git.sesse.net Git - vlc/blob - modules/video_chroma/i420_rgb16.c

   1 /*****************************************************************************
   2  * i420_rgb16.c : YUV to bitmap RGB conversion module for vlc
   3  *****************************************************************************
   4  * Copyright (C) 2000 the VideoLAN team
   5  * $Id$
   6  *
   7  * Authors: Samuel Hocevar <sam@zoy.org>
   8  *          Damien Fouilleul <damienf@videolan.org>
   9  *
  10  * This program is free software; you can redistribute it and/or modify
  11  * it under the terms of the GNU General Public License as published by
  12  * the Free Software Foundation; either version 2 of the License, or
  13  * (at your option) any later version.
  14  *
  15  * This program is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  * GNU General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU General Public License
  21  * along with this program; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
  23  *****************************************************************************/
  24
  25 /*****************************************************************************
  26  * Preamble
  27  *****************************************************************************/
  28 #include <string.h>                                            /* strerror() */
  29 #include <stdlib.h>                                      /* malloc(), free() */
  30
  31 #include <vlc/vlc.h>
  32 #include <vlc_vout.h>
  33
  34 #include "i420_rgb.h"
  35 #if defined (MODULE_NAME_IS_i420_rgb)
  36 #   include "i420_rgb_c.h"
  37 #elif defined (MODULE_NAME_IS_i420_rgb_mmx)
  38 #   if defined(HAVE_MMX_INTRINSICS)
  39 #       include <mmintrin.h>
  40 #   endif
  41 #   include "i420_rgb_mmx.h"
  42 #elif defined (MODULE_NAME_IS_i420_rgb_sse2)
  43 #   if defined(HAVE_SSE2_INTRINSICS)
  44 #       include <emmintrin.h>
  45 #   endif
  46 #   include "i420_rgb_mmx.h"
  47 #endif
  48
  49 static void SetOffset( int, int, int, int, vlc_bool_t *,
  50                        unsigned int *, int * );
  51
  52 #if defined (MODULE_NAME_IS_i420_rgb)
  53 /*****************************************************************************
  54  * I420_RGB16: color YUV 4:2:0 to RGB 16 bpp with dithering
  55  *****************************************************************************
  56  * Horizontal alignment needed:
  57  *  - input: 8 pixels (8 Y bytes, 4 U/V bytes), margins not allowed
  58  *  - output: 1 pixel (2 bytes), margins allowed
  59  * Vertical alignment needed:
  60  *  - input: 2 lines (2 Y lines, 1 U/V line)
  61  *  - output: 1 line
  62  *****************************************************************************/
  63 void E_(I420_RGB16_dither)( vout_thread_t *p_vout, picture_t *p_src,
  64                                                       picture_t *p_dest )
  65 {
  66     /* We got this one from the old arguments */
  67     uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
  68     uint8_t  *p_y   = p_src->Y_PIXELS;
  69     uint8_t  *p_u   = p_src->U_PIXELS;
  70     uint8_t  *p_v   = p_src->V_PIXELS;
  71
  72     vlc_bool_t   b_hscale;                        /* horizontal scaling type */
  73     unsigned int i_vscale;                          /* vertical scaling type */
  74     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
  75     unsigned int i_real_y;                                          /* y % 4 */
  76
  77     int         i_right_margin;
  78     int         i_rewind;
  79     int         i_scale_count;                       /* scale modulo counter */
  80     int         i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
  81     uint16_t *  p_pic_start;       /* beginning of the current line for copy */
  82     int         i_uval, i_vval;                           /* U and V samples */
  83     int         i_red, i_green, i_blue;          /* U and V modified samples */
  84     uint16_t *  p_yuv = p_vout->chroma.p_sys->p_rgb16;
  85     uint16_t *  p_ybase;                     /* Y dependant conversion table */
  86
  87     /* Conversion buffer pointer */
  88     uint16_t *  p_buffer_start = (uint16_t*)p_vout->chroma.p_sys->p_buffer;
  89     uint16_t *  p_buffer;
  90
  91     /* Offset array pointer */
  92     int *       p_offset_start = p_vout->chroma.p_sys->p_offset;
  93     int *       p_offset;
  94
  95     const int i_source_margin = p_src->p[0].i_pitch
  96                                  - p_src->p[0].i_visible_pitch;
  97     const int i_source_margin_c = p_src->p[1].i_pitch
  98                                  - p_src->p[1].i_visible_pitch;
  99
 100     /* The dithering matrices */
 101     int dither10[4] = {  0x0,  0x8,  0x2,  0xa };
 102     int dither11[4] = {  0xc,  0x4,  0xe,  0x6 };
 103     int dither12[4] = {  0x3,  0xb,  0x1,  0x9 };
 104     int dither13[4] = {  0xf,  0x7,  0xd,  0x5 };
 105
 106     for(i_x = 0; i_x < 4; i_x++)
 107     {
 108         dither10[i_x] = dither10[i_x] << (SHIFT - 4 + p_vout->output.i_rrshift);
 109         dither11[i_x] = dither11[i_x] << (SHIFT - 4 + p_vout->output.i_rrshift);
 110         dither12[i_x] = dither12[i_x] << (SHIFT - 4 + p_vout->output.i_rrshift);
 111         dither13[i_x] = dither13[i_x] << (SHIFT - 4 + p_vout->output.i_rrshift);
 112     }
 113
 114     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
 115
 116     if( p_vout->render.i_width & 7 )
 117     {
 118         i_rewind = 8 - ( p_vout->render.i_width & 7 );
 119     }
 120     else
 121     {
 122         i_rewind = 0;
 123     }
 124
 125     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
 126      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
 127      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
 128     SetOffset( p_vout->render.i_width, p_vout->render.i_height,
 129                p_vout->output.i_width, p_vout->output.i_height,
 130                &b_hscale, &i_vscale, p_offset_start );
 131
 132     /*
 133      * Perform conversion
 134      */
 135     i_scale_count = ( i_vscale == 1 ) ?
 136                     p_vout->output.i_height : p_vout->render.i_height;
 137     for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
 138     {
 139         i_real_y = i_y & 0x3;
 140         p_pic_start = p_pic;
 141         p_buffer = b_hscale ? p_buffer_start : p_pic;
 142
 143         for ( i_x = p_vout->render.i_width / 8; i_x--; )
 144         {
 145             int *p_dither = dither10;
 146             CONVERT_YUV_PIXEL_DITHER(2);
 147             p_dither = dither11;
 148             CONVERT_Y_PIXEL_DITHER(2);
 149             p_dither = dither12;
 150             CONVERT_YUV_PIXEL_DITHER(2);
 151             p_dither = dither13;
 152             CONVERT_Y_PIXEL_DITHER(2);
 153             p_dither = dither10;
 154             CONVERT_YUV_PIXEL_DITHER(2);
 155             p_dither = dither11;
 156             CONVERT_Y_PIXEL_DITHER(2);
 157             p_dither = dither12;
 158             CONVERT_YUV_PIXEL_DITHER(2);
 159             p_dither = dither13;
 160             CONVERT_Y_PIXEL_DITHER(2);
 161         }
 162
 163         /* Here we do some unaligned reads and duplicate conversions, but
 164          * at least we have all the pixels */
 165         if( i_rewind )
 166         {
 167             int *p_dither = dither10;
 168             p_y -= i_rewind;
 169             p_u -= i_rewind >> 1;
 170             p_v -= i_rewind >> 1;
 171             p_buffer -= i_rewind;
 172             CONVERT_YUV_PIXEL_DITHER(2);
 173             p_dither = dither11;
 174             CONVERT_Y_PIXEL_DITHER(2);
 175             p_dither = dither12;
 176             CONVERT_YUV_PIXEL_DITHER(2);
 177             p_dither = dither13;
 178             CONVERT_Y_PIXEL_DITHER(2);
 179             p_dither = dither10;
 180             CONVERT_YUV_PIXEL_DITHER(2);
 181             p_dither = dither11;
 182             CONVERT_Y_PIXEL_DITHER(2);
 183             p_dither = dither12;
 184             CONVERT_YUV_PIXEL_DITHER(2);
 185             p_dither = dither13;
 186             CONVERT_Y_PIXEL_DITHER(2);
 187         }
 188         SCALE_WIDTH;
 189         SCALE_HEIGHT( 420, 2 );
 190
 191         p_y += i_source_margin;
 192         if( i_y % 2 )
 193         {
 194             p_u += i_source_margin_c;
 195             p_v += i_source_margin_c;
 196         }
 197     }
 198 }
 199 #endif
 200
 201 /*****************************************************************************
 202  * I420_RGB16: color YUV 4:2:0 to RGB 16 bpp
 203  *****************************************************************************
 204  * Horizontal alignment needed:
 205  *  - input: 8 pixels (8 Y bytes, 4 U/V bytes), margins not allowed
 206  *  - output: 1 pixel (2 bytes), margins allowed
 207  * Vertical alignment needed:
 208  *  - input: 2 lines (2 Y lines, 1 U/V line)
 209  *  - output: 1 line
 210  *****************************************************************************/
 211
 212 #if defined (MODULE_NAME_IS_i420_rgb)
 213
 214 void E_(I420_RGB16)( vout_thread_t *p_vout, picture_t *p_src,
 215                                             picture_t *p_dest )
 216 {
 217     /* We got this one from the old arguments */
 218     uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
 219     uint8_t  *p_y   = p_src->Y_PIXELS;
 220     uint8_t  *p_u   = p_src->U_PIXELS;
 221     uint8_t  *p_v   = p_src->V_PIXELS;
 222
 223     vlc_bool_t  b_hscale;                         /* horizontal scaling type */
 224     unsigned int i_vscale;                          /* vertical scaling type */
 225     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
 226
 227     int         i_right_margin;
 228     int         i_rewind;
 229     int         i_scale_count;                       /* scale modulo counter */
 230     int         i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
 231     uint16_t *  p_pic_start;       /* beginning of the current line for copy */
 232     int         i_uval, i_vval;                           /* U and V samples */
 233     int         i_red, i_green, i_blue;          /* U and V modified samples */
 234     uint16_t *  p_yuv = p_vout->chroma.p_sys->p_rgb16;
 235     uint16_t *  p_ybase;                     /* Y dependant conversion table */
 236
 237     /* Conversion buffer pointer */
 238     uint16_t *  p_buffer_start = (uint16_t*)p_vout->chroma.p_sys->p_buffer;
 239     uint16_t *  p_buffer;
 240
 241     /* Offset array pointer */
 242     int *       p_offset_start = p_vout->chroma.p_sys->p_offset;
 243     int *       p_offset;
 244
 245     const int i_source_margin = p_src->p[0].i_pitch
 246                                  - p_src->p[0].i_visible_pitch;
 247     const int i_source_margin_c = p_src->p[1].i_pitch
 248                                  - p_src->p[1].i_visible_pitch;
 249
 250     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
 251
 252     if( p_vout->render.i_width & 7 )
 253     {
 254         i_rewind = 8 - ( p_vout->render.i_width & 7 );
 255     }
 256     else
 257     {
 258         i_rewind = 0;
 259     }
 260
 261     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
 262      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
 263      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
 264     SetOffset( p_vout->render.i_width, p_vout->render.i_height,
 265                p_vout->output.i_width, p_vout->output.i_height,
 266                &b_hscale, &i_vscale, p_offset_start );
 267
 268     /*
 269      * Perform conversion
 270      */
 271     i_scale_count = ( i_vscale == 1 ) ?
 272                     p_vout->output.i_height : p_vout->render.i_height;
 273     for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
 274     {
 275         p_pic_start = p_pic;
 276         p_buffer = b_hscale ? p_buffer_start : p_pic;
 277
 278         for ( i_x = p_vout->render.i_width / 8; i_x--; )
 279         {
 280             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
 281             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
 282             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
 283             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
 284         }
 285
 286         /* Here we do some unaligned reads and duplicate conversions, but
 287          * at least we have all the pixels */
 288         if( i_rewind )
 289         {
 290             p_y -= i_rewind;
 291             p_u -= i_rewind >> 1;
 292             p_v -= i_rewind >> 1;
 293             p_buffer -= i_rewind;
 294
 295             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
 296             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
 297             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
 298             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
 299         }
 300         SCALE_WIDTH;
 301         SCALE_HEIGHT( 420, 2 );
 302
 303         p_y += i_source_margin;
 304         if( i_y % 2 )
 305         {
 306             p_u += i_source_margin_c;
 307             p_v += i_source_margin_c;
 308         }
 309     }
 310 }
 311
 312 #else // defined (MODULE_NAME_IS_i420_rgb_mmx)
 313
 314 void E_(I420_R5G5B5)( vout_thread_t *p_vout, picture_t *p_src,
 315                                             picture_t *p_dest )
 316 {
 317     /* We got this one from the old arguments */
 318     uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
 319     uint8_t  *p_y   = p_src->Y_PIXELS;
 320     uint8_t  *p_u   = p_src->U_PIXELS;
 321     uint8_t  *p_v   = p_src->V_PIXELS;
 322
 323     vlc_bool_t  b_hscale;                         /* horizontal scaling type */
 324     unsigned int i_vscale;                          /* vertical scaling type */
 325     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
 326
 327     int         i_right_margin;
 328     int         i_rewind;
 329     int         i_scale_count;                       /* scale modulo counter */
 330     int         i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
 331     uint16_t *  p_pic_start;       /* beginning of the current line for copy */
 332
 333     /* Conversion buffer pointer */
 334     uint16_t *  p_buffer_start = (uint16_t*)p_vout->chroma.p_sys->p_buffer;
 335     uint16_t *  p_buffer;
 336
 337     /* Offset array pointer */
 338     int *       p_offset_start = p_vout->chroma.p_sys->p_offset;
 339     int *       p_offset;
 340
 341     const int i_source_margin = p_src->p[0].i_pitch
 342                                  - p_src->p[0].i_visible_pitch;
 343     const int i_source_margin_c = p_src->p[1].i_pitch
 344                                  - p_src->p[1].i_visible_pitch;
 345
 346     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
 347
 348     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
 349      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
 350      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
 351     SetOffset( p_vout->render.i_width, p_vout->render.i_height,
 352                p_vout->output.i_width, p_vout->output.i_height,
 353                &b_hscale, &i_vscale, p_offset_start );
 354
 355
 356     /*
 357      * Perform conversion
 358      */
 359     i_scale_count = ( i_vscale == 1 ) ?
 360                     p_vout->output.i_height : p_vout->render.i_height;
 361
 362 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
 363
 364     if( p_vout->render.i_width & 15 )
 365     {
 366         i_rewind = 16 - ( p_vout->render.i_width & 15 );
 367     }
 368     else
 369     {
 370         i_rewind = 0;
 371     }
 372
 373     /*
 374     ** SSE2 128 bits fetch/store instructions are faster
 375     ** if memory access is 16 bytes aligned
 376     */
 377
 378     p_buffer = b_hscale ? p_buffer_start : p_pic;
 379     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
 380                     p_dest->p->i_pitch|
 381                     ((int)p_y)|
 382                     ((int)p_buffer))) )
 383     {
 384         /* use faster SSE2 aligned fetch and store */
 385         for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
 386         {
 387             p_pic_start = p_pic;
 388
 389             for ( i_x = p_vout->render.i_width/16; i_x--; )
 390             {
 391 #if defined (CAN_COMPILE_SSE2)
 392                 __asm__( ".p2align 3"
 393                          SSE2_INIT_16_ALIGNED
 394                          SSE2_YUV_MUL
 395                          SSE2_YUV_ADD
 396                          SSE2_UNPACK_15_ALIGNED
 397                          : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
 398 #else
 399                 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
 400                 SSE2_INTRINSICS_INIT_16_ALIGNED
 401                 SSE2_INTRINSICS_YUV_MUL
 402                 SSE2_INTRINSICS_YUV_ADD
 403                 SSE2_INTRINSICS_UNPACK_15_ALIGNED
 404 #endif
 405                 p_y += 16;
 406                 p_u += 8;
 407                 p_v += 8;
 408                 p_buffer += 16;
 409             }
 410             /* Here we do some unaligned reads and duplicate conversions, but
 411              * at least we have all the pixels */
 412             if( i_rewind )
 413             {
 414                 p_y -= i_rewind;
 415                 p_u -= i_rewind >> 1;
 416                 p_v -= i_rewind >> 1;
 417                 p_buffer -= i_rewind;
 418
 419 #if defined (CAN_COMPILE_SSE2)
 420                 __asm__( ".p2align 3"
 421                          SSE2_INIT_16_UNALIGNED
 422                          SSE2_YUV_MUL
 423                          SSE2_YUV_ADD
 424                          SSE2_UNPACK_15_UNALIGNED
 425                          : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
 426 #else
 427                 {
 428                     __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
 429
 430                     SSE2_INTRINSICS_INIT_16_UNALIGNED
 431                     SSE2_INTRINSICS_YUV_MUL
 432                     SSE2_INTRINSICS_YUV_ADD
 433                     SSE2_INTRINSICS_UNPACK_15_UNALIGNED
 434                 }
 435 #endif
 436                 p_y += 16;
 437                 p_u += 8;
 438                 p_v += 8;
 439             }
 440             SCALE_WIDTH;
 441             SCALE_HEIGHT( 420, 2 );
 442
 443             p_y += i_source_margin;
 444             if( i_y % 2 )
 445             {
 446                 p_u += i_source_margin_c;
 447                 p_v += i_source_margin_c;
 448             }
 449             p_buffer = b_hscale ? p_buffer_start : p_pic;
 450         }
 451         /* make sure all SSE2 stores are visible thereafter */
 452 #if defined (CAN_COMPILE_SSE2)
 453         __asm__ __volatile__ ( "sfence" );
 454 #else
 455         _mm_sfence();
 456 #endif
 457     }
 458     else
 459     {
 460         /* use slower SSE2 unaligned fetch and store */
 461         for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
 462         {
 463             p_pic_start = p_pic;
 464             p_buffer = b_hscale ? p_buffer_start : p_pic;
 465
 466             for ( i_x = p_vout->render.i_width/16; i_x--; )
 467             {
 468 #if defined (CAN_COMPILE_SSE2)
 469                 __asm__( ".p2align 3"
 470                          SSE2_INIT_16_UNALIGNED
 471                          SSE2_YUV_MUL
 472                          SSE2_YUV_ADD
 473                          SSE2_UNPACK_15_UNALIGNED
 474                          : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
 475 #else
 476                 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
 477                 SSE2_INTRINSICS_INIT_16_UNALIGNED
 478                 SSE2_INTRINSICS_YUV_MUL
 479                 SSE2_INTRINSICS_YUV_ADD
 480                 SSE2_INTRINSICS_UNPACK_15_UNALIGNED
 481 #endif
 482                 p_y += 16;
 483                 p_u += 8;
 484                 p_v += 8;
 485                 p_buffer += 16;
 486             }
 487             /* Here we do some unaligned reads and duplicate conversions, but
 488              * at least we have all the pixels */
 489             if( i_rewind )
 490             {
 491                 p_y -= i_rewind;
 492                 p_u -= i_rewind >> 1;
 493                 p_v -= i_rewind >> 1;
 494                 p_buffer -= i_rewind;
 495
 496 #if defined (CAN_COMPILE_SSE2)
 497                 __asm__( ".p2align 3"
 498                          SSE2_INIT_16_UNALIGNED
 499                          SSE2_YUV_MUL
 500                          SSE2_YUV_ADD
 501                          SSE2_UNPACK_15_UNALIGNED
 502                          : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
 503 #else
 504                 {
 505                     __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
 506
 507                     SSE2_INTRINSICS_INIT_16_UNALIGNED
 508                     SSE2_INTRINSICS_YUV_MUL
 509                     SSE2_INTRINSICS_YUV_ADD
 510                     SSE2_INTRINSICS_UNPACK_15_UNALIGNED
 511                 }
 512 #endif
 513                 p_y += 16;
 514                 p_u += 8;
 515                 p_v += 8;
 516             }
 517             SCALE_WIDTH;
 518             SCALE_HEIGHT( 420, 2 );
 519
 520             p_y += i_source_margin;
 521             if( i_y % 2 )
 522             {
 523                 p_u += i_source_margin_c;
 524                 p_v += i_source_margin_c;
 525             }
 526             p_buffer = b_hscale ? p_buffer_start : p_pic;
 527         }
 528     }
 529 #else // defined (MODULE_NAME_IS_i420_rgb_mmx)
 530
 531     if( p_vout->render.i_width & 7 )
 532     {
 533         i_rewind = 8 - ( p_vout->render.i_width & 7 );
 534     }
 535     else
 536     {
 537         i_rewind = 0;
 538     }
 539
 540     for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
 541     {
 542         p_pic_start = p_pic;
 543         p_buffer = b_hscale ? p_buffer_start : p_pic;
 544
 545         for ( i_x = p_vout->render.i_width / 8; i_x--; )
 546         {
 547 #if defined (CAN_COMPILE_MMX)
 548             __asm__( ".p2align 3"
 549                      MMX_INIT_16
 550                      MMX_YUV_MUL
 551                      MMX_YUV_ADD
 552                      MMX_UNPACK_15
 553                      : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
 554 #else
 555             __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
 556             uint64_t tmp64;
 557             MMX_INTRINSICS_INIT_16
 558             MMX_INTRINSICS_YUV_MUL
 559             MMX_INTRINSICS_YUV_ADD
 560             MMX_INTRINSICS_UNPACK_15
 561 #endif
 562
 563             p_y += 8;
 564             p_u += 4;
 565             p_v += 4;
 566             p_buffer += 8;
 567         }
 568
 569         /* Here we do some unaligned reads and duplicate conversions, but
 570          * at least we have all the pixels */
 571         if( i_rewind )
 572         {
 573             p_y -= i_rewind;
 574             p_u -= i_rewind >> 1;
 575             p_v -= i_rewind >> 1;
 576             p_buffer -= i_rewind;
 577
 578 #if defined (CAN_COMPILE_MMX)
 579             __asm__( ".p2align 3"
 580                      MMX_INIT_16
 581                      MMX_YUV_MUL
 582                      MMX_YUV_ADD
 583                      MMX_UNPACK_15
 584                      : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
 585 #else
 586             {
 587                 __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
 588                 uint64_t tmp64;
 589
 590                 MMX_INTRINSICS_INIT_16
 591                 MMX_INTRINSICS_YUV_MUL
 592                 MMX_INTRINSICS_YUV_ADD
 593                 MMX_INTRINSICS_UNPACK_15
 594             }
 595 #endif
 596             p_y += 8;
 597             p_u += 4;
 598             p_v += 4;
 599             p_buffer += 8;
 600         }
 601         SCALE_WIDTH;
 602         SCALE_HEIGHT( 420, 2 );
 603
 604         p_y += i_source_margin;
 605         if( i_y % 2 )
 606         {
 607             p_u += i_source_margin_c;
 608             p_v += i_source_margin_c;
 609         }
 610     }
 611     /* re-enable FPU registers */
 612 #if defined (CAN_COMPILE_MMX)
 613     __asm__ __volatile__ ( "emms" );
 614 #else
 615     _mm_empty();
 616 #endif
 617
 618 #endif
 619 }
 620
 621 void E_(I420_R5G6B5)( vout_thread_t *p_vout, picture_t *p_src,
 622                                             picture_t *p_dest )
 623 {
 624     /* We got this one from the old arguments */
 625     uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
 626     uint8_t  *p_y   = p_src->Y_PIXELS;
 627     uint8_t  *p_u   = p_src->U_PIXELS;
 628     uint8_t  *p_v   = p_src->V_PIXELS;
 629
 630     vlc_bool_t  b_hscale;                         /* horizontal scaling type */
 631     unsigned int i_vscale;                          /* vertical scaling type */
 632     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
 633
 634     int         i_right_margin;
 635     int         i_rewind;
 636     int         i_scale_count;                       /* scale modulo counter */
 637     int         i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
 638     uint16_t *  p_pic_start;       /* beginning of the current line for copy */
 639
 640     /* Conversion buffer pointer */
 641     uint16_t *  p_buffer_start = (uint16_t*)p_vout->chroma.p_sys->p_buffer;
 642     uint16_t *  p_buffer;
 643
 644     /* Offset array pointer */
 645     int *       p_offset_start = p_vout->chroma.p_sys->p_offset;
 646     int *       p_offset;
 647
 648     const int i_source_margin = p_src->p[0].i_pitch
 649                                  - p_src->p[0].i_visible_pitch;
 650     const int i_source_margin_c = p_src->p[1].i_pitch
 651                                  - p_src->p[1].i_visible_pitch;
 652
 653     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
 654
 655     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
 656      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
 657      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
 658     SetOffset( p_vout->render.i_width, p_vout->render.i_height,
 659                p_vout->output.i_width, p_vout->output.i_height,
 660                &b_hscale, &i_vscale, p_offset_start );
 661
 662
 663     /*
 664      * Perform conversion
 665      */
 666     i_scale_count = ( i_vscale == 1 ) ?
 667                     p_vout->output.i_height : p_vout->render.i_height;
 668
 669 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
 670
 671     if( p_vout->render.i_width & 15 )
 672     {
 673         i_rewind = 16 - ( p_vout->render.i_width & 15 );
 674     }
 675     else
 676     {
 677         i_rewind = 0;
 678     }
 679
 680     /*
 681     ** SSE2 128 bits fetch/store instructions are faster
 682     ** if memory access is 16 bytes aligned
 683     */
 684
 685     p_buffer = b_hscale ? p_buffer_start : p_pic;
 686     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
 687                     p_dest->p->i_pitch|
 688                     ((int)p_y)|
 689                     ((int)p_buffer))) )
 690     {
 691         /* use faster SSE2 aligned fetch and store */
 692         for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
 693         {
 694             p_pic_start = p_pic;
 695
 696             for ( i_x = p_vout->render.i_width/16; i_x--; )
 697             {
 698 #if defined (CAN_COMPILE_SSE2)
 699                 __asm__( ".p2align 3"
 700                          SSE2_INIT_16_ALIGNED
 701                          SSE2_YUV_MUL
 702                          SSE2_YUV_ADD
 703                          SSE2_UNPACK_16_ALIGNED
 704                          : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
 705 #else
 706                 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
 707                 SSE2_INTRINSICS_INIT_16_ALIGNED
 708                 SSE2_INTRINSICS_YUV_MUL
 709                 SSE2_INTRINSICS_YUV_ADD
 710                 SSE2_INTRINSICS_UNPACK_16_ALIGNED
 711 #endif
 712                 p_y += 16;
 713                 p_u += 8;
 714                 p_v += 8;
 715                 p_buffer += 16;
 716             }
 717             /* Here we do some unaligned reads and duplicate conversions, but
 718              * at least we have all the pixels */
 719             if( i_rewind )
 720             {
 721                 p_y -= i_rewind;
 722                 p_u -= i_rewind >> 1;
 723                 p_v -= i_rewind >> 1;
 724                 p_buffer -= i_rewind;
 725
 726 #if defined (CAN_COMPILE_SSE2)
 727                 __asm__( ".p2align 3"
 728                          SSE2_INIT_16_UNALIGNED
 729                          SSE2_YUV_MUL
 730                          SSE2_YUV_ADD
 731                          SSE2_UNPACK_16_UNALIGNED
 732                          : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
 733 #else
 734                 {
 735                     __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
 736
 737                     SSE2_INTRINSICS_INIT_16_UNALIGNED
 738                     SSE2_INTRINSICS_YUV_MUL
 739                     SSE2_INTRINSICS_YUV_ADD
 740                     SSE2_INTRINSICS_UNPACK_16_UNALIGNED
 741                 }
 742 #endif
 743                 p_y += 16;
 744                 p_u += 8;
 745                 p_v += 8;
 746             }
 747             SCALE_WIDTH;
 748             SCALE_HEIGHT( 420, 2 );
 749
 750             p_y += i_source_margin;
 751             if( i_y % 2 )
 752             {
 753                 p_u += i_source_margin_c;
 754                 p_v += i_source_margin_c;
 755             }
 756             p_buffer = b_hscale ? p_buffer_start : p_pic;
 757         }
 758         /* make sure all SSE2 stores are visible thereafter */
 759 #if defined (CAN_COMPILE_SSE2)
 760         __asm__ __volatile__ ( "sfence" );
 761 #else
 762         _mm_sfence();
 763 #endif
 764     }
 765     else
 766     {
 767         /* use slower SSE2 unaligned fetch and store */
 768         for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
 769         {
 770             p_pic_start = p_pic;
 771             p_buffer = b_hscale ? p_buffer_start : p_pic;
 772
 773             for ( i_x = p_vout->render.i_width/16; i_x--; )
 774             {
 775 #if defined (CAN_COMPILE_SSE2)
 776                 __asm__( ".p2align 3"
 777                          SSE2_INIT_16_UNALIGNED
 778                          SSE2_YUV_MUL
 779                          SSE2_YUV_ADD
 780                          SSE2_UNPACK_16_UNALIGNED
 781                          : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
 782 #else
 783                 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
 784                 SSE2_INTRINSICS_INIT_16_UNALIGNED
 785                 SSE2_INTRINSICS_YUV_MUL
 786                 SSE2_INTRINSICS_YUV_ADD
 787                 SSE2_INTRINSICS_UNPACK_16_UNALIGNED
 788 #endif
 789                 p_y += 16;
 790                 p_u += 8;
 791                 p_v += 8;
 792                 p_buffer += 16;
 793             }
 794             /* Here we do some unaligned reads and duplicate conversions, but
 795              * at least we have all the pixels */
 796             if( i_rewind )
 797             {
 798                 p_y -= i_rewind;
 799                 p_u -= i_rewind >> 1;
 800                 p_v -= i_rewind >> 1;
 801                 p_buffer -= i_rewind;
 802
 803 #if defined (CAN_COMPILE_SSE2)
 804                 __asm__( ".p2align 3"
 805                          SSE2_INIT_16_UNALIGNED
 806                          SSE2_YUV_MUL
 807                          SSE2_YUV_ADD
 808                          SSE2_UNPACK_16_UNALIGNED
 809                          : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
 810 #else
 811                 {
 812                     __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
 813
 814                     SSE2_INTRINSICS_INIT_16_UNALIGNED
 815                     SSE2_INTRINSICS_YUV_MUL
 816                     SSE2_INTRINSICS_YUV_ADD
 817                     SSE2_INTRINSICS_UNPACK_16_UNALIGNED
 818                 }
 819 #endif
 820                 p_y += 16;
 821                 p_u += 8;
 822                 p_v += 8;
 823             }
 824             SCALE_WIDTH;
 825             SCALE_HEIGHT( 420, 2 );
 826
 827             p_y += i_source_margin;
 828             if( i_y % 2 )
 829             {
 830                 p_u += i_source_margin_c;
 831                 p_v += i_source_margin_c;
 832             }
 833             p_buffer = b_hscale ? p_buffer_start : p_pic;
 834         }
 835     }
 836 #else // defined (MODULE_NAME_IS_i420_rgb_mmx)
 837
 838     if( p_vout->render.i_width & 7 )
 839     {
 840         i_rewind = 8 - ( p_vout->render.i_width & 7 );
 841     }
 842     else
 843     {
 844         i_rewind = 0;
 845     }
 846
 847     for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
 848     {
 849         p_pic_start = p_pic;
 850         p_buffer = b_hscale ? p_buffer_start : p_pic;
 851
 852         for ( i_x = p_vout->render.i_width / 8; i_x--; )
 853         {
 854 #if defined (CAN_COMPILE_MMX)
 855             __asm__( ".p2align 3"
 856                      MMX_INIT_16
 857                      MMX_YUV_MUL
 858                      MMX_YUV_ADD
 859                      MMX_UNPACK_16
 860                      : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
 861 #else
 862             __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
 863             uint64_t tmp64;
 864             MMX_INTRINSICS_INIT_16
 865             MMX_INTRINSICS_YUV_MUL
 866             MMX_INTRINSICS_YUV_ADD
 867             MMX_INTRINSICS_UNPACK_16
 868 #endif
 869
 870             p_y += 8;
 871             p_u += 4;
 872             p_v += 4;
 873             p_buffer += 8;
 874         }
 875
 876         /* Here we do some unaligned reads and duplicate conversions, but
 877          * at least we have all the pixels */
 878         if( i_rewind )
 879         {
 880             p_y -= i_rewind;
 881             p_u -= i_rewind >> 1;
 882             p_v -= i_rewind >> 1;
 883             p_buffer -= i_rewind;
 884
 885 #if defined (CAN_COMPILE_MMX)
 886             __asm__( ".p2align 3"
 887                      MMX_INIT_16
 888                      MMX_YUV_MUL
 889                      MMX_YUV_ADD
 890                      MMX_UNPACK_16
 891                      : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
 892 #else
 893             {
 894                 __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
 895                 uint64_t tmp64;
 896
 897                 MMX_INTRINSICS_INIT_16
 898                 MMX_INTRINSICS_YUV_MUL
 899                 MMX_INTRINSICS_YUV_ADD
 900                 MMX_INTRINSICS_UNPACK_16
 901             }
 902 #endif
 903             p_y += 8;
 904             p_u += 4;
 905             p_v += 4;
 906             p_buffer += 8;
 907         }
 908         SCALE_WIDTH;
 909         SCALE_HEIGHT( 420, 2 );
 910
 911         p_y += i_source_margin;
 912         if( i_y % 2 )
 913         {
 914             p_u += i_source_margin_c;
 915             p_v += i_source_margin_c;
 916         }
 917     }
 918     /* re-enable FPU registers */
 919 #if defined (CAN_COMPILE_MMX)
 920     __asm__ __volatile__ ( "emms" );
 921 #else
 922     _mm_empty();
 923 #endif
 924
 925 #endif
 926 }
 927
 928 #endif
 929
 930 /*****************************************************************************
 931  * I420_RGB32: color YUV 4:2:0 to RGB 32 bpp
 932  *****************************************************************************
 933  * Horizontal alignment needed:
 934  *  - input: 8 pixels (8 Y bytes, 4 U/V bytes), margins not allowed
 935  *  - output: 1 pixel (2 bytes), margins allowed
 936  * Vertical alignment needed:
 937  *  - input: 2 lines (2 Y lines, 1 U/V line)
 938  *  - output: 1 line
 939  *****************************************************************************/
 940
 941 #if defined (MODULE_NAME_IS_i420_rgb)
 942
 943 void E_(I420_RGB32)( vout_thread_t *p_vout, picture_t *p_src,
 944                                             picture_t *p_dest )
 945 {
 946     /* We got this one from the old arguments */
 947     uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
 948     uint8_t  *p_y   = p_src->Y_PIXELS;
 949     uint8_t  *p_u   = p_src->U_PIXELS;
 950     uint8_t  *p_v   = p_src->V_PIXELS;
 951
 952     vlc_bool_t  b_hscale;                         /* horizontal scaling type */
 953     unsigned int i_vscale;                          /* vertical scaling type */
 954     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
 955
 956     int         i_right_margin;
 957     int         i_rewind;
 958     int         i_scale_count;                       /* scale modulo counter */
 959     int         i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
 960     uint32_t *  p_pic_start;       /* beginning of the current line for copy */
 961     int         i_uval, i_vval;                           /* U and V samples */
 962     int         i_red, i_green, i_blue;          /* U and V modified samples */
 963     uint32_t *  p_yuv = p_vout->chroma.p_sys->p_rgb32;
 964     uint32_t *  p_ybase;                     /* Y dependant conversion table */
 965
 966     /* Conversion buffer pointer */
 967     uint32_t *  p_buffer_start = (uint32_t*)p_vout->chroma.p_sys->p_buffer;
 968     uint32_t *  p_buffer;
 969
 970     /* Offset array pointer */
 971     int *       p_offset_start = p_vout->chroma.p_sys->p_offset;
 972     int *       p_offset;
 973
 974     const int i_source_margin = p_src->p[0].i_pitch
 975                                  - p_src->p[0].i_visible_pitch;
 976     const int i_source_margin_c = p_src->p[1].i_pitch
 977                                  - p_src->p[1].i_visible_pitch;
 978
 979     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
 980
 981     if( p_vout->render.i_width & 7 )
 982     {
 983         i_rewind = 8 - ( p_vout->render.i_width & 7 );
 984     }
 985     else
 986     {
 987         i_rewind = 0;
 988     }
 989
 990     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
 991      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
 992      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
 993     SetOffset( p_vout->render.i_width, p_vout->render.i_height,
 994                p_vout->output.i_width, p_vout->output.i_height,
 995                &b_hscale, &i_vscale, p_offset_start );
 996
 997     /*
 998      * Perform conversion
 999      */
1000     i_scale_count = ( i_vscale == 1 ) ?
1001                     p_vout->output.i_height : p_vout->render.i_height;
1002     for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1003     {
1004         p_pic_start = p_pic;
1005         p_buffer = b_hscale ? p_buffer_start : p_pic;
1006
1007         for ( i_x = p_vout->render.i_width / 8; i_x--; )
1008         {
1009             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
1010             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
1011             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
1012             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
1013         }
1014
1015         /* Here we do some unaligned reads and duplicate conversions, but
1016          * at least we have all the pixels */
1017         if( i_rewind )
1018         {
1019             p_y -= i_rewind;
1020             p_u -= i_rewind >> 1;
1021             p_v -= i_rewind >> 1;
1022             p_buffer -= i_rewind;
1023             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
1024             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
1025             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
1026             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
1027         }
1028         SCALE_WIDTH;
1029         SCALE_HEIGHT( 420, 4 );
1030
1031         p_y += i_source_margin;
1032         if( i_y % 2 )
1033         {
1034             p_u += i_source_margin_c;
1035             p_v += i_source_margin_c;
1036         }
1037     }
1038 }
1039
1040 #else // defined (MODULE_NAME_IS_i420_rgb_mmx) || defined (MODULE_NAME_IS_i420_rgb_sse2)
1041
1042 void E_(I420_A8R8G8B8)( vout_thread_t *p_vout, picture_t *p_src,
1043                                             picture_t *p_dest )
1044 {
1045     /* We got this one from the old arguments */
1046     uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
1047     uint8_t  *p_y   = p_src->Y_PIXELS;
1048     uint8_t  *p_u   = p_src->U_PIXELS;
1049     uint8_t  *p_v   = p_src->V_PIXELS;
1050
1051     vlc_bool_t  b_hscale;                         /* horizontal scaling type */
1052     unsigned int i_vscale;                          /* vertical scaling type */
1053     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
1054
1055     int         i_right_margin;
1056     int         i_rewind;
1057     int         i_scale_count;                       /* scale modulo counter */
1058     int         i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
1059     uint32_t *  p_pic_start;       /* beginning of the current line for copy */
1060     /* Conversion buffer pointer */
1061     uint32_t *  p_buffer_start = (uint32_t*)p_vout->chroma.p_sys->p_buffer;
1062     uint32_t *  p_buffer;
1063
1064     /* Offset array pointer */
1065     int *       p_offset_start = p_vout->chroma.p_sys->p_offset;
1066     int *       p_offset;
1067
1068     const int i_source_margin = p_src->p[0].i_pitch
1069                                  - p_src->p[0].i_visible_pitch;
1070     const int i_source_margin_c = p_src->p[1].i_pitch
1071                                  - p_src->p[1].i_visible_pitch;
1072
1073     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
1074
1075     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
1076      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
1077      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
1078     SetOffset( p_vout->render.i_width, p_vout->render.i_height,
1079                p_vout->output.i_width, p_vout->output.i_height,
1080                &b_hscale, &i_vscale, p_offset_start );
1081
1082     /*
1083      * Perform conversion
1084      */
1085     i_scale_count = ( i_vscale == 1 ) ?
1086                     p_vout->output.i_height : p_vout->render.i_height;
1087
1088 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
1089
1090     if( p_vout->render.i_width & 15 )
1091     {
1092         i_rewind = 16 - ( p_vout->render.i_width & 15 );
1093     }
1094     else
1095     {
1096         i_rewind = 0;
1097     }
1098
1099     /*
1100     ** SSE2 128 bits fetch/store instructions are faster
1101     ** if memory access is 16 bytes aligned
1102     */
1103
1104     p_buffer = b_hscale ? p_buffer_start : p_pic;
1105     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
1106                     p_dest->p->i_pitch|
1107                     ((int)p_y)|
1108                     ((int)p_buffer))) )
1109     {
1110         /* use faster SSE2 aligned fetch and store */
1111         for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1112         {
1113             p_pic_start = p_pic;
1114
1115             for ( i_x = p_vout->render.i_width / 16; i_x--; )
1116             {
1117 #if defined (CAN_COMPILE_SSE2)
1118                 /* use inline SSE2 assembly */
1119                 __asm__( ".p2align 3"
1120                          SSE2_INIT_32_ALIGNED
1121                          SSE2_YUV_MUL
1122                          SSE2_YUV_ADD
1123                          SSE2_UNPACK_32_ARGB_ALIGNED
1124                          : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
1125 #else
1126                 /* otherwise use SSE2 C intrinsics wrappers */
1127                 __m128i  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
1128
1129                 SSE2_INTRINSICS_INIT_32_ALIGNED
1130                 SSE2_INTRINSICS_YUV_MUL
1131                 SSE2_INTRINSICS_YUV_ADD
1132                 SSE2_INTRINSICS_UNPACK_32_ARGB_ALIGNED
1133 #endif
1134                 p_y += 16;
1135                 p_u += 8;
1136                 p_v += 8;
1137                 p_buffer += 16;
1138             }
1139
1140             /* Here we do some unaligned reads and duplicate conversions, but
1141              * at least we have all the pixels */
1142             if( i_rewind )
1143             {
1144                 p_y -= i_rewind;
1145                 p_u -= i_rewind >> 1;
1146                 p_v -= i_rewind >> 1;
1147                 p_buffer -= i_rewind;
1148 #if defined (CAN_COMPILE_SSE2)
1149                 /* use inline SSE2 assembly */
1150                 __asm__( ".p2align 3"
1151                          SSE2_INIT_32_UNALIGNED
1152                          SSE2_YUV_MUL
1153                          SSE2_YUV_ADD
1154                          SSE2_UNPACK_32_ARGB_UNALIGNED
1155                          : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
1156 #else
1157                 /* otherwise use SSE2 intrinsics wrappers */
1158                 {
1159                     __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
1160
1161                     SSE2_INTRINSICS_INIT_32_UNALIGNED
1162                     SSE2_INTRINSICS_YUV_MUL
1163                     SSE2_INTRINSICS_YUV_ADD
1164                     SSE2_INTRINSICS_UNPACK_32_ARGB_UNALIGNED
1165                 }
1166 #endif
1167                 p_y += 16;
1168                 p_u += 4;
1169                 p_v += 4;
1170             }
1171             SCALE_WIDTH;
1172             SCALE_HEIGHT( 420, 4 );
1173
1174             p_y += i_source_margin;
1175             if( i_y % 2 )
1176             {
1177                 p_u += i_source_margin_c;
1178                 p_v += i_source_margin_c;
1179             }
1180             p_buffer = b_hscale ? p_buffer_start : p_pic;
1181         }
1182         /* make sure all SSE2 stores are visible thereafter */
1183 #if defined (CAN_COMPILE_SSE2)
1184         __asm__ __volatile__ ( "sfence" );
1185 #else
1186         _mm_sfence();
1187 #endif
1188     }
1189     else
1190     {
1191         /* use slower SSE2 unaligned fetch and store */
1192         for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1193         {
1194             p_pic_start = p_pic;
1195             p_buffer = b_hscale ? p_buffer_start : p_pic;
1196
1197             for ( i_x = p_vout->render.i_width / 16; i_x--; )
1198             {
1199 #if defined (CAN_COMPILE_SSE2)
1200                 /* use inline SSE2 assembly */
1201                 __asm__( ".p2align 3"
1202                          SSE2_INIT_32_UNALIGNED
1203                          SSE2_YUV_MUL
1204                          SSE2_YUV_ADD
1205                          SSE2_UNPACK_32_ARGB_UNALIGNED
1206                          : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
1207 #else
1208                 /* otherwise use SSE2 C intrinsics wrappers */
1209                 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
1210
1211                 SSE2_INTRINSICS_INIT_32_UNALIGNED
1212                 SSE2_INTRINSICS_YUV_MUL
1213                 SSE2_INTRINSICS_YUV_ADD
1214                 SSE2_INTRINSICS_UNPACK_32_ARGB_UNALIGNED
1215 #endif
1216                 p_y += 16;
1217                 p_u += 8;
1218                 p_v += 8;
1219                 p_buffer += 16;
1220             }
1221
1222             /* Here we do some unaligned reads and duplicate conversions, but
1223              * at least we have all the pixels */
1224             if( i_rewind )
1225             {
1226                 p_y -= i_rewind;
1227                 p_u -= i_rewind >> 1;
1228                 p_v -= i_rewind >> 1;
1229                 p_buffer -= i_rewind;
1230 #if defined (CAN_COMPILE_SSE2)
1231                 /* use inline SSE2 assembly */
1232                 __asm__( ".p2align 3"
1233                          SSE2_INIT_32_UNALIGNED
1234                          SSE2_YUV_MUL
1235                          SSE2_YUV_ADD
1236                          SSE2_UNPACK_32_ARGB_UNALIGNED
1237                          : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
1238 #else
1239                 /* otherwise use SSE2 intrinsics wrappers */
1240                 {
1241                     __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
1242
1243                     SSE2_INTRINSICS_INIT_32_UNALIGNED
1244                     SSE2_INTRINSICS_YUV_MUL
1245                     SSE2_INTRINSICS_YUV_ADD
1246                     SSE2_INTRINSICS_UNPACK_32_ARGB_UNALIGNED
1247                 }
1248 #endif
1249                 p_y += 16;
1250                 p_u += 8;
1251                 p_v += 8;
1252             }
1253             SCALE_WIDTH;
1254             SCALE_HEIGHT( 420, 4 );
1255
1256             p_y += i_source_margin;
1257             if( i_y % 2 )
1258             {
1259                 p_u += i_source_margin_c;
1260                 p_v += i_source_margin_c;
1261             }
1262             p_buffer = b_hscale ? p_buffer_start : p_pic;
1263         }
1264     }
1265
1266 #else
1267
1268     if( p_vout->render.i_width & 7 )
1269     {
1270         i_rewind = 8 - ( p_vout->render.i_width & 7 );
1271     }
1272     else
1273     {
1274         i_rewind = 0;
1275     }
1276
1277     for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1278     {
1279         p_pic_start = p_pic;
1280         p_buffer = b_hscale ? p_buffer_start : p_pic;
1281
1282         for ( i_x = p_vout->render.i_width / 8; i_x--; )
1283         {
1284 #if defined (CAN_COMPILE_MMX)
1285             /* use inline MMX assembly */
1286             __asm__( MMX_INIT_32
1287                      : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
1288
1289             __asm__( ".p2align 3"
1290                      MMX_YUV_MUL
1291                      MMX_YUV_ADD
1292                      MMX_UNPACK_32_ARGB
1293                      : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
1294 #else
1295             /* otherwise use MMX C intrinsics wrappers */
1296             __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
1297             uint64_t tmp64;
1298
1299             MMX_INTRINSICS_INIT_32
1300             MMX_INTRINSICS_YUV_MUL
1301             MMX_INTRINSICS_YUV_ADD
1302             MMX_INTRINSICS_UNPACK_32_ARGB
1303 #endif
1304             p_y += 8;
1305             p_u += 4;
1306             p_v += 4;
1307             p_buffer += 8;
1308         }
1309
1310         /* Here we do some unaligned reads and duplicate conversions, but
1311          * at least we have all the pixels */
1312         if( i_rewind )
1313         {
1314             p_y -= i_rewind;
1315             p_u -= i_rewind >> 1;
1316             p_v -= i_rewind >> 1;
1317             p_buffer -= i_rewind;
1318 #if defined (CAN_COMPILE_MMX)
1319             /* use inline MMX assembly */
1320             __asm__( ".p2align 3"
1321                      MMX_INIT_32
1322                      MMX_YUV_MUL
1323                      MMX_YUV_ADD
1324                      MMX_UNPACK_32_ARGB
1325                      : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
1326 #else
1327             /* otherwise use MMX intrinsics wrappers */
1328             {
1329                 __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
1330                 uint64_t tmp64;
1331
1332                 MMX_INTRINSICS_INIT_32
1333                 MMX_INTRINSICS_YUV_MUL
1334                 MMX_INTRINSICS_YUV_ADD
1335                 MMX_INTRINSICS_UNPACK_32_ARGB
1336             }
1337 #endif
1338             p_y += 8;
1339             p_u += 4;
1340             p_v += 4;
1341             p_buffer += 8;
1342         }
1343         SCALE_WIDTH;
1344         SCALE_HEIGHT( 420, 4 );
1345
1346         p_y += i_source_margin;
1347         if( i_y % 2 )
1348         {
1349             p_u += i_source_margin_c;
1350             p_v += i_source_margin_c;
1351         }
1352     }
1353     /* re-enable FPU registers */
1354 #if defined (CAN_COMPILE_MMX)
1355     __asm__ __volatile__ ( "emms" );
1356 #else
1357     _mm_empty();
1358 #endif
1359
1360 #endif
1361 }
1362
1363 void E_(I420_B8G8R8A8)( vout_thread_t *p_vout, picture_t *p_src,
1364                                             picture_t *p_dest )
1365 {
1366     /* We got this one from the old arguments */
1367     uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
1368     uint8_t  *p_y   = p_src->Y_PIXELS;
1369     uint8_t  *p_u   = p_src->U_PIXELS;
1370     uint8_t  *p_v   = p_src->V_PIXELS;
1371
1372     vlc_bool_t  b_hscale;                         /* horizontal scaling type */
1373     unsigned int i_vscale;                          /* vertical scaling type */
1374     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
1375
1376     int         i_right_margin;
1377     int         i_rewind;
1378     int         i_scale_count;                       /* scale modulo counter */
1379     int         i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
1380     uint32_t *  p_pic_start;       /* beginning of the current line for copy */
1381     /* Conversion buffer pointer */
1382     uint32_t *  p_buffer_start = (uint32_t*)p_vout->chroma.p_sys->p_buffer;
1383     uint32_t *  p_buffer;
1384
1385     /* Offset array pointer */
1386     int *       p_offset_start = p_vout->chroma.p_sys->p_offset;
1387     int *       p_offset;
1388
1389     const int i_source_margin = p_src->p[0].i_pitch
1390                                  - p_src->p[0].i_visible_pitch;
1391     const int i_source_margin_c = p_src->p[1].i_pitch
1392                                  - p_src->p[1].i_visible_pitch;
1393
1394     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
1395
1396     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
1397      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
1398      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
1399     SetOffset( p_vout->render.i_width, p_vout->render.i_height,
1400                p_vout->output.i_width, p_vout->output.i_height,
1401                &b_hscale, &i_vscale, p_offset_start );
1402
1403     /*
1404      * Perform conversion
1405      */
1406     i_scale_count = ( i_vscale == 1 ) ?
1407                     p_vout->output.i_height : p_vout->render.i_height;
1408
1409 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
1410
1411     if( p_vout->render.i_width & 15 )
1412     {
1413         i_rewind = 16 - ( p_vout->render.i_width & 15 );
1414     }
1415     else
1416     {
1417         i_rewind = 0;
1418     }
1419
1420     /*
1421     ** SSE2 128 bits fetch/store instructions are faster
1422     ** if memory access is 16 bytes aligned
1423     */
1424
1425     p_buffer = b_hscale ? p_buffer_start : p_pic;
1426     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
1427                     p_dest->p->i_pitch|
1428                     ((int)p_y)|
1429                     ((int)p_buffer))) )
1430     {
1431         /* use faster SSE2 aligned fetch and store */
1432         for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1433         {
1434             p_pic_start = p_pic;
1435
1436             for ( i_x = p_vout->render.i_width / 16; i_x--; )
1437             {
1438 #if defined (CAN_COMPILE_SSE2)
1439                 /* use inline SSE2 assembly */
1440                 __asm__( ".p2align 3"
1441                          SSE2_INIT_32_ALIGNED
1442                          SSE2_YUV_MUL
1443                          SSE2_YUV_ADD
1444                          SSE2_UNPACK_32_BGRA_ALIGNED
1445                          : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
1446 #else
1447                 /* otherwise use SSE2 C intrinsics wrappers */
1448                 __m128i  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
1449
1450                 SSE2_INTRINSICS_INIT_32_ALIGNED
1451                 SSE2_INTRINSICS_YUV_MUL
1452                 SSE2_INTRINSICS_YUV_ADD
1453                 SSE2_INTRINSICS_UNPACK_32_BGRA_ALIGNED
1454 #endif
1455                 p_y += 16;
1456                 p_u += 8;
1457                 p_v += 8;
1458                 p_buffer += 16;
1459             }
1460
1461             /* Here we do some unaligned reads and duplicate conversions, but
1462              * at least we have all the pixels */
1463             if( i_rewind )
1464             {
1465                 p_y -= i_rewind;
1466                 p_u -= i_rewind >> 1;
1467                 p_v -= i_rewind >> 1;
1468                 p_buffer -= i_rewind;
1469 #if defined (CAN_COMPILE_SSE2)
1470                 /* use inline SSE2 assembly */
1471                 __asm__( ".p2align 3"
1472                          SSE2_INIT_32_UNALIGNED
1473                          SSE2_YUV_MUL
1474                          SSE2_YUV_ADD
1475                          SSE2_UNPACK_32_BGRA_UNALIGNED
1476                          : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
1477 #else
1478                 /* otherwise use SSE2 intrinsics wrappers */
1479                 {
1480                     __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
1481
1482                     SSE2_INTRINSICS_INIT_32_UNALIGNED
1483                     SSE2_INTRINSICS_YUV_MUL
1484                     SSE2_INTRINSICS_YUV_ADD
1485                     SSE2_INTRINSICS_UNPACK_32_BGRA_UNALIGNED
1486                 }
1487 #endif
1488                 p_y += 16;
1489                 p_u += 4;
1490                 p_v += 4;
1491             }
1492             SCALE_WIDTH;
1493             SCALE_HEIGHT( 420, 4 );
1494
1495             p_y += i_source_margin;
1496             if( i_y % 2 )
1497             {
1498                 p_u += i_source_margin_c;
1499                 p_v += i_source_margin_c;
1500             }
1501             p_buffer = b_hscale ? p_buffer_start : p_pic;
1502         }
1503         /* make sure all SSE2 stores are visible thereafter */
1504 #if defined (CAN_COMPILE_SSE2)
1505         __asm__ __volatile__ ( "sfence" );
1506 #else
1507         _mm_sfence();
1508 #endif
1509     }
1510     else
1511     {
1512         /* use slower SSE2 unaligned fetch and store */
1513         for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1514         {
1515             p_pic_start = p_pic;
1516             p_buffer = b_hscale ? p_buffer_start : p_pic;
1517
1518             for ( i_x = p_vout->render.i_width / 16; i_x--; )
1519             {
1520 #if defined (CAN_COMPILE_SSE2)
1521                 /* use inline SSE2 assembly */
1522                 __asm__( ".p2align 3"
1523                          SSE2_INIT_32_UNALIGNED
1524                          SSE2_YUV_MUL
1525                          SSE2_YUV_ADD
1526                          SSE2_UNPACK_32_BGRA_UNALIGNED
1527                          : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
1528 #else
1529                 /* otherwise use SSE2 C intrinsics wrappers */
1530                 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
1531
1532                 SSE2_INTRINSICS_INIT_32_UNALIGNED
1533                 SSE2_INTRINSICS_YUV_MUL
1534                 SSE2_INTRINSICS_YUV_ADD
1535                 SSE2_INTRINSICS_UNPACK_32_BGRA_UNALIGNED
1536 #endif
1537                 p_y += 16;
1538                 p_u += 8;
1539                 p_v += 8;
1540                 p_buffer += 16;
1541             }
1542
1543             /* Here we do some unaligned reads and duplicate conversions, but
1544              * at least we have all the pixels */
1545             if( i_rewind )
1546             {
1547                 p_y -= i_rewind;
1548                 p_u -= i_rewind >> 1;
1549                 p_v -= i_rewind >> 1;
1550                 p_buffer -= i_rewind;
1551 #if defined (CAN_COMPILE_SSE2)
1552                 /* use inline SSE2 assembly */
1553                 __asm__( ".p2align 3"
1554                          SSE2_INIT_32_UNALIGNED
1555                          SSE2_YUV_MUL
1556                          SSE2_YUV_ADD
1557                          SSE2_UNPACK_32_BGRA_UNALIGNED
1558                          : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
1559 #else
1560                 /* otherwise use SSE2 intrinsics wrappers */
1561                 {
1562                     __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
1563
1564                     SSE2_INTRINSICS_INIT_32_UNALIGNED
1565                     SSE2_INTRINSICS_YUV_MUL
1566                     SSE2_INTRINSICS_YUV_ADD
1567                     SSE2_INTRINSICS_UNPACK_32_BGRA_UNALIGNED
1568                 }
1569 #endif
1570                 p_y += 16;
1571                 p_u += 8;
1572                 p_v += 8;
1573             }
1574             SCALE_WIDTH;
1575             SCALE_HEIGHT( 420, 4 );
1576
1577             p_y += i_source_margin;
1578             if( i_y % 2 )
1579             {
1580                 p_u += i_source_margin_c;
1581                 p_v += i_source_margin_c;
1582             }
1583             p_buffer = b_hscale ? p_buffer_start : p_pic;
1584         }
1585     }
1586
1587 #else
1588
1589     if( p_vout->render.i_width & 7 )
1590     {
1591         i_rewind = 8 - ( p_vout->render.i_width & 7 );
1592     }
1593     else
1594     {
1595         i_rewind = 0;
1596     }
1597
1598     for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1599     {
1600         p_pic_start = p_pic;
1601         p_buffer = b_hscale ? p_buffer_start : p_pic;
1602
1603         for ( i_x = p_vout->render.i_width / 8; i_x--; )
1604         {
1605 #if defined (CAN_COMPILE_MMX)
1606             /* use inline MMX assembly */
1607             __asm__( MMX_INIT_32
1608                      : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
1609
1610             __asm__( ".p2align 3"
1611                      MMX_YUV_MUL
1612                      MMX_YUV_ADD
1613                      MMX_UNPACK_32_ARGB
1614                      : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
1615 #else
1616             /* otherwise use MMX C intrinsics wrappers */
1617             __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
1618             uint64_t tmp64;
1619
1620             MMX_INTRINSICS_INIT_32
1621             MMX_INTRINSICS_YUV_MUL
1622             MMX_INTRINSICS_YUV_ADD
1623             MMX_INTRINSICS_UNPACK_32_BGRA
1624 #endif
1625             p_y += 8;
1626             p_u += 4;
1627             p_v += 4;
1628             p_buffer += 8;
1629         }
1630
1631         /* Here we do some unaligned reads and duplicate conversions, but
1632          * at least we have all the pixels */
1633         if( i_rewind )
1634         {
1635             p_y -= i_rewind;
1636             p_u -= i_rewind >> 1;
1637             p_v -= i_rewind >> 1;
1638             p_buffer -= i_rewind;
1639 #if defined (CAN_COMPILE_MMX)
1640             /* use inline MMX assembly */
1641             __asm__( ".p2align 3"
1642                      MMX_INIT_32
1643                      MMX_YUV_MUL
1644                      MMX_YUV_ADD
1645                      MMX_UNPACK_32_BGRA
1646                      : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
1647 #else
1648             /* otherwise use MMX intrinsics wrappers */
1649             {
1650                 __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
1651                 uint64_t tmp64;
1652
1653                 MMX_INTRINSICS_INIT_32
1654                 MMX_INTRINSICS_YUV_MUL
1655                 MMX_INTRINSICS_YUV_ADD
1656                 MMX_INTRINSICS_UNPACK_32_BGRA
1657             }
1658 #endif
1659             p_y += 8;
1660             p_u += 4;
1661             p_v += 4;
1662             p_buffer += 8;
1663         }
1664         SCALE_WIDTH;
1665         SCALE_HEIGHT( 420, 4 );
1666
1667         p_y += i_source_margin;
1668         if( i_y % 2 )
1669         {
1670             p_u += i_source_margin_c;
1671             p_v += i_source_margin_c;
1672         }
1673     }
1674     /* re-enable FPU registers */
1675 #if defined (CAN_COMPILE_MMX)
1676     __asm__ __volatile__ ( "emms" );
1677 #else
1678     _mm_empty();
1679 #endif
1680
1681 #endif
1682 }
1683
1684 #endif
1685
1686 /* Following functions are local */
1687
1688 /*****************************************************************************
1689  * SetOffset: build offset array for conversion functions
1690  *****************************************************************************
1691  * This function will build an offset array used in later conversion functions.
1692  * It will also set horizontal and vertical scaling indicators.
1693  *****************************************************************************/
1694 static void SetOffset( int i_width, int i_height, int i_pic_width,
1695                        int i_pic_height, vlc_bool_t *pb_hscale,
1696                        unsigned int *pi_vscale, int *p_offset )
1697 {
1698     int i_x;                                    /* x position in destination */
1699     int i_scale_count;                                     /* modulo counter */
1700
1701     /*
1702      * Prepare horizontal offset array
1703      */
1704     if( i_pic_width - i_width == 0 )
1705     {
1706         /* No horizontal scaling: YUV conversion is done directly to picture */
1707         *pb_hscale = 0;
1708     }
1709     else if( i_pic_width - i_width > 0 )
1710     {
1711         /* Prepare scaling array for horizontal extension */
1712         *pb_hscale = 1;
1713         i_scale_count = i_pic_width;
1714         for( i_x = i_width; i_x--; )
1715         {
1716             while( (i_scale_count -= i_width) > 0 )
1717             {
1718                 *p_offset++ = 0;
1719             }
1720             *p_offset++ = 1;
1721             i_scale_count += i_pic_width;
1722         }
1723     }
1724     else /* if( i_pic_width - i_width < 0 ) */
1725     {
1726         /* Prepare scaling array for horizontal reduction */
1727         *pb_hscale = 1;
1728         i_scale_count = i_width;
1729         for( i_x = i_pic_width; i_x--; )
1730         {
1731             *p_offset = 1;
1732             while( (i_scale_count -= i_pic_width) > 0 )
1733             {
1734                 *p_offset += 1;
1735             }
1736             p_offset++;
1737             i_scale_count += i_width;
1738         }
1739     }
1740
1741     /*
1742      * Set vertical scaling indicator
1743      */
1744     if( i_pic_height - i_height == 0 )
1745     {
1746         *pi_vscale = 0;
1747     }
1748     else if( i_pic_height - i_height > 0 )
1749     {
1750         *pi_vscale = 1;
1751     }
1752     else /* if( i_pic_height - i_height < 0 ) */
1753     {
1754         *pi_vscale = -1;
1755     }
1756 }
1757