git.sesse.net Git - vlc/blob - modules/video_chroma/i420_yuy2.c

   1 /*****************************************************************************
   2  * i420_yuy2.c : YUV to YUV conversion module for vlc
   3  *****************************************************************************
   4  * Copyright (C) 2000, 2001 the VideoLAN team
   5  * $Id$
   6  *
   7  * Authors: Samuel Hocevar <sam@zoy.org>
   8  *          Damien Fouilleul <damien@videolan.org>
   9  *
  10  * This program is free software; you can redistribute it and/or modify
  11  * it under the terms of the GNU General Public License as published by
  12  * the Free Software Foundation; either version 2 of the License, or
  13  * (at your option) any later version.
  14  *
  15  * This program is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  * GNU General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU General Public License
  21  * along with this program; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
  23  *****************************************************************************/
  24
  25 /*****************************************************************************
  26  * Preamble
  27  *****************************************************************************/
  28
  29 #ifdef HAVE_CONFIG_H
  30 # include "config.h"
  31 #endif
  32
  33 #include <vlc_common.h>
  34 #include <vlc_plugin.h>
  35 #include <vlc_vout.h>
  36
  37 #if defined (MODULE_NAME_IS_i420_yuy2_altivec) && defined(HAVE_ALTIVEC_H)
  38 #   include <altivec.h>
  39 #endif
  40
  41 #include "i420_yuy2.h"
  42
  43 #define SRC_FOURCC  "I420,IYUV,YV12"
  44
  45 #if defined (MODULE_NAME_IS_i420_yuy2)
  46 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv,Y211"
  47 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
  48 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
  49 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
  50 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
  51 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
  52 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422"
  53 #endif
  54
  55 /*****************************************************************************
  56  * Local and extern prototypes.
  57  *****************************************************************************/
  58 static int  Activate ( vlc_object_t * );
  59
  60 static void I420_YUY2           ( vout_thread_t *, picture_t *, picture_t * );
  61 static void I420_YVYU           ( vout_thread_t *, picture_t *, picture_t * );
  62 static void I420_UYVY           ( vout_thread_t *, picture_t *, picture_t * );
  63 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
  64 static void I420_IUYV           ( vout_thread_t *, picture_t *, picture_t * );
  65 static void I420_cyuv           ( vout_thread_t *, picture_t *, picture_t * );
  66 #endif
  67 #if defined (MODULE_NAME_IS_i420_yuy2)
  68 static void I420_Y211           ( vout_thread_t *, picture_t *, picture_t * );
  69 #endif
  70
  71 #ifdef MODULE_NAME_IS_i420_yuy2_mmx
  72 /* Initialize MMX-specific constants */
  73 static const uint64_t i_00ffw = 0x00ff00ff00ff00ffULL;
  74 static const uint64_t i_80w   = 0x0000000080808080ULL;
  75 #endif
  76
  77 /*****************************************************************************
  78  * Module descriptor.
  79  *****************************************************************************/
  80 vlc_module_begin();
  81 #if defined (MODULE_NAME_IS_i420_yuy2)
  82     set_description( N_("Conversions from " SRC_FOURCC " to " DEST_FOURCC) );
  83     set_capability( "chroma", 80 );
  84 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
  85     set_description( N_("MMX conversions from " SRC_FOURCC " to " DEST_FOURCC) );
  86     set_capability( "chroma", 100 );
  87     add_requirement( MMX );
  88 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
  89     set_description( N_("SSE2 conversions from " SRC_FOURCC " to " DEST_FOURCC) );
  90     set_capability( "chroma", 120 );
  91     add_requirement( SSE2 );
  92 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
  93     set_description(
  94             _("AltiVec conversions from " SRC_FOURCC " to " DEST_FOURCC) );
  95     set_capability( "chroma", 100 );
  96     add_requirement( ALTIVEC );
  97 #endif
  98     set_callbacks( Activate, NULL );
  99 vlc_module_end();
 100
 101 /*****************************************************************************
 102  * Activate: allocate a chroma function
 103  *****************************************************************************
 104  * This function allocates and initializes a chroma function
 105  *****************************************************************************/
 106 static int Activate( vlc_object_t *p_this )
 107 {
 108     vout_thread_t *p_vout = (vout_thread_t *)p_this;
 109
 110     if( p_vout->render.i_width & 1 || p_vout->render.i_height & 1 )
 111     {
 112         return -1;
 113     }
 114
 115     switch( p_vout->render.i_chroma )
 116     {
 117         case VLC_FOURCC('Y','V','1','2'):
 118         case VLC_FOURCC('I','4','2','0'):
 119         case VLC_FOURCC('I','Y','U','V'):
 120             switch( p_vout->output.i_chroma )
 121             {
 122                 case VLC_FOURCC('Y','U','Y','2'):
 123                 case VLC_FOURCC('Y','U','N','V'):
 124                     p_vout->chroma.pf_convert = I420_YUY2;
 125                     break;
 126
 127                 case VLC_FOURCC('Y','V','Y','U'):
 128                     p_vout->chroma.pf_convert = I420_YVYU;
 129                     break;
 130
 131                 case VLC_FOURCC('U','Y','V','Y'):
 132                 case VLC_FOURCC('U','Y','N','V'):
 133                 case VLC_FOURCC('Y','4','2','2'):
 134                     p_vout->chroma.pf_convert = I420_UYVY;
 135                     break;
 136 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
 137                 case VLC_FOURCC('I','U','Y','V'):
 138                     p_vout->chroma.pf_convert = I420_IUYV;
 139                     break;
 140
 141                 case VLC_FOURCC('c','y','u','v'):
 142                     p_vout->chroma.pf_convert = I420_cyuv;
 143                     break;
 144 #endif
 145
 146 #if defined (MODULE_NAME_IS_i420_yuy2)
 147                 case VLC_FOURCC('Y','2','1','1'):
 148                     p_vout->chroma.pf_convert = I420_Y211;
 149                     break;
 150 #endif
 151
 152                 default:
 153                     return -1;
 154             }
 155             break;
 156
 157         default:
 158             return -1;
 159     }
 160
 161     return 0;
 162 }
 163
 164 #if 0
 165 static inline unsigned long long read_cycles(void)
 166 {
 167     unsigned long long v;
 168     __asm__ __volatile__("rdtsc" : "=A" (v): );
 169
 170     return v;
 171 }
 172 #endif
 173
 174 /* Following functions are local */
 175 /*****************************************************************************
 176  * I420_YUY2: planar YUV 4:2:0 to packed YUYV 4:2:2
 177  *****************************************************************************/
 178 static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source,
 179                                               picture_t *p_dest )
 180 {
 181     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
 182     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
 183     uint8_t *p_u = p_source->U_PIXELS;
 184     uint8_t *p_v = p_source->V_PIXELS;
 185
 186     int i_x, i_y;
 187
 188 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 189 #define VEC_NEXT_LINES( ) \
 190     p_line1  = p_line2; \
 191     p_line2 += p_dest->p->i_pitch; \
 192     p_y1     = p_y2; \
 193     p_y2    += p_source->p[Y_PLANE].i_pitch;
 194
 195 #define VEC_LOAD_UV( ) \
 196     u_vec = vec_ld( 0, p_u ); p_u += 16; \
 197     v_vec = vec_ld( 0, p_v ); p_v += 16;
 198
 199 #define VEC_MERGE( a ) \
 200     uv_vec = a( u_vec, v_vec ); \
 201     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
 202     vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
 203     vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
 204     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
 205     vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16; \
 206     vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16;
 207
 208     vector unsigned char u_vec;
 209     vector unsigned char v_vec;
 210     vector unsigned char uv_vec;
 211     vector unsigned char y_vec;
 212
 213     if( !( ( p_vout->render.i_width % 32 ) |
 214            ( p_vout->render.i_height % 2 ) ) )
 215     {
 216         /* Width is a multiple of 32, we take 2 lines at a time */
 217         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 218         {
 219             VEC_NEXT_LINES( );
 220             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
 221             {
 222                 VEC_LOAD_UV( );
 223                 VEC_MERGE( vec_mergeh );
 224                 VEC_MERGE( vec_mergel );
 225             }
 226         }
 227     }
 228     else if( !( ( p_vout->render.i_width % 16 ) |
 229                 ( p_vout->render.i_height % 4 ) ) )
 230     {
 231         /* Width is only a multiple of 16, we take 4 lines at a time */
 232         for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
 233         {
 234             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
 235             VEC_NEXT_LINES( );
 236             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
 237             {
 238                 VEC_LOAD_UV( );
 239                 VEC_MERGE( vec_mergeh );
 240                 VEC_MERGE( vec_mergel );
 241             }
 242
 243             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
 244             VEC_LOAD_UV( );
 245             VEC_MERGE( vec_mergeh );
 246
 247             /* Line 3 and 4, pixels 0 to 16 */
 248             VEC_NEXT_LINES( );
 249             VEC_MERGE( vec_mergel );
 250
 251             /* Line 3 and 4, pixels 16 to ( width ) */
 252             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
 253             {
 254                 VEC_LOAD_UV( );
 255                 VEC_MERGE( vec_mergeh );
 256                 VEC_MERGE( vec_mergel );
 257             }
 258         }
 259     }
 260     else
 261     {
 262         /* Crap, use the C version */
 263 #undef VEC_NEXT_LINES
 264 #undef VEC_LOAD_UV
 265 #undef VEC_MERGE
 266 #endif
 267
 268     const int i_source_margin = p_source->p[0].i_pitch
 269                                  - p_source->p[0].i_visible_pitch;
 270     const int i_source_margin_c = p_source->p[1].i_pitch
 271                                  - p_source->p[1].i_visible_pitch;
 272     const int i_dest_margin = p_dest->p->i_pitch
 273                                - p_dest->p->i_visible_pitch;
 274
 275 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
 276     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 277     {
 278         p_line1 = p_line2;
 279         p_line2 += p_dest->p->i_pitch;
 280
 281         p_y1 = p_y2;
 282         p_y2 += p_source->p[Y_PLANE].i_pitch;
 283
 284 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
 285         for( i_x = p_vout->render.i_width / 8; i_x-- ; )
 286         {
 287             C_YUV420_YUYV( );
 288             C_YUV420_YUYV( );
 289             C_YUV420_YUYV( );
 290             C_YUV420_YUYV( );
 291         }
 292 #else
 293         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
 294         {
 295             MMX_CALL( MMX_YUV420_YUYV );
 296         }
 297 #endif
 298         for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; )
 299         {
 300             C_YUV420_YUYV( );
 301         }
 302
 303         p_y1 += i_source_margin;
 304         p_y2 += i_source_margin;
 305         p_u += i_source_margin_c;
 306         p_v += i_source_margin_c;
 307         p_line1 += i_dest_margin;
 308         p_line2 += i_dest_margin;
 309     }
 310
 311 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
 312     /* re-enable FPU registers */
 313     MMX_END;
 314 #endif
 315
 316 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 317     }
 318 #endif
 319
 320 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 321     /*
 322     ** SSE2 128 bits fetch/store instructions are faster
 323     ** if memory access is 16 bytes aligned
 324     */
 325
 326     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
 327         ((intptr_t)p_line2|(intptr_t)p_y2))) )
 328     {
 329         /* use faster SSE2 aligned fetch and store */
 330         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 331         {
 332             p_line1 = p_line2;
 333             p_line2 += p_dest->p->i_pitch;
 334
 335             p_y1 = p_y2;
 336             p_y2 += p_source->p[Y_PLANE].i_pitch;
 337
 338             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
 339             {
 340                 SSE2_CALL( SSE2_YUV420_YUYV_ALIGNED );
 341             }
 342             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
 343             {
 344                 C_YUV420_YUYV( );
 345             }
 346
 347             p_y1 += i_source_margin;
 348             p_y2 += i_source_margin;
 349             p_u += i_source_margin_c;
 350             p_v += i_source_margin_c;
 351             p_line1 += i_dest_margin;
 352             p_line2 += i_dest_margin;
 353         }
 354     }
 355     else
 356     {
 357         /* use slower SSE2 unaligned fetch and store */
 358         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 359         {
 360             p_line1 = p_line2;
 361             p_line2 += p_dest->p->i_pitch;
 362
 363             p_y1 = p_y2;
 364             p_y2 += p_source->p[Y_PLANE].i_pitch;
 365
 366             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
 367             {
 368                 SSE2_CALL( SSE2_YUV420_YUYV_UNALIGNED );
 369             }
 370             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
 371             {
 372                 C_YUV420_YUYV( );
 373             }
 374
 375             p_y1 += i_source_margin;
 376             p_y2 += i_source_margin;
 377             p_u += i_source_margin_c;
 378             p_v += i_source_margin_c;
 379             p_line1 += i_dest_margin;
 380             p_line2 += i_dest_margin;
 381         }
 382     }
 383     /* make sure all SSE2 stores are visible thereafter */
 384     SSE2_END;
 385
 386 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 387 }
 388
 389 /*****************************************************************************
 390  * I420_YVYU: planar YUV 4:2:0 to packed YVYU 4:2:2
 391  *****************************************************************************/
 392 static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source,
 393                                               picture_t *p_dest )
 394 {
 395     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
 396     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
 397     uint8_t *p_u = p_source->U_PIXELS;
 398     uint8_t *p_v = p_source->V_PIXELS;
 399
 400     int i_x, i_y;
 401
 402 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 403 #define VEC_NEXT_LINES( ) \
 404     p_line1  = p_line2; \
 405     p_line2 += p_dest->p->i_pitch; \
 406     p_y1     = p_y2; \
 407     p_y2    += p_source->p[Y_PLANE].i_pitch;
 408
 409 #define VEC_LOAD_UV( ) \
 410     u_vec = vec_ld( 0, p_u ); p_u += 16; \
 411     v_vec = vec_ld( 0, p_v ); p_v += 16;
 412
 413 #define VEC_MERGE( a ) \
 414     vu_vec = a( v_vec, u_vec ); \
 415     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
 416     vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
 417     vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
 418     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
 419     vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16; \
 420     vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16;
 421
 422     vector unsigned char u_vec;
 423     vector unsigned char v_vec;
 424     vector unsigned char vu_vec;
 425     vector unsigned char y_vec;
 426
 427     if( !( ( p_vout->render.i_width % 32 ) |
 428            ( p_vout->render.i_height % 2 ) ) )
 429     {
 430         /* Width is a multiple of 32, we take 2 lines at a time */
 431         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 432         {
 433             VEC_NEXT_LINES( );
 434             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
 435             {
 436                 VEC_LOAD_UV( );
 437                 VEC_MERGE( vec_mergeh );
 438                 VEC_MERGE( vec_mergel );
 439             }
 440         }
 441     }
 442     else if( !( ( p_vout->render.i_width % 16 ) |
 443                 ( p_vout->render.i_height % 4 ) ) )
 444     {
 445         /* Width is only a multiple of 16, we take 4 lines at a time */
 446         for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
 447         {
 448             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
 449             VEC_NEXT_LINES( );
 450             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
 451             {
 452                 VEC_LOAD_UV( );
 453                 VEC_MERGE( vec_mergeh );
 454                 VEC_MERGE( vec_mergel );
 455             }
 456
 457             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
 458             VEC_LOAD_UV( );
 459             VEC_MERGE( vec_mergeh );
 460
 461             /* Line 3 and 4, pixels 0 to 16 */
 462             VEC_NEXT_LINES( );
 463             VEC_MERGE( vec_mergel );
 464
 465             /* Line 3 and 4, pixels 16 to ( width ) */
 466             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
 467             {
 468                 VEC_LOAD_UV( );
 469                 VEC_MERGE( vec_mergeh );
 470                 VEC_MERGE( vec_mergel );
 471             }
 472         }
 473     }
 474     else
 475     {
 476         /* Crap, use the C version */
 477 #undef VEC_NEXT_LINES
 478 #undef VEC_LOAD_UV
 479 #undef VEC_MERGE
 480 #endif
 481
 482     const int i_source_margin = p_source->p[0].i_pitch
 483                                  - p_source->p[0].i_visible_pitch;
 484     const int i_source_margin_c = p_source->p[1].i_pitch
 485                                  - p_source->p[1].i_visible_pitch;
 486     const int i_dest_margin = p_dest->p->i_pitch
 487                                - p_dest->p->i_visible_pitch;
 488
 489 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
 490     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 491     {
 492         p_line1 = p_line2;
 493         p_line2 += p_dest->p->i_pitch;
 494
 495         p_y1 = p_y2;
 496         p_y2 += p_source->p[Y_PLANE].i_pitch;
 497
 498         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
 499         {
 500 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
 501             C_YUV420_YVYU( );
 502             C_YUV420_YVYU( );
 503             C_YUV420_YVYU( );
 504             C_YUV420_YVYU( );
 505 #else
 506             MMX_CALL( MMX_YUV420_YVYU );
 507 #endif
 508         }
 509         for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; )
 510         {
 511             C_YUV420_YVYU( );
 512         }
 513
 514         p_y1 += i_source_margin;
 515         p_y2 += i_source_margin;
 516         p_u += i_source_margin_c;
 517         p_v += i_source_margin_c;
 518         p_line1 += i_dest_margin;
 519         p_line2 += i_dest_margin;
 520     }
 521
 522 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
 523     /* re-enable FPU registers */
 524     MMX_END;
 525 #endif
 526
 527 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 528     }
 529 #endif
 530
 531 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 532     /*
 533     ** SSE2 128 bits fetch/store instructions are faster
 534     ** if memory access is 16 bytes aligned
 535     */
 536     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
 537         ((intptr_t)p_line2|(intptr_t)p_y2))) )
 538     {
 539         /* use faster SSE2 aligned fetch and store */
 540         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 541         {
 542             p_line1 = p_line2;
 543             p_line2 += p_dest->p->i_pitch;
 544
 545             p_y1 = p_y2;
 546             p_y2 += p_source->p[Y_PLANE].i_pitch;
 547
 548             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
 549             {
 550                 SSE2_CALL( SSE2_YUV420_YVYU_ALIGNED );
 551             }
 552             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
 553             {
 554                 C_YUV420_YVYU( );
 555             }
 556
 557             p_y1 += i_source_margin;
 558             p_y2 += i_source_margin;
 559             p_u += i_source_margin_c;
 560             p_v += i_source_margin_c;
 561             p_line1 += i_dest_margin;
 562             p_line2 += i_dest_margin;
 563         }
 564     }
 565     else
 566     {
 567         /* use slower SSE2 unaligned fetch and store */
 568         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 569         {
 570             p_line1 = p_line2;
 571             p_line2 += p_dest->p->i_pitch;
 572
 573             p_y1 = p_y2;
 574             p_y2 += p_source->p[Y_PLANE].i_pitch;
 575
 576             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
 577             {
 578                 SSE2_CALL( SSE2_YUV420_YVYU_UNALIGNED );
 579             }
 580             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
 581             {
 582                 C_YUV420_YVYU( );
 583             }
 584
 585             p_y1 += i_source_margin;
 586             p_y2 += i_source_margin;
 587             p_u += i_source_margin_c;
 588             p_v += i_source_margin_c;
 589             p_line1 += i_dest_margin;
 590             p_line2 += i_dest_margin;
 591         }
 592     }
 593     /* make sure all SSE2 stores are visible thereafter */
 594     SSE2_END;
 595 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 596 }
 597
 598 /*****************************************************************************
 599  * I420_UYVY: planar YUV 4:2:0 to packed UYVY 4:2:2
 600  *****************************************************************************/
 601 static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source,
 602                                               picture_t *p_dest )
 603 {
 604     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
 605     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
 606     uint8_t *p_u = p_source->U_PIXELS;
 607     uint8_t *p_v = p_source->V_PIXELS;
 608
 609     int i_x, i_y;
 610
 611 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 612 #define VEC_NEXT_LINES( ) \
 613     p_line1  = p_line2; \
 614     p_line2 += p_dest->p->i_pitch; \
 615     p_y1     = p_y2; \
 616     p_y2    += p_source->p[Y_PLANE].i_pitch;
 617
 618 #define VEC_LOAD_UV( ) \
 619     u_vec = vec_ld( 0, p_u ); p_u += 16; \
 620     v_vec = vec_ld( 0, p_v ); p_v += 16;
 621
 622 #define VEC_MERGE( a ) \
 623     uv_vec = a( u_vec, v_vec ); \
 624     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
 625     vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
 626     vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
 627     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
 628     vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16; \
 629     vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16;
 630
 631     vector unsigned char u_vec;
 632     vector unsigned char v_vec;
 633     vector unsigned char uv_vec;
 634     vector unsigned char y_vec;
 635
 636     if( !( ( p_vout->render.i_width % 32 ) |
 637            ( p_vout->render.i_height % 2 ) ) )
 638     {
 639         /* Width is a multiple of 32, we take 2 lines at a time */
 640         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 641         {
 642             VEC_NEXT_LINES( );
 643             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
 644             {
 645                 VEC_LOAD_UV( );
 646                 VEC_MERGE( vec_mergeh );
 647                 VEC_MERGE( vec_mergel );
 648             }
 649         }
 650     }
 651     else if( !( ( p_vout->render.i_width % 16 ) |
 652                 ( p_vout->render.i_height % 4 ) ) )
 653     {
 654         /* Width is only a multiple of 16, we take 4 lines at a time */
 655         for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
 656         {
 657             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
 658             VEC_NEXT_LINES( );
 659             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
 660             {
 661                 VEC_LOAD_UV( );
 662                 VEC_MERGE( vec_mergeh );
 663                 VEC_MERGE( vec_mergel );
 664             }
 665
 666             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
 667             VEC_LOAD_UV( );
 668             VEC_MERGE( vec_mergeh );
 669
 670             /* Line 3 and 4, pixels 0 to 16 */
 671             VEC_NEXT_LINES( );
 672             VEC_MERGE( vec_mergel );
 673
 674             /* Line 3 and 4, pixels 16 to ( width ) */
 675             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
 676             {
 677                 VEC_LOAD_UV( );
 678                 VEC_MERGE( vec_mergeh );
 679                 VEC_MERGE( vec_mergel );
 680             }
 681         }
 682     }
 683     else
 684     {
 685         /* Crap, use the C version */
 686 #undef VEC_NEXT_LINES
 687 #undef VEC_LOAD_UV
 688 #undef VEC_MERGE
 689 #endif
 690
 691     const int i_source_margin = p_source->p[0].i_pitch
 692                                  - p_source->p[0].i_visible_pitch;
 693     const int i_source_margin_c = p_source->p[1].i_pitch
 694                                  - p_source->p[1].i_visible_pitch;
 695     const int i_dest_margin = p_dest->p->i_pitch
 696                                - p_dest->p->i_visible_pitch;
 697
 698 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
 699     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 700     {
 701         p_line1 = p_line2;
 702         p_line2 += p_dest->p->i_pitch;
 703
 704         p_y1 = p_y2;
 705         p_y2 += p_source->p[Y_PLANE].i_pitch;
 706
 707         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
 708         {
 709 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
 710             C_YUV420_UYVY( );
 711             C_YUV420_UYVY( );
 712             C_YUV420_UYVY( );
 713             C_YUV420_UYVY( );
 714 #else
 715             MMX_CALL( MMX_YUV420_UYVY );
 716 #endif
 717         }
 718         for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x--; )
 719         {
 720             C_YUV420_UYVY( );
 721         }
 722
 723         p_y1 += i_source_margin;
 724         p_y2 += i_source_margin;
 725         p_u += i_source_margin_c;
 726         p_v += i_source_margin_c;
 727         p_line1 += i_dest_margin;
 728         p_line2 += i_dest_margin;
 729     }
 730
 731 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
 732     /* re-enable FPU registers */
 733     MMX_END;
 734 #endif
 735
 736 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 737     }
 738 #endif
 739
 740 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 741     /*
 742     ** SSE2 128 bits fetch/store instructions are faster
 743     ** if memory access is 16 bytes aligned
 744     */
 745     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
 746         ((intptr_t)p_line2|(intptr_t)p_y2))) )
 747     {
 748         /* use faster SSE2 aligned fetch and store */
 749         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 750         {
 751             p_line1 = p_line2;
 752             p_line2 += p_dest->p->i_pitch;
 753
 754             p_y1 = p_y2;
 755             p_y2 += p_source->p[Y_PLANE].i_pitch;
 756
 757             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
 758             {
 759                 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
 760             }
 761             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
 762             {
 763                 C_YUV420_UYVY( );
 764             }
 765
 766             p_y1 += i_source_margin;
 767             p_y2 += i_source_margin;
 768             p_u += i_source_margin_c;
 769             p_v += i_source_margin_c;
 770             p_line1 += i_dest_margin;
 771             p_line2 += i_dest_margin;
 772         }
 773     }
 774     else
 775     {
 776         /* use slower SSE2 unaligned fetch and store */
 777         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 778         {
 779             p_line1 = p_line2;
 780             p_line2 += p_dest->p->i_pitch;
 781
 782             p_y1 = p_y2;
 783             p_y2 += p_source->p[Y_PLANE].i_pitch;
 784
 785             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
 786             {
 787                 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
 788             }
 789             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
 790             {
 791                 C_YUV420_UYVY( );
 792             }
 793
 794             p_y1 += i_source_margin;
 795             p_y2 += i_source_margin;
 796             p_u += i_source_margin_c;
 797             p_v += i_source_margin_c;
 798             p_line1 += i_dest_margin;
 799             p_line2 += i_dest_margin;
 800         }
 801     }
 802     /* make sure all SSE2 stores are visible thereafter */
 803     SSE2_END;
 804 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 805 }
 806
 807 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
 808 /*****************************************************************************
 809  * I420_IUYV: planar YUV 4:2:0 to interleaved packed UYVY 4:2:2
 810  *****************************************************************************/
 811 static void I420_IUYV( vout_thread_t *p_vout, picture_t *p_source,
 812                                               picture_t *p_dest )
 813 {
 814     VLC_UNUSED(p_source); VLC_UNUSED(p_dest);
 815     /* FIXME: TODO ! */
 816     msg_Err( p_vout, "I420_IUYV unimplemented, please harass <sam@zoy.org>" );
 817 }
 818
 819 /*****************************************************************************
 820  * I420_cyuv: planar YUV 4:2:0 to upside-down packed UYVY 4:2:2
 821  *****************************************************************************/
 822 static void I420_cyuv( vout_thread_t *p_vout, picture_t *p_source,
 823                                               picture_t *p_dest )
 824 {
 825     uint8_t *p_line1 = p_dest->p->p_pixels +
 826                        p_dest->p->i_visible_lines * p_dest->p->i_pitch
 827                        + p_dest->p->i_pitch;
 828     uint8_t *p_line2 = p_dest->p->p_pixels +
 829                        p_dest->p->i_visible_lines * p_dest->p->i_pitch;
 830     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
 831     uint8_t *p_u = p_source->U_PIXELS;
 832     uint8_t *p_v = p_source->V_PIXELS;
 833
 834     int i_x, i_y;
 835
 836     const int i_source_margin = p_source->p[0].i_pitch
 837                                  - p_source->p[0].i_visible_pitch;
 838     const int i_source_margin_c = p_source->p[1].i_pitch
 839                                  - p_source->p[1].i_visible_pitch;
 840     const int i_dest_margin = p_dest->p->i_pitch
 841                                - p_dest->p->i_visible_pitch;
 842
 843 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
 844     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 845     {
 846         p_line1 -= 3 * p_dest->p->i_pitch;
 847         p_line2 -= 3 * p_dest->p->i_pitch;
 848
 849         p_y1 = p_y2;
 850         p_y2 += p_source->p[Y_PLANE].i_pitch;
 851
 852         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
 853         {
 854 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
 855             C_YUV420_UYVY( );
 856             C_YUV420_UYVY( );
 857             C_YUV420_UYVY( );
 858             C_YUV420_UYVY( );
 859 #else
 860             MMX_CALL( MMX_YUV420_UYVY );
 861 #endif
 862         }
 863         for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; )
 864         {
 865             C_YUV420_UYVY( );
 866         }
 867
 868         p_y1 += i_source_margin;
 869         p_y2 += i_source_margin;
 870         p_u += i_source_margin_c;
 871         p_v += i_source_margin_c;
 872         p_line1 += i_dest_margin;
 873         p_line2 += i_dest_margin;
 874     }
 875
 876 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
 877     /* re-enable FPU registers */
 878     MMX_END;
 879 #endif
 880
 881 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 882     /*
 883     ** SSE2 128 bits fetch/store instructions are faster
 884     ** if memory access is 16 bytes aligned
 885     */
 886     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
 887         ((intptr_t)p_line2|(intptr_t)p_y2))) )
 888     {
 889         /* use faster SSE2 aligned fetch and store */
 890         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 891         {
 892             p_line1 = p_line2;
 893             p_line2 += p_dest->p->i_pitch;
 894
 895             p_y1 = p_y2;
 896             p_y2 += p_source->p[Y_PLANE].i_pitch;
 897
 898             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
 899             {
 900                 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
 901             }
 902             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
 903             {
 904                 C_YUV420_UYVY( );
 905             }
 906
 907             p_y1 += i_source_margin;
 908             p_y2 += i_source_margin;
 909             p_u += i_source_margin_c;
 910             p_v += i_source_margin_c;
 911             p_line1 += i_dest_margin;
 912             p_line2 += i_dest_margin;
 913         }
 914     }
 915     else
 916     {
 917         /* use slower SSE2 unaligned fetch and store */
 918         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 919         {
 920             p_line1 = p_line2;
 921             p_line2 += p_dest->p->i_pitch;
 922
 923             p_y1 = p_y2;
 924             p_y2 += p_source->p[Y_PLANE].i_pitch;
 925
 926             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
 927             {
 928                 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
 929             }
 930             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
 931             {
 932                 C_YUV420_UYVY( );
 933             }
 934
 935             p_y1 += i_source_margin;
 936             p_y2 += i_source_margin;
 937             p_u += i_source_margin_c;
 938             p_v += i_source_margin_c;
 939             p_line1 += i_dest_margin;
 940             p_line2 += i_dest_margin;
 941         }
 942     }
 943     /* make sure all SSE2 stores are visible thereafter */
 944     SSE2_END;
 945 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 946 }
 947 #endif // !defined (MODULE_NAME_IS_i420_yuy2_altivec)
 948
 949 /*****************************************************************************
 950  * I420_Y211: planar YUV 4:2:0 to packed YUYV 2:1:1
 951  *****************************************************************************/
 952 #if defined (MODULE_NAME_IS_i420_yuy2)
 953 static void I420_Y211( vout_thread_t *p_vout, picture_t *p_source,
 954                                               picture_t *p_dest )
 955 {
 956     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
 957     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
 958     uint8_t *p_u = p_source->U_PIXELS;
 959     uint8_t *p_v = p_source->V_PIXELS;
 960
 961     int i_x, i_y;
 962
 963     const int i_source_margin = p_source->p[0].i_pitch
 964                                  - p_source->p[0].i_visible_pitch;
 965     const int i_source_margin_c = p_source->p[1].i_pitch
 966                                  - p_source->p[1].i_visible_pitch;
 967     const int i_dest_margin = p_dest->p->i_pitch
 968                                - p_dest->p->i_visible_pitch;
 969
 970     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 971     {
 972         p_line1 = p_line2;
 973         p_line2 += p_dest->p->i_pitch;
 974
 975         p_y1 = p_y2;
 976         p_y2 += p_source->p[Y_PLANE].i_pitch;
 977
 978         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
 979         {
 980             C_YUV420_Y211( );
 981             C_YUV420_Y211( );
 982         }
 983
 984         p_y1 += i_source_margin;
 985         p_y2 += i_source_margin;
 986         p_u += i_source_margin_c;
 987         p_v += i_source_margin_c;
 988         p_line1 += i_dest_margin;
 989         p_line2 += i_dest_margin;
 990     }
 991 }
 992 #endif