git.sesse.net Git - vlc/blob - modules/video_chroma/i420_yuy2.c

   1 /*****************************************************************************
   2  * i420_yuy2.c : YUV to YUV conversion module for vlc
   3  *****************************************************************************
   4  * Copyright (C) 2000, 2001 the VideoLAN team
   5  * $Id$
   6  *
   7  * Authors: Samuel Hocevar <sam@zoy.org>
   8  *          Damien Fouilleul <damien@videolan.org>
   9  *
  10  * This program is free software; you can redistribute it and/or modify
  11  * it under the terms of the GNU General Public License as published by
  12  * the Free Software Foundation; either version 2 of the License, or
  13  * (at your option) any later version.
  14  *
  15  * This program is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  * GNU General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU General Public License
  21  * along with this program; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
  23  *****************************************************************************/
  24
  25 /*****************************************************************************
  26  * Preamble
  27  *****************************************************************************/
  28 #include <string.h>                                            /* strerror() */
  29
  30 #include <vlc/vlc.h>
  31 #include <vlc_vout.h>
  32
  33 #if defined (MODULE_NAME_IS_i420_yuy2_altivec) && defined(HAVE_ALTIVEC_H)
  34 #   include <altivec.h>
  35 #endif
  36
  37 #include "i420_yuy2.h"
  38
  39 #define SRC_FOURCC  "I420,IYUV,YV12"
  40
  41 #if defined (MODULE_NAME_IS_i420_yuy2)
  42 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv,Y211"
  43 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
  44 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
  45 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
  46 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
  47 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
  48 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422"
  49 #endif
  50
  51 /*****************************************************************************
  52  * Local and extern prototypes.
  53  *****************************************************************************/
  54 static int  Activate ( vlc_object_t * );
  55
  56 static void I420_YUY2           ( vout_thread_t *, picture_t *, picture_t * );
  57 static void I420_YVYU           ( vout_thread_t *, picture_t *, picture_t * );
  58 static void I420_UYVY           ( vout_thread_t *, picture_t *, picture_t * );
  59 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
  60 static void I420_IUYV           ( vout_thread_t *, picture_t *, picture_t * );
  61 static void I420_cyuv           ( vout_thread_t *, picture_t *, picture_t * );
  62 #endif
  63 #if defined (MODULE_NAME_IS_i420_yuy2)
  64 static void I420_Y211           ( vout_thread_t *, picture_t *, picture_t * );
  65 #endif
  66
  67 #ifdef MODULE_NAME_IS_i420_yuy2_mmx
  68 /* Initialize MMX-specific constants */
  69 static const uint64_t i_00ffw = 0x00ff00ff00ff00ffULL;
  70 static const uint64_t i_80w   = 0x0000000080808080ULL;
  71 #endif
  72
  73 /*****************************************************************************
  74  * Module descriptor.
  75  *****************************************************************************/
  76 vlc_module_begin();
  77 #if defined (MODULE_NAME_IS_i420_yuy2)
  78     set_description( _("Conversions from " SRC_FOURCC " to " DEST_FOURCC) );
  79     set_capability( "chroma", 80 );
  80 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
  81     set_description( _("MMX conversions from " SRC_FOURCC " to " DEST_FOURCC) );
  82     set_capability( "chroma", 100 );
  83     add_requirement( MMX );
  84 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
  85     set_description( _("SSE2 conversions from " SRC_FOURCC " to " DEST_FOURCC) );
  86     set_capability( "chroma", 120 );
  87     add_requirement( SSE2 );
  88 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
  89     set_description(
  90             _("AltiVec conversions from " SRC_FOURCC " to " DEST_FOURCC) );
  91     set_capability( "chroma", 100 );
  92     add_requirement( ALTIVEC );
  93 #endif
  94     set_callbacks( Activate, NULL );
  95 vlc_module_end();
  96
  97 /*****************************************************************************
  98  * Activate: allocate a chroma function
  99  *****************************************************************************
 100  * This function allocates and initializes a chroma function
 101  *****************************************************************************/
 102 static int Activate( vlc_object_t *p_this )
 103 {
 104     vout_thread_t *p_vout = (vout_thread_t *)p_this;
 105
 106     if( p_vout->render.i_width & 1 || p_vout->render.i_height & 1 )
 107     {
 108         return -1;
 109     }
 110
 111     switch( p_vout->render.i_chroma )
 112     {
 113         case VLC_FOURCC('Y','V','1','2'):
 114         case VLC_FOURCC('I','4','2','0'):
 115         case VLC_FOURCC('I','Y','U','V'):
 116             switch( p_vout->output.i_chroma )
 117             {
 118                 case VLC_FOURCC('Y','U','Y','2'):
 119                 case VLC_FOURCC('Y','U','N','V'):
 120                     p_vout->chroma.pf_convert = I420_YUY2;
 121                     break;
 122
 123                 case VLC_FOURCC('Y','V','Y','U'):
 124                     p_vout->chroma.pf_convert = I420_YVYU;
 125                     break;
 126
 127                 case VLC_FOURCC('U','Y','V','Y'):
 128                 case VLC_FOURCC('U','Y','N','V'):
 129                 case VLC_FOURCC('Y','4','2','2'):
 130                     p_vout->chroma.pf_convert = I420_UYVY;
 131                     break;
 132 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
 133                 case VLC_FOURCC('I','U','Y','V'):
 134                     p_vout->chroma.pf_convert = I420_IUYV;
 135                     break;
 136
 137                 case VLC_FOURCC('c','y','u','v'):
 138                     p_vout->chroma.pf_convert = I420_cyuv;
 139                     break;
 140 #endif
 141
 142 #if defined (MODULE_NAME_IS_i420_yuy2)
 143                 case VLC_FOURCC('Y','2','1','1'):
 144                     p_vout->chroma.pf_convert = I420_Y211;
 145                     break;
 146 #endif
 147
 148                 default:
 149                     return -1;
 150             }
 151             break;
 152
 153         default:
 154             return -1;
 155     }
 156
 157     return 0;
 158 }
 159
 160 #if 0
 161 static inline unsigned long long read_cycles(void)
 162 {
 163     unsigned long long v;
 164     __asm__ __volatile__("rdtsc" : "=A" (v): );
 165
 166     return v;
 167 }
 168 #endif
 169
 170 /* Following functions are local */
 171 /*****************************************************************************
 172  * I420_YUY2: planar YUV 4:2:0 to packed YUYV 4:2:2
 173  *****************************************************************************/
 174 static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source,
 175                                               picture_t *p_dest )
 176 {
 177     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
 178     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
 179     uint8_t *p_u = p_source->U_PIXELS;
 180     uint8_t *p_v = p_source->V_PIXELS;
 181
 182     int i_x, i_y;
 183
 184 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 185 #define VEC_NEXT_LINES( ) \
 186     p_line1  = p_line2; \
 187     p_line2 += p_dest->p->i_pitch; \
 188     p_y1     = p_y2; \
 189     p_y2    += p_source->p[Y_PLANE].i_pitch;
 190
 191 #define VEC_LOAD_UV( ) \
 192     u_vec = vec_ld( 0, p_u ); p_u += 16; \
 193     v_vec = vec_ld( 0, p_v ); p_v += 16;
 194
 195 #define VEC_MERGE( a ) \
 196     uv_vec = a( u_vec, v_vec ); \
 197     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
 198     vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
 199     vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
 200     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
 201     vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16; \
 202     vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16;
 203
 204     vector unsigned char u_vec;
 205     vector unsigned char v_vec;
 206     vector unsigned char uv_vec;
 207     vector unsigned char y_vec;
 208
 209     if( !( ( p_vout->render.i_width % 32 ) |
 210            ( p_vout->render.i_height % 2 ) ) )
 211     {
 212         /* Width is a multiple of 32, we take 2 lines at a time */
 213         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 214         {
 215             VEC_NEXT_LINES( );
 216             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
 217             {
 218                 VEC_LOAD_UV( );
 219                 VEC_MERGE( vec_mergeh );
 220                 VEC_MERGE( vec_mergel );
 221             }
 222         }
 223     }
 224     else if( !( ( p_vout->render.i_width % 16 ) |
 225                 ( p_vout->render.i_height % 4 ) ) )
 226     {
 227         /* Width is only a multiple of 16, we take 4 lines at a time */
 228         for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
 229         {
 230             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
 231             VEC_NEXT_LINES( );
 232             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
 233             {
 234                 VEC_LOAD_UV( );
 235                 VEC_MERGE( vec_mergeh );
 236                 VEC_MERGE( vec_mergel );
 237             }
 238
 239             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
 240             VEC_LOAD_UV( );
 241             VEC_MERGE( vec_mergeh );
 242
 243             /* Line 3 and 4, pixels 0 to 16 */
 244             VEC_NEXT_LINES( );
 245             VEC_MERGE( vec_mergel );
 246
 247             /* Line 3 and 4, pixels 16 to ( width ) */
 248             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
 249             {
 250                 VEC_LOAD_UV( );
 251                 VEC_MERGE( vec_mergeh );
 252                 VEC_MERGE( vec_mergel );
 253             }
 254         }
 255     }
 256     else
 257     {
 258         /* Crap, use the C version */
 259 #undef VEC_NEXT_LINES
 260 #undef VEC_LOAD_UV
 261 #undef VEC_MERGE
 262 #endif
 263
 264     const int i_source_margin = p_source->p[0].i_pitch
 265                                  - p_source->p[0].i_visible_pitch;
 266     const int i_source_margin_c = p_source->p[1].i_pitch
 267                                  - p_source->p[1].i_visible_pitch;
 268     const int i_dest_margin = p_dest->p->i_pitch
 269                                - p_dest->p->i_visible_pitch;
 270
 271 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
 272     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 273     {
 274         p_line1 = p_line2;
 275         p_line2 += p_dest->p->i_pitch;
 276
 277         p_y1 = p_y2;
 278         p_y2 += p_source->p[Y_PLANE].i_pitch;
 279
 280 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
 281         for( i_x = p_vout->render.i_width / 8; i_x-- ; )
 282         {
 283             C_YUV420_YUYV( );
 284             C_YUV420_YUYV( );
 285             C_YUV420_YUYV( );
 286             C_YUV420_YUYV( );
 287         }
 288 #else
 289         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
 290         {
 291             MMX_CALL( MMX_YUV420_YUYV );
 292         }
 293 #endif
 294         for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; )
 295         {
 296             C_YUV420_YUYV( );
 297         }
 298
 299         p_y1 += i_source_margin;
 300         p_y2 += i_source_margin;
 301         p_u += i_source_margin_c;
 302         p_v += i_source_margin_c;
 303         p_line1 += i_dest_margin;
 304         p_line2 += i_dest_margin;
 305     }
 306
 307 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
 308     /* re-enable FPU registers */
 309     MMX_END;
 310 #endif
 311
 312 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 313     }
 314 #endif
 315
 316 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 317     /*
 318     ** SSE2 128 bits fetch/store instructions are faster
 319     ** if memory access is 16 bytes aligned
 320     */
 321
 322     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
 323         ((int)p_line2|(int)p_y2))) )
 324     {
 325         /* use faster SSE2 aligned fetch and store */
 326         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 327         {
 328             p_line1 = p_line2;
 329             p_line2 += p_dest->p->i_pitch;
 330
 331             p_y1 = p_y2;
 332             p_y2 += p_source->p[Y_PLANE].i_pitch;
 333
 334             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
 335             {
 336                 SSE2_CALL( SSE2_YUV420_YUYV_ALIGNED );
 337             }
 338             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
 339             {
 340                 C_YUV420_YUYV( );
 341             }
 342
 343             p_y1 += i_source_margin;
 344             p_y2 += i_source_margin;
 345             p_u += i_source_margin_c;
 346             p_v += i_source_margin_c;
 347             p_line1 += i_dest_margin;
 348             p_line2 += i_dest_margin;
 349         }
 350     }
 351     else
 352     {
 353         /* use slower SSE2 unaligned fetch and store */
 354         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 355         {
 356             p_line1 = p_line2;
 357             p_line2 += p_dest->p->i_pitch;
 358
 359             p_y1 = p_y2;
 360             p_y2 += p_source->p[Y_PLANE].i_pitch;
 361
 362             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
 363             {
 364                 SSE2_CALL( SSE2_YUV420_YUYV_UNALIGNED );
 365             }
 366             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
 367             {
 368                 C_YUV420_YUYV( );
 369             }
 370
 371             p_y1 += i_source_margin;
 372             p_y2 += i_source_margin;
 373             p_u += i_source_margin_c;
 374             p_v += i_source_margin_c;
 375             p_line1 += i_dest_margin;
 376             p_line2 += i_dest_margin;
 377         }
 378     }
 379     /* make sure all SSE2 stores are visible thereafter */
 380     SSE2_END;
 381
 382 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 383 }
 384
 385 /*****************************************************************************
 386  * I420_YVYU: planar YUV 4:2:0 to packed YVYU 4:2:2
 387  *****************************************************************************/
 388 static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source,
 389                                               picture_t *p_dest )
 390 {
 391     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
 392     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
 393     uint8_t *p_u = p_source->U_PIXELS;
 394     uint8_t *p_v = p_source->V_PIXELS;
 395
 396     int i_x, i_y;
 397
 398 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 399 #define VEC_NEXT_LINES( ) \
 400     p_line1  = p_line2; \
 401     p_line2 += p_dest->p->i_pitch; \
 402     p_y1     = p_y2; \
 403     p_y2    += p_source->p[Y_PLANE].i_pitch;
 404
 405 #define VEC_LOAD_UV( ) \
 406     u_vec = vec_ld( 0, p_u ); p_u += 16; \
 407     v_vec = vec_ld( 0, p_v ); p_v += 16;
 408
 409 #define VEC_MERGE( a ) \
 410     vu_vec = a( v_vec, u_vec ); \
 411     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
 412     vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
 413     vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
 414     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
 415     vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16; \
 416     vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16;
 417
 418     vector unsigned char u_vec;
 419     vector unsigned char v_vec;
 420     vector unsigned char vu_vec;
 421     vector unsigned char y_vec;
 422
 423     if( !( ( p_vout->render.i_width % 32 ) |
 424            ( p_vout->render.i_height % 2 ) ) )
 425     {
 426         /* Width is a multiple of 32, we take 2 lines at a time */
 427         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 428         {
 429             VEC_NEXT_LINES( );
 430             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
 431             {
 432                 VEC_LOAD_UV( );
 433                 VEC_MERGE( vec_mergeh );
 434                 VEC_MERGE( vec_mergel );
 435             }
 436         }
 437     }
 438     else if( !( ( p_vout->render.i_width % 16 ) |
 439                 ( p_vout->render.i_height % 4 ) ) )
 440     {
 441         /* Width is only a multiple of 16, we take 4 lines at a time */
 442         for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
 443         {
 444             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
 445             VEC_NEXT_LINES( );
 446             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
 447             {
 448                 VEC_LOAD_UV( );
 449                 VEC_MERGE( vec_mergeh );
 450                 VEC_MERGE( vec_mergel );
 451             }
 452
 453             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
 454             VEC_LOAD_UV( );
 455             VEC_MERGE( vec_mergeh );
 456
 457             /* Line 3 and 4, pixels 0 to 16 */
 458             VEC_NEXT_LINES( );
 459             VEC_MERGE( vec_mergel );
 460
 461             /* Line 3 and 4, pixels 16 to ( width ) */
 462             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
 463             {
 464                 VEC_LOAD_UV( );
 465                 VEC_MERGE( vec_mergeh );
 466                 VEC_MERGE( vec_mergel );
 467             }
 468         }
 469     }
 470     else
 471     {
 472         /* Crap, use the C version */
 473 #undef VEC_NEXT_LINES
 474 #undef VEC_LOAD_UV
 475 #undef VEC_MERGE
 476 #endif
 477
 478     const int i_source_margin = p_source->p[0].i_pitch
 479                                  - p_source->p[0].i_visible_pitch;
 480     const int i_source_margin_c = p_source->p[1].i_pitch
 481                                  - p_source->p[1].i_visible_pitch;
 482     const int i_dest_margin = p_dest->p->i_pitch
 483                                - p_dest->p->i_visible_pitch;
 484
 485 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
 486     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 487     {
 488         p_line1 = p_line2;
 489         p_line2 += p_dest->p->i_pitch;
 490
 491         p_y1 = p_y2;
 492         p_y2 += p_source->p[Y_PLANE].i_pitch;
 493
 494         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
 495         {
 496 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
 497             C_YUV420_YVYU( );
 498             C_YUV420_YVYU( );
 499             C_YUV420_YVYU( );
 500             C_YUV420_YVYU( );
 501 #else
 502             MMX_CALL( MMX_YUV420_YVYU );
 503 #endif
 504         }
 505         for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; )
 506         {
 507             C_YUV420_YVYU( );
 508         }
 509
 510         p_y1 += i_source_margin;
 511         p_y2 += i_source_margin;
 512         p_u += i_source_margin_c;
 513         p_v += i_source_margin_c;
 514         p_line1 += i_dest_margin;
 515         p_line2 += i_dest_margin;
 516     }
 517
 518 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
 519     /* re-enable FPU registers */
 520     MMX_END;
 521 #endif
 522
 523 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 524     }
 525 #endif
 526
 527 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 528     /*
 529     ** SSE2 128 bits fetch/store instructions are faster
 530     ** if memory access is 16 bytes aligned
 531     */
 532     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
 533         ((int)p_line2|(int)p_y2))) )
 534     {
 535         /* use faster SSE2 aligned fetch and store */
 536         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 537         {
 538             p_line1 = p_line2;
 539             p_line2 += p_dest->p->i_pitch;
 540
 541             p_y1 = p_y2;
 542             p_y2 += p_source->p[Y_PLANE].i_pitch;
 543
 544             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
 545             {
 546                 SSE2_CALL( SSE2_YUV420_YVYU_ALIGNED );
 547             }
 548             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
 549             {
 550                 C_YUV420_YVYU( );
 551             }
 552
 553             p_y1 += i_source_margin;
 554             p_y2 += i_source_margin;
 555             p_u += i_source_margin_c;
 556             p_v += i_source_margin_c;
 557             p_line1 += i_dest_margin;
 558             p_line2 += i_dest_margin;
 559         }
 560     }
 561     else
 562     {
 563         /* use slower SSE2 unaligned fetch and store */
 564         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 565         {
 566             p_line1 = p_line2;
 567             p_line2 += p_dest->p->i_pitch;
 568
 569             p_y1 = p_y2;
 570             p_y2 += p_source->p[Y_PLANE].i_pitch;
 571
 572             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
 573             {
 574                 SSE2_CALL( SSE2_YUV420_YVYU_UNALIGNED );
 575             }
 576             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
 577             {
 578                 C_YUV420_YVYU( );
 579             }
 580
 581             p_y1 += i_source_margin;
 582             p_y2 += i_source_margin;
 583             p_u += i_source_margin_c;
 584             p_v += i_source_margin_c;
 585             p_line1 += i_dest_margin;
 586             p_line2 += i_dest_margin;
 587         }
 588     }
 589     /* make sure all SSE2 stores are visible thereafter */
 590     SSE2_END;
 591 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 592 }
 593
 594 /*****************************************************************************
 595  * I420_UYVY: planar YUV 4:2:0 to packed UYVY 4:2:2
 596  *****************************************************************************/
 597 static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source,
 598                                               picture_t *p_dest )
 599 {
 600     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
 601     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
 602     uint8_t *p_u = p_source->U_PIXELS;
 603     uint8_t *p_v = p_source->V_PIXELS;
 604
 605     int i_x, i_y;
 606
 607 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 608 #define VEC_NEXT_LINES( ) \
 609     p_line1  = p_line2; \
 610     p_line2 += p_dest->p->i_pitch; \
 611     p_y1     = p_y2; \
 612     p_y2    += p_source->p[Y_PLANE].i_pitch;
 613
 614 #define VEC_LOAD_UV( ) \
 615     u_vec = vec_ld( 0, p_u ); p_u += 16; \
 616     v_vec = vec_ld( 0, p_v ); p_v += 16;
 617
 618 #define VEC_MERGE( a ) \
 619     uv_vec = a( u_vec, v_vec ); \
 620     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
 621     vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
 622     vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
 623     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
 624     vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16; \
 625     vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16;
 626
 627     vector unsigned char u_vec;
 628     vector unsigned char v_vec;
 629     vector unsigned char uv_vec;
 630     vector unsigned char y_vec;
 631
 632     if( !( ( p_vout->render.i_width % 32 ) |
 633            ( p_vout->render.i_height % 2 ) ) )
 634     {
 635         /* Width is a multiple of 32, we take 2 lines at a time */
 636         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 637         {
 638             VEC_NEXT_LINES( );
 639             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
 640             {
 641                 VEC_LOAD_UV( );
 642                 VEC_MERGE( vec_mergeh );
 643                 VEC_MERGE( vec_mergel );
 644             }
 645         }
 646     }
 647     else if( !( ( p_vout->render.i_width % 16 ) |
 648                 ( p_vout->render.i_height % 4 ) ) )
 649     {
 650         /* Width is only a multiple of 16, we take 4 lines at a time */
 651         for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
 652         {
 653             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
 654             VEC_NEXT_LINES( );
 655             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
 656             {
 657                 VEC_LOAD_UV( );
 658                 VEC_MERGE( vec_mergeh );
 659                 VEC_MERGE( vec_mergel );
 660             }
 661
 662             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
 663             VEC_LOAD_UV( );
 664             VEC_MERGE( vec_mergeh );
 665
 666             /* Line 3 and 4, pixels 0 to 16 */
 667             VEC_NEXT_LINES( );
 668             VEC_MERGE( vec_mergel );
 669
 670             /* Line 3 and 4, pixels 16 to ( width ) */
 671             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
 672             {
 673                 VEC_LOAD_UV( );
 674                 VEC_MERGE( vec_mergeh );
 675                 VEC_MERGE( vec_mergel );
 676             }
 677         }
 678     }
 679     else
 680     {
 681         /* Crap, use the C version */
 682 #undef VEC_NEXT_LINES
 683 #undef VEC_LOAD_UV
 684 #undef VEC_MERGE
 685 #endif
 686
 687     const int i_source_margin = p_source->p[0].i_pitch
 688                                  - p_source->p[0].i_visible_pitch;
 689     const int i_source_margin_c = p_source->p[1].i_pitch
 690                                  - p_source->p[1].i_visible_pitch;
 691     const int i_dest_margin = p_dest->p->i_pitch
 692                                - p_dest->p->i_visible_pitch;
 693
 694 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
 695     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 696     {
 697         p_line1 = p_line2;
 698         p_line2 += p_dest->p->i_pitch;
 699
 700         p_y1 = p_y2;
 701         p_y2 += p_source->p[Y_PLANE].i_pitch;
 702
 703         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
 704         {
 705 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
 706             C_YUV420_UYVY( );
 707             C_YUV420_UYVY( );
 708             C_YUV420_UYVY( );
 709             C_YUV420_UYVY( );
 710 #else
 711             MMX_CALL( MMX_YUV420_UYVY );
 712 #endif
 713         }
 714         for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x--; )
 715         {
 716             C_YUV420_UYVY( );
 717         }
 718
 719         p_y1 += i_source_margin;
 720         p_y2 += i_source_margin;
 721         p_u += i_source_margin_c;
 722         p_v += i_source_margin_c;
 723         p_line1 += i_dest_margin;
 724         p_line2 += i_dest_margin;
 725     }
 726
 727 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
 728     /* re-enable FPU registers */
 729     MMX_END;
 730 #endif
 731
 732 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 733     }
 734 #endif
 735
 736 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 737     /*
 738     ** SSE2 128 bits fetch/store instructions are faster
 739     ** if memory access is 16 bytes aligned
 740     */
 741     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
 742         ((int)p_line2|(int)p_y2))) )
 743     {
 744         /* use faster SSE2 aligned fetch and store */
 745         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 746         {
 747             p_line1 = p_line2;
 748             p_line2 += p_dest->p->i_pitch;
 749
 750             p_y1 = p_y2;
 751             p_y2 += p_source->p[Y_PLANE].i_pitch;
 752
 753             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
 754             {
 755                 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
 756             }
 757             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
 758             {
 759                 C_YUV420_UYVY( );
 760             }
 761
 762             p_y1 += i_source_margin;
 763             p_y2 += i_source_margin;
 764             p_u += i_source_margin_c;
 765             p_v += i_source_margin_c;
 766             p_line1 += i_dest_margin;
 767             p_line2 += i_dest_margin;
 768         }
 769     }
 770     else
 771     {
 772         /* use slower SSE2 unaligned fetch and store */
 773         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 774         {
 775             p_line1 = p_line2;
 776             p_line2 += p_dest->p->i_pitch;
 777
 778             p_y1 = p_y2;
 779             p_y2 += p_source->p[Y_PLANE].i_pitch;
 780
 781             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
 782             {
 783                 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
 784             }
 785             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
 786             {
 787                 C_YUV420_UYVY( );
 788             }
 789
 790             p_y1 += i_source_margin;
 791             p_y2 += i_source_margin;
 792             p_u += i_source_margin_c;
 793             p_v += i_source_margin_c;
 794             p_line1 += i_dest_margin;
 795             p_line2 += i_dest_margin;
 796         }
 797     }
 798     /* make sure all SSE2 stores are visible thereafter */
 799     SSE2_END;
 800 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 801 }
 802
 803 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
 804 /*****************************************************************************
 805  * I420_IUYV: planar YUV 4:2:0 to interleaved packed UYVY 4:2:2
 806  *****************************************************************************/
 807 static void I420_IUYV( vout_thread_t *p_vout, picture_t *p_source,
 808                                               picture_t *p_dest )
 809 {
 810     /* FIXME: TODO ! */
 811     msg_Err( p_vout, "I420_IUYV unimplemented, please harass <sam@zoy.org>" );
 812 }
 813
 814 /*****************************************************************************
 815  * I420_cyuv: planar YUV 4:2:0 to upside-down packed UYVY 4:2:2
 816  *****************************************************************************/
 817 static void I420_cyuv( vout_thread_t *p_vout, picture_t *p_source,
 818                                               picture_t *p_dest )
 819 {
 820     uint8_t *p_line1 = p_dest->p->p_pixels +
 821                        p_dest->p->i_visible_lines * p_dest->p->i_pitch
 822                        + p_dest->p->i_pitch;
 823     uint8_t *p_line2 = p_dest->p->p_pixels +
 824                        p_dest->p->i_visible_lines * p_dest->p->i_pitch;
 825     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
 826     uint8_t *p_u = p_source->U_PIXELS;
 827     uint8_t *p_v = p_source->V_PIXELS;
 828
 829     int i_x, i_y;
 830
 831     const int i_source_margin = p_source->p[0].i_pitch
 832                                  - p_source->p[0].i_visible_pitch;
 833     const int i_source_margin_c = p_source->p[1].i_pitch
 834                                  - p_source->p[1].i_visible_pitch;
 835     const int i_dest_margin = p_dest->p->i_pitch
 836                                - p_dest->p->i_visible_pitch;
 837
 838 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
 839     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 840     {
 841         p_line1 -= 3 * p_dest->p->i_pitch;
 842         p_line2 -= 3 * p_dest->p->i_pitch;
 843
 844         p_y1 = p_y2;
 845         p_y2 += p_source->p[Y_PLANE].i_pitch;
 846
 847         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
 848         {
 849 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
 850             C_YUV420_UYVY( );
 851             C_YUV420_UYVY( );
 852             C_YUV420_UYVY( );
 853             C_YUV420_UYVY( );
 854 #else
 855             MMX_CALL( MMX_YUV420_UYVY );
 856 #endif
 857         }
 858         for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; )
 859         {
 860             C_YUV420_UYVY( );
 861         }
 862
 863         p_y1 += i_source_margin;
 864         p_y2 += i_source_margin;
 865         p_u += i_source_margin_c;
 866         p_v += i_source_margin_c;
 867         p_line1 += i_dest_margin;
 868         p_line2 += i_dest_margin;
 869     }
 870
 871 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
 872     /* re-enable FPU registers */
 873     MMX_END;
 874 #endif
 875
 876 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 877     /*
 878     ** SSE2 128 bits fetch/store instructions are faster
 879     ** if memory access is 16 bytes aligned
 880     */
 881     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
 882         ((int)p_line2|(int)p_y2))) )
 883     {
 884         /* use faster SSE2 aligned fetch and store */
 885         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 886         {
 887             p_line1 = p_line2;
 888             p_line2 += p_dest->p->i_pitch;
 889
 890             p_y1 = p_y2;
 891             p_y2 += p_source->p[Y_PLANE].i_pitch;
 892
 893             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
 894             {
 895                 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
 896             }
 897             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
 898             {
 899                 C_YUV420_UYVY( );
 900             }
 901
 902             p_y1 += i_source_margin;
 903             p_y2 += i_source_margin;
 904             p_u += i_source_margin_c;
 905             p_v += i_source_margin_c;
 906             p_line1 += i_dest_margin;
 907             p_line2 += i_dest_margin;
 908         }
 909     }
 910     else
 911     {
 912         /* use slower SSE2 unaligned fetch and store */
 913         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 914         {
 915             p_line1 = p_line2;
 916             p_line2 += p_dest->p->i_pitch;
 917
 918             p_y1 = p_y2;
 919             p_y2 += p_source->p[Y_PLANE].i_pitch;
 920
 921             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
 922             {
 923                 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
 924             }
 925             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
 926             {
 927                 C_YUV420_UYVY( );
 928             }
 929
 930             p_y1 += i_source_margin;
 931             p_y2 += i_source_margin;
 932             p_u += i_source_margin_c;
 933             p_v += i_source_margin_c;
 934             p_line1 += i_dest_margin;
 935             p_line2 += i_dest_margin;
 936         }
 937     }
 938     /* make sure all SSE2 stores are visible thereafter */
 939     SSE2_END;
 940 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 941 }
 942 #endif // !defined (MODULE_NAME_IS_i420_yuy2_altivec)
 943
 944 /*****************************************************************************
 945  * I420_Y211: planar YUV 4:2:0 to packed YUYV 2:1:1
 946  *****************************************************************************/
 947 #if defined (MODULE_NAME_IS_i420_yuy2)
 948 static void I420_Y211( vout_thread_t *p_vout, picture_t *p_source,
 949                                               picture_t *p_dest )
 950 {
 951     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
 952     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
 953     uint8_t *p_u = p_source->U_PIXELS;
 954     uint8_t *p_v = p_source->V_PIXELS;
 955
 956     int i_x, i_y;
 957
 958     const int i_source_margin = p_source->p[0].i_pitch
 959                                  - p_source->p[0].i_visible_pitch;
 960     const int i_source_margin_c = p_source->p[1].i_pitch
 961                                  - p_source->p[1].i_visible_pitch;
 962     const int i_dest_margin = p_dest->p->i_pitch
 963                                - p_dest->p->i_visible_pitch;
 964
 965     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 966     {
 967         p_line1 = p_line2;
 968         p_line2 += p_dest->p->i_pitch;
 969
 970         p_y1 = p_y2;
 971         p_y2 += p_source->p[Y_PLANE].i_pitch;
 972
 973         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
 974         {
 975             C_YUV420_Y211( );
 976             C_YUV420_Y211( );
 977         }
 978
 979         p_y1 += i_source_margin;
 980         p_y2 += i_source_margin;
 981         p_u += i_source_margin_c;
 982         p_v += i_source_margin_c;
 983         p_line1 += i_dest_margin;
 984         p_line2 += i_dest_margin;
 985     }
 986 }
 987 #endif