git.sesse.net Git - vlc/blob - modules/video_chroma/i420_yuy2.c

   1 /*****************************************************************************
   2  * i420_yuy2.c : YUV to YUV conversion module for vlc
   3  *****************************************************************************
   4  * Copyright (C) 2000, 2001 the VideoLAN team
   5  * $Id$
   6  *
   7  * Authors: Samuel Hocevar <sam@zoy.org>
   8  *          Damien Fouilleul <damien@videolan.org>
   9  *
  10  * This program is free software; you can redistribute it and/or modify
  11  * it under the terms of the GNU General Public License as published by
  12  * the Free Software Foundation; either version 2 of the License, or
  13  * (at your option) any later version.
  14  *
  15  * This program is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  * GNU General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU General Public License
  21  * along with this program; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
  23  *****************************************************************************/
  24
  25 /*****************************************************************************
  26  * Preamble
  27  *****************************************************************************/
  28 #include <string.h>                                            /* strerror() */
  29 #include <stdlib.h>                                      /* malloc(), free() */
  30
  31 #include <vlc/vlc.h>
  32 #include <vlc_vout.h>
  33
  34 #if defined (MODULE_NAME_IS_i420_yuy2_altivec) && defined(HAVE_ALTIVEC_H)
  35 #   include <altivec.h>
  36 #endif
  37
  38 #include "i420_yuy2.h"
  39
  40 #define SRC_FOURCC  "I420,IYUV,YV12"
  41
  42 #if defined (MODULE_NAME_IS_i420_yuy2)
  43 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv,Y211"
  44 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
  45 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
  46 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
  47 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
  48 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
  49 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422"
  50 #endif
  51
  52 /*****************************************************************************
  53  * Local and extern prototypes.
  54  *****************************************************************************/
  55 static int  Activate ( vlc_object_t * );
  56
  57 static void I420_YUY2           ( vout_thread_t *, picture_t *, picture_t * );
  58 static void I420_YVYU           ( vout_thread_t *, picture_t *, picture_t * );
  59 static void I420_UYVY           ( vout_thread_t *, picture_t *, picture_t * );
  60 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
  61 static void I420_IUYV           ( vout_thread_t *, picture_t *, picture_t * );
  62 static void I420_cyuv           ( vout_thread_t *, picture_t *, picture_t * );
  63 #endif
  64 #if defined (MODULE_NAME_IS_i420_yuy2)
  65 static void I420_Y211           ( vout_thread_t *, picture_t *, picture_t * );
  66 #endif
  67
  68 #ifdef MODULE_NAME_IS_i420_yuy2_mmx
  69 /* Initialize MMX-specific constants */
  70 static const uint64_t i_00ffw = 0x00ff00ff00ff00ffULL;
  71 static const uint64_t i_80w   = 0x0000000080808080ULL;
  72 #endif
  73
  74 /*****************************************************************************
  75  * Module descriptor.
  76  *****************************************************************************/
  77 vlc_module_begin();
  78 #if defined (MODULE_NAME_IS_i420_yuy2)
  79     set_description( _("Conversions from " SRC_FOURCC " to " DEST_FOURCC) );
  80     set_capability( "chroma", 80 );
  81 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
  82     set_description( _("MMX conversions from " SRC_FOURCC " to " DEST_FOURCC) );
  83     set_capability( "chroma", 100 );
  84     add_requirement( MMX );
  85 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
  86     set_description( _("SSE2 conversions from " SRC_FOURCC " to " DEST_FOURCC) );
  87     set_capability( "chroma", 120 );
  88     add_requirement( SSE2 );
  89 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
  90     set_description(
  91             _("AltiVec conversions from " SRC_FOURCC " to " DEST_FOURCC) );
  92     set_capability( "chroma", 100 );
  93     add_requirement( ALTIVEC );
  94 #endif
  95     set_callbacks( Activate, NULL );
  96 vlc_module_end();
  97
  98 /*****************************************************************************
  99  * Activate: allocate a chroma function
 100  *****************************************************************************
 101  * This function allocates and initializes a chroma function
 102  *****************************************************************************/
 103 static int Activate( vlc_object_t *p_this )
 104 {
 105     vout_thread_t *p_vout = (vout_thread_t *)p_this;
 106
 107     if( p_vout->render.i_width & 1 || p_vout->render.i_height & 1 )
 108     {
 109         return -1;
 110     }
 111
 112     switch( p_vout->render.i_chroma )
 113     {
 114         case VLC_FOURCC('Y','V','1','2'):
 115         case VLC_FOURCC('I','4','2','0'):
 116         case VLC_FOURCC('I','Y','U','V'):
 117             switch( p_vout->output.i_chroma )
 118             {
 119                 case VLC_FOURCC('Y','U','Y','2'):
 120                 case VLC_FOURCC('Y','U','N','V'):
 121                     p_vout->chroma.pf_convert = I420_YUY2;
 122                     break;
 123
 124                 case VLC_FOURCC('Y','V','Y','U'):
 125                     p_vout->chroma.pf_convert = I420_YVYU;
 126                     break;
 127
 128                 case VLC_FOURCC('U','Y','V','Y'):
 129                 case VLC_FOURCC('U','Y','N','V'):
 130                 case VLC_FOURCC('Y','4','2','2'):
 131                     p_vout->chroma.pf_convert = I420_UYVY;
 132                     break;
 133 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
 134                 case VLC_FOURCC('I','U','Y','V'):
 135                     p_vout->chroma.pf_convert = I420_IUYV;
 136                     break;
 137
 138                 case VLC_FOURCC('c','y','u','v'):
 139                     p_vout->chroma.pf_convert = I420_cyuv;
 140                     break;
 141 #endif
 142
 143 #if defined (MODULE_NAME_IS_i420_yuy2)
 144                 case VLC_FOURCC('Y','2','1','1'):
 145                     p_vout->chroma.pf_convert = I420_Y211;
 146                     break;
 147 #endif
 148
 149                 default:
 150                     return -1;
 151             }
 152             break;
 153
 154         default:
 155             return -1;
 156     }
 157
 158     return 0;
 159 }
 160
 161 #if 0
 162 static inline unsigned long long read_cycles(void)
 163 {
 164     unsigned long long v;
 165     __asm__ __volatile__("rdtsc" : "=A" (v): );
 166
 167     return v;
 168 }
 169 #endif
 170
 171 /* Following functions are local */
 172 /*****************************************************************************
 173  * I420_YUY2: planar YUV 4:2:0 to packed YUYV 4:2:2
 174  *****************************************************************************/
 175 static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source,
 176                                               picture_t *p_dest )
 177 {
 178     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
 179     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
 180     uint8_t *p_u = p_source->U_PIXELS;
 181     uint8_t *p_v = p_source->V_PIXELS;
 182
 183     int i_x, i_y;
 184
 185 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 186 #define VEC_NEXT_LINES( ) \
 187     p_line1  = p_line2; \
 188     p_line2 += p_dest->p->i_pitch; \
 189     p_y1     = p_y2; \
 190     p_y2    += p_source->p[Y_PLANE].i_pitch;
 191
 192 #define VEC_LOAD_UV( ) \
 193     u_vec = vec_ld( 0, p_u ); p_u += 16; \
 194     v_vec = vec_ld( 0, p_v ); p_v += 16;
 195
 196 #define VEC_MERGE( a ) \
 197     uv_vec = a( u_vec, v_vec ); \
 198     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
 199     vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
 200     vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
 201     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
 202     vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16; \
 203     vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16;
 204
 205     vector unsigned char u_vec;
 206     vector unsigned char v_vec;
 207     vector unsigned char uv_vec;
 208     vector unsigned char y_vec;
 209
 210     if( !( ( p_vout->render.i_width % 32 ) |
 211            ( p_vout->render.i_height % 2 ) ) )
 212     {
 213         /* Width is a multiple of 32, we take 2 lines at a time */
 214         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 215         {
 216             VEC_NEXT_LINES( );
 217             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
 218             {
 219                 VEC_LOAD_UV( );
 220                 VEC_MERGE( vec_mergeh );
 221                 VEC_MERGE( vec_mergel );
 222             }
 223         }
 224     }
 225     else if( !( ( p_vout->render.i_width % 16 ) |
 226                 ( p_vout->render.i_height % 4 ) ) )
 227     {
 228         /* Width is only a multiple of 16, we take 4 lines at a time */
 229         for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
 230         {
 231             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
 232             VEC_NEXT_LINES( );
 233             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
 234             {
 235                 VEC_LOAD_UV( );
 236                 VEC_MERGE( vec_mergeh );
 237                 VEC_MERGE( vec_mergel );
 238             }
 239
 240             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
 241             VEC_LOAD_UV( );
 242             VEC_MERGE( vec_mergeh );
 243
 244             /* Line 3 and 4, pixels 0 to 16 */
 245             VEC_NEXT_LINES( );
 246             VEC_MERGE( vec_mergel );
 247
 248             /* Line 3 and 4, pixels 16 to ( width ) */
 249             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
 250             {
 251                 VEC_LOAD_UV( );
 252                 VEC_MERGE( vec_mergeh );
 253                 VEC_MERGE( vec_mergel );
 254             }
 255         }
 256     }
 257     else
 258     {
 259         /* Crap, use the C version */
 260 #undef VEC_NEXT_LINES
 261 #undef VEC_LOAD_UV
 262 #undef VEC_MERGE
 263 #endif
 264
 265     const int i_source_margin = p_source->p[0].i_pitch
 266                                  - p_source->p[0].i_visible_pitch;
 267     const int i_source_margin_c = p_source->p[1].i_pitch
 268                                  - p_source->p[1].i_visible_pitch;
 269     const int i_dest_margin = p_dest->p->i_pitch
 270                                - p_dest->p->i_visible_pitch;
 271
 272 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
 273     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 274     {
 275         p_line1 = p_line2;
 276         p_line2 += p_dest->p->i_pitch;
 277
 278         p_y1 = p_y2;
 279         p_y2 += p_source->p[Y_PLANE].i_pitch;
 280
 281 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
 282         for( i_x = p_vout->render.i_width / 8; i_x-- ; )
 283         {
 284             C_YUV420_YUYV( );
 285             C_YUV420_YUYV( );
 286             C_YUV420_YUYV( );
 287             C_YUV420_YUYV( );
 288         }
 289 #else
 290         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
 291         {
 292             MMX_CALL( MMX_YUV420_YUYV );
 293         }
 294 #endif
 295         for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; )
 296         {
 297             C_YUV420_YUYV( );
 298         }
 299
 300         p_y1 += i_source_margin;
 301         p_y2 += i_source_margin;
 302         p_u += i_source_margin_c;
 303         p_v += i_source_margin_c;
 304         p_line1 += i_dest_margin;
 305         p_line2 += i_dest_margin;
 306     }
 307
 308 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
 309     __asm__ __volatile__("emms" :: );
 310 #endif
 311
 312 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 313     }
 314 #endif
 315
 316 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 317     /*
 318     ** SSE2 128 bits fetch/store instructions are faster
 319     ** if memory access is 16 bytes aligned
 320     */
 321
 322     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
 323         ((int)p_line2|(int)p_y2))) )
 324     {
 325         /* use faster SSE2 aligned fetch and store */
 326         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 327         {
 328             p_line1 = p_line2;
 329             p_line2 += p_dest->p->i_pitch;
 330
 331             p_y1 = p_y2;
 332             p_y2 += p_source->p[Y_PLANE].i_pitch;
 333
 334             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
 335             {
 336                 SSE2_CALL( SSE2_YUV420_YUYV_ALIGNED );
 337             }
 338             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
 339             {
 340                 C_YUV420_YUYV( );
 341             }
 342
 343             p_y1 += i_source_margin;
 344             p_y2 += i_source_margin;
 345             p_u += i_source_margin_c;
 346             p_v += i_source_margin_c;
 347             p_line1 += i_dest_margin;
 348             p_line2 += i_dest_margin;
 349         }
 350     }
 351     else
 352     {
 353         /* use slower SSE2 unaligned fetch and store */
 354         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 355         {
 356             p_line1 = p_line2;
 357             p_line2 += p_dest->p->i_pitch;
 358
 359             p_y1 = p_y2;
 360             p_y2 += p_source->p[Y_PLANE].i_pitch;
 361
 362             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
 363             {
 364                 SSE2_CALL( SSE2_YUV420_YUYV_UNALIGNED );
 365             }
 366             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
 367             {
 368                 C_YUV420_YUYV( );
 369             }
 370
 371             p_y1 += i_source_margin;
 372             p_y2 += i_source_margin;
 373             p_u += i_source_margin_c;
 374             p_v += i_source_margin_c;
 375             p_line1 += i_dest_margin;
 376             p_line2 += i_dest_margin;
 377         }
 378     }
 379
 380 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 381 }
 382
 383 /*****************************************************************************
 384  * I420_YVYU: planar YUV 4:2:0 to packed YVYU 4:2:2
 385  *****************************************************************************/
 386 static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source,
 387                                               picture_t *p_dest )
 388 {
 389     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
 390     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
 391     uint8_t *p_u = p_source->U_PIXELS;
 392     uint8_t *p_v = p_source->V_PIXELS;
 393
 394     int i_x, i_y;
 395
 396 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 397 #define VEC_NEXT_LINES( ) \
 398     p_line1  = p_line2; \
 399     p_line2 += p_dest->p->i_pitch; \
 400     p_y1     = p_y2; \
 401     p_y2    += p_source->p[Y_PLANE].i_pitch;
 402
 403 #define VEC_LOAD_UV( ) \
 404     u_vec = vec_ld( 0, p_u ); p_u += 16; \
 405     v_vec = vec_ld( 0, p_v ); p_v += 16;
 406
 407 #define VEC_MERGE( a ) \
 408     vu_vec = a( v_vec, u_vec ); \
 409     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
 410     vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
 411     vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
 412     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
 413     vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16; \
 414     vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16;
 415
 416     vector unsigned char u_vec;
 417     vector unsigned char v_vec;
 418     vector unsigned char vu_vec;
 419     vector unsigned char y_vec;
 420
 421     if( !( ( p_vout->render.i_width % 32 ) |
 422            ( p_vout->render.i_height % 2 ) ) )
 423     {
 424         /* Width is a multiple of 32, we take 2 lines at a time */
 425         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 426         {
 427             VEC_NEXT_LINES( );
 428             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
 429             {
 430                 VEC_LOAD_UV( );
 431                 VEC_MERGE( vec_mergeh );
 432                 VEC_MERGE( vec_mergel );
 433             }
 434         }
 435     }
 436     else if( !( ( p_vout->render.i_width % 16 ) |
 437                 ( p_vout->render.i_height % 4 ) ) )
 438     {
 439         /* Width is only a multiple of 16, we take 4 lines at a time */
 440         for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
 441         {
 442             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
 443             VEC_NEXT_LINES( );
 444             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
 445             {
 446                 VEC_LOAD_UV( );
 447                 VEC_MERGE( vec_mergeh );
 448                 VEC_MERGE( vec_mergel );
 449             }
 450
 451             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
 452             VEC_LOAD_UV( );
 453             VEC_MERGE( vec_mergeh );
 454
 455             /* Line 3 and 4, pixels 0 to 16 */
 456             VEC_NEXT_LINES( );
 457             VEC_MERGE( vec_mergel );
 458
 459             /* Line 3 and 4, pixels 16 to ( width ) */
 460             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
 461             {
 462                 VEC_LOAD_UV( );
 463                 VEC_MERGE( vec_mergeh );
 464                 VEC_MERGE( vec_mergel );
 465             }
 466         }
 467     }
 468     else
 469     {
 470         /* Crap, use the C version */
 471 #undef VEC_NEXT_LINES
 472 #undef VEC_LOAD_UV
 473 #undef VEC_MERGE
 474 #endif
 475
 476     const int i_source_margin = p_source->p[0].i_pitch
 477                                  - p_source->p[0].i_visible_pitch;
 478     const int i_source_margin_c = p_source->p[1].i_pitch
 479                                  - p_source->p[1].i_visible_pitch;
 480     const int i_dest_margin = p_dest->p->i_pitch
 481                                - p_dest->p->i_visible_pitch;
 482
 483 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
 484     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 485     {
 486         p_line1 = p_line2;
 487         p_line2 += p_dest->p->i_pitch;
 488
 489         p_y1 = p_y2;
 490         p_y2 += p_source->p[Y_PLANE].i_pitch;
 491
 492         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
 493         {
 494 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
 495             C_YUV420_YVYU( );
 496             C_YUV420_YVYU( );
 497             C_YUV420_YVYU( );
 498             C_YUV420_YVYU( );
 499 #else
 500             MMX_CALL( MMX_YUV420_YVYU );
 501 #endif
 502         }
 503         for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; )
 504         {
 505             C_YUV420_YVYU( );
 506         }
 507
 508         p_y1 += i_source_margin;
 509         p_y2 += i_source_margin;
 510         p_u += i_source_margin_c;
 511         p_v += i_source_margin_c;
 512         p_line1 += i_dest_margin;
 513         p_line2 += i_dest_margin;
 514     }
 515
 516 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
 517     __asm__ __volatile__("emms" :: );
 518 #endif
 519
 520 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 521     }
 522 #endif
 523
 524 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 525     /*
 526     ** SSE2 128 bits fetch/store instructions are faster
 527     ** if memory access is 16 bytes aligned
 528     */
 529     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
 530         ((int)p_line2|(int)p_y2))) )
 531     {
 532         /* use faster SSE2 aligned fetch and store */
 533         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 534         {
 535             p_line1 = p_line2;
 536             p_line2 += p_dest->p->i_pitch;
 537
 538             p_y1 = p_y2;
 539             p_y2 += p_source->p[Y_PLANE].i_pitch;
 540
 541             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
 542             {
 543                 SSE2_CALL( SSE2_YUV420_YVYU_ALIGNED );
 544             }
 545             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
 546             {
 547                 C_YUV420_YVYU( );
 548             }
 549
 550             p_y1 += i_source_margin;
 551             p_y2 += i_source_margin;
 552             p_u += i_source_margin_c;
 553             p_v += i_source_margin_c;
 554             p_line1 += i_dest_margin;
 555             p_line2 += i_dest_margin;
 556         }
 557     }
 558     else
 559     {
 560         /* use slower SSE2 unaligned fetch and store */
 561         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 562         {
 563             p_line1 = p_line2;
 564             p_line2 += p_dest->p->i_pitch;
 565
 566             p_y1 = p_y2;
 567             p_y2 += p_source->p[Y_PLANE].i_pitch;
 568
 569             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
 570             {
 571                 SSE2_CALL( SSE2_YUV420_YVYU_UNALIGNED );
 572             }
 573             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
 574             {
 575                 C_YUV420_YVYU( );
 576             }
 577
 578             p_y1 += i_source_margin;
 579             p_y2 += i_source_margin;
 580             p_u += i_source_margin_c;
 581             p_v += i_source_margin_c;
 582             p_line1 += i_dest_margin;
 583             p_line2 += i_dest_margin;
 584         }
 585     }
 586 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 587 }
 588
 589 /*****************************************************************************
 590  * I420_UYVY: planar YUV 4:2:0 to packed UYVY 4:2:2
 591  *****************************************************************************/
 592 static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source,
 593                                               picture_t *p_dest )
 594 {
 595     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
 596     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
 597     uint8_t *p_u = p_source->U_PIXELS;
 598     uint8_t *p_v = p_source->V_PIXELS;
 599
 600     int i_x, i_y;
 601
 602 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 603 #define VEC_NEXT_LINES( ) \
 604     p_line1  = p_line2; \
 605     p_line2 += p_dest->p->i_pitch; \
 606     p_y1     = p_y2; \
 607     p_y2    += p_source->p[Y_PLANE].i_pitch;
 608
 609 #define VEC_LOAD_UV( ) \
 610     u_vec = vec_ld( 0, p_u ); p_u += 16; \
 611     v_vec = vec_ld( 0, p_v ); p_v += 16;
 612
 613 #define VEC_MERGE( a ) \
 614     uv_vec = a( u_vec, v_vec ); \
 615     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
 616     vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
 617     vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
 618     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
 619     vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16; \
 620     vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16;
 621
 622     vector unsigned char u_vec;
 623     vector unsigned char v_vec;
 624     vector unsigned char uv_vec;
 625     vector unsigned char y_vec;
 626
 627     if( !( ( p_vout->render.i_width % 32 ) |
 628            ( p_vout->render.i_height % 2 ) ) )
 629     {
 630         /* Width is a multiple of 32, we take 2 lines at a time */
 631         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 632         {
 633             VEC_NEXT_LINES( );
 634             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
 635             {
 636                 VEC_LOAD_UV( );
 637                 VEC_MERGE( vec_mergeh );
 638                 VEC_MERGE( vec_mergel );
 639             }
 640         }
 641     }
 642     else if( !( ( p_vout->render.i_width % 16 ) |
 643                 ( p_vout->render.i_height % 4 ) ) )
 644     {
 645         /* Width is only a multiple of 16, we take 4 lines at a time */
 646         for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
 647         {
 648             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
 649             VEC_NEXT_LINES( );
 650             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
 651             {
 652                 VEC_LOAD_UV( );
 653                 VEC_MERGE( vec_mergeh );
 654                 VEC_MERGE( vec_mergel );
 655             }
 656
 657             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
 658             VEC_LOAD_UV( );
 659             VEC_MERGE( vec_mergeh );
 660
 661             /* Line 3 and 4, pixels 0 to 16 */
 662             VEC_NEXT_LINES( );
 663             VEC_MERGE( vec_mergel );
 664
 665             /* Line 3 and 4, pixels 16 to ( width ) */
 666             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
 667             {
 668                 VEC_LOAD_UV( );
 669                 VEC_MERGE( vec_mergeh );
 670                 VEC_MERGE( vec_mergel );
 671             }
 672         }
 673     }
 674     else
 675     {
 676         /* Crap, use the C version */
 677 #undef VEC_NEXT_LINES
 678 #undef VEC_LOAD_UV
 679 #undef VEC_MERGE
 680 #endif
 681
 682     const int i_source_margin = p_source->p[0].i_pitch
 683                                  - p_source->p[0].i_visible_pitch;
 684     const int i_source_margin_c = p_source->p[1].i_pitch
 685                                  - p_source->p[1].i_visible_pitch;
 686     const int i_dest_margin = p_dest->p->i_pitch
 687                                - p_dest->p->i_visible_pitch;
 688
 689 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
 690     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 691     {
 692         p_line1 = p_line2;
 693         p_line2 += p_dest->p->i_pitch;
 694
 695         p_y1 = p_y2;
 696         p_y2 += p_source->p[Y_PLANE].i_pitch;
 697
 698         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
 699         {
 700 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
 701             C_YUV420_UYVY( );
 702             C_YUV420_UYVY( );
 703             C_YUV420_UYVY( );
 704             C_YUV420_UYVY( );
 705 #else
 706             MMX_CALL( MMX_YUV420_UYVY );
 707 #endif
 708         }
 709         for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x--; )
 710         {
 711             C_YUV420_UYVY( );
 712         }
 713
 714         p_y1 += i_source_margin;
 715         p_y2 += i_source_margin;
 716         p_u += i_source_margin_c;
 717         p_v += i_source_margin_c;
 718         p_line1 += i_dest_margin;
 719         p_line2 += i_dest_margin;
 720     }
 721
 722 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
 723     __asm__ __volatile__("emms" :: );
 724 #endif
 725
 726 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 727     }
 728 #endif
 729
 730 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 731     /*
 732     ** SSE2 128 bits fetch/store instructions are faster
 733     ** if memory access is 16 bytes aligned
 734     */
 735     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
 736         ((int)p_line2|(int)p_y2))) )
 737     {
 738         /* use faster SSE2 aligned fetch and store */
 739         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 740         {
 741             p_line1 = p_line2;
 742             p_line2 += p_dest->p->i_pitch;
 743
 744             p_y1 = p_y2;
 745             p_y2 += p_source->p[Y_PLANE].i_pitch;
 746
 747             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
 748             {
 749                 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
 750             }
 751             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
 752             {
 753                 C_YUV420_UYVY( );
 754             }
 755
 756             p_y1 += i_source_margin;
 757             p_y2 += i_source_margin;
 758             p_u += i_source_margin_c;
 759             p_v += i_source_margin_c;
 760             p_line1 += i_dest_margin;
 761             p_line2 += i_dest_margin;
 762         }
 763     }
 764     else
 765     {
 766         /* use slower SSE2 unaligned fetch and store */
 767         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 768         {
 769             p_line1 = p_line2;
 770             p_line2 += p_dest->p->i_pitch;
 771
 772             p_y1 = p_y2;
 773             p_y2 += p_source->p[Y_PLANE].i_pitch;
 774
 775             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
 776             {
 777                 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
 778             }
 779             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
 780             {
 781                 C_YUV420_UYVY( );
 782             }
 783
 784             p_y1 += i_source_margin;
 785             p_y2 += i_source_margin;
 786             p_u += i_source_margin_c;
 787             p_v += i_source_margin_c;
 788             p_line1 += i_dest_margin;
 789             p_line2 += i_dest_margin;
 790         }
 791     }
 792 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 793 }
 794
 795 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
 796 /*****************************************************************************
 797  * I420_IUYV: planar YUV 4:2:0 to interleaved packed UYVY 4:2:2
 798  *****************************************************************************/
 799 static void I420_IUYV( vout_thread_t *p_vout, picture_t *p_source,
 800                                               picture_t *p_dest )
 801 {
 802     /* FIXME: TODO ! */
 803     msg_Err( p_vout, "I420_IUYV unimplemented, please harass <sam@zoy.org>" );
 804 }
 805
 806 /*****************************************************************************
 807  * I420_cyuv: planar YUV 4:2:0 to upside-down packed UYVY 4:2:2
 808  *****************************************************************************/
 809 static void I420_cyuv( vout_thread_t *p_vout, picture_t *p_source,
 810                                               picture_t *p_dest )
 811 {
 812     uint8_t *p_line1 = p_dest->p->p_pixels +
 813                        p_dest->p->i_visible_lines * p_dest->p->i_pitch
 814                        + p_dest->p->i_pitch;
 815     uint8_t *p_line2 = p_dest->p->p_pixels +
 816                        p_dest->p->i_visible_lines * p_dest->p->i_pitch;
 817     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
 818     uint8_t *p_u = p_source->U_PIXELS;
 819     uint8_t *p_v = p_source->V_PIXELS;
 820
 821     int i_x, i_y;
 822
 823     const int i_source_margin = p_source->p[0].i_pitch
 824                                  - p_source->p[0].i_visible_pitch;
 825     const int i_source_margin_c = p_source->p[1].i_pitch
 826                                  - p_source->p[1].i_visible_pitch;
 827     const int i_dest_margin = p_dest->p->i_pitch
 828                                - p_dest->p->i_visible_pitch;
 829
 830 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
 831     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 832     {
 833         p_line1 -= 3 * p_dest->p->i_pitch;
 834         p_line2 -= 3 * p_dest->p->i_pitch;
 835
 836         p_y1 = p_y2;
 837         p_y2 += p_source->p[Y_PLANE].i_pitch;
 838
 839         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
 840         {
 841 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
 842             C_YUV420_UYVY( );
 843             C_YUV420_UYVY( );
 844             C_YUV420_UYVY( );
 845             C_YUV420_UYVY( );
 846 #else
 847             MMX_CALL( MMX_YUV420_UYVY );
 848 #endif
 849         }
 850         for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; )
 851         {
 852             C_YUV420_UYVY( );
 853         }
 854
 855         p_y1 += i_source_margin;
 856         p_y2 += i_source_margin;
 857         p_u += i_source_margin_c;
 858         p_v += i_source_margin_c;
 859         p_line1 += i_dest_margin;
 860         p_line2 += i_dest_margin;
 861     }
 862
 863 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
 864     __asm__ __volatile__("emms" :: );
 865 #endif
 866
 867 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 868     /*
 869     ** SSE2 128 bits fetch/store instructions are faster
 870     ** if memory access is 16 bytes aligned
 871     */
 872     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
 873         ((int)p_line2|(int)p_y2))) )
 874     {
 875         /* use faster SSE2 aligned fetch and store */
 876         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 877         {
 878             p_line1 = p_line2;
 879             p_line2 += p_dest->p->i_pitch;
 880
 881             p_y1 = p_y2;
 882             p_y2 += p_source->p[Y_PLANE].i_pitch;
 883
 884             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
 885             {
 886                 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
 887             }
 888             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
 889             {
 890                 C_YUV420_UYVY( );
 891             }
 892
 893             p_y1 += i_source_margin;
 894             p_y2 += i_source_margin;
 895             p_u += i_source_margin_c;
 896             p_v += i_source_margin_c;
 897             p_line1 += i_dest_margin;
 898             p_line2 += i_dest_margin;
 899         }
 900     }
 901     else
 902     {
 903         /* use slower SSE2 unaligned fetch and store */
 904         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 905         {
 906             p_line1 = p_line2;
 907             p_line2 += p_dest->p->i_pitch;
 908
 909             p_y1 = p_y2;
 910             p_y2 += p_source->p[Y_PLANE].i_pitch;
 911
 912             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
 913             {
 914                 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
 915             }
 916             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
 917             {
 918                 C_YUV420_UYVY( );
 919             }
 920
 921             p_y1 += i_source_margin;
 922             p_y2 += i_source_margin;
 923             p_u += i_source_margin_c;
 924             p_v += i_source_margin_c;
 925             p_line1 += i_dest_margin;
 926             p_line2 += i_dest_margin;
 927         }
 928     }
 929 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 930 }
 931 #endif // !defined (MODULE_NAME_IS_i420_yuy2_altivec)
 932
 933 /*****************************************************************************
 934  * I420_Y211: planar YUV 4:2:0 to packed YUYV 2:1:1
 935  *****************************************************************************/
 936 #if defined (MODULE_NAME_IS_i420_yuy2)
 937 static void I420_Y211( vout_thread_t *p_vout, picture_t *p_source,
 938                                               picture_t *p_dest )
 939 {
 940     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
 941     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
 942     uint8_t *p_u = p_source->U_PIXELS;
 943     uint8_t *p_v = p_source->V_PIXELS;
 944
 945     int i_x, i_y;
 946
 947     const int i_source_margin = p_source->p[0].i_pitch
 948                                  - p_source->p[0].i_visible_pitch;
 949     const int i_source_margin_c = p_source->p[1].i_pitch
 950                                  - p_source->p[1].i_visible_pitch;
 951     const int i_dest_margin = p_dest->p->i_pitch
 952                                - p_dest->p->i_visible_pitch;
 953
 954     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 955     {
 956         p_line1 = p_line2;
 957         p_line2 += p_dest->p->i_pitch;
 958
 959         p_y1 = p_y2;
 960         p_y2 += p_source->p[Y_PLANE].i_pitch;
 961
 962         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
 963         {
 964             C_YUV420_Y211( );
 965             C_YUV420_Y211( );
 966         }
 967
 968         p_y1 += i_source_margin;
 969         p_y2 += i_source_margin;
 970         p_u += i_source_margin_c;
 971         p_v += i_source_margin_c;
 972         p_line1 += i_dest_margin;
 973         p_line2 += i_dest_margin;
 974     }
 975 }
 976 #endif