git.sesse.net Git - vlc/blob - modules/video_chroma/i420_yuy2.c

   1 /*****************************************************************************
   2  * i420_yuy2.c : YUV to YUV conversion module for vlc
   3  *****************************************************************************
   4  * Copyright (C) 2000, 2001 the VideoLAN team
   5  * $Id$
   6  *
   7  * Authors: Samuel Hocevar <sam@zoy.org>
   8  *          Damien Fouilleul <damien@videolan.org>
   9  *
  10  * This program is free software; you can redistribute it and/or modify
  11  * it under the terms of the GNU General Public License as published by
  12  * the Free Software Foundation; either version 2 of the License, or
  13  * (at your option) any later version.
  14  *
  15  * This program is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  * GNU General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU General Public License
  21  * along with this program; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
  23  *****************************************************************************/
  24
  25 /*****************************************************************************
  26  * Preamble
  27  *****************************************************************************/
  28
  29 #include <vlc/vlc.h>
  30 #include <vlc_vout.h>
  31
  32 #if defined (MODULE_NAME_IS_i420_yuy2_altivec) && defined(HAVE_ALTIVEC_H)
  33 #   include <altivec.h>
  34 #endif
  35
  36 #include "i420_yuy2.h"
  37
  38 #define SRC_FOURCC  "I420,IYUV,YV12"
  39
  40 #if defined (MODULE_NAME_IS_i420_yuy2)
  41 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv,Y211"
  42 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
  43 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
  44 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
  45 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
  46 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
  47 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422"
  48 #endif
  49
  50 /*****************************************************************************
  51  * Local and extern prototypes.
  52  *****************************************************************************/
  53 static int  Activate ( vlc_object_t * );
  54
  55 static void I420_YUY2           ( vout_thread_t *, picture_t *, picture_t * );
  56 static void I420_YVYU           ( vout_thread_t *, picture_t *, picture_t * );
  57 static void I420_UYVY           ( vout_thread_t *, picture_t *, picture_t * );
  58 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
  59 static void I420_IUYV           ( vout_thread_t *, picture_t *, picture_t * );
  60 static void I420_cyuv           ( vout_thread_t *, picture_t *, picture_t * );
  61 #endif
  62 #if defined (MODULE_NAME_IS_i420_yuy2)
  63 static void I420_Y211           ( vout_thread_t *, picture_t *, picture_t * );
  64 #endif
  65
  66 #ifdef MODULE_NAME_IS_i420_yuy2_mmx
  67 /* Initialize MMX-specific constants */
  68 static const uint64_t i_00ffw = 0x00ff00ff00ff00ffULL;
  69 static const uint64_t i_80w   = 0x0000000080808080ULL;
  70 #endif
  71
  72 /*****************************************************************************
  73  * Module descriptor.
  74  *****************************************************************************/
  75 vlc_module_begin();
  76 #if defined (MODULE_NAME_IS_i420_yuy2)
  77     set_description( _("Conversions from " SRC_FOURCC " to " DEST_FOURCC) );
  78     set_capability( "chroma", 80 );
  79 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
  80     set_description( _("MMX conversions from " SRC_FOURCC " to " DEST_FOURCC) );
  81     set_capability( "chroma", 100 );
  82     add_requirement( MMX );
  83 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
  84     set_description( _("SSE2 conversions from " SRC_FOURCC " to " DEST_FOURCC) );
  85     set_capability( "chroma", 120 );
  86     add_requirement( SSE2 );
  87 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
  88     set_description(
  89             _("AltiVec conversions from " SRC_FOURCC " to " DEST_FOURCC) );
  90     set_capability( "chroma", 100 );
  91     add_requirement( ALTIVEC );
  92 #endif
  93     set_callbacks( Activate, NULL );
  94 vlc_module_end();
  95
  96 /*****************************************************************************
  97  * Activate: allocate a chroma function
  98  *****************************************************************************
  99  * This function allocates and initializes a chroma function
 100  *****************************************************************************/
 101 static int Activate( vlc_object_t *p_this )
 102 {
 103     vout_thread_t *p_vout = (vout_thread_t *)p_this;
 104
 105     if( p_vout->render.i_width & 1 || p_vout->render.i_height & 1 )
 106     {
 107         return -1;
 108     }
 109
 110     switch( p_vout->render.i_chroma )
 111     {
 112         case VLC_FOURCC('Y','V','1','2'):
 113         case VLC_FOURCC('I','4','2','0'):
 114         case VLC_FOURCC('I','Y','U','V'):
 115             switch( p_vout->output.i_chroma )
 116             {
 117                 case VLC_FOURCC('Y','U','Y','2'):
 118                 case VLC_FOURCC('Y','U','N','V'):
 119                     p_vout->chroma.pf_convert = I420_YUY2;
 120                     break;
 121
 122                 case VLC_FOURCC('Y','V','Y','U'):
 123                     p_vout->chroma.pf_convert = I420_YVYU;
 124                     break;
 125
 126                 case VLC_FOURCC('U','Y','V','Y'):
 127                 case VLC_FOURCC('U','Y','N','V'):
 128                 case VLC_FOURCC('Y','4','2','2'):
 129                     p_vout->chroma.pf_convert = I420_UYVY;
 130                     break;
 131 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
 132                 case VLC_FOURCC('I','U','Y','V'):
 133                     p_vout->chroma.pf_convert = I420_IUYV;
 134                     break;
 135
 136                 case VLC_FOURCC('c','y','u','v'):
 137                     p_vout->chroma.pf_convert = I420_cyuv;
 138                     break;
 139 #endif
 140
 141 #if defined (MODULE_NAME_IS_i420_yuy2)
 142                 case VLC_FOURCC('Y','2','1','1'):
 143                     p_vout->chroma.pf_convert = I420_Y211;
 144                     break;
 145 #endif
 146
 147                 default:
 148                     return -1;
 149             }
 150             break;
 151
 152         default:
 153             return -1;
 154     }
 155
 156     return 0;
 157 }
 158
 159 #if 0
 160 static inline unsigned long long read_cycles(void)
 161 {
 162     unsigned long long v;
 163     __asm__ __volatile__("rdtsc" : "=A" (v): );
 164
 165     return v;
 166 }
 167 #endif
 168
 169 /* Following functions are local */
 170 /*****************************************************************************
 171  * I420_YUY2: planar YUV 4:2:0 to packed YUYV 4:2:2
 172  *****************************************************************************/
 173 static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source,
 174                                               picture_t *p_dest )
 175 {
 176     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
 177     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
 178     uint8_t *p_u = p_source->U_PIXELS;
 179     uint8_t *p_v = p_source->V_PIXELS;
 180
 181     int i_x, i_y;
 182
 183 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 184 #define VEC_NEXT_LINES( ) \
 185     p_line1  = p_line2; \
 186     p_line2 += p_dest->p->i_pitch; \
 187     p_y1     = p_y2; \
 188     p_y2    += p_source->p[Y_PLANE].i_pitch;
 189
 190 #define VEC_LOAD_UV( ) \
 191     u_vec = vec_ld( 0, p_u ); p_u += 16; \
 192     v_vec = vec_ld( 0, p_v ); p_v += 16;
 193
 194 #define VEC_MERGE( a ) \
 195     uv_vec = a( u_vec, v_vec ); \
 196     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
 197     vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
 198     vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
 199     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
 200     vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16; \
 201     vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16;
 202
 203     vector unsigned char u_vec;
 204     vector unsigned char v_vec;
 205     vector unsigned char uv_vec;
 206     vector unsigned char y_vec;
 207
 208     if( !( ( p_vout->render.i_width % 32 ) |
 209            ( p_vout->render.i_height % 2 ) ) )
 210     {
 211         /* Width is a multiple of 32, we take 2 lines at a time */
 212         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 213         {
 214             VEC_NEXT_LINES( );
 215             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
 216             {
 217                 VEC_LOAD_UV( );
 218                 VEC_MERGE( vec_mergeh );
 219                 VEC_MERGE( vec_mergel );
 220             }
 221         }
 222     }
 223     else if( !( ( p_vout->render.i_width % 16 ) |
 224                 ( p_vout->render.i_height % 4 ) ) )
 225     {
 226         /* Width is only a multiple of 16, we take 4 lines at a time */
 227         for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
 228         {
 229             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
 230             VEC_NEXT_LINES( );
 231             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
 232             {
 233                 VEC_LOAD_UV( );
 234                 VEC_MERGE( vec_mergeh );
 235                 VEC_MERGE( vec_mergel );
 236             }
 237
 238             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
 239             VEC_LOAD_UV( );
 240             VEC_MERGE( vec_mergeh );
 241
 242             /* Line 3 and 4, pixels 0 to 16 */
 243             VEC_NEXT_LINES( );
 244             VEC_MERGE( vec_mergel );
 245
 246             /* Line 3 and 4, pixels 16 to ( width ) */
 247             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
 248             {
 249                 VEC_LOAD_UV( );
 250                 VEC_MERGE( vec_mergeh );
 251                 VEC_MERGE( vec_mergel );
 252             }
 253         }
 254     }
 255     else
 256     {
 257         /* Crap, use the C version */
 258 #undef VEC_NEXT_LINES
 259 #undef VEC_LOAD_UV
 260 #undef VEC_MERGE
 261 #endif
 262
 263     const int i_source_margin = p_source->p[0].i_pitch
 264                                  - p_source->p[0].i_visible_pitch;
 265     const int i_source_margin_c = p_source->p[1].i_pitch
 266                                  - p_source->p[1].i_visible_pitch;
 267     const int i_dest_margin = p_dest->p->i_pitch
 268                                - p_dest->p->i_visible_pitch;
 269
 270 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
 271     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 272     {
 273         p_line1 = p_line2;
 274         p_line2 += p_dest->p->i_pitch;
 275
 276         p_y1 = p_y2;
 277         p_y2 += p_source->p[Y_PLANE].i_pitch;
 278
 279 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
 280         for( i_x = p_vout->render.i_width / 8; i_x-- ; )
 281         {
 282             C_YUV420_YUYV( );
 283             C_YUV420_YUYV( );
 284             C_YUV420_YUYV( );
 285             C_YUV420_YUYV( );
 286         }
 287 #else
 288         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
 289         {
 290             MMX_CALL( MMX_YUV420_YUYV );
 291         }
 292 #endif
 293         for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; )
 294         {
 295             C_YUV420_YUYV( );
 296         }
 297
 298         p_y1 += i_source_margin;
 299         p_y2 += i_source_margin;
 300         p_u += i_source_margin_c;
 301         p_v += i_source_margin_c;
 302         p_line1 += i_dest_margin;
 303         p_line2 += i_dest_margin;
 304     }
 305
 306 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
 307     /* re-enable FPU registers */
 308     MMX_END;
 309 #endif
 310
 311 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 312     }
 313 #endif
 314
 315 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 316     /*
 317     ** SSE2 128 bits fetch/store instructions are faster
 318     ** if memory access is 16 bytes aligned
 319     */
 320
 321     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
 322         ((intptr_t)p_line2|(intptr_t)p_y2))) )
 323     {
 324         /* use faster SSE2 aligned fetch and store */
 325         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 326         {
 327             p_line1 = p_line2;
 328             p_line2 += p_dest->p->i_pitch;
 329
 330             p_y1 = p_y2;
 331             p_y2 += p_source->p[Y_PLANE].i_pitch;
 332
 333             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
 334             {
 335                 SSE2_CALL( SSE2_YUV420_YUYV_ALIGNED );
 336             }
 337             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
 338             {
 339                 C_YUV420_YUYV( );
 340             }
 341
 342             p_y1 += i_source_margin;
 343             p_y2 += i_source_margin;
 344             p_u += i_source_margin_c;
 345             p_v += i_source_margin_c;
 346             p_line1 += i_dest_margin;
 347             p_line2 += i_dest_margin;
 348         }
 349     }
 350     else
 351     {
 352         /* use slower SSE2 unaligned fetch and store */
 353         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 354         {
 355             p_line1 = p_line2;
 356             p_line2 += p_dest->p->i_pitch;
 357
 358             p_y1 = p_y2;
 359             p_y2 += p_source->p[Y_PLANE].i_pitch;
 360
 361             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
 362             {
 363                 SSE2_CALL( SSE2_YUV420_YUYV_UNALIGNED );
 364             }
 365             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
 366             {
 367                 C_YUV420_YUYV( );
 368             }
 369
 370             p_y1 += i_source_margin;
 371             p_y2 += i_source_margin;
 372             p_u += i_source_margin_c;
 373             p_v += i_source_margin_c;
 374             p_line1 += i_dest_margin;
 375             p_line2 += i_dest_margin;
 376         }
 377     }
 378     /* make sure all SSE2 stores are visible thereafter */
 379     SSE2_END;
 380
 381 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 382 }
 383
 384 /*****************************************************************************
 385  * I420_YVYU: planar YUV 4:2:0 to packed YVYU 4:2:2
 386  *****************************************************************************/
 387 static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source,
 388                                               picture_t *p_dest )
 389 {
 390     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
 391     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
 392     uint8_t *p_u = p_source->U_PIXELS;
 393     uint8_t *p_v = p_source->V_PIXELS;
 394
 395     int i_x, i_y;
 396
 397 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 398 #define VEC_NEXT_LINES( ) \
 399     p_line1  = p_line2; \
 400     p_line2 += p_dest->p->i_pitch; \
 401     p_y1     = p_y2; \
 402     p_y2    += p_source->p[Y_PLANE].i_pitch;
 403
 404 #define VEC_LOAD_UV( ) \
 405     u_vec = vec_ld( 0, p_u ); p_u += 16; \
 406     v_vec = vec_ld( 0, p_v ); p_v += 16;
 407
 408 #define VEC_MERGE( a ) \
 409     vu_vec = a( v_vec, u_vec ); \
 410     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
 411     vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
 412     vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
 413     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
 414     vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16; \
 415     vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16;
 416
 417     vector unsigned char u_vec;
 418     vector unsigned char v_vec;
 419     vector unsigned char vu_vec;
 420     vector unsigned char y_vec;
 421
 422     if( !( ( p_vout->render.i_width % 32 ) |
 423            ( p_vout->render.i_height % 2 ) ) )
 424     {
 425         /* Width is a multiple of 32, we take 2 lines at a time */
 426         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 427         {
 428             VEC_NEXT_LINES( );
 429             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
 430             {
 431                 VEC_LOAD_UV( );
 432                 VEC_MERGE( vec_mergeh );
 433                 VEC_MERGE( vec_mergel );
 434             }
 435         }
 436     }
 437     else if( !( ( p_vout->render.i_width % 16 ) |
 438                 ( p_vout->render.i_height % 4 ) ) )
 439     {
 440         /* Width is only a multiple of 16, we take 4 lines at a time */
 441         for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
 442         {
 443             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
 444             VEC_NEXT_LINES( );
 445             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
 446             {
 447                 VEC_LOAD_UV( );
 448                 VEC_MERGE( vec_mergeh );
 449                 VEC_MERGE( vec_mergel );
 450             }
 451
 452             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
 453             VEC_LOAD_UV( );
 454             VEC_MERGE( vec_mergeh );
 455
 456             /* Line 3 and 4, pixels 0 to 16 */
 457             VEC_NEXT_LINES( );
 458             VEC_MERGE( vec_mergel );
 459
 460             /* Line 3 and 4, pixels 16 to ( width ) */
 461             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
 462             {
 463                 VEC_LOAD_UV( );
 464                 VEC_MERGE( vec_mergeh );
 465                 VEC_MERGE( vec_mergel );
 466             }
 467         }
 468     }
 469     else
 470     {
 471         /* Crap, use the C version */
 472 #undef VEC_NEXT_LINES
 473 #undef VEC_LOAD_UV
 474 #undef VEC_MERGE
 475 #endif
 476
 477     const int i_source_margin = p_source->p[0].i_pitch
 478                                  - p_source->p[0].i_visible_pitch;
 479     const int i_source_margin_c = p_source->p[1].i_pitch
 480                                  - p_source->p[1].i_visible_pitch;
 481     const int i_dest_margin = p_dest->p->i_pitch
 482                                - p_dest->p->i_visible_pitch;
 483
 484 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
 485     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 486     {
 487         p_line1 = p_line2;
 488         p_line2 += p_dest->p->i_pitch;
 489
 490         p_y1 = p_y2;
 491         p_y2 += p_source->p[Y_PLANE].i_pitch;
 492
 493         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
 494         {
 495 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
 496             C_YUV420_YVYU( );
 497             C_YUV420_YVYU( );
 498             C_YUV420_YVYU( );
 499             C_YUV420_YVYU( );
 500 #else
 501             MMX_CALL( MMX_YUV420_YVYU );
 502 #endif
 503         }
 504         for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; )
 505         {
 506             C_YUV420_YVYU( );
 507         }
 508
 509         p_y1 += i_source_margin;
 510         p_y2 += i_source_margin;
 511         p_u += i_source_margin_c;
 512         p_v += i_source_margin_c;
 513         p_line1 += i_dest_margin;
 514         p_line2 += i_dest_margin;
 515     }
 516
 517 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
 518     /* re-enable FPU registers */
 519     MMX_END;
 520 #endif
 521
 522 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 523     }
 524 #endif
 525
 526 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 527     /*
 528     ** SSE2 128 bits fetch/store instructions are faster
 529     ** if memory access is 16 bytes aligned
 530     */
 531     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
 532         ((intptr_t)p_line2|(intptr_t)p_y2))) )
 533     {
 534         /* use faster SSE2 aligned fetch and store */
 535         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 536         {
 537             p_line1 = p_line2;
 538             p_line2 += p_dest->p->i_pitch;
 539
 540             p_y1 = p_y2;
 541             p_y2 += p_source->p[Y_PLANE].i_pitch;
 542
 543             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
 544             {
 545                 SSE2_CALL( SSE2_YUV420_YVYU_ALIGNED );
 546             }
 547             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
 548             {
 549                 C_YUV420_YVYU( );
 550             }
 551
 552             p_y1 += i_source_margin;
 553             p_y2 += i_source_margin;
 554             p_u += i_source_margin_c;
 555             p_v += i_source_margin_c;
 556             p_line1 += i_dest_margin;
 557             p_line2 += i_dest_margin;
 558         }
 559     }
 560     else
 561     {
 562         /* use slower SSE2 unaligned fetch and store */
 563         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 564         {
 565             p_line1 = p_line2;
 566             p_line2 += p_dest->p->i_pitch;
 567
 568             p_y1 = p_y2;
 569             p_y2 += p_source->p[Y_PLANE].i_pitch;
 570
 571             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
 572             {
 573                 SSE2_CALL( SSE2_YUV420_YVYU_UNALIGNED );
 574             }
 575             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
 576             {
 577                 C_YUV420_YVYU( );
 578             }
 579
 580             p_y1 += i_source_margin;
 581             p_y2 += i_source_margin;
 582             p_u += i_source_margin_c;
 583             p_v += i_source_margin_c;
 584             p_line1 += i_dest_margin;
 585             p_line2 += i_dest_margin;
 586         }
 587     }
 588     /* make sure all SSE2 stores are visible thereafter */
 589     SSE2_END;
 590 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 591 }
 592
 593 /*****************************************************************************
 594  * I420_UYVY: planar YUV 4:2:0 to packed UYVY 4:2:2
 595  *****************************************************************************/
 596 static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source,
 597                                               picture_t *p_dest )
 598 {
 599     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
 600     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
 601     uint8_t *p_u = p_source->U_PIXELS;
 602     uint8_t *p_v = p_source->V_PIXELS;
 603
 604     int i_x, i_y;
 605
 606 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 607 #define VEC_NEXT_LINES( ) \
 608     p_line1  = p_line2; \
 609     p_line2 += p_dest->p->i_pitch; \
 610     p_y1     = p_y2; \
 611     p_y2    += p_source->p[Y_PLANE].i_pitch;
 612
 613 #define VEC_LOAD_UV( ) \
 614     u_vec = vec_ld( 0, p_u ); p_u += 16; \
 615     v_vec = vec_ld( 0, p_v ); p_v += 16;
 616
 617 #define VEC_MERGE( a ) \
 618     uv_vec = a( u_vec, v_vec ); \
 619     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
 620     vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
 621     vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
 622     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
 623     vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16; \
 624     vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16;
 625
 626     vector unsigned char u_vec;
 627     vector unsigned char v_vec;
 628     vector unsigned char uv_vec;
 629     vector unsigned char y_vec;
 630
 631     if( !( ( p_vout->render.i_width % 32 ) |
 632            ( p_vout->render.i_height % 2 ) ) )
 633     {
 634         /* Width is a multiple of 32, we take 2 lines at a time */
 635         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 636         {
 637             VEC_NEXT_LINES( );
 638             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
 639             {
 640                 VEC_LOAD_UV( );
 641                 VEC_MERGE( vec_mergeh );
 642                 VEC_MERGE( vec_mergel );
 643             }
 644         }
 645     }
 646     else if( !( ( p_vout->render.i_width % 16 ) |
 647                 ( p_vout->render.i_height % 4 ) ) )
 648     {
 649         /* Width is only a multiple of 16, we take 4 lines at a time */
 650         for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
 651         {
 652             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
 653             VEC_NEXT_LINES( );
 654             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
 655             {
 656                 VEC_LOAD_UV( );
 657                 VEC_MERGE( vec_mergeh );
 658                 VEC_MERGE( vec_mergel );
 659             }
 660
 661             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
 662             VEC_LOAD_UV( );
 663             VEC_MERGE( vec_mergeh );
 664
 665             /* Line 3 and 4, pixels 0 to 16 */
 666             VEC_NEXT_LINES( );
 667             VEC_MERGE( vec_mergel );
 668
 669             /* Line 3 and 4, pixels 16 to ( width ) */
 670             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
 671             {
 672                 VEC_LOAD_UV( );
 673                 VEC_MERGE( vec_mergeh );
 674                 VEC_MERGE( vec_mergel );
 675             }
 676         }
 677     }
 678     else
 679     {
 680         /* Crap, use the C version */
 681 #undef VEC_NEXT_LINES
 682 #undef VEC_LOAD_UV
 683 #undef VEC_MERGE
 684 #endif
 685
 686     const int i_source_margin = p_source->p[0].i_pitch
 687                                  - p_source->p[0].i_visible_pitch;
 688     const int i_source_margin_c = p_source->p[1].i_pitch
 689                                  - p_source->p[1].i_visible_pitch;
 690     const int i_dest_margin = p_dest->p->i_pitch
 691                                - p_dest->p->i_visible_pitch;
 692
 693 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
 694     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 695     {
 696         p_line1 = p_line2;
 697         p_line2 += p_dest->p->i_pitch;
 698
 699         p_y1 = p_y2;
 700         p_y2 += p_source->p[Y_PLANE].i_pitch;
 701
 702         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
 703         {
 704 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
 705             C_YUV420_UYVY( );
 706             C_YUV420_UYVY( );
 707             C_YUV420_UYVY( );
 708             C_YUV420_UYVY( );
 709 #else
 710             MMX_CALL( MMX_YUV420_UYVY );
 711 #endif
 712         }
 713         for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x--; )
 714         {
 715             C_YUV420_UYVY( );
 716         }
 717
 718         p_y1 += i_source_margin;
 719         p_y2 += i_source_margin;
 720         p_u += i_source_margin_c;
 721         p_v += i_source_margin_c;
 722         p_line1 += i_dest_margin;
 723         p_line2 += i_dest_margin;
 724     }
 725
 726 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
 727     /* re-enable FPU registers */
 728     MMX_END;
 729 #endif
 730
 731 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 732     }
 733 #endif
 734
 735 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 736     /*
 737     ** SSE2 128 bits fetch/store instructions are faster
 738     ** if memory access is 16 bytes aligned
 739     */
 740     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
 741         ((intptr_t)p_line2|(intptr_t)p_y2))) )
 742     {
 743         /* use faster SSE2 aligned fetch and store */
 744         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 745         {
 746             p_line1 = p_line2;
 747             p_line2 += p_dest->p->i_pitch;
 748
 749             p_y1 = p_y2;
 750             p_y2 += p_source->p[Y_PLANE].i_pitch;
 751
 752             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
 753             {
 754                 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
 755             }
 756             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
 757             {
 758                 C_YUV420_UYVY( );
 759             }
 760
 761             p_y1 += i_source_margin;
 762             p_y2 += i_source_margin;
 763             p_u += i_source_margin_c;
 764             p_v += i_source_margin_c;
 765             p_line1 += i_dest_margin;
 766             p_line2 += i_dest_margin;
 767         }
 768     }
 769     else
 770     {
 771         /* use slower SSE2 unaligned fetch and store */
 772         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 773         {
 774             p_line1 = p_line2;
 775             p_line2 += p_dest->p->i_pitch;
 776
 777             p_y1 = p_y2;
 778             p_y2 += p_source->p[Y_PLANE].i_pitch;
 779
 780             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
 781             {
 782                 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
 783             }
 784             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
 785             {
 786                 C_YUV420_UYVY( );
 787             }
 788
 789             p_y1 += i_source_margin;
 790             p_y2 += i_source_margin;
 791             p_u += i_source_margin_c;
 792             p_v += i_source_margin_c;
 793             p_line1 += i_dest_margin;
 794             p_line2 += i_dest_margin;
 795         }
 796     }
 797     /* make sure all SSE2 stores are visible thereafter */
 798     SSE2_END;
 799 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 800 }
 801
 802 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
 803 /*****************************************************************************
 804  * I420_IUYV: planar YUV 4:2:0 to interleaved packed UYVY 4:2:2
 805  *****************************************************************************/
 806 static void I420_IUYV( vout_thread_t *p_vout, picture_t *p_source,
 807                                               picture_t *p_dest )
 808 {
 809     /* FIXME: TODO ! */
 810     msg_Err( p_vout, "I420_IUYV unimplemented, please harass <sam@zoy.org>" );
 811 }
 812
 813 /*****************************************************************************
 814  * I420_cyuv: planar YUV 4:2:0 to upside-down packed UYVY 4:2:2
 815  *****************************************************************************/
 816 static void I420_cyuv( vout_thread_t *p_vout, picture_t *p_source,
 817                                               picture_t *p_dest )
 818 {
 819     uint8_t *p_line1 = p_dest->p->p_pixels +
 820                        p_dest->p->i_visible_lines * p_dest->p->i_pitch
 821                        + p_dest->p->i_pitch;
 822     uint8_t *p_line2 = p_dest->p->p_pixels +
 823                        p_dest->p->i_visible_lines * p_dest->p->i_pitch;
 824     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
 825     uint8_t *p_u = p_source->U_PIXELS;
 826     uint8_t *p_v = p_source->V_PIXELS;
 827
 828     int i_x, i_y;
 829
 830     const int i_source_margin = p_source->p[0].i_pitch
 831                                  - p_source->p[0].i_visible_pitch;
 832     const int i_source_margin_c = p_source->p[1].i_pitch
 833                                  - p_source->p[1].i_visible_pitch;
 834     const int i_dest_margin = p_dest->p->i_pitch
 835                                - p_dest->p->i_visible_pitch;
 836
 837 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
 838     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 839     {
 840         p_line1 -= 3 * p_dest->p->i_pitch;
 841         p_line2 -= 3 * p_dest->p->i_pitch;
 842
 843         p_y1 = p_y2;
 844         p_y2 += p_source->p[Y_PLANE].i_pitch;
 845
 846         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
 847         {
 848 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
 849             C_YUV420_UYVY( );
 850             C_YUV420_UYVY( );
 851             C_YUV420_UYVY( );
 852             C_YUV420_UYVY( );
 853 #else
 854             MMX_CALL( MMX_YUV420_UYVY );
 855 #endif
 856         }
 857         for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; )
 858         {
 859             C_YUV420_UYVY( );
 860         }
 861
 862         p_y1 += i_source_margin;
 863         p_y2 += i_source_margin;
 864         p_u += i_source_margin_c;
 865         p_v += i_source_margin_c;
 866         p_line1 += i_dest_margin;
 867         p_line2 += i_dest_margin;
 868     }
 869
 870 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
 871     /* re-enable FPU registers */
 872     MMX_END;
 873 #endif
 874
 875 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 876     /*
 877     ** SSE2 128 bits fetch/store instructions are faster
 878     ** if memory access is 16 bytes aligned
 879     */
 880     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
 881         ((intptr_t)p_line2|(intptr_t)p_y2))) )
 882     {
 883         /* use faster SSE2 aligned fetch and store */
 884         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 885         {
 886             p_line1 = p_line2;
 887             p_line2 += p_dest->p->i_pitch;
 888
 889             p_y1 = p_y2;
 890             p_y2 += p_source->p[Y_PLANE].i_pitch;
 891
 892             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
 893             {
 894                 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
 895             }
 896             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
 897             {
 898                 C_YUV420_UYVY( );
 899             }
 900
 901             p_y1 += i_source_margin;
 902             p_y2 += i_source_margin;
 903             p_u += i_source_margin_c;
 904             p_v += i_source_margin_c;
 905             p_line1 += i_dest_margin;
 906             p_line2 += i_dest_margin;
 907         }
 908     }
 909     else
 910     {
 911         /* use slower SSE2 unaligned fetch and store */
 912         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 913         {
 914             p_line1 = p_line2;
 915             p_line2 += p_dest->p->i_pitch;
 916
 917             p_y1 = p_y2;
 918             p_y2 += p_source->p[Y_PLANE].i_pitch;
 919
 920             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
 921             {
 922                 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
 923             }
 924             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
 925             {
 926                 C_YUV420_UYVY( );
 927             }
 928
 929             p_y1 += i_source_margin;
 930             p_y2 += i_source_margin;
 931             p_u += i_source_margin_c;
 932             p_v += i_source_margin_c;
 933             p_line1 += i_dest_margin;
 934             p_line2 += i_dest_margin;
 935         }
 936     }
 937     /* make sure all SSE2 stores are visible thereafter */
 938     SSE2_END;
 939 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 940 }
 941 #endif // !defined (MODULE_NAME_IS_i420_yuy2_altivec)
 942
 943 /*****************************************************************************
 944  * I420_Y211: planar YUV 4:2:0 to packed YUYV 2:1:1
 945  *****************************************************************************/
 946 #if defined (MODULE_NAME_IS_i420_yuy2)
 947 static void I420_Y211( vout_thread_t *p_vout, picture_t *p_source,
 948                                               picture_t *p_dest )
 949 {
 950     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
 951     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
 952     uint8_t *p_u = p_source->U_PIXELS;
 953     uint8_t *p_v = p_source->V_PIXELS;
 954
 955     int i_x, i_y;
 956
 957     const int i_source_margin = p_source->p[0].i_pitch
 958                                  - p_source->p[0].i_visible_pitch;
 959     const int i_source_margin_c = p_source->p[1].i_pitch
 960                                  - p_source->p[1].i_visible_pitch;
 961     const int i_dest_margin = p_dest->p->i_pitch
 962                                - p_dest->p->i_visible_pitch;
 963
 964     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 965     {
 966         p_line1 = p_line2;
 967         p_line2 += p_dest->p->i_pitch;
 968
 969         p_y1 = p_y2;
 970         p_y2 += p_source->p[Y_PLANE].i_pitch;
 971
 972         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
 973         {
 974             C_YUV420_Y211( );
 975             C_YUV420_Y211( );
 976         }
 977
 978         p_y1 += i_source_margin;
 979         p_y2 += i_source_margin;
 980         p_u += i_source_margin_c;
 981         p_v += i_source_margin_c;
 982         p_line1 += i_dest_margin;
 983         p_line2 += i_dest_margin;
 984     }
 985 }
 986 #endif