git.sesse.net Git - vlc/blob - modules/video_chroma/i420_yuy2.c

   1 /*****************************************************************************
   2  * i420_yuy2.c : YUV to YUV conversion module for vlc
   3  *****************************************************************************
   4  * Copyright (C) 2000, 2001 the VideoLAN team
   5  * $Id$
   6  *
   7  * Authors: Samuel Hocevar <sam@zoy.org>
   8  *          Damien Fouilleul <damien@videolan.org>
   9  *
  10  * This program is free software; you can redistribute it and/or modify
  11  * it under the terms of the GNU General Public License as published by
  12  * the Free Software Foundation; either version 2 of the License, or
  13  * (at your option) any later version.
  14  *
  15  * This program is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  * GNU General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU General Public License
  21  * along with this program; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
  23  *****************************************************************************/
  24
  25 /*****************************************************************************
  26  * Preamble
  27  *****************************************************************************/
  28
  29 #ifdef HAVE_CONFIG_H
  30 # include "config.h"
  31 #endif
  32
  33 #include <vlc/vlc.h>
  34 #include <vlc_vout.h>
  35
  36 #if defined (MODULE_NAME_IS_i420_yuy2_altivec) && defined(HAVE_ALTIVEC_H)
  37 #   include <altivec.h>
  38 #endif
  39
  40 #include "i420_yuy2.h"
  41
  42 #define SRC_FOURCC  "I420,IYUV,YV12"
  43
  44 #if defined (MODULE_NAME_IS_i420_yuy2)
  45 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv,Y211"
  46 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
  47 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
  48 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
  49 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
  50 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
  51 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422"
  52 #endif
  53
  54 /*****************************************************************************
  55  * Local and extern prototypes.
  56  *****************************************************************************/
  57 static int  Activate ( vlc_object_t * );
  58
  59 static void I420_YUY2           ( vout_thread_t *, picture_t *, picture_t * );
  60 static void I420_YVYU           ( vout_thread_t *, picture_t *, picture_t * );
  61 static void I420_UYVY           ( vout_thread_t *, picture_t *, picture_t * );
  62 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
  63 static void I420_IUYV           ( vout_thread_t *, picture_t *, picture_t * );
  64 static void I420_cyuv           ( vout_thread_t *, picture_t *, picture_t * );
  65 #endif
  66 #if defined (MODULE_NAME_IS_i420_yuy2)
  67 static void I420_Y211           ( vout_thread_t *, picture_t *, picture_t * );
  68 #endif
  69
  70 #ifdef MODULE_NAME_IS_i420_yuy2_mmx
  71 /* Initialize MMX-specific constants */
  72 static const uint64_t i_00ffw = 0x00ff00ff00ff00ffULL;
  73 static const uint64_t i_80w   = 0x0000000080808080ULL;
  74 #endif
  75
  76 /*****************************************************************************
  77  * Module descriptor.
  78  *****************************************************************************/
  79 vlc_module_begin();
  80 #if defined (MODULE_NAME_IS_i420_yuy2)
  81     set_description( _("Conversions from " SRC_FOURCC " to " DEST_FOURCC) );
  82     set_capability( "chroma", 80 );
  83 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
  84     set_description( _("MMX conversions from " SRC_FOURCC " to " DEST_FOURCC) );
  85     set_capability( "chroma", 100 );
  86     add_requirement( MMX );
  87 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
  88     set_description( _("SSE2 conversions from " SRC_FOURCC " to " DEST_FOURCC) );
  89     set_capability( "chroma", 120 );
  90     add_requirement( SSE2 );
  91 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
  92     set_description(
  93             _("AltiVec conversions from " SRC_FOURCC " to " DEST_FOURCC) );
  94     set_capability( "chroma", 100 );
  95     add_requirement( ALTIVEC );
  96 #endif
  97     set_callbacks( Activate, NULL );
  98 vlc_module_end();
  99
 100 /*****************************************************************************
 101  * Activate: allocate a chroma function
 102  *****************************************************************************
 103  * This function allocates and initializes a chroma function
 104  *****************************************************************************/
 105 static int Activate( vlc_object_t *p_this )
 106 {
 107     vout_thread_t *p_vout = (vout_thread_t *)p_this;
 108
 109     if( p_vout->render.i_width & 1 || p_vout->render.i_height & 1 )
 110     {
 111         return -1;
 112     }
 113
 114     switch( p_vout->render.i_chroma )
 115     {
 116         case VLC_FOURCC('Y','V','1','2'):
 117         case VLC_FOURCC('I','4','2','0'):
 118         case VLC_FOURCC('I','Y','U','V'):
 119             switch( p_vout->output.i_chroma )
 120             {
 121                 case VLC_FOURCC('Y','U','Y','2'):
 122                 case VLC_FOURCC('Y','U','N','V'):
 123                     p_vout->chroma.pf_convert = I420_YUY2;
 124                     break;
 125
 126                 case VLC_FOURCC('Y','V','Y','U'):
 127                     p_vout->chroma.pf_convert = I420_YVYU;
 128                     break;
 129
 130                 case VLC_FOURCC('U','Y','V','Y'):
 131                 case VLC_FOURCC('U','Y','N','V'):
 132                 case VLC_FOURCC('Y','4','2','2'):
 133                     p_vout->chroma.pf_convert = I420_UYVY;
 134                     break;
 135 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
 136                 case VLC_FOURCC('I','U','Y','V'):
 137                     p_vout->chroma.pf_convert = I420_IUYV;
 138                     break;
 139
 140                 case VLC_FOURCC('c','y','u','v'):
 141                     p_vout->chroma.pf_convert = I420_cyuv;
 142                     break;
 143 #endif
 144
 145 #if defined (MODULE_NAME_IS_i420_yuy2)
 146                 case VLC_FOURCC('Y','2','1','1'):
 147                     p_vout->chroma.pf_convert = I420_Y211;
 148                     break;
 149 #endif
 150
 151                 default:
 152                     return -1;
 153             }
 154             break;
 155
 156         default:
 157             return -1;
 158     }
 159
 160     return 0;
 161 }
 162
 163 #if 0
 164 static inline unsigned long long read_cycles(void)
 165 {
 166     unsigned long long v;
 167     __asm__ __volatile__("rdtsc" : "=A" (v): );
 168
 169     return v;
 170 }
 171 #endif
 172
 173 /* Following functions are local */
 174 /*****************************************************************************
 175  * I420_YUY2: planar YUV 4:2:0 to packed YUYV 4:2:2
 176  *****************************************************************************/
 177 static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source,
 178                                               picture_t *p_dest )
 179 {
 180     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
 181     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
 182     uint8_t *p_u = p_source->U_PIXELS;
 183     uint8_t *p_v = p_source->V_PIXELS;
 184
 185     int i_x, i_y;
 186
 187 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 188 #define VEC_NEXT_LINES( ) \
 189     p_line1  = p_line2; \
 190     p_line2 += p_dest->p->i_pitch; \
 191     p_y1     = p_y2; \
 192     p_y2    += p_source->p[Y_PLANE].i_pitch;
 193
 194 #define VEC_LOAD_UV( ) \
 195     u_vec = vec_ld( 0, p_u ); p_u += 16; \
 196     v_vec = vec_ld( 0, p_v ); p_v += 16;
 197
 198 #define VEC_MERGE( a ) \
 199     uv_vec = a( u_vec, v_vec ); \
 200     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
 201     vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
 202     vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
 203     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
 204     vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16; \
 205     vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16;
 206
 207     vector unsigned char u_vec;
 208     vector unsigned char v_vec;
 209     vector unsigned char uv_vec;
 210     vector unsigned char y_vec;
 211
 212     if( !( ( p_vout->render.i_width % 32 ) |
 213            ( p_vout->render.i_height % 2 ) ) )
 214     {
 215         /* Width is a multiple of 32, we take 2 lines at a time */
 216         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 217         {
 218             VEC_NEXT_LINES( );
 219             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
 220             {
 221                 VEC_LOAD_UV( );
 222                 VEC_MERGE( vec_mergeh );
 223                 VEC_MERGE( vec_mergel );
 224             }
 225         }
 226     }
 227     else if( !( ( p_vout->render.i_width % 16 ) |
 228                 ( p_vout->render.i_height % 4 ) ) )
 229     {
 230         /* Width is only a multiple of 16, we take 4 lines at a time */
 231         for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
 232         {
 233             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
 234             VEC_NEXT_LINES( );
 235             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
 236             {
 237                 VEC_LOAD_UV( );
 238                 VEC_MERGE( vec_mergeh );
 239                 VEC_MERGE( vec_mergel );
 240             }
 241
 242             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
 243             VEC_LOAD_UV( );
 244             VEC_MERGE( vec_mergeh );
 245
 246             /* Line 3 and 4, pixels 0 to 16 */
 247             VEC_NEXT_LINES( );
 248             VEC_MERGE( vec_mergel );
 249
 250             /* Line 3 and 4, pixels 16 to ( width ) */
 251             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
 252             {
 253                 VEC_LOAD_UV( );
 254                 VEC_MERGE( vec_mergeh );
 255                 VEC_MERGE( vec_mergel );
 256             }
 257         }
 258     }
 259     else
 260     {
 261         /* Crap, use the C version */
 262 #undef VEC_NEXT_LINES
 263 #undef VEC_LOAD_UV
 264 #undef VEC_MERGE
 265 #endif
 266
 267     const int i_source_margin = p_source->p[0].i_pitch
 268                                  - p_source->p[0].i_visible_pitch;
 269     const int i_source_margin_c = p_source->p[1].i_pitch
 270                                  - p_source->p[1].i_visible_pitch;
 271     const int i_dest_margin = p_dest->p->i_pitch
 272                                - p_dest->p->i_visible_pitch;
 273
 274 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
 275     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 276     {
 277         p_line1 = p_line2;
 278         p_line2 += p_dest->p->i_pitch;
 279
 280         p_y1 = p_y2;
 281         p_y2 += p_source->p[Y_PLANE].i_pitch;
 282
 283 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
 284         for( i_x = p_vout->render.i_width / 8; i_x-- ; )
 285         {
 286             C_YUV420_YUYV( );
 287             C_YUV420_YUYV( );
 288             C_YUV420_YUYV( );
 289             C_YUV420_YUYV( );
 290         }
 291 #else
 292         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
 293         {
 294             MMX_CALL( MMX_YUV420_YUYV );
 295         }
 296 #endif
 297         for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; )
 298         {
 299             C_YUV420_YUYV( );
 300         }
 301
 302         p_y1 += i_source_margin;
 303         p_y2 += i_source_margin;
 304         p_u += i_source_margin_c;
 305         p_v += i_source_margin_c;
 306         p_line1 += i_dest_margin;
 307         p_line2 += i_dest_margin;
 308     }
 309
 310 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
 311     /* re-enable FPU registers */
 312     MMX_END;
 313 #endif
 314
 315 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 316     }
 317 #endif
 318
 319 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 320     /*
 321     ** SSE2 128 bits fetch/store instructions are faster
 322     ** if memory access is 16 bytes aligned
 323     */
 324
 325     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
 326         ((intptr_t)p_line2|(intptr_t)p_y2))) )
 327     {
 328         /* use faster SSE2 aligned fetch and store */
 329         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 330         {
 331             p_line1 = p_line2;
 332             p_line2 += p_dest->p->i_pitch;
 333
 334             p_y1 = p_y2;
 335             p_y2 += p_source->p[Y_PLANE].i_pitch;
 336
 337             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
 338             {
 339                 SSE2_CALL( SSE2_YUV420_YUYV_ALIGNED );
 340             }
 341             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
 342             {
 343                 C_YUV420_YUYV( );
 344             }
 345
 346             p_y1 += i_source_margin;
 347             p_y2 += i_source_margin;
 348             p_u += i_source_margin_c;
 349             p_v += i_source_margin_c;
 350             p_line1 += i_dest_margin;
 351             p_line2 += i_dest_margin;
 352         }
 353     }
 354     else
 355     {
 356         /* use slower SSE2 unaligned fetch and store */
 357         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 358         {
 359             p_line1 = p_line2;
 360             p_line2 += p_dest->p->i_pitch;
 361
 362             p_y1 = p_y2;
 363             p_y2 += p_source->p[Y_PLANE].i_pitch;
 364
 365             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
 366             {
 367                 SSE2_CALL( SSE2_YUV420_YUYV_UNALIGNED );
 368             }
 369             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
 370             {
 371                 C_YUV420_YUYV( );
 372             }
 373
 374             p_y1 += i_source_margin;
 375             p_y2 += i_source_margin;
 376             p_u += i_source_margin_c;
 377             p_v += i_source_margin_c;
 378             p_line1 += i_dest_margin;
 379             p_line2 += i_dest_margin;
 380         }
 381     }
 382     /* make sure all SSE2 stores are visible thereafter */
 383     SSE2_END;
 384
 385 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 386 }
 387
 388 /*****************************************************************************
 389  * I420_YVYU: planar YUV 4:2:0 to packed YVYU 4:2:2
 390  *****************************************************************************/
 391 static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source,
 392                                               picture_t *p_dest )
 393 {
 394     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
 395     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
 396     uint8_t *p_u = p_source->U_PIXELS;
 397     uint8_t *p_v = p_source->V_PIXELS;
 398
 399     int i_x, i_y;
 400
 401 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 402 #define VEC_NEXT_LINES( ) \
 403     p_line1  = p_line2; \
 404     p_line2 += p_dest->p->i_pitch; \
 405     p_y1     = p_y2; \
 406     p_y2    += p_source->p[Y_PLANE].i_pitch;
 407
 408 #define VEC_LOAD_UV( ) \
 409     u_vec = vec_ld( 0, p_u ); p_u += 16; \
 410     v_vec = vec_ld( 0, p_v ); p_v += 16;
 411
 412 #define VEC_MERGE( a ) \
 413     vu_vec = a( v_vec, u_vec ); \
 414     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
 415     vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
 416     vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
 417     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
 418     vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16; \
 419     vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16;
 420
 421     vector unsigned char u_vec;
 422     vector unsigned char v_vec;
 423     vector unsigned char vu_vec;
 424     vector unsigned char y_vec;
 425
 426     if( !( ( p_vout->render.i_width % 32 ) |
 427            ( p_vout->render.i_height % 2 ) ) )
 428     {
 429         /* Width is a multiple of 32, we take 2 lines at a time */
 430         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 431         {
 432             VEC_NEXT_LINES( );
 433             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
 434             {
 435                 VEC_LOAD_UV( );
 436                 VEC_MERGE( vec_mergeh );
 437                 VEC_MERGE( vec_mergel );
 438             }
 439         }
 440     }
 441     else if( !( ( p_vout->render.i_width % 16 ) |
 442                 ( p_vout->render.i_height % 4 ) ) )
 443     {
 444         /* Width is only a multiple of 16, we take 4 lines at a time */
 445         for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
 446         {
 447             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
 448             VEC_NEXT_LINES( );
 449             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
 450             {
 451                 VEC_LOAD_UV( );
 452                 VEC_MERGE( vec_mergeh );
 453                 VEC_MERGE( vec_mergel );
 454             }
 455
 456             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
 457             VEC_LOAD_UV( );
 458             VEC_MERGE( vec_mergeh );
 459
 460             /* Line 3 and 4, pixels 0 to 16 */
 461             VEC_NEXT_LINES( );
 462             VEC_MERGE( vec_mergel );
 463
 464             /* Line 3 and 4, pixels 16 to ( width ) */
 465             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
 466             {
 467                 VEC_LOAD_UV( );
 468                 VEC_MERGE( vec_mergeh );
 469                 VEC_MERGE( vec_mergel );
 470             }
 471         }
 472     }
 473     else
 474     {
 475         /* Crap, use the C version */
 476 #undef VEC_NEXT_LINES
 477 #undef VEC_LOAD_UV
 478 #undef VEC_MERGE
 479 #endif
 480
 481     const int i_source_margin = p_source->p[0].i_pitch
 482                                  - p_source->p[0].i_visible_pitch;
 483     const int i_source_margin_c = p_source->p[1].i_pitch
 484                                  - p_source->p[1].i_visible_pitch;
 485     const int i_dest_margin = p_dest->p->i_pitch
 486                                - p_dest->p->i_visible_pitch;
 487
 488 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
 489     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 490     {
 491         p_line1 = p_line2;
 492         p_line2 += p_dest->p->i_pitch;
 493
 494         p_y1 = p_y2;
 495         p_y2 += p_source->p[Y_PLANE].i_pitch;
 496
 497         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
 498         {
 499 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
 500             C_YUV420_YVYU( );
 501             C_YUV420_YVYU( );
 502             C_YUV420_YVYU( );
 503             C_YUV420_YVYU( );
 504 #else
 505             MMX_CALL( MMX_YUV420_YVYU );
 506 #endif
 507         }
 508         for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; )
 509         {
 510             C_YUV420_YVYU( );
 511         }
 512
 513         p_y1 += i_source_margin;
 514         p_y2 += i_source_margin;
 515         p_u += i_source_margin_c;
 516         p_v += i_source_margin_c;
 517         p_line1 += i_dest_margin;
 518         p_line2 += i_dest_margin;
 519     }
 520
 521 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
 522     /* re-enable FPU registers */
 523     MMX_END;
 524 #endif
 525
 526 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 527     }
 528 #endif
 529
 530 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 531     /*
 532     ** SSE2 128 bits fetch/store instructions are faster
 533     ** if memory access is 16 bytes aligned
 534     */
 535     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
 536         ((intptr_t)p_line2|(intptr_t)p_y2))) )
 537     {
 538         /* use faster SSE2 aligned fetch and store */
 539         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 540         {
 541             p_line1 = p_line2;
 542             p_line2 += p_dest->p->i_pitch;
 543
 544             p_y1 = p_y2;
 545             p_y2 += p_source->p[Y_PLANE].i_pitch;
 546
 547             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
 548             {
 549                 SSE2_CALL( SSE2_YUV420_YVYU_ALIGNED );
 550             }
 551             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
 552             {
 553                 C_YUV420_YVYU( );
 554             }
 555
 556             p_y1 += i_source_margin;
 557             p_y2 += i_source_margin;
 558             p_u += i_source_margin_c;
 559             p_v += i_source_margin_c;
 560             p_line1 += i_dest_margin;
 561             p_line2 += i_dest_margin;
 562         }
 563     }
 564     else
 565     {
 566         /* use slower SSE2 unaligned fetch and store */
 567         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 568         {
 569             p_line1 = p_line2;
 570             p_line2 += p_dest->p->i_pitch;
 571
 572             p_y1 = p_y2;
 573             p_y2 += p_source->p[Y_PLANE].i_pitch;
 574
 575             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
 576             {
 577                 SSE2_CALL( SSE2_YUV420_YVYU_UNALIGNED );
 578             }
 579             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
 580             {
 581                 C_YUV420_YVYU( );
 582             }
 583
 584             p_y1 += i_source_margin;
 585             p_y2 += i_source_margin;
 586             p_u += i_source_margin_c;
 587             p_v += i_source_margin_c;
 588             p_line1 += i_dest_margin;
 589             p_line2 += i_dest_margin;
 590         }
 591     }
 592     /* make sure all SSE2 stores are visible thereafter */
 593     SSE2_END;
 594 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 595 }
 596
 597 /*****************************************************************************
 598  * I420_UYVY: planar YUV 4:2:0 to packed UYVY 4:2:2
 599  *****************************************************************************/
 600 static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source,
 601                                               picture_t *p_dest )
 602 {
 603     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
 604     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
 605     uint8_t *p_u = p_source->U_PIXELS;
 606     uint8_t *p_v = p_source->V_PIXELS;
 607
 608     int i_x, i_y;
 609
 610 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 611 #define VEC_NEXT_LINES( ) \
 612     p_line1  = p_line2; \
 613     p_line2 += p_dest->p->i_pitch; \
 614     p_y1     = p_y2; \
 615     p_y2    += p_source->p[Y_PLANE].i_pitch;
 616
 617 #define VEC_LOAD_UV( ) \
 618     u_vec = vec_ld( 0, p_u ); p_u += 16; \
 619     v_vec = vec_ld( 0, p_v ); p_v += 16;
 620
 621 #define VEC_MERGE( a ) \
 622     uv_vec = a( u_vec, v_vec ); \
 623     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
 624     vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
 625     vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
 626     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
 627     vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16; \
 628     vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16;
 629
 630     vector unsigned char u_vec;
 631     vector unsigned char v_vec;
 632     vector unsigned char uv_vec;
 633     vector unsigned char y_vec;
 634
 635     if( !( ( p_vout->render.i_width % 32 ) |
 636            ( p_vout->render.i_height % 2 ) ) )
 637     {
 638         /* Width is a multiple of 32, we take 2 lines at a time */
 639         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 640         {
 641             VEC_NEXT_LINES( );
 642             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
 643             {
 644                 VEC_LOAD_UV( );
 645                 VEC_MERGE( vec_mergeh );
 646                 VEC_MERGE( vec_mergel );
 647             }
 648         }
 649     }
 650     else if( !( ( p_vout->render.i_width % 16 ) |
 651                 ( p_vout->render.i_height % 4 ) ) )
 652     {
 653         /* Width is only a multiple of 16, we take 4 lines at a time */
 654         for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
 655         {
 656             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
 657             VEC_NEXT_LINES( );
 658             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
 659             {
 660                 VEC_LOAD_UV( );
 661                 VEC_MERGE( vec_mergeh );
 662                 VEC_MERGE( vec_mergel );
 663             }
 664
 665             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
 666             VEC_LOAD_UV( );
 667             VEC_MERGE( vec_mergeh );
 668
 669             /* Line 3 and 4, pixels 0 to 16 */
 670             VEC_NEXT_LINES( );
 671             VEC_MERGE( vec_mergel );
 672
 673             /* Line 3 and 4, pixels 16 to ( width ) */
 674             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
 675             {
 676                 VEC_LOAD_UV( );
 677                 VEC_MERGE( vec_mergeh );
 678                 VEC_MERGE( vec_mergel );
 679             }
 680         }
 681     }
 682     else
 683     {
 684         /* Crap, use the C version */
 685 #undef VEC_NEXT_LINES
 686 #undef VEC_LOAD_UV
 687 #undef VEC_MERGE
 688 #endif
 689
 690     const int i_source_margin = p_source->p[0].i_pitch
 691                                  - p_source->p[0].i_visible_pitch;
 692     const int i_source_margin_c = p_source->p[1].i_pitch
 693                                  - p_source->p[1].i_visible_pitch;
 694     const int i_dest_margin = p_dest->p->i_pitch
 695                                - p_dest->p->i_visible_pitch;
 696
 697 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
 698     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 699     {
 700         p_line1 = p_line2;
 701         p_line2 += p_dest->p->i_pitch;
 702
 703         p_y1 = p_y2;
 704         p_y2 += p_source->p[Y_PLANE].i_pitch;
 705
 706         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
 707         {
 708 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
 709             C_YUV420_UYVY( );
 710             C_YUV420_UYVY( );
 711             C_YUV420_UYVY( );
 712             C_YUV420_UYVY( );
 713 #else
 714             MMX_CALL( MMX_YUV420_UYVY );
 715 #endif
 716         }
 717         for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x--; )
 718         {
 719             C_YUV420_UYVY( );
 720         }
 721
 722         p_y1 += i_source_margin;
 723         p_y2 += i_source_margin;
 724         p_u += i_source_margin_c;
 725         p_v += i_source_margin_c;
 726         p_line1 += i_dest_margin;
 727         p_line2 += i_dest_margin;
 728     }
 729
 730 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
 731     /* re-enable FPU registers */
 732     MMX_END;
 733 #endif
 734
 735 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 736     }
 737 #endif
 738
 739 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 740     /*
 741     ** SSE2 128 bits fetch/store instructions are faster
 742     ** if memory access is 16 bytes aligned
 743     */
 744     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
 745         ((intptr_t)p_line2|(intptr_t)p_y2))) )
 746     {
 747         /* use faster SSE2 aligned fetch and store */
 748         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 749         {
 750             p_line1 = p_line2;
 751             p_line2 += p_dest->p->i_pitch;
 752
 753             p_y1 = p_y2;
 754             p_y2 += p_source->p[Y_PLANE].i_pitch;
 755
 756             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
 757             {
 758                 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
 759             }
 760             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
 761             {
 762                 C_YUV420_UYVY( );
 763             }
 764
 765             p_y1 += i_source_margin;
 766             p_y2 += i_source_margin;
 767             p_u += i_source_margin_c;
 768             p_v += i_source_margin_c;
 769             p_line1 += i_dest_margin;
 770             p_line2 += i_dest_margin;
 771         }
 772     }
 773     else
 774     {
 775         /* use slower SSE2 unaligned fetch and store */
 776         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 777         {
 778             p_line1 = p_line2;
 779             p_line2 += p_dest->p->i_pitch;
 780
 781             p_y1 = p_y2;
 782             p_y2 += p_source->p[Y_PLANE].i_pitch;
 783
 784             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
 785             {
 786                 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
 787             }
 788             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
 789             {
 790                 C_YUV420_UYVY( );
 791             }
 792
 793             p_y1 += i_source_margin;
 794             p_y2 += i_source_margin;
 795             p_u += i_source_margin_c;
 796             p_v += i_source_margin_c;
 797             p_line1 += i_dest_margin;
 798             p_line2 += i_dest_margin;
 799         }
 800     }
 801     /* make sure all SSE2 stores are visible thereafter */
 802     SSE2_END;
 803 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 804 }
 805
 806 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
 807 /*****************************************************************************
 808  * I420_IUYV: planar YUV 4:2:0 to interleaved packed UYVY 4:2:2
 809  *****************************************************************************/
 810 static void I420_IUYV( vout_thread_t *p_vout, picture_t *p_source,
 811                                               picture_t *p_dest )
 812 {
 813     VLC_UNUSED(p_source); VLC_UNUSED(p_dest);
 814     /* FIXME: TODO ! */
 815     msg_Err( p_vout, "I420_IUYV unimplemented, please harass <sam@zoy.org>" );
 816 }
 817
 818 /*****************************************************************************
 819  * I420_cyuv: planar YUV 4:2:0 to upside-down packed UYVY 4:2:2
 820  *****************************************************************************/
 821 static void I420_cyuv( vout_thread_t *p_vout, picture_t *p_source,
 822                                               picture_t *p_dest )
 823 {
 824     uint8_t *p_line1 = p_dest->p->p_pixels +
 825                        p_dest->p->i_visible_lines * p_dest->p->i_pitch
 826                        + p_dest->p->i_pitch;
 827     uint8_t *p_line2 = p_dest->p->p_pixels +
 828                        p_dest->p->i_visible_lines * p_dest->p->i_pitch;
 829     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
 830     uint8_t *p_u = p_source->U_PIXELS;
 831     uint8_t *p_v = p_source->V_PIXELS;
 832
 833     int i_x, i_y;
 834
 835     const int i_source_margin = p_source->p[0].i_pitch
 836                                  - p_source->p[0].i_visible_pitch;
 837     const int i_source_margin_c = p_source->p[1].i_pitch
 838                                  - p_source->p[1].i_visible_pitch;
 839     const int i_dest_margin = p_dest->p->i_pitch
 840                                - p_dest->p->i_visible_pitch;
 841
 842 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
 843     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 844     {
 845         p_line1 -= 3 * p_dest->p->i_pitch;
 846         p_line2 -= 3 * p_dest->p->i_pitch;
 847
 848         p_y1 = p_y2;
 849         p_y2 += p_source->p[Y_PLANE].i_pitch;
 850
 851         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
 852         {
 853 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
 854             C_YUV420_UYVY( );
 855             C_YUV420_UYVY( );
 856             C_YUV420_UYVY( );
 857             C_YUV420_UYVY( );
 858 #else
 859             MMX_CALL( MMX_YUV420_UYVY );
 860 #endif
 861         }
 862         for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; )
 863         {
 864             C_YUV420_UYVY( );
 865         }
 866
 867         p_y1 += i_source_margin;
 868         p_y2 += i_source_margin;
 869         p_u += i_source_margin_c;
 870         p_v += i_source_margin_c;
 871         p_line1 += i_dest_margin;
 872         p_line2 += i_dest_margin;
 873     }
 874
 875 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
 876     /* re-enable FPU registers */
 877     MMX_END;
 878 #endif
 879
 880 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 881     /*
 882     ** SSE2 128 bits fetch/store instructions are faster
 883     ** if memory access is 16 bytes aligned
 884     */
 885     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
 886         ((intptr_t)p_line2|(intptr_t)p_y2))) )
 887     {
 888         /* use faster SSE2 aligned fetch and store */
 889         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 890         {
 891             p_line1 = p_line2;
 892             p_line2 += p_dest->p->i_pitch;
 893
 894             p_y1 = p_y2;
 895             p_y2 += p_source->p[Y_PLANE].i_pitch;
 896
 897             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
 898             {
 899                 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
 900             }
 901             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
 902             {
 903                 C_YUV420_UYVY( );
 904             }
 905
 906             p_y1 += i_source_margin;
 907             p_y2 += i_source_margin;
 908             p_u += i_source_margin_c;
 909             p_v += i_source_margin_c;
 910             p_line1 += i_dest_margin;
 911             p_line2 += i_dest_margin;
 912         }
 913     }
 914     else
 915     {
 916         /* use slower SSE2 unaligned fetch and store */
 917         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 918         {
 919             p_line1 = p_line2;
 920             p_line2 += p_dest->p->i_pitch;
 921
 922             p_y1 = p_y2;
 923             p_y2 += p_source->p[Y_PLANE].i_pitch;
 924
 925             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
 926             {
 927                 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
 928             }
 929             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
 930             {
 931                 C_YUV420_UYVY( );
 932             }
 933
 934             p_y1 += i_source_margin;
 935             p_y2 += i_source_margin;
 936             p_u += i_source_margin_c;
 937             p_v += i_source_margin_c;
 938             p_line1 += i_dest_margin;
 939             p_line2 += i_dest_margin;
 940         }
 941     }
 942     /* make sure all SSE2 stores are visible thereafter */
 943     SSE2_END;
 944 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 945 }
 946 #endif // !defined (MODULE_NAME_IS_i420_yuy2_altivec)
 947
 948 /*****************************************************************************
 949  * I420_Y211: planar YUV 4:2:0 to packed YUYV 2:1:1
 950  *****************************************************************************/
 951 #if defined (MODULE_NAME_IS_i420_yuy2)
 952 static void I420_Y211( vout_thread_t *p_vout, picture_t *p_source,
 953                                               picture_t *p_dest )
 954 {
 955     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
 956     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
 957     uint8_t *p_u = p_source->U_PIXELS;
 958     uint8_t *p_v = p_source->V_PIXELS;
 959
 960     int i_x, i_y;
 961
 962     const int i_source_margin = p_source->p[0].i_pitch
 963                                  - p_source->p[0].i_visible_pitch;
 964     const int i_source_margin_c = p_source->p[1].i_pitch
 965                                  - p_source->p[1].i_visible_pitch;
 966     const int i_dest_margin = p_dest->p->i_pitch
 967                                - p_dest->p->i_visible_pitch;
 968
 969     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 970     {
 971         p_line1 = p_line2;
 972         p_line2 += p_dest->p->i_pitch;
 973
 974         p_y1 = p_y2;
 975         p_y2 += p_source->p[Y_PLANE].i_pitch;
 976
 977         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
 978         {
 979             C_YUV420_Y211( );
 980             C_YUV420_Y211( );
 981         }
 982
 983         p_y1 += i_source_margin;
 984         p_y2 += i_source_margin;
 985         p_u += i_source_margin_c;
 986         p_v += i_source_margin_c;
 987         p_line1 += i_dest_margin;
 988         p_line2 += i_dest_margin;
 989     }
 990 }
 991 #endif