git.sesse.net Git - vlc/blob - modules/video_chroma/i420_yuy2.c

   1 /*****************************************************************************
   2  * i420_yuy2.c : YUV to YUV conversion module for vlc
   3  *****************************************************************************
   4  * Copyright (C) 2000, 2001 the VideoLAN team
   5  * $Id$
   6  *
   7  * Authors: Samuel Hocevar <sam@zoy.org>
   8  *          Damien Fouilleul <damien@videolan.org>
   9  *
  10  * This program is free software; you can redistribute it and/or modify
  11  * it under the terms of the GNU General Public License as published by
  12  * the Free Software Foundation; either version 2 of the License, or
  13  * (at your option) any later version.
  14  *
  15  * This program is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  * GNU General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU General Public License
  21  * along with this program; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
  23  *****************************************************************************/
  24
  25 /*****************************************************************************
  26  * Preamble
  27  *****************************************************************************/
  28 #include <string.h>                                            /* strerror() */
  29 #include <stdlib.h>                                      /* malloc(), free() */
  30
  31 #include <vlc/vlc.h>
  32 #include <vlc_vout.h>
  33
  34 #if defined (MODULE_NAME_IS_i420_yuy2_altivec) && defined(HAVE_ALTIVEC_H)
  35 #   include <altivec.h>
  36 #endif
  37
  38 #include "i420_yuy2.h"
  39
  40 #define SRC_FOURCC  "I420,IYUV,YV12"
  41
  42 #if defined (MODULE_NAME_IS_i420_yuy2)
  43 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv,Y211"
  44 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
  45 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
  46 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
  47 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
  48 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
  49 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422"
  50 #endif
  51
  52 /*****************************************************************************
  53  * Local and extern prototypes.
  54  *****************************************************************************/
  55 static int  Activate ( vlc_object_t * );
  56
  57 static void I420_YUY2           ( vout_thread_t *, picture_t *, picture_t * );
  58 static void I420_YVYU           ( vout_thread_t *, picture_t *, picture_t * );
  59 static void I420_UYVY           ( vout_thread_t *, picture_t *, picture_t * );
  60 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
  61 static void I420_IUYV           ( vout_thread_t *, picture_t *, picture_t * );
  62 static void I420_cyuv           ( vout_thread_t *, picture_t *, picture_t * );
  63 #endif
  64 #if defined (MODULE_NAME_IS_i420_yuy2)
  65 static void I420_Y211           ( vout_thread_t *, picture_t *, picture_t * );
  66 #endif
  67
  68 #ifdef MODULE_NAME_IS_i420_yuy2_mmx
  69 /* Initialize MMX-specific constants */
  70 static const uint64_t i_00ffw = 0x00ff00ff00ff00ffULL;
  71 static const uint64_t i_80w   = 0x0000000080808080ULL;
  72 #endif
  73
  74 /*****************************************************************************
  75  * Module descriptor.
  76  *****************************************************************************/
  77 vlc_module_begin();
  78 #if defined (MODULE_NAME_IS_i420_yuy2)
  79     set_description( _("Conversions from " SRC_FOURCC " to " DEST_FOURCC) );
  80     set_capability( "chroma", 80 );
  81 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
  82     set_description( _("MMX conversions from " SRC_FOURCC " to " DEST_FOURCC) );
  83     set_capability( "chroma", 100 );
  84     add_requirement( MMX );
  85 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
  86     set_description( _("SSE2 conversions from " SRC_FOURCC " to " DEST_FOURCC) );
  87     set_capability( "chroma", 120 );
  88     add_requirement( SSE2 );
  89 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
  90     set_description(
  91             _("AltiVec conversions from " SRC_FOURCC " to " DEST_FOURCC) );
  92     set_capability( "chroma", 100 );
  93     add_requirement( ALTIVEC );
  94 #endif
  95     set_callbacks( Activate, NULL );
  96 vlc_module_end();
  97
  98 /*****************************************************************************
  99  * Activate: allocate a chroma function
 100  *****************************************************************************
 101  * This function allocates and initializes a chroma function
 102  *****************************************************************************/
 103 static int Activate( vlc_object_t *p_this )
 104 {
 105     vout_thread_t *p_vout = (vout_thread_t *)p_this;
 106
 107     if( p_vout->render.i_width & 1 || p_vout->render.i_height & 1 )
 108     {
 109         return -1;
 110     }
 111
 112     switch( p_vout->render.i_chroma )
 113     {
 114         case VLC_FOURCC('Y','V','1','2'):
 115         case VLC_FOURCC('I','4','2','0'):
 116         case VLC_FOURCC('I','Y','U','V'):
 117             switch( p_vout->output.i_chroma )
 118             {
 119                 case VLC_FOURCC('Y','U','Y','2'):
 120                 case VLC_FOURCC('Y','U','N','V'):
 121                     p_vout->chroma.pf_convert = I420_YUY2;
 122                     break;
 123
 124                 case VLC_FOURCC('Y','V','Y','U'):
 125                     p_vout->chroma.pf_convert = I420_YVYU;
 126                     break;
 127
 128                 case VLC_FOURCC('U','Y','V','Y'):
 129                 case VLC_FOURCC('U','Y','N','V'):
 130                 case VLC_FOURCC('Y','4','2','2'):
 131                     p_vout->chroma.pf_convert = I420_UYVY;
 132                     break;
 133 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
 134                 case VLC_FOURCC('I','U','Y','V'):
 135                     p_vout->chroma.pf_convert = I420_IUYV;
 136                     break;
 137
 138                 case VLC_FOURCC('c','y','u','v'):
 139                     p_vout->chroma.pf_convert = I420_cyuv;
 140                     break;
 141 #endif
 142
 143 #if defined (MODULE_NAME_IS_i420_yuy2)
 144                 case VLC_FOURCC('Y','2','1','1'):
 145                     p_vout->chroma.pf_convert = I420_Y211;
 146                     break;
 147 #endif
 148
 149                 default:
 150                     return -1;
 151             }
 152             break;
 153
 154         default:
 155             return -1;
 156     }
 157
 158     return 0;
 159 }
 160
 161 #if 0
 162 static inline unsigned long long read_cycles(void)
 163 {
 164     unsigned long long v;
 165     __asm__ __volatile__("rdtsc" : "=A" (v): );
 166
 167     return v;
 168 }
 169 #endif
 170
 171 /* Following functions are local */
 172 /*****************************************************************************
 173  * I420_YUY2: planar YUV 4:2:0 to packed YUYV 4:2:2
 174  *****************************************************************************/
 175 static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source,
 176                                               picture_t *p_dest )
 177 {
 178     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
 179     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
 180     uint8_t *p_u = p_source->U_PIXELS;
 181     uint8_t *p_v = p_source->V_PIXELS;
 182
 183     int i_x, i_y;
 184
 185 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 186 #define VEC_NEXT_LINES( ) \
 187     p_line1  = p_line2; \
 188     p_line2 += p_dest->p->i_pitch; \
 189     p_y1     = p_y2; \
 190     p_y2    += p_source->p[Y_PLANE].i_pitch;
 191
 192 #define VEC_LOAD_UV( ) \
 193     u_vec = vec_ld( 0, p_u ); p_u += 16; \
 194     v_vec = vec_ld( 0, p_v ); p_v += 16;
 195
 196 #define VEC_MERGE( a ) \
 197     uv_vec = a( u_vec, v_vec ); \
 198     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
 199     vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
 200     vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
 201     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
 202     vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16; \
 203     vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16;
 204
 205     vector unsigned char u_vec;
 206     vector unsigned char v_vec;
 207     vector unsigned char uv_vec;
 208     vector unsigned char y_vec;
 209
 210     if( !( ( p_vout->render.i_width % 32 ) |
 211            ( p_vout->render.i_height % 2 ) ) )
 212     {
 213         /* Width is a multiple of 32, we take 2 lines at a time */
 214         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 215         {
 216             VEC_NEXT_LINES( );
 217             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
 218             {
 219                 VEC_LOAD_UV( );
 220                 VEC_MERGE( vec_mergeh );
 221                 VEC_MERGE( vec_mergel );
 222             }
 223         }
 224     }
 225     else if( !( ( p_vout->render.i_width % 16 ) |
 226                 ( p_vout->render.i_height % 4 ) ) )
 227     {
 228         /* Width is only a multiple of 16, we take 4 lines at a time */
 229         for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
 230         {
 231             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
 232             VEC_NEXT_LINES( );
 233             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
 234             {
 235                 VEC_LOAD_UV( );
 236                 VEC_MERGE( vec_mergeh );
 237                 VEC_MERGE( vec_mergel );
 238             }
 239
 240             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
 241             VEC_LOAD_UV( );
 242             VEC_MERGE( vec_mergeh );
 243
 244             /* Line 3 and 4, pixels 0 to 16 */
 245             VEC_NEXT_LINES( );
 246             VEC_MERGE( vec_mergel );
 247
 248             /* Line 3 and 4, pixels 16 to ( width ) */
 249             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
 250             {
 251                 VEC_LOAD_UV( );
 252                 VEC_MERGE( vec_mergeh );
 253                 VEC_MERGE( vec_mergel );
 254             }
 255         }
 256     }
 257     else
 258     {
 259         /* Crap, use the C version */
 260 #undef VEC_NEXT_LINES
 261 #undef VEC_LOAD_UV
 262 #undef VEC_MERGE
 263 #endif
 264
 265     const int i_source_margin = p_source->p[0].i_pitch
 266                                  - p_source->p[0].i_visible_pitch;
 267     const int i_source_margin_c = p_source->p[1].i_pitch
 268                                  - p_source->p[1].i_visible_pitch;
 269     const int i_dest_margin = p_dest->p->i_pitch
 270                                - p_dest->p->i_visible_pitch;
 271
 272 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
 273     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 274     {
 275         p_line1 = p_line2;
 276         p_line2 += p_dest->p->i_pitch;
 277
 278         p_y1 = p_y2;
 279         p_y2 += p_source->p[Y_PLANE].i_pitch;
 280
 281 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
 282         for( i_x = p_vout->render.i_width / 8; i_x-- ; )
 283         {
 284             C_YUV420_YUYV( );
 285             C_YUV420_YUYV( );
 286             C_YUV420_YUYV( );
 287             C_YUV420_YUYV( );
 288         }
 289 #else
 290         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
 291         {
 292             MMX_CALL( MMX_YUV420_YUYV );
 293         }
 294 #endif
 295         for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; )
 296         {
 297             C_YUV420_YUYV( );
 298         }
 299
 300         p_y1 += i_source_margin;
 301         p_y2 += i_source_margin;
 302         p_u += i_source_margin_c;
 303         p_v += i_source_margin_c;
 304         p_line1 += i_dest_margin;
 305         p_line2 += i_dest_margin;
 306     }
 307
 308 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
 309     /* re-enable FPU registers */
 310     __asm__ __volatile__ ( "emms" );
 311 #endif
 312
 313 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 314     }
 315 #endif
 316
 317 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 318     /*
 319     ** SSE2 128 bits fetch/store instructions are faster
 320     ** if memory access is 16 bytes aligned
 321     */
 322
 323     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
 324         ((int)p_line2|(int)p_y2))) )
 325     {
 326         /* use faster SSE2 aligned fetch and store */
 327         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 328         {
 329             p_line1 = p_line2;
 330             p_line2 += p_dest->p->i_pitch;
 331
 332             p_y1 = p_y2;
 333             p_y2 += p_source->p[Y_PLANE].i_pitch;
 334
 335             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
 336             {
 337                 SSE2_CALL( SSE2_YUV420_YUYV_ALIGNED );
 338             }
 339             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
 340             {
 341                 C_YUV420_YUYV( );
 342             }
 343
 344             p_y1 += i_source_margin;
 345             p_y2 += i_source_margin;
 346             p_u += i_source_margin_c;
 347             p_v += i_source_margin_c;
 348             p_line1 += i_dest_margin;
 349             p_line2 += i_dest_margin;
 350         }
 351         /* make sure all SSE2 stores are visible thereafter */
 352         __asm__ __volatile__ ( "sfence" );
 353     }
 354     else
 355     {
 356         /* use slower SSE2 unaligned fetch and store */
 357         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 358         {
 359             p_line1 = p_line2;
 360             p_line2 += p_dest->p->i_pitch;
 361
 362             p_y1 = p_y2;
 363             p_y2 += p_source->p[Y_PLANE].i_pitch;
 364
 365             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
 366             {
 367                 SSE2_CALL( SSE2_YUV420_YUYV_UNALIGNED );
 368             }
 369             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
 370             {
 371                 C_YUV420_YUYV( );
 372             }
 373
 374             p_y1 += i_source_margin;
 375             p_y2 += i_source_margin;
 376             p_u += i_source_margin_c;
 377             p_v += i_source_margin_c;
 378             p_line1 += i_dest_margin;
 379             p_line2 += i_dest_margin;
 380         }
 381     }
 382
 383 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 384 }
 385
 386 /*****************************************************************************
 387  * I420_YVYU: planar YUV 4:2:0 to packed YVYU 4:2:2
 388  *****************************************************************************/
 389 static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source,
 390                                               picture_t *p_dest )
 391 {
 392     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
 393     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
 394     uint8_t *p_u = p_source->U_PIXELS;
 395     uint8_t *p_v = p_source->V_PIXELS;
 396
 397     int i_x, i_y;
 398
 399 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 400 #define VEC_NEXT_LINES( ) \
 401     p_line1  = p_line2; \
 402     p_line2 += p_dest->p->i_pitch; \
 403     p_y1     = p_y2; \
 404     p_y2    += p_source->p[Y_PLANE].i_pitch;
 405
 406 #define VEC_LOAD_UV( ) \
 407     u_vec = vec_ld( 0, p_u ); p_u += 16; \
 408     v_vec = vec_ld( 0, p_v ); p_v += 16;
 409
 410 #define VEC_MERGE( a ) \
 411     vu_vec = a( v_vec, u_vec ); \
 412     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
 413     vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
 414     vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
 415     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
 416     vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16; \
 417     vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16;
 418
 419     vector unsigned char u_vec;
 420     vector unsigned char v_vec;
 421     vector unsigned char vu_vec;
 422     vector unsigned char y_vec;
 423
 424     if( !( ( p_vout->render.i_width % 32 ) |
 425            ( p_vout->render.i_height % 2 ) ) )
 426     {
 427         /* Width is a multiple of 32, we take 2 lines at a time */
 428         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 429         {
 430             VEC_NEXT_LINES( );
 431             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
 432             {
 433                 VEC_LOAD_UV( );
 434                 VEC_MERGE( vec_mergeh );
 435                 VEC_MERGE( vec_mergel );
 436             }
 437         }
 438     }
 439     else if( !( ( p_vout->render.i_width % 16 ) |
 440                 ( p_vout->render.i_height % 4 ) ) )
 441     {
 442         /* Width is only a multiple of 16, we take 4 lines at a time */
 443         for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
 444         {
 445             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
 446             VEC_NEXT_LINES( );
 447             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
 448             {
 449                 VEC_LOAD_UV( );
 450                 VEC_MERGE( vec_mergeh );
 451                 VEC_MERGE( vec_mergel );
 452             }
 453
 454             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
 455             VEC_LOAD_UV( );
 456             VEC_MERGE( vec_mergeh );
 457
 458             /* Line 3 and 4, pixels 0 to 16 */
 459             VEC_NEXT_LINES( );
 460             VEC_MERGE( vec_mergel );
 461
 462             /* Line 3 and 4, pixels 16 to ( width ) */
 463             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
 464             {
 465                 VEC_LOAD_UV( );
 466                 VEC_MERGE( vec_mergeh );
 467                 VEC_MERGE( vec_mergel );
 468             }
 469         }
 470     }
 471     else
 472     {
 473         /* Crap, use the C version */
 474 #undef VEC_NEXT_LINES
 475 #undef VEC_LOAD_UV
 476 #undef VEC_MERGE
 477 #endif
 478
 479     const int i_source_margin = p_source->p[0].i_pitch
 480                                  - p_source->p[0].i_visible_pitch;
 481     const int i_source_margin_c = p_source->p[1].i_pitch
 482                                  - p_source->p[1].i_visible_pitch;
 483     const int i_dest_margin = p_dest->p->i_pitch
 484                                - p_dest->p->i_visible_pitch;
 485
 486 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
 487     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 488     {
 489         p_line1 = p_line2;
 490         p_line2 += p_dest->p->i_pitch;
 491
 492         p_y1 = p_y2;
 493         p_y2 += p_source->p[Y_PLANE].i_pitch;
 494
 495         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
 496         {
 497 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
 498             C_YUV420_YVYU( );
 499             C_YUV420_YVYU( );
 500             C_YUV420_YVYU( );
 501             C_YUV420_YVYU( );
 502 #else
 503             MMX_CALL( MMX_YUV420_YVYU );
 504 #endif
 505         }
 506         for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; )
 507         {
 508             C_YUV420_YVYU( );
 509         }
 510
 511         p_y1 += i_source_margin;
 512         p_y2 += i_source_margin;
 513         p_u += i_source_margin_c;
 514         p_v += i_source_margin_c;
 515         p_line1 += i_dest_margin;
 516         p_line2 += i_dest_margin;
 517     }
 518
 519 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
 520     /* re-enable FPU registers */
 521     __asm__ __volatile__ ( "emms" );
 522 #endif
 523
 524 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 525     }
 526 #endif
 527
 528 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 529     /*
 530     ** SSE2 128 bits fetch/store instructions are faster
 531     ** if memory access is 16 bytes aligned
 532     */
 533     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
 534         ((int)p_line2|(int)p_y2))) )
 535     {
 536         /* use faster SSE2 aligned fetch and store */
 537         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 538         {
 539             p_line1 = p_line2;
 540             p_line2 += p_dest->p->i_pitch;
 541
 542             p_y1 = p_y2;
 543             p_y2 += p_source->p[Y_PLANE].i_pitch;
 544
 545             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
 546             {
 547                 SSE2_CALL( SSE2_YUV420_YVYU_ALIGNED );
 548             }
 549             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
 550             {
 551                 C_YUV420_YVYU( );
 552             }
 553
 554             p_y1 += i_source_margin;
 555             p_y2 += i_source_margin;
 556             p_u += i_source_margin_c;
 557             p_v += i_source_margin_c;
 558             p_line1 += i_dest_margin;
 559             p_line2 += i_dest_margin;
 560         }
 561         /* make sure all SSE2 stores are visible thereafter */
 562         __asm__ __volatile__ ( "sfence" );
 563     }
 564     else
 565     {
 566         /* use slower SSE2 unaligned fetch and store */
 567         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 568         {
 569             p_line1 = p_line2;
 570             p_line2 += p_dest->p->i_pitch;
 571
 572             p_y1 = p_y2;
 573             p_y2 += p_source->p[Y_PLANE].i_pitch;
 574
 575             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
 576             {
 577                 SSE2_CALL( SSE2_YUV420_YVYU_UNALIGNED );
 578             }
 579             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
 580             {
 581                 C_YUV420_YVYU( );
 582             }
 583
 584             p_y1 += i_source_margin;
 585             p_y2 += i_source_margin;
 586             p_u += i_source_margin_c;
 587             p_v += i_source_margin_c;
 588             p_line1 += i_dest_margin;
 589             p_line2 += i_dest_margin;
 590         }
 591     }
 592 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 593 }
 594
 595 /*****************************************************************************
 596  * I420_UYVY: planar YUV 4:2:0 to packed UYVY 4:2:2
 597  *****************************************************************************/
 598 static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source,
 599                                               picture_t *p_dest )
 600 {
 601     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
 602     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
 603     uint8_t *p_u = p_source->U_PIXELS;
 604     uint8_t *p_v = p_source->V_PIXELS;
 605
 606     int i_x, i_y;
 607
 608 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 609 #define VEC_NEXT_LINES( ) \
 610     p_line1  = p_line2; \
 611     p_line2 += p_dest->p->i_pitch; \
 612     p_y1     = p_y2; \
 613     p_y2    += p_source->p[Y_PLANE].i_pitch;
 614
 615 #define VEC_LOAD_UV( ) \
 616     u_vec = vec_ld( 0, p_u ); p_u += 16; \
 617     v_vec = vec_ld( 0, p_v ); p_v += 16;
 618
 619 #define VEC_MERGE( a ) \
 620     uv_vec = a( u_vec, v_vec ); \
 621     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
 622     vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
 623     vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
 624     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
 625     vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16; \
 626     vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16;
 627
 628     vector unsigned char u_vec;
 629     vector unsigned char v_vec;
 630     vector unsigned char uv_vec;
 631     vector unsigned char y_vec;
 632
 633     if( !( ( p_vout->render.i_width % 32 ) |
 634            ( p_vout->render.i_height % 2 ) ) )
 635     {
 636         /* Width is a multiple of 32, we take 2 lines at a time */
 637         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 638         {
 639             VEC_NEXT_LINES( );
 640             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
 641             {
 642                 VEC_LOAD_UV( );
 643                 VEC_MERGE( vec_mergeh );
 644                 VEC_MERGE( vec_mergel );
 645             }
 646         }
 647     }
 648     else if( !( ( p_vout->render.i_width % 16 ) |
 649                 ( p_vout->render.i_height % 4 ) ) )
 650     {
 651         /* Width is only a multiple of 16, we take 4 lines at a time */
 652         for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
 653         {
 654             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
 655             VEC_NEXT_LINES( );
 656             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
 657             {
 658                 VEC_LOAD_UV( );
 659                 VEC_MERGE( vec_mergeh );
 660                 VEC_MERGE( vec_mergel );
 661             }
 662
 663             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
 664             VEC_LOAD_UV( );
 665             VEC_MERGE( vec_mergeh );
 666
 667             /* Line 3 and 4, pixels 0 to 16 */
 668             VEC_NEXT_LINES( );
 669             VEC_MERGE( vec_mergel );
 670
 671             /* Line 3 and 4, pixels 16 to ( width ) */
 672             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
 673             {
 674                 VEC_LOAD_UV( );
 675                 VEC_MERGE( vec_mergeh );
 676                 VEC_MERGE( vec_mergel );
 677             }
 678         }
 679     }
 680     else
 681     {
 682         /* Crap, use the C version */
 683 #undef VEC_NEXT_LINES
 684 #undef VEC_LOAD_UV
 685 #undef VEC_MERGE
 686 #endif
 687
 688     const int i_source_margin = p_source->p[0].i_pitch
 689                                  - p_source->p[0].i_visible_pitch;
 690     const int i_source_margin_c = p_source->p[1].i_pitch
 691                                  - p_source->p[1].i_visible_pitch;
 692     const int i_dest_margin = p_dest->p->i_pitch
 693                                - p_dest->p->i_visible_pitch;
 694
 695 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
 696     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 697     {
 698         p_line1 = p_line2;
 699         p_line2 += p_dest->p->i_pitch;
 700
 701         p_y1 = p_y2;
 702         p_y2 += p_source->p[Y_PLANE].i_pitch;
 703
 704         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
 705         {
 706 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
 707             C_YUV420_UYVY( );
 708             C_YUV420_UYVY( );
 709             C_YUV420_UYVY( );
 710             C_YUV420_UYVY( );
 711 #else
 712             MMX_CALL( MMX_YUV420_UYVY );
 713 #endif
 714         }
 715         for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x--; )
 716         {
 717             C_YUV420_UYVY( );
 718         }
 719
 720         p_y1 += i_source_margin;
 721         p_y2 += i_source_margin;
 722         p_u += i_source_margin_c;
 723         p_v += i_source_margin_c;
 724         p_line1 += i_dest_margin;
 725         p_line2 += i_dest_margin;
 726     }
 727
 728 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
 729     /* re-enable FPU registers */
 730     __asm__ __volatile__ ( "emms" );
 731 #endif
 732
 733 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 734     }
 735 #endif
 736
 737 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 738     /*
 739     ** SSE2 128 bits fetch/store instructions are faster
 740     ** if memory access is 16 bytes aligned
 741     */
 742     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
 743         ((int)p_line2|(int)p_y2))) )
 744     {
 745         /* use faster SSE2 aligned fetch and store */
 746         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 747         {
 748             p_line1 = p_line2;
 749             p_line2 += p_dest->p->i_pitch;
 750
 751             p_y1 = p_y2;
 752             p_y2 += p_source->p[Y_PLANE].i_pitch;
 753
 754             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
 755             {
 756                 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
 757             }
 758             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
 759             {
 760                 C_YUV420_UYVY( );
 761             }
 762
 763             p_y1 += i_source_margin;
 764             p_y2 += i_source_margin;
 765             p_u += i_source_margin_c;
 766             p_v += i_source_margin_c;
 767             p_line1 += i_dest_margin;
 768             p_line2 += i_dest_margin;
 769         }
 770         /* make sure all SSE2 stores are visible thereafter */
 771         __asm__ __volatile__ ( "sfence" );
 772     }
 773     else
 774     {
 775         /* use slower SSE2 unaligned fetch and store */
 776         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 777         {
 778             p_line1 = p_line2;
 779             p_line2 += p_dest->p->i_pitch;
 780
 781             p_y1 = p_y2;
 782             p_y2 += p_source->p[Y_PLANE].i_pitch;
 783
 784             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
 785             {
 786                 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
 787             }
 788             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
 789             {
 790                 C_YUV420_UYVY( );
 791             }
 792
 793             p_y1 += i_source_margin;
 794             p_y2 += i_source_margin;
 795             p_u += i_source_margin_c;
 796             p_v += i_source_margin_c;
 797             p_line1 += i_dest_margin;
 798             p_line2 += i_dest_margin;
 799         }
 800     }
 801 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 802 }
 803
 804 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
 805 /*****************************************************************************
 806  * I420_IUYV: planar YUV 4:2:0 to interleaved packed UYVY 4:2:2
 807  *****************************************************************************/
 808 static void I420_IUYV( vout_thread_t *p_vout, picture_t *p_source,
 809                                               picture_t *p_dest )
 810 {
 811     /* FIXME: TODO ! */
 812     msg_Err( p_vout, "I420_IUYV unimplemented, please harass <sam@zoy.org>" );
 813 }
 814
 815 /*****************************************************************************
 816  * I420_cyuv: planar YUV 4:2:0 to upside-down packed UYVY 4:2:2
 817  *****************************************************************************/
 818 static void I420_cyuv( vout_thread_t *p_vout, picture_t *p_source,
 819                                               picture_t *p_dest )
 820 {
 821     uint8_t *p_line1 = p_dest->p->p_pixels +
 822                        p_dest->p->i_visible_lines * p_dest->p->i_pitch
 823                        + p_dest->p->i_pitch;
 824     uint8_t *p_line2 = p_dest->p->p_pixels +
 825                        p_dest->p->i_visible_lines * p_dest->p->i_pitch;
 826     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
 827     uint8_t *p_u = p_source->U_PIXELS;
 828     uint8_t *p_v = p_source->V_PIXELS;
 829
 830     int i_x, i_y;
 831
 832     const int i_source_margin = p_source->p[0].i_pitch
 833                                  - p_source->p[0].i_visible_pitch;
 834     const int i_source_margin_c = p_source->p[1].i_pitch
 835                                  - p_source->p[1].i_visible_pitch;
 836     const int i_dest_margin = p_dest->p->i_pitch
 837                                - p_dest->p->i_visible_pitch;
 838
 839 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
 840     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 841     {
 842         p_line1 -= 3 * p_dest->p->i_pitch;
 843         p_line2 -= 3 * p_dest->p->i_pitch;
 844
 845         p_y1 = p_y2;
 846         p_y2 += p_source->p[Y_PLANE].i_pitch;
 847
 848         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
 849         {
 850 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
 851             C_YUV420_UYVY( );
 852             C_YUV420_UYVY( );
 853             C_YUV420_UYVY( );
 854             C_YUV420_UYVY( );
 855 #else
 856             MMX_CALL( MMX_YUV420_UYVY );
 857 #endif
 858         }
 859         for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; )
 860         {
 861             C_YUV420_UYVY( );
 862         }
 863
 864         p_y1 += i_source_margin;
 865         p_y2 += i_source_margin;
 866         p_u += i_source_margin_c;
 867         p_v += i_source_margin_c;
 868         p_line1 += i_dest_margin;
 869         p_line2 += i_dest_margin;
 870     }
 871
 872 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
 873     /* re-enable FPU registers */
 874     __asm__ __volatile__ ( "emms" );
 875 #endif
 876
 877 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 878     /*
 879     ** SSE2 128 bits fetch/store instructions are faster
 880     ** if memory access is 16 bytes aligned
 881     */
 882     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
 883         ((int)p_line2|(int)p_y2))) )
 884     {
 885         /* use faster SSE2 aligned fetch and store */
 886         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 887         {
 888             p_line1 = p_line2;
 889             p_line2 += p_dest->p->i_pitch;
 890
 891             p_y1 = p_y2;
 892             p_y2 += p_source->p[Y_PLANE].i_pitch;
 893
 894             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
 895             {
 896                 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
 897             }
 898             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
 899             {
 900                 C_YUV420_UYVY( );
 901             }
 902
 903             p_y1 += i_source_margin;
 904             p_y2 += i_source_margin;
 905             p_u += i_source_margin_c;
 906             p_v += i_source_margin_c;
 907             p_line1 += i_dest_margin;
 908             p_line2 += i_dest_margin;
 909         }
 910         /* make sure all SSE2 stores are visible thereafter */
 911         __asm__ __volatile__ ( "sfence" );
 912     }
 913     else
 914     {
 915         /* use slower SSE2 unaligned fetch and store */
 916         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 917         {
 918             p_line1 = p_line2;
 919             p_line2 += p_dest->p->i_pitch;
 920
 921             p_y1 = p_y2;
 922             p_y2 += p_source->p[Y_PLANE].i_pitch;
 923
 924             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
 925             {
 926                 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
 927             }
 928             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
 929             {
 930                 C_YUV420_UYVY( );
 931             }
 932
 933             p_y1 += i_source_margin;
 934             p_y2 += i_source_margin;
 935             p_u += i_source_margin_c;
 936             p_v += i_source_margin_c;
 937             p_line1 += i_dest_margin;
 938             p_line2 += i_dest_margin;
 939         }
 940     }
 941 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 942 }
 943 #endif // !defined (MODULE_NAME_IS_i420_yuy2_altivec)
 944
 945 /*****************************************************************************
 946  * I420_Y211: planar YUV 4:2:0 to packed YUYV 2:1:1
 947  *****************************************************************************/
 948 #if defined (MODULE_NAME_IS_i420_yuy2)
 949 static void I420_Y211( vout_thread_t *p_vout, picture_t *p_source,
 950                                               picture_t *p_dest )
 951 {
 952     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
 953     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
 954     uint8_t *p_u = p_source->U_PIXELS;
 955     uint8_t *p_v = p_source->V_PIXELS;
 956
 957     int i_x, i_y;
 958
 959     const int i_source_margin = p_source->p[0].i_pitch
 960                                  - p_source->p[0].i_visible_pitch;
 961     const int i_source_margin_c = p_source->p[1].i_pitch
 962                                  - p_source->p[1].i_visible_pitch;
 963     const int i_dest_margin = p_dest->p->i_pitch
 964                                - p_dest->p->i_visible_pitch;
 965
 966     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 967     {
 968         p_line1 = p_line2;
 969         p_line2 += p_dest->p->i_pitch;
 970
 971         p_y1 = p_y2;
 972         p_y2 += p_source->p[Y_PLANE].i_pitch;
 973
 974         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
 975         {
 976             C_YUV420_Y211( );
 977             C_YUV420_Y211( );
 978         }
 979
 980         p_y1 += i_source_margin;
 981         p_y2 += i_source_margin;
 982         p_u += i_source_margin_c;
 983         p_v += i_source_margin_c;
 984         p_line1 += i_dest_margin;
 985         p_line2 += i_dest_margin;
 986     }
 987 }
 988 #endif