git.sesse.net Git - vlc/blob - modules/video_chroma/i420_yuy2.c

   1 /*****************************************************************************
   2  * i420_yuy2.c : YUV to YUV conversion module for vlc
   3  *****************************************************************************
   4  * Copyright (C) 2000, 2001 the VideoLAN team
   5  * $Id$
   6  *
   7  * Authors: Samuel Hocevar <sam@zoy.org>
   8  *          Damien Fouilleul <damien@videolan.org>
   9  *
  10  * This program is free software; you can redistribute it and/or modify
  11  * it under the terms of the GNU General Public License as published by
  12  * the Free Software Foundation; either version 2 of the License, or
  13  * (at your option) any later version.
  14  *
  15  * This program is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  * GNU General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU General Public License
  21  * along with this program; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
  23  *****************************************************************************/
  24
  25 /*****************************************************************************
  26  * Preamble
  27  *****************************************************************************/
  28 #include <string.h>                                            /* strerror() */
  29 #include <stdlib.h>                                      /* malloc(), free() */
  30
  31 #include <vlc/vlc.h>
  32 #include <vlc_vout.h>
  33
  34 #if defined (MODULE_NAME_IS_i420_yuy2_altivec) && defined(HAVE_ALTIVEC_H)
  35 #   include <altivec.h>
  36 #endif
  37
  38 #include "i420_yuy2.h"
  39
  40 #define SRC_FOURCC  "I420,IYUV,YV12"
  41
  42 #if defined (MODULE_NAME_IS_i420_yuy2)
  43 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv,Y211"
  44 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
  45 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
  46 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
  47 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
  48 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
  49 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422"
  50 #endif
  51
  52 /*****************************************************************************
  53  * Local and extern prototypes.
  54  *****************************************************************************/
  55 static int  Activate ( vlc_object_t * );
  56
  57 static void I420_YUY2           ( vout_thread_t *, picture_t *, picture_t * );
  58 static void I420_YVYU           ( vout_thread_t *, picture_t *, picture_t * );
  59 static void I420_UYVY           ( vout_thread_t *, picture_t *, picture_t * );
  60 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
  61 static void I420_IUYV           ( vout_thread_t *, picture_t *, picture_t * );
  62 static void I420_cyuv           ( vout_thread_t *, picture_t *, picture_t * );
  63 #endif
  64 #if defined (MODULE_NAME_IS_i420_yuy2)
  65 static void I420_Y211           ( vout_thread_t *, picture_t *, picture_t * );
  66 #endif
  67
  68 #ifdef MODULE_NAME_IS_i420_yuy2_mmx
  69 /* Initialize MMX-specific constants */
  70 static const uint64_t i_00ffw = 0x00ff00ff00ff00ffULL;
  71 static const uint64_t i_80w   = 0x0000000080808080ULL;
  72 #endif
  73
  74 /*****************************************************************************
  75  * Module descriptor.
  76  *****************************************************************************/
  77 vlc_module_begin();
  78 #if defined (MODULE_NAME_IS_i420_yuy2)
  79     set_description( _("Conversions from " SRC_FOURCC " to " DEST_FOURCC) );
  80     set_capability( "chroma", 80 );
  81 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
  82     set_description( _("MMX conversions from " SRC_FOURCC " to " DEST_FOURCC) );
  83     set_capability( "chroma", 100 );
  84     add_requirement( MMX );
  85 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
  86     set_description( _("SSE2 conversions from " SRC_FOURCC " to " DEST_FOURCC) );
  87     set_capability( "chroma", 120 );
  88     add_requirement( SSE2 );
  89 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
  90     set_description(
  91             _("AltiVec conversions from " SRC_FOURCC " to " DEST_FOURCC) );
  92     set_capability( "chroma", 100 );
  93     add_requirement( ALTIVEC );
  94 #endif
  95     set_callbacks( Activate, NULL );
  96 vlc_module_end();
  97
  98 /*****************************************************************************
  99  * Activate: allocate a chroma function
 100  *****************************************************************************
 101  * This function allocates and initializes a chroma function
 102  *****************************************************************************/
 103 static int Activate( vlc_object_t *p_this )
 104 {
 105     vout_thread_t *p_vout = (vout_thread_t *)p_this;
 106
 107     if( p_vout->render.i_width & 1 || p_vout->render.i_height & 1 )
 108     {
 109         return -1;
 110     }
 111
 112     switch( p_vout->render.i_chroma )
 113     {
 114         case VLC_FOURCC('Y','V','1','2'):
 115         case VLC_FOURCC('I','4','2','0'):
 116         case VLC_FOURCC('I','Y','U','V'):
 117             switch( p_vout->output.i_chroma )
 118             {
 119                 case VLC_FOURCC('Y','U','Y','2'):
 120                 case VLC_FOURCC('Y','U','N','V'):
 121                     p_vout->chroma.pf_convert = I420_YUY2;
 122                     break;
 123
 124                 case VLC_FOURCC('Y','V','Y','U'):
 125                     p_vout->chroma.pf_convert = I420_YVYU;
 126                     break;
 127
 128                 case VLC_FOURCC('U','Y','V','Y'):
 129                 case VLC_FOURCC('U','Y','N','V'):
 130                 case VLC_FOURCC('Y','4','2','2'):
 131                     p_vout->chroma.pf_convert = I420_UYVY;
 132                     break;
 133 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
 134                 case VLC_FOURCC('I','U','Y','V'):
 135                     p_vout->chroma.pf_convert = I420_IUYV;
 136                     break;
 137
 138                 case VLC_FOURCC('c','y','u','v'):
 139                     p_vout->chroma.pf_convert = I420_cyuv;
 140                     break;
 141 #endif
 142
 143 #if defined (MODULE_NAME_IS_i420_yuy2)
 144                 case VLC_FOURCC('Y','2','1','1'):
 145                     p_vout->chroma.pf_convert = I420_Y211;
 146                     break;
 147 #endif
 148
 149                 default:
 150                     return -1;
 151             }
 152             break;
 153
 154         default:
 155             return -1;
 156     }
 157
 158     return 0;
 159 }
 160
 161 /* Following functions are local */
 162
 163 /*****************************************************************************
 164  * I420_YUY2: planar YUV 4:2:0 to packed YUYV 4:2:2
 165  *****************************************************************************/
 166 static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source,
 167                                               picture_t *p_dest )
 168 {
 169     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
 170     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
 171     uint8_t *p_u = p_source->U_PIXELS;
 172     uint8_t *p_v = p_source->V_PIXELS;
 173
 174     int i_x, i_y;
 175
 176 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 177 #define VEC_NEXT_LINES( ) \
 178     p_line1  = p_line2; \
 179     p_line2 += p_dest->p->i_pitch; \
 180     p_y1     = p_y2; \
 181     p_y2    += p_source->p[Y_PLANE].i_pitch;
 182
 183 #define VEC_LOAD_UV( ) \
 184     u_vec = vec_ld( 0, p_u ); p_u += 16; \
 185     v_vec = vec_ld( 0, p_v ); p_v += 16;
 186
 187 #define VEC_MERGE( a ) \
 188     uv_vec = a( u_vec, v_vec ); \
 189     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
 190     vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
 191     vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
 192     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
 193     vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16; \
 194     vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16;
 195
 196     vector unsigned char u_vec;
 197     vector unsigned char v_vec;
 198     vector unsigned char uv_vec;
 199     vector unsigned char y_vec;
 200
 201     if( !( ( p_vout->render.i_width % 32 ) |
 202            ( p_vout->render.i_height % 2 ) ) )
 203     {
 204         /* Width is a multiple of 32, we take 2 lines at a time */
 205         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 206         {
 207             VEC_NEXT_LINES( );
 208             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
 209             {
 210                 VEC_LOAD_UV( );
 211                 VEC_MERGE( vec_mergeh );
 212                 VEC_MERGE( vec_mergel );
 213             }
 214         }
 215     }
 216     else if( !( ( p_vout->render.i_width % 16 ) |
 217                 ( p_vout->render.i_height % 4 ) ) )
 218     {
 219         /* Width is only a multiple of 16, we take 4 lines at a time */
 220         for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
 221         {
 222             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
 223             VEC_NEXT_LINES( );
 224             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
 225             {
 226                 VEC_LOAD_UV( );
 227                 VEC_MERGE( vec_mergeh );
 228                 VEC_MERGE( vec_mergel );
 229             }
 230
 231             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
 232             VEC_LOAD_UV( );
 233             VEC_MERGE( vec_mergeh );
 234
 235             /* Line 3 and 4, pixels 0 to 16 */
 236             VEC_NEXT_LINES( );
 237             VEC_MERGE( vec_mergel );
 238
 239             /* Line 3 and 4, pixels 16 to ( width ) */
 240             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
 241             {
 242                 VEC_LOAD_UV( );
 243                 VEC_MERGE( vec_mergeh );
 244                 VEC_MERGE( vec_mergel );
 245             }
 246         }
 247     }
 248     else
 249     {
 250         /* Crap, use the C version */
 251 #undef VEC_NEXT_LINES
 252 #undef VEC_LOAD_UV
 253 #undef VEC_MERGE
 254 #endif
 255
 256     const int i_source_margin = p_source->p[0].i_pitch
 257                                  - p_source->p[0].i_visible_pitch;
 258     const int i_source_margin_c = p_source->p[1].i_pitch
 259                                  - p_source->p[1].i_visible_pitch;
 260     const int i_dest_margin = p_dest->p->i_pitch
 261                                - p_dest->p->i_visible_pitch;
 262
 263 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
 264     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 265     {
 266         p_line1 = p_line2;
 267         p_line2 += p_dest->p->i_pitch;
 268
 269         p_y1 = p_y2;
 270         p_y2 += p_source->p[Y_PLANE].i_pitch;
 271
 272 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
 273         for( i_x = p_vout->render.i_width / 8; i_x-- ; )
 274         {
 275             C_YUV420_YUYV( );
 276             C_YUV420_YUYV( );
 277             C_YUV420_YUYV( );
 278             C_YUV420_YUYV( );
 279         }
 280 #else
 281         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
 282         {
 283             MMX_CALL( MMX_YUV420_YUYV );
 284         }
 285 #endif
 286         for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; )
 287         {
 288             C_YUV420_YUYV( );
 289         }
 290
 291         p_y1 += i_source_margin;
 292         p_y2 += i_source_margin;
 293         p_u += i_source_margin_c;
 294         p_v += i_source_margin_c;
 295         p_line1 += i_dest_margin;
 296         p_line2 += i_dest_margin;
 297     }
 298
 299 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
 300     __asm__ __volatile__("emms" :: );
 301 #endif
 302
 303 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 304     }
 305 #endif
 306
 307 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 308     /*
 309     ** SSE2 128 bytes fetch/store instructions are faster
 310     ** if memory access is 16 bytes aligned
 311     */
 312     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
 313         ((int)p_line2|(int)p_y2))) )
 314     {
 315         /* use faster SSE2 aligned fetch and store */
 316         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 317         {
 318             p_line1 = p_line2;
 319             p_line2 += p_dest->p->i_pitch;
 320
 321             p_y1 = p_y2;
 322             p_y2 += p_source->p[Y_PLANE].i_pitch;
 323
 324             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
 325             {
 326                 SSE2_CALL( SSE2_YUV420_YUYV_ALIGNED );
 327             }
 328             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
 329             {
 330                 C_YUV420_YUYV( );
 331             }
 332
 333             p_y1 += i_source_margin;
 334             p_y2 += i_source_margin;
 335             p_u += i_source_margin_c;
 336             p_v += i_source_margin_c;
 337             p_line1 += i_dest_margin;
 338             p_line2 += i_dest_margin;
 339         }
 340     }
 341     else
 342     {
 343         /* use slower SSE2 unaligned fetch and store */
 344         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 345         {
 346             p_line1 = p_line2;
 347             p_line2 += p_dest->p->i_pitch;
 348
 349             p_y1 = p_y2;
 350             p_y2 += p_source->p[Y_PLANE].i_pitch;
 351
 352             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
 353             {
 354                 SSE2_CALL( SSE2_YUV420_YUYV_UNALIGNED );
 355             }
 356             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
 357             {
 358                 C_YUV420_YUYV( );
 359             }
 360
 361             p_y1 += i_source_margin;
 362             p_y2 += i_source_margin;
 363             p_u += i_source_margin_c;
 364             p_v += i_source_margin_c;
 365             p_line1 += i_dest_margin;
 366             p_line2 += i_dest_margin;
 367         }
 368     }
 369 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 370 }
 371
 372 /*****************************************************************************
 373  * I420_YVYU: planar YUV 4:2:0 to packed YVYU 4:2:2
 374  *****************************************************************************/
 375 static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source,
 376                                               picture_t *p_dest )
 377 {
 378     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
 379     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
 380     uint8_t *p_u = p_source->U_PIXELS;
 381     uint8_t *p_v = p_source->V_PIXELS;
 382
 383     int i_x, i_y;
 384
 385 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 386 #define VEC_NEXT_LINES( ) \
 387     p_line1  = p_line2; \
 388     p_line2 += p_dest->p->i_pitch; \
 389     p_y1     = p_y2; \
 390     p_y2    += p_source->p[Y_PLANE].i_pitch;
 391
 392 #define VEC_LOAD_UV( ) \
 393     u_vec = vec_ld( 0, p_u ); p_u += 16; \
 394     v_vec = vec_ld( 0, p_v ); p_v += 16;
 395
 396 #define VEC_MERGE( a ) \
 397     vu_vec = a( v_vec, u_vec ); \
 398     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
 399     vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
 400     vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
 401     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
 402     vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16; \
 403     vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16;
 404
 405     vector unsigned char u_vec;
 406     vector unsigned char v_vec;
 407     vector unsigned char vu_vec;
 408     vector unsigned char y_vec;
 409
 410     if( !( ( p_vout->render.i_width % 32 ) |
 411            ( p_vout->render.i_height % 2 ) ) )
 412     {
 413         /* Width is a multiple of 32, we take 2 lines at a time */
 414         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 415         {
 416             VEC_NEXT_LINES( );
 417             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
 418             {
 419                 VEC_LOAD_UV( );
 420                 VEC_MERGE( vec_mergeh );
 421                 VEC_MERGE( vec_mergel );
 422             }
 423         }
 424     }
 425     else if( !( ( p_vout->render.i_width % 16 ) |
 426                 ( p_vout->render.i_height % 4 ) ) )
 427     {
 428         /* Width is only a multiple of 16, we take 4 lines at a time */
 429         for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
 430         {
 431             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
 432             VEC_NEXT_LINES( );
 433             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
 434             {
 435                 VEC_LOAD_UV( );
 436                 VEC_MERGE( vec_mergeh );
 437                 VEC_MERGE( vec_mergel );
 438             }
 439
 440             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
 441             VEC_LOAD_UV( );
 442             VEC_MERGE( vec_mergeh );
 443
 444             /* Line 3 and 4, pixels 0 to 16 */
 445             VEC_NEXT_LINES( );
 446             VEC_MERGE( vec_mergel );
 447
 448             /* Line 3 and 4, pixels 16 to ( width ) */
 449             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
 450             {
 451                 VEC_LOAD_UV( );
 452                 VEC_MERGE( vec_mergeh );
 453                 VEC_MERGE( vec_mergel );
 454             }
 455         }
 456     }
 457     else
 458     {
 459         /* Crap, use the C version */
 460 #undef VEC_NEXT_LINES
 461 #undef VEC_LOAD_UV
 462 #undef VEC_MERGE
 463 #endif
 464
 465     const int i_source_margin = p_source->p[0].i_pitch
 466                                  - p_source->p[0].i_visible_pitch;
 467     const int i_source_margin_c = p_source->p[1].i_pitch
 468                                  - p_source->p[1].i_visible_pitch;
 469     const int i_dest_margin = p_dest->p->i_pitch
 470                                - p_dest->p->i_visible_pitch;
 471
 472 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
 473     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 474     {
 475         p_line1 = p_line2;
 476         p_line2 += p_dest->p->i_pitch;
 477
 478         p_y1 = p_y2;
 479         p_y2 += p_source->p[Y_PLANE].i_pitch;
 480
 481         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
 482         {
 483 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
 484             C_YUV420_YVYU( );
 485             C_YUV420_YVYU( );
 486             C_YUV420_YVYU( );
 487             C_YUV420_YVYU( );
 488 #else
 489             MMX_CALL( MMX_YUV420_YVYU );
 490 #endif
 491         }
 492
 493         p_y1 += i_source_margin;
 494         p_y2 += i_source_margin;
 495         p_u += i_source_margin_c;
 496         p_v += i_source_margin_c;
 497         p_line1 += i_dest_margin;
 498         p_line2 += i_dest_margin;
 499     }
 500
 501 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
 502     __asm__ __volatile__("emms" :: );
 503 #endif
 504
 505 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 506     }
 507 #endif
 508
 509 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 510     /*
 511     ** SSE2 128 bytes fetch/store instructions are faster
 512     ** if memory access is 16 bytes aligned
 513     */
 514     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
 515         ((int)p_line2|(int)p_y2))) )
 516     {
 517         /* use faster SSE2 aligned fetch and store */
 518         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 519         {
 520             p_line1 = p_line2;
 521             p_line2 += p_dest->p->i_pitch;
 522
 523             p_y1 = p_y2;
 524             p_y2 += p_source->p[Y_PLANE].i_pitch;
 525
 526             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
 527             {
 528                 SSE2_CALL( SSE2_YUV420_YVYU_ALIGNED );
 529             }
 530             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
 531             {
 532                 C_YUV420_YVYU( );
 533             }
 534
 535             p_y1 += i_source_margin;
 536             p_y2 += i_source_margin;
 537             p_u += i_source_margin_c;
 538             p_v += i_source_margin_c;
 539             p_line1 += i_dest_margin;
 540             p_line2 += i_dest_margin;
 541         }
 542     }
 543     else
 544     {
 545         /* use slower SSE2 unaligned fetch and store */
 546         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 547         {
 548             p_line1 = p_line2;
 549             p_line2 += p_dest->p->i_pitch;
 550
 551             p_y1 = p_y2;
 552             p_y2 += p_source->p[Y_PLANE].i_pitch;
 553
 554             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
 555             {
 556                 SSE2_CALL( SSE2_YUV420_YVYU_UNALIGNED );
 557             }
 558             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
 559             {
 560                 C_YUV420_YVYU( );
 561             }
 562
 563             p_y1 += i_source_margin;
 564             p_y2 += i_source_margin;
 565             p_u += i_source_margin_c;
 566             p_v += i_source_margin_c;
 567             p_line1 += i_dest_margin;
 568             p_line2 += i_dest_margin;
 569         }
 570     }
 571 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 572 }
 573
 574 /*****************************************************************************
 575  * I420_UYVY: planar YUV 4:2:0 to packed UYVY 4:2:2
 576  *****************************************************************************/
 577 static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source,
 578                                               picture_t *p_dest )
 579 {
 580     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
 581     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
 582     uint8_t *p_u = p_source->U_PIXELS;
 583     uint8_t *p_v = p_source->V_PIXELS;
 584
 585     int i_x, i_y;
 586
 587 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 588 #define VEC_NEXT_LINES( ) \
 589     p_line1  = p_line2; \
 590     p_line2 += p_dest->p->i_pitch; \
 591     p_y1     = p_y2; \
 592     p_y2    += p_source->p[Y_PLANE].i_pitch;
 593
 594 #define VEC_LOAD_UV( ) \
 595     u_vec = vec_ld( 0, p_u ); p_u += 16; \
 596     v_vec = vec_ld( 0, p_v ); p_v += 16;
 597
 598 #define VEC_MERGE( a ) \
 599     uv_vec = a( u_vec, v_vec ); \
 600     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
 601     vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
 602     vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
 603     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
 604     vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16; \
 605     vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16;
 606
 607     vector unsigned char u_vec;
 608     vector unsigned char v_vec;
 609     vector unsigned char uv_vec;
 610     vector unsigned char y_vec;
 611
 612     if( !( ( p_vout->render.i_width % 32 ) |
 613            ( p_vout->render.i_height % 2 ) ) )
 614     {
 615         /* Width is a multiple of 32, we take 2 lines at a time */
 616         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 617         {
 618             VEC_NEXT_LINES( );
 619             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
 620             {
 621                 VEC_LOAD_UV( );
 622                 VEC_MERGE( vec_mergeh );
 623                 VEC_MERGE( vec_mergel );
 624             }
 625         }
 626     }
 627     else if( !( ( p_vout->render.i_width % 16 ) |
 628                 ( p_vout->render.i_height % 4 ) ) )
 629     {
 630         /* Width is only a multiple of 16, we take 4 lines at a time */
 631         for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
 632         {
 633             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
 634             VEC_NEXT_LINES( );
 635             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
 636             {
 637                 VEC_LOAD_UV( );
 638                 VEC_MERGE( vec_mergeh );
 639                 VEC_MERGE( vec_mergel );
 640             }
 641
 642             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
 643             VEC_LOAD_UV( );
 644             VEC_MERGE( vec_mergeh );
 645
 646             /* Line 3 and 4, pixels 0 to 16 */
 647             VEC_NEXT_LINES( );
 648             VEC_MERGE( vec_mergel );
 649
 650             /* Line 3 and 4, pixels 16 to ( width ) */
 651             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
 652             {
 653                 VEC_LOAD_UV( );
 654                 VEC_MERGE( vec_mergeh );
 655                 VEC_MERGE( vec_mergel );
 656             }
 657         }
 658     }
 659     else
 660     {
 661         /* Crap, use the C version */
 662 #undef VEC_NEXT_LINES
 663 #undef VEC_LOAD_UV
 664 #undef VEC_MERGE
 665 #endif
 666
 667     const int i_source_margin = p_source->p[0].i_pitch
 668                                  - p_source->p[0].i_visible_pitch;
 669     const int i_source_margin_c = p_source->p[1].i_pitch
 670                                  - p_source->p[1].i_visible_pitch;
 671     const int i_dest_margin = p_dest->p->i_pitch
 672                                - p_dest->p->i_visible_pitch;
 673
 674 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
 675     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 676     {
 677         p_line1 = p_line2;
 678         p_line2 += p_dest->p->i_pitch;
 679
 680         p_y1 = p_y2;
 681         p_y2 += p_source->p[Y_PLANE].i_pitch;
 682
 683         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
 684         {
 685 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
 686             C_YUV420_UYVY( );
 687             C_YUV420_UYVY( );
 688             C_YUV420_UYVY( );
 689             C_YUV420_UYVY( );
 690 #else
 691             MMX_CALL( MMX_YUV420_UYVY );
 692 #endif
 693         }
 694         for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x--; )
 695         {
 696             C_YUV420_UYVY( );
 697         }
 698
 699         p_y1 += i_source_margin;
 700         p_y2 += i_source_margin;
 701         p_u += i_source_margin_c;
 702         p_v += i_source_margin_c;
 703         p_line1 += i_dest_margin;
 704         p_line2 += i_dest_margin;
 705     }
 706
 707 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
 708     __asm__ __volatile__("emms" :: );
 709 #endif
 710
 711 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 712     }
 713 #endif
 714
 715 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 716     /*
 717     ** SSE2 128 bytes fetch/store instructions are faster
 718     ** if memory access is 16 bytes aligned
 719     */
 720     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
 721         ((int)p_line2|(int)p_y2))) )
 722     {
 723         /* use faster SSE2 aligned fetch and store */
 724         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 725         {
 726             p_line1 = p_line2;
 727             p_line2 += p_dest->p->i_pitch;
 728
 729             p_y1 = p_y2;
 730             p_y2 += p_source->p[Y_PLANE].i_pitch;
 731
 732             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
 733             {
 734                 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
 735             }
 736             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
 737             {
 738                 C_YUV420_UYVY( );
 739             }
 740
 741             p_y1 += i_source_margin;
 742             p_y2 += i_source_margin;
 743             p_u += i_source_margin_c;
 744             p_v += i_source_margin_c;
 745             p_line1 += i_dest_margin;
 746             p_line2 += i_dest_margin;
 747         }
 748     }
 749     else
 750     {
 751         /* use slower SSE2 unaligned fetch and store */
 752         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 753         {
 754             p_line1 = p_line2;
 755             p_line2 += p_dest->p->i_pitch;
 756
 757             p_y1 = p_y2;
 758             p_y2 += p_source->p[Y_PLANE].i_pitch;
 759
 760             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
 761             {
 762                 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
 763             }
 764             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
 765             {
 766                 C_YUV420_UYVY( );
 767             }
 768
 769             p_y1 += i_source_margin;
 770             p_y2 += i_source_margin;
 771             p_u += i_source_margin_c;
 772             p_v += i_source_margin_c;
 773             p_line1 += i_dest_margin;
 774             p_line2 += i_dest_margin;
 775         }
 776     }
 777 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 778 }
 779
 780 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
 781 /*****************************************************************************
 782  * I420_IUYV: planar YUV 4:2:0 to interleaved packed UYVY 4:2:2
 783  *****************************************************************************/
 784 static void I420_IUYV( vout_thread_t *p_vout, picture_t *p_source,
 785                                               picture_t *p_dest )
 786 {
 787     /* FIXME: TODO ! */
 788     msg_Err( p_vout, "I420_IUYV unimplemented, please harass <sam@zoy.org>" );
 789 }
 790
 791 /*****************************************************************************
 792  * I420_cyuv: planar YUV 4:2:0 to upside-down packed UYVY 4:2:2
 793  *****************************************************************************/
 794 static void I420_cyuv( vout_thread_t *p_vout, picture_t *p_source,
 795                                               picture_t *p_dest )
 796 {
 797     uint8_t *p_line1 = p_dest->p->p_pixels +
 798                        p_dest->p->i_visible_lines * p_dest->p->i_pitch
 799                        + p_dest->p->i_pitch;
 800     uint8_t *p_line2 = p_dest->p->p_pixels +
 801                        p_dest->p->i_visible_lines * p_dest->p->i_pitch;
 802     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
 803     uint8_t *p_u = p_source->U_PIXELS;
 804     uint8_t *p_v = p_source->V_PIXELS;
 805
 806     int i_x, i_y;
 807
 808     const int i_source_margin = p_source->p[0].i_pitch
 809                                  - p_source->p[0].i_visible_pitch;
 810     const int i_source_margin_c = p_source->p[1].i_pitch
 811                                  - p_source->p[1].i_visible_pitch;
 812     const int i_dest_margin = p_dest->p->i_pitch
 813                                - p_dest->p->i_visible_pitch;
 814
 815 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
 816     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 817     {
 818         p_line1 -= 3 * p_dest->p->i_pitch;
 819         p_line2 -= 3 * p_dest->p->i_pitch;
 820
 821         p_y1 = p_y2;
 822         p_y2 += p_source->p[Y_PLANE].i_pitch;
 823
 824         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
 825         {
 826 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
 827             C_YUV420_UYVY( );
 828             C_YUV420_UYVY( );
 829             C_YUV420_UYVY( );
 830             C_YUV420_UYVY( );
 831 #else
 832             MMX_CALL( MMX_YUV420_UYVY );
 833 #endif
 834         }
 835
 836         p_y1 += i_source_margin;
 837         p_y2 += i_source_margin;
 838         p_u += i_source_margin_c;
 839         p_v += i_source_margin_c;
 840         p_line1 += i_dest_margin;
 841         p_line2 += i_dest_margin;
 842     }
 843
 844 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
 845     __asm__ __volatile__("emms" :: );
 846 #endif
 847
 848 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 849     /*
 850     ** SSE2 128 bytes fetch/store instructions are faster
 851     ** if memory access is 16 bytes aligned
 852     */
 853     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
 854         ((int)p_line2|(int)p_y2))) )
 855     {
 856         /* use faster SSE2 aligned fetch and store */
 857         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 858         {
 859             p_line1 = p_line2;
 860             p_line2 += p_dest->p->i_pitch;
 861
 862             p_y1 = p_y2;
 863             p_y2 += p_source->p[Y_PLANE].i_pitch;
 864
 865             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
 866             {
 867                 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
 868             }
 869             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
 870             {
 871                 C_YUV420_UYVY( );
 872             }
 873
 874             p_y1 += i_source_margin;
 875             p_y2 += i_source_margin;
 876             p_u += i_source_margin_c;
 877             p_v += i_source_margin_c;
 878             p_line1 += i_dest_margin;
 879             p_line2 += i_dest_margin;
 880         }
 881     }
 882     else
 883     {
 884         /* use slower SSE2 unaligned fetch and store */
 885         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 886         {
 887             p_line1 = p_line2;
 888             p_line2 += p_dest->p->i_pitch;
 889
 890             p_y1 = p_y2;
 891             p_y2 += p_source->p[Y_PLANE].i_pitch;
 892
 893             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
 894             {
 895                 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
 896             }
 897             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
 898             {
 899                 C_YUV420_UYVY( );
 900             }
 901
 902             p_y1 += i_source_margin;
 903             p_y2 += i_source_margin;
 904             p_u += i_source_margin_c;
 905             p_v += i_source_margin_c;
 906             p_line1 += i_dest_margin;
 907             p_line2 += i_dest_margin;
 908         }
 909     }
 910 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 911 }
 912 #endif // !defined (MODULE_NAME_IS_i420_yuy2_altivec)
 913
 914 /*****************************************************************************
 915  * I420_Y211: planar YUV 4:2:0 to packed YUYV 2:1:1
 916  *****************************************************************************/
 917 #if defined (MODULE_NAME_IS_i420_yuy2)
 918 static void I420_Y211( vout_thread_t *p_vout, picture_t *p_source,
 919                                               picture_t *p_dest )
 920 {
 921     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
 922     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
 923     uint8_t *p_u = p_source->U_PIXELS;
 924     uint8_t *p_v = p_source->V_PIXELS;
 925
 926     int i_x, i_y;
 927
 928     const int i_source_margin = p_source->p[0].i_pitch
 929                                  - p_source->p[0].i_visible_pitch;
 930     const int i_source_margin_c = p_source->p[1].i_pitch
 931                                  - p_source->p[1].i_visible_pitch;
 932     const int i_dest_margin = p_dest->p->i_pitch
 933                                - p_dest->p->i_visible_pitch;
 934
 935     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
 936     {
 937         p_line1 = p_line2;
 938         p_line2 += p_dest->p->i_pitch;
 939
 940         p_y1 = p_y2;
 941         p_y2 += p_source->p[Y_PLANE].i_pitch;
 942
 943         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
 944         {
 945             C_YUV420_Y211( );
 946             C_YUV420_Y211( );
 947         }
 948
 949         p_y1 += i_source_margin;
 950         p_y2 += i_source_margin;
 951         p_u += i_source_margin_c;
 952         p_v += i_source_margin_c;
 953         p_line1 += i_dest_margin;
 954         p_line2 += i_dest_margin;
 955     }
 956 }
 957 #endif