git.sesse.net Git - vlc/blob - modules/video_chroma/i420_yuy2.c

   1 /*****************************************************************************
   2  * i420_yuy2.c : YUV to YUV conversion module for vlc
   3  *****************************************************************************
   4  * Copyright (C) 2000, 2001 the VideoLAN team
   5  * $Id$
   6  *
   7  * Authors: Samuel Hocevar <sam@zoy.org>
   8  *          Damien Fouilleul <damien@videolan.org>
   9  *
  10  * This program is free software; you can redistribute it and/or modify
  11  * it under the terms of the GNU General Public License as published by
  12  * the Free Software Foundation; either version 2 of the License, or
  13  * (at your option) any later version.
  14  *
  15  * This program is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  * GNU General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU General Public License
  21  * along with this program; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
  23  *****************************************************************************/
  24
  25 /*****************************************************************************
  26  * Preamble
  27  *****************************************************************************/
  28
  29 #ifdef HAVE_CONFIG_H
  30 # include "config.h"
  31 #endif
  32
  33 #include <vlc_common.h>
  34 #include <vlc_plugin.h>
  35 #include <vlc_filter.h>
  36
  37 #if defined (MODULE_NAME_IS_i420_yuy2_altivec) && defined(HAVE_ALTIVEC_H)
  38 #   include <altivec.h>
  39 #endif
  40
  41 #include "i420_yuy2.h"
  42
  43 #define SRC_FOURCC  "I420,IYUV,YV12"
  44
  45 #if defined (MODULE_NAME_IS_i420_yuy2)
  46 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv,Y211"
  47 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
  48 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
  49 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
  50 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
  51 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
  52 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422"
  53 #endif
  54
  55 /*****************************************************************************
  56  * Local and extern prototypes.
  57  *****************************************************************************/
  58 static int  Activate ( vlc_object_t * );
  59
  60 static void I420_YUY2           ( filter_t *, picture_t *, picture_t * );
  61 static void I420_YVYU           ( filter_t *, picture_t *, picture_t * );
  62 static void I420_UYVY           ( filter_t *, picture_t *, picture_t * );
  63 static picture_t *I420_YUY2_Filter    ( filter_t *, picture_t * );
  64 static picture_t *I420_YVYU_Filter    ( filter_t *, picture_t * );
  65 static picture_t *I420_UYVY_Filter    ( filter_t *, picture_t * );
  66 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
  67 static void I420_IUYV           ( filter_t *, picture_t *, picture_t * );
  68 static void I420_cyuv           ( filter_t *, picture_t *, picture_t * );
  69 static picture_t *I420_IUYV_Filter    ( filter_t *, picture_t * );
  70 static picture_t *I420_cyuv_Filter    ( filter_t *, picture_t * );
  71 #endif
  72 #if defined (MODULE_NAME_IS_i420_yuy2)
  73 static void I420_Y211           ( filter_t *, picture_t *, picture_t * );
  74 static picture_t *I420_Y211_Filter    ( filter_t *, picture_t * );
  75 #endif
  76
  77 #ifdef MODULE_NAME_IS_i420_yuy2_mmx
  78 /* Initialize MMX-specific constants */
  79 static const uint64_t i_00ffw = 0x00ff00ff00ff00ffULL;
  80 static const uint64_t i_80w   = 0x0000000080808080ULL;
  81 #endif
  82
  83 /*****************************************************************************
  84  * Module descriptor.
  85  *****************************************************************************/
  86 vlc_module_begin ()
  87 #if defined (MODULE_NAME_IS_i420_yuy2)
  88     set_description( N_("Conversions from " SRC_FOURCC " to " DEST_FOURCC) )
  89     set_capability( "video filter2", 80 )
  90 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
  91     set_description( N_("MMX conversions from " SRC_FOURCC " to " DEST_FOURCC) )
  92     set_capability( "video filter2", 160 )
  93 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
  94     set_description( N_("SSE2 conversions from " SRC_FOURCC " to " DEST_FOURCC) )
  95     set_capability( "video filter2", 250 )
  96 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
  97     set_description(
  98             _("AltiVec conversions from " SRC_FOURCC " to " DEST_FOURCC) );
  99     set_capability( "video filter2", 250 )
 100 #endif
 101     set_callbacks( Activate, NULL )
 102 vlc_module_end ()
 103
 104 /*****************************************************************************
 105  * Activate: allocate a chroma function
 106  *****************************************************************************
 107  * This function allocates and initializes a chroma function
 108  *****************************************************************************/
 109 static int Activate( vlc_object_t *p_this )
 110 {
 111     filter_t *p_filter = (filter_t *)p_this;
 112
 113     if( p_filter->fmt_in.video.i_width & 1
 114      || p_filter->fmt_in.video.i_height & 1 )
 115     {
 116         return -1;
 117     }
 118
 119     if( p_filter->fmt_in.video.i_width != p_filter->fmt_out.video.i_width
 120      || p_filter->fmt_in.video.i_height != p_filter->fmt_out.video.i_height )
 121         return -1;
 122
 123     switch( p_filter->fmt_in.video.i_chroma )
 124     {
 125         case VLC_CODEC_YV12:
 126         case VLC_CODEC_I420:
 127             switch( p_filter->fmt_out.video.i_chroma )
 128             {
 129                 case VLC_CODEC_YUYV:
 130                     p_filter->pf_video_filter = I420_YUY2_Filter;
 131                     break;
 132
 133                 case VLC_CODEC_YVYU:
 134                     p_filter->pf_video_filter = I420_YVYU_Filter;
 135                     break;
 136
 137                 case VLC_CODEC_UYVY:
 138                     p_filter->pf_video_filter = I420_UYVY_Filter;
 139                     break;
 140 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
 141                 case VLC_FOURCC('I','U','Y','V'):
 142                     p_filter->pf_video_filter = I420_IUYV_Filter;
 143                     break;
 144
 145                 case VLC_CODEC_CYUV:
 146                     p_filter->pf_video_filter = I420_cyuv_Filter;
 147                     break;
 148 #endif
 149
 150 #if defined (MODULE_NAME_IS_i420_yuy2)
 151                 case VLC_CODEC_Y211:
 152                     p_filter->pf_video_filter = I420_Y211_Filter;
 153                     break;
 154 #endif
 155
 156                 default:
 157                     return -1;
 158             }
 159             break;
 160
 161         default:
 162             return -1;
 163     }
 164
 165     return 0;
 166 }
 167
 168 #if 0
 169 static inline unsigned long long read_cycles(void)
 170 {
 171     unsigned long long v;
 172     __asm__ __volatile__("rdtsc" : "=A" (v): );
 173
 174     return v;
 175 }
 176 #endif
 177
 178 /* Following functions are local */
 179
 180 VIDEO_FILTER_WRAPPER( I420_YUY2 )
 181 VIDEO_FILTER_WRAPPER( I420_YVYU )
 182 VIDEO_FILTER_WRAPPER( I420_UYVY )
 183 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
 184 VIDEO_FILTER_WRAPPER( I420_IUYV )
 185 VIDEO_FILTER_WRAPPER( I420_cyuv )
 186 #endif
 187 #if defined (MODULE_NAME_IS_i420_yuy2)
 188 VIDEO_FILTER_WRAPPER( I420_Y211 )
 189 #endif
 190
 191 /*****************************************************************************
 192  * I420_YUY2: planar YUV 4:2:0 to packed YUYV 4:2:2
 193  *****************************************************************************/
 194 static void I420_YUY2( filter_t *p_filter, picture_t *p_source,
 195                                            picture_t *p_dest )
 196 {
 197     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
 198     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
 199     uint8_t *p_u = p_source->U_PIXELS;
 200     uint8_t *p_v = p_source->V_PIXELS;
 201
 202     int i_x, i_y;
 203
 204 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 205 #define VEC_NEXT_LINES( ) \
 206     p_line1  = p_line2; \
 207     p_line2 += p_dest->p->i_pitch; \
 208     p_y1     = p_y2; \
 209     p_y2    += p_source->p[Y_PLANE].i_pitch;
 210
 211 #define VEC_LOAD_UV( ) \
 212     u_vec = vec_ld( 0, p_u ); p_u += 16; \
 213     v_vec = vec_ld( 0, p_v ); p_v += 16;
 214
 215 #define VEC_MERGE( a ) \
 216     uv_vec = a( u_vec, v_vec ); \
 217     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
 218     vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
 219     vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
 220     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
 221     vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16; \
 222     vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16;
 223
 224     vector unsigned char u_vec;
 225     vector unsigned char v_vec;
 226     vector unsigned char uv_vec;
 227     vector unsigned char y_vec;
 228
 229     if( !( ( p_filter->fmt_in.video.i_width % 32 ) |
 230            ( p_filter->fmt_in.video.i_height % 2 ) ) )
 231     {
 232         /* Width is a multiple of 32, we take 2 lines at a time */
 233         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 234         {
 235             VEC_NEXT_LINES( );
 236             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
 237             {
 238                 VEC_LOAD_UV( );
 239                 VEC_MERGE( vec_mergeh );
 240                 VEC_MERGE( vec_mergel );
 241             }
 242         }
 243     }
 244     else if( !( ( p_filter->fmt_in.video.i_width % 16 ) |
 245                 ( p_filter->fmt_in.video.i_height % 4 ) ) )
 246     {
 247         /* Width is only a multiple of 16, we take 4 lines at a time */
 248         for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; )
 249         {
 250             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
 251             VEC_NEXT_LINES( );
 252             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
 253             {
 254                 VEC_LOAD_UV( );
 255                 VEC_MERGE( vec_mergeh );
 256                 VEC_MERGE( vec_mergel );
 257             }
 258
 259             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
 260             VEC_LOAD_UV( );
 261             VEC_MERGE( vec_mergeh );
 262
 263             /* Line 3 and 4, pixels 0 to 16 */
 264             VEC_NEXT_LINES( );
 265             VEC_MERGE( vec_mergel );
 266
 267             /* Line 3 and 4, pixels 16 to ( width ) */
 268             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
 269             {
 270                 VEC_LOAD_UV( );
 271                 VEC_MERGE( vec_mergeh );
 272                 VEC_MERGE( vec_mergel );
 273             }
 274         }
 275     }
 276     else
 277     {
 278         /* Crap, use the C version */
 279 #undef VEC_NEXT_LINES
 280 #undef VEC_LOAD_UV
 281 #undef VEC_MERGE
 282 #endif
 283
 284     const int i_source_margin = p_source->p[0].i_pitch
 285                                  - p_source->p[0].i_visible_pitch;
 286     const int i_source_margin_c = p_source->p[1].i_pitch
 287                                  - p_source->p[1].i_visible_pitch;
 288     const int i_dest_margin = p_dest->p->i_pitch
 289                                - p_dest->p->i_visible_pitch;
 290
 291 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
 292     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 293     {
 294         p_line1 = p_line2;
 295         p_line2 += p_dest->p->i_pitch;
 296
 297         p_y1 = p_y2;
 298         p_y2 += p_source->p[Y_PLANE].i_pitch;
 299
 300 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
 301         for( i_x = p_filter->fmt_in.video.i_width / 8; i_x-- ; )
 302         {
 303             C_YUV420_YUYV( );
 304             C_YUV420_YUYV( );
 305             C_YUV420_YUYV( );
 306             C_YUV420_YUYV( );
 307         }
 308 #else
 309         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
 310         {
 311             MMX_CALL( MMX_YUV420_YUYV );
 312         }
 313 #endif
 314         for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; )
 315         {
 316             C_YUV420_YUYV( );
 317         }
 318
 319         p_y1 += i_source_margin;
 320         p_y2 += i_source_margin;
 321         p_u += i_source_margin_c;
 322         p_v += i_source_margin_c;
 323         p_line1 += i_dest_margin;
 324         p_line2 += i_dest_margin;
 325     }
 326
 327 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
 328     /* re-enable FPU registers */
 329     MMX_END;
 330 #endif
 331
 332 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 333     }
 334 #endif
 335
 336 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 337     /*
 338     ** SSE2 128 bits fetch/store instructions are faster
 339     ** if memory access is 16 bytes aligned
 340     */
 341
 342     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
 343         ((intptr_t)p_line2|(intptr_t)p_y2))) )
 344     {
 345         /* use faster SSE2 aligned fetch and store */
 346         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 347         {
 348             p_line1 = p_line2;
 349             p_line2 += p_dest->p->i_pitch;
 350
 351             p_y1 = p_y2;
 352             p_y2 += p_source->p[Y_PLANE].i_pitch;
 353
 354             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
 355             {
 356                 SSE2_CALL( SSE2_YUV420_YUYV_ALIGNED );
 357             }
 358             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
 359             {
 360                 C_YUV420_YUYV( );
 361             }
 362
 363             p_y1 += i_source_margin;
 364             p_y2 += i_source_margin;
 365             p_u += i_source_margin_c;
 366             p_v += i_source_margin_c;
 367             p_line1 += i_dest_margin;
 368             p_line2 += i_dest_margin;
 369         }
 370     }
 371     else
 372     {
 373         /* use slower SSE2 unaligned fetch and store */
 374         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 375         {
 376             p_line1 = p_line2;
 377             p_line2 += p_dest->p->i_pitch;
 378
 379             p_y1 = p_y2;
 380             p_y2 += p_source->p[Y_PLANE].i_pitch;
 381
 382             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
 383             {
 384                 SSE2_CALL( SSE2_YUV420_YUYV_UNALIGNED );
 385             }
 386             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
 387             {
 388                 C_YUV420_YUYV( );
 389             }
 390
 391             p_y1 += i_source_margin;
 392             p_y2 += i_source_margin;
 393             p_u += i_source_margin_c;
 394             p_v += i_source_margin_c;
 395             p_line1 += i_dest_margin;
 396             p_line2 += i_dest_margin;
 397         }
 398     }
 399     /* make sure all SSE2 stores are visible thereafter */
 400     SSE2_END;
 401
 402 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 403 }
 404
 405 /*****************************************************************************
 406  * I420_YVYU: planar YUV 4:2:0 to packed YVYU 4:2:2
 407  *****************************************************************************/
 408 static void I420_YVYU( filter_t *p_filter, picture_t *p_source,
 409                                            picture_t *p_dest )
 410 {
 411     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
 412     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
 413     uint8_t *p_u = p_source->U_PIXELS;
 414     uint8_t *p_v = p_source->V_PIXELS;
 415
 416     int i_x, i_y;
 417
 418 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 419 #define VEC_NEXT_LINES( ) \
 420     p_line1  = p_line2; \
 421     p_line2 += p_dest->p->i_pitch; \
 422     p_y1     = p_y2; \
 423     p_y2    += p_source->p[Y_PLANE].i_pitch;
 424
 425 #define VEC_LOAD_UV( ) \
 426     u_vec = vec_ld( 0, p_u ); p_u += 16; \
 427     v_vec = vec_ld( 0, p_v ); p_v += 16;
 428
 429 #define VEC_MERGE( a ) \
 430     vu_vec = a( v_vec, u_vec ); \
 431     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
 432     vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
 433     vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
 434     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
 435     vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16; \
 436     vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16;
 437
 438     vector unsigned char u_vec;
 439     vector unsigned char v_vec;
 440     vector unsigned char vu_vec;
 441     vector unsigned char y_vec;
 442
 443     if( !( ( p_filter->fmt_in.video.i_width % 32 ) |
 444            ( p_filter->fmt_in.video.i_height % 2 ) ) )
 445     {
 446         /* Width is a multiple of 32, we take 2 lines at a time */
 447         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 448         {
 449             VEC_NEXT_LINES( );
 450             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
 451             {
 452                 VEC_LOAD_UV( );
 453                 VEC_MERGE( vec_mergeh );
 454                 VEC_MERGE( vec_mergel );
 455             }
 456         }
 457     }
 458     else if( !( ( p_filter->fmt_in.video.i_width % 16 ) |
 459                 ( p_filter->fmt_in.video.i_height % 4 ) ) )
 460     {
 461         /* Width is only a multiple of 16, we take 4 lines at a time */
 462         for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; )
 463         {
 464             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
 465             VEC_NEXT_LINES( );
 466             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
 467             {
 468                 VEC_LOAD_UV( );
 469                 VEC_MERGE( vec_mergeh );
 470                 VEC_MERGE( vec_mergel );
 471             }
 472
 473             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
 474             VEC_LOAD_UV( );
 475             VEC_MERGE( vec_mergeh );
 476
 477             /* Line 3 and 4, pixels 0 to 16 */
 478             VEC_NEXT_LINES( );
 479             VEC_MERGE( vec_mergel );
 480
 481             /* Line 3 and 4, pixels 16 to ( width ) */
 482             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
 483             {
 484                 VEC_LOAD_UV( );
 485                 VEC_MERGE( vec_mergeh );
 486                 VEC_MERGE( vec_mergel );
 487             }
 488         }
 489     }
 490     else
 491     {
 492         /* Crap, use the C version */
 493 #undef VEC_NEXT_LINES
 494 #undef VEC_LOAD_UV
 495 #undef VEC_MERGE
 496 #endif
 497
 498     const int i_source_margin = p_source->p[0].i_pitch
 499                                  - p_source->p[0].i_visible_pitch;
 500     const int i_source_margin_c = p_source->p[1].i_pitch
 501                                  - p_source->p[1].i_visible_pitch;
 502     const int i_dest_margin = p_dest->p->i_pitch
 503                                - p_dest->p->i_visible_pitch;
 504
 505 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
 506     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 507     {
 508         p_line1 = p_line2;
 509         p_line2 += p_dest->p->i_pitch;
 510
 511         p_y1 = p_y2;
 512         p_y2 += p_source->p[Y_PLANE].i_pitch;
 513
 514         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
 515         {
 516 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
 517             C_YUV420_YVYU( );
 518             C_YUV420_YVYU( );
 519             C_YUV420_YVYU( );
 520             C_YUV420_YVYU( );
 521 #else
 522             MMX_CALL( MMX_YUV420_YVYU );
 523 #endif
 524         }
 525         for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; )
 526         {
 527             C_YUV420_YVYU( );
 528         }
 529
 530         p_y1 += i_source_margin;
 531         p_y2 += i_source_margin;
 532         p_u += i_source_margin_c;
 533         p_v += i_source_margin_c;
 534         p_line1 += i_dest_margin;
 535         p_line2 += i_dest_margin;
 536     }
 537
 538 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
 539     /* re-enable FPU registers */
 540     MMX_END;
 541 #endif
 542
 543 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 544     }
 545 #endif
 546
 547 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 548     /*
 549     ** SSE2 128 bits fetch/store instructions are faster
 550     ** if memory access is 16 bytes aligned
 551     */
 552     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
 553         ((intptr_t)p_line2|(intptr_t)p_y2))) )
 554     {
 555         /* use faster SSE2 aligned fetch and store */
 556         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 557         {
 558             p_line1 = p_line2;
 559             p_line2 += p_dest->p->i_pitch;
 560
 561             p_y1 = p_y2;
 562             p_y2 += p_source->p[Y_PLANE].i_pitch;
 563
 564             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
 565             {
 566                 SSE2_CALL( SSE2_YUV420_YVYU_ALIGNED );
 567             }
 568             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
 569             {
 570                 C_YUV420_YVYU( );
 571             }
 572
 573             p_y1 += i_source_margin;
 574             p_y2 += i_source_margin;
 575             p_u += i_source_margin_c;
 576             p_v += i_source_margin_c;
 577             p_line1 += i_dest_margin;
 578             p_line2 += i_dest_margin;
 579         }
 580     }
 581     else
 582     {
 583         /* use slower SSE2 unaligned fetch and store */
 584         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 585         {
 586             p_line1 = p_line2;
 587             p_line2 += p_dest->p->i_pitch;
 588
 589             p_y1 = p_y2;
 590             p_y2 += p_source->p[Y_PLANE].i_pitch;
 591
 592             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
 593             {
 594                 SSE2_CALL( SSE2_YUV420_YVYU_UNALIGNED );
 595             }
 596             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
 597             {
 598                 C_YUV420_YVYU( );
 599             }
 600
 601             p_y1 += i_source_margin;
 602             p_y2 += i_source_margin;
 603             p_u += i_source_margin_c;
 604             p_v += i_source_margin_c;
 605             p_line1 += i_dest_margin;
 606             p_line2 += i_dest_margin;
 607         }
 608     }
 609     /* make sure all SSE2 stores are visible thereafter */
 610     SSE2_END;
 611 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 612 }
 613
 614 /*****************************************************************************
 615  * I420_UYVY: planar YUV 4:2:0 to packed UYVY 4:2:2
 616  *****************************************************************************/
 617 static void I420_UYVY( filter_t *p_filter, picture_t *p_source,
 618                                            picture_t *p_dest )
 619 {
 620     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
 621     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
 622     uint8_t *p_u = p_source->U_PIXELS;
 623     uint8_t *p_v = p_source->V_PIXELS;
 624
 625     int i_x, i_y;
 626
 627 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 628 #define VEC_NEXT_LINES( ) \
 629     p_line1  = p_line2; \
 630     p_line2 += p_dest->p->i_pitch; \
 631     p_y1     = p_y2; \
 632     p_y2    += p_source->p[Y_PLANE].i_pitch;
 633
 634 #define VEC_LOAD_UV( ) \
 635     u_vec = vec_ld( 0, p_u ); p_u += 16; \
 636     v_vec = vec_ld( 0, p_v ); p_v += 16;
 637
 638 #define VEC_MERGE( a ) \
 639     uv_vec = a( u_vec, v_vec ); \
 640     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
 641     vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
 642     vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
 643     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
 644     vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16; \
 645     vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16;
 646
 647     vector unsigned char u_vec;
 648     vector unsigned char v_vec;
 649     vector unsigned char uv_vec;
 650     vector unsigned char y_vec;
 651
 652     if( !( ( p_filter->fmt_in.video.i_width % 32 ) |
 653            ( p_filter->fmt_in.video.i_height % 2 ) ) )
 654     {
 655         /* Width is a multiple of 32, we take 2 lines at a time */
 656         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 657         {
 658             VEC_NEXT_LINES( );
 659             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
 660             {
 661                 VEC_LOAD_UV( );
 662                 VEC_MERGE( vec_mergeh );
 663                 VEC_MERGE( vec_mergel );
 664             }
 665         }
 666     }
 667     else if( !( ( p_filter->fmt_in.video.i_width % 16 ) |
 668                 ( p_filter->fmt_in.video.i_height % 4 ) ) )
 669     {
 670         /* Width is only a multiple of 16, we take 4 lines at a time */
 671         for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; )
 672         {
 673             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
 674             VEC_NEXT_LINES( );
 675             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
 676             {
 677                 VEC_LOAD_UV( );
 678                 VEC_MERGE( vec_mergeh );
 679                 VEC_MERGE( vec_mergel );
 680             }
 681
 682             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
 683             VEC_LOAD_UV( );
 684             VEC_MERGE( vec_mergeh );
 685
 686             /* Line 3 and 4, pixels 0 to 16 */
 687             VEC_NEXT_LINES( );
 688             VEC_MERGE( vec_mergel );
 689
 690             /* Line 3 and 4, pixels 16 to ( width ) */
 691             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
 692             {
 693                 VEC_LOAD_UV( );
 694                 VEC_MERGE( vec_mergeh );
 695                 VEC_MERGE( vec_mergel );
 696             }
 697         }
 698     }
 699     else
 700     {
 701         /* Crap, use the C version */
 702 #undef VEC_NEXT_LINES
 703 #undef VEC_LOAD_UV
 704 #undef VEC_MERGE
 705 #endif
 706
 707     const int i_source_margin = p_source->p[0].i_pitch
 708                                  - p_source->p[0].i_visible_pitch;
 709     const int i_source_margin_c = p_source->p[1].i_pitch
 710                                  - p_source->p[1].i_visible_pitch;
 711     const int i_dest_margin = p_dest->p->i_pitch
 712                                - p_dest->p->i_visible_pitch;
 713
 714 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
 715     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 716     {
 717         p_line1 = p_line2;
 718         p_line2 += p_dest->p->i_pitch;
 719
 720         p_y1 = p_y2;
 721         p_y2 += p_source->p[Y_PLANE].i_pitch;
 722
 723         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
 724         {
 725 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
 726             C_YUV420_UYVY( );
 727             C_YUV420_UYVY( );
 728             C_YUV420_UYVY( );
 729             C_YUV420_UYVY( );
 730 #else
 731             MMX_CALL( MMX_YUV420_UYVY );
 732 #endif
 733         }
 734         for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x--; )
 735         {
 736             C_YUV420_UYVY( );
 737         }
 738
 739         p_y1 += i_source_margin;
 740         p_y2 += i_source_margin;
 741         p_u += i_source_margin_c;
 742         p_v += i_source_margin_c;
 743         p_line1 += i_dest_margin;
 744         p_line2 += i_dest_margin;
 745     }
 746
 747 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
 748     /* re-enable FPU registers */
 749     MMX_END;
 750 #endif
 751
 752 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 753     }
 754 #endif
 755
 756 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 757     /*
 758     ** SSE2 128 bits fetch/store instructions are faster
 759     ** if memory access is 16 bytes aligned
 760     */
 761     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
 762         ((intptr_t)p_line2|(intptr_t)p_y2))) )
 763     {
 764         /* use faster SSE2 aligned fetch and store */
 765         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 766         {
 767             p_line1 = p_line2;
 768             p_line2 += p_dest->p->i_pitch;
 769
 770             p_y1 = p_y2;
 771             p_y2 += p_source->p[Y_PLANE].i_pitch;
 772
 773             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
 774             {
 775                 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
 776             }
 777             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
 778             {
 779                 C_YUV420_UYVY( );
 780             }
 781
 782             p_y1 += i_source_margin;
 783             p_y2 += i_source_margin;
 784             p_u += i_source_margin_c;
 785             p_v += i_source_margin_c;
 786             p_line1 += i_dest_margin;
 787             p_line2 += i_dest_margin;
 788         }
 789     }
 790     else
 791     {
 792         /* use slower SSE2 unaligned fetch and store */
 793         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 794         {
 795             p_line1 = p_line2;
 796             p_line2 += p_dest->p->i_pitch;
 797
 798             p_y1 = p_y2;
 799             p_y2 += p_source->p[Y_PLANE].i_pitch;
 800
 801             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
 802             {
 803                 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
 804             }
 805             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
 806             {
 807                 C_YUV420_UYVY( );
 808             }
 809
 810             p_y1 += i_source_margin;
 811             p_y2 += i_source_margin;
 812             p_u += i_source_margin_c;
 813             p_v += i_source_margin_c;
 814             p_line1 += i_dest_margin;
 815             p_line2 += i_dest_margin;
 816         }
 817     }
 818     /* make sure all SSE2 stores are visible thereafter */
 819     SSE2_END;
 820 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 821 }
 822
 823 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
 824 /*****************************************************************************
 825  * I420_IUYV: planar YUV 4:2:0 to interleaved packed UYVY 4:2:2
 826  *****************************************************************************/
 827 static void I420_IUYV( filter_t *p_filter, picture_t *p_source,
 828                                            picture_t *p_dest )
 829 {
 830     VLC_UNUSED(p_source); VLC_UNUSED(p_dest);
 831     /* FIXME: TODO ! */
 832     msg_Err( p_filter, "I420_IUYV unimplemented, please harass <sam@zoy.org>" );
 833 }
 834
 835 /*****************************************************************************
 836  * I420_cyuv: planar YUV 4:2:0 to upside-down packed UYVY 4:2:2
 837  *****************************************************************************/
 838 static void I420_cyuv( filter_t *p_filter, picture_t *p_source,
 839                                            picture_t *p_dest )
 840 {
 841     uint8_t *p_line1 = p_dest->p->p_pixels +
 842                        p_dest->p->i_visible_lines * p_dest->p->i_pitch
 843                        + p_dest->p->i_pitch;
 844     uint8_t *p_line2 = p_dest->p->p_pixels +
 845                        p_dest->p->i_visible_lines * p_dest->p->i_pitch;
 846     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
 847     uint8_t *p_u = p_source->U_PIXELS;
 848     uint8_t *p_v = p_source->V_PIXELS;
 849
 850     int i_x, i_y;
 851
 852     const int i_source_margin = p_source->p[0].i_pitch
 853                                  - p_source->p[0].i_visible_pitch;
 854     const int i_source_margin_c = p_source->p[1].i_pitch
 855                                  - p_source->p[1].i_visible_pitch;
 856     const int i_dest_margin = p_dest->p->i_pitch
 857                                - p_dest->p->i_visible_pitch;
 858
 859 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
 860     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 861     {
 862         p_line1 -= 3 * p_dest->p->i_pitch;
 863         p_line2 -= 3 * p_dest->p->i_pitch;
 864
 865         p_y1 = p_y2;
 866         p_y2 += p_source->p[Y_PLANE].i_pitch;
 867
 868         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
 869         {
 870 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
 871             C_YUV420_UYVY( );
 872             C_YUV420_UYVY( );
 873             C_YUV420_UYVY( );
 874             C_YUV420_UYVY( );
 875 #else
 876             MMX_CALL( MMX_YUV420_UYVY );
 877 #endif
 878         }
 879         for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; )
 880         {
 881             C_YUV420_UYVY( );
 882         }
 883
 884         p_y1 += i_source_margin;
 885         p_y2 += i_source_margin;
 886         p_u += i_source_margin_c;
 887         p_v += i_source_margin_c;
 888         p_line1 += i_dest_margin;
 889         p_line2 += i_dest_margin;
 890     }
 891
 892 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
 893     /* re-enable FPU registers */
 894     MMX_END;
 895 #endif
 896
 897 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 898     /*
 899     ** SSE2 128 bits fetch/store instructions are faster
 900     ** if memory access is 16 bytes aligned
 901     */
 902     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
 903         ((intptr_t)p_line2|(intptr_t)p_y2))) )
 904     {
 905         /* use faster SSE2 aligned fetch and store */
 906         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 907         {
 908             p_line1 = p_line2;
 909             p_line2 += p_dest->p->i_pitch;
 910
 911             p_y1 = p_y2;
 912             p_y2 += p_source->p[Y_PLANE].i_pitch;
 913
 914             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
 915             {
 916                 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
 917             }
 918             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
 919             {
 920                 C_YUV420_UYVY( );
 921             }
 922
 923             p_y1 += i_source_margin;
 924             p_y2 += i_source_margin;
 925             p_u += i_source_margin_c;
 926             p_v += i_source_margin_c;
 927             p_line1 += i_dest_margin;
 928             p_line2 += i_dest_margin;
 929         }
 930     }
 931     else
 932     {
 933         /* use slower SSE2 unaligned fetch and store */
 934         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 935         {
 936             p_line1 = p_line2;
 937             p_line2 += p_dest->p->i_pitch;
 938
 939             p_y1 = p_y2;
 940             p_y2 += p_source->p[Y_PLANE].i_pitch;
 941
 942             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
 943             {
 944                 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
 945             }
 946             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
 947             {
 948                 C_YUV420_UYVY( );
 949             }
 950
 951             p_y1 += i_source_margin;
 952             p_y2 += i_source_margin;
 953             p_u += i_source_margin_c;
 954             p_v += i_source_margin_c;
 955             p_line1 += i_dest_margin;
 956             p_line2 += i_dest_margin;
 957         }
 958     }
 959     /* make sure all SSE2 stores are visible thereafter */
 960     SSE2_END;
 961 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 962 }
 963 #endif // !defined (MODULE_NAME_IS_i420_yuy2_altivec)
 964
 965 /*****************************************************************************
 966  * I420_Y211: planar YUV 4:2:0 to packed YUYV 2:1:1
 967  *****************************************************************************/
 968 #if defined (MODULE_NAME_IS_i420_yuy2)
 969 static void I420_Y211( filter_t *p_filter, picture_t *p_source,
 970                                            picture_t *p_dest )
 971 {
 972     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
 973     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
 974     uint8_t *p_u = p_source->U_PIXELS;
 975     uint8_t *p_v = p_source->V_PIXELS;
 976
 977     int i_x, i_y;
 978
 979     const int i_source_margin = p_source->p[0].i_pitch
 980                                  - p_source->p[0].i_visible_pitch;
 981     const int i_source_margin_c = p_source->p[1].i_pitch
 982                                  - p_source->p[1].i_visible_pitch;
 983     const int i_dest_margin = p_dest->p->i_pitch
 984                                - p_dest->p->i_visible_pitch;
 985
 986     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 987     {
 988         p_line1 = p_line2;
 989         p_line2 += p_dest->p->i_pitch;
 990
 991         p_y1 = p_y2;
 992         p_y2 += p_source->p[Y_PLANE].i_pitch;
 993
 994         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
 995         {
 996             C_YUV420_Y211( );
 997             C_YUV420_Y211( );
 998         }
 999
1000         p_y1 += i_source_margin;
1001         p_y2 += i_source_margin;
1002         p_u += i_source_margin_c;
1003         p_v += i_source_margin_c;
1004         p_line1 += i_dest_margin;
1005         p_line2 += i_dest_margin;
1006     }
1007 }
1008 #endif