git.sesse.net Git - vlc/blob - modules/video_chroma/i420_yuy2.c

   1 /*****************************************************************************
   2  * i420_yuy2.c : YUV to YUV conversion module for vlc
   3  *****************************************************************************
   4  * Copyright (C) 2000, 2001 VLC authors and VideoLAN
   5  * $Id$
   6  *
   7  * Authors: Samuel Hocevar <sam@zoy.org>
   8  *          Damien Fouilleul <damien@videolan.org>
   9  *
  10  * This program is free software; you can redistribute it and/or modify it
  11  * under the terms of the GNU Lesser General Public License as published by
  12  * the Free Software Foundation; either version 2.1 of the License, or
  13  * (at your option) any later version.
  14  *
  15  * This program is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  18  * GNU Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public License
  21  * along with this program; if not, write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
  23  *****************************************************************************/
  24
  25 /*****************************************************************************
  26  * Preamble
  27  *****************************************************************************/
  28
  29 #ifdef HAVE_CONFIG_H
  30 # include "config.h"
  31 #endif
  32
  33 #include <vlc_common.h>
  34 #include <vlc_plugin.h>
  35 #include <vlc_filter.h>
  36 #include <vlc_cpu.h>
  37
  38 #if defined (MODULE_NAME_IS_i420_yuy2_altivec) && defined(HAVE_ALTIVEC_H)
  39 #   include <altivec.h>
  40 #endif
  41
  42 #include "i420_yuy2.h"
  43
  44 #define SRC_FOURCC  "I420,IYUV,YV12"
  45
  46 #if defined (MODULE_NAME_IS_i420_yuy2)
  47 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv,Y211"
  48 #    define VLC_TARGET
  49 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
  50 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
  51 #    define VLC_TARGET VLC_MMX
  52 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
  53 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
  54 #    define VLC_TARGET VLC_SSE
  55 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
  56 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422"
  57 #    define VLC_TARGET
  58 #endif
  59
  60 /*****************************************************************************
  61  * Local and extern prototypes.
  62  *****************************************************************************/
  63 static int  Activate ( vlc_object_t * );
  64
  65 static void I420_YUY2           ( filter_t *, picture_t *, picture_t * );
  66 static void I420_YVYU           ( filter_t *, picture_t *, picture_t * );
  67 static void I420_UYVY           ( filter_t *, picture_t *, picture_t * );
  68 static picture_t *I420_YUY2_Filter    ( filter_t *, picture_t * );
  69 static picture_t *I420_YVYU_Filter    ( filter_t *, picture_t * );
  70 static picture_t *I420_UYVY_Filter    ( filter_t *, picture_t * );
  71 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
  72 static void I420_IUYV           ( filter_t *, picture_t *, picture_t * );
  73 static void I420_cyuv           ( filter_t *, picture_t *, picture_t * );
  74 static picture_t *I420_IUYV_Filter    ( filter_t *, picture_t * );
  75 static picture_t *I420_cyuv_Filter    ( filter_t *, picture_t * );
  76 #endif
  77 #if defined (MODULE_NAME_IS_i420_yuy2)
  78 static void I420_Y211           ( filter_t *, picture_t *, picture_t * );
  79 static picture_t *I420_Y211_Filter    ( filter_t *, picture_t * );
  80 #endif
  81
  82 #ifdef MODULE_NAME_IS_i420_yuy2_mmx
  83 /* Initialize MMX-specific constants */
  84 static const uint64_t i_00ffw = 0x00ff00ff00ff00ffULL;
  85 static const uint64_t i_80w   = 0x0000000080808080ULL;
  86 #endif
  87
  88 /*****************************************************************************
  89  * Module descriptor.
  90  *****************************************************************************/
  91 vlc_module_begin ()
  92 #if defined (MODULE_NAME_IS_i420_yuy2)
  93     set_description( N_("Conversions from " SRC_FOURCC " to " DEST_FOURCC) )
  94     set_capability( "video filter2", 80 )
  95 # define vlc_CPU_capable() (true)
  96 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
  97     set_description( N_("MMX conversions from " SRC_FOURCC " to " DEST_FOURCC) )
  98     set_capability( "video filter2", 160 )
  99 # define vlc_CPU_capable() vlc_CPU_MMX()
 100 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
 101     set_description( N_("SSE2 conversions from " SRC_FOURCC " to " DEST_FOURCC) )
 102     set_capability( "video filter2", 250 )
 103 # define vlc_CPU_capable() vlc_CPU_SSE2()
 104 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
 105     set_description(
 106             _("AltiVec conversions from " SRC_FOURCC " to " DEST_FOURCC) );
 107     set_capability( "video filter2", 250 )
 108 # define vlc_CPU_capable() vlc_CPU_ALTIVEC()
 109 #endif
 110     set_callbacks( Activate, NULL )
 111 vlc_module_end ()
 112
 113 /*****************************************************************************
 114  * Activate: allocate a chroma function
 115  *****************************************************************************
 116  * This function allocates and initializes a chroma function
 117  *****************************************************************************/
 118 static int Activate( vlc_object_t *p_this )
 119 {
 120     filter_t *p_filter = (filter_t *)p_this;
 121
 122     if( !vlc_CPU_capable() )
 123         return VLC_EGENERIC;
 124     if( p_filter->fmt_in.video.i_width & 1
 125      || p_filter->fmt_in.video.i_height & 1 )
 126     {
 127         return -1;
 128     }
 129
 130     if( p_filter->fmt_in.video.i_width != p_filter->fmt_out.video.i_width
 131      || p_filter->fmt_in.video.i_height != p_filter->fmt_out.video.i_height )
 132         return -1;
 133
 134     switch( p_filter->fmt_in.video.i_chroma )
 135     {
 136         case VLC_CODEC_YV12:
 137         case VLC_CODEC_I420:
 138             switch( p_filter->fmt_out.video.i_chroma )
 139             {
 140                 case VLC_CODEC_YUYV:
 141                     p_filter->pf_video_filter = I420_YUY2_Filter;
 142                     break;
 143
 144                 case VLC_CODEC_YVYU:
 145                     p_filter->pf_video_filter = I420_YVYU_Filter;
 146                     break;
 147
 148                 case VLC_CODEC_UYVY:
 149                     p_filter->pf_video_filter = I420_UYVY_Filter;
 150                     break;
 151 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
 152                 case VLC_FOURCC('I','U','Y','V'):
 153                     p_filter->pf_video_filter = I420_IUYV_Filter;
 154                     break;
 155
 156                 case VLC_CODEC_CYUV:
 157                     p_filter->pf_video_filter = I420_cyuv_Filter;
 158                     break;
 159 #endif
 160
 161 #if defined (MODULE_NAME_IS_i420_yuy2)
 162                 case VLC_CODEC_Y211:
 163                     p_filter->pf_video_filter = I420_Y211_Filter;
 164                     break;
 165 #endif
 166
 167                 default:
 168                     return -1;
 169             }
 170             break;
 171
 172         default:
 173             return -1;
 174     }
 175
 176     return 0;
 177 }
 178
 179 #if 0
 180 static inline unsigned long long read_cycles(void)
 181 {
 182     unsigned long long v;
 183     __asm__ __volatile__("rdtsc" : "=A" (v): );
 184
 185     return v;
 186 }
 187 #endif
 188
 189 /* Following functions are local */
 190
 191 VIDEO_FILTER_WRAPPER( I420_YUY2 )
 192 VIDEO_FILTER_WRAPPER( I420_YVYU )
 193 VIDEO_FILTER_WRAPPER( I420_UYVY )
 194 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
 195 VIDEO_FILTER_WRAPPER( I420_IUYV )
 196 VIDEO_FILTER_WRAPPER( I420_cyuv )
 197 #endif
 198 #if defined (MODULE_NAME_IS_i420_yuy2)
 199 VIDEO_FILTER_WRAPPER( I420_Y211 )
 200 #endif
 201
 202 /*****************************************************************************
 203  * I420_YUY2: planar YUV 4:2:0 to packed YUYV 4:2:2
 204  *****************************************************************************/
 205 VLC_TARGET
 206 static void I420_YUY2( filter_t *p_filter, picture_t *p_source,
 207                                            picture_t *p_dest )
 208 {
 209     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
 210     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
 211     uint8_t *p_u = p_source->U_PIXELS;
 212     uint8_t *p_v = p_source->V_PIXELS;
 213
 214     int i_x, i_y;
 215
 216 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 217 #define VEC_NEXT_LINES( ) \
 218     p_line1  = p_line2; \
 219     p_line2 += p_dest->p->i_pitch; \
 220     p_y1     = p_y2; \
 221     p_y2    += p_source->p[Y_PLANE].i_pitch;
 222
 223 #define VEC_LOAD_UV( ) \
 224     u_vec = vec_ld( 0, p_u ); p_u += 16; \
 225     v_vec = vec_ld( 0, p_v ); p_v += 16;
 226
 227 #define VEC_MERGE( a ) \
 228     uv_vec = a( u_vec, v_vec ); \
 229     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
 230     vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
 231     vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
 232     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
 233     vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16; \
 234     vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16;
 235
 236     vector unsigned char u_vec;
 237     vector unsigned char v_vec;
 238     vector unsigned char uv_vec;
 239     vector unsigned char y_vec;
 240
 241     if( !( ( p_filter->fmt_in.video.i_width % 32 ) |
 242            ( p_filter->fmt_in.video.i_height % 2 ) ) )
 243     {
 244         /* Width is a multiple of 32, we take 2 lines at a time */
 245         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 246         {
 247             VEC_NEXT_LINES( );
 248             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
 249             {
 250                 VEC_LOAD_UV( );
 251                 VEC_MERGE( vec_mergeh );
 252                 VEC_MERGE( vec_mergel );
 253             }
 254         }
 255     }
 256 #warning FIXME: converting widths % 16 but !widths % 32 is broken on altivec
 257 #if 0
 258     else if( !( ( p_filter->fmt_in.video.i_width % 16 ) |
 259                 ( p_filter->fmt_in.video.i_height % 4 ) ) )
 260     {
 261         /* Width is only a multiple of 16, we take 4 lines at a time */
 262         for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; )
 263         {
 264             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
 265             VEC_NEXT_LINES( );
 266             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
 267             {
 268                 VEC_LOAD_UV( );
 269                 VEC_MERGE( vec_mergeh );
 270                 VEC_MERGE( vec_mergel );
 271             }
 272
 273             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
 274             VEC_LOAD_UV( );
 275             VEC_MERGE( vec_mergeh );
 276
 277             /* Line 3 and 4, pixels 0 to 16 */
 278             VEC_NEXT_LINES( );
 279             VEC_MERGE( vec_mergel );
 280
 281             /* Line 3 and 4, pixels 16 to ( width ) */
 282             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
 283             {
 284                 VEC_LOAD_UV( );
 285                 VEC_MERGE( vec_mergeh );
 286                 VEC_MERGE( vec_mergel );
 287             }
 288         }
 289     }
 290 #endif
 291     else
 292     {
 293         /* Crap, use the C version */
 294 #undef VEC_NEXT_LINES
 295 #undef VEC_LOAD_UV
 296 #undef VEC_MERGE
 297 #endif
 298
 299     const int i_source_margin = p_source->p[0].i_pitch
 300                                  - p_source->p[0].i_visible_pitch;
 301     const int i_source_margin_c = p_source->p[1].i_pitch
 302                                  - p_source->p[1].i_visible_pitch;
 303     const int i_dest_margin = p_dest->p->i_pitch
 304                                - p_dest->p->i_visible_pitch;
 305
 306 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
 307     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 308     {
 309         p_line1 = p_line2;
 310         p_line2 += p_dest->p->i_pitch;
 311
 312         p_y1 = p_y2;
 313         p_y2 += p_source->p[Y_PLANE].i_pitch;
 314
 315 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
 316         for( i_x = p_filter->fmt_in.video.i_width / 8; i_x-- ; )
 317         {
 318             C_YUV420_YUYV( );
 319             C_YUV420_YUYV( );
 320             C_YUV420_YUYV( );
 321             C_YUV420_YUYV( );
 322         }
 323 #else
 324         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
 325         {
 326             MMX_CALL( MMX_YUV420_YUYV );
 327         }
 328 #endif
 329         for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; )
 330         {
 331             C_YUV420_YUYV( );
 332         }
 333
 334         p_y1 += i_source_margin;
 335         p_y2 += i_source_margin;
 336         p_u += i_source_margin_c;
 337         p_v += i_source_margin_c;
 338         p_line1 += i_dest_margin;
 339         p_line2 += i_dest_margin;
 340     }
 341
 342 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
 343     /* re-enable FPU registers */
 344     MMX_END;
 345 #endif
 346
 347 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 348     }
 349 #endif
 350
 351 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 352     /*
 353     ** SSE2 128 bits fetch/store instructions are faster
 354     ** if memory access is 16 bytes aligned
 355     */
 356
 357     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
 358         ((intptr_t)p_line2|(intptr_t)p_y2))) )
 359     {
 360         /* use faster SSE2 aligned fetch and store */
 361         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 362         {
 363             p_line1 = p_line2;
 364             p_line2 += p_dest->p->i_pitch;
 365
 366             p_y1 = p_y2;
 367             p_y2 += p_source->p[Y_PLANE].i_pitch;
 368
 369             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
 370             {
 371                 SSE2_CALL( SSE2_YUV420_YUYV_ALIGNED );
 372             }
 373             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
 374             {
 375                 C_YUV420_YUYV( );
 376             }
 377
 378             p_y1 += i_source_margin;
 379             p_y2 += i_source_margin;
 380             p_u += i_source_margin_c;
 381             p_v += i_source_margin_c;
 382             p_line1 += i_dest_margin;
 383             p_line2 += i_dest_margin;
 384         }
 385     }
 386     else
 387     {
 388         /* use slower SSE2 unaligned fetch and store */
 389         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 390         {
 391             p_line1 = p_line2;
 392             p_line2 += p_dest->p->i_pitch;
 393
 394             p_y1 = p_y2;
 395             p_y2 += p_source->p[Y_PLANE].i_pitch;
 396
 397             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
 398             {
 399                 SSE2_CALL( SSE2_YUV420_YUYV_UNALIGNED );
 400             }
 401             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
 402             {
 403                 C_YUV420_YUYV( );
 404             }
 405
 406             p_y1 += i_source_margin;
 407             p_y2 += i_source_margin;
 408             p_u += i_source_margin_c;
 409             p_v += i_source_margin_c;
 410             p_line1 += i_dest_margin;
 411             p_line2 += i_dest_margin;
 412         }
 413     }
 414     /* make sure all SSE2 stores are visible thereafter */
 415     SSE2_END;
 416
 417 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 418 }
 419
 420 /*****************************************************************************
 421  * I420_YVYU: planar YUV 4:2:0 to packed YVYU 4:2:2
 422  *****************************************************************************/
 423 VLC_TARGET
 424 static void I420_YVYU( filter_t *p_filter, picture_t *p_source,
 425                                            picture_t *p_dest )
 426 {
 427     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
 428     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
 429     uint8_t *p_u = p_source->U_PIXELS;
 430     uint8_t *p_v = p_source->V_PIXELS;
 431
 432     int i_x, i_y;
 433
 434 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 435 #define VEC_NEXT_LINES( ) \
 436     p_line1  = p_line2; \
 437     p_line2 += p_dest->p->i_pitch; \
 438     p_y1     = p_y2; \
 439     p_y2    += p_source->p[Y_PLANE].i_pitch;
 440
 441 #define VEC_LOAD_UV( ) \
 442     u_vec = vec_ld( 0, p_u ); p_u += 16; \
 443     v_vec = vec_ld( 0, p_v ); p_v += 16;
 444
 445 #define VEC_MERGE( a ) \
 446     vu_vec = a( v_vec, u_vec ); \
 447     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
 448     vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
 449     vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
 450     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
 451     vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16; \
 452     vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16;
 453
 454     vector unsigned char u_vec;
 455     vector unsigned char v_vec;
 456     vector unsigned char vu_vec;
 457     vector unsigned char y_vec;
 458
 459     if( !( ( p_filter->fmt_in.video.i_width % 32 ) |
 460            ( p_filter->fmt_in.video.i_height % 2 ) ) )
 461     {
 462         /* Width is a multiple of 32, we take 2 lines at a time */
 463         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 464         {
 465             VEC_NEXT_LINES( );
 466             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
 467             {
 468                 VEC_LOAD_UV( );
 469                 VEC_MERGE( vec_mergeh );
 470                 VEC_MERGE( vec_mergel );
 471             }
 472         }
 473     }
 474     else if( !( ( p_filter->fmt_in.video.i_width % 16 ) |
 475                 ( p_filter->fmt_in.video.i_height % 4 ) ) )
 476     {
 477         /* Width is only a multiple of 16, we take 4 lines at a time */
 478         for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; )
 479         {
 480             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
 481             VEC_NEXT_LINES( );
 482             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
 483             {
 484                 VEC_LOAD_UV( );
 485                 VEC_MERGE( vec_mergeh );
 486                 VEC_MERGE( vec_mergel );
 487             }
 488
 489             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
 490             VEC_LOAD_UV( );
 491             VEC_MERGE( vec_mergeh );
 492
 493             /* Line 3 and 4, pixels 0 to 16 */
 494             VEC_NEXT_LINES( );
 495             VEC_MERGE( vec_mergel );
 496
 497             /* Line 3 and 4, pixels 16 to ( width ) */
 498             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
 499             {
 500                 VEC_LOAD_UV( );
 501                 VEC_MERGE( vec_mergeh );
 502                 VEC_MERGE( vec_mergel );
 503             }
 504         }
 505     }
 506     else
 507     {
 508         /* Crap, use the C version */
 509 #undef VEC_NEXT_LINES
 510 #undef VEC_LOAD_UV
 511 #undef VEC_MERGE
 512 #endif
 513
 514     const int i_source_margin = p_source->p[0].i_pitch
 515                                  - p_source->p[0].i_visible_pitch;
 516     const int i_source_margin_c = p_source->p[1].i_pitch
 517                                  - p_source->p[1].i_visible_pitch;
 518     const int i_dest_margin = p_dest->p->i_pitch
 519                                - p_dest->p->i_visible_pitch;
 520
 521 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
 522     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 523     {
 524         p_line1 = p_line2;
 525         p_line2 += p_dest->p->i_pitch;
 526
 527         p_y1 = p_y2;
 528         p_y2 += p_source->p[Y_PLANE].i_pitch;
 529
 530         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
 531         {
 532 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
 533             C_YUV420_YVYU( );
 534             C_YUV420_YVYU( );
 535             C_YUV420_YVYU( );
 536             C_YUV420_YVYU( );
 537 #else
 538             MMX_CALL( MMX_YUV420_YVYU );
 539 #endif
 540         }
 541         for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; )
 542         {
 543             C_YUV420_YVYU( );
 544         }
 545
 546         p_y1 += i_source_margin;
 547         p_y2 += i_source_margin;
 548         p_u += i_source_margin_c;
 549         p_v += i_source_margin_c;
 550         p_line1 += i_dest_margin;
 551         p_line2 += i_dest_margin;
 552     }
 553
 554 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
 555     /* re-enable FPU registers */
 556     MMX_END;
 557 #endif
 558
 559 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 560     }
 561 #endif
 562
 563 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 564     /*
 565     ** SSE2 128 bits fetch/store instructions are faster
 566     ** if memory access is 16 bytes aligned
 567     */
 568     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
 569         ((intptr_t)p_line2|(intptr_t)p_y2))) )
 570     {
 571         /* use faster SSE2 aligned fetch and store */
 572         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 573         {
 574             p_line1 = p_line2;
 575             p_line2 += p_dest->p->i_pitch;
 576
 577             p_y1 = p_y2;
 578             p_y2 += p_source->p[Y_PLANE].i_pitch;
 579
 580             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
 581             {
 582                 SSE2_CALL( SSE2_YUV420_YVYU_ALIGNED );
 583             }
 584             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
 585             {
 586                 C_YUV420_YVYU( );
 587             }
 588
 589             p_y1 += i_source_margin;
 590             p_y2 += i_source_margin;
 591             p_u += i_source_margin_c;
 592             p_v += i_source_margin_c;
 593             p_line1 += i_dest_margin;
 594             p_line2 += i_dest_margin;
 595         }
 596     }
 597     else
 598     {
 599         /* use slower SSE2 unaligned fetch and store */
 600         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 601         {
 602             p_line1 = p_line2;
 603             p_line2 += p_dest->p->i_pitch;
 604
 605             p_y1 = p_y2;
 606             p_y2 += p_source->p[Y_PLANE].i_pitch;
 607
 608             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
 609             {
 610                 SSE2_CALL( SSE2_YUV420_YVYU_UNALIGNED );
 611             }
 612             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
 613             {
 614                 C_YUV420_YVYU( );
 615             }
 616
 617             p_y1 += i_source_margin;
 618             p_y2 += i_source_margin;
 619             p_u += i_source_margin_c;
 620             p_v += i_source_margin_c;
 621             p_line1 += i_dest_margin;
 622             p_line2 += i_dest_margin;
 623         }
 624     }
 625     /* make sure all SSE2 stores are visible thereafter */
 626     SSE2_END;
 627 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 628 }
 629
 630 /*****************************************************************************
 631  * I420_UYVY: planar YUV 4:2:0 to packed UYVY 4:2:2
 632  *****************************************************************************/
 633 VLC_TARGET
 634 static void I420_UYVY( filter_t *p_filter, picture_t *p_source,
 635                                            picture_t *p_dest )
 636 {
 637     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
 638     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
 639     uint8_t *p_u = p_source->U_PIXELS;
 640     uint8_t *p_v = p_source->V_PIXELS;
 641
 642     int i_x, i_y;
 643
 644 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 645 #define VEC_NEXT_LINES( ) \
 646     p_line1  = p_line2; \
 647     p_line2 += p_dest->p->i_pitch; \
 648     p_y1     = p_y2; \
 649     p_y2    += p_source->p[Y_PLANE].i_pitch;
 650
 651 #define VEC_LOAD_UV( ) \
 652     u_vec = vec_ld( 0, p_u ); p_u += 16; \
 653     v_vec = vec_ld( 0, p_v ); p_v += 16;
 654
 655 #define VEC_MERGE( a ) \
 656     uv_vec = a( u_vec, v_vec ); \
 657     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
 658     vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
 659     vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
 660     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
 661     vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16; \
 662     vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16;
 663
 664     vector unsigned char u_vec;
 665     vector unsigned char v_vec;
 666     vector unsigned char uv_vec;
 667     vector unsigned char y_vec;
 668
 669     if( !( ( p_filter->fmt_in.video.i_width % 32 ) |
 670            ( p_filter->fmt_in.video.i_height % 2 ) ) )
 671     {
 672         /* Width is a multiple of 32, we take 2 lines at a time */
 673         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 674         {
 675             VEC_NEXT_LINES( );
 676             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
 677             {
 678                 VEC_LOAD_UV( );
 679                 VEC_MERGE( vec_mergeh );
 680                 VEC_MERGE( vec_mergel );
 681             }
 682         }
 683     }
 684     else if( !( ( p_filter->fmt_in.video.i_width % 16 ) |
 685                 ( p_filter->fmt_in.video.i_height % 4 ) ) )
 686     {
 687         /* Width is only a multiple of 16, we take 4 lines at a time */
 688         for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; )
 689         {
 690             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
 691             VEC_NEXT_LINES( );
 692             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
 693             {
 694                 VEC_LOAD_UV( );
 695                 VEC_MERGE( vec_mergeh );
 696                 VEC_MERGE( vec_mergel );
 697             }
 698
 699             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
 700             VEC_LOAD_UV( );
 701             VEC_MERGE( vec_mergeh );
 702
 703             /* Line 3 and 4, pixels 0 to 16 */
 704             VEC_NEXT_LINES( );
 705             VEC_MERGE( vec_mergel );
 706
 707             /* Line 3 and 4, pixels 16 to ( width ) */
 708             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
 709             {
 710                 VEC_LOAD_UV( );
 711                 VEC_MERGE( vec_mergeh );
 712                 VEC_MERGE( vec_mergel );
 713             }
 714         }
 715     }
 716     else
 717     {
 718         /* Crap, use the C version */
 719 #undef VEC_NEXT_LINES
 720 #undef VEC_LOAD_UV
 721 #undef VEC_MERGE
 722 #endif
 723
 724     const int i_source_margin = p_source->p[0].i_pitch
 725                                  - p_source->p[0].i_visible_pitch;
 726     const int i_source_margin_c = p_source->p[1].i_pitch
 727                                  - p_source->p[1].i_visible_pitch;
 728     const int i_dest_margin = p_dest->p->i_pitch
 729                                - p_dest->p->i_visible_pitch;
 730
 731 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
 732     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 733     {
 734         p_line1 = p_line2;
 735         p_line2 += p_dest->p->i_pitch;
 736
 737         p_y1 = p_y2;
 738         p_y2 += p_source->p[Y_PLANE].i_pitch;
 739
 740         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
 741         {
 742 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
 743             C_YUV420_UYVY( );
 744             C_YUV420_UYVY( );
 745             C_YUV420_UYVY( );
 746             C_YUV420_UYVY( );
 747 #else
 748             MMX_CALL( MMX_YUV420_UYVY );
 749 #endif
 750         }
 751         for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x--; )
 752         {
 753             C_YUV420_UYVY( );
 754         }
 755
 756         p_y1 += i_source_margin;
 757         p_y2 += i_source_margin;
 758         p_u += i_source_margin_c;
 759         p_v += i_source_margin_c;
 760         p_line1 += i_dest_margin;
 761         p_line2 += i_dest_margin;
 762     }
 763
 764 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
 765     /* re-enable FPU registers */
 766     MMX_END;
 767 #endif
 768
 769 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 770     }
 771 #endif
 772
 773 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 774     /*
 775     ** SSE2 128 bits fetch/store instructions are faster
 776     ** if memory access is 16 bytes aligned
 777     */
 778     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
 779         ((intptr_t)p_line2|(intptr_t)p_y2))) )
 780     {
 781         /* use faster SSE2 aligned fetch and store */
 782         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 783         {
 784             p_line1 = p_line2;
 785             p_line2 += p_dest->p->i_pitch;
 786
 787             p_y1 = p_y2;
 788             p_y2 += p_source->p[Y_PLANE].i_pitch;
 789
 790             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
 791             {
 792                 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
 793             }
 794             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
 795             {
 796                 C_YUV420_UYVY( );
 797             }
 798
 799             p_y1 += i_source_margin;
 800             p_y2 += i_source_margin;
 801             p_u += i_source_margin_c;
 802             p_v += i_source_margin_c;
 803             p_line1 += i_dest_margin;
 804             p_line2 += i_dest_margin;
 805         }
 806     }
 807     else
 808     {
 809         /* use slower SSE2 unaligned fetch and store */
 810         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 811         {
 812             p_line1 = p_line2;
 813             p_line2 += p_dest->p->i_pitch;
 814
 815             p_y1 = p_y2;
 816             p_y2 += p_source->p[Y_PLANE].i_pitch;
 817
 818             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
 819             {
 820                 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
 821             }
 822             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
 823             {
 824                 C_YUV420_UYVY( );
 825             }
 826
 827             p_y1 += i_source_margin;
 828             p_y2 += i_source_margin;
 829             p_u += i_source_margin_c;
 830             p_v += i_source_margin_c;
 831             p_line1 += i_dest_margin;
 832             p_line2 += i_dest_margin;
 833         }
 834     }
 835     /* make sure all SSE2 stores are visible thereafter */
 836     SSE2_END;
 837 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 838 }
 839
 840 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
 841 /*****************************************************************************
 842  * I420_IUYV: planar YUV 4:2:0 to interleaved packed UYVY 4:2:2
 843  *****************************************************************************/
 844 static void I420_IUYV( filter_t *p_filter, picture_t *p_source,
 845                                            picture_t *p_dest )
 846 {
 847     VLC_UNUSED(p_source); VLC_UNUSED(p_dest);
 848     /* FIXME: TODO ! */
 849     msg_Err( p_filter, "I420_IUYV unimplemented, please harass <sam@zoy.org>" );
 850 }
 851
 852 /*****************************************************************************
 853  * I420_cyuv: planar YUV 4:2:0 to upside-down packed UYVY 4:2:2
 854  *****************************************************************************/
 855 VLC_TARGET
 856 static void I420_cyuv( filter_t *p_filter, picture_t *p_source,
 857                                            picture_t *p_dest )
 858 {
 859     uint8_t *p_line1 = p_dest->p->p_pixels +
 860                        p_dest->p->i_visible_lines * p_dest->p->i_pitch
 861                        + p_dest->p->i_pitch;
 862     uint8_t *p_line2 = p_dest->p->p_pixels +
 863                        p_dest->p->i_visible_lines * p_dest->p->i_pitch;
 864     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
 865     uint8_t *p_u = p_source->U_PIXELS;
 866     uint8_t *p_v = p_source->V_PIXELS;
 867
 868     int i_x, i_y;
 869
 870     const int i_source_margin = p_source->p[0].i_pitch
 871                                  - p_source->p[0].i_visible_pitch;
 872     const int i_source_margin_c = p_source->p[1].i_pitch
 873                                  - p_source->p[1].i_visible_pitch;
 874     const int i_dest_margin = p_dest->p->i_pitch
 875                                - p_dest->p->i_visible_pitch;
 876
 877 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
 878     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 879     {
 880         p_line1 -= 3 * p_dest->p->i_pitch;
 881         p_line2 -= 3 * p_dest->p->i_pitch;
 882
 883         p_y1 = p_y2;
 884         p_y2 += p_source->p[Y_PLANE].i_pitch;
 885
 886         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
 887         {
 888 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
 889             C_YUV420_UYVY( );
 890             C_YUV420_UYVY( );
 891             C_YUV420_UYVY( );
 892             C_YUV420_UYVY( );
 893 #else
 894             MMX_CALL( MMX_YUV420_UYVY );
 895 #endif
 896         }
 897         for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; )
 898         {
 899             C_YUV420_UYVY( );
 900         }
 901
 902         p_y1 += i_source_margin;
 903         p_y2 += i_source_margin;
 904         p_u += i_source_margin_c;
 905         p_v += i_source_margin_c;
 906         p_line1 += i_dest_margin;
 907         p_line2 += i_dest_margin;
 908     }
 909
 910 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
 911     /* re-enable FPU registers */
 912     MMX_END;
 913 #endif
 914
 915 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 916     /*
 917     ** SSE2 128 bits fetch/store instructions are faster
 918     ** if memory access is 16 bytes aligned
 919     */
 920     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
 921         ((intptr_t)p_line2|(intptr_t)p_y2))) )
 922     {
 923         /* use faster SSE2 aligned fetch and store */
 924         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 925         {
 926             p_line1 = p_line2;
 927             p_line2 += p_dest->p->i_pitch;
 928
 929             p_y1 = p_y2;
 930             p_y2 += p_source->p[Y_PLANE].i_pitch;
 931
 932             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
 933             {
 934                 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
 935             }
 936             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
 937             {
 938                 C_YUV420_UYVY( );
 939             }
 940
 941             p_y1 += i_source_margin;
 942             p_y2 += i_source_margin;
 943             p_u += i_source_margin_c;
 944             p_v += i_source_margin_c;
 945             p_line1 += i_dest_margin;
 946             p_line2 += i_dest_margin;
 947         }
 948     }
 949     else
 950     {
 951         /* use slower SSE2 unaligned fetch and store */
 952         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 953         {
 954             p_line1 = p_line2;
 955             p_line2 += p_dest->p->i_pitch;
 956
 957             p_y1 = p_y2;
 958             p_y2 += p_source->p[Y_PLANE].i_pitch;
 959
 960             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
 961             {
 962                 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
 963             }
 964             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
 965             {
 966                 C_YUV420_UYVY( );
 967             }
 968
 969             p_y1 += i_source_margin;
 970             p_y2 += i_source_margin;
 971             p_u += i_source_margin_c;
 972             p_v += i_source_margin_c;
 973             p_line1 += i_dest_margin;
 974             p_line2 += i_dest_margin;
 975         }
 976     }
 977     /* make sure all SSE2 stores are visible thereafter */
 978     SSE2_END;
 979 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 980 }
 981 #endif // !defined (MODULE_NAME_IS_i420_yuy2_altivec)
 982
 983 /*****************************************************************************
 984  * I420_Y211: planar YUV 4:2:0 to packed YUYV 2:1:1
 985  *****************************************************************************/
 986 #if defined (MODULE_NAME_IS_i420_yuy2)
 987 static void I420_Y211( filter_t *p_filter, picture_t *p_source,
 988                                            picture_t *p_dest )
 989 {
 990     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
 991     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
 992     uint8_t *p_u = p_source->U_PIXELS;
 993     uint8_t *p_v = p_source->V_PIXELS;
 994
 995     int i_x, i_y;
 996
 997     const int i_source_margin = p_source->p[0].i_pitch
 998                                  - p_source->p[0].i_visible_pitch;
 999     const int i_source_margin_c = p_source->p[1].i_pitch
1000                                  - p_source->p[1].i_visible_pitch;
1001     const int i_dest_margin = p_dest->p->i_pitch
1002                                - p_dest->p->i_visible_pitch;
1003
1004     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
1005     {
1006         p_line1 = p_line2;
1007         p_line2 += p_dest->p->i_pitch;
1008
1009         p_y1 = p_y2;
1010         p_y2 += p_source->p[Y_PLANE].i_pitch;
1011
1012         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
1013         {
1014             C_YUV420_Y211( );
1015             C_YUV420_Y211( );
1016         }
1017
1018         p_y1 += i_source_margin;
1019         p_y2 += i_source_margin;
1020         p_u += i_source_margin_c;
1021         p_v += i_source_margin_c;
1022         p_line1 += i_dest_margin;
1023         p_line2 += i_dest_margin;
1024     }
1025 }
1026 #endif