git.sesse.net Git - vlc/blob - modules/video_chroma/i420_yuy2.c

   1 /*****************************************************************************
   2  * i420_yuy2.c : YUV to YUV conversion module for vlc
   3  *****************************************************************************
   4  * Copyright (C) 2000, 2001 the VideoLAN team
   5  * $Id$
   6  *
   7  * Authors: Samuel Hocevar <sam@zoy.org>
   8  *          Damien Fouilleul <damien@videolan.org>
   9  *
  10  * This program is free software; you can redistribute it and/or modify
  11  * it under the terms of the GNU General Public License as published by
  12  * the Free Software Foundation; either version 2 of the License, or
  13  * (at your option) any later version.
  14  *
  15  * This program is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  * GNU General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU General Public License
  21  * along with this program; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
  23  *****************************************************************************/
  24
  25 /*****************************************************************************
  26  * Preamble
  27  *****************************************************************************/
  28
  29 #ifdef HAVE_CONFIG_H
  30 # include "config.h"
  31 #endif
  32
  33 #include <vlc_common.h>
  34 #include <vlc_plugin.h>
  35 #include <vlc_filter.h>
  36 #include <vlc_cpu.h>
  37
  38 #if defined (MODULE_NAME_IS_i420_yuy2_altivec) && defined(HAVE_ALTIVEC_H)
  39 #   include <altivec.h>
  40 #endif
  41
  42 #include "i420_yuy2.h"
  43
  44 #define SRC_FOURCC  "I420,IYUV,YV12"
  45
  46 #if defined (MODULE_NAME_IS_i420_yuy2)
  47 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv,Y211"
  48 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
  49 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
  50 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
  51 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
  52 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
  53 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422"
  54 #endif
  55
  56 /*****************************************************************************
  57  * Local and extern prototypes.
  58  *****************************************************************************/
  59 static int  Activate ( vlc_object_t * );
  60
  61 static void I420_YUY2           ( filter_t *, picture_t *, picture_t * );
  62 static void I420_YVYU           ( filter_t *, picture_t *, picture_t * );
  63 static void I420_UYVY           ( filter_t *, picture_t *, picture_t * );
  64 static picture_t *I420_YUY2_Filter    ( filter_t *, picture_t * );
  65 static picture_t *I420_YVYU_Filter    ( filter_t *, picture_t * );
  66 static picture_t *I420_UYVY_Filter    ( filter_t *, picture_t * );
  67 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
  68 static void I420_IUYV           ( filter_t *, picture_t *, picture_t * );
  69 static void I420_cyuv           ( filter_t *, picture_t *, picture_t * );
  70 static picture_t *I420_IUYV_Filter    ( filter_t *, picture_t * );
  71 static picture_t *I420_cyuv_Filter    ( filter_t *, picture_t * );
  72 #endif
  73 #if defined (MODULE_NAME_IS_i420_yuy2)
  74 static void I420_Y211           ( filter_t *, picture_t *, picture_t * );
  75 static picture_t *I420_Y211_Filter    ( filter_t *, picture_t * );
  76 #endif
  77
  78 #ifdef MODULE_NAME_IS_i420_yuy2_mmx
  79 /* Initialize MMX-specific constants */
  80 static const uint64_t i_00ffw = 0x00ff00ff00ff00ffULL;
  81 static const uint64_t i_80w   = 0x0000000080808080ULL;
  82 #endif
  83
  84 /*****************************************************************************
  85  * Module descriptor.
  86  *****************************************************************************/
  87 vlc_module_begin ()
  88 #if defined (MODULE_NAME_IS_i420_yuy2)
  89     set_description( N_("Conversions from " SRC_FOURCC " to " DEST_FOURCC) )
  90     set_capability( "video filter2", 80 )
  91 # define vlc_CPU_capable() (true)
  92 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
  93     set_description( N_("MMX conversions from " SRC_FOURCC " to " DEST_FOURCC) )
  94     set_capability( "video filter2", 160 )
  95 # define vlc_CPU_capable() vlc_CPU_MMX()
  96 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
  97     set_description( N_("SSE2 conversions from " SRC_FOURCC " to " DEST_FOURCC) )
  98     set_capability( "video filter2", 250 )
  99 # define vlc_CPU_capable() vlc_CPU_SSE2()
 100 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
 101     set_description(
 102             _("AltiVec conversions from " SRC_FOURCC " to " DEST_FOURCC) );
 103     set_capability( "video filter2", 250 )
 104 # define vlc_CPU_capable() vlc_CPU_ALTIVEC()
 105 #endif
 106     set_callbacks( Activate, NULL )
 107 vlc_module_end ()
 108
 109 /*****************************************************************************
 110  * Activate: allocate a chroma function
 111  *****************************************************************************
 112  * This function allocates and initializes a chroma function
 113  *****************************************************************************/
 114 static int Activate( vlc_object_t *p_this )
 115 {
 116     filter_t *p_filter = (filter_t *)p_this;
 117
 118     if( !vlc_CPU_capable() )
 119         return VLC_EGENERIC;
 120     if( p_filter->fmt_in.video.i_width & 1
 121      || p_filter->fmt_in.video.i_height & 1 )
 122     {
 123         return -1;
 124     }
 125
 126     if( p_filter->fmt_in.video.i_width != p_filter->fmt_out.video.i_width
 127      || p_filter->fmt_in.video.i_height != p_filter->fmt_out.video.i_height )
 128         return -1;
 129
 130     switch( p_filter->fmt_in.video.i_chroma )
 131     {
 132         case VLC_CODEC_YV12:
 133         case VLC_CODEC_I420:
 134             switch( p_filter->fmt_out.video.i_chroma )
 135             {
 136                 case VLC_CODEC_YUYV:
 137                     p_filter->pf_video_filter = I420_YUY2_Filter;
 138                     break;
 139
 140                 case VLC_CODEC_YVYU:
 141                     p_filter->pf_video_filter = I420_YVYU_Filter;
 142                     break;
 143
 144                 case VLC_CODEC_UYVY:
 145                     p_filter->pf_video_filter = I420_UYVY_Filter;
 146                     break;
 147 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
 148                 case VLC_FOURCC('I','U','Y','V'):
 149                     p_filter->pf_video_filter = I420_IUYV_Filter;
 150                     break;
 151
 152                 case VLC_CODEC_CYUV:
 153                     p_filter->pf_video_filter = I420_cyuv_Filter;
 154                     break;
 155 #endif
 156
 157 #if defined (MODULE_NAME_IS_i420_yuy2)
 158                 case VLC_CODEC_Y211:
 159                     p_filter->pf_video_filter = I420_Y211_Filter;
 160                     break;
 161 #endif
 162
 163                 default:
 164                     return -1;
 165             }
 166             break;
 167
 168         default:
 169             return -1;
 170     }
 171
 172     return 0;
 173 }
 174
 175 #if 0
 176 static inline unsigned long long read_cycles(void)
 177 {
 178     unsigned long long v;
 179     __asm__ __volatile__("rdtsc" : "=A" (v): );
 180
 181     return v;
 182 }
 183 #endif
 184
 185 /* Following functions are local */
 186
 187 VIDEO_FILTER_WRAPPER( I420_YUY2 )
 188 VIDEO_FILTER_WRAPPER( I420_YVYU )
 189 VIDEO_FILTER_WRAPPER( I420_UYVY )
 190 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
 191 VIDEO_FILTER_WRAPPER( I420_IUYV )
 192 VIDEO_FILTER_WRAPPER( I420_cyuv )
 193 #endif
 194 #if defined (MODULE_NAME_IS_i420_yuy2)
 195 VIDEO_FILTER_WRAPPER( I420_Y211 )
 196 #endif
 197
 198 /*****************************************************************************
 199  * I420_YUY2: planar YUV 4:2:0 to packed YUYV 4:2:2
 200  *****************************************************************************/
 201 static void I420_YUY2( filter_t *p_filter, picture_t *p_source,
 202                                            picture_t *p_dest )
 203 {
 204     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
 205     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
 206     uint8_t *p_u = p_source->U_PIXELS;
 207     uint8_t *p_v = p_source->V_PIXELS;
 208
 209     int i_x, i_y;
 210
 211 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 212 #define VEC_NEXT_LINES( ) \
 213     p_line1  = p_line2; \
 214     p_line2 += p_dest->p->i_pitch; \
 215     p_y1     = p_y2; \
 216     p_y2    += p_source->p[Y_PLANE].i_pitch;
 217
 218 #define VEC_LOAD_UV( ) \
 219     u_vec = vec_ld( 0, p_u ); p_u += 16; \
 220     v_vec = vec_ld( 0, p_v ); p_v += 16;
 221
 222 #define VEC_MERGE( a ) \
 223     uv_vec = a( u_vec, v_vec ); \
 224     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
 225     vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
 226     vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
 227     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
 228     vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16; \
 229     vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16;
 230
 231     vector unsigned char u_vec;
 232     vector unsigned char v_vec;
 233     vector unsigned char uv_vec;
 234     vector unsigned char y_vec;
 235
 236     if( !( ( p_filter->fmt_in.video.i_width % 32 ) |
 237            ( p_filter->fmt_in.video.i_height % 2 ) ) )
 238     {
 239         /* Width is a multiple of 32, we take 2 lines at a time */
 240         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 241         {
 242             VEC_NEXT_LINES( );
 243             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
 244             {
 245                 VEC_LOAD_UV( );
 246                 VEC_MERGE( vec_mergeh );
 247                 VEC_MERGE( vec_mergel );
 248             }
 249         }
 250     }
 251 #warning FIXME: converting widths % 16 but !widths % 32 is broken on altivec
 252 #if 0
 253     else if( !( ( p_filter->fmt_in.video.i_width % 16 ) |
 254                 ( p_filter->fmt_in.video.i_height % 4 ) ) )
 255     {
 256         /* Width is only a multiple of 16, we take 4 lines at a time */
 257         for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; )
 258         {
 259             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
 260             VEC_NEXT_LINES( );
 261             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
 262             {
 263                 VEC_LOAD_UV( );
 264                 VEC_MERGE( vec_mergeh );
 265                 VEC_MERGE( vec_mergel );
 266             }
 267
 268             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
 269             VEC_LOAD_UV( );
 270             VEC_MERGE( vec_mergeh );
 271
 272             /* Line 3 and 4, pixels 0 to 16 */
 273             VEC_NEXT_LINES( );
 274             VEC_MERGE( vec_mergel );
 275
 276             /* Line 3 and 4, pixels 16 to ( width ) */
 277             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
 278             {
 279                 VEC_LOAD_UV( );
 280                 VEC_MERGE( vec_mergeh );
 281                 VEC_MERGE( vec_mergel );
 282             }
 283         }
 284     }
 285 #endif
 286     else
 287     {
 288         /* Crap, use the C version */
 289 #undef VEC_NEXT_LINES
 290 #undef VEC_LOAD_UV
 291 #undef VEC_MERGE
 292 #endif
 293
 294     const int i_source_margin = p_source->p[0].i_pitch
 295                                  - p_source->p[0].i_visible_pitch;
 296     const int i_source_margin_c = p_source->p[1].i_pitch
 297                                  - p_source->p[1].i_visible_pitch;
 298     const int i_dest_margin = p_dest->p->i_pitch
 299                                - p_dest->p->i_visible_pitch;
 300
 301 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
 302     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 303     {
 304         p_line1 = p_line2;
 305         p_line2 += p_dest->p->i_pitch;
 306
 307         p_y1 = p_y2;
 308         p_y2 += p_source->p[Y_PLANE].i_pitch;
 309
 310 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
 311         for( i_x = p_filter->fmt_in.video.i_width / 8; i_x-- ; )
 312         {
 313             C_YUV420_YUYV( );
 314             C_YUV420_YUYV( );
 315             C_YUV420_YUYV( );
 316             C_YUV420_YUYV( );
 317         }
 318 #else
 319         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
 320         {
 321             MMX_CALL( MMX_YUV420_YUYV );
 322         }
 323 #endif
 324         for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; )
 325         {
 326             C_YUV420_YUYV( );
 327         }
 328
 329         p_y1 += i_source_margin;
 330         p_y2 += i_source_margin;
 331         p_u += i_source_margin_c;
 332         p_v += i_source_margin_c;
 333         p_line1 += i_dest_margin;
 334         p_line2 += i_dest_margin;
 335     }
 336
 337 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
 338     /* re-enable FPU registers */
 339     MMX_END;
 340 #endif
 341
 342 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 343     }
 344 #endif
 345
 346 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 347     /*
 348     ** SSE2 128 bits fetch/store instructions are faster
 349     ** if memory access is 16 bytes aligned
 350     */
 351
 352     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
 353         ((intptr_t)p_line2|(intptr_t)p_y2))) )
 354     {
 355         /* use faster SSE2 aligned fetch and store */
 356         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 357         {
 358             p_line1 = p_line2;
 359             p_line2 += p_dest->p->i_pitch;
 360
 361             p_y1 = p_y2;
 362             p_y2 += p_source->p[Y_PLANE].i_pitch;
 363
 364             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
 365             {
 366                 SSE2_CALL( SSE2_YUV420_YUYV_ALIGNED );
 367             }
 368             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
 369             {
 370                 C_YUV420_YUYV( );
 371             }
 372
 373             p_y1 += i_source_margin;
 374             p_y2 += i_source_margin;
 375             p_u += i_source_margin_c;
 376             p_v += i_source_margin_c;
 377             p_line1 += i_dest_margin;
 378             p_line2 += i_dest_margin;
 379         }
 380     }
 381     else
 382     {
 383         /* use slower SSE2 unaligned fetch and store */
 384         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 385         {
 386             p_line1 = p_line2;
 387             p_line2 += p_dest->p->i_pitch;
 388
 389             p_y1 = p_y2;
 390             p_y2 += p_source->p[Y_PLANE].i_pitch;
 391
 392             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
 393             {
 394                 SSE2_CALL( SSE2_YUV420_YUYV_UNALIGNED );
 395             }
 396             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
 397             {
 398                 C_YUV420_YUYV( );
 399             }
 400
 401             p_y1 += i_source_margin;
 402             p_y2 += i_source_margin;
 403             p_u += i_source_margin_c;
 404             p_v += i_source_margin_c;
 405             p_line1 += i_dest_margin;
 406             p_line2 += i_dest_margin;
 407         }
 408     }
 409     /* make sure all SSE2 stores are visible thereafter */
 410     SSE2_END;
 411
 412 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 413 }
 414
 415 /*****************************************************************************
 416  * I420_YVYU: planar YUV 4:2:0 to packed YVYU 4:2:2
 417  *****************************************************************************/
 418 static void I420_YVYU( filter_t *p_filter, picture_t *p_source,
 419                                            picture_t *p_dest )
 420 {
 421     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
 422     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
 423     uint8_t *p_u = p_source->U_PIXELS;
 424     uint8_t *p_v = p_source->V_PIXELS;
 425
 426     int i_x, i_y;
 427
 428 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 429 #define VEC_NEXT_LINES( ) \
 430     p_line1  = p_line2; \
 431     p_line2 += p_dest->p->i_pitch; \
 432     p_y1     = p_y2; \
 433     p_y2    += p_source->p[Y_PLANE].i_pitch;
 434
 435 #define VEC_LOAD_UV( ) \
 436     u_vec = vec_ld( 0, p_u ); p_u += 16; \
 437     v_vec = vec_ld( 0, p_v ); p_v += 16;
 438
 439 #define VEC_MERGE( a ) \
 440     vu_vec = a( v_vec, u_vec ); \
 441     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
 442     vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
 443     vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
 444     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
 445     vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16; \
 446     vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16;
 447
 448     vector unsigned char u_vec;
 449     vector unsigned char v_vec;
 450     vector unsigned char vu_vec;
 451     vector unsigned char y_vec;
 452
 453     if( !( ( p_filter->fmt_in.video.i_width % 32 ) |
 454            ( p_filter->fmt_in.video.i_height % 2 ) ) )
 455     {
 456         /* Width is a multiple of 32, we take 2 lines at a time */
 457         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 458         {
 459             VEC_NEXT_LINES( );
 460             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
 461             {
 462                 VEC_LOAD_UV( );
 463                 VEC_MERGE( vec_mergeh );
 464                 VEC_MERGE( vec_mergel );
 465             }
 466         }
 467     }
 468     else if( !( ( p_filter->fmt_in.video.i_width % 16 ) |
 469                 ( p_filter->fmt_in.video.i_height % 4 ) ) )
 470     {
 471         /* Width is only a multiple of 16, we take 4 lines at a time */
 472         for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; )
 473         {
 474             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
 475             VEC_NEXT_LINES( );
 476             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
 477             {
 478                 VEC_LOAD_UV( );
 479                 VEC_MERGE( vec_mergeh );
 480                 VEC_MERGE( vec_mergel );
 481             }
 482
 483             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
 484             VEC_LOAD_UV( );
 485             VEC_MERGE( vec_mergeh );
 486
 487             /* Line 3 and 4, pixels 0 to 16 */
 488             VEC_NEXT_LINES( );
 489             VEC_MERGE( vec_mergel );
 490
 491             /* Line 3 and 4, pixels 16 to ( width ) */
 492             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
 493             {
 494                 VEC_LOAD_UV( );
 495                 VEC_MERGE( vec_mergeh );
 496                 VEC_MERGE( vec_mergel );
 497             }
 498         }
 499     }
 500     else
 501     {
 502         /* Crap, use the C version */
 503 #undef VEC_NEXT_LINES
 504 #undef VEC_LOAD_UV
 505 #undef VEC_MERGE
 506 #endif
 507
 508     const int i_source_margin = p_source->p[0].i_pitch
 509                                  - p_source->p[0].i_visible_pitch;
 510     const int i_source_margin_c = p_source->p[1].i_pitch
 511                                  - p_source->p[1].i_visible_pitch;
 512     const int i_dest_margin = p_dest->p->i_pitch
 513                                - p_dest->p->i_visible_pitch;
 514
 515 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
 516     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 517     {
 518         p_line1 = p_line2;
 519         p_line2 += p_dest->p->i_pitch;
 520
 521         p_y1 = p_y2;
 522         p_y2 += p_source->p[Y_PLANE].i_pitch;
 523
 524         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
 525         {
 526 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
 527             C_YUV420_YVYU( );
 528             C_YUV420_YVYU( );
 529             C_YUV420_YVYU( );
 530             C_YUV420_YVYU( );
 531 #else
 532             MMX_CALL( MMX_YUV420_YVYU );
 533 #endif
 534         }
 535         for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; )
 536         {
 537             C_YUV420_YVYU( );
 538         }
 539
 540         p_y1 += i_source_margin;
 541         p_y2 += i_source_margin;
 542         p_u += i_source_margin_c;
 543         p_v += i_source_margin_c;
 544         p_line1 += i_dest_margin;
 545         p_line2 += i_dest_margin;
 546     }
 547
 548 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
 549     /* re-enable FPU registers */
 550     MMX_END;
 551 #endif
 552
 553 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 554     }
 555 #endif
 556
 557 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 558     /*
 559     ** SSE2 128 bits fetch/store instructions are faster
 560     ** if memory access is 16 bytes aligned
 561     */
 562     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
 563         ((intptr_t)p_line2|(intptr_t)p_y2))) )
 564     {
 565         /* use faster SSE2 aligned fetch and store */
 566         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 567         {
 568             p_line1 = p_line2;
 569             p_line2 += p_dest->p->i_pitch;
 570
 571             p_y1 = p_y2;
 572             p_y2 += p_source->p[Y_PLANE].i_pitch;
 573
 574             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
 575             {
 576                 SSE2_CALL( SSE2_YUV420_YVYU_ALIGNED );
 577             }
 578             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
 579             {
 580                 C_YUV420_YVYU( );
 581             }
 582
 583             p_y1 += i_source_margin;
 584             p_y2 += i_source_margin;
 585             p_u += i_source_margin_c;
 586             p_v += i_source_margin_c;
 587             p_line1 += i_dest_margin;
 588             p_line2 += i_dest_margin;
 589         }
 590     }
 591     else
 592     {
 593         /* use slower SSE2 unaligned fetch and store */
 594         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 595         {
 596             p_line1 = p_line2;
 597             p_line2 += p_dest->p->i_pitch;
 598
 599             p_y1 = p_y2;
 600             p_y2 += p_source->p[Y_PLANE].i_pitch;
 601
 602             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
 603             {
 604                 SSE2_CALL( SSE2_YUV420_YVYU_UNALIGNED );
 605             }
 606             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
 607             {
 608                 C_YUV420_YVYU( );
 609             }
 610
 611             p_y1 += i_source_margin;
 612             p_y2 += i_source_margin;
 613             p_u += i_source_margin_c;
 614             p_v += i_source_margin_c;
 615             p_line1 += i_dest_margin;
 616             p_line2 += i_dest_margin;
 617         }
 618     }
 619     /* make sure all SSE2 stores are visible thereafter */
 620     SSE2_END;
 621 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 622 }
 623
 624 /*****************************************************************************
 625  * I420_UYVY: planar YUV 4:2:0 to packed UYVY 4:2:2
 626  *****************************************************************************/
 627 static void I420_UYVY( filter_t *p_filter, picture_t *p_source,
 628                                            picture_t *p_dest )
 629 {
 630     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
 631     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
 632     uint8_t *p_u = p_source->U_PIXELS;
 633     uint8_t *p_v = p_source->V_PIXELS;
 634
 635     int i_x, i_y;
 636
 637 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 638 #define VEC_NEXT_LINES( ) \
 639     p_line1  = p_line2; \
 640     p_line2 += p_dest->p->i_pitch; \
 641     p_y1     = p_y2; \
 642     p_y2    += p_source->p[Y_PLANE].i_pitch;
 643
 644 #define VEC_LOAD_UV( ) \
 645     u_vec = vec_ld( 0, p_u ); p_u += 16; \
 646     v_vec = vec_ld( 0, p_v ); p_v += 16;
 647
 648 #define VEC_MERGE( a ) \
 649     uv_vec = a( u_vec, v_vec ); \
 650     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
 651     vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
 652     vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
 653     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
 654     vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16; \
 655     vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16;
 656
 657     vector unsigned char u_vec;
 658     vector unsigned char v_vec;
 659     vector unsigned char uv_vec;
 660     vector unsigned char y_vec;
 661
 662     if( !( ( p_filter->fmt_in.video.i_width % 32 ) |
 663            ( p_filter->fmt_in.video.i_height % 2 ) ) )
 664     {
 665         /* Width is a multiple of 32, we take 2 lines at a time */
 666         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 667         {
 668             VEC_NEXT_LINES( );
 669             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
 670             {
 671                 VEC_LOAD_UV( );
 672                 VEC_MERGE( vec_mergeh );
 673                 VEC_MERGE( vec_mergel );
 674             }
 675         }
 676     }
 677     else if( !( ( p_filter->fmt_in.video.i_width % 16 ) |
 678                 ( p_filter->fmt_in.video.i_height % 4 ) ) )
 679     {
 680         /* Width is only a multiple of 16, we take 4 lines at a time */
 681         for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; )
 682         {
 683             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
 684             VEC_NEXT_LINES( );
 685             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
 686             {
 687                 VEC_LOAD_UV( );
 688                 VEC_MERGE( vec_mergeh );
 689                 VEC_MERGE( vec_mergel );
 690             }
 691
 692             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
 693             VEC_LOAD_UV( );
 694             VEC_MERGE( vec_mergeh );
 695
 696             /* Line 3 and 4, pixels 0 to 16 */
 697             VEC_NEXT_LINES( );
 698             VEC_MERGE( vec_mergel );
 699
 700             /* Line 3 and 4, pixels 16 to ( width ) */
 701             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
 702             {
 703                 VEC_LOAD_UV( );
 704                 VEC_MERGE( vec_mergeh );
 705                 VEC_MERGE( vec_mergel );
 706             }
 707         }
 708     }
 709     else
 710     {
 711         /* Crap, use the C version */
 712 #undef VEC_NEXT_LINES
 713 #undef VEC_LOAD_UV
 714 #undef VEC_MERGE
 715 #endif
 716
 717     const int i_source_margin = p_source->p[0].i_pitch
 718                                  - p_source->p[0].i_visible_pitch;
 719     const int i_source_margin_c = p_source->p[1].i_pitch
 720                                  - p_source->p[1].i_visible_pitch;
 721     const int i_dest_margin = p_dest->p->i_pitch
 722                                - p_dest->p->i_visible_pitch;
 723
 724 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
 725     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 726     {
 727         p_line1 = p_line2;
 728         p_line2 += p_dest->p->i_pitch;
 729
 730         p_y1 = p_y2;
 731         p_y2 += p_source->p[Y_PLANE].i_pitch;
 732
 733         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
 734         {
 735 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
 736             C_YUV420_UYVY( );
 737             C_YUV420_UYVY( );
 738             C_YUV420_UYVY( );
 739             C_YUV420_UYVY( );
 740 #else
 741             MMX_CALL( MMX_YUV420_UYVY );
 742 #endif
 743         }
 744         for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x--; )
 745         {
 746             C_YUV420_UYVY( );
 747         }
 748
 749         p_y1 += i_source_margin;
 750         p_y2 += i_source_margin;
 751         p_u += i_source_margin_c;
 752         p_v += i_source_margin_c;
 753         p_line1 += i_dest_margin;
 754         p_line2 += i_dest_margin;
 755     }
 756
 757 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
 758     /* re-enable FPU registers */
 759     MMX_END;
 760 #endif
 761
 762 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 763     }
 764 #endif
 765
 766 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 767     /*
 768     ** SSE2 128 bits fetch/store instructions are faster
 769     ** if memory access is 16 bytes aligned
 770     */
 771     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
 772         ((intptr_t)p_line2|(intptr_t)p_y2))) )
 773     {
 774         /* use faster SSE2 aligned fetch and store */
 775         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 776         {
 777             p_line1 = p_line2;
 778             p_line2 += p_dest->p->i_pitch;
 779
 780             p_y1 = p_y2;
 781             p_y2 += p_source->p[Y_PLANE].i_pitch;
 782
 783             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
 784             {
 785                 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
 786             }
 787             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
 788             {
 789                 C_YUV420_UYVY( );
 790             }
 791
 792             p_y1 += i_source_margin;
 793             p_y2 += i_source_margin;
 794             p_u += i_source_margin_c;
 795             p_v += i_source_margin_c;
 796             p_line1 += i_dest_margin;
 797             p_line2 += i_dest_margin;
 798         }
 799     }
 800     else
 801     {
 802         /* use slower SSE2 unaligned fetch and store */
 803         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 804         {
 805             p_line1 = p_line2;
 806             p_line2 += p_dest->p->i_pitch;
 807
 808             p_y1 = p_y2;
 809             p_y2 += p_source->p[Y_PLANE].i_pitch;
 810
 811             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
 812             {
 813                 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
 814             }
 815             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
 816             {
 817                 C_YUV420_UYVY( );
 818             }
 819
 820             p_y1 += i_source_margin;
 821             p_y2 += i_source_margin;
 822             p_u += i_source_margin_c;
 823             p_v += i_source_margin_c;
 824             p_line1 += i_dest_margin;
 825             p_line2 += i_dest_margin;
 826         }
 827     }
 828     /* make sure all SSE2 stores are visible thereafter */
 829     SSE2_END;
 830 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 831 }
 832
 833 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
 834 /*****************************************************************************
 835  * I420_IUYV: planar YUV 4:2:0 to interleaved packed UYVY 4:2:2
 836  *****************************************************************************/
 837 static void I420_IUYV( filter_t *p_filter, picture_t *p_source,
 838                                            picture_t *p_dest )
 839 {
 840     VLC_UNUSED(p_source); VLC_UNUSED(p_dest);
 841     /* FIXME: TODO ! */
 842     msg_Err( p_filter, "I420_IUYV unimplemented, please harass <sam@zoy.org>" );
 843 }
 844
 845 /*****************************************************************************
 846  * I420_cyuv: planar YUV 4:2:0 to upside-down packed UYVY 4:2:2
 847  *****************************************************************************/
 848 static void I420_cyuv( filter_t *p_filter, picture_t *p_source,
 849                                            picture_t *p_dest )
 850 {
 851     uint8_t *p_line1 = p_dest->p->p_pixels +
 852                        p_dest->p->i_visible_lines * p_dest->p->i_pitch
 853                        + p_dest->p->i_pitch;
 854     uint8_t *p_line2 = p_dest->p->p_pixels +
 855                        p_dest->p->i_visible_lines * p_dest->p->i_pitch;
 856     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
 857     uint8_t *p_u = p_source->U_PIXELS;
 858     uint8_t *p_v = p_source->V_PIXELS;
 859
 860     int i_x, i_y;
 861
 862     const int i_source_margin = p_source->p[0].i_pitch
 863                                  - p_source->p[0].i_visible_pitch;
 864     const int i_source_margin_c = p_source->p[1].i_pitch
 865                                  - p_source->p[1].i_visible_pitch;
 866     const int i_dest_margin = p_dest->p->i_pitch
 867                                - p_dest->p->i_visible_pitch;
 868
 869 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
 870     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 871     {
 872         p_line1 -= 3 * p_dest->p->i_pitch;
 873         p_line2 -= 3 * p_dest->p->i_pitch;
 874
 875         p_y1 = p_y2;
 876         p_y2 += p_source->p[Y_PLANE].i_pitch;
 877
 878         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
 879         {
 880 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
 881             C_YUV420_UYVY( );
 882             C_YUV420_UYVY( );
 883             C_YUV420_UYVY( );
 884             C_YUV420_UYVY( );
 885 #else
 886             MMX_CALL( MMX_YUV420_UYVY );
 887 #endif
 888         }
 889         for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; )
 890         {
 891             C_YUV420_UYVY( );
 892         }
 893
 894         p_y1 += i_source_margin;
 895         p_y2 += i_source_margin;
 896         p_u += i_source_margin_c;
 897         p_v += i_source_margin_c;
 898         p_line1 += i_dest_margin;
 899         p_line2 += i_dest_margin;
 900     }
 901
 902 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
 903     /* re-enable FPU registers */
 904     MMX_END;
 905 #endif
 906
 907 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 908     /*
 909     ** SSE2 128 bits fetch/store instructions are faster
 910     ** if memory access is 16 bytes aligned
 911     */
 912     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
 913         ((intptr_t)p_line2|(intptr_t)p_y2))) )
 914     {
 915         /* use faster SSE2 aligned fetch and store */
 916         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 917         {
 918             p_line1 = p_line2;
 919             p_line2 += p_dest->p->i_pitch;
 920
 921             p_y1 = p_y2;
 922             p_y2 += p_source->p[Y_PLANE].i_pitch;
 923
 924             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
 925             {
 926                 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
 927             }
 928             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
 929             {
 930                 C_YUV420_UYVY( );
 931             }
 932
 933             p_y1 += i_source_margin;
 934             p_y2 += i_source_margin;
 935             p_u += i_source_margin_c;
 936             p_v += i_source_margin_c;
 937             p_line1 += i_dest_margin;
 938             p_line2 += i_dest_margin;
 939         }
 940     }
 941     else
 942     {
 943         /* use slower SSE2 unaligned fetch and store */
 944         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 945         {
 946             p_line1 = p_line2;
 947             p_line2 += p_dest->p->i_pitch;
 948
 949             p_y1 = p_y2;
 950             p_y2 += p_source->p[Y_PLANE].i_pitch;
 951
 952             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
 953             {
 954                 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
 955             }
 956             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
 957             {
 958                 C_YUV420_UYVY( );
 959             }
 960
 961             p_y1 += i_source_margin;
 962             p_y2 += i_source_margin;
 963             p_u += i_source_margin_c;
 964             p_v += i_source_margin_c;
 965             p_line1 += i_dest_margin;
 966             p_line2 += i_dest_margin;
 967         }
 968     }
 969     /* make sure all SSE2 stores are visible thereafter */
 970     SSE2_END;
 971 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 972 }
 973 #endif // !defined (MODULE_NAME_IS_i420_yuy2_altivec)
 974
 975 /*****************************************************************************
 976  * I420_Y211: planar YUV 4:2:0 to packed YUYV 2:1:1
 977  *****************************************************************************/
 978 #if defined (MODULE_NAME_IS_i420_yuy2)
 979 static void I420_Y211( filter_t *p_filter, picture_t *p_source,
 980                                            picture_t *p_dest )
 981 {
 982     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
 983     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
 984     uint8_t *p_u = p_source->U_PIXELS;
 985     uint8_t *p_v = p_source->V_PIXELS;
 986
 987     int i_x, i_y;
 988
 989     const int i_source_margin = p_source->p[0].i_pitch
 990                                  - p_source->p[0].i_visible_pitch;
 991     const int i_source_margin_c = p_source->p[1].i_pitch
 992                                  - p_source->p[1].i_visible_pitch;
 993     const int i_dest_margin = p_dest->p->i_pitch
 994                                - p_dest->p->i_visible_pitch;
 995
 996     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 997     {
 998         p_line1 = p_line2;
 999         p_line2 += p_dest->p->i_pitch;
1000
1001         p_y1 = p_y2;
1002         p_y2 += p_source->p[Y_PLANE].i_pitch;
1003
1004         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
1005         {
1006             C_YUV420_Y211( );
1007             C_YUV420_Y211( );
1008         }
1009
1010         p_y1 += i_source_margin;
1011         p_y2 += i_source_margin;
1012         p_u += i_source_margin_c;
1013         p_v += i_source_margin_c;
1014         p_line1 += i_dest_margin;
1015         p_line2 += i_dest_margin;
1016     }
1017 }
1018 #endif