git.sesse.net Git - vlc/blob - modules/video_chroma/i420_rgb16_x86.c

   1 /*****************************************************************************
   2  * i420_rgb16_x86.c : YUV to bitmap RGB conversion module for vlc
   3  *****************************************************************************
   4  * Copyright (C) 2000 VLC authors and VideoLAN
   5  * $Id$
   6  *
   7  * Authors: Samuel Hocevar <sam@zoy.org>
   8  *          Damien Fouilleul <damienf@videolan.org>
   9  *
  10  * This program is free software; you can redistribute it and/or modify it
  11  * under the terms of the GNU Lesser General Public License as published by
  12  * the Free Software Foundation; either version 2.1 of the License, or
  13  * (at your option) any later version.
  14  *
  15  * This program is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  18  * GNU Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public License
  21  * along with this program; if not, write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
  23  *****************************************************************************/
  24
  25 #ifdef HAVE_CONFIG_H
  26 # include "config.h"
  27 #endif
  28
  29 #include <vlc_common.h>
  30 #include <vlc_filter.h>
  31 #include <vlc_cpu.h>
  32
  33 #include "i420_rgb.h"
  34 #ifdef SSE2
  35 # include "i420_rgb_sse2.h"
  36 # define VLC_TARGET VLC_SSE
  37 #else
  38 # include "i420_rgb_mmx.h"
  39 # define VLC_TARGET VLC_MMX
  40 #endif
  41
  42 /*****************************************************************************
  43  * SetOffset: build offset array for conversion functions
  44  *****************************************************************************
  45  * This function will build an offset array used in later conversion functions.
  46  * It will also set horizontal and vertical scaling indicators.
  47  *****************************************************************************/
  48 static void SetOffset( int i_width, int i_height, int i_pic_width,
  49                        int i_pic_height, bool *pb_hscale,
  50                        unsigned int *pi_vscale, int *p_offset )
  51 {
  52     /*
  53      * Prepare horizontal offset array
  54      */
  55     if( i_pic_width - i_width == 0 )
  56     {   /* No horizontal scaling: YUV conversion is done directly to picture */
  57         *pb_hscale = 0;
  58     }
  59     else if( i_pic_width - i_width > 0 )
  60     {   /* Prepare scaling array for horizontal extension */
  61         int i_scale_count = i_pic_width;
  62
  63         *pb_hscale = 1;
  64         for( int i_x = i_width; i_x--; )
  65         {
  66             while( (i_scale_count -= i_width) > 0 )
  67             {
  68                 *p_offset++ = 0;
  69             }
  70             *p_offset++ = 1;
  71             i_scale_count += i_pic_width;
  72         }
  73     }
  74     else /* if( i_pic_width - i_width < 0 ) */
  75     {   /* Prepare scaling array for horizontal reduction */
  76         int i_scale_count = i_pic_width;
  77
  78         *pb_hscale = 1;
  79         for( int i_x = i_pic_width; i_x--; )
  80         {
  81             *p_offset = 1;
  82             while( (i_scale_count -= i_pic_width) > 0 )
  83             {
  84                 *p_offset += 1;
  85             }
  86             p_offset++;
  87             i_scale_count += i_width;
  88         }
  89     }
  90
  91     /*
  92      * Set vertical scaling indicator
  93      */
  94     if( i_pic_height - i_height == 0 )
  95         *pi_vscale = 0;
  96     else if( i_pic_height - i_height > 0 )
  97         *pi_vscale = 1;
  98     else /* if( i_pic_height - i_height < 0 ) */
  99         *pi_vscale = -1;
 100 }
 101
 102 VLC_TARGET
 103 void I420_R5G5B5( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
 104 {
 105     /* We got this one from the old arguments */
 106     uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
 107     uint8_t  *p_y   = p_src->Y_PIXELS;
 108     uint8_t  *p_u   = p_src->U_PIXELS;
 109     uint8_t  *p_v   = p_src->V_PIXELS;
 110
 111     bool  b_hscale;                         /* horizontal scaling type */
 112     unsigned int i_vscale;                          /* vertical scaling type */
 113     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
 114
 115     int         i_right_margin;
 116     int         i_rewind;
 117     int         i_scale_count;                       /* scale modulo counter */
 118     int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
 119     uint16_t *  p_pic_start;       /* beginning of the current line for copy */
 120
 121     /* Conversion buffer pointer */
 122     uint16_t *  p_buffer_start = (uint16_t*)p_filter->p_sys->p_buffer;
 123     uint16_t *  p_buffer;
 124
 125     /* Offset array pointer */
 126     int *       p_offset_start = p_filter->p_sys->p_offset;
 127     int *       p_offset;
 128
 129     const int i_source_margin = p_src->p[0].i_pitch
 130                                  - p_src->p[0].i_visible_pitch;
 131     const int i_source_margin_c = p_src->p[1].i_pitch
 132                                  - p_src->p[1].i_visible_pitch;
 133
 134     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
 135
 136     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
 137      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
 138      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
 139     SetOffset( p_filter->fmt_in.video.i_width,
 140                p_filter->fmt_in.video.i_height,
 141                p_filter->fmt_out.video.i_width,
 142                p_filter->fmt_out.video.i_height,
 143                &b_hscale, &i_vscale, p_offset_start );
 144
 145
 146     /*
 147      * Perform conversion
 148      */
 149     i_scale_count = ( i_vscale == 1 ) ?
 150                     p_filter->fmt_out.video.i_height :
 151                     p_filter->fmt_in.video.i_height;
 152
 153 #ifdef SSE2
 154
 155     i_rewind = (-p_filter->fmt_in.video.i_width) & 15;
 156
 157     /*
 158     ** SSE2 128 bits fetch/store instructions are faster
 159     ** if memory access is 16 bytes aligned
 160     */
 161
 162     p_buffer = b_hscale ? p_buffer_start : p_pic;
 163     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
 164                     p_dest->p->i_pitch|
 165                     ((intptr_t)p_y)|
 166                     ((intptr_t)p_buffer))) )
 167     {
 168         /* use faster SSE2 aligned fetch and store */
 169         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
 170         {
 171             p_pic_start = p_pic;
 172
 173             for ( i_x = p_filter->fmt_in.video.i_width/16; i_x--; )
 174             {
 175                 SSE2_CALL (
 176                     SSE2_INIT_16_ALIGNED
 177                     SSE2_YUV_MUL
 178                     SSE2_YUV_ADD
 179                     SSE2_UNPACK_15_ALIGNED
 180                 );
 181                 p_y += 16;
 182                 p_u += 8;
 183                 p_v += 8;
 184                 p_buffer += 16;
 185             }
 186             /* Here we do some unaligned reads and duplicate conversions, but
 187              * at least we have all the pixels */
 188             if( i_rewind )
 189             {
 190                 p_y -= i_rewind;
 191                 p_u -= i_rewind >> 1;
 192                 p_v -= i_rewind >> 1;
 193                 p_buffer -= i_rewind;
 194
 195                 SSE2_CALL (
 196                     SSE2_INIT_16_UNALIGNED
 197                     SSE2_YUV_MUL
 198                     SSE2_YUV_ADD
 199                     SSE2_UNPACK_15_UNALIGNED
 200                 );
 201                 p_y += 16;
 202                 p_u += 8;
 203                 p_v += 8;
 204             }
 205             SCALE_WIDTH;
 206             SCALE_HEIGHT( 420, 2 );
 207
 208             p_y += i_source_margin;
 209             if( i_y % 2 )
 210             {
 211                 p_u += i_source_margin_c;
 212                 p_v += i_source_margin_c;
 213             }
 214             p_buffer = b_hscale ? p_buffer_start : p_pic;
 215         }
 216     }
 217     else
 218     {
 219         /* use slower SSE2 unaligned fetch and store */
 220         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
 221         {
 222             p_pic_start = p_pic;
 223             p_buffer = b_hscale ? p_buffer_start : p_pic;
 224
 225             for ( i_x = p_filter->fmt_in.video.i_width/16; i_x--; )
 226             {
 227                 SSE2_CALL (
 228                     SSE2_INIT_16_UNALIGNED
 229                     SSE2_YUV_MUL
 230                     SSE2_YUV_ADD
 231                     SSE2_UNPACK_15_UNALIGNED
 232                 );
 233                 p_y += 16;
 234                 p_u += 8;
 235                 p_v += 8;
 236                 p_buffer += 16;
 237             }
 238             /* Here we do some unaligned reads and duplicate conversions, but
 239              * at least we have all the pixels */
 240             if( i_rewind )
 241             {
 242                 p_y -= i_rewind;
 243                 p_u -= i_rewind >> 1;
 244                 p_v -= i_rewind >> 1;
 245                 p_buffer -= i_rewind;
 246
 247                 SSE2_CALL (
 248                     SSE2_INIT_16_UNALIGNED
 249                     SSE2_YUV_MUL
 250                     SSE2_YUV_ADD
 251                     SSE2_UNPACK_15_UNALIGNED
 252                 );
 253                 p_y += 16;
 254                 p_u += 8;
 255                 p_v += 8;
 256             }
 257             SCALE_WIDTH;
 258             SCALE_HEIGHT( 420, 2 );
 259
 260             p_y += i_source_margin;
 261             if( i_y % 2 )
 262             {
 263                 p_u += i_source_margin_c;
 264                 p_v += i_source_margin_c;
 265             }
 266             p_buffer = b_hscale ? p_buffer_start : p_pic;
 267         }
 268     }
 269
 270     /* make sure all SSE2 stores are visible thereafter */
 271     SSE2_END;
 272
 273 #else /* SSE2 */
 274
 275     i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
 276
 277     for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
 278     {
 279         p_pic_start = p_pic;
 280         p_buffer = b_hscale ? p_buffer_start : p_pic;
 281
 282         for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
 283         {
 284             MMX_CALL (
 285                 MMX_INIT_16
 286                 MMX_YUV_MUL
 287                 MMX_YUV_ADD
 288                 MMX_UNPACK_15
 289             );
 290             p_y += 8;
 291             p_u += 4;
 292             p_v += 4;
 293             p_buffer += 8;
 294         }
 295
 296         /* Here we do some unaligned reads and duplicate conversions, but
 297          * at least we have all the pixels */
 298         if( i_rewind )
 299         {
 300             p_y -= i_rewind;
 301             p_u -= i_rewind >> 1;
 302             p_v -= i_rewind >> 1;
 303             p_buffer -= i_rewind;
 304
 305             MMX_CALL (
 306                 MMX_INIT_16
 307                 MMX_YUV_MUL
 308                 MMX_YUV_ADD
 309                 MMX_UNPACK_15
 310             );
 311             p_y += 8;
 312             p_u += 4;
 313             p_v += 4;
 314             p_buffer += 8;
 315         }
 316         SCALE_WIDTH;
 317         SCALE_HEIGHT( 420, 2 );
 318
 319         p_y += i_source_margin;
 320         if( i_y % 2 )
 321         {
 322             p_u += i_source_margin_c;
 323             p_v += i_source_margin_c;
 324         }
 325     }
 326     /* re-enable FPU registers */
 327     MMX_END;
 328
 329 #endif /* SSE2 */
 330 }
 331
 332 VLC_TARGET
 333 void I420_R5G6B5( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
 334 {
 335     /* We got this one from the old arguments */
 336     uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
 337     uint8_t  *p_y   = p_src->Y_PIXELS;
 338     uint8_t  *p_u   = p_src->U_PIXELS;
 339     uint8_t  *p_v   = p_src->V_PIXELS;
 340
 341     bool  b_hscale;                         /* horizontal scaling type */
 342     unsigned int i_vscale;                          /* vertical scaling type */
 343     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
 344
 345     int         i_right_margin;
 346     int         i_rewind;
 347     int         i_scale_count;                       /* scale modulo counter */
 348     int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
 349     uint16_t *  p_pic_start;       /* beginning of the current line for copy */
 350
 351     /* Conversion buffer pointer */
 352     uint16_t *  p_buffer_start = (uint16_t*)p_filter->p_sys->p_buffer;
 353     uint16_t *  p_buffer;
 354
 355     /* Offset array pointer */
 356     int *       p_offset_start = p_filter->p_sys->p_offset;
 357     int *       p_offset;
 358
 359     const int i_source_margin = p_src->p[0].i_pitch
 360                                  - p_src->p[0].i_visible_pitch;
 361     const int i_source_margin_c = p_src->p[1].i_pitch
 362                                  - p_src->p[1].i_visible_pitch;
 363
 364     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
 365
 366     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
 367      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
 368      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
 369     SetOffset( p_filter->fmt_in.video.i_width,
 370                p_filter->fmt_in.video.i_height,
 371                p_filter->fmt_out.video.i_width,
 372                p_filter->fmt_out.video.i_height,
 373                &b_hscale, &i_vscale, p_offset_start );
 374
 375
 376     /*
 377      * Perform conversion
 378      */
 379     i_scale_count = ( i_vscale == 1 ) ?
 380                     p_filter->fmt_out.video.i_height :
 381                     p_filter->fmt_in.video.i_height;
 382
 383 #ifdef SSE2
 384
 385     i_rewind = (-p_filter->fmt_in.video.i_width) & 15;
 386
 387     /*
 388     ** SSE2 128 bits fetch/store instructions are faster
 389     ** if memory access is 16 bytes aligned
 390     */
 391
 392     p_buffer = b_hscale ? p_buffer_start : p_pic;
 393     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
 394                     p_dest->p->i_pitch|
 395                     ((intptr_t)p_y)|
 396                     ((intptr_t)p_buffer))) )
 397     {
 398         /* use faster SSE2 aligned fetch and store */
 399         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
 400         {
 401             p_pic_start = p_pic;
 402
 403             for ( i_x = p_filter->fmt_in.video.i_width/16; i_x--; )
 404             {
 405                 SSE2_CALL (
 406                     SSE2_INIT_16_ALIGNED
 407                     SSE2_YUV_MUL
 408                     SSE2_YUV_ADD
 409                     SSE2_UNPACK_16_ALIGNED
 410                 );
 411                 p_y += 16;
 412                 p_u += 8;
 413                 p_v += 8;
 414                 p_buffer += 16;
 415             }
 416             /* Here we do some unaligned reads and duplicate conversions, but
 417              * at least we have all the pixels */
 418             if( i_rewind )
 419             {
 420                 p_y -= i_rewind;
 421                 p_u -= i_rewind >> 1;
 422                 p_v -= i_rewind >> 1;
 423                 p_buffer -= i_rewind;
 424
 425                 SSE2_CALL (
 426                     SSE2_INIT_16_UNALIGNED
 427                     SSE2_YUV_MUL
 428                     SSE2_YUV_ADD
 429                     SSE2_UNPACK_16_UNALIGNED
 430                 );
 431                 p_y += 16;
 432                 p_u += 8;
 433                 p_v += 8;
 434             }
 435             SCALE_WIDTH;
 436             SCALE_HEIGHT( 420, 2 );
 437
 438             p_y += i_source_margin;
 439             if( i_y % 2 )
 440             {
 441                 p_u += i_source_margin_c;
 442                 p_v += i_source_margin_c;
 443             }
 444             p_buffer = b_hscale ? p_buffer_start : p_pic;
 445         }
 446     }
 447     else
 448     {
 449         /* use slower SSE2 unaligned fetch and store */
 450         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
 451         {
 452             p_pic_start = p_pic;
 453             p_buffer = b_hscale ? p_buffer_start : p_pic;
 454
 455             for ( i_x = p_filter->fmt_in.video.i_width/16; i_x--; )
 456             {
 457                 SSE2_CALL(
 458                     SSE2_INIT_16_UNALIGNED
 459                     SSE2_YUV_MUL
 460                     SSE2_YUV_ADD
 461                     SSE2_UNPACK_16_UNALIGNED
 462                 );
 463                 p_y += 16;
 464                 p_u += 8;
 465                 p_v += 8;
 466                 p_buffer += 16;
 467             }
 468             /* Here we do some unaligned reads and duplicate conversions, but
 469              * at least we have all the pixels */
 470             if( i_rewind )
 471             {
 472                 p_y -= i_rewind;
 473                 p_u -= i_rewind >> 1;
 474                 p_v -= i_rewind >> 1;
 475                 p_buffer -= i_rewind;
 476
 477                 SSE2_CALL(
 478                     SSE2_INIT_16_UNALIGNED
 479                     SSE2_YUV_MUL
 480                     SSE2_YUV_ADD
 481                     SSE2_UNPACK_16_UNALIGNED
 482                 );
 483                 p_y += 16;
 484                 p_u += 8;
 485                 p_v += 8;
 486             }
 487             SCALE_WIDTH;
 488             SCALE_HEIGHT( 420, 2 );
 489
 490             p_y += i_source_margin;
 491             if( i_y % 2 )
 492             {
 493                 p_u += i_source_margin_c;
 494                 p_v += i_source_margin_c;
 495             }
 496             p_buffer = b_hscale ? p_buffer_start : p_pic;
 497         }
 498     }
 499
 500     /* make sure all SSE2 stores are visible thereafter */
 501     SSE2_END;
 502
 503 #else /* SSE2 */
 504
 505     i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
 506
 507     for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
 508     {
 509         p_pic_start = p_pic;
 510         p_buffer = b_hscale ? p_buffer_start : p_pic;
 511
 512         for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
 513         {
 514             MMX_CALL (
 515                 MMX_INIT_16
 516                 MMX_YUV_MUL
 517                 MMX_YUV_ADD
 518                 MMX_UNPACK_16
 519             );
 520             p_y += 8;
 521             p_u += 4;
 522             p_v += 4;
 523             p_buffer += 8;
 524         }
 525
 526         /* Here we do some unaligned reads and duplicate conversions, but
 527          * at least we have all the pixels */
 528         if( i_rewind )
 529         {
 530             p_y -= i_rewind;
 531             p_u -= i_rewind >> 1;
 532             p_v -= i_rewind >> 1;
 533             p_buffer -= i_rewind;
 534
 535             MMX_CALL (
 536                 MMX_INIT_16
 537                 MMX_YUV_MUL
 538                 MMX_YUV_ADD
 539                 MMX_UNPACK_16
 540             );
 541             p_y += 8;
 542             p_u += 4;
 543             p_v += 4;
 544             p_buffer += 8;
 545         }
 546         SCALE_WIDTH;
 547         SCALE_HEIGHT( 420, 2 );
 548
 549         p_y += i_source_margin;
 550         if( i_y % 2 )
 551         {
 552             p_u += i_source_margin_c;
 553             p_v += i_source_margin_c;
 554         }
 555     }
 556     /* re-enable FPU registers */
 557     MMX_END;
 558
 559 #endif /* SSE2 */
 560 }
 561
 562 VLC_TARGET
 563 void I420_A8R8G8B8( filter_t *p_filter, picture_t *p_src,
 564                                             picture_t *p_dest )
 565 {
 566     /* We got this one from the old arguments */
 567     uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
 568     uint8_t  *p_y   = p_src->Y_PIXELS;
 569     uint8_t  *p_u   = p_src->U_PIXELS;
 570     uint8_t  *p_v   = p_src->V_PIXELS;
 571
 572     bool  b_hscale;                         /* horizontal scaling type */
 573     unsigned int i_vscale;                          /* vertical scaling type */
 574     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
 575
 576     int         i_right_margin;
 577     int         i_rewind;
 578     int         i_scale_count;                       /* scale modulo counter */
 579     int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
 580     uint32_t *  p_pic_start;       /* beginning of the current line for copy */
 581     /* Conversion buffer pointer */
 582     uint32_t *  p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer;
 583     uint32_t *  p_buffer;
 584
 585     /* Offset array pointer */
 586     int *       p_offset_start = p_filter->p_sys->p_offset;
 587     int *       p_offset;
 588
 589     const int i_source_margin = p_src->p[0].i_pitch
 590                                  - p_src->p[0].i_visible_pitch;
 591     const int i_source_margin_c = p_src->p[1].i_pitch
 592                                  - p_src->p[1].i_visible_pitch;
 593
 594     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
 595
 596     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
 597      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
 598      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
 599     SetOffset( p_filter->fmt_in.video.i_width,
 600                p_filter->fmt_in.video.i_height,
 601                p_filter->fmt_out.video.i_width,
 602                p_filter->fmt_out.video.i_height,
 603                &b_hscale, &i_vscale, p_offset_start );
 604
 605     /*
 606      * Perform conversion
 607      */
 608     i_scale_count = ( i_vscale == 1 ) ?
 609                     p_filter->fmt_out.video.i_height :
 610                     p_filter->fmt_in.video.i_height;
 611
 612 #ifdef SSE2
 613
 614     i_rewind = (-p_filter->fmt_in.video.i_width) & 15;
 615
 616     /*
 617     ** SSE2 128 bits fetch/store instructions are faster
 618     ** if memory access is 16 bytes aligned
 619     */
 620
 621     p_buffer = b_hscale ? p_buffer_start : p_pic;
 622     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
 623                     p_dest->p->i_pitch|
 624                     ((intptr_t)p_y)|
 625                     ((intptr_t)p_buffer))) )
 626     {
 627         /* use faster SSE2 aligned fetch and store */
 628         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
 629         {
 630             p_pic_start = p_pic;
 631
 632             for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
 633             {
 634                 SSE2_CALL (
 635                     SSE2_INIT_32_ALIGNED
 636                     SSE2_YUV_MUL
 637                     SSE2_YUV_ADD
 638                     SSE2_UNPACK_32_ARGB_ALIGNED
 639                 );
 640                 p_y += 16;
 641                 p_u += 8;
 642                 p_v += 8;
 643                 p_buffer += 16;
 644             }
 645
 646             /* Here we do some unaligned reads and duplicate conversions, but
 647              * at least we have all the pixels */
 648             if( i_rewind )
 649             {
 650                 p_y -= i_rewind;
 651                 p_u -= i_rewind >> 1;
 652                 p_v -= i_rewind >> 1;
 653                 p_buffer -= i_rewind;
 654                 SSE2_CALL (
 655                     SSE2_INIT_32_UNALIGNED
 656                     SSE2_YUV_MUL
 657                     SSE2_YUV_ADD
 658                     SSE2_UNPACK_32_ARGB_UNALIGNED
 659                 );
 660                 p_y += 16;
 661                 p_u += 4;
 662                 p_v += 4;
 663             }
 664             SCALE_WIDTH;
 665             SCALE_HEIGHT( 420, 4 );
 666
 667             p_y += i_source_margin;
 668             if( i_y % 2 )
 669             {
 670                 p_u += i_source_margin_c;
 671                 p_v += i_source_margin_c;
 672             }
 673             p_buffer = b_hscale ? p_buffer_start : p_pic;
 674         }
 675     }
 676     else
 677     {
 678         /* use slower SSE2 unaligned fetch and store */
 679         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
 680         {
 681             p_pic_start = p_pic;
 682             p_buffer = b_hscale ? p_buffer_start : p_pic;
 683
 684             for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
 685             {
 686                 SSE2_CALL (
 687                     SSE2_INIT_32_UNALIGNED
 688                     SSE2_YUV_MUL
 689                     SSE2_YUV_ADD
 690                     SSE2_UNPACK_32_ARGB_UNALIGNED
 691                 );
 692                 p_y += 16;
 693                 p_u += 8;
 694                 p_v += 8;
 695                 p_buffer += 16;
 696             }
 697
 698             /* Here we do some unaligned reads and duplicate conversions, but
 699              * at least we have all the pixels */
 700             if( i_rewind )
 701             {
 702                 p_y -= i_rewind;
 703                 p_u -= i_rewind >> 1;
 704                 p_v -= i_rewind >> 1;
 705                 p_buffer -= i_rewind;
 706                 SSE2_CALL (
 707                     SSE2_INIT_32_UNALIGNED
 708                     SSE2_YUV_MUL
 709                     SSE2_YUV_ADD
 710                     SSE2_UNPACK_32_ARGB_UNALIGNED
 711                 );
 712                 p_y += 16;
 713                 p_u += 8;
 714                 p_v += 8;
 715             }
 716             SCALE_WIDTH;
 717             SCALE_HEIGHT( 420, 4 );
 718
 719             p_y += i_source_margin;
 720             if( i_y % 2 )
 721             {
 722                 p_u += i_source_margin_c;
 723                 p_v += i_source_margin_c;
 724             }
 725             p_buffer = b_hscale ? p_buffer_start : p_pic;
 726         }
 727     }
 728
 729     /* make sure all SSE2 stores are visible thereafter */
 730     SSE2_END;
 731
 732 #else
 733
 734     i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
 735
 736     for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
 737     {
 738         p_pic_start = p_pic;
 739         p_buffer = b_hscale ? p_buffer_start : p_pic;
 740
 741         for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
 742         {
 743             MMX_CALL (
 744                 MMX_INIT_32
 745                 MMX_YUV_MUL
 746                 MMX_YUV_ADD
 747                 MMX_UNPACK_32_ARGB
 748             );
 749             p_y += 8;
 750             p_u += 4;
 751             p_v += 4;
 752             p_buffer += 8;
 753         }
 754
 755         /* Here we do some unaligned reads and duplicate conversions, but
 756          * at least we have all the pixels */
 757         if( i_rewind )
 758         {
 759             p_y -= i_rewind;
 760             p_u -= i_rewind >> 1;
 761             p_v -= i_rewind >> 1;
 762             p_buffer -= i_rewind;
 763             MMX_CALL (
 764                 MMX_INIT_32
 765                 MMX_YUV_MUL
 766                 MMX_YUV_ADD
 767                 MMX_UNPACK_32_ARGB
 768             );
 769             p_y += 8;
 770             p_u += 4;
 771             p_v += 4;
 772             p_buffer += 8;
 773         }
 774         SCALE_WIDTH;
 775         SCALE_HEIGHT( 420, 4 );
 776
 777         p_y += i_source_margin;
 778         if( i_y % 2 )
 779         {
 780             p_u += i_source_margin_c;
 781             p_v += i_source_margin_c;
 782         }
 783     }
 784
 785     /* re-enable FPU registers */
 786     MMX_END;
 787
 788 #endif
 789 }
 790
 791 VLC_TARGET
 792 void I420_R8G8B8A8( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
 793 {
 794     /* We got this one from the old arguments */
 795     uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
 796     uint8_t  *p_y   = p_src->Y_PIXELS;
 797     uint8_t  *p_u   = p_src->U_PIXELS;
 798     uint8_t  *p_v   = p_src->V_PIXELS;
 799
 800     bool  b_hscale;                         /* horizontal scaling type */
 801     unsigned int i_vscale;                          /* vertical scaling type */
 802     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
 803
 804     int         i_right_margin;
 805     int         i_rewind;
 806     int         i_scale_count;                       /* scale modulo counter */
 807     int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
 808     uint32_t *  p_pic_start;       /* beginning of the current line for copy */
 809     /* Conversion buffer pointer */
 810     uint32_t *  p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer;
 811     uint32_t *  p_buffer;
 812
 813     /* Offset array pointer */
 814     int *       p_offset_start = p_filter->p_sys->p_offset;
 815     int *       p_offset;
 816
 817     const int i_source_margin = p_src->p[0].i_pitch
 818                                  - p_src->p[0].i_visible_pitch;
 819     const int i_source_margin_c = p_src->p[1].i_pitch
 820                                  - p_src->p[1].i_visible_pitch;
 821
 822     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
 823
 824     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
 825      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
 826      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
 827     SetOffset( p_filter->fmt_in.video.i_width,
 828                p_filter->fmt_in.video.i_height,
 829                p_filter->fmt_out.video.i_width,
 830                p_filter->fmt_out.video.i_height,
 831                &b_hscale, &i_vscale, p_offset_start );
 832
 833     /*
 834      * Perform conversion
 835      */
 836     i_scale_count = ( i_vscale == 1 ) ?
 837                     p_filter->fmt_out.video.i_height :
 838                     p_filter->fmt_in.video.i_height;
 839
 840 #ifdef SSE2
 841
 842     i_rewind = (-p_filter->fmt_in.video.i_width) & 15;
 843
 844     /*
 845     ** SSE2 128 bits fetch/store instructions are faster
 846     ** if memory access is 16 bytes aligned
 847     */
 848
 849     p_buffer = b_hscale ? p_buffer_start : p_pic;
 850     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
 851                     p_dest->p->i_pitch|
 852                     ((intptr_t)p_y)|
 853                     ((intptr_t)p_buffer))) )
 854     {
 855         /* use faster SSE2 aligned fetch and store */
 856         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
 857         {
 858             p_pic_start = p_pic;
 859
 860             for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
 861             {
 862                 SSE2_CALL (
 863                     SSE2_INIT_32_ALIGNED
 864                     SSE2_YUV_MUL
 865                     SSE2_YUV_ADD
 866                     SSE2_UNPACK_32_RGBA_ALIGNED
 867                 );
 868                 p_y += 16;
 869                 p_u += 8;
 870                 p_v += 8;
 871                 p_buffer += 16;
 872             }
 873
 874             /* Here we do some unaligned reads and duplicate conversions, but
 875              * at least we have all the pixels */
 876             if( i_rewind )
 877             {
 878                 p_y -= i_rewind;
 879                 p_u -= i_rewind >> 1;
 880                 p_v -= i_rewind >> 1;
 881                 p_buffer -= i_rewind;
 882                 SSE2_CALL (
 883                     SSE2_INIT_32_UNALIGNED
 884                     SSE2_YUV_MUL
 885                     SSE2_YUV_ADD
 886                     SSE2_UNPACK_32_RGBA_UNALIGNED
 887                 );
 888                 p_y += 16;
 889                 p_u += 4;
 890                 p_v += 4;
 891             }
 892             SCALE_WIDTH;
 893             SCALE_HEIGHT( 420, 4 );
 894
 895             p_y += i_source_margin;
 896             if( i_y % 2 )
 897             {
 898                 p_u += i_source_margin_c;
 899                 p_v += i_source_margin_c;
 900             }
 901             p_buffer = b_hscale ? p_buffer_start : p_pic;
 902         }
 903     }
 904     else
 905     {
 906         /* use slower SSE2 unaligned fetch and store */
 907         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
 908         {
 909             p_pic_start = p_pic;
 910             p_buffer = b_hscale ? p_buffer_start : p_pic;
 911
 912             for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
 913             {
 914                 SSE2_CALL (
 915                     SSE2_INIT_32_UNALIGNED
 916                     SSE2_YUV_MUL
 917                     SSE2_YUV_ADD
 918                     SSE2_UNPACK_32_RGBA_UNALIGNED
 919                 );
 920                 p_y += 16;
 921                 p_u += 8;
 922                 p_v += 8;
 923                 p_buffer += 16;
 924             }
 925
 926             /* Here we do some unaligned reads and duplicate conversions, but
 927              * at least we have all the pixels */
 928             if( i_rewind )
 929             {
 930                 p_y -= i_rewind;
 931                 p_u -= i_rewind >> 1;
 932                 p_v -= i_rewind >> 1;
 933                 p_buffer -= i_rewind;
 934                 SSE2_CALL (
 935                     SSE2_INIT_32_UNALIGNED
 936                     SSE2_YUV_MUL
 937                     SSE2_YUV_ADD
 938                     SSE2_UNPACK_32_RGBA_UNALIGNED
 939                 );
 940                 p_y += 16;
 941                 p_u += 8;
 942                 p_v += 8;
 943             }
 944             SCALE_WIDTH;
 945             SCALE_HEIGHT( 420, 4 );
 946
 947             p_y += i_source_margin;
 948             if( i_y % 2 )
 949             {
 950                 p_u += i_source_margin_c;
 951                 p_v += i_source_margin_c;
 952             }
 953             p_buffer = b_hscale ? p_buffer_start : p_pic;
 954         }
 955     }
 956
 957     /* make sure all SSE2 stores are visible thereafter */
 958     SSE2_END;
 959
 960 #else
 961
 962     i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
 963
 964     for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
 965     {
 966         p_pic_start = p_pic;
 967         p_buffer = b_hscale ? p_buffer_start : p_pic;
 968
 969         for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
 970         {
 971             MMX_CALL (
 972                 MMX_INIT_32
 973                 MMX_YUV_MUL
 974                 MMX_YUV_ADD
 975                 MMX_UNPACK_32_RGBA
 976             );
 977             p_y += 8;
 978             p_u += 4;
 979             p_v += 4;
 980             p_buffer += 8;
 981         }
 982
 983         /* Here we do some unaligned reads and duplicate conversions, but
 984          * at least we have all the pixels */
 985         if( i_rewind )
 986         {
 987             p_y -= i_rewind;
 988             p_u -= i_rewind >> 1;
 989             p_v -= i_rewind >> 1;
 990             p_buffer -= i_rewind;
 991             MMX_CALL (
 992                 MMX_INIT_32
 993                 MMX_YUV_MUL
 994                 MMX_YUV_ADD
 995                 MMX_UNPACK_32_RGBA
 996             );
 997             p_y += 8;
 998             p_u += 4;
 999             p_v += 4;
1000             p_buffer += 8;
1001         }
1002         SCALE_WIDTH;
1003         SCALE_HEIGHT( 420, 4 );
1004
1005         p_y += i_source_margin;
1006         if( i_y % 2 )
1007         {
1008             p_u += i_source_margin_c;
1009             p_v += i_source_margin_c;
1010         }
1011     }
1012
1013     /* re-enable FPU registers */
1014     MMX_END;
1015
1016 #endif
1017 }
1018
1019 VLC_TARGET
1020 void I420_B8G8R8A8( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
1021 {
1022     /* We got this one from the old arguments */
1023     uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
1024     uint8_t  *p_y   = p_src->Y_PIXELS;
1025     uint8_t  *p_u   = p_src->U_PIXELS;
1026     uint8_t  *p_v   = p_src->V_PIXELS;
1027
1028     bool  b_hscale;                         /* horizontal scaling type */
1029     unsigned int i_vscale;                          /* vertical scaling type */
1030     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
1031
1032     int         i_right_margin;
1033     int         i_rewind;
1034     int         i_scale_count;                       /* scale modulo counter */
1035     int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
1036     uint32_t *  p_pic_start;       /* beginning of the current line for copy */
1037     /* Conversion buffer pointer */
1038     uint32_t *  p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer;
1039     uint32_t *  p_buffer;
1040
1041     /* Offset array pointer */
1042     int *       p_offset_start = p_filter->p_sys->p_offset;
1043     int *       p_offset;
1044
1045     const int i_source_margin = p_src->p[0].i_pitch
1046                                  - p_src->p[0].i_visible_pitch;
1047     const int i_source_margin_c = p_src->p[1].i_pitch
1048                                  - p_src->p[1].i_visible_pitch;
1049
1050     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
1051
1052     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
1053      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
1054      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
1055     SetOffset( p_filter->fmt_in.video.i_width,
1056                p_filter->fmt_in.video.i_height,
1057                p_filter->fmt_out.video.i_width,
1058                p_filter->fmt_out.video.i_height,
1059                &b_hscale, &i_vscale, p_offset_start );
1060
1061     /*
1062      * Perform conversion
1063      */
1064     i_scale_count = ( i_vscale == 1 ) ?
1065                     p_filter->fmt_out.video.i_height :
1066                     p_filter->fmt_in.video.i_height;
1067
1068 #ifdef SSE2
1069
1070     i_rewind = (-p_filter->fmt_in.video.i_width) & 15;
1071
1072     /*
1073     ** SSE2 128 bits fetch/store instructions are faster
1074     ** if memory access is 16 bytes aligned
1075     */
1076
1077     p_buffer = b_hscale ? p_buffer_start : p_pic;
1078     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
1079                     p_dest->p->i_pitch|
1080                     ((intptr_t)p_y)|
1081                     ((intptr_t)p_buffer))) )
1082     {
1083         /* use faster SSE2 aligned fetch and store */
1084         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1085         {
1086             p_pic_start = p_pic;
1087
1088             for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
1089             {
1090                 SSE2_CALL (
1091                     SSE2_INIT_32_ALIGNED
1092                     SSE2_YUV_MUL
1093                     SSE2_YUV_ADD
1094                     SSE2_UNPACK_32_BGRA_ALIGNED
1095                 );
1096                 p_y += 16;
1097                 p_u += 8;
1098                 p_v += 8;
1099                 p_buffer += 16;
1100             }
1101
1102             /* Here we do some unaligned reads and duplicate conversions, but
1103              * at least we have all the pixels */
1104             if( i_rewind )
1105             {
1106                 p_y -= i_rewind;
1107                 p_u -= i_rewind >> 1;
1108                 p_v -= i_rewind >> 1;
1109                 p_buffer -= i_rewind;
1110                 SSE2_CALL (
1111                     SSE2_INIT_32_UNALIGNED
1112                     SSE2_YUV_MUL
1113                     SSE2_YUV_ADD
1114                     SSE2_UNPACK_32_BGRA_UNALIGNED
1115                 );
1116                 p_y += 16;
1117                 p_u += 4;
1118                 p_v += 4;
1119             }
1120             SCALE_WIDTH;
1121             SCALE_HEIGHT( 420, 4 );
1122
1123             p_y += i_source_margin;
1124             if( i_y % 2 )
1125             {
1126                 p_u += i_source_margin_c;
1127                 p_v += i_source_margin_c;
1128             }
1129             p_buffer = b_hscale ? p_buffer_start : p_pic;
1130         }
1131     }
1132     else
1133     {
1134         /* use slower SSE2 unaligned fetch and store */
1135         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1136         {
1137             p_pic_start = p_pic;
1138             p_buffer = b_hscale ? p_buffer_start : p_pic;
1139
1140             for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
1141             {
1142                 SSE2_CALL (
1143                     SSE2_INIT_32_UNALIGNED
1144                     SSE2_YUV_MUL
1145                     SSE2_YUV_ADD
1146                     SSE2_UNPACK_32_BGRA_UNALIGNED
1147                 );
1148                 p_y += 16;
1149                 p_u += 8;
1150                 p_v += 8;
1151                 p_buffer += 16;
1152             }
1153
1154             /* Here we do some unaligned reads and duplicate conversions, but
1155              * at least we have all the pixels */
1156             if( i_rewind )
1157             {
1158                 p_y -= i_rewind;
1159                 p_u -= i_rewind >> 1;
1160                 p_v -= i_rewind >> 1;
1161                 p_buffer -= i_rewind;
1162                 SSE2_CALL (
1163                     SSE2_INIT_32_UNALIGNED
1164                     SSE2_YUV_MUL
1165                     SSE2_YUV_ADD
1166                     SSE2_UNPACK_32_BGRA_UNALIGNED
1167                 );
1168                 p_y += 16;
1169                 p_u += 8;
1170                 p_v += 8;
1171             }
1172             SCALE_WIDTH;
1173             SCALE_HEIGHT( 420, 4 );
1174
1175             p_y += i_source_margin;
1176             if( i_y % 2 )
1177             {
1178                 p_u += i_source_margin_c;
1179                 p_v += i_source_margin_c;
1180             }
1181             p_buffer = b_hscale ? p_buffer_start : p_pic;
1182         }
1183     }
1184
1185 #else
1186
1187     i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
1188
1189     for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1190     {
1191         p_pic_start = p_pic;
1192         p_buffer = b_hscale ? p_buffer_start : p_pic;
1193
1194         for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
1195         {
1196             MMX_CALL (
1197                 MMX_INIT_32
1198                 MMX_YUV_MUL
1199                 MMX_YUV_ADD
1200                 MMX_UNPACK_32_BGRA
1201             );
1202             p_y += 8;
1203             p_u += 4;
1204             p_v += 4;
1205             p_buffer += 8;
1206         }
1207
1208         /* Here we do some unaligned reads and duplicate conversions, but
1209          * at least we have all the pixels */
1210         if( i_rewind )
1211         {
1212             p_y -= i_rewind;
1213             p_u -= i_rewind >> 1;
1214             p_v -= i_rewind >> 1;
1215             p_buffer -= i_rewind;
1216             MMX_CALL (
1217                 MMX_INIT_32
1218                 MMX_YUV_MUL
1219                 MMX_YUV_ADD
1220                 MMX_UNPACK_32_BGRA
1221             );
1222             p_y += 8;
1223             p_u += 4;
1224             p_v += 4;
1225             p_buffer += 8;
1226         }
1227         SCALE_WIDTH;
1228         SCALE_HEIGHT( 420, 4 );
1229
1230         p_y += i_source_margin;
1231         if( i_y % 2 )
1232         {
1233             p_u += i_source_margin_c;
1234             p_v += i_source_margin_c;
1235         }
1236     }
1237
1238     /* re-enable FPU registers */
1239     MMX_END;
1240
1241 #endif
1242 }
1243
1244 VLC_TARGET
1245 void I420_A8B8G8R8( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
1246 {
1247     /* We got this one from the old arguments */
1248     uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
1249     uint8_t  *p_y   = p_src->Y_PIXELS;
1250     uint8_t  *p_u   = p_src->U_PIXELS;
1251     uint8_t  *p_v   = p_src->V_PIXELS;
1252
1253     bool  b_hscale;                         /* horizontal scaling type */
1254     unsigned int i_vscale;                          /* vertical scaling type */
1255     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
1256
1257     int         i_right_margin;
1258     int         i_rewind;
1259     int         i_scale_count;                       /* scale modulo counter */
1260     int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
1261     uint32_t *  p_pic_start;       /* beginning of the current line for copy */
1262     /* Conversion buffer pointer */
1263     uint32_t *  p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer;
1264     uint32_t *  p_buffer;
1265
1266     /* Offset array pointer */
1267     int *       p_offset_start = p_filter->p_sys->p_offset;
1268     int *       p_offset;
1269
1270     const int i_source_margin = p_src->p[0].i_pitch
1271                                  - p_src->p[0].i_visible_pitch;
1272     const int i_source_margin_c = p_src->p[1].i_pitch
1273                                  - p_src->p[1].i_visible_pitch;
1274
1275     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
1276
1277     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
1278      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
1279      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
1280     SetOffset( p_filter->fmt_in.video.i_width,
1281                p_filter->fmt_in.video.i_height,
1282                p_filter->fmt_out.video.i_width,
1283                p_filter->fmt_out.video.i_height,
1284                &b_hscale, &i_vscale, p_offset_start );
1285
1286     /*
1287      * Perform conversion
1288      */
1289     i_scale_count = ( i_vscale == 1 ) ?
1290                     p_filter->fmt_out.video.i_height :
1291                     p_filter->fmt_in.video.i_height;
1292
1293 #ifdef SSE2
1294
1295     i_rewind = (-p_filter->fmt_in.video.i_width) & 15;
1296
1297     /*
1298     ** SSE2 128 bits fetch/store instructions are faster
1299     ** if memory access is 16 bytes aligned
1300     */
1301
1302     p_buffer = b_hscale ? p_buffer_start : p_pic;
1303     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
1304                     p_dest->p->i_pitch|
1305                     ((intptr_t)p_y)|
1306                     ((intptr_t)p_buffer))) )
1307     {
1308         /* use faster SSE2 aligned fetch and store */
1309         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1310         {
1311             p_pic_start = p_pic;
1312
1313             for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
1314             {
1315                 SSE2_CALL (
1316                     SSE2_INIT_32_ALIGNED
1317                     SSE2_YUV_MUL
1318                     SSE2_YUV_ADD
1319                     SSE2_UNPACK_32_ABGR_ALIGNED
1320                 );
1321                 p_y += 16;
1322                 p_u += 8;
1323                 p_v += 8;
1324                 p_buffer += 16;
1325             }
1326
1327             /* Here we do some unaligned reads and duplicate conversions, but
1328              * at least we have all the pixels */
1329             if( i_rewind )
1330             {
1331                 p_y -= i_rewind;
1332                 p_u -= i_rewind >> 1;
1333                 p_v -= i_rewind >> 1;
1334                 p_buffer -= i_rewind;
1335                 SSE2_CALL (
1336                     SSE2_INIT_32_UNALIGNED
1337                     SSE2_YUV_MUL
1338                     SSE2_YUV_ADD
1339                     SSE2_UNPACK_32_ABGR_UNALIGNED
1340                 );
1341                 p_y += 16;
1342                 p_u += 4;
1343                 p_v += 4;
1344             }
1345             SCALE_WIDTH;
1346             SCALE_HEIGHT( 420, 4 );
1347
1348             p_y += i_source_margin;
1349             if( i_y % 2 )
1350             {
1351                 p_u += i_source_margin_c;
1352                 p_v += i_source_margin_c;
1353             }
1354             p_buffer = b_hscale ? p_buffer_start : p_pic;
1355         }
1356     }
1357     else
1358     {
1359         /* use slower SSE2 unaligned fetch and store */
1360         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1361         {
1362             p_pic_start = p_pic;
1363             p_buffer = b_hscale ? p_buffer_start : p_pic;
1364
1365             for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
1366             {
1367                 SSE2_CALL (
1368                     SSE2_INIT_32_UNALIGNED
1369                     SSE2_YUV_MUL
1370                     SSE2_YUV_ADD
1371                     SSE2_UNPACK_32_ABGR_UNALIGNED
1372                 );
1373                 p_y += 16;
1374                 p_u += 8;
1375                 p_v += 8;
1376                 p_buffer += 16;
1377             }
1378
1379             /* Here we do some unaligned reads and duplicate conversions, but
1380              * at least we have all the pixels */
1381             if( i_rewind )
1382             {
1383                 p_y -= i_rewind;
1384                 p_u -= i_rewind >> 1;
1385                 p_v -= i_rewind >> 1;
1386                 p_buffer -= i_rewind;
1387                 SSE2_CALL (
1388                     SSE2_INIT_32_UNALIGNED
1389                     SSE2_YUV_MUL
1390                     SSE2_YUV_ADD
1391                     SSE2_UNPACK_32_ABGR_UNALIGNED
1392                 );
1393                 p_y += 16;
1394                 p_u += 8;
1395                 p_v += 8;
1396             }
1397             SCALE_WIDTH;
1398             SCALE_HEIGHT( 420, 4 );
1399
1400             p_y += i_source_margin;
1401             if( i_y % 2 )
1402             {
1403                 p_u += i_source_margin_c;
1404                 p_v += i_source_margin_c;
1405             }
1406             p_buffer = b_hscale ? p_buffer_start : p_pic;
1407         }
1408     }
1409
1410 #else
1411
1412     i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
1413
1414     for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1415     {
1416         p_pic_start = p_pic;
1417         p_buffer = b_hscale ? p_buffer_start : p_pic;
1418
1419         for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
1420         {
1421             MMX_CALL (
1422                 MMX_INIT_32
1423                 MMX_YUV_MUL
1424                 MMX_YUV_ADD
1425                 MMX_UNPACK_32_ABGR
1426             );
1427             p_y += 8;
1428             p_u += 4;
1429             p_v += 4;
1430             p_buffer += 8;
1431         }
1432
1433         /* Here we do some unaligned reads and duplicate conversions, but
1434          * at least we have all the pixels */
1435         if( i_rewind )
1436         {
1437             p_y -= i_rewind;
1438             p_u -= i_rewind >> 1;
1439             p_v -= i_rewind >> 1;
1440             p_buffer -= i_rewind;
1441             MMX_CALL (
1442                 MMX_INIT_32
1443                 MMX_YUV_MUL
1444                 MMX_YUV_ADD
1445                 MMX_UNPACK_32_ABGR
1446             );
1447             p_y += 8;
1448             p_u += 4;
1449             p_v += 4;
1450             p_buffer += 8;
1451         }
1452         SCALE_WIDTH;
1453         SCALE_HEIGHT( 420, 4 );
1454
1455         p_y += i_source_margin;
1456         if( i_y % 2 )
1457         {
1458             p_u += i_source_margin_c;
1459             p_v += i_source_margin_c;
1460         }
1461     }
1462
1463     /* re-enable FPU registers */
1464     MMX_END;
1465
1466 #endif
1467 }