]> git.sesse.net Git - vlc/blob - modules/video_chroma/i420_rgb16_x86.c
mediacodec: skip prerolled frames
[vlc] / modules / video_chroma / i420_rgb16_x86.c
1 /*****************************************************************************
2  * i420_rgb16_x86.c : YUV to bitmap RGB conversion module for vlc
3  *****************************************************************************
4  * Copyright (C) 2000 VLC authors and VideoLAN
5  * $Id$
6  *
7  * Authors: Samuel Hocevar <sam@zoy.org>
8  *          Damien Fouilleul <damienf@videolan.org>
9  *
10  * This program is free software; you can redistribute it and/or modify it
11  * under the terms of the GNU Lesser General Public License as published by
12  * the Free Software Foundation; either version 2.1 of the License, or
13  * (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18  * GNU Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public License
21  * along with this program; if not, write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
23  *****************************************************************************/
24
25 #ifdef HAVE_CONFIG_H
26 # include "config.h"
27 #endif
28
29 #include <vlc_common.h>
30 #include <vlc_filter.h>
31 #include <vlc_cpu.h>
32
33 #include "i420_rgb.h"
34 #ifdef SSE2
35 # include "i420_rgb_sse2.h"
36 # define VLC_TARGET VLC_SSE
37 #else
38 # include "i420_rgb_mmx.h"
39 # define VLC_TARGET VLC_MMX
40 #endif
41
42 /*****************************************************************************
43  * SetOffset: build offset array for conversion functions
44  *****************************************************************************
45  * This function will build an offset array used in later conversion functions.
46  * It will also set horizontal and vertical scaling indicators.
47  *****************************************************************************/
48 static void SetOffset( int i_width, int i_height, int i_pic_width,
49                        int i_pic_height, bool *pb_hscale,
50                        unsigned int *pi_vscale, int *p_offset )
51 {
52     /*
53      * Prepare horizontal offset array
54      */
55     if( i_pic_width - i_width == 0 )
56     {   /* No horizontal scaling: YUV conversion is done directly to picture */
57         *pb_hscale = 0;
58     }
59     else if( i_pic_width - i_width > 0 )
60     {   /* Prepare scaling array for horizontal extension */
61         int i_scale_count = i_pic_width;
62
63         *pb_hscale = 1;
64         for( int i_x = i_width; i_x--; )
65         {
66             while( (i_scale_count -= i_width) > 0 )
67             {
68                 *p_offset++ = 0;
69             }
70             *p_offset++ = 1;
71             i_scale_count += i_pic_width;
72         }
73     }
74     else /* if( i_pic_width - i_width < 0 ) */
75     {   /* Prepare scaling array for horizontal reduction */
76         int i_scale_count = i_pic_width;
77
78         *pb_hscale = 1;
79         for( int i_x = i_pic_width; i_x--; )
80         {
81             *p_offset = 1;
82             while( (i_scale_count -= i_pic_width) > 0 )
83             {
84                 *p_offset += 1;
85             }
86             p_offset++;
87             i_scale_count += i_width;
88         }
89     }
90
91     /*
92      * Set vertical scaling indicator
93      */
94     if( i_pic_height - i_height == 0 )
95         *pi_vscale = 0;
96     else if( i_pic_height - i_height > 0 )
97         *pi_vscale = 1;
98     else /* if( i_pic_height - i_height < 0 ) */
99         *pi_vscale = -1;
100 }
101
102 VLC_TARGET
103 void I420_R5G5B5( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
104 {
105     /* We got this one from the old arguments */
106     uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
107     uint8_t  *p_y   = p_src->Y_PIXELS;
108     uint8_t  *p_u   = p_src->U_PIXELS;
109     uint8_t  *p_v   = p_src->V_PIXELS;
110
111     bool  b_hscale;                         /* horizontal scaling type */
112     unsigned int i_vscale;                          /* vertical scaling type */
113     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
114
115     int         i_right_margin;
116     int         i_rewind;
117     int         i_scale_count;                       /* scale modulo counter */
118     int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
119     uint16_t *  p_pic_start;       /* beginning of the current line for copy */
120
121     /* Conversion buffer pointer */
122     uint16_t *  p_buffer_start = (uint16_t*)p_filter->p_sys->p_buffer;
123     uint16_t *  p_buffer;
124
125     /* Offset array pointer */
126     int *       p_offset_start = p_filter->p_sys->p_offset;
127     int *       p_offset;
128
129     const int i_source_margin = p_src->p[0].i_pitch
130                                  - p_src->p[0].i_visible_pitch;
131     const int i_source_margin_c = p_src->p[1].i_pitch
132                                  - p_src->p[1].i_visible_pitch;
133
134     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
135
136     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
137      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
138      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
139     SetOffset( p_filter->fmt_in.video.i_width,
140                p_filter->fmt_in.video.i_height,
141                p_filter->fmt_out.video.i_width,
142                p_filter->fmt_out.video.i_height,
143                &b_hscale, &i_vscale, p_offset_start );
144
145
146     /*
147      * Perform conversion
148      */
149     i_scale_count = ( i_vscale == 1 ) ?
150                     p_filter->fmt_out.video.i_height :
151                     p_filter->fmt_in.video.i_height;
152
153 #ifdef SSE2
154
155     i_rewind = (-p_filter->fmt_in.video.i_width) & 15;
156
157     /*
158     ** SSE2 128 bits fetch/store instructions are faster
159     ** if memory access is 16 bytes aligned
160     */
161
162     p_buffer = b_hscale ? p_buffer_start : p_pic;
163     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
164                     p_dest->p->i_pitch|
165                     ((intptr_t)p_y)|
166                     ((intptr_t)p_buffer))) )
167     {
168         /* use faster SSE2 aligned fetch and store */
169         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
170         {
171             p_pic_start = p_pic;
172
173             for ( i_x = p_filter->fmt_in.video.i_width/16; i_x--; )
174             {
175                 SSE2_CALL (
176                     SSE2_INIT_16_ALIGNED
177                     SSE2_YUV_MUL
178                     SSE2_YUV_ADD
179                     SSE2_UNPACK_15_ALIGNED
180                 );
181                 p_y += 16;
182                 p_u += 8;
183                 p_v += 8;
184                 p_buffer += 16;
185             }
186             /* Here we do some unaligned reads and duplicate conversions, but
187              * at least we have all the pixels */
188             if( i_rewind )
189             {
190                 p_y -= i_rewind;
191                 p_u -= i_rewind >> 1;
192                 p_v -= i_rewind >> 1;
193                 p_buffer -= i_rewind;
194
195                 SSE2_CALL (
196                     SSE2_INIT_16_UNALIGNED
197                     SSE2_YUV_MUL
198                     SSE2_YUV_ADD
199                     SSE2_UNPACK_15_UNALIGNED
200                 );
201                 p_y += 16;
202                 p_u += 8;
203                 p_v += 8;
204             }
205             SCALE_WIDTH;
206             SCALE_HEIGHT( 420, 2 );
207
208             p_y += i_source_margin;
209             if( i_y % 2 )
210             {
211                 p_u += i_source_margin_c;
212                 p_v += i_source_margin_c;
213             }
214             p_buffer = b_hscale ? p_buffer_start : p_pic;
215         }
216     }
217     else
218     {
219         /* use slower SSE2 unaligned fetch and store */
220         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
221         {
222             p_pic_start = p_pic;
223             p_buffer = b_hscale ? p_buffer_start : p_pic;
224
225             for ( i_x = p_filter->fmt_in.video.i_width/16; i_x--; )
226             {
227                 SSE2_CALL (
228                     SSE2_INIT_16_UNALIGNED
229                     SSE2_YUV_MUL
230                     SSE2_YUV_ADD
231                     SSE2_UNPACK_15_UNALIGNED
232                 );
233                 p_y += 16;
234                 p_u += 8;
235                 p_v += 8;
236                 p_buffer += 16;
237             }
238             /* Here we do some unaligned reads and duplicate conversions, but
239              * at least we have all the pixels */
240             if( i_rewind )
241             {
242                 p_y -= i_rewind;
243                 p_u -= i_rewind >> 1;
244                 p_v -= i_rewind >> 1;
245                 p_buffer -= i_rewind;
246
247                 SSE2_CALL (
248                     SSE2_INIT_16_UNALIGNED
249                     SSE2_YUV_MUL
250                     SSE2_YUV_ADD
251                     SSE2_UNPACK_15_UNALIGNED
252                 );
253                 p_y += 16;
254                 p_u += 8;
255                 p_v += 8;
256             }
257             SCALE_WIDTH;
258             SCALE_HEIGHT( 420, 2 );
259
260             p_y += i_source_margin;
261             if( i_y % 2 )
262             {
263                 p_u += i_source_margin_c;
264                 p_v += i_source_margin_c;
265             }
266             p_buffer = b_hscale ? p_buffer_start : p_pic;
267         }
268     }
269
270     /* make sure all SSE2 stores are visible thereafter */
271     SSE2_END;
272
273 #else /* SSE2 */
274
275     i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
276
277     for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
278     {
279         p_pic_start = p_pic;
280         p_buffer = b_hscale ? p_buffer_start : p_pic;
281
282         for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
283         {
284             MMX_CALL (
285                 MMX_INIT_16
286                 MMX_YUV_MUL
287                 MMX_YUV_ADD
288                 MMX_UNPACK_15
289             );
290             p_y += 8;
291             p_u += 4;
292             p_v += 4;
293             p_buffer += 8;
294         }
295
296         /* Here we do some unaligned reads and duplicate conversions, but
297          * at least we have all the pixels */
298         if( i_rewind )
299         {
300             p_y -= i_rewind;
301             p_u -= i_rewind >> 1;
302             p_v -= i_rewind >> 1;
303             p_buffer -= i_rewind;
304
305             MMX_CALL (
306                 MMX_INIT_16
307                 MMX_YUV_MUL
308                 MMX_YUV_ADD
309                 MMX_UNPACK_15
310             );
311             p_y += 8;
312             p_u += 4;
313             p_v += 4;
314             p_buffer += 8;
315         }
316         SCALE_WIDTH;
317         SCALE_HEIGHT( 420, 2 );
318
319         p_y += i_source_margin;
320         if( i_y % 2 )
321         {
322             p_u += i_source_margin_c;
323             p_v += i_source_margin_c;
324         }
325     }
326     /* re-enable FPU registers */
327     MMX_END;
328
329 #endif /* SSE2 */
330 }
331
332 VLC_TARGET
333 void I420_R5G6B5( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
334 {
335     /* We got this one from the old arguments */
336     uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
337     uint8_t  *p_y   = p_src->Y_PIXELS;
338     uint8_t  *p_u   = p_src->U_PIXELS;
339     uint8_t  *p_v   = p_src->V_PIXELS;
340
341     bool  b_hscale;                         /* horizontal scaling type */
342     unsigned int i_vscale;                          /* vertical scaling type */
343     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
344
345     int         i_right_margin;
346     int         i_rewind;
347     int         i_scale_count;                       /* scale modulo counter */
348     int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
349     uint16_t *  p_pic_start;       /* beginning of the current line for copy */
350
351     /* Conversion buffer pointer */
352     uint16_t *  p_buffer_start = (uint16_t*)p_filter->p_sys->p_buffer;
353     uint16_t *  p_buffer;
354
355     /* Offset array pointer */
356     int *       p_offset_start = p_filter->p_sys->p_offset;
357     int *       p_offset;
358
359     const int i_source_margin = p_src->p[0].i_pitch
360                                  - p_src->p[0].i_visible_pitch;
361     const int i_source_margin_c = p_src->p[1].i_pitch
362                                  - p_src->p[1].i_visible_pitch;
363
364     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
365
366     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
367      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
368      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
369     SetOffset( p_filter->fmt_in.video.i_width,
370                p_filter->fmt_in.video.i_height,
371                p_filter->fmt_out.video.i_width,
372                p_filter->fmt_out.video.i_height,
373                &b_hscale, &i_vscale, p_offset_start );
374
375
376     /*
377      * Perform conversion
378      */
379     i_scale_count = ( i_vscale == 1 ) ?
380                     p_filter->fmt_out.video.i_height :
381                     p_filter->fmt_in.video.i_height;
382
383 #ifdef SSE2
384
385     i_rewind = (-p_filter->fmt_in.video.i_width) & 15;
386
387     /*
388     ** SSE2 128 bits fetch/store instructions are faster
389     ** if memory access is 16 bytes aligned
390     */
391
392     p_buffer = b_hscale ? p_buffer_start : p_pic;
393     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
394                     p_dest->p->i_pitch|
395                     ((intptr_t)p_y)|
396                     ((intptr_t)p_buffer))) )
397     {
398         /* use faster SSE2 aligned fetch and store */
399         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
400         {
401             p_pic_start = p_pic;
402
403             for ( i_x = p_filter->fmt_in.video.i_width/16; i_x--; )
404             {
405                 SSE2_CALL (
406                     SSE2_INIT_16_ALIGNED
407                     SSE2_YUV_MUL
408                     SSE2_YUV_ADD
409                     SSE2_UNPACK_16_ALIGNED
410                 );
411                 p_y += 16;
412                 p_u += 8;
413                 p_v += 8;
414                 p_buffer += 16;
415             }
416             /* Here we do some unaligned reads and duplicate conversions, but
417              * at least we have all the pixels */
418             if( i_rewind )
419             {
420                 p_y -= i_rewind;
421                 p_u -= i_rewind >> 1;
422                 p_v -= i_rewind >> 1;
423                 p_buffer -= i_rewind;
424
425                 SSE2_CALL (
426                     SSE2_INIT_16_UNALIGNED
427                     SSE2_YUV_MUL
428                     SSE2_YUV_ADD
429                     SSE2_UNPACK_16_UNALIGNED
430                 );
431                 p_y += 16;
432                 p_u += 8;
433                 p_v += 8;
434             }
435             SCALE_WIDTH;
436             SCALE_HEIGHT( 420, 2 );
437
438             p_y += i_source_margin;
439             if( i_y % 2 )
440             {
441                 p_u += i_source_margin_c;
442                 p_v += i_source_margin_c;
443             }
444             p_buffer = b_hscale ? p_buffer_start : p_pic;
445         }
446     }
447     else
448     {
449         /* use slower SSE2 unaligned fetch and store */
450         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
451         {
452             p_pic_start = p_pic;
453             p_buffer = b_hscale ? p_buffer_start : p_pic;
454
455             for ( i_x = p_filter->fmt_in.video.i_width/16; i_x--; )
456             {
457                 SSE2_CALL(
458                     SSE2_INIT_16_UNALIGNED
459                     SSE2_YUV_MUL
460                     SSE2_YUV_ADD
461                     SSE2_UNPACK_16_UNALIGNED
462                 );
463                 p_y += 16;
464                 p_u += 8;
465                 p_v += 8;
466                 p_buffer += 16;
467             }
468             /* Here we do some unaligned reads and duplicate conversions, but
469              * at least we have all the pixels */
470             if( i_rewind )
471             {
472                 p_y -= i_rewind;
473                 p_u -= i_rewind >> 1;
474                 p_v -= i_rewind >> 1;
475                 p_buffer -= i_rewind;
476
477                 SSE2_CALL(
478                     SSE2_INIT_16_UNALIGNED
479                     SSE2_YUV_MUL
480                     SSE2_YUV_ADD
481                     SSE2_UNPACK_16_UNALIGNED
482                 );
483                 p_y += 16;
484                 p_u += 8;
485                 p_v += 8;
486             }
487             SCALE_WIDTH;
488             SCALE_HEIGHT( 420, 2 );
489
490             p_y += i_source_margin;
491             if( i_y % 2 )
492             {
493                 p_u += i_source_margin_c;
494                 p_v += i_source_margin_c;
495             }
496             p_buffer = b_hscale ? p_buffer_start : p_pic;
497         }
498     }
499
500     /* make sure all SSE2 stores are visible thereafter */
501     SSE2_END;
502
503 #else /* SSE2 */
504
505     i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
506
507     for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
508     {
509         p_pic_start = p_pic;
510         p_buffer = b_hscale ? p_buffer_start : p_pic;
511
512         for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
513         {
514             MMX_CALL (
515                 MMX_INIT_16
516                 MMX_YUV_MUL
517                 MMX_YUV_ADD
518                 MMX_UNPACK_16
519             );
520             p_y += 8;
521             p_u += 4;
522             p_v += 4;
523             p_buffer += 8;
524         }
525
526         /* Here we do some unaligned reads and duplicate conversions, but
527          * at least we have all the pixels */
528         if( i_rewind )
529         {
530             p_y -= i_rewind;
531             p_u -= i_rewind >> 1;
532             p_v -= i_rewind >> 1;
533             p_buffer -= i_rewind;
534
535             MMX_CALL (
536                 MMX_INIT_16
537                 MMX_YUV_MUL
538                 MMX_YUV_ADD
539                 MMX_UNPACK_16
540             );
541             p_y += 8;
542             p_u += 4;
543             p_v += 4;
544             p_buffer += 8;
545         }
546         SCALE_WIDTH;
547         SCALE_HEIGHT( 420, 2 );
548
549         p_y += i_source_margin;
550         if( i_y % 2 )
551         {
552             p_u += i_source_margin_c;
553             p_v += i_source_margin_c;
554         }
555     }
556     /* re-enable FPU registers */
557     MMX_END;
558
559 #endif /* SSE2 */
560 }
561
562 VLC_TARGET
563 void I420_A8R8G8B8( filter_t *p_filter, picture_t *p_src,
564                                             picture_t *p_dest )
565 {
566     /* We got this one from the old arguments */
567     uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
568     uint8_t  *p_y   = p_src->Y_PIXELS;
569     uint8_t  *p_u   = p_src->U_PIXELS;
570     uint8_t  *p_v   = p_src->V_PIXELS;
571
572     bool  b_hscale;                         /* horizontal scaling type */
573     unsigned int i_vscale;                          /* vertical scaling type */
574     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
575
576     int         i_right_margin;
577     int         i_rewind;
578     int         i_scale_count;                       /* scale modulo counter */
579     int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
580     uint32_t *  p_pic_start;       /* beginning of the current line for copy */
581     /* Conversion buffer pointer */
582     uint32_t *  p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer;
583     uint32_t *  p_buffer;
584
585     /* Offset array pointer */
586     int *       p_offset_start = p_filter->p_sys->p_offset;
587     int *       p_offset;
588
589     const int i_source_margin = p_src->p[0].i_pitch
590                                  - p_src->p[0].i_visible_pitch;
591     const int i_source_margin_c = p_src->p[1].i_pitch
592                                  - p_src->p[1].i_visible_pitch;
593
594     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
595
596     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
597      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
598      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
599     SetOffset( p_filter->fmt_in.video.i_width,
600                p_filter->fmt_in.video.i_height,
601                p_filter->fmt_out.video.i_width,
602                p_filter->fmt_out.video.i_height,
603                &b_hscale, &i_vscale, p_offset_start );
604
605     /*
606      * Perform conversion
607      */
608     i_scale_count = ( i_vscale == 1 ) ?
609                     p_filter->fmt_out.video.i_height :
610                     p_filter->fmt_in.video.i_height;
611
612 #ifdef SSE2
613
614     i_rewind = (-p_filter->fmt_in.video.i_width) & 15;
615
616     /*
617     ** SSE2 128 bits fetch/store instructions are faster
618     ** if memory access is 16 bytes aligned
619     */
620
621     p_buffer = b_hscale ? p_buffer_start : p_pic;
622     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
623                     p_dest->p->i_pitch|
624                     ((intptr_t)p_y)|
625                     ((intptr_t)p_buffer))) )
626     {
627         /* use faster SSE2 aligned fetch and store */
628         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
629         {
630             p_pic_start = p_pic;
631
632             for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
633             {
634                 SSE2_CALL (
635                     SSE2_INIT_32_ALIGNED
636                     SSE2_YUV_MUL
637                     SSE2_YUV_ADD
638                     SSE2_UNPACK_32_ARGB_ALIGNED
639                 );
640                 p_y += 16;
641                 p_u += 8;
642                 p_v += 8;
643                 p_buffer += 16;
644             }
645
646             /* Here we do some unaligned reads and duplicate conversions, but
647              * at least we have all the pixels */
648             if( i_rewind )
649             {
650                 p_y -= i_rewind;
651                 p_u -= i_rewind >> 1;
652                 p_v -= i_rewind >> 1;
653                 p_buffer -= i_rewind;
654                 SSE2_CALL (
655                     SSE2_INIT_32_UNALIGNED
656                     SSE2_YUV_MUL
657                     SSE2_YUV_ADD
658                     SSE2_UNPACK_32_ARGB_UNALIGNED
659                 );
660                 p_y += 16;
661                 p_u += 4;
662                 p_v += 4;
663             }
664             SCALE_WIDTH;
665             SCALE_HEIGHT( 420, 4 );
666
667             p_y += i_source_margin;
668             if( i_y % 2 )
669             {
670                 p_u += i_source_margin_c;
671                 p_v += i_source_margin_c;
672             }
673             p_buffer = b_hscale ? p_buffer_start : p_pic;
674         }
675     }
676     else
677     {
678         /* use slower SSE2 unaligned fetch and store */
679         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
680         {
681             p_pic_start = p_pic;
682             p_buffer = b_hscale ? p_buffer_start : p_pic;
683
684             for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
685             {
686                 SSE2_CALL (
687                     SSE2_INIT_32_UNALIGNED
688                     SSE2_YUV_MUL
689                     SSE2_YUV_ADD
690                     SSE2_UNPACK_32_ARGB_UNALIGNED
691                 );
692                 p_y += 16;
693                 p_u += 8;
694                 p_v += 8;
695                 p_buffer += 16;
696             }
697
698             /* Here we do some unaligned reads and duplicate conversions, but
699              * at least we have all the pixels */
700             if( i_rewind )
701             {
702                 p_y -= i_rewind;
703                 p_u -= i_rewind >> 1;
704                 p_v -= i_rewind >> 1;
705                 p_buffer -= i_rewind;
706                 SSE2_CALL (
707                     SSE2_INIT_32_UNALIGNED
708                     SSE2_YUV_MUL
709                     SSE2_YUV_ADD
710                     SSE2_UNPACK_32_ARGB_UNALIGNED
711                 );
712                 p_y += 16;
713                 p_u += 8;
714                 p_v += 8;
715             }
716             SCALE_WIDTH;
717             SCALE_HEIGHT( 420, 4 );
718
719             p_y += i_source_margin;
720             if( i_y % 2 )
721             {
722                 p_u += i_source_margin_c;
723                 p_v += i_source_margin_c;
724             }
725             p_buffer = b_hscale ? p_buffer_start : p_pic;
726         }
727     }
728
729     /* make sure all SSE2 stores are visible thereafter */
730     SSE2_END;
731
732 #else
733
734     i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
735
736     for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
737     {
738         p_pic_start = p_pic;
739         p_buffer = b_hscale ? p_buffer_start : p_pic;
740
741         for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
742         {
743             MMX_CALL (
744                 MMX_INIT_32
745                 MMX_YUV_MUL
746                 MMX_YUV_ADD
747                 MMX_UNPACK_32_ARGB
748             );
749             p_y += 8;
750             p_u += 4;
751             p_v += 4;
752             p_buffer += 8;
753         }
754
755         /* Here we do some unaligned reads and duplicate conversions, but
756          * at least we have all the pixels */
757         if( i_rewind )
758         {
759             p_y -= i_rewind;
760             p_u -= i_rewind >> 1;
761             p_v -= i_rewind >> 1;
762             p_buffer -= i_rewind;
763             MMX_CALL (
764                 MMX_INIT_32
765                 MMX_YUV_MUL
766                 MMX_YUV_ADD
767                 MMX_UNPACK_32_ARGB
768             );
769             p_y += 8;
770             p_u += 4;
771             p_v += 4;
772             p_buffer += 8;
773         }
774         SCALE_WIDTH;
775         SCALE_HEIGHT( 420, 4 );
776
777         p_y += i_source_margin;
778         if( i_y % 2 )
779         {
780             p_u += i_source_margin_c;
781             p_v += i_source_margin_c;
782         }
783     }
784
785     /* re-enable FPU registers */
786     MMX_END;
787
788 #endif
789 }
790
791 VLC_TARGET
792 void I420_R8G8B8A8( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
793 {
794     /* We got this one from the old arguments */
795     uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
796     uint8_t  *p_y   = p_src->Y_PIXELS;
797     uint8_t  *p_u   = p_src->U_PIXELS;
798     uint8_t  *p_v   = p_src->V_PIXELS;
799
800     bool  b_hscale;                         /* horizontal scaling type */
801     unsigned int i_vscale;                          /* vertical scaling type */
802     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
803
804     int         i_right_margin;
805     int         i_rewind;
806     int         i_scale_count;                       /* scale modulo counter */
807     int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
808     uint32_t *  p_pic_start;       /* beginning of the current line for copy */
809     /* Conversion buffer pointer */
810     uint32_t *  p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer;
811     uint32_t *  p_buffer;
812
813     /* Offset array pointer */
814     int *       p_offset_start = p_filter->p_sys->p_offset;
815     int *       p_offset;
816
817     const int i_source_margin = p_src->p[0].i_pitch
818                                  - p_src->p[0].i_visible_pitch;
819     const int i_source_margin_c = p_src->p[1].i_pitch
820                                  - p_src->p[1].i_visible_pitch;
821
822     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
823
824     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
825      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
826      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
827     SetOffset( p_filter->fmt_in.video.i_width,
828                p_filter->fmt_in.video.i_height,
829                p_filter->fmt_out.video.i_width,
830                p_filter->fmt_out.video.i_height,
831                &b_hscale, &i_vscale, p_offset_start );
832
833     /*
834      * Perform conversion
835      */
836     i_scale_count = ( i_vscale == 1 ) ?
837                     p_filter->fmt_out.video.i_height :
838                     p_filter->fmt_in.video.i_height;
839
840 #ifdef SSE2
841
842     i_rewind = (-p_filter->fmt_in.video.i_width) & 15;
843
844     /*
845     ** SSE2 128 bits fetch/store instructions are faster
846     ** if memory access is 16 bytes aligned
847     */
848
849     p_buffer = b_hscale ? p_buffer_start : p_pic;
850     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
851                     p_dest->p->i_pitch|
852                     ((intptr_t)p_y)|
853                     ((intptr_t)p_buffer))) )
854     {
855         /* use faster SSE2 aligned fetch and store */
856         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
857         {
858             p_pic_start = p_pic;
859
860             for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
861             {
862                 SSE2_CALL (
863                     SSE2_INIT_32_ALIGNED
864                     SSE2_YUV_MUL
865                     SSE2_YUV_ADD
866                     SSE2_UNPACK_32_RGBA_ALIGNED
867                 );
868                 p_y += 16;
869                 p_u += 8;
870                 p_v += 8;
871                 p_buffer += 16;
872             }
873
874             /* Here we do some unaligned reads and duplicate conversions, but
875              * at least we have all the pixels */
876             if( i_rewind )
877             {
878                 p_y -= i_rewind;
879                 p_u -= i_rewind >> 1;
880                 p_v -= i_rewind >> 1;
881                 p_buffer -= i_rewind;
882                 SSE2_CALL (
883                     SSE2_INIT_32_UNALIGNED
884                     SSE2_YUV_MUL
885                     SSE2_YUV_ADD
886                     SSE2_UNPACK_32_RGBA_UNALIGNED
887                 );
888                 p_y += 16;
889                 p_u += 4;
890                 p_v += 4;
891             }
892             SCALE_WIDTH;
893             SCALE_HEIGHT( 420, 4 );
894
895             p_y += i_source_margin;
896             if( i_y % 2 )
897             {
898                 p_u += i_source_margin_c;
899                 p_v += i_source_margin_c;
900             }
901             p_buffer = b_hscale ? p_buffer_start : p_pic;
902         }
903     }
904     else
905     {
906         /* use slower SSE2 unaligned fetch and store */
907         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
908         {
909             p_pic_start = p_pic;
910             p_buffer = b_hscale ? p_buffer_start : p_pic;
911
912             for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
913             {
914                 SSE2_CALL (
915                     SSE2_INIT_32_UNALIGNED
916                     SSE2_YUV_MUL
917                     SSE2_YUV_ADD
918                     SSE2_UNPACK_32_RGBA_UNALIGNED
919                 );
920                 p_y += 16;
921                 p_u += 8;
922                 p_v += 8;
923                 p_buffer += 16;
924             }
925
926             /* Here we do some unaligned reads and duplicate conversions, but
927              * at least we have all the pixels */
928             if( i_rewind )
929             {
930                 p_y -= i_rewind;
931                 p_u -= i_rewind >> 1;
932                 p_v -= i_rewind >> 1;
933                 p_buffer -= i_rewind;
934                 SSE2_CALL (
935                     SSE2_INIT_32_UNALIGNED
936                     SSE2_YUV_MUL
937                     SSE2_YUV_ADD
938                     SSE2_UNPACK_32_RGBA_UNALIGNED
939                 );
940                 p_y += 16;
941                 p_u += 8;
942                 p_v += 8;
943             }
944             SCALE_WIDTH;
945             SCALE_HEIGHT( 420, 4 );
946
947             p_y += i_source_margin;
948             if( i_y % 2 )
949             {
950                 p_u += i_source_margin_c;
951                 p_v += i_source_margin_c;
952             }
953             p_buffer = b_hscale ? p_buffer_start : p_pic;
954         }
955     }
956
957     /* make sure all SSE2 stores are visible thereafter */
958     SSE2_END;
959
960 #else
961
962     i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
963
964     for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
965     {
966         p_pic_start = p_pic;
967         p_buffer = b_hscale ? p_buffer_start : p_pic;
968
969         for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
970         {
971             MMX_CALL (
972                 MMX_INIT_32
973                 MMX_YUV_MUL
974                 MMX_YUV_ADD
975                 MMX_UNPACK_32_RGBA
976             );
977             p_y += 8;
978             p_u += 4;
979             p_v += 4;
980             p_buffer += 8;
981         }
982
983         /* Here we do some unaligned reads and duplicate conversions, but
984          * at least we have all the pixels */
985         if( i_rewind )
986         {
987             p_y -= i_rewind;
988             p_u -= i_rewind >> 1;
989             p_v -= i_rewind >> 1;
990             p_buffer -= i_rewind;
991             MMX_CALL (
992                 MMX_INIT_32
993                 MMX_YUV_MUL
994                 MMX_YUV_ADD
995                 MMX_UNPACK_32_RGBA
996             );
997             p_y += 8;
998             p_u += 4;
999             p_v += 4;
1000             p_buffer += 8;
1001         }
1002         SCALE_WIDTH;
1003         SCALE_HEIGHT( 420, 4 );
1004
1005         p_y += i_source_margin;
1006         if( i_y % 2 )
1007         {
1008             p_u += i_source_margin_c;
1009             p_v += i_source_margin_c;
1010         }
1011     }
1012
1013     /* re-enable FPU registers */
1014     MMX_END;
1015
1016 #endif
1017 }
1018
1019 VLC_TARGET
1020 void I420_B8G8R8A8( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
1021 {
1022     /* We got this one from the old arguments */
1023     uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
1024     uint8_t  *p_y   = p_src->Y_PIXELS;
1025     uint8_t  *p_u   = p_src->U_PIXELS;
1026     uint8_t  *p_v   = p_src->V_PIXELS;
1027
1028     bool  b_hscale;                         /* horizontal scaling type */
1029     unsigned int i_vscale;                          /* vertical scaling type */
1030     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
1031
1032     int         i_right_margin;
1033     int         i_rewind;
1034     int         i_scale_count;                       /* scale modulo counter */
1035     int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
1036     uint32_t *  p_pic_start;       /* beginning of the current line for copy */
1037     /* Conversion buffer pointer */
1038     uint32_t *  p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer;
1039     uint32_t *  p_buffer;
1040
1041     /* Offset array pointer */
1042     int *       p_offset_start = p_filter->p_sys->p_offset;
1043     int *       p_offset;
1044
1045     const int i_source_margin = p_src->p[0].i_pitch
1046                                  - p_src->p[0].i_visible_pitch;
1047     const int i_source_margin_c = p_src->p[1].i_pitch
1048                                  - p_src->p[1].i_visible_pitch;
1049
1050     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
1051
1052     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
1053      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
1054      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
1055     SetOffset( p_filter->fmt_in.video.i_width,
1056                p_filter->fmt_in.video.i_height,
1057                p_filter->fmt_out.video.i_width,
1058                p_filter->fmt_out.video.i_height,
1059                &b_hscale, &i_vscale, p_offset_start );
1060
1061     /*
1062      * Perform conversion
1063      */
1064     i_scale_count = ( i_vscale == 1 ) ?
1065                     p_filter->fmt_out.video.i_height :
1066                     p_filter->fmt_in.video.i_height;
1067
1068 #ifdef SSE2
1069
1070     i_rewind = (-p_filter->fmt_in.video.i_width) & 15;
1071
1072     /*
1073     ** SSE2 128 bits fetch/store instructions are faster
1074     ** if memory access is 16 bytes aligned
1075     */
1076
1077     p_buffer = b_hscale ? p_buffer_start : p_pic;
1078     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
1079                     p_dest->p->i_pitch|
1080                     ((intptr_t)p_y)|
1081                     ((intptr_t)p_buffer))) )
1082     {
1083         /* use faster SSE2 aligned fetch and store */
1084         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1085         {
1086             p_pic_start = p_pic;
1087
1088             for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
1089             {
1090                 SSE2_CALL (
1091                     SSE2_INIT_32_ALIGNED
1092                     SSE2_YUV_MUL
1093                     SSE2_YUV_ADD
1094                     SSE2_UNPACK_32_BGRA_ALIGNED
1095                 );
1096                 p_y += 16;
1097                 p_u += 8;
1098                 p_v += 8;
1099                 p_buffer += 16;
1100             }
1101
1102             /* Here we do some unaligned reads and duplicate conversions, but
1103              * at least we have all the pixels */
1104             if( i_rewind )
1105             {
1106                 p_y -= i_rewind;
1107                 p_u -= i_rewind >> 1;
1108                 p_v -= i_rewind >> 1;
1109                 p_buffer -= i_rewind;
1110                 SSE2_CALL (
1111                     SSE2_INIT_32_UNALIGNED
1112                     SSE2_YUV_MUL
1113                     SSE2_YUV_ADD
1114                     SSE2_UNPACK_32_BGRA_UNALIGNED
1115                 );
1116                 p_y += 16;
1117                 p_u += 4;
1118                 p_v += 4;
1119             }
1120             SCALE_WIDTH;
1121             SCALE_HEIGHT( 420, 4 );
1122
1123             p_y += i_source_margin;
1124             if( i_y % 2 )
1125             {
1126                 p_u += i_source_margin_c;
1127                 p_v += i_source_margin_c;
1128             }
1129             p_buffer = b_hscale ? p_buffer_start : p_pic;
1130         }
1131     }
1132     else
1133     {
1134         /* use slower SSE2 unaligned fetch and store */
1135         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1136         {
1137             p_pic_start = p_pic;
1138             p_buffer = b_hscale ? p_buffer_start : p_pic;
1139
1140             for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
1141             {
1142                 SSE2_CALL (
1143                     SSE2_INIT_32_UNALIGNED
1144                     SSE2_YUV_MUL
1145                     SSE2_YUV_ADD
1146                     SSE2_UNPACK_32_BGRA_UNALIGNED
1147                 );
1148                 p_y += 16;
1149                 p_u += 8;
1150                 p_v += 8;
1151                 p_buffer += 16;
1152             }
1153
1154             /* Here we do some unaligned reads and duplicate conversions, but
1155              * at least we have all the pixels */
1156             if( i_rewind )
1157             {
1158                 p_y -= i_rewind;
1159                 p_u -= i_rewind >> 1;
1160                 p_v -= i_rewind >> 1;
1161                 p_buffer -= i_rewind;
1162                 SSE2_CALL (
1163                     SSE2_INIT_32_UNALIGNED
1164                     SSE2_YUV_MUL
1165                     SSE2_YUV_ADD
1166                     SSE2_UNPACK_32_BGRA_UNALIGNED
1167                 );
1168                 p_y += 16;
1169                 p_u += 8;
1170                 p_v += 8;
1171             }
1172             SCALE_WIDTH;
1173             SCALE_HEIGHT( 420, 4 );
1174
1175             p_y += i_source_margin;
1176             if( i_y % 2 )
1177             {
1178                 p_u += i_source_margin_c;
1179                 p_v += i_source_margin_c;
1180             }
1181             p_buffer = b_hscale ? p_buffer_start : p_pic;
1182         }
1183     }
1184
1185 #else
1186
1187     i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
1188
1189     for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1190     {
1191         p_pic_start = p_pic;
1192         p_buffer = b_hscale ? p_buffer_start : p_pic;
1193
1194         for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
1195         {
1196             MMX_CALL (
1197                 MMX_INIT_32
1198                 MMX_YUV_MUL
1199                 MMX_YUV_ADD
1200                 MMX_UNPACK_32_BGRA
1201             );
1202             p_y += 8;
1203             p_u += 4;
1204             p_v += 4;
1205             p_buffer += 8;
1206         }
1207
1208         /* Here we do some unaligned reads and duplicate conversions, but
1209          * at least we have all the pixels */
1210         if( i_rewind )
1211         {
1212             p_y -= i_rewind;
1213             p_u -= i_rewind >> 1;
1214             p_v -= i_rewind >> 1;
1215             p_buffer -= i_rewind;
1216             MMX_CALL (
1217                 MMX_INIT_32
1218                 MMX_YUV_MUL
1219                 MMX_YUV_ADD
1220                 MMX_UNPACK_32_BGRA
1221             );
1222             p_y += 8;
1223             p_u += 4;
1224             p_v += 4;
1225             p_buffer += 8;
1226         }
1227         SCALE_WIDTH;
1228         SCALE_HEIGHT( 420, 4 );
1229
1230         p_y += i_source_margin;
1231         if( i_y % 2 )
1232         {
1233             p_u += i_source_margin_c;
1234             p_v += i_source_margin_c;
1235         }
1236     }
1237
1238     /* re-enable FPU registers */
1239     MMX_END;
1240
1241 #endif
1242 }
1243
1244 VLC_TARGET
1245 void I420_A8B8G8R8( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
1246 {
1247     /* We got this one from the old arguments */
1248     uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
1249     uint8_t  *p_y   = p_src->Y_PIXELS;
1250     uint8_t  *p_u   = p_src->U_PIXELS;
1251     uint8_t  *p_v   = p_src->V_PIXELS;
1252
1253     bool  b_hscale;                         /* horizontal scaling type */
1254     unsigned int i_vscale;                          /* vertical scaling type */
1255     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
1256
1257     int         i_right_margin;
1258     int         i_rewind;
1259     int         i_scale_count;                       /* scale modulo counter */
1260     int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
1261     uint32_t *  p_pic_start;       /* beginning of the current line for copy */
1262     /* Conversion buffer pointer */
1263     uint32_t *  p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer;
1264     uint32_t *  p_buffer;
1265
1266     /* Offset array pointer */
1267     int *       p_offset_start = p_filter->p_sys->p_offset;
1268     int *       p_offset;
1269
1270     const int i_source_margin = p_src->p[0].i_pitch
1271                                  - p_src->p[0].i_visible_pitch;
1272     const int i_source_margin_c = p_src->p[1].i_pitch
1273                                  - p_src->p[1].i_visible_pitch;
1274
1275     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
1276
1277     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
1278      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
1279      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
1280     SetOffset( p_filter->fmt_in.video.i_width,
1281                p_filter->fmt_in.video.i_height,
1282                p_filter->fmt_out.video.i_width,
1283                p_filter->fmt_out.video.i_height,
1284                &b_hscale, &i_vscale, p_offset_start );
1285
1286     /*
1287      * Perform conversion
1288      */
1289     i_scale_count = ( i_vscale == 1 ) ?
1290                     p_filter->fmt_out.video.i_height :
1291                     p_filter->fmt_in.video.i_height;
1292
1293 #ifdef SSE2
1294
1295     i_rewind = (-p_filter->fmt_in.video.i_width) & 15;
1296
1297     /*
1298     ** SSE2 128 bits fetch/store instructions are faster
1299     ** if memory access is 16 bytes aligned
1300     */
1301
1302     p_buffer = b_hscale ? p_buffer_start : p_pic;
1303     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
1304                     p_dest->p->i_pitch|
1305                     ((intptr_t)p_y)|
1306                     ((intptr_t)p_buffer))) )
1307     {
1308         /* use faster SSE2 aligned fetch and store */
1309         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1310         {
1311             p_pic_start = p_pic;
1312
1313             for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
1314             {
1315                 SSE2_CALL (
1316                     SSE2_INIT_32_ALIGNED
1317                     SSE2_YUV_MUL
1318                     SSE2_YUV_ADD
1319                     SSE2_UNPACK_32_ABGR_ALIGNED
1320                 );
1321                 p_y += 16;
1322                 p_u += 8;
1323                 p_v += 8;
1324                 p_buffer += 16;
1325             }
1326
1327             /* Here we do some unaligned reads and duplicate conversions, but
1328              * at least we have all the pixels */
1329             if( i_rewind )
1330             {
1331                 p_y -= i_rewind;
1332                 p_u -= i_rewind >> 1;
1333                 p_v -= i_rewind >> 1;
1334                 p_buffer -= i_rewind;
1335                 SSE2_CALL (
1336                     SSE2_INIT_32_UNALIGNED
1337                     SSE2_YUV_MUL
1338                     SSE2_YUV_ADD
1339                     SSE2_UNPACK_32_ABGR_UNALIGNED
1340                 );
1341                 p_y += 16;
1342                 p_u += 4;
1343                 p_v += 4;
1344             }
1345             SCALE_WIDTH;
1346             SCALE_HEIGHT( 420, 4 );
1347
1348             p_y += i_source_margin;
1349             if( i_y % 2 )
1350             {
1351                 p_u += i_source_margin_c;
1352                 p_v += i_source_margin_c;
1353             }
1354             p_buffer = b_hscale ? p_buffer_start : p_pic;
1355         }
1356     }
1357     else
1358     {
1359         /* use slower SSE2 unaligned fetch and store */
1360         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1361         {
1362             p_pic_start = p_pic;
1363             p_buffer = b_hscale ? p_buffer_start : p_pic;
1364
1365             for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
1366             {
1367                 SSE2_CALL (
1368                     SSE2_INIT_32_UNALIGNED
1369                     SSE2_YUV_MUL
1370                     SSE2_YUV_ADD
1371                     SSE2_UNPACK_32_ABGR_UNALIGNED
1372                 );
1373                 p_y += 16;
1374                 p_u += 8;
1375                 p_v += 8;
1376                 p_buffer += 16;
1377             }
1378
1379             /* Here we do some unaligned reads and duplicate conversions, but
1380              * at least we have all the pixels */
1381             if( i_rewind )
1382             {
1383                 p_y -= i_rewind;
1384                 p_u -= i_rewind >> 1;
1385                 p_v -= i_rewind >> 1;
1386                 p_buffer -= i_rewind;
1387                 SSE2_CALL (
1388                     SSE2_INIT_32_UNALIGNED
1389                     SSE2_YUV_MUL
1390                     SSE2_YUV_ADD
1391                     SSE2_UNPACK_32_ABGR_UNALIGNED
1392                 );
1393                 p_y += 16;
1394                 p_u += 8;
1395                 p_v += 8;
1396             }
1397             SCALE_WIDTH;
1398             SCALE_HEIGHT( 420, 4 );
1399
1400             p_y += i_source_margin;
1401             if( i_y % 2 )
1402             {
1403                 p_u += i_source_margin_c;
1404                 p_v += i_source_margin_c;
1405             }
1406             p_buffer = b_hscale ? p_buffer_start : p_pic;
1407         }
1408     }
1409
1410 #else
1411
1412     i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
1413
1414     for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1415     {
1416         p_pic_start = p_pic;
1417         p_buffer = b_hscale ? p_buffer_start : p_pic;
1418
1419         for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
1420         {
1421             MMX_CALL (
1422                 MMX_INIT_32
1423                 MMX_YUV_MUL
1424                 MMX_YUV_ADD
1425                 MMX_UNPACK_32_ABGR
1426             );
1427             p_y += 8;
1428             p_u += 4;
1429             p_v += 4;
1430             p_buffer += 8;
1431         }
1432
1433         /* Here we do some unaligned reads and duplicate conversions, but
1434          * at least we have all the pixels */
1435         if( i_rewind )
1436         {
1437             p_y -= i_rewind;
1438             p_u -= i_rewind >> 1;
1439             p_v -= i_rewind >> 1;
1440             p_buffer -= i_rewind;
1441             MMX_CALL (
1442                 MMX_INIT_32
1443                 MMX_YUV_MUL
1444                 MMX_YUV_ADD
1445                 MMX_UNPACK_32_ABGR
1446             );
1447             p_y += 8;
1448             p_u += 4;
1449             p_v += 4;
1450             p_buffer += 8;
1451         }
1452         SCALE_WIDTH;
1453         SCALE_HEIGHT( 420, 4 );
1454
1455         p_y += i_source_margin;
1456         if( i_y % 2 )
1457         {
1458             p_u += i_source_margin_c;
1459             p_v += i_source_margin_c;
1460         }
1461     }
1462
1463     /* re-enable FPU registers */
1464     MMX_END;
1465
1466 #endif
1467 }