]> git.sesse.net Git - vlc/blob - modules/video_chroma/i420_rgb16.c
i420_rgb: clobber lists for MMX and SSE2
[vlc] / modules / video_chroma / i420_rgb16.c
1 /*****************************************************************************
2  * i420_rgb16.c : YUV to bitmap RGB conversion module for vlc
3  *****************************************************************************
4  * Copyright (C) 2000 the VideoLAN team
5  * $Id$
6  *
7  * Authors: Samuel Hocevar <sam@zoy.org>
8  *          Damien Fouilleul <damienf@videolan.org>
9  *
10  * This program is free software; you can redistribute it and/or modify
11  * it under the terms of the GNU General Public License as published by
12  * the Free Software Foundation; either version 2 of the License, or
13  * (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public License
21  * along with this program; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
23  *****************************************************************************/
24
25 /*****************************************************************************
26  * Preamble
27  *****************************************************************************/
28
29 #ifdef HAVE_CONFIG_H
30 # include "config.h"
31 #endif
32
33 #include <vlc_common.h>
34 #include <vlc_filter.h>
35 #include <vlc_cpu.h>
36
37 #include "i420_rgb.h"
38 #if defined (MODULE_NAME_IS_i420_rgb)
39 #   include "i420_rgb_c.h"
40 #   define VLC_TARGET
41 #elif defined (MODULE_NAME_IS_i420_rgb_mmx)
42 #   include "../mmx/i420_rgb_mmx.h"
43 #   define VLC_TARGET VLC_MMX
44 #elif defined (MODULE_NAME_IS_i420_rgb_sse2)
45 #   include "../mmx/i420_rgb_mmx.h"
46 #   define VLC_TARGET VLC_SSE
47 #endif
48
49 static void SetOffset( int, int, int, int, bool *,
50                        unsigned int *, int * );
51
52 #if defined (MODULE_NAME_IS_i420_rgb)
53 /*****************************************************************************
54  * I420_RGB16: color YUV 4:2:0 to RGB 16 bpp with dithering
55  *****************************************************************************
56  * Horizontal alignment needed:
57  *  - input: 8 pixels (8 Y bytes, 4 U/V bytes), margins not allowed
58  *  - output: 1 pixel (2 bytes), margins allowed
59  * Vertical alignment needed:
60  *  - input: 2 lines (2 Y lines, 1 U/V line)
61  *  - output: 1 line
62  *****************************************************************************/
63 void I420_RGB16_dither( filter_t *p_filter, picture_t *p_src,
64                                                 picture_t *p_dest )
65 {
66     /* We got this one from the old arguments */
67     uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
68     uint8_t  *p_y   = p_src->Y_PIXELS;
69     uint8_t  *p_u   = p_src->U_PIXELS;
70     uint8_t  *p_v   = p_src->V_PIXELS;
71
72     bool   b_hscale;                        /* horizontal scaling type */
73     unsigned int i_vscale;                          /* vertical scaling type */
74     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
75     unsigned int i_real_y;                                          /* y % 4 */
76
77     int         i_right_margin;
78     int         i_rewind;
79     int         i_scale_count;                       /* scale modulo counter */
80     int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
81     uint16_t *  p_pic_start;       /* beginning of the current line for copy */
82     int         i_uval, i_vval;                           /* U and V samples */
83     int         i_red, i_green, i_blue;          /* U and V modified samples */
84     uint16_t *  p_yuv = p_filter->p_sys->p_rgb16;
85     uint16_t *  p_ybase;                     /* Y dependant conversion table */
86
87     /* Conversion buffer pointer */
88     uint16_t *  p_buffer_start = (uint16_t*)p_filter->p_sys->p_buffer;
89     uint16_t *  p_buffer;
90
91     /* Offset array pointer */
92     int *       p_offset_start = p_filter->p_sys->p_offset;
93     int *       p_offset;
94
95     const int i_source_margin = p_src->p[0].i_pitch
96                                  - p_src->p[0].i_visible_pitch;
97     const int i_source_margin_c = p_src->p[1].i_pitch
98                                  - p_src->p[1].i_visible_pitch;
99
100     /* The dithering matrices */
101     int dither10[4] = {  0x0,  0x8,  0x2,  0xa };
102     int dither11[4] = {  0xc,  0x4,  0xe,  0x6 };
103     int dither12[4] = {  0x3,  0xb,  0x1,  0x9 };
104     int dither13[4] = {  0xf,  0x7,  0xd,  0x5 };
105
106     for(i_x = 0; i_x < 4; i_x++)
107     {
108         dither10[i_x] = dither10[i_x] << (SHIFT - 4 + p_filter->fmt_out.video.i_rrshift);
109         dither11[i_x] = dither11[i_x] << (SHIFT - 4 + p_filter->fmt_out.video.i_rrshift);
110         dither12[i_x] = dither12[i_x] << (SHIFT - 4 + p_filter->fmt_out.video.i_rrshift);
111         dither13[i_x] = dither13[i_x] << (SHIFT - 4 + p_filter->fmt_out.video.i_rrshift);
112     }
113
114     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
115     i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
116
117     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
118      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
119      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
120     SetOffset( p_filter->fmt_in.video.i_width,
121                p_filter->fmt_in.video.i_height,
122                p_filter->fmt_out.video.i_width,
123                p_filter->fmt_out.video.i_height,
124                &b_hscale, &i_vscale, p_offset_start );
125
126     /*
127      * Perform conversion
128      */
129     i_scale_count = ( i_vscale == 1 ) ?
130                     p_filter->fmt_out.video.i_height :
131                     p_filter->fmt_in.video.i_height;
132     for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
133     {
134         i_real_y = i_y & 0x3;
135         p_pic_start = p_pic;
136         p_buffer = b_hscale ? p_buffer_start : p_pic;
137
138         for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
139         {
140             int *p_dither = dither10;
141             CONVERT_YUV_PIXEL_DITHER(2);
142             p_dither = dither11;
143             CONVERT_Y_PIXEL_DITHER(2);
144             p_dither = dither12;
145             CONVERT_YUV_PIXEL_DITHER(2);
146             p_dither = dither13;
147             CONVERT_Y_PIXEL_DITHER(2);
148             p_dither = dither10;
149             CONVERT_YUV_PIXEL_DITHER(2);
150             p_dither = dither11;
151             CONVERT_Y_PIXEL_DITHER(2);
152             p_dither = dither12;
153             CONVERT_YUV_PIXEL_DITHER(2);
154             p_dither = dither13;
155             CONVERT_Y_PIXEL_DITHER(2);
156         }
157
158         /* Here we do some unaligned reads and duplicate conversions, but
159          * at least we have all the pixels */
160         if( i_rewind )
161         {
162             int *p_dither = dither10;
163             p_y -= i_rewind;
164             p_u -= i_rewind >> 1;
165             p_v -= i_rewind >> 1;
166             p_buffer -= i_rewind;
167             CONVERT_YUV_PIXEL_DITHER(2);
168             p_dither = dither11;
169             CONVERT_Y_PIXEL_DITHER(2);
170             p_dither = dither12;
171             CONVERT_YUV_PIXEL_DITHER(2);
172             p_dither = dither13;
173             CONVERT_Y_PIXEL_DITHER(2);
174             p_dither = dither10;
175             CONVERT_YUV_PIXEL_DITHER(2);
176             p_dither = dither11;
177             CONVERT_Y_PIXEL_DITHER(2);
178             p_dither = dither12;
179             CONVERT_YUV_PIXEL_DITHER(2);
180             p_dither = dither13;
181             CONVERT_Y_PIXEL_DITHER(2);
182         }
183         SCALE_WIDTH;
184         SCALE_HEIGHT( 420, 2 );
185
186         p_y += i_source_margin;
187         if( i_y % 2 )
188         {
189             p_u += i_source_margin_c;
190             p_v += i_source_margin_c;
191         }
192     }
193 }
194 #endif
195
196 /*****************************************************************************
197  * I420_RGB16: color YUV 4:2:0 to RGB 16 bpp
198  *****************************************************************************
199  * Horizontal alignment needed:
200  *  - input: 8 pixels (8 Y bytes, 4 U/V bytes), margins not allowed
201  *  - output: 1 pixel (2 bytes), margins allowed
202  * Vertical alignment needed:
203  *  - input: 2 lines (2 Y lines, 1 U/V line)
204  *  - output: 1 line
205  *****************************************************************************/
206
207 #if defined (MODULE_NAME_IS_i420_rgb)
208
209 void I420_RGB16( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
210 {
211     /* We got this one from the old arguments */
212     uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
213     uint8_t  *p_y   = p_src->Y_PIXELS;
214     uint8_t  *p_u   = p_src->U_PIXELS;
215     uint8_t  *p_v   = p_src->V_PIXELS;
216
217     bool  b_hscale;                         /* horizontal scaling type */
218     unsigned int i_vscale;                          /* vertical scaling type */
219     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
220
221     int         i_right_margin;
222     int         i_rewind;
223     int         i_scale_count;                       /* scale modulo counter */
224     int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
225     uint16_t *  p_pic_start;       /* beginning of the current line for copy */
226     int         i_uval, i_vval;                           /* U and V samples */
227     int         i_red, i_green, i_blue;          /* U and V modified samples */
228     uint16_t *  p_yuv = p_filter->p_sys->p_rgb16;
229     uint16_t *  p_ybase;                     /* Y dependant conversion table */
230
231     /* Conversion buffer pointer */
232     uint16_t *  p_buffer_start = (uint16_t*)p_filter->p_sys->p_buffer;
233     uint16_t *  p_buffer;
234
235     /* Offset array pointer */
236     int *       p_offset_start = p_filter->p_sys->p_offset;
237     int *       p_offset;
238
239     const int i_source_margin = p_src->p[0].i_pitch
240                                  - p_src->p[0].i_visible_pitch;
241     const int i_source_margin_c = p_src->p[1].i_pitch
242                                  - p_src->p[1].i_visible_pitch;
243
244     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
245     i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
246
247     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
248      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
249      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
250     SetOffset( p_filter->fmt_in.video.i_width,
251                p_filter->fmt_in.video.i_height,
252                p_filter->fmt_out.video.i_width,
253                p_filter->fmt_out.video.i_height,
254                &b_hscale, &i_vscale, p_offset_start );
255
256     /*
257      * Perform conversion
258      */
259     i_scale_count = ( i_vscale == 1 ) ?
260                     p_filter->fmt_out.video.i_height :
261                     p_filter->fmt_in.video.i_height;
262     for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
263     {
264         p_pic_start = p_pic;
265         p_buffer = b_hscale ? p_buffer_start : p_pic;
266
267         for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
268         {
269             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
270             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
271             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
272             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
273         }
274
275         /* Here we do some unaligned reads and duplicate conversions, but
276          * at least we have all the pixels */
277         if( i_rewind )
278         {
279             p_y -= i_rewind;
280             p_u -= i_rewind >> 1;
281             p_v -= i_rewind >> 1;
282             p_buffer -= i_rewind;
283
284             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
285             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
286             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
287             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
288         }
289         SCALE_WIDTH;
290         SCALE_HEIGHT( 420, 2 );
291
292         p_y += i_source_margin;
293         if( i_y % 2 )
294         {
295             p_u += i_source_margin_c;
296             p_v += i_source_margin_c;
297         }
298     }
299 }
300
301 #else // ! defined (MODULE_NAME_IS_i420_rgb)
302
303 VLC_TARGET
304 void I420_R5G5B5( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
305 {
306     /* We got this one from the old arguments */
307     uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
308     uint8_t  *p_y   = p_src->Y_PIXELS;
309     uint8_t  *p_u   = p_src->U_PIXELS;
310     uint8_t  *p_v   = p_src->V_PIXELS;
311
312     bool  b_hscale;                         /* horizontal scaling type */
313     unsigned int i_vscale;                          /* vertical scaling type */
314     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
315
316     int         i_right_margin;
317     int         i_rewind;
318     int         i_scale_count;                       /* scale modulo counter */
319     int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
320     uint16_t *  p_pic_start;       /* beginning of the current line for copy */
321
322     /* Conversion buffer pointer */
323     uint16_t *  p_buffer_start = (uint16_t*)p_filter->p_sys->p_buffer;
324     uint16_t *  p_buffer;
325
326     /* Offset array pointer */
327     int *       p_offset_start = p_filter->p_sys->p_offset;
328     int *       p_offset;
329
330     const int i_source_margin = p_src->p[0].i_pitch
331                                  - p_src->p[0].i_visible_pitch;
332     const int i_source_margin_c = p_src->p[1].i_pitch
333                                  - p_src->p[1].i_visible_pitch;
334
335     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
336
337     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
338      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
339      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
340     SetOffset( p_filter->fmt_in.video.i_width,
341                p_filter->fmt_in.video.i_height,
342                p_filter->fmt_out.video.i_width,
343                p_filter->fmt_out.video.i_height,
344                &b_hscale, &i_vscale, p_offset_start );
345
346
347     /*
348      * Perform conversion
349      */
350     i_scale_count = ( i_vscale == 1 ) ?
351                     p_filter->fmt_out.video.i_height :
352                     p_filter->fmt_in.video.i_height;
353
354 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
355
356     i_rewind = (-p_filter->fmt_in.video.i_width) & 15;
357
358     /*
359     ** SSE2 128 bits fetch/store instructions are faster
360     ** if memory access is 16 bytes aligned
361     */
362
363     p_buffer = b_hscale ? p_buffer_start : p_pic;
364     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
365                     p_dest->p->i_pitch|
366                     ((intptr_t)p_y)|
367                     ((intptr_t)p_buffer))) )
368     {
369         /* use faster SSE2 aligned fetch and store */
370         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
371         {
372             p_pic_start = p_pic;
373
374             for ( i_x = p_filter->fmt_in.video.i_width/16; i_x--; )
375             {
376                 SSE2_CALL (
377                     SSE2_INIT_16_ALIGNED
378                     SSE2_YUV_MUL
379                     SSE2_YUV_ADD
380                     SSE2_UNPACK_15_ALIGNED
381                 );
382                 p_y += 16;
383                 p_u += 8;
384                 p_v += 8;
385                 p_buffer += 16;
386             }
387             /* Here we do some unaligned reads and duplicate conversions, but
388              * at least we have all the pixels */
389             if( i_rewind )
390             {
391                 p_y -= i_rewind;
392                 p_u -= i_rewind >> 1;
393                 p_v -= i_rewind >> 1;
394                 p_buffer -= i_rewind;
395
396                 SSE2_CALL (
397                     SSE2_INIT_16_UNALIGNED
398                     SSE2_YUV_MUL
399                     SSE2_YUV_ADD
400                     SSE2_UNPACK_15_UNALIGNED
401                 );
402                 p_y += 16;
403                 p_u += 8;
404                 p_v += 8;
405             }
406             SCALE_WIDTH;
407             SCALE_HEIGHT( 420, 2 );
408
409             p_y += i_source_margin;
410             if( i_y % 2 )
411             {
412                 p_u += i_source_margin_c;
413                 p_v += i_source_margin_c;
414             }
415             p_buffer = b_hscale ? p_buffer_start : p_pic;
416         }
417     }
418     else
419     {
420         /* use slower SSE2 unaligned fetch and store */
421         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
422         {
423             p_pic_start = p_pic;
424             p_buffer = b_hscale ? p_buffer_start : p_pic;
425
426             for ( i_x = p_filter->fmt_in.video.i_width/16; i_x--; )
427             {
428                 SSE2_CALL (
429                     SSE2_INIT_16_UNALIGNED
430                     SSE2_YUV_MUL
431                     SSE2_YUV_ADD
432                     SSE2_UNPACK_15_UNALIGNED
433                 );
434                 p_y += 16;
435                 p_u += 8;
436                 p_v += 8;
437                 p_buffer += 16;
438             }
439             /* Here we do some unaligned reads and duplicate conversions, but
440              * at least we have all the pixels */
441             if( i_rewind )
442             {
443                 p_y -= i_rewind;
444                 p_u -= i_rewind >> 1;
445                 p_v -= i_rewind >> 1;
446                 p_buffer -= i_rewind;
447
448                 SSE2_CALL (
449                     SSE2_INIT_16_UNALIGNED
450                     SSE2_YUV_MUL
451                     SSE2_YUV_ADD
452                     SSE2_UNPACK_15_UNALIGNED
453                 );
454                 p_y += 16;
455                 p_u += 8;
456                 p_v += 8;
457             }
458             SCALE_WIDTH;
459             SCALE_HEIGHT( 420, 2 );
460
461             p_y += i_source_margin;
462             if( i_y % 2 )
463             {
464                 p_u += i_source_margin_c;
465                 p_v += i_source_margin_c;
466             }
467             p_buffer = b_hscale ? p_buffer_start : p_pic;
468         }
469     }
470
471     /* make sure all SSE2 stores are visible thereafter */
472     SSE2_END;
473
474 #else // defined (MODULE_NAME_IS_i420_rgb_mmx)
475
476     i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
477
478     for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
479     {
480         p_pic_start = p_pic;
481         p_buffer = b_hscale ? p_buffer_start : p_pic;
482
483         for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
484         {
485             MMX_CALL (
486                 MMX_INIT_16
487                 MMX_YUV_MUL
488                 MMX_YUV_ADD
489                 MMX_UNPACK_15
490             );
491             p_y += 8;
492             p_u += 4;
493             p_v += 4;
494             p_buffer += 8;
495         }
496
497         /* Here we do some unaligned reads and duplicate conversions, but
498          * at least we have all the pixels */
499         if( i_rewind )
500         {
501             p_y -= i_rewind;
502             p_u -= i_rewind >> 1;
503             p_v -= i_rewind >> 1;
504             p_buffer -= i_rewind;
505
506             MMX_CALL (
507                 MMX_INIT_16
508                 MMX_YUV_MUL
509                 MMX_YUV_ADD
510                 MMX_UNPACK_15
511             );
512             p_y += 8;
513             p_u += 4;
514             p_v += 4;
515             p_buffer += 8;
516         }
517         SCALE_WIDTH;
518         SCALE_HEIGHT( 420, 2 );
519
520         p_y += i_source_margin;
521         if( i_y % 2 )
522         {
523             p_u += i_source_margin_c;
524             p_v += i_source_margin_c;
525         }
526     }
527     /* re-enable FPU registers */
528     MMX_END;
529
530 #endif
531 }
532
533 VLC_TARGET
534 void I420_R5G6B5( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
535 {
536     /* We got this one from the old arguments */
537     uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
538     uint8_t  *p_y   = p_src->Y_PIXELS;
539     uint8_t  *p_u   = p_src->U_PIXELS;
540     uint8_t  *p_v   = p_src->V_PIXELS;
541
542     bool  b_hscale;                         /* horizontal scaling type */
543     unsigned int i_vscale;                          /* vertical scaling type */
544     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
545
546     int         i_right_margin;
547     int         i_rewind;
548     int         i_scale_count;                       /* scale modulo counter */
549     int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
550     uint16_t *  p_pic_start;       /* beginning of the current line for copy */
551
552     /* Conversion buffer pointer */
553     uint16_t *  p_buffer_start = (uint16_t*)p_filter->p_sys->p_buffer;
554     uint16_t *  p_buffer;
555
556     /* Offset array pointer */
557     int *       p_offset_start = p_filter->p_sys->p_offset;
558     int *       p_offset;
559
560     const int i_source_margin = p_src->p[0].i_pitch
561                                  - p_src->p[0].i_visible_pitch;
562     const int i_source_margin_c = p_src->p[1].i_pitch
563                                  - p_src->p[1].i_visible_pitch;
564
565     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
566
567     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
568      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
569      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
570     SetOffset( p_filter->fmt_in.video.i_width,
571                p_filter->fmt_in.video.i_height,
572                p_filter->fmt_out.video.i_width,
573                p_filter->fmt_out.video.i_height,
574                &b_hscale, &i_vscale, p_offset_start );
575
576
577     /*
578      * Perform conversion
579      */
580     i_scale_count = ( i_vscale == 1 ) ?
581                     p_filter->fmt_out.video.i_height :
582                     p_filter->fmt_in.video.i_height;
583
584 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
585
586     i_rewind = (-p_filter->fmt_in.video.i_width) & 15;
587
588     /*
589     ** SSE2 128 bits fetch/store instructions are faster
590     ** if memory access is 16 bytes aligned
591     */
592
593     p_buffer = b_hscale ? p_buffer_start : p_pic;
594     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
595                     p_dest->p->i_pitch|
596                     ((intptr_t)p_y)|
597                     ((intptr_t)p_buffer))) )
598     {
599         /* use faster SSE2 aligned fetch and store */
600         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
601         {
602             p_pic_start = p_pic;
603
604             for ( i_x = p_filter->fmt_in.video.i_width/16; i_x--; )
605             {
606                 SSE2_CALL (
607                     SSE2_INIT_16_ALIGNED
608                     SSE2_YUV_MUL
609                     SSE2_YUV_ADD
610                     SSE2_UNPACK_16_ALIGNED
611                 );
612                 p_y += 16;
613                 p_u += 8;
614                 p_v += 8;
615                 p_buffer += 16;
616             }
617             /* Here we do some unaligned reads and duplicate conversions, but
618              * at least we have all the pixels */
619             if( i_rewind )
620             {
621                 p_y -= i_rewind;
622                 p_u -= i_rewind >> 1;
623                 p_v -= i_rewind >> 1;
624                 p_buffer -= i_rewind;
625
626                 SSE2_CALL (
627                     SSE2_INIT_16_UNALIGNED
628                     SSE2_YUV_MUL
629                     SSE2_YUV_ADD
630                     SSE2_UNPACK_16_UNALIGNED
631                 );
632                 p_y += 16;
633                 p_u += 8;
634                 p_v += 8;
635             }
636             SCALE_WIDTH;
637             SCALE_HEIGHT( 420, 2 );
638
639             p_y += i_source_margin;
640             if( i_y % 2 )
641             {
642                 p_u += i_source_margin_c;
643                 p_v += i_source_margin_c;
644             }
645             p_buffer = b_hscale ? p_buffer_start : p_pic;
646         }
647     }
648     else
649     {
650         /* use slower SSE2 unaligned fetch and store */
651         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
652         {
653             p_pic_start = p_pic;
654             p_buffer = b_hscale ? p_buffer_start : p_pic;
655
656             for ( i_x = p_filter->fmt_in.video.i_width/16; i_x--; )
657             {
658                 SSE2_CALL(
659                     SSE2_INIT_16_UNALIGNED
660                     SSE2_YUV_MUL
661                     SSE2_YUV_ADD
662                     SSE2_UNPACK_16_UNALIGNED
663                 );
664                 p_y += 16;
665                 p_u += 8;
666                 p_v += 8;
667                 p_buffer += 16;
668             }
669             /* Here we do some unaligned reads and duplicate conversions, but
670              * at least we have all the pixels */
671             if( i_rewind )
672             {
673                 p_y -= i_rewind;
674                 p_u -= i_rewind >> 1;
675                 p_v -= i_rewind >> 1;
676                 p_buffer -= i_rewind;
677
678                 SSE2_CALL(
679                     SSE2_INIT_16_UNALIGNED
680                     SSE2_YUV_MUL
681                     SSE2_YUV_ADD
682                     SSE2_UNPACK_16_UNALIGNED
683                 );
684                 p_y += 16;
685                 p_u += 8;
686                 p_v += 8;
687             }
688             SCALE_WIDTH;
689             SCALE_HEIGHT( 420, 2 );
690
691             p_y += i_source_margin;
692             if( i_y % 2 )
693             {
694                 p_u += i_source_margin_c;
695                 p_v += i_source_margin_c;
696             }
697             p_buffer = b_hscale ? p_buffer_start : p_pic;
698         }
699     }
700
701     /* make sure all SSE2 stores are visible thereafter */
702     SSE2_END;
703
704 #else // defined (MODULE_NAME_IS_i420_rgb_mmx)
705
706     i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
707
708     for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
709     {
710         p_pic_start = p_pic;
711         p_buffer = b_hscale ? p_buffer_start : p_pic;
712
713         for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
714         {
715             MMX_CALL (
716                 MMX_INIT_16
717                 MMX_YUV_MUL
718                 MMX_YUV_ADD
719                 MMX_UNPACK_16
720             );
721             p_y += 8;
722             p_u += 4;
723             p_v += 4;
724             p_buffer += 8;
725         }
726
727         /* Here we do some unaligned reads and duplicate conversions, but
728          * at least we have all the pixels */
729         if( i_rewind )
730         {
731             p_y -= i_rewind;
732             p_u -= i_rewind >> 1;
733             p_v -= i_rewind >> 1;
734             p_buffer -= i_rewind;
735
736             MMX_CALL (
737                 MMX_INIT_16
738                 MMX_YUV_MUL
739                 MMX_YUV_ADD
740                 MMX_UNPACK_16
741             );
742             p_y += 8;
743             p_u += 4;
744             p_v += 4;
745             p_buffer += 8;
746         }
747         SCALE_WIDTH;
748         SCALE_HEIGHT( 420, 2 );
749
750         p_y += i_source_margin;
751         if( i_y % 2 )
752         {
753             p_u += i_source_margin_c;
754             p_v += i_source_margin_c;
755         }
756     }
757     /* re-enable FPU registers */
758     MMX_END;
759
760 #endif
761 }
762
763 #endif
764
765 /*****************************************************************************
766  * I420_RGB32: color YUV 4:2:0 to RGB 32 bpp
767  *****************************************************************************
768  * Horizontal alignment needed:
769  *  - input: 8 pixels (8 Y bytes, 4 U/V bytes), margins not allowed
770  *  - output: 1 pixel (2 bytes), margins allowed
771  * Vertical alignment needed:
772  *  - input: 2 lines (2 Y lines, 1 U/V line)
773  *  - output: 1 line
774  *****************************************************************************/
775
776 #if defined (MODULE_NAME_IS_i420_rgb)
777
778 void I420_RGB32( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
779 {
780     /* We got this one from the old arguments */
781     uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
782     uint8_t  *p_y   = p_src->Y_PIXELS;
783     uint8_t  *p_u   = p_src->U_PIXELS;
784     uint8_t  *p_v   = p_src->V_PIXELS;
785
786     bool  b_hscale;                         /* horizontal scaling type */
787     unsigned int i_vscale;                          /* vertical scaling type */
788     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
789
790     int         i_right_margin;
791     int         i_rewind;
792     int         i_scale_count;                       /* scale modulo counter */
793     int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
794     uint32_t *  p_pic_start;       /* beginning of the current line for copy */
795     int         i_uval, i_vval;                           /* U and V samples */
796     int         i_red, i_green, i_blue;          /* U and V modified samples */
797     uint32_t *  p_yuv = p_filter->p_sys->p_rgb32;
798     uint32_t *  p_ybase;                     /* Y dependant conversion table */
799
800     /* Conversion buffer pointer */
801     uint32_t *  p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer;
802     uint32_t *  p_buffer;
803
804     /* Offset array pointer */
805     int *       p_offset_start = p_filter->p_sys->p_offset;
806     int *       p_offset;
807
808     const int i_source_margin = p_src->p[0].i_pitch
809                                  - p_src->p[0].i_visible_pitch;
810     const int i_source_margin_c = p_src->p[1].i_pitch
811                                  - p_src->p[1].i_visible_pitch;
812
813     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
814     i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
815
816     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
817      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
818      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
819     SetOffset( p_filter->fmt_in.video.i_width,
820                p_filter->fmt_in.video.i_height,
821                p_filter->fmt_out.video.i_width,
822                p_filter->fmt_out.video.i_height,
823                &b_hscale, &i_vscale, p_offset_start );
824
825     /*
826      * Perform conversion
827      */
828     i_scale_count = ( i_vscale == 1 ) ?
829                     p_filter->fmt_out.video.i_height :
830                     p_filter->fmt_in.video.i_height;
831     for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
832     {
833         p_pic_start = p_pic;
834         p_buffer = b_hscale ? p_buffer_start : p_pic;
835
836         for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
837         {
838             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
839             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
840             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
841             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
842         }
843
844         /* Here we do some unaligned reads and duplicate conversions, but
845          * at least we have all the pixels */
846         if( i_rewind )
847         {
848             p_y -= i_rewind;
849             p_u -= i_rewind >> 1;
850             p_v -= i_rewind >> 1;
851             p_buffer -= i_rewind;
852             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
853             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
854             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
855             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
856         }
857         SCALE_WIDTH;
858         SCALE_HEIGHT( 420, 4 );
859
860         p_y += i_source_margin;
861         if( i_y % 2 )
862         {
863             p_u += i_source_margin_c;
864             p_v += i_source_margin_c;
865         }
866     }
867 }
868
869 #else // defined (MODULE_NAME_IS_i420_rgb_mmx) || defined (MODULE_NAME_IS_i420_rgb_sse2)
870
871 VLC_TARGET
872 void I420_A8R8G8B8( filter_t *p_filter, picture_t *p_src,
873                                             picture_t *p_dest )
874 {
875     /* We got this one from the old arguments */
876     uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
877     uint8_t  *p_y   = p_src->Y_PIXELS;
878     uint8_t  *p_u   = p_src->U_PIXELS;
879     uint8_t  *p_v   = p_src->V_PIXELS;
880
881     bool  b_hscale;                         /* horizontal scaling type */
882     unsigned int i_vscale;                          /* vertical scaling type */
883     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
884
885     int         i_right_margin;
886     int         i_rewind;
887     int         i_scale_count;                       /* scale modulo counter */
888     int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
889     uint32_t *  p_pic_start;       /* beginning of the current line for copy */
890     /* Conversion buffer pointer */
891     uint32_t *  p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer;
892     uint32_t *  p_buffer;
893
894     /* Offset array pointer */
895     int *       p_offset_start = p_filter->p_sys->p_offset;
896     int *       p_offset;
897
898     const int i_source_margin = p_src->p[0].i_pitch
899                                  - p_src->p[0].i_visible_pitch;
900     const int i_source_margin_c = p_src->p[1].i_pitch
901                                  - p_src->p[1].i_visible_pitch;
902
903     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
904
905     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
906      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
907      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
908     SetOffset( p_filter->fmt_in.video.i_width,
909                p_filter->fmt_in.video.i_height,
910                p_filter->fmt_out.video.i_width,
911                p_filter->fmt_out.video.i_height,
912                &b_hscale, &i_vscale, p_offset_start );
913
914     /*
915      * Perform conversion
916      */
917     i_scale_count = ( i_vscale == 1 ) ?
918                     p_filter->fmt_out.video.i_height :
919                     p_filter->fmt_in.video.i_height;
920
921 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
922
923     i_rewind = (-p_filter->fmt_in.video.i_width) & 15;
924
925     /*
926     ** SSE2 128 bits fetch/store instructions are faster
927     ** if memory access is 16 bytes aligned
928     */
929
930     p_buffer = b_hscale ? p_buffer_start : p_pic;
931     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
932                     p_dest->p->i_pitch|
933                     ((intptr_t)p_y)|
934                     ((intptr_t)p_buffer))) )
935     {
936         /* use faster SSE2 aligned fetch and store */
937         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
938         {
939             p_pic_start = p_pic;
940
941             for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
942             {
943                 SSE2_CALL (
944                     SSE2_INIT_32_ALIGNED
945                     SSE2_YUV_MUL
946                     SSE2_YUV_ADD
947                     SSE2_UNPACK_32_ARGB_ALIGNED
948                 );
949                 p_y += 16;
950                 p_u += 8;
951                 p_v += 8;
952                 p_buffer += 16;
953             }
954
955             /* Here we do some unaligned reads and duplicate conversions, but
956              * at least we have all the pixels */
957             if( i_rewind )
958             {
959                 p_y -= i_rewind;
960                 p_u -= i_rewind >> 1;
961                 p_v -= i_rewind >> 1;
962                 p_buffer -= i_rewind;
963                 SSE2_CALL (
964                     SSE2_INIT_32_UNALIGNED
965                     SSE2_YUV_MUL
966                     SSE2_YUV_ADD
967                     SSE2_UNPACK_32_ARGB_UNALIGNED
968                 );
969                 p_y += 16;
970                 p_u += 4;
971                 p_v += 4;
972             }
973             SCALE_WIDTH;
974             SCALE_HEIGHT( 420, 4 );
975
976             p_y += i_source_margin;
977             if( i_y % 2 )
978             {
979                 p_u += i_source_margin_c;
980                 p_v += i_source_margin_c;
981             }
982             p_buffer = b_hscale ? p_buffer_start : p_pic;
983         }
984     }
985     else
986     {
987         /* use slower SSE2 unaligned fetch and store */
988         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
989         {
990             p_pic_start = p_pic;
991             p_buffer = b_hscale ? p_buffer_start : p_pic;
992
993             for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
994             {
995                 SSE2_CALL (
996                     SSE2_INIT_32_UNALIGNED
997                     SSE2_YUV_MUL
998                     SSE2_YUV_ADD
999                     SSE2_UNPACK_32_ARGB_UNALIGNED
1000                 );
1001                 p_y += 16;
1002                 p_u += 8;
1003                 p_v += 8;
1004                 p_buffer += 16;
1005             }
1006
1007             /* Here we do some unaligned reads and duplicate conversions, but
1008              * at least we have all the pixels */
1009             if( i_rewind )
1010             {
1011                 p_y -= i_rewind;
1012                 p_u -= i_rewind >> 1;
1013                 p_v -= i_rewind >> 1;
1014                 p_buffer -= i_rewind;
1015                 SSE2_CALL (
1016                     SSE2_INIT_32_UNALIGNED
1017                     SSE2_YUV_MUL
1018                     SSE2_YUV_ADD
1019                     SSE2_UNPACK_32_ARGB_UNALIGNED
1020                 );
1021                 p_y += 16;
1022                 p_u += 8;
1023                 p_v += 8;
1024             }
1025             SCALE_WIDTH;
1026             SCALE_HEIGHT( 420, 4 );
1027
1028             p_y += i_source_margin;
1029             if( i_y % 2 )
1030             {
1031                 p_u += i_source_margin_c;
1032                 p_v += i_source_margin_c;
1033             }
1034             p_buffer = b_hscale ? p_buffer_start : p_pic;
1035         }
1036     }
1037
1038     /* make sure all SSE2 stores are visible thereafter */
1039     SSE2_END;
1040
1041 #else // defined (MODULE_NAME_IS_i420_rgb_mmx)
1042
1043     i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
1044
1045     for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1046     {
1047         p_pic_start = p_pic;
1048         p_buffer = b_hscale ? p_buffer_start : p_pic;
1049
1050         for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
1051         {
1052             MMX_CALL (
1053                 MMX_INIT_32
1054                 MMX_YUV_MUL
1055                 MMX_YUV_ADD
1056                 MMX_UNPACK_32_ARGB
1057             );
1058             p_y += 8;
1059             p_u += 4;
1060             p_v += 4;
1061             p_buffer += 8;
1062         }
1063
1064         /* Here we do some unaligned reads and duplicate conversions, but
1065          * at least we have all the pixels */
1066         if( i_rewind )
1067         {
1068             p_y -= i_rewind;
1069             p_u -= i_rewind >> 1;
1070             p_v -= i_rewind >> 1;
1071             p_buffer -= i_rewind;
1072             MMX_CALL (
1073                 MMX_INIT_32
1074                 MMX_YUV_MUL
1075                 MMX_YUV_ADD
1076                 MMX_UNPACK_32_ARGB
1077             );
1078             p_y += 8;
1079             p_u += 4;
1080             p_v += 4;
1081             p_buffer += 8;
1082         }
1083         SCALE_WIDTH;
1084         SCALE_HEIGHT( 420, 4 );
1085
1086         p_y += i_source_margin;
1087         if( i_y % 2 )
1088         {
1089             p_u += i_source_margin_c;
1090             p_v += i_source_margin_c;
1091         }
1092     }
1093
1094     /* re-enable FPU registers */
1095     MMX_END;
1096
1097 #endif
1098 }
1099
1100 VLC_TARGET
1101 void I420_R8G8B8A8( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
1102 {
1103     /* We got this one from the old arguments */
1104     uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
1105     uint8_t  *p_y   = p_src->Y_PIXELS;
1106     uint8_t  *p_u   = p_src->U_PIXELS;
1107     uint8_t  *p_v   = p_src->V_PIXELS;
1108
1109     bool  b_hscale;                         /* horizontal scaling type */
1110     unsigned int i_vscale;                          /* vertical scaling type */
1111     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
1112
1113     int         i_right_margin;
1114     int         i_rewind;
1115     int         i_scale_count;                       /* scale modulo counter */
1116     int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
1117     uint32_t *  p_pic_start;       /* beginning of the current line for copy */
1118     /* Conversion buffer pointer */
1119     uint32_t *  p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer;
1120     uint32_t *  p_buffer;
1121
1122     /* Offset array pointer */
1123     int *       p_offset_start = p_filter->p_sys->p_offset;
1124     int *       p_offset;
1125
1126     const int i_source_margin = p_src->p[0].i_pitch
1127                                  - p_src->p[0].i_visible_pitch;
1128     const int i_source_margin_c = p_src->p[1].i_pitch
1129                                  - p_src->p[1].i_visible_pitch;
1130
1131     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
1132
1133     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
1134      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
1135      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
1136     SetOffset( p_filter->fmt_in.video.i_width,
1137                p_filter->fmt_in.video.i_height,
1138                p_filter->fmt_out.video.i_width,
1139                p_filter->fmt_out.video.i_height,
1140                &b_hscale, &i_vscale, p_offset_start );
1141
1142     /*
1143      * Perform conversion
1144      */
1145     i_scale_count = ( i_vscale == 1 ) ?
1146                     p_filter->fmt_out.video.i_height :
1147                     p_filter->fmt_in.video.i_height;
1148
1149 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
1150
1151     i_rewind = (-p_filter->fmt_in.video.i_width) & 15;
1152
1153     /*
1154     ** SSE2 128 bits fetch/store instructions are faster
1155     ** if memory access is 16 bytes aligned
1156     */
1157
1158     p_buffer = b_hscale ? p_buffer_start : p_pic;
1159     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
1160                     p_dest->p->i_pitch|
1161                     ((intptr_t)p_y)|
1162                     ((intptr_t)p_buffer))) )
1163     {
1164         /* use faster SSE2 aligned fetch and store */
1165         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1166         {
1167             p_pic_start = p_pic;
1168
1169             for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
1170             {
1171                 SSE2_CALL (
1172                     SSE2_INIT_32_ALIGNED
1173                     SSE2_YUV_MUL
1174                     SSE2_YUV_ADD
1175                     SSE2_UNPACK_32_RGBA_ALIGNED
1176                 );
1177                 p_y += 16;
1178                 p_u += 8;
1179                 p_v += 8;
1180                 p_buffer += 16;
1181             }
1182
1183             /* Here we do some unaligned reads and duplicate conversions, but
1184              * at least we have all the pixels */
1185             if( i_rewind )
1186             {
1187                 p_y -= i_rewind;
1188                 p_u -= i_rewind >> 1;
1189                 p_v -= i_rewind >> 1;
1190                 p_buffer -= i_rewind;
1191                 SSE2_CALL (
1192                     SSE2_INIT_32_UNALIGNED
1193                     SSE2_YUV_MUL
1194                     SSE2_YUV_ADD
1195                     SSE2_UNPACK_32_RGBA_UNALIGNED
1196                 );
1197                 p_y += 16;
1198                 p_u += 4;
1199                 p_v += 4;
1200             }
1201             SCALE_WIDTH;
1202             SCALE_HEIGHT( 420, 4 );
1203
1204             p_y += i_source_margin;
1205             if( i_y % 2 )
1206             {
1207                 p_u += i_source_margin_c;
1208                 p_v += i_source_margin_c;
1209             }
1210             p_buffer = b_hscale ? p_buffer_start : p_pic;
1211         }
1212     }
1213     else
1214     {
1215         /* use slower SSE2 unaligned fetch and store */
1216         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1217         {
1218             p_pic_start = p_pic;
1219             p_buffer = b_hscale ? p_buffer_start : p_pic;
1220
1221             for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
1222             {
1223                 SSE2_CALL (
1224                     SSE2_INIT_32_UNALIGNED
1225                     SSE2_YUV_MUL
1226                     SSE2_YUV_ADD
1227                     SSE2_UNPACK_32_RGBA_UNALIGNED
1228                 );
1229                 p_y += 16;
1230                 p_u += 8;
1231                 p_v += 8;
1232                 p_buffer += 16;
1233             }
1234
1235             /* Here we do some unaligned reads and duplicate conversions, but
1236              * at least we have all the pixels */
1237             if( i_rewind )
1238             {
1239                 p_y -= i_rewind;
1240                 p_u -= i_rewind >> 1;
1241                 p_v -= i_rewind >> 1;
1242                 p_buffer -= i_rewind;
1243                 SSE2_CALL (
1244                     SSE2_INIT_32_UNALIGNED
1245                     SSE2_YUV_MUL
1246                     SSE2_YUV_ADD
1247                     SSE2_UNPACK_32_RGBA_UNALIGNED
1248                 );
1249                 p_y += 16;
1250                 p_u += 8;
1251                 p_v += 8;
1252             }
1253             SCALE_WIDTH;
1254             SCALE_HEIGHT( 420, 4 );
1255
1256             p_y += i_source_margin;
1257             if( i_y % 2 )
1258             {
1259                 p_u += i_source_margin_c;
1260                 p_v += i_source_margin_c;
1261             }
1262             p_buffer = b_hscale ? p_buffer_start : p_pic;
1263         }
1264     }
1265
1266     /* make sure all SSE2 stores are visible thereafter */
1267     SSE2_END;
1268
1269 #else // defined (MODULE_NAME_IS_i420_rgb_mmx)
1270
1271     i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
1272
1273     for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1274     {
1275         p_pic_start = p_pic;
1276         p_buffer = b_hscale ? p_buffer_start : p_pic;
1277
1278         for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
1279         {
1280             MMX_CALL (
1281                 MMX_INIT_32
1282                 MMX_YUV_MUL
1283                 MMX_YUV_ADD
1284                 MMX_UNPACK_32_RGBA
1285             );
1286             p_y += 8;
1287             p_u += 4;
1288             p_v += 4;
1289             p_buffer += 8;
1290         }
1291
1292         /* Here we do some unaligned reads and duplicate conversions, but
1293          * at least we have all the pixels */
1294         if( i_rewind )
1295         {
1296             p_y -= i_rewind;
1297             p_u -= i_rewind >> 1;
1298             p_v -= i_rewind >> 1;
1299             p_buffer -= i_rewind;
1300             MMX_CALL (
1301                 MMX_INIT_32
1302                 MMX_YUV_MUL
1303                 MMX_YUV_ADD
1304                 MMX_UNPACK_32_RGBA
1305             );
1306             p_y += 8;
1307             p_u += 4;
1308             p_v += 4;
1309             p_buffer += 8;
1310         }
1311         SCALE_WIDTH;
1312         SCALE_HEIGHT( 420, 4 );
1313
1314         p_y += i_source_margin;
1315         if( i_y % 2 )
1316         {
1317             p_u += i_source_margin_c;
1318             p_v += i_source_margin_c;
1319         }
1320     }
1321
1322     /* re-enable FPU registers */
1323     MMX_END;
1324
1325 #endif
1326 }
1327
1328 VLC_TARGET
1329 void I420_B8G8R8A8( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
1330 {
1331     /* We got this one from the old arguments */
1332     uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
1333     uint8_t  *p_y   = p_src->Y_PIXELS;
1334     uint8_t  *p_u   = p_src->U_PIXELS;
1335     uint8_t  *p_v   = p_src->V_PIXELS;
1336
1337     bool  b_hscale;                         /* horizontal scaling type */
1338     unsigned int i_vscale;                          /* vertical scaling type */
1339     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
1340
1341     int         i_right_margin;
1342     int         i_rewind;
1343     int         i_scale_count;                       /* scale modulo counter */
1344     int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
1345     uint32_t *  p_pic_start;       /* beginning of the current line for copy */
1346     /* Conversion buffer pointer */
1347     uint32_t *  p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer;
1348     uint32_t *  p_buffer;
1349
1350     /* Offset array pointer */
1351     int *       p_offset_start = p_filter->p_sys->p_offset;
1352     int *       p_offset;
1353
1354     const int i_source_margin = p_src->p[0].i_pitch
1355                                  - p_src->p[0].i_visible_pitch;
1356     const int i_source_margin_c = p_src->p[1].i_pitch
1357                                  - p_src->p[1].i_visible_pitch;
1358
1359     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
1360
1361     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
1362      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
1363      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
1364     SetOffset( p_filter->fmt_in.video.i_width,
1365                p_filter->fmt_in.video.i_height,
1366                p_filter->fmt_out.video.i_width,
1367                p_filter->fmt_out.video.i_height,
1368                &b_hscale, &i_vscale, p_offset_start );
1369
1370     /*
1371      * Perform conversion
1372      */
1373     i_scale_count = ( i_vscale == 1 ) ?
1374                     p_filter->fmt_out.video.i_height :
1375                     p_filter->fmt_in.video.i_height;
1376
1377 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
1378
1379     i_rewind = (-p_filter->fmt_in.video.i_width) & 15;
1380
1381     /*
1382     ** SSE2 128 bits fetch/store instructions are faster
1383     ** if memory access is 16 bytes aligned
1384     */
1385
1386     p_buffer = b_hscale ? p_buffer_start : p_pic;
1387     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
1388                     p_dest->p->i_pitch|
1389                     ((intptr_t)p_y)|
1390                     ((intptr_t)p_buffer))) )
1391     {
1392         /* use faster SSE2 aligned fetch and store */
1393         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1394         {
1395             p_pic_start = p_pic;
1396
1397             for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
1398             {
1399                 SSE2_CALL (
1400                     SSE2_INIT_32_ALIGNED
1401                     SSE2_YUV_MUL
1402                     SSE2_YUV_ADD
1403                     SSE2_UNPACK_32_BGRA_ALIGNED
1404                 );
1405                 p_y += 16;
1406                 p_u += 8;
1407                 p_v += 8;
1408                 p_buffer += 16;
1409             }
1410
1411             /* Here we do some unaligned reads and duplicate conversions, but
1412              * at least we have all the pixels */
1413             if( i_rewind )
1414             {
1415                 p_y -= i_rewind;
1416                 p_u -= i_rewind >> 1;
1417                 p_v -= i_rewind >> 1;
1418                 p_buffer -= i_rewind;
1419                 SSE2_CALL (
1420                     SSE2_INIT_32_UNALIGNED
1421                     SSE2_YUV_MUL
1422                     SSE2_YUV_ADD
1423                     SSE2_UNPACK_32_BGRA_UNALIGNED
1424                 );
1425                 p_y += 16;
1426                 p_u += 4;
1427                 p_v += 4;
1428             }
1429             SCALE_WIDTH;
1430             SCALE_HEIGHT( 420, 4 );
1431
1432             p_y += i_source_margin;
1433             if( i_y % 2 )
1434             {
1435                 p_u += i_source_margin_c;
1436                 p_v += i_source_margin_c;
1437             }
1438             p_buffer = b_hscale ? p_buffer_start : p_pic;
1439         }
1440     }
1441     else
1442     {
1443         /* use slower SSE2 unaligned fetch and store */
1444         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1445         {
1446             p_pic_start = p_pic;
1447             p_buffer = b_hscale ? p_buffer_start : p_pic;
1448
1449             for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
1450             {
1451                 SSE2_CALL (
1452                     SSE2_INIT_32_UNALIGNED
1453                     SSE2_YUV_MUL
1454                     SSE2_YUV_ADD
1455                     SSE2_UNPACK_32_BGRA_UNALIGNED
1456                 );
1457                 p_y += 16;
1458                 p_u += 8;
1459                 p_v += 8;
1460                 p_buffer += 16;
1461             }
1462
1463             /* Here we do some unaligned reads and duplicate conversions, but
1464              * at least we have all the pixels */
1465             if( i_rewind )
1466             {
1467                 p_y -= i_rewind;
1468                 p_u -= i_rewind >> 1;
1469                 p_v -= i_rewind >> 1;
1470                 p_buffer -= i_rewind;
1471                 SSE2_CALL (
1472                     SSE2_INIT_32_UNALIGNED
1473                     SSE2_YUV_MUL
1474                     SSE2_YUV_ADD
1475                     SSE2_UNPACK_32_BGRA_UNALIGNED
1476                 );
1477                 p_y += 16;
1478                 p_u += 8;
1479                 p_v += 8;
1480             }
1481             SCALE_WIDTH;
1482             SCALE_HEIGHT( 420, 4 );
1483
1484             p_y += i_source_margin;
1485             if( i_y % 2 )
1486             {
1487                 p_u += i_source_margin_c;
1488                 p_v += i_source_margin_c;
1489             }
1490             p_buffer = b_hscale ? p_buffer_start : p_pic;
1491         }
1492     }
1493
1494 #else
1495
1496     i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
1497
1498     for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1499     {
1500         p_pic_start = p_pic;
1501         p_buffer = b_hscale ? p_buffer_start : p_pic;
1502
1503         for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
1504         {
1505             MMX_CALL (
1506                 MMX_INIT_32
1507                 MMX_YUV_MUL
1508                 MMX_YUV_ADD
1509                 MMX_UNPACK_32_BGRA
1510             );
1511             p_y += 8;
1512             p_u += 4;
1513             p_v += 4;
1514             p_buffer += 8;
1515         }
1516
1517         /* Here we do some unaligned reads and duplicate conversions, but
1518          * at least we have all the pixels */
1519         if( i_rewind )
1520         {
1521             p_y -= i_rewind;
1522             p_u -= i_rewind >> 1;
1523             p_v -= i_rewind >> 1;
1524             p_buffer -= i_rewind;
1525             MMX_CALL (
1526                 MMX_INIT_32
1527                 MMX_YUV_MUL
1528                 MMX_YUV_ADD
1529                 MMX_UNPACK_32_BGRA
1530             );
1531             p_y += 8;
1532             p_u += 4;
1533             p_v += 4;
1534             p_buffer += 8;
1535         }
1536         SCALE_WIDTH;
1537         SCALE_HEIGHT( 420, 4 );
1538
1539         p_y += i_source_margin;
1540         if( i_y % 2 )
1541         {
1542             p_u += i_source_margin_c;
1543             p_v += i_source_margin_c;
1544         }
1545     }
1546
1547     /* re-enable FPU registers */
1548     MMX_END;
1549
1550 #endif
1551 }
1552
1553 VLC_TARGET
1554 void I420_A8B8G8R8( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
1555 {
1556     /* We got this one from the old arguments */
1557     uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
1558     uint8_t  *p_y   = p_src->Y_PIXELS;
1559     uint8_t  *p_u   = p_src->U_PIXELS;
1560     uint8_t  *p_v   = p_src->V_PIXELS;
1561
1562     bool  b_hscale;                         /* horizontal scaling type */
1563     unsigned int i_vscale;                          /* vertical scaling type */
1564     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
1565
1566     int         i_right_margin;
1567     int         i_rewind;
1568     int         i_scale_count;                       /* scale modulo counter */
1569     int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
1570     uint32_t *  p_pic_start;       /* beginning of the current line for copy */
1571     /* Conversion buffer pointer */
1572     uint32_t *  p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer;
1573     uint32_t *  p_buffer;
1574
1575     /* Offset array pointer */
1576     int *       p_offset_start = p_filter->p_sys->p_offset;
1577     int *       p_offset;
1578
1579     const int i_source_margin = p_src->p[0].i_pitch
1580                                  - p_src->p[0].i_visible_pitch;
1581     const int i_source_margin_c = p_src->p[1].i_pitch
1582                                  - p_src->p[1].i_visible_pitch;
1583
1584     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
1585
1586     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
1587      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
1588      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
1589     SetOffset( p_filter->fmt_in.video.i_width,
1590                p_filter->fmt_in.video.i_height,
1591                p_filter->fmt_out.video.i_width,
1592                p_filter->fmt_out.video.i_height,
1593                &b_hscale, &i_vscale, p_offset_start );
1594
1595     /*
1596      * Perform conversion
1597      */
1598     i_scale_count = ( i_vscale == 1 ) ?
1599                     p_filter->fmt_out.video.i_height :
1600                     p_filter->fmt_in.video.i_height;
1601
1602 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
1603
1604     i_rewind = (-p_filter->fmt_in.video.i_width) & 15;
1605
1606     /*
1607     ** SSE2 128 bits fetch/store instructions are faster
1608     ** if memory access is 16 bytes aligned
1609     */
1610
1611     p_buffer = b_hscale ? p_buffer_start : p_pic;
1612     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
1613                     p_dest->p->i_pitch|
1614                     ((intptr_t)p_y)|
1615                     ((intptr_t)p_buffer))) )
1616     {
1617         /* use faster SSE2 aligned fetch and store */
1618         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1619         {
1620             p_pic_start = p_pic;
1621
1622             for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
1623             {
1624                 SSE2_CALL (
1625                     SSE2_INIT_32_ALIGNED
1626                     SSE2_YUV_MUL
1627                     SSE2_YUV_ADD
1628                     SSE2_UNPACK_32_ABGR_ALIGNED
1629                 );
1630                 p_y += 16;
1631                 p_u += 8;
1632                 p_v += 8;
1633                 p_buffer += 16;
1634             }
1635
1636             /* Here we do some unaligned reads and duplicate conversions, but
1637              * at least we have all the pixels */
1638             if( i_rewind )
1639             {
1640                 p_y -= i_rewind;
1641                 p_u -= i_rewind >> 1;
1642                 p_v -= i_rewind >> 1;
1643                 p_buffer -= i_rewind;
1644                 SSE2_CALL (
1645                     SSE2_INIT_32_UNALIGNED
1646                     SSE2_YUV_MUL
1647                     SSE2_YUV_ADD
1648                     SSE2_UNPACK_32_ABGR_UNALIGNED
1649                 );
1650                 p_y += 16;
1651                 p_u += 4;
1652                 p_v += 4;
1653             }
1654             SCALE_WIDTH;
1655             SCALE_HEIGHT( 420, 4 );
1656
1657             p_y += i_source_margin;
1658             if( i_y % 2 )
1659             {
1660                 p_u += i_source_margin_c;
1661                 p_v += i_source_margin_c;
1662             }
1663             p_buffer = b_hscale ? p_buffer_start : p_pic;
1664         }
1665     }
1666     else
1667     {
1668         /* use slower SSE2 unaligned fetch and store */
1669         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1670         {
1671             p_pic_start = p_pic;
1672             p_buffer = b_hscale ? p_buffer_start : p_pic;
1673
1674             for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
1675             {
1676                 SSE2_CALL (
1677                     SSE2_INIT_32_UNALIGNED
1678                     SSE2_YUV_MUL
1679                     SSE2_YUV_ADD
1680                     SSE2_UNPACK_32_ABGR_UNALIGNED
1681                 );
1682                 p_y += 16;
1683                 p_u += 8;
1684                 p_v += 8;
1685                 p_buffer += 16;
1686             }
1687
1688             /* Here we do some unaligned reads and duplicate conversions, but
1689              * at least we have all the pixels */
1690             if( i_rewind )
1691             {
1692                 p_y -= i_rewind;
1693                 p_u -= i_rewind >> 1;
1694                 p_v -= i_rewind >> 1;
1695                 p_buffer -= i_rewind;
1696                 SSE2_CALL (
1697                     SSE2_INIT_32_UNALIGNED
1698                     SSE2_YUV_MUL
1699                     SSE2_YUV_ADD
1700                     SSE2_UNPACK_32_ABGR_UNALIGNED
1701                 );
1702                 p_y += 16;
1703                 p_u += 8;
1704                 p_v += 8;
1705             }
1706             SCALE_WIDTH;
1707             SCALE_HEIGHT( 420, 4 );
1708
1709             p_y += i_source_margin;
1710             if( i_y % 2 )
1711             {
1712                 p_u += i_source_margin_c;
1713                 p_v += i_source_margin_c;
1714             }
1715             p_buffer = b_hscale ? p_buffer_start : p_pic;
1716         }
1717     }
1718
1719 #else
1720
1721     i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
1722
1723     for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1724     {
1725         p_pic_start = p_pic;
1726         p_buffer = b_hscale ? p_buffer_start : p_pic;
1727
1728         for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
1729         {
1730             MMX_CALL (
1731                 MMX_INIT_32
1732                 MMX_YUV_MUL
1733                 MMX_YUV_ADD
1734                 MMX_UNPACK_32_ABGR
1735             );
1736             p_y += 8;
1737             p_u += 4;
1738             p_v += 4;
1739             p_buffer += 8;
1740         }
1741
1742         /* Here we do some unaligned reads and duplicate conversions, but
1743          * at least we have all the pixels */
1744         if( i_rewind )
1745         {
1746             p_y -= i_rewind;
1747             p_u -= i_rewind >> 1;
1748             p_v -= i_rewind >> 1;
1749             p_buffer -= i_rewind;
1750             MMX_CALL (
1751                 MMX_INIT_32
1752                 MMX_YUV_MUL
1753                 MMX_YUV_ADD
1754                 MMX_UNPACK_32_ABGR
1755             );
1756             p_y += 8;
1757             p_u += 4;
1758             p_v += 4;
1759             p_buffer += 8;
1760         }
1761         SCALE_WIDTH;
1762         SCALE_HEIGHT( 420, 4 );
1763
1764         p_y += i_source_margin;
1765         if( i_y % 2 )
1766         {
1767             p_u += i_source_margin_c;
1768             p_v += i_source_margin_c;
1769         }
1770     }
1771
1772     /* re-enable FPU registers */
1773     MMX_END;
1774
1775 #endif
1776 }
1777
1778 #endif
1779
1780 /* Following functions are local */
1781
1782 /*****************************************************************************
1783  * SetOffset: build offset array for conversion functions
1784  *****************************************************************************
1785  * This function will build an offset array used in later conversion functions.
1786  * It will also set horizontal and vertical scaling indicators.
1787  *****************************************************************************/
1788 static void SetOffset( int i_width, int i_height, int i_pic_width,
1789                        int i_pic_height, bool *pb_hscale,
1790                        unsigned int *pi_vscale, int *p_offset )
1791 {
1792     int i_x;                                    /* x position in destination */
1793     int i_scale_count;                                     /* modulo counter */
1794
1795     /*
1796      * Prepare horizontal offset array
1797      */
1798     if( i_pic_width - i_width == 0 )
1799     {
1800         /* No horizontal scaling: YUV conversion is done directly to picture */
1801         *pb_hscale = 0;
1802     }
1803     else if( i_pic_width - i_width > 0 )
1804     {
1805         /* Prepare scaling array for horizontal extension */
1806         *pb_hscale = 1;
1807         i_scale_count = i_pic_width;
1808         for( i_x = i_width; i_x--; )
1809         {
1810             while( (i_scale_count -= i_width) > 0 )
1811             {
1812                 *p_offset++ = 0;
1813             }
1814             *p_offset++ = 1;
1815             i_scale_count += i_pic_width;
1816         }
1817     }
1818     else /* if( i_pic_width - i_width < 0 ) */
1819     {
1820         /* Prepare scaling array for horizontal reduction */
1821         *pb_hscale = 1;
1822         i_scale_count = i_width;
1823         for( i_x = i_pic_width; i_x--; )
1824         {
1825             *p_offset = 1;
1826             while( (i_scale_count -= i_pic_width) > 0 )
1827             {
1828                 *p_offset += 1;
1829             }
1830             p_offset++;
1831             i_scale_count += i_width;
1832         }
1833     }
1834
1835     /*
1836      * Set vertical scaling indicator
1837      */
1838     if( i_pic_height - i_height == 0 )
1839     {
1840         *pi_vscale = 0;
1841     }
1842     else if( i_pic_height - i_height > 0 )
1843     {
1844         *pi_vscale = 1;
1845     }
1846     else /* if( i_pic_height - i_height < 0 ) */
1847     {
1848         *pi_vscale = -1;
1849     }
1850 }
1851