]> git.sesse.net Git - vlc/blob - modules/video_chroma/i420_rgb16.c
2f37e48d4f8b06361cbf772d00dc25c832c7dabc
[vlc] / modules / video_chroma / i420_rgb16.c
1 /*****************************************************************************
2  * i420_rgb16.c : YUV to bitmap RGB conversion module for vlc
3  *****************************************************************************
4  * Copyright (C) 2000 the VideoLAN team
5  * $Id$
6  *
7  * Authors: Samuel Hocevar <sam@zoy.org>
8  *          Damien Fouilleul <damienf@videolan.org>
9  *
10  * This program is free software; you can redistribute it and/or modify
11  * it under the terms of the GNU General Public License as published by
12  * the Free Software Foundation; either version 2 of the License, or
13  * (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public License
21  * along with this program; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
23  *****************************************************************************/
24
25 /*****************************************************************************
26  * Preamble
27  *****************************************************************************/
28
29 #ifdef HAVE_CONFIG_H
30 # include "config.h"
31 #endif
32
33 #include <vlc/vlc.h>
34 #include <vlc_vout.h>
35
36 #include "i420_rgb.h"
37 #if defined (MODULE_NAME_IS_i420_rgb)
38 #   include "i420_rgb_c.h"
39 #elif defined (MODULE_NAME_IS_i420_rgb_mmx)
40 #   include "i420_rgb_mmx.h"
41 #elif defined (MODULE_NAME_IS_i420_rgb_sse2)
42 #   include "i420_rgb_mmx.h"
43 #endif
44
45 static void SetOffset( int, int, int, int, vlc_bool_t *,
46                        unsigned int *, int * );
47
48 #if defined (MODULE_NAME_IS_i420_rgb)
49 /*****************************************************************************
50  * I420_RGB16: color YUV 4:2:0 to RGB 16 bpp with dithering
51  *****************************************************************************
52  * Horizontal alignment needed:
53  *  - input: 8 pixels (8 Y bytes, 4 U/V bytes), margins not allowed
54  *  - output: 1 pixel (2 bytes), margins allowed
55  * Vertical alignment needed:
56  *  - input: 2 lines (2 Y lines, 1 U/V line)
57  *  - output: 1 line
58  *****************************************************************************/
59 void E_(I420_RGB16_dither)( vout_thread_t *p_vout, picture_t *p_src,
60                                                       picture_t *p_dest )
61 {
62     /* We got this one from the old arguments */
63     uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
64     uint8_t  *p_y   = p_src->Y_PIXELS;
65     uint8_t  *p_u   = p_src->U_PIXELS;
66     uint8_t  *p_v   = p_src->V_PIXELS;
67
68     vlc_bool_t   b_hscale;                        /* horizontal scaling type */
69     unsigned int i_vscale;                          /* vertical scaling type */
70     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
71     unsigned int i_real_y;                                          /* y % 4 */
72
73     int         i_right_margin;
74     int         i_rewind;
75     int         i_scale_count;                       /* scale modulo counter */
76     int         i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
77     uint16_t *  p_pic_start;       /* beginning of the current line for copy */
78     int         i_uval, i_vval;                           /* U and V samples */
79     int         i_red, i_green, i_blue;          /* U and V modified samples */
80     uint16_t *  p_yuv = p_vout->chroma.p_sys->p_rgb16;
81     uint16_t *  p_ybase;                     /* Y dependant conversion table */
82
83     /* Conversion buffer pointer */
84     uint16_t *  p_buffer_start = (uint16_t*)p_vout->chroma.p_sys->p_buffer;
85     uint16_t *  p_buffer;
86
87     /* Offset array pointer */
88     int *       p_offset_start = p_vout->chroma.p_sys->p_offset;
89     int *       p_offset;
90
91     const int i_source_margin = p_src->p[0].i_pitch
92                                  - p_src->p[0].i_visible_pitch;
93     const int i_source_margin_c = p_src->p[1].i_pitch
94                                  - p_src->p[1].i_visible_pitch;
95
96     /* The dithering matrices */
97     int dither10[4] = {  0x0,  0x8,  0x2,  0xa };
98     int dither11[4] = {  0xc,  0x4,  0xe,  0x6 };
99     int dither12[4] = {  0x3,  0xb,  0x1,  0x9 };
100     int dither13[4] = {  0xf,  0x7,  0xd,  0x5 };
101
102     for(i_x = 0; i_x < 4; i_x++)
103     {
104         dither10[i_x] = dither10[i_x] << (SHIFT - 4 + p_vout->output.i_rrshift);
105         dither11[i_x] = dither11[i_x] << (SHIFT - 4 + p_vout->output.i_rrshift);
106         dither12[i_x] = dither12[i_x] << (SHIFT - 4 + p_vout->output.i_rrshift);
107         dither13[i_x] = dither13[i_x] << (SHIFT - 4 + p_vout->output.i_rrshift);
108     }
109
110     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
111
112     if( p_vout->render.i_width & 7 )
113     {
114         i_rewind = 8 - ( p_vout->render.i_width & 7 );
115     }
116     else
117     {
118         i_rewind = 0;
119     }
120
121     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
122      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
123      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
124     SetOffset( p_vout->render.i_width, p_vout->render.i_height,
125                p_vout->output.i_width, p_vout->output.i_height,
126                &b_hscale, &i_vscale, p_offset_start );
127
128     /*
129      * Perform conversion
130      */
131     i_scale_count = ( i_vscale == 1 ) ?
132                     p_vout->output.i_height : p_vout->render.i_height;
133     for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
134     {
135         i_real_y = i_y & 0x3;
136         p_pic_start = p_pic;
137         p_buffer = b_hscale ? p_buffer_start : p_pic;
138
139         for ( i_x = p_vout->render.i_width / 8; i_x--; )
140         {
141             int *p_dither = dither10;
142             CONVERT_YUV_PIXEL_DITHER(2);
143             p_dither = dither11;
144             CONVERT_Y_PIXEL_DITHER(2);
145             p_dither = dither12;
146             CONVERT_YUV_PIXEL_DITHER(2);
147             p_dither = dither13;
148             CONVERT_Y_PIXEL_DITHER(2);
149             p_dither = dither10;
150             CONVERT_YUV_PIXEL_DITHER(2);
151             p_dither = dither11;
152             CONVERT_Y_PIXEL_DITHER(2);
153             p_dither = dither12;
154             CONVERT_YUV_PIXEL_DITHER(2);
155             p_dither = dither13;
156             CONVERT_Y_PIXEL_DITHER(2);
157         }
158
159         /* Here we do some unaligned reads and duplicate conversions, but
160          * at least we have all the pixels */
161         if( i_rewind )
162         {
163             int *p_dither = dither10;
164             p_y -= i_rewind;
165             p_u -= i_rewind >> 1;
166             p_v -= i_rewind >> 1;
167             p_buffer -= i_rewind;
168             CONVERT_YUV_PIXEL_DITHER(2);
169             p_dither = dither11;
170             CONVERT_Y_PIXEL_DITHER(2);
171             p_dither = dither12;
172             CONVERT_YUV_PIXEL_DITHER(2);
173             p_dither = dither13;
174             CONVERT_Y_PIXEL_DITHER(2);
175             p_dither = dither10;
176             CONVERT_YUV_PIXEL_DITHER(2);
177             p_dither = dither11;
178             CONVERT_Y_PIXEL_DITHER(2);
179             p_dither = dither12;
180             CONVERT_YUV_PIXEL_DITHER(2);
181             p_dither = dither13;
182             CONVERT_Y_PIXEL_DITHER(2);
183         }
184         SCALE_WIDTH;
185         SCALE_HEIGHT( 420, 2 );
186
187         p_y += i_source_margin;
188         if( i_y % 2 )
189         {
190             p_u += i_source_margin_c;
191             p_v += i_source_margin_c;
192         }
193     }
194 }
195 #endif
196
197 /*****************************************************************************
198  * I420_RGB16: color YUV 4:2:0 to RGB 16 bpp
199  *****************************************************************************
200  * Horizontal alignment needed:
201  *  - input: 8 pixels (8 Y bytes, 4 U/V bytes), margins not allowed
202  *  - output: 1 pixel (2 bytes), margins allowed
203  * Vertical alignment needed:
204  *  - input: 2 lines (2 Y lines, 1 U/V line)
205  *  - output: 1 line
206  *****************************************************************************/
207
208 #if defined (MODULE_NAME_IS_i420_rgb)
209
210 void E_(I420_RGB16)( vout_thread_t *p_vout, picture_t *p_src,
211                                             picture_t *p_dest )
212 {
213     /* We got this one from the old arguments */
214     uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
215     uint8_t  *p_y   = p_src->Y_PIXELS;
216     uint8_t  *p_u   = p_src->U_PIXELS;
217     uint8_t  *p_v   = p_src->V_PIXELS;
218
219     vlc_bool_t  b_hscale;                         /* horizontal scaling type */
220     unsigned int i_vscale;                          /* vertical scaling type */
221     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
222
223     int         i_right_margin;
224     int         i_rewind;
225     int         i_scale_count;                       /* scale modulo counter */
226     int         i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
227     uint16_t *  p_pic_start;       /* beginning of the current line for copy */
228     int         i_uval, i_vval;                           /* U and V samples */
229     int         i_red, i_green, i_blue;          /* U and V modified samples */
230     uint16_t *  p_yuv = p_vout->chroma.p_sys->p_rgb16;
231     uint16_t *  p_ybase;                     /* Y dependant conversion table */
232
233     /* Conversion buffer pointer */
234     uint16_t *  p_buffer_start = (uint16_t*)p_vout->chroma.p_sys->p_buffer;
235     uint16_t *  p_buffer;
236
237     /* Offset array pointer */
238     int *       p_offset_start = p_vout->chroma.p_sys->p_offset;
239     int *       p_offset;
240
241     const int i_source_margin = p_src->p[0].i_pitch
242                                  - p_src->p[0].i_visible_pitch;
243     const int i_source_margin_c = p_src->p[1].i_pitch
244                                  - p_src->p[1].i_visible_pitch;
245
246     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
247
248     if( p_vout->render.i_width & 7 )
249     {
250         i_rewind = 8 - ( p_vout->render.i_width & 7 );
251     }
252     else
253     {
254         i_rewind = 0;
255     }
256
257     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
258      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
259      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
260     SetOffset( p_vout->render.i_width, p_vout->render.i_height,
261                p_vout->output.i_width, p_vout->output.i_height,
262                &b_hscale, &i_vscale, p_offset_start );
263
264     /*
265      * Perform conversion
266      */
267     i_scale_count = ( i_vscale == 1 ) ?
268                     p_vout->output.i_height : p_vout->render.i_height;
269     for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
270     {
271         p_pic_start = p_pic;
272         p_buffer = b_hscale ? p_buffer_start : p_pic;
273
274         for ( i_x = p_vout->render.i_width / 8; i_x--; )
275         {
276             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
277             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
278             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
279             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
280         }
281
282         /* Here we do some unaligned reads and duplicate conversions, but
283          * at least we have all the pixels */
284         if( i_rewind )
285         {
286             p_y -= i_rewind;
287             p_u -= i_rewind >> 1;
288             p_v -= i_rewind >> 1;
289             p_buffer -= i_rewind;
290
291             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
292             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
293             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
294             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
295         }
296         SCALE_WIDTH;
297         SCALE_HEIGHT( 420, 2 );
298
299         p_y += i_source_margin;
300         if( i_y % 2 )
301         {
302             p_u += i_source_margin_c;
303             p_v += i_source_margin_c;
304         }
305     }
306 }
307
308 #else // ! defined (MODULE_NAME_IS_i420_rgb)
309
310 void E_(I420_R5G5B5)( vout_thread_t *p_vout, picture_t *p_src,
311                                             picture_t *p_dest )
312 {
313     /* We got this one from the old arguments */
314     uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
315     uint8_t  *p_y   = p_src->Y_PIXELS;
316     uint8_t  *p_u   = p_src->U_PIXELS;
317     uint8_t  *p_v   = p_src->V_PIXELS;
318
319     vlc_bool_t  b_hscale;                         /* horizontal scaling type */
320     unsigned int i_vscale;                          /* vertical scaling type */
321     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
322
323     int         i_right_margin;
324     int         i_rewind;
325     int         i_scale_count;                       /* scale modulo counter */
326     int         i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
327     uint16_t *  p_pic_start;       /* beginning of the current line for copy */
328
329     /* Conversion buffer pointer */
330     uint16_t *  p_buffer_start = (uint16_t*)p_vout->chroma.p_sys->p_buffer;
331     uint16_t *  p_buffer;
332
333     /* Offset array pointer */
334     int *       p_offset_start = p_vout->chroma.p_sys->p_offset;
335     int *       p_offset;
336
337     const int i_source_margin = p_src->p[0].i_pitch
338                                  - p_src->p[0].i_visible_pitch;
339     const int i_source_margin_c = p_src->p[1].i_pitch
340                                  - p_src->p[1].i_visible_pitch;
341
342     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
343
344     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
345      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
346      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
347     SetOffset( p_vout->render.i_width, p_vout->render.i_height,
348                p_vout->output.i_width, p_vout->output.i_height,
349                &b_hscale, &i_vscale, p_offset_start );
350
351
352     /*
353      * Perform conversion
354      */
355     i_scale_count = ( i_vscale == 1 ) ?
356                     p_vout->output.i_height : p_vout->render.i_height;
357
358 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
359
360     if( p_vout->render.i_width & 15 )
361     {
362         i_rewind = 16 - ( p_vout->render.i_width & 15 );
363     }
364     else
365     {
366         i_rewind = 0;
367     }
368
369     /*
370     ** SSE2 128 bits fetch/store instructions are faster
371     ** if memory access is 16 bytes aligned
372     */
373
374     p_buffer = b_hscale ? p_buffer_start : p_pic;
375     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
376                     p_dest->p->i_pitch|
377                     ((intptr_t)p_y)|
378                     ((intptr_t)p_buffer))) )
379     {
380         /* use faster SSE2 aligned fetch and store */
381         for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
382         {
383             p_pic_start = p_pic;
384
385             for ( i_x = p_vout->render.i_width/16; i_x--; )
386             {
387                 SSE2_CALL (
388                     SSE2_INIT_16_ALIGNED
389                     SSE2_YUV_MUL
390                     SSE2_YUV_ADD
391                     SSE2_UNPACK_15_ALIGNED
392                 );
393                 p_y += 16;
394                 p_u += 8;
395                 p_v += 8;
396                 p_buffer += 16;
397             }
398             /* Here we do some unaligned reads and duplicate conversions, but
399              * at least we have all the pixels */
400             if( i_rewind )
401             {
402                 p_y -= i_rewind;
403                 p_u -= i_rewind >> 1;
404                 p_v -= i_rewind >> 1;
405                 p_buffer -= i_rewind;
406
407                 SSE2_CALL (
408                     SSE2_INIT_16_UNALIGNED
409                     SSE2_YUV_MUL
410                     SSE2_YUV_ADD
411                     SSE2_UNPACK_15_UNALIGNED
412                 );
413                 p_y += 16;
414                 p_u += 8;
415                 p_v += 8;
416             }
417             SCALE_WIDTH;
418             SCALE_HEIGHT( 420, 2 );
419
420             p_y += i_source_margin;
421             if( i_y % 2 )
422             {
423                 p_u += i_source_margin_c;
424                 p_v += i_source_margin_c;
425             }
426             p_buffer = b_hscale ? p_buffer_start : p_pic;
427         }
428     }
429     else
430     {
431         /* use slower SSE2 unaligned fetch and store */
432         for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
433         {
434             p_pic_start = p_pic;
435             p_buffer = b_hscale ? p_buffer_start : p_pic;
436
437             for ( i_x = p_vout->render.i_width/16; i_x--; )
438             {
439                 SSE2_CALL (
440                     SSE2_INIT_16_UNALIGNED
441                     SSE2_YUV_MUL
442                     SSE2_YUV_ADD
443                     SSE2_UNPACK_15_UNALIGNED
444                 );
445                 p_y += 16;
446                 p_u += 8;
447                 p_v += 8;
448                 p_buffer += 16;
449             }
450             /* Here we do some unaligned reads and duplicate conversions, but
451              * at least we have all the pixels */
452             if( i_rewind )
453             {
454                 p_y -= i_rewind;
455                 p_u -= i_rewind >> 1;
456                 p_v -= i_rewind >> 1;
457                 p_buffer -= i_rewind;
458
459                 SSE2_CALL (
460                     SSE2_INIT_16_UNALIGNED
461                     SSE2_YUV_MUL
462                     SSE2_YUV_ADD
463                     SSE2_UNPACK_15_UNALIGNED
464                 );
465                 p_y += 16;
466                 p_u += 8;
467                 p_v += 8;
468             }
469             SCALE_WIDTH;
470             SCALE_HEIGHT( 420, 2 );
471
472             p_y += i_source_margin;
473             if( i_y % 2 )
474             {
475                 p_u += i_source_margin_c;
476                 p_v += i_source_margin_c;
477             }
478             p_buffer = b_hscale ? p_buffer_start : p_pic;
479         }
480     }
481
482     /* make sure all SSE2 stores are visible thereafter */
483     SSE2_END;
484
485 #else // defined (MODULE_NAME_IS_i420_rgb_mmx)
486
487     if( p_vout->render.i_width & 7 )
488     {
489         i_rewind = 8 - ( p_vout->render.i_width & 7 );
490     }
491     else
492     {
493         i_rewind = 0;
494     }
495
496     for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
497     {
498         p_pic_start = p_pic;
499         p_buffer = b_hscale ? p_buffer_start : p_pic;
500
501         for ( i_x = p_vout->render.i_width / 8; i_x--; )
502         {
503             MMX_CALL (
504                 MMX_INIT_16
505                 MMX_YUV_MUL
506                 MMX_YUV_ADD
507                 MMX_UNPACK_15
508             );
509             p_y += 8;
510             p_u += 4;
511             p_v += 4;
512             p_buffer += 8;
513         }
514
515         /* Here we do some unaligned reads and duplicate conversions, but
516          * at least we have all the pixels */
517         if( i_rewind )
518         {
519             p_y -= i_rewind;
520             p_u -= i_rewind >> 1;
521             p_v -= i_rewind >> 1;
522             p_buffer -= i_rewind;
523
524             MMX_CALL (
525                 MMX_INIT_16
526                 MMX_YUV_MUL
527                 MMX_YUV_ADD
528                 MMX_UNPACK_15
529             );
530             p_y += 8;
531             p_u += 4;
532             p_v += 4;
533             p_buffer += 8;
534         }
535         SCALE_WIDTH;
536         SCALE_HEIGHT( 420, 2 );
537
538         p_y += i_source_margin;
539         if( i_y % 2 )
540         {
541             p_u += i_source_margin_c;
542             p_v += i_source_margin_c;
543         }
544     }
545     /* re-enable FPU registers */
546     MMX_END;
547
548 #endif
549 }
550
551 void E_(I420_R5G6B5)( vout_thread_t *p_vout, picture_t *p_src,
552                                             picture_t *p_dest )
553 {
554     /* We got this one from the old arguments */
555     uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
556     uint8_t  *p_y   = p_src->Y_PIXELS;
557     uint8_t  *p_u   = p_src->U_PIXELS;
558     uint8_t  *p_v   = p_src->V_PIXELS;
559
560     vlc_bool_t  b_hscale;                         /* horizontal scaling type */
561     unsigned int i_vscale;                          /* vertical scaling type */
562     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
563
564     int         i_right_margin;
565     int         i_rewind;
566     int         i_scale_count;                       /* scale modulo counter */
567     int         i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
568     uint16_t *  p_pic_start;       /* beginning of the current line for copy */
569
570     /* Conversion buffer pointer */
571     uint16_t *  p_buffer_start = (uint16_t*)p_vout->chroma.p_sys->p_buffer;
572     uint16_t *  p_buffer;
573
574     /* Offset array pointer */
575     int *       p_offset_start = p_vout->chroma.p_sys->p_offset;
576     int *       p_offset;
577
578     const int i_source_margin = p_src->p[0].i_pitch
579                                  - p_src->p[0].i_visible_pitch;
580     const int i_source_margin_c = p_src->p[1].i_pitch
581                                  - p_src->p[1].i_visible_pitch;
582
583     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
584
585     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
586      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
587      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
588     SetOffset( p_vout->render.i_width, p_vout->render.i_height,
589                p_vout->output.i_width, p_vout->output.i_height,
590                &b_hscale, &i_vscale, p_offset_start );
591
592
593     /*
594      * Perform conversion
595      */
596     i_scale_count = ( i_vscale == 1 ) ?
597                     p_vout->output.i_height : p_vout->render.i_height;
598
599 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
600
601     if( p_vout->render.i_width & 15 )
602     {
603         i_rewind = 16 - ( p_vout->render.i_width & 15 );
604     }
605     else
606     {
607         i_rewind = 0;
608     }
609
610     /*
611     ** SSE2 128 bits fetch/store instructions are faster
612     ** if memory access is 16 bytes aligned
613     */
614
615     p_buffer = b_hscale ? p_buffer_start : p_pic;
616     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
617                     p_dest->p->i_pitch|
618                     ((intptr_t)p_y)|
619                     ((intptr_t)p_buffer))) )
620     {
621         /* use faster SSE2 aligned fetch and store */
622         for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
623         {
624             p_pic_start = p_pic;
625
626             for ( i_x = p_vout->render.i_width/16; i_x--; )
627             {
628                 SSE2_CALL (
629                     SSE2_INIT_16_ALIGNED
630                     SSE2_YUV_MUL
631                     SSE2_YUV_ADD
632                     SSE2_UNPACK_16_ALIGNED
633                 );
634                 p_y += 16;
635                 p_u += 8;
636                 p_v += 8;
637                 p_buffer += 16;
638             }
639             /* Here we do some unaligned reads and duplicate conversions, but
640              * at least we have all the pixels */
641             if( i_rewind )
642             {
643                 p_y -= i_rewind;
644                 p_u -= i_rewind >> 1;
645                 p_v -= i_rewind >> 1;
646                 p_buffer -= i_rewind;
647
648                 SSE2_CALL (
649                     SSE2_INIT_16_UNALIGNED
650                     SSE2_YUV_MUL
651                     SSE2_YUV_ADD
652                     SSE2_UNPACK_16_UNALIGNED
653                 );
654                 p_y += 16;
655                 p_u += 8;
656                 p_v += 8;
657             }
658             SCALE_WIDTH;
659             SCALE_HEIGHT( 420, 2 );
660
661             p_y += i_source_margin;
662             if( i_y % 2 )
663             {
664                 p_u += i_source_margin_c;
665                 p_v += i_source_margin_c;
666             }
667             p_buffer = b_hscale ? p_buffer_start : p_pic;
668         }
669     }
670     else
671     {
672         /* use slower SSE2 unaligned fetch and store */
673         for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
674         {
675             p_pic_start = p_pic;
676             p_buffer = b_hscale ? p_buffer_start : p_pic;
677
678             for ( i_x = p_vout->render.i_width/16; i_x--; )
679             {
680                 SSE2_CALL(
681                     SSE2_INIT_16_UNALIGNED
682                     SSE2_YUV_MUL
683                     SSE2_YUV_ADD
684                     SSE2_UNPACK_16_UNALIGNED
685                 );
686                 p_y += 16;
687                 p_u += 8;
688                 p_v += 8;
689                 p_buffer += 16;
690             }
691             /* Here we do some unaligned reads and duplicate conversions, but
692              * at least we have all the pixels */
693             if( i_rewind )
694             {
695                 p_y -= i_rewind;
696                 p_u -= i_rewind >> 1;
697                 p_v -= i_rewind >> 1;
698                 p_buffer -= i_rewind;
699
700                 SSE2_CALL(
701                     SSE2_INIT_16_UNALIGNED
702                     SSE2_YUV_MUL
703                     SSE2_YUV_ADD
704                     SSE2_UNPACK_16_UNALIGNED
705                 );
706                 p_y += 16;
707                 p_u += 8;
708                 p_v += 8;
709             }
710             SCALE_WIDTH;
711             SCALE_HEIGHT( 420, 2 );
712
713             p_y += i_source_margin;
714             if( i_y % 2 )
715             {
716                 p_u += i_source_margin_c;
717                 p_v += i_source_margin_c;
718             }
719             p_buffer = b_hscale ? p_buffer_start : p_pic;
720         }
721     }
722
723     /* make sure all SSE2 stores are visible thereafter */
724     SSE2_END;
725
726 #else // defined (MODULE_NAME_IS_i420_rgb_mmx)
727
728     if( p_vout->render.i_width & 7 )
729     {
730         i_rewind = 8 - ( p_vout->render.i_width & 7 );
731     }
732     else
733     {
734         i_rewind = 0;
735     }
736
737     for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
738     {
739         p_pic_start = p_pic;
740         p_buffer = b_hscale ? p_buffer_start : p_pic;
741
742         for ( i_x = p_vout->render.i_width / 8; i_x--; )
743         {
744             MMX_CALL (
745                 MMX_INIT_16
746                 MMX_YUV_MUL
747                 MMX_YUV_ADD
748                 MMX_UNPACK_16
749             );
750             p_y += 8;
751             p_u += 4;
752             p_v += 4;
753             p_buffer += 8;
754         }
755
756         /* Here we do some unaligned reads and duplicate conversions, but
757          * at least we have all the pixels */
758         if( i_rewind )
759         {
760             p_y -= i_rewind;
761             p_u -= i_rewind >> 1;
762             p_v -= i_rewind >> 1;
763             p_buffer -= i_rewind;
764
765             MMX_CALL (
766                 MMX_INIT_16
767                 MMX_YUV_MUL
768                 MMX_YUV_ADD
769                 MMX_UNPACK_16
770             );
771             p_y += 8;
772             p_u += 4;
773             p_v += 4;
774             p_buffer += 8;
775         }
776         SCALE_WIDTH;
777         SCALE_HEIGHT( 420, 2 );
778
779         p_y += i_source_margin;
780         if( i_y % 2 )
781         {
782             p_u += i_source_margin_c;
783             p_v += i_source_margin_c;
784         }
785     }
786     /* re-enable FPU registers */
787     MMX_END;
788
789 #endif
790 }
791
792 #endif
793
794 /*****************************************************************************
795  * I420_RGB32: color YUV 4:2:0 to RGB 32 bpp
796  *****************************************************************************
797  * Horizontal alignment needed:
798  *  - input: 8 pixels (8 Y bytes, 4 U/V bytes), margins not allowed
799  *  - output: 1 pixel (2 bytes), margins allowed
800  * Vertical alignment needed:
801  *  - input: 2 lines (2 Y lines, 1 U/V line)
802  *  - output: 1 line
803  *****************************************************************************/
804
805 #if defined (MODULE_NAME_IS_i420_rgb)
806
807 void E_(I420_RGB32)( vout_thread_t *p_vout, picture_t *p_src,
808                                             picture_t *p_dest )
809 {
810     /* We got this one from the old arguments */
811     uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
812     uint8_t  *p_y   = p_src->Y_PIXELS;
813     uint8_t  *p_u   = p_src->U_PIXELS;
814     uint8_t  *p_v   = p_src->V_PIXELS;
815
816     vlc_bool_t  b_hscale;                         /* horizontal scaling type */
817     unsigned int i_vscale;                          /* vertical scaling type */
818     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
819
820     int         i_right_margin;
821     int         i_rewind;
822     int         i_scale_count;                       /* scale modulo counter */
823     int         i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
824     uint32_t *  p_pic_start;       /* beginning of the current line for copy */
825     int         i_uval, i_vval;                           /* U and V samples */
826     int         i_red, i_green, i_blue;          /* U and V modified samples */
827     uint32_t *  p_yuv = p_vout->chroma.p_sys->p_rgb32;
828     uint32_t *  p_ybase;                     /* Y dependant conversion table */
829
830     /* Conversion buffer pointer */
831     uint32_t *  p_buffer_start = (uint32_t*)p_vout->chroma.p_sys->p_buffer;
832     uint32_t *  p_buffer;
833
834     /* Offset array pointer */
835     int *       p_offset_start = p_vout->chroma.p_sys->p_offset;
836     int *       p_offset;
837
838     const int i_source_margin = p_src->p[0].i_pitch
839                                  - p_src->p[0].i_visible_pitch;
840     const int i_source_margin_c = p_src->p[1].i_pitch
841                                  - p_src->p[1].i_visible_pitch;
842
843     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
844
845     if( p_vout->render.i_width & 7 )
846     {
847         i_rewind = 8 - ( p_vout->render.i_width & 7 );
848     }
849     else
850     {
851         i_rewind = 0;
852     }
853
854     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
855      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
856      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
857     SetOffset( p_vout->render.i_width, p_vout->render.i_height,
858                p_vout->output.i_width, p_vout->output.i_height,
859                &b_hscale, &i_vscale, p_offset_start );
860
861     /*
862      * Perform conversion
863      */
864     i_scale_count = ( i_vscale == 1 ) ?
865                     p_vout->output.i_height : p_vout->render.i_height;
866     for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
867     {
868         p_pic_start = p_pic;
869         p_buffer = b_hscale ? p_buffer_start : p_pic;
870
871         for ( i_x = p_vout->render.i_width / 8; i_x--; )
872         {
873             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
874             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
875             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
876             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
877         }
878
879         /* Here we do some unaligned reads and duplicate conversions, but
880          * at least we have all the pixels */
881         if( i_rewind )
882         {
883             p_y -= i_rewind;
884             p_u -= i_rewind >> 1;
885             p_v -= i_rewind >> 1;
886             p_buffer -= i_rewind;
887             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
888             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
889             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
890             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
891         }
892         SCALE_WIDTH;
893         SCALE_HEIGHT( 420, 4 );
894
895         p_y += i_source_margin;
896         if( i_y % 2 )
897         {
898             p_u += i_source_margin_c;
899             p_v += i_source_margin_c;
900         }
901     }
902 }
903
904 #else // defined (MODULE_NAME_IS_i420_rgb_mmx) || defined (MODULE_NAME_IS_i420_rgb_sse2)
905
906 void E_(I420_A8R8G8B8)( vout_thread_t *p_vout, picture_t *p_src,
907                                             picture_t *p_dest )
908 {
909     /* We got this one from the old arguments */
910     uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
911     uint8_t  *p_y   = p_src->Y_PIXELS;
912     uint8_t  *p_u   = p_src->U_PIXELS;
913     uint8_t  *p_v   = p_src->V_PIXELS;
914
915     vlc_bool_t  b_hscale;                         /* horizontal scaling type */
916     unsigned int i_vscale;                          /* vertical scaling type */
917     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
918
919     int         i_right_margin;
920     int         i_rewind;
921     int         i_scale_count;                       /* scale modulo counter */
922     int         i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
923     uint32_t *  p_pic_start;       /* beginning of the current line for copy */
924     /* Conversion buffer pointer */
925     uint32_t *  p_buffer_start = (uint32_t*)p_vout->chroma.p_sys->p_buffer;
926     uint32_t *  p_buffer;
927
928     /* Offset array pointer */
929     int *       p_offset_start = p_vout->chroma.p_sys->p_offset;
930     int *       p_offset;
931
932     const int i_source_margin = p_src->p[0].i_pitch
933                                  - p_src->p[0].i_visible_pitch;
934     const int i_source_margin_c = p_src->p[1].i_pitch
935                                  - p_src->p[1].i_visible_pitch;
936
937     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
938
939     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
940      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
941      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
942     SetOffset( p_vout->render.i_width, p_vout->render.i_height,
943                p_vout->output.i_width, p_vout->output.i_height,
944                &b_hscale, &i_vscale, p_offset_start );
945
946     /*
947      * Perform conversion
948      */
949     i_scale_count = ( i_vscale == 1 ) ?
950                     p_vout->output.i_height : p_vout->render.i_height;
951
952 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
953
954     if( p_vout->render.i_width & 15 )
955     {
956         i_rewind = 16 - ( p_vout->render.i_width & 15 );
957     }
958     else
959     {
960         i_rewind = 0;
961     }
962
963     /*
964     ** SSE2 128 bits fetch/store instructions are faster
965     ** if memory access is 16 bytes aligned
966     */
967
968     p_buffer = b_hscale ? p_buffer_start : p_pic;
969     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
970                     p_dest->p->i_pitch|
971                     ((intptr_t)p_y)|
972                     ((intptr_t)p_buffer))) )
973     {
974         /* use faster SSE2 aligned fetch and store */
975         for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
976         {
977             p_pic_start = p_pic;
978
979             for ( i_x = p_vout->render.i_width / 16; i_x--; )
980             {
981                 SSE2_CALL (
982                     SSE2_INIT_32_ALIGNED
983                     SSE2_YUV_MUL
984                     SSE2_YUV_ADD
985                     SSE2_UNPACK_32_ARGB_ALIGNED
986                 );
987                 p_y += 16;
988                 p_u += 8;
989                 p_v += 8;
990                 p_buffer += 16;
991             }
992
993             /* Here we do some unaligned reads and duplicate conversions, but
994              * at least we have all the pixels */
995             if( i_rewind )
996             {
997                 p_y -= i_rewind;
998                 p_u -= i_rewind >> 1;
999                 p_v -= i_rewind >> 1;
1000                 p_buffer -= i_rewind;
1001                 SSE2_CALL (
1002                     SSE2_INIT_32_UNALIGNED
1003                     SSE2_YUV_MUL
1004                     SSE2_YUV_ADD
1005                     SSE2_UNPACK_32_ARGB_UNALIGNED
1006                 );
1007                 p_y += 16;
1008                 p_u += 4;
1009                 p_v += 4;
1010             }
1011             SCALE_WIDTH;
1012             SCALE_HEIGHT( 420, 4 );
1013
1014             p_y += i_source_margin;
1015             if( i_y % 2 )
1016             {
1017                 p_u += i_source_margin_c;
1018                 p_v += i_source_margin_c;
1019             }
1020             p_buffer = b_hscale ? p_buffer_start : p_pic;
1021         }
1022     }
1023     else
1024     {
1025         /* use slower SSE2 unaligned fetch and store */
1026         for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1027         {
1028             p_pic_start = p_pic;
1029             p_buffer = b_hscale ? p_buffer_start : p_pic;
1030
1031             for ( i_x = p_vout->render.i_width / 16; i_x--; )
1032             {
1033                 SSE2_CALL (
1034                     SSE2_INIT_32_UNALIGNED
1035                     SSE2_YUV_MUL
1036                     SSE2_YUV_ADD
1037                     SSE2_UNPACK_32_ARGB_UNALIGNED
1038                 );
1039                 p_y += 16;
1040                 p_u += 8;
1041                 p_v += 8;
1042                 p_buffer += 16;
1043             }
1044
1045             /* Here we do some unaligned reads and duplicate conversions, but
1046              * at least we have all the pixels */
1047             if( i_rewind )
1048             {
1049                 p_y -= i_rewind;
1050                 p_u -= i_rewind >> 1;
1051                 p_v -= i_rewind >> 1;
1052                 p_buffer -= i_rewind;
1053                 SSE2_CALL (
1054                     SSE2_INIT_32_UNALIGNED
1055                     SSE2_YUV_MUL
1056                     SSE2_YUV_ADD
1057                     SSE2_UNPACK_32_ARGB_UNALIGNED
1058                 );
1059                 p_y += 16;
1060                 p_u += 8;
1061                 p_v += 8;
1062             }
1063             SCALE_WIDTH;
1064             SCALE_HEIGHT( 420, 4 );
1065
1066             p_y += i_source_margin;
1067             if( i_y % 2 )
1068             {
1069                 p_u += i_source_margin_c;
1070                 p_v += i_source_margin_c;
1071             }
1072             p_buffer = b_hscale ? p_buffer_start : p_pic;
1073         }
1074     }
1075
1076     /* make sure all SSE2 stores are visible thereafter */
1077     SSE2_END;
1078
1079 #else // defined (MODULE_NAME_IS_i420_rgb_mmx)
1080
1081     if( p_vout->render.i_width & 7 )
1082     {
1083         i_rewind = 8 - ( p_vout->render.i_width & 7 );
1084     }
1085     else
1086     {
1087         i_rewind = 0;
1088     }
1089
1090     for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1091     {
1092         p_pic_start = p_pic;
1093         p_buffer = b_hscale ? p_buffer_start : p_pic;
1094
1095         for ( i_x = p_vout->render.i_width / 8; i_x--; )
1096         {
1097             MMX_CALL (
1098                 MMX_INIT_32
1099                 MMX_YUV_MUL
1100                 MMX_YUV_ADD
1101                 MMX_UNPACK_32_ARGB
1102             );
1103             p_y += 8;
1104             p_u += 4;
1105             p_v += 4;
1106             p_buffer += 8;
1107         }
1108
1109         /* Here we do some unaligned reads and duplicate conversions, but
1110          * at least we have all the pixels */
1111         if( i_rewind )
1112         {
1113             p_y -= i_rewind;
1114             p_u -= i_rewind >> 1;
1115             p_v -= i_rewind >> 1;
1116             p_buffer -= i_rewind;
1117             MMX_CALL (
1118                 MMX_INIT_32
1119                 MMX_YUV_MUL
1120                 MMX_YUV_ADD
1121                 MMX_UNPACK_32_ARGB
1122             );
1123             p_y += 8;
1124             p_u += 4;
1125             p_v += 4;
1126             p_buffer += 8;
1127         }
1128         SCALE_WIDTH;
1129         SCALE_HEIGHT( 420, 4 );
1130
1131         p_y += i_source_margin;
1132         if( i_y % 2 )
1133         {
1134             p_u += i_source_margin_c;
1135             p_v += i_source_margin_c;
1136         }
1137     }
1138
1139     /* re-enable FPU registers */
1140     MMX_END;
1141
1142 #endif
1143 }
1144
1145 void E_(I420_R8G8B8A8)( vout_thread_t *p_vout, picture_t *p_src,
1146                                             picture_t *p_dest )
1147 {
1148     /* We got this one from the old arguments */
1149     uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
1150     uint8_t  *p_y   = p_src->Y_PIXELS;
1151     uint8_t  *p_u   = p_src->U_PIXELS;
1152     uint8_t  *p_v   = p_src->V_PIXELS;
1153
1154     vlc_bool_t  b_hscale;                         /* horizontal scaling type */
1155     unsigned int i_vscale;                          /* vertical scaling type */
1156     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
1157
1158     int         i_right_margin;
1159     int         i_rewind;
1160     int         i_scale_count;                       /* scale modulo counter */
1161     int         i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
1162     uint32_t *  p_pic_start;       /* beginning of the current line for copy */
1163     /* Conversion buffer pointer */
1164     uint32_t *  p_buffer_start = (uint32_t*)p_vout->chroma.p_sys->p_buffer;
1165     uint32_t *  p_buffer;
1166
1167     /* Offset array pointer */
1168     int *       p_offset_start = p_vout->chroma.p_sys->p_offset;
1169     int *       p_offset;
1170
1171     const int i_source_margin = p_src->p[0].i_pitch
1172                                  - p_src->p[0].i_visible_pitch;
1173     const int i_source_margin_c = p_src->p[1].i_pitch
1174                                  - p_src->p[1].i_visible_pitch;
1175
1176     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
1177
1178     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
1179      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
1180      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
1181     SetOffset( p_vout->render.i_width, p_vout->render.i_height,
1182                p_vout->output.i_width, p_vout->output.i_height,
1183                &b_hscale, &i_vscale, p_offset_start );
1184
1185     /*
1186      * Perform conversion
1187      */
1188     i_scale_count = ( i_vscale == 1 ) ?
1189                     p_vout->output.i_height : p_vout->render.i_height;
1190
1191 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
1192
1193     if( p_vout->render.i_width & 15 )
1194     {
1195         i_rewind = 16 - ( p_vout->render.i_width & 15 );
1196     }
1197     else
1198     {
1199         i_rewind = 0;
1200     }
1201
1202     /*
1203     ** SSE2 128 bits fetch/store instructions are faster
1204     ** if memory access is 16 bytes aligned
1205     */
1206
1207     p_buffer = b_hscale ? p_buffer_start : p_pic;
1208     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
1209                     p_dest->p->i_pitch|
1210                     ((intptr_t)p_y)|
1211                     ((intptr_t)p_buffer))) )
1212     {
1213         /* use faster SSE2 aligned fetch and store */
1214         for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1215         {
1216             p_pic_start = p_pic;
1217
1218             for ( i_x = p_vout->render.i_width / 16; i_x--; )
1219             {
1220                 SSE2_CALL (
1221                     SSE2_INIT_32_ALIGNED
1222                     SSE2_YUV_MUL
1223                     SSE2_YUV_ADD
1224                     SSE2_UNPACK_32_RGBA_ALIGNED
1225                 );
1226                 p_y += 16;
1227                 p_u += 8;
1228                 p_v += 8;
1229                 p_buffer += 16;
1230             }
1231
1232             /* Here we do some unaligned reads and duplicate conversions, but
1233              * at least we have all the pixels */
1234             if( i_rewind )
1235             {
1236                 p_y -= i_rewind;
1237                 p_u -= i_rewind >> 1;
1238                 p_v -= i_rewind >> 1;
1239                 p_buffer -= i_rewind;
1240                 SSE2_CALL (
1241                     SSE2_INIT_32_UNALIGNED
1242                     SSE2_YUV_MUL
1243                     SSE2_YUV_ADD
1244                     SSE2_UNPACK_32_RGBA_UNALIGNED
1245                 );
1246                 p_y += 16;
1247                 p_u += 4;
1248                 p_v += 4;
1249             }
1250             SCALE_WIDTH;
1251             SCALE_HEIGHT( 420, 4 );
1252
1253             p_y += i_source_margin;
1254             if( i_y % 2 )
1255             {
1256                 p_u += i_source_margin_c;
1257                 p_v += i_source_margin_c;
1258             }
1259             p_buffer = b_hscale ? p_buffer_start : p_pic;
1260         }
1261     }
1262     else
1263     {
1264         /* use slower SSE2 unaligned fetch and store */
1265         for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1266         {
1267             p_pic_start = p_pic;
1268             p_buffer = b_hscale ? p_buffer_start : p_pic;
1269
1270             for ( i_x = p_vout->render.i_width / 16; i_x--; )
1271             {
1272                 SSE2_CALL (
1273                     SSE2_INIT_32_UNALIGNED
1274                     SSE2_YUV_MUL
1275                     SSE2_YUV_ADD
1276                     SSE2_UNPACK_32_RGBA_UNALIGNED
1277                 );
1278                 p_y += 16;
1279                 p_u += 8;
1280                 p_v += 8;
1281                 p_buffer += 16;
1282             }
1283
1284             /* Here we do some unaligned reads and duplicate conversions, but
1285              * at least we have all the pixels */
1286             if( i_rewind )
1287             {
1288                 p_y -= i_rewind;
1289                 p_u -= i_rewind >> 1;
1290                 p_v -= i_rewind >> 1;
1291                 p_buffer -= i_rewind;
1292                 SSE2_CALL (
1293                     SSE2_INIT_32_UNALIGNED
1294                     SSE2_YUV_MUL
1295                     SSE2_YUV_ADD
1296                     SSE2_UNPACK_32_RGBA_UNALIGNED
1297                 );
1298                 p_y += 16;
1299                 p_u += 8;
1300                 p_v += 8;
1301             }
1302             SCALE_WIDTH;
1303             SCALE_HEIGHT( 420, 4 );
1304
1305             p_y += i_source_margin;
1306             if( i_y % 2 )
1307             {
1308                 p_u += i_source_margin_c;
1309                 p_v += i_source_margin_c;
1310             }
1311             p_buffer = b_hscale ? p_buffer_start : p_pic;
1312         }
1313     }
1314
1315     /* make sure all SSE2 stores are visible thereafter */
1316     SSE2_END;
1317
1318 #else // defined (MODULE_NAME_IS_i420_rgb_mmx)
1319
1320     if( p_vout->render.i_width & 7 )
1321     {
1322         i_rewind = 8 - ( p_vout->render.i_width & 7 );
1323     }
1324     else
1325     {
1326         i_rewind = 0;
1327     }
1328
1329     for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1330     {
1331         p_pic_start = p_pic;
1332         p_buffer = b_hscale ? p_buffer_start : p_pic;
1333
1334         for ( i_x = p_vout->render.i_width / 8; i_x--; )
1335         {
1336             MMX_CALL (
1337                 MMX_INIT_32
1338                 MMX_YUV_MUL
1339                 MMX_YUV_ADD
1340                 MMX_UNPACK_32_RGBA
1341             );
1342             p_y += 8;
1343             p_u += 4;
1344             p_v += 4;
1345             p_buffer += 8;
1346         }
1347
1348         /* Here we do some unaligned reads and duplicate conversions, but
1349          * at least we have all the pixels */
1350         if( i_rewind )
1351         {
1352             p_y -= i_rewind;
1353             p_u -= i_rewind >> 1;
1354             p_v -= i_rewind >> 1;
1355             p_buffer -= i_rewind;
1356             MMX_CALL (
1357                 MMX_INIT_32
1358                 MMX_YUV_MUL
1359                 MMX_YUV_ADD
1360                 MMX_UNPACK_32_RGBA
1361             );
1362             p_y += 8;
1363             p_u += 4;
1364             p_v += 4;
1365             p_buffer += 8;
1366         }
1367         SCALE_WIDTH;
1368         SCALE_HEIGHT( 420, 4 );
1369
1370         p_y += i_source_margin;
1371         if( i_y % 2 )
1372         {
1373             p_u += i_source_margin_c;
1374             p_v += i_source_margin_c;
1375         }
1376     }
1377
1378     /* re-enable FPU registers */
1379     MMX_END;
1380
1381 #endif
1382 }
1383
1384 void E_(I420_B8G8R8A8)( vout_thread_t *p_vout, picture_t *p_src,
1385                                             picture_t *p_dest )
1386 {
1387     /* We got this one from the old arguments */
1388     uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
1389     uint8_t  *p_y   = p_src->Y_PIXELS;
1390     uint8_t  *p_u   = p_src->U_PIXELS;
1391     uint8_t  *p_v   = p_src->V_PIXELS;
1392
1393     vlc_bool_t  b_hscale;                         /* horizontal scaling type */
1394     unsigned int i_vscale;                          /* vertical scaling type */
1395     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
1396
1397     int         i_right_margin;
1398     int         i_rewind;
1399     int         i_scale_count;                       /* scale modulo counter */
1400     int         i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
1401     uint32_t *  p_pic_start;       /* beginning of the current line for copy */
1402     /* Conversion buffer pointer */
1403     uint32_t *  p_buffer_start = (uint32_t*)p_vout->chroma.p_sys->p_buffer;
1404     uint32_t *  p_buffer;
1405
1406     /* Offset array pointer */
1407     int *       p_offset_start = p_vout->chroma.p_sys->p_offset;
1408     int *       p_offset;
1409
1410     const int i_source_margin = p_src->p[0].i_pitch
1411                                  - p_src->p[0].i_visible_pitch;
1412     const int i_source_margin_c = p_src->p[1].i_pitch
1413                                  - p_src->p[1].i_visible_pitch;
1414
1415     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
1416
1417     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
1418      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
1419      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
1420     SetOffset( p_vout->render.i_width, p_vout->render.i_height,
1421                p_vout->output.i_width, p_vout->output.i_height,
1422                &b_hscale, &i_vscale, p_offset_start );
1423
1424     /*
1425      * Perform conversion
1426      */
1427     i_scale_count = ( i_vscale == 1 ) ?
1428                     p_vout->output.i_height : p_vout->render.i_height;
1429
1430 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
1431
1432     if( p_vout->render.i_width & 15 )
1433     {
1434         i_rewind = 16 - ( p_vout->render.i_width & 15 );
1435     }
1436     else
1437     {
1438         i_rewind = 0;
1439     }
1440
1441     /*
1442     ** SSE2 128 bits fetch/store instructions are faster
1443     ** if memory access is 16 bytes aligned
1444     */
1445
1446     p_buffer = b_hscale ? p_buffer_start : p_pic;
1447     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
1448                     p_dest->p->i_pitch|
1449                     ((intptr_t)p_y)|
1450                     ((intptr_t)p_buffer))) )
1451     {
1452         /* use faster SSE2 aligned fetch and store */
1453         for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1454         {
1455             p_pic_start = p_pic;
1456
1457             for ( i_x = p_vout->render.i_width / 16; i_x--; )
1458             {
1459                 SSE2_CALL (
1460                     SSE2_INIT_32_ALIGNED
1461                     SSE2_YUV_MUL
1462                     SSE2_YUV_ADD
1463                     SSE2_UNPACK_32_BGRA_ALIGNED
1464                 );
1465                 p_y += 16;
1466                 p_u += 8;
1467                 p_v += 8;
1468                 p_buffer += 16;
1469             }
1470
1471             /* Here we do some unaligned reads and duplicate conversions, but
1472              * at least we have all the pixels */
1473             if( i_rewind )
1474             {
1475                 p_y -= i_rewind;
1476                 p_u -= i_rewind >> 1;
1477                 p_v -= i_rewind >> 1;
1478                 p_buffer -= i_rewind;
1479                 SSE2_CALL (
1480                     SSE2_INIT_32_UNALIGNED
1481                     SSE2_YUV_MUL
1482                     SSE2_YUV_ADD
1483                     SSE2_UNPACK_32_BGRA_UNALIGNED
1484                 );
1485                 p_y += 16;
1486                 p_u += 4;
1487                 p_v += 4;
1488             }
1489             SCALE_WIDTH;
1490             SCALE_HEIGHT( 420, 4 );
1491
1492             p_y += i_source_margin;
1493             if( i_y % 2 )
1494             {
1495                 p_u += i_source_margin_c;
1496                 p_v += i_source_margin_c;
1497             }
1498             p_buffer = b_hscale ? p_buffer_start : p_pic;
1499         }
1500     }
1501     else
1502     {
1503         /* use slower SSE2 unaligned fetch and store */
1504         for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1505         {
1506             p_pic_start = p_pic;
1507             p_buffer = b_hscale ? p_buffer_start : p_pic;
1508
1509             for ( i_x = p_vout->render.i_width / 16; i_x--; )
1510             {
1511                 SSE2_CALL (
1512                     SSE2_INIT_32_UNALIGNED
1513                     SSE2_YUV_MUL
1514                     SSE2_YUV_ADD
1515                     SSE2_UNPACK_32_BGRA_UNALIGNED
1516                 );
1517                 p_y += 16;
1518                 p_u += 8;
1519                 p_v += 8;
1520                 p_buffer += 16;
1521             }
1522
1523             /* Here we do some unaligned reads and duplicate conversions, but
1524              * at least we have all the pixels */
1525             if( i_rewind )
1526             {
1527                 p_y -= i_rewind;
1528                 p_u -= i_rewind >> 1;
1529                 p_v -= i_rewind >> 1;
1530                 p_buffer -= i_rewind;
1531                 SSE2_CALL (
1532                     SSE2_INIT_32_UNALIGNED
1533                     SSE2_YUV_MUL
1534                     SSE2_YUV_ADD
1535                     SSE2_UNPACK_32_BGRA_UNALIGNED
1536                 );
1537                 p_y += 16;
1538                 p_u += 8;
1539                 p_v += 8;
1540             }
1541             SCALE_WIDTH;
1542             SCALE_HEIGHT( 420, 4 );
1543
1544             p_y += i_source_margin;
1545             if( i_y % 2 )
1546             {
1547                 p_u += i_source_margin_c;
1548                 p_v += i_source_margin_c;
1549             }
1550             p_buffer = b_hscale ? p_buffer_start : p_pic;
1551         }
1552     }
1553
1554 #else
1555
1556     if( p_vout->render.i_width & 7 )
1557     {
1558         i_rewind = 8 - ( p_vout->render.i_width & 7 );
1559     }
1560     else
1561     {
1562         i_rewind = 0;
1563     }
1564
1565     for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1566     {
1567         p_pic_start = p_pic;
1568         p_buffer = b_hscale ? p_buffer_start : p_pic;
1569
1570         for ( i_x = p_vout->render.i_width / 8; i_x--; )
1571         {
1572             MMX_CALL (
1573                 MMX_INIT_32
1574                 MMX_YUV_MUL
1575                 MMX_YUV_ADD
1576                 MMX_UNPACK_32_BGRA
1577             );
1578             p_y += 8;
1579             p_u += 4;
1580             p_v += 4;
1581             p_buffer += 8;
1582         }
1583
1584         /* Here we do some unaligned reads and duplicate conversions, but
1585          * at least we have all the pixels */
1586         if( i_rewind )
1587         {
1588             p_y -= i_rewind;
1589             p_u -= i_rewind >> 1;
1590             p_v -= i_rewind >> 1;
1591             p_buffer -= i_rewind;
1592             MMX_CALL (
1593                 MMX_INIT_32
1594                 MMX_YUV_MUL
1595                 MMX_YUV_ADD
1596                 MMX_UNPACK_32_BGRA
1597             );
1598             p_y += 8;
1599             p_u += 4;
1600             p_v += 4;
1601             p_buffer += 8;
1602         }
1603         SCALE_WIDTH;
1604         SCALE_HEIGHT( 420, 4 );
1605
1606         p_y += i_source_margin;
1607         if( i_y % 2 )
1608         {
1609             p_u += i_source_margin_c;
1610             p_v += i_source_margin_c;
1611         }
1612     }
1613
1614     /* re-enable FPU registers */
1615     MMX_END;
1616
1617 #endif
1618 }
1619
1620 void E_(I420_A8B8G8R8)( vout_thread_t *p_vout, picture_t *p_src,
1621                                             picture_t *p_dest )
1622 {
1623     /* We got this one from the old arguments */
1624     uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
1625     uint8_t  *p_y   = p_src->Y_PIXELS;
1626     uint8_t  *p_u   = p_src->U_PIXELS;
1627     uint8_t  *p_v   = p_src->V_PIXELS;
1628
1629     vlc_bool_t  b_hscale;                         /* horizontal scaling type */
1630     unsigned int i_vscale;                          /* vertical scaling type */
1631     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
1632
1633     int         i_right_margin;
1634     int         i_rewind;
1635     int         i_scale_count;                       /* scale modulo counter */
1636     int         i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
1637     uint32_t *  p_pic_start;       /* beginning of the current line for copy */
1638     /* Conversion buffer pointer */
1639     uint32_t *  p_buffer_start = (uint32_t*)p_vout->chroma.p_sys->p_buffer;
1640     uint32_t *  p_buffer;
1641
1642     /* Offset array pointer */
1643     int *       p_offset_start = p_vout->chroma.p_sys->p_offset;
1644     int *       p_offset;
1645
1646     const int i_source_margin = p_src->p[0].i_pitch
1647                                  - p_src->p[0].i_visible_pitch;
1648     const int i_source_margin_c = p_src->p[1].i_pitch
1649                                  - p_src->p[1].i_visible_pitch;
1650
1651     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
1652
1653     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
1654      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
1655      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
1656     SetOffset( p_vout->render.i_width, p_vout->render.i_height,
1657                p_vout->output.i_width, p_vout->output.i_height,
1658                &b_hscale, &i_vscale, p_offset_start );
1659
1660     /*
1661      * Perform conversion
1662      */
1663     i_scale_count = ( i_vscale == 1 ) ?
1664                     p_vout->output.i_height : p_vout->render.i_height;
1665
1666 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
1667
1668     if( p_vout->render.i_width & 15 )
1669     {
1670         i_rewind = 16 - ( p_vout->render.i_width & 15 );
1671     }
1672     else
1673     {
1674         i_rewind = 0;
1675     }
1676
1677     /*
1678     ** SSE2 128 bits fetch/store instructions are faster
1679     ** if memory access is 16 bytes aligned
1680     */
1681
1682     p_buffer = b_hscale ? p_buffer_start : p_pic;
1683     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
1684                     p_dest->p->i_pitch|
1685                     ((intptr_t)p_y)|
1686                     ((intptr_t)p_buffer))) )
1687     {
1688         /* use faster SSE2 aligned fetch and store */
1689         for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1690         {
1691             p_pic_start = p_pic;
1692
1693             for ( i_x = p_vout->render.i_width / 16; i_x--; )
1694             {
1695                 SSE2_CALL (
1696                     SSE2_INIT_32_ALIGNED
1697                     SSE2_YUV_MUL
1698                     SSE2_YUV_ADD
1699                     SSE2_UNPACK_32_ABGR_ALIGNED
1700                 );
1701                 p_y += 16;
1702                 p_u += 8;
1703                 p_v += 8;
1704                 p_buffer += 16;
1705             }
1706
1707             /* Here we do some unaligned reads and duplicate conversions, but
1708              * at least we have all the pixels */
1709             if( i_rewind )
1710             {
1711                 p_y -= i_rewind;
1712                 p_u -= i_rewind >> 1;
1713                 p_v -= i_rewind >> 1;
1714                 p_buffer -= i_rewind;
1715                 SSE2_CALL (
1716                     SSE2_INIT_32_UNALIGNED
1717                     SSE2_YUV_MUL
1718                     SSE2_YUV_ADD
1719                     SSE2_UNPACK_32_ABGR_UNALIGNED
1720                 );
1721                 p_y += 16;
1722                 p_u += 4;
1723                 p_v += 4;
1724             }
1725             SCALE_WIDTH;
1726             SCALE_HEIGHT( 420, 4 );
1727
1728             p_y += i_source_margin;
1729             if( i_y % 2 )
1730             {
1731                 p_u += i_source_margin_c;
1732                 p_v += i_source_margin_c;
1733             }
1734             p_buffer = b_hscale ? p_buffer_start : p_pic;
1735         }
1736     }
1737     else
1738     {
1739         /* use slower SSE2 unaligned fetch and store */
1740         for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1741         {
1742             p_pic_start = p_pic;
1743             p_buffer = b_hscale ? p_buffer_start : p_pic;
1744
1745             for ( i_x = p_vout->render.i_width / 16; i_x--; )
1746             {
1747                 SSE2_CALL (
1748                     SSE2_INIT_32_UNALIGNED
1749                     SSE2_YUV_MUL
1750                     SSE2_YUV_ADD
1751                     SSE2_UNPACK_32_ABGR_UNALIGNED
1752                 );
1753                 p_y += 16;
1754                 p_u += 8;
1755                 p_v += 8;
1756                 p_buffer += 16;
1757             }
1758
1759             /* Here we do some unaligned reads and duplicate conversions, but
1760              * at least we have all the pixels */
1761             if( i_rewind )
1762             {
1763                 p_y -= i_rewind;
1764                 p_u -= i_rewind >> 1;
1765                 p_v -= i_rewind >> 1;
1766                 p_buffer -= i_rewind;
1767                 SSE2_CALL (
1768                     SSE2_INIT_32_UNALIGNED
1769                     SSE2_YUV_MUL
1770                     SSE2_YUV_ADD
1771                     SSE2_UNPACK_32_ABGR_UNALIGNED
1772                 );
1773                 p_y += 16;
1774                 p_u += 8;
1775                 p_v += 8;
1776             }
1777             SCALE_WIDTH;
1778             SCALE_HEIGHT( 420, 4 );
1779
1780             p_y += i_source_margin;
1781             if( i_y % 2 )
1782             {
1783                 p_u += i_source_margin_c;
1784                 p_v += i_source_margin_c;
1785             }
1786             p_buffer = b_hscale ? p_buffer_start : p_pic;
1787         }
1788     }
1789
1790 #else
1791
1792     if( p_vout->render.i_width & 7 )
1793     {
1794         i_rewind = 8 - ( p_vout->render.i_width & 7 );
1795     }
1796     else
1797     {
1798         i_rewind = 0;
1799     }
1800
1801     for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1802     {
1803         p_pic_start = p_pic;
1804         p_buffer = b_hscale ? p_buffer_start : p_pic;
1805
1806         for ( i_x = p_vout->render.i_width / 8; i_x--; )
1807         {
1808             MMX_CALL (
1809                 MMX_INIT_32
1810                 MMX_YUV_MUL
1811                 MMX_YUV_ADD
1812                 MMX_UNPACK_32_ABGR
1813             );
1814             p_y += 8;
1815             p_u += 4;
1816             p_v += 4;
1817             p_buffer += 8;
1818         }
1819
1820         /* Here we do some unaligned reads and duplicate conversions, but
1821          * at least we have all the pixels */
1822         if( i_rewind )
1823         {
1824             p_y -= i_rewind;
1825             p_u -= i_rewind >> 1;
1826             p_v -= i_rewind >> 1;
1827             p_buffer -= i_rewind;
1828             MMX_CALL (
1829                 MMX_INIT_32
1830                 MMX_YUV_MUL
1831                 MMX_YUV_ADD
1832                 MMX_UNPACK_32_ABGR
1833             );
1834             p_y += 8;
1835             p_u += 4;
1836             p_v += 4;
1837             p_buffer += 8;
1838         }
1839         SCALE_WIDTH;
1840         SCALE_HEIGHT( 420, 4 );
1841
1842         p_y += i_source_margin;
1843         if( i_y % 2 )
1844         {
1845             p_u += i_source_margin_c;
1846             p_v += i_source_margin_c;
1847         }
1848     }
1849
1850     /* re-enable FPU registers */
1851     MMX_END;
1852
1853 #endif
1854 }
1855
1856 #endif
1857
1858 /* Following functions are local */
1859
1860 /*****************************************************************************
1861  * SetOffset: build offset array for conversion functions
1862  *****************************************************************************
1863  * This function will build an offset array used in later conversion functions.
1864  * It will also set horizontal and vertical scaling indicators.
1865  *****************************************************************************/
1866 static void SetOffset( int i_width, int i_height, int i_pic_width,
1867                        int i_pic_height, vlc_bool_t *pb_hscale,
1868                        unsigned int *pi_vscale, int *p_offset )
1869 {
1870     int i_x;                                    /* x position in destination */
1871     int i_scale_count;                                     /* modulo counter */
1872
1873     /*
1874      * Prepare horizontal offset array
1875      */
1876     if( i_pic_width - i_width == 0 )
1877     {
1878         /* No horizontal scaling: YUV conversion is done directly to picture */
1879         *pb_hscale = 0;
1880     }
1881     else if( i_pic_width - i_width > 0 )
1882     {
1883         /* Prepare scaling array for horizontal extension */
1884         *pb_hscale = 1;
1885         i_scale_count = i_pic_width;
1886         for( i_x = i_width; i_x--; )
1887         {
1888             while( (i_scale_count -= i_width) > 0 )
1889             {
1890                 *p_offset++ = 0;
1891             }
1892             *p_offset++ = 1;
1893             i_scale_count += i_pic_width;
1894         }
1895     }
1896     else /* if( i_pic_width - i_width < 0 ) */
1897     {
1898         /* Prepare scaling array for horizontal reduction */
1899         *pb_hscale = 1;
1900         i_scale_count = i_width;
1901         for( i_x = i_pic_width; i_x--; )
1902         {
1903             *p_offset = 1;
1904             while( (i_scale_count -= i_pic_width) > 0 )
1905             {
1906                 *p_offset += 1;
1907             }
1908             p_offset++;
1909             i_scale_count += i_width;
1910         }
1911     }
1912
1913     /*
1914      * Set vertical scaling indicator
1915      */
1916     if( i_pic_height - i_height == 0 )
1917     {
1918         *pi_vscale = 0;
1919     }
1920     else if( i_pic_height - i_height > 0 )
1921     {
1922         *pi_vscale = 1;
1923     }
1924     else /* if( i_pic_height - i_height < 0 ) */
1925     {
1926         *pi_vscale = -1;
1927     }
1928 }
1929