]> git.sesse.net Git - vlc/blob - modules/video_chroma/i420_rgb16.c
Use var_Inherit* instead of var_CreateGet*.
[vlc] / modules / video_chroma / i420_rgb16.c
1 /*****************************************************************************
2  * i420_rgb16.c : YUV to bitmap RGB conversion module for vlc
3  *****************************************************************************
4  * Copyright (C) 2000 the VideoLAN team
5  * $Id$
6  *
7  * Authors: Samuel Hocevar <sam@zoy.org>
8  *          Damien Fouilleul <damienf@videolan.org>
9  *
10  * This program is free software; you can redistribute it and/or modify
11  * it under the terms of the GNU General Public License as published by
12  * the Free Software Foundation; either version 2 of the License, or
13  * (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public License
21  * along with this program; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
23  *****************************************************************************/
24
25 /*****************************************************************************
26  * Preamble
27  *****************************************************************************/
28
29 #ifdef HAVE_CONFIG_H
30 # include "config.h"
31 #endif
32
33 #include <vlc_common.h>
34 #include <vlc_filter.h>
35
36 #include "i420_rgb.h"
37 #if defined (MODULE_NAME_IS_i420_rgb)
38 #   include "i420_rgb_c.h"
39 #elif defined (MODULE_NAME_IS_i420_rgb_mmx)
40 #   include "../mmx/i420_rgb_mmx.h"
41 #elif defined (MODULE_NAME_IS_i420_rgb_sse2)
42 #   include "../mmx/i420_rgb_mmx.h"
43 #endif
44
45 static void SetOffset( int, int, int, int, bool *,
46                        unsigned int *, int * );
47
48 #if defined (MODULE_NAME_IS_i420_rgb)
49 /*****************************************************************************
50  * I420_RGB16: color YUV 4:2:0 to RGB 16 bpp with dithering
51  *****************************************************************************
52  * Horizontal alignment needed:
53  *  - input: 8 pixels (8 Y bytes, 4 U/V bytes), margins not allowed
54  *  - output: 1 pixel (2 bytes), margins allowed
55  * Vertical alignment needed:
56  *  - input: 2 lines (2 Y lines, 1 U/V line)
57  *  - output: 1 line
58  *****************************************************************************/
59 void I420_RGB16_dither( filter_t *p_filter, picture_t *p_src,
60                                                 picture_t *p_dest )
61 {
62     /* We got this one from the old arguments */
63     uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
64     uint8_t  *p_y   = p_src->Y_PIXELS;
65     uint8_t  *p_u   = p_src->U_PIXELS;
66     uint8_t  *p_v   = p_src->V_PIXELS;
67
68     bool   b_hscale;                        /* horizontal scaling type */
69     unsigned int i_vscale;                          /* vertical scaling type */
70     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
71     unsigned int i_real_y;                                          /* y % 4 */
72
73     int         i_right_margin;
74     int         i_rewind;
75     int         i_scale_count;                       /* scale modulo counter */
76     int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
77     uint16_t *  p_pic_start;       /* beginning of the current line for copy */
78     int         i_uval, i_vval;                           /* U and V samples */
79     int         i_red, i_green, i_blue;          /* U and V modified samples */
80     uint16_t *  p_yuv = p_filter->p_sys->p_rgb16;
81     uint16_t *  p_ybase;                     /* Y dependant conversion table */
82
83     /* Conversion buffer pointer */
84     uint16_t *  p_buffer_start = (uint16_t*)p_filter->p_sys->p_buffer;
85     uint16_t *  p_buffer;
86
87     /* Offset array pointer */
88     int *       p_offset_start = p_filter->p_sys->p_offset;
89     int *       p_offset;
90
91     const int i_source_margin = p_src->p[0].i_pitch
92                                  - p_src->p[0].i_visible_pitch;
93     const int i_source_margin_c = p_src->p[1].i_pitch
94                                  - p_src->p[1].i_visible_pitch;
95
96     /* The dithering matrices */
97     int dither10[4] = {  0x0,  0x8,  0x2,  0xa };
98     int dither11[4] = {  0xc,  0x4,  0xe,  0x6 };
99     int dither12[4] = {  0x3,  0xb,  0x1,  0x9 };
100     int dither13[4] = {  0xf,  0x7,  0xd,  0x5 };
101
102     for(i_x = 0; i_x < 4; i_x++)
103     {
104         dither10[i_x] = dither10[i_x] << (SHIFT - 4 + p_filter->fmt_out.video.i_rrshift);
105         dither11[i_x] = dither11[i_x] << (SHIFT - 4 + p_filter->fmt_out.video.i_rrshift);
106         dither12[i_x] = dither12[i_x] << (SHIFT - 4 + p_filter->fmt_out.video.i_rrshift);
107         dither13[i_x] = dither13[i_x] << (SHIFT - 4 + p_filter->fmt_out.video.i_rrshift);
108     }
109
110     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
111     i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
112
113     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
114      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
115      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
116     SetOffset( p_filter->fmt_in.video.i_width,
117                p_filter->fmt_in.video.i_height,
118                p_filter->fmt_out.video.i_width,
119                p_filter->fmt_out.video.i_height,
120                &b_hscale, &i_vscale, p_offset_start );
121
122     /*
123      * Perform conversion
124      */
125     i_scale_count = ( i_vscale == 1 ) ?
126                     p_filter->fmt_out.video.i_height :
127                     p_filter->fmt_in.video.i_height;
128     for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
129     {
130         i_real_y = i_y & 0x3;
131         p_pic_start = p_pic;
132         p_buffer = b_hscale ? p_buffer_start : p_pic;
133
134         for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
135         {
136             int *p_dither = dither10;
137             CONVERT_YUV_PIXEL_DITHER(2);
138             p_dither = dither11;
139             CONVERT_Y_PIXEL_DITHER(2);
140             p_dither = dither12;
141             CONVERT_YUV_PIXEL_DITHER(2);
142             p_dither = dither13;
143             CONVERT_Y_PIXEL_DITHER(2);
144             p_dither = dither10;
145             CONVERT_YUV_PIXEL_DITHER(2);
146             p_dither = dither11;
147             CONVERT_Y_PIXEL_DITHER(2);
148             p_dither = dither12;
149             CONVERT_YUV_PIXEL_DITHER(2);
150             p_dither = dither13;
151             CONVERT_Y_PIXEL_DITHER(2);
152         }
153
154         /* Here we do some unaligned reads and duplicate conversions, but
155          * at least we have all the pixels */
156         if( i_rewind )
157         {
158             int *p_dither = dither10;
159             p_y -= i_rewind;
160             p_u -= i_rewind >> 1;
161             p_v -= i_rewind >> 1;
162             p_buffer -= i_rewind;
163             CONVERT_YUV_PIXEL_DITHER(2);
164             p_dither = dither11;
165             CONVERT_Y_PIXEL_DITHER(2);
166             p_dither = dither12;
167             CONVERT_YUV_PIXEL_DITHER(2);
168             p_dither = dither13;
169             CONVERT_Y_PIXEL_DITHER(2);
170             p_dither = dither10;
171             CONVERT_YUV_PIXEL_DITHER(2);
172             p_dither = dither11;
173             CONVERT_Y_PIXEL_DITHER(2);
174             p_dither = dither12;
175             CONVERT_YUV_PIXEL_DITHER(2);
176             p_dither = dither13;
177             CONVERT_Y_PIXEL_DITHER(2);
178         }
179         SCALE_WIDTH;
180         SCALE_HEIGHT( 420, 2 );
181
182         p_y += i_source_margin;
183         if( i_y % 2 )
184         {
185             p_u += i_source_margin_c;
186             p_v += i_source_margin_c;
187         }
188     }
189 }
190 #endif
191
192 /*****************************************************************************
193  * I420_RGB16: color YUV 4:2:0 to RGB 16 bpp
194  *****************************************************************************
195  * Horizontal alignment needed:
196  *  - input: 8 pixels (8 Y bytes, 4 U/V bytes), margins not allowed
197  *  - output: 1 pixel (2 bytes), margins allowed
198  * Vertical alignment needed:
199  *  - input: 2 lines (2 Y lines, 1 U/V line)
200  *  - output: 1 line
201  *****************************************************************************/
202
203 #if defined (MODULE_NAME_IS_i420_rgb)
204
205 void I420_RGB16( filter_t *p_filter, picture_t *p_src,
206                                          picture_t *p_dest )
207 {
208     /* We got this one from the old arguments */
209     uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
210     uint8_t  *p_y   = p_src->Y_PIXELS;
211     uint8_t  *p_u   = p_src->U_PIXELS;
212     uint8_t  *p_v   = p_src->V_PIXELS;
213
214     bool  b_hscale;                         /* horizontal scaling type */
215     unsigned int i_vscale;                          /* vertical scaling type */
216     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
217
218     int         i_right_margin;
219     int         i_rewind;
220     int         i_scale_count;                       /* scale modulo counter */
221     int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
222     uint16_t *  p_pic_start;       /* beginning of the current line for copy */
223     int         i_uval, i_vval;                           /* U and V samples */
224     int         i_red, i_green, i_blue;          /* U and V modified samples */
225     uint16_t *  p_yuv = p_filter->p_sys->p_rgb16;
226     uint16_t *  p_ybase;                     /* Y dependant conversion table */
227
228     /* Conversion buffer pointer */
229     uint16_t *  p_buffer_start = (uint16_t*)p_filter->p_sys->p_buffer;
230     uint16_t *  p_buffer;
231
232     /* Offset array pointer */
233     int *       p_offset_start = p_filter->p_sys->p_offset;
234     int *       p_offset;
235
236     const int i_source_margin = p_src->p[0].i_pitch
237                                  - p_src->p[0].i_visible_pitch;
238     const int i_source_margin_c = p_src->p[1].i_pitch
239                                  - p_src->p[1].i_visible_pitch;
240
241     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
242     i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
243
244     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
245      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
246      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
247     SetOffset( p_filter->fmt_in.video.i_width,
248                p_filter->fmt_in.video.i_height,
249                p_filter->fmt_out.video.i_width,
250                p_filter->fmt_out.video.i_height,
251                &b_hscale, &i_vscale, p_offset_start );
252
253     /*
254      * Perform conversion
255      */
256     i_scale_count = ( i_vscale == 1 ) ?
257                     p_filter->fmt_out.video.i_height :
258                     p_filter->fmt_in.video.i_height;
259     for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
260     {
261         p_pic_start = p_pic;
262         p_buffer = b_hscale ? p_buffer_start : p_pic;
263
264         for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
265         {
266             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
267             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
268             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
269             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
270         }
271
272         /* Here we do some unaligned reads and duplicate conversions, but
273          * at least we have all the pixels */
274         if( i_rewind )
275         {
276             p_y -= i_rewind;
277             p_u -= i_rewind >> 1;
278             p_v -= i_rewind >> 1;
279             p_buffer -= i_rewind;
280
281             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
282             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
283             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
284             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
285         }
286         SCALE_WIDTH;
287         SCALE_HEIGHT( 420, 2 );
288
289         p_y += i_source_margin;
290         if( i_y % 2 )
291         {
292             p_u += i_source_margin_c;
293             p_v += i_source_margin_c;
294         }
295     }
296 }
297
298 #else // ! defined (MODULE_NAME_IS_i420_rgb)
299
300 void I420_R5G5B5( filter_t *p_filter, picture_t *p_src,
301                                           picture_t *p_dest )
302 {
303     /* We got this one from the old arguments */
304     uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
305     uint8_t  *p_y   = p_src->Y_PIXELS;
306     uint8_t  *p_u   = p_src->U_PIXELS;
307     uint8_t  *p_v   = p_src->V_PIXELS;
308
309     bool  b_hscale;                         /* horizontal scaling type */
310     unsigned int i_vscale;                          /* vertical scaling type */
311     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
312
313     int         i_right_margin;
314     int         i_rewind;
315     int         i_scale_count;                       /* scale modulo counter */
316     int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
317     uint16_t *  p_pic_start;       /* beginning of the current line for copy */
318
319     /* Conversion buffer pointer */
320     uint16_t *  p_buffer_start = (uint16_t*)p_filter->p_sys->p_buffer;
321     uint16_t *  p_buffer;
322
323     /* Offset array pointer */
324     int *       p_offset_start = p_filter->p_sys->p_offset;
325     int *       p_offset;
326
327     const int i_source_margin = p_src->p[0].i_pitch
328                                  - p_src->p[0].i_visible_pitch;
329     const int i_source_margin_c = p_src->p[1].i_pitch
330                                  - p_src->p[1].i_visible_pitch;
331
332     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
333
334     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
335      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
336      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
337     SetOffset( p_filter->fmt_in.video.i_width,
338                p_filter->fmt_in.video.i_height,
339                p_filter->fmt_out.video.i_width,
340                p_filter->fmt_out.video.i_height,
341                &b_hscale, &i_vscale, p_offset_start );
342
343
344     /*
345      * Perform conversion
346      */
347     i_scale_count = ( i_vscale == 1 ) ?
348                     p_filter->fmt_out.video.i_height :
349                     p_filter->fmt_in.video.i_height;
350
351 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
352
353     i_rewind = (-p_filter->fmt_in.video.i_width) & 15;
354
355     /*
356     ** SSE2 128 bits fetch/store instructions are faster
357     ** if memory access is 16 bytes aligned
358     */
359
360     p_buffer = b_hscale ? p_buffer_start : p_pic;
361     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
362                     p_dest->p->i_pitch|
363                     ((intptr_t)p_y)|
364                     ((intptr_t)p_buffer))) )
365     {
366         /* use faster SSE2 aligned fetch and store */
367         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
368         {
369             p_pic_start = p_pic;
370
371             for ( i_x = p_filter->fmt_in.video.i_width/16; i_x--; )
372             {
373                 SSE2_CALL (
374                     SSE2_INIT_16_ALIGNED
375                     SSE2_YUV_MUL
376                     SSE2_YUV_ADD
377                     SSE2_UNPACK_15_ALIGNED
378                 );
379                 p_y += 16;
380                 p_u += 8;
381                 p_v += 8;
382                 p_buffer += 16;
383             }
384             /* Here we do some unaligned reads and duplicate conversions, but
385              * at least we have all the pixels */
386             if( i_rewind )
387             {
388                 p_y -= i_rewind;
389                 p_u -= i_rewind >> 1;
390                 p_v -= i_rewind >> 1;
391                 p_buffer -= i_rewind;
392
393                 SSE2_CALL (
394                     SSE2_INIT_16_UNALIGNED
395                     SSE2_YUV_MUL
396                     SSE2_YUV_ADD
397                     SSE2_UNPACK_15_UNALIGNED
398                 );
399                 p_y += 16;
400                 p_u += 8;
401                 p_v += 8;
402             }
403             SCALE_WIDTH;
404             SCALE_HEIGHT( 420, 2 );
405
406             p_y += i_source_margin;
407             if( i_y % 2 )
408             {
409                 p_u += i_source_margin_c;
410                 p_v += i_source_margin_c;
411             }
412             p_buffer = b_hscale ? p_buffer_start : p_pic;
413         }
414     }
415     else
416     {
417         /* use slower SSE2 unaligned fetch and store */
418         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
419         {
420             p_pic_start = p_pic;
421             p_buffer = b_hscale ? p_buffer_start : p_pic;
422
423             for ( i_x = p_filter->fmt_in.video.i_width/16; i_x--; )
424             {
425                 SSE2_CALL (
426                     SSE2_INIT_16_UNALIGNED
427                     SSE2_YUV_MUL
428                     SSE2_YUV_ADD
429                     SSE2_UNPACK_15_UNALIGNED
430                 );
431                 p_y += 16;
432                 p_u += 8;
433                 p_v += 8;
434                 p_buffer += 16;
435             }
436             /* Here we do some unaligned reads and duplicate conversions, but
437              * at least we have all the pixels */
438             if( i_rewind )
439             {
440                 p_y -= i_rewind;
441                 p_u -= i_rewind >> 1;
442                 p_v -= i_rewind >> 1;
443                 p_buffer -= i_rewind;
444
445                 SSE2_CALL (
446                     SSE2_INIT_16_UNALIGNED
447                     SSE2_YUV_MUL
448                     SSE2_YUV_ADD
449                     SSE2_UNPACK_15_UNALIGNED
450                 );
451                 p_y += 16;
452                 p_u += 8;
453                 p_v += 8;
454             }
455             SCALE_WIDTH;
456             SCALE_HEIGHT( 420, 2 );
457
458             p_y += i_source_margin;
459             if( i_y % 2 )
460             {
461                 p_u += i_source_margin_c;
462                 p_v += i_source_margin_c;
463             }
464             p_buffer = b_hscale ? p_buffer_start : p_pic;
465         }
466     }
467
468     /* make sure all SSE2 stores are visible thereafter */
469     SSE2_END;
470
471 #else // defined (MODULE_NAME_IS_i420_rgb_mmx)
472
473     i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
474
475     for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
476     {
477         p_pic_start = p_pic;
478         p_buffer = b_hscale ? p_buffer_start : p_pic;
479
480         for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
481         {
482             MMX_CALL (
483                 MMX_INIT_16
484                 MMX_YUV_MUL
485                 MMX_YUV_ADD
486                 MMX_UNPACK_15
487             );
488             p_y += 8;
489             p_u += 4;
490             p_v += 4;
491             p_buffer += 8;
492         }
493
494         /* Here we do some unaligned reads and duplicate conversions, but
495          * at least we have all the pixels */
496         if( i_rewind )
497         {
498             p_y -= i_rewind;
499             p_u -= i_rewind >> 1;
500             p_v -= i_rewind >> 1;
501             p_buffer -= i_rewind;
502
503             MMX_CALL (
504                 MMX_INIT_16
505                 MMX_YUV_MUL
506                 MMX_YUV_ADD
507                 MMX_UNPACK_15
508             );
509             p_y += 8;
510             p_u += 4;
511             p_v += 4;
512             p_buffer += 8;
513         }
514         SCALE_WIDTH;
515         SCALE_HEIGHT( 420, 2 );
516
517         p_y += i_source_margin;
518         if( i_y % 2 )
519         {
520             p_u += i_source_margin_c;
521             p_v += i_source_margin_c;
522         }
523     }
524     /* re-enable FPU registers */
525     MMX_END;
526
527 #endif
528 }
529
530 void I420_R5G6B5( filter_t *p_filter, picture_t *p_src,
531                                           picture_t *p_dest )
532 {
533     /* We got this one from the old arguments */
534     uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
535     uint8_t  *p_y   = p_src->Y_PIXELS;
536     uint8_t  *p_u   = p_src->U_PIXELS;
537     uint8_t  *p_v   = p_src->V_PIXELS;
538
539     bool  b_hscale;                         /* horizontal scaling type */
540     unsigned int i_vscale;                          /* vertical scaling type */
541     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
542
543     int         i_right_margin;
544     int         i_rewind;
545     int         i_scale_count;                       /* scale modulo counter */
546     int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
547     uint16_t *  p_pic_start;       /* beginning of the current line for copy */
548
549     /* Conversion buffer pointer */
550     uint16_t *  p_buffer_start = (uint16_t*)p_filter->p_sys->p_buffer;
551     uint16_t *  p_buffer;
552
553     /* Offset array pointer */
554     int *       p_offset_start = p_filter->p_sys->p_offset;
555     int *       p_offset;
556
557     const int i_source_margin = p_src->p[0].i_pitch
558                                  - p_src->p[0].i_visible_pitch;
559     const int i_source_margin_c = p_src->p[1].i_pitch
560                                  - p_src->p[1].i_visible_pitch;
561
562     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
563
564     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
565      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
566      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
567     SetOffset( p_filter->fmt_in.video.i_width,
568                p_filter->fmt_in.video.i_height,
569                p_filter->fmt_out.video.i_width,
570                p_filter->fmt_out.video.i_height,
571                &b_hscale, &i_vscale, p_offset_start );
572
573
574     /*
575      * Perform conversion
576      */
577     i_scale_count = ( i_vscale == 1 ) ?
578                     p_filter->fmt_out.video.i_height :
579                     p_filter->fmt_in.video.i_height;
580
581 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
582
583     i_rewind = (-p_filter->fmt_in.video.i_width) & 15;
584
585     /*
586     ** SSE2 128 bits fetch/store instructions are faster
587     ** if memory access is 16 bytes aligned
588     */
589
590     p_buffer = b_hscale ? p_buffer_start : p_pic;
591     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
592                     p_dest->p->i_pitch|
593                     ((intptr_t)p_y)|
594                     ((intptr_t)p_buffer))) )
595     {
596         /* use faster SSE2 aligned fetch and store */
597         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
598         {
599             p_pic_start = p_pic;
600
601             for ( i_x = p_filter->fmt_in.video.i_width/16; i_x--; )
602             {
603                 SSE2_CALL (
604                     SSE2_INIT_16_ALIGNED
605                     SSE2_YUV_MUL
606                     SSE2_YUV_ADD
607                     SSE2_UNPACK_16_ALIGNED
608                 );
609                 p_y += 16;
610                 p_u += 8;
611                 p_v += 8;
612                 p_buffer += 16;
613             }
614             /* Here we do some unaligned reads and duplicate conversions, but
615              * at least we have all the pixels */
616             if( i_rewind )
617             {
618                 p_y -= i_rewind;
619                 p_u -= i_rewind >> 1;
620                 p_v -= i_rewind >> 1;
621                 p_buffer -= i_rewind;
622
623                 SSE2_CALL (
624                     SSE2_INIT_16_UNALIGNED
625                     SSE2_YUV_MUL
626                     SSE2_YUV_ADD
627                     SSE2_UNPACK_16_UNALIGNED
628                 );
629                 p_y += 16;
630                 p_u += 8;
631                 p_v += 8;
632             }
633             SCALE_WIDTH;
634             SCALE_HEIGHT( 420, 2 );
635
636             p_y += i_source_margin;
637             if( i_y % 2 )
638             {
639                 p_u += i_source_margin_c;
640                 p_v += i_source_margin_c;
641             }
642             p_buffer = b_hscale ? p_buffer_start : p_pic;
643         }
644     }
645     else
646     {
647         /* use slower SSE2 unaligned fetch and store */
648         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
649         {
650             p_pic_start = p_pic;
651             p_buffer = b_hscale ? p_buffer_start : p_pic;
652
653             for ( i_x = p_filter->fmt_in.video.i_width/16; i_x--; )
654             {
655                 SSE2_CALL(
656                     SSE2_INIT_16_UNALIGNED
657                     SSE2_YUV_MUL
658                     SSE2_YUV_ADD
659                     SSE2_UNPACK_16_UNALIGNED
660                 );
661                 p_y += 16;
662                 p_u += 8;
663                 p_v += 8;
664                 p_buffer += 16;
665             }
666             /* Here we do some unaligned reads and duplicate conversions, but
667              * at least we have all the pixels */
668             if( i_rewind )
669             {
670                 p_y -= i_rewind;
671                 p_u -= i_rewind >> 1;
672                 p_v -= i_rewind >> 1;
673                 p_buffer -= i_rewind;
674
675                 SSE2_CALL(
676                     SSE2_INIT_16_UNALIGNED
677                     SSE2_YUV_MUL
678                     SSE2_YUV_ADD
679                     SSE2_UNPACK_16_UNALIGNED
680                 );
681                 p_y += 16;
682                 p_u += 8;
683                 p_v += 8;
684             }
685             SCALE_WIDTH;
686             SCALE_HEIGHT( 420, 2 );
687
688             p_y += i_source_margin;
689             if( i_y % 2 )
690             {
691                 p_u += i_source_margin_c;
692                 p_v += i_source_margin_c;
693             }
694             p_buffer = b_hscale ? p_buffer_start : p_pic;
695         }
696     }
697
698     /* make sure all SSE2 stores are visible thereafter */
699     SSE2_END;
700
701 #else // defined (MODULE_NAME_IS_i420_rgb_mmx)
702
703     i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
704
705     for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
706     {
707         p_pic_start = p_pic;
708         p_buffer = b_hscale ? p_buffer_start : p_pic;
709
710         for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
711         {
712             MMX_CALL (
713                 MMX_INIT_16
714                 MMX_YUV_MUL
715                 MMX_YUV_ADD
716                 MMX_UNPACK_16
717             );
718             p_y += 8;
719             p_u += 4;
720             p_v += 4;
721             p_buffer += 8;
722         }
723
724         /* Here we do some unaligned reads and duplicate conversions, but
725          * at least we have all the pixels */
726         if( i_rewind )
727         {
728             p_y -= i_rewind;
729             p_u -= i_rewind >> 1;
730             p_v -= i_rewind >> 1;
731             p_buffer -= i_rewind;
732
733             MMX_CALL (
734                 MMX_INIT_16
735                 MMX_YUV_MUL
736                 MMX_YUV_ADD
737                 MMX_UNPACK_16
738             );
739             p_y += 8;
740             p_u += 4;
741             p_v += 4;
742             p_buffer += 8;
743         }
744         SCALE_WIDTH;
745         SCALE_HEIGHT( 420, 2 );
746
747         p_y += i_source_margin;
748         if( i_y % 2 )
749         {
750             p_u += i_source_margin_c;
751             p_v += i_source_margin_c;
752         }
753     }
754     /* re-enable FPU registers */
755     MMX_END;
756
757 #endif
758 }
759
760 #endif
761
762 /*****************************************************************************
763  * I420_RGB32: color YUV 4:2:0 to RGB 32 bpp
764  *****************************************************************************
765  * Horizontal alignment needed:
766  *  - input: 8 pixels (8 Y bytes, 4 U/V bytes), margins not allowed
767  *  - output: 1 pixel (2 bytes), margins allowed
768  * Vertical alignment needed:
769  *  - input: 2 lines (2 Y lines, 1 U/V line)
770  *  - output: 1 line
771  *****************************************************************************/
772
773 #if defined (MODULE_NAME_IS_i420_rgb)
774
775 void I420_RGB32( filter_t *p_filter, picture_t *p_src,
776                                          picture_t *p_dest )
777 {
778     /* We got this one from the old arguments */
779     uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
780     uint8_t  *p_y   = p_src->Y_PIXELS;
781     uint8_t  *p_u   = p_src->U_PIXELS;
782     uint8_t  *p_v   = p_src->V_PIXELS;
783
784     bool  b_hscale;                         /* horizontal scaling type */
785     unsigned int i_vscale;                          /* vertical scaling type */
786     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
787
788     int         i_right_margin;
789     int         i_rewind;
790     int         i_scale_count;                       /* scale modulo counter */
791     int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
792     uint32_t *  p_pic_start;       /* beginning of the current line for copy */
793     int         i_uval, i_vval;                           /* U and V samples */
794     int         i_red, i_green, i_blue;          /* U and V modified samples */
795     uint32_t *  p_yuv = p_filter->p_sys->p_rgb32;
796     uint32_t *  p_ybase;                     /* Y dependant conversion table */
797
798     /* Conversion buffer pointer */
799     uint32_t *  p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer;
800     uint32_t *  p_buffer;
801
802     /* Offset array pointer */
803     int *       p_offset_start = p_filter->p_sys->p_offset;
804     int *       p_offset;
805
806     const int i_source_margin = p_src->p[0].i_pitch
807                                  - p_src->p[0].i_visible_pitch;
808     const int i_source_margin_c = p_src->p[1].i_pitch
809                                  - p_src->p[1].i_visible_pitch;
810
811     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
812     i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
813
814     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
815      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
816      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
817     SetOffset( p_filter->fmt_in.video.i_width,
818                p_filter->fmt_in.video.i_height,
819                p_filter->fmt_out.video.i_width,
820                p_filter->fmt_out.video.i_height,
821                &b_hscale, &i_vscale, p_offset_start );
822
823     /*
824      * Perform conversion
825      */
826     i_scale_count = ( i_vscale == 1 ) ?
827                     p_filter->fmt_out.video.i_height :
828                     p_filter->fmt_in.video.i_height;
829     for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
830     {
831         p_pic_start = p_pic;
832         p_buffer = b_hscale ? p_buffer_start : p_pic;
833
834         for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
835         {
836             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
837             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
838             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
839             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
840         }
841
842         /* Here we do some unaligned reads and duplicate conversions, but
843          * at least we have all the pixels */
844         if( i_rewind )
845         {
846             p_y -= i_rewind;
847             p_u -= i_rewind >> 1;
848             p_v -= i_rewind >> 1;
849             p_buffer -= i_rewind;
850             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
851             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
852             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
853             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
854         }
855         SCALE_WIDTH;
856         SCALE_HEIGHT( 420, 4 );
857
858         p_y += i_source_margin;
859         if( i_y % 2 )
860         {
861             p_u += i_source_margin_c;
862             p_v += i_source_margin_c;
863         }
864     }
865 }
866
867 #else // defined (MODULE_NAME_IS_i420_rgb_mmx) || defined (MODULE_NAME_IS_i420_rgb_sse2)
868
869 void I420_A8R8G8B8( filter_t *p_filter, picture_t *p_src,
870                                             picture_t *p_dest )
871 {
872     /* We got this one from the old arguments */
873     uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
874     uint8_t  *p_y   = p_src->Y_PIXELS;
875     uint8_t  *p_u   = p_src->U_PIXELS;
876     uint8_t  *p_v   = p_src->V_PIXELS;
877
878     bool  b_hscale;                         /* horizontal scaling type */
879     unsigned int i_vscale;                          /* vertical scaling type */
880     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
881
882     int         i_right_margin;
883     int         i_rewind;
884     int         i_scale_count;                       /* scale modulo counter */
885     int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
886     uint32_t *  p_pic_start;       /* beginning of the current line for copy */
887     /* Conversion buffer pointer */
888     uint32_t *  p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer;
889     uint32_t *  p_buffer;
890
891     /* Offset array pointer */
892     int *       p_offset_start = p_filter->p_sys->p_offset;
893     int *       p_offset;
894
895     const int i_source_margin = p_src->p[0].i_pitch
896                                  - p_src->p[0].i_visible_pitch;
897     const int i_source_margin_c = p_src->p[1].i_pitch
898                                  - p_src->p[1].i_visible_pitch;
899
900     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
901
902     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
903      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
904      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
905     SetOffset( p_filter->fmt_in.video.i_width,
906                p_filter->fmt_in.video.i_height,
907                p_filter->fmt_out.video.i_width,
908                p_filter->fmt_out.video.i_height,
909                &b_hscale, &i_vscale, p_offset_start );
910
911     /*
912      * Perform conversion
913      */
914     i_scale_count = ( i_vscale == 1 ) ?
915                     p_filter->fmt_out.video.i_height :
916                     p_filter->fmt_in.video.i_height;
917
918 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
919
920     i_rewind = (-p_filter->fmt_in.video.i_width) & 15;
921
922     /*
923     ** SSE2 128 bits fetch/store instructions are faster
924     ** if memory access is 16 bytes aligned
925     */
926
927     p_buffer = b_hscale ? p_buffer_start : p_pic;
928     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
929                     p_dest->p->i_pitch|
930                     ((intptr_t)p_y)|
931                     ((intptr_t)p_buffer))) )
932     {
933         /* use faster SSE2 aligned fetch and store */
934         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
935         {
936             p_pic_start = p_pic;
937
938             for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
939             {
940                 SSE2_CALL (
941                     SSE2_INIT_32_ALIGNED
942                     SSE2_YUV_MUL
943                     SSE2_YUV_ADD
944                     SSE2_UNPACK_32_ARGB_ALIGNED
945                 );
946                 p_y += 16;
947                 p_u += 8;
948                 p_v += 8;
949                 p_buffer += 16;
950             }
951
952             /* Here we do some unaligned reads and duplicate conversions, but
953              * at least we have all the pixels */
954             if( i_rewind )
955             {
956                 p_y -= i_rewind;
957                 p_u -= i_rewind >> 1;
958                 p_v -= i_rewind >> 1;
959                 p_buffer -= i_rewind;
960                 SSE2_CALL (
961                     SSE2_INIT_32_UNALIGNED
962                     SSE2_YUV_MUL
963                     SSE2_YUV_ADD
964                     SSE2_UNPACK_32_ARGB_UNALIGNED
965                 );
966                 p_y += 16;
967                 p_u += 4;
968                 p_v += 4;
969             }
970             SCALE_WIDTH;
971             SCALE_HEIGHT( 420, 4 );
972
973             p_y += i_source_margin;
974             if( i_y % 2 )
975             {
976                 p_u += i_source_margin_c;
977                 p_v += i_source_margin_c;
978             }
979             p_buffer = b_hscale ? p_buffer_start : p_pic;
980         }
981     }
982     else
983     {
984         /* use slower SSE2 unaligned fetch and store */
985         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
986         {
987             p_pic_start = p_pic;
988             p_buffer = b_hscale ? p_buffer_start : p_pic;
989
990             for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
991             {
992                 SSE2_CALL (
993                     SSE2_INIT_32_UNALIGNED
994                     SSE2_YUV_MUL
995                     SSE2_YUV_ADD
996                     SSE2_UNPACK_32_ARGB_UNALIGNED
997                 );
998                 p_y += 16;
999                 p_u += 8;
1000                 p_v += 8;
1001                 p_buffer += 16;
1002             }
1003
1004             /* Here we do some unaligned reads and duplicate conversions, but
1005              * at least we have all the pixels */
1006             if( i_rewind )
1007             {
1008                 p_y -= i_rewind;
1009                 p_u -= i_rewind >> 1;
1010                 p_v -= i_rewind >> 1;
1011                 p_buffer -= i_rewind;
1012                 SSE2_CALL (
1013                     SSE2_INIT_32_UNALIGNED
1014                     SSE2_YUV_MUL
1015                     SSE2_YUV_ADD
1016                     SSE2_UNPACK_32_ARGB_UNALIGNED
1017                 );
1018                 p_y += 16;
1019                 p_u += 8;
1020                 p_v += 8;
1021             }
1022             SCALE_WIDTH;
1023             SCALE_HEIGHT( 420, 4 );
1024
1025             p_y += i_source_margin;
1026             if( i_y % 2 )
1027             {
1028                 p_u += i_source_margin_c;
1029                 p_v += i_source_margin_c;
1030             }
1031             p_buffer = b_hscale ? p_buffer_start : p_pic;
1032         }
1033     }
1034
1035     /* make sure all SSE2 stores are visible thereafter */
1036     SSE2_END;
1037
1038 #else // defined (MODULE_NAME_IS_i420_rgb_mmx)
1039
1040     i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
1041
1042     for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1043     {
1044         p_pic_start = p_pic;
1045         p_buffer = b_hscale ? p_buffer_start : p_pic;
1046
1047         for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
1048         {
1049             MMX_CALL (
1050                 MMX_INIT_32
1051                 MMX_YUV_MUL
1052                 MMX_YUV_ADD
1053                 MMX_UNPACK_32_ARGB
1054             );
1055             p_y += 8;
1056             p_u += 4;
1057             p_v += 4;
1058             p_buffer += 8;
1059         }
1060
1061         /* Here we do some unaligned reads and duplicate conversions, but
1062          * at least we have all the pixels */
1063         if( i_rewind )
1064         {
1065             p_y -= i_rewind;
1066             p_u -= i_rewind >> 1;
1067             p_v -= i_rewind >> 1;
1068             p_buffer -= i_rewind;
1069             MMX_CALL (
1070                 MMX_INIT_32
1071                 MMX_YUV_MUL
1072                 MMX_YUV_ADD
1073                 MMX_UNPACK_32_ARGB
1074             );
1075             p_y += 8;
1076             p_u += 4;
1077             p_v += 4;
1078             p_buffer += 8;
1079         }
1080         SCALE_WIDTH;
1081         SCALE_HEIGHT( 420, 4 );
1082
1083         p_y += i_source_margin;
1084         if( i_y % 2 )
1085         {
1086             p_u += i_source_margin_c;
1087             p_v += i_source_margin_c;
1088         }
1089     }
1090
1091     /* re-enable FPU registers */
1092     MMX_END;
1093
1094 #endif
1095 }
1096
1097 void I420_R8G8B8A8( filter_t *p_filter, picture_t *p_src,
1098                                             picture_t *p_dest )
1099 {
1100     /* We got this one from the old arguments */
1101     uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
1102     uint8_t  *p_y   = p_src->Y_PIXELS;
1103     uint8_t  *p_u   = p_src->U_PIXELS;
1104     uint8_t  *p_v   = p_src->V_PIXELS;
1105
1106     bool  b_hscale;                         /* horizontal scaling type */
1107     unsigned int i_vscale;                          /* vertical scaling type */
1108     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
1109
1110     int         i_right_margin;
1111     int         i_rewind;
1112     int         i_scale_count;                       /* scale modulo counter */
1113     int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
1114     uint32_t *  p_pic_start;       /* beginning of the current line for copy */
1115     /* Conversion buffer pointer */
1116     uint32_t *  p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer;
1117     uint32_t *  p_buffer;
1118
1119     /* Offset array pointer */
1120     int *       p_offset_start = p_filter->p_sys->p_offset;
1121     int *       p_offset;
1122
1123     const int i_source_margin = p_src->p[0].i_pitch
1124                                  - p_src->p[0].i_visible_pitch;
1125     const int i_source_margin_c = p_src->p[1].i_pitch
1126                                  - p_src->p[1].i_visible_pitch;
1127
1128     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
1129
1130     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
1131      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
1132      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
1133     SetOffset( p_filter->fmt_in.video.i_width,
1134                p_filter->fmt_in.video.i_height,
1135                p_filter->fmt_out.video.i_width,
1136                p_filter->fmt_out.video.i_height,
1137                &b_hscale, &i_vscale, p_offset_start );
1138
1139     /*
1140      * Perform conversion
1141      */
1142     i_scale_count = ( i_vscale == 1 ) ?
1143                     p_filter->fmt_out.video.i_height :
1144                     p_filter->fmt_in.video.i_height;
1145
1146 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
1147
1148     i_rewind = (-p_filter->fmt_in.video.i_width) & 15;
1149
1150     /*
1151     ** SSE2 128 bits fetch/store instructions are faster
1152     ** if memory access is 16 bytes aligned
1153     */
1154
1155     p_buffer = b_hscale ? p_buffer_start : p_pic;
1156     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
1157                     p_dest->p->i_pitch|
1158                     ((intptr_t)p_y)|
1159                     ((intptr_t)p_buffer))) )
1160     {
1161         /* use faster SSE2 aligned fetch and store */
1162         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1163         {
1164             p_pic_start = p_pic;
1165
1166             for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
1167             {
1168                 SSE2_CALL (
1169                     SSE2_INIT_32_ALIGNED
1170                     SSE2_YUV_MUL
1171                     SSE2_YUV_ADD
1172                     SSE2_UNPACK_32_RGBA_ALIGNED
1173                 );
1174                 p_y += 16;
1175                 p_u += 8;
1176                 p_v += 8;
1177                 p_buffer += 16;
1178             }
1179
1180             /* Here we do some unaligned reads and duplicate conversions, but
1181              * at least we have all the pixels */
1182             if( i_rewind )
1183             {
1184                 p_y -= i_rewind;
1185                 p_u -= i_rewind >> 1;
1186                 p_v -= i_rewind >> 1;
1187                 p_buffer -= i_rewind;
1188                 SSE2_CALL (
1189                     SSE2_INIT_32_UNALIGNED
1190                     SSE2_YUV_MUL
1191                     SSE2_YUV_ADD
1192                     SSE2_UNPACK_32_RGBA_UNALIGNED
1193                 );
1194                 p_y += 16;
1195                 p_u += 4;
1196                 p_v += 4;
1197             }
1198             SCALE_WIDTH;
1199             SCALE_HEIGHT( 420, 4 );
1200
1201             p_y += i_source_margin;
1202             if( i_y % 2 )
1203             {
1204                 p_u += i_source_margin_c;
1205                 p_v += i_source_margin_c;
1206             }
1207             p_buffer = b_hscale ? p_buffer_start : p_pic;
1208         }
1209     }
1210     else
1211     {
1212         /* use slower SSE2 unaligned fetch and store */
1213         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1214         {
1215             p_pic_start = p_pic;
1216             p_buffer = b_hscale ? p_buffer_start : p_pic;
1217
1218             for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
1219             {
1220                 SSE2_CALL (
1221                     SSE2_INIT_32_UNALIGNED
1222                     SSE2_YUV_MUL
1223                     SSE2_YUV_ADD
1224                     SSE2_UNPACK_32_RGBA_UNALIGNED
1225                 );
1226                 p_y += 16;
1227                 p_u += 8;
1228                 p_v += 8;
1229                 p_buffer += 16;
1230             }
1231
1232             /* Here we do some unaligned reads and duplicate conversions, but
1233              * at least we have all the pixels */
1234             if( i_rewind )
1235             {
1236                 p_y -= i_rewind;
1237                 p_u -= i_rewind >> 1;
1238                 p_v -= i_rewind >> 1;
1239                 p_buffer -= i_rewind;
1240                 SSE2_CALL (
1241                     SSE2_INIT_32_UNALIGNED
1242                     SSE2_YUV_MUL
1243                     SSE2_YUV_ADD
1244                     SSE2_UNPACK_32_RGBA_UNALIGNED
1245                 );
1246                 p_y += 16;
1247                 p_u += 8;
1248                 p_v += 8;
1249             }
1250             SCALE_WIDTH;
1251             SCALE_HEIGHT( 420, 4 );
1252
1253             p_y += i_source_margin;
1254             if( i_y % 2 )
1255             {
1256                 p_u += i_source_margin_c;
1257                 p_v += i_source_margin_c;
1258             }
1259             p_buffer = b_hscale ? p_buffer_start : p_pic;
1260         }
1261     }
1262
1263     /* make sure all SSE2 stores are visible thereafter */
1264     SSE2_END;
1265
1266 #else // defined (MODULE_NAME_IS_i420_rgb_mmx)
1267
1268     i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
1269
1270     for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1271     {
1272         p_pic_start = p_pic;
1273         p_buffer = b_hscale ? p_buffer_start : p_pic;
1274
1275         for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
1276         {
1277             MMX_CALL (
1278                 MMX_INIT_32
1279                 MMX_YUV_MUL
1280                 MMX_YUV_ADD
1281                 MMX_UNPACK_32_RGBA
1282             );
1283             p_y += 8;
1284             p_u += 4;
1285             p_v += 4;
1286             p_buffer += 8;
1287         }
1288
1289         /* Here we do some unaligned reads and duplicate conversions, but
1290          * at least we have all the pixels */
1291         if( i_rewind )
1292         {
1293             p_y -= i_rewind;
1294             p_u -= i_rewind >> 1;
1295             p_v -= i_rewind >> 1;
1296             p_buffer -= i_rewind;
1297             MMX_CALL (
1298                 MMX_INIT_32
1299                 MMX_YUV_MUL
1300                 MMX_YUV_ADD
1301                 MMX_UNPACK_32_RGBA
1302             );
1303             p_y += 8;
1304             p_u += 4;
1305             p_v += 4;
1306             p_buffer += 8;
1307         }
1308         SCALE_WIDTH;
1309         SCALE_HEIGHT( 420, 4 );
1310
1311         p_y += i_source_margin;
1312         if( i_y % 2 )
1313         {
1314             p_u += i_source_margin_c;
1315             p_v += i_source_margin_c;
1316         }
1317     }
1318
1319     /* re-enable FPU registers */
1320     MMX_END;
1321
1322 #endif
1323 }
1324
1325 void I420_B8G8R8A8( filter_t *p_filter, picture_t *p_src,
1326                                             picture_t *p_dest )
1327 {
1328     /* We got this one from the old arguments */
1329     uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
1330     uint8_t  *p_y   = p_src->Y_PIXELS;
1331     uint8_t  *p_u   = p_src->U_PIXELS;
1332     uint8_t  *p_v   = p_src->V_PIXELS;
1333
1334     bool  b_hscale;                         /* horizontal scaling type */
1335     unsigned int i_vscale;                          /* vertical scaling type */
1336     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
1337
1338     int         i_right_margin;
1339     int         i_rewind;
1340     int         i_scale_count;                       /* scale modulo counter */
1341     int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
1342     uint32_t *  p_pic_start;       /* beginning of the current line for copy */
1343     /* Conversion buffer pointer */
1344     uint32_t *  p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer;
1345     uint32_t *  p_buffer;
1346
1347     /* Offset array pointer */
1348     int *       p_offset_start = p_filter->p_sys->p_offset;
1349     int *       p_offset;
1350
1351     const int i_source_margin = p_src->p[0].i_pitch
1352                                  - p_src->p[0].i_visible_pitch;
1353     const int i_source_margin_c = p_src->p[1].i_pitch
1354                                  - p_src->p[1].i_visible_pitch;
1355
1356     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
1357
1358     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
1359      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
1360      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
1361     SetOffset( p_filter->fmt_in.video.i_width,
1362                p_filter->fmt_in.video.i_height,
1363                p_filter->fmt_out.video.i_width,
1364                p_filter->fmt_out.video.i_height,
1365                &b_hscale, &i_vscale, p_offset_start );
1366
1367     /*
1368      * Perform conversion
1369      */
1370     i_scale_count = ( i_vscale == 1 ) ?
1371                     p_filter->fmt_out.video.i_height :
1372                     p_filter->fmt_in.video.i_height;
1373
1374 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
1375
1376     i_rewind = (-p_filter->fmt_in.video.i_width) & 15;
1377
1378     /*
1379     ** SSE2 128 bits fetch/store instructions are faster
1380     ** if memory access is 16 bytes aligned
1381     */
1382
1383     p_buffer = b_hscale ? p_buffer_start : p_pic;
1384     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
1385                     p_dest->p->i_pitch|
1386                     ((intptr_t)p_y)|
1387                     ((intptr_t)p_buffer))) )
1388     {
1389         /* use faster SSE2 aligned fetch and store */
1390         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1391         {
1392             p_pic_start = p_pic;
1393
1394             for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
1395             {
1396                 SSE2_CALL (
1397                     SSE2_INIT_32_ALIGNED
1398                     SSE2_YUV_MUL
1399                     SSE2_YUV_ADD
1400                     SSE2_UNPACK_32_BGRA_ALIGNED
1401                 );
1402                 p_y += 16;
1403                 p_u += 8;
1404                 p_v += 8;
1405                 p_buffer += 16;
1406             }
1407
1408             /* Here we do some unaligned reads and duplicate conversions, but
1409              * at least we have all the pixels */
1410             if( i_rewind )
1411             {
1412                 p_y -= i_rewind;
1413                 p_u -= i_rewind >> 1;
1414                 p_v -= i_rewind >> 1;
1415                 p_buffer -= i_rewind;
1416                 SSE2_CALL (
1417                     SSE2_INIT_32_UNALIGNED
1418                     SSE2_YUV_MUL
1419                     SSE2_YUV_ADD
1420                     SSE2_UNPACK_32_BGRA_UNALIGNED
1421                 );
1422                 p_y += 16;
1423                 p_u += 4;
1424                 p_v += 4;
1425             }
1426             SCALE_WIDTH;
1427             SCALE_HEIGHT( 420, 4 );
1428
1429             p_y += i_source_margin;
1430             if( i_y % 2 )
1431             {
1432                 p_u += i_source_margin_c;
1433                 p_v += i_source_margin_c;
1434             }
1435             p_buffer = b_hscale ? p_buffer_start : p_pic;
1436         }
1437     }
1438     else
1439     {
1440         /* use slower SSE2 unaligned fetch and store */
1441         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1442         {
1443             p_pic_start = p_pic;
1444             p_buffer = b_hscale ? p_buffer_start : p_pic;
1445
1446             for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
1447             {
1448                 SSE2_CALL (
1449                     SSE2_INIT_32_UNALIGNED
1450                     SSE2_YUV_MUL
1451                     SSE2_YUV_ADD
1452                     SSE2_UNPACK_32_BGRA_UNALIGNED
1453                 );
1454                 p_y += 16;
1455                 p_u += 8;
1456                 p_v += 8;
1457                 p_buffer += 16;
1458             }
1459
1460             /* Here we do some unaligned reads and duplicate conversions, but
1461              * at least we have all the pixels */
1462             if( i_rewind )
1463             {
1464                 p_y -= i_rewind;
1465                 p_u -= i_rewind >> 1;
1466                 p_v -= i_rewind >> 1;
1467                 p_buffer -= i_rewind;
1468                 SSE2_CALL (
1469                     SSE2_INIT_32_UNALIGNED
1470                     SSE2_YUV_MUL
1471                     SSE2_YUV_ADD
1472                     SSE2_UNPACK_32_BGRA_UNALIGNED
1473                 );
1474                 p_y += 16;
1475                 p_u += 8;
1476                 p_v += 8;
1477             }
1478             SCALE_WIDTH;
1479             SCALE_HEIGHT( 420, 4 );
1480
1481             p_y += i_source_margin;
1482             if( i_y % 2 )
1483             {
1484                 p_u += i_source_margin_c;
1485                 p_v += i_source_margin_c;
1486             }
1487             p_buffer = b_hscale ? p_buffer_start : p_pic;
1488         }
1489     }
1490
1491 #else
1492
1493     i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
1494
1495     for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1496     {
1497         p_pic_start = p_pic;
1498         p_buffer = b_hscale ? p_buffer_start : p_pic;
1499
1500         for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
1501         {
1502             MMX_CALL (
1503                 MMX_INIT_32
1504                 MMX_YUV_MUL
1505                 MMX_YUV_ADD
1506                 MMX_UNPACK_32_BGRA
1507             );
1508             p_y += 8;
1509             p_u += 4;
1510             p_v += 4;
1511             p_buffer += 8;
1512         }
1513
1514         /* Here we do some unaligned reads and duplicate conversions, but
1515          * at least we have all the pixels */
1516         if( i_rewind )
1517         {
1518             p_y -= i_rewind;
1519             p_u -= i_rewind >> 1;
1520             p_v -= i_rewind >> 1;
1521             p_buffer -= i_rewind;
1522             MMX_CALL (
1523                 MMX_INIT_32
1524                 MMX_YUV_MUL
1525                 MMX_YUV_ADD
1526                 MMX_UNPACK_32_BGRA
1527             );
1528             p_y += 8;
1529             p_u += 4;
1530             p_v += 4;
1531             p_buffer += 8;
1532         }
1533         SCALE_WIDTH;
1534         SCALE_HEIGHT( 420, 4 );
1535
1536         p_y += i_source_margin;
1537         if( i_y % 2 )
1538         {
1539             p_u += i_source_margin_c;
1540             p_v += i_source_margin_c;
1541         }
1542     }
1543
1544     /* re-enable FPU registers */
1545     MMX_END;
1546
1547 #endif
1548 }
1549
1550 void I420_A8B8G8R8( filter_t *p_filter, picture_t *p_src,
1551                                             picture_t *p_dest )
1552 {
1553     /* We got this one from the old arguments */
1554     uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
1555     uint8_t  *p_y   = p_src->Y_PIXELS;
1556     uint8_t  *p_u   = p_src->U_PIXELS;
1557     uint8_t  *p_v   = p_src->V_PIXELS;
1558
1559     bool  b_hscale;                         /* horizontal scaling type */
1560     unsigned int i_vscale;                          /* vertical scaling type */
1561     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
1562
1563     int         i_right_margin;
1564     int         i_rewind;
1565     int         i_scale_count;                       /* scale modulo counter */
1566     int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
1567     uint32_t *  p_pic_start;       /* beginning of the current line for copy */
1568     /* Conversion buffer pointer */
1569     uint32_t *  p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer;
1570     uint32_t *  p_buffer;
1571
1572     /* Offset array pointer */
1573     int *       p_offset_start = p_filter->p_sys->p_offset;
1574     int *       p_offset;
1575
1576     const int i_source_margin = p_src->p[0].i_pitch
1577                                  - p_src->p[0].i_visible_pitch;
1578     const int i_source_margin_c = p_src->p[1].i_pitch
1579                                  - p_src->p[1].i_visible_pitch;
1580
1581     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
1582
1583     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
1584      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
1585      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
1586     SetOffset( p_filter->fmt_in.video.i_width,
1587                p_filter->fmt_in.video.i_height,
1588                p_filter->fmt_out.video.i_width,
1589                p_filter->fmt_out.video.i_height,
1590                &b_hscale, &i_vscale, p_offset_start );
1591
1592     /*
1593      * Perform conversion
1594      */
1595     i_scale_count = ( i_vscale == 1 ) ?
1596                     p_filter->fmt_out.video.i_height :
1597                     p_filter->fmt_in.video.i_height;
1598
1599 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
1600
1601     i_rewind = (-p_filter->fmt_in.video.i_width) & 15;
1602
1603     /*
1604     ** SSE2 128 bits fetch/store instructions are faster
1605     ** if memory access is 16 bytes aligned
1606     */
1607
1608     p_buffer = b_hscale ? p_buffer_start : p_pic;
1609     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
1610                     p_dest->p->i_pitch|
1611                     ((intptr_t)p_y)|
1612                     ((intptr_t)p_buffer))) )
1613     {
1614         /* use faster SSE2 aligned fetch and store */
1615         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1616         {
1617             p_pic_start = p_pic;
1618
1619             for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
1620             {
1621                 SSE2_CALL (
1622                     SSE2_INIT_32_ALIGNED
1623                     SSE2_YUV_MUL
1624                     SSE2_YUV_ADD
1625                     SSE2_UNPACK_32_ABGR_ALIGNED
1626                 );
1627                 p_y += 16;
1628                 p_u += 8;
1629                 p_v += 8;
1630                 p_buffer += 16;
1631             }
1632
1633             /* Here we do some unaligned reads and duplicate conversions, but
1634              * at least we have all the pixels */
1635             if( i_rewind )
1636             {
1637                 p_y -= i_rewind;
1638                 p_u -= i_rewind >> 1;
1639                 p_v -= i_rewind >> 1;
1640                 p_buffer -= i_rewind;
1641                 SSE2_CALL (
1642                     SSE2_INIT_32_UNALIGNED
1643                     SSE2_YUV_MUL
1644                     SSE2_YUV_ADD
1645                     SSE2_UNPACK_32_ABGR_UNALIGNED
1646                 );
1647                 p_y += 16;
1648                 p_u += 4;
1649                 p_v += 4;
1650             }
1651             SCALE_WIDTH;
1652             SCALE_HEIGHT( 420, 4 );
1653
1654             p_y += i_source_margin;
1655             if( i_y % 2 )
1656             {
1657                 p_u += i_source_margin_c;
1658                 p_v += i_source_margin_c;
1659             }
1660             p_buffer = b_hscale ? p_buffer_start : p_pic;
1661         }
1662     }
1663     else
1664     {
1665         /* use slower SSE2 unaligned fetch and store */
1666         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1667         {
1668             p_pic_start = p_pic;
1669             p_buffer = b_hscale ? p_buffer_start : p_pic;
1670
1671             for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
1672             {
1673                 SSE2_CALL (
1674                     SSE2_INIT_32_UNALIGNED
1675                     SSE2_YUV_MUL
1676                     SSE2_YUV_ADD
1677                     SSE2_UNPACK_32_ABGR_UNALIGNED
1678                 );
1679                 p_y += 16;
1680                 p_u += 8;
1681                 p_v += 8;
1682                 p_buffer += 16;
1683             }
1684
1685             /* Here we do some unaligned reads and duplicate conversions, but
1686              * at least we have all the pixels */
1687             if( i_rewind )
1688             {
1689                 p_y -= i_rewind;
1690                 p_u -= i_rewind >> 1;
1691                 p_v -= i_rewind >> 1;
1692                 p_buffer -= i_rewind;
1693                 SSE2_CALL (
1694                     SSE2_INIT_32_UNALIGNED
1695                     SSE2_YUV_MUL
1696                     SSE2_YUV_ADD
1697                     SSE2_UNPACK_32_ABGR_UNALIGNED
1698                 );
1699                 p_y += 16;
1700                 p_u += 8;
1701                 p_v += 8;
1702             }
1703             SCALE_WIDTH;
1704             SCALE_HEIGHT( 420, 4 );
1705
1706             p_y += i_source_margin;
1707             if( i_y % 2 )
1708             {
1709                 p_u += i_source_margin_c;
1710                 p_v += i_source_margin_c;
1711             }
1712             p_buffer = b_hscale ? p_buffer_start : p_pic;
1713         }
1714     }
1715
1716 #else
1717
1718     i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
1719
1720     for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1721     {
1722         p_pic_start = p_pic;
1723         p_buffer = b_hscale ? p_buffer_start : p_pic;
1724
1725         for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
1726         {
1727             MMX_CALL (
1728                 MMX_INIT_32
1729                 MMX_YUV_MUL
1730                 MMX_YUV_ADD
1731                 MMX_UNPACK_32_ABGR
1732             );
1733             p_y += 8;
1734             p_u += 4;
1735             p_v += 4;
1736             p_buffer += 8;
1737         }
1738
1739         /* Here we do some unaligned reads and duplicate conversions, but
1740          * at least we have all the pixels */
1741         if( i_rewind )
1742         {
1743             p_y -= i_rewind;
1744             p_u -= i_rewind >> 1;
1745             p_v -= i_rewind >> 1;
1746             p_buffer -= i_rewind;
1747             MMX_CALL (
1748                 MMX_INIT_32
1749                 MMX_YUV_MUL
1750                 MMX_YUV_ADD
1751                 MMX_UNPACK_32_ABGR
1752             );
1753             p_y += 8;
1754             p_u += 4;
1755             p_v += 4;
1756             p_buffer += 8;
1757         }
1758         SCALE_WIDTH;
1759         SCALE_HEIGHT( 420, 4 );
1760
1761         p_y += i_source_margin;
1762         if( i_y % 2 )
1763         {
1764             p_u += i_source_margin_c;
1765             p_v += i_source_margin_c;
1766         }
1767     }
1768
1769     /* re-enable FPU registers */
1770     MMX_END;
1771
1772 #endif
1773 }
1774
1775 #endif
1776
1777 /* Following functions are local */
1778
1779 /*****************************************************************************
1780  * SetOffset: build offset array for conversion functions
1781  *****************************************************************************
1782  * This function will build an offset array used in later conversion functions.
1783  * It will also set horizontal and vertical scaling indicators.
1784  *****************************************************************************/
1785 static void SetOffset( int i_width, int i_height, int i_pic_width,
1786                        int i_pic_height, bool *pb_hscale,
1787                        unsigned int *pi_vscale, int *p_offset )
1788 {
1789     int i_x;                                    /* x position in destination */
1790     int i_scale_count;                                     /* modulo counter */
1791
1792     /*
1793      * Prepare horizontal offset array
1794      */
1795     if( i_pic_width - i_width == 0 )
1796     {
1797         /* No horizontal scaling: YUV conversion is done directly to picture */
1798         *pb_hscale = 0;
1799     }
1800     else if( i_pic_width - i_width > 0 )
1801     {
1802         /* Prepare scaling array for horizontal extension */
1803         *pb_hscale = 1;
1804         i_scale_count = i_pic_width;
1805         for( i_x = i_width; i_x--; )
1806         {
1807             while( (i_scale_count -= i_width) > 0 )
1808             {
1809                 *p_offset++ = 0;
1810             }
1811             *p_offset++ = 1;
1812             i_scale_count += i_pic_width;
1813         }
1814     }
1815     else /* if( i_pic_width - i_width < 0 ) */
1816     {
1817         /* Prepare scaling array for horizontal reduction */
1818         *pb_hscale = 1;
1819         i_scale_count = i_width;
1820         for( i_x = i_pic_width; i_x--; )
1821         {
1822             *p_offset = 1;
1823             while( (i_scale_count -= i_pic_width) > 0 )
1824             {
1825                 *p_offset += 1;
1826             }
1827             p_offset++;
1828             i_scale_count += i_width;
1829         }
1830     }
1831
1832     /*
1833      * Set vertical scaling indicator
1834      */
1835     if( i_pic_height - i_height == 0 )
1836     {
1837         *pi_vscale = 0;
1838     }
1839     else if( i_pic_height - i_height > 0 )
1840     {
1841         *pi_vscale = 1;
1842     }
1843     else /* if( i_pic_height - i_height < 0 ) */
1844     {
1845         *pi_vscale = -1;
1846     }
1847 }
1848