]> git.sesse.net Git - vlc/blob - modules/video_chroma/i420_rgb16.c
macosx: fixed menubar appearance in fullscreen mode by partially reverting [46c93c9cc...
[vlc] / modules / video_chroma / i420_rgb16.c
1 /*****************************************************************************
2  * i420_rgb16.c : YUV to bitmap RGB conversion module for vlc
3  *****************************************************************************
4  * Copyright (C) 2000 the VideoLAN team
5  * $Id$
6  *
7  * Authors: Samuel Hocevar <sam@zoy.org>
8  *          Damien Fouilleul <damienf@videolan.org>
9  *
10  * This program is free software; you can redistribute it and/or modify
11  * it under the terms of the GNU General Public License as published by
12  * the Free Software Foundation; either version 2 of the License, or
13  * (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public License
21  * along with this program; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
23  *****************************************************************************/
24
25 /*****************************************************************************
26  * Preamble
27  *****************************************************************************/
28
29 #ifdef HAVE_CONFIG_H
30 # include "config.h"
31 #endif
32
33 #include <vlc/vlc.h>
34 #include <vlc_filter.h>
35 #include <vlc_vout.h>
36
37 #include "i420_rgb.h"
38 #if defined (MODULE_NAME_IS_i420_rgb)
39 #   include "i420_rgb_c.h"
40 #elif defined (MODULE_NAME_IS_i420_rgb_mmx)
41 #   include "i420_rgb_mmx.h"
42 #elif defined (MODULE_NAME_IS_i420_rgb_sse2)
43 #   include "i420_rgb_mmx.h"
44 #endif
45
46 static void SetOffset( int, int, int, int, bool *,
47                        unsigned int *, int * );
48
49 #if defined (MODULE_NAME_IS_i420_rgb)
50 /*****************************************************************************
51  * I420_RGB16: color YUV 4:2:0 to RGB 16 bpp with dithering
52  *****************************************************************************
53  * Horizontal alignment needed:
54  *  - input: 8 pixels (8 Y bytes, 4 U/V bytes), margins not allowed
55  *  - output: 1 pixel (2 bytes), margins allowed
56  * Vertical alignment needed:
57  *  - input: 2 lines (2 Y lines, 1 U/V line)
58  *  - output: 1 line
59  *****************************************************************************/
60 void I420_RGB16_dither( filter_t *p_filter, picture_t *p_src,
61                                                 picture_t *p_dest )
62 {
63     /* We got this one from the old arguments */
64     uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
65     uint8_t  *p_y   = p_src->Y_PIXELS;
66     uint8_t  *p_u   = p_src->U_PIXELS;
67     uint8_t  *p_v   = p_src->V_PIXELS;
68
69     bool   b_hscale;                        /* horizontal scaling type */
70     unsigned int i_vscale;                          /* vertical scaling type */
71     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
72     unsigned int i_real_y;                                          /* y % 4 */
73
74     int         i_right_margin;
75     int         i_rewind;
76     int         i_scale_count;                       /* scale modulo counter */
77     int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
78     uint16_t *  p_pic_start;       /* beginning of the current line for copy */
79     int         i_uval, i_vval;                           /* U and V samples */
80     int         i_red, i_green, i_blue;          /* U and V modified samples */
81     uint16_t *  p_yuv = p_filter->p_sys->p_rgb16;
82     uint16_t *  p_ybase;                     /* Y dependant conversion table */
83
84     /* Conversion buffer pointer */
85     uint16_t *  p_buffer_start = (uint16_t*)p_filter->p_sys->p_buffer;
86     uint16_t *  p_buffer;
87
88     /* Offset array pointer */
89     int *       p_offset_start = p_filter->p_sys->p_offset;
90     int *       p_offset;
91
92     const int i_source_margin = p_src->p[0].i_pitch
93                                  - p_src->p[0].i_visible_pitch;
94     const int i_source_margin_c = p_src->p[1].i_pitch
95                                  - p_src->p[1].i_visible_pitch;
96
97     /* The dithering matrices */
98     int dither10[4] = {  0x0,  0x8,  0x2,  0xa };
99     int dither11[4] = {  0xc,  0x4,  0xe,  0x6 };
100     int dither12[4] = {  0x3,  0xb,  0x1,  0x9 };
101     int dither13[4] = {  0xf,  0x7,  0xd,  0x5 };
102
103     for(i_x = 0; i_x < 4; i_x++)
104     {
105         dither10[i_x] = dither10[i_x] << (SHIFT - 4 + p_filter->fmt_out.video.i_rrshift);
106         dither11[i_x] = dither11[i_x] << (SHIFT - 4 + p_filter->fmt_out.video.i_rrshift);
107         dither12[i_x] = dither12[i_x] << (SHIFT - 4 + p_filter->fmt_out.video.i_rrshift);
108         dither13[i_x] = dither13[i_x] << (SHIFT - 4 + p_filter->fmt_out.video.i_rrshift);
109     }
110
111     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
112
113     if( p_filter->fmt_in.video.i_width & 7 )
114     {
115         i_rewind = 8 - ( p_filter->fmt_in.video.i_width & 7 );
116     }
117     else
118     {
119         i_rewind = 0;
120     }
121
122     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
123      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
124      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
125     SetOffset( p_filter->fmt_in.video.i_width,
126                p_filter->fmt_in.video.i_height,
127                p_filter->fmt_out.video.i_width,
128                p_filter->fmt_out.video.i_height,
129                &b_hscale, &i_vscale, p_offset_start );
130
131     /*
132      * Perform conversion
133      */
134     i_scale_count = ( i_vscale == 1 ) ?
135                     p_filter->fmt_out.video.i_height :
136                     p_filter->fmt_in.video.i_height;
137     for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
138     {
139         i_real_y = i_y & 0x3;
140         p_pic_start = p_pic;
141         p_buffer = b_hscale ? p_buffer_start : p_pic;
142
143         for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
144         {
145             int *p_dither = dither10;
146             CONVERT_YUV_PIXEL_DITHER(2);
147             p_dither = dither11;
148             CONVERT_Y_PIXEL_DITHER(2);
149             p_dither = dither12;
150             CONVERT_YUV_PIXEL_DITHER(2);
151             p_dither = dither13;
152             CONVERT_Y_PIXEL_DITHER(2);
153             p_dither = dither10;
154             CONVERT_YUV_PIXEL_DITHER(2);
155             p_dither = dither11;
156             CONVERT_Y_PIXEL_DITHER(2);
157             p_dither = dither12;
158             CONVERT_YUV_PIXEL_DITHER(2);
159             p_dither = dither13;
160             CONVERT_Y_PIXEL_DITHER(2);
161         }
162
163         /* Here we do some unaligned reads and duplicate conversions, but
164          * at least we have all the pixels */
165         if( i_rewind )
166         {
167             int *p_dither = dither10;
168             p_y -= i_rewind;
169             p_u -= i_rewind >> 1;
170             p_v -= i_rewind >> 1;
171             p_buffer -= i_rewind;
172             CONVERT_YUV_PIXEL_DITHER(2);
173             p_dither = dither11;
174             CONVERT_Y_PIXEL_DITHER(2);
175             p_dither = dither12;
176             CONVERT_YUV_PIXEL_DITHER(2);
177             p_dither = dither13;
178             CONVERT_Y_PIXEL_DITHER(2);
179             p_dither = dither10;
180             CONVERT_YUV_PIXEL_DITHER(2);
181             p_dither = dither11;
182             CONVERT_Y_PIXEL_DITHER(2);
183             p_dither = dither12;
184             CONVERT_YUV_PIXEL_DITHER(2);
185             p_dither = dither13;
186             CONVERT_Y_PIXEL_DITHER(2);
187         }
188         SCALE_WIDTH;
189         SCALE_HEIGHT( 420, 2 );
190
191         p_y += i_source_margin;
192         if( i_y % 2 )
193         {
194             p_u += i_source_margin_c;
195             p_v += i_source_margin_c;
196         }
197     }
198 }
199 #endif
200
201 /*****************************************************************************
202  * I420_RGB16: color YUV 4:2:0 to RGB 16 bpp
203  *****************************************************************************
204  * Horizontal alignment needed:
205  *  - input: 8 pixels (8 Y bytes, 4 U/V bytes), margins not allowed
206  *  - output: 1 pixel (2 bytes), margins allowed
207  * Vertical alignment needed:
208  *  - input: 2 lines (2 Y lines, 1 U/V line)
209  *  - output: 1 line
210  *****************************************************************************/
211
212 #if defined (MODULE_NAME_IS_i420_rgb)
213
214 void I420_RGB16( filter_t *p_filter, picture_t *p_src,
215                                          picture_t *p_dest )
216 {
217     /* We got this one from the old arguments */
218     uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
219     uint8_t  *p_y   = p_src->Y_PIXELS;
220     uint8_t  *p_u   = p_src->U_PIXELS;
221     uint8_t  *p_v   = p_src->V_PIXELS;
222
223     bool  b_hscale;                         /* horizontal scaling type */
224     unsigned int i_vscale;                          /* vertical scaling type */
225     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
226
227     int         i_right_margin;
228     int         i_rewind;
229     int         i_scale_count;                       /* scale modulo counter */
230     int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
231     uint16_t *  p_pic_start;       /* beginning of the current line for copy */
232     int         i_uval, i_vval;                           /* U and V samples */
233     int         i_red, i_green, i_blue;          /* U and V modified samples */
234     uint16_t *  p_yuv = p_filter->p_sys->p_rgb16;
235     uint16_t *  p_ybase;                     /* Y dependant conversion table */
236
237     /* Conversion buffer pointer */
238     uint16_t *  p_buffer_start = (uint16_t*)p_filter->p_sys->p_buffer;
239     uint16_t *  p_buffer;
240
241     /* Offset array pointer */
242     int *       p_offset_start = p_filter->p_sys->p_offset;
243     int *       p_offset;
244
245     const int i_source_margin = p_src->p[0].i_pitch
246                                  - p_src->p[0].i_visible_pitch;
247     const int i_source_margin_c = p_src->p[1].i_pitch
248                                  - p_src->p[1].i_visible_pitch;
249
250     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
251
252     if( p_filter->fmt_in.video.i_width & 7 )
253     {
254         i_rewind = 8 - ( p_filter->fmt_in.video.i_width & 7 );
255     }
256     else
257     {
258         i_rewind = 0;
259     }
260
261     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
262      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
263      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
264     SetOffset( p_filter->fmt_in.video.i_width,
265                p_filter->fmt_in.video.i_height,
266                p_filter->fmt_out.video.i_width,
267                p_filter->fmt_out.video.i_height,
268                &b_hscale, &i_vscale, p_offset_start );
269
270     /*
271      * Perform conversion
272      */
273     i_scale_count = ( i_vscale == 1 ) ?
274                     p_filter->fmt_out.video.i_height :
275                     p_filter->fmt_in.video.i_height;
276     for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
277     {
278         p_pic_start = p_pic;
279         p_buffer = b_hscale ? p_buffer_start : p_pic;
280
281         for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
282         {
283             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
284             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
285             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
286             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
287         }
288
289         /* Here we do some unaligned reads and duplicate conversions, but
290          * at least we have all the pixels */
291         if( i_rewind )
292         {
293             p_y -= i_rewind;
294             p_u -= i_rewind >> 1;
295             p_v -= i_rewind >> 1;
296             p_buffer -= i_rewind;
297
298             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
299             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
300             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
301             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
302         }
303         SCALE_WIDTH;
304         SCALE_HEIGHT( 420, 2 );
305
306         p_y += i_source_margin;
307         if( i_y % 2 )
308         {
309             p_u += i_source_margin_c;
310             p_v += i_source_margin_c;
311         }
312     }
313 }
314
315 #else // ! defined (MODULE_NAME_IS_i420_rgb)
316
317 void I420_R5G5B5( filter_t *p_filter, picture_t *p_src,
318                                           picture_t *p_dest )
319 {
320     /* We got this one from the old arguments */
321     uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
322     uint8_t  *p_y   = p_src->Y_PIXELS;
323     uint8_t  *p_u   = p_src->U_PIXELS;
324     uint8_t  *p_v   = p_src->V_PIXELS;
325
326     bool  b_hscale;                         /* horizontal scaling type */
327     unsigned int i_vscale;                          /* vertical scaling type */
328     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
329
330     int         i_right_margin;
331     int         i_rewind;
332     int         i_scale_count;                       /* scale modulo counter */
333     int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
334     uint16_t *  p_pic_start;       /* beginning of the current line for copy */
335
336     /* Conversion buffer pointer */
337     uint16_t *  p_buffer_start = (uint16_t*)p_filter->p_sys->p_buffer;
338     uint16_t *  p_buffer;
339
340     /* Offset array pointer */
341     int *       p_offset_start = p_filter->p_sys->p_offset;
342     int *       p_offset;
343
344     const int i_source_margin = p_src->p[0].i_pitch
345                                  - p_src->p[0].i_visible_pitch;
346     const int i_source_margin_c = p_src->p[1].i_pitch
347                                  - p_src->p[1].i_visible_pitch;
348
349     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
350
351     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
352      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
353      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
354     SetOffset( p_filter->fmt_in.video.i_width,
355                p_filter->fmt_in.video.i_height,
356                p_filter->fmt_out.video.i_width,
357                p_filter->fmt_out.video.i_height,
358                &b_hscale, &i_vscale, p_offset_start );
359
360
361     /*
362      * Perform conversion
363      */
364     i_scale_count = ( i_vscale == 1 ) ?
365                     p_filter->fmt_out.video.i_height :
366                     p_filter->fmt_in.video.i_height;
367
368 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
369
370     if( p_filter->fmt_in.video.i_width & 15 )
371     {
372         i_rewind = 16 - ( p_filter->fmt_in.video.i_width & 15 );
373     }
374     else
375     {
376         i_rewind = 0;
377     }
378
379     /*
380     ** SSE2 128 bits fetch/store instructions are faster
381     ** if memory access is 16 bytes aligned
382     */
383
384     p_buffer = b_hscale ? p_buffer_start : p_pic;
385     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
386                     p_dest->p->i_pitch|
387                     ((intptr_t)p_y)|
388                     ((intptr_t)p_buffer))) )
389     {
390         /* use faster SSE2 aligned fetch and store */
391         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
392         {
393             p_pic_start = p_pic;
394
395             for ( i_x = p_filter->fmt_in.video.i_width/16; i_x--; )
396             {
397                 SSE2_CALL (
398                     SSE2_INIT_16_ALIGNED
399                     SSE2_YUV_MUL
400                     SSE2_YUV_ADD
401                     SSE2_UNPACK_15_ALIGNED
402                 );
403                 p_y += 16;
404                 p_u += 8;
405                 p_v += 8;
406                 p_buffer += 16;
407             }
408             /* Here we do some unaligned reads and duplicate conversions, but
409              * at least we have all the pixels */
410             if( i_rewind )
411             {
412                 p_y -= i_rewind;
413                 p_u -= i_rewind >> 1;
414                 p_v -= i_rewind >> 1;
415                 p_buffer -= i_rewind;
416
417                 SSE2_CALL (
418                     SSE2_INIT_16_UNALIGNED
419                     SSE2_YUV_MUL
420                     SSE2_YUV_ADD
421                     SSE2_UNPACK_15_UNALIGNED
422                 );
423                 p_y += 16;
424                 p_u += 8;
425                 p_v += 8;
426             }
427             SCALE_WIDTH;
428             SCALE_HEIGHT( 420, 2 );
429
430             p_y += i_source_margin;
431             if( i_y % 2 )
432             {
433                 p_u += i_source_margin_c;
434                 p_v += i_source_margin_c;
435             }
436             p_buffer = b_hscale ? p_buffer_start : p_pic;
437         }
438     }
439     else
440     {
441         /* use slower SSE2 unaligned fetch and store */
442         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
443         {
444             p_pic_start = p_pic;
445             p_buffer = b_hscale ? p_buffer_start : p_pic;
446
447             for ( i_x = p_filter->fmt_in.video.i_width/16; i_x--; )
448             {
449                 SSE2_CALL (
450                     SSE2_INIT_16_UNALIGNED
451                     SSE2_YUV_MUL
452                     SSE2_YUV_ADD
453                     SSE2_UNPACK_15_UNALIGNED
454                 );
455                 p_y += 16;
456                 p_u += 8;
457                 p_v += 8;
458                 p_buffer += 16;
459             }
460             /* Here we do some unaligned reads and duplicate conversions, but
461              * at least we have all the pixels */
462             if( i_rewind )
463             {
464                 p_y -= i_rewind;
465                 p_u -= i_rewind >> 1;
466                 p_v -= i_rewind >> 1;
467                 p_buffer -= i_rewind;
468
469                 SSE2_CALL (
470                     SSE2_INIT_16_UNALIGNED
471                     SSE2_YUV_MUL
472                     SSE2_YUV_ADD
473                     SSE2_UNPACK_15_UNALIGNED
474                 );
475                 p_y += 16;
476                 p_u += 8;
477                 p_v += 8;
478             }
479             SCALE_WIDTH;
480             SCALE_HEIGHT( 420, 2 );
481
482             p_y += i_source_margin;
483             if( i_y % 2 )
484             {
485                 p_u += i_source_margin_c;
486                 p_v += i_source_margin_c;
487             }
488             p_buffer = b_hscale ? p_buffer_start : p_pic;
489         }
490     }
491
492     /* make sure all SSE2 stores are visible thereafter */
493     SSE2_END;
494
495 #else // defined (MODULE_NAME_IS_i420_rgb_mmx)
496
497     if( p_filter->fmt_in.video.i_width & 7 )
498     {
499         i_rewind = 8 - ( p_filter->fmt_in.video.i_width & 7 );
500     }
501     else
502     {
503         i_rewind = 0;
504     }
505
506     for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
507     {
508         p_pic_start = p_pic;
509         p_buffer = b_hscale ? p_buffer_start : p_pic;
510
511         for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
512         {
513             MMX_CALL (
514                 MMX_INIT_16
515                 MMX_YUV_MUL
516                 MMX_YUV_ADD
517                 MMX_UNPACK_15
518             );
519             p_y += 8;
520             p_u += 4;
521             p_v += 4;
522             p_buffer += 8;
523         }
524
525         /* Here we do some unaligned reads and duplicate conversions, but
526          * at least we have all the pixels */
527         if( i_rewind )
528         {
529             p_y -= i_rewind;
530             p_u -= i_rewind >> 1;
531             p_v -= i_rewind >> 1;
532             p_buffer -= i_rewind;
533
534             MMX_CALL (
535                 MMX_INIT_16
536                 MMX_YUV_MUL
537                 MMX_YUV_ADD
538                 MMX_UNPACK_15
539             );
540             p_y += 8;
541             p_u += 4;
542             p_v += 4;
543             p_buffer += 8;
544         }
545         SCALE_WIDTH;
546         SCALE_HEIGHT( 420, 2 );
547
548         p_y += i_source_margin;
549         if( i_y % 2 )
550         {
551             p_u += i_source_margin_c;
552             p_v += i_source_margin_c;
553         }
554     }
555     /* re-enable FPU registers */
556     MMX_END;
557
558 #endif
559 }
560
561 void I420_R5G6B5( filter_t *p_filter, picture_t *p_src,
562                                           picture_t *p_dest )
563 {
564     /* We got this one from the old arguments */
565     uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
566     uint8_t  *p_y   = p_src->Y_PIXELS;
567     uint8_t  *p_u   = p_src->U_PIXELS;
568     uint8_t  *p_v   = p_src->V_PIXELS;
569
570     bool  b_hscale;                         /* horizontal scaling type */
571     unsigned int i_vscale;                          /* vertical scaling type */
572     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
573
574     int         i_right_margin;
575     int         i_rewind;
576     int         i_scale_count;                       /* scale modulo counter */
577     int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
578     uint16_t *  p_pic_start;       /* beginning of the current line for copy */
579
580     /* Conversion buffer pointer */
581     uint16_t *  p_buffer_start = (uint16_t*)p_filter->p_sys->p_buffer;
582     uint16_t *  p_buffer;
583
584     /* Offset array pointer */
585     int *       p_offset_start = p_filter->p_sys->p_offset;
586     int *       p_offset;
587
588     const int i_source_margin = p_src->p[0].i_pitch
589                                  - p_src->p[0].i_visible_pitch;
590     const int i_source_margin_c = p_src->p[1].i_pitch
591                                  - p_src->p[1].i_visible_pitch;
592
593     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
594
595     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
596      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
597      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
598     SetOffset( p_filter->fmt_in.video.i_width,
599                p_filter->fmt_in.video.i_height,
600                p_filter->fmt_out.video.i_width,
601                p_filter->fmt_out.video.i_height,
602                &b_hscale, &i_vscale, p_offset_start );
603
604
605     /*
606      * Perform conversion
607      */
608     i_scale_count = ( i_vscale == 1 ) ?
609                     p_filter->fmt_out.video.i_height :
610                     p_filter->fmt_in.video.i_height;
611
612 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
613
614     if( p_filter->fmt_in.video.i_width & 15 )
615     {
616         i_rewind = 16 - ( p_filter->fmt_in.video.i_width & 15 );
617     }
618     else
619     {
620         i_rewind = 0;
621     }
622
623     /*
624     ** SSE2 128 bits fetch/store instructions are faster
625     ** if memory access is 16 bytes aligned
626     */
627
628     p_buffer = b_hscale ? p_buffer_start : p_pic;
629     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
630                     p_dest->p->i_pitch|
631                     ((intptr_t)p_y)|
632                     ((intptr_t)p_buffer))) )
633     {
634         /* use faster SSE2 aligned fetch and store */
635         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
636         {
637             p_pic_start = p_pic;
638
639             for ( i_x = p_filter->fmt_in.video.i_width/16; i_x--; )
640             {
641                 SSE2_CALL (
642                     SSE2_INIT_16_ALIGNED
643                     SSE2_YUV_MUL
644                     SSE2_YUV_ADD
645                     SSE2_UNPACK_16_ALIGNED
646                 );
647                 p_y += 16;
648                 p_u += 8;
649                 p_v += 8;
650                 p_buffer += 16;
651             }
652             /* Here we do some unaligned reads and duplicate conversions, but
653              * at least we have all the pixels */
654             if( i_rewind )
655             {
656                 p_y -= i_rewind;
657                 p_u -= i_rewind >> 1;
658                 p_v -= i_rewind >> 1;
659                 p_buffer -= i_rewind;
660
661                 SSE2_CALL (
662                     SSE2_INIT_16_UNALIGNED
663                     SSE2_YUV_MUL
664                     SSE2_YUV_ADD
665                     SSE2_UNPACK_16_UNALIGNED
666                 );
667                 p_y += 16;
668                 p_u += 8;
669                 p_v += 8;
670             }
671             SCALE_WIDTH;
672             SCALE_HEIGHT( 420, 2 );
673
674             p_y += i_source_margin;
675             if( i_y % 2 )
676             {
677                 p_u += i_source_margin_c;
678                 p_v += i_source_margin_c;
679             }
680             p_buffer = b_hscale ? p_buffer_start : p_pic;
681         }
682     }
683     else
684     {
685         /* use slower SSE2 unaligned fetch and store */
686         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
687         {
688             p_pic_start = p_pic;
689             p_buffer = b_hscale ? p_buffer_start : p_pic;
690
691             for ( i_x = p_filter->fmt_in.video.i_width/16; i_x--; )
692             {
693                 SSE2_CALL(
694                     SSE2_INIT_16_UNALIGNED
695                     SSE2_YUV_MUL
696                     SSE2_YUV_ADD
697                     SSE2_UNPACK_16_UNALIGNED
698                 );
699                 p_y += 16;
700                 p_u += 8;
701                 p_v += 8;
702                 p_buffer += 16;
703             }
704             /* Here we do some unaligned reads and duplicate conversions, but
705              * at least we have all the pixels */
706             if( i_rewind )
707             {
708                 p_y -= i_rewind;
709                 p_u -= i_rewind >> 1;
710                 p_v -= i_rewind >> 1;
711                 p_buffer -= i_rewind;
712
713                 SSE2_CALL(
714                     SSE2_INIT_16_UNALIGNED
715                     SSE2_YUV_MUL
716                     SSE2_YUV_ADD
717                     SSE2_UNPACK_16_UNALIGNED
718                 );
719                 p_y += 16;
720                 p_u += 8;
721                 p_v += 8;
722             }
723             SCALE_WIDTH;
724             SCALE_HEIGHT( 420, 2 );
725
726             p_y += i_source_margin;
727             if( i_y % 2 )
728             {
729                 p_u += i_source_margin_c;
730                 p_v += i_source_margin_c;
731             }
732             p_buffer = b_hscale ? p_buffer_start : p_pic;
733         }
734     }
735
736     /* make sure all SSE2 stores are visible thereafter */
737     SSE2_END;
738
739 #else // defined (MODULE_NAME_IS_i420_rgb_mmx)
740
741     if( p_filter->fmt_in.video.i_width & 7 )
742     {
743         i_rewind = 8 - ( p_filter->fmt_in.video.i_width & 7 );
744     }
745     else
746     {
747         i_rewind = 0;
748     }
749
750     for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
751     {
752         p_pic_start = p_pic;
753         p_buffer = b_hscale ? p_buffer_start : p_pic;
754
755         for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
756         {
757             MMX_CALL (
758                 MMX_INIT_16
759                 MMX_YUV_MUL
760                 MMX_YUV_ADD
761                 MMX_UNPACK_16
762             );
763             p_y += 8;
764             p_u += 4;
765             p_v += 4;
766             p_buffer += 8;
767         }
768
769         /* Here we do some unaligned reads and duplicate conversions, but
770          * at least we have all the pixels */
771         if( i_rewind )
772         {
773             p_y -= i_rewind;
774             p_u -= i_rewind >> 1;
775             p_v -= i_rewind >> 1;
776             p_buffer -= i_rewind;
777
778             MMX_CALL (
779                 MMX_INIT_16
780                 MMX_YUV_MUL
781                 MMX_YUV_ADD
782                 MMX_UNPACK_16
783             );
784             p_y += 8;
785             p_u += 4;
786             p_v += 4;
787             p_buffer += 8;
788         }
789         SCALE_WIDTH;
790         SCALE_HEIGHT( 420, 2 );
791
792         p_y += i_source_margin;
793         if( i_y % 2 )
794         {
795             p_u += i_source_margin_c;
796             p_v += i_source_margin_c;
797         }
798     }
799     /* re-enable FPU registers */
800     MMX_END;
801
802 #endif
803 }
804
805 #endif
806
807 /*****************************************************************************
808  * I420_RGB32: color YUV 4:2:0 to RGB 32 bpp
809  *****************************************************************************
810  * Horizontal alignment needed:
811  *  - input: 8 pixels (8 Y bytes, 4 U/V bytes), margins not allowed
812  *  - output: 1 pixel (2 bytes), margins allowed
813  * Vertical alignment needed:
814  *  - input: 2 lines (2 Y lines, 1 U/V line)
815  *  - output: 1 line
816  *****************************************************************************/
817
818 #if defined (MODULE_NAME_IS_i420_rgb)
819
820 void I420_RGB32( filter_t *p_filter, picture_t *p_src,
821                                          picture_t *p_dest )
822 {
823     /* We got this one from the old arguments */
824     uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
825     uint8_t  *p_y   = p_src->Y_PIXELS;
826     uint8_t  *p_u   = p_src->U_PIXELS;
827     uint8_t  *p_v   = p_src->V_PIXELS;
828
829     bool  b_hscale;                         /* horizontal scaling type */
830     unsigned int i_vscale;                          /* vertical scaling type */
831     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
832
833     int         i_right_margin;
834     int         i_rewind;
835     int         i_scale_count;                       /* scale modulo counter */
836     int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
837     uint32_t *  p_pic_start;       /* beginning of the current line for copy */
838     int         i_uval, i_vval;                           /* U and V samples */
839     int         i_red, i_green, i_blue;          /* U and V modified samples */
840     uint32_t *  p_yuv = p_filter->p_sys->p_rgb32;
841     uint32_t *  p_ybase;                     /* Y dependant conversion table */
842
843     /* Conversion buffer pointer */
844     uint32_t *  p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer;
845     uint32_t *  p_buffer;
846
847     /* Offset array pointer */
848     int *       p_offset_start = p_filter->p_sys->p_offset;
849     int *       p_offset;
850
851     const int i_source_margin = p_src->p[0].i_pitch
852                                  - p_src->p[0].i_visible_pitch;
853     const int i_source_margin_c = p_src->p[1].i_pitch
854                                  - p_src->p[1].i_visible_pitch;
855
856     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
857
858     if( p_filter->fmt_in.video.i_width & 7 )
859     {
860         i_rewind = 8 - ( p_filter->fmt_in.video.i_width & 7 );
861     }
862     else
863     {
864         i_rewind = 0;
865     }
866
867     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
868      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
869      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
870     SetOffset( p_filter->fmt_in.video.i_width,
871                p_filter->fmt_in.video.i_height,
872                p_filter->fmt_out.video.i_width,
873                p_filter->fmt_out.video.i_height,
874                &b_hscale, &i_vscale, p_offset_start );
875
876     /*
877      * Perform conversion
878      */
879     i_scale_count = ( i_vscale == 1 ) ?
880                     p_filter->fmt_out.video.i_height :
881                     p_filter->fmt_in.video.i_height;
882     for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
883     {
884         p_pic_start = p_pic;
885         p_buffer = b_hscale ? p_buffer_start : p_pic;
886
887         for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
888         {
889             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
890             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
891             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
892             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
893         }
894
895         /* Here we do some unaligned reads and duplicate conversions, but
896          * at least we have all the pixels */
897         if( i_rewind )
898         {
899             p_y -= i_rewind;
900             p_u -= i_rewind >> 1;
901             p_v -= i_rewind >> 1;
902             p_buffer -= i_rewind;
903             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
904             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
905             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
906             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
907         }
908         SCALE_WIDTH;
909         SCALE_HEIGHT( 420, 4 );
910
911         p_y += i_source_margin;
912         if( i_y % 2 )
913         {
914             p_u += i_source_margin_c;
915             p_v += i_source_margin_c;
916         }
917     }
918 }
919
920 #else // defined (MODULE_NAME_IS_i420_rgb_mmx) || defined (MODULE_NAME_IS_i420_rgb_sse2)
921
922 void I420_A8R8G8B8( filter_t *p_filter, picture_t *p_src,
923                                             picture_t *p_dest )
924 {
925     /* We got this one from the old arguments */
926     uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
927     uint8_t  *p_y   = p_src->Y_PIXELS;
928     uint8_t  *p_u   = p_src->U_PIXELS;
929     uint8_t  *p_v   = p_src->V_PIXELS;
930
931     bool  b_hscale;                         /* horizontal scaling type */
932     unsigned int i_vscale;                          /* vertical scaling type */
933     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
934
935     int         i_right_margin;
936     int         i_rewind;
937     int         i_scale_count;                       /* scale modulo counter */
938     int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
939     uint32_t *  p_pic_start;       /* beginning of the current line for copy */
940     /* Conversion buffer pointer */
941     uint32_t *  p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer;
942     uint32_t *  p_buffer;
943
944     /* Offset array pointer */
945     int *       p_offset_start = p_filter->p_sys->p_offset;
946     int *       p_offset;
947
948     const int i_source_margin = p_src->p[0].i_pitch
949                                  - p_src->p[0].i_visible_pitch;
950     const int i_source_margin_c = p_src->p[1].i_pitch
951                                  - p_src->p[1].i_visible_pitch;
952
953     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
954
955     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
956      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
957      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
958     SetOffset( p_filter->fmt_in.video.i_width,
959                p_filter->fmt_in.video.i_height,
960                p_filter->fmt_out.video.i_width,
961                p_filter->fmt_out.video.i_height,
962                &b_hscale, &i_vscale, p_offset_start );
963
964     /*
965      * Perform conversion
966      */
967     i_scale_count = ( i_vscale == 1 ) ?
968                     p_filter->fmt_out.video.i_height :
969                     p_filter->fmt_in.video.i_height;
970
971 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
972
973     if( p_filter->fmt_in.video.i_width & 15 )
974     {
975         i_rewind = 16 - ( p_filter->fmt_in.video.i_width & 15 );
976     }
977     else
978     {
979         i_rewind = 0;
980     }
981
982     /*
983     ** SSE2 128 bits fetch/store instructions are faster
984     ** if memory access is 16 bytes aligned
985     */
986
987     p_buffer = b_hscale ? p_buffer_start : p_pic;
988     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
989                     p_dest->p->i_pitch|
990                     ((intptr_t)p_y)|
991                     ((intptr_t)p_buffer))) )
992     {
993         /* use faster SSE2 aligned fetch and store */
994         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
995         {
996             p_pic_start = p_pic;
997
998             for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
999             {
1000                 SSE2_CALL (
1001                     SSE2_INIT_32_ALIGNED
1002                     SSE2_YUV_MUL
1003                     SSE2_YUV_ADD
1004                     SSE2_UNPACK_32_ARGB_ALIGNED
1005                 );
1006                 p_y += 16;
1007                 p_u += 8;
1008                 p_v += 8;
1009                 p_buffer += 16;
1010             }
1011
1012             /* Here we do some unaligned reads and duplicate conversions, but
1013              * at least we have all the pixels */
1014             if( i_rewind )
1015             {
1016                 p_y -= i_rewind;
1017                 p_u -= i_rewind >> 1;
1018                 p_v -= i_rewind >> 1;
1019                 p_buffer -= i_rewind;
1020                 SSE2_CALL (
1021                     SSE2_INIT_32_UNALIGNED
1022                     SSE2_YUV_MUL
1023                     SSE2_YUV_ADD
1024                     SSE2_UNPACK_32_ARGB_UNALIGNED
1025                 );
1026                 p_y += 16;
1027                 p_u += 4;
1028                 p_v += 4;
1029             }
1030             SCALE_WIDTH;
1031             SCALE_HEIGHT( 420, 4 );
1032
1033             p_y += i_source_margin;
1034             if( i_y % 2 )
1035             {
1036                 p_u += i_source_margin_c;
1037                 p_v += i_source_margin_c;
1038             }
1039             p_buffer = b_hscale ? p_buffer_start : p_pic;
1040         }
1041     }
1042     else
1043     {
1044         /* use slower SSE2 unaligned fetch and store */
1045         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1046         {
1047             p_pic_start = p_pic;
1048             p_buffer = b_hscale ? p_buffer_start : p_pic;
1049
1050             for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
1051             {
1052                 SSE2_CALL (
1053                     SSE2_INIT_32_UNALIGNED
1054                     SSE2_YUV_MUL
1055                     SSE2_YUV_ADD
1056                     SSE2_UNPACK_32_ARGB_UNALIGNED
1057                 );
1058                 p_y += 16;
1059                 p_u += 8;
1060                 p_v += 8;
1061                 p_buffer += 16;
1062             }
1063
1064             /* Here we do some unaligned reads and duplicate conversions, but
1065              * at least we have all the pixels */
1066             if( i_rewind )
1067             {
1068                 p_y -= i_rewind;
1069                 p_u -= i_rewind >> 1;
1070                 p_v -= i_rewind >> 1;
1071                 p_buffer -= i_rewind;
1072                 SSE2_CALL (
1073                     SSE2_INIT_32_UNALIGNED
1074                     SSE2_YUV_MUL
1075                     SSE2_YUV_ADD
1076                     SSE2_UNPACK_32_ARGB_UNALIGNED
1077                 );
1078                 p_y += 16;
1079                 p_u += 8;
1080                 p_v += 8;
1081             }
1082             SCALE_WIDTH;
1083             SCALE_HEIGHT( 420, 4 );
1084
1085             p_y += i_source_margin;
1086             if( i_y % 2 )
1087             {
1088                 p_u += i_source_margin_c;
1089                 p_v += i_source_margin_c;
1090             }
1091             p_buffer = b_hscale ? p_buffer_start : p_pic;
1092         }
1093     }
1094
1095     /* make sure all SSE2 stores are visible thereafter */
1096     SSE2_END;
1097
1098 #else // defined (MODULE_NAME_IS_i420_rgb_mmx)
1099
1100     if( p_filter->fmt_in.video.i_width & 7 )
1101     {
1102         i_rewind = 8 - ( p_filter->fmt_in.video.i_width & 7 );
1103     }
1104     else
1105     {
1106         i_rewind = 0;
1107     }
1108
1109     for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1110     {
1111         p_pic_start = p_pic;
1112         p_buffer = b_hscale ? p_buffer_start : p_pic;
1113
1114         for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
1115         {
1116             MMX_CALL (
1117                 MMX_INIT_32
1118                 MMX_YUV_MUL
1119                 MMX_YUV_ADD
1120                 MMX_UNPACK_32_ARGB
1121             );
1122             p_y += 8;
1123             p_u += 4;
1124             p_v += 4;
1125             p_buffer += 8;
1126         }
1127
1128         /* Here we do some unaligned reads and duplicate conversions, but
1129          * at least we have all the pixels */
1130         if( i_rewind )
1131         {
1132             p_y -= i_rewind;
1133             p_u -= i_rewind >> 1;
1134             p_v -= i_rewind >> 1;
1135             p_buffer -= i_rewind;
1136             MMX_CALL (
1137                 MMX_INIT_32
1138                 MMX_YUV_MUL
1139                 MMX_YUV_ADD
1140                 MMX_UNPACK_32_ARGB
1141             );
1142             p_y += 8;
1143             p_u += 4;
1144             p_v += 4;
1145             p_buffer += 8;
1146         }
1147         SCALE_WIDTH;
1148         SCALE_HEIGHT( 420, 4 );
1149
1150         p_y += i_source_margin;
1151         if( i_y % 2 )
1152         {
1153             p_u += i_source_margin_c;
1154             p_v += i_source_margin_c;
1155         }
1156     }
1157
1158     /* re-enable FPU registers */
1159     MMX_END;
1160
1161 #endif
1162 }
1163
1164 void I420_R8G8B8A8( filter_t *p_filter, picture_t *p_src,
1165                                             picture_t *p_dest )
1166 {
1167     /* We got this one from the old arguments */
1168     uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
1169     uint8_t  *p_y   = p_src->Y_PIXELS;
1170     uint8_t  *p_u   = p_src->U_PIXELS;
1171     uint8_t  *p_v   = p_src->V_PIXELS;
1172
1173     bool  b_hscale;                         /* horizontal scaling type */
1174     unsigned int i_vscale;                          /* vertical scaling type */
1175     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
1176
1177     int         i_right_margin;
1178     int         i_rewind;
1179     int         i_scale_count;                       /* scale modulo counter */
1180     int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
1181     uint32_t *  p_pic_start;       /* beginning of the current line for copy */
1182     /* Conversion buffer pointer */
1183     uint32_t *  p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer;
1184     uint32_t *  p_buffer;
1185
1186     /* Offset array pointer */
1187     int *       p_offset_start = p_filter->p_sys->p_offset;
1188     int *       p_offset;
1189
1190     const int i_source_margin = p_src->p[0].i_pitch
1191                                  - p_src->p[0].i_visible_pitch;
1192     const int i_source_margin_c = p_src->p[1].i_pitch
1193                                  - p_src->p[1].i_visible_pitch;
1194
1195     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
1196
1197     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
1198      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
1199      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
1200     SetOffset( p_filter->fmt_in.video.i_width,
1201                p_filter->fmt_in.video.i_height,
1202                p_filter->fmt_out.video.i_width,
1203                p_filter->fmt_out.video.i_height,
1204                &b_hscale, &i_vscale, p_offset_start );
1205
1206     /*
1207      * Perform conversion
1208      */
1209     i_scale_count = ( i_vscale == 1 ) ?
1210                     p_filter->fmt_out.video.i_height :
1211                     p_filter->fmt_in.video.i_height;
1212
1213 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
1214
1215     if( p_filter->fmt_in.video.i_width & 15 )
1216     {
1217         i_rewind = 16 - ( p_filter->fmt_in.video.i_width & 15 );
1218     }
1219     else
1220     {
1221         i_rewind = 0;
1222     }
1223
1224     /*
1225     ** SSE2 128 bits fetch/store instructions are faster
1226     ** if memory access is 16 bytes aligned
1227     */
1228
1229     p_buffer = b_hscale ? p_buffer_start : p_pic;
1230     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
1231                     p_dest->p->i_pitch|
1232                     ((intptr_t)p_y)|
1233                     ((intptr_t)p_buffer))) )
1234     {
1235         /* use faster SSE2 aligned fetch and store */
1236         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1237         {
1238             p_pic_start = p_pic;
1239
1240             for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
1241             {
1242                 SSE2_CALL (
1243                     SSE2_INIT_32_ALIGNED
1244                     SSE2_YUV_MUL
1245                     SSE2_YUV_ADD
1246                     SSE2_UNPACK_32_RGBA_ALIGNED
1247                 );
1248                 p_y += 16;
1249                 p_u += 8;
1250                 p_v += 8;
1251                 p_buffer += 16;
1252             }
1253
1254             /* Here we do some unaligned reads and duplicate conversions, but
1255              * at least we have all the pixels */
1256             if( i_rewind )
1257             {
1258                 p_y -= i_rewind;
1259                 p_u -= i_rewind >> 1;
1260                 p_v -= i_rewind >> 1;
1261                 p_buffer -= i_rewind;
1262                 SSE2_CALL (
1263                     SSE2_INIT_32_UNALIGNED
1264                     SSE2_YUV_MUL
1265                     SSE2_YUV_ADD
1266                     SSE2_UNPACK_32_RGBA_UNALIGNED
1267                 );
1268                 p_y += 16;
1269                 p_u += 4;
1270                 p_v += 4;
1271             }
1272             SCALE_WIDTH;
1273             SCALE_HEIGHT( 420, 4 );
1274
1275             p_y += i_source_margin;
1276             if( i_y % 2 )
1277             {
1278                 p_u += i_source_margin_c;
1279                 p_v += i_source_margin_c;
1280             }
1281             p_buffer = b_hscale ? p_buffer_start : p_pic;
1282         }
1283     }
1284     else
1285     {
1286         /* use slower SSE2 unaligned fetch and store */
1287         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1288         {
1289             p_pic_start = p_pic;
1290             p_buffer = b_hscale ? p_buffer_start : p_pic;
1291
1292             for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
1293             {
1294                 SSE2_CALL (
1295                     SSE2_INIT_32_UNALIGNED
1296                     SSE2_YUV_MUL
1297                     SSE2_YUV_ADD
1298                     SSE2_UNPACK_32_RGBA_UNALIGNED
1299                 );
1300                 p_y += 16;
1301                 p_u += 8;
1302                 p_v += 8;
1303                 p_buffer += 16;
1304             }
1305
1306             /* Here we do some unaligned reads and duplicate conversions, but
1307              * at least we have all the pixels */
1308             if( i_rewind )
1309             {
1310                 p_y -= i_rewind;
1311                 p_u -= i_rewind >> 1;
1312                 p_v -= i_rewind >> 1;
1313                 p_buffer -= i_rewind;
1314                 SSE2_CALL (
1315                     SSE2_INIT_32_UNALIGNED
1316                     SSE2_YUV_MUL
1317                     SSE2_YUV_ADD
1318                     SSE2_UNPACK_32_RGBA_UNALIGNED
1319                 );
1320                 p_y += 16;
1321                 p_u += 8;
1322                 p_v += 8;
1323             }
1324             SCALE_WIDTH;
1325             SCALE_HEIGHT( 420, 4 );
1326
1327             p_y += i_source_margin;
1328             if( i_y % 2 )
1329             {
1330                 p_u += i_source_margin_c;
1331                 p_v += i_source_margin_c;
1332             }
1333             p_buffer = b_hscale ? p_buffer_start : p_pic;
1334         }
1335     }
1336
1337     /* make sure all SSE2 stores are visible thereafter */
1338     SSE2_END;
1339
1340 #else // defined (MODULE_NAME_IS_i420_rgb_mmx)
1341
1342     if( p_filter->fmt_in.video.i_width & 7 )
1343     {
1344         i_rewind = 8 - ( p_filter->fmt_in.video.i_width & 7 );
1345     }
1346     else
1347     {
1348         i_rewind = 0;
1349     }
1350
1351     for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1352     {
1353         p_pic_start = p_pic;
1354         p_buffer = b_hscale ? p_buffer_start : p_pic;
1355
1356         for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
1357         {
1358             MMX_CALL (
1359                 MMX_INIT_32
1360                 MMX_YUV_MUL
1361                 MMX_YUV_ADD
1362                 MMX_UNPACK_32_RGBA
1363             );
1364             p_y += 8;
1365             p_u += 4;
1366             p_v += 4;
1367             p_buffer += 8;
1368         }
1369
1370         /* Here we do some unaligned reads and duplicate conversions, but
1371          * at least we have all the pixels */
1372         if( i_rewind )
1373         {
1374             p_y -= i_rewind;
1375             p_u -= i_rewind >> 1;
1376             p_v -= i_rewind >> 1;
1377             p_buffer -= i_rewind;
1378             MMX_CALL (
1379                 MMX_INIT_32
1380                 MMX_YUV_MUL
1381                 MMX_YUV_ADD
1382                 MMX_UNPACK_32_RGBA
1383             );
1384             p_y += 8;
1385             p_u += 4;
1386             p_v += 4;
1387             p_buffer += 8;
1388         }
1389         SCALE_WIDTH;
1390         SCALE_HEIGHT( 420, 4 );
1391
1392         p_y += i_source_margin;
1393         if( i_y % 2 )
1394         {
1395             p_u += i_source_margin_c;
1396             p_v += i_source_margin_c;
1397         }
1398     }
1399
1400     /* re-enable FPU registers */
1401     MMX_END;
1402
1403 #endif
1404 }
1405
1406 void I420_B8G8R8A8( filter_t *p_filter, picture_t *p_src,
1407                                             picture_t *p_dest )
1408 {
1409     /* We got this one from the old arguments */
1410     uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
1411     uint8_t  *p_y   = p_src->Y_PIXELS;
1412     uint8_t  *p_u   = p_src->U_PIXELS;
1413     uint8_t  *p_v   = p_src->V_PIXELS;
1414
1415     bool  b_hscale;                         /* horizontal scaling type */
1416     unsigned int i_vscale;                          /* vertical scaling type */
1417     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
1418
1419     int         i_right_margin;
1420     int         i_rewind;
1421     int         i_scale_count;                       /* scale modulo counter */
1422     int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
1423     uint32_t *  p_pic_start;       /* beginning of the current line for copy */
1424     /* Conversion buffer pointer */
1425     uint32_t *  p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer;
1426     uint32_t *  p_buffer;
1427
1428     /* Offset array pointer */
1429     int *       p_offset_start = p_filter->p_sys->p_offset;
1430     int *       p_offset;
1431
1432     const int i_source_margin = p_src->p[0].i_pitch
1433                                  - p_src->p[0].i_visible_pitch;
1434     const int i_source_margin_c = p_src->p[1].i_pitch
1435                                  - p_src->p[1].i_visible_pitch;
1436
1437     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
1438
1439     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
1440      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
1441      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
1442     SetOffset( p_filter->fmt_in.video.i_width,
1443                p_filter->fmt_in.video.i_height,
1444                p_filter->fmt_out.video.i_width,
1445                p_filter->fmt_out.video.i_height,
1446                &b_hscale, &i_vscale, p_offset_start );
1447
1448     /*
1449      * Perform conversion
1450      */
1451     i_scale_count = ( i_vscale == 1 ) ?
1452                     p_filter->fmt_out.video.i_height :
1453                     p_filter->fmt_in.video.i_height;
1454
1455 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
1456
1457     if( p_filter->fmt_in.video.i_width & 15 )
1458     {
1459         i_rewind = 16 - ( p_filter->fmt_in.video.i_width & 15 );
1460     }
1461     else
1462     {
1463         i_rewind = 0;
1464     }
1465
1466     /*
1467     ** SSE2 128 bits fetch/store instructions are faster
1468     ** if memory access is 16 bytes aligned
1469     */
1470
1471     p_buffer = b_hscale ? p_buffer_start : p_pic;
1472     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
1473                     p_dest->p->i_pitch|
1474                     ((intptr_t)p_y)|
1475                     ((intptr_t)p_buffer))) )
1476     {
1477         /* use faster SSE2 aligned fetch and store */
1478         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1479         {
1480             p_pic_start = p_pic;
1481
1482             for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
1483             {
1484                 SSE2_CALL (
1485                     SSE2_INIT_32_ALIGNED
1486                     SSE2_YUV_MUL
1487                     SSE2_YUV_ADD
1488                     SSE2_UNPACK_32_BGRA_ALIGNED
1489                 );
1490                 p_y += 16;
1491                 p_u += 8;
1492                 p_v += 8;
1493                 p_buffer += 16;
1494             }
1495
1496             /* Here we do some unaligned reads and duplicate conversions, but
1497              * at least we have all the pixels */
1498             if( i_rewind )
1499             {
1500                 p_y -= i_rewind;
1501                 p_u -= i_rewind >> 1;
1502                 p_v -= i_rewind >> 1;
1503                 p_buffer -= i_rewind;
1504                 SSE2_CALL (
1505                     SSE2_INIT_32_UNALIGNED
1506                     SSE2_YUV_MUL
1507                     SSE2_YUV_ADD
1508                     SSE2_UNPACK_32_BGRA_UNALIGNED
1509                 );
1510                 p_y += 16;
1511                 p_u += 4;
1512                 p_v += 4;
1513             }
1514             SCALE_WIDTH;
1515             SCALE_HEIGHT( 420, 4 );
1516
1517             p_y += i_source_margin;
1518             if( i_y % 2 )
1519             {
1520                 p_u += i_source_margin_c;
1521                 p_v += i_source_margin_c;
1522             }
1523             p_buffer = b_hscale ? p_buffer_start : p_pic;
1524         }
1525     }
1526     else
1527     {
1528         /* use slower SSE2 unaligned fetch and store */
1529         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1530         {
1531             p_pic_start = p_pic;
1532             p_buffer = b_hscale ? p_buffer_start : p_pic;
1533
1534             for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
1535             {
1536                 SSE2_CALL (
1537                     SSE2_INIT_32_UNALIGNED
1538                     SSE2_YUV_MUL
1539                     SSE2_YUV_ADD
1540                     SSE2_UNPACK_32_BGRA_UNALIGNED
1541                 );
1542                 p_y += 16;
1543                 p_u += 8;
1544                 p_v += 8;
1545                 p_buffer += 16;
1546             }
1547
1548             /* Here we do some unaligned reads and duplicate conversions, but
1549              * at least we have all the pixels */
1550             if( i_rewind )
1551             {
1552                 p_y -= i_rewind;
1553                 p_u -= i_rewind >> 1;
1554                 p_v -= i_rewind >> 1;
1555                 p_buffer -= i_rewind;
1556                 SSE2_CALL (
1557                     SSE2_INIT_32_UNALIGNED
1558                     SSE2_YUV_MUL
1559                     SSE2_YUV_ADD
1560                     SSE2_UNPACK_32_BGRA_UNALIGNED
1561                 );
1562                 p_y += 16;
1563                 p_u += 8;
1564                 p_v += 8;
1565             }
1566             SCALE_WIDTH;
1567             SCALE_HEIGHT( 420, 4 );
1568
1569             p_y += i_source_margin;
1570             if( i_y % 2 )
1571             {
1572                 p_u += i_source_margin_c;
1573                 p_v += i_source_margin_c;
1574             }
1575             p_buffer = b_hscale ? p_buffer_start : p_pic;
1576         }
1577     }
1578
1579 #else
1580
1581     if( p_filter->fmt_in.video.i_width & 7 )
1582     {
1583         i_rewind = 8 - ( p_filter->fmt_in.video.i_width & 7 );
1584     }
1585     else
1586     {
1587         i_rewind = 0;
1588     }
1589
1590     for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1591     {
1592         p_pic_start = p_pic;
1593         p_buffer = b_hscale ? p_buffer_start : p_pic;
1594
1595         for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
1596         {
1597             MMX_CALL (
1598                 MMX_INIT_32
1599                 MMX_YUV_MUL
1600                 MMX_YUV_ADD
1601                 MMX_UNPACK_32_BGRA
1602             );
1603             p_y += 8;
1604             p_u += 4;
1605             p_v += 4;
1606             p_buffer += 8;
1607         }
1608
1609         /* Here we do some unaligned reads and duplicate conversions, but
1610          * at least we have all the pixels */
1611         if( i_rewind )
1612         {
1613             p_y -= i_rewind;
1614             p_u -= i_rewind >> 1;
1615             p_v -= i_rewind >> 1;
1616             p_buffer -= i_rewind;
1617             MMX_CALL (
1618                 MMX_INIT_32
1619                 MMX_YUV_MUL
1620                 MMX_YUV_ADD
1621                 MMX_UNPACK_32_BGRA
1622             );
1623             p_y += 8;
1624             p_u += 4;
1625             p_v += 4;
1626             p_buffer += 8;
1627         }
1628         SCALE_WIDTH;
1629         SCALE_HEIGHT( 420, 4 );
1630
1631         p_y += i_source_margin;
1632         if( i_y % 2 )
1633         {
1634             p_u += i_source_margin_c;
1635             p_v += i_source_margin_c;
1636         }
1637     }
1638
1639     /* re-enable FPU registers */
1640     MMX_END;
1641
1642 #endif
1643 }
1644
1645 void I420_A8B8G8R8( filter_t *p_filter, picture_t *p_src,
1646                                             picture_t *p_dest )
1647 {
1648     /* We got this one from the old arguments */
1649     uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
1650     uint8_t  *p_y   = p_src->Y_PIXELS;
1651     uint8_t  *p_u   = p_src->U_PIXELS;
1652     uint8_t  *p_v   = p_src->V_PIXELS;
1653
1654     bool  b_hscale;                         /* horizontal scaling type */
1655     unsigned int i_vscale;                          /* vertical scaling type */
1656     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
1657
1658     int         i_right_margin;
1659     int         i_rewind;
1660     int         i_scale_count;                       /* scale modulo counter */
1661     int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
1662     uint32_t *  p_pic_start;       /* beginning of the current line for copy */
1663     /* Conversion buffer pointer */
1664     uint32_t *  p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer;
1665     uint32_t *  p_buffer;
1666
1667     /* Offset array pointer */
1668     int *       p_offset_start = p_filter->p_sys->p_offset;
1669     int *       p_offset;
1670
1671     const int i_source_margin = p_src->p[0].i_pitch
1672                                  - p_src->p[0].i_visible_pitch;
1673     const int i_source_margin_c = p_src->p[1].i_pitch
1674                                  - p_src->p[1].i_visible_pitch;
1675
1676     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
1677
1678     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
1679      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
1680      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
1681     SetOffset( p_filter->fmt_in.video.i_width,
1682                p_filter->fmt_in.video.i_height,
1683                p_filter->fmt_out.video.i_width,
1684                p_filter->fmt_out.video.i_height,
1685                &b_hscale, &i_vscale, p_offset_start );
1686
1687     /*
1688      * Perform conversion
1689      */
1690     i_scale_count = ( i_vscale == 1 ) ?
1691                     p_filter->fmt_out.video.i_height :
1692                     p_filter->fmt_in.video.i_height;
1693
1694 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
1695
1696     if( p_filter->fmt_in.video.i_width & 15 )
1697     {
1698         i_rewind = 16 - ( p_filter->fmt_in.video.i_width & 15 );
1699     }
1700     else
1701     {
1702         i_rewind = 0;
1703     }
1704
1705     /*
1706     ** SSE2 128 bits fetch/store instructions are faster
1707     ** if memory access is 16 bytes aligned
1708     */
1709
1710     p_buffer = b_hscale ? p_buffer_start : p_pic;
1711     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
1712                     p_dest->p->i_pitch|
1713                     ((intptr_t)p_y)|
1714                     ((intptr_t)p_buffer))) )
1715     {
1716         /* use faster SSE2 aligned fetch and store */
1717         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1718         {
1719             p_pic_start = p_pic;
1720
1721             for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
1722             {
1723                 SSE2_CALL (
1724                     SSE2_INIT_32_ALIGNED
1725                     SSE2_YUV_MUL
1726                     SSE2_YUV_ADD
1727                     SSE2_UNPACK_32_ABGR_ALIGNED
1728                 );
1729                 p_y += 16;
1730                 p_u += 8;
1731                 p_v += 8;
1732                 p_buffer += 16;
1733             }
1734
1735             /* Here we do some unaligned reads and duplicate conversions, but
1736              * at least we have all the pixels */
1737             if( i_rewind )
1738             {
1739                 p_y -= i_rewind;
1740                 p_u -= i_rewind >> 1;
1741                 p_v -= i_rewind >> 1;
1742                 p_buffer -= i_rewind;
1743                 SSE2_CALL (
1744                     SSE2_INIT_32_UNALIGNED
1745                     SSE2_YUV_MUL
1746                     SSE2_YUV_ADD
1747                     SSE2_UNPACK_32_ABGR_UNALIGNED
1748                 );
1749                 p_y += 16;
1750                 p_u += 4;
1751                 p_v += 4;
1752             }
1753             SCALE_WIDTH;
1754             SCALE_HEIGHT( 420, 4 );
1755
1756             p_y += i_source_margin;
1757             if( i_y % 2 )
1758             {
1759                 p_u += i_source_margin_c;
1760                 p_v += i_source_margin_c;
1761             }
1762             p_buffer = b_hscale ? p_buffer_start : p_pic;
1763         }
1764     }
1765     else
1766     {
1767         /* use slower SSE2 unaligned fetch and store */
1768         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1769         {
1770             p_pic_start = p_pic;
1771             p_buffer = b_hscale ? p_buffer_start : p_pic;
1772
1773             for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
1774             {
1775                 SSE2_CALL (
1776                     SSE2_INIT_32_UNALIGNED
1777                     SSE2_YUV_MUL
1778                     SSE2_YUV_ADD
1779                     SSE2_UNPACK_32_ABGR_UNALIGNED
1780                 );
1781                 p_y += 16;
1782                 p_u += 8;
1783                 p_v += 8;
1784                 p_buffer += 16;
1785             }
1786
1787             /* Here we do some unaligned reads and duplicate conversions, but
1788              * at least we have all the pixels */
1789             if( i_rewind )
1790             {
1791                 p_y -= i_rewind;
1792                 p_u -= i_rewind >> 1;
1793                 p_v -= i_rewind >> 1;
1794                 p_buffer -= i_rewind;
1795                 SSE2_CALL (
1796                     SSE2_INIT_32_UNALIGNED
1797                     SSE2_YUV_MUL
1798                     SSE2_YUV_ADD
1799                     SSE2_UNPACK_32_ABGR_UNALIGNED
1800                 );
1801                 p_y += 16;
1802                 p_u += 8;
1803                 p_v += 8;
1804             }
1805             SCALE_WIDTH;
1806             SCALE_HEIGHT( 420, 4 );
1807
1808             p_y += i_source_margin;
1809             if( i_y % 2 )
1810             {
1811                 p_u += i_source_margin_c;
1812                 p_v += i_source_margin_c;
1813             }
1814             p_buffer = b_hscale ? p_buffer_start : p_pic;
1815         }
1816     }
1817
1818 #else
1819
1820     if( p_filter->fmt_in.video.i_width & 7 )
1821     {
1822         i_rewind = 8 - ( p_filter->fmt_in.video.i_width & 7 );
1823     }
1824     else
1825     {
1826         i_rewind = 0;
1827     }
1828
1829     for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1830     {
1831         p_pic_start = p_pic;
1832         p_buffer = b_hscale ? p_buffer_start : p_pic;
1833
1834         for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
1835         {
1836             MMX_CALL (
1837                 MMX_INIT_32
1838                 MMX_YUV_MUL
1839                 MMX_YUV_ADD
1840                 MMX_UNPACK_32_ABGR
1841             );
1842             p_y += 8;
1843             p_u += 4;
1844             p_v += 4;
1845             p_buffer += 8;
1846         }
1847
1848         /* Here we do some unaligned reads and duplicate conversions, but
1849          * at least we have all the pixels */
1850         if( i_rewind )
1851         {
1852             p_y -= i_rewind;
1853             p_u -= i_rewind >> 1;
1854             p_v -= i_rewind >> 1;
1855             p_buffer -= i_rewind;
1856             MMX_CALL (
1857                 MMX_INIT_32
1858                 MMX_YUV_MUL
1859                 MMX_YUV_ADD
1860                 MMX_UNPACK_32_ABGR
1861             );
1862             p_y += 8;
1863             p_u += 4;
1864             p_v += 4;
1865             p_buffer += 8;
1866         }
1867         SCALE_WIDTH;
1868         SCALE_HEIGHT( 420, 4 );
1869
1870         p_y += i_source_margin;
1871         if( i_y % 2 )
1872         {
1873             p_u += i_source_margin_c;
1874             p_v += i_source_margin_c;
1875         }
1876     }
1877
1878     /* re-enable FPU registers */
1879     MMX_END;
1880
1881 #endif
1882 }
1883
1884 #endif
1885
1886 /* Following functions are local */
1887
1888 /*****************************************************************************
1889  * SetOffset: build offset array for conversion functions
1890  *****************************************************************************
1891  * This function will build an offset array used in later conversion functions.
1892  * It will also set horizontal and vertical scaling indicators.
1893  *****************************************************************************/
1894 static void SetOffset( int i_width, int i_height, int i_pic_width,
1895                        int i_pic_height, bool *pb_hscale,
1896                        unsigned int *pi_vscale, int *p_offset )
1897 {
1898     int i_x;                                    /* x position in destination */
1899     int i_scale_count;                                     /* modulo counter */
1900
1901     /*
1902      * Prepare horizontal offset array
1903      */
1904     if( i_pic_width - i_width == 0 )
1905     {
1906         /* No horizontal scaling: YUV conversion is done directly to picture */
1907         *pb_hscale = 0;
1908     }
1909     else if( i_pic_width - i_width > 0 )
1910     {
1911         /* Prepare scaling array for horizontal extension */
1912         *pb_hscale = 1;
1913         i_scale_count = i_pic_width;
1914         for( i_x = i_width; i_x--; )
1915         {
1916             while( (i_scale_count -= i_width) > 0 )
1917             {
1918                 *p_offset++ = 0;
1919             }
1920             *p_offset++ = 1;
1921             i_scale_count += i_pic_width;
1922         }
1923     }
1924     else /* if( i_pic_width - i_width < 0 ) */
1925     {
1926         /* Prepare scaling array for horizontal reduction */
1927         *pb_hscale = 1;
1928         i_scale_count = i_width;
1929         for( i_x = i_pic_width; i_x--; )
1930         {
1931             *p_offset = 1;
1932             while( (i_scale_count -= i_pic_width) > 0 )
1933             {
1934                 *p_offset += 1;
1935             }
1936             p_offset++;
1937             i_scale_count += i_width;
1938         }
1939     }
1940
1941     /*
1942      * Set vertical scaling indicator
1943      */
1944     if( i_pic_height - i_height == 0 )
1945     {
1946         *pi_vscale = 0;
1947     }
1948     else if( i_pic_height - i_height > 0 )
1949     {
1950         *pi_vscale = 1;
1951     }
1952     else /* if( i_pic_height - i_height < 0 ) */
1953     {
1954         *pi_vscale = -1;
1955     }
1956 }
1957