]> git.sesse.net Git - vlc/blob - modules/video_chroma/i420_rgb16.c
Merge branch 1.0-bugfix
[vlc] / modules / video_chroma / i420_rgb16.c
1 /*****************************************************************************
2  * i420_rgb16.c : YUV to bitmap RGB conversion module for vlc
3  *****************************************************************************
4  * Copyright (C) 2000 the VideoLAN team
5  * $Id$
6  *
7  * Authors: Samuel Hocevar <sam@zoy.org>
8  *          Damien Fouilleul <damienf@videolan.org>
9  *
10  * This program is free software; you can redistribute it and/or modify
11  * it under the terms of the GNU General Public License as published by
12  * the Free Software Foundation; either version 2 of the License, or
13  * (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public License
21  * along with this program; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
23  *****************************************************************************/
24
25 /*****************************************************************************
26  * Preamble
27  *****************************************************************************/
28
29 #ifdef HAVE_CONFIG_H
30 # include "config.h"
31 #endif
32
33 #include <vlc/vlc.h>
34 #include <vlc_filter.h>
35
36 #include "i420_rgb.h"
37 #if defined (MODULE_NAME_IS_i420_rgb)
38 #   include "i420_rgb_c.h"
39 #elif defined (MODULE_NAME_IS_i420_rgb_mmx)
40 #   include "i420_rgb_mmx.h"
41 #elif defined (MODULE_NAME_IS_i420_rgb_sse2)
42 #   include "i420_rgb_mmx.h"
43 #endif
44
45 static void SetOffset( int, int, int, int, bool *,
46                        unsigned int *, int * );
47
48 #if defined (MODULE_NAME_IS_i420_rgb)
49 /*****************************************************************************
50  * I420_RGB16: color YUV 4:2:0 to RGB 16 bpp with dithering
51  *****************************************************************************
52  * Horizontal alignment needed:
53  *  - input: 8 pixels (8 Y bytes, 4 U/V bytes), margins not allowed
54  *  - output: 1 pixel (2 bytes), margins allowed
55  * Vertical alignment needed:
56  *  - input: 2 lines (2 Y lines, 1 U/V line)
57  *  - output: 1 line
58  *****************************************************************************/
59 void I420_RGB16_dither( filter_t *p_filter, picture_t *p_src,
60                                                 picture_t *p_dest )
61 {
62     /* We got this one from the old arguments */
63     uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
64     uint8_t  *p_y   = p_src->Y_PIXELS;
65     uint8_t  *p_u   = p_src->U_PIXELS;
66     uint8_t  *p_v   = p_src->V_PIXELS;
67
68     bool   b_hscale;                        /* horizontal scaling type */
69     unsigned int i_vscale;                          /* vertical scaling type */
70     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
71     unsigned int i_real_y;                                          /* y % 4 */
72
73     int         i_right_margin;
74     int         i_rewind;
75     int         i_scale_count;                       /* scale modulo counter */
76     int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
77     uint16_t *  p_pic_start;       /* beginning of the current line for copy */
78     int         i_uval, i_vval;                           /* U and V samples */
79     int         i_red, i_green, i_blue;          /* U and V modified samples */
80     uint16_t *  p_yuv = p_filter->p_sys->p_rgb16;
81     uint16_t *  p_ybase;                     /* Y dependant conversion table */
82
83     /* Conversion buffer pointer */
84     uint16_t *  p_buffer_start = (uint16_t*)p_filter->p_sys->p_buffer;
85     uint16_t *  p_buffer;
86
87     /* Offset array pointer */
88     int *       p_offset_start = p_filter->p_sys->p_offset;
89     int *       p_offset;
90
91     const int i_source_margin = p_src->p[0].i_pitch
92                                  - p_src->p[0].i_visible_pitch;
93     const int i_source_margin_c = p_src->p[1].i_pitch
94                                  - p_src->p[1].i_visible_pitch;
95
96     /* The dithering matrices */
97     int dither10[4] = {  0x0,  0x8,  0x2,  0xa };
98     int dither11[4] = {  0xc,  0x4,  0xe,  0x6 };
99     int dither12[4] = {  0x3,  0xb,  0x1,  0x9 };
100     int dither13[4] = {  0xf,  0x7,  0xd,  0x5 };
101
102     for(i_x = 0; i_x < 4; i_x++)
103     {
104         dither10[i_x] = dither10[i_x] << (SHIFT - 4 + p_filter->fmt_out.video.i_rrshift);
105         dither11[i_x] = dither11[i_x] << (SHIFT - 4 + p_filter->fmt_out.video.i_rrshift);
106         dither12[i_x] = dither12[i_x] << (SHIFT - 4 + p_filter->fmt_out.video.i_rrshift);
107         dither13[i_x] = dither13[i_x] << (SHIFT - 4 + p_filter->fmt_out.video.i_rrshift);
108     }
109
110     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
111
112     if( p_filter->fmt_in.video.i_width & 7 )
113     {
114         i_rewind = 8 - ( p_filter->fmt_in.video.i_width & 7 );
115     }
116     else
117     {
118         i_rewind = 0;
119     }
120
121     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
122      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
123      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
124     SetOffset( p_filter->fmt_in.video.i_width,
125                p_filter->fmt_in.video.i_height,
126                p_filter->fmt_out.video.i_width,
127                p_filter->fmt_out.video.i_height,
128                &b_hscale, &i_vscale, p_offset_start );
129
130     /*
131      * Perform conversion
132      */
133     i_scale_count = ( i_vscale == 1 ) ?
134                     p_filter->fmt_out.video.i_height :
135                     p_filter->fmt_in.video.i_height;
136     for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
137     {
138         i_real_y = i_y & 0x3;
139         p_pic_start = p_pic;
140         p_buffer = b_hscale ? p_buffer_start : p_pic;
141
142         for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
143         {
144             int *p_dither = dither10;
145             CONVERT_YUV_PIXEL_DITHER(2);
146             p_dither = dither11;
147             CONVERT_Y_PIXEL_DITHER(2);
148             p_dither = dither12;
149             CONVERT_YUV_PIXEL_DITHER(2);
150             p_dither = dither13;
151             CONVERT_Y_PIXEL_DITHER(2);
152             p_dither = dither10;
153             CONVERT_YUV_PIXEL_DITHER(2);
154             p_dither = dither11;
155             CONVERT_Y_PIXEL_DITHER(2);
156             p_dither = dither12;
157             CONVERT_YUV_PIXEL_DITHER(2);
158             p_dither = dither13;
159             CONVERT_Y_PIXEL_DITHER(2);
160         }
161
162         /* Here we do some unaligned reads and duplicate conversions, but
163          * at least we have all the pixels */
164         if( i_rewind )
165         {
166             int *p_dither = dither10;
167             p_y -= i_rewind;
168             p_u -= i_rewind >> 1;
169             p_v -= i_rewind >> 1;
170             p_buffer -= i_rewind;
171             CONVERT_YUV_PIXEL_DITHER(2);
172             p_dither = dither11;
173             CONVERT_Y_PIXEL_DITHER(2);
174             p_dither = dither12;
175             CONVERT_YUV_PIXEL_DITHER(2);
176             p_dither = dither13;
177             CONVERT_Y_PIXEL_DITHER(2);
178             p_dither = dither10;
179             CONVERT_YUV_PIXEL_DITHER(2);
180             p_dither = dither11;
181             CONVERT_Y_PIXEL_DITHER(2);
182             p_dither = dither12;
183             CONVERT_YUV_PIXEL_DITHER(2);
184             p_dither = dither13;
185             CONVERT_Y_PIXEL_DITHER(2);
186         }
187         SCALE_WIDTH;
188         SCALE_HEIGHT( 420, 2 );
189
190         p_y += i_source_margin;
191         if( i_y % 2 )
192         {
193             p_u += i_source_margin_c;
194             p_v += i_source_margin_c;
195         }
196     }
197 }
198 #endif
199
200 /*****************************************************************************
201  * I420_RGB16: color YUV 4:2:0 to RGB 16 bpp
202  *****************************************************************************
203  * Horizontal alignment needed:
204  *  - input: 8 pixels (8 Y bytes, 4 U/V bytes), margins not allowed
205  *  - output: 1 pixel (2 bytes), margins allowed
206  * Vertical alignment needed:
207  *  - input: 2 lines (2 Y lines, 1 U/V line)
208  *  - output: 1 line
209  *****************************************************************************/
210
211 #if defined (MODULE_NAME_IS_i420_rgb)
212
213 void I420_RGB16( filter_t *p_filter, picture_t *p_src,
214                                          picture_t *p_dest )
215 {
216     /* We got this one from the old arguments */
217     uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
218     uint8_t  *p_y   = p_src->Y_PIXELS;
219     uint8_t  *p_u   = p_src->U_PIXELS;
220     uint8_t  *p_v   = p_src->V_PIXELS;
221
222     bool  b_hscale;                         /* horizontal scaling type */
223     unsigned int i_vscale;                          /* vertical scaling type */
224     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
225
226     int         i_right_margin;
227     int         i_rewind;
228     int         i_scale_count;                       /* scale modulo counter */
229     int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
230     uint16_t *  p_pic_start;       /* beginning of the current line for copy */
231     int         i_uval, i_vval;                           /* U and V samples */
232     int         i_red, i_green, i_blue;          /* U and V modified samples */
233     uint16_t *  p_yuv = p_filter->p_sys->p_rgb16;
234     uint16_t *  p_ybase;                     /* Y dependant conversion table */
235
236     /* Conversion buffer pointer */
237     uint16_t *  p_buffer_start = (uint16_t*)p_filter->p_sys->p_buffer;
238     uint16_t *  p_buffer;
239
240     /* Offset array pointer */
241     int *       p_offset_start = p_filter->p_sys->p_offset;
242     int *       p_offset;
243
244     const int i_source_margin = p_src->p[0].i_pitch
245                                  - p_src->p[0].i_visible_pitch;
246     const int i_source_margin_c = p_src->p[1].i_pitch
247                                  - p_src->p[1].i_visible_pitch;
248
249     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
250
251     if( p_filter->fmt_in.video.i_width & 7 )
252     {
253         i_rewind = 8 - ( p_filter->fmt_in.video.i_width & 7 );
254     }
255     else
256     {
257         i_rewind = 0;
258     }
259
260     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
261      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
262      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
263     SetOffset( p_filter->fmt_in.video.i_width,
264                p_filter->fmt_in.video.i_height,
265                p_filter->fmt_out.video.i_width,
266                p_filter->fmt_out.video.i_height,
267                &b_hscale, &i_vscale, p_offset_start );
268
269     /*
270      * Perform conversion
271      */
272     i_scale_count = ( i_vscale == 1 ) ?
273                     p_filter->fmt_out.video.i_height :
274                     p_filter->fmt_in.video.i_height;
275     for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
276     {
277         p_pic_start = p_pic;
278         p_buffer = b_hscale ? p_buffer_start : p_pic;
279
280         for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
281         {
282             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
283             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
284             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
285             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
286         }
287
288         /* Here we do some unaligned reads and duplicate conversions, but
289          * at least we have all the pixels */
290         if( i_rewind )
291         {
292             p_y -= i_rewind;
293             p_u -= i_rewind >> 1;
294             p_v -= i_rewind >> 1;
295             p_buffer -= i_rewind;
296
297             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
298             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
299             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
300             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
301         }
302         SCALE_WIDTH;
303         SCALE_HEIGHT( 420, 2 );
304
305         p_y += i_source_margin;
306         if( i_y % 2 )
307         {
308             p_u += i_source_margin_c;
309             p_v += i_source_margin_c;
310         }
311     }
312 }
313
314 #else // ! defined (MODULE_NAME_IS_i420_rgb)
315
316 void I420_R5G5B5( filter_t *p_filter, picture_t *p_src,
317                                           picture_t *p_dest )
318 {
319     /* We got this one from the old arguments */
320     uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
321     uint8_t  *p_y   = p_src->Y_PIXELS;
322     uint8_t  *p_u   = p_src->U_PIXELS;
323     uint8_t  *p_v   = p_src->V_PIXELS;
324
325     bool  b_hscale;                         /* horizontal scaling type */
326     unsigned int i_vscale;                          /* vertical scaling type */
327     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
328
329     int         i_right_margin;
330     int         i_rewind;
331     int         i_scale_count;                       /* scale modulo counter */
332     int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
333     uint16_t *  p_pic_start;       /* beginning of the current line for copy */
334
335     /* Conversion buffer pointer */
336     uint16_t *  p_buffer_start = (uint16_t*)p_filter->p_sys->p_buffer;
337     uint16_t *  p_buffer;
338
339     /* Offset array pointer */
340     int *       p_offset_start = p_filter->p_sys->p_offset;
341     int *       p_offset;
342
343     const int i_source_margin = p_src->p[0].i_pitch
344                                  - p_src->p[0].i_visible_pitch;
345     const int i_source_margin_c = p_src->p[1].i_pitch
346                                  - p_src->p[1].i_visible_pitch;
347
348     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
349
350     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
351      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
352      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
353     SetOffset( p_filter->fmt_in.video.i_width,
354                p_filter->fmt_in.video.i_height,
355                p_filter->fmt_out.video.i_width,
356                p_filter->fmt_out.video.i_height,
357                &b_hscale, &i_vscale, p_offset_start );
358
359
360     /*
361      * Perform conversion
362      */
363     i_scale_count = ( i_vscale == 1 ) ?
364                     p_filter->fmt_out.video.i_height :
365                     p_filter->fmt_in.video.i_height;
366
367 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
368
369     if( p_filter->fmt_in.video.i_width & 15 )
370     {
371         i_rewind = 16 - ( p_filter->fmt_in.video.i_width & 15 );
372     }
373     else
374     {
375         i_rewind = 0;
376     }
377
378     /*
379     ** SSE2 128 bits fetch/store instructions are faster
380     ** if memory access is 16 bytes aligned
381     */
382
383     p_buffer = b_hscale ? p_buffer_start : p_pic;
384     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
385                     p_dest->p->i_pitch|
386                     ((intptr_t)p_y)|
387                     ((intptr_t)p_buffer))) )
388     {
389         /* use faster SSE2 aligned fetch and store */
390         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
391         {
392             p_pic_start = p_pic;
393
394             for ( i_x = p_filter->fmt_in.video.i_width/16; i_x--; )
395             {
396                 SSE2_CALL (
397                     SSE2_INIT_16_ALIGNED
398                     SSE2_YUV_MUL
399                     SSE2_YUV_ADD
400                     SSE2_UNPACK_15_ALIGNED
401                 );
402                 p_y += 16;
403                 p_u += 8;
404                 p_v += 8;
405                 p_buffer += 16;
406             }
407             /* Here we do some unaligned reads and duplicate conversions, but
408              * at least we have all the pixels */
409             if( i_rewind )
410             {
411                 p_y -= i_rewind;
412                 p_u -= i_rewind >> 1;
413                 p_v -= i_rewind >> 1;
414                 p_buffer -= i_rewind;
415
416                 SSE2_CALL (
417                     SSE2_INIT_16_UNALIGNED
418                     SSE2_YUV_MUL
419                     SSE2_YUV_ADD
420                     SSE2_UNPACK_15_UNALIGNED
421                 );
422                 p_y += 16;
423                 p_u += 8;
424                 p_v += 8;
425             }
426             SCALE_WIDTH;
427             SCALE_HEIGHT( 420, 2 );
428
429             p_y += i_source_margin;
430             if( i_y % 2 )
431             {
432                 p_u += i_source_margin_c;
433                 p_v += i_source_margin_c;
434             }
435             p_buffer = b_hscale ? p_buffer_start : p_pic;
436         }
437     }
438     else
439     {
440         /* use slower SSE2 unaligned fetch and store */
441         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
442         {
443             p_pic_start = p_pic;
444             p_buffer = b_hscale ? p_buffer_start : p_pic;
445
446             for ( i_x = p_filter->fmt_in.video.i_width/16; i_x--; )
447             {
448                 SSE2_CALL (
449                     SSE2_INIT_16_UNALIGNED
450                     SSE2_YUV_MUL
451                     SSE2_YUV_ADD
452                     SSE2_UNPACK_15_UNALIGNED
453                 );
454                 p_y += 16;
455                 p_u += 8;
456                 p_v += 8;
457                 p_buffer += 16;
458             }
459             /* Here we do some unaligned reads and duplicate conversions, but
460              * at least we have all the pixels */
461             if( i_rewind )
462             {
463                 p_y -= i_rewind;
464                 p_u -= i_rewind >> 1;
465                 p_v -= i_rewind >> 1;
466                 p_buffer -= i_rewind;
467
468                 SSE2_CALL (
469                     SSE2_INIT_16_UNALIGNED
470                     SSE2_YUV_MUL
471                     SSE2_YUV_ADD
472                     SSE2_UNPACK_15_UNALIGNED
473                 );
474                 p_y += 16;
475                 p_u += 8;
476                 p_v += 8;
477             }
478             SCALE_WIDTH;
479             SCALE_HEIGHT( 420, 2 );
480
481             p_y += i_source_margin;
482             if( i_y % 2 )
483             {
484                 p_u += i_source_margin_c;
485                 p_v += i_source_margin_c;
486             }
487             p_buffer = b_hscale ? p_buffer_start : p_pic;
488         }
489     }
490
491     /* make sure all SSE2 stores are visible thereafter */
492     SSE2_END;
493
494 #else // defined (MODULE_NAME_IS_i420_rgb_mmx)
495
496     if( p_filter->fmt_in.video.i_width & 7 )
497     {
498         i_rewind = 8 - ( p_filter->fmt_in.video.i_width & 7 );
499     }
500     else
501     {
502         i_rewind = 0;
503     }
504
505     for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
506     {
507         p_pic_start = p_pic;
508         p_buffer = b_hscale ? p_buffer_start : p_pic;
509
510         for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
511         {
512             MMX_CALL (
513                 MMX_INIT_16
514                 MMX_YUV_MUL
515                 MMX_YUV_ADD
516                 MMX_UNPACK_15
517             );
518             p_y += 8;
519             p_u += 4;
520             p_v += 4;
521             p_buffer += 8;
522         }
523
524         /* Here we do some unaligned reads and duplicate conversions, but
525          * at least we have all the pixels */
526         if( i_rewind )
527         {
528             p_y -= i_rewind;
529             p_u -= i_rewind >> 1;
530             p_v -= i_rewind >> 1;
531             p_buffer -= i_rewind;
532
533             MMX_CALL (
534                 MMX_INIT_16
535                 MMX_YUV_MUL
536                 MMX_YUV_ADD
537                 MMX_UNPACK_15
538             );
539             p_y += 8;
540             p_u += 4;
541             p_v += 4;
542             p_buffer += 8;
543         }
544         SCALE_WIDTH;
545         SCALE_HEIGHT( 420, 2 );
546
547         p_y += i_source_margin;
548         if( i_y % 2 )
549         {
550             p_u += i_source_margin_c;
551             p_v += i_source_margin_c;
552         }
553     }
554     /* re-enable FPU registers */
555     MMX_END;
556
557 #endif
558 }
559
560 void I420_R5G6B5( filter_t *p_filter, picture_t *p_src,
561                                           picture_t *p_dest )
562 {
563     /* We got this one from the old arguments */
564     uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
565     uint8_t  *p_y   = p_src->Y_PIXELS;
566     uint8_t  *p_u   = p_src->U_PIXELS;
567     uint8_t  *p_v   = p_src->V_PIXELS;
568
569     bool  b_hscale;                         /* horizontal scaling type */
570     unsigned int i_vscale;                          /* vertical scaling type */
571     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
572
573     int         i_right_margin;
574     int         i_rewind;
575     int         i_scale_count;                       /* scale modulo counter */
576     int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
577     uint16_t *  p_pic_start;       /* beginning of the current line for copy */
578
579     /* Conversion buffer pointer */
580     uint16_t *  p_buffer_start = (uint16_t*)p_filter->p_sys->p_buffer;
581     uint16_t *  p_buffer;
582
583     /* Offset array pointer */
584     int *       p_offset_start = p_filter->p_sys->p_offset;
585     int *       p_offset;
586
587     const int i_source_margin = p_src->p[0].i_pitch
588                                  - p_src->p[0].i_visible_pitch;
589     const int i_source_margin_c = p_src->p[1].i_pitch
590                                  - p_src->p[1].i_visible_pitch;
591
592     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
593
594     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
595      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
596      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
597     SetOffset( p_filter->fmt_in.video.i_width,
598                p_filter->fmt_in.video.i_height,
599                p_filter->fmt_out.video.i_width,
600                p_filter->fmt_out.video.i_height,
601                &b_hscale, &i_vscale, p_offset_start );
602
603
604     /*
605      * Perform conversion
606      */
607     i_scale_count = ( i_vscale == 1 ) ?
608                     p_filter->fmt_out.video.i_height :
609                     p_filter->fmt_in.video.i_height;
610
611 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
612
613     if( p_filter->fmt_in.video.i_width & 15 )
614     {
615         i_rewind = 16 - ( p_filter->fmt_in.video.i_width & 15 );
616     }
617     else
618     {
619         i_rewind = 0;
620     }
621
622     /*
623     ** SSE2 128 bits fetch/store instructions are faster
624     ** if memory access is 16 bytes aligned
625     */
626
627     p_buffer = b_hscale ? p_buffer_start : p_pic;
628     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
629                     p_dest->p->i_pitch|
630                     ((intptr_t)p_y)|
631                     ((intptr_t)p_buffer))) )
632     {
633         /* use faster SSE2 aligned fetch and store */
634         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
635         {
636             p_pic_start = p_pic;
637
638             for ( i_x = p_filter->fmt_in.video.i_width/16; i_x--; )
639             {
640                 SSE2_CALL (
641                     SSE2_INIT_16_ALIGNED
642                     SSE2_YUV_MUL
643                     SSE2_YUV_ADD
644                     SSE2_UNPACK_16_ALIGNED
645                 );
646                 p_y += 16;
647                 p_u += 8;
648                 p_v += 8;
649                 p_buffer += 16;
650             }
651             /* Here we do some unaligned reads and duplicate conversions, but
652              * at least we have all the pixels */
653             if( i_rewind )
654             {
655                 p_y -= i_rewind;
656                 p_u -= i_rewind >> 1;
657                 p_v -= i_rewind >> 1;
658                 p_buffer -= i_rewind;
659
660                 SSE2_CALL (
661                     SSE2_INIT_16_UNALIGNED
662                     SSE2_YUV_MUL
663                     SSE2_YUV_ADD
664                     SSE2_UNPACK_16_UNALIGNED
665                 );
666                 p_y += 16;
667                 p_u += 8;
668                 p_v += 8;
669             }
670             SCALE_WIDTH;
671             SCALE_HEIGHT( 420, 2 );
672
673             p_y += i_source_margin;
674             if( i_y % 2 )
675             {
676                 p_u += i_source_margin_c;
677                 p_v += i_source_margin_c;
678             }
679             p_buffer = b_hscale ? p_buffer_start : p_pic;
680         }
681     }
682     else
683     {
684         /* use slower SSE2 unaligned fetch and store */
685         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
686         {
687             p_pic_start = p_pic;
688             p_buffer = b_hscale ? p_buffer_start : p_pic;
689
690             for ( i_x = p_filter->fmt_in.video.i_width/16; i_x--; )
691             {
692                 SSE2_CALL(
693                     SSE2_INIT_16_UNALIGNED
694                     SSE2_YUV_MUL
695                     SSE2_YUV_ADD
696                     SSE2_UNPACK_16_UNALIGNED
697                 );
698                 p_y += 16;
699                 p_u += 8;
700                 p_v += 8;
701                 p_buffer += 16;
702             }
703             /* Here we do some unaligned reads and duplicate conversions, but
704              * at least we have all the pixels */
705             if( i_rewind )
706             {
707                 p_y -= i_rewind;
708                 p_u -= i_rewind >> 1;
709                 p_v -= i_rewind >> 1;
710                 p_buffer -= i_rewind;
711
712                 SSE2_CALL(
713                     SSE2_INIT_16_UNALIGNED
714                     SSE2_YUV_MUL
715                     SSE2_YUV_ADD
716                     SSE2_UNPACK_16_UNALIGNED
717                 );
718                 p_y += 16;
719                 p_u += 8;
720                 p_v += 8;
721             }
722             SCALE_WIDTH;
723             SCALE_HEIGHT( 420, 2 );
724
725             p_y += i_source_margin;
726             if( i_y % 2 )
727             {
728                 p_u += i_source_margin_c;
729                 p_v += i_source_margin_c;
730             }
731             p_buffer = b_hscale ? p_buffer_start : p_pic;
732         }
733     }
734
735     /* make sure all SSE2 stores are visible thereafter */
736     SSE2_END;
737
738 #else // defined (MODULE_NAME_IS_i420_rgb_mmx)
739
740     if( p_filter->fmt_in.video.i_width & 7 )
741     {
742         i_rewind = 8 - ( p_filter->fmt_in.video.i_width & 7 );
743     }
744     else
745     {
746         i_rewind = 0;
747     }
748
749     for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
750     {
751         p_pic_start = p_pic;
752         p_buffer = b_hscale ? p_buffer_start : p_pic;
753
754         for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
755         {
756             MMX_CALL (
757                 MMX_INIT_16
758                 MMX_YUV_MUL
759                 MMX_YUV_ADD
760                 MMX_UNPACK_16
761             );
762             p_y += 8;
763             p_u += 4;
764             p_v += 4;
765             p_buffer += 8;
766         }
767
768         /* Here we do some unaligned reads and duplicate conversions, but
769          * at least we have all the pixels */
770         if( i_rewind )
771         {
772             p_y -= i_rewind;
773             p_u -= i_rewind >> 1;
774             p_v -= i_rewind >> 1;
775             p_buffer -= i_rewind;
776
777             MMX_CALL (
778                 MMX_INIT_16
779                 MMX_YUV_MUL
780                 MMX_YUV_ADD
781                 MMX_UNPACK_16
782             );
783             p_y += 8;
784             p_u += 4;
785             p_v += 4;
786             p_buffer += 8;
787         }
788         SCALE_WIDTH;
789         SCALE_HEIGHT( 420, 2 );
790
791         p_y += i_source_margin;
792         if( i_y % 2 )
793         {
794             p_u += i_source_margin_c;
795             p_v += i_source_margin_c;
796         }
797     }
798     /* re-enable FPU registers */
799     MMX_END;
800
801 #endif
802 }
803
804 #endif
805
806 /*****************************************************************************
807  * I420_RGB32: color YUV 4:2:0 to RGB 32 bpp
808  *****************************************************************************
809  * Horizontal alignment needed:
810  *  - input: 8 pixels (8 Y bytes, 4 U/V bytes), margins not allowed
811  *  - output: 1 pixel (2 bytes), margins allowed
812  * Vertical alignment needed:
813  *  - input: 2 lines (2 Y lines, 1 U/V line)
814  *  - output: 1 line
815  *****************************************************************************/
816
817 #if defined (MODULE_NAME_IS_i420_rgb)
818
819 void I420_RGB32( filter_t *p_filter, picture_t *p_src,
820                                          picture_t *p_dest )
821 {
822     /* We got this one from the old arguments */
823     uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
824     uint8_t  *p_y   = p_src->Y_PIXELS;
825     uint8_t  *p_u   = p_src->U_PIXELS;
826     uint8_t  *p_v   = p_src->V_PIXELS;
827
828     bool  b_hscale;                         /* horizontal scaling type */
829     unsigned int i_vscale;                          /* vertical scaling type */
830     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
831
832     int         i_right_margin;
833     int         i_rewind;
834     int         i_scale_count;                       /* scale modulo counter */
835     int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
836     uint32_t *  p_pic_start;       /* beginning of the current line for copy */
837     int         i_uval, i_vval;                           /* U and V samples */
838     int         i_red, i_green, i_blue;          /* U and V modified samples */
839     uint32_t *  p_yuv = p_filter->p_sys->p_rgb32;
840     uint32_t *  p_ybase;                     /* Y dependant conversion table */
841
842     /* Conversion buffer pointer */
843     uint32_t *  p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer;
844     uint32_t *  p_buffer;
845
846     /* Offset array pointer */
847     int *       p_offset_start = p_filter->p_sys->p_offset;
848     int *       p_offset;
849
850     const int i_source_margin = p_src->p[0].i_pitch
851                                  - p_src->p[0].i_visible_pitch;
852     const int i_source_margin_c = p_src->p[1].i_pitch
853                                  - p_src->p[1].i_visible_pitch;
854
855     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
856
857     if( p_filter->fmt_in.video.i_width & 7 )
858     {
859         i_rewind = 8 - ( p_filter->fmt_in.video.i_width & 7 );
860     }
861     else
862     {
863         i_rewind = 0;
864     }
865
866     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
867      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
868      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
869     SetOffset( p_filter->fmt_in.video.i_width,
870                p_filter->fmt_in.video.i_height,
871                p_filter->fmt_out.video.i_width,
872                p_filter->fmt_out.video.i_height,
873                &b_hscale, &i_vscale, p_offset_start );
874
875     /*
876      * Perform conversion
877      */
878     i_scale_count = ( i_vscale == 1 ) ?
879                     p_filter->fmt_out.video.i_height :
880                     p_filter->fmt_in.video.i_height;
881     for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
882     {
883         p_pic_start = p_pic;
884         p_buffer = b_hscale ? p_buffer_start : p_pic;
885
886         for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
887         {
888             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
889             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
890             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
891             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
892         }
893
894         /* Here we do some unaligned reads and duplicate conversions, but
895          * at least we have all the pixels */
896         if( i_rewind )
897         {
898             p_y -= i_rewind;
899             p_u -= i_rewind >> 1;
900             p_v -= i_rewind >> 1;
901             p_buffer -= i_rewind;
902             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
903             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
904             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
905             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
906         }
907         SCALE_WIDTH;
908         SCALE_HEIGHT( 420, 4 );
909
910         p_y += i_source_margin;
911         if( i_y % 2 )
912         {
913             p_u += i_source_margin_c;
914             p_v += i_source_margin_c;
915         }
916     }
917 }
918
919 #else // defined (MODULE_NAME_IS_i420_rgb_mmx) || defined (MODULE_NAME_IS_i420_rgb_sse2)
920
921 void I420_A8R8G8B8( filter_t *p_filter, picture_t *p_src,
922                                             picture_t *p_dest )
923 {
924     /* We got this one from the old arguments */
925     uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
926     uint8_t  *p_y   = p_src->Y_PIXELS;
927     uint8_t  *p_u   = p_src->U_PIXELS;
928     uint8_t  *p_v   = p_src->V_PIXELS;
929
930     bool  b_hscale;                         /* horizontal scaling type */
931     unsigned int i_vscale;                          /* vertical scaling type */
932     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
933
934     int         i_right_margin;
935     int         i_rewind;
936     int         i_scale_count;                       /* scale modulo counter */
937     int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
938     uint32_t *  p_pic_start;       /* beginning of the current line for copy */
939     /* Conversion buffer pointer */
940     uint32_t *  p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer;
941     uint32_t *  p_buffer;
942
943     /* Offset array pointer */
944     int *       p_offset_start = p_filter->p_sys->p_offset;
945     int *       p_offset;
946
947     const int i_source_margin = p_src->p[0].i_pitch
948                                  - p_src->p[0].i_visible_pitch;
949     const int i_source_margin_c = p_src->p[1].i_pitch
950                                  - p_src->p[1].i_visible_pitch;
951
952     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
953
954     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
955      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
956      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
957     SetOffset( p_filter->fmt_in.video.i_width,
958                p_filter->fmt_in.video.i_height,
959                p_filter->fmt_out.video.i_width,
960                p_filter->fmt_out.video.i_height,
961                &b_hscale, &i_vscale, p_offset_start );
962
963     /*
964      * Perform conversion
965      */
966     i_scale_count = ( i_vscale == 1 ) ?
967                     p_filter->fmt_out.video.i_height :
968                     p_filter->fmt_in.video.i_height;
969
970 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
971
972     if( p_filter->fmt_in.video.i_width & 15 )
973     {
974         i_rewind = 16 - ( p_filter->fmt_in.video.i_width & 15 );
975     }
976     else
977     {
978         i_rewind = 0;
979     }
980
981     /*
982     ** SSE2 128 bits fetch/store instructions are faster
983     ** if memory access is 16 bytes aligned
984     */
985
986     p_buffer = b_hscale ? p_buffer_start : p_pic;
987     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
988                     p_dest->p->i_pitch|
989                     ((intptr_t)p_y)|
990                     ((intptr_t)p_buffer))) )
991     {
992         /* use faster SSE2 aligned fetch and store */
993         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
994         {
995             p_pic_start = p_pic;
996
997             for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
998             {
999                 SSE2_CALL (
1000                     SSE2_INIT_32_ALIGNED
1001                     SSE2_YUV_MUL
1002                     SSE2_YUV_ADD
1003                     SSE2_UNPACK_32_ARGB_ALIGNED
1004                 );
1005                 p_y += 16;
1006                 p_u += 8;
1007                 p_v += 8;
1008                 p_buffer += 16;
1009             }
1010
1011             /* Here we do some unaligned reads and duplicate conversions, but
1012              * at least we have all the pixels */
1013             if( i_rewind )
1014             {
1015                 p_y -= i_rewind;
1016                 p_u -= i_rewind >> 1;
1017                 p_v -= i_rewind >> 1;
1018                 p_buffer -= i_rewind;
1019                 SSE2_CALL (
1020                     SSE2_INIT_32_UNALIGNED
1021                     SSE2_YUV_MUL
1022                     SSE2_YUV_ADD
1023                     SSE2_UNPACK_32_ARGB_UNALIGNED
1024                 );
1025                 p_y += 16;
1026                 p_u += 4;
1027                 p_v += 4;
1028             }
1029             SCALE_WIDTH;
1030             SCALE_HEIGHT( 420, 4 );
1031
1032             p_y += i_source_margin;
1033             if( i_y % 2 )
1034             {
1035                 p_u += i_source_margin_c;
1036                 p_v += i_source_margin_c;
1037             }
1038             p_buffer = b_hscale ? p_buffer_start : p_pic;
1039         }
1040     }
1041     else
1042     {
1043         /* use slower SSE2 unaligned fetch and store */
1044         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1045         {
1046             p_pic_start = p_pic;
1047             p_buffer = b_hscale ? p_buffer_start : p_pic;
1048
1049             for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
1050             {
1051                 SSE2_CALL (
1052                     SSE2_INIT_32_UNALIGNED
1053                     SSE2_YUV_MUL
1054                     SSE2_YUV_ADD
1055                     SSE2_UNPACK_32_ARGB_UNALIGNED
1056                 );
1057                 p_y += 16;
1058                 p_u += 8;
1059                 p_v += 8;
1060                 p_buffer += 16;
1061             }
1062
1063             /* Here we do some unaligned reads and duplicate conversions, but
1064              * at least we have all the pixels */
1065             if( i_rewind )
1066             {
1067                 p_y -= i_rewind;
1068                 p_u -= i_rewind >> 1;
1069                 p_v -= i_rewind >> 1;
1070                 p_buffer -= i_rewind;
1071                 SSE2_CALL (
1072                     SSE2_INIT_32_UNALIGNED
1073                     SSE2_YUV_MUL
1074                     SSE2_YUV_ADD
1075                     SSE2_UNPACK_32_ARGB_UNALIGNED
1076                 );
1077                 p_y += 16;
1078                 p_u += 8;
1079                 p_v += 8;
1080             }
1081             SCALE_WIDTH;
1082             SCALE_HEIGHT( 420, 4 );
1083
1084             p_y += i_source_margin;
1085             if( i_y % 2 )
1086             {
1087                 p_u += i_source_margin_c;
1088                 p_v += i_source_margin_c;
1089             }
1090             p_buffer = b_hscale ? p_buffer_start : p_pic;
1091         }
1092     }
1093
1094     /* make sure all SSE2 stores are visible thereafter */
1095     SSE2_END;
1096
1097 #else // defined (MODULE_NAME_IS_i420_rgb_mmx)
1098
1099     if( p_filter->fmt_in.video.i_width & 7 )
1100     {
1101         i_rewind = 8 - ( p_filter->fmt_in.video.i_width & 7 );
1102     }
1103     else
1104     {
1105         i_rewind = 0;
1106     }
1107
1108     for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1109     {
1110         p_pic_start = p_pic;
1111         p_buffer = b_hscale ? p_buffer_start : p_pic;
1112
1113         for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
1114         {
1115             MMX_CALL (
1116                 MMX_INIT_32
1117                 MMX_YUV_MUL
1118                 MMX_YUV_ADD
1119                 MMX_UNPACK_32_ARGB
1120             );
1121             p_y += 8;
1122             p_u += 4;
1123             p_v += 4;
1124             p_buffer += 8;
1125         }
1126
1127         /* Here we do some unaligned reads and duplicate conversions, but
1128          * at least we have all the pixels */
1129         if( i_rewind )
1130         {
1131             p_y -= i_rewind;
1132             p_u -= i_rewind >> 1;
1133             p_v -= i_rewind >> 1;
1134             p_buffer -= i_rewind;
1135             MMX_CALL (
1136                 MMX_INIT_32
1137                 MMX_YUV_MUL
1138                 MMX_YUV_ADD
1139                 MMX_UNPACK_32_ARGB
1140             );
1141             p_y += 8;
1142             p_u += 4;
1143             p_v += 4;
1144             p_buffer += 8;
1145         }
1146         SCALE_WIDTH;
1147         SCALE_HEIGHT( 420, 4 );
1148
1149         p_y += i_source_margin;
1150         if( i_y % 2 )
1151         {
1152             p_u += i_source_margin_c;
1153             p_v += i_source_margin_c;
1154         }
1155     }
1156
1157     /* re-enable FPU registers */
1158     MMX_END;
1159
1160 #endif
1161 }
1162
1163 void I420_R8G8B8A8( filter_t *p_filter, picture_t *p_src,
1164                                             picture_t *p_dest )
1165 {
1166     /* We got this one from the old arguments */
1167     uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
1168     uint8_t  *p_y   = p_src->Y_PIXELS;
1169     uint8_t  *p_u   = p_src->U_PIXELS;
1170     uint8_t  *p_v   = p_src->V_PIXELS;
1171
1172     bool  b_hscale;                         /* horizontal scaling type */
1173     unsigned int i_vscale;                          /* vertical scaling type */
1174     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
1175
1176     int         i_right_margin;
1177     int         i_rewind;
1178     int         i_scale_count;                       /* scale modulo counter */
1179     int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
1180     uint32_t *  p_pic_start;       /* beginning of the current line for copy */
1181     /* Conversion buffer pointer */
1182     uint32_t *  p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer;
1183     uint32_t *  p_buffer;
1184
1185     /* Offset array pointer */
1186     int *       p_offset_start = p_filter->p_sys->p_offset;
1187     int *       p_offset;
1188
1189     const int i_source_margin = p_src->p[0].i_pitch
1190                                  - p_src->p[0].i_visible_pitch;
1191     const int i_source_margin_c = p_src->p[1].i_pitch
1192                                  - p_src->p[1].i_visible_pitch;
1193
1194     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
1195
1196     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
1197      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
1198      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
1199     SetOffset( p_filter->fmt_in.video.i_width,
1200                p_filter->fmt_in.video.i_height,
1201                p_filter->fmt_out.video.i_width,
1202                p_filter->fmt_out.video.i_height,
1203                &b_hscale, &i_vscale, p_offset_start );
1204
1205     /*
1206      * Perform conversion
1207      */
1208     i_scale_count = ( i_vscale == 1 ) ?
1209                     p_filter->fmt_out.video.i_height :
1210                     p_filter->fmt_in.video.i_height;
1211
1212 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
1213
1214     if( p_filter->fmt_in.video.i_width & 15 )
1215     {
1216         i_rewind = 16 - ( p_filter->fmt_in.video.i_width & 15 );
1217     }
1218     else
1219     {
1220         i_rewind = 0;
1221     }
1222
1223     /*
1224     ** SSE2 128 bits fetch/store instructions are faster
1225     ** if memory access is 16 bytes aligned
1226     */
1227
1228     p_buffer = b_hscale ? p_buffer_start : p_pic;
1229     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
1230                     p_dest->p->i_pitch|
1231                     ((intptr_t)p_y)|
1232                     ((intptr_t)p_buffer))) )
1233     {
1234         /* use faster SSE2 aligned fetch and store */
1235         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1236         {
1237             p_pic_start = p_pic;
1238
1239             for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
1240             {
1241                 SSE2_CALL (
1242                     SSE2_INIT_32_ALIGNED
1243                     SSE2_YUV_MUL
1244                     SSE2_YUV_ADD
1245                     SSE2_UNPACK_32_RGBA_ALIGNED
1246                 );
1247                 p_y += 16;
1248                 p_u += 8;
1249                 p_v += 8;
1250                 p_buffer += 16;
1251             }
1252
1253             /* Here we do some unaligned reads and duplicate conversions, but
1254              * at least we have all the pixels */
1255             if( i_rewind )
1256             {
1257                 p_y -= i_rewind;
1258                 p_u -= i_rewind >> 1;
1259                 p_v -= i_rewind >> 1;
1260                 p_buffer -= i_rewind;
1261                 SSE2_CALL (
1262                     SSE2_INIT_32_UNALIGNED
1263                     SSE2_YUV_MUL
1264                     SSE2_YUV_ADD
1265                     SSE2_UNPACK_32_RGBA_UNALIGNED
1266                 );
1267                 p_y += 16;
1268                 p_u += 4;
1269                 p_v += 4;
1270             }
1271             SCALE_WIDTH;
1272             SCALE_HEIGHT( 420, 4 );
1273
1274             p_y += i_source_margin;
1275             if( i_y % 2 )
1276             {
1277                 p_u += i_source_margin_c;
1278                 p_v += i_source_margin_c;
1279             }
1280             p_buffer = b_hscale ? p_buffer_start : p_pic;
1281         }
1282     }
1283     else
1284     {
1285         /* use slower SSE2 unaligned fetch and store */
1286         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1287         {
1288             p_pic_start = p_pic;
1289             p_buffer = b_hscale ? p_buffer_start : p_pic;
1290
1291             for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
1292             {
1293                 SSE2_CALL (
1294                     SSE2_INIT_32_UNALIGNED
1295                     SSE2_YUV_MUL
1296                     SSE2_YUV_ADD
1297                     SSE2_UNPACK_32_RGBA_UNALIGNED
1298                 );
1299                 p_y += 16;
1300                 p_u += 8;
1301                 p_v += 8;
1302                 p_buffer += 16;
1303             }
1304
1305             /* Here we do some unaligned reads and duplicate conversions, but
1306              * at least we have all the pixels */
1307             if( i_rewind )
1308             {
1309                 p_y -= i_rewind;
1310                 p_u -= i_rewind >> 1;
1311                 p_v -= i_rewind >> 1;
1312                 p_buffer -= i_rewind;
1313                 SSE2_CALL (
1314                     SSE2_INIT_32_UNALIGNED
1315                     SSE2_YUV_MUL
1316                     SSE2_YUV_ADD
1317                     SSE2_UNPACK_32_RGBA_UNALIGNED
1318                 );
1319                 p_y += 16;
1320                 p_u += 8;
1321                 p_v += 8;
1322             }
1323             SCALE_WIDTH;
1324             SCALE_HEIGHT( 420, 4 );
1325
1326             p_y += i_source_margin;
1327             if( i_y % 2 )
1328             {
1329                 p_u += i_source_margin_c;
1330                 p_v += i_source_margin_c;
1331             }
1332             p_buffer = b_hscale ? p_buffer_start : p_pic;
1333         }
1334     }
1335
1336     /* make sure all SSE2 stores are visible thereafter */
1337     SSE2_END;
1338
1339 #else // defined (MODULE_NAME_IS_i420_rgb_mmx)
1340
1341     if( p_filter->fmt_in.video.i_width & 7 )
1342     {
1343         i_rewind = 8 - ( p_filter->fmt_in.video.i_width & 7 );
1344     }
1345     else
1346     {
1347         i_rewind = 0;
1348     }
1349
1350     for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1351     {
1352         p_pic_start = p_pic;
1353         p_buffer = b_hscale ? p_buffer_start : p_pic;
1354
1355         for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
1356         {
1357             MMX_CALL (
1358                 MMX_INIT_32
1359                 MMX_YUV_MUL
1360                 MMX_YUV_ADD
1361                 MMX_UNPACK_32_RGBA
1362             );
1363             p_y += 8;
1364             p_u += 4;
1365             p_v += 4;
1366             p_buffer += 8;
1367         }
1368
1369         /* Here we do some unaligned reads and duplicate conversions, but
1370          * at least we have all the pixels */
1371         if( i_rewind )
1372         {
1373             p_y -= i_rewind;
1374             p_u -= i_rewind >> 1;
1375             p_v -= i_rewind >> 1;
1376             p_buffer -= i_rewind;
1377             MMX_CALL (
1378                 MMX_INIT_32
1379                 MMX_YUV_MUL
1380                 MMX_YUV_ADD
1381                 MMX_UNPACK_32_RGBA
1382             );
1383             p_y += 8;
1384             p_u += 4;
1385             p_v += 4;
1386             p_buffer += 8;
1387         }
1388         SCALE_WIDTH;
1389         SCALE_HEIGHT( 420, 4 );
1390
1391         p_y += i_source_margin;
1392         if( i_y % 2 )
1393         {
1394             p_u += i_source_margin_c;
1395             p_v += i_source_margin_c;
1396         }
1397     }
1398
1399     /* re-enable FPU registers */
1400     MMX_END;
1401
1402 #endif
1403 }
1404
1405 void I420_B8G8R8A8( filter_t *p_filter, picture_t *p_src,
1406                                             picture_t *p_dest )
1407 {
1408     /* We got this one from the old arguments */
1409     uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
1410     uint8_t  *p_y   = p_src->Y_PIXELS;
1411     uint8_t  *p_u   = p_src->U_PIXELS;
1412     uint8_t  *p_v   = p_src->V_PIXELS;
1413
1414     bool  b_hscale;                         /* horizontal scaling type */
1415     unsigned int i_vscale;                          /* vertical scaling type */
1416     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
1417
1418     int         i_right_margin;
1419     int         i_rewind;
1420     int         i_scale_count;                       /* scale modulo counter */
1421     int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
1422     uint32_t *  p_pic_start;       /* beginning of the current line for copy */
1423     /* Conversion buffer pointer */
1424     uint32_t *  p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer;
1425     uint32_t *  p_buffer;
1426
1427     /* Offset array pointer */
1428     int *       p_offset_start = p_filter->p_sys->p_offset;
1429     int *       p_offset;
1430
1431     const int i_source_margin = p_src->p[0].i_pitch
1432                                  - p_src->p[0].i_visible_pitch;
1433     const int i_source_margin_c = p_src->p[1].i_pitch
1434                                  - p_src->p[1].i_visible_pitch;
1435
1436     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
1437
1438     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
1439      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
1440      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
1441     SetOffset( p_filter->fmt_in.video.i_width,
1442                p_filter->fmt_in.video.i_height,
1443                p_filter->fmt_out.video.i_width,
1444                p_filter->fmt_out.video.i_height,
1445                &b_hscale, &i_vscale, p_offset_start );
1446
1447     /*
1448      * Perform conversion
1449      */
1450     i_scale_count = ( i_vscale == 1 ) ?
1451                     p_filter->fmt_out.video.i_height :
1452                     p_filter->fmt_in.video.i_height;
1453
1454 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
1455
1456     if( p_filter->fmt_in.video.i_width & 15 )
1457     {
1458         i_rewind = 16 - ( p_filter->fmt_in.video.i_width & 15 );
1459     }
1460     else
1461     {
1462         i_rewind = 0;
1463     }
1464
1465     /*
1466     ** SSE2 128 bits fetch/store instructions are faster
1467     ** if memory access is 16 bytes aligned
1468     */
1469
1470     p_buffer = b_hscale ? p_buffer_start : p_pic;
1471     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
1472                     p_dest->p->i_pitch|
1473                     ((intptr_t)p_y)|
1474                     ((intptr_t)p_buffer))) )
1475     {
1476         /* use faster SSE2 aligned fetch and store */
1477         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1478         {
1479             p_pic_start = p_pic;
1480
1481             for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
1482             {
1483                 SSE2_CALL (
1484                     SSE2_INIT_32_ALIGNED
1485                     SSE2_YUV_MUL
1486                     SSE2_YUV_ADD
1487                     SSE2_UNPACK_32_BGRA_ALIGNED
1488                 );
1489                 p_y += 16;
1490                 p_u += 8;
1491                 p_v += 8;
1492                 p_buffer += 16;
1493             }
1494
1495             /* Here we do some unaligned reads and duplicate conversions, but
1496              * at least we have all the pixels */
1497             if( i_rewind )
1498             {
1499                 p_y -= i_rewind;
1500                 p_u -= i_rewind >> 1;
1501                 p_v -= i_rewind >> 1;
1502                 p_buffer -= i_rewind;
1503                 SSE2_CALL (
1504                     SSE2_INIT_32_UNALIGNED
1505                     SSE2_YUV_MUL
1506                     SSE2_YUV_ADD
1507                     SSE2_UNPACK_32_BGRA_UNALIGNED
1508                 );
1509                 p_y += 16;
1510                 p_u += 4;
1511                 p_v += 4;
1512             }
1513             SCALE_WIDTH;
1514             SCALE_HEIGHT( 420, 4 );
1515
1516             p_y += i_source_margin;
1517             if( i_y % 2 )
1518             {
1519                 p_u += i_source_margin_c;
1520                 p_v += i_source_margin_c;
1521             }
1522             p_buffer = b_hscale ? p_buffer_start : p_pic;
1523         }
1524     }
1525     else
1526     {
1527         /* use slower SSE2 unaligned fetch and store */
1528         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1529         {
1530             p_pic_start = p_pic;
1531             p_buffer = b_hscale ? p_buffer_start : p_pic;
1532
1533             for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
1534             {
1535                 SSE2_CALL (
1536                     SSE2_INIT_32_UNALIGNED
1537                     SSE2_YUV_MUL
1538                     SSE2_YUV_ADD
1539                     SSE2_UNPACK_32_BGRA_UNALIGNED
1540                 );
1541                 p_y += 16;
1542                 p_u += 8;
1543                 p_v += 8;
1544                 p_buffer += 16;
1545             }
1546
1547             /* Here we do some unaligned reads and duplicate conversions, but
1548              * at least we have all the pixels */
1549             if( i_rewind )
1550             {
1551                 p_y -= i_rewind;
1552                 p_u -= i_rewind >> 1;
1553                 p_v -= i_rewind >> 1;
1554                 p_buffer -= i_rewind;
1555                 SSE2_CALL (
1556                     SSE2_INIT_32_UNALIGNED
1557                     SSE2_YUV_MUL
1558                     SSE2_YUV_ADD
1559                     SSE2_UNPACK_32_BGRA_UNALIGNED
1560                 );
1561                 p_y += 16;
1562                 p_u += 8;
1563                 p_v += 8;
1564             }
1565             SCALE_WIDTH;
1566             SCALE_HEIGHT( 420, 4 );
1567
1568             p_y += i_source_margin;
1569             if( i_y % 2 )
1570             {
1571                 p_u += i_source_margin_c;
1572                 p_v += i_source_margin_c;
1573             }
1574             p_buffer = b_hscale ? p_buffer_start : p_pic;
1575         }
1576     }
1577
1578 #else
1579
1580     if( p_filter->fmt_in.video.i_width & 7 )
1581     {
1582         i_rewind = 8 - ( p_filter->fmt_in.video.i_width & 7 );
1583     }
1584     else
1585     {
1586         i_rewind = 0;
1587     }
1588
1589     for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1590     {
1591         p_pic_start = p_pic;
1592         p_buffer = b_hscale ? p_buffer_start : p_pic;
1593
1594         for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
1595         {
1596             MMX_CALL (
1597                 MMX_INIT_32
1598                 MMX_YUV_MUL
1599                 MMX_YUV_ADD
1600                 MMX_UNPACK_32_BGRA
1601             );
1602             p_y += 8;
1603             p_u += 4;
1604             p_v += 4;
1605             p_buffer += 8;
1606         }
1607
1608         /* Here we do some unaligned reads and duplicate conversions, but
1609          * at least we have all the pixels */
1610         if( i_rewind )
1611         {
1612             p_y -= i_rewind;
1613             p_u -= i_rewind >> 1;
1614             p_v -= i_rewind >> 1;
1615             p_buffer -= i_rewind;
1616             MMX_CALL (
1617                 MMX_INIT_32
1618                 MMX_YUV_MUL
1619                 MMX_YUV_ADD
1620                 MMX_UNPACK_32_BGRA
1621             );
1622             p_y += 8;
1623             p_u += 4;
1624             p_v += 4;
1625             p_buffer += 8;
1626         }
1627         SCALE_WIDTH;
1628         SCALE_HEIGHT( 420, 4 );
1629
1630         p_y += i_source_margin;
1631         if( i_y % 2 )
1632         {
1633             p_u += i_source_margin_c;
1634             p_v += i_source_margin_c;
1635         }
1636     }
1637
1638     /* re-enable FPU registers */
1639     MMX_END;
1640
1641 #endif
1642 }
1643
1644 void I420_A8B8G8R8( filter_t *p_filter, picture_t *p_src,
1645                                             picture_t *p_dest )
1646 {
1647     /* We got this one from the old arguments */
1648     uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
1649     uint8_t  *p_y   = p_src->Y_PIXELS;
1650     uint8_t  *p_u   = p_src->U_PIXELS;
1651     uint8_t  *p_v   = p_src->V_PIXELS;
1652
1653     bool  b_hscale;                         /* horizontal scaling type */
1654     unsigned int i_vscale;                          /* vertical scaling type */
1655     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
1656
1657     int         i_right_margin;
1658     int         i_rewind;
1659     int         i_scale_count;                       /* scale modulo counter */
1660     int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
1661     uint32_t *  p_pic_start;       /* beginning of the current line for copy */
1662     /* Conversion buffer pointer */
1663     uint32_t *  p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer;
1664     uint32_t *  p_buffer;
1665
1666     /* Offset array pointer */
1667     int *       p_offset_start = p_filter->p_sys->p_offset;
1668     int *       p_offset;
1669
1670     const int i_source_margin = p_src->p[0].i_pitch
1671                                  - p_src->p[0].i_visible_pitch;
1672     const int i_source_margin_c = p_src->p[1].i_pitch
1673                                  - p_src->p[1].i_visible_pitch;
1674
1675     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
1676
1677     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
1678      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
1679      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
1680     SetOffset( p_filter->fmt_in.video.i_width,
1681                p_filter->fmt_in.video.i_height,
1682                p_filter->fmt_out.video.i_width,
1683                p_filter->fmt_out.video.i_height,
1684                &b_hscale, &i_vscale, p_offset_start );
1685
1686     /*
1687      * Perform conversion
1688      */
1689     i_scale_count = ( i_vscale == 1 ) ?
1690                     p_filter->fmt_out.video.i_height :
1691                     p_filter->fmt_in.video.i_height;
1692
1693 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
1694
1695     if( p_filter->fmt_in.video.i_width & 15 )
1696     {
1697         i_rewind = 16 - ( p_filter->fmt_in.video.i_width & 15 );
1698     }
1699     else
1700     {
1701         i_rewind = 0;
1702     }
1703
1704     /*
1705     ** SSE2 128 bits fetch/store instructions are faster
1706     ** if memory access is 16 bytes aligned
1707     */
1708
1709     p_buffer = b_hscale ? p_buffer_start : p_pic;
1710     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
1711                     p_dest->p->i_pitch|
1712                     ((intptr_t)p_y)|
1713                     ((intptr_t)p_buffer))) )
1714     {
1715         /* use faster SSE2 aligned fetch and store */
1716         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1717         {
1718             p_pic_start = p_pic;
1719
1720             for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
1721             {
1722                 SSE2_CALL (
1723                     SSE2_INIT_32_ALIGNED
1724                     SSE2_YUV_MUL
1725                     SSE2_YUV_ADD
1726                     SSE2_UNPACK_32_ABGR_ALIGNED
1727                 );
1728                 p_y += 16;
1729                 p_u += 8;
1730                 p_v += 8;
1731                 p_buffer += 16;
1732             }
1733
1734             /* Here we do some unaligned reads and duplicate conversions, but
1735              * at least we have all the pixels */
1736             if( i_rewind )
1737             {
1738                 p_y -= i_rewind;
1739                 p_u -= i_rewind >> 1;
1740                 p_v -= i_rewind >> 1;
1741                 p_buffer -= i_rewind;
1742                 SSE2_CALL (
1743                     SSE2_INIT_32_UNALIGNED
1744                     SSE2_YUV_MUL
1745                     SSE2_YUV_ADD
1746                     SSE2_UNPACK_32_ABGR_UNALIGNED
1747                 );
1748                 p_y += 16;
1749                 p_u += 4;
1750                 p_v += 4;
1751             }
1752             SCALE_WIDTH;
1753             SCALE_HEIGHT( 420, 4 );
1754
1755             p_y += i_source_margin;
1756             if( i_y % 2 )
1757             {
1758                 p_u += i_source_margin_c;
1759                 p_v += i_source_margin_c;
1760             }
1761             p_buffer = b_hscale ? p_buffer_start : p_pic;
1762         }
1763     }
1764     else
1765     {
1766         /* use slower SSE2 unaligned fetch and store */
1767         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1768         {
1769             p_pic_start = p_pic;
1770             p_buffer = b_hscale ? p_buffer_start : p_pic;
1771
1772             for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
1773             {
1774                 SSE2_CALL (
1775                     SSE2_INIT_32_UNALIGNED
1776                     SSE2_YUV_MUL
1777                     SSE2_YUV_ADD
1778                     SSE2_UNPACK_32_ABGR_UNALIGNED
1779                 );
1780                 p_y += 16;
1781                 p_u += 8;
1782                 p_v += 8;
1783                 p_buffer += 16;
1784             }
1785
1786             /* Here we do some unaligned reads and duplicate conversions, but
1787              * at least we have all the pixels */
1788             if( i_rewind )
1789             {
1790                 p_y -= i_rewind;
1791                 p_u -= i_rewind >> 1;
1792                 p_v -= i_rewind >> 1;
1793                 p_buffer -= i_rewind;
1794                 SSE2_CALL (
1795                     SSE2_INIT_32_UNALIGNED
1796                     SSE2_YUV_MUL
1797                     SSE2_YUV_ADD
1798                     SSE2_UNPACK_32_ABGR_UNALIGNED
1799                 );
1800                 p_y += 16;
1801                 p_u += 8;
1802                 p_v += 8;
1803             }
1804             SCALE_WIDTH;
1805             SCALE_HEIGHT( 420, 4 );
1806
1807             p_y += i_source_margin;
1808             if( i_y % 2 )
1809             {
1810                 p_u += i_source_margin_c;
1811                 p_v += i_source_margin_c;
1812             }
1813             p_buffer = b_hscale ? p_buffer_start : p_pic;
1814         }
1815     }
1816
1817 #else
1818
1819     if( p_filter->fmt_in.video.i_width & 7 )
1820     {
1821         i_rewind = 8 - ( p_filter->fmt_in.video.i_width & 7 );
1822     }
1823     else
1824     {
1825         i_rewind = 0;
1826     }
1827
1828     for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1829     {
1830         p_pic_start = p_pic;
1831         p_buffer = b_hscale ? p_buffer_start : p_pic;
1832
1833         for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
1834         {
1835             MMX_CALL (
1836                 MMX_INIT_32
1837                 MMX_YUV_MUL
1838                 MMX_YUV_ADD
1839                 MMX_UNPACK_32_ABGR
1840             );
1841             p_y += 8;
1842             p_u += 4;
1843             p_v += 4;
1844             p_buffer += 8;
1845         }
1846
1847         /* Here we do some unaligned reads and duplicate conversions, but
1848          * at least we have all the pixels */
1849         if( i_rewind )
1850         {
1851             p_y -= i_rewind;
1852             p_u -= i_rewind >> 1;
1853             p_v -= i_rewind >> 1;
1854             p_buffer -= i_rewind;
1855             MMX_CALL (
1856                 MMX_INIT_32
1857                 MMX_YUV_MUL
1858                 MMX_YUV_ADD
1859                 MMX_UNPACK_32_ABGR
1860             );
1861             p_y += 8;
1862             p_u += 4;
1863             p_v += 4;
1864             p_buffer += 8;
1865         }
1866         SCALE_WIDTH;
1867         SCALE_HEIGHT( 420, 4 );
1868
1869         p_y += i_source_margin;
1870         if( i_y % 2 )
1871         {
1872             p_u += i_source_margin_c;
1873             p_v += i_source_margin_c;
1874         }
1875     }
1876
1877     /* re-enable FPU registers */
1878     MMX_END;
1879
1880 #endif
1881 }
1882
1883 #endif
1884
1885 /* Following functions are local */
1886
1887 /*****************************************************************************
1888  * SetOffset: build offset array for conversion functions
1889  *****************************************************************************
1890  * This function will build an offset array used in later conversion functions.
1891  * It will also set horizontal and vertical scaling indicators.
1892  *****************************************************************************/
1893 static void SetOffset( int i_width, int i_height, int i_pic_width,
1894                        int i_pic_height, bool *pb_hscale,
1895                        unsigned int *pi_vscale, int *p_offset )
1896 {
1897     int i_x;                                    /* x position in destination */
1898     int i_scale_count;                                     /* modulo counter */
1899
1900     /*
1901      * Prepare horizontal offset array
1902      */
1903     if( i_pic_width - i_width == 0 )
1904     {
1905         /* No horizontal scaling: YUV conversion is done directly to picture */
1906         *pb_hscale = 0;
1907     }
1908     else if( i_pic_width - i_width > 0 )
1909     {
1910         /* Prepare scaling array for horizontal extension */
1911         *pb_hscale = 1;
1912         i_scale_count = i_pic_width;
1913         for( i_x = i_width; i_x--; )
1914         {
1915             while( (i_scale_count -= i_width) > 0 )
1916             {
1917                 *p_offset++ = 0;
1918             }
1919             *p_offset++ = 1;
1920             i_scale_count += i_pic_width;
1921         }
1922     }
1923     else /* if( i_pic_width - i_width < 0 ) */
1924     {
1925         /* Prepare scaling array for horizontal reduction */
1926         *pb_hscale = 1;
1927         i_scale_count = i_width;
1928         for( i_x = i_pic_width; i_x--; )
1929         {
1930             *p_offset = 1;
1931             while( (i_scale_count -= i_pic_width) > 0 )
1932             {
1933                 *p_offset += 1;
1934             }
1935             p_offset++;
1936             i_scale_count += i_width;
1937         }
1938     }
1939
1940     /*
1941      * Set vertical scaling indicator
1942      */
1943     if( i_pic_height - i_height == 0 )
1944     {
1945         *pi_vscale = 0;
1946     }
1947     else if( i_pic_height - i_height > 0 )
1948     {
1949         *pi_vscale = 1;
1950     }
1951     else /* if( i_pic_height - i_height < 0 ) */
1952     {
1953         *pi_vscale = -1;
1954     }
1955 }
1956