]> git.sesse.net Git - vlc/blob - modules/video_chroma/i420_rgb16.c
video_chroma: added I420_ABGR32 support (mostly for opengl), some clean up as well
[vlc] / modules / video_chroma / i420_rgb16.c
1 /*****************************************************************************
2  * i420_rgb16.c : YUV to bitmap RGB conversion module for vlc
3  *****************************************************************************
4  * Copyright (C) 2000 the VideoLAN team
5  * $Id$
6  *
7  * Authors: Samuel Hocevar <sam@zoy.org>
8  *          Damien Fouilleul <damienf@videolan.org>
9  *
10  * This program is free software; you can redistribute it and/or modify
11  * it under the terms of the GNU General Public License as published by
12  * the Free Software Foundation; either version 2 of the License, or
13  * (at your option) any later version.
14  * 
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public License
21  * along with this program; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
23  *****************************************************************************/
24
25 /*****************************************************************************
26  * Preamble
27  *****************************************************************************/
28 #include <string.h>                                            /* strerror() */
29 #include <stdlib.h>                                      /* malloc(), free() */
30
31 #include <vlc/vlc.h>
32 #include <vlc_vout.h>
33
34 #include "i420_rgb.h"
35 #if defined (MODULE_NAME_IS_i420_rgb)
36 #   include "i420_rgb_c.h"
37 #elif defined (MODULE_NAME_IS_i420_rgb_mmx)
38 #   include "i420_rgb_mmx.h"
39 #elif defined (MODULE_NAME_IS_i420_rgb_sse2)
40 #   include "i420_rgb_mmx.h"
41 #endif
42
43 static void SetOffset( int, int, int, int, vlc_bool_t *,
44                        unsigned int *, int * );
45
46 #if defined (MODULE_NAME_IS_i420_rgb)
47 /*****************************************************************************
48  * I420_RGB16: color YUV 4:2:0 to RGB 16 bpp with dithering
49  *****************************************************************************
50  * Horizontal alignment needed:
51  *  - input: 8 pixels (8 Y bytes, 4 U/V bytes), margins not allowed
52  *  - output: 1 pixel (2 bytes), margins allowed
53  * Vertical alignment needed:
54  *  - input: 2 lines (2 Y lines, 1 U/V line)
55  *  - output: 1 line
56  *****************************************************************************/
57 void E_(I420_RGB16_dither)( vout_thread_t *p_vout, picture_t *p_src,
58                                                       picture_t *p_dest )
59 {
60     /* We got this one from the old arguments */
61     uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
62     uint8_t  *p_y   = p_src->Y_PIXELS;
63     uint8_t  *p_u   = p_src->U_PIXELS;
64     uint8_t  *p_v   = p_src->V_PIXELS;
65
66     vlc_bool_t   b_hscale;                        /* horizontal scaling type */
67     unsigned int i_vscale;                          /* vertical scaling type */
68     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
69     unsigned int i_real_y;                                          /* y % 4 */
70
71     int         i_right_margin;
72     int         i_rewind;
73     int         i_scale_count;                       /* scale modulo counter */
74     int         i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
75     uint16_t *  p_pic_start;       /* beginning of the current line for copy */
76     int         i_uval, i_vval;                           /* U and V samples */
77     int         i_red, i_green, i_blue;          /* U and V modified samples */
78     uint16_t *  p_yuv = p_vout->chroma.p_sys->p_rgb16;
79     uint16_t *  p_ybase;                     /* Y dependant conversion table */
80
81     /* Conversion buffer pointer */
82     uint16_t *  p_buffer_start = (uint16_t*)p_vout->chroma.p_sys->p_buffer;
83     uint16_t *  p_buffer;
84
85     /* Offset array pointer */
86     int *       p_offset_start = p_vout->chroma.p_sys->p_offset;
87     int *       p_offset;
88
89     const int i_source_margin = p_src->p[0].i_pitch
90                                  - p_src->p[0].i_visible_pitch;
91     const int i_source_margin_c = p_src->p[1].i_pitch
92                                  - p_src->p[1].i_visible_pitch;
93
94     /* The dithering matrices */
95     int dither10[4] = {  0x0,  0x8,  0x2,  0xa };
96     int dither11[4] = {  0xc,  0x4,  0xe,  0x6 };
97     int dither12[4] = {  0x3,  0xb,  0x1,  0x9 };
98     int dither13[4] = {  0xf,  0x7,  0xd,  0x5 };
99
100     for(i_x = 0; i_x < 4; i_x++)
101     {
102         dither10[i_x] = dither10[i_x] << (SHIFT - 4 + p_vout->output.i_rrshift);
103         dither11[i_x] = dither11[i_x] << (SHIFT - 4 + p_vout->output.i_rrshift);
104         dither12[i_x] = dither12[i_x] << (SHIFT - 4 + p_vout->output.i_rrshift);
105         dither13[i_x] = dither13[i_x] << (SHIFT - 4 + p_vout->output.i_rrshift);
106     }
107
108     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
109
110     if( p_vout->render.i_width & 7 )
111     {
112         i_rewind = 8 - ( p_vout->render.i_width & 7 );
113     }
114     else
115     {
116         i_rewind = 0;
117     }
118
119     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
120      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
121      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
122     SetOffset( p_vout->render.i_width, p_vout->render.i_height,
123                p_vout->output.i_width, p_vout->output.i_height,
124                &b_hscale, &i_vscale, p_offset_start );
125
126     /*
127      * Perform conversion
128      */
129     i_scale_count = ( i_vscale == 1 ) ?
130                     p_vout->output.i_height : p_vout->render.i_height;
131     for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
132     {
133         i_real_y = i_y & 0x3;
134         p_pic_start = p_pic;
135         p_buffer = b_hscale ? p_buffer_start : p_pic;
136
137         for ( i_x = p_vout->render.i_width / 8; i_x--; )
138         {
139             int *p_dither = dither10;
140             CONVERT_YUV_PIXEL_DITHER(2);
141             p_dither = dither11;
142             CONVERT_Y_PIXEL_DITHER(2);
143             p_dither = dither12;
144             CONVERT_YUV_PIXEL_DITHER(2);
145             p_dither = dither13;
146             CONVERT_Y_PIXEL_DITHER(2);
147             p_dither = dither10;
148             CONVERT_YUV_PIXEL_DITHER(2);
149             p_dither = dither11;
150             CONVERT_Y_PIXEL_DITHER(2);
151             p_dither = dither12;
152             CONVERT_YUV_PIXEL_DITHER(2);
153             p_dither = dither13;
154             CONVERT_Y_PIXEL_DITHER(2);
155         }
156
157         /* Here we do some unaligned reads and duplicate conversions, but
158          * at least we have all the pixels */
159         if( i_rewind )
160         {
161             int *p_dither = dither10;
162             p_y -= i_rewind;
163             p_u -= i_rewind >> 1;
164             p_v -= i_rewind >> 1;
165             p_buffer -= i_rewind;
166             CONVERT_YUV_PIXEL_DITHER(2);
167             p_dither = dither11;
168             CONVERT_Y_PIXEL_DITHER(2);
169             p_dither = dither12;
170             CONVERT_YUV_PIXEL_DITHER(2);
171             p_dither = dither13;
172             CONVERT_Y_PIXEL_DITHER(2);
173             p_dither = dither10;
174             CONVERT_YUV_PIXEL_DITHER(2);
175             p_dither = dither11;
176             CONVERT_Y_PIXEL_DITHER(2);
177             p_dither = dither12;
178             CONVERT_YUV_PIXEL_DITHER(2);
179             p_dither = dither13;
180             CONVERT_Y_PIXEL_DITHER(2);
181         }
182         SCALE_WIDTH;
183         SCALE_HEIGHT( 420, 2 );
184
185         p_y += i_source_margin;
186         if( i_y % 2 )
187         {
188             p_u += i_source_margin_c;
189             p_v += i_source_margin_c;
190         }
191     }
192 }
193 #endif
194
195 /*****************************************************************************
196  * I420_RGB16: color YUV 4:2:0 to RGB 16 bpp
197  *****************************************************************************
198  * Horizontal alignment needed:
199  *  - input: 8 pixels (8 Y bytes, 4 U/V bytes), margins not allowed
200  *  - output: 1 pixel (2 bytes), margins allowed
201  * Vertical alignment needed:
202  *  - input: 2 lines (2 Y lines, 1 U/V line)
203  *  - output: 1 line
204  *****************************************************************************/
205
206 #if defined (MODULE_NAME_IS_i420_rgb)
207
208 void E_(I420_RGB16)( vout_thread_t *p_vout, picture_t *p_src,
209                                             picture_t *p_dest )
210 {
211     /* We got this one from the old arguments */
212     uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
213     uint8_t  *p_y   = p_src->Y_PIXELS;
214     uint8_t  *p_u   = p_src->U_PIXELS;
215     uint8_t  *p_v   = p_src->V_PIXELS;
216
217     vlc_bool_t  b_hscale;                         /* horizontal scaling type */
218     unsigned int i_vscale;                          /* vertical scaling type */
219     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
220
221     int         i_right_margin;
222     int         i_rewind;
223     int         i_scale_count;                       /* scale modulo counter */
224     int         i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
225     uint16_t *  p_pic_start;       /* beginning of the current line for copy */
226     int         i_uval, i_vval;                           /* U and V samples */
227     int         i_red, i_green, i_blue;          /* U and V modified samples */
228     uint16_t *  p_yuv = p_vout->chroma.p_sys->p_rgb16;
229     uint16_t *  p_ybase;                     /* Y dependant conversion table */
230
231     /* Conversion buffer pointer */
232     uint16_t *  p_buffer_start = (uint16_t*)p_vout->chroma.p_sys->p_buffer;
233     uint16_t *  p_buffer;
234
235     /* Offset array pointer */
236     int *       p_offset_start = p_vout->chroma.p_sys->p_offset;
237     int *       p_offset;
238
239     const int i_source_margin = p_src->p[0].i_pitch
240                                  - p_src->p[0].i_visible_pitch;
241     const int i_source_margin_c = p_src->p[1].i_pitch
242                                  - p_src->p[1].i_visible_pitch;
243
244     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
245
246     if( p_vout->render.i_width & 7 )
247     {
248         i_rewind = 8 - ( p_vout->render.i_width & 7 );
249     }
250     else
251     {
252         i_rewind = 0;
253     }
254
255     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
256      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
257      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
258     SetOffset( p_vout->render.i_width, p_vout->render.i_height,
259                p_vout->output.i_width, p_vout->output.i_height,
260                &b_hscale, &i_vscale, p_offset_start );
261
262     /*
263      * Perform conversion
264      */
265     i_scale_count = ( i_vscale == 1 ) ?
266                     p_vout->output.i_height : p_vout->render.i_height;
267     for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
268     {
269         p_pic_start = p_pic;
270         p_buffer = b_hscale ? p_buffer_start : p_pic;
271
272         for ( i_x = p_vout->render.i_width / 8; i_x--; )
273         {
274             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
275             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
276             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
277             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
278         }
279
280         /* Here we do some unaligned reads and duplicate conversions, but
281          * at least we have all the pixels */
282         if( i_rewind )
283         {
284             p_y -= i_rewind;
285             p_u -= i_rewind >> 1;
286             p_v -= i_rewind >> 1;
287             p_buffer -= i_rewind;
288
289             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
290             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
291             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
292             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
293         }
294         SCALE_WIDTH;
295         SCALE_HEIGHT( 420, 2 );
296
297         p_y += i_source_margin;
298         if( i_y % 2 )
299         {
300             p_u += i_source_margin_c;
301             p_v += i_source_margin_c;
302         }
303     }
304 }
305
306 #else // ! defined (MODULE_NAME_IS_i420_rgb)
307
308 void E_(I420_R5G5B5)( vout_thread_t *p_vout, picture_t *p_src,
309                                             picture_t *p_dest )
310 {
311     /* We got this one from the old arguments */
312     uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
313     uint8_t  *p_y   = p_src->Y_PIXELS;
314     uint8_t  *p_u   = p_src->U_PIXELS;
315     uint8_t  *p_v   = p_src->V_PIXELS;
316
317     vlc_bool_t  b_hscale;                         /* horizontal scaling type */
318     unsigned int i_vscale;                          /* vertical scaling type */
319     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
320
321     int         i_right_margin;
322     int         i_rewind;
323     int         i_scale_count;                       /* scale modulo counter */
324     int         i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
325     uint16_t *  p_pic_start;       /* beginning of the current line for copy */
326
327     /* Conversion buffer pointer */
328     uint16_t *  p_buffer_start = (uint16_t*)p_vout->chroma.p_sys->p_buffer;
329     uint16_t *  p_buffer;
330
331     /* Offset array pointer */
332     int *       p_offset_start = p_vout->chroma.p_sys->p_offset;
333     int *       p_offset;
334
335     const int i_source_margin = p_src->p[0].i_pitch
336                                  - p_src->p[0].i_visible_pitch;
337     const int i_source_margin_c = p_src->p[1].i_pitch
338                                  - p_src->p[1].i_visible_pitch;
339
340     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
341
342     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
343      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
344      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
345     SetOffset( p_vout->render.i_width, p_vout->render.i_height,
346                p_vout->output.i_width, p_vout->output.i_height,
347                &b_hscale, &i_vscale, p_offset_start );
348
349
350     /*
351      * Perform conversion
352      */
353     i_scale_count = ( i_vscale == 1 ) ?
354                     p_vout->output.i_height : p_vout->render.i_height;
355
356 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
357
358     if( p_vout->render.i_width & 15 )
359     {
360         i_rewind = 16 - ( p_vout->render.i_width & 15 );
361     }
362     else
363     {
364         i_rewind = 0;
365     }
366
367     /*
368     ** SSE2 128 bits fetch/store instructions are faster 
369     ** if memory access is 16 bytes aligned
370     */
371
372     p_buffer = b_hscale ? p_buffer_start : p_pic;
373     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
374                     p_dest->p->i_pitch|
375                     ((int)p_y)|
376                     ((int)p_buffer))) )
377     {
378         /* use faster SSE2 aligned fetch and store */
379         for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
380         {
381             p_pic_start = p_pic;
382
383             for ( i_x = p_vout->render.i_width/16; i_x--; )
384             {
385                 SSE2_CALL (
386                     SSE2_INIT_16_ALIGNED
387                     SSE2_YUV_MUL
388                     SSE2_YUV_ADD
389                     SSE2_UNPACK_15_ALIGNED
390                 );
391                 p_y += 16;
392                 p_u += 8;
393                 p_v += 8;
394                 p_buffer += 16;
395             }
396             /* Here we do some unaligned reads and duplicate conversions, but
397              * at least we have all the pixels */
398             if( i_rewind )
399             {
400                 p_y -= i_rewind;
401                 p_u -= i_rewind >> 1;
402                 p_v -= i_rewind >> 1;
403                 p_buffer -= i_rewind;
404
405                 SSE2_CALL (
406                     SSE2_INIT_16_UNALIGNED
407                     SSE2_YUV_MUL
408                     SSE2_YUV_ADD
409                     SSE2_UNPACK_15_UNALIGNED
410                 );
411                 p_y += 16;
412                 p_u += 8;
413                 p_v += 8;
414             }
415             SCALE_WIDTH;
416             SCALE_HEIGHT( 420, 2 );
417
418             p_y += i_source_margin;
419             if( i_y % 2 )
420             {
421                 p_u += i_source_margin_c;
422                 p_v += i_source_margin_c;
423             }
424             p_buffer = b_hscale ? p_buffer_start : p_pic;
425         }
426     }
427     else
428     {
429         /* use slower SSE2 unaligned fetch and store */
430         for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
431         {
432             p_pic_start = p_pic;
433             p_buffer = b_hscale ? p_buffer_start : p_pic;
434
435             for ( i_x = p_vout->render.i_width/16; i_x--; )
436             {
437                 SSE2_CALL (
438                     SSE2_INIT_16_UNALIGNED
439                     SSE2_YUV_MUL
440                     SSE2_YUV_ADD
441                     SSE2_UNPACK_15_UNALIGNED
442                 );
443                 p_y += 16;
444                 p_u += 8;
445                 p_v += 8;
446                 p_buffer += 16;
447             }
448             /* Here we do some unaligned reads and duplicate conversions, but
449              * at least we have all the pixels */
450             if( i_rewind )
451             {
452                 p_y -= i_rewind;
453                 p_u -= i_rewind >> 1;
454                 p_v -= i_rewind >> 1;
455                 p_buffer -= i_rewind;
456
457                 SSE2_CALL (
458                     SSE2_INIT_16_UNALIGNED
459                     SSE2_YUV_MUL
460                     SSE2_YUV_ADD
461                     SSE2_UNPACK_15_UNALIGNED
462                 );
463                 p_y += 16;
464                 p_u += 8;
465                 p_v += 8;
466             }
467             SCALE_WIDTH;
468             SCALE_HEIGHT( 420, 2 );
469
470             p_y += i_source_margin;
471             if( i_y % 2 )
472             {
473                 p_u += i_source_margin_c;
474                 p_v += i_source_margin_c;
475             }
476             p_buffer = b_hscale ? p_buffer_start : p_pic;
477         }
478     }
479
480     /* make sure all SSE2 stores are visible thereafter */
481     SSE2_END;
482
483 #else // defined (MODULE_NAME_IS_i420_rgb_mmx)
484
485     if( p_vout->render.i_width & 7 )
486     {
487         i_rewind = 8 - ( p_vout->render.i_width & 7 );
488     }
489     else
490     {
491         i_rewind = 0;
492     }
493
494     for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
495     {
496         p_pic_start = p_pic;
497         p_buffer = b_hscale ? p_buffer_start : p_pic;
498
499         for ( i_x = p_vout->render.i_width / 8; i_x--; )
500         {
501             MMX_CALL (
502                 MMX_INIT_16
503                 MMX_YUV_MUL
504                 MMX_YUV_ADD
505                 MMX_UNPACK_15
506             );
507             p_y += 8;
508             p_u += 4;
509             p_v += 4;
510             p_buffer += 8;
511         }
512
513         /* Here we do some unaligned reads and duplicate conversions, but
514          * at least we have all the pixels */
515         if( i_rewind )
516         {
517             p_y -= i_rewind;
518             p_u -= i_rewind >> 1;
519             p_v -= i_rewind >> 1;
520             p_buffer -= i_rewind;
521
522             MMX_CALL (
523                 MMX_INIT_16
524                 MMX_YUV_MUL
525                 MMX_YUV_ADD
526                 MMX_UNPACK_15
527             );
528             p_y += 8;
529             p_u += 4;
530             p_v += 4;
531             p_buffer += 8;
532         }
533         SCALE_WIDTH;
534         SCALE_HEIGHT( 420, 2 );
535
536         p_y += i_source_margin;
537         if( i_y % 2 )
538         {
539             p_u += i_source_margin_c;
540             p_v += i_source_margin_c;
541         }
542     }
543     /* re-enable FPU registers */
544     MMX_END;
545
546 #endif
547 }
548
549 void E_(I420_R5G6B5)( vout_thread_t *p_vout, picture_t *p_src,
550                                             picture_t *p_dest )
551 {
552     /* We got this one from the old arguments */
553     uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
554     uint8_t  *p_y   = p_src->Y_PIXELS;
555     uint8_t  *p_u   = p_src->U_PIXELS;
556     uint8_t  *p_v   = p_src->V_PIXELS;
557
558     vlc_bool_t  b_hscale;                         /* horizontal scaling type */
559     unsigned int i_vscale;                          /* vertical scaling type */
560     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
561
562     int         i_right_margin;
563     int         i_rewind;
564     int         i_scale_count;                       /* scale modulo counter */
565     int         i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
566     uint16_t *  p_pic_start;       /* beginning of the current line for copy */
567
568     /* Conversion buffer pointer */
569     uint16_t *  p_buffer_start = (uint16_t*)p_vout->chroma.p_sys->p_buffer;
570     uint16_t *  p_buffer;
571
572     /* Offset array pointer */
573     int *       p_offset_start = p_vout->chroma.p_sys->p_offset;
574     int *       p_offset;
575
576     const int i_source_margin = p_src->p[0].i_pitch
577                                  - p_src->p[0].i_visible_pitch;
578     const int i_source_margin_c = p_src->p[1].i_pitch
579                                  - p_src->p[1].i_visible_pitch;
580
581     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
582
583     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
584      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
585      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
586     SetOffset( p_vout->render.i_width, p_vout->render.i_height,
587                p_vout->output.i_width, p_vout->output.i_height,
588                &b_hscale, &i_vscale, p_offset_start );
589
590
591     /*
592      * Perform conversion
593      */
594     i_scale_count = ( i_vscale == 1 ) ?
595                     p_vout->output.i_height : p_vout->render.i_height;
596
597 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
598
599     if( p_vout->render.i_width & 15 )
600     {
601         i_rewind = 16 - ( p_vout->render.i_width & 15 );
602     }
603     else
604     {
605         i_rewind = 0;
606     }
607
608     /*
609     ** SSE2 128 bits fetch/store instructions are faster 
610     ** if memory access is 16 bytes aligned
611     */
612
613     p_buffer = b_hscale ? p_buffer_start : p_pic;
614     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
615                     p_dest->p->i_pitch|
616                     ((int)p_y)|
617                     ((int)p_buffer))) )
618     {
619         /* use faster SSE2 aligned fetch and store */
620         for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
621         {
622             p_pic_start = p_pic;
623
624             for ( i_x = p_vout->render.i_width/16; i_x--; )
625             {
626                 SSE2_CALL (
627                     SSE2_INIT_16_ALIGNED
628                     SSE2_YUV_MUL
629                     SSE2_YUV_ADD
630                     SSE2_UNPACK_16_ALIGNED
631                 );
632                 p_y += 16;
633                 p_u += 8;
634                 p_v += 8;
635                 p_buffer += 16;
636             }
637             /* Here we do some unaligned reads and duplicate conversions, but
638              * at least we have all the pixels */
639             if( i_rewind )
640             {
641                 p_y -= i_rewind;
642                 p_u -= i_rewind >> 1;
643                 p_v -= i_rewind >> 1;
644                 p_buffer -= i_rewind;
645
646                 SSE2_CALL (
647                     SSE2_INIT_16_UNALIGNED
648                     SSE2_YUV_MUL
649                     SSE2_YUV_ADD
650                     SSE2_UNPACK_16_UNALIGNED
651                 );
652                 p_y += 16;
653                 p_u += 8;
654                 p_v += 8;
655             }
656             SCALE_WIDTH;
657             SCALE_HEIGHT( 420, 2 );
658
659             p_y += i_source_margin;
660             if( i_y % 2 )
661             {
662                 p_u += i_source_margin_c;
663                 p_v += i_source_margin_c;
664             }
665             p_buffer = b_hscale ? p_buffer_start : p_pic;
666         }
667     }
668     else
669     {
670         /* use slower SSE2 unaligned fetch and store */
671         for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
672         {
673             p_pic_start = p_pic;
674             p_buffer = b_hscale ? p_buffer_start : p_pic;
675
676             for ( i_x = p_vout->render.i_width/16; i_x--; )
677             {
678                 SSE2_CALL(
679                     SSE2_INIT_16_UNALIGNED
680                     SSE2_YUV_MUL
681                     SSE2_YUV_ADD
682                     SSE2_UNPACK_16_UNALIGNED
683                 );
684                 p_y += 16;
685                 p_u += 8;
686                 p_v += 8;
687                 p_buffer += 16;
688             }
689             /* Here we do some unaligned reads and duplicate conversions, but
690              * at least we have all the pixels */
691             if( i_rewind )
692             {
693                 p_y -= i_rewind;
694                 p_u -= i_rewind >> 1;
695                 p_v -= i_rewind >> 1;
696                 p_buffer -= i_rewind;
697
698                 SSE2_CALL(
699                     SSE2_INIT_16_UNALIGNED
700                     SSE2_YUV_MUL
701                     SSE2_YUV_ADD
702                     SSE2_UNPACK_16_UNALIGNED
703                 );
704                 p_y += 16;
705                 p_u += 8;
706                 p_v += 8;
707             }
708             SCALE_WIDTH;
709             SCALE_HEIGHT( 420, 2 );
710
711             p_y += i_source_margin;
712             if( i_y % 2 )
713             {
714                 p_u += i_source_margin_c;
715                 p_v += i_source_margin_c;
716             }
717             p_buffer = b_hscale ? p_buffer_start : p_pic;
718         }
719     }
720
721     /* make sure all SSE2 stores are visible thereafter */
722     SSE2_END;
723
724 #else // defined (MODULE_NAME_IS_i420_rgb_mmx)
725
726     if( p_vout->render.i_width & 7 )
727     {
728         i_rewind = 8 - ( p_vout->render.i_width & 7 );
729     }
730     else
731     {
732         i_rewind = 0;
733     }
734
735     for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
736     {
737         p_pic_start = p_pic;
738         p_buffer = b_hscale ? p_buffer_start : p_pic;
739
740         for ( i_x = p_vout->render.i_width / 8; i_x--; )
741         {
742             MMX_CALL (
743                 MMX_INIT_16
744                 MMX_YUV_MUL
745                 MMX_YUV_ADD
746                 MMX_UNPACK_16
747             );
748             p_y += 8;
749             p_u += 4;
750             p_v += 4;
751             p_buffer += 8;
752         }
753
754         /* Here we do some unaligned reads and duplicate conversions, but
755          * at least we have all the pixels */
756         if( i_rewind )
757         {
758             p_y -= i_rewind;
759             p_u -= i_rewind >> 1;
760             p_v -= i_rewind >> 1;
761             p_buffer -= i_rewind;
762
763             MMX_CALL (
764                 MMX_INIT_16
765                 MMX_YUV_MUL
766                 MMX_YUV_ADD
767                 MMX_UNPACK_16
768             );
769             p_y += 8;
770             p_u += 4;
771             p_v += 4;
772             p_buffer += 8;
773         }
774         SCALE_WIDTH;
775         SCALE_HEIGHT( 420, 2 );
776
777         p_y += i_source_margin;
778         if( i_y % 2 )
779         {
780             p_u += i_source_margin_c;
781             p_v += i_source_margin_c;
782         }
783     }
784     /* re-enable FPU registers */
785     MMX_END;
786
787 #endif
788 }
789
790 #endif
791
792 /*****************************************************************************
793  * I420_RGB32: color YUV 4:2:0 to RGB 32 bpp
794  *****************************************************************************
795  * Horizontal alignment needed:
796  *  - input: 8 pixels (8 Y bytes, 4 U/V bytes), margins not allowed
797  *  - output: 1 pixel (2 bytes), margins allowed
798  * Vertical alignment needed:
799  *  - input: 2 lines (2 Y lines, 1 U/V line)
800  *  - output: 1 line
801  *****************************************************************************/
802
803 #if defined (MODULE_NAME_IS_i420_rgb)
804
805 void E_(I420_RGB32)( vout_thread_t *p_vout, picture_t *p_src,
806                                             picture_t *p_dest )
807 {
808     /* We got this one from the old arguments */
809     uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
810     uint8_t  *p_y   = p_src->Y_PIXELS;
811     uint8_t  *p_u   = p_src->U_PIXELS;
812     uint8_t  *p_v   = p_src->V_PIXELS;
813
814     vlc_bool_t  b_hscale;                         /* horizontal scaling type */
815     unsigned int i_vscale;                          /* vertical scaling type */
816     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
817
818     int         i_right_margin;
819     int         i_rewind;
820     int         i_scale_count;                       /* scale modulo counter */
821     int         i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
822     uint32_t *  p_pic_start;       /* beginning of the current line for copy */
823     int         i_uval, i_vval;                           /* U and V samples */
824     int         i_red, i_green, i_blue;          /* U and V modified samples */
825     uint32_t *  p_yuv = p_vout->chroma.p_sys->p_rgb32;
826     uint32_t *  p_ybase;                     /* Y dependant conversion table */
827
828     /* Conversion buffer pointer */
829     uint32_t *  p_buffer_start = (uint32_t*)p_vout->chroma.p_sys->p_buffer;
830     uint32_t *  p_buffer;
831
832     /* Offset array pointer */
833     int *       p_offset_start = p_vout->chroma.p_sys->p_offset;
834     int *       p_offset;
835
836     const int i_source_margin = p_src->p[0].i_pitch
837                                  - p_src->p[0].i_visible_pitch;
838     const int i_source_margin_c = p_src->p[1].i_pitch
839                                  - p_src->p[1].i_visible_pitch;
840
841     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
842
843     if( p_vout->render.i_width & 7 )
844     {
845         i_rewind = 8 - ( p_vout->render.i_width & 7 );
846     }
847     else
848     {
849         i_rewind = 0;
850     }
851
852     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
853      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
854      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
855     SetOffset( p_vout->render.i_width, p_vout->render.i_height,
856                p_vout->output.i_width, p_vout->output.i_height,
857                &b_hscale, &i_vscale, p_offset_start );
858
859     /*
860      * Perform conversion
861      */
862     i_scale_count = ( i_vscale == 1 ) ?
863                     p_vout->output.i_height : p_vout->render.i_height;
864     for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
865     {
866         p_pic_start = p_pic;
867         p_buffer = b_hscale ? p_buffer_start : p_pic;
868
869         for ( i_x = p_vout->render.i_width / 8; i_x--; )
870         {
871             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
872             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
873             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
874             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
875         }
876
877         /* Here we do some unaligned reads and duplicate conversions, but
878          * at least we have all the pixels */
879         if( i_rewind )
880         {
881             p_y -= i_rewind;
882             p_u -= i_rewind >> 1;
883             p_v -= i_rewind >> 1;
884             p_buffer -= i_rewind;
885             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
886             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
887             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
888             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
889         }
890         SCALE_WIDTH;
891         SCALE_HEIGHT( 420, 4 );
892
893         p_y += i_source_margin;
894         if( i_y % 2 )
895         {
896             p_u += i_source_margin_c;
897             p_v += i_source_margin_c;
898         }
899     }
900 }
901
902 #else // defined (MODULE_NAME_IS_i420_rgb_mmx) || defined (MODULE_NAME_IS_i420_rgb_sse2)
903
904 void E_(I420_A8R8G8B8)( vout_thread_t *p_vout, picture_t *p_src,
905                                             picture_t *p_dest )
906 {
907     /* We got this one from the old arguments */
908     uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
909     uint8_t  *p_y   = p_src->Y_PIXELS;
910     uint8_t  *p_u   = p_src->U_PIXELS;
911     uint8_t  *p_v   = p_src->V_PIXELS;
912
913     vlc_bool_t  b_hscale;                         /* horizontal scaling type */
914     unsigned int i_vscale;                          /* vertical scaling type */
915     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
916
917     int         i_right_margin;
918     int         i_rewind;
919     int         i_scale_count;                       /* scale modulo counter */
920     int         i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
921     uint32_t *  p_pic_start;       /* beginning of the current line for copy */
922     /* Conversion buffer pointer */
923     uint32_t *  p_buffer_start = (uint32_t*)p_vout->chroma.p_sys->p_buffer;
924     uint32_t *  p_buffer;
925
926     /* Offset array pointer */
927     int *       p_offset_start = p_vout->chroma.p_sys->p_offset;
928     int *       p_offset;
929
930     const int i_source_margin = p_src->p[0].i_pitch
931                                  - p_src->p[0].i_visible_pitch;
932     const int i_source_margin_c = p_src->p[1].i_pitch
933                                  - p_src->p[1].i_visible_pitch;
934
935     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
936
937     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
938      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
939      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
940     SetOffset( p_vout->render.i_width, p_vout->render.i_height,
941                p_vout->output.i_width, p_vout->output.i_height,
942                &b_hscale, &i_vscale, p_offset_start );
943
944     /*
945      * Perform conversion
946      */
947     i_scale_count = ( i_vscale == 1 ) ?
948                     p_vout->output.i_height : p_vout->render.i_height;
949
950 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
951
952     if( p_vout->render.i_width & 15 )
953     {
954         i_rewind = 16 - ( p_vout->render.i_width & 15 );
955     }
956     else
957     {
958         i_rewind = 0;
959     }
960
961     /*
962     ** SSE2 128 bits fetch/store instructions are faster 
963     ** if memory access is 16 bytes aligned
964     */
965
966     p_buffer = b_hscale ? p_buffer_start : p_pic;
967     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
968                     p_dest->p->i_pitch|
969                     ((int)p_y)|
970                     ((int)p_buffer))) )
971     {
972         /* use faster SSE2 aligned fetch and store */
973         for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
974         {
975             p_pic_start = p_pic;
976
977             for ( i_x = p_vout->render.i_width / 16; i_x--; )
978             {
979                 SSE2_CALL (
980                     SSE2_INIT_32_ALIGNED
981                     SSE2_YUV_MUL
982                     SSE2_YUV_ADD
983                     SSE2_UNPACK_32_ARGB_ALIGNED
984                 );
985                 p_y += 16;
986                 p_u += 8;
987                 p_v += 8;
988                 p_buffer += 16;
989             }
990
991             /* Here we do some unaligned reads and duplicate conversions, but
992              * at least we have all the pixels */
993             if( i_rewind )
994             {
995                 p_y -= i_rewind;
996                 p_u -= i_rewind >> 1;
997                 p_v -= i_rewind >> 1;
998                 p_buffer -= i_rewind;
999                 SSE2_CALL (
1000                     SSE2_INIT_32_UNALIGNED
1001                     SSE2_YUV_MUL
1002                     SSE2_YUV_ADD
1003                     SSE2_UNPACK_32_ARGB_UNALIGNED
1004                 );
1005                 p_y += 16;
1006                 p_u += 4;
1007                 p_v += 4;
1008             }
1009             SCALE_WIDTH;
1010             SCALE_HEIGHT( 420, 4 );
1011
1012             p_y += i_source_margin;
1013             if( i_y % 2 )
1014             {
1015                 p_u += i_source_margin_c;
1016                 p_v += i_source_margin_c;
1017             }
1018             p_buffer = b_hscale ? p_buffer_start : p_pic;
1019         }
1020     }
1021     else
1022     {
1023         /* use slower SSE2 unaligned fetch and store */
1024         for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1025         {
1026             p_pic_start = p_pic;
1027             p_buffer = b_hscale ? p_buffer_start : p_pic;
1028
1029             for ( i_x = p_vout->render.i_width / 16; i_x--; )
1030             {
1031                 SSE2_CALL (
1032                     SSE2_INIT_32_UNALIGNED
1033                     SSE2_YUV_MUL
1034                     SSE2_YUV_ADD
1035                     SSE2_UNPACK_32_ARGB_UNALIGNED
1036                 );
1037                 p_y += 16;
1038                 p_u += 8;
1039                 p_v += 8;
1040                 p_buffer += 16;
1041             }
1042
1043             /* Here we do some unaligned reads and duplicate conversions, but
1044              * at least we have all the pixels */
1045             if( i_rewind )
1046             {
1047                 p_y -= i_rewind;
1048                 p_u -= i_rewind >> 1;
1049                 p_v -= i_rewind >> 1;
1050                 p_buffer -= i_rewind;
1051                 SSE2_CALL (
1052                     SSE2_INIT_32_UNALIGNED
1053                     SSE2_YUV_MUL
1054                     SSE2_YUV_ADD
1055                     SSE2_UNPACK_32_ARGB_UNALIGNED
1056                 );
1057                 p_y += 16;
1058                 p_u += 8;
1059                 p_v += 8;
1060             }
1061             SCALE_WIDTH;
1062             SCALE_HEIGHT( 420, 4 );
1063
1064             p_y += i_source_margin;
1065             if( i_y % 2 )
1066             {
1067                 p_u += i_source_margin_c;
1068                 p_v += i_source_margin_c;
1069             }
1070             p_buffer = b_hscale ? p_buffer_start : p_pic;
1071         }
1072     }
1073
1074     /* make sure all SSE2 stores are visible thereafter */
1075     SSE2_END;
1076
1077 #else // defined (MODULE_NAME_IS_i420_rgb_mmx)
1078
1079     if( p_vout->render.i_width & 7 )
1080     {
1081         i_rewind = 8 - ( p_vout->render.i_width & 7 );
1082     }
1083     else
1084     {
1085         i_rewind = 0;
1086     }
1087
1088     for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1089     {
1090         p_pic_start = p_pic;
1091         p_buffer = b_hscale ? p_buffer_start : p_pic;
1092
1093         for ( i_x = p_vout->render.i_width / 8; i_x--; )
1094         {
1095             MMX_CALL (
1096                 MMX_INIT_32
1097                 MMX_YUV_MUL
1098                 MMX_YUV_ADD
1099                 MMX_UNPACK_32_ARGB
1100             );
1101             p_y += 8;
1102             p_u += 4;
1103             p_v += 4;
1104             p_buffer += 8;
1105         }
1106
1107         /* Here we do some unaligned reads and duplicate conversions, but
1108          * at least we have all the pixels */
1109         if( i_rewind )
1110         {
1111             p_y -= i_rewind;
1112             p_u -= i_rewind >> 1;
1113             p_v -= i_rewind >> 1;
1114             p_buffer -= i_rewind;
1115             MMX_CALL (
1116                 MMX_INIT_32
1117                 MMX_YUV_MUL
1118                 MMX_YUV_ADD
1119                 MMX_UNPACK_32_ARGB
1120             );
1121             p_y += 8;
1122             p_u += 4;
1123             p_v += 4;
1124             p_buffer += 8;
1125         }
1126         SCALE_WIDTH;
1127         SCALE_HEIGHT( 420, 4 );
1128
1129         p_y += i_source_margin;
1130         if( i_y % 2 )
1131         {
1132             p_u += i_source_margin_c;
1133             p_v += i_source_margin_c;
1134         }
1135     }
1136
1137     /* re-enable FPU registers */
1138     MMX_END;
1139
1140 #endif
1141 }
1142
1143 void E_(I420_B8G8R8A8)( vout_thread_t *p_vout, picture_t *p_src,
1144                                             picture_t *p_dest )
1145 {
1146     /* We got this one from the old arguments */
1147     uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
1148     uint8_t  *p_y   = p_src->Y_PIXELS;
1149     uint8_t  *p_u   = p_src->U_PIXELS;
1150     uint8_t  *p_v   = p_src->V_PIXELS;
1151
1152     vlc_bool_t  b_hscale;                         /* horizontal scaling type */
1153     unsigned int i_vscale;                          /* vertical scaling type */
1154     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
1155
1156     int         i_right_margin;
1157     int         i_rewind;
1158     int         i_scale_count;                       /* scale modulo counter */
1159     int         i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
1160     uint32_t *  p_pic_start;       /* beginning of the current line for copy */
1161     /* Conversion buffer pointer */
1162     uint32_t *  p_buffer_start = (uint32_t*)p_vout->chroma.p_sys->p_buffer;
1163     uint32_t *  p_buffer;
1164
1165     /* Offset array pointer */
1166     int *       p_offset_start = p_vout->chroma.p_sys->p_offset;
1167     int *       p_offset;
1168
1169     const int i_source_margin = p_src->p[0].i_pitch
1170                                  - p_src->p[0].i_visible_pitch;
1171     const int i_source_margin_c = p_src->p[1].i_pitch
1172                                  - p_src->p[1].i_visible_pitch;
1173
1174     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
1175
1176     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
1177      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
1178      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
1179     SetOffset( p_vout->render.i_width, p_vout->render.i_height,
1180                p_vout->output.i_width, p_vout->output.i_height,
1181                &b_hscale, &i_vscale, p_offset_start );
1182
1183     /*
1184      * Perform conversion
1185      */
1186     i_scale_count = ( i_vscale == 1 ) ?
1187                     p_vout->output.i_height : p_vout->render.i_height;
1188
1189 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
1190
1191     if( p_vout->render.i_width & 15 )
1192     {
1193         i_rewind = 16 - ( p_vout->render.i_width & 15 );
1194     }
1195     else
1196     {
1197         i_rewind = 0;
1198     }
1199
1200     /*
1201     ** SSE2 128 bits fetch/store instructions are faster 
1202     ** if memory access is 16 bytes aligned
1203     */
1204
1205     p_buffer = b_hscale ? p_buffer_start : p_pic;
1206     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
1207                     p_dest->p->i_pitch|
1208                     ((int)p_y)|
1209                     ((int)p_buffer))) )
1210     {
1211         /* use faster SSE2 aligned fetch and store */
1212         for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1213         {
1214             p_pic_start = p_pic;
1215
1216             for ( i_x = p_vout->render.i_width / 16; i_x--; )
1217             {
1218                 SSE2_CALL (
1219                     SSE2_INIT_32_ALIGNED
1220                     SSE2_YUV_MUL
1221                     SSE2_YUV_ADD
1222                     SSE2_UNPACK_32_BGRA_ALIGNED
1223                 );
1224                 p_y += 16;
1225                 p_u += 8;
1226                 p_v += 8;
1227                 p_buffer += 16;
1228             }
1229
1230             /* Here we do some unaligned reads and duplicate conversions, but
1231              * at least we have all the pixels */
1232             if( i_rewind )
1233             {
1234                 p_y -= i_rewind;
1235                 p_u -= i_rewind >> 1;
1236                 p_v -= i_rewind >> 1;
1237                 p_buffer -= i_rewind;
1238                 SSE2_CALL (
1239                     SSE2_INIT_32_UNALIGNED
1240                     SSE2_YUV_MUL
1241                     SSE2_YUV_ADD
1242                     SSE2_UNPACK_32_BGRA_UNALIGNED
1243                 );
1244                 p_y += 16;
1245                 p_u += 4;
1246                 p_v += 4;
1247             }
1248             SCALE_WIDTH;
1249             SCALE_HEIGHT( 420, 4 );
1250
1251             p_y += i_source_margin;
1252             if( i_y % 2 )
1253             {
1254                 p_u += i_source_margin_c;
1255                 p_v += i_source_margin_c;
1256             }
1257             p_buffer = b_hscale ? p_buffer_start : p_pic;
1258         }
1259     }
1260     else
1261     {
1262         /* use slower SSE2 unaligned fetch and store */
1263         for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1264         {
1265             p_pic_start = p_pic;
1266             p_buffer = b_hscale ? p_buffer_start : p_pic;
1267
1268             for ( i_x = p_vout->render.i_width / 16; i_x--; )
1269             {
1270                 SSE2_CALL (
1271                     SSE2_INIT_32_UNALIGNED
1272                     SSE2_YUV_MUL
1273                     SSE2_YUV_ADD
1274                     SSE2_UNPACK_32_BGRA_UNALIGNED
1275                 );
1276                 p_y += 16;
1277                 p_u += 8;
1278                 p_v += 8;
1279                 p_buffer += 16;
1280             }
1281
1282             /* Here we do some unaligned reads and duplicate conversions, but
1283              * at least we have all the pixels */
1284             if( i_rewind )
1285             {
1286                 p_y -= i_rewind;
1287                 p_u -= i_rewind >> 1;
1288                 p_v -= i_rewind >> 1;
1289                 p_buffer -= i_rewind;
1290                 SSE2_CALL (
1291                     SSE2_INIT_32_UNALIGNED
1292                     SSE2_YUV_MUL
1293                     SSE2_YUV_ADD
1294                     SSE2_UNPACK_32_BGRA_UNALIGNED
1295                 );
1296                 p_y += 16;
1297                 p_u += 8;
1298                 p_v += 8;
1299             }
1300             SCALE_WIDTH;
1301             SCALE_HEIGHT( 420, 4 );
1302
1303             p_y += i_source_margin;
1304             if( i_y % 2 )
1305             {
1306                 p_u += i_source_margin_c;
1307                 p_v += i_source_margin_c;
1308             }
1309             p_buffer = b_hscale ? p_buffer_start : p_pic;
1310         }
1311     }
1312
1313 #else
1314
1315     if( p_vout->render.i_width & 7 )
1316     {
1317         i_rewind = 8 - ( p_vout->render.i_width & 7 );
1318     }
1319     else
1320     {
1321         i_rewind = 0;
1322     }
1323
1324     for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1325     {
1326         p_pic_start = p_pic;
1327         p_buffer = b_hscale ? p_buffer_start : p_pic;
1328
1329         for ( i_x = p_vout->render.i_width / 8; i_x--; )
1330         {
1331             MMX_CALL (
1332                 MMX_INIT_32
1333                 MMX_YUV_MUL
1334                 MMX_YUV_ADD
1335                 MMX_UNPACK_32_BGRA
1336             );
1337             p_y += 8;
1338             p_u += 4;
1339             p_v += 4;
1340             p_buffer += 8;
1341         }
1342
1343         /* Here we do some unaligned reads and duplicate conversions, but
1344          * at least we have all the pixels */
1345         if( i_rewind )
1346         {
1347             p_y -= i_rewind;
1348             p_u -= i_rewind >> 1;
1349             p_v -= i_rewind >> 1;
1350             p_buffer -= i_rewind;
1351             MMX_CALL (
1352                 MMX_INIT_32
1353                 MMX_YUV_MUL
1354                 MMX_YUV_ADD
1355                 MMX_UNPACK_32_BGRA
1356             );
1357             p_y += 8;
1358             p_u += 4;
1359             p_v += 4;
1360             p_buffer += 8;
1361         }
1362         SCALE_WIDTH;
1363         SCALE_HEIGHT( 420, 4 );
1364
1365         p_y += i_source_margin;
1366         if( i_y % 2 )
1367         {
1368             p_u += i_source_margin_c;
1369             p_v += i_source_margin_c;
1370         }
1371     }
1372
1373     /* re-enable FPU registers */
1374     MMX_END;
1375
1376 #endif
1377 }
1378
1379 void E_(I420_A8B8G8R8)( vout_thread_t *p_vout, picture_t *p_src,
1380                                             picture_t *p_dest )
1381 {
1382     /* We got this one from the old arguments */
1383     uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
1384     uint8_t  *p_y   = p_src->Y_PIXELS;
1385     uint8_t  *p_u   = p_src->U_PIXELS;
1386     uint8_t  *p_v   = p_src->V_PIXELS;
1387
1388     vlc_bool_t  b_hscale;                         /* horizontal scaling type */
1389     unsigned int i_vscale;                          /* vertical scaling type */
1390     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
1391
1392     int         i_right_margin;
1393     int         i_rewind;
1394     int         i_scale_count;                       /* scale modulo counter */
1395     int         i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
1396     uint32_t *  p_pic_start;       /* beginning of the current line for copy */
1397     /* Conversion buffer pointer */
1398     uint32_t *  p_buffer_start = (uint32_t*)p_vout->chroma.p_sys->p_buffer;
1399     uint32_t *  p_buffer;
1400
1401     /* Offset array pointer */
1402     int *       p_offset_start = p_vout->chroma.p_sys->p_offset;
1403     int *       p_offset;
1404
1405     const int i_source_margin = p_src->p[0].i_pitch
1406                                  - p_src->p[0].i_visible_pitch;
1407     const int i_source_margin_c = p_src->p[1].i_pitch
1408                                  - p_src->p[1].i_visible_pitch;
1409
1410     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
1411
1412     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
1413      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
1414      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
1415     SetOffset( p_vout->render.i_width, p_vout->render.i_height,
1416                p_vout->output.i_width, p_vout->output.i_height,
1417                &b_hscale, &i_vscale, p_offset_start );
1418
1419     /*
1420      * Perform conversion
1421      */
1422     i_scale_count = ( i_vscale == 1 ) ?
1423                     p_vout->output.i_height : p_vout->render.i_height;
1424
1425 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
1426
1427     if( p_vout->render.i_width & 15 )
1428     {
1429         i_rewind = 16 - ( p_vout->render.i_width & 15 );
1430     }
1431     else
1432     {
1433         i_rewind = 0;
1434     }
1435
1436     /*
1437     ** SSE2 128 bits fetch/store instructions are faster 
1438     ** if memory access is 16 bytes aligned
1439     */
1440
1441     p_buffer = b_hscale ? p_buffer_start : p_pic;
1442     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
1443                     p_dest->p->i_pitch|
1444                     ((int)p_y)|
1445                     ((int)p_buffer))) )
1446     {
1447         /* use faster SSE2 aligned fetch and store */
1448         for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1449         {
1450             p_pic_start = p_pic;
1451
1452             for ( i_x = p_vout->render.i_width / 16; i_x--; )
1453             {
1454                 SSE2_CALL (
1455                     SSE2_INIT_32_ALIGNED
1456                     SSE2_YUV_MUL
1457                     SSE2_YUV_ADD
1458                     SSE2_UNPACK_32_ABGR_ALIGNED
1459                 );
1460                 p_y += 16;
1461                 p_u += 8;
1462                 p_v += 8;
1463                 p_buffer += 16;
1464             }
1465
1466             /* Here we do some unaligned reads and duplicate conversions, but
1467              * at least we have all the pixels */
1468             if( i_rewind )
1469             {
1470                 p_y -= i_rewind;
1471                 p_u -= i_rewind >> 1;
1472                 p_v -= i_rewind >> 1;
1473                 p_buffer -= i_rewind;
1474                 SSE2_CALL (
1475                     SSE2_INIT_32_UNALIGNED
1476                     SSE2_YUV_MUL
1477                     SSE2_YUV_ADD
1478                     SSE2_UNPACK_32_ABGR_UNALIGNED
1479                 );
1480                 p_y += 16;
1481                 p_u += 4;
1482                 p_v += 4;
1483             }
1484             SCALE_WIDTH;
1485             SCALE_HEIGHT( 420, 4 );
1486
1487             p_y += i_source_margin;
1488             if( i_y % 2 )
1489             {
1490                 p_u += i_source_margin_c;
1491                 p_v += i_source_margin_c;
1492             }
1493             p_buffer = b_hscale ? p_buffer_start : p_pic;
1494         }
1495     }
1496     else
1497     {
1498         /* use slower SSE2 unaligned fetch and store */
1499         for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1500         {
1501             p_pic_start = p_pic;
1502             p_buffer = b_hscale ? p_buffer_start : p_pic;
1503
1504             for ( i_x = p_vout->render.i_width / 16; i_x--; )
1505             {
1506                 SSE2_CALL (
1507                     SSE2_INIT_32_UNALIGNED
1508                     SSE2_YUV_MUL
1509                     SSE2_YUV_ADD
1510                     SSE2_UNPACK_32_ABGR_UNALIGNED
1511                 );
1512                 p_y += 16;
1513                 p_u += 8;
1514                 p_v += 8;
1515                 p_buffer += 16;
1516             }
1517
1518             /* Here we do some unaligned reads and duplicate conversions, but
1519              * at least we have all the pixels */
1520             if( i_rewind )
1521             {
1522                 p_y -= i_rewind;
1523                 p_u -= i_rewind >> 1;
1524                 p_v -= i_rewind >> 1;
1525                 p_buffer -= i_rewind;
1526                 SSE2_CALL (
1527                     SSE2_INIT_32_UNALIGNED
1528                     SSE2_YUV_MUL
1529                     SSE2_YUV_ADD
1530                     SSE2_UNPACK_32_ABGR_UNALIGNED
1531                 );
1532                 p_y += 16;
1533                 p_u += 8;
1534                 p_v += 8;
1535             }
1536             SCALE_WIDTH;
1537             SCALE_HEIGHT( 420, 4 );
1538
1539             p_y += i_source_margin;
1540             if( i_y % 2 )
1541             {
1542                 p_u += i_source_margin_c;
1543                 p_v += i_source_margin_c;
1544             }
1545             p_buffer = b_hscale ? p_buffer_start : p_pic;
1546         }
1547     }
1548
1549 #else
1550
1551     if( p_vout->render.i_width & 7 )
1552     {
1553         i_rewind = 8 - ( p_vout->render.i_width & 7 );
1554     }
1555     else
1556     {
1557         i_rewind = 0;
1558     }
1559
1560     for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1561     {
1562         p_pic_start = p_pic;
1563         p_buffer = b_hscale ? p_buffer_start : p_pic;
1564
1565         for ( i_x = p_vout->render.i_width / 8; i_x--; )
1566         {
1567             MMX_CALL (
1568                 MMX_INIT_32
1569                 MMX_YUV_MUL
1570                 MMX_YUV_ADD
1571                 MMX_UNPACK_32_ABGR
1572             );
1573             p_y += 8;
1574             p_u += 4;
1575             p_v += 4;
1576             p_buffer += 8;
1577         }
1578
1579         /* Here we do some unaligned reads and duplicate conversions, but
1580          * at least we have all the pixels */
1581         if( i_rewind )
1582         {
1583             p_y -= i_rewind;
1584             p_u -= i_rewind >> 1;
1585             p_v -= i_rewind >> 1;
1586             p_buffer -= i_rewind;
1587             MMX_CALL (
1588                 MMX_INIT_32
1589                 MMX_YUV_MUL
1590                 MMX_YUV_ADD
1591                 MMX_UNPACK_32_ABGR
1592             );
1593             p_y += 8;
1594             p_u += 4;
1595             p_v += 4;
1596             p_buffer += 8;
1597         }
1598         SCALE_WIDTH;
1599         SCALE_HEIGHT( 420, 4 );
1600
1601         p_y += i_source_margin;
1602         if( i_y % 2 )
1603         {
1604             p_u += i_source_margin_c;
1605             p_v += i_source_margin_c;
1606         }
1607     }
1608
1609     /* re-enable FPU registers */
1610     MMX_END;
1611
1612 #endif
1613 }
1614
1615 #endif
1616
1617 /* Following functions are local */
1618
1619 /*****************************************************************************
1620  * SetOffset: build offset array for conversion functions
1621  *****************************************************************************
1622  * This function will build an offset array used in later conversion functions.
1623  * It will also set horizontal and vertical scaling indicators.
1624  *****************************************************************************/
1625 static void SetOffset( int i_width, int i_height, int i_pic_width,
1626                        int i_pic_height, vlc_bool_t *pb_hscale,
1627                        unsigned int *pi_vscale, int *p_offset )
1628 {
1629     int i_x;                                    /* x position in destination */
1630     int i_scale_count;                                     /* modulo counter */
1631
1632     /*
1633      * Prepare horizontal offset array
1634      */
1635     if( i_pic_width - i_width == 0 )
1636     {
1637         /* No horizontal scaling: YUV conversion is done directly to picture */
1638         *pb_hscale = 0;
1639     }
1640     else if( i_pic_width - i_width > 0 )
1641     {
1642         /* Prepare scaling array for horizontal extension */
1643         *pb_hscale = 1;
1644         i_scale_count = i_pic_width;
1645         for( i_x = i_width; i_x--; )
1646         {
1647             while( (i_scale_count -= i_width) > 0 )
1648             {
1649                 *p_offset++ = 0;
1650             }
1651             *p_offset++ = 1;
1652             i_scale_count += i_pic_width;
1653         }
1654     }
1655     else /* if( i_pic_width - i_width < 0 ) */
1656     {
1657         /* Prepare scaling array for horizontal reduction */
1658         *pb_hscale = 1;
1659         i_scale_count = i_width;
1660         for( i_x = i_pic_width; i_x--; )
1661         {
1662             *p_offset = 1;
1663             while( (i_scale_count -= i_pic_width) > 0 )
1664             {
1665                 *p_offset += 1;
1666             }
1667             p_offset++;
1668             i_scale_count += i_width;
1669         }
1670     }
1671
1672     /*
1673      * Set vertical scaling indicator
1674      */
1675     if( i_pic_height - i_height == 0 )
1676     {
1677         *pi_vscale = 0;
1678     }
1679     else if( i_pic_height - i_height > 0 )
1680     {
1681         *pi_vscale = 1;
1682     }
1683     else /* if( i_pic_height - i_height < 0 ) */
1684     {
1685         *pi_vscale = -1;
1686     }
1687 }
1688