]> git.sesse.net Git - vlc/blob - modules/video_chroma/i420_rgb16.c
video_chroma: a few SSE2 fixes
[vlc] / modules / video_chroma / i420_rgb16.c
1 /*****************************************************************************
2  * i420_rgb16.c : YUV to bitmap RGB conversion module for vlc
3  *****************************************************************************
4  * Copyright (C) 2000 the VideoLAN team
5  * $Id$
6  *
7  * Authors: Samuel Hocevar <sam@zoy.org>
8  *          Damien Fouilleul <damienf@videolan.org>
9  *
10  * This program is free software; you can redistribute it and/or modify
11  * it under the terms of the GNU General Public License as published by
12  * the Free Software Foundation; either version 2 of the License, or
13  * (at your option) any later version.
14  * 
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public License
21  * along with this program; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
23  *****************************************************************************/
24
25 /*****************************************************************************
26  * Preamble
27  *****************************************************************************/
28 #include <string.h>                                            /* strerror() */
29 #include <stdlib.h>                                      /* malloc(), free() */
30
31 #include <vlc/vlc.h>
32 #include <vlc_vout.h>
33
34 #include "i420_rgb.h"
35 #if defined (MODULE_NAME_IS_i420_rgb)
36 #   include "i420_rgb_c.h"
37 #elif defined (MODULE_NAME_IS_i420_rgb_mmx)
38 #   if defined(HAVE_MMX_INTRINSICS)
39 #       include <mmintrin.h>
40 #   endif
41 #   include "i420_rgb_mmx.h"
42 #elif defined (MODULE_NAME_IS_i420_rgb_sse2)
43 #   if defined(HAVE_SSE2_INTRINSICS)
44 #       include <emmintrin.h>
45 #   endif
46 #   include "i420_rgb_mmx.h"
47 #endif
48
49 static void SetOffset( int, int, int, int, vlc_bool_t *,
50                        unsigned int *, int * );
51
52 #if defined (MODULE_NAME_IS_i420_rgb)
53 /*****************************************************************************
54  * I420_RGB16: color YUV 4:2:0 to RGB 16 bpp with dithering
55  *****************************************************************************
56  * Horizontal alignment needed:
57  *  - input: 8 pixels (8 Y bytes, 4 U/V bytes), margins not allowed
58  *  - output: 1 pixel (2 bytes), margins allowed
59  * Vertical alignment needed:
60  *  - input: 2 lines (2 Y lines, 1 U/V line)
61  *  - output: 1 line
62  *****************************************************************************/
63 void E_(I420_RGB16_dither)( vout_thread_t *p_vout, picture_t *p_src,
64                                                       picture_t *p_dest )
65 {
66     /* We got this one from the old arguments */
67     uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
68     uint8_t  *p_y   = p_src->Y_PIXELS;
69     uint8_t  *p_u   = p_src->U_PIXELS;
70     uint8_t  *p_v   = p_src->V_PIXELS;
71
72     vlc_bool_t   b_hscale;                        /* horizontal scaling type */
73     unsigned int i_vscale;                          /* vertical scaling type */
74     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
75     unsigned int i_real_y;                                          /* y % 4 */
76
77     int         i_right_margin;
78     int         i_rewind;
79     int         i_scale_count;                       /* scale modulo counter */
80     int         i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
81     uint16_t *  p_pic_start;       /* beginning of the current line for copy */
82     int         i_uval, i_vval;                           /* U and V samples */
83     int         i_red, i_green, i_blue;          /* U and V modified samples */
84     uint16_t *  p_yuv = p_vout->chroma.p_sys->p_rgb16;
85     uint16_t *  p_ybase;                     /* Y dependant conversion table */
86
87     /* Conversion buffer pointer */
88     uint16_t *  p_buffer_start = (uint16_t*)p_vout->chroma.p_sys->p_buffer;
89     uint16_t *  p_buffer;
90
91     /* Offset array pointer */
92     int *       p_offset_start = p_vout->chroma.p_sys->p_offset;
93     int *       p_offset;
94
95     const int i_source_margin = p_src->p[0].i_pitch
96                                  - p_src->p[0].i_visible_pitch;
97     const int i_source_margin_c = p_src->p[1].i_pitch
98                                  - p_src->p[1].i_visible_pitch;
99
100     /* The dithering matrices */
101     int dither10[4] = {  0x0,  0x8,  0x2,  0xa };
102     int dither11[4] = {  0xc,  0x4,  0xe,  0x6 };
103     int dither12[4] = {  0x3,  0xb,  0x1,  0x9 };
104     int dither13[4] = {  0xf,  0x7,  0xd,  0x5 };
105
106     for(i_x = 0; i_x < 4; i_x++)
107     {
108         dither10[i_x] = dither10[i_x] << (SHIFT - 4 + p_vout->output.i_rrshift);
109         dither11[i_x] = dither11[i_x] << (SHIFT - 4 + p_vout->output.i_rrshift);
110         dither12[i_x] = dither12[i_x] << (SHIFT - 4 + p_vout->output.i_rrshift);
111         dither13[i_x] = dither13[i_x] << (SHIFT - 4 + p_vout->output.i_rrshift);
112     }
113
114     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
115
116     if( p_vout->render.i_width & 7 )
117     {
118         i_rewind = 8 - ( p_vout->render.i_width & 7 );
119     }
120     else
121     {
122         i_rewind = 0;
123     }
124
125     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
126      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
127      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
128     SetOffset( p_vout->render.i_width, p_vout->render.i_height,
129                p_vout->output.i_width, p_vout->output.i_height,
130                &b_hscale, &i_vscale, p_offset_start );
131
132     /*
133      * Perform conversion
134      */
135     i_scale_count = ( i_vscale == 1 ) ?
136                     p_vout->output.i_height : p_vout->render.i_height;
137     for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
138     {
139         i_real_y = i_y & 0x3;
140         p_pic_start = p_pic;
141         p_buffer = b_hscale ? p_buffer_start : p_pic;
142
143         for ( i_x = p_vout->render.i_width / 8; i_x--; )
144         {
145             int *p_dither = dither10;
146             CONVERT_YUV_PIXEL_DITHER(2);
147             p_dither = dither11;
148             CONVERT_Y_PIXEL_DITHER(2);
149             p_dither = dither12;
150             CONVERT_YUV_PIXEL_DITHER(2);
151             p_dither = dither13;
152             CONVERT_Y_PIXEL_DITHER(2);
153             p_dither = dither10;
154             CONVERT_YUV_PIXEL_DITHER(2);
155             p_dither = dither11;
156             CONVERT_Y_PIXEL_DITHER(2);
157             p_dither = dither12;
158             CONVERT_YUV_PIXEL_DITHER(2);
159             p_dither = dither13;
160             CONVERT_Y_PIXEL_DITHER(2);
161         }
162
163         /* Here we do some unaligned reads and duplicate conversions, but
164          * at least we have all the pixels */
165         if( i_rewind )
166         {
167             int *p_dither = dither10;
168             p_y -= i_rewind;
169             p_u -= i_rewind >> 1;
170             p_v -= i_rewind >> 1;
171             p_buffer -= i_rewind;
172             CONVERT_YUV_PIXEL_DITHER(2);
173             p_dither = dither11;
174             CONVERT_Y_PIXEL_DITHER(2);
175             p_dither = dither12;
176             CONVERT_YUV_PIXEL_DITHER(2);
177             p_dither = dither13;
178             CONVERT_Y_PIXEL_DITHER(2);
179             p_dither = dither10;
180             CONVERT_YUV_PIXEL_DITHER(2);
181             p_dither = dither11;
182             CONVERT_Y_PIXEL_DITHER(2);
183             p_dither = dither12;
184             CONVERT_YUV_PIXEL_DITHER(2);
185             p_dither = dither13;
186             CONVERT_Y_PIXEL_DITHER(2);
187         }
188         SCALE_WIDTH;
189         SCALE_HEIGHT( 420, 2 );
190
191         p_y += i_source_margin;
192         if( i_y % 2 )
193         {
194             p_u += i_source_margin_c;
195             p_v += i_source_margin_c;
196         }
197     }
198 }
199 #endif
200
201 /*****************************************************************************
202  * I420_RGB16: color YUV 4:2:0 to RGB 16 bpp
203  *****************************************************************************
204  * Horizontal alignment needed:
205  *  - input: 8 pixels (8 Y bytes, 4 U/V bytes), margins not allowed
206  *  - output: 1 pixel (2 bytes), margins allowed
207  * Vertical alignment needed:
208  *  - input: 2 lines (2 Y lines, 1 U/V line)
209  *  - output: 1 line
210  *****************************************************************************/
211
212 #if defined (MODULE_NAME_IS_i420_rgb)
213
214 void E_(I420_RGB16)( vout_thread_t *p_vout, picture_t *p_src,
215                                             picture_t *p_dest )
216 {
217     /* We got this one from the old arguments */
218     uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
219     uint8_t  *p_y   = p_src->Y_PIXELS;
220     uint8_t  *p_u   = p_src->U_PIXELS;
221     uint8_t  *p_v   = p_src->V_PIXELS;
222
223     vlc_bool_t  b_hscale;                         /* horizontal scaling type */
224     unsigned int i_vscale;                          /* vertical scaling type */
225     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
226
227     int         i_right_margin;
228     int         i_rewind;
229     int         i_scale_count;                       /* scale modulo counter */
230     int         i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
231     uint16_t *  p_pic_start;       /* beginning of the current line for copy */
232     int         i_uval, i_vval;                           /* U and V samples */
233     int         i_red, i_green, i_blue;          /* U and V modified samples */
234     uint16_t *  p_yuv = p_vout->chroma.p_sys->p_rgb16;
235     uint16_t *  p_ybase;                     /* Y dependant conversion table */
236
237     /* Conversion buffer pointer */
238     uint16_t *  p_buffer_start = (uint16_t*)p_vout->chroma.p_sys->p_buffer;
239     uint16_t *  p_buffer;
240
241     /* Offset array pointer */
242     int *       p_offset_start = p_vout->chroma.p_sys->p_offset;
243     int *       p_offset;
244
245     const int i_source_margin = p_src->p[0].i_pitch
246                                  - p_src->p[0].i_visible_pitch;
247     const int i_source_margin_c = p_src->p[1].i_pitch
248                                  - p_src->p[1].i_visible_pitch;
249
250     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
251
252     if( p_vout->render.i_width & 7 )
253     {
254         i_rewind = 8 - ( p_vout->render.i_width & 7 );
255     }
256     else
257     {
258         i_rewind = 0;
259     }
260
261     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
262      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
263      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
264     SetOffset( p_vout->render.i_width, p_vout->render.i_height,
265                p_vout->output.i_width, p_vout->output.i_height,
266                &b_hscale, &i_vscale, p_offset_start );
267
268     /*
269      * Perform conversion
270      */
271     i_scale_count = ( i_vscale == 1 ) ?
272                     p_vout->output.i_height : p_vout->render.i_height;
273     for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
274     {
275         p_pic_start = p_pic;
276         p_buffer = b_hscale ? p_buffer_start : p_pic;
277
278         for ( i_x = p_vout->render.i_width / 8; i_x--; )
279         {
280             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
281             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
282             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
283             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
284         }
285
286         /* Here we do some unaligned reads and duplicate conversions, but
287          * at least we have all the pixels */
288         if( i_rewind )
289         {
290             p_y -= i_rewind;
291             p_u -= i_rewind >> 1;
292             p_v -= i_rewind >> 1;
293             p_buffer -= i_rewind;
294
295             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
296             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
297             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
298             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
299         }
300         SCALE_WIDTH;
301         SCALE_HEIGHT( 420, 2 );
302
303         p_y += i_source_margin;
304         if( i_y % 2 )
305         {
306             p_u += i_source_margin_c;
307             p_v += i_source_margin_c;
308         }
309     }
310 }
311
312 #else // defined (MODULE_NAME_IS_i420_rgb_mmx)
313
314 void E_(I420_R5G5B5)( vout_thread_t *p_vout, picture_t *p_src,
315                                             picture_t *p_dest )
316 {
317     /* We got this one from the old arguments */
318     uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
319     uint8_t  *p_y   = p_src->Y_PIXELS;
320     uint8_t  *p_u   = p_src->U_PIXELS;
321     uint8_t  *p_v   = p_src->V_PIXELS;
322
323     vlc_bool_t  b_hscale;                         /* horizontal scaling type */
324     unsigned int i_vscale;                          /* vertical scaling type */
325     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
326
327     int         i_right_margin;
328     int         i_rewind;
329     int         i_scale_count;                       /* scale modulo counter */
330     int         i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
331     uint16_t *  p_pic_start;       /* beginning of the current line for copy */
332
333     /* Conversion buffer pointer */
334     uint16_t *  p_buffer_start = (uint16_t*)p_vout->chroma.p_sys->p_buffer;
335     uint16_t *  p_buffer;
336
337     /* Offset array pointer */
338     int *       p_offset_start = p_vout->chroma.p_sys->p_offset;
339     int *       p_offset;
340
341     const int i_source_margin = p_src->p[0].i_pitch
342                                  - p_src->p[0].i_visible_pitch;
343     const int i_source_margin_c = p_src->p[1].i_pitch
344                                  - p_src->p[1].i_visible_pitch;
345
346     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
347
348     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
349      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
350      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
351     SetOffset( p_vout->render.i_width, p_vout->render.i_height,
352                p_vout->output.i_width, p_vout->output.i_height,
353                &b_hscale, &i_vscale, p_offset_start );
354
355
356     /*
357      * Perform conversion
358      */
359     i_scale_count = ( i_vscale == 1 ) ?
360                     p_vout->output.i_height : p_vout->render.i_height;
361
362 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
363
364     if( p_vout->render.i_width & 15 )
365     {
366         i_rewind = 16 - ( p_vout->render.i_width & 15 );
367     }
368     else
369     {
370         i_rewind = 0;
371     }
372
373     /*
374     ** SSE2 128 bits fetch/store instructions are faster 
375     ** if memory access is 16 bytes aligned
376     */
377
378     p_buffer = b_hscale ? p_buffer_start : p_pic;
379     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
380                     p_dest->p->i_pitch|
381                     ((int)p_y)|
382                     ((int)p_buffer))) )
383     {
384         /* use faster SSE2 aligned fetch and store */
385         for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
386         {
387             p_pic_start = p_pic;
388
389             for ( i_x = p_vout->render.i_width/16; i_x--; )
390             {
391 #if defined (CAN_COMPILE_SSE2)
392                 __asm__( ".p2align 3"
393                          SSE2_INIT_16_ALIGNED
394                          SSE2_YUV_MUL
395                          SSE2_YUV_ADD
396                          SSE2_UNPACK_15_ALIGNED
397                          : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
398 #else
399                 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
400                 SSE2_INTRINSICS_INIT_16_ALIGNED
401                 SSE2_INTRINSICS_YUV_MUL
402                 SSE2_INTRINSICS_YUV_ADD
403                 SSE2_INTRINSICS_UNPACK_15_ALIGNED
404 #endif
405                 p_y += 16;
406                 p_u += 8;
407                 p_v += 8;
408                 p_buffer += 16;
409             }
410             /* Here we do some unaligned reads and duplicate conversions, but
411              * at least we have all the pixels */
412             if( i_rewind )
413             {
414                 p_y -= i_rewind;
415                 p_u -= i_rewind >> 1;
416                 p_v -= i_rewind >> 1;
417                 p_buffer -= i_rewind;
418
419 #if defined (CAN_COMPILE_SSE2)
420                 __asm__( ".p2align 3"
421                          SSE2_INIT_16_UNALIGNED
422                          SSE2_YUV_MUL
423                          SSE2_YUV_ADD
424                          SSE2_UNPACK_15_UNALIGNED
425                          : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
426 #else
427                 {
428                     __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
429
430                     SSE2_INTRINSICS_INIT_16_UNALIGNED
431                     SSE2_INTRINSICS_YUV_MUL
432                     SSE2_INTRINSICS_YUV_ADD
433                     SSE2_INTRINSICS_UNPACK_15_UNALIGNED
434                 }
435 #endif
436                 p_y += 16;
437                 p_u += 8;
438                 p_v += 8;
439             }
440             SCALE_WIDTH;
441             SCALE_HEIGHT( 420, 2 );
442
443             p_y += i_source_margin;
444             if( i_y % 2 )
445             {
446                 p_u += i_source_margin_c;
447                 p_v += i_source_margin_c;
448             }
449             p_buffer = b_hscale ? p_buffer_start : p_pic;
450         }
451     }
452     else
453     {
454         /* use slower SSE2 unaligned fetch and store */
455         for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
456         {
457             p_pic_start = p_pic;
458             p_buffer = b_hscale ? p_buffer_start : p_pic;
459
460             for ( i_x = p_vout->render.i_width/16; i_x--; )
461             {
462 #if defined (CAN_COMPILE_SSE2)
463                 __asm__( ".p2align 3"
464                          SSE2_INIT_16_UNALIGNED
465                          SSE2_YUV_MUL
466                          SSE2_YUV_ADD
467                          SSE2_UNPACK_15_UNALIGNED
468                          : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
469 #else
470                 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
471                 SSE2_INTRINSICS_INIT_16_UNALIGNED
472                 SSE2_INTRINSICS_YUV_MUL
473                 SSE2_INTRINSICS_YUV_ADD
474                 SSE2_INTRINSICS_UNPACK_15_UNALIGNED
475 #endif
476                 p_y += 16;
477                 p_u += 8;
478                 p_v += 8;
479                 p_buffer += 16;
480             }
481             /* Here we do some unaligned reads and duplicate conversions, but
482              * at least we have all the pixels */
483             if( i_rewind )
484             {
485                 p_y -= i_rewind;
486                 p_u -= i_rewind >> 1;
487                 p_v -= i_rewind >> 1;
488                 p_buffer -= i_rewind;
489
490 #if defined (CAN_COMPILE_SSE2)
491                 __asm__( ".p2align 3"
492                          SSE2_INIT_16_UNALIGNED
493                          SSE2_YUV_MUL
494                          SSE2_YUV_ADD
495                          SSE2_UNPACK_15_UNALIGNED
496                          : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
497 #else
498                 {
499                     __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
500
501                     SSE2_INTRINSICS_INIT_16_UNALIGNED
502                     SSE2_INTRINSICS_YUV_MUL
503                     SSE2_INTRINSICS_YUV_ADD
504                     SSE2_INTRINSICS_UNPACK_15_UNALIGNED
505                 }
506 #endif
507                 p_y += 16;
508                 p_u += 8;
509                 p_v += 8;
510             }
511             SCALE_WIDTH;
512             SCALE_HEIGHT( 420, 2 );
513
514             p_y += i_source_margin;
515             if( i_y % 2 )
516             {
517                 p_u += i_source_margin_c;
518                 p_v += i_source_margin_c;
519             }
520             p_buffer = b_hscale ? p_buffer_start : p_pic;
521         }
522     }
523
524     /* make sure all SSE2 stores are visible thereafter */
525 #if defined (CAN_COMPILE_SSE2)
526     __asm__ __volatile__ ( "sfence" ::: "memory" );
527 #else
528     _mm_sfence();
529 #endif
530
531 #else // defined (MODULE_NAME_IS_i420_rgb_mmx)
532
533     if( p_vout->render.i_width & 7 )
534     {
535         i_rewind = 8 - ( p_vout->render.i_width & 7 );
536     }
537     else
538     {
539         i_rewind = 0;
540     }
541
542     for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
543     {
544         p_pic_start = p_pic;
545         p_buffer = b_hscale ? p_buffer_start : p_pic;
546
547         for ( i_x = p_vout->render.i_width / 8; i_x--; )
548         {
549 #if defined (CAN_COMPILE_MMX)
550             __asm__( ".p2align 3"
551                      MMX_INIT_16
552                      MMX_YUV_MUL
553                      MMX_YUV_ADD
554                      MMX_UNPACK_15
555                      : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
556 #else
557             __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
558             uint64_t tmp64;
559             MMX_INTRINSICS_INIT_16
560             MMX_INTRINSICS_YUV_MUL
561             MMX_INTRINSICS_YUV_ADD
562             MMX_INTRINSICS_UNPACK_15
563 #endif
564
565             p_y += 8;
566             p_u += 4;
567             p_v += 4;
568             p_buffer += 8;
569         }
570
571         /* Here we do some unaligned reads and duplicate conversions, but
572          * at least we have all the pixels */
573         if( i_rewind )
574         {
575             p_y -= i_rewind;
576             p_u -= i_rewind >> 1;
577             p_v -= i_rewind >> 1;
578             p_buffer -= i_rewind;
579
580 #if defined (CAN_COMPILE_MMX)
581             __asm__( ".p2align 3"
582                      MMX_INIT_16
583                      MMX_YUV_MUL
584                      MMX_YUV_ADD
585                      MMX_UNPACK_15
586                      : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
587 #else
588             {
589                 __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
590                 uint64_t tmp64;
591
592                 MMX_INTRINSICS_INIT_16
593                 MMX_INTRINSICS_YUV_MUL
594                 MMX_INTRINSICS_YUV_ADD
595                 MMX_INTRINSICS_UNPACK_15
596             }
597 #endif
598             p_y += 8;
599             p_u += 4;
600             p_v += 4;
601             p_buffer += 8;
602         }
603         SCALE_WIDTH;
604         SCALE_HEIGHT( 420, 2 );
605
606         p_y += i_source_margin;
607         if( i_y % 2 )
608         {
609             p_u += i_source_margin_c;
610             p_v += i_source_margin_c;
611         }
612     }
613     /* re-enable FPU registers */
614 #if defined (CAN_COMPILE_MMX)
615     __asm__ __volatile__ ( "emms" );
616 #else
617     _mm_empty();
618 #endif
619
620 #endif
621 }
622
623 void E_(I420_R5G6B5)( vout_thread_t *p_vout, picture_t *p_src,
624                                             picture_t *p_dest )
625 {
626     /* We got this one from the old arguments */
627     uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
628     uint8_t  *p_y   = p_src->Y_PIXELS;
629     uint8_t  *p_u   = p_src->U_PIXELS;
630     uint8_t  *p_v   = p_src->V_PIXELS;
631
632     vlc_bool_t  b_hscale;                         /* horizontal scaling type */
633     unsigned int i_vscale;                          /* vertical scaling type */
634     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
635
636     int         i_right_margin;
637     int         i_rewind;
638     int         i_scale_count;                       /* scale modulo counter */
639     int         i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
640     uint16_t *  p_pic_start;       /* beginning of the current line for copy */
641
642     /* Conversion buffer pointer */
643     uint16_t *  p_buffer_start = (uint16_t*)p_vout->chroma.p_sys->p_buffer;
644     uint16_t *  p_buffer;
645
646     /* Offset array pointer */
647     int *       p_offset_start = p_vout->chroma.p_sys->p_offset;
648     int *       p_offset;
649
650     const int i_source_margin = p_src->p[0].i_pitch
651                                  - p_src->p[0].i_visible_pitch;
652     const int i_source_margin_c = p_src->p[1].i_pitch
653                                  - p_src->p[1].i_visible_pitch;
654
655     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
656
657     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
658      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
659      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
660     SetOffset( p_vout->render.i_width, p_vout->render.i_height,
661                p_vout->output.i_width, p_vout->output.i_height,
662                &b_hscale, &i_vscale, p_offset_start );
663
664
665     /*
666      * Perform conversion
667      */
668     i_scale_count = ( i_vscale == 1 ) ?
669                     p_vout->output.i_height : p_vout->render.i_height;
670
671 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
672
673     if( p_vout->render.i_width & 15 )
674     {
675         i_rewind = 16 - ( p_vout->render.i_width & 15 );
676     }
677     else
678     {
679         i_rewind = 0;
680     }
681
682     /*
683     ** SSE2 128 bits fetch/store instructions are faster 
684     ** if memory access is 16 bytes aligned
685     */
686
687     p_buffer = b_hscale ? p_buffer_start : p_pic;
688     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
689                     p_dest->p->i_pitch|
690                     ((int)p_y)|
691                     ((int)p_buffer))) )
692     {
693         /* use faster SSE2 aligned fetch and store */
694         for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
695         {
696             p_pic_start = p_pic;
697
698             for ( i_x = p_vout->render.i_width/16; i_x--; )
699             {
700 #if defined (CAN_COMPILE_SSE2)
701                 __asm__( ".p2align 3"
702                          SSE2_INIT_16_ALIGNED
703                          SSE2_YUV_MUL
704                          SSE2_YUV_ADD
705                          SSE2_UNPACK_16_ALIGNED
706                          : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
707 #else
708                 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
709                 SSE2_INTRINSICS_INIT_16_ALIGNED
710                 SSE2_INTRINSICS_YUV_MUL
711                 SSE2_INTRINSICS_YUV_ADD
712                 SSE2_INTRINSICS_UNPACK_16_ALIGNED
713 #endif
714                 p_y += 16;
715                 p_u += 8;
716                 p_v += 8;
717                 p_buffer += 16;
718             }
719             /* Here we do some unaligned reads and duplicate conversions, but
720              * at least we have all the pixels */
721             if( i_rewind )
722             {
723                 p_y -= i_rewind;
724                 p_u -= i_rewind >> 1;
725                 p_v -= i_rewind >> 1;
726                 p_buffer -= i_rewind;
727
728 #if defined (CAN_COMPILE_SSE2)
729                 __asm__( ".p2align 3"
730                          SSE2_INIT_16_UNALIGNED
731                          SSE2_YUV_MUL
732                          SSE2_YUV_ADD
733                          SSE2_UNPACK_16_UNALIGNED
734                          : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
735 #else
736                 {
737                     __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
738
739                     SSE2_INTRINSICS_INIT_16_UNALIGNED
740                     SSE2_INTRINSICS_YUV_MUL
741                     SSE2_INTRINSICS_YUV_ADD
742                     SSE2_INTRINSICS_UNPACK_16_UNALIGNED
743                 }
744 #endif
745                 p_y += 16;
746                 p_u += 8;
747                 p_v += 8;
748             }
749             SCALE_WIDTH;
750             SCALE_HEIGHT( 420, 2 );
751
752             p_y += i_source_margin;
753             if( i_y % 2 )
754             {
755                 p_u += i_source_margin_c;
756                 p_v += i_source_margin_c;
757             }
758             p_buffer = b_hscale ? p_buffer_start : p_pic;
759         }
760     }
761     else
762     {
763         /* use slower SSE2 unaligned fetch and store */
764         for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
765         {
766             p_pic_start = p_pic;
767             p_buffer = b_hscale ? p_buffer_start : p_pic;
768
769             for ( i_x = p_vout->render.i_width/16; i_x--; )
770             {
771 #if defined (CAN_COMPILE_SSE2)
772                 __asm__( ".p2align 3"
773                          SSE2_INIT_16_UNALIGNED
774                          SSE2_YUV_MUL
775                          SSE2_YUV_ADD
776                          SSE2_UNPACK_16_UNALIGNED
777                          : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
778 #else
779                 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
780                 SSE2_INTRINSICS_INIT_16_UNALIGNED
781                 SSE2_INTRINSICS_YUV_MUL
782                 SSE2_INTRINSICS_YUV_ADD
783                 SSE2_INTRINSICS_UNPACK_16_UNALIGNED
784 #endif
785                 p_y += 16;
786                 p_u += 8;
787                 p_v += 8;
788                 p_buffer += 16;
789             }
790             /* Here we do some unaligned reads and duplicate conversions, but
791              * at least we have all the pixels */
792             if( i_rewind )
793             {
794                 p_y -= i_rewind;
795                 p_u -= i_rewind >> 1;
796                 p_v -= i_rewind >> 1;
797                 p_buffer -= i_rewind;
798
799 #if defined (CAN_COMPILE_SSE2)
800                 __asm__( ".p2align 3"
801                          SSE2_INIT_16_UNALIGNED
802                          SSE2_YUV_MUL
803                          SSE2_YUV_ADD
804                          SSE2_UNPACK_16_UNALIGNED
805                          : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
806 #else
807                 {
808                     __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
809
810                     SSE2_INTRINSICS_INIT_16_UNALIGNED
811                     SSE2_INTRINSICS_YUV_MUL
812                     SSE2_INTRINSICS_YUV_ADD
813                     SSE2_INTRINSICS_UNPACK_16_UNALIGNED
814                 }
815 #endif
816                 p_y += 16;
817                 p_u += 8;
818                 p_v += 8;
819             }
820             SCALE_WIDTH;
821             SCALE_HEIGHT( 420, 2 );
822
823             p_y += i_source_margin;
824             if( i_y % 2 )
825             {
826                 p_u += i_source_margin_c;
827                 p_v += i_source_margin_c;
828             }
829             p_buffer = b_hscale ? p_buffer_start : p_pic;
830         }
831     }
832
833     /* make sure all SSE2 stores are visible thereafter */
834 #if defined (CAN_COMPILE_SSE2)
835     __asm__ __volatile__ ( "sfence" ::: "memory" );
836 #else
837     _mm_sfence();
838 #endif
839
840 #else // defined (MODULE_NAME_IS_i420_rgb_mmx)
841
842     if( p_vout->render.i_width & 7 )
843     {
844         i_rewind = 8 - ( p_vout->render.i_width & 7 );
845     }
846     else
847     {
848         i_rewind = 0;
849     }
850
851     for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
852     {
853         p_pic_start = p_pic;
854         p_buffer = b_hscale ? p_buffer_start : p_pic;
855
856         for ( i_x = p_vout->render.i_width / 8; i_x--; )
857         {
858 #if defined (CAN_COMPILE_MMX)
859             __asm__( ".p2align 3"
860                      MMX_INIT_16
861                      MMX_YUV_MUL
862                      MMX_YUV_ADD
863                      MMX_UNPACK_16
864                      : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
865 #else
866             __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
867             uint64_t tmp64;
868             MMX_INTRINSICS_INIT_16
869             MMX_INTRINSICS_YUV_MUL
870             MMX_INTRINSICS_YUV_ADD
871             MMX_INTRINSICS_UNPACK_16
872 #endif
873
874             p_y += 8;
875             p_u += 4;
876             p_v += 4;
877             p_buffer += 8;
878         }
879
880         /* Here we do some unaligned reads and duplicate conversions, but
881          * at least we have all the pixels */
882         if( i_rewind )
883         {
884             p_y -= i_rewind;
885             p_u -= i_rewind >> 1;
886             p_v -= i_rewind >> 1;
887             p_buffer -= i_rewind;
888
889 #if defined (CAN_COMPILE_MMX)
890             __asm__( ".p2align 3"
891                      MMX_INIT_16
892                      MMX_YUV_MUL
893                      MMX_YUV_ADD
894                      MMX_UNPACK_16
895                      : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
896 #else
897             {
898                 __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
899                 uint64_t tmp64;
900
901                 MMX_INTRINSICS_INIT_16
902                 MMX_INTRINSICS_YUV_MUL
903                 MMX_INTRINSICS_YUV_ADD
904                 MMX_INTRINSICS_UNPACK_16
905             }
906 #endif
907             p_y += 8;
908             p_u += 4;
909             p_v += 4;
910             p_buffer += 8;
911         }
912         SCALE_WIDTH;
913         SCALE_HEIGHT( 420, 2 );
914
915         p_y += i_source_margin;
916         if( i_y % 2 )
917         {
918             p_u += i_source_margin_c;
919             p_v += i_source_margin_c;
920         }
921     }
922     /* re-enable FPU registers */
923 #if defined (CAN_COMPILE_MMX)
924     __asm__ __volatile__ ( "emms" );
925 #else
926     _mm_empty();
927 #endif
928
929 #endif
930 }
931
932 #endif
933
934 /*****************************************************************************
935  * I420_RGB32: color YUV 4:2:0 to RGB 32 bpp
936  *****************************************************************************
937  * Horizontal alignment needed:
938  *  - input: 8 pixels (8 Y bytes, 4 U/V bytes), margins not allowed
939  *  - output: 1 pixel (2 bytes), margins allowed
940  * Vertical alignment needed:
941  *  - input: 2 lines (2 Y lines, 1 U/V line)
942  *  - output: 1 line
943  *****************************************************************************/
944
945 #if defined (MODULE_NAME_IS_i420_rgb)
946
947 void E_(I420_RGB32)( vout_thread_t *p_vout, picture_t *p_src,
948                                             picture_t *p_dest )
949 {
950     /* We got this one from the old arguments */
951     uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
952     uint8_t  *p_y   = p_src->Y_PIXELS;
953     uint8_t  *p_u   = p_src->U_PIXELS;
954     uint8_t  *p_v   = p_src->V_PIXELS;
955
956     vlc_bool_t  b_hscale;                         /* horizontal scaling type */
957     unsigned int i_vscale;                          /* vertical scaling type */
958     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
959
960     int         i_right_margin;
961     int         i_rewind;
962     int         i_scale_count;                       /* scale modulo counter */
963     int         i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
964     uint32_t *  p_pic_start;       /* beginning of the current line for copy */
965     int         i_uval, i_vval;                           /* U and V samples */
966     int         i_red, i_green, i_blue;          /* U and V modified samples */
967     uint32_t *  p_yuv = p_vout->chroma.p_sys->p_rgb32;
968     uint32_t *  p_ybase;                     /* Y dependant conversion table */
969
970     /* Conversion buffer pointer */
971     uint32_t *  p_buffer_start = (uint32_t*)p_vout->chroma.p_sys->p_buffer;
972     uint32_t *  p_buffer;
973
974     /* Offset array pointer */
975     int *       p_offset_start = p_vout->chroma.p_sys->p_offset;
976     int *       p_offset;
977
978     const int i_source_margin = p_src->p[0].i_pitch
979                                  - p_src->p[0].i_visible_pitch;
980     const int i_source_margin_c = p_src->p[1].i_pitch
981                                  - p_src->p[1].i_visible_pitch;
982
983     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
984
985     if( p_vout->render.i_width & 7 )
986     {
987         i_rewind = 8 - ( p_vout->render.i_width & 7 );
988     }
989     else
990     {
991         i_rewind = 0;
992     }
993
994     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
995      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
996      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
997     SetOffset( p_vout->render.i_width, p_vout->render.i_height,
998                p_vout->output.i_width, p_vout->output.i_height,
999                &b_hscale, &i_vscale, p_offset_start );
1000
1001     /*
1002      * Perform conversion
1003      */
1004     i_scale_count = ( i_vscale == 1 ) ?
1005                     p_vout->output.i_height : p_vout->render.i_height;
1006     for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1007     {
1008         p_pic_start = p_pic;
1009         p_buffer = b_hscale ? p_buffer_start : p_pic;
1010
1011         for ( i_x = p_vout->render.i_width / 8; i_x--; )
1012         {
1013             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
1014             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
1015             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
1016             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
1017         }
1018
1019         /* Here we do some unaligned reads and duplicate conversions, but
1020          * at least we have all the pixels */
1021         if( i_rewind )
1022         {
1023             p_y -= i_rewind;
1024             p_u -= i_rewind >> 1;
1025             p_v -= i_rewind >> 1;
1026             p_buffer -= i_rewind;
1027             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
1028             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
1029             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
1030             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
1031         }
1032         SCALE_WIDTH;
1033         SCALE_HEIGHT( 420, 4 );
1034
1035         p_y += i_source_margin;
1036         if( i_y % 2 )
1037         {
1038             p_u += i_source_margin_c;
1039             p_v += i_source_margin_c;
1040         }
1041     }
1042 }
1043
1044 #else // defined (MODULE_NAME_IS_i420_rgb_mmx) || defined (MODULE_NAME_IS_i420_rgb_sse2)
1045
1046 void E_(I420_A8R8G8B8)( vout_thread_t *p_vout, picture_t *p_src,
1047                                             picture_t *p_dest )
1048 {
1049     /* We got this one from the old arguments */
1050     uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
1051     uint8_t  *p_y   = p_src->Y_PIXELS;
1052     uint8_t  *p_u   = p_src->U_PIXELS;
1053     uint8_t  *p_v   = p_src->V_PIXELS;
1054
1055     vlc_bool_t  b_hscale;                         /* horizontal scaling type */
1056     unsigned int i_vscale;                          /* vertical scaling type */
1057     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
1058
1059     int         i_right_margin;
1060     int         i_rewind;
1061     int         i_scale_count;                       /* scale modulo counter */
1062     int         i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
1063     uint32_t *  p_pic_start;       /* beginning of the current line for copy */
1064     /* Conversion buffer pointer */
1065     uint32_t *  p_buffer_start = (uint32_t*)p_vout->chroma.p_sys->p_buffer;
1066     uint32_t *  p_buffer;
1067
1068     /* Offset array pointer */
1069     int *       p_offset_start = p_vout->chroma.p_sys->p_offset;
1070     int *       p_offset;
1071
1072     const int i_source_margin = p_src->p[0].i_pitch
1073                                  - p_src->p[0].i_visible_pitch;
1074     const int i_source_margin_c = p_src->p[1].i_pitch
1075                                  - p_src->p[1].i_visible_pitch;
1076
1077     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
1078
1079     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
1080      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
1081      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
1082     SetOffset( p_vout->render.i_width, p_vout->render.i_height,
1083                p_vout->output.i_width, p_vout->output.i_height,
1084                &b_hscale, &i_vscale, p_offset_start );
1085
1086     /*
1087      * Perform conversion
1088      */
1089     i_scale_count = ( i_vscale == 1 ) ?
1090                     p_vout->output.i_height : p_vout->render.i_height;
1091
1092 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
1093
1094     if( p_vout->render.i_width & 15 )
1095     {
1096         i_rewind = 16 - ( p_vout->render.i_width & 15 );
1097     }
1098     else
1099     {
1100         i_rewind = 0;
1101     }
1102
1103     /*
1104     ** SSE2 128 bits fetch/store instructions are faster 
1105     ** if memory access is 16 bytes aligned
1106     */
1107
1108     p_buffer = b_hscale ? p_buffer_start : p_pic;
1109     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
1110                     p_dest->p->i_pitch|
1111                     ((int)p_y)|
1112                     ((int)p_buffer))) )
1113     {
1114         /* use faster SSE2 aligned fetch and store */
1115         for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1116         {
1117             p_pic_start = p_pic;
1118
1119             for ( i_x = p_vout->render.i_width / 16; i_x--; )
1120             {
1121 #if defined (CAN_COMPILE_SSE2)
1122                 /* use inline SSE2 assembly */
1123                 __asm__( ".p2align 3"
1124                          SSE2_INIT_32_ALIGNED
1125                          SSE2_YUV_MUL
1126                          SSE2_YUV_ADD
1127                          SSE2_UNPACK_32_ARGB_ALIGNED
1128                          : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
1129 #else
1130                 /* otherwise use SSE2 C intrinsics wrappers */
1131                 __m128i  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
1132
1133                 SSE2_INTRINSICS_INIT_32_ALIGNED
1134                 SSE2_INTRINSICS_YUV_MUL
1135                 SSE2_INTRINSICS_YUV_ADD
1136                 SSE2_INTRINSICS_UNPACK_32_ARGB_ALIGNED
1137 #endif
1138                 p_y += 16;
1139                 p_u += 8;
1140                 p_v += 8;
1141                 p_buffer += 16;
1142             }
1143
1144             /* Here we do some unaligned reads and duplicate conversions, but
1145              * at least we have all the pixels */
1146             if( i_rewind )
1147             {
1148                 p_y -= i_rewind;
1149                 p_u -= i_rewind >> 1;
1150                 p_v -= i_rewind >> 1;
1151                 p_buffer -= i_rewind;
1152 #if defined (CAN_COMPILE_SSE2)
1153                 /* use inline SSE2 assembly */
1154                 __asm__( ".p2align 3"
1155                          SSE2_INIT_32_UNALIGNED
1156                          SSE2_YUV_MUL
1157                          SSE2_YUV_ADD
1158                          SSE2_UNPACK_32_ARGB_UNALIGNED
1159                          : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
1160 #else
1161                 /* otherwise use SSE2 intrinsics wrappers */
1162                 {
1163                     __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
1164
1165                     SSE2_INTRINSICS_INIT_32_UNALIGNED
1166                     SSE2_INTRINSICS_YUV_MUL
1167                     SSE2_INTRINSICS_YUV_ADD
1168                     SSE2_INTRINSICS_UNPACK_32_ARGB_UNALIGNED
1169                 }
1170 #endif
1171                 p_y += 16;
1172                 p_u += 4;
1173                 p_v += 4;
1174             }
1175             SCALE_WIDTH;
1176             SCALE_HEIGHT( 420, 4 );
1177
1178             p_y += i_source_margin;
1179             if( i_y % 2 )
1180             {
1181                 p_u += i_source_margin_c;
1182                 p_v += i_source_margin_c;
1183             }
1184             p_buffer = b_hscale ? p_buffer_start : p_pic;
1185         }
1186     }
1187     else
1188     {
1189         /* use slower SSE2 unaligned fetch and store */
1190         for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1191         {
1192             p_pic_start = p_pic;
1193             p_buffer = b_hscale ? p_buffer_start : p_pic;
1194
1195             for ( i_x = p_vout->render.i_width / 16; i_x--; )
1196             {
1197 #if defined (CAN_COMPILE_SSE2)
1198                 /* use inline SSE2 assembly */
1199                 __asm__( ".p2align 3"
1200                          SSE2_INIT_32_UNALIGNED
1201                          SSE2_YUV_MUL
1202                          SSE2_YUV_ADD
1203                          SSE2_UNPACK_32_ARGB_UNALIGNED
1204                          : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
1205 #else
1206                 /* otherwise use SSE2 C intrinsics wrappers */
1207                 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
1208
1209                 SSE2_INTRINSICS_INIT_32_UNALIGNED
1210                 SSE2_INTRINSICS_YUV_MUL
1211                 SSE2_INTRINSICS_YUV_ADD
1212                 SSE2_INTRINSICS_UNPACK_32_ARGB_UNALIGNED
1213 #endif
1214                 p_y += 16;
1215                 p_u += 8;
1216                 p_v += 8;
1217                 p_buffer += 16;
1218             }
1219
1220             /* Here we do some unaligned reads and duplicate conversions, but
1221              * at least we have all the pixels */
1222             if( i_rewind )
1223             {
1224                 p_y -= i_rewind;
1225                 p_u -= i_rewind >> 1;
1226                 p_v -= i_rewind >> 1;
1227                 p_buffer -= i_rewind;
1228 #if defined (CAN_COMPILE_SSE2)
1229                 /* use inline SSE2 assembly */
1230                 __asm__( ".p2align 3"
1231                          SSE2_INIT_32_UNALIGNED
1232                          SSE2_YUV_MUL
1233                          SSE2_YUV_ADD
1234                          SSE2_UNPACK_32_ARGB_UNALIGNED
1235                          : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
1236 #else
1237                 /* otherwise use SSE2 intrinsics wrappers */
1238                 {
1239                     __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
1240
1241                     SSE2_INTRINSICS_INIT_32_UNALIGNED
1242                     SSE2_INTRINSICS_YUV_MUL
1243                     SSE2_INTRINSICS_YUV_ADD
1244                     SSE2_INTRINSICS_UNPACK_32_ARGB_UNALIGNED
1245                 }
1246 #endif
1247                 p_y += 16;
1248                 p_u += 8;
1249                 p_v += 8;
1250             }
1251             SCALE_WIDTH;
1252             SCALE_HEIGHT( 420, 4 );
1253
1254             p_y += i_source_margin;
1255             if( i_y % 2 )
1256             {
1257                 p_u += i_source_margin_c;
1258                 p_v += i_source_margin_c;
1259             }
1260             p_buffer = b_hscale ? p_buffer_start : p_pic;
1261         }
1262     }
1263
1264     /* make sure all SSE2 stores are visible thereafter */
1265 #if defined (CAN_COMPILE_SSE2)
1266     __asm__ __volatile__ ( "sfence" ::: "memory" );
1267 #else
1268     _mm_sfence();
1269 #endif
1270
1271 #else // defined (MODULE_NAME_IS_i420_rgb_mmx)
1272
1273     if( p_vout->render.i_width & 7 )
1274     {
1275         i_rewind = 8 - ( p_vout->render.i_width & 7 );
1276     }
1277     else
1278     {
1279         i_rewind = 0;
1280     }
1281
1282     for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1283     {
1284         p_pic_start = p_pic;
1285         p_buffer = b_hscale ? p_buffer_start : p_pic;
1286
1287         for ( i_x = p_vout->render.i_width / 8; i_x--; )
1288         {
1289 #if defined (CAN_COMPILE_MMX)
1290             /* use inline MMX assembly */
1291             __asm__( MMX_INIT_32
1292                      : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
1293
1294             __asm__( ".p2align 3"
1295                      MMX_YUV_MUL
1296                      MMX_YUV_ADD
1297                      MMX_UNPACK_32_ARGB
1298                      : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
1299 #else
1300             /* otherwise use MMX C intrinsics wrappers */
1301             __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
1302             uint64_t tmp64;
1303
1304             MMX_INTRINSICS_INIT_32
1305             MMX_INTRINSICS_YUV_MUL
1306             MMX_INTRINSICS_YUV_ADD
1307             MMX_INTRINSICS_UNPACK_32_ARGB
1308 #endif
1309             p_y += 8;
1310             p_u += 4;
1311             p_v += 4;
1312             p_buffer += 8;
1313         }
1314
1315         /* Here we do some unaligned reads and duplicate conversions, but
1316          * at least we have all the pixels */
1317         if( i_rewind )
1318         {
1319             p_y -= i_rewind;
1320             p_u -= i_rewind >> 1;
1321             p_v -= i_rewind >> 1;
1322             p_buffer -= i_rewind;
1323 #if defined (CAN_COMPILE_MMX)
1324             /* use inline MMX assembly */
1325             __asm__( ".p2align 3"
1326                      MMX_INIT_32
1327                      MMX_YUV_MUL
1328                      MMX_YUV_ADD
1329                      MMX_UNPACK_32_ARGB
1330                      : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
1331 #else
1332             /* otherwise use MMX intrinsics wrappers */
1333             {
1334                 __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
1335                 uint64_t tmp64;
1336
1337                 MMX_INTRINSICS_INIT_32
1338                 MMX_INTRINSICS_YUV_MUL
1339                 MMX_INTRINSICS_YUV_ADD
1340                 MMX_INTRINSICS_UNPACK_32_ARGB
1341             }
1342 #endif
1343             p_y += 8;
1344             p_u += 4;
1345             p_v += 4;
1346             p_buffer += 8;
1347         }
1348         SCALE_WIDTH;
1349         SCALE_HEIGHT( 420, 4 );
1350
1351         p_y += i_source_margin;
1352         if( i_y % 2 )
1353         {
1354             p_u += i_source_margin_c;
1355             p_v += i_source_margin_c;
1356         }
1357     }
1358     /* re-enable FPU registers */
1359 #if defined (CAN_COMPILE_MMX)
1360     __asm__ __volatile__ ( "emms" );
1361 #else
1362     _mm_empty();
1363 #endif
1364
1365 #endif
1366 }
1367
1368 void E_(I420_B8G8R8A8)( vout_thread_t *p_vout, picture_t *p_src,
1369                                             picture_t *p_dest )
1370 {
1371     /* We got this one from the old arguments */
1372     uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
1373     uint8_t  *p_y   = p_src->Y_PIXELS;
1374     uint8_t  *p_u   = p_src->U_PIXELS;
1375     uint8_t  *p_v   = p_src->V_PIXELS;
1376
1377     vlc_bool_t  b_hscale;                         /* horizontal scaling type */
1378     unsigned int i_vscale;                          /* vertical scaling type */
1379     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
1380
1381     int         i_right_margin;
1382     int         i_rewind;
1383     int         i_scale_count;                       /* scale modulo counter */
1384     int         i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
1385     uint32_t *  p_pic_start;       /* beginning of the current line for copy */
1386     /* Conversion buffer pointer */
1387     uint32_t *  p_buffer_start = (uint32_t*)p_vout->chroma.p_sys->p_buffer;
1388     uint32_t *  p_buffer;
1389
1390     /* Offset array pointer */
1391     int *       p_offset_start = p_vout->chroma.p_sys->p_offset;
1392     int *       p_offset;
1393
1394     const int i_source_margin = p_src->p[0].i_pitch
1395                                  - p_src->p[0].i_visible_pitch;
1396     const int i_source_margin_c = p_src->p[1].i_pitch
1397                                  - p_src->p[1].i_visible_pitch;
1398
1399     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
1400
1401     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
1402      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
1403      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
1404     SetOffset( p_vout->render.i_width, p_vout->render.i_height,
1405                p_vout->output.i_width, p_vout->output.i_height,
1406                &b_hscale, &i_vscale, p_offset_start );
1407
1408     /*
1409      * Perform conversion
1410      */
1411     i_scale_count = ( i_vscale == 1 ) ?
1412                     p_vout->output.i_height : p_vout->render.i_height;
1413
1414 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
1415
1416     if( p_vout->render.i_width & 15 )
1417     {
1418         i_rewind = 16 - ( p_vout->render.i_width & 15 );
1419     }
1420     else
1421     {
1422         i_rewind = 0;
1423     }
1424
1425     /*
1426     ** SSE2 128 bits fetch/store instructions are faster 
1427     ** if memory access is 16 bytes aligned
1428     */
1429
1430     p_buffer = b_hscale ? p_buffer_start : p_pic;
1431     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
1432                     p_dest->p->i_pitch|
1433                     ((int)p_y)|
1434                     ((int)p_buffer))) )
1435     {
1436         /* use faster SSE2 aligned fetch and store */
1437         for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1438         {
1439             p_pic_start = p_pic;
1440
1441             for ( i_x = p_vout->render.i_width / 16; i_x--; )
1442             {
1443 #if defined (CAN_COMPILE_SSE2)
1444                 /* use inline SSE2 assembly */
1445                 __asm__( ".p2align 3"
1446                          SSE2_INIT_32_ALIGNED
1447                          SSE2_YUV_MUL
1448                          SSE2_YUV_ADD
1449                          SSE2_UNPACK_32_BGRA_ALIGNED
1450                          : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
1451 #else
1452                 /* otherwise use SSE2 C intrinsics wrappers */
1453                 __m128i  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
1454
1455                 SSE2_INTRINSICS_INIT_32_ALIGNED
1456                 SSE2_INTRINSICS_YUV_MUL
1457                 SSE2_INTRINSICS_YUV_ADD
1458                 SSE2_INTRINSICS_UNPACK_32_BGRA_ALIGNED
1459 #endif
1460                 p_y += 16;
1461                 p_u += 8;
1462                 p_v += 8;
1463                 p_buffer += 16;
1464             }
1465
1466             /* Here we do some unaligned reads and duplicate conversions, but
1467              * at least we have all the pixels */
1468             if( i_rewind )
1469             {
1470                 p_y -= i_rewind;
1471                 p_u -= i_rewind >> 1;
1472                 p_v -= i_rewind >> 1;
1473                 p_buffer -= i_rewind;
1474 #if defined (CAN_COMPILE_SSE2)
1475                 /* use inline SSE2 assembly */
1476                 __asm__( ".p2align 3"
1477                          SSE2_INIT_32_UNALIGNED
1478                          SSE2_YUV_MUL
1479                          SSE2_YUV_ADD
1480                          SSE2_UNPACK_32_BGRA_UNALIGNED
1481                          : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
1482 #else
1483                 /* otherwise use SSE2 intrinsics wrappers */
1484                 {
1485                     __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
1486
1487                     SSE2_INTRINSICS_INIT_32_UNALIGNED
1488                     SSE2_INTRINSICS_YUV_MUL
1489                     SSE2_INTRINSICS_YUV_ADD
1490                     SSE2_INTRINSICS_UNPACK_32_BGRA_UNALIGNED
1491                 }
1492 #endif
1493                 p_y += 16;
1494                 p_u += 4;
1495                 p_v += 4;
1496             }
1497             SCALE_WIDTH;
1498             SCALE_HEIGHT( 420, 4 );
1499
1500             p_y += i_source_margin;
1501             if( i_y % 2 )
1502             {
1503                 p_u += i_source_margin_c;
1504                 p_v += i_source_margin_c;
1505             }
1506             p_buffer = b_hscale ? p_buffer_start : p_pic;
1507         }
1508     }
1509     else
1510     {
1511         /* use slower SSE2 unaligned fetch and store */
1512         for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1513         {
1514             p_pic_start = p_pic;
1515             p_buffer = b_hscale ? p_buffer_start : p_pic;
1516
1517             for ( i_x = p_vout->render.i_width / 16; i_x--; )
1518             {
1519 #if defined (CAN_COMPILE_SSE2)
1520                 /* use inline SSE2 assembly */
1521                 __asm__( ".p2align 3"
1522                          SSE2_INIT_32_UNALIGNED
1523                          SSE2_YUV_MUL
1524                          SSE2_YUV_ADD
1525                          SSE2_UNPACK_32_BGRA_UNALIGNED
1526                          : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
1527 #else
1528                 /* otherwise use SSE2 C intrinsics wrappers */
1529                 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
1530
1531                 SSE2_INTRINSICS_INIT_32_UNALIGNED
1532                 SSE2_INTRINSICS_YUV_MUL
1533                 SSE2_INTRINSICS_YUV_ADD
1534                 SSE2_INTRINSICS_UNPACK_32_BGRA_UNALIGNED
1535 #endif
1536                 p_y += 16;
1537                 p_u += 8;
1538                 p_v += 8;
1539                 p_buffer += 16;
1540             }
1541
1542             /* Here we do some unaligned reads and duplicate conversions, but
1543              * at least we have all the pixels */
1544             if( i_rewind )
1545             {
1546                 p_y -= i_rewind;
1547                 p_u -= i_rewind >> 1;
1548                 p_v -= i_rewind >> 1;
1549                 p_buffer -= i_rewind;
1550 #if defined (CAN_COMPILE_SSE2)
1551                 /* use inline SSE2 assembly */
1552                 __asm__( ".p2align 3"
1553                          SSE2_INIT_32_UNALIGNED
1554                          SSE2_YUV_MUL
1555                          SSE2_YUV_ADD
1556                          SSE2_UNPACK_32_BGRA_UNALIGNED
1557                          : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
1558 #else
1559                 /* otherwise use SSE2 intrinsics wrappers */
1560                 {
1561                     __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
1562
1563                     SSE2_INTRINSICS_INIT_32_UNALIGNED
1564                     SSE2_INTRINSICS_YUV_MUL
1565                     SSE2_INTRINSICS_YUV_ADD
1566                     SSE2_INTRINSICS_UNPACK_32_BGRA_UNALIGNED
1567                 }
1568 #endif
1569                 p_y += 16;
1570                 p_u += 8;
1571                 p_v += 8;
1572             }
1573             SCALE_WIDTH;
1574             SCALE_HEIGHT( 420, 4 );
1575
1576             p_y += i_source_margin;
1577             if( i_y % 2 )
1578             {
1579                 p_u += i_source_margin_c;
1580                 p_v += i_source_margin_c;
1581             }
1582             p_buffer = b_hscale ? p_buffer_start : p_pic;
1583         }
1584     }
1585
1586 #else
1587
1588     if( p_vout->render.i_width & 7 )
1589     {
1590         i_rewind = 8 - ( p_vout->render.i_width & 7 );
1591     }
1592     else
1593     {
1594         i_rewind = 0;
1595     }
1596
1597     for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1598     {
1599         p_pic_start = p_pic;
1600         p_buffer = b_hscale ? p_buffer_start : p_pic;
1601
1602         for ( i_x = p_vout->render.i_width / 8; i_x--; )
1603         {
1604 #if defined (CAN_COMPILE_MMX)
1605             /* use inline MMX assembly */
1606             __asm__( MMX_INIT_32
1607                      : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
1608
1609             __asm__( ".p2align 3"
1610                      MMX_YUV_MUL
1611                      MMX_YUV_ADD
1612                      MMX_UNPACK_32_ARGB
1613                      : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
1614 #else
1615             /* otherwise use MMX C intrinsics wrappers */
1616             __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
1617             uint64_t tmp64;
1618
1619             MMX_INTRINSICS_INIT_32
1620             MMX_INTRINSICS_YUV_MUL
1621             MMX_INTRINSICS_YUV_ADD
1622             MMX_INTRINSICS_UNPACK_32_BGRA
1623 #endif
1624             p_y += 8;
1625             p_u += 4;
1626             p_v += 4;
1627             p_buffer += 8;
1628         }
1629
1630         /* Here we do some unaligned reads and duplicate conversions, but
1631          * at least we have all the pixels */
1632         if( i_rewind )
1633         {
1634             p_y -= i_rewind;
1635             p_u -= i_rewind >> 1;
1636             p_v -= i_rewind >> 1;
1637             p_buffer -= i_rewind;
1638 #if defined (CAN_COMPILE_MMX)
1639             /* use inline MMX assembly */
1640             __asm__( ".p2align 3"
1641                      MMX_INIT_32
1642                      MMX_YUV_MUL
1643                      MMX_YUV_ADD
1644                      MMX_UNPACK_32_BGRA
1645                      : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
1646 #else
1647             /* otherwise use MMX intrinsics wrappers */
1648             {
1649                 __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
1650                 uint64_t tmp64;
1651
1652                 MMX_INTRINSICS_INIT_32
1653                 MMX_INTRINSICS_YUV_MUL
1654                 MMX_INTRINSICS_YUV_ADD
1655                 MMX_INTRINSICS_UNPACK_32_BGRA
1656             }
1657 #endif
1658             p_y += 8;
1659             p_u += 4;
1660             p_v += 4;
1661             p_buffer += 8;
1662         }
1663         SCALE_WIDTH;
1664         SCALE_HEIGHT( 420, 4 );
1665
1666         p_y += i_source_margin;
1667         if( i_y % 2 )
1668         {
1669             p_u += i_source_margin_c;
1670             p_v += i_source_margin_c;
1671         }
1672     }
1673     /* re-enable FPU registers */
1674 #if defined (CAN_COMPILE_MMX)
1675     __asm__ __volatile__ ( "emms" );
1676 #else
1677     _mm_empty();
1678 #endif
1679
1680 #endif
1681 }
1682
1683 #endif
1684
1685 /* Following functions are local */
1686
1687 /*****************************************************************************
1688  * SetOffset: build offset array for conversion functions
1689  *****************************************************************************
1690  * This function will build an offset array used in later conversion functions.
1691  * It will also set horizontal and vertical scaling indicators.
1692  *****************************************************************************/
1693 static void SetOffset( int i_width, int i_height, int i_pic_width,
1694                        int i_pic_height, vlc_bool_t *pb_hscale,
1695                        unsigned int *pi_vscale, int *p_offset )
1696 {
1697     int i_x;                                    /* x position in destination */
1698     int i_scale_count;                                     /* modulo counter */
1699
1700     /*
1701      * Prepare horizontal offset array
1702      */
1703     if( i_pic_width - i_width == 0 )
1704     {
1705         /* No horizontal scaling: YUV conversion is done directly to picture */
1706         *pb_hscale = 0;
1707     }
1708     else if( i_pic_width - i_width > 0 )
1709     {
1710         /* Prepare scaling array for horizontal extension */
1711         *pb_hscale = 1;
1712         i_scale_count = i_pic_width;
1713         for( i_x = i_width; i_x--; )
1714         {
1715             while( (i_scale_count -= i_width) > 0 )
1716             {
1717                 *p_offset++ = 0;
1718             }
1719             *p_offset++ = 1;
1720             i_scale_count += i_pic_width;
1721         }
1722     }
1723     else /* if( i_pic_width - i_width < 0 ) */
1724     {
1725         /* Prepare scaling array for horizontal reduction */
1726         *pb_hscale = 1;
1727         i_scale_count = i_width;
1728         for( i_x = i_pic_width; i_x--; )
1729         {
1730             *p_offset = 1;
1731             while( (i_scale_count -= i_pic_width) > 0 )
1732             {
1733                 *p_offset += 1;
1734             }
1735             p_offset++;
1736             i_scale_count += i_width;
1737         }
1738     }
1739
1740     /*
1741      * Set vertical scaling indicator
1742      */
1743     if( i_pic_height - i_height == 0 )
1744     {
1745         *pi_vscale = 0;
1746     }
1747     else if( i_pic_height - i_height > 0 )
1748     {
1749         *pi_vscale = 1;
1750     }
1751     else /* if( i_pic_height - i_height < 0 ) */
1752     {
1753         *pi_vscale = -1;
1754     }
1755 }
1756