]> git.sesse.net Git - vlc/blob - modules/video_chroma/i420_yuy2.c
- video_chroma: added support for IA-32/64 SSE2 accelaration (128 bit vector integer...
[vlc] / modules / video_chroma / i420_yuy2.c
1 /*****************************************************************************
2  * i420_yuy2.c : YUV to YUV conversion module for vlc
3  *****************************************************************************
4  * Copyright (C) 2000, 2001 the VideoLAN team
5  * $Id$
6  *
7  * Authors: Samuel Hocevar <sam@zoy.org>
8  *          Damien Fouilleul <damien@videolan.org>
9  *
10  * This program is free software; you can redistribute it and/or modify
11  * it under the terms of the GNU General Public License as published by
12  * the Free Software Foundation; either version 2 of the License, or
13  * (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public License
21  * along with this program; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
23  *****************************************************************************/
24
25 /*****************************************************************************
26  * Preamble
27  *****************************************************************************/
28 #include <string.h>                                            /* strerror() */
29 #include <stdlib.h>                                      /* malloc(), free() */
30
31 #include <vlc/vlc.h>
32 #include <vlc_vout.h>
33
34 #if defined (MODULE_NAME_IS_i420_yuy2_altivec) && defined(HAVE_ALTIVEC_H)
35 #   include <altivec.h>
36 #endif
37
38 #include "i420_yuy2.h"
39
40 #define SRC_FOURCC  "I420,IYUV,YV12"
41
42 #if defined (MODULE_NAME_IS_i420_yuy2)
43 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv,Y211"
44 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
45 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
46 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
47 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
48 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
49 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422"
50 #endif
51
52 /*****************************************************************************
53  * Local and extern prototypes.
54  *****************************************************************************/
55 static int  Activate ( vlc_object_t * );
56
57 static void I420_YUY2           ( vout_thread_t *, picture_t *, picture_t * );
58 static void I420_YVYU           ( vout_thread_t *, picture_t *, picture_t * );
59 static void I420_UYVY           ( vout_thread_t *, picture_t *, picture_t * );
60 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
61 static void I420_IUYV           ( vout_thread_t *, picture_t *, picture_t * );
62 static void I420_cyuv           ( vout_thread_t *, picture_t *, picture_t * );
63 #endif
64 #if defined (MODULE_NAME_IS_i420_yuy2)
65 static void I420_Y211           ( vout_thread_t *, picture_t *, picture_t * );
66 #endif
67
68 #ifdef MODULE_NAME_IS_i420_yuy2_mmx
69 /* Initialize MMX-specific constants */
70 static const uint64_t i_00ffw = 0x00ff00ff00ff00ffULL;
71 static const uint64_t i_80w   = 0x0000000080808080ULL;
72 #endif
73
74 /*****************************************************************************
75  * Module descriptor.
76  *****************************************************************************/
77 vlc_module_begin();
78 #if defined (MODULE_NAME_IS_i420_yuy2)
79     set_description( _("Conversions from " SRC_FOURCC " to " DEST_FOURCC) );
80     set_capability( "chroma", 80 );
81 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
82     set_description( _("MMX conversions from " SRC_FOURCC " to " DEST_FOURCC) );
83     set_capability( "chroma", 100 );
84     add_requirement( MMX );
85 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
86     set_description( _("SSE2 conversions from " SRC_FOURCC " to " DEST_FOURCC) );
87     set_capability( "chroma", 120 );
88     add_requirement( SSE2 );
89 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
90     set_description(
91             _("AltiVec conversions from " SRC_FOURCC " to " DEST_FOURCC) );
92     set_capability( "chroma", 100 );
93     add_requirement( ALTIVEC );
94 #endif
95     set_callbacks( Activate, NULL );
96 vlc_module_end();
97
98 /*****************************************************************************
99  * Activate: allocate a chroma function
100  *****************************************************************************
101  * This function allocates and initializes a chroma function
102  *****************************************************************************/
103 static int Activate( vlc_object_t *p_this )
104 {
105     vout_thread_t *p_vout = (vout_thread_t *)p_this;
106
107     if( p_vout->render.i_width & 1 || p_vout->render.i_height & 1 )
108     {
109         return -1;
110     }
111
112     switch( p_vout->render.i_chroma )
113     {
114         case VLC_FOURCC('Y','V','1','2'):
115         case VLC_FOURCC('I','4','2','0'):
116         case VLC_FOURCC('I','Y','U','V'):
117             switch( p_vout->output.i_chroma )
118             {
119                 case VLC_FOURCC('Y','U','Y','2'):
120                 case VLC_FOURCC('Y','U','N','V'):
121                     p_vout->chroma.pf_convert = I420_YUY2;
122                     break;
123
124                 case VLC_FOURCC('Y','V','Y','U'):
125                     p_vout->chroma.pf_convert = I420_YVYU;
126                     break;
127
128                 case VLC_FOURCC('U','Y','V','Y'):
129                 case VLC_FOURCC('U','Y','N','V'):
130                 case VLC_FOURCC('Y','4','2','2'):
131                     p_vout->chroma.pf_convert = I420_UYVY;
132                     break;
133 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
134                 case VLC_FOURCC('I','U','Y','V'):
135                     p_vout->chroma.pf_convert = I420_IUYV;
136                     break;
137
138                 case VLC_FOURCC('c','y','u','v'):
139                     p_vout->chroma.pf_convert = I420_cyuv;
140                     break;
141 #endif
142
143 #if defined (MODULE_NAME_IS_i420_yuy2)
144                 case VLC_FOURCC('Y','2','1','1'):
145                     p_vout->chroma.pf_convert = I420_Y211;
146                     break;
147 #endif
148
149                 default:
150                     return -1;
151             }
152             break;
153
154         default:
155             return -1;
156     }
157
158     return 0;
159 }
160
161 /* Following functions are local */
162
163 /*****************************************************************************
164  * I420_YUY2: planar YUV 4:2:0 to packed YUYV 4:2:2
165  *****************************************************************************/
166 static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source,
167                                               picture_t *p_dest )
168 {
169     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
170     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
171     uint8_t *p_u = p_source->U_PIXELS;
172     uint8_t *p_v = p_source->V_PIXELS;
173
174     int i_x, i_y;
175
176 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
177 #define VEC_NEXT_LINES( ) \
178     p_line1  = p_line2; \
179     p_line2 += p_dest->p->i_pitch; \
180     p_y1     = p_y2; \
181     p_y2    += p_source->p[Y_PLANE].i_pitch;
182
183 #define VEC_LOAD_UV( ) \
184     u_vec = vec_ld( 0, p_u ); p_u += 16; \
185     v_vec = vec_ld( 0, p_v ); p_v += 16;
186
187 #define VEC_MERGE( a ) \
188     uv_vec = a( u_vec, v_vec ); \
189     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
190     vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
191     vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
192     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
193     vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16; \
194     vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16;
195
196     vector unsigned char u_vec;
197     vector unsigned char v_vec;
198     vector unsigned char uv_vec;
199     vector unsigned char y_vec;
200
201     if( !( ( p_vout->render.i_width % 32 ) |
202            ( p_vout->render.i_height % 2 ) ) )
203     {
204         /* Width is a multiple of 32, we take 2 lines at a time */
205         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
206         {
207             VEC_NEXT_LINES( );
208             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
209             {
210                 VEC_LOAD_UV( );
211                 VEC_MERGE( vec_mergeh );
212                 VEC_MERGE( vec_mergel );
213             }
214         }
215     }
216     else if( !( ( p_vout->render.i_width % 16 ) |
217                 ( p_vout->render.i_height % 4 ) ) )
218     {
219         /* Width is only a multiple of 16, we take 4 lines at a time */
220         for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
221         {
222             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
223             VEC_NEXT_LINES( );
224             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
225             {
226                 VEC_LOAD_UV( );
227                 VEC_MERGE( vec_mergeh );
228                 VEC_MERGE( vec_mergel );
229             }
230
231             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
232             VEC_LOAD_UV( );
233             VEC_MERGE( vec_mergeh );
234
235             /* Line 3 and 4, pixels 0 to 16 */
236             VEC_NEXT_LINES( );
237             VEC_MERGE( vec_mergel );
238
239             /* Line 3 and 4, pixels 16 to ( width ) */
240             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
241             {
242                 VEC_LOAD_UV( );
243                 VEC_MERGE( vec_mergeh );
244                 VEC_MERGE( vec_mergel );
245             }
246         }
247     }
248     else
249     {
250         /* Crap, use the C version */
251 #undef VEC_NEXT_LINES
252 #undef VEC_LOAD_UV
253 #undef VEC_MERGE
254 #endif
255
256     const int i_source_margin = p_source->p[0].i_pitch
257                                  - p_source->p[0].i_visible_pitch;
258     const int i_source_margin_c = p_source->p[1].i_pitch
259                                  - p_source->p[1].i_visible_pitch;
260     const int i_dest_margin = p_dest->p->i_pitch
261                                - p_dest->p->i_visible_pitch;
262
263 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
264     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
265     {
266         p_line1 = p_line2;
267         p_line2 += p_dest->p->i_pitch;
268
269         p_y1 = p_y2;
270         p_y2 += p_source->p[Y_PLANE].i_pitch;
271
272 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
273         for( i_x = p_vout->render.i_width / 8; i_x-- ; )
274         {
275             C_YUV420_YUYV( );
276             C_YUV420_YUYV( );
277             C_YUV420_YUYV( );
278             C_YUV420_YUYV( );
279         }
280 #else
281         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
282         {
283             MMX_CALL( MMX_YUV420_YUYV );
284         }
285 #endif
286         for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; )
287         {
288             C_YUV420_YUYV( );
289         }
290
291         p_y1 += i_source_margin;
292         p_y2 += i_source_margin;
293         p_u += i_source_margin_c;
294         p_v += i_source_margin_c;
295         p_line1 += i_dest_margin;
296         p_line2 += i_dest_margin;
297     }
298
299 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
300     __asm__ __volatile__("emms" :: );
301 #endif
302
303 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
304     }
305 #endif
306
307 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
308     /*
309     ** SSE2 128 bytes fetch/store instructions are faster 
310     ** if memory access is 16 bytes aligned
311     */
312     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
313         ((int)p_line2|(int)p_y2))) )
314     {
315         /* use faster SSE2 aligned fetch and store */
316         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
317         {
318             p_line1 = p_line2;
319             p_line2 += p_dest->p->i_pitch;
320
321             p_y1 = p_y2;
322             p_y2 += p_source->p[Y_PLANE].i_pitch;
323
324             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
325             {
326                 SSE2_CALL( SSE2_YUV420_YUYV_ALIGNED );
327             }
328             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
329             {
330                 C_YUV420_YUYV( );
331             }
332
333             p_y1 += i_source_margin;
334             p_y2 += i_source_margin;
335             p_u += i_source_margin_c;
336             p_v += i_source_margin_c;
337             p_line1 += i_dest_margin;
338             p_line2 += i_dest_margin;
339         }
340     }
341     else
342     {
343         /* use slower SSE2 unaligned fetch and store */
344         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
345         {
346             p_line1 = p_line2;
347             p_line2 += p_dest->p->i_pitch;
348
349             p_y1 = p_y2;
350             p_y2 += p_source->p[Y_PLANE].i_pitch;
351
352             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
353             {
354                 SSE2_CALL( SSE2_YUV420_YUYV_UNALIGNED );
355             }
356             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
357             {
358                 C_YUV420_YUYV( );
359             }
360
361             p_y1 += i_source_margin;
362             p_y2 += i_source_margin;
363             p_u += i_source_margin_c;
364             p_v += i_source_margin_c;
365             p_line1 += i_dest_margin;
366             p_line2 += i_dest_margin;
367         }
368     }
369 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
370 }
371
372 /*****************************************************************************
373  * I420_YVYU: planar YUV 4:2:0 to packed YVYU 4:2:2
374  *****************************************************************************/
375 static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source,
376                                               picture_t *p_dest )
377 {
378     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
379     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
380     uint8_t *p_u = p_source->U_PIXELS;
381     uint8_t *p_v = p_source->V_PIXELS;
382
383     int i_x, i_y;
384
385 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
386 #define VEC_NEXT_LINES( ) \
387     p_line1  = p_line2; \
388     p_line2 += p_dest->p->i_pitch; \
389     p_y1     = p_y2; \
390     p_y2    += p_source->p[Y_PLANE].i_pitch;
391
392 #define VEC_LOAD_UV( ) \
393     u_vec = vec_ld( 0, p_u ); p_u += 16; \
394     v_vec = vec_ld( 0, p_v ); p_v += 16;
395
396 #define VEC_MERGE( a ) \
397     vu_vec = a( v_vec, u_vec ); \
398     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
399     vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
400     vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
401     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
402     vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16; \
403     vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16;
404
405     vector unsigned char u_vec;
406     vector unsigned char v_vec;
407     vector unsigned char vu_vec;
408     vector unsigned char y_vec;
409
410     if( !( ( p_vout->render.i_width % 32 ) |
411            ( p_vout->render.i_height % 2 ) ) )
412     {
413         /* Width is a multiple of 32, we take 2 lines at a time */
414         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
415         {
416             VEC_NEXT_LINES( );
417             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
418             {
419                 VEC_LOAD_UV( );
420                 VEC_MERGE( vec_mergeh );
421                 VEC_MERGE( vec_mergel );
422             }
423         }
424     }
425     else if( !( ( p_vout->render.i_width % 16 ) |
426                 ( p_vout->render.i_height % 4 ) ) )
427     {
428         /* Width is only a multiple of 16, we take 4 lines at a time */
429         for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
430         {
431             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
432             VEC_NEXT_LINES( );
433             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
434             {
435                 VEC_LOAD_UV( );
436                 VEC_MERGE( vec_mergeh );
437                 VEC_MERGE( vec_mergel );
438             }
439
440             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
441             VEC_LOAD_UV( );
442             VEC_MERGE( vec_mergeh );
443
444             /* Line 3 and 4, pixels 0 to 16 */
445             VEC_NEXT_LINES( );
446             VEC_MERGE( vec_mergel );
447
448             /* Line 3 and 4, pixels 16 to ( width ) */
449             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
450             {
451                 VEC_LOAD_UV( );
452                 VEC_MERGE( vec_mergeh );
453                 VEC_MERGE( vec_mergel );
454             }
455         }
456     }
457     else
458     {
459         /* Crap, use the C version */
460 #undef VEC_NEXT_LINES
461 #undef VEC_LOAD_UV
462 #undef VEC_MERGE
463 #endif
464
465     const int i_source_margin = p_source->p[0].i_pitch
466                                  - p_source->p[0].i_visible_pitch;
467     const int i_source_margin_c = p_source->p[1].i_pitch
468                                  - p_source->p[1].i_visible_pitch;
469     const int i_dest_margin = p_dest->p->i_pitch
470                                - p_dest->p->i_visible_pitch;
471
472 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
473     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
474     {
475         p_line1 = p_line2;
476         p_line2 += p_dest->p->i_pitch;
477
478         p_y1 = p_y2;
479         p_y2 += p_source->p[Y_PLANE].i_pitch;
480
481         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
482         {
483 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
484             C_YUV420_YVYU( );
485             C_YUV420_YVYU( );
486             C_YUV420_YVYU( );
487             C_YUV420_YVYU( );
488 #else
489             MMX_CALL( MMX_YUV420_YVYU );
490 #endif
491         }
492
493         p_y1 += i_source_margin;
494         p_y2 += i_source_margin;
495         p_u += i_source_margin_c;
496         p_v += i_source_margin_c;
497         p_line1 += i_dest_margin;
498         p_line2 += i_dest_margin;
499     }
500
501 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
502     __asm__ __volatile__("emms" :: );
503 #endif
504
505 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
506     }
507 #endif
508
509 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
510     /*
511     ** SSE2 128 bytes fetch/store instructions are faster 
512     ** if memory access is 16 bytes aligned
513     */
514     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
515         ((int)p_line2|(int)p_y2))) )
516     {
517         /* use faster SSE2 aligned fetch and store */
518         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
519         {
520             p_line1 = p_line2;
521             p_line2 += p_dest->p->i_pitch;
522
523             p_y1 = p_y2;
524             p_y2 += p_source->p[Y_PLANE].i_pitch;
525
526             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
527             {
528                 SSE2_CALL( SSE2_YUV420_YVYU_ALIGNED );
529             }
530             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
531             {
532                 C_YUV420_YVYU( );
533             }
534
535             p_y1 += i_source_margin;
536             p_y2 += i_source_margin;
537             p_u += i_source_margin_c;
538             p_v += i_source_margin_c;
539             p_line1 += i_dest_margin;
540             p_line2 += i_dest_margin;
541         }
542     }
543     else
544     {
545         /* use slower SSE2 unaligned fetch and store */
546         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
547         {
548             p_line1 = p_line2;
549             p_line2 += p_dest->p->i_pitch;
550
551             p_y1 = p_y2;
552             p_y2 += p_source->p[Y_PLANE].i_pitch;
553
554             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
555             {
556                 SSE2_CALL( SSE2_YUV420_YVYU_UNALIGNED );
557             }
558             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
559             {
560                 C_YUV420_YVYU( );
561             }
562
563             p_y1 += i_source_margin;
564             p_y2 += i_source_margin;
565             p_u += i_source_margin_c;
566             p_v += i_source_margin_c;
567             p_line1 += i_dest_margin;
568             p_line2 += i_dest_margin;
569         }
570     }
571 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
572 }
573
574 /*****************************************************************************
575  * I420_UYVY: planar YUV 4:2:0 to packed UYVY 4:2:2
576  *****************************************************************************/
577 static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source,
578                                               picture_t *p_dest )
579 {
580     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
581     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
582     uint8_t *p_u = p_source->U_PIXELS;
583     uint8_t *p_v = p_source->V_PIXELS;
584
585     int i_x, i_y;
586
587 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
588 #define VEC_NEXT_LINES( ) \
589     p_line1  = p_line2; \
590     p_line2 += p_dest->p->i_pitch; \
591     p_y1     = p_y2; \
592     p_y2    += p_source->p[Y_PLANE].i_pitch;
593
594 #define VEC_LOAD_UV( ) \
595     u_vec = vec_ld( 0, p_u ); p_u += 16; \
596     v_vec = vec_ld( 0, p_v ); p_v += 16;
597
598 #define VEC_MERGE( a ) \
599     uv_vec = a( u_vec, v_vec ); \
600     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
601     vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
602     vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
603     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
604     vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16; \
605     vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16;
606
607     vector unsigned char u_vec;
608     vector unsigned char v_vec;
609     vector unsigned char uv_vec;
610     vector unsigned char y_vec;
611
612     if( !( ( p_vout->render.i_width % 32 ) |
613            ( p_vout->render.i_height % 2 ) ) )
614     {
615         /* Width is a multiple of 32, we take 2 lines at a time */
616         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
617         {
618             VEC_NEXT_LINES( );
619             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
620             {
621                 VEC_LOAD_UV( );
622                 VEC_MERGE( vec_mergeh );
623                 VEC_MERGE( vec_mergel );
624             }
625         }
626     }
627     else if( !( ( p_vout->render.i_width % 16 ) |
628                 ( p_vout->render.i_height % 4 ) ) )
629     {
630         /* Width is only a multiple of 16, we take 4 lines at a time */
631         for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
632         {
633             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
634             VEC_NEXT_LINES( );
635             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
636             {
637                 VEC_LOAD_UV( );
638                 VEC_MERGE( vec_mergeh );
639                 VEC_MERGE( vec_mergel );
640             }
641
642             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
643             VEC_LOAD_UV( );
644             VEC_MERGE( vec_mergeh );
645
646             /* Line 3 and 4, pixels 0 to 16 */
647             VEC_NEXT_LINES( );
648             VEC_MERGE( vec_mergel );
649
650             /* Line 3 and 4, pixels 16 to ( width ) */
651             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
652             {
653                 VEC_LOAD_UV( );
654                 VEC_MERGE( vec_mergeh );
655                 VEC_MERGE( vec_mergel );
656             }
657         }
658     }
659     else
660     {
661         /* Crap, use the C version */
662 #undef VEC_NEXT_LINES
663 #undef VEC_LOAD_UV
664 #undef VEC_MERGE
665 #endif
666
667     const int i_source_margin = p_source->p[0].i_pitch
668                                  - p_source->p[0].i_visible_pitch;
669     const int i_source_margin_c = p_source->p[1].i_pitch
670                                  - p_source->p[1].i_visible_pitch;
671     const int i_dest_margin = p_dest->p->i_pitch
672                                - p_dest->p->i_visible_pitch;
673
674 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
675     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
676     {
677         p_line1 = p_line2;
678         p_line2 += p_dest->p->i_pitch;
679
680         p_y1 = p_y2;
681         p_y2 += p_source->p[Y_PLANE].i_pitch;
682
683         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
684         {
685 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
686             C_YUV420_UYVY( );
687             C_YUV420_UYVY( );
688             C_YUV420_UYVY( );
689             C_YUV420_UYVY( );
690 #else
691             MMX_CALL( MMX_YUV420_UYVY );
692 #endif
693         }
694         for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x--; )
695         {
696             C_YUV420_UYVY( );
697         }
698
699         p_y1 += i_source_margin;
700         p_y2 += i_source_margin;
701         p_u += i_source_margin_c;
702         p_v += i_source_margin_c;
703         p_line1 += i_dest_margin;
704         p_line2 += i_dest_margin;
705     }
706
707 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
708     __asm__ __volatile__("emms" :: );
709 #endif
710
711 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
712     }
713 #endif
714
715 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
716     /*
717     ** SSE2 128 bytes fetch/store instructions are faster 
718     ** if memory access is 16 bytes aligned
719     */
720     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
721         ((int)p_line2|(int)p_y2))) )
722     {
723         /* use faster SSE2 aligned fetch and store */
724         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
725         {
726             p_line1 = p_line2;
727             p_line2 += p_dest->p->i_pitch;
728
729             p_y1 = p_y2;
730             p_y2 += p_source->p[Y_PLANE].i_pitch;
731
732             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
733             {
734                 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
735             }
736             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
737             {
738                 C_YUV420_UYVY( );
739             }
740
741             p_y1 += i_source_margin;
742             p_y2 += i_source_margin;
743             p_u += i_source_margin_c;
744             p_v += i_source_margin_c;
745             p_line1 += i_dest_margin;
746             p_line2 += i_dest_margin;
747         }
748     }
749     else
750     {
751         /* use slower SSE2 unaligned fetch and store */
752         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
753         {
754             p_line1 = p_line2;
755             p_line2 += p_dest->p->i_pitch;
756
757             p_y1 = p_y2;
758             p_y2 += p_source->p[Y_PLANE].i_pitch;
759
760             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
761             {
762                 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
763             }
764             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
765             {
766                 C_YUV420_UYVY( );
767             }
768
769             p_y1 += i_source_margin;
770             p_y2 += i_source_margin;
771             p_u += i_source_margin_c;
772             p_v += i_source_margin_c;
773             p_line1 += i_dest_margin;
774             p_line2 += i_dest_margin;
775         }
776     }
777 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
778 }
779
780 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
781 /*****************************************************************************
782  * I420_IUYV: planar YUV 4:2:0 to interleaved packed UYVY 4:2:2
783  *****************************************************************************/
784 static void I420_IUYV( vout_thread_t *p_vout, picture_t *p_source,
785                                               picture_t *p_dest )
786 {
787     /* FIXME: TODO ! */
788     msg_Err( p_vout, "I420_IUYV unimplemented, please harass <sam@zoy.org>" );
789 }
790
791 /*****************************************************************************
792  * I420_cyuv: planar YUV 4:2:0 to upside-down packed UYVY 4:2:2
793  *****************************************************************************/
794 static void I420_cyuv( vout_thread_t *p_vout, picture_t *p_source,
795                                               picture_t *p_dest )
796 {
797     uint8_t *p_line1 = p_dest->p->p_pixels +
798                        p_dest->p->i_visible_lines * p_dest->p->i_pitch
799                        + p_dest->p->i_pitch;
800     uint8_t *p_line2 = p_dest->p->p_pixels +
801                        p_dest->p->i_visible_lines * p_dest->p->i_pitch;
802     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
803     uint8_t *p_u = p_source->U_PIXELS;
804     uint8_t *p_v = p_source->V_PIXELS;
805
806     int i_x, i_y;
807
808     const int i_source_margin = p_source->p[0].i_pitch
809                                  - p_source->p[0].i_visible_pitch;
810     const int i_source_margin_c = p_source->p[1].i_pitch
811                                  - p_source->p[1].i_visible_pitch;
812     const int i_dest_margin = p_dest->p->i_pitch
813                                - p_dest->p->i_visible_pitch;
814
815 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
816     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
817     {
818         p_line1 -= 3 * p_dest->p->i_pitch;
819         p_line2 -= 3 * p_dest->p->i_pitch;
820
821         p_y1 = p_y2;
822         p_y2 += p_source->p[Y_PLANE].i_pitch;
823
824         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
825         {
826 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
827             C_YUV420_UYVY( );
828             C_YUV420_UYVY( );
829             C_YUV420_UYVY( );
830             C_YUV420_UYVY( );
831 #else
832             MMX_CALL( MMX_YUV420_UYVY );
833 #endif
834         }
835
836         p_y1 += i_source_margin;
837         p_y2 += i_source_margin;
838         p_u += i_source_margin_c;
839         p_v += i_source_margin_c;
840         p_line1 += i_dest_margin;
841         p_line2 += i_dest_margin;
842     }
843
844 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
845     __asm__ __volatile__("emms" :: );
846 #endif
847
848 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
849     /*
850     ** SSE2 128 bytes fetch/store instructions are faster 
851     ** if memory access is 16 bytes aligned
852     */
853     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
854         ((int)p_line2|(int)p_y2))) )
855     {
856         /* use faster SSE2 aligned fetch and store */
857         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
858         {
859             p_line1 = p_line2;
860             p_line2 += p_dest->p->i_pitch;
861
862             p_y1 = p_y2;
863             p_y2 += p_source->p[Y_PLANE].i_pitch;
864
865             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
866             {
867                 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
868             }
869             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
870             {
871                 C_YUV420_UYVY( );
872             }
873
874             p_y1 += i_source_margin;
875             p_y2 += i_source_margin;
876             p_u += i_source_margin_c;
877             p_v += i_source_margin_c;
878             p_line1 += i_dest_margin;
879             p_line2 += i_dest_margin;
880         }
881     }
882     else
883     {
884         /* use slower SSE2 unaligned fetch and store */
885         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
886         {
887             p_line1 = p_line2;
888             p_line2 += p_dest->p->i_pitch;
889
890             p_y1 = p_y2;
891             p_y2 += p_source->p[Y_PLANE].i_pitch;
892
893             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
894             {
895                 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
896             }
897             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
898             {
899                 C_YUV420_UYVY( );
900             }
901
902             p_y1 += i_source_margin;
903             p_y2 += i_source_margin;
904             p_u += i_source_margin_c;
905             p_v += i_source_margin_c;
906             p_line1 += i_dest_margin;
907             p_line2 += i_dest_margin;
908         }
909     }
910 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
911 }
912 #endif // !defined (MODULE_NAME_IS_i420_yuy2_altivec)
913
914 /*****************************************************************************
915  * I420_Y211: planar YUV 4:2:0 to packed YUYV 2:1:1
916  *****************************************************************************/
917 #if defined (MODULE_NAME_IS_i420_yuy2)
918 static void I420_Y211( vout_thread_t *p_vout, picture_t *p_source,
919                                               picture_t *p_dest )
920 {
921     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
922     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
923     uint8_t *p_u = p_source->U_PIXELS;
924     uint8_t *p_v = p_source->V_PIXELS;
925
926     int i_x, i_y;
927
928     const int i_source_margin = p_source->p[0].i_pitch
929                                  - p_source->p[0].i_visible_pitch;
930     const int i_source_margin_c = p_source->p[1].i_pitch
931                                  - p_source->p[1].i_visible_pitch;
932     const int i_dest_margin = p_dest->p->i_pitch
933                                - p_dest->p->i_visible_pitch;
934
935     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
936     {
937         p_line1 = p_line2;
938         p_line2 += p_dest->p->i_pitch;
939
940         p_y1 = p_y2;
941         p_y2 += p_source->p[Y_PLANE].i_pitch;
942
943         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
944         {
945             C_YUV420_Y211( );
946             C_YUV420_Y211( );
947         }
948
949         p_y1 += i_source_margin;
950         p_y2 += i_source_margin;
951         p_u += i_source_margin_c;
952         p_v += i_source_margin_c;
953         p_line1 += i_dest_margin;
954         p_line2 += i_dest_margin;
955     }
956 }
957 #endif