]> git.sesse.net Git - vlc/blob - modules/video_chroma/i420_yuy2.c
chromas converstion: optimize MMX/SSE acceleration even more by leveraging out of...
[vlc] / modules / video_chroma / i420_yuy2.c
1 /*****************************************************************************
2  * i420_yuy2.c : YUV to YUV conversion module for vlc
3  *****************************************************************************
4  * Copyright (C) 2000, 2001 the VideoLAN team
5  * $Id$
6  *
7  * Authors: Samuel Hocevar <sam@zoy.org>
8  *          Damien Fouilleul <damien@videolan.org>
9  *
10  * This program is free software; you can redistribute it and/or modify
11  * it under the terms of the GNU General Public License as published by
12  * the Free Software Foundation; either version 2 of the License, or
13  * (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public License
21  * along with this program; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
23  *****************************************************************************/
24
25 /*****************************************************************************
26  * Preamble
27  *****************************************************************************/
28 #include <string.h>                                            /* strerror() */
29 #include <stdlib.h>                                      /* malloc(), free() */
30
31 #include <vlc/vlc.h>
32 #include <vlc_vout.h>
33
34 #if defined (MODULE_NAME_IS_i420_yuy2_altivec) && defined(HAVE_ALTIVEC_H)
35 #   include <altivec.h>
36 #endif
37
38 #include "i420_yuy2.h"
39
40 #define SRC_FOURCC  "I420,IYUV,YV12"
41
42 #if defined (MODULE_NAME_IS_i420_yuy2)
43 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv,Y211"
44 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
45 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
46 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
47 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
48 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
49 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422"
50 #endif
51
52 /*****************************************************************************
53  * Local and extern prototypes.
54  *****************************************************************************/
55 static int  Activate ( vlc_object_t * );
56
57 static void I420_YUY2           ( vout_thread_t *, picture_t *, picture_t * );
58 static void I420_YVYU           ( vout_thread_t *, picture_t *, picture_t * );
59 static void I420_UYVY           ( vout_thread_t *, picture_t *, picture_t * );
60 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
61 static void I420_IUYV           ( vout_thread_t *, picture_t *, picture_t * );
62 static void I420_cyuv           ( vout_thread_t *, picture_t *, picture_t * );
63 #endif
64 #if defined (MODULE_NAME_IS_i420_yuy2)
65 static void I420_Y211           ( vout_thread_t *, picture_t *, picture_t * );
66 #endif
67
68 #ifdef MODULE_NAME_IS_i420_yuy2_mmx
69 /* Initialize MMX-specific constants */
70 static const uint64_t i_00ffw = 0x00ff00ff00ff00ffULL;
71 static const uint64_t i_80w   = 0x0000000080808080ULL;
72 #endif
73
74 /*****************************************************************************
75  * Module descriptor.
76  *****************************************************************************/
77 vlc_module_begin();
78 #if defined (MODULE_NAME_IS_i420_yuy2)
79     set_description( _("Conversions from " SRC_FOURCC " to " DEST_FOURCC) );
80     set_capability( "chroma", 80 );
81 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
82     set_description( _("MMX conversions from " SRC_FOURCC " to " DEST_FOURCC) );
83     set_capability( "chroma", 100 );
84     add_requirement( MMX );
85 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
86     set_description( _("SSE2 conversions from " SRC_FOURCC " to " DEST_FOURCC) );
87     set_capability( "chroma", 120 );
88     add_requirement( SSE2 );
89 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
90     set_description(
91             _("AltiVec conversions from " SRC_FOURCC " to " DEST_FOURCC) );
92     set_capability( "chroma", 100 );
93     add_requirement( ALTIVEC );
94 #endif
95     set_callbacks( Activate, NULL );
96 vlc_module_end();
97
98 /*****************************************************************************
99  * Activate: allocate a chroma function
100  *****************************************************************************
101  * This function allocates and initializes a chroma function
102  *****************************************************************************/
103 static int Activate( vlc_object_t *p_this )
104 {
105     vout_thread_t *p_vout = (vout_thread_t *)p_this;
106
107     if( p_vout->render.i_width & 1 || p_vout->render.i_height & 1 )
108     {
109         return -1;
110     }
111
112     switch( p_vout->render.i_chroma )
113     {
114         case VLC_FOURCC('Y','V','1','2'):
115         case VLC_FOURCC('I','4','2','0'):
116         case VLC_FOURCC('I','Y','U','V'):
117             switch( p_vout->output.i_chroma )
118             {
119                 case VLC_FOURCC('Y','U','Y','2'):
120                 case VLC_FOURCC('Y','U','N','V'):
121                     p_vout->chroma.pf_convert = I420_YUY2;
122                     break;
123
124                 case VLC_FOURCC('Y','V','Y','U'):
125                     p_vout->chroma.pf_convert = I420_YVYU;
126                     break;
127
128                 case VLC_FOURCC('U','Y','V','Y'):
129                 case VLC_FOURCC('U','Y','N','V'):
130                 case VLC_FOURCC('Y','4','2','2'):
131                     p_vout->chroma.pf_convert = I420_UYVY;
132                     break;
133 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
134                 case VLC_FOURCC('I','U','Y','V'):
135                     p_vout->chroma.pf_convert = I420_IUYV;
136                     break;
137
138                 case VLC_FOURCC('c','y','u','v'):
139                     p_vout->chroma.pf_convert = I420_cyuv;
140                     break;
141 #endif
142
143 #if defined (MODULE_NAME_IS_i420_yuy2)
144                 case VLC_FOURCC('Y','2','1','1'):
145                     p_vout->chroma.pf_convert = I420_Y211;
146                     break;
147 #endif
148
149                 default:
150                     return -1;
151             }
152             break;
153
154         default:
155             return -1;
156     }
157
158     return 0;
159 }
160
161 #if 0
162 static inline unsigned long long read_cycles(void)
163 {
164     unsigned long long v;
165     __asm__ __volatile__("rdtsc" : "=A" (v): );
166
167     return v;
168 }
169 #endif
170
171 /* Following functions are local */
172 /*****************************************************************************
173  * I420_YUY2: planar YUV 4:2:0 to packed YUYV 4:2:2
174  *****************************************************************************/
175 static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source,
176                                               picture_t *p_dest )
177 {
178     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
179     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
180     uint8_t *p_u = p_source->U_PIXELS;
181     uint8_t *p_v = p_source->V_PIXELS;
182
183     int i_x, i_y;
184
185 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
186 #define VEC_NEXT_LINES( ) \
187     p_line1  = p_line2; \
188     p_line2 += p_dest->p->i_pitch; \
189     p_y1     = p_y2; \
190     p_y2    += p_source->p[Y_PLANE].i_pitch;
191
192 #define VEC_LOAD_UV( ) \
193     u_vec = vec_ld( 0, p_u ); p_u += 16; \
194     v_vec = vec_ld( 0, p_v ); p_v += 16;
195
196 #define VEC_MERGE( a ) \
197     uv_vec = a( u_vec, v_vec ); \
198     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
199     vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
200     vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
201     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
202     vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16; \
203     vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16;
204
205     vector unsigned char u_vec;
206     vector unsigned char v_vec;
207     vector unsigned char uv_vec;
208     vector unsigned char y_vec;
209
210     if( !( ( p_vout->render.i_width % 32 ) |
211            ( p_vout->render.i_height % 2 ) ) )
212     {
213         /* Width is a multiple of 32, we take 2 lines at a time */
214         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
215         {
216             VEC_NEXT_LINES( );
217             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
218             {
219                 VEC_LOAD_UV( );
220                 VEC_MERGE( vec_mergeh );
221                 VEC_MERGE( vec_mergel );
222             }
223         }
224     }
225     else if( !( ( p_vout->render.i_width % 16 ) |
226                 ( p_vout->render.i_height % 4 ) ) )
227     {
228         /* Width is only a multiple of 16, we take 4 lines at a time */
229         for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
230         {
231             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
232             VEC_NEXT_LINES( );
233             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
234             {
235                 VEC_LOAD_UV( );
236                 VEC_MERGE( vec_mergeh );
237                 VEC_MERGE( vec_mergel );
238             }
239
240             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
241             VEC_LOAD_UV( );
242             VEC_MERGE( vec_mergeh );
243
244             /* Line 3 and 4, pixels 0 to 16 */
245             VEC_NEXT_LINES( );
246             VEC_MERGE( vec_mergel );
247
248             /* Line 3 and 4, pixels 16 to ( width ) */
249             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
250             {
251                 VEC_LOAD_UV( );
252                 VEC_MERGE( vec_mergeh );
253                 VEC_MERGE( vec_mergel );
254             }
255         }
256     }
257     else
258     {
259         /* Crap, use the C version */
260 #undef VEC_NEXT_LINES
261 #undef VEC_LOAD_UV
262 #undef VEC_MERGE
263 #endif
264
265     const int i_source_margin = p_source->p[0].i_pitch
266                                  - p_source->p[0].i_visible_pitch;
267     const int i_source_margin_c = p_source->p[1].i_pitch
268                                  - p_source->p[1].i_visible_pitch;
269     const int i_dest_margin = p_dest->p->i_pitch
270                                - p_dest->p->i_visible_pitch;
271
272 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
273     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
274     {
275         p_line1 = p_line2;
276         p_line2 += p_dest->p->i_pitch;
277
278         p_y1 = p_y2;
279         p_y2 += p_source->p[Y_PLANE].i_pitch;
280
281 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
282         for( i_x = p_vout->render.i_width / 8; i_x-- ; )
283         {
284             C_YUV420_YUYV( );
285             C_YUV420_YUYV( );
286             C_YUV420_YUYV( );
287             C_YUV420_YUYV( );
288         }
289 #else
290         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
291         {
292             MMX_CALL( MMX_YUV420_YUYV );
293         }
294 #endif
295         for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; )
296         {
297             C_YUV420_YUYV( );
298         }
299
300         p_y1 += i_source_margin;
301         p_y2 += i_source_margin;
302         p_u += i_source_margin_c;
303         p_v += i_source_margin_c;
304         p_line1 += i_dest_margin;
305         p_line2 += i_dest_margin;
306     }
307
308 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
309     __asm__ __volatile__("emms" :: );
310 #endif
311
312 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
313     }
314 #endif
315
316 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
317     /*
318     ** SSE2 128 bits fetch/store instructions are faster 
319     ** if memory access is 16 bytes aligned
320     */
321
322     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
323         ((int)p_line2|(int)p_y2))) )
324     {
325         /* use faster SSE2 aligned fetch and store */
326         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
327         {
328             p_line1 = p_line2;
329             p_line2 += p_dest->p->i_pitch;
330
331             p_y1 = p_y2;
332             p_y2 += p_source->p[Y_PLANE].i_pitch;
333
334             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
335             {
336                 SSE2_CALL( SSE2_YUV420_YUYV_ALIGNED );
337             }
338             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
339             {
340                 C_YUV420_YUYV( );
341             }
342
343             p_y1 += i_source_margin;
344             p_y2 += i_source_margin;
345             p_u += i_source_margin_c;
346             p_v += i_source_margin_c;
347             p_line1 += i_dest_margin;
348             p_line2 += i_dest_margin;
349         }
350     }
351     else
352     {
353         /* use slower SSE2 unaligned fetch and store */
354         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
355         {
356             p_line1 = p_line2;
357             p_line2 += p_dest->p->i_pitch;
358
359             p_y1 = p_y2;
360             p_y2 += p_source->p[Y_PLANE].i_pitch;
361
362             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
363             {
364                 SSE2_CALL( SSE2_YUV420_YUYV_UNALIGNED );
365             }
366             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
367             {
368                 C_YUV420_YUYV( );
369             }
370
371             p_y1 += i_source_margin;
372             p_y2 += i_source_margin;
373             p_u += i_source_margin_c;
374             p_v += i_source_margin_c;
375             p_line1 += i_dest_margin;
376             p_line2 += i_dest_margin;
377         }
378     }
379
380 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
381 }
382
383 /*****************************************************************************
384  * I420_YVYU: planar YUV 4:2:0 to packed YVYU 4:2:2
385  *****************************************************************************/
386 static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source,
387                                               picture_t *p_dest )
388 {
389     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
390     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
391     uint8_t *p_u = p_source->U_PIXELS;
392     uint8_t *p_v = p_source->V_PIXELS;
393
394     int i_x, i_y;
395
396 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
397 #define VEC_NEXT_LINES( ) \
398     p_line1  = p_line2; \
399     p_line2 += p_dest->p->i_pitch; \
400     p_y1     = p_y2; \
401     p_y2    += p_source->p[Y_PLANE].i_pitch;
402
403 #define VEC_LOAD_UV( ) \
404     u_vec = vec_ld( 0, p_u ); p_u += 16; \
405     v_vec = vec_ld( 0, p_v ); p_v += 16;
406
407 #define VEC_MERGE( a ) \
408     vu_vec = a( v_vec, u_vec ); \
409     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
410     vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
411     vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
412     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
413     vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16; \
414     vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16;
415
416     vector unsigned char u_vec;
417     vector unsigned char v_vec;
418     vector unsigned char vu_vec;
419     vector unsigned char y_vec;
420
421     if( !( ( p_vout->render.i_width % 32 ) |
422            ( p_vout->render.i_height % 2 ) ) )
423     {
424         /* Width is a multiple of 32, we take 2 lines at a time */
425         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
426         {
427             VEC_NEXT_LINES( );
428             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
429             {
430                 VEC_LOAD_UV( );
431                 VEC_MERGE( vec_mergeh );
432                 VEC_MERGE( vec_mergel );
433             }
434         }
435     }
436     else if( !( ( p_vout->render.i_width % 16 ) |
437                 ( p_vout->render.i_height % 4 ) ) )
438     {
439         /* Width is only a multiple of 16, we take 4 lines at a time */
440         for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
441         {
442             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
443             VEC_NEXT_LINES( );
444             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
445             {
446                 VEC_LOAD_UV( );
447                 VEC_MERGE( vec_mergeh );
448                 VEC_MERGE( vec_mergel );
449             }
450
451             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
452             VEC_LOAD_UV( );
453             VEC_MERGE( vec_mergeh );
454
455             /* Line 3 and 4, pixels 0 to 16 */
456             VEC_NEXT_LINES( );
457             VEC_MERGE( vec_mergel );
458
459             /* Line 3 and 4, pixels 16 to ( width ) */
460             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
461             {
462                 VEC_LOAD_UV( );
463                 VEC_MERGE( vec_mergeh );
464                 VEC_MERGE( vec_mergel );
465             }
466         }
467     }
468     else
469     {
470         /* Crap, use the C version */
471 #undef VEC_NEXT_LINES
472 #undef VEC_LOAD_UV
473 #undef VEC_MERGE
474 #endif
475
476     const int i_source_margin = p_source->p[0].i_pitch
477                                  - p_source->p[0].i_visible_pitch;
478     const int i_source_margin_c = p_source->p[1].i_pitch
479                                  - p_source->p[1].i_visible_pitch;
480     const int i_dest_margin = p_dest->p->i_pitch
481                                - p_dest->p->i_visible_pitch;
482
483 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
484     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
485     {
486         p_line1 = p_line2;
487         p_line2 += p_dest->p->i_pitch;
488
489         p_y1 = p_y2;
490         p_y2 += p_source->p[Y_PLANE].i_pitch;
491
492         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
493         {
494 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
495             C_YUV420_YVYU( );
496             C_YUV420_YVYU( );
497             C_YUV420_YVYU( );
498             C_YUV420_YVYU( );
499 #else
500             MMX_CALL( MMX_YUV420_YVYU );
501 #endif
502         }
503         for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; )
504         {
505             C_YUV420_YVYU( );
506         }
507
508         p_y1 += i_source_margin;
509         p_y2 += i_source_margin;
510         p_u += i_source_margin_c;
511         p_v += i_source_margin_c;
512         p_line1 += i_dest_margin;
513         p_line2 += i_dest_margin;
514     }
515
516 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
517     __asm__ __volatile__("emms" :: );
518 #endif
519
520 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
521     }
522 #endif
523
524 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
525     /*
526     ** SSE2 128 bits fetch/store instructions are faster 
527     ** if memory access is 16 bytes aligned
528     */
529     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
530         ((int)p_line2|(int)p_y2))) )
531     {
532         /* use faster SSE2 aligned fetch and store */
533         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
534         {
535             p_line1 = p_line2;
536             p_line2 += p_dest->p->i_pitch;
537
538             p_y1 = p_y2;
539             p_y2 += p_source->p[Y_PLANE].i_pitch;
540
541             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
542             {
543                 SSE2_CALL( SSE2_YUV420_YVYU_ALIGNED );
544             }
545             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
546             {
547                 C_YUV420_YVYU( );
548             }
549
550             p_y1 += i_source_margin;
551             p_y2 += i_source_margin;
552             p_u += i_source_margin_c;
553             p_v += i_source_margin_c;
554             p_line1 += i_dest_margin;
555             p_line2 += i_dest_margin;
556         }
557     }
558     else
559     {
560         /* use slower SSE2 unaligned fetch and store */
561         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
562         {
563             p_line1 = p_line2;
564             p_line2 += p_dest->p->i_pitch;
565
566             p_y1 = p_y2;
567             p_y2 += p_source->p[Y_PLANE].i_pitch;
568
569             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
570             {
571                 SSE2_CALL( SSE2_YUV420_YVYU_UNALIGNED );
572             }
573             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
574             {
575                 C_YUV420_YVYU( );
576             }
577
578             p_y1 += i_source_margin;
579             p_y2 += i_source_margin;
580             p_u += i_source_margin_c;
581             p_v += i_source_margin_c;
582             p_line1 += i_dest_margin;
583             p_line2 += i_dest_margin;
584         }
585     }
586 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
587 }
588
589 /*****************************************************************************
590  * I420_UYVY: planar YUV 4:2:0 to packed UYVY 4:2:2
591  *****************************************************************************/
592 static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source,
593                                               picture_t *p_dest )
594 {
595     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
596     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
597     uint8_t *p_u = p_source->U_PIXELS;
598     uint8_t *p_v = p_source->V_PIXELS;
599
600     int i_x, i_y;
601
602 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
603 #define VEC_NEXT_LINES( ) \
604     p_line1  = p_line2; \
605     p_line2 += p_dest->p->i_pitch; \
606     p_y1     = p_y2; \
607     p_y2    += p_source->p[Y_PLANE].i_pitch;
608
609 #define VEC_LOAD_UV( ) \
610     u_vec = vec_ld( 0, p_u ); p_u += 16; \
611     v_vec = vec_ld( 0, p_v ); p_v += 16;
612
613 #define VEC_MERGE( a ) \
614     uv_vec = a( u_vec, v_vec ); \
615     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
616     vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
617     vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
618     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
619     vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16; \
620     vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16;
621
622     vector unsigned char u_vec;
623     vector unsigned char v_vec;
624     vector unsigned char uv_vec;
625     vector unsigned char y_vec;
626
627     if( !( ( p_vout->render.i_width % 32 ) |
628            ( p_vout->render.i_height % 2 ) ) )
629     {
630         /* Width is a multiple of 32, we take 2 lines at a time */
631         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
632         {
633             VEC_NEXT_LINES( );
634             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
635             {
636                 VEC_LOAD_UV( );
637                 VEC_MERGE( vec_mergeh );
638                 VEC_MERGE( vec_mergel );
639             }
640         }
641     }
642     else if( !( ( p_vout->render.i_width % 16 ) |
643                 ( p_vout->render.i_height % 4 ) ) )
644     {
645         /* Width is only a multiple of 16, we take 4 lines at a time */
646         for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
647         {
648             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
649             VEC_NEXT_LINES( );
650             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
651             {
652                 VEC_LOAD_UV( );
653                 VEC_MERGE( vec_mergeh );
654                 VEC_MERGE( vec_mergel );
655             }
656
657             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
658             VEC_LOAD_UV( );
659             VEC_MERGE( vec_mergeh );
660
661             /* Line 3 and 4, pixels 0 to 16 */
662             VEC_NEXT_LINES( );
663             VEC_MERGE( vec_mergel );
664
665             /* Line 3 and 4, pixels 16 to ( width ) */
666             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
667             {
668                 VEC_LOAD_UV( );
669                 VEC_MERGE( vec_mergeh );
670                 VEC_MERGE( vec_mergel );
671             }
672         }
673     }
674     else
675     {
676         /* Crap, use the C version */
677 #undef VEC_NEXT_LINES
678 #undef VEC_LOAD_UV
679 #undef VEC_MERGE
680 #endif
681
682     const int i_source_margin = p_source->p[0].i_pitch
683                                  - p_source->p[0].i_visible_pitch;
684     const int i_source_margin_c = p_source->p[1].i_pitch
685                                  - p_source->p[1].i_visible_pitch;
686     const int i_dest_margin = p_dest->p->i_pitch
687                                - p_dest->p->i_visible_pitch;
688
689 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
690     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
691     {
692         p_line1 = p_line2;
693         p_line2 += p_dest->p->i_pitch;
694
695         p_y1 = p_y2;
696         p_y2 += p_source->p[Y_PLANE].i_pitch;
697
698         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
699         {
700 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
701             C_YUV420_UYVY( );
702             C_YUV420_UYVY( );
703             C_YUV420_UYVY( );
704             C_YUV420_UYVY( );
705 #else
706             MMX_CALL( MMX_YUV420_UYVY );
707 #endif
708         }
709         for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x--; )
710         {
711             C_YUV420_UYVY( );
712         }
713
714         p_y1 += i_source_margin;
715         p_y2 += i_source_margin;
716         p_u += i_source_margin_c;
717         p_v += i_source_margin_c;
718         p_line1 += i_dest_margin;
719         p_line2 += i_dest_margin;
720     }
721
722 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
723     __asm__ __volatile__("emms" :: );
724 #endif
725
726 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
727     }
728 #endif
729
730 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
731     /*
732     ** SSE2 128 bits fetch/store instructions are faster 
733     ** if memory access is 16 bytes aligned
734     */
735     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
736         ((int)p_line2|(int)p_y2))) )
737     {
738         /* use faster SSE2 aligned fetch and store */
739         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
740         {
741             p_line1 = p_line2;
742             p_line2 += p_dest->p->i_pitch;
743
744             p_y1 = p_y2;
745             p_y2 += p_source->p[Y_PLANE].i_pitch;
746
747             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
748             {
749                 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
750             }
751             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
752             {
753                 C_YUV420_UYVY( );
754             }
755
756             p_y1 += i_source_margin;
757             p_y2 += i_source_margin;
758             p_u += i_source_margin_c;
759             p_v += i_source_margin_c;
760             p_line1 += i_dest_margin;
761             p_line2 += i_dest_margin;
762         }
763     }
764     else
765     {
766         /* use slower SSE2 unaligned fetch and store */
767         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
768         {
769             p_line1 = p_line2;
770             p_line2 += p_dest->p->i_pitch;
771
772             p_y1 = p_y2;
773             p_y2 += p_source->p[Y_PLANE].i_pitch;
774
775             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
776             {
777                 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
778             }
779             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
780             {
781                 C_YUV420_UYVY( );
782             }
783
784             p_y1 += i_source_margin;
785             p_y2 += i_source_margin;
786             p_u += i_source_margin_c;
787             p_v += i_source_margin_c;
788             p_line1 += i_dest_margin;
789             p_line2 += i_dest_margin;
790         }
791     }
792 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
793 }
794
795 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
796 /*****************************************************************************
797  * I420_IUYV: planar YUV 4:2:0 to interleaved packed UYVY 4:2:2
798  *****************************************************************************/
799 static void I420_IUYV( vout_thread_t *p_vout, picture_t *p_source,
800                                               picture_t *p_dest )
801 {
802     /* FIXME: TODO ! */
803     msg_Err( p_vout, "I420_IUYV unimplemented, please harass <sam@zoy.org>" );
804 }
805
806 /*****************************************************************************
807  * I420_cyuv: planar YUV 4:2:0 to upside-down packed UYVY 4:2:2
808  *****************************************************************************/
809 static void I420_cyuv( vout_thread_t *p_vout, picture_t *p_source,
810                                               picture_t *p_dest )
811 {
812     uint8_t *p_line1 = p_dest->p->p_pixels +
813                        p_dest->p->i_visible_lines * p_dest->p->i_pitch
814                        + p_dest->p->i_pitch;
815     uint8_t *p_line2 = p_dest->p->p_pixels +
816                        p_dest->p->i_visible_lines * p_dest->p->i_pitch;
817     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
818     uint8_t *p_u = p_source->U_PIXELS;
819     uint8_t *p_v = p_source->V_PIXELS;
820
821     int i_x, i_y;
822
823     const int i_source_margin = p_source->p[0].i_pitch
824                                  - p_source->p[0].i_visible_pitch;
825     const int i_source_margin_c = p_source->p[1].i_pitch
826                                  - p_source->p[1].i_visible_pitch;
827     const int i_dest_margin = p_dest->p->i_pitch
828                                - p_dest->p->i_visible_pitch;
829
830 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
831     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
832     {
833         p_line1 -= 3 * p_dest->p->i_pitch;
834         p_line2 -= 3 * p_dest->p->i_pitch;
835
836         p_y1 = p_y2;
837         p_y2 += p_source->p[Y_PLANE].i_pitch;
838
839         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
840         {
841 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
842             C_YUV420_UYVY( );
843             C_YUV420_UYVY( );
844             C_YUV420_UYVY( );
845             C_YUV420_UYVY( );
846 #else
847             MMX_CALL( MMX_YUV420_UYVY );
848 #endif
849         }
850         for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; )
851         {
852             C_YUV420_UYVY( );
853         }
854
855         p_y1 += i_source_margin;
856         p_y2 += i_source_margin;
857         p_u += i_source_margin_c;
858         p_v += i_source_margin_c;
859         p_line1 += i_dest_margin;
860         p_line2 += i_dest_margin;
861     }
862
863 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
864     __asm__ __volatile__("emms" :: );
865 #endif
866
867 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
868     /*
869     ** SSE2 128 bits fetch/store instructions are faster 
870     ** if memory access is 16 bytes aligned
871     */
872     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
873         ((int)p_line2|(int)p_y2))) )
874     {
875         /* use faster SSE2 aligned fetch and store */
876         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
877         {
878             p_line1 = p_line2;
879             p_line2 += p_dest->p->i_pitch;
880
881             p_y1 = p_y2;
882             p_y2 += p_source->p[Y_PLANE].i_pitch;
883
884             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
885             {
886                 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
887             }
888             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
889             {
890                 C_YUV420_UYVY( );
891             }
892
893             p_y1 += i_source_margin;
894             p_y2 += i_source_margin;
895             p_u += i_source_margin_c;
896             p_v += i_source_margin_c;
897             p_line1 += i_dest_margin;
898             p_line2 += i_dest_margin;
899         }
900     }
901     else
902     {
903         /* use slower SSE2 unaligned fetch and store */
904         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
905         {
906             p_line1 = p_line2;
907             p_line2 += p_dest->p->i_pitch;
908
909             p_y1 = p_y2;
910             p_y2 += p_source->p[Y_PLANE].i_pitch;
911
912             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
913             {
914                 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
915             }
916             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
917             {
918                 C_YUV420_UYVY( );
919             }
920
921             p_y1 += i_source_margin;
922             p_y2 += i_source_margin;
923             p_u += i_source_margin_c;
924             p_v += i_source_margin_c;
925             p_line1 += i_dest_margin;
926             p_line2 += i_dest_margin;
927         }
928     }
929 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
930 }
931 #endif // !defined (MODULE_NAME_IS_i420_yuy2_altivec)
932
933 /*****************************************************************************
934  * I420_Y211: planar YUV 4:2:0 to packed YUYV 2:1:1
935  *****************************************************************************/
936 #if defined (MODULE_NAME_IS_i420_yuy2)
937 static void I420_Y211( vout_thread_t *p_vout, picture_t *p_source,
938                                               picture_t *p_dest )
939 {
940     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
941     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
942     uint8_t *p_u = p_source->U_PIXELS;
943     uint8_t *p_v = p_source->V_PIXELS;
944
945     int i_x, i_y;
946
947     const int i_source_margin = p_source->p[0].i_pitch
948                                  - p_source->p[0].i_visible_pitch;
949     const int i_source_margin_c = p_source->p[1].i_pitch
950                                  - p_source->p[1].i_visible_pitch;
951     const int i_dest_margin = p_dest->p->i_pitch
952                                - p_dest->p->i_visible_pitch;
953
954     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
955     {
956         p_line1 = p_line2;
957         p_line2 += p_dest->p->i_pitch;
958
959         p_y1 = p_y2;
960         p_y2 += p_source->p[Y_PLANE].i_pitch;
961
962         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
963         {
964             C_YUV420_Y211( );
965             C_YUV420_Y211( );
966         }
967
968         p_y1 += i_source_margin;
969         p_y2 += i_source_margin;
970         p_u += i_source_margin_c;
971         p_v += i_source_margin_c;
972         p_line1 += i_dest_margin;
973         p_line2 += i_dest_margin;
974     }
975 }
976 #endif