]> git.sesse.net Git - vlc/blob - modules/video_chroma/i420_yuy2.c
Removes trailing spaces. Removes tabs.
[vlc] / modules / video_chroma / i420_yuy2.c
1 /*****************************************************************************
2  * i420_yuy2.c : YUV to YUV conversion module for vlc
3  *****************************************************************************
4  * Copyright (C) 2000, 2001 the VideoLAN team
5  * $Id$
6  *
7  * Authors: Samuel Hocevar <sam@zoy.org>
8  *          Damien Fouilleul <damien@videolan.org>
9  *
10  * This program is free software; you can redistribute it and/or modify
11  * it under the terms of the GNU General Public License as published by
12  * the Free Software Foundation; either version 2 of the License, or
13  * (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public License
21  * along with this program; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
23  *****************************************************************************/
24
25 /*****************************************************************************
26  * Preamble
27  *****************************************************************************/
28
29 #include <vlc/vlc.h>
30 #include <vlc_vout.h>
31
32 #if defined (MODULE_NAME_IS_i420_yuy2_altivec) && defined(HAVE_ALTIVEC_H)
33 #   include <altivec.h>
34 #endif
35
36 #include "i420_yuy2.h"
37
38 #define SRC_FOURCC  "I420,IYUV,YV12"
39
40 #if defined (MODULE_NAME_IS_i420_yuy2)
41 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv,Y211"
42 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
43 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
44 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
45 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
46 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
47 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422"
48 #endif
49
50 /*****************************************************************************
51  * Local and extern prototypes.
52  *****************************************************************************/
53 static int  Activate ( vlc_object_t * );
54
55 static void I420_YUY2           ( vout_thread_t *, picture_t *, picture_t * );
56 static void I420_YVYU           ( vout_thread_t *, picture_t *, picture_t * );
57 static void I420_UYVY           ( vout_thread_t *, picture_t *, picture_t * );
58 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
59 static void I420_IUYV           ( vout_thread_t *, picture_t *, picture_t * );
60 static void I420_cyuv           ( vout_thread_t *, picture_t *, picture_t * );
61 #endif
62 #if defined (MODULE_NAME_IS_i420_yuy2)
63 static void I420_Y211           ( vout_thread_t *, picture_t *, picture_t * );
64 #endif
65
66 #ifdef MODULE_NAME_IS_i420_yuy2_mmx
67 /* Initialize MMX-specific constants */
68 static const uint64_t i_00ffw = 0x00ff00ff00ff00ffULL;
69 static const uint64_t i_80w   = 0x0000000080808080ULL;
70 #endif
71
72 /*****************************************************************************
73  * Module descriptor.
74  *****************************************************************************/
75 vlc_module_begin();
76 #if defined (MODULE_NAME_IS_i420_yuy2)
77     set_description( _("Conversions from " SRC_FOURCC " to " DEST_FOURCC) );
78     set_capability( "chroma", 80 );
79 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
80     set_description( _("MMX conversions from " SRC_FOURCC " to " DEST_FOURCC) );
81     set_capability( "chroma", 100 );
82     add_requirement( MMX );
83 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
84     set_description( _("SSE2 conversions from " SRC_FOURCC " to " DEST_FOURCC) );
85     set_capability( "chroma", 120 );
86     add_requirement( SSE2 );
87 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
88     set_description(
89             _("AltiVec conversions from " SRC_FOURCC " to " DEST_FOURCC) );
90     set_capability( "chroma", 100 );
91     add_requirement( ALTIVEC );
92 #endif
93     set_callbacks( Activate, NULL );
94 vlc_module_end();
95
96 /*****************************************************************************
97  * Activate: allocate a chroma function
98  *****************************************************************************
99  * This function allocates and initializes a chroma function
100  *****************************************************************************/
101 static int Activate( vlc_object_t *p_this )
102 {
103     vout_thread_t *p_vout = (vout_thread_t *)p_this;
104
105     if( p_vout->render.i_width & 1 || p_vout->render.i_height & 1 )
106     {
107         return -1;
108     }
109
110     switch( p_vout->render.i_chroma )
111     {
112         case VLC_FOURCC('Y','V','1','2'):
113         case VLC_FOURCC('I','4','2','0'):
114         case VLC_FOURCC('I','Y','U','V'):
115             switch( p_vout->output.i_chroma )
116             {
117                 case VLC_FOURCC('Y','U','Y','2'):
118                 case VLC_FOURCC('Y','U','N','V'):
119                     p_vout->chroma.pf_convert = I420_YUY2;
120                     break;
121
122                 case VLC_FOURCC('Y','V','Y','U'):
123                     p_vout->chroma.pf_convert = I420_YVYU;
124                     break;
125
126                 case VLC_FOURCC('U','Y','V','Y'):
127                 case VLC_FOURCC('U','Y','N','V'):
128                 case VLC_FOURCC('Y','4','2','2'):
129                     p_vout->chroma.pf_convert = I420_UYVY;
130                     break;
131 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
132                 case VLC_FOURCC('I','U','Y','V'):
133                     p_vout->chroma.pf_convert = I420_IUYV;
134                     break;
135
136                 case VLC_FOURCC('c','y','u','v'):
137                     p_vout->chroma.pf_convert = I420_cyuv;
138                     break;
139 #endif
140
141 #if defined (MODULE_NAME_IS_i420_yuy2)
142                 case VLC_FOURCC('Y','2','1','1'):
143                     p_vout->chroma.pf_convert = I420_Y211;
144                     break;
145 #endif
146
147                 default:
148                     return -1;
149             }
150             break;
151
152         default:
153             return -1;
154     }
155
156     return 0;
157 }
158
159 #if 0
160 static inline unsigned long long read_cycles(void)
161 {
162     unsigned long long v;
163     __asm__ __volatile__("rdtsc" : "=A" (v): );
164
165     return v;
166 }
167 #endif
168
169 /* Following functions are local */
170 /*****************************************************************************
171  * I420_YUY2: planar YUV 4:2:0 to packed YUYV 4:2:2
172  *****************************************************************************/
173 static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source,
174                                               picture_t *p_dest )
175 {
176     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
177     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
178     uint8_t *p_u = p_source->U_PIXELS;
179     uint8_t *p_v = p_source->V_PIXELS;
180
181     int i_x, i_y;
182
183 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
184 #define VEC_NEXT_LINES( ) \
185     p_line1  = p_line2; \
186     p_line2 += p_dest->p->i_pitch; \
187     p_y1     = p_y2; \
188     p_y2    += p_source->p[Y_PLANE].i_pitch;
189
190 #define VEC_LOAD_UV( ) \
191     u_vec = vec_ld( 0, p_u ); p_u += 16; \
192     v_vec = vec_ld( 0, p_v ); p_v += 16;
193
194 #define VEC_MERGE( a ) \
195     uv_vec = a( u_vec, v_vec ); \
196     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
197     vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
198     vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
199     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
200     vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16; \
201     vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16;
202
203     vector unsigned char u_vec;
204     vector unsigned char v_vec;
205     vector unsigned char uv_vec;
206     vector unsigned char y_vec;
207
208     if( !( ( p_vout->render.i_width % 32 ) |
209            ( p_vout->render.i_height % 2 ) ) )
210     {
211         /* Width is a multiple of 32, we take 2 lines at a time */
212         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
213         {
214             VEC_NEXT_LINES( );
215             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
216             {
217                 VEC_LOAD_UV( );
218                 VEC_MERGE( vec_mergeh );
219                 VEC_MERGE( vec_mergel );
220             }
221         }
222     }
223     else if( !( ( p_vout->render.i_width % 16 ) |
224                 ( p_vout->render.i_height % 4 ) ) )
225     {
226         /* Width is only a multiple of 16, we take 4 lines at a time */
227         for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
228         {
229             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
230             VEC_NEXT_LINES( );
231             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
232             {
233                 VEC_LOAD_UV( );
234                 VEC_MERGE( vec_mergeh );
235                 VEC_MERGE( vec_mergel );
236             }
237
238             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
239             VEC_LOAD_UV( );
240             VEC_MERGE( vec_mergeh );
241
242             /* Line 3 and 4, pixels 0 to 16 */
243             VEC_NEXT_LINES( );
244             VEC_MERGE( vec_mergel );
245
246             /* Line 3 and 4, pixels 16 to ( width ) */
247             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
248             {
249                 VEC_LOAD_UV( );
250                 VEC_MERGE( vec_mergeh );
251                 VEC_MERGE( vec_mergel );
252             }
253         }
254     }
255     else
256     {
257         /* Crap, use the C version */
258 #undef VEC_NEXT_LINES
259 #undef VEC_LOAD_UV
260 #undef VEC_MERGE
261 #endif
262
263     const int i_source_margin = p_source->p[0].i_pitch
264                                  - p_source->p[0].i_visible_pitch;
265     const int i_source_margin_c = p_source->p[1].i_pitch
266                                  - p_source->p[1].i_visible_pitch;
267     const int i_dest_margin = p_dest->p->i_pitch
268                                - p_dest->p->i_visible_pitch;
269
270 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
271     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
272     {
273         p_line1 = p_line2;
274         p_line2 += p_dest->p->i_pitch;
275
276         p_y1 = p_y2;
277         p_y2 += p_source->p[Y_PLANE].i_pitch;
278
279 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
280         for( i_x = p_vout->render.i_width / 8; i_x-- ; )
281         {
282             C_YUV420_YUYV( );
283             C_YUV420_YUYV( );
284             C_YUV420_YUYV( );
285             C_YUV420_YUYV( );
286         }
287 #else
288         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
289         {
290             MMX_CALL( MMX_YUV420_YUYV );
291         }
292 #endif
293         for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; )
294         {
295             C_YUV420_YUYV( );
296         }
297
298         p_y1 += i_source_margin;
299         p_y2 += i_source_margin;
300         p_u += i_source_margin_c;
301         p_v += i_source_margin_c;
302         p_line1 += i_dest_margin;
303         p_line2 += i_dest_margin;
304     }
305
306 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
307     /* re-enable FPU registers */
308     MMX_END;
309 #endif
310
311 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
312     }
313 #endif
314
315 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
316     /*
317     ** SSE2 128 bits fetch/store instructions are faster
318     ** if memory access is 16 bytes aligned
319     */
320
321     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
322         ((intptr_t)p_line2|(intptr_t)p_y2))) )
323     {
324         /* use faster SSE2 aligned fetch and store */
325         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
326         {
327             p_line1 = p_line2;
328             p_line2 += p_dest->p->i_pitch;
329
330             p_y1 = p_y2;
331             p_y2 += p_source->p[Y_PLANE].i_pitch;
332
333             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
334             {
335                 SSE2_CALL( SSE2_YUV420_YUYV_ALIGNED );
336             }
337             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
338             {
339                 C_YUV420_YUYV( );
340             }
341
342             p_y1 += i_source_margin;
343             p_y2 += i_source_margin;
344             p_u += i_source_margin_c;
345             p_v += i_source_margin_c;
346             p_line1 += i_dest_margin;
347             p_line2 += i_dest_margin;
348         }
349     }
350     else
351     {
352         /* use slower SSE2 unaligned fetch and store */
353         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
354         {
355             p_line1 = p_line2;
356             p_line2 += p_dest->p->i_pitch;
357
358             p_y1 = p_y2;
359             p_y2 += p_source->p[Y_PLANE].i_pitch;
360
361             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
362             {
363                 SSE2_CALL( SSE2_YUV420_YUYV_UNALIGNED );
364             }
365             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
366             {
367                 C_YUV420_YUYV( );
368             }
369
370             p_y1 += i_source_margin;
371             p_y2 += i_source_margin;
372             p_u += i_source_margin_c;
373             p_v += i_source_margin_c;
374             p_line1 += i_dest_margin;
375             p_line2 += i_dest_margin;
376         }
377     }
378     /* make sure all SSE2 stores are visible thereafter */
379     SSE2_END;
380
381 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
382 }
383
384 /*****************************************************************************
385  * I420_YVYU: planar YUV 4:2:0 to packed YVYU 4:2:2
386  *****************************************************************************/
387 static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source,
388                                               picture_t *p_dest )
389 {
390     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
391     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
392     uint8_t *p_u = p_source->U_PIXELS;
393     uint8_t *p_v = p_source->V_PIXELS;
394
395     int i_x, i_y;
396
397 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
398 #define VEC_NEXT_LINES( ) \
399     p_line1  = p_line2; \
400     p_line2 += p_dest->p->i_pitch; \
401     p_y1     = p_y2; \
402     p_y2    += p_source->p[Y_PLANE].i_pitch;
403
404 #define VEC_LOAD_UV( ) \
405     u_vec = vec_ld( 0, p_u ); p_u += 16; \
406     v_vec = vec_ld( 0, p_v ); p_v += 16;
407
408 #define VEC_MERGE( a ) \
409     vu_vec = a( v_vec, u_vec ); \
410     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
411     vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
412     vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
413     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
414     vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16; \
415     vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16;
416
417     vector unsigned char u_vec;
418     vector unsigned char v_vec;
419     vector unsigned char vu_vec;
420     vector unsigned char y_vec;
421
422     if( !( ( p_vout->render.i_width % 32 ) |
423            ( p_vout->render.i_height % 2 ) ) )
424     {
425         /* Width is a multiple of 32, we take 2 lines at a time */
426         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
427         {
428             VEC_NEXT_LINES( );
429             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
430             {
431                 VEC_LOAD_UV( );
432                 VEC_MERGE( vec_mergeh );
433                 VEC_MERGE( vec_mergel );
434             }
435         }
436     }
437     else if( !( ( p_vout->render.i_width % 16 ) |
438                 ( p_vout->render.i_height % 4 ) ) )
439     {
440         /* Width is only a multiple of 16, we take 4 lines at a time */
441         for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
442         {
443             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
444             VEC_NEXT_LINES( );
445             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
446             {
447                 VEC_LOAD_UV( );
448                 VEC_MERGE( vec_mergeh );
449                 VEC_MERGE( vec_mergel );
450             }
451
452             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
453             VEC_LOAD_UV( );
454             VEC_MERGE( vec_mergeh );
455
456             /* Line 3 and 4, pixels 0 to 16 */
457             VEC_NEXT_LINES( );
458             VEC_MERGE( vec_mergel );
459
460             /* Line 3 and 4, pixels 16 to ( width ) */
461             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
462             {
463                 VEC_LOAD_UV( );
464                 VEC_MERGE( vec_mergeh );
465                 VEC_MERGE( vec_mergel );
466             }
467         }
468     }
469     else
470     {
471         /* Crap, use the C version */
472 #undef VEC_NEXT_LINES
473 #undef VEC_LOAD_UV
474 #undef VEC_MERGE
475 #endif
476
477     const int i_source_margin = p_source->p[0].i_pitch
478                                  - p_source->p[0].i_visible_pitch;
479     const int i_source_margin_c = p_source->p[1].i_pitch
480                                  - p_source->p[1].i_visible_pitch;
481     const int i_dest_margin = p_dest->p->i_pitch
482                                - p_dest->p->i_visible_pitch;
483
484 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
485     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
486     {
487         p_line1 = p_line2;
488         p_line2 += p_dest->p->i_pitch;
489
490         p_y1 = p_y2;
491         p_y2 += p_source->p[Y_PLANE].i_pitch;
492
493         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
494         {
495 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
496             C_YUV420_YVYU( );
497             C_YUV420_YVYU( );
498             C_YUV420_YVYU( );
499             C_YUV420_YVYU( );
500 #else
501             MMX_CALL( MMX_YUV420_YVYU );
502 #endif
503         }
504         for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; )
505         {
506             C_YUV420_YVYU( );
507         }
508
509         p_y1 += i_source_margin;
510         p_y2 += i_source_margin;
511         p_u += i_source_margin_c;
512         p_v += i_source_margin_c;
513         p_line1 += i_dest_margin;
514         p_line2 += i_dest_margin;
515     }
516
517 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
518     /* re-enable FPU registers */
519     MMX_END;
520 #endif
521
522 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
523     }
524 #endif
525
526 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
527     /*
528     ** SSE2 128 bits fetch/store instructions are faster
529     ** if memory access is 16 bytes aligned
530     */
531     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
532         ((intptr_t)p_line2|(intptr_t)p_y2))) )
533     {
534         /* use faster SSE2 aligned fetch and store */
535         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
536         {
537             p_line1 = p_line2;
538             p_line2 += p_dest->p->i_pitch;
539
540             p_y1 = p_y2;
541             p_y2 += p_source->p[Y_PLANE].i_pitch;
542
543             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
544             {
545                 SSE2_CALL( SSE2_YUV420_YVYU_ALIGNED );
546             }
547             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
548             {
549                 C_YUV420_YVYU( );
550             }
551
552             p_y1 += i_source_margin;
553             p_y2 += i_source_margin;
554             p_u += i_source_margin_c;
555             p_v += i_source_margin_c;
556             p_line1 += i_dest_margin;
557             p_line2 += i_dest_margin;
558         }
559     }
560     else
561     {
562         /* use slower SSE2 unaligned fetch and store */
563         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
564         {
565             p_line1 = p_line2;
566             p_line2 += p_dest->p->i_pitch;
567
568             p_y1 = p_y2;
569             p_y2 += p_source->p[Y_PLANE].i_pitch;
570
571             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
572             {
573                 SSE2_CALL( SSE2_YUV420_YVYU_UNALIGNED );
574             }
575             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
576             {
577                 C_YUV420_YVYU( );
578             }
579
580             p_y1 += i_source_margin;
581             p_y2 += i_source_margin;
582             p_u += i_source_margin_c;
583             p_v += i_source_margin_c;
584             p_line1 += i_dest_margin;
585             p_line2 += i_dest_margin;
586         }
587     }
588     /* make sure all SSE2 stores are visible thereafter */
589     SSE2_END;
590 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
591 }
592
593 /*****************************************************************************
594  * I420_UYVY: planar YUV 4:2:0 to packed UYVY 4:2:2
595  *****************************************************************************/
596 static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source,
597                                               picture_t *p_dest )
598 {
599     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
600     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
601     uint8_t *p_u = p_source->U_PIXELS;
602     uint8_t *p_v = p_source->V_PIXELS;
603
604     int i_x, i_y;
605
606 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
607 #define VEC_NEXT_LINES( ) \
608     p_line1  = p_line2; \
609     p_line2 += p_dest->p->i_pitch; \
610     p_y1     = p_y2; \
611     p_y2    += p_source->p[Y_PLANE].i_pitch;
612
613 #define VEC_LOAD_UV( ) \
614     u_vec = vec_ld( 0, p_u ); p_u += 16; \
615     v_vec = vec_ld( 0, p_v ); p_v += 16;
616
617 #define VEC_MERGE( a ) \
618     uv_vec = a( u_vec, v_vec ); \
619     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
620     vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
621     vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
622     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
623     vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16; \
624     vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16;
625
626     vector unsigned char u_vec;
627     vector unsigned char v_vec;
628     vector unsigned char uv_vec;
629     vector unsigned char y_vec;
630
631     if( !( ( p_vout->render.i_width % 32 ) |
632            ( p_vout->render.i_height % 2 ) ) )
633     {
634         /* Width is a multiple of 32, we take 2 lines at a time */
635         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
636         {
637             VEC_NEXT_LINES( );
638             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
639             {
640                 VEC_LOAD_UV( );
641                 VEC_MERGE( vec_mergeh );
642                 VEC_MERGE( vec_mergel );
643             }
644         }
645     }
646     else if( !( ( p_vout->render.i_width % 16 ) |
647                 ( p_vout->render.i_height % 4 ) ) )
648     {
649         /* Width is only a multiple of 16, we take 4 lines at a time */
650         for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
651         {
652             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
653             VEC_NEXT_LINES( );
654             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
655             {
656                 VEC_LOAD_UV( );
657                 VEC_MERGE( vec_mergeh );
658                 VEC_MERGE( vec_mergel );
659             }
660
661             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
662             VEC_LOAD_UV( );
663             VEC_MERGE( vec_mergeh );
664
665             /* Line 3 and 4, pixels 0 to 16 */
666             VEC_NEXT_LINES( );
667             VEC_MERGE( vec_mergel );
668
669             /* Line 3 and 4, pixels 16 to ( width ) */
670             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
671             {
672                 VEC_LOAD_UV( );
673                 VEC_MERGE( vec_mergeh );
674                 VEC_MERGE( vec_mergel );
675             }
676         }
677     }
678     else
679     {
680         /* Crap, use the C version */
681 #undef VEC_NEXT_LINES
682 #undef VEC_LOAD_UV
683 #undef VEC_MERGE
684 #endif
685
686     const int i_source_margin = p_source->p[0].i_pitch
687                                  - p_source->p[0].i_visible_pitch;
688     const int i_source_margin_c = p_source->p[1].i_pitch
689                                  - p_source->p[1].i_visible_pitch;
690     const int i_dest_margin = p_dest->p->i_pitch
691                                - p_dest->p->i_visible_pitch;
692
693 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
694     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
695     {
696         p_line1 = p_line2;
697         p_line2 += p_dest->p->i_pitch;
698
699         p_y1 = p_y2;
700         p_y2 += p_source->p[Y_PLANE].i_pitch;
701
702         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
703         {
704 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
705             C_YUV420_UYVY( );
706             C_YUV420_UYVY( );
707             C_YUV420_UYVY( );
708             C_YUV420_UYVY( );
709 #else
710             MMX_CALL( MMX_YUV420_UYVY );
711 #endif
712         }
713         for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x--; )
714         {
715             C_YUV420_UYVY( );
716         }
717
718         p_y1 += i_source_margin;
719         p_y2 += i_source_margin;
720         p_u += i_source_margin_c;
721         p_v += i_source_margin_c;
722         p_line1 += i_dest_margin;
723         p_line2 += i_dest_margin;
724     }
725
726 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
727     /* re-enable FPU registers */
728     MMX_END;
729 #endif
730
731 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
732     }
733 #endif
734
735 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
736     /*
737     ** SSE2 128 bits fetch/store instructions are faster
738     ** if memory access is 16 bytes aligned
739     */
740     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
741         ((intptr_t)p_line2|(intptr_t)p_y2))) )
742     {
743         /* use faster SSE2 aligned fetch and store */
744         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
745         {
746             p_line1 = p_line2;
747             p_line2 += p_dest->p->i_pitch;
748
749             p_y1 = p_y2;
750             p_y2 += p_source->p[Y_PLANE].i_pitch;
751
752             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
753             {
754                 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
755             }
756             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
757             {
758                 C_YUV420_UYVY( );
759             }
760
761             p_y1 += i_source_margin;
762             p_y2 += i_source_margin;
763             p_u += i_source_margin_c;
764             p_v += i_source_margin_c;
765             p_line1 += i_dest_margin;
766             p_line2 += i_dest_margin;
767         }
768     }
769     else
770     {
771         /* use slower SSE2 unaligned fetch and store */
772         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
773         {
774             p_line1 = p_line2;
775             p_line2 += p_dest->p->i_pitch;
776
777             p_y1 = p_y2;
778             p_y2 += p_source->p[Y_PLANE].i_pitch;
779
780             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
781             {
782                 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
783             }
784             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
785             {
786                 C_YUV420_UYVY( );
787             }
788
789             p_y1 += i_source_margin;
790             p_y2 += i_source_margin;
791             p_u += i_source_margin_c;
792             p_v += i_source_margin_c;
793             p_line1 += i_dest_margin;
794             p_line2 += i_dest_margin;
795         }
796     }
797     /* make sure all SSE2 stores are visible thereafter */
798     SSE2_END;
799 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
800 }
801
802 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
803 /*****************************************************************************
804  * I420_IUYV: planar YUV 4:2:0 to interleaved packed UYVY 4:2:2
805  *****************************************************************************/
806 static void I420_IUYV( vout_thread_t *p_vout, picture_t *p_source,
807                                               picture_t *p_dest )
808 {
809     /* FIXME: TODO ! */
810     msg_Err( p_vout, "I420_IUYV unimplemented, please harass <sam@zoy.org>" );
811 }
812
813 /*****************************************************************************
814  * I420_cyuv: planar YUV 4:2:0 to upside-down packed UYVY 4:2:2
815  *****************************************************************************/
816 static void I420_cyuv( vout_thread_t *p_vout, picture_t *p_source,
817                                               picture_t *p_dest )
818 {
819     uint8_t *p_line1 = p_dest->p->p_pixels +
820                        p_dest->p->i_visible_lines * p_dest->p->i_pitch
821                        + p_dest->p->i_pitch;
822     uint8_t *p_line2 = p_dest->p->p_pixels +
823                        p_dest->p->i_visible_lines * p_dest->p->i_pitch;
824     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
825     uint8_t *p_u = p_source->U_PIXELS;
826     uint8_t *p_v = p_source->V_PIXELS;
827
828     int i_x, i_y;
829
830     const int i_source_margin = p_source->p[0].i_pitch
831                                  - p_source->p[0].i_visible_pitch;
832     const int i_source_margin_c = p_source->p[1].i_pitch
833                                  - p_source->p[1].i_visible_pitch;
834     const int i_dest_margin = p_dest->p->i_pitch
835                                - p_dest->p->i_visible_pitch;
836
837 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
838     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
839     {
840         p_line1 -= 3 * p_dest->p->i_pitch;
841         p_line2 -= 3 * p_dest->p->i_pitch;
842
843         p_y1 = p_y2;
844         p_y2 += p_source->p[Y_PLANE].i_pitch;
845
846         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
847         {
848 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
849             C_YUV420_UYVY( );
850             C_YUV420_UYVY( );
851             C_YUV420_UYVY( );
852             C_YUV420_UYVY( );
853 #else
854             MMX_CALL( MMX_YUV420_UYVY );
855 #endif
856         }
857         for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; )
858         {
859             C_YUV420_UYVY( );
860         }
861
862         p_y1 += i_source_margin;
863         p_y2 += i_source_margin;
864         p_u += i_source_margin_c;
865         p_v += i_source_margin_c;
866         p_line1 += i_dest_margin;
867         p_line2 += i_dest_margin;
868     }
869
870 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
871     /* re-enable FPU registers */
872     MMX_END;
873 #endif
874
875 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
876     /*
877     ** SSE2 128 bits fetch/store instructions are faster
878     ** if memory access is 16 bytes aligned
879     */
880     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
881         ((intptr_t)p_line2|(intptr_t)p_y2))) )
882     {
883         /* use faster SSE2 aligned fetch and store */
884         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
885         {
886             p_line1 = p_line2;
887             p_line2 += p_dest->p->i_pitch;
888
889             p_y1 = p_y2;
890             p_y2 += p_source->p[Y_PLANE].i_pitch;
891
892             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
893             {
894                 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
895             }
896             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
897             {
898                 C_YUV420_UYVY( );
899             }
900
901             p_y1 += i_source_margin;
902             p_y2 += i_source_margin;
903             p_u += i_source_margin_c;
904             p_v += i_source_margin_c;
905             p_line1 += i_dest_margin;
906             p_line2 += i_dest_margin;
907         }
908     }
909     else
910     {
911         /* use slower SSE2 unaligned fetch and store */
912         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
913         {
914             p_line1 = p_line2;
915             p_line2 += p_dest->p->i_pitch;
916
917             p_y1 = p_y2;
918             p_y2 += p_source->p[Y_PLANE].i_pitch;
919
920             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
921             {
922                 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
923             }
924             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
925             {
926                 C_YUV420_UYVY( );
927             }
928
929             p_y1 += i_source_margin;
930             p_y2 += i_source_margin;
931             p_u += i_source_margin_c;
932             p_v += i_source_margin_c;
933             p_line1 += i_dest_margin;
934             p_line2 += i_dest_margin;
935         }
936     }
937     /* make sure all SSE2 stores are visible thereafter */
938     SSE2_END;
939 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
940 }
941 #endif // !defined (MODULE_NAME_IS_i420_yuy2_altivec)
942
943 /*****************************************************************************
944  * I420_Y211: planar YUV 4:2:0 to packed YUYV 2:1:1
945  *****************************************************************************/
946 #if defined (MODULE_NAME_IS_i420_yuy2)
947 static void I420_Y211( vout_thread_t *p_vout, picture_t *p_source,
948                                               picture_t *p_dest )
949 {
950     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
951     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
952     uint8_t *p_u = p_source->U_PIXELS;
953     uint8_t *p_v = p_source->V_PIXELS;
954
955     int i_x, i_y;
956
957     const int i_source_margin = p_source->p[0].i_pitch
958                                  - p_source->p[0].i_visible_pitch;
959     const int i_source_margin_c = p_source->p[1].i_pitch
960                                  - p_source->p[1].i_visible_pitch;
961     const int i_dest_margin = p_dest->p->i_pitch
962                                - p_dest->p->i_visible_pitch;
963
964     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
965     {
966         p_line1 = p_line2;
967         p_line2 += p_dest->p->i_pitch;
968
969         p_y1 = p_y2;
970         p_y2 += p_source->p[Y_PLANE].i_pitch;
971
972         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
973         {
974             C_YUV420_Y211( );
975             C_YUV420_Y211( );
976         }
977
978         p_y1 += i_source_margin;
979         p_y2 += i_source_margin;
980         p_u += i_source_margin_c;
981         p_v += i_source_margin_c;
982         p_line1 += i_dest_margin;
983         p_line2 += i_dest_margin;
984     }
985 }
986 #endif