]> git.sesse.net Git - vlc/blob - modules/video_chroma/i420_yuy2.c
Include vlc_plugin.h as needed
[vlc] / modules / video_chroma / i420_yuy2.c
1 /*****************************************************************************
2  * i420_yuy2.c : YUV to YUV conversion module for vlc
3  *****************************************************************************
4  * Copyright (C) 2000, 2001 the VideoLAN team
5  * $Id$
6  *
7  * Authors: Samuel Hocevar <sam@zoy.org>
8  *          Damien Fouilleul <damien@videolan.org>
9  *
10  * This program is free software; you can redistribute it and/or modify
11  * it under the terms of the GNU General Public License as published by
12  * the Free Software Foundation; either version 2 of the License, or
13  * (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public License
21  * along with this program; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
23  *****************************************************************************/
24
25 /*****************************************************************************
26  * Preamble
27  *****************************************************************************/
28
29 #ifdef HAVE_CONFIG_H
30 # include "config.h"
31 #endif
32
33 #include <vlc/vlc.h>
34 #include <vlc_plugin.h>
35 #include <vlc_vout.h>
36
37 #if defined (MODULE_NAME_IS_i420_yuy2_altivec) && defined(HAVE_ALTIVEC_H)
38 #   include <altivec.h>
39 #endif
40
41 #include "i420_yuy2.h"
42
43 #define SRC_FOURCC  "I420,IYUV,YV12"
44
45 #if defined (MODULE_NAME_IS_i420_yuy2)
46 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv,Y211"
47 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
48 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
49 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
50 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
51 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
52 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422"
53 #endif
54
55 /*****************************************************************************
56  * Local and extern prototypes.
57  *****************************************************************************/
58 static int  Activate ( vlc_object_t * );
59
60 static void I420_YUY2           ( vout_thread_t *, picture_t *, picture_t * );
61 static void I420_YVYU           ( vout_thread_t *, picture_t *, picture_t * );
62 static void I420_UYVY           ( vout_thread_t *, picture_t *, picture_t * );
63 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
64 static void I420_IUYV           ( vout_thread_t *, picture_t *, picture_t * );
65 static void I420_cyuv           ( vout_thread_t *, picture_t *, picture_t * );
66 #endif
67 #if defined (MODULE_NAME_IS_i420_yuy2)
68 static void I420_Y211           ( vout_thread_t *, picture_t *, picture_t * );
69 #endif
70
71 #ifdef MODULE_NAME_IS_i420_yuy2_mmx
72 /* Initialize MMX-specific constants */
73 static const uint64_t i_00ffw = 0x00ff00ff00ff00ffULL;
74 static const uint64_t i_80w   = 0x0000000080808080ULL;
75 #endif
76
77 /*****************************************************************************
78  * Module descriptor.
79  *****************************************************************************/
80 vlc_module_begin();
81 #if defined (MODULE_NAME_IS_i420_yuy2)
82     set_description( _("Conversions from " SRC_FOURCC " to " DEST_FOURCC) );
83     set_capability( "chroma", 80 );
84 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
85     set_description( _("MMX conversions from " SRC_FOURCC " to " DEST_FOURCC) );
86     set_capability( "chroma", 100 );
87     add_requirement( MMX );
88 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
89     set_description( _("SSE2 conversions from " SRC_FOURCC " to " DEST_FOURCC) );
90     set_capability( "chroma", 120 );
91     add_requirement( SSE2 );
92 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
93     set_description(
94             _("AltiVec conversions from " SRC_FOURCC " to " DEST_FOURCC) );
95     set_capability( "chroma", 100 );
96     add_requirement( ALTIVEC );
97 #endif
98     set_callbacks( Activate, NULL );
99 vlc_module_end();
100
101 /*****************************************************************************
102  * Activate: allocate a chroma function
103  *****************************************************************************
104  * This function allocates and initializes a chroma function
105  *****************************************************************************/
106 static int Activate( vlc_object_t *p_this )
107 {
108     vout_thread_t *p_vout = (vout_thread_t *)p_this;
109
110     if( p_vout->render.i_width & 1 || p_vout->render.i_height & 1 )
111     {
112         return -1;
113     }
114
115     switch( p_vout->render.i_chroma )
116     {
117         case VLC_FOURCC('Y','V','1','2'):
118         case VLC_FOURCC('I','4','2','0'):
119         case VLC_FOURCC('I','Y','U','V'):
120             switch( p_vout->output.i_chroma )
121             {
122                 case VLC_FOURCC('Y','U','Y','2'):
123                 case VLC_FOURCC('Y','U','N','V'):
124                     p_vout->chroma.pf_convert = I420_YUY2;
125                     break;
126
127                 case VLC_FOURCC('Y','V','Y','U'):
128                     p_vout->chroma.pf_convert = I420_YVYU;
129                     break;
130
131                 case VLC_FOURCC('U','Y','V','Y'):
132                 case VLC_FOURCC('U','Y','N','V'):
133                 case VLC_FOURCC('Y','4','2','2'):
134                     p_vout->chroma.pf_convert = I420_UYVY;
135                     break;
136 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
137                 case VLC_FOURCC('I','U','Y','V'):
138                     p_vout->chroma.pf_convert = I420_IUYV;
139                     break;
140
141                 case VLC_FOURCC('c','y','u','v'):
142                     p_vout->chroma.pf_convert = I420_cyuv;
143                     break;
144 #endif
145
146 #if defined (MODULE_NAME_IS_i420_yuy2)
147                 case VLC_FOURCC('Y','2','1','1'):
148                     p_vout->chroma.pf_convert = I420_Y211;
149                     break;
150 #endif
151
152                 default:
153                     return -1;
154             }
155             break;
156
157         default:
158             return -1;
159     }
160
161     return 0;
162 }
163
164 #if 0
165 static inline unsigned long long read_cycles(void)
166 {
167     unsigned long long v;
168     __asm__ __volatile__("rdtsc" : "=A" (v): );
169
170     return v;
171 }
172 #endif
173
174 /* Following functions are local */
175 /*****************************************************************************
176  * I420_YUY2: planar YUV 4:2:0 to packed YUYV 4:2:2
177  *****************************************************************************/
178 static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source,
179                                               picture_t *p_dest )
180 {
181     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
182     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
183     uint8_t *p_u = p_source->U_PIXELS;
184     uint8_t *p_v = p_source->V_PIXELS;
185
186     int i_x, i_y;
187
188 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
189 #define VEC_NEXT_LINES( ) \
190     p_line1  = p_line2; \
191     p_line2 += p_dest->p->i_pitch; \
192     p_y1     = p_y2; \
193     p_y2    += p_source->p[Y_PLANE].i_pitch;
194
195 #define VEC_LOAD_UV( ) \
196     u_vec = vec_ld( 0, p_u ); p_u += 16; \
197     v_vec = vec_ld( 0, p_v ); p_v += 16;
198
199 #define VEC_MERGE( a ) \
200     uv_vec = a( u_vec, v_vec ); \
201     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
202     vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
203     vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
204     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
205     vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16; \
206     vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16;
207
208     vector unsigned char u_vec;
209     vector unsigned char v_vec;
210     vector unsigned char uv_vec;
211     vector unsigned char y_vec;
212
213     if( !( ( p_vout->render.i_width % 32 ) |
214            ( p_vout->render.i_height % 2 ) ) )
215     {
216         /* Width is a multiple of 32, we take 2 lines at a time */
217         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
218         {
219             VEC_NEXT_LINES( );
220             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
221             {
222                 VEC_LOAD_UV( );
223                 VEC_MERGE( vec_mergeh );
224                 VEC_MERGE( vec_mergel );
225             }
226         }
227     }
228     else if( !( ( p_vout->render.i_width % 16 ) |
229                 ( p_vout->render.i_height % 4 ) ) )
230     {
231         /* Width is only a multiple of 16, we take 4 lines at a time */
232         for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
233         {
234             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
235             VEC_NEXT_LINES( );
236             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
237             {
238                 VEC_LOAD_UV( );
239                 VEC_MERGE( vec_mergeh );
240                 VEC_MERGE( vec_mergel );
241             }
242
243             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
244             VEC_LOAD_UV( );
245             VEC_MERGE( vec_mergeh );
246
247             /* Line 3 and 4, pixels 0 to 16 */
248             VEC_NEXT_LINES( );
249             VEC_MERGE( vec_mergel );
250
251             /* Line 3 and 4, pixels 16 to ( width ) */
252             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
253             {
254                 VEC_LOAD_UV( );
255                 VEC_MERGE( vec_mergeh );
256                 VEC_MERGE( vec_mergel );
257             }
258         }
259     }
260     else
261     {
262         /* Crap, use the C version */
263 #undef VEC_NEXT_LINES
264 #undef VEC_LOAD_UV
265 #undef VEC_MERGE
266 #endif
267
268     const int i_source_margin = p_source->p[0].i_pitch
269                                  - p_source->p[0].i_visible_pitch;
270     const int i_source_margin_c = p_source->p[1].i_pitch
271                                  - p_source->p[1].i_visible_pitch;
272     const int i_dest_margin = p_dest->p->i_pitch
273                                - p_dest->p->i_visible_pitch;
274
275 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
276     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
277     {
278         p_line1 = p_line2;
279         p_line2 += p_dest->p->i_pitch;
280
281         p_y1 = p_y2;
282         p_y2 += p_source->p[Y_PLANE].i_pitch;
283
284 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
285         for( i_x = p_vout->render.i_width / 8; i_x-- ; )
286         {
287             C_YUV420_YUYV( );
288             C_YUV420_YUYV( );
289             C_YUV420_YUYV( );
290             C_YUV420_YUYV( );
291         }
292 #else
293         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
294         {
295             MMX_CALL( MMX_YUV420_YUYV );
296         }
297 #endif
298         for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; )
299         {
300             C_YUV420_YUYV( );
301         }
302
303         p_y1 += i_source_margin;
304         p_y2 += i_source_margin;
305         p_u += i_source_margin_c;
306         p_v += i_source_margin_c;
307         p_line1 += i_dest_margin;
308         p_line2 += i_dest_margin;
309     }
310
311 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
312     /* re-enable FPU registers */
313     MMX_END;
314 #endif
315
316 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
317     }
318 #endif
319
320 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
321     /*
322     ** SSE2 128 bits fetch/store instructions are faster
323     ** if memory access is 16 bytes aligned
324     */
325
326     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
327         ((intptr_t)p_line2|(intptr_t)p_y2))) )
328     {
329         /* use faster SSE2 aligned fetch and store */
330         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
331         {
332             p_line1 = p_line2;
333             p_line2 += p_dest->p->i_pitch;
334
335             p_y1 = p_y2;
336             p_y2 += p_source->p[Y_PLANE].i_pitch;
337
338             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
339             {
340                 SSE2_CALL( SSE2_YUV420_YUYV_ALIGNED );
341             }
342             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
343             {
344                 C_YUV420_YUYV( );
345             }
346
347             p_y1 += i_source_margin;
348             p_y2 += i_source_margin;
349             p_u += i_source_margin_c;
350             p_v += i_source_margin_c;
351             p_line1 += i_dest_margin;
352             p_line2 += i_dest_margin;
353         }
354     }
355     else
356     {
357         /* use slower SSE2 unaligned fetch and store */
358         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
359         {
360             p_line1 = p_line2;
361             p_line2 += p_dest->p->i_pitch;
362
363             p_y1 = p_y2;
364             p_y2 += p_source->p[Y_PLANE].i_pitch;
365
366             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
367             {
368                 SSE2_CALL( SSE2_YUV420_YUYV_UNALIGNED );
369             }
370             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
371             {
372                 C_YUV420_YUYV( );
373             }
374
375             p_y1 += i_source_margin;
376             p_y2 += i_source_margin;
377             p_u += i_source_margin_c;
378             p_v += i_source_margin_c;
379             p_line1 += i_dest_margin;
380             p_line2 += i_dest_margin;
381         }
382     }
383     /* make sure all SSE2 stores are visible thereafter */
384     SSE2_END;
385
386 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
387 }
388
389 /*****************************************************************************
390  * I420_YVYU: planar YUV 4:2:0 to packed YVYU 4:2:2
391  *****************************************************************************/
392 static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source,
393                                               picture_t *p_dest )
394 {
395     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
396     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
397     uint8_t *p_u = p_source->U_PIXELS;
398     uint8_t *p_v = p_source->V_PIXELS;
399
400     int i_x, i_y;
401
402 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
403 #define VEC_NEXT_LINES( ) \
404     p_line1  = p_line2; \
405     p_line2 += p_dest->p->i_pitch; \
406     p_y1     = p_y2; \
407     p_y2    += p_source->p[Y_PLANE].i_pitch;
408
409 #define VEC_LOAD_UV( ) \
410     u_vec = vec_ld( 0, p_u ); p_u += 16; \
411     v_vec = vec_ld( 0, p_v ); p_v += 16;
412
413 #define VEC_MERGE( a ) \
414     vu_vec = a( v_vec, u_vec ); \
415     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
416     vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
417     vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
418     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
419     vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16; \
420     vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16;
421
422     vector unsigned char u_vec;
423     vector unsigned char v_vec;
424     vector unsigned char vu_vec;
425     vector unsigned char y_vec;
426
427     if( !( ( p_vout->render.i_width % 32 ) |
428            ( p_vout->render.i_height % 2 ) ) )
429     {
430         /* Width is a multiple of 32, we take 2 lines at a time */
431         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
432         {
433             VEC_NEXT_LINES( );
434             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
435             {
436                 VEC_LOAD_UV( );
437                 VEC_MERGE( vec_mergeh );
438                 VEC_MERGE( vec_mergel );
439             }
440         }
441     }
442     else if( !( ( p_vout->render.i_width % 16 ) |
443                 ( p_vout->render.i_height % 4 ) ) )
444     {
445         /* Width is only a multiple of 16, we take 4 lines at a time */
446         for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
447         {
448             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
449             VEC_NEXT_LINES( );
450             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
451             {
452                 VEC_LOAD_UV( );
453                 VEC_MERGE( vec_mergeh );
454                 VEC_MERGE( vec_mergel );
455             }
456
457             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
458             VEC_LOAD_UV( );
459             VEC_MERGE( vec_mergeh );
460
461             /* Line 3 and 4, pixels 0 to 16 */
462             VEC_NEXT_LINES( );
463             VEC_MERGE( vec_mergel );
464
465             /* Line 3 and 4, pixels 16 to ( width ) */
466             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
467             {
468                 VEC_LOAD_UV( );
469                 VEC_MERGE( vec_mergeh );
470                 VEC_MERGE( vec_mergel );
471             }
472         }
473     }
474     else
475     {
476         /* Crap, use the C version */
477 #undef VEC_NEXT_LINES
478 #undef VEC_LOAD_UV
479 #undef VEC_MERGE
480 #endif
481
482     const int i_source_margin = p_source->p[0].i_pitch
483                                  - p_source->p[0].i_visible_pitch;
484     const int i_source_margin_c = p_source->p[1].i_pitch
485                                  - p_source->p[1].i_visible_pitch;
486     const int i_dest_margin = p_dest->p->i_pitch
487                                - p_dest->p->i_visible_pitch;
488
489 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
490     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
491     {
492         p_line1 = p_line2;
493         p_line2 += p_dest->p->i_pitch;
494
495         p_y1 = p_y2;
496         p_y2 += p_source->p[Y_PLANE].i_pitch;
497
498         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
499         {
500 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
501             C_YUV420_YVYU( );
502             C_YUV420_YVYU( );
503             C_YUV420_YVYU( );
504             C_YUV420_YVYU( );
505 #else
506             MMX_CALL( MMX_YUV420_YVYU );
507 #endif
508         }
509         for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; )
510         {
511             C_YUV420_YVYU( );
512         }
513
514         p_y1 += i_source_margin;
515         p_y2 += i_source_margin;
516         p_u += i_source_margin_c;
517         p_v += i_source_margin_c;
518         p_line1 += i_dest_margin;
519         p_line2 += i_dest_margin;
520     }
521
522 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
523     /* re-enable FPU registers */
524     MMX_END;
525 #endif
526
527 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
528     }
529 #endif
530
531 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
532     /*
533     ** SSE2 128 bits fetch/store instructions are faster
534     ** if memory access is 16 bytes aligned
535     */
536     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
537         ((intptr_t)p_line2|(intptr_t)p_y2))) )
538     {
539         /* use faster SSE2 aligned fetch and store */
540         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
541         {
542             p_line1 = p_line2;
543             p_line2 += p_dest->p->i_pitch;
544
545             p_y1 = p_y2;
546             p_y2 += p_source->p[Y_PLANE].i_pitch;
547
548             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
549             {
550                 SSE2_CALL( SSE2_YUV420_YVYU_ALIGNED );
551             }
552             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
553             {
554                 C_YUV420_YVYU( );
555             }
556
557             p_y1 += i_source_margin;
558             p_y2 += i_source_margin;
559             p_u += i_source_margin_c;
560             p_v += i_source_margin_c;
561             p_line1 += i_dest_margin;
562             p_line2 += i_dest_margin;
563         }
564     }
565     else
566     {
567         /* use slower SSE2 unaligned fetch and store */
568         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
569         {
570             p_line1 = p_line2;
571             p_line2 += p_dest->p->i_pitch;
572
573             p_y1 = p_y2;
574             p_y2 += p_source->p[Y_PLANE].i_pitch;
575
576             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
577             {
578                 SSE2_CALL( SSE2_YUV420_YVYU_UNALIGNED );
579             }
580             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
581             {
582                 C_YUV420_YVYU( );
583             }
584
585             p_y1 += i_source_margin;
586             p_y2 += i_source_margin;
587             p_u += i_source_margin_c;
588             p_v += i_source_margin_c;
589             p_line1 += i_dest_margin;
590             p_line2 += i_dest_margin;
591         }
592     }
593     /* make sure all SSE2 stores are visible thereafter */
594     SSE2_END;
595 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
596 }
597
598 /*****************************************************************************
599  * I420_UYVY: planar YUV 4:2:0 to packed UYVY 4:2:2
600  *****************************************************************************/
601 static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source,
602                                               picture_t *p_dest )
603 {
604     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
605     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
606     uint8_t *p_u = p_source->U_PIXELS;
607     uint8_t *p_v = p_source->V_PIXELS;
608
609     int i_x, i_y;
610
611 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
612 #define VEC_NEXT_LINES( ) \
613     p_line1  = p_line2; \
614     p_line2 += p_dest->p->i_pitch; \
615     p_y1     = p_y2; \
616     p_y2    += p_source->p[Y_PLANE].i_pitch;
617
618 #define VEC_LOAD_UV( ) \
619     u_vec = vec_ld( 0, p_u ); p_u += 16; \
620     v_vec = vec_ld( 0, p_v ); p_v += 16;
621
622 #define VEC_MERGE( a ) \
623     uv_vec = a( u_vec, v_vec ); \
624     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
625     vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
626     vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
627     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
628     vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16; \
629     vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16;
630
631     vector unsigned char u_vec;
632     vector unsigned char v_vec;
633     vector unsigned char uv_vec;
634     vector unsigned char y_vec;
635
636     if( !( ( p_vout->render.i_width % 32 ) |
637            ( p_vout->render.i_height % 2 ) ) )
638     {
639         /* Width is a multiple of 32, we take 2 lines at a time */
640         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
641         {
642             VEC_NEXT_LINES( );
643             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
644             {
645                 VEC_LOAD_UV( );
646                 VEC_MERGE( vec_mergeh );
647                 VEC_MERGE( vec_mergel );
648             }
649         }
650     }
651     else if( !( ( p_vout->render.i_width % 16 ) |
652                 ( p_vout->render.i_height % 4 ) ) )
653     {
654         /* Width is only a multiple of 16, we take 4 lines at a time */
655         for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
656         {
657             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
658             VEC_NEXT_LINES( );
659             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
660             {
661                 VEC_LOAD_UV( );
662                 VEC_MERGE( vec_mergeh );
663                 VEC_MERGE( vec_mergel );
664             }
665
666             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
667             VEC_LOAD_UV( );
668             VEC_MERGE( vec_mergeh );
669
670             /* Line 3 and 4, pixels 0 to 16 */
671             VEC_NEXT_LINES( );
672             VEC_MERGE( vec_mergel );
673
674             /* Line 3 and 4, pixels 16 to ( width ) */
675             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
676             {
677                 VEC_LOAD_UV( );
678                 VEC_MERGE( vec_mergeh );
679                 VEC_MERGE( vec_mergel );
680             }
681         }
682     }
683     else
684     {
685         /* Crap, use the C version */
686 #undef VEC_NEXT_LINES
687 #undef VEC_LOAD_UV
688 #undef VEC_MERGE
689 #endif
690
691     const int i_source_margin = p_source->p[0].i_pitch
692                                  - p_source->p[0].i_visible_pitch;
693     const int i_source_margin_c = p_source->p[1].i_pitch
694                                  - p_source->p[1].i_visible_pitch;
695     const int i_dest_margin = p_dest->p->i_pitch
696                                - p_dest->p->i_visible_pitch;
697
698 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
699     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
700     {
701         p_line1 = p_line2;
702         p_line2 += p_dest->p->i_pitch;
703
704         p_y1 = p_y2;
705         p_y2 += p_source->p[Y_PLANE].i_pitch;
706
707         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
708         {
709 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
710             C_YUV420_UYVY( );
711             C_YUV420_UYVY( );
712             C_YUV420_UYVY( );
713             C_YUV420_UYVY( );
714 #else
715             MMX_CALL( MMX_YUV420_UYVY );
716 #endif
717         }
718         for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x--; )
719         {
720             C_YUV420_UYVY( );
721         }
722
723         p_y1 += i_source_margin;
724         p_y2 += i_source_margin;
725         p_u += i_source_margin_c;
726         p_v += i_source_margin_c;
727         p_line1 += i_dest_margin;
728         p_line2 += i_dest_margin;
729     }
730
731 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
732     /* re-enable FPU registers */
733     MMX_END;
734 #endif
735
736 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
737     }
738 #endif
739
740 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
741     /*
742     ** SSE2 128 bits fetch/store instructions are faster
743     ** if memory access is 16 bytes aligned
744     */
745     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
746         ((intptr_t)p_line2|(intptr_t)p_y2))) )
747     {
748         /* use faster SSE2 aligned fetch and store */
749         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
750         {
751             p_line1 = p_line2;
752             p_line2 += p_dest->p->i_pitch;
753
754             p_y1 = p_y2;
755             p_y2 += p_source->p[Y_PLANE].i_pitch;
756
757             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
758             {
759                 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
760             }
761             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
762             {
763                 C_YUV420_UYVY( );
764             }
765
766             p_y1 += i_source_margin;
767             p_y2 += i_source_margin;
768             p_u += i_source_margin_c;
769             p_v += i_source_margin_c;
770             p_line1 += i_dest_margin;
771             p_line2 += i_dest_margin;
772         }
773     }
774     else
775     {
776         /* use slower SSE2 unaligned fetch and store */
777         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
778         {
779             p_line1 = p_line2;
780             p_line2 += p_dest->p->i_pitch;
781
782             p_y1 = p_y2;
783             p_y2 += p_source->p[Y_PLANE].i_pitch;
784
785             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
786             {
787                 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
788             }
789             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
790             {
791                 C_YUV420_UYVY( );
792             }
793
794             p_y1 += i_source_margin;
795             p_y2 += i_source_margin;
796             p_u += i_source_margin_c;
797             p_v += i_source_margin_c;
798             p_line1 += i_dest_margin;
799             p_line2 += i_dest_margin;
800         }
801     }
802     /* make sure all SSE2 stores are visible thereafter */
803     SSE2_END;
804 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
805 }
806
807 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
808 /*****************************************************************************
809  * I420_IUYV: planar YUV 4:2:0 to interleaved packed UYVY 4:2:2
810  *****************************************************************************/
811 static void I420_IUYV( vout_thread_t *p_vout, picture_t *p_source,
812                                               picture_t *p_dest )
813 {
814     VLC_UNUSED(p_source); VLC_UNUSED(p_dest);
815     /* FIXME: TODO ! */
816     msg_Err( p_vout, "I420_IUYV unimplemented, please harass <sam@zoy.org>" );
817 }
818
819 /*****************************************************************************
820  * I420_cyuv: planar YUV 4:2:0 to upside-down packed UYVY 4:2:2
821  *****************************************************************************/
822 static void I420_cyuv( vout_thread_t *p_vout, picture_t *p_source,
823                                               picture_t *p_dest )
824 {
825     uint8_t *p_line1 = p_dest->p->p_pixels +
826                        p_dest->p->i_visible_lines * p_dest->p->i_pitch
827                        + p_dest->p->i_pitch;
828     uint8_t *p_line2 = p_dest->p->p_pixels +
829                        p_dest->p->i_visible_lines * p_dest->p->i_pitch;
830     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
831     uint8_t *p_u = p_source->U_PIXELS;
832     uint8_t *p_v = p_source->V_PIXELS;
833
834     int i_x, i_y;
835
836     const int i_source_margin = p_source->p[0].i_pitch
837                                  - p_source->p[0].i_visible_pitch;
838     const int i_source_margin_c = p_source->p[1].i_pitch
839                                  - p_source->p[1].i_visible_pitch;
840     const int i_dest_margin = p_dest->p->i_pitch
841                                - p_dest->p->i_visible_pitch;
842
843 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
844     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
845     {
846         p_line1 -= 3 * p_dest->p->i_pitch;
847         p_line2 -= 3 * p_dest->p->i_pitch;
848
849         p_y1 = p_y2;
850         p_y2 += p_source->p[Y_PLANE].i_pitch;
851
852         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
853         {
854 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
855             C_YUV420_UYVY( );
856             C_YUV420_UYVY( );
857             C_YUV420_UYVY( );
858             C_YUV420_UYVY( );
859 #else
860             MMX_CALL( MMX_YUV420_UYVY );
861 #endif
862         }
863         for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; )
864         {
865             C_YUV420_UYVY( );
866         }
867
868         p_y1 += i_source_margin;
869         p_y2 += i_source_margin;
870         p_u += i_source_margin_c;
871         p_v += i_source_margin_c;
872         p_line1 += i_dest_margin;
873         p_line2 += i_dest_margin;
874     }
875
876 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
877     /* re-enable FPU registers */
878     MMX_END;
879 #endif
880
881 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
882     /*
883     ** SSE2 128 bits fetch/store instructions are faster
884     ** if memory access is 16 bytes aligned
885     */
886     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
887         ((intptr_t)p_line2|(intptr_t)p_y2))) )
888     {
889         /* use faster SSE2 aligned fetch and store */
890         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
891         {
892             p_line1 = p_line2;
893             p_line2 += p_dest->p->i_pitch;
894
895             p_y1 = p_y2;
896             p_y2 += p_source->p[Y_PLANE].i_pitch;
897
898             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
899             {
900                 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
901             }
902             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
903             {
904                 C_YUV420_UYVY( );
905             }
906
907             p_y1 += i_source_margin;
908             p_y2 += i_source_margin;
909             p_u += i_source_margin_c;
910             p_v += i_source_margin_c;
911             p_line1 += i_dest_margin;
912             p_line2 += i_dest_margin;
913         }
914     }
915     else
916     {
917         /* use slower SSE2 unaligned fetch and store */
918         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
919         {
920             p_line1 = p_line2;
921             p_line2 += p_dest->p->i_pitch;
922
923             p_y1 = p_y2;
924             p_y2 += p_source->p[Y_PLANE].i_pitch;
925
926             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
927             {
928                 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
929             }
930             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
931             {
932                 C_YUV420_UYVY( );
933             }
934
935             p_y1 += i_source_margin;
936             p_y2 += i_source_margin;
937             p_u += i_source_margin_c;
938             p_v += i_source_margin_c;
939             p_line1 += i_dest_margin;
940             p_line2 += i_dest_margin;
941         }
942     }
943     /* make sure all SSE2 stores are visible thereafter */
944     SSE2_END;
945 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
946 }
947 #endif // !defined (MODULE_NAME_IS_i420_yuy2_altivec)
948
949 /*****************************************************************************
950  * I420_Y211: planar YUV 4:2:0 to packed YUYV 2:1:1
951  *****************************************************************************/
952 #if defined (MODULE_NAME_IS_i420_yuy2)
953 static void I420_Y211( vout_thread_t *p_vout, picture_t *p_source,
954                                               picture_t *p_dest )
955 {
956     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
957     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
958     uint8_t *p_u = p_source->U_PIXELS;
959     uint8_t *p_v = p_source->V_PIXELS;
960
961     int i_x, i_y;
962
963     const int i_source_margin = p_source->p[0].i_pitch
964                                  - p_source->p[0].i_visible_pitch;
965     const int i_source_margin_c = p_source->p[1].i_pitch
966                                  - p_source->p[1].i_visible_pitch;
967     const int i_dest_margin = p_dest->p->i_pitch
968                                - p_dest->p->i_visible_pitch;
969
970     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
971     {
972         p_line1 = p_line2;
973         p_line2 += p_dest->p->i_pitch;
974
975         p_y1 = p_y2;
976         p_y2 += p_source->p[Y_PLANE].i_pitch;
977
978         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
979         {
980             C_YUV420_Y211( );
981             C_YUV420_Y211( );
982         }
983
984         p_y1 += i_source_margin;
985         p_y2 += i_source_margin;
986         p_u += i_source_margin_c;
987         p_v += i_source_margin_c;
988         p_line1 += i_dest_margin;
989         p_line2 += i_dest_margin;
990     }
991 }
992 #endif