]> git.sesse.net Git - vlc/blob - modules/video_chroma/i420_yuy2.c
Remove stdlib.h
[vlc] / modules / video_chroma / i420_yuy2.c
1 /*****************************************************************************
2  * i420_yuy2.c : YUV to YUV conversion module for vlc
3  *****************************************************************************
4  * Copyright (C) 2000, 2001 the VideoLAN team
5  * $Id$
6  *
7  * Authors: Samuel Hocevar <sam@zoy.org>
8  *          Damien Fouilleul <damien@videolan.org>
9  *
10  * This program is free software; you can redistribute it and/or modify
11  * it under the terms of the GNU General Public License as published by
12  * the Free Software Foundation; either version 2 of the License, or
13  * (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public License
21  * along with this program; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
23  *****************************************************************************/
24
25 /*****************************************************************************
26  * Preamble
27  *****************************************************************************/
28 #include <string.h>                                            /* strerror() */
29
30 #include <vlc/vlc.h>
31 #include <vlc_vout.h>
32
33 #if defined (MODULE_NAME_IS_i420_yuy2_altivec) && defined(HAVE_ALTIVEC_H)
34 #   include <altivec.h>
35 #endif
36
37 #include "i420_yuy2.h"
38
39 #define SRC_FOURCC  "I420,IYUV,YV12"
40
41 #if defined (MODULE_NAME_IS_i420_yuy2)
42 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv,Y211"
43 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
44 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
45 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
46 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
47 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
48 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422"
49 #endif
50
51 /*****************************************************************************
52  * Local and extern prototypes.
53  *****************************************************************************/
54 static int  Activate ( vlc_object_t * );
55
56 static void I420_YUY2           ( vout_thread_t *, picture_t *, picture_t * );
57 static void I420_YVYU           ( vout_thread_t *, picture_t *, picture_t * );
58 static void I420_UYVY           ( vout_thread_t *, picture_t *, picture_t * );
59 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
60 static void I420_IUYV           ( vout_thread_t *, picture_t *, picture_t * );
61 static void I420_cyuv           ( vout_thread_t *, picture_t *, picture_t * );
62 #endif
63 #if defined (MODULE_NAME_IS_i420_yuy2)
64 static void I420_Y211           ( vout_thread_t *, picture_t *, picture_t * );
65 #endif
66
67 #ifdef MODULE_NAME_IS_i420_yuy2_mmx
68 /* Initialize MMX-specific constants */
69 static const uint64_t i_00ffw = 0x00ff00ff00ff00ffULL;
70 static const uint64_t i_80w   = 0x0000000080808080ULL;
71 #endif
72
73 /*****************************************************************************
74  * Module descriptor.
75  *****************************************************************************/
76 vlc_module_begin();
77 #if defined (MODULE_NAME_IS_i420_yuy2)
78     set_description( _("Conversions from " SRC_FOURCC " to " DEST_FOURCC) );
79     set_capability( "chroma", 80 );
80 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
81     set_description( _("MMX conversions from " SRC_FOURCC " to " DEST_FOURCC) );
82     set_capability( "chroma", 100 );
83     add_requirement( MMX );
84 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
85     set_description( _("SSE2 conversions from " SRC_FOURCC " to " DEST_FOURCC) );
86     set_capability( "chroma", 120 );
87     add_requirement( SSE2 );
88 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
89     set_description(
90             _("AltiVec conversions from " SRC_FOURCC " to " DEST_FOURCC) );
91     set_capability( "chroma", 100 );
92     add_requirement( ALTIVEC );
93 #endif
94     set_callbacks( Activate, NULL );
95 vlc_module_end();
96
97 /*****************************************************************************
98  * Activate: allocate a chroma function
99  *****************************************************************************
100  * This function allocates and initializes a chroma function
101  *****************************************************************************/
102 static int Activate( vlc_object_t *p_this )
103 {
104     vout_thread_t *p_vout = (vout_thread_t *)p_this;
105
106     if( p_vout->render.i_width & 1 || p_vout->render.i_height & 1 )
107     {
108         return -1;
109     }
110
111     switch( p_vout->render.i_chroma )
112     {
113         case VLC_FOURCC('Y','V','1','2'):
114         case VLC_FOURCC('I','4','2','0'):
115         case VLC_FOURCC('I','Y','U','V'):
116             switch( p_vout->output.i_chroma )
117             {
118                 case VLC_FOURCC('Y','U','Y','2'):
119                 case VLC_FOURCC('Y','U','N','V'):
120                     p_vout->chroma.pf_convert = I420_YUY2;
121                     break;
122
123                 case VLC_FOURCC('Y','V','Y','U'):
124                     p_vout->chroma.pf_convert = I420_YVYU;
125                     break;
126
127                 case VLC_FOURCC('U','Y','V','Y'):
128                 case VLC_FOURCC('U','Y','N','V'):
129                 case VLC_FOURCC('Y','4','2','2'):
130                     p_vout->chroma.pf_convert = I420_UYVY;
131                     break;
132 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
133                 case VLC_FOURCC('I','U','Y','V'):
134                     p_vout->chroma.pf_convert = I420_IUYV;
135                     break;
136
137                 case VLC_FOURCC('c','y','u','v'):
138                     p_vout->chroma.pf_convert = I420_cyuv;
139                     break;
140 #endif
141
142 #if defined (MODULE_NAME_IS_i420_yuy2)
143                 case VLC_FOURCC('Y','2','1','1'):
144                     p_vout->chroma.pf_convert = I420_Y211;
145                     break;
146 #endif
147
148                 default:
149                     return -1;
150             }
151             break;
152
153         default:
154             return -1;
155     }
156
157     return 0;
158 }
159
160 #if 0
161 static inline unsigned long long read_cycles(void)
162 {
163     unsigned long long v;
164     __asm__ __volatile__("rdtsc" : "=A" (v): );
165
166     return v;
167 }
168 #endif
169
170 /* Following functions are local */
171 /*****************************************************************************
172  * I420_YUY2: planar YUV 4:2:0 to packed YUYV 4:2:2
173  *****************************************************************************/
174 static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source,
175                                               picture_t *p_dest )
176 {
177     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
178     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
179     uint8_t *p_u = p_source->U_PIXELS;
180     uint8_t *p_v = p_source->V_PIXELS;
181
182     int i_x, i_y;
183
184 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
185 #define VEC_NEXT_LINES( ) \
186     p_line1  = p_line2; \
187     p_line2 += p_dest->p->i_pitch; \
188     p_y1     = p_y2; \
189     p_y2    += p_source->p[Y_PLANE].i_pitch;
190
191 #define VEC_LOAD_UV( ) \
192     u_vec = vec_ld( 0, p_u ); p_u += 16; \
193     v_vec = vec_ld( 0, p_v ); p_v += 16;
194
195 #define VEC_MERGE( a ) \
196     uv_vec = a( u_vec, v_vec ); \
197     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
198     vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
199     vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
200     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
201     vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16; \
202     vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16;
203
204     vector unsigned char u_vec;
205     vector unsigned char v_vec;
206     vector unsigned char uv_vec;
207     vector unsigned char y_vec;
208
209     if( !( ( p_vout->render.i_width % 32 ) |
210            ( p_vout->render.i_height % 2 ) ) )
211     {
212         /* Width is a multiple of 32, we take 2 lines at a time */
213         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
214         {
215             VEC_NEXT_LINES( );
216             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
217             {
218                 VEC_LOAD_UV( );
219                 VEC_MERGE( vec_mergeh );
220                 VEC_MERGE( vec_mergel );
221             }
222         }
223     }
224     else if( !( ( p_vout->render.i_width % 16 ) |
225                 ( p_vout->render.i_height % 4 ) ) )
226     {
227         /* Width is only a multiple of 16, we take 4 lines at a time */
228         for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
229         {
230             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
231             VEC_NEXT_LINES( );
232             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
233             {
234                 VEC_LOAD_UV( );
235                 VEC_MERGE( vec_mergeh );
236                 VEC_MERGE( vec_mergel );
237             }
238
239             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
240             VEC_LOAD_UV( );
241             VEC_MERGE( vec_mergeh );
242
243             /* Line 3 and 4, pixels 0 to 16 */
244             VEC_NEXT_LINES( );
245             VEC_MERGE( vec_mergel );
246
247             /* Line 3 and 4, pixels 16 to ( width ) */
248             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
249             {
250                 VEC_LOAD_UV( );
251                 VEC_MERGE( vec_mergeh );
252                 VEC_MERGE( vec_mergel );
253             }
254         }
255     }
256     else
257     {
258         /* Crap, use the C version */
259 #undef VEC_NEXT_LINES
260 #undef VEC_LOAD_UV
261 #undef VEC_MERGE
262 #endif
263
264     const int i_source_margin = p_source->p[0].i_pitch
265                                  - p_source->p[0].i_visible_pitch;
266     const int i_source_margin_c = p_source->p[1].i_pitch
267                                  - p_source->p[1].i_visible_pitch;
268     const int i_dest_margin = p_dest->p->i_pitch
269                                - p_dest->p->i_visible_pitch;
270
271 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
272     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
273     {
274         p_line1 = p_line2;
275         p_line2 += p_dest->p->i_pitch;
276
277         p_y1 = p_y2;
278         p_y2 += p_source->p[Y_PLANE].i_pitch;
279
280 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
281         for( i_x = p_vout->render.i_width / 8; i_x-- ; )
282         {
283             C_YUV420_YUYV( );
284             C_YUV420_YUYV( );
285             C_YUV420_YUYV( );
286             C_YUV420_YUYV( );
287         }
288 #else
289         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
290         {
291             MMX_CALL( MMX_YUV420_YUYV );
292         }
293 #endif
294         for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; )
295         {
296             C_YUV420_YUYV( );
297         }
298
299         p_y1 += i_source_margin;
300         p_y2 += i_source_margin;
301         p_u += i_source_margin_c;
302         p_v += i_source_margin_c;
303         p_line1 += i_dest_margin;
304         p_line2 += i_dest_margin;
305     }
306
307 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
308     /* re-enable FPU registers */
309     MMX_END;
310 #endif
311
312 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
313     }
314 #endif
315
316 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
317     /*
318     ** SSE2 128 bits fetch/store instructions are faster 
319     ** if memory access is 16 bytes aligned
320     */
321
322     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
323         ((int)p_line2|(int)p_y2))) )
324     {
325         /* use faster SSE2 aligned fetch and store */
326         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
327         {
328             p_line1 = p_line2;
329             p_line2 += p_dest->p->i_pitch;
330
331             p_y1 = p_y2;
332             p_y2 += p_source->p[Y_PLANE].i_pitch;
333
334             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
335             {
336                 SSE2_CALL( SSE2_YUV420_YUYV_ALIGNED );
337             }
338             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
339             {
340                 C_YUV420_YUYV( );
341             }
342
343             p_y1 += i_source_margin;
344             p_y2 += i_source_margin;
345             p_u += i_source_margin_c;
346             p_v += i_source_margin_c;
347             p_line1 += i_dest_margin;
348             p_line2 += i_dest_margin;
349         }
350     }
351     else
352     {
353         /* use slower SSE2 unaligned fetch and store */
354         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
355         {
356             p_line1 = p_line2;
357             p_line2 += p_dest->p->i_pitch;
358
359             p_y1 = p_y2;
360             p_y2 += p_source->p[Y_PLANE].i_pitch;
361
362             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
363             {
364                 SSE2_CALL( SSE2_YUV420_YUYV_UNALIGNED );
365             }
366             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
367             {
368                 C_YUV420_YUYV( );
369             }
370
371             p_y1 += i_source_margin;
372             p_y2 += i_source_margin;
373             p_u += i_source_margin_c;
374             p_v += i_source_margin_c;
375             p_line1 += i_dest_margin;
376             p_line2 += i_dest_margin;
377         }
378     }
379     /* make sure all SSE2 stores are visible thereafter */
380     SSE2_END;
381
382 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
383 }
384
385 /*****************************************************************************
386  * I420_YVYU: planar YUV 4:2:0 to packed YVYU 4:2:2
387  *****************************************************************************/
388 static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source,
389                                               picture_t *p_dest )
390 {
391     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
392     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
393     uint8_t *p_u = p_source->U_PIXELS;
394     uint8_t *p_v = p_source->V_PIXELS;
395
396     int i_x, i_y;
397
398 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
399 #define VEC_NEXT_LINES( ) \
400     p_line1  = p_line2; \
401     p_line2 += p_dest->p->i_pitch; \
402     p_y1     = p_y2; \
403     p_y2    += p_source->p[Y_PLANE].i_pitch;
404
405 #define VEC_LOAD_UV( ) \
406     u_vec = vec_ld( 0, p_u ); p_u += 16; \
407     v_vec = vec_ld( 0, p_v ); p_v += 16;
408
409 #define VEC_MERGE( a ) \
410     vu_vec = a( v_vec, u_vec ); \
411     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
412     vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
413     vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
414     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
415     vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16; \
416     vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16;
417
418     vector unsigned char u_vec;
419     vector unsigned char v_vec;
420     vector unsigned char vu_vec;
421     vector unsigned char y_vec;
422
423     if( !( ( p_vout->render.i_width % 32 ) |
424            ( p_vout->render.i_height % 2 ) ) )
425     {
426         /* Width is a multiple of 32, we take 2 lines at a time */
427         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
428         {
429             VEC_NEXT_LINES( );
430             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
431             {
432                 VEC_LOAD_UV( );
433                 VEC_MERGE( vec_mergeh );
434                 VEC_MERGE( vec_mergel );
435             }
436         }
437     }
438     else if( !( ( p_vout->render.i_width % 16 ) |
439                 ( p_vout->render.i_height % 4 ) ) )
440     {
441         /* Width is only a multiple of 16, we take 4 lines at a time */
442         for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
443         {
444             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
445             VEC_NEXT_LINES( );
446             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
447             {
448                 VEC_LOAD_UV( );
449                 VEC_MERGE( vec_mergeh );
450                 VEC_MERGE( vec_mergel );
451             }
452
453             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
454             VEC_LOAD_UV( );
455             VEC_MERGE( vec_mergeh );
456
457             /* Line 3 and 4, pixels 0 to 16 */
458             VEC_NEXT_LINES( );
459             VEC_MERGE( vec_mergel );
460
461             /* Line 3 and 4, pixels 16 to ( width ) */
462             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
463             {
464                 VEC_LOAD_UV( );
465                 VEC_MERGE( vec_mergeh );
466                 VEC_MERGE( vec_mergel );
467             }
468         }
469     }
470     else
471     {
472         /* Crap, use the C version */
473 #undef VEC_NEXT_LINES
474 #undef VEC_LOAD_UV
475 #undef VEC_MERGE
476 #endif
477
478     const int i_source_margin = p_source->p[0].i_pitch
479                                  - p_source->p[0].i_visible_pitch;
480     const int i_source_margin_c = p_source->p[1].i_pitch
481                                  - p_source->p[1].i_visible_pitch;
482     const int i_dest_margin = p_dest->p->i_pitch
483                                - p_dest->p->i_visible_pitch;
484
485 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
486     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
487     {
488         p_line1 = p_line2;
489         p_line2 += p_dest->p->i_pitch;
490
491         p_y1 = p_y2;
492         p_y2 += p_source->p[Y_PLANE].i_pitch;
493
494         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
495         {
496 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
497             C_YUV420_YVYU( );
498             C_YUV420_YVYU( );
499             C_YUV420_YVYU( );
500             C_YUV420_YVYU( );
501 #else
502             MMX_CALL( MMX_YUV420_YVYU );
503 #endif
504         }
505         for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; )
506         {
507             C_YUV420_YVYU( );
508         }
509
510         p_y1 += i_source_margin;
511         p_y2 += i_source_margin;
512         p_u += i_source_margin_c;
513         p_v += i_source_margin_c;
514         p_line1 += i_dest_margin;
515         p_line2 += i_dest_margin;
516     }
517
518 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
519     /* re-enable FPU registers */
520     MMX_END;
521 #endif
522
523 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
524     }
525 #endif
526
527 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
528     /*
529     ** SSE2 128 bits fetch/store instructions are faster 
530     ** if memory access is 16 bytes aligned
531     */
532     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
533         ((int)p_line2|(int)p_y2))) )
534     {
535         /* use faster SSE2 aligned fetch and store */
536         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
537         {
538             p_line1 = p_line2;
539             p_line2 += p_dest->p->i_pitch;
540
541             p_y1 = p_y2;
542             p_y2 += p_source->p[Y_PLANE].i_pitch;
543
544             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
545             {
546                 SSE2_CALL( SSE2_YUV420_YVYU_ALIGNED );
547             }
548             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
549             {
550                 C_YUV420_YVYU( );
551             }
552
553             p_y1 += i_source_margin;
554             p_y2 += i_source_margin;
555             p_u += i_source_margin_c;
556             p_v += i_source_margin_c;
557             p_line1 += i_dest_margin;
558             p_line2 += i_dest_margin;
559         }
560     }
561     else
562     {
563         /* use slower SSE2 unaligned fetch and store */
564         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
565         {
566             p_line1 = p_line2;
567             p_line2 += p_dest->p->i_pitch;
568
569             p_y1 = p_y2;
570             p_y2 += p_source->p[Y_PLANE].i_pitch;
571
572             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
573             {
574                 SSE2_CALL( SSE2_YUV420_YVYU_UNALIGNED );
575             }
576             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
577             {
578                 C_YUV420_YVYU( );
579             }
580
581             p_y1 += i_source_margin;
582             p_y2 += i_source_margin;
583             p_u += i_source_margin_c;
584             p_v += i_source_margin_c;
585             p_line1 += i_dest_margin;
586             p_line2 += i_dest_margin;
587         }
588     }
589     /* make sure all SSE2 stores are visible thereafter */
590     SSE2_END;
591 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
592 }
593
594 /*****************************************************************************
595  * I420_UYVY: planar YUV 4:2:0 to packed UYVY 4:2:2
596  *****************************************************************************/
597 static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source,
598                                               picture_t *p_dest )
599 {
600     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
601     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
602     uint8_t *p_u = p_source->U_PIXELS;
603     uint8_t *p_v = p_source->V_PIXELS;
604
605     int i_x, i_y;
606
607 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
608 #define VEC_NEXT_LINES( ) \
609     p_line1  = p_line2; \
610     p_line2 += p_dest->p->i_pitch; \
611     p_y1     = p_y2; \
612     p_y2    += p_source->p[Y_PLANE].i_pitch;
613
614 #define VEC_LOAD_UV( ) \
615     u_vec = vec_ld( 0, p_u ); p_u += 16; \
616     v_vec = vec_ld( 0, p_v ); p_v += 16;
617
618 #define VEC_MERGE( a ) \
619     uv_vec = a( u_vec, v_vec ); \
620     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
621     vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
622     vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
623     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
624     vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16; \
625     vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16;
626
627     vector unsigned char u_vec;
628     vector unsigned char v_vec;
629     vector unsigned char uv_vec;
630     vector unsigned char y_vec;
631
632     if( !( ( p_vout->render.i_width % 32 ) |
633            ( p_vout->render.i_height % 2 ) ) )
634     {
635         /* Width is a multiple of 32, we take 2 lines at a time */
636         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
637         {
638             VEC_NEXT_LINES( );
639             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
640             {
641                 VEC_LOAD_UV( );
642                 VEC_MERGE( vec_mergeh );
643                 VEC_MERGE( vec_mergel );
644             }
645         }
646     }
647     else if( !( ( p_vout->render.i_width % 16 ) |
648                 ( p_vout->render.i_height % 4 ) ) )
649     {
650         /* Width is only a multiple of 16, we take 4 lines at a time */
651         for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
652         {
653             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
654             VEC_NEXT_LINES( );
655             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
656             {
657                 VEC_LOAD_UV( );
658                 VEC_MERGE( vec_mergeh );
659                 VEC_MERGE( vec_mergel );
660             }
661
662             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
663             VEC_LOAD_UV( );
664             VEC_MERGE( vec_mergeh );
665
666             /* Line 3 and 4, pixels 0 to 16 */
667             VEC_NEXT_LINES( );
668             VEC_MERGE( vec_mergel );
669
670             /* Line 3 and 4, pixels 16 to ( width ) */
671             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
672             {
673                 VEC_LOAD_UV( );
674                 VEC_MERGE( vec_mergeh );
675                 VEC_MERGE( vec_mergel );
676             }
677         }
678     }
679     else
680     {
681         /* Crap, use the C version */
682 #undef VEC_NEXT_LINES
683 #undef VEC_LOAD_UV
684 #undef VEC_MERGE
685 #endif
686
687     const int i_source_margin = p_source->p[0].i_pitch
688                                  - p_source->p[0].i_visible_pitch;
689     const int i_source_margin_c = p_source->p[1].i_pitch
690                                  - p_source->p[1].i_visible_pitch;
691     const int i_dest_margin = p_dest->p->i_pitch
692                                - p_dest->p->i_visible_pitch;
693
694 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
695     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
696     {
697         p_line1 = p_line2;
698         p_line2 += p_dest->p->i_pitch;
699
700         p_y1 = p_y2;
701         p_y2 += p_source->p[Y_PLANE].i_pitch;
702
703         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
704         {
705 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
706             C_YUV420_UYVY( );
707             C_YUV420_UYVY( );
708             C_YUV420_UYVY( );
709             C_YUV420_UYVY( );
710 #else
711             MMX_CALL( MMX_YUV420_UYVY );
712 #endif
713         }
714         for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x--; )
715         {
716             C_YUV420_UYVY( );
717         }
718
719         p_y1 += i_source_margin;
720         p_y2 += i_source_margin;
721         p_u += i_source_margin_c;
722         p_v += i_source_margin_c;
723         p_line1 += i_dest_margin;
724         p_line2 += i_dest_margin;
725     }
726
727 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
728     /* re-enable FPU registers */
729     MMX_END;
730 #endif
731
732 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
733     }
734 #endif
735
736 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
737     /*
738     ** SSE2 128 bits fetch/store instructions are faster 
739     ** if memory access is 16 bytes aligned
740     */
741     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
742         ((int)p_line2|(int)p_y2))) )
743     {
744         /* use faster SSE2 aligned fetch and store */
745         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
746         {
747             p_line1 = p_line2;
748             p_line2 += p_dest->p->i_pitch;
749
750             p_y1 = p_y2;
751             p_y2 += p_source->p[Y_PLANE].i_pitch;
752
753             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
754             {
755                 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
756             }
757             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
758             {
759                 C_YUV420_UYVY( );
760             }
761
762             p_y1 += i_source_margin;
763             p_y2 += i_source_margin;
764             p_u += i_source_margin_c;
765             p_v += i_source_margin_c;
766             p_line1 += i_dest_margin;
767             p_line2 += i_dest_margin;
768         }
769     }
770     else
771     {
772         /* use slower SSE2 unaligned fetch and store */
773         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
774         {
775             p_line1 = p_line2;
776             p_line2 += p_dest->p->i_pitch;
777
778             p_y1 = p_y2;
779             p_y2 += p_source->p[Y_PLANE].i_pitch;
780
781             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
782             {
783                 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
784             }
785             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
786             {
787                 C_YUV420_UYVY( );
788             }
789
790             p_y1 += i_source_margin;
791             p_y2 += i_source_margin;
792             p_u += i_source_margin_c;
793             p_v += i_source_margin_c;
794             p_line1 += i_dest_margin;
795             p_line2 += i_dest_margin;
796         }
797     }
798     /* make sure all SSE2 stores are visible thereafter */
799     SSE2_END;
800 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
801 }
802
803 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
804 /*****************************************************************************
805  * I420_IUYV: planar YUV 4:2:0 to interleaved packed UYVY 4:2:2
806  *****************************************************************************/
807 static void I420_IUYV( vout_thread_t *p_vout, picture_t *p_source,
808                                               picture_t *p_dest )
809 {
810     /* FIXME: TODO ! */
811     msg_Err( p_vout, "I420_IUYV unimplemented, please harass <sam@zoy.org>" );
812 }
813
814 /*****************************************************************************
815  * I420_cyuv: planar YUV 4:2:0 to upside-down packed UYVY 4:2:2
816  *****************************************************************************/
817 static void I420_cyuv( vout_thread_t *p_vout, picture_t *p_source,
818                                               picture_t *p_dest )
819 {
820     uint8_t *p_line1 = p_dest->p->p_pixels +
821                        p_dest->p->i_visible_lines * p_dest->p->i_pitch
822                        + p_dest->p->i_pitch;
823     uint8_t *p_line2 = p_dest->p->p_pixels +
824                        p_dest->p->i_visible_lines * p_dest->p->i_pitch;
825     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
826     uint8_t *p_u = p_source->U_PIXELS;
827     uint8_t *p_v = p_source->V_PIXELS;
828
829     int i_x, i_y;
830
831     const int i_source_margin = p_source->p[0].i_pitch
832                                  - p_source->p[0].i_visible_pitch;
833     const int i_source_margin_c = p_source->p[1].i_pitch
834                                  - p_source->p[1].i_visible_pitch;
835     const int i_dest_margin = p_dest->p->i_pitch
836                                - p_dest->p->i_visible_pitch;
837
838 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
839     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
840     {
841         p_line1 -= 3 * p_dest->p->i_pitch;
842         p_line2 -= 3 * p_dest->p->i_pitch;
843
844         p_y1 = p_y2;
845         p_y2 += p_source->p[Y_PLANE].i_pitch;
846
847         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
848         {
849 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
850             C_YUV420_UYVY( );
851             C_YUV420_UYVY( );
852             C_YUV420_UYVY( );
853             C_YUV420_UYVY( );
854 #else
855             MMX_CALL( MMX_YUV420_UYVY );
856 #endif
857         }
858         for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; )
859         {
860             C_YUV420_UYVY( );
861         }
862
863         p_y1 += i_source_margin;
864         p_y2 += i_source_margin;
865         p_u += i_source_margin_c;
866         p_v += i_source_margin_c;
867         p_line1 += i_dest_margin;
868         p_line2 += i_dest_margin;
869     }
870
871 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
872     /* re-enable FPU registers */
873     MMX_END;
874 #endif
875
876 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
877     /*
878     ** SSE2 128 bits fetch/store instructions are faster 
879     ** if memory access is 16 bytes aligned
880     */
881     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
882         ((int)p_line2|(int)p_y2))) )
883     {
884         /* use faster SSE2 aligned fetch and store */
885         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
886         {
887             p_line1 = p_line2;
888             p_line2 += p_dest->p->i_pitch;
889
890             p_y1 = p_y2;
891             p_y2 += p_source->p[Y_PLANE].i_pitch;
892
893             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
894             {
895                 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
896             }
897             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
898             {
899                 C_YUV420_UYVY( );
900             }
901
902             p_y1 += i_source_margin;
903             p_y2 += i_source_margin;
904             p_u += i_source_margin_c;
905             p_v += i_source_margin_c;
906             p_line1 += i_dest_margin;
907             p_line2 += i_dest_margin;
908         }
909     }
910     else
911     {
912         /* use slower SSE2 unaligned fetch and store */
913         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
914         {
915             p_line1 = p_line2;
916             p_line2 += p_dest->p->i_pitch;
917
918             p_y1 = p_y2;
919             p_y2 += p_source->p[Y_PLANE].i_pitch;
920
921             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
922             {
923                 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
924             }
925             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
926             {
927                 C_YUV420_UYVY( );
928             }
929
930             p_y1 += i_source_margin;
931             p_y2 += i_source_margin;
932             p_u += i_source_margin_c;
933             p_v += i_source_margin_c;
934             p_line1 += i_dest_margin;
935             p_line2 += i_dest_margin;
936         }
937     }
938     /* make sure all SSE2 stores are visible thereafter */
939     SSE2_END;
940 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
941 }
942 #endif // !defined (MODULE_NAME_IS_i420_yuy2_altivec)
943
944 /*****************************************************************************
945  * I420_Y211: planar YUV 4:2:0 to packed YUYV 2:1:1
946  *****************************************************************************/
947 #if defined (MODULE_NAME_IS_i420_yuy2)
948 static void I420_Y211( vout_thread_t *p_vout, picture_t *p_source,
949                                               picture_t *p_dest )
950 {
951     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
952     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
953     uint8_t *p_u = p_source->U_PIXELS;
954     uint8_t *p_v = p_source->V_PIXELS;
955
956     int i_x, i_y;
957
958     const int i_source_margin = p_source->p[0].i_pitch
959                                  - p_source->p[0].i_visible_pitch;
960     const int i_source_margin_c = p_source->p[1].i_pitch
961                                  - p_source->p[1].i_visible_pitch;
962     const int i_dest_margin = p_dest->p->i_pitch
963                                - p_dest->p->i_visible_pitch;
964
965     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
966     {
967         p_line1 = p_line2;
968         p_line2 += p_dest->p->i_pitch;
969
970         p_y1 = p_y2;
971         p_y2 += p_source->p[Y_PLANE].i_pitch;
972
973         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
974         {
975             C_YUV420_Y211( );
976             C_YUV420_Y211( );
977         }
978
979         p_y1 += i_source_margin;
980         p_y2 += i_source_margin;
981         p_u += i_source_margin_c;
982         p_v += i_source_margin_c;
983         p_line1 += i_dest_margin;
984         p_line2 += i_dest_margin;
985     }
986 }
987 #endif