]> git.sesse.net Git - vlc/blob - modules/video_chroma/i420_yuy2.c
video chromas: finalize SSE2 improvements
[vlc] / modules / video_chroma / i420_yuy2.c
1 /*****************************************************************************
2  * i420_yuy2.c : YUV to YUV conversion module for vlc
3  *****************************************************************************
4  * Copyright (C) 2000, 2001 the VideoLAN team
5  * $Id$
6  *
7  * Authors: Samuel Hocevar <sam@zoy.org>
8  *          Damien Fouilleul <damien@videolan.org>
9  *
10  * This program is free software; you can redistribute it and/or modify
11  * it under the terms of the GNU General Public License as published by
12  * the Free Software Foundation; either version 2 of the License, or
13  * (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public License
21  * along with this program; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
23  *****************************************************************************/
24
25 /*****************************************************************************
26  * Preamble
27  *****************************************************************************/
28 #include <string.h>                                            /* strerror() */
29 #include <stdlib.h>                                      /* malloc(), free() */
30
31 #include <vlc/vlc.h>
32 #include <vlc_vout.h>
33
34 #if defined (MODULE_NAME_IS_i420_yuy2_altivec) && defined(HAVE_ALTIVEC_H)
35 #   include <altivec.h>
36 #endif
37
38 #include "i420_yuy2.h"
39
40 #define SRC_FOURCC  "I420,IYUV,YV12"
41
42 #if defined (MODULE_NAME_IS_i420_yuy2)
43 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv,Y211"
44 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
45 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
46 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
47 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
48 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
49 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422"
50 #endif
51
52 /*****************************************************************************
53  * Local and extern prototypes.
54  *****************************************************************************/
55 static int  Activate ( vlc_object_t * );
56
57 static void I420_YUY2           ( vout_thread_t *, picture_t *, picture_t * );
58 static void I420_YVYU           ( vout_thread_t *, picture_t *, picture_t * );
59 static void I420_UYVY           ( vout_thread_t *, picture_t *, picture_t * );
60 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
61 static void I420_IUYV           ( vout_thread_t *, picture_t *, picture_t * );
62 static void I420_cyuv           ( vout_thread_t *, picture_t *, picture_t * );
63 #endif
64 #if defined (MODULE_NAME_IS_i420_yuy2)
65 static void I420_Y211           ( vout_thread_t *, picture_t *, picture_t * );
66 #endif
67
68 #ifdef MODULE_NAME_IS_i420_yuy2_mmx
69 /* Initialize MMX-specific constants */
70 static const uint64_t i_00ffw = 0x00ff00ff00ff00ffULL;
71 static const uint64_t i_80w   = 0x0000000080808080ULL;
72 #endif
73
74 /*****************************************************************************
75  * Module descriptor.
76  *****************************************************************************/
77 vlc_module_begin();
78 #if defined (MODULE_NAME_IS_i420_yuy2)
79     set_description( _("Conversions from " SRC_FOURCC " to " DEST_FOURCC) );
80     set_capability( "chroma", 80 );
81 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
82     set_description( _("MMX conversions from " SRC_FOURCC " to " DEST_FOURCC) );
83     set_capability( "chroma", 100 );
84     add_requirement( MMX );
85 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
86     set_description( _("SSE2 conversions from " SRC_FOURCC " to " DEST_FOURCC) );
87     set_capability( "chroma", 120 );
88     add_requirement( SSE2 );
89 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
90     set_description(
91             _("AltiVec conversions from " SRC_FOURCC " to " DEST_FOURCC) );
92     set_capability( "chroma", 100 );
93     add_requirement( ALTIVEC );
94 #endif
95     set_callbacks( Activate, NULL );
96 vlc_module_end();
97
98 /*****************************************************************************
99  * Activate: allocate a chroma function
100  *****************************************************************************
101  * This function allocates and initializes a chroma function
102  *****************************************************************************/
103 static int Activate( vlc_object_t *p_this )
104 {
105     vout_thread_t *p_vout = (vout_thread_t *)p_this;
106
107     if( p_vout->render.i_width & 1 || p_vout->render.i_height & 1 )
108     {
109         return -1;
110     }
111
112     switch( p_vout->render.i_chroma )
113     {
114         case VLC_FOURCC('Y','V','1','2'):
115         case VLC_FOURCC('I','4','2','0'):
116         case VLC_FOURCC('I','Y','U','V'):
117             switch( p_vout->output.i_chroma )
118             {
119                 case VLC_FOURCC('Y','U','Y','2'):
120                 case VLC_FOURCC('Y','U','N','V'):
121                     p_vout->chroma.pf_convert = I420_YUY2;
122                     break;
123
124                 case VLC_FOURCC('Y','V','Y','U'):
125                     p_vout->chroma.pf_convert = I420_YVYU;
126                     break;
127
128                 case VLC_FOURCC('U','Y','V','Y'):
129                 case VLC_FOURCC('U','Y','N','V'):
130                 case VLC_FOURCC('Y','4','2','2'):
131                     p_vout->chroma.pf_convert = I420_UYVY;
132                     break;
133 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
134                 case VLC_FOURCC('I','U','Y','V'):
135                     p_vout->chroma.pf_convert = I420_IUYV;
136                     break;
137
138                 case VLC_FOURCC('c','y','u','v'):
139                     p_vout->chroma.pf_convert = I420_cyuv;
140                     break;
141 #endif
142
143 #if defined (MODULE_NAME_IS_i420_yuy2)
144                 case VLC_FOURCC('Y','2','1','1'):
145                     p_vout->chroma.pf_convert = I420_Y211;
146                     break;
147 #endif
148
149                 default:
150                     return -1;
151             }
152             break;
153
154         default:
155             return -1;
156     }
157
158     return 0;
159 }
160
161 #if 0
162 static inline unsigned long long read_cycles(void)
163 {
164     unsigned long long v;
165     __asm__ __volatile__("rdtsc" : "=A" (v): );
166
167     return v;
168 }
169 #endif
170
171 /* Following functions are local */
172 /*****************************************************************************
173  * I420_YUY2: planar YUV 4:2:0 to packed YUYV 4:2:2
174  *****************************************************************************/
175 static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source,
176                                               picture_t *p_dest )
177 {
178     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
179     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
180     uint8_t *p_u = p_source->U_PIXELS;
181     uint8_t *p_v = p_source->V_PIXELS;
182
183     int i_x, i_y;
184
185 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
186 #define VEC_NEXT_LINES( ) \
187     p_line1  = p_line2; \
188     p_line2 += p_dest->p->i_pitch; \
189     p_y1     = p_y2; \
190     p_y2    += p_source->p[Y_PLANE].i_pitch;
191
192 #define VEC_LOAD_UV( ) \
193     u_vec = vec_ld( 0, p_u ); p_u += 16; \
194     v_vec = vec_ld( 0, p_v ); p_v += 16;
195
196 #define VEC_MERGE( a ) \
197     uv_vec = a( u_vec, v_vec ); \
198     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
199     vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
200     vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
201     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
202     vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16; \
203     vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16;
204
205     vector unsigned char u_vec;
206     vector unsigned char v_vec;
207     vector unsigned char uv_vec;
208     vector unsigned char y_vec;
209
210     if( !( ( p_vout->render.i_width % 32 ) |
211            ( p_vout->render.i_height % 2 ) ) )
212     {
213         /* Width is a multiple of 32, we take 2 lines at a time */
214         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
215         {
216             VEC_NEXT_LINES( );
217             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
218             {
219                 VEC_LOAD_UV( );
220                 VEC_MERGE( vec_mergeh );
221                 VEC_MERGE( vec_mergel );
222             }
223         }
224     }
225     else if( !( ( p_vout->render.i_width % 16 ) |
226                 ( p_vout->render.i_height % 4 ) ) )
227     {
228         /* Width is only a multiple of 16, we take 4 lines at a time */
229         for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
230         {
231             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
232             VEC_NEXT_LINES( );
233             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
234             {
235                 VEC_LOAD_UV( );
236                 VEC_MERGE( vec_mergeh );
237                 VEC_MERGE( vec_mergel );
238             }
239
240             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
241             VEC_LOAD_UV( );
242             VEC_MERGE( vec_mergeh );
243
244             /* Line 3 and 4, pixels 0 to 16 */
245             VEC_NEXT_LINES( );
246             VEC_MERGE( vec_mergel );
247
248             /* Line 3 and 4, pixels 16 to ( width ) */
249             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
250             {
251                 VEC_LOAD_UV( );
252                 VEC_MERGE( vec_mergeh );
253                 VEC_MERGE( vec_mergel );
254             }
255         }
256     }
257     else
258     {
259         /* Crap, use the C version */
260 #undef VEC_NEXT_LINES
261 #undef VEC_LOAD_UV
262 #undef VEC_MERGE
263 #endif
264
265     const int i_source_margin = p_source->p[0].i_pitch
266                                  - p_source->p[0].i_visible_pitch;
267     const int i_source_margin_c = p_source->p[1].i_pitch
268                                  - p_source->p[1].i_visible_pitch;
269     const int i_dest_margin = p_dest->p->i_pitch
270                                - p_dest->p->i_visible_pitch;
271
272 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
273     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
274     {
275         p_line1 = p_line2;
276         p_line2 += p_dest->p->i_pitch;
277
278         p_y1 = p_y2;
279         p_y2 += p_source->p[Y_PLANE].i_pitch;
280
281 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
282         for( i_x = p_vout->render.i_width / 8; i_x-- ; )
283         {
284             C_YUV420_YUYV( );
285             C_YUV420_YUYV( );
286             C_YUV420_YUYV( );
287             C_YUV420_YUYV( );
288         }
289 #else
290         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
291         {
292             MMX_CALL( MMX_YUV420_YUYV );
293         }
294 #endif
295         for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; )
296         {
297             C_YUV420_YUYV( );
298         }
299
300         p_y1 += i_source_margin;
301         p_y2 += i_source_margin;
302         p_u += i_source_margin_c;
303         p_v += i_source_margin_c;
304         p_line1 += i_dest_margin;
305         p_line2 += i_dest_margin;
306     }
307
308 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
309     /* re-enable FPU registers */
310     MMX_END;
311 #endif
312
313 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
314     }
315 #endif
316
317 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
318     /*
319     ** SSE2 128 bits fetch/store instructions are faster 
320     ** if memory access is 16 bytes aligned
321     */
322
323     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
324         ((int)p_line2|(int)p_y2))) )
325     {
326         /* use faster SSE2 aligned fetch and store */
327         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
328         {
329             p_line1 = p_line2;
330             p_line2 += p_dest->p->i_pitch;
331
332             p_y1 = p_y2;
333             p_y2 += p_source->p[Y_PLANE].i_pitch;
334
335             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
336             {
337                 SSE2_CALL( SSE2_YUV420_YUYV_ALIGNED );
338             }
339             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
340             {
341                 C_YUV420_YUYV( );
342             }
343
344             p_y1 += i_source_margin;
345             p_y2 += i_source_margin;
346             p_u += i_source_margin_c;
347             p_v += i_source_margin_c;
348             p_line1 += i_dest_margin;
349             p_line2 += i_dest_margin;
350         }
351     }
352     else
353     {
354         /* use slower SSE2 unaligned fetch and store */
355         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
356         {
357             p_line1 = p_line2;
358             p_line2 += p_dest->p->i_pitch;
359
360             p_y1 = p_y2;
361             p_y2 += p_source->p[Y_PLANE].i_pitch;
362
363             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
364             {
365                 SSE2_CALL( SSE2_YUV420_YUYV_UNALIGNED );
366             }
367             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
368             {
369                 C_YUV420_YUYV( );
370             }
371
372             p_y1 += i_source_margin;
373             p_y2 += i_source_margin;
374             p_u += i_source_margin_c;
375             p_v += i_source_margin_c;
376             p_line1 += i_dest_margin;
377             p_line2 += i_dest_margin;
378         }
379     }
380     /* make sure all SSE2 stores are visible thereafter */
381     SSE2_END;
382
383 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
384 }
385
386 /*****************************************************************************
387  * I420_YVYU: planar YUV 4:2:0 to packed YVYU 4:2:2
388  *****************************************************************************/
389 static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source,
390                                               picture_t *p_dest )
391 {
392     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
393     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
394     uint8_t *p_u = p_source->U_PIXELS;
395     uint8_t *p_v = p_source->V_PIXELS;
396
397     int i_x, i_y;
398
399 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
400 #define VEC_NEXT_LINES( ) \
401     p_line1  = p_line2; \
402     p_line2 += p_dest->p->i_pitch; \
403     p_y1     = p_y2; \
404     p_y2    += p_source->p[Y_PLANE].i_pitch;
405
406 #define VEC_LOAD_UV( ) \
407     u_vec = vec_ld( 0, p_u ); p_u += 16; \
408     v_vec = vec_ld( 0, p_v ); p_v += 16;
409
410 #define VEC_MERGE( a ) \
411     vu_vec = a( v_vec, u_vec ); \
412     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
413     vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
414     vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
415     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
416     vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16; \
417     vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16;
418
419     vector unsigned char u_vec;
420     vector unsigned char v_vec;
421     vector unsigned char vu_vec;
422     vector unsigned char y_vec;
423
424     if( !( ( p_vout->render.i_width % 32 ) |
425            ( p_vout->render.i_height % 2 ) ) )
426     {
427         /* Width is a multiple of 32, we take 2 lines at a time */
428         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
429         {
430             VEC_NEXT_LINES( );
431             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
432             {
433                 VEC_LOAD_UV( );
434                 VEC_MERGE( vec_mergeh );
435                 VEC_MERGE( vec_mergel );
436             }
437         }
438     }
439     else if( !( ( p_vout->render.i_width % 16 ) |
440                 ( p_vout->render.i_height % 4 ) ) )
441     {
442         /* Width is only a multiple of 16, we take 4 lines at a time */
443         for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
444         {
445             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
446             VEC_NEXT_LINES( );
447             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
448             {
449                 VEC_LOAD_UV( );
450                 VEC_MERGE( vec_mergeh );
451                 VEC_MERGE( vec_mergel );
452             }
453
454             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
455             VEC_LOAD_UV( );
456             VEC_MERGE( vec_mergeh );
457
458             /* Line 3 and 4, pixels 0 to 16 */
459             VEC_NEXT_LINES( );
460             VEC_MERGE( vec_mergel );
461
462             /* Line 3 and 4, pixels 16 to ( width ) */
463             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
464             {
465                 VEC_LOAD_UV( );
466                 VEC_MERGE( vec_mergeh );
467                 VEC_MERGE( vec_mergel );
468             }
469         }
470     }
471     else
472     {
473         /* Crap, use the C version */
474 #undef VEC_NEXT_LINES
475 #undef VEC_LOAD_UV
476 #undef VEC_MERGE
477 #endif
478
479     const int i_source_margin = p_source->p[0].i_pitch
480                                  - p_source->p[0].i_visible_pitch;
481     const int i_source_margin_c = p_source->p[1].i_pitch
482                                  - p_source->p[1].i_visible_pitch;
483     const int i_dest_margin = p_dest->p->i_pitch
484                                - p_dest->p->i_visible_pitch;
485
486 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
487     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
488     {
489         p_line1 = p_line2;
490         p_line2 += p_dest->p->i_pitch;
491
492         p_y1 = p_y2;
493         p_y2 += p_source->p[Y_PLANE].i_pitch;
494
495         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
496         {
497 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
498             C_YUV420_YVYU( );
499             C_YUV420_YVYU( );
500             C_YUV420_YVYU( );
501             C_YUV420_YVYU( );
502 #else
503             MMX_CALL( MMX_YUV420_YVYU );
504 #endif
505         }
506         for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; )
507         {
508             C_YUV420_YVYU( );
509         }
510
511         p_y1 += i_source_margin;
512         p_y2 += i_source_margin;
513         p_u += i_source_margin_c;
514         p_v += i_source_margin_c;
515         p_line1 += i_dest_margin;
516         p_line2 += i_dest_margin;
517     }
518
519 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
520     /* re-enable FPU registers */
521     MMX_END;
522 #endif
523
524 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
525     }
526 #endif
527
528 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
529     /*
530     ** SSE2 128 bits fetch/store instructions are faster 
531     ** if memory access is 16 bytes aligned
532     */
533     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
534         ((int)p_line2|(int)p_y2))) )
535     {
536         /* use faster SSE2 aligned fetch and store */
537         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
538         {
539             p_line1 = p_line2;
540             p_line2 += p_dest->p->i_pitch;
541
542             p_y1 = p_y2;
543             p_y2 += p_source->p[Y_PLANE].i_pitch;
544
545             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
546             {
547                 SSE2_CALL( SSE2_YUV420_YVYU_ALIGNED );
548             }
549             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
550             {
551                 C_YUV420_YVYU( );
552             }
553
554             p_y1 += i_source_margin;
555             p_y2 += i_source_margin;
556             p_u += i_source_margin_c;
557             p_v += i_source_margin_c;
558             p_line1 += i_dest_margin;
559             p_line2 += i_dest_margin;
560         }
561     }
562     else
563     {
564         /* use slower SSE2 unaligned fetch and store */
565         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
566         {
567             p_line1 = p_line2;
568             p_line2 += p_dest->p->i_pitch;
569
570             p_y1 = p_y2;
571             p_y2 += p_source->p[Y_PLANE].i_pitch;
572
573             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
574             {
575                 SSE2_CALL( SSE2_YUV420_YVYU_UNALIGNED );
576             }
577             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
578             {
579                 C_YUV420_YVYU( );
580             }
581
582             p_y1 += i_source_margin;
583             p_y2 += i_source_margin;
584             p_u += i_source_margin_c;
585             p_v += i_source_margin_c;
586             p_line1 += i_dest_margin;
587             p_line2 += i_dest_margin;
588         }
589     }
590     /* make sure all SSE2 stores are visible thereafter */
591     SSE2_END;
592 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
593 }
594
595 /*****************************************************************************
596  * I420_UYVY: planar YUV 4:2:0 to packed UYVY 4:2:2
597  *****************************************************************************/
598 static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source,
599                                               picture_t *p_dest )
600 {
601     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
602     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
603     uint8_t *p_u = p_source->U_PIXELS;
604     uint8_t *p_v = p_source->V_PIXELS;
605
606     int i_x, i_y;
607
608 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
609 #define VEC_NEXT_LINES( ) \
610     p_line1  = p_line2; \
611     p_line2 += p_dest->p->i_pitch; \
612     p_y1     = p_y2; \
613     p_y2    += p_source->p[Y_PLANE].i_pitch;
614
615 #define VEC_LOAD_UV( ) \
616     u_vec = vec_ld( 0, p_u ); p_u += 16; \
617     v_vec = vec_ld( 0, p_v ); p_v += 16;
618
619 #define VEC_MERGE( a ) \
620     uv_vec = a( u_vec, v_vec ); \
621     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
622     vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
623     vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
624     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
625     vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16; \
626     vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16;
627
628     vector unsigned char u_vec;
629     vector unsigned char v_vec;
630     vector unsigned char uv_vec;
631     vector unsigned char y_vec;
632
633     if( !( ( p_vout->render.i_width % 32 ) |
634            ( p_vout->render.i_height % 2 ) ) )
635     {
636         /* Width is a multiple of 32, we take 2 lines at a time */
637         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
638         {
639             VEC_NEXT_LINES( );
640             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
641             {
642                 VEC_LOAD_UV( );
643                 VEC_MERGE( vec_mergeh );
644                 VEC_MERGE( vec_mergel );
645             }
646         }
647     }
648     else if( !( ( p_vout->render.i_width % 16 ) |
649                 ( p_vout->render.i_height % 4 ) ) )
650     {
651         /* Width is only a multiple of 16, we take 4 lines at a time */
652         for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
653         {
654             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
655             VEC_NEXT_LINES( );
656             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
657             {
658                 VEC_LOAD_UV( );
659                 VEC_MERGE( vec_mergeh );
660                 VEC_MERGE( vec_mergel );
661             }
662
663             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
664             VEC_LOAD_UV( );
665             VEC_MERGE( vec_mergeh );
666
667             /* Line 3 and 4, pixels 0 to 16 */
668             VEC_NEXT_LINES( );
669             VEC_MERGE( vec_mergel );
670
671             /* Line 3 and 4, pixels 16 to ( width ) */
672             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
673             {
674                 VEC_LOAD_UV( );
675                 VEC_MERGE( vec_mergeh );
676                 VEC_MERGE( vec_mergel );
677             }
678         }
679     }
680     else
681     {
682         /* Crap, use the C version */
683 #undef VEC_NEXT_LINES
684 #undef VEC_LOAD_UV
685 #undef VEC_MERGE
686 #endif
687
688     const int i_source_margin = p_source->p[0].i_pitch
689                                  - p_source->p[0].i_visible_pitch;
690     const int i_source_margin_c = p_source->p[1].i_pitch
691                                  - p_source->p[1].i_visible_pitch;
692     const int i_dest_margin = p_dest->p->i_pitch
693                                - p_dest->p->i_visible_pitch;
694
695 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
696     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
697     {
698         p_line1 = p_line2;
699         p_line2 += p_dest->p->i_pitch;
700
701         p_y1 = p_y2;
702         p_y2 += p_source->p[Y_PLANE].i_pitch;
703
704         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
705         {
706 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
707             C_YUV420_UYVY( );
708             C_YUV420_UYVY( );
709             C_YUV420_UYVY( );
710             C_YUV420_UYVY( );
711 #else
712             MMX_CALL( MMX_YUV420_UYVY );
713 #endif
714         }
715         for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x--; )
716         {
717             C_YUV420_UYVY( );
718         }
719
720         p_y1 += i_source_margin;
721         p_y2 += i_source_margin;
722         p_u += i_source_margin_c;
723         p_v += i_source_margin_c;
724         p_line1 += i_dest_margin;
725         p_line2 += i_dest_margin;
726     }
727
728 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
729     /* re-enable FPU registers */
730     MMX_END;
731 #endif
732
733 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
734     }
735 #endif
736
737 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
738     /*
739     ** SSE2 128 bits fetch/store instructions are faster 
740     ** if memory access is 16 bytes aligned
741     */
742     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
743         ((int)p_line2|(int)p_y2))) )
744     {
745         /* use faster SSE2 aligned fetch and store */
746         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
747         {
748             p_line1 = p_line2;
749             p_line2 += p_dest->p->i_pitch;
750
751             p_y1 = p_y2;
752             p_y2 += p_source->p[Y_PLANE].i_pitch;
753
754             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
755             {
756                 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
757             }
758             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
759             {
760                 C_YUV420_UYVY( );
761             }
762
763             p_y1 += i_source_margin;
764             p_y2 += i_source_margin;
765             p_u += i_source_margin_c;
766             p_v += i_source_margin_c;
767             p_line1 += i_dest_margin;
768             p_line2 += i_dest_margin;
769         }
770     }
771     else
772     {
773         /* use slower SSE2 unaligned fetch and store */
774         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
775         {
776             p_line1 = p_line2;
777             p_line2 += p_dest->p->i_pitch;
778
779             p_y1 = p_y2;
780             p_y2 += p_source->p[Y_PLANE].i_pitch;
781
782             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
783             {
784                 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
785             }
786             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
787             {
788                 C_YUV420_UYVY( );
789             }
790
791             p_y1 += i_source_margin;
792             p_y2 += i_source_margin;
793             p_u += i_source_margin_c;
794             p_v += i_source_margin_c;
795             p_line1 += i_dest_margin;
796             p_line2 += i_dest_margin;
797         }
798     }
799     /* make sure all SSE2 stores are visible thereafter */
800     SSE2_END;
801 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
802 }
803
804 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
805 /*****************************************************************************
806  * I420_IUYV: planar YUV 4:2:0 to interleaved packed UYVY 4:2:2
807  *****************************************************************************/
808 static void I420_IUYV( vout_thread_t *p_vout, picture_t *p_source,
809                                               picture_t *p_dest )
810 {
811     /* FIXME: TODO ! */
812     msg_Err( p_vout, "I420_IUYV unimplemented, please harass <sam@zoy.org>" );
813 }
814
815 /*****************************************************************************
816  * I420_cyuv: planar YUV 4:2:0 to upside-down packed UYVY 4:2:2
817  *****************************************************************************/
818 static void I420_cyuv( vout_thread_t *p_vout, picture_t *p_source,
819                                               picture_t *p_dest )
820 {
821     uint8_t *p_line1 = p_dest->p->p_pixels +
822                        p_dest->p->i_visible_lines * p_dest->p->i_pitch
823                        + p_dest->p->i_pitch;
824     uint8_t *p_line2 = p_dest->p->p_pixels +
825                        p_dest->p->i_visible_lines * p_dest->p->i_pitch;
826     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
827     uint8_t *p_u = p_source->U_PIXELS;
828     uint8_t *p_v = p_source->V_PIXELS;
829
830     int i_x, i_y;
831
832     const int i_source_margin = p_source->p[0].i_pitch
833                                  - p_source->p[0].i_visible_pitch;
834     const int i_source_margin_c = p_source->p[1].i_pitch
835                                  - p_source->p[1].i_visible_pitch;
836     const int i_dest_margin = p_dest->p->i_pitch
837                                - p_dest->p->i_visible_pitch;
838
839 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
840     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
841     {
842         p_line1 -= 3 * p_dest->p->i_pitch;
843         p_line2 -= 3 * p_dest->p->i_pitch;
844
845         p_y1 = p_y2;
846         p_y2 += p_source->p[Y_PLANE].i_pitch;
847
848         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
849         {
850 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
851             C_YUV420_UYVY( );
852             C_YUV420_UYVY( );
853             C_YUV420_UYVY( );
854             C_YUV420_UYVY( );
855 #else
856             MMX_CALL( MMX_YUV420_UYVY );
857 #endif
858         }
859         for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; )
860         {
861             C_YUV420_UYVY( );
862         }
863
864         p_y1 += i_source_margin;
865         p_y2 += i_source_margin;
866         p_u += i_source_margin_c;
867         p_v += i_source_margin_c;
868         p_line1 += i_dest_margin;
869         p_line2 += i_dest_margin;
870     }
871
872 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
873     /* re-enable FPU registers */
874     MMX_END;
875 #endif
876
877 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
878     /*
879     ** SSE2 128 bits fetch/store instructions are faster 
880     ** if memory access is 16 bytes aligned
881     */
882     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
883         ((int)p_line2|(int)p_y2))) )
884     {
885         /* use faster SSE2 aligned fetch and store */
886         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
887         {
888             p_line1 = p_line2;
889             p_line2 += p_dest->p->i_pitch;
890
891             p_y1 = p_y2;
892             p_y2 += p_source->p[Y_PLANE].i_pitch;
893
894             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
895             {
896                 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
897             }
898             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
899             {
900                 C_YUV420_UYVY( );
901             }
902
903             p_y1 += i_source_margin;
904             p_y2 += i_source_margin;
905             p_u += i_source_margin_c;
906             p_v += i_source_margin_c;
907             p_line1 += i_dest_margin;
908             p_line2 += i_dest_margin;
909         }
910     }
911     else
912     {
913         /* use slower SSE2 unaligned fetch and store */
914         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
915         {
916             p_line1 = p_line2;
917             p_line2 += p_dest->p->i_pitch;
918
919             p_y1 = p_y2;
920             p_y2 += p_source->p[Y_PLANE].i_pitch;
921
922             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
923             {
924                 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
925             }
926             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
927             {
928                 C_YUV420_UYVY( );
929             }
930
931             p_y1 += i_source_margin;
932             p_y2 += i_source_margin;
933             p_u += i_source_margin_c;
934             p_v += i_source_margin_c;
935             p_line1 += i_dest_margin;
936             p_line2 += i_dest_margin;
937         }
938     }
939     /* make sure all SSE2 stores are visible thereafter */
940     SSE2_END;
941 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
942 }
943 #endif // !defined (MODULE_NAME_IS_i420_yuy2_altivec)
944
945 /*****************************************************************************
946  * I420_Y211: planar YUV 4:2:0 to packed YUYV 2:1:1
947  *****************************************************************************/
948 #if defined (MODULE_NAME_IS_i420_yuy2)
949 static void I420_Y211( vout_thread_t *p_vout, picture_t *p_source,
950                                               picture_t *p_dest )
951 {
952     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
953     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
954     uint8_t *p_u = p_source->U_PIXELS;
955     uint8_t *p_v = p_source->V_PIXELS;
956
957     int i_x, i_y;
958
959     const int i_source_margin = p_source->p[0].i_pitch
960                                  - p_source->p[0].i_visible_pitch;
961     const int i_source_margin_c = p_source->p[1].i_pitch
962                                  - p_source->p[1].i_visible_pitch;
963     const int i_dest_margin = p_dest->p->i_pitch
964                                - p_dest->p->i_visible_pitch;
965
966     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
967     {
968         p_line1 = p_line2;
969         p_line2 += p_dest->p->i_pitch;
970
971         p_y1 = p_y2;
972         p_y2 += p_source->p[Y_PLANE].i_pitch;
973
974         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
975         {
976             C_YUV420_Y211( );
977             C_YUV420_Y211( );
978         }
979
980         p_y1 += i_source_margin;
981         p_y2 += i_source_margin;
982         p_u += i_source_margin_c;
983         p_v += i_source_margin_c;
984         p_line1 += i_dest_margin;
985         p_line2 += i_dest_margin;
986     }
987 }
988 #endif