]> git.sesse.net Git - vlc/blob - modules/video_chroma/i420_yuy2.c
Revert "let gcc choose how to reference memory addresses in i420_rgx mmx asm"
[vlc] / modules / video_chroma / i420_yuy2.c
1 /*****************************************************************************
2  * i420_yuy2.c : YUV to YUV conversion module for vlc
3  *****************************************************************************
4  * Copyright (C) 2000, 2001 the VideoLAN team
5  * $Id$
6  *
7  * Authors: Samuel Hocevar <sam@zoy.org>
8  *          Damien Fouilleul <damien@videolan.org>
9  *
10  * This program is free software; you can redistribute it and/or modify
11  * it under the terms of the GNU General Public License as published by
12  * the Free Software Foundation; either version 2 of the License, or
13  * (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public License
21  * along with this program; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
23  *****************************************************************************/
24
25 /*****************************************************************************
26  * Preamble
27  *****************************************************************************/
28
29 #ifdef HAVE_CONFIG_H
30 # include "config.h"
31 #endif
32
33 #include <vlc/vlc.h>
34 #include <vlc_vout.h>
35
36 #if defined (MODULE_NAME_IS_i420_yuy2_altivec) && defined(HAVE_ALTIVEC_H)
37 #   include <altivec.h>
38 #endif
39
40 #include "i420_yuy2.h"
41
42 #define SRC_FOURCC  "I420,IYUV,YV12"
43
44 #if defined (MODULE_NAME_IS_i420_yuy2)
45 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv,Y211"
46 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
47 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
48 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
49 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
50 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
51 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422"
52 #endif
53
54 /*****************************************************************************
55  * Local and extern prototypes.
56  *****************************************************************************/
57 static int  Activate ( vlc_object_t * );
58
59 static void I420_YUY2           ( vout_thread_t *, picture_t *, picture_t * );
60 static void I420_YVYU           ( vout_thread_t *, picture_t *, picture_t * );
61 static void I420_UYVY           ( vout_thread_t *, picture_t *, picture_t * );
62 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
63 static void I420_IUYV           ( vout_thread_t *, picture_t *, picture_t * );
64 static void I420_cyuv           ( vout_thread_t *, picture_t *, picture_t * );
65 #endif
66 #if defined (MODULE_NAME_IS_i420_yuy2)
67 static void I420_Y211           ( vout_thread_t *, picture_t *, picture_t * );
68 #endif
69
70 #ifdef MODULE_NAME_IS_i420_yuy2_mmx
71 /* Initialize MMX-specific constants */
72 static const uint64_t i_00ffw = 0x00ff00ff00ff00ffULL;
73 static const uint64_t i_80w   = 0x0000000080808080ULL;
74 #endif
75
76 /*****************************************************************************
77  * Module descriptor.
78  *****************************************************************************/
79 vlc_module_begin();
80 #if defined (MODULE_NAME_IS_i420_yuy2)
81     set_description( _("Conversions from " SRC_FOURCC " to " DEST_FOURCC) );
82     set_capability( "chroma", 80 );
83 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
84     set_description( _("MMX conversions from " SRC_FOURCC " to " DEST_FOURCC) );
85     set_capability( "chroma", 100 );
86     add_requirement( MMX );
87 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
88     set_description( _("SSE2 conversions from " SRC_FOURCC " to " DEST_FOURCC) );
89     set_capability( "chroma", 120 );
90     add_requirement( SSE2 );
91 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
92     set_description(
93             _("AltiVec conversions from " SRC_FOURCC " to " DEST_FOURCC) );
94     set_capability( "chroma", 100 );
95     add_requirement( ALTIVEC );
96 #endif
97     set_callbacks( Activate, NULL );
98 vlc_module_end();
99
100 /*****************************************************************************
101  * Activate: allocate a chroma function
102  *****************************************************************************
103  * This function allocates and initializes a chroma function
104  *****************************************************************************/
105 static int Activate( vlc_object_t *p_this )
106 {
107     vout_thread_t *p_vout = (vout_thread_t *)p_this;
108
109     if( p_vout->render.i_width & 1 || p_vout->render.i_height & 1 )
110     {
111         return -1;
112     }
113
114     switch( p_vout->render.i_chroma )
115     {
116         case VLC_FOURCC('Y','V','1','2'):
117         case VLC_FOURCC('I','4','2','0'):
118         case VLC_FOURCC('I','Y','U','V'):
119             switch( p_vout->output.i_chroma )
120             {
121                 case VLC_FOURCC('Y','U','Y','2'):
122                 case VLC_FOURCC('Y','U','N','V'):
123                     p_vout->chroma.pf_convert = I420_YUY2;
124                     break;
125
126                 case VLC_FOURCC('Y','V','Y','U'):
127                     p_vout->chroma.pf_convert = I420_YVYU;
128                     break;
129
130                 case VLC_FOURCC('U','Y','V','Y'):
131                 case VLC_FOURCC('U','Y','N','V'):
132                 case VLC_FOURCC('Y','4','2','2'):
133                     p_vout->chroma.pf_convert = I420_UYVY;
134                     break;
135 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
136                 case VLC_FOURCC('I','U','Y','V'):
137                     p_vout->chroma.pf_convert = I420_IUYV;
138                     break;
139
140                 case VLC_FOURCC('c','y','u','v'):
141                     p_vout->chroma.pf_convert = I420_cyuv;
142                     break;
143 #endif
144
145 #if defined (MODULE_NAME_IS_i420_yuy2)
146                 case VLC_FOURCC('Y','2','1','1'):
147                     p_vout->chroma.pf_convert = I420_Y211;
148                     break;
149 #endif
150
151                 default:
152                     return -1;
153             }
154             break;
155
156         default:
157             return -1;
158     }
159
160     return 0;
161 }
162
163 #if 0
164 static inline unsigned long long read_cycles(void)
165 {
166     unsigned long long v;
167     __asm__ __volatile__("rdtsc" : "=A" (v): );
168
169     return v;
170 }
171 #endif
172
173 /* Following functions are local */
174 /*****************************************************************************
175  * I420_YUY2: planar YUV 4:2:0 to packed YUYV 4:2:2
176  *****************************************************************************/
177 static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source,
178                                               picture_t *p_dest )
179 {
180     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
181     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
182     uint8_t *p_u = p_source->U_PIXELS;
183     uint8_t *p_v = p_source->V_PIXELS;
184
185     int i_x, i_y;
186
187 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
188 #define VEC_NEXT_LINES( ) \
189     p_line1  = p_line2; \
190     p_line2 += p_dest->p->i_pitch; \
191     p_y1     = p_y2; \
192     p_y2    += p_source->p[Y_PLANE].i_pitch;
193
194 #define VEC_LOAD_UV( ) \
195     u_vec = vec_ld( 0, p_u ); p_u += 16; \
196     v_vec = vec_ld( 0, p_v ); p_v += 16;
197
198 #define VEC_MERGE( a ) \
199     uv_vec = a( u_vec, v_vec ); \
200     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
201     vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
202     vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
203     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
204     vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16; \
205     vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16;
206
207     vector unsigned char u_vec;
208     vector unsigned char v_vec;
209     vector unsigned char uv_vec;
210     vector unsigned char y_vec;
211
212     if( !( ( p_vout->render.i_width % 32 ) |
213            ( p_vout->render.i_height % 2 ) ) )
214     {
215         /* Width is a multiple of 32, we take 2 lines at a time */
216         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
217         {
218             VEC_NEXT_LINES( );
219             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
220             {
221                 VEC_LOAD_UV( );
222                 VEC_MERGE( vec_mergeh );
223                 VEC_MERGE( vec_mergel );
224             }
225         }
226     }
227     else if( !( ( p_vout->render.i_width % 16 ) |
228                 ( p_vout->render.i_height % 4 ) ) )
229     {
230         /* Width is only a multiple of 16, we take 4 lines at a time */
231         for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
232         {
233             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
234             VEC_NEXT_LINES( );
235             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
236             {
237                 VEC_LOAD_UV( );
238                 VEC_MERGE( vec_mergeh );
239                 VEC_MERGE( vec_mergel );
240             }
241
242             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
243             VEC_LOAD_UV( );
244             VEC_MERGE( vec_mergeh );
245
246             /* Line 3 and 4, pixels 0 to 16 */
247             VEC_NEXT_LINES( );
248             VEC_MERGE( vec_mergel );
249
250             /* Line 3 and 4, pixels 16 to ( width ) */
251             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
252             {
253                 VEC_LOAD_UV( );
254                 VEC_MERGE( vec_mergeh );
255                 VEC_MERGE( vec_mergel );
256             }
257         }
258     }
259     else
260     {
261         /* Crap, use the C version */
262 #undef VEC_NEXT_LINES
263 #undef VEC_LOAD_UV
264 #undef VEC_MERGE
265 #endif
266
267     const int i_source_margin = p_source->p[0].i_pitch
268                                  - p_source->p[0].i_visible_pitch;
269     const int i_source_margin_c = p_source->p[1].i_pitch
270                                  - p_source->p[1].i_visible_pitch;
271     const int i_dest_margin = p_dest->p->i_pitch
272                                - p_dest->p->i_visible_pitch;
273
274 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
275     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
276     {
277         p_line1 = p_line2;
278         p_line2 += p_dest->p->i_pitch;
279
280         p_y1 = p_y2;
281         p_y2 += p_source->p[Y_PLANE].i_pitch;
282
283 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
284         for( i_x = p_vout->render.i_width / 8; i_x-- ; )
285         {
286             C_YUV420_YUYV( );
287             C_YUV420_YUYV( );
288             C_YUV420_YUYV( );
289             C_YUV420_YUYV( );
290         }
291 #else
292         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
293         {
294             MMX_CALL( MMX_YUV420_YUYV );
295         }
296 #endif
297         for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; )
298         {
299             C_YUV420_YUYV( );
300         }
301
302         p_y1 += i_source_margin;
303         p_y2 += i_source_margin;
304         p_u += i_source_margin_c;
305         p_v += i_source_margin_c;
306         p_line1 += i_dest_margin;
307         p_line2 += i_dest_margin;
308     }
309
310 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
311     /* re-enable FPU registers */
312     MMX_END;
313 #endif
314
315 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
316     }
317 #endif
318
319 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
320     /*
321     ** SSE2 128 bits fetch/store instructions are faster
322     ** if memory access is 16 bytes aligned
323     */
324
325     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
326         ((intptr_t)p_line2|(intptr_t)p_y2))) )
327     {
328         /* use faster SSE2 aligned fetch and store */
329         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
330         {
331             p_line1 = p_line2;
332             p_line2 += p_dest->p->i_pitch;
333
334             p_y1 = p_y2;
335             p_y2 += p_source->p[Y_PLANE].i_pitch;
336
337             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
338             {
339                 SSE2_CALL( SSE2_YUV420_YUYV_ALIGNED );
340             }
341             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
342             {
343                 C_YUV420_YUYV( );
344             }
345
346             p_y1 += i_source_margin;
347             p_y2 += i_source_margin;
348             p_u += i_source_margin_c;
349             p_v += i_source_margin_c;
350             p_line1 += i_dest_margin;
351             p_line2 += i_dest_margin;
352         }
353     }
354     else
355     {
356         /* use slower SSE2 unaligned fetch and store */
357         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
358         {
359             p_line1 = p_line2;
360             p_line2 += p_dest->p->i_pitch;
361
362             p_y1 = p_y2;
363             p_y2 += p_source->p[Y_PLANE].i_pitch;
364
365             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
366             {
367                 SSE2_CALL( SSE2_YUV420_YUYV_UNALIGNED );
368             }
369             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
370             {
371                 C_YUV420_YUYV( );
372             }
373
374             p_y1 += i_source_margin;
375             p_y2 += i_source_margin;
376             p_u += i_source_margin_c;
377             p_v += i_source_margin_c;
378             p_line1 += i_dest_margin;
379             p_line2 += i_dest_margin;
380         }
381     }
382     /* make sure all SSE2 stores are visible thereafter */
383     SSE2_END;
384
385 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
386 }
387
388 /*****************************************************************************
389  * I420_YVYU: planar YUV 4:2:0 to packed YVYU 4:2:2
390  *****************************************************************************/
391 static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source,
392                                               picture_t *p_dest )
393 {
394     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
395     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
396     uint8_t *p_u = p_source->U_PIXELS;
397     uint8_t *p_v = p_source->V_PIXELS;
398
399     int i_x, i_y;
400
401 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
402 #define VEC_NEXT_LINES( ) \
403     p_line1  = p_line2; \
404     p_line2 += p_dest->p->i_pitch; \
405     p_y1     = p_y2; \
406     p_y2    += p_source->p[Y_PLANE].i_pitch;
407
408 #define VEC_LOAD_UV( ) \
409     u_vec = vec_ld( 0, p_u ); p_u += 16; \
410     v_vec = vec_ld( 0, p_v ); p_v += 16;
411
412 #define VEC_MERGE( a ) \
413     vu_vec = a( v_vec, u_vec ); \
414     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
415     vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
416     vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
417     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
418     vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16; \
419     vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16;
420
421     vector unsigned char u_vec;
422     vector unsigned char v_vec;
423     vector unsigned char vu_vec;
424     vector unsigned char y_vec;
425
426     if( !( ( p_vout->render.i_width % 32 ) |
427            ( p_vout->render.i_height % 2 ) ) )
428     {
429         /* Width is a multiple of 32, we take 2 lines at a time */
430         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
431         {
432             VEC_NEXT_LINES( );
433             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
434             {
435                 VEC_LOAD_UV( );
436                 VEC_MERGE( vec_mergeh );
437                 VEC_MERGE( vec_mergel );
438             }
439         }
440     }
441     else if( !( ( p_vout->render.i_width % 16 ) |
442                 ( p_vout->render.i_height % 4 ) ) )
443     {
444         /* Width is only a multiple of 16, we take 4 lines at a time */
445         for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
446         {
447             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
448             VEC_NEXT_LINES( );
449             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
450             {
451                 VEC_LOAD_UV( );
452                 VEC_MERGE( vec_mergeh );
453                 VEC_MERGE( vec_mergel );
454             }
455
456             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
457             VEC_LOAD_UV( );
458             VEC_MERGE( vec_mergeh );
459
460             /* Line 3 and 4, pixels 0 to 16 */
461             VEC_NEXT_LINES( );
462             VEC_MERGE( vec_mergel );
463
464             /* Line 3 and 4, pixels 16 to ( width ) */
465             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
466             {
467                 VEC_LOAD_UV( );
468                 VEC_MERGE( vec_mergeh );
469                 VEC_MERGE( vec_mergel );
470             }
471         }
472     }
473     else
474     {
475         /* Crap, use the C version */
476 #undef VEC_NEXT_LINES
477 #undef VEC_LOAD_UV
478 #undef VEC_MERGE
479 #endif
480
481     const int i_source_margin = p_source->p[0].i_pitch
482                                  - p_source->p[0].i_visible_pitch;
483     const int i_source_margin_c = p_source->p[1].i_pitch
484                                  - p_source->p[1].i_visible_pitch;
485     const int i_dest_margin = p_dest->p->i_pitch
486                                - p_dest->p->i_visible_pitch;
487
488 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
489     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
490     {
491         p_line1 = p_line2;
492         p_line2 += p_dest->p->i_pitch;
493
494         p_y1 = p_y2;
495         p_y2 += p_source->p[Y_PLANE].i_pitch;
496
497         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
498         {
499 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
500             C_YUV420_YVYU( );
501             C_YUV420_YVYU( );
502             C_YUV420_YVYU( );
503             C_YUV420_YVYU( );
504 #else
505             MMX_CALL( MMX_YUV420_YVYU );
506 #endif
507         }
508         for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; )
509         {
510             C_YUV420_YVYU( );
511         }
512
513         p_y1 += i_source_margin;
514         p_y2 += i_source_margin;
515         p_u += i_source_margin_c;
516         p_v += i_source_margin_c;
517         p_line1 += i_dest_margin;
518         p_line2 += i_dest_margin;
519     }
520
521 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
522     /* re-enable FPU registers */
523     MMX_END;
524 #endif
525
526 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
527     }
528 #endif
529
530 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
531     /*
532     ** SSE2 128 bits fetch/store instructions are faster
533     ** if memory access is 16 bytes aligned
534     */
535     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
536         ((intptr_t)p_line2|(intptr_t)p_y2))) )
537     {
538         /* use faster SSE2 aligned fetch and store */
539         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
540         {
541             p_line1 = p_line2;
542             p_line2 += p_dest->p->i_pitch;
543
544             p_y1 = p_y2;
545             p_y2 += p_source->p[Y_PLANE].i_pitch;
546
547             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
548             {
549                 SSE2_CALL( SSE2_YUV420_YVYU_ALIGNED );
550             }
551             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
552             {
553                 C_YUV420_YVYU( );
554             }
555
556             p_y1 += i_source_margin;
557             p_y2 += i_source_margin;
558             p_u += i_source_margin_c;
559             p_v += i_source_margin_c;
560             p_line1 += i_dest_margin;
561             p_line2 += i_dest_margin;
562         }
563     }
564     else
565     {
566         /* use slower SSE2 unaligned fetch and store */
567         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
568         {
569             p_line1 = p_line2;
570             p_line2 += p_dest->p->i_pitch;
571
572             p_y1 = p_y2;
573             p_y2 += p_source->p[Y_PLANE].i_pitch;
574
575             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
576             {
577                 SSE2_CALL( SSE2_YUV420_YVYU_UNALIGNED );
578             }
579             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
580             {
581                 C_YUV420_YVYU( );
582             }
583
584             p_y1 += i_source_margin;
585             p_y2 += i_source_margin;
586             p_u += i_source_margin_c;
587             p_v += i_source_margin_c;
588             p_line1 += i_dest_margin;
589             p_line2 += i_dest_margin;
590         }
591     }
592     /* make sure all SSE2 stores are visible thereafter */
593     SSE2_END;
594 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
595 }
596
597 /*****************************************************************************
598  * I420_UYVY: planar YUV 4:2:0 to packed UYVY 4:2:2
599  *****************************************************************************/
600 static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source,
601                                               picture_t *p_dest )
602 {
603     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
604     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
605     uint8_t *p_u = p_source->U_PIXELS;
606     uint8_t *p_v = p_source->V_PIXELS;
607
608     int i_x, i_y;
609
610 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
611 #define VEC_NEXT_LINES( ) \
612     p_line1  = p_line2; \
613     p_line2 += p_dest->p->i_pitch; \
614     p_y1     = p_y2; \
615     p_y2    += p_source->p[Y_PLANE].i_pitch;
616
617 #define VEC_LOAD_UV( ) \
618     u_vec = vec_ld( 0, p_u ); p_u += 16; \
619     v_vec = vec_ld( 0, p_v ); p_v += 16;
620
621 #define VEC_MERGE( a ) \
622     uv_vec = a( u_vec, v_vec ); \
623     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
624     vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
625     vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
626     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
627     vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16; \
628     vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16;
629
630     vector unsigned char u_vec;
631     vector unsigned char v_vec;
632     vector unsigned char uv_vec;
633     vector unsigned char y_vec;
634
635     if( !( ( p_vout->render.i_width % 32 ) |
636            ( p_vout->render.i_height % 2 ) ) )
637     {
638         /* Width is a multiple of 32, we take 2 lines at a time */
639         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
640         {
641             VEC_NEXT_LINES( );
642             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
643             {
644                 VEC_LOAD_UV( );
645                 VEC_MERGE( vec_mergeh );
646                 VEC_MERGE( vec_mergel );
647             }
648         }
649     }
650     else if( !( ( p_vout->render.i_width % 16 ) |
651                 ( p_vout->render.i_height % 4 ) ) )
652     {
653         /* Width is only a multiple of 16, we take 4 lines at a time */
654         for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
655         {
656             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
657             VEC_NEXT_LINES( );
658             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
659             {
660                 VEC_LOAD_UV( );
661                 VEC_MERGE( vec_mergeh );
662                 VEC_MERGE( vec_mergel );
663             }
664
665             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
666             VEC_LOAD_UV( );
667             VEC_MERGE( vec_mergeh );
668
669             /* Line 3 and 4, pixels 0 to 16 */
670             VEC_NEXT_LINES( );
671             VEC_MERGE( vec_mergel );
672
673             /* Line 3 and 4, pixels 16 to ( width ) */
674             for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
675             {
676                 VEC_LOAD_UV( );
677                 VEC_MERGE( vec_mergeh );
678                 VEC_MERGE( vec_mergel );
679             }
680         }
681     }
682     else
683     {
684         /* Crap, use the C version */
685 #undef VEC_NEXT_LINES
686 #undef VEC_LOAD_UV
687 #undef VEC_MERGE
688 #endif
689
690     const int i_source_margin = p_source->p[0].i_pitch
691                                  - p_source->p[0].i_visible_pitch;
692     const int i_source_margin_c = p_source->p[1].i_pitch
693                                  - p_source->p[1].i_visible_pitch;
694     const int i_dest_margin = p_dest->p->i_pitch
695                                - p_dest->p->i_visible_pitch;
696
697 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
698     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
699     {
700         p_line1 = p_line2;
701         p_line2 += p_dest->p->i_pitch;
702
703         p_y1 = p_y2;
704         p_y2 += p_source->p[Y_PLANE].i_pitch;
705
706         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
707         {
708 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
709             C_YUV420_UYVY( );
710             C_YUV420_UYVY( );
711             C_YUV420_UYVY( );
712             C_YUV420_UYVY( );
713 #else
714             MMX_CALL( MMX_YUV420_UYVY );
715 #endif
716         }
717         for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x--; )
718         {
719             C_YUV420_UYVY( );
720         }
721
722         p_y1 += i_source_margin;
723         p_y2 += i_source_margin;
724         p_u += i_source_margin_c;
725         p_v += i_source_margin_c;
726         p_line1 += i_dest_margin;
727         p_line2 += i_dest_margin;
728     }
729
730 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
731     /* re-enable FPU registers */
732     MMX_END;
733 #endif
734
735 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
736     }
737 #endif
738
739 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
740     /*
741     ** SSE2 128 bits fetch/store instructions are faster
742     ** if memory access is 16 bytes aligned
743     */
744     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
745         ((intptr_t)p_line2|(intptr_t)p_y2))) )
746     {
747         /* use faster SSE2 aligned fetch and store */
748         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
749         {
750             p_line1 = p_line2;
751             p_line2 += p_dest->p->i_pitch;
752
753             p_y1 = p_y2;
754             p_y2 += p_source->p[Y_PLANE].i_pitch;
755
756             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
757             {
758                 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
759             }
760             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
761             {
762                 C_YUV420_UYVY( );
763             }
764
765             p_y1 += i_source_margin;
766             p_y2 += i_source_margin;
767             p_u += i_source_margin_c;
768             p_v += i_source_margin_c;
769             p_line1 += i_dest_margin;
770             p_line2 += i_dest_margin;
771         }
772     }
773     else
774     {
775         /* use slower SSE2 unaligned fetch and store */
776         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
777         {
778             p_line1 = p_line2;
779             p_line2 += p_dest->p->i_pitch;
780
781             p_y1 = p_y2;
782             p_y2 += p_source->p[Y_PLANE].i_pitch;
783
784             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
785             {
786                 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
787             }
788             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
789             {
790                 C_YUV420_UYVY( );
791             }
792
793             p_y1 += i_source_margin;
794             p_y2 += i_source_margin;
795             p_u += i_source_margin_c;
796             p_v += i_source_margin_c;
797             p_line1 += i_dest_margin;
798             p_line2 += i_dest_margin;
799         }
800     }
801     /* make sure all SSE2 stores are visible thereafter */
802     SSE2_END;
803 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
804 }
805
806 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
807 /*****************************************************************************
808  * I420_IUYV: planar YUV 4:2:0 to interleaved packed UYVY 4:2:2
809  *****************************************************************************/
810 static void I420_IUYV( vout_thread_t *p_vout, picture_t *p_source,
811                                               picture_t *p_dest )
812 {
813     VLC_UNUSED(p_source); VLC_UNUSED(p_dest);
814     /* FIXME: TODO ! */
815     msg_Err( p_vout, "I420_IUYV unimplemented, please harass <sam@zoy.org>" );
816 }
817
818 /*****************************************************************************
819  * I420_cyuv: planar YUV 4:2:0 to upside-down packed UYVY 4:2:2
820  *****************************************************************************/
821 static void I420_cyuv( vout_thread_t *p_vout, picture_t *p_source,
822                                               picture_t *p_dest )
823 {
824     uint8_t *p_line1 = p_dest->p->p_pixels +
825                        p_dest->p->i_visible_lines * p_dest->p->i_pitch
826                        + p_dest->p->i_pitch;
827     uint8_t *p_line2 = p_dest->p->p_pixels +
828                        p_dest->p->i_visible_lines * p_dest->p->i_pitch;
829     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
830     uint8_t *p_u = p_source->U_PIXELS;
831     uint8_t *p_v = p_source->V_PIXELS;
832
833     int i_x, i_y;
834
835     const int i_source_margin = p_source->p[0].i_pitch
836                                  - p_source->p[0].i_visible_pitch;
837     const int i_source_margin_c = p_source->p[1].i_pitch
838                                  - p_source->p[1].i_visible_pitch;
839     const int i_dest_margin = p_dest->p->i_pitch
840                                - p_dest->p->i_visible_pitch;
841
842 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
843     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
844     {
845         p_line1 -= 3 * p_dest->p->i_pitch;
846         p_line2 -= 3 * p_dest->p->i_pitch;
847
848         p_y1 = p_y2;
849         p_y2 += p_source->p[Y_PLANE].i_pitch;
850
851         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
852         {
853 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
854             C_YUV420_UYVY( );
855             C_YUV420_UYVY( );
856             C_YUV420_UYVY( );
857             C_YUV420_UYVY( );
858 #else
859             MMX_CALL( MMX_YUV420_UYVY );
860 #endif
861         }
862         for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; )
863         {
864             C_YUV420_UYVY( );
865         }
866
867         p_y1 += i_source_margin;
868         p_y2 += i_source_margin;
869         p_u += i_source_margin_c;
870         p_v += i_source_margin_c;
871         p_line1 += i_dest_margin;
872         p_line2 += i_dest_margin;
873     }
874
875 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
876     /* re-enable FPU registers */
877     MMX_END;
878 #endif
879
880 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
881     /*
882     ** SSE2 128 bits fetch/store instructions are faster
883     ** if memory access is 16 bytes aligned
884     */
885     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
886         ((intptr_t)p_line2|(intptr_t)p_y2))) )
887     {
888         /* use faster SSE2 aligned fetch and store */
889         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
890         {
891             p_line1 = p_line2;
892             p_line2 += p_dest->p->i_pitch;
893
894             p_y1 = p_y2;
895             p_y2 += p_source->p[Y_PLANE].i_pitch;
896
897             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
898             {
899                 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
900             }
901             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
902             {
903                 C_YUV420_UYVY( );
904             }
905
906             p_y1 += i_source_margin;
907             p_y2 += i_source_margin;
908             p_u += i_source_margin_c;
909             p_v += i_source_margin_c;
910             p_line1 += i_dest_margin;
911             p_line2 += i_dest_margin;
912         }
913     }
914     else
915     {
916         /* use slower SSE2 unaligned fetch and store */
917         for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
918         {
919             p_line1 = p_line2;
920             p_line2 += p_dest->p->i_pitch;
921
922             p_y1 = p_y2;
923             p_y2 += p_source->p[Y_PLANE].i_pitch;
924
925             for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
926             {
927                 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
928             }
929             for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
930             {
931                 C_YUV420_UYVY( );
932             }
933
934             p_y1 += i_source_margin;
935             p_y2 += i_source_margin;
936             p_u += i_source_margin_c;
937             p_v += i_source_margin_c;
938             p_line1 += i_dest_margin;
939             p_line2 += i_dest_margin;
940         }
941     }
942     /* make sure all SSE2 stores are visible thereafter */
943     SSE2_END;
944 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
945 }
946 #endif // !defined (MODULE_NAME_IS_i420_yuy2_altivec)
947
948 /*****************************************************************************
949  * I420_Y211: planar YUV 4:2:0 to packed YUYV 2:1:1
950  *****************************************************************************/
951 #if defined (MODULE_NAME_IS_i420_yuy2)
952 static void I420_Y211( vout_thread_t *p_vout, picture_t *p_source,
953                                               picture_t *p_dest )
954 {
955     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
956     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
957     uint8_t *p_u = p_source->U_PIXELS;
958     uint8_t *p_v = p_source->V_PIXELS;
959
960     int i_x, i_y;
961
962     const int i_source_margin = p_source->p[0].i_pitch
963                                  - p_source->p[0].i_visible_pitch;
964     const int i_source_margin_c = p_source->p[1].i_pitch
965                                  - p_source->p[1].i_visible_pitch;
966     const int i_dest_margin = p_dest->p->i_pitch
967                                - p_dest->p->i_visible_pitch;
968
969     for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
970     {
971         p_line1 = p_line2;
972         p_line2 += p_dest->p->i_pitch;
973
974         p_y1 = p_y2;
975         p_y2 += p_source->p[Y_PLANE].i_pitch;
976
977         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
978         {
979             C_YUV420_Y211( );
980             C_YUV420_Y211( );
981         }
982
983         p_y1 += i_source_margin;
984         p_y2 += i_source_margin;
985         p_u += i_source_margin_c;
986         p_v += i_source_margin_c;
987         p_line1 += i_dest_margin;
988         p_line2 += i_dest_margin;
989     }
990 }
991 #endif