1 /*****************************************************************************
2 * i420_yuy2.c : YUV to YUV conversion module for vlc
3 *****************************************************************************
4 * Copyright (C) 2000, 2001 the VideoLAN team
7 * Authors: Samuel Hocevar <sam@zoy.org>
8 * Damien Fouilleul <damien@videolan.org>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
23 *****************************************************************************/
25 /*****************************************************************************
27 *****************************************************************************/
33 #include <vlc_common.h>
34 #include <vlc_plugin.h>
37 #if defined (MODULE_NAME_IS_i420_yuy2_altivec) && defined(HAVE_ALTIVEC_H)
41 #include "i420_yuy2.h"
43 #define SRC_FOURCC "I420,IYUV,YV12"
45 #if defined (MODULE_NAME_IS_i420_yuy2)
46 # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv,Y211"
47 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
48 # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
49 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
50 # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
51 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
52 # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422"
55 /*****************************************************************************
56 * Local and extern prototypes.
57 *****************************************************************************/
58 static int Activate ( vlc_object_t * );
60 static void I420_YUY2 ( vout_thread_t *, picture_t *, picture_t * );
61 static void I420_YVYU ( vout_thread_t *, picture_t *, picture_t * );
62 static void I420_UYVY ( vout_thread_t *, picture_t *, picture_t * );
63 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
64 static void I420_IUYV ( vout_thread_t *, picture_t *, picture_t * );
65 static void I420_cyuv ( vout_thread_t *, picture_t *, picture_t * );
67 #if defined (MODULE_NAME_IS_i420_yuy2)
68 static void I420_Y211 ( vout_thread_t *, picture_t *, picture_t * );
71 #ifdef MODULE_NAME_IS_i420_yuy2_mmx
72 /* Initialize MMX-specific constants */
73 static const uint64_t i_00ffw = 0x00ff00ff00ff00ffULL;
74 static const uint64_t i_80w = 0x0000000080808080ULL;
77 /*****************************************************************************
79 *****************************************************************************/
81 #if defined (MODULE_NAME_IS_i420_yuy2)
82 set_description( N_("Conversions from " SRC_FOURCC " to " DEST_FOURCC) );
83 set_capability( "chroma", 80 );
84 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
85 set_description( N_("MMX conversions from " SRC_FOURCC " to " DEST_FOURCC) );
86 set_capability( "chroma", 100 );
87 add_requirement( MMX );
88 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
89 set_description( N_("SSE2 conversions from " SRC_FOURCC " to " DEST_FOURCC) );
90 set_capability( "chroma", 120 );
91 add_requirement( SSE2 );
92 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
94 _("AltiVec conversions from " SRC_FOURCC " to " DEST_FOURCC) );
95 set_capability( "chroma", 100 );
96 add_requirement( ALTIVEC );
98 set_callbacks( Activate, NULL );
101 /*****************************************************************************
102 * Activate: allocate a chroma function
103 *****************************************************************************
104 * This function allocates and initializes a chroma function
105 *****************************************************************************/
106 static int Activate( vlc_object_t *p_this )
108 vout_thread_t *p_vout = (vout_thread_t *)p_this;
110 if( p_vout->render.i_width & 1 || p_vout->render.i_height & 1 )
115 switch( p_vout->render.i_chroma )
117 case VLC_FOURCC('Y','V','1','2'):
118 case VLC_FOURCC('I','4','2','0'):
119 case VLC_FOURCC('I','Y','U','V'):
120 switch( p_vout->output.i_chroma )
122 case VLC_FOURCC('Y','U','Y','2'):
123 case VLC_FOURCC('Y','U','N','V'):
124 p_vout->chroma.pf_convert = I420_YUY2;
127 case VLC_FOURCC('Y','V','Y','U'):
128 p_vout->chroma.pf_convert = I420_YVYU;
131 case VLC_FOURCC('U','Y','V','Y'):
132 case VLC_FOURCC('U','Y','N','V'):
133 case VLC_FOURCC('Y','4','2','2'):
134 p_vout->chroma.pf_convert = I420_UYVY;
136 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
137 case VLC_FOURCC('I','U','Y','V'):
138 p_vout->chroma.pf_convert = I420_IUYV;
141 case VLC_FOURCC('c','y','u','v'):
142 p_vout->chroma.pf_convert = I420_cyuv;
146 #if defined (MODULE_NAME_IS_i420_yuy2)
147 case VLC_FOURCC('Y','2','1','1'):
148 p_vout->chroma.pf_convert = I420_Y211;
165 static inline unsigned long long read_cycles(void)
167 unsigned long long v;
168 __asm__ __volatile__("rdtsc" : "=A" (v): );
174 /* Following functions are local */
175 /*****************************************************************************
176 * I420_YUY2: planar YUV 4:2:0 to packed YUYV 4:2:2
177 *****************************************************************************/
178 static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source,
181 uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
182 uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
183 uint8_t *p_u = p_source->U_PIXELS;
184 uint8_t *p_v = p_source->V_PIXELS;
188 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
189 #define VEC_NEXT_LINES( ) \
191 p_line2 += p_dest->p->i_pitch; \
193 p_y2 += p_source->p[Y_PLANE].i_pitch;
195 #define VEC_LOAD_UV( ) \
196 u_vec = vec_ld( 0, p_u ); p_u += 16; \
197 v_vec = vec_ld( 0, p_v ); p_v += 16;
199 #define VEC_MERGE( a ) \
200 uv_vec = a( u_vec, v_vec ); \
201 y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
202 vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
203 vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
204 y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
205 vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16; \
206 vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16;
208 vector unsigned char u_vec;
209 vector unsigned char v_vec;
210 vector unsigned char uv_vec;
211 vector unsigned char y_vec;
213 if( !( ( p_vout->render.i_width % 32 ) |
214 ( p_vout->render.i_height % 2 ) ) )
216 /* Width is a multiple of 32, we take 2 lines at a time */
217 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
220 for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
223 VEC_MERGE( vec_mergeh );
224 VEC_MERGE( vec_mergel );
228 else if( !( ( p_vout->render.i_width % 16 ) |
229 ( p_vout->render.i_height % 4 ) ) )
231 /* Width is only a multiple of 16, we take 4 lines at a time */
232 for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
234 /* Line 1 and 2, pixels 0 to ( width - 16 ) */
236 for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
239 VEC_MERGE( vec_mergeh );
240 VEC_MERGE( vec_mergel );
243 /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
245 VEC_MERGE( vec_mergeh );
247 /* Line 3 and 4, pixels 0 to 16 */
249 VEC_MERGE( vec_mergel );
251 /* Line 3 and 4, pixels 16 to ( width ) */
252 for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
255 VEC_MERGE( vec_mergeh );
256 VEC_MERGE( vec_mergel );
262 /* Crap, use the C version */
263 #undef VEC_NEXT_LINES
268 const int i_source_margin = p_source->p[0].i_pitch
269 - p_source->p[0].i_visible_pitch;
270 const int i_source_margin_c = p_source->p[1].i_pitch
271 - p_source->p[1].i_visible_pitch;
272 const int i_dest_margin = p_dest->p->i_pitch
273 - p_dest->p->i_visible_pitch;
275 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
276 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
279 p_line2 += p_dest->p->i_pitch;
282 p_y2 += p_source->p[Y_PLANE].i_pitch;
284 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
285 for( i_x = p_vout->render.i_width / 8; i_x-- ; )
293 for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
295 MMX_CALL( MMX_YUV420_YUYV );
298 for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; )
303 p_y1 += i_source_margin;
304 p_y2 += i_source_margin;
305 p_u += i_source_margin_c;
306 p_v += i_source_margin_c;
307 p_line1 += i_dest_margin;
308 p_line2 += i_dest_margin;
311 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
312 /* re-enable FPU registers */
316 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
320 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
322 ** SSE2 128 bits fetch/store instructions are faster
323 ** if memory access is 16 bytes aligned
326 if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
327 ((intptr_t)p_line2|(intptr_t)p_y2))) )
329 /* use faster SSE2 aligned fetch and store */
330 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
333 p_line2 += p_dest->p->i_pitch;
336 p_y2 += p_source->p[Y_PLANE].i_pitch;
338 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
340 SSE2_CALL( SSE2_YUV420_YUYV_ALIGNED );
342 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
347 p_y1 += i_source_margin;
348 p_y2 += i_source_margin;
349 p_u += i_source_margin_c;
350 p_v += i_source_margin_c;
351 p_line1 += i_dest_margin;
352 p_line2 += i_dest_margin;
357 /* use slower SSE2 unaligned fetch and store */
358 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
361 p_line2 += p_dest->p->i_pitch;
364 p_y2 += p_source->p[Y_PLANE].i_pitch;
366 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
368 SSE2_CALL( SSE2_YUV420_YUYV_UNALIGNED );
370 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
375 p_y1 += i_source_margin;
376 p_y2 += i_source_margin;
377 p_u += i_source_margin_c;
378 p_v += i_source_margin_c;
379 p_line1 += i_dest_margin;
380 p_line2 += i_dest_margin;
383 /* make sure all SSE2 stores are visible thereafter */
386 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
389 /*****************************************************************************
390 * I420_YVYU: planar YUV 4:2:0 to packed YVYU 4:2:2
391 *****************************************************************************/
392 static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source,
395 uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
396 uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
397 uint8_t *p_u = p_source->U_PIXELS;
398 uint8_t *p_v = p_source->V_PIXELS;
402 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
403 #define VEC_NEXT_LINES( ) \
405 p_line2 += p_dest->p->i_pitch; \
407 p_y2 += p_source->p[Y_PLANE].i_pitch;
409 #define VEC_LOAD_UV( ) \
410 u_vec = vec_ld( 0, p_u ); p_u += 16; \
411 v_vec = vec_ld( 0, p_v ); p_v += 16;
413 #define VEC_MERGE( a ) \
414 vu_vec = a( v_vec, u_vec ); \
415 y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
416 vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
417 vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
418 y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
419 vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16; \
420 vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16;
422 vector unsigned char u_vec;
423 vector unsigned char v_vec;
424 vector unsigned char vu_vec;
425 vector unsigned char y_vec;
427 if( !( ( p_vout->render.i_width % 32 ) |
428 ( p_vout->render.i_height % 2 ) ) )
430 /* Width is a multiple of 32, we take 2 lines at a time */
431 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
434 for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
437 VEC_MERGE( vec_mergeh );
438 VEC_MERGE( vec_mergel );
442 else if( !( ( p_vout->render.i_width % 16 ) |
443 ( p_vout->render.i_height % 4 ) ) )
445 /* Width is only a multiple of 16, we take 4 lines at a time */
446 for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
448 /* Line 1 and 2, pixels 0 to ( width - 16 ) */
450 for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
453 VEC_MERGE( vec_mergeh );
454 VEC_MERGE( vec_mergel );
457 /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
459 VEC_MERGE( vec_mergeh );
461 /* Line 3 and 4, pixels 0 to 16 */
463 VEC_MERGE( vec_mergel );
465 /* Line 3 and 4, pixels 16 to ( width ) */
466 for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
469 VEC_MERGE( vec_mergeh );
470 VEC_MERGE( vec_mergel );
476 /* Crap, use the C version */
477 #undef VEC_NEXT_LINES
482 const int i_source_margin = p_source->p[0].i_pitch
483 - p_source->p[0].i_visible_pitch;
484 const int i_source_margin_c = p_source->p[1].i_pitch
485 - p_source->p[1].i_visible_pitch;
486 const int i_dest_margin = p_dest->p->i_pitch
487 - p_dest->p->i_visible_pitch;
489 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
490 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
493 p_line2 += p_dest->p->i_pitch;
496 p_y2 += p_source->p[Y_PLANE].i_pitch;
498 for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
500 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
506 MMX_CALL( MMX_YUV420_YVYU );
509 for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; )
514 p_y1 += i_source_margin;
515 p_y2 += i_source_margin;
516 p_u += i_source_margin_c;
517 p_v += i_source_margin_c;
518 p_line1 += i_dest_margin;
519 p_line2 += i_dest_margin;
522 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
523 /* re-enable FPU registers */
527 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
531 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
533 ** SSE2 128 bits fetch/store instructions are faster
534 ** if memory access is 16 bytes aligned
536 if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
537 ((intptr_t)p_line2|(intptr_t)p_y2))) )
539 /* use faster SSE2 aligned fetch and store */
540 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
543 p_line2 += p_dest->p->i_pitch;
546 p_y2 += p_source->p[Y_PLANE].i_pitch;
548 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
550 SSE2_CALL( SSE2_YUV420_YVYU_ALIGNED );
552 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
557 p_y1 += i_source_margin;
558 p_y2 += i_source_margin;
559 p_u += i_source_margin_c;
560 p_v += i_source_margin_c;
561 p_line1 += i_dest_margin;
562 p_line2 += i_dest_margin;
567 /* use slower SSE2 unaligned fetch and store */
568 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
571 p_line2 += p_dest->p->i_pitch;
574 p_y2 += p_source->p[Y_PLANE].i_pitch;
576 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
578 SSE2_CALL( SSE2_YUV420_YVYU_UNALIGNED );
580 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
585 p_y1 += i_source_margin;
586 p_y2 += i_source_margin;
587 p_u += i_source_margin_c;
588 p_v += i_source_margin_c;
589 p_line1 += i_dest_margin;
590 p_line2 += i_dest_margin;
593 /* make sure all SSE2 stores are visible thereafter */
595 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
598 /*****************************************************************************
599 * I420_UYVY: planar YUV 4:2:0 to packed UYVY 4:2:2
600 *****************************************************************************/
601 static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source,
604 uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
605 uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
606 uint8_t *p_u = p_source->U_PIXELS;
607 uint8_t *p_v = p_source->V_PIXELS;
611 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
612 #define VEC_NEXT_LINES( ) \
614 p_line2 += p_dest->p->i_pitch; \
616 p_y2 += p_source->p[Y_PLANE].i_pitch;
618 #define VEC_LOAD_UV( ) \
619 u_vec = vec_ld( 0, p_u ); p_u += 16; \
620 v_vec = vec_ld( 0, p_v ); p_v += 16;
622 #define VEC_MERGE( a ) \
623 uv_vec = a( u_vec, v_vec ); \
624 y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
625 vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
626 vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
627 y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
628 vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16; \
629 vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16;
631 vector unsigned char u_vec;
632 vector unsigned char v_vec;
633 vector unsigned char uv_vec;
634 vector unsigned char y_vec;
636 if( !( ( p_vout->render.i_width % 32 ) |
637 ( p_vout->render.i_height % 2 ) ) )
639 /* Width is a multiple of 32, we take 2 lines at a time */
640 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
643 for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
646 VEC_MERGE( vec_mergeh );
647 VEC_MERGE( vec_mergel );
651 else if( !( ( p_vout->render.i_width % 16 ) |
652 ( p_vout->render.i_height % 4 ) ) )
654 /* Width is only a multiple of 16, we take 4 lines at a time */
655 for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
657 /* Line 1 and 2, pixels 0 to ( width - 16 ) */
659 for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
662 VEC_MERGE( vec_mergeh );
663 VEC_MERGE( vec_mergel );
666 /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
668 VEC_MERGE( vec_mergeh );
670 /* Line 3 and 4, pixels 0 to 16 */
672 VEC_MERGE( vec_mergel );
674 /* Line 3 and 4, pixels 16 to ( width ) */
675 for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
678 VEC_MERGE( vec_mergeh );
679 VEC_MERGE( vec_mergel );
685 /* Crap, use the C version */
686 #undef VEC_NEXT_LINES
691 const int i_source_margin = p_source->p[0].i_pitch
692 - p_source->p[0].i_visible_pitch;
693 const int i_source_margin_c = p_source->p[1].i_pitch
694 - p_source->p[1].i_visible_pitch;
695 const int i_dest_margin = p_dest->p->i_pitch
696 - p_dest->p->i_visible_pitch;
698 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
699 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
702 p_line2 += p_dest->p->i_pitch;
705 p_y2 += p_source->p[Y_PLANE].i_pitch;
707 for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
709 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
715 MMX_CALL( MMX_YUV420_UYVY );
718 for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x--; )
723 p_y1 += i_source_margin;
724 p_y2 += i_source_margin;
725 p_u += i_source_margin_c;
726 p_v += i_source_margin_c;
727 p_line1 += i_dest_margin;
728 p_line2 += i_dest_margin;
731 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
732 /* re-enable FPU registers */
736 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
740 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
742 ** SSE2 128 bits fetch/store instructions are faster
743 ** if memory access is 16 bytes aligned
745 if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
746 ((intptr_t)p_line2|(intptr_t)p_y2))) )
748 /* use faster SSE2 aligned fetch and store */
749 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
752 p_line2 += p_dest->p->i_pitch;
755 p_y2 += p_source->p[Y_PLANE].i_pitch;
757 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
759 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
761 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
766 p_y1 += i_source_margin;
767 p_y2 += i_source_margin;
768 p_u += i_source_margin_c;
769 p_v += i_source_margin_c;
770 p_line1 += i_dest_margin;
771 p_line2 += i_dest_margin;
776 /* use slower SSE2 unaligned fetch and store */
777 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
780 p_line2 += p_dest->p->i_pitch;
783 p_y2 += p_source->p[Y_PLANE].i_pitch;
785 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
787 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
789 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
794 p_y1 += i_source_margin;
795 p_y2 += i_source_margin;
796 p_u += i_source_margin_c;
797 p_v += i_source_margin_c;
798 p_line1 += i_dest_margin;
799 p_line2 += i_dest_margin;
802 /* make sure all SSE2 stores are visible thereafter */
804 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
807 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
808 /*****************************************************************************
809 * I420_IUYV: planar YUV 4:2:0 to interleaved packed UYVY 4:2:2
810 *****************************************************************************/
811 static void I420_IUYV( vout_thread_t *p_vout, picture_t *p_source,
814 VLC_UNUSED(p_source); VLC_UNUSED(p_dest);
816 msg_Err( p_vout, "I420_IUYV unimplemented, please harass <sam@zoy.org>" );
819 /*****************************************************************************
820 * I420_cyuv: planar YUV 4:2:0 to upside-down packed UYVY 4:2:2
821 *****************************************************************************/
822 static void I420_cyuv( vout_thread_t *p_vout, picture_t *p_source,
825 uint8_t *p_line1 = p_dest->p->p_pixels +
826 p_dest->p->i_visible_lines * p_dest->p->i_pitch
827 + p_dest->p->i_pitch;
828 uint8_t *p_line2 = p_dest->p->p_pixels +
829 p_dest->p->i_visible_lines * p_dest->p->i_pitch;
830 uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
831 uint8_t *p_u = p_source->U_PIXELS;
832 uint8_t *p_v = p_source->V_PIXELS;
836 const int i_source_margin = p_source->p[0].i_pitch
837 - p_source->p[0].i_visible_pitch;
838 const int i_source_margin_c = p_source->p[1].i_pitch
839 - p_source->p[1].i_visible_pitch;
840 const int i_dest_margin = p_dest->p->i_pitch
841 - p_dest->p->i_visible_pitch;
843 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
844 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
846 p_line1 -= 3 * p_dest->p->i_pitch;
847 p_line2 -= 3 * p_dest->p->i_pitch;
850 p_y2 += p_source->p[Y_PLANE].i_pitch;
852 for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
854 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
860 MMX_CALL( MMX_YUV420_UYVY );
863 for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; )
868 p_y1 += i_source_margin;
869 p_y2 += i_source_margin;
870 p_u += i_source_margin_c;
871 p_v += i_source_margin_c;
872 p_line1 += i_dest_margin;
873 p_line2 += i_dest_margin;
876 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
877 /* re-enable FPU registers */
881 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
883 ** SSE2 128 bits fetch/store instructions are faster
884 ** if memory access is 16 bytes aligned
886 if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
887 ((intptr_t)p_line2|(intptr_t)p_y2))) )
889 /* use faster SSE2 aligned fetch and store */
890 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
893 p_line2 += p_dest->p->i_pitch;
896 p_y2 += p_source->p[Y_PLANE].i_pitch;
898 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
900 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
902 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
907 p_y1 += i_source_margin;
908 p_y2 += i_source_margin;
909 p_u += i_source_margin_c;
910 p_v += i_source_margin_c;
911 p_line1 += i_dest_margin;
912 p_line2 += i_dest_margin;
917 /* use slower SSE2 unaligned fetch and store */
918 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
921 p_line2 += p_dest->p->i_pitch;
924 p_y2 += p_source->p[Y_PLANE].i_pitch;
926 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
928 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
930 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
935 p_y1 += i_source_margin;
936 p_y2 += i_source_margin;
937 p_u += i_source_margin_c;
938 p_v += i_source_margin_c;
939 p_line1 += i_dest_margin;
940 p_line2 += i_dest_margin;
943 /* make sure all SSE2 stores are visible thereafter */
945 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
947 #endif // !defined (MODULE_NAME_IS_i420_yuy2_altivec)
949 /*****************************************************************************
950 * I420_Y211: planar YUV 4:2:0 to packed YUYV 2:1:1
951 *****************************************************************************/
952 #if defined (MODULE_NAME_IS_i420_yuy2)
953 static void I420_Y211( vout_thread_t *p_vout, picture_t *p_source,
956 uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
957 uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
958 uint8_t *p_u = p_source->U_PIXELS;
959 uint8_t *p_v = p_source->V_PIXELS;
963 const int i_source_margin = p_source->p[0].i_pitch
964 - p_source->p[0].i_visible_pitch;
965 const int i_source_margin_c = p_source->p[1].i_pitch
966 - p_source->p[1].i_visible_pitch;
967 const int i_dest_margin = p_dest->p->i_pitch
968 - p_dest->p->i_visible_pitch;
970 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
973 p_line2 += p_dest->p->i_pitch;
976 p_y2 += p_source->p[Y_PLANE].i_pitch;
978 for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
984 p_y1 += i_source_margin;
985 p_y2 += i_source_margin;
986 p_u += i_source_margin_c;
987 p_v += i_source_margin_c;
988 p_line1 += i_dest_margin;
989 p_line2 += i_dest_margin;