1 /*****************************************************************************
2 * i420_yuy2.c : YUV to YUV conversion module for vlc
3 *****************************************************************************
4 * Copyright (C) 2000, 2001 the VideoLAN team
7 * Authors: Samuel Hocevar <sam@zoy.org>
8 * Damien Fouilleul <damien@videolan.org>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
23 *****************************************************************************/
25 /*****************************************************************************
27 *****************************************************************************/
28 #include <string.h> /* strerror() */
29 #include <stdlib.h> /* malloc(), free() */
34 #if defined (MODULE_NAME_IS_i420_yuy2_altivec) && defined(HAVE_ALTIVEC_H)
38 #include "i420_yuy2.h"
40 #define SRC_FOURCC "I420,IYUV,YV12"
42 #if defined (MODULE_NAME_IS_i420_yuy2)
43 # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv,Y211"
44 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
45 # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
46 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
47 # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
48 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
49 # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422"
52 /*****************************************************************************
53 * Local and extern prototypes.
54 *****************************************************************************/
55 static int Activate ( vlc_object_t * );
57 static void I420_YUY2 ( vout_thread_t *, picture_t *, picture_t * );
58 static void I420_YVYU ( vout_thread_t *, picture_t *, picture_t * );
59 static void I420_UYVY ( vout_thread_t *, picture_t *, picture_t * );
60 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
61 static void I420_IUYV ( vout_thread_t *, picture_t *, picture_t * );
62 static void I420_cyuv ( vout_thread_t *, picture_t *, picture_t * );
64 #if defined (MODULE_NAME_IS_i420_yuy2)
65 static void I420_Y211 ( vout_thread_t *, picture_t *, picture_t * );
68 #ifdef MODULE_NAME_IS_i420_yuy2_mmx
69 /* Initialize MMX-specific constants */
70 static const uint64_t i_00ffw = 0x00ff00ff00ff00ffULL;
71 static const uint64_t i_80w = 0x0000000080808080ULL;
74 /*****************************************************************************
76 *****************************************************************************/
78 #if defined (MODULE_NAME_IS_i420_yuy2)
79 set_description( _("Conversions from " SRC_FOURCC " to " DEST_FOURCC) );
80 set_capability( "chroma", 80 );
81 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
82 set_description( _("MMX conversions from " SRC_FOURCC " to " DEST_FOURCC) );
83 set_capability( "chroma", 100 );
84 add_requirement( MMX );
85 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
86 set_description( _("SSE2 conversions from " SRC_FOURCC " to " DEST_FOURCC) );
87 set_capability( "chroma", 120 );
88 add_requirement( SSE2 );
89 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
91 _("AltiVec conversions from " SRC_FOURCC " to " DEST_FOURCC) );
92 set_capability( "chroma", 100 );
93 add_requirement( ALTIVEC );
95 set_callbacks( Activate, NULL );
98 /*****************************************************************************
99 * Activate: allocate a chroma function
100 *****************************************************************************
101 * This function allocates and initializes a chroma function
102 *****************************************************************************/
103 static int Activate( vlc_object_t *p_this )
105 vout_thread_t *p_vout = (vout_thread_t *)p_this;
107 if( p_vout->render.i_width & 1 || p_vout->render.i_height & 1 )
112 switch( p_vout->render.i_chroma )
114 case VLC_FOURCC('Y','V','1','2'):
115 case VLC_FOURCC('I','4','2','0'):
116 case VLC_FOURCC('I','Y','U','V'):
117 switch( p_vout->output.i_chroma )
119 case VLC_FOURCC('Y','U','Y','2'):
120 case VLC_FOURCC('Y','U','N','V'):
121 p_vout->chroma.pf_convert = I420_YUY2;
124 case VLC_FOURCC('Y','V','Y','U'):
125 p_vout->chroma.pf_convert = I420_YVYU;
128 case VLC_FOURCC('U','Y','V','Y'):
129 case VLC_FOURCC('U','Y','N','V'):
130 case VLC_FOURCC('Y','4','2','2'):
131 p_vout->chroma.pf_convert = I420_UYVY;
133 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
134 case VLC_FOURCC('I','U','Y','V'):
135 p_vout->chroma.pf_convert = I420_IUYV;
138 case VLC_FOURCC('c','y','u','v'):
139 p_vout->chroma.pf_convert = I420_cyuv;
143 #if defined (MODULE_NAME_IS_i420_yuy2)
144 case VLC_FOURCC('Y','2','1','1'):
145 p_vout->chroma.pf_convert = I420_Y211;
161 /* Following functions are local */
163 /*****************************************************************************
164 * I420_YUY2: planar YUV 4:2:0 to packed YUYV 4:2:2
165 *****************************************************************************/
166 static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source,
169 uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
170 uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
171 uint8_t *p_u = p_source->U_PIXELS;
172 uint8_t *p_v = p_source->V_PIXELS;
176 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
177 #define VEC_NEXT_LINES( ) \
179 p_line2 += p_dest->p->i_pitch; \
181 p_y2 += p_source->p[Y_PLANE].i_pitch;
183 #define VEC_LOAD_UV( ) \
184 u_vec = vec_ld( 0, p_u ); p_u += 16; \
185 v_vec = vec_ld( 0, p_v ); p_v += 16;
187 #define VEC_MERGE( a ) \
188 uv_vec = a( u_vec, v_vec ); \
189 y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
190 vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
191 vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
192 y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
193 vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16; \
194 vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16;
196 vector unsigned char u_vec;
197 vector unsigned char v_vec;
198 vector unsigned char uv_vec;
199 vector unsigned char y_vec;
201 if( !( ( p_vout->render.i_width % 32 ) |
202 ( p_vout->render.i_height % 2 ) ) )
204 /* Width is a multiple of 32, we take 2 lines at a time */
205 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
208 for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
211 VEC_MERGE( vec_mergeh );
212 VEC_MERGE( vec_mergel );
216 else if( !( ( p_vout->render.i_width % 16 ) |
217 ( p_vout->render.i_height % 4 ) ) )
219 /* Width is only a multiple of 16, we take 4 lines at a time */
220 for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
222 /* Line 1 and 2, pixels 0 to ( width - 16 ) */
224 for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
227 VEC_MERGE( vec_mergeh );
228 VEC_MERGE( vec_mergel );
231 /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
233 VEC_MERGE( vec_mergeh );
235 /* Line 3 and 4, pixels 0 to 16 */
237 VEC_MERGE( vec_mergel );
239 /* Line 3 and 4, pixels 16 to ( width ) */
240 for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
243 VEC_MERGE( vec_mergeh );
244 VEC_MERGE( vec_mergel );
250 /* Crap, use the C version */
251 #undef VEC_NEXT_LINES
256 const int i_source_margin = p_source->p[0].i_pitch
257 - p_source->p[0].i_visible_pitch;
258 const int i_source_margin_c = p_source->p[1].i_pitch
259 - p_source->p[1].i_visible_pitch;
260 const int i_dest_margin = p_dest->p->i_pitch
261 - p_dest->p->i_visible_pitch;
263 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
264 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
267 p_line2 += p_dest->p->i_pitch;
270 p_y2 += p_source->p[Y_PLANE].i_pitch;
272 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
273 for( i_x = p_vout->render.i_width / 8; i_x-- ; )
281 for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
283 MMX_CALL( MMX_YUV420_YUYV );
286 for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; )
291 p_y1 += i_source_margin;
292 p_y2 += i_source_margin;
293 p_u += i_source_margin_c;
294 p_v += i_source_margin_c;
295 p_line1 += i_dest_margin;
296 p_line2 += i_dest_margin;
299 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
300 __asm__ __volatile__("emms" :: );
303 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
307 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
309 ** SSE2 128 bytes fetch/store instructions are faster
310 ** if memory access is 16 bytes aligned
312 if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
313 ((int)p_line2|(int)p_y2))) )
315 /* use faster SSE2 aligned fetch and store */
316 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
319 p_line2 += p_dest->p->i_pitch;
322 p_y2 += p_source->p[Y_PLANE].i_pitch;
324 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
326 SSE2_CALL( SSE2_YUV420_YUYV_ALIGNED );
328 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
333 p_y1 += i_source_margin;
334 p_y2 += i_source_margin;
335 p_u += i_source_margin_c;
336 p_v += i_source_margin_c;
337 p_line1 += i_dest_margin;
338 p_line2 += i_dest_margin;
343 /* use slower SSE2 unaligned fetch and store */
344 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
347 p_line2 += p_dest->p->i_pitch;
350 p_y2 += p_source->p[Y_PLANE].i_pitch;
352 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
354 SSE2_CALL( SSE2_YUV420_YUYV_UNALIGNED );
356 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
361 p_y1 += i_source_margin;
362 p_y2 += i_source_margin;
363 p_u += i_source_margin_c;
364 p_v += i_source_margin_c;
365 p_line1 += i_dest_margin;
366 p_line2 += i_dest_margin;
369 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
372 /*****************************************************************************
373 * I420_YVYU: planar YUV 4:2:0 to packed YVYU 4:2:2
374 *****************************************************************************/
375 static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source,
378 uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
379 uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
380 uint8_t *p_u = p_source->U_PIXELS;
381 uint8_t *p_v = p_source->V_PIXELS;
385 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
386 #define VEC_NEXT_LINES( ) \
388 p_line2 += p_dest->p->i_pitch; \
390 p_y2 += p_source->p[Y_PLANE].i_pitch;
392 #define VEC_LOAD_UV( ) \
393 u_vec = vec_ld( 0, p_u ); p_u += 16; \
394 v_vec = vec_ld( 0, p_v ); p_v += 16;
396 #define VEC_MERGE( a ) \
397 vu_vec = a( v_vec, u_vec ); \
398 y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
399 vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
400 vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
401 y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
402 vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16; \
403 vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16;
405 vector unsigned char u_vec;
406 vector unsigned char v_vec;
407 vector unsigned char vu_vec;
408 vector unsigned char y_vec;
410 if( !( ( p_vout->render.i_width % 32 ) |
411 ( p_vout->render.i_height % 2 ) ) )
413 /* Width is a multiple of 32, we take 2 lines at a time */
414 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
417 for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
420 VEC_MERGE( vec_mergeh );
421 VEC_MERGE( vec_mergel );
425 else if( !( ( p_vout->render.i_width % 16 ) |
426 ( p_vout->render.i_height % 4 ) ) )
428 /* Width is only a multiple of 16, we take 4 lines at a time */
429 for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
431 /* Line 1 and 2, pixels 0 to ( width - 16 ) */
433 for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
436 VEC_MERGE( vec_mergeh );
437 VEC_MERGE( vec_mergel );
440 /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
442 VEC_MERGE( vec_mergeh );
444 /* Line 3 and 4, pixels 0 to 16 */
446 VEC_MERGE( vec_mergel );
448 /* Line 3 and 4, pixels 16 to ( width ) */
449 for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
452 VEC_MERGE( vec_mergeh );
453 VEC_MERGE( vec_mergel );
459 /* Crap, use the C version */
460 #undef VEC_NEXT_LINES
465 const int i_source_margin = p_source->p[0].i_pitch
466 - p_source->p[0].i_visible_pitch;
467 const int i_source_margin_c = p_source->p[1].i_pitch
468 - p_source->p[1].i_visible_pitch;
469 const int i_dest_margin = p_dest->p->i_pitch
470 - p_dest->p->i_visible_pitch;
472 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
473 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
476 p_line2 += p_dest->p->i_pitch;
479 p_y2 += p_source->p[Y_PLANE].i_pitch;
481 for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
483 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
489 MMX_CALL( MMX_YUV420_YVYU );
493 p_y1 += i_source_margin;
494 p_y2 += i_source_margin;
495 p_u += i_source_margin_c;
496 p_v += i_source_margin_c;
497 p_line1 += i_dest_margin;
498 p_line2 += i_dest_margin;
501 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
502 __asm__ __volatile__("emms" :: );
505 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
509 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
511 ** SSE2 128 bytes fetch/store instructions are faster
512 ** if memory access is 16 bytes aligned
514 if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
515 ((int)p_line2|(int)p_y2))) )
517 /* use faster SSE2 aligned fetch and store */
518 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
521 p_line2 += p_dest->p->i_pitch;
524 p_y2 += p_source->p[Y_PLANE].i_pitch;
526 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
528 SSE2_CALL( SSE2_YUV420_YVYU_ALIGNED );
530 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
535 p_y1 += i_source_margin;
536 p_y2 += i_source_margin;
537 p_u += i_source_margin_c;
538 p_v += i_source_margin_c;
539 p_line1 += i_dest_margin;
540 p_line2 += i_dest_margin;
545 /* use slower SSE2 unaligned fetch and store */
546 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
549 p_line2 += p_dest->p->i_pitch;
552 p_y2 += p_source->p[Y_PLANE].i_pitch;
554 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
556 SSE2_CALL( SSE2_YUV420_YVYU_UNALIGNED );
558 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
563 p_y1 += i_source_margin;
564 p_y2 += i_source_margin;
565 p_u += i_source_margin_c;
566 p_v += i_source_margin_c;
567 p_line1 += i_dest_margin;
568 p_line2 += i_dest_margin;
571 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
574 /*****************************************************************************
575 * I420_UYVY: planar YUV 4:2:0 to packed UYVY 4:2:2
576 *****************************************************************************/
577 static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source,
580 uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
581 uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
582 uint8_t *p_u = p_source->U_PIXELS;
583 uint8_t *p_v = p_source->V_PIXELS;
587 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
588 #define VEC_NEXT_LINES( ) \
590 p_line2 += p_dest->p->i_pitch; \
592 p_y2 += p_source->p[Y_PLANE].i_pitch;
594 #define VEC_LOAD_UV( ) \
595 u_vec = vec_ld( 0, p_u ); p_u += 16; \
596 v_vec = vec_ld( 0, p_v ); p_v += 16;
598 #define VEC_MERGE( a ) \
599 uv_vec = a( u_vec, v_vec ); \
600 y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
601 vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
602 vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
603 y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
604 vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16; \
605 vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16;
607 vector unsigned char u_vec;
608 vector unsigned char v_vec;
609 vector unsigned char uv_vec;
610 vector unsigned char y_vec;
612 if( !( ( p_vout->render.i_width % 32 ) |
613 ( p_vout->render.i_height % 2 ) ) )
615 /* Width is a multiple of 32, we take 2 lines at a time */
616 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
619 for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
622 VEC_MERGE( vec_mergeh );
623 VEC_MERGE( vec_mergel );
627 else if( !( ( p_vout->render.i_width % 16 ) |
628 ( p_vout->render.i_height % 4 ) ) )
630 /* Width is only a multiple of 16, we take 4 lines at a time */
631 for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
633 /* Line 1 and 2, pixels 0 to ( width - 16 ) */
635 for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
638 VEC_MERGE( vec_mergeh );
639 VEC_MERGE( vec_mergel );
642 /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
644 VEC_MERGE( vec_mergeh );
646 /* Line 3 and 4, pixels 0 to 16 */
648 VEC_MERGE( vec_mergel );
650 /* Line 3 and 4, pixels 16 to ( width ) */
651 for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
654 VEC_MERGE( vec_mergeh );
655 VEC_MERGE( vec_mergel );
661 /* Crap, use the C version */
662 #undef VEC_NEXT_LINES
667 const int i_source_margin = p_source->p[0].i_pitch
668 - p_source->p[0].i_visible_pitch;
669 const int i_source_margin_c = p_source->p[1].i_pitch
670 - p_source->p[1].i_visible_pitch;
671 const int i_dest_margin = p_dest->p->i_pitch
672 - p_dest->p->i_visible_pitch;
674 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
675 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
678 p_line2 += p_dest->p->i_pitch;
681 p_y2 += p_source->p[Y_PLANE].i_pitch;
683 for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
685 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
691 MMX_CALL( MMX_YUV420_UYVY );
694 for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x--; )
699 p_y1 += i_source_margin;
700 p_y2 += i_source_margin;
701 p_u += i_source_margin_c;
702 p_v += i_source_margin_c;
703 p_line1 += i_dest_margin;
704 p_line2 += i_dest_margin;
707 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
708 __asm__ __volatile__("emms" :: );
711 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
715 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
717 ** SSE2 128 bytes fetch/store instructions are faster
718 ** if memory access is 16 bytes aligned
720 if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
721 ((int)p_line2|(int)p_y2))) )
723 /* use faster SSE2 aligned fetch and store */
724 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
727 p_line2 += p_dest->p->i_pitch;
730 p_y2 += p_source->p[Y_PLANE].i_pitch;
732 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
734 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
736 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
741 p_y1 += i_source_margin;
742 p_y2 += i_source_margin;
743 p_u += i_source_margin_c;
744 p_v += i_source_margin_c;
745 p_line1 += i_dest_margin;
746 p_line2 += i_dest_margin;
751 /* use slower SSE2 unaligned fetch and store */
752 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
755 p_line2 += p_dest->p->i_pitch;
758 p_y2 += p_source->p[Y_PLANE].i_pitch;
760 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
762 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
764 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
769 p_y1 += i_source_margin;
770 p_y2 += i_source_margin;
771 p_u += i_source_margin_c;
772 p_v += i_source_margin_c;
773 p_line1 += i_dest_margin;
774 p_line2 += i_dest_margin;
777 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
780 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
781 /*****************************************************************************
782 * I420_IUYV: planar YUV 4:2:0 to interleaved packed UYVY 4:2:2
783 *****************************************************************************/
784 static void I420_IUYV( vout_thread_t *p_vout, picture_t *p_source,
788 msg_Err( p_vout, "I420_IUYV unimplemented, please harass <sam@zoy.org>" );
791 /*****************************************************************************
792 * I420_cyuv: planar YUV 4:2:0 to upside-down packed UYVY 4:2:2
793 *****************************************************************************/
794 static void I420_cyuv( vout_thread_t *p_vout, picture_t *p_source,
797 uint8_t *p_line1 = p_dest->p->p_pixels +
798 p_dest->p->i_visible_lines * p_dest->p->i_pitch
799 + p_dest->p->i_pitch;
800 uint8_t *p_line2 = p_dest->p->p_pixels +
801 p_dest->p->i_visible_lines * p_dest->p->i_pitch;
802 uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
803 uint8_t *p_u = p_source->U_PIXELS;
804 uint8_t *p_v = p_source->V_PIXELS;
808 const int i_source_margin = p_source->p[0].i_pitch
809 - p_source->p[0].i_visible_pitch;
810 const int i_source_margin_c = p_source->p[1].i_pitch
811 - p_source->p[1].i_visible_pitch;
812 const int i_dest_margin = p_dest->p->i_pitch
813 - p_dest->p->i_visible_pitch;
815 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
816 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
818 p_line1 -= 3 * p_dest->p->i_pitch;
819 p_line2 -= 3 * p_dest->p->i_pitch;
822 p_y2 += p_source->p[Y_PLANE].i_pitch;
824 for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
826 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
832 MMX_CALL( MMX_YUV420_UYVY );
836 p_y1 += i_source_margin;
837 p_y2 += i_source_margin;
838 p_u += i_source_margin_c;
839 p_v += i_source_margin_c;
840 p_line1 += i_dest_margin;
841 p_line2 += i_dest_margin;
844 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
845 __asm__ __volatile__("emms" :: );
848 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
850 ** SSE2 128 bytes fetch/store instructions are faster
851 ** if memory access is 16 bytes aligned
853 if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
854 ((int)p_line2|(int)p_y2))) )
856 /* use faster SSE2 aligned fetch and store */
857 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
860 p_line2 += p_dest->p->i_pitch;
863 p_y2 += p_source->p[Y_PLANE].i_pitch;
865 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
867 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
869 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
874 p_y1 += i_source_margin;
875 p_y2 += i_source_margin;
876 p_u += i_source_margin_c;
877 p_v += i_source_margin_c;
878 p_line1 += i_dest_margin;
879 p_line2 += i_dest_margin;
884 /* use slower SSE2 unaligned fetch and store */
885 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
888 p_line2 += p_dest->p->i_pitch;
891 p_y2 += p_source->p[Y_PLANE].i_pitch;
893 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
895 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
897 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
902 p_y1 += i_source_margin;
903 p_y2 += i_source_margin;
904 p_u += i_source_margin_c;
905 p_v += i_source_margin_c;
906 p_line1 += i_dest_margin;
907 p_line2 += i_dest_margin;
910 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
912 #endif // !defined (MODULE_NAME_IS_i420_yuy2_altivec)
914 /*****************************************************************************
915 * I420_Y211: planar YUV 4:2:0 to packed YUYV 2:1:1
916 *****************************************************************************/
917 #if defined (MODULE_NAME_IS_i420_yuy2)
918 static void I420_Y211( vout_thread_t *p_vout, picture_t *p_source,
921 uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
922 uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
923 uint8_t *p_u = p_source->U_PIXELS;
924 uint8_t *p_v = p_source->V_PIXELS;
928 const int i_source_margin = p_source->p[0].i_pitch
929 - p_source->p[0].i_visible_pitch;
930 const int i_source_margin_c = p_source->p[1].i_pitch
931 - p_source->p[1].i_visible_pitch;
932 const int i_dest_margin = p_dest->p->i_pitch
933 - p_dest->p->i_visible_pitch;
935 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
938 p_line2 += p_dest->p->i_pitch;
941 p_y2 += p_source->p[Y_PLANE].i_pitch;
943 for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
949 p_y1 += i_source_margin;
950 p_y2 += i_source_margin;
951 p_u += i_source_margin_c;
952 p_v += i_source_margin_c;
953 p_line1 += i_dest_margin;
954 p_line2 += i_dest_margin;