1 /*****************************************************************************
2 * i420_yuy2.c : YUV to YUV conversion module for vlc
3 *****************************************************************************
4 * Copyright (C) 2000, 2001 the VideoLAN team
7 * Authors: Samuel Hocevar <sam@zoy.org>
8 * Damien Fouilleul <damien@videolan.org>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
23 *****************************************************************************/
25 /*****************************************************************************
27 *****************************************************************************/
32 #if defined (MODULE_NAME_IS_i420_yuy2_altivec) && defined(HAVE_ALTIVEC_H)
36 #include "i420_yuy2.h"
38 #define SRC_FOURCC "I420,IYUV,YV12"
40 #if defined (MODULE_NAME_IS_i420_yuy2)
41 # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv,Y211"
42 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
43 # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
44 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
45 # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
46 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
47 # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422"
50 /*****************************************************************************
51 * Local and extern prototypes.
52 *****************************************************************************/
53 static int Activate ( vlc_object_t * );
55 static void I420_YUY2 ( vout_thread_t *, picture_t *, picture_t * );
56 static void I420_YVYU ( vout_thread_t *, picture_t *, picture_t * );
57 static void I420_UYVY ( vout_thread_t *, picture_t *, picture_t * );
58 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
59 static void I420_IUYV ( vout_thread_t *, picture_t *, picture_t * );
60 static void I420_cyuv ( vout_thread_t *, picture_t *, picture_t * );
62 #if defined (MODULE_NAME_IS_i420_yuy2)
63 static void I420_Y211 ( vout_thread_t *, picture_t *, picture_t * );
66 #ifdef MODULE_NAME_IS_i420_yuy2_mmx
67 /* Initialize MMX-specific constants */
68 static const uint64_t i_00ffw = 0x00ff00ff00ff00ffULL;
69 static const uint64_t i_80w = 0x0000000080808080ULL;
72 /*****************************************************************************
74 *****************************************************************************/
76 #if defined (MODULE_NAME_IS_i420_yuy2)
77 set_description( _("Conversions from " SRC_FOURCC " to " DEST_FOURCC) );
78 set_capability( "chroma", 80 );
79 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
80 set_description( _("MMX conversions from " SRC_FOURCC " to " DEST_FOURCC) );
81 set_capability( "chroma", 100 );
82 add_requirement( MMX );
83 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
84 set_description( _("SSE2 conversions from " SRC_FOURCC " to " DEST_FOURCC) );
85 set_capability( "chroma", 120 );
86 add_requirement( SSE2 );
87 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
89 _("AltiVec conversions from " SRC_FOURCC " to " DEST_FOURCC) );
90 set_capability( "chroma", 100 );
91 add_requirement( ALTIVEC );
93 set_callbacks( Activate, NULL );
96 /*****************************************************************************
97 * Activate: allocate a chroma function
98 *****************************************************************************
99 * This function allocates and initializes a chroma function
100 *****************************************************************************/
101 static int Activate( vlc_object_t *p_this )
103 vout_thread_t *p_vout = (vout_thread_t *)p_this;
105 if( p_vout->render.i_width & 1 || p_vout->render.i_height & 1 )
110 switch( p_vout->render.i_chroma )
112 case VLC_FOURCC('Y','V','1','2'):
113 case VLC_FOURCC('I','4','2','0'):
114 case VLC_FOURCC('I','Y','U','V'):
115 switch( p_vout->output.i_chroma )
117 case VLC_FOURCC('Y','U','Y','2'):
118 case VLC_FOURCC('Y','U','N','V'):
119 p_vout->chroma.pf_convert = I420_YUY2;
122 case VLC_FOURCC('Y','V','Y','U'):
123 p_vout->chroma.pf_convert = I420_YVYU;
126 case VLC_FOURCC('U','Y','V','Y'):
127 case VLC_FOURCC('U','Y','N','V'):
128 case VLC_FOURCC('Y','4','2','2'):
129 p_vout->chroma.pf_convert = I420_UYVY;
131 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
132 case VLC_FOURCC('I','U','Y','V'):
133 p_vout->chroma.pf_convert = I420_IUYV;
136 case VLC_FOURCC('c','y','u','v'):
137 p_vout->chroma.pf_convert = I420_cyuv;
141 #if defined (MODULE_NAME_IS_i420_yuy2)
142 case VLC_FOURCC('Y','2','1','1'):
143 p_vout->chroma.pf_convert = I420_Y211;
160 static inline unsigned long long read_cycles(void)
162 unsigned long long v;
163 __asm__ __volatile__("rdtsc" : "=A" (v): );
169 /* Following functions are local */
170 /*****************************************************************************
171 * I420_YUY2: planar YUV 4:2:0 to packed YUYV 4:2:2
172 *****************************************************************************/
173 static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source,
176 uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
177 uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
178 uint8_t *p_u = p_source->U_PIXELS;
179 uint8_t *p_v = p_source->V_PIXELS;
183 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
184 #define VEC_NEXT_LINES( ) \
186 p_line2 += p_dest->p->i_pitch; \
188 p_y2 += p_source->p[Y_PLANE].i_pitch;
190 #define VEC_LOAD_UV( ) \
191 u_vec = vec_ld( 0, p_u ); p_u += 16; \
192 v_vec = vec_ld( 0, p_v ); p_v += 16;
194 #define VEC_MERGE( a ) \
195 uv_vec = a( u_vec, v_vec ); \
196 y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
197 vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
198 vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
199 y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
200 vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16; \
201 vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16;
203 vector unsigned char u_vec;
204 vector unsigned char v_vec;
205 vector unsigned char uv_vec;
206 vector unsigned char y_vec;
208 if( !( ( p_vout->render.i_width % 32 ) |
209 ( p_vout->render.i_height % 2 ) ) )
211 /* Width is a multiple of 32, we take 2 lines at a time */
212 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
215 for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
218 VEC_MERGE( vec_mergeh );
219 VEC_MERGE( vec_mergel );
223 else if( !( ( p_vout->render.i_width % 16 ) |
224 ( p_vout->render.i_height % 4 ) ) )
226 /* Width is only a multiple of 16, we take 4 lines at a time */
227 for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
229 /* Line 1 and 2, pixels 0 to ( width - 16 ) */
231 for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
234 VEC_MERGE( vec_mergeh );
235 VEC_MERGE( vec_mergel );
238 /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
240 VEC_MERGE( vec_mergeh );
242 /* Line 3 and 4, pixels 0 to 16 */
244 VEC_MERGE( vec_mergel );
246 /* Line 3 and 4, pixels 16 to ( width ) */
247 for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
250 VEC_MERGE( vec_mergeh );
251 VEC_MERGE( vec_mergel );
257 /* Crap, use the C version */
258 #undef VEC_NEXT_LINES
263 const int i_source_margin = p_source->p[0].i_pitch
264 - p_source->p[0].i_visible_pitch;
265 const int i_source_margin_c = p_source->p[1].i_pitch
266 - p_source->p[1].i_visible_pitch;
267 const int i_dest_margin = p_dest->p->i_pitch
268 - p_dest->p->i_visible_pitch;
270 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
271 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
274 p_line2 += p_dest->p->i_pitch;
277 p_y2 += p_source->p[Y_PLANE].i_pitch;
279 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
280 for( i_x = p_vout->render.i_width / 8; i_x-- ; )
288 for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
290 MMX_CALL( MMX_YUV420_YUYV );
293 for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; )
298 p_y1 += i_source_margin;
299 p_y2 += i_source_margin;
300 p_u += i_source_margin_c;
301 p_v += i_source_margin_c;
302 p_line1 += i_dest_margin;
303 p_line2 += i_dest_margin;
306 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
307 /* re-enable FPU registers */
311 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
315 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
317 ** SSE2 128 bits fetch/store instructions are faster
318 ** if memory access is 16 bytes aligned
321 if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
322 ((intptr_t)p_line2|(intptr_t)p_y2))) )
324 /* use faster SSE2 aligned fetch and store */
325 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
328 p_line2 += p_dest->p->i_pitch;
331 p_y2 += p_source->p[Y_PLANE].i_pitch;
333 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
335 SSE2_CALL( SSE2_YUV420_YUYV_ALIGNED );
337 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
342 p_y1 += i_source_margin;
343 p_y2 += i_source_margin;
344 p_u += i_source_margin_c;
345 p_v += i_source_margin_c;
346 p_line1 += i_dest_margin;
347 p_line2 += i_dest_margin;
352 /* use slower SSE2 unaligned fetch and store */
353 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
356 p_line2 += p_dest->p->i_pitch;
359 p_y2 += p_source->p[Y_PLANE].i_pitch;
361 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
363 SSE2_CALL( SSE2_YUV420_YUYV_UNALIGNED );
365 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
370 p_y1 += i_source_margin;
371 p_y2 += i_source_margin;
372 p_u += i_source_margin_c;
373 p_v += i_source_margin_c;
374 p_line1 += i_dest_margin;
375 p_line2 += i_dest_margin;
378 /* make sure all SSE2 stores are visible thereafter */
381 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
384 /*****************************************************************************
385 * I420_YVYU: planar YUV 4:2:0 to packed YVYU 4:2:2
386 *****************************************************************************/
387 static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source,
390 uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
391 uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
392 uint8_t *p_u = p_source->U_PIXELS;
393 uint8_t *p_v = p_source->V_PIXELS;
397 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
398 #define VEC_NEXT_LINES( ) \
400 p_line2 += p_dest->p->i_pitch; \
402 p_y2 += p_source->p[Y_PLANE].i_pitch;
404 #define VEC_LOAD_UV( ) \
405 u_vec = vec_ld( 0, p_u ); p_u += 16; \
406 v_vec = vec_ld( 0, p_v ); p_v += 16;
408 #define VEC_MERGE( a ) \
409 vu_vec = a( v_vec, u_vec ); \
410 y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
411 vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
412 vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
413 y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
414 vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16; \
415 vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16;
417 vector unsigned char u_vec;
418 vector unsigned char v_vec;
419 vector unsigned char vu_vec;
420 vector unsigned char y_vec;
422 if( !( ( p_vout->render.i_width % 32 ) |
423 ( p_vout->render.i_height % 2 ) ) )
425 /* Width is a multiple of 32, we take 2 lines at a time */
426 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
429 for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
432 VEC_MERGE( vec_mergeh );
433 VEC_MERGE( vec_mergel );
437 else if( !( ( p_vout->render.i_width % 16 ) |
438 ( p_vout->render.i_height % 4 ) ) )
440 /* Width is only a multiple of 16, we take 4 lines at a time */
441 for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
443 /* Line 1 and 2, pixels 0 to ( width - 16 ) */
445 for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
448 VEC_MERGE( vec_mergeh );
449 VEC_MERGE( vec_mergel );
452 /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
454 VEC_MERGE( vec_mergeh );
456 /* Line 3 and 4, pixels 0 to 16 */
458 VEC_MERGE( vec_mergel );
460 /* Line 3 and 4, pixels 16 to ( width ) */
461 for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
464 VEC_MERGE( vec_mergeh );
465 VEC_MERGE( vec_mergel );
471 /* Crap, use the C version */
472 #undef VEC_NEXT_LINES
477 const int i_source_margin = p_source->p[0].i_pitch
478 - p_source->p[0].i_visible_pitch;
479 const int i_source_margin_c = p_source->p[1].i_pitch
480 - p_source->p[1].i_visible_pitch;
481 const int i_dest_margin = p_dest->p->i_pitch
482 - p_dest->p->i_visible_pitch;
484 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
485 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
488 p_line2 += p_dest->p->i_pitch;
491 p_y2 += p_source->p[Y_PLANE].i_pitch;
493 for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
495 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
501 MMX_CALL( MMX_YUV420_YVYU );
504 for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; )
509 p_y1 += i_source_margin;
510 p_y2 += i_source_margin;
511 p_u += i_source_margin_c;
512 p_v += i_source_margin_c;
513 p_line1 += i_dest_margin;
514 p_line2 += i_dest_margin;
517 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
518 /* re-enable FPU registers */
522 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
526 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
528 ** SSE2 128 bits fetch/store instructions are faster
529 ** if memory access is 16 bytes aligned
531 if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
532 ((intptr_t)p_line2|(intptr_t)p_y2))) )
534 /* use faster SSE2 aligned fetch and store */
535 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
538 p_line2 += p_dest->p->i_pitch;
541 p_y2 += p_source->p[Y_PLANE].i_pitch;
543 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
545 SSE2_CALL( SSE2_YUV420_YVYU_ALIGNED );
547 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
552 p_y1 += i_source_margin;
553 p_y2 += i_source_margin;
554 p_u += i_source_margin_c;
555 p_v += i_source_margin_c;
556 p_line1 += i_dest_margin;
557 p_line2 += i_dest_margin;
562 /* use slower SSE2 unaligned fetch and store */
563 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
566 p_line2 += p_dest->p->i_pitch;
569 p_y2 += p_source->p[Y_PLANE].i_pitch;
571 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
573 SSE2_CALL( SSE2_YUV420_YVYU_UNALIGNED );
575 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
580 p_y1 += i_source_margin;
581 p_y2 += i_source_margin;
582 p_u += i_source_margin_c;
583 p_v += i_source_margin_c;
584 p_line1 += i_dest_margin;
585 p_line2 += i_dest_margin;
588 /* make sure all SSE2 stores are visible thereafter */
590 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
593 /*****************************************************************************
594 * I420_UYVY: planar YUV 4:2:0 to packed UYVY 4:2:2
595 *****************************************************************************/
596 static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source,
599 uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
600 uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
601 uint8_t *p_u = p_source->U_PIXELS;
602 uint8_t *p_v = p_source->V_PIXELS;
606 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
607 #define VEC_NEXT_LINES( ) \
609 p_line2 += p_dest->p->i_pitch; \
611 p_y2 += p_source->p[Y_PLANE].i_pitch;
613 #define VEC_LOAD_UV( ) \
614 u_vec = vec_ld( 0, p_u ); p_u += 16; \
615 v_vec = vec_ld( 0, p_v ); p_v += 16;
617 #define VEC_MERGE( a ) \
618 uv_vec = a( u_vec, v_vec ); \
619 y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
620 vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
621 vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
622 y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
623 vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16; \
624 vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16;
626 vector unsigned char u_vec;
627 vector unsigned char v_vec;
628 vector unsigned char uv_vec;
629 vector unsigned char y_vec;
631 if( !( ( p_vout->render.i_width % 32 ) |
632 ( p_vout->render.i_height % 2 ) ) )
634 /* Width is a multiple of 32, we take 2 lines at a time */
635 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
638 for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
641 VEC_MERGE( vec_mergeh );
642 VEC_MERGE( vec_mergel );
646 else if( !( ( p_vout->render.i_width % 16 ) |
647 ( p_vout->render.i_height % 4 ) ) )
649 /* Width is only a multiple of 16, we take 4 lines at a time */
650 for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
652 /* Line 1 and 2, pixels 0 to ( width - 16 ) */
654 for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
657 VEC_MERGE( vec_mergeh );
658 VEC_MERGE( vec_mergel );
661 /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
663 VEC_MERGE( vec_mergeh );
665 /* Line 3 and 4, pixels 0 to 16 */
667 VEC_MERGE( vec_mergel );
669 /* Line 3 and 4, pixels 16 to ( width ) */
670 for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
673 VEC_MERGE( vec_mergeh );
674 VEC_MERGE( vec_mergel );
680 /* Crap, use the C version */
681 #undef VEC_NEXT_LINES
686 const int i_source_margin = p_source->p[0].i_pitch
687 - p_source->p[0].i_visible_pitch;
688 const int i_source_margin_c = p_source->p[1].i_pitch
689 - p_source->p[1].i_visible_pitch;
690 const int i_dest_margin = p_dest->p->i_pitch
691 - p_dest->p->i_visible_pitch;
693 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
694 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
697 p_line2 += p_dest->p->i_pitch;
700 p_y2 += p_source->p[Y_PLANE].i_pitch;
702 for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
704 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
710 MMX_CALL( MMX_YUV420_UYVY );
713 for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x--; )
718 p_y1 += i_source_margin;
719 p_y2 += i_source_margin;
720 p_u += i_source_margin_c;
721 p_v += i_source_margin_c;
722 p_line1 += i_dest_margin;
723 p_line2 += i_dest_margin;
726 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
727 /* re-enable FPU registers */
731 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
735 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
737 ** SSE2 128 bits fetch/store instructions are faster
738 ** if memory access is 16 bytes aligned
740 if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
741 ((intptr_t)p_line2|(intptr_t)p_y2))) )
743 /* use faster SSE2 aligned fetch and store */
744 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
747 p_line2 += p_dest->p->i_pitch;
750 p_y2 += p_source->p[Y_PLANE].i_pitch;
752 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
754 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
756 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
761 p_y1 += i_source_margin;
762 p_y2 += i_source_margin;
763 p_u += i_source_margin_c;
764 p_v += i_source_margin_c;
765 p_line1 += i_dest_margin;
766 p_line2 += i_dest_margin;
771 /* use slower SSE2 unaligned fetch and store */
772 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
775 p_line2 += p_dest->p->i_pitch;
778 p_y2 += p_source->p[Y_PLANE].i_pitch;
780 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
782 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
784 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
789 p_y1 += i_source_margin;
790 p_y2 += i_source_margin;
791 p_u += i_source_margin_c;
792 p_v += i_source_margin_c;
793 p_line1 += i_dest_margin;
794 p_line2 += i_dest_margin;
797 /* make sure all SSE2 stores are visible thereafter */
799 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
802 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
803 /*****************************************************************************
804 * I420_IUYV: planar YUV 4:2:0 to interleaved packed UYVY 4:2:2
805 *****************************************************************************/
806 static void I420_IUYV( vout_thread_t *p_vout, picture_t *p_source,
810 msg_Err( p_vout, "I420_IUYV unimplemented, please harass <sam@zoy.org>" );
813 /*****************************************************************************
814 * I420_cyuv: planar YUV 4:2:0 to upside-down packed UYVY 4:2:2
815 *****************************************************************************/
816 static void I420_cyuv( vout_thread_t *p_vout, picture_t *p_source,
819 uint8_t *p_line1 = p_dest->p->p_pixels +
820 p_dest->p->i_visible_lines * p_dest->p->i_pitch
821 + p_dest->p->i_pitch;
822 uint8_t *p_line2 = p_dest->p->p_pixels +
823 p_dest->p->i_visible_lines * p_dest->p->i_pitch;
824 uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
825 uint8_t *p_u = p_source->U_PIXELS;
826 uint8_t *p_v = p_source->V_PIXELS;
830 const int i_source_margin = p_source->p[0].i_pitch
831 - p_source->p[0].i_visible_pitch;
832 const int i_source_margin_c = p_source->p[1].i_pitch
833 - p_source->p[1].i_visible_pitch;
834 const int i_dest_margin = p_dest->p->i_pitch
835 - p_dest->p->i_visible_pitch;
837 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
838 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
840 p_line1 -= 3 * p_dest->p->i_pitch;
841 p_line2 -= 3 * p_dest->p->i_pitch;
844 p_y2 += p_source->p[Y_PLANE].i_pitch;
846 for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
848 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
854 MMX_CALL( MMX_YUV420_UYVY );
857 for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; )
862 p_y1 += i_source_margin;
863 p_y2 += i_source_margin;
864 p_u += i_source_margin_c;
865 p_v += i_source_margin_c;
866 p_line1 += i_dest_margin;
867 p_line2 += i_dest_margin;
870 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
871 /* re-enable FPU registers */
875 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
877 ** SSE2 128 bits fetch/store instructions are faster
878 ** if memory access is 16 bytes aligned
880 if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
881 ((intptr_t)p_line2|(intptr_t)p_y2))) )
883 /* use faster SSE2 aligned fetch and store */
884 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
887 p_line2 += p_dest->p->i_pitch;
890 p_y2 += p_source->p[Y_PLANE].i_pitch;
892 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
894 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
896 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
901 p_y1 += i_source_margin;
902 p_y2 += i_source_margin;
903 p_u += i_source_margin_c;
904 p_v += i_source_margin_c;
905 p_line1 += i_dest_margin;
906 p_line2 += i_dest_margin;
911 /* use slower SSE2 unaligned fetch and store */
912 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
915 p_line2 += p_dest->p->i_pitch;
918 p_y2 += p_source->p[Y_PLANE].i_pitch;
920 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
922 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
924 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
929 p_y1 += i_source_margin;
930 p_y2 += i_source_margin;
931 p_u += i_source_margin_c;
932 p_v += i_source_margin_c;
933 p_line1 += i_dest_margin;
934 p_line2 += i_dest_margin;
937 /* make sure all SSE2 stores are visible thereafter */
939 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
941 #endif // !defined (MODULE_NAME_IS_i420_yuy2_altivec)
943 /*****************************************************************************
944 * I420_Y211: planar YUV 4:2:0 to packed YUYV 2:1:1
945 *****************************************************************************/
946 #if defined (MODULE_NAME_IS_i420_yuy2)
947 static void I420_Y211( vout_thread_t *p_vout, picture_t *p_source,
950 uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
951 uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
952 uint8_t *p_u = p_source->U_PIXELS;
953 uint8_t *p_v = p_source->V_PIXELS;
957 const int i_source_margin = p_source->p[0].i_pitch
958 - p_source->p[0].i_visible_pitch;
959 const int i_source_margin_c = p_source->p[1].i_pitch
960 - p_source->p[1].i_visible_pitch;
961 const int i_dest_margin = p_dest->p->i_pitch
962 - p_dest->p->i_visible_pitch;
964 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
967 p_line2 += p_dest->p->i_pitch;
970 p_y2 += p_source->p[Y_PLANE].i_pitch;
972 for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
978 p_y1 += i_source_margin;
979 p_y2 += i_source_margin;
980 p_u += i_source_margin_c;
981 p_v += i_source_margin_c;
982 p_line1 += i_dest_margin;
983 p_line2 += i_dest_margin;