1 /*****************************************************************************
2 * i420_yuy2.c : YUV to YUV conversion module for vlc
3 *****************************************************************************
4 * Copyright (C) 2000, 2001 the VideoLAN team
7 * Authors: Samuel Hocevar <sam@zoy.org>
8 * Damien Fouilleul <damien@videolan.org>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
23 *****************************************************************************/
25 /*****************************************************************************
27 *****************************************************************************/
36 #if defined (MODULE_NAME_IS_i420_yuy2_altivec) && defined(HAVE_ALTIVEC_H)
40 #include "i420_yuy2.h"
42 #define SRC_FOURCC "I420,IYUV,YV12"
44 #if defined (MODULE_NAME_IS_i420_yuy2)
45 # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv,Y211"
46 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
47 # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
48 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
49 # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
50 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
51 # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422"
54 /*****************************************************************************
55 * Local and extern prototypes.
56 *****************************************************************************/
57 static int Activate ( vlc_object_t * );
59 static void I420_YUY2 ( vout_thread_t *, picture_t *, picture_t * );
60 static void I420_YVYU ( vout_thread_t *, picture_t *, picture_t * );
61 static void I420_UYVY ( vout_thread_t *, picture_t *, picture_t * );
62 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
63 static void I420_IUYV ( vout_thread_t *, picture_t *, picture_t * );
64 static void I420_cyuv ( vout_thread_t *, picture_t *, picture_t * );
66 #if defined (MODULE_NAME_IS_i420_yuy2)
67 static void I420_Y211 ( vout_thread_t *, picture_t *, picture_t * );
70 #ifdef MODULE_NAME_IS_i420_yuy2_mmx
71 /* Initialize MMX-specific constants */
72 static const uint64_t i_00ffw = 0x00ff00ff00ff00ffULL;
73 static const uint64_t i_80w = 0x0000000080808080ULL;
76 /*****************************************************************************
78 *****************************************************************************/
80 #if defined (MODULE_NAME_IS_i420_yuy2)
81 set_description( _("Conversions from " SRC_FOURCC " to " DEST_FOURCC) );
82 set_capability( "chroma", 80 );
83 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
84 set_description( _("MMX conversions from " SRC_FOURCC " to " DEST_FOURCC) );
85 set_capability( "chroma", 100 );
86 add_requirement( MMX );
87 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
88 set_description( _("SSE2 conversions from " SRC_FOURCC " to " DEST_FOURCC) );
89 set_capability( "chroma", 120 );
90 add_requirement( SSE2 );
91 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
93 _("AltiVec conversions from " SRC_FOURCC " to " DEST_FOURCC) );
94 set_capability( "chroma", 100 );
95 add_requirement( ALTIVEC );
97 set_callbacks( Activate, NULL );
100 /*****************************************************************************
101 * Activate: allocate a chroma function
102 *****************************************************************************
103 * This function allocates and initializes a chroma function
104 *****************************************************************************/
105 static int Activate( vlc_object_t *p_this )
107 vout_thread_t *p_vout = (vout_thread_t *)p_this;
109 if( p_vout->render.i_width & 1 || p_vout->render.i_height & 1 )
114 switch( p_vout->render.i_chroma )
116 case VLC_FOURCC('Y','V','1','2'):
117 case VLC_FOURCC('I','4','2','0'):
118 case VLC_FOURCC('I','Y','U','V'):
119 switch( p_vout->output.i_chroma )
121 case VLC_FOURCC('Y','U','Y','2'):
122 case VLC_FOURCC('Y','U','N','V'):
123 p_vout->chroma.pf_convert = I420_YUY2;
126 case VLC_FOURCC('Y','V','Y','U'):
127 p_vout->chroma.pf_convert = I420_YVYU;
130 case VLC_FOURCC('U','Y','V','Y'):
131 case VLC_FOURCC('U','Y','N','V'):
132 case VLC_FOURCC('Y','4','2','2'):
133 p_vout->chroma.pf_convert = I420_UYVY;
135 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
136 case VLC_FOURCC('I','U','Y','V'):
137 p_vout->chroma.pf_convert = I420_IUYV;
140 case VLC_FOURCC('c','y','u','v'):
141 p_vout->chroma.pf_convert = I420_cyuv;
145 #if defined (MODULE_NAME_IS_i420_yuy2)
146 case VLC_FOURCC('Y','2','1','1'):
147 p_vout->chroma.pf_convert = I420_Y211;
164 static inline unsigned long long read_cycles(void)
166 unsigned long long v;
167 __asm__ __volatile__("rdtsc" : "=A" (v): );
173 /* Following functions are local */
174 /*****************************************************************************
175 * I420_YUY2: planar YUV 4:2:0 to packed YUYV 4:2:2
176 *****************************************************************************/
177 static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source,
180 uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
181 uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
182 uint8_t *p_u = p_source->U_PIXELS;
183 uint8_t *p_v = p_source->V_PIXELS;
187 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
188 #define VEC_NEXT_LINES( ) \
190 p_line2 += p_dest->p->i_pitch; \
192 p_y2 += p_source->p[Y_PLANE].i_pitch;
194 #define VEC_LOAD_UV( ) \
195 u_vec = vec_ld( 0, p_u ); p_u += 16; \
196 v_vec = vec_ld( 0, p_v ); p_v += 16;
198 #define VEC_MERGE( a ) \
199 uv_vec = a( u_vec, v_vec ); \
200 y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
201 vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
202 vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
203 y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
204 vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16; \
205 vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16;
207 vector unsigned char u_vec;
208 vector unsigned char v_vec;
209 vector unsigned char uv_vec;
210 vector unsigned char y_vec;
212 if( !( ( p_vout->render.i_width % 32 ) |
213 ( p_vout->render.i_height % 2 ) ) )
215 /* Width is a multiple of 32, we take 2 lines at a time */
216 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
219 for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
222 VEC_MERGE( vec_mergeh );
223 VEC_MERGE( vec_mergel );
227 else if( !( ( p_vout->render.i_width % 16 ) |
228 ( p_vout->render.i_height % 4 ) ) )
230 /* Width is only a multiple of 16, we take 4 lines at a time */
231 for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
233 /* Line 1 and 2, pixels 0 to ( width - 16 ) */
235 for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
238 VEC_MERGE( vec_mergeh );
239 VEC_MERGE( vec_mergel );
242 /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
244 VEC_MERGE( vec_mergeh );
246 /* Line 3 and 4, pixels 0 to 16 */
248 VEC_MERGE( vec_mergel );
250 /* Line 3 and 4, pixels 16 to ( width ) */
251 for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
254 VEC_MERGE( vec_mergeh );
255 VEC_MERGE( vec_mergel );
261 /* Crap, use the C version */
262 #undef VEC_NEXT_LINES
267 const int i_source_margin = p_source->p[0].i_pitch
268 - p_source->p[0].i_visible_pitch;
269 const int i_source_margin_c = p_source->p[1].i_pitch
270 - p_source->p[1].i_visible_pitch;
271 const int i_dest_margin = p_dest->p->i_pitch
272 - p_dest->p->i_visible_pitch;
274 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
275 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
278 p_line2 += p_dest->p->i_pitch;
281 p_y2 += p_source->p[Y_PLANE].i_pitch;
283 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
284 for( i_x = p_vout->render.i_width / 8; i_x-- ; )
292 for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
294 MMX_CALL( MMX_YUV420_YUYV );
297 for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; )
302 p_y1 += i_source_margin;
303 p_y2 += i_source_margin;
304 p_u += i_source_margin_c;
305 p_v += i_source_margin_c;
306 p_line1 += i_dest_margin;
307 p_line2 += i_dest_margin;
310 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
311 /* re-enable FPU registers */
315 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
319 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
321 ** SSE2 128 bits fetch/store instructions are faster
322 ** if memory access is 16 bytes aligned
325 if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
326 ((intptr_t)p_line2|(intptr_t)p_y2))) )
328 /* use faster SSE2 aligned fetch and store */
329 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
332 p_line2 += p_dest->p->i_pitch;
335 p_y2 += p_source->p[Y_PLANE].i_pitch;
337 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
339 SSE2_CALL( SSE2_YUV420_YUYV_ALIGNED );
341 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
346 p_y1 += i_source_margin;
347 p_y2 += i_source_margin;
348 p_u += i_source_margin_c;
349 p_v += i_source_margin_c;
350 p_line1 += i_dest_margin;
351 p_line2 += i_dest_margin;
356 /* use slower SSE2 unaligned fetch and store */
357 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
360 p_line2 += p_dest->p->i_pitch;
363 p_y2 += p_source->p[Y_PLANE].i_pitch;
365 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
367 SSE2_CALL( SSE2_YUV420_YUYV_UNALIGNED );
369 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
374 p_y1 += i_source_margin;
375 p_y2 += i_source_margin;
376 p_u += i_source_margin_c;
377 p_v += i_source_margin_c;
378 p_line1 += i_dest_margin;
379 p_line2 += i_dest_margin;
382 /* make sure all SSE2 stores are visible thereafter */
385 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
388 /*****************************************************************************
389 * I420_YVYU: planar YUV 4:2:0 to packed YVYU 4:2:2
390 *****************************************************************************/
391 static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source,
394 uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
395 uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
396 uint8_t *p_u = p_source->U_PIXELS;
397 uint8_t *p_v = p_source->V_PIXELS;
401 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
402 #define VEC_NEXT_LINES( ) \
404 p_line2 += p_dest->p->i_pitch; \
406 p_y2 += p_source->p[Y_PLANE].i_pitch;
408 #define VEC_LOAD_UV( ) \
409 u_vec = vec_ld( 0, p_u ); p_u += 16; \
410 v_vec = vec_ld( 0, p_v ); p_v += 16;
412 #define VEC_MERGE( a ) \
413 vu_vec = a( v_vec, u_vec ); \
414 y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
415 vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
416 vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
417 y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
418 vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16; \
419 vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16;
421 vector unsigned char u_vec;
422 vector unsigned char v_vec;
423 vector unsigned char vu_vec;
424 vector unsigned char y_vec;
426 if( !( ( p_vout->render.i_width % 32 ) |
427 ( p_vout->render.i_height % 2 ) ) )
429 /* Width is a multiple of 32, we take 2 lines at a time */
430 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
433 for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
436 VEC_MERGE( vec_mergeh );
437 VEC_MERGE( vec_mergel );
441 else if( !( ( p_vout->render.i_width % 16 ) |
442 ( p_vout->render.i_height % 4 ) ) )
444 /* Width is only a multiple of 16, we take 4 lines at a time */
445 for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
447 /* Line 1 and 2, pixels 0 to ( width - 16 ) */
449 for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
452 VEC_MERGE( vec_mergeh );
453 VEC_MERGE( vec_mergel );
456 /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
458 VEC_MERGE( vec_mergeh );
460 /* Line 3 and 4, pixels 0 to 16 */
462 VEC_MERGE( vec_mergel );
464 /* Line 3 and 4, pixels 16 to ( width ) */
465 for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
468 VEC_MERGE( vec_mergeh );
469 VEC_MERGE( vec_mergel );
475 /* Crap, use the C version */
476 #undef VEC_NEXT_LINES
481 const int i_source_margin = p_source->p[0].i_pitch
482 - p_source->p[0].i_visible_pitch;
483 const int i_source_margin_c = p_source->p[1].i_pitch
484 - p_source->p[1].i_visible_pitch;
485 const int i_dest_margin = p_dest->p->i_pitch
486 - p_dest->p->i_visible_pitch;
488 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
489 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
492 p_line2 += p_dest->p->i_pitch;
495 p_y2 += p_source->p[Y_PLANE].i_pitch;
497 for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
499 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
505 MMX_CALL( MMX_YUV420_YVYU );
508 for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; )
513 p_y1 += i_source_margin;
514 p_y2 += i_source_margin;
515 p_u += i_source_margin_c;
516 p_v += i_source_margin_c;
517 p_line1 += i_dest_margin;
518 p_line2 += i_dest_margin;
521 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
522 /* re-enable FPU registers */
526 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
530 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
532 ** SSE2 128 bits fetch/store instructions are faster
533 ** if memory access is 16 bytes aligned
535 if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
536 ((intptr_t)p_line2|(intptr_t)p_y2))) )
538 /* use faster SSE2 aligned fetch and store */
539 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
542 p_line2 += p_dest->p->i_pitch;
545 p_y2 += p_source->p[Y_PLANE].i_pitch;
547 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
549 SSE2_CALL( SSE2_YUV420_YVYU_ALIGNED );
551 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
556 p_y1 += i_source_margin;
557 p_y2 += i_source_margin;
558 p_u += i_source_margin_c;
559 p_v += i_source_margin_c;
560 p_line1 += i_dest_margin;
561 p_line2 += i_dest_margin;
566 /* use slower SSE2 unaligned fetch and store */
567 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
570 p_line2 += p_dest->p->i_pitch;
573 p_y2 += p_source->p[Y_PLANE].i_pitch;
575 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
577 SSE2_CALL( SSE2_YUV420_YVYU_UNALIGNED );
579 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
584 p_y1 += i_source_margin;
585 p_y2 += i_source_margin;
586 p_u += i_source_margin_c;
587 p_v += i_source_margin_c;
588 p_line1 += i_dest_margin;
589 p_line2 += i_dest_margin;
592 /* make sure all SSE2 stores are visible thereafter */
594 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
597 /*****************************************************************************
598 * I420_UYVY: planar YUV 4:2:0 to packed UYVY 4:2:2
599 *****************************************************************************/
600 static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source,
603 uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
604 uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
605 uint8_t *p_u = p_source->U_PIXELS;
606 uint8_t *p_v = p_source->V_PIXELS;
610 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
611 #define VEC_NEXT_LINES( ) \
613 p_line2 += p_dest->p->i_pitch; \
615 p_y2 += p_source->p[Y_PLANE].i_pitch;
617 #define VEC_LOAD_UV( ) \
618 u_vec = vec_ld( 0, p_u ); p_u += 16; \
619 v_vec = vec_ld( 0, p_v ); p_v += 16;
621 #define VEC_MERGE( a ) \
622 uv_vec = a( u_vec, v_vec ); \
623 y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
624 vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
625 vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
626 y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
627 vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16; \
628 vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16;
630 vector unsigned char u_vec;
631 vector unsigned char v_vec;
632 vector unsigned char uv_vec;
633 vector unsigned char y_vec;
635 if( !( ( p_vout->render.i_width % 32 ) |
636 ( p_vout->render.i_height % 2 ) ) )
638 /* Width is a multiple of 32, we take 2 lines at a time */
639 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
642 for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
645 VEC_MERGE( vec_mergeh );
646 VEC_MERGE( vec_mergel );
650 else if( !( ( p_vout->render.i_width % 16 ) |
651 ( p_vout->render.i_height % 4 ) ) )
653 /* Width is only a multiple of 16, we take 4 lines at a time */
654 for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
656 /* Line 1 and 2, pixels 0 to ( width - 16 ) */
658 for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
661 VEC_MERGE( vec_mergeh );
662 VEC_MERGE( vec_mergel );
665 /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
667 VEC_MERGE( vec_mergeh );
669 /* Line 3 and 4, pixels 0 to 16 */
671 VEC_MERGE( vec_mergel );
673 /* Line 3 and 4, pixels 16 to ( width ) */
674 for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
677 VEC_MERGE( vec_mergeh );
678 VEC_MERGE( vec_mergel );
684 /* Crap, use the C version */
685 #undef VEC_NEXT_LINES
690 const int i_source_margin = p_source->p[0].i_pitch
691 - p_source->p[0].i_visible_pitch;
692 const int i_source_margin_c = p_source->p[1].i_pitch
693 - p_source->p[1].i_visible_pitch;
694 const int i_dest_margin = p_dest->p->i_pitch
695 - p_dest->p->i_visible_pitch;
697 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
698 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
701 p_line2 += p_dest->p->i_pitch;
704 p_y2 += p_source->p[Y_PLANE].i_pitch;
706 for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
708 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
714 MMX_CALL( MMX_YUV420_UYVY );
717 for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x--; )
722 p_y1 += i_source_margin;
723 p_y2 += i_source_margin;
724 p_u += i_source_margin_c;
725 p_v += i_source_margin_c;
726 p_line1 += i_dest_margin;
727 p_line2 += i_dest_margin;
730 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
731 /* re-enable FPU registers */
735 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
739 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
741 ** SSE2 128 bits fetch/store instructions are faster
742 ** if memory access is 16 bytes aligned
744 if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
745 ((intptr_t)p_line2|(intptr_t)p_y2))) )
747 /* use faster SSE2 aligned fetch and store */
748 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
751 p_line2 += p_dest->p->i_pitch;
754 p_y2 += p_source->p[Y_PLANE].i_pitch;
756 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
758 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
760 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
765 p_y1 += i_source_margin;
766 p_y2 += i_source_margin;
767 p_u += i_source_margin_c;
768 p_v += i_source_margin_c;
769 p_line1 += i_dest_margin;
770 p_line2 += i_dest_margin;
775 /* use slower SSE2 unaligned fetch and store */
776 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
779 p_line2 += p_dest->p->i_pitch;
782 p_y2 += p_source->p[Y_PLANE].i_pitch;
784 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
786 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
788 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
793 p_y1 += i_source_margin;
794 p_y2 += i_source_margin;
795 p_u += i_source_margin_c;
796 p_v += i_source_margin_c;
797 p_line1 += i_dest_margin;
798 p_line2 += i_dest_margin;
801 /* make sure all SSE2 stores are visible thereafter */
803 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
806 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
807 /*****************************************************************************
808 * I420_IUYV: planar YUV 4:2:0 to interleaved packed UYVY 4:2:2
809 *****************************************************************************/
810 static void I420_IUYV( vout_thread_t *p_vout, picture_t *p_source,
813 VLC_UNUSED(p_source); VLC_UNUSED(p_dest);
815 msg_Err( p_vout, "I420_IUYV unimplemented, please harass <sam@zoy.org>" );
818 /*****************************************************************************
819 * I420_cyuv: planar YUV 4:2:0 to upside-down packed UYVY 4:2:2
820 *****************************************************************************/
821 static void I420_cyuv( vout_thread_t *p_vout, picture_t *p_source,
824 uint8_t *p_line1 = p_dest->p->p_pixels +
825 p_dest->p->i_visible_lines * p_dest->p->i_pitch
826 + p_dest->p->i_pitch;
827 uint8_t *p_line2 = p_dest->p->p_pixels +
828 p_dest->p->i_visible_lines * p_dest->p->i_pitch;
829 uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
830 uint8_t *p_u = p_source->U_PIXELS;
831 uint8_t *p_v = p_source->V_PIXELS;
835 const int i_source_margin = p_source->p[0].i_pitch
836 - p_source->p[0].i_visible_pitch;
837 const int i_source_margin_c = p_source->p[1].i_pitch
838 - p_source->p[1].i_visible_pitch;
839 const int i_dest_margin = p_dest->p->i_pitch
840 - p_dest->p->i_visible_pitch;
842 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
843 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
845 p_line1 -= 3 * p_dest->p->i_pitch;
846 p_line2 -= 3 * p_dest->p->i_pitch;
849 p_y2 += p_source->p[Y_PLANE].i_pitch;
851 for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
853 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
859 MMX_CALL( MMX_YUV420_UYVY );
862 for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; )
867 p_y1 += i_source_margin;
868 p_y2 += i_source_margin;
869 p_u += i_source_margin_c;
870 p_v += i_source_margin_c;
871 p_line1 += i_dest_margin;
872 p_line2 += i_dest_margin;
875 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
876 /* re-enable FPU registers */
880 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
882 ** SSE2 128 bits fetch/store instructions are faster
883 ** if memory access is 16 bytes aligned
885 if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
886 ((intptr_t)p_line2|(intptr_t)p_y2))) )
888 /* use faster SSE2 aligned fetch and store */
889 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
892 p_line2 += p_dest->p->i_pitch;
895 p_y2 += p_source->p[Y_PLANE].i_pitch;
897 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
899 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
901 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
906 p_y1 += i_source_margin;
907 p_y2 += i_source_margin;
908 p_u += i_source_margin_c;
909 p_v += i_source_margin_c;
910 p_line1 += i_dest_margin;
911 p_line2 += i_dest_margin;
916 /* use slower SSE2 unaligned fetch and store */
917 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
920 p_line2 += p_dest->p->i_pitch;
923 p_y2 += p_source->p[Y_PLANE].i_pitch;
925 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
927 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
929 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
934 p_y1 += i_source_margin;
935 p_y2 += i_source_margin;
936 p_u += i_source_margin_c;
937 p_v += i_source_margin_c;
938 p_line1 += i_dest_margin;
939 p_line2 += i_dest_margin;
942 /* make sure all SSE2 stores are visible thereafter */
944 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
946 #endif // !defined (MODULE_NAME_IS_i420_yuy2_altivec)
948 /*****************************************************************************
949 * I420_Y211: planar YUV 4:2:0 to packed YUYV 2:1:1
950 *****************************************************************************/
951 #if defined (MODULE_NAME_IS_i420_yuy2)
952 static void I420_Y211( vout_thread_t *p_vout, picture_t *p_source,
955 uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
956 uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
957 uint8_t *p_u = p_source->U_PIXELS;
958 uint8_t *p_v = p_source->V_PIXELS;
962 const int i_source_margin = p_source->p[0].i_pitch
963 - p_source->p[0].i_visible_pitch;
964 const int i_source_margin_c = p_source->p[1].i_pitch
965 - p_source->p[1].i_visible_pitch;
966 const int i_dest_margin = p_dest->p->i_pitch
967 - p_dest->p->i_visible_pitch;
969 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
972 p_line2 += p_dest->p->i_pitch;
975 p_y2 += p_source->p[Y_PLANE].i_pitch;
977 for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
983 p_y1 += i_source_margin;
984 p_y2 += i_source_margin;
985 p_u += i_source_margin_c;
986 p_v += i_source_margin_c;
987 p_line1 += i_dest_margin;
988 p_line2 += i_dest_margin;