1 /*****************************************************************************
2 * i420_yuy2.c : YUV to YUV conversion module for vlc
3 *****************************************************************************
4 * Copyright (C) 2000, 2001 the VideoLAN team
7 * Authors: Samuel Hocevar <sam@zoy.org>
8 * Damien Fouilleul <damien@videolan.org>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
23 *****************************************************************************/
25 /*****************************************************************************
27 *****************************************************************************/
28 #include <string.h> /* strerror() */
33 #if defined (MODULE_NAME_IS_i420_yuy2_altivec) && defined(HAVE_ALTIVEC_H)
37 #include "i420_yuy2.h"
39 #define SRC_FOURCC "I420,IYUV,YV12"
41 #if defined (MODULE_NAME_IS_i420_yuy2)
42 # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv,Y211"
43 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
44 # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
45 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
46 # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
47 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
48 # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422"
51 /*****************************************************************************
52 * Local and extern prototypes.
53 *****************************************************************************/
54 static int Activate ( vlc_object_t * );
56 static void I420_YUY2 ( vout_thread_t *, picture_t *, picture_t * );
57 static void I420_YVYU ( vout_thread_t *, picture_t *, picture_t * );
58 static void I420_UYVY ( vout_thread_t *, picture_t *, picture_t * );
59 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
60 static void I420_IUYV ( vout_thread_t *, picture_t *, picture_t * );
61 static void I420_cyuv ( vout_thread_t *, picture_t *, picture_t * );
63 #if defined (MODULE_NAME_IS_i420_yuy2)
64 static void I420_Y211 ( vout_thread_t *, picture_t *, picture_t * );
67 #ifdef MODULE_NAME_IS_i420_yuy2_mmx
68 /* Initialize MMX-specific constants */
69 static const uint64_t i_00ffw = 0x00ff00ff00ff00ffULL;
70 static const uint64_t i_80w = 0x0000000080808080ULL;
73 /*****************************************************************************
75 *****************************************************************************/
77 #if defined (MODULE_NAME_IS_i420_yuy2)
78 set_description( _("Conversions from " SRC_FOURCC " to " DEST_FOURCC) );
79 set_capability( "chroma", 80 );
80 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
81 set_description( _("MMX conversions from " SRC_FOURCC " to " DEST_FOURCC) );
82 set_capability( "chroma", 100 );
83 add_requirement( MMX );
84 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
85 set_description( _("SSE2 conversions from " SRC_FOURCC " to " DEST_FOURCC) );
86 set_capability( "chroma", 120 );
87 add_requirement( SSE2 );
88 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
90 _("AltiVec conversions from " SRC_FOURCC " to " DEST_FOURCC) );
91 set_capability( "chroma", 100 );
92 add_requirement( ALTIVEC );
94 set_callbacks( Activate, NULL );
97 /*****************************************************************************
98 * Activate: allocate a chroma function
99 *****************************************************************************
100 * This function allocates and initializes a chroma function
101 *****************************************************************************/
102 static int Activate( vlc_object_t *p_this )
104 vout_thread_t *p_vout = (vout_thread_t *)p_this;
106 if( p_vout->render.i_width & 1 || p_vout->render.i_height & 1 )
111 switch( p_vout->render.i_chroma )
113 case VLC_FOURCC('Y','V','1','2'):
114 case VLC_FOURCC('I','4','2','0'):
115 case VLC_FOURCC('I','Y','U','V'):
116 switch( p_vout->output.i_chroma )
118 case VLC_FOURCC('Y','U','Y','2'):
119 case VLC_FOURCC('Y','U','N','V'):
120 p_vout->chroma.pf_convert = I420_YUY2;
123 case VLC_FOURCC('Y','V','Y','U'):
124 p_vout->chroma.pf_convert = I420_YVYU;
127 case VLC_FOURCC('U','Y','V','Y'):
128 case VLC_FOURCC('U','Y','N','V'):
129 case VLC_FOURCC('Y','4','2','2'):
130 p_vout->chroma.pf_convert = I420_UYVY;
132 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
133 case VLC_FOURCC('I','U','Y','V'):
134 p_vout->chroma.pf_convert = I420_IUYV;
137 case VLC_FOURCC('c','y','u','v'):
138 p_vout->chroma.pf_convert = I420_cyuv;
142 #if defined (MODULE_NAME_IS_i420_yuy2)
143 case VLC_FOURCC('Y','2','1','1'):
144 p_vout->chroma.pf_convert = I420_Y211;
161 static inline unsigned long long read_cycles(void)
163 unsigned long long v;
164 __asm__ __volatile__("rdtsc" : "=A" (v): );
170 /* Following functions are local */
171 /*****************************************************************************
172 * I420_YUY2: planar YUV 4:2:0 to packed YUYV 4:2:2
173 *****************************************************************************/
174 static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source,
177 uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
178 uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
179 uint8_t *p_u = p_source->U_PIXELS;
180 uint8_t *p_v = p_source->V_PIXELS;
184 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
185 #define VEC_NEXT_LINES( ) \
187 p_line2 += p_dest->p->i_pitch; \
189 p_y2 += p_source->p[Y_PLANE].i_pitch;
191 #define VEC_LOAD_UV( ) \
192 u_vec = vec_ld( 0, p_u ); p_u += 16; \
193 v_vec = vec_ld( 0, p_v ); p_v += 16;
195 #define VEC_MERGE( a ) \
196 uv_vec = a( u_vec, v_vec ); \
197 y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
198 vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
199 vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
200 y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
201 vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16; \
202 vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16;
204 vector unsigned char u_vec;
205 vector unsigned char v_vec;
206 vector unsigned char uv_vec;
207 vector unsigned char y_vec;
209 if( !( ( p_vout->render.i_width % 32 ) |
210 ( p_vout->render.i_height % 2 ) ) )
212 /* Width is a multiple of 32, we take 2 lines at a time */
213 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
216 for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
219 VEC_MERGE( vec_mergeh );
220 VEC_MERGE( vec_mergel );
224 else if( !( ( p_vout->render.i_width % 16 ) |
225 ( p_vout->render.i_height % 4 ) ) )
227 /* Width is only a multiple of 16, we take 4 lines at a time */
228 for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
230 /* Line 1 and 2, pixels 0 to ( width - 16 ) */
232 for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
235 VEC_MERGE( vec_mergeh );
236 VEC_MERGE( vec_mergel );
239 /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
241 VEC_MERGE( vec_mergeh );
243 /* Line 3 and 4, pixels 0 to 16 */
245 VEC_MERGE( vec_mergel );
247 /* Line 3 and 4, pixels 16 to ( width ) */
248 for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
251 VEC_MERGE( vec_mergeh );
252 VEC_MERGE( vec_mergel );
258 /* Crap, use the C version */
259 #undef VEC_NEXT_LINES
264 const int i_source_margin = p_source->p[0].i_pitch
265 - p_source->p[0].i_visible_pitch;
266 const int i_source_margin_c = p_source->p[1].i_pitch
267 - p_source->p[1].i_visible_pitch;
268 const int i_dest_margin = p_dest->p->i_pitch
269 - p_dest->p->i_visible_pitch;
271 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
272 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
275 p_line2 += p_dest->p->i_pitch;
278 p_y2 += p_source->p[Y_PLANE].i_pitch;
280 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
281 for( i_x = p_vout->render.i_width / 8; i_x-- ; )
289 for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
291 MMX_CALL( MMX_YUV420_YUYV );
294 for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; )
299 p_y1 += i_source_margin;
300 p_y2 += i_source_margin;
301 p_u += i_source_margin_c;
302 p_v += i_source_margin_c;
303 p_line1 += i_dest_margin;
304 p_line2 += i_dest_margin;
307 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
308 /* re-enable FPU registers */
312 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
316 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
318 ** SSE2 128 bits fetch/store instructions are faster
319 ** if memory access is 16 bytes aligned
322 if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
323 ((int)p_line2|(int)p_y2))) )
325 /* use faster SSE2 aligned fetch and store */
326 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
329 p_line2 += p_dest->p->i_pitch;
332 p_y2 += p_source->p[Y_PLANE].i_pitch;
334 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
336 SSE2_CALL( SSE2_YUV420_YUYV_ALIGNED );
338 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
343 p_y1 += i_source_margin;
344 p_y2 += i_source_margin;
345 p_u += i_source_margin_c;
346 p_v += i_source_margin_c;
347 p_line1 += i_dest_margin;
348 p_line2 += i_dest_margin;
353 /* use slower SSE2 unaligned fetch and store */
354 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
357 p_line2 += p_dest->p->i_pitch;
360 p_y2 += p_source->p[Y_PLANE].i_pitch;
362 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
364 SSE2_CALL( SSE2_YUV420_YUYV_UNALIGNED );
366 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
371 p_y1 += i_source_margin;
372 p_y2 += i_source_margin;
373 p_u += i_source_margin_c;
374 p_v += i_source_margin_c;
375 p_line1 += i_dest_margin;
376 p_line2 += i_dest_margin;
379 /* make sure all SSE2 stores are visible thereafter */
382 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
385 /*****************************************************************************
386 * I420_YVYU: planar YUV 4:2:0 to packed YVYU 4:2:2
387 *****************************************************************************/
388 static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source,
391 uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
392 uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
393 uint8_t *p_u = p_source->U_PIXELS;
394 uint8_t *p_v = p_source->V_PIXELS;
398 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
399 #define VEC_NEXT_LINES( ) \
401 p_line2 += p_dest->p->i_pitch; \
403 p_y2 += p_source->p[Y_PLANE].i_pitch;
405 #define VEC_LOAD_UV( ) \
406 u_vec = vec_ld( 0, p_u ); p_u += 16; \
407 v_vec = vec_ld( 0, p_v ); p_v += 16;
409 #define VEC_MERGE( a ) \
410 vu_vec = a( v_vec, u_vec ); \
411 y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
412 vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
413 vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
414 y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
415 vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16; \
416 vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16;
418 vector unsigned char u_vec;
419 vector unsigned char v_vec;
420 vector unsigned char vu_vec;
421 vector unsigned char y_vec;
423 if( !( ( p_vout->render.i_width % 32 ) |
424 ( p_vout->render.i_height % 2 ) ) )
426 /* Width is a multiple of 32, we take 2 lines at a time */
427 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
430 for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
433 VEC_MERGE( vec_mergeh );
434 VEC_MERGE( vec_mergel );
438 else if( !( ( p_vout->render.i_width % 16 ) |
439 ( p_vout->render.i_height % 4 ) ) )
441 /* Width is only a multiple of 16, we take 4 lines at a time */
442 for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
444 /* Line 1 and 2, pixels 0 to ( width - 16 ) */
446 for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
449 VEC_MERGE( vec_mergeh );
450 VEC_MERGE( vec_mergel );
453 /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
455 VEC_MERGE( vec_mergeh );
457 /* Line 3 and 4, pixels 0 to 16 */
459 VEC_MERGE( vec_mergel );
461 /* Line 3 and 4, pixels 16 to ( width ) */
462 for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
465 VEC_MERGE( vec_mergeh );
466 VEC_MERGE( vec_mergel );
472 /* Crap, use the C version */
473 #undef VEC_NEXT_LINES
478 const int i_source_margin = p_source->p[0].i_pitch
479 - p_source->p[0].i_visible_pitch;
480 const int i_source_margin_c = p_source->p[1].i_pitch
481 - p_source->p[1].i_visible_pitch;
482 const int i_dest_margin = p_dest->p->i_pitch
483 - p_dest->p->i_visible_pitch;
485 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
486 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
489 p_line2 += p_dest->p->i_pitch;
492 p_y2 += p_source->p[Y_PLANE].i_pitch;
494 for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
496 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
502 MMX_CALL( MMX_YUV420_YVYU );
505 for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; )
510 p_y1 += i_source_margin;
511 p_y2 += i_source_margin;
512 p_u += i_source_margin_c;
513 p_v += i_source_margin_c;
514 p_line1 += i_dest_margin;
515 p_line2 += i_dest_margin;
518 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
519 /* re-enable FPU registers */
523 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
527 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
529 ** SSE2 128 bits fetch/store instructions are faster
530 ** if memory access is 16 bytes aligned
532 if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
533 ((int)p_line2|(int)p_y2))) )
535 /* use faster SSE2 aligned fetch and store */
536 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
539 p_line2 += p_dest->p->i_pitch;
542 p_y2 += p_source->p[Y_PLANE].i_pitch;
544 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
546 SSE2_CALL( SSE2_YUV420_YVYU_ALIGNED );
548 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
553 p_y1 += i_source_margin;
554 p_y2 += i_source_margin;
555 p_u += i_source_margin_c;
556 p_v += i_source_margin_c;
557 p_line1 += i_dest_margin;
558 p_line2 += i_dest_margin;
563 /* use slower SSE2 unaligned fetch and store */
564 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
567 p_line2 += p_dest->p->i_pitch;
570 p_y2 += p_source->p[Y_PLANE].i_pitch;
572 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
574 SSE2_CALL( SSE2_YUV420_YVYU_UNALIGNED );
576 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
581 p_y1 += i_source_margin;
582 p_y2 += i_source_margin;
583 p_u += i_source_margin_c;
584 p_v += i_source_margin_c;
585 p_line1 += i_dest_margin;
586 p_line2 += i_dest_margin;
589 /* make sure all SSE2 stores are visible thereafter */
591 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
594 /*****************************************************************************
595 * I420_UYVY: planar YUV 4:2:0 to packed UYVY 4:2:2
596 *****************************************************************************/
597 static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source,
600 uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
601 uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
602 uint8_t *p_u = p_source->U_PIXELS;
603 uint8_t *p_v = p_source->V_PIXELS;
607 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
608 #define VEC_NEXT_LINES( ) \
610 p_line2 += p_dest->p->i_pitch; \
612 p_y2 += p_source->p[Y_PLANE].i_pitch;
614 #define VEC_LOAD_UV( ) \
615 u_vec = vec_ld( 0, p_u ); p_u += 16; \
616 v_vec = vec_ld( 0, p_v ); p_v += 16;
618 #define VEC_MERGE( a ) \
619 uv_vec = a( u_vec, v_vec ); \
620 y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
621 vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
622 vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
623 y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
624 vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16; \
625 vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16;
627 vector unsigned char u_vec;
628 vector unsigned char v_vec;
629 vector unsigned char uv_vec;
630 vector unsigned char y_vec;
632 if( !( ( p_vout->render.i_width % 32 ) |
633 ( p_vout->render.i_height % 2 ) ) )
635 /* Width is a multiple of 32, we take 2 lines at a time */
636 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
639 for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
642 VEC_MERGE( vec_mergeh );
643 VEC_MERGE( vec_mergel );
647 else if( !( ( p_vout->render.i_width % 16 ) |
648 ( p_vout->render.i_height % 4 ) ) )
650 /* Width is only a multiple of 16, we take 4 lines at a time */
651 for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
653 /* Line 1 and 2, pixels 0 to ( width - 16 ) */
655 for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
658 VEC_MERGE( vec_mergeh );
659 VEC_MERGE( vec_mergel );
662 /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
664 VEC_MERGE( vec_mergeh );
666 /* Line 3 and 4, pixels 0 to 16 */
668 VEC_MERGE( vec_mergel );
670 /* Line 3 and 4, pixels 16 to ( width ) */
671 for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
674 VEC_MERGE( vec_mergeh );
675 VEC_MERGE( vec_mergel );
681 /* Crap, use the C version */
682 #undef VEC_NEXT_LINES
687 const int i_source_margin = p_source->p[0].i_pitch
688 - p_source->p[0].i_visible_pitch;
689 const int i_source_margin_c = p_source->p[1].i_pitch
690 - p_source->p[1].i_visible_pitch;
691 const int i_dest_margin = p_dest->p->i_pitch
692 - p_dest->p->i_visible_pitch;
694 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
695 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
698 p_line2 += p_dest->p->i_pitch;
701 p_y2 += p_source->p[Y_PLANE].i_pitch;
703 for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
705 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
711 MMX_CALL( MMX_YUV420_UYVY );
714 for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x--; )
719 p_y1 += i_source_margin;
720 p_y2 += i_source_margin;
721 p_u += i_source_margin_c;
722 p_v += i_source_margin_c;
723 p_line1 += i_dest_margin;
724 p_line2 += i_dest_margin;
727 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
728 /* re-enable FPU registers */
732 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
736 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
738 ** SSE2 128 bits fetch/store instructions are faster
739 ** if memory access is 16 bytes aligned
741 if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
742 ((int)p_line2|(int)p_y2))) )
744 /* use faster SSE2 aligned fetch and store */
745 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
748 p_line2 += p_dest->p->i_pitch;
751 p_y2 += p_source->p[Y_PLANE].i_pitch;
753 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
755 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
757 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
762 p_y1 += i_source_margin;
763 p_y2 += i_source_margin;
764 p_u += i_source_margin_c;
765 p_v += i_source_margin_c;
766 p_line1 += i_dest_margin;
767 p_line2 += i_dest_margin;
772 /* use slower SSE2 unaligned fetch and store */
773 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
776 p_line2 += p_dest->p->i_pitch;
779 p_y2 += p_source->p[Y_PLANE].i_pitch;
781 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
783 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
785 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
790 p_y1 += i_source_margin;
791 p_y2 += i_source_margin;
792 p_u += i_source_margin_c;
793 p_v += i_source_margin_c;
794 p_line1 += i_dest_margin;
795 p_line2 += i_dest_margin;
798 /* make sure all SSE2 stores are visible thereafter */
800 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
803 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
804 /*****************************************************************************
805 * I420_IUYV: planar YUV 4:2:0 to interleaved packed UYVY 4:2:2
806 *****************************************************************************/
807 static void I420_IUYV( vout_thread_t *p_vout, picture_t *p_source,
811 msg_Err( p_vout, "I420_IUYV unimplemented, please harass <sam@zoy.org>" );
814 /*****************************************************************************
815 * I420_cyuv: planar YUV 4:2:0 to upside-down packed UYVY 4:2:2
816 *****************************************************************************/
817 static void I420_cyuv( vout_thread_t *p_vout, picture_t *p_source,
820 uint8_t *p_line1 = p_dest->p->p_pixels +
821 p_dest->p->i_visible_lines * p_dest->p->i_pitch
822 + p_dest->p->i_pitch;
823 uint8_t *p_line2 = p_dest->p->p_pixels +
824 p_dest->p->i_visible_lines * p_dest->p->i_pitch;
825 uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
826 uint8_t *p_u = p_source->U_PIXELS;
827 uint8_t *p_v = p_source->V_PIXELS;
831 const int i_source_margin = p_source->p[0].i_pitch
832 - p_source->p[0].i_visible_pitch;
833 const int i_source_margin_c = p_source->p[1].i_pitch
834 - p_source->p[1].i_visible_pitch;
835 const int i_dest_margin = p_dest->p->i_pitch
836 - p_dest->p->i_visible_pitch;
838 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
839 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
841 p_line1 -= 3 * p_dest->p->i_pitch;
842 p_line2 -= 3 * p_dest->p->i_pitch;
845 p_y2 += p_source->p[Y_PLANE].i_pitch;
847 for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
849 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
855 MMX_CALL( MMX_YUV420_UYVY );
858 for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; )
863 p_y1 += i_source_margin;
864 p_y2 += i_source_margin;
865 p_u += i_source_margin_c;
866 p_v += i_source_margin_c;
867 p_line1 += i_dest_margin;
868 p_line2 += i_dest_margin;
871 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
872 /* re-enable FPU registers */
876 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
878 ** SSE2 128 bits fetch/store instructions are faster
879 ** if memory access is 16 bytes aligned
881 if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
882 ((int)p_line2|(int)p_y2))) )
884 /* use faster SSE2 aligned fetch and store */
885 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
888 p_line2 += p_dest->p->i_pitch;
891 p_y2 += p_source->p[Y_PLANE].i_pitch;
893 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
895 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
897 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
902 p_y1 += i_source_margin;
903 p_y2 += i_source_margin;
904 p_u += i_source_margin_c;
905 p_v += i_source_margin_c;
906 p_line1 += i_dest_margin;
907 p_line2 += i_dest_margin;
912 /* use slower SSE2 unaligned fetch and store */
913 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
916 p_line2 += p_dest->p->i_pitch;
919 p_y2 += p_source->p[Y_PLANE].i_pitch;
921 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
923 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
925 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
930 p_y1 += i_source_margin;
931 p_y2 += i_source_margin;
932 p_u += i_source_margin_c;
933 p_v += i_source_margin_c;
934 p_line1 += i_dest_margin;
935 p_line2 += i_dest_margin;
938 /* make sure all SSE2 stores are visible thereafter */
940 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
942 #endif // !defined (MODULE_NAME_IS_i420_yuy2_altivec)
944 /*****************************************************************************
945 * I420_Y211: planar YUV 4:2:0 to packed YUYV 2:1:1
946 *****************************************************************************/
947 #if defined (MODULE_NAME_IS_i420_yuy2)
948 static void I420_Y211( vout_thread_t *p_vout, picture_t *p_source,
951 uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
952 uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
953 uint8_t *p_u = p_source->U_PIXELS;
954 uint8_t *p_v = p_source->V_PIXELS;
958 const int i_source_margin = p_source->p[0].i_pitch
959 - p_source->p[0].i_visible_pitch;
960 const int i_source_margin_c = p_source->p[1].i_pitch
961 - p_source->p[1].i_visible_pitch;
962 const int i_dest_margin = p_dest->p->i_pitch
963 - p_dest->p->i_visible_pitch;
965 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
968 p_line2 += p_dest->p->i_pitch;
971 p_y2 += p_source->p[Y_PLANE].i_pitch;
973 for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
979 p_y1 += i_source_margin;
980 p_y2 += i_source_margin;
981 p_u += i_source_margin_c;
982 p_v += i_source_margin_c;
983 p_line1 += i_dest_margin;
984 p_line2 += i_dest_margin;