1 /*****************************************************************************
2 * i420_yuy2.c : YUV to YUV conversion module for vlc
3 *****************************************************************************
4 * Copyright (C) 2000, 2001 the VideoLAN team
7 * Authors: Samuel Hocevar <sam@zoy.org>
8 * Damien Fouilleul <damien@videolan.org>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
23 *****************************************************************************/
25 /*****************************************************************************
27 *****************************************************************************/
28 #include <string.h> /* strerror() */
29 #include <stdlib.h> /* malloc(), free() */
34 #if defined (MODULE_NAME_IS_i420_yuy2_altivec) && defined(HAVE_ALTIVEC_H)
38 #include "i420_yuy2.h"
40 #define SRC_FOURCC "I420,IYUV,YV12"
42 #if defined (MODULE_NAME_IS_i420_yuy2)
43 # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv,Y211"
44 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
45 # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
46 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
47 # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
48 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
49 # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422"
52 /*****************************************************************************
53 * Local and extern prototypes.
54 *****************************************************************************/
55 static int Activate ( vlc_object_t * );
57 static void I420_YUY2 ( vout_thread_t *, picture_t *, picture_t * );
58 static void I420_YVYU ( vout_thread_t *, picture_t *, picture_t * );
59 static void I420_UYVY ( vout_thread_t *, picture_t *, picture_t * );
60 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
61 static void I420_IUYV ( vout_thread_t *, picture_t *, picture_t * );
62 static void I420_cyuv ( vout_thread_t *, picture_t *, picture_t * );
64 #if defined (MODULE_NAME_IS_i420_yuy2)
65 static void I420_Y211 ( vout_thread_t *, picture_t *, picture_t * );
68 #ifdef MODULE_NAME_IS_i420_yuy2_mmx
69 /* Initialize MMX-specific constants */
70 static const uint64_t i_00ffw = 0x00ff00ff00ff00ffULL;
71 static const uint64_t i_80w = 0x0000000080808080ULL;
74 /*****************************************************************************
76 *****************************************************************************/
78 #if defined (MODULE_NAME_IS_i420_yuy2)
79 set_description( _("Conversions from " SRC_FOURCC " to " DEST_FOURCC) );
80 set_capability( "chroma", 80 );
81 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
82 set_description( _("MMX conversions from " SRC_FOURCC " to " DEST_FOURCC) );
83 set_capability( "chroma", 100 );
84 add_requirement( MMX );
85 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
86 set_description( _("SSE2 conversions from " SRC_FOURCC " to " DEST_FOURCC) );
87 set_capability( "chroma", 120 );
88 add_requirement( SSE2 );
89 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
91 _("AltiVec conversions from " SRC_FOURCC " to " DEST_FOURCC) );
92 set_capability( "chroma", 100 );
93 add_requirement( ALTIVEC );
95 set_callbacks( Activate, NULL );
98 /*****************************************************************************
99 * Activate: allocate a chroma function
100 *****************************************************************************
101 * This function allocates and initializes a chroma function
102 *****************************************************************************/
103 static int Activate( vlc_object_t *p_this )
105 vout_thread_t *p_vout = (vout_thread_t *)p_this;
107 if( p_vout->render.i_width & 1 || p_vout->render.i_height & 1 )
112 switch( p_vout->render.i_chroma )
114 case VLC_FOURCC('Y','V','1','2'):
115 case VLC_FOURCC('I','4','2','0'):
116 case VLC_FOURCC('I','Y','U','V'):
117 switch( p_vout->output.i_chroma )
119 case VLC_FOURCC('Y','U','Y','2'):
120 case VLC_FOURCC('Y','U','N','V'):
121 p_vout->chroma.pf_convert = I420_YUY2;
124 case VLC_FOURCC('Y','V','Y','U'):
125 p_vout->chroma.pf_convert = I420_YVYU;
128 case VLC_FOURCC('U','Y','V','Y'):
129 case VLC_FOURCC('U','Y','N','V'):
130 case VLC_FOURCC('Y','4','2','2'):
131 p_vout->chroma.pf_convert = I420_UYVY;
133 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
134 case VLC_FOURCC('I','U','Y','V'):
135 p_vout->chroma.pf_convert = I420_IUYV;
138 case VLC_FOURCC('c','y','u','v'):
139 p_vout->chroma.pf_convert = I420_cyuv;
143 #if defined (MODULE_NAME_IS_i420_yuy2)
144 case VLC_FOURCC('Y','2','1','1'):
145 p_vout->chroma.pf_convert = I420_Y211;
162 static inline unsigned long long read_cycles(void)
164 unsigned long long v;
165 __asm__ __volatile__("rdtsc" : "=A" (v): );
171 /* Following functions are local */
172 /*****************************************************************************
173 * I420_YUY2: planar YUV 4:2:0 to packed YUYV 4:2:2
174 *****************************************************************************/
175 static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source,
178 uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
179 uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
180 uint8_t *p_u = p_source->U_PIXELS;
181 uint8_t *p_v = p_source->V_PIXELS;
185 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
186 #define VEC_NEXT_LINES( ) \
188 p_line2 += p_dest->p->i_pitch; \
190 p_y2 += p_source->p[Y_PLANE].i_pitch;
192 #define VEC_LOAD_UV( ) \
193 u_vec = vec_ld( 0, p_u ); p_u += 16; \
194 v_vec = vec_ld( 0, p_v ); p_v += 16;
196 #define VEC_MERGE( a ) \
197 uv_vec = a( u_vec, v_vec ); \
198 y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
199 vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
200 vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
201 y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
202 vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16; \
203 vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16;
205 vector unsigned char u_vec;
206 vector unsigned char v_vec;
207 vector unsigned char uv_vec;
208 vector unsigned char y_vec;
210 if( !( ( p_vout->render.i_width % 32 ) |
211 ( p_vout->render.i_height % 2 ) ) )
213 /* Width is a multiple of 32, we take 2 lines at a time */
214 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
217 for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
220 VEC_MERGE( vec_mergeh );
221 VEC_MERGE( vec_mergel );
225 else if( !( ( p_vout->render.i_width % 16 ) |
226 ( p_vout->render.i_height % 4 ) ) )
228 /* Width is only a multiple of 16, we take 4 lines at a time */
229 for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
231 /* Line 1 and 2, pixels 0 to ( width - 16 ) */
233 for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
236 VEC_MERGE( vec_mergeh );
237 VEC_MERGE( vec_mergel );
240 /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
242 VEC_MERGE( vec_mergeh );
244 /* Line 3 and 4, pixels 0 to 16 */
246 VEC_MERGE( vec_mergel );
248 /* Line 3 and 4, pixels 16 to ( width ) */
249 for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
252 VEC_MERGE( vec_mergeh );
253 VEC_MERGE( vec_mergel );
259 /* Crap, use the C version */
260 #undef VEC_NEXT_LINES
265 const int i_source_margin = p_source->p[0].i_pitch
266 - p_source->p[0].i_visible_pitch;
267 const int i_source_margin_c = p_source->p[1].i_pitch
268 - p_source->p[1].i_visible_pitch;
269 const int i_dest_margin = p_dest->p->i_pitch
270 - p_dest->p->i_visible_pitch;
272 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
273 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
276 p_line2 += p_dest->p->i_pitch;
279 p_y2 += p_source->p[Y_PLANE].i_pitch;
281 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
282 for( i_x = p_vout->render.i_width / 8; i_x-- ; )
290 for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
292 MMX_CALL( MMX_YUV420_YUYV );
295 for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; )
300 p_y1 += i_source_margin;
301 p_y2 += i_source_margin;
302 p_u += i_source_margin_c;
303 p_v += i_source_margin_c;
304 p_line1 += i_dest_margin;
305 p_line2 += i_dest_margin;
308 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
309 __asm__ __volatile__("emms" :: );
312 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
316 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
318 ** SSE2 128 bits fetch/store instructions are faster
319 ** if memory access is 16 bytes aligned
322 if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
323 ((int)p_line2|(int)p_y2))) )
325 /* use faster SSE2 aligned fetch and store */
326 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
329 p_line2 += p_dest->p->i_pitch;
332 p_y2 += p_source->p[Y_PLANE].i_pitch;
334 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
336 SSE2_CALL( SSE2_YUV420_YUYV_ALIGNED );
338 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
343 p_y1 += i_source_margin;
344 p_y2 += i_source_margin;
345 p_u += i_source_margin_c;
346 p_v += i_source_margin_c;
347 p_line1 += i_dest_margin;
348 p_line2 += i_dest_margin;
353 /* use slower SSE2 unaligned fetch and store */
354 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
357 p_line2 += p_dest->p->i_pitch;
360 p_y2 += p_source->p[Y_PLANE].i_pitch;
362 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
364 SSE2_CALL( SSE2_YUV420_YUYV_UNALIGNED );
366 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
371 p_y1 += i_source_margin;
372 p_y2 += i_source_margin;
373 p_u += i_source_margin_c;
374 p_v += i_source_margin_c;
375 p_line1 += i_dest_margin;
376 p_line2 += i_dest_margin;
380 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
383 /*****************************************************************************
384 * I420_YVYU: planar YUV 4:2:0 to packed YVYU 4:2:2
385 *****************************************************************************/
386 static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source,
389 uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
390 uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
391 uint8_t *p_u = p_source->U_PIXELS;
392 uint8_t *p_v = p_source->V_PIXELS;
396 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
397 #define VEC_NEXT_LINES( ) \
399 p_line2 += p_dest->p->i_pitch; \
401 p_y2 += p_source->p[Y_PLANE].i_pitch;
403 #define VEC_LOAD_UV( ) \
404 u_vec = vec_ld( 0, p_u ); p_u += 16; \
405 v_vec = vec_ld( 0, p_v ); p_v += 16;
407 #define VEC_MERGE( a ) \
408 vu_vec = a( v_vec, u_vec ); \
409 y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
410 vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
411 vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
412 y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
413 vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16; \
414 vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16;
416 vector unsigned char u_vec;
417 vector unsigned char v_vec;
418 vector unsigned char vu_vec;
419 vector unsigned char y_vec;
421 if( !( ( p_vout->render.i_width % 32 ) |
422 ( p_vout->render.i_height % 2 ) ) )
424 /* Width is a multiple of 32, we take 2 lines at a time */
425 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
428 for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
431 VEC_MERGE( vec_mergeh );
432 VEC_MERGE( vec_mergel );
436 else if( !( ( p_vout->render.i_width % 16 ) |
437 ( p_vout->render.i_height % 4 ) ) )
439 /* Width is only a multiple of 16, we take 4 lines at a time */
440 for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
442 /* Line 1 and 2, pixels 0 to ( width - 16 ) */
444 for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
447 VEC_MERGE( vec_mergeh );
448 VEC_MERGE( vec_mergel );
451 /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
453 VEC_MERGE( vec_mergeh );
455 /* Line 3 and 4, pixels 0 to 16 */
457 VEC_MERGE( vec_mergel );
459 /* Line 3 and 4, pixels 16 to ( width ) */
460 for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
463 VEC_MERGE( vec_mergeh );
464 VEC_MERGE( vec_mergel );
470 /* Crap, use the C version */
471 #undef VEC_NEXT_LINES
476 const int i_source_margin = p_source->p[0].i_pitch
477 - p_source->p[0].i_visible_pitch;
478 const int i_source_margin_c = p_source->p[1].i_pitch
479 - p_source->p[1].i_visible_pitch;
480 const int i_dest_margin = p_dest->p->i_pitch
481 - p_dest->p->i_visible_pitch;
483 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
484 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
487 p_line2 += p_dest->p->i_pitch;
490 p_y2 += p_source->p[Y_PLANE].i_pitch;
492 for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
494 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
500 MMX_CALL( MMX_YUV420_YVYU );
503 for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; )
508 p_y1 += i_source_margin;
509 p_y2 += i_source_margin;
510 p_u += i_source_margin_c;
511 p_v += i_source_margin_c;
512 p_line1 += i_dest_margin;
513 p_line2 += i_dest_margin;
516 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
517 __asm__ __volatile__("emms" :: );
520 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
524 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
526 ** SSE2 128 bits fetch/store instructions are faster
527 ** if memory access is 16 bytes aligned
529 if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
530 ((int)p_line2|(int)p_y2))) )
532 /* use faster SSE2 aligned fetch and store */
533 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
536 p_line2 += p_dest->p->i_pitch;
539 p_y2 += p_source->p[Y_PLANE].i_pitch;
541 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
543 SSE2_CALL( SSE2_YUV420_YVYU_ALIGNED );
545 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
550 p_y1 += i_source_margin;
551 p_y2 += i_source_margin;
552 p_u += i_source_margin_c;
553 p_v += i_source_margin_c;
554 p_line1 += i_dest_margin;
555 p_line2 += i_dest_margin;
560 /* use slower SSE2 unaligned fetch and store */
561 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
564 p_line2 += p_dest->p->i_pitch;
567 p_y2 += p_source->p[Y_PLANE].i_pitch;
569 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
571 SSE2_CALL( SSE2_YUV420_YVYU_UNALIGNED );
573 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
578 p_y1 += i_source_margin;
579 p_y2 += i_source_margin;
580 p_u += i_source_margin_c;
581 p_v += i_source_margin_c;
582 p_line1 += i_dest_margin;
583 p_line2 += i_dest_margin;
586 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
589 /*****************************************************************************
590 * I420_UYVY: planar YUV 4:2:0 to packed UYVY 4:2:2
591 *****************************************************************************/
592 static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source,
595 uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
596 uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
597 uint8_t *p_u = p_source->U_PIXELS;
598 uint8_t *p_v = p_source->V_PIXELS;
602 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
603 #define VEC_NEXT_LINES( ) \
605 p_line2 += p_dest->p->i_pitch; \
607 p_y2 += p_source->p[Y_PLANE].i_pitch;
609 #define VEC_LOAD_UV( ) \
610 u_vec = vec_ld( 0, p_u ); p_u += 16; \
611 v_vec = vec_ld( 0, p_v ); p_v += 16;
613 #define VEC_MERGE( a ) \
614 uv_vec = a( u_vec, v_vec ); \
615 y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
616 vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
617 vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
618 y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
619 vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16; \
620 vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16;
622 vector unsigned char u_vec;
623 vector unsigned char v_vec;
624 vector unsigned char uv_vec;
625 vector unsigned char y_vec;
627 if( !( ( p_vout->render.i_width % 32 ) |
628 ( p_vout->render.i_height % 2 ) ) )
630 /* Width is a multiple of 32, we take 2 lines at a time */
631 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
634 for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
637 VEC_MERGE( vec_mergeh );
638 VEC_MERGE( vec_mergel );
642 else if( !( ( p_vout->render.i_width % 16 ) |
643 ( p_vout->render.i_height % 4 ) ) )
645 /* Width is only a multiple of 16, we take 4 lines at a time */
646 for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
648 /* Line 1 and 2, pixels 0 to ( width - 16 ) */
650 for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
653 VEC_MERGE( vec_mergeh );
654 VEC_MERGE( vec_mergel );
657 /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
659 VEC_MERGE( vec_mergeh );
661 /* Line 3 and 4, pixels 0 to 16 */
663 VEC_MERGE( vec_mergel );
665 /* Line 3 and 4, pixels 16 to ( width ) */
666 for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
669 VEC_MERGE( vec_mergeh );
670 VEC_MERGE( vec_mergel );
676 /* Crap, use the C version */
677 #undef VEC_NEXT_LINES
682 const int i_source_margin = p_source->p[0].i_pitch
683 - p_source->p[0].i_visible_pitch;
684 const int i_source_margin_c = p_source->p[1].i_pitch
685 - p_source->p[1].i_visible_pitch;
686 const int i_dest_margin = p_dest->p->i_pitch
687 - p_dest->p->i_visible_pitch;
689 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
690 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
693 p_line2 += p_dest->p->i_pitch;
696 p_y2 += p_source->p[Y_PLANE].i_pitch;
698 for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
700 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
706 MMX_CALL( MMX_YUV420_UYVY );
709 for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x--; )
714 p_y1 += i_source_margin;
715 p_y2 += i_source_margin;
716 p_u += i_source_margin_c;
717 p_v += i_source_margin_c;
718 p_line1 += i_dest_margin;
719 p_line2 += i_dest_margin;
722 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
723 __asm__ __volatile__("emms" :: );
726 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
730 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
732 ** SSE2 128 bits fetch/store instructions are faster
733 ** if memory access is 16 bytes aligned
735 if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
736 ((int)p_line2|(int)p_y2))) )
738 /* use faster SSE2 aligned fetch and store */
739 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
742 p_line2 += p_dest->p->i_pitch;
745 p_y2 += p_source->p[Y_PLANE].i_pitch;
747 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
749 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
751 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
756 p_y1 += i_source_margin;
757 p_y2 += i_source_margin;
758 p_u += i_source_margin_c;
759 p_v += i_source_margin_c;
760 p_line1 += i_dest_margin;
761 p_line2 += i_dest_margin;
766 /* use slower SSE2 unaligned fetch and store */
767 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
770 p_line2 += p_dest->p->i_pitch;
773 p_y2 += p_source->p[Y_PLANE].i_pitch;
775 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
777 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
779 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
784 p_y1 += i_source_margin;
785 p_y2 += i_source_margin;
786 p_u += i_source_margin_c;
787 p_v += i_source_margin_c;
788 p_line1 += i_dest_margin;
789 p_line2 += i_dest_margin;
792 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
795 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
796 /*****************************************************************************
797 * I420_IUYV: planar YUV 4:2:0 to interleaved packed UYVY 4:2:2
798 *****************************************************************************/
799 static void I420_IUYV( vout_thread_t *p_vout, picture_t *p_source,
803 msg_Err( p_vout, "I420_IUYV unimplemented, please harass <sam@zoy.org>" );
806 /*****************************************************************************
807 * I420_cyuv: planar YUV 4:2:0 to upside-down packed UYVY 4:2:2
808 *****************************************************************************/
809 static void I420_cyuv( vout_thread_t *p_vout, picture_t *p_source,
812 uint8_t *p_line1 = p_dest->p->p_pixels +
813 p_dest->p->i_visible_lines * p_dest->p->i_pitch
814 + p_dest->p->i_pitch;
815 uint8_t *p_line2 = p_dest->p->p_pixels +
816 p_dest->p->i_visible_lines * p_dest->p->i_pitch;
817 uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
818 uint8_t *p_u = p_source->U_PIXELS;
819 uint8_t *p_v = p_source->V_PIXELS;
823 const int i_source_margin = p_source->p[0].i_pitch
824 - p_source->p[0].i_visible_pitch;
825 const int i_source_margin_c = p_source->p[1].i_pitch
826 - p_source->p[1].i_visible_pitch;
827 const int i_dest_margin = p_dest->p->i_pitch
828 - p_dest->p->i_visible_pitch;
830 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
831 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
833 p_line1 -= 3 * p_dest->p->i_pitch;
834 p_line2 -= 3 * p_dest->p->i_pitch;
837 p_y2 += p_source->p[Y_PLANE].i_pitch;
839 for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
841 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
847 MMX_CALL( MMX_YUV420_UYVY );
850 for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; )
855 p_y1 += i_source_margin;
856 p_y2 += i_source_margin;
857 p_u += i_source_margin_c;
858 p_v += i_source_margin_c;
859 p_line1 += i_dest_margin;
860 p_line2 += i_dest_margin;
863 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
864 __asm__ __volatile__("emms" :: );
867 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
869 ** SSE2 128 bits fetch/store instructions are faster
870 ** if memory access is 16 bytes aligned
872 if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
873 ((int)p_line2|(int)p_y2))) )
875 /* use faster SSE2 aligned fetch and store */
876 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
879 p_line2 += p_dest->p->i_pitch;
882 p_y2 += p_source->p[Y_PLANE].i_pitch;
884 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
886 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
888 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
893 p_y1 += i_source_margin;
894 p_y2 += i_source_margin;
895 p_u += i_source_margin_c;
896 p_v += i_source_margin_c;
897 p_line1 += i_dest_margin;
898 p_line2 += i_dest_margin;
903 /* use slower SSE2 unaligned fetch and store */
904 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
907 p_line2 += p_dest->p->i_pitch;
910 p_y2 += p_source->p[Y_PLANE].i_pitch;
912 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
914 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
916 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
921 p_y1 += i_source_margin;
922 p_y2 += i_source_margin;
923 p_u += i_source_margin_c;
924 p_v += i_source_margin_c;
925 p_line1 += i_dest_margin;
926 p_line2 += i_dest_margin;
929 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
931 #endif // !defined (MODULE_NAME_IS_i420_yuy2_altivec)
933 /*****************************************************************************
934 * I420_Y211: planar YUV 4:2:0 to packed YUYV 2:1:1
935 *****************************************************************************/
936 #if defined (MODULE_NAME_IS_i420_yuy2)
937 static void I420_Y211( vout_thread_t *p_vout, picture_t *p_source,
940 uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
941 uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
942 uint8_t *p_u = p_source->U_PIXELS;
943 uint8_t *p_v = p_source->V_PIXELS;
947 const int i_source_margin = p_source->p[0].i_pitch
948 - p_source->p[0].i_visible_pitch;
949 const int i_source_margin_c = p_source->p[1].i_pitch
950 - p_source->p[1].i_visible_pitch;
951 const int i_dest_margin = p_dest->p->i_pitch
952 - p_dest->p->i_visible_pitch;
954 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
957 p_line2 += p_dest->p->i_pitch;
960 p_y2 += p_source->p[Y_PLANE].i_pitch;
962 for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
968 p_y1 += i_source_margin;
969 p_y2 += i_source_margin;
970 p_u += i_source_margin_c;
971 p_v += i_source_margin_c;
972 p_line1 += i_dest_margin;
973 p_line2 += i_dest_margin;