1 /*****************************************************************************
2 * i420_yuy2.c : YUV to YUV conversion module for vlc
3 *****************************************************************************
4 * Copyright (C) 2000, 2001 the VideoLAN team
7 * Authors: Samuel Hocevar <sam@zoy.org>
8 * Damien Fouilleul <damien@videolan.org>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
23 *****************************************************************************/
25 /*****************************************************************************
27 *****************************************************************************/
28 #include <string.h> /* strerror() */
29 #include <stdlib.h> /* malloc(), free() */
34 #if defined (MODULE_NAME_IS_i420_yuy2_altivec) && defined(HAVE_ALTIVEC_H)
38 #include "i420_yuy2.h"
40 #define SRC_FOURCC "I420,IYUV,YV12"
42 #if defined (MODULE_NAME_IS_i420_yuy2)
43 # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv,Y211"
44 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
45 # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
46 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
47 # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
48 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
49 # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422"
52 /*****************************************************************************
53 * Local and extern prototypes.
54 *****************************************************************************/
55 static int Activate ( vlc_object_t * );
57 static void I420_YUY2 ( vout_thread_t *, picture_t *, picture_t * );
58 static void I420_YVYU ( vout_thread_t *, picture_t *, picture_t * );
59 static void I420_UYVY ( vout_thread_t *, picture_t *, picture_t * );
60 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
61 static void I420_IUYV ( vout_thread_t *, picture_t *, picture_t * );
62 static void I420_cyuv ( vout_thread_t *, picture_t *, picture_t * );
64 #if defined (MODULE_NAME_IS_i420_yuy2)
65 static void I420_Y211 ( vout_thread_t *, picture_t *, picture_t * );
68 #ifdef MODULE_NAME_IS_i420_yuy2_mmx
69 /* Initialize MMX-specific constants */
70 static const uint64_t i_00ffw = 0x00ff00ff00ff00ffULL;
71 static const uint64_t i_80w = 0x0000000080808080ULL;
74 /*****************************************************************************
76 *****************************************************************************/
78 #if defined (MODULE_NAME_IS_i420_yuy2)
79 set_description( _("Conversions from " SRC_FOURCC " to " DEST_FOURCC) );
80 set_capability( "chroma", 80 );
81 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
82 set_description( _("MMX conversions from " SRC_FOURCC " to " DEST_FOURCC) );
83 set_capability( "chroma", 100 );
84 add_requirement( MMX );
85 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
86 set_description( _("SSE2 conversions from " SRC_FOURCC " to " DEST_FOURCC) );
87 set_capability( "chroma", 120 );
88 add_requirement( SSE2 );
89 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
91 _("AltiVec conversions from " SRC_FOURCC " to " DEST_FOURCC) );
92 set_capability( "chroma", 100 );
93 add_requirement( ALTIVEC );
95 set_callbacks( Activate, NULL );
98 /*****************************************************************************
99 * Activate: allocate a chroma function
100 *****************************************************************************
101 * This function allocates and initializes a chroma function
102 *****************************************************************************/
103 static int Activate( vlc_object_t *p_this )
105 vout_thread_t *p_vout = (vout_thread_t *)p_this;
107 if( p_vout->render.i_width & 1 || p_vout->render.i_height & 1 )
112 switch( p_vout->render.i_chroma )
114 case VLC_FOURCC('Y','V','1','2'):
115 case VLC_FOURCC('I','4','2','0'):
116 case VLC_FOURCC('I','Y','U','V'):
117 switch( p_vout->output.i_chroma )
119 case VLC_FOURCC('Y','U','Y','2'):
120 case VLC_FOURCC('Y','U','N','V'):
121 p_vout->chroma.pf_convert = I420_YUY2;
124 case VLC_FOURCC('Y','V','Y','U'):
125 p_vout->chroma.pf_convert = I420_YVYU;
128 case VLC_FOURCC('U','Y','V','Y'):
129 case VLC_FOURCC('U','Y','N','V'):
130 case VLC_FOURCC('Y','4','2','2'):
131 p_vout->chroma.pf_convert = I420_UYVY;
133 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
134 case VLC_FOURCC('I','U','Y','V'):
135 p_vout->chroma.pf_convert = I420_IUYV;
138 case VLC_FOURCC('c','y','u','v'):
139 p_vout->chroma.pf_convert = I420_cyuv;
143 #if defined (MODULE_NAME_IS_i420_yuy2)
144 case VLC_FOURCC('Y','2','1','1'):
145 p_vout->chroma.pf_convert = I420_Y211;
162 static inline unsigned long long read_cycles(void)
164 unsigned long long v;
165 __asm__ __volatile__("rdtsc" : "=A" (v): );
171 /* Following functions are local */
172 /*****************************************************************************
173 * I420_YUY2: planar YUV 4:2:0 to packed YUYV 4:2:2
174 *****************************************************************************/
175 static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source,
178 uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
179 uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
180 uint8_t *p_u = p_source->U_PIXELS;
181 uint8_t *p_v = p_source->V_PIXELS;
185 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
186 #define VEC_NEXT_LINES( ) \
188 p_line2 += p_dest->p->i_pitch; \
190 p_y2 += p_source->p[Y_PLANE].i_pitch;
192 #define VEC_LOAD_UV( ) \
193 u_vec = vec_ld( 0, p_u ); p_u += 16; \
194 v_vec = vec_ld( 0, p_v ); p_v += 16;
196 #define VEC_MERGE( a ) \
197 uv_vec = a( u_vec, v_vec ); \
198 y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
199 vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
200 vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
201 y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
202 vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16; \
203 vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16;
205 vector unsigned char u_vec;
206 vector unsigned char v_vec;
207 vector unsigned char uv_vec;
208 vector unsigned char y_vec;
210 if( !( ( p_vout->render.i_width % 32 ) |
211 ( p_vout->render.i_height % 2 ) ) )
213 /* Width is a multiple of 32, we take 2 lines at a time */
214 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
217 for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
220 VEC_MERGE( vec_mergeh );
221 VEC_MERGE( vec_mergel );
225 else if( !( ( p_vout->render.i_width % 16 ) |
226 ( p_vout->render.i_height % 4 ) ) )
228 /* Width is only a multiple of 16, we take 4 lines at a time */
229 for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
231 /* Line 1 and 2, pixels 0 to ( width - 16 ) */
233 for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
236 VEC_MERGE( vec_mergeh );
237 VEC_MERGE( vec_mergel );
240 /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
242 VEC_MERGE( vec_mergeh );
244 /* Line 3 and 4, pixels 0 to 16 */
246 VEC_MERGE( vec_mergel );
248 /* Line 3 and 4, pixels 16 to ( width ) */
249 for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
252 VEC_MERGE( vec_mergeh );
253 VEC_MERGE( vec_mergel );
259 /* Crap, use the C version */
260 #undef VEC_NEXT_LINES
265 const int i_source_margin = p_source->p[0].i_pitch
266 - p_source->p[0].i_visible_pitch;
267 const int i_source_margin_c = p_source->p[1].i_pitch
268 - p_source->p[1].i_visible_pitch;
269 const int i_dest_margin = p_dest->p->i_pitch
270 - p_dest->p->i_visible_pitch;
272 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
273 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
276 p_line2 += p_dest->p->i_pitch;
279 p_y2 += p_source->p[Y_PLANE].i_pitch;
281 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
282 for( i_x = p_vout->render.i_width / 8; i_x-- ; )
290 for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
292 MMX_CALL( MMX_YUV420_YUYV );
295 for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; )
300 p_y1 += i_source_margin;
301 p_y2 += i_source_margin;
302 p_u += i_source_margin_c;
303 p_v += i_source_margin_c;
304 p_line1 += i_dest_margin;
305 p_line2 += i_dest_margin;
308 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
309 /* re-enable FPU registers */
310 __asm__ __volatile__ ( "emms" );
313 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
317 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
319 ** SSE2 128 bits fetch/store instructions are faster
320 ** if memory access is 16 bytes aligned
323 if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
324 ((int)p_line2|(int)p_y2))) )
326 /* use faster SSE2 aligned fetch and store */
327 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
330 p_line2 += p_dest->p->i_pitch;
333 p_y2 += p_source->p[Y_PLANE].i_pitch;
335 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
337 SSE2_CALL( SSE2_YUV420_YUYV_ALIGNED );
339 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
344 p_y1 += i_source_margin;
345 p_y2 += i_source_margin;
346 p_u += i_source_margin_c;
347 p_v += i_source_margin_c;
348 p_line1 += i_dest_margin;
349 p_line2 += i_dest_margin;
351 /* make sure all SSE2 stores are visible thereafter */
352 __asm__ __volatile__ ( "sfence" );
356 /* use slower SSE2 unaligned fetch and store */
357 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
360 p_line2 += p_dest->p->i_pitch;
363 p_y2 += p_source->p[Y_PLANE].i_pitch;
365 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
367 SSE2_CALL( SSE2_YUV420_YUYV_UNALIGNED );
369 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
374 p_y1 += i_source_margin;
375 p_y2 += i_source_margin;
376 p_u += i_source_margin_c;
377 p_v += i_source_margin_c;
378 p_line1 += i_dest_margin;
379 p_line2 += i_dest_margin;
383 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
386 /*****************************************************************************
387 * I420_YVYU: planar YUV 4:2:0 to packed YVYU 4:2:2
388 *****************************************************************************/
389 static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source,
392 uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
393 uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
394 uint8_t *p_u = p_source->U_PIXELS;
395 uint8_t *p_v = p_source->V_PIXELS;
399 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
400 #define VEC_NEXT_LINES( ) \
402 p_line2 += p_dest->p->i_pitch; \
404 p_y2 += p_source->p[Y_PLANE].i_pitch;
406 #define VEC_LOAD_UV( ) \
407 u_vec = vec_ld( 0, p_u ); p_u += 16; \
408 v_vec = vec_ld( 0, p_v ); p_v += 16;
410 #define VEC_MERGE( a ) \
411 vu_vec = a( v_vec, u_vec ); \
412 y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
413 vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
414 vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
415 y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
416 vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16; \
417 vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16;
419 vector unsigned char u_vec;
420 vector unsigned char v_vec;
421 vector unsigned char vu_vec;
422 vector unsigned char y_vec;
424 if( !( ( p_vout->render.i_width % 32 ) |
425 ( p_vout->render.i_height % 2 ) ) )
427 /* Width is a multiple of 32, we take 2 lines at a time */
428 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
431 for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
434 VEC_MERGE( vec_mergeh );
435 VEC_MERGE( vec_mergel );
439 else if( !( ( p_vout->render.i_width % 16 ) |
440 ( p_vout->render.i_height % 4 ) ) )
442 /* Width is only a multiple of 16, we take 4 lines at a time */
443 for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
445 /* Line 1 and 2, pixels 0 to ( width - 16 ) */
447 for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
450 VEC_MERGE( vec_mergeh );
451 VEC_MERGE( vec_mergel );
454 /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
456 VEC_MERGE( vec_mergeh );
458 /* Line 3 and 4, pixels 0 to 16 */
460 VEC_MERGE( vec_mergel );
462 /* Line 3 and 4, pixels 16 to ( width ) */
463 for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
466 VEC_MERGE( vec_mergeh );
467 VEC_MERGE( vec_mergel );
473 /* Crap, use the C version */
474 #undef VEC_NEXT_LINES
479 const int i_source_margin = p_source->p[0].i_pitch
480 - p_source->p[0].i_visible_pitch;
481 const int i_source_margin_c = p_source->p[1].i_pitch
482 - p_source->p[1].i_visible_pitch;
483 const int i_dest_margin = p_dest->p->i_pitch
484 - p_dest->p->i_visible_pitch;
486 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
487 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
490 p_line2 += p_dest->p->i_pitch;
493 p_y2 += p_source->p[Y_PLANE].i_pitch;
495 for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
497 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
503 MMX_CALL( MMX_YUV420_YVYU );
506 for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; )
511 p_y1 += i_source_margin;
512 p_y2 += i_source_margin;
513 p_u += i_source_margin_c;
514 p_v += i_source_margin_c;
515 p_line1 += i_dest_margin;
516 p_line2 += i_dest_margin;
519 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
520 /* re-enable FPU registers */
521 __asm__ __volatile__ ( "emms" );
524 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
528 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
530 ** SSE2 128 bits fetch/store instructions are faster
531 ** if memory access is 16 bytes aligned
533 if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
534 ((int)p_line2|(int)p_y2))) )
536 /* use faster SSE2 aligned fetch and store */
537 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
540 p_line2 += p_dest->p->i_pitch;
543 p_y2 += p_source->p[Y_PLANE].i_pitch;
545 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
547 SSE2_CALL( SSE2_YUV420_YVYU_ALIGNED );
549 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
554 p_y1 += i_source_margin;
555 p_y2 += i_source_margin;
556 p_u += i_source_margin_c;
557 p_v += i_source_margin_c;
558 p_line1 += i_dest_margin;
559 p_line2 += i_dest_margin;
561 /* make sure all SSE2 stores are visible thereafter */
562 __asm__ __volatile__ ( "sfence" );
566 /* use slower SSE2 unaligned fetch and store */
567 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
570 p_line2 += p_dest->p->i_pitch;
573 p_y2 += p_source->p[Y_PLANE].i_pitch;
575 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
577 SSE2_CALL( SSE2_YUV420_YVYU_UNALIGNED );
579 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
584 p_y1 += i_source_margin;
585 p_y2 += i_source_margin;
586 p_u += i_source_margin_c;
587 p_v += i_source_margin_c;
588 p_line1 += i_dest_margin;
589 p_line2 += i_dest_margin;
592 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
595 /*****************************************************************************
596 * I420_UYVY: planar YUV 4:2:0 to packed UYVY 4:2:2
597 *****************************************************************************/
598 static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source,
601 uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
602 uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
603 uint8_t *p_u = p_source->U_PIXELS;
604 uint8_t *p_v = p_source->V_PIXELS;
608 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
609 #define VEC_NEXT_LINES( ) \
611 p_line2 += p_dest->p->i_pitch; \
613 p_y2 += p_source->p[Y_PLANE].i_pitch;
615 #define VEC_LOAD_UV( ) \
616 u_vec = vec_ld( 0, p_u ); p_u += 16; \
617 v_vec = vec_ld( 0, p_v ); p_v += 16;
619 #define VEC_MERGE( a ) \
620 uv_vec = a( u_vec, v_vec ); \
621 y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
622 vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
623 vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
624 y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
625 vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16; \
626 vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16;
628 vector unsigned char u_vec;
629 vector unsigned char v_vec;
630 vector unsigned char uv_vec;
631 vector unsigned char y_vec;
633 if( !( ( p_vout->render.i_width % 32 ) |
634 ( p_vout->render.i_height % 2 ) ) )
636 /* Width is a multiple of 32, we take 2 lines at a time */
637 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
640 for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
643 VEC_MERGE( vec_mergeh );
644 VEC_MERGE( vec_mergel );
648 else if( !( ( p_vout->render.i_width % 16 ) |
649 ( p_vout->render.i_height % 4 ) ) )
651 /* Width is only a multiple of 16, we take 4 lines at a time */
652 for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
654 /* Line 1 and 2, pixels 0 to ( width - 16 ) */
656 for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
659 VEC_MERGE( vec_mergeh );
660 VEC_MERGE( vec_mergel );
663 /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
665 VEC_MERGE( vec_mergeh );
667 /* Line 3 and 4, pixels 0 to 16 */
669 VEC_MERGE( vec_mergel );
671 /* Line 3 and 4, pixels 16 to ( width ) */
672 for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
675 VEC_MERGE( vec_mergeh );
676 VEC_MERGE( vec_mergel );
682 /* Crap, use the C version */
683 #undef VEC_NEXT_LINES
688 const int i_source_margin = p_source->p[0].i_pitch
689 - p_source->p[0].i_visible_pitch;
690 const int i_source_margin_c = p_source->p[1].i_pitch
691 - p_source->p[1].i_visible_pitch;
692 const int i_dest_margin = p_dest->p->i_pitch
693 - p_dest->p->i_visible_pitch;
695 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
696 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
699 p_line2 += p_dest->p->i_pitch;
702 p_y2 += p_source->p[Y_PLANE].i_pitch;
704 for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
706 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
712 MMX_CALL( MMX_YUV420_UYVY );
715 for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x--; )
720 p_y1 += i_source_margin;
721 p_y2 += i_source_margin;
722 p_u += i_source_margin_c;
723 p_v += i_source_margin_c;
724 p_line1 += i_dest_margin;
725 p_line2 += i_dest_margin;
728 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
729 /* re-enable FPU registers */
730 __asm__ __volatile__ ( "emms" );
733 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
737 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
739 ** SSE2 128 bits fetch/store instructions are faster
740 ** if memory access is 16 bytes aligned
742 if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
743 ((int)p_line2|(int)p_y2))) )
745 /* use faster SSE2 aligned fetch and store */
746 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
749 p_line2 += p_dest->p->i_pitch;
752 p_y2 += p_source->p[Y_PLANE].i_pitch;
754 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
756 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
758 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
763 p_y1 += i_source_margin;
764 p_y2 += i_source_margin;
765 p_u += i_source_margin_c;
766 p_v += i_source_margin_c;
767 p_line1 += i_dest_margin;
768 p_line2 += i_dest_margin;
770 /* make sure all SSE2 stores are visible thereafter */
771 __asm__ __volatile__ ( "sfence" );
775 /* use slower SSE2 unaligned fetch and store */
776 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
779 p_line2 += p_dest->p->i_pitch;
782 p_y2 += p_source->p[Y_PLANE].i_pitch;
784 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
786 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
788 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
793 p_y1 += i_source_margin;
794 p_y2 += i_source_margin;
795 p_u += i_source_margin_c;
796 p_v += i_source_margin_c;
797 p_line1 += i_dest_margin;
798 p_line2 += i_dest_margin;
801 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
804 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
805 /*****************************************************************************
806 * I420_IUYV: planar YUV 4:2:0 to interleaved packed UYVY 4:2:2
807 *****************************************************************************/
808 static void I420_IUYV( vout_thread_t *p_vout, picture_t *p_source,
812 msg_Err( p_vout, "I420_IUYV unimplemented, please harass <sam@zoy.org>" );
815 /*****************************************************************************
816 * I420_cyuv: planar YUV 4:2:0 to upside-down packed UYVY 4:2:2
817 *****************************************************************************/
818 static void I420_cyuv( vout_thread_t *p_vout, picture_t *p_source,
821 uint8_t *p_line1 = p_dest->p->p_pixels +
822 p_dest->p->i_visible_lines * p_dest->p->i_pitch
823 + p_dest->p->i_pitch;
824 uint8_t *p_line2 = p_dest->p->p_pixels +
825 p_dest->p->i_visible_lines * p_dest->p->i_pitch;
826 uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
827 uint8_t *p_u = p_source->U_PIXELS;
828 uint8_t *p_v = p_source->V_PIXELS;
832 const int i_source_margin = p_source->p[0].i_pitch
833 - p_source->p[0].i_visible_pitch;
834 const int i_source_margin_c = p_source->p[1].i_pitch
835 - p_source->p[1].i_visible_pitch;
836 const int i_dest_margin = p_dest->p->i_pitch
837 - p_dest->p->i_visible_pitch;
839 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
840 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
842 p_line1 -= 3 * p_dest->p->i_pitch;
843 p_line2 -= 3 * p_dest->p->i_pitch;
846 p_y2 += p_source->p[Y_PLANE].i_pitch;
848 for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
850 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
856 MMX_CALL( MMX_YUV420_UYVY );
859 for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; )
864 p_y1 += i_source_margin;
865 p_y2 += i_source_margin;
866 p_u += i_source_margin_c;
867 p_v += i_source_margin_c;
868 p_line1 += i_dest_margin;
869 p_line2 += i_dest_margin;
872 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
873 /* re-enable FPU registers */
874 __asm__ __volatile__ ( "emms" );
877 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
879 ** SSE2 128 bits fetch/store instructions are faster
880 ** if memory access is 16 bytes aligned
882 if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
883 ((int)p_line2|(int)p_y2))) )
885 /* use faster SSE2 aligned fetch and store */
886 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
889 p_line2 += p_dest->p->i_pitch;
892 p_y2 += p_source->p[Y_PLANE].i_pitch;
894 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
896 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
898 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
903 p_y1 += i_source_margin;
904 p_y2 += i_source_margin;
905 p_u += i_source_margin_c;
906 p_v += i_source_margin_c;
907 p_line1 += i_dest_margin;
908 p_line2 += i_dest_margin;
910 /* make sure all SSE2 stores are visible thereafter */
911 __asm__ __volatile__ ( "sfence" );
915 /* use slower SSE2 unaligned fetch and store */
916 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
919 p_line2 += p_dest->p->i_pitch;
922 p_y2 += p_source->p[Y_PLANE].i_pitch;
924 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
926 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
928 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
933 p_y1 += i_source_margin;
934 p_y2 += i_source_margin;
935 p_u += i_source_margin_c;
936 p_v += i_source_margin_c;
937 p_line1 += i_dest_margin;
938 p_line2 += i_dest_margin;
941 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
943 #endif // !defined (MODULE_NAME_IS_i420_yuy2_altivec)
945 /*****************************************************************************
946 * I420_Y211: planar YUV 4:2:0 to packed YUYV 2:1:1
947 *****************************************************************************/
948 #if defined (MODULE_NAME_IS_i420_yuy2)
949 static void I420_Y211( vout_thread_t *p_vout, picture_t *p_source,
952 uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
953 uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
954 uint8_t *p_u = p_source->U_PIXELS;
955 uint8_t *p_v = p_source->V_PIXELS;
959 const int i_source_margin = p_source->p[0].i_pitch
960 - p_source->p[0].i_visible_pitch;
961 const int i_source_margin_c = p_source->p[1].i_pitch
962 - p_source->p[1].i_visible_pitch;
963 const int i_dest_margin = p_dest->p->i_pitch
964 - p_dest->p->i_visible_pitch;
966 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
969 p_line2 += p_dest->p->i_pitch;
972 p_y2 += p_source->p[Y_PLANE].i_pitch;
974 for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
980 p_y1 += i_source_margin;
981 p_y2 += i_source_margin;
982 p_u += i_source_margin_c;
983 p_v += i_source_margin_c;
984 p_line1 += i_dest_margin;
985 p_line2 += i_dest_margin;