1 /*****************************************************************************
2 * i420_yuy2.c : YUV to YUV conversion module for vlc
3 *****************************************************************************
4 * Copyright (C) 2000, 2001 the VideoLAN team
7 * Authors: Samuel Hocevar <sam@zoy.org>
8 * Damien Fouilleul <damien@videolan.org>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
23 *****************************************************************************/
25 /*****************************************************************************
27 *****************************************************************************/
33 #include <vlc_common.h>
34 #include <vlc_plugin.h>
35 #include <vlc_filter.h>
38 #if defined (MODULE_NAME_IS_i420_yuy2_altivec) && defined(HAVE_ALTIVEC_H)
42 #include "i420_yuy2.h"
44 #define SRC_FOURCC "I420,IYUV,YV12"
46 #if defined (MODULE_NAME_IS_i420_yuy2)
47 # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv,Y211"
48 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
49 # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
50 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
51 # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
52 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
53 # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422"
56 /*****************************************************************************
57 * Local and extern prototypes.
58 *****************************************************************************/
59 static int Activate ( vlc_object_t * );
61 static void I420_YUY2 ( filter_t *, picture_t *, picture_t * );
62 static void I420_YVYU ( filter_t *, picture_t *, picture_t * );
63 static void I420_UYVY ( filter_t *, picture_t *, picture_t * );
64 static picture_t *I420_YUY2_Filter ( filter_t *, picture_t * );
65 static picture_t *I420_YVYU_Filter ( filter_t *, picture_t * );
66 static picture_t *I420_UYVY_Filter ( filter_t *, picture_t * );
67 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
68 static void I420_IUYV ( filter_t *, picture_t *, picture_t * );
69 static void I420_cyuv ( filter_t *, picture_t *, picture_t * );
70 static picture_t *I420_IUYV_Filter ( filter_t *, picture_t * );
71 static picture_t *I420_cyuv_Filter ( filter_t *, picture_t * );
73 #if defined (MODULE_NAME_IS_i420_yuy2)
74 static void I420_Y211 ( filter_t *, picture_t *, picture_t * );
75 static picture_t *I420_Y211_Filter ( filter_t *, picture_t * );
78 #ifdef MODULE_NAME_IS_i420_yuy2_mmx
79 /* Initialize MMX-specific constants */
80 static const uint64_t i_00ffw = 0x00ff00ff00ff00ffULL;
81 static const uint64_t i_80w = 0x0000000080808080ULL;
84 /*****************************************************************************
86 *****************************************************************************/
88 #if defined (MODULE_NAME_IS_i420_yuy2)
89 set_description( N_("Conversions from " SRC_FOURCC " to " DEST_FOURCC) );
90 set_capability( "video filter2", 80 );
91 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
92 set_description( N_("MMX conversions from " SRC_FOURCC " to " DEST_FOURCC) );
93 set_capability( "video filter2", 160 );
94 add_requirement( MMX );
95 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
96 set_description( N_("SSE2 conversions from " SRC_FOURCC " to " DEST_FOURCC) );
97 set_capability( "video filter2", 250 );
98 add_requirement( SSE2 );
99 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
101 _("AltiVec conversions from " SRC_FOURCC " to " DEST_FOURCC) );
102 set_capability( "video filter2", 250 );
103 add_requirement( ALTIVEC );
105 set_callbacks( Activate, NULL );
108 /*****************************************************************************
109 * Activate: allocate a chroma function
110 *****************************************************************************
111 * This function allocates and initializes a chroma function
112 *****************************************************************************/
113 static int Activate( vlc_object_t *p_this )
115 filter_t *p_filter = (filter_t *)p_this;
117 if( p_filter->fmt_in.video.i_width & 1
118 || p_filter->fmt_in.video.i_height & 1 )
123 if( p_filter->fmt_in.video.i_width != p_filter->fmt_out.video.i_width
124 || p_filter->fmt_in.video.i_height != p_filter->fmt_out.video.i_height )
127 switch( p_filter->fmt_in.video.i_chroma )
129 case VLC_FOURCC('Y','V','1','2'):
130 case VLC_FOURCC('I','4','2','0'):
131 case VLC_FOURCC('I','Y','U','V'):
132 switch( p_filter->fmt_out.video.i_chroma )
134 case VLC_FOURCC('Y','U','Y','2'):
135 case VLC_FOURCC('Y','U','N','V'):
136 p_filter->pf_video_filter = I420_YUY2_Filter;
139 case VLC_FOURCC('Y','V','Y','U'):
140 p_filter->pf_video_filter = I420_YVYU_Filter;
143 case VLC_FOURCC('U','Y','V','Y'):
144 case VLC_FOURCC('U','Y','N','V'):
145 case VLC_FOURCC('Y','4','2','2'):
146 p_filter->pf_video_filter = I420_UYVY_Filter;
148 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
149 case VLC_FOURCC('I','U','Y','V'):
150 p_filter->pf_video_filter = I420_IUYV_Filter;
153 case VLC_FOURCC('c','y','u','v'):
154 p_filter->pf_video_filter = I420_cyuv_Filter;
158 #if defined (MODULE_NAME_IS_i420_yuy2)
159 case VLC_FOURCC('Y','2','1','1'):
160 p_filter->pf_video_filter = I420_Y211_Filter;
177 static inline unsigned long long read_cycles(void)
179 unsigned long long v;
180 __asm__ __volatile__("rdtsc" : "=A" (v): );
186 /* Following functions are local */
188 VIDEO_FILTER_WRAPPER( I420_YUY2 )
189 VIDEO_FILTER_WRAPPER( I420_YVYU )
190 VIDEO_FILTER_WRAPPER( I420_UYVY )
191 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
192 VIDEO_FILTER_WRAPPER( I420_IUYV )
193 VIDEO_FILTER_WRAPPER( I420_cyuv )
195 #if defined (MODULE_NAME_IS_i420_yuy2)
196 VIDEO_FILTER_WRAPPER( I420_Y211 )
199 /*****************************************************************************
200 * I420_YUY2: planar YUV 4:2:0 to packed YUYV 4:2:2
201 *****************************************************************************/
202 static void I420_YUY2( filter_t *p_filter, picture_t *p_source,
205 uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
206 uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
207 uint8_t *p_u = p_source->U_PIXELS;
208 uint8_t *p_v = p_source->V_PIXELS;
212 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
213 #define VEC_NEXT_LINES( ) \
215 p_line2 += p_dest->p->i_pitch; \
217 p_y2 += p_source->p[Y_PLANE].i_pitch;
219 #define VEC_LOAD_UV( ) \
220 u_vec = vec_ld( 0, p_u ); p_u += 16; \
221 v_vec = vec_ld( 0, p_v ); p_v += 16;
223 #define VEC_MERGE( a ) \
224 uv_vec = a( u_vec, v_vec ); \
225 y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
226 vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
227 vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
228 y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
229 vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16; \
230 vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16;
232 vector unsigned char u_vec;
233 vector unsigned char v_vec;
234 vector unsigned char uv_vec;
235 vector unsigned char y_vec;
237 if( !( ( p_filter->fmt_in.video.i_width % 32 ) |
238 ( p_filter->fmt_in.video.i_height % 2 ) ) )
240 /* Width is a multiple of 32, we take 2 lines at a time */
241 for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
244 for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
247 VEC_MERGE( vec_mergeh );
248 VEC_MERGE( vec_mergel );
252 else if( !( ( p_filter->fmt_in.video.i_width % 16 ) |
253 ( p_filter->fmt_in.video.i_height % 4 ) ) )
255 /* Width is only a multiple of 16, we take 4 lines at a time */
256 for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; )
258 /* Line 1 and 2, pixels 0 to ( width - 16 ) */
260 for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
263 VEC_MERGE( vec_mergeh );
264 VEC_MERGE( vec_mergel );
267 /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
269 VEC_MERGE( vec_mergeh );
271 /* Line 3 and 4, pixels 0 to 16 */
273 VEC_MERGE( vec_mergel );
275 /* Line 3 and 4, pixels 16 to ( width ) */
276 for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
279 VEC_MERGE( vec_mergeh );
280 VEC_MERGE( vec_mergel );
286 /* Crap, use the C version */
287 #undef VEC_NEXT_LINES
292 const int i_source_margin = p_source->p[0].i_pitch
293 - p_source->p[0].i_visible_pitch;
294 const int i_source_margin_c = p_source->p[1].i_pitch
295 - p_source->p[1].i_visible_pitch;
296 const int i_dest_margin = p_dest->p->i_pitch
297 - p_dest->p->i_visible_pitch;
299 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
300 for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
303 p_line2 += p_dest->p->i_pitch;
306 p_y2 += p_source->p[Y_PLANE].i_pitch;
308 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
309 for( i_x = p_filter->fmt_in.video.i_width / 8; i_x-- ; )
317 for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
319 MMX_CALL( MMX_YUV420_YUYV );
322 for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; )
327 p_y1 += i_source_margin;
328 p_y2 += i_source_margin;
329 p_u += i_source_margin_c;
330 p_v += i_source_margin_c;
331 p_line1 += i_dest_margin;
332 p_line2 += i_dest_margin;
335 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
336 /* re-enable FPU registers */
340 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
344 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
346 ** SSE2 128 bits fetch/store instructions are faster
347 ** if memory access is 16 bytes aligned
350 if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
351 ((intptr_t)p_line2|(intptr_t)p_y2))) )
353 /* use faster SSE2 aligned fetch and store */
354 for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
357 p_line2 += p_dest->p->i_pitch;
360 p_y2 += p_source->p[Y_PLANE].i_pitch;
362 for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
364 SSE2_CALL( SSE2_YUV420_YUYV_ALIGNED );
366 for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
371 p_y1 += i_source_margin;
372 p_y2 += i_source_margin;
373 p_u += i_source_margin_c;
374 p_v += i_source_margin_c;
375 p_line1 += i_dest_margin;
376 p_line2 += i_dest_margin;
381 /* use slower SSE2 unaligned fetch and store */
382 for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
385 p_line2 += p_dest->p->i_pitch;
388 p_y2 += p_source->p[Y_PLANE].i_pitch;
390 for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
392 SSE2_CALL( SSE2_YUV420_YUYV_UNALIGNED );
394 for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
399 p_y1 += i_source_margin;
400 p_y2 += i_source_margin;
401 p_u += i_source_margin_c;
402 p_v += i_source_margin_c;
403 p_line1 += i_dest_margin;
404 p_line2 += i_dest_margin;
407 /* make sure all SSE2 stores are visible thereafter */
410 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
413 /*****************************************************************************
414 * I420_YVYU: planar YUV 4:2:0 to packed YVYU 4:2:2
415 *****************************************************************************/
416 static void I420_YVYU( filter_t *p_filter, picture_t *p_source,
419 uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
420 uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
421 uint8_t *p_u = p_source->U_PIXELS;
422 uint8_t *p_v = p_source->V_PIXELS;
426 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
427 #define VEC_NEXT_LINES( ) \
429 p_line2 += p_dest->p->i_pitch; \
431 p_y2 += p_source->p[Y_PLANE].i_pitch;
433 #define VEC_LOAD_UV( ) \
434 u_vec = vec_ld( 0, p_u ); p_u += 16; \
435 v_vec = vec_ld( 0, p_v ); p_v += 16;
437 #define VEC_MERGE( a ) \
438 vu_vec = a( v_vec, u_vec ); \
439 y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
440 vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
441 vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
442 y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
443 vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16; \
444 vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16;
446 vector unsigned char u_vec;
447 vector unsigned char v_vec;
448 vector unsigned char vu_vec;
449 vector unsigned char y_vec;
451 if( !( ( p_filter->fmt_in.video.i_width % 32 ) |
452 ( p_filter->fmt_in.video.i_height % 2 ) ) )
454 /* Width is a multiple of 32, we take 2 lines at a time */
455 for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
458 for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
461 VEC_MERGE( vec_mergeh );
462 VEC_MERGE( vec_mergel );
466 else if( !( ( p_filter->fmt_in.video.i_width % 16 ) |
467 ( p_filter->fmt_in.video.i_height % 4 ) ) )
469 /* Width is only a multiple of 16, we take 4 lines at a time */
470 for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; )
472 /* Line 1 and 2, pixels 0 to ( width - 16 ) */
474 for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
477 VEC_MERGE( vec_mergeh );
478 VEC_MERGE( vec_mergel );
481 /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
483 VEC_MERGE( vec_mergeh );
485 /* Line 3 and 4, pixels 0 to 16 */
487 VEC_MERGE( vec_mergel );
489 /* Line 3 and 4, pixels 16 to ( width ) */
490 for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
493 VEC_MERGE( vec_mergeh );
494 VEC_MERGE( vec_mergel );
500 /* Crap, use the C version */
501 #undef VEC_NEXT_LINES
506 const int i_source_margin = p_source->p[0].i_pitch
507 - p_source->p[0].i_visible_pitch;
508 const int i_source_margin_c = p_source->p[1].i_pitch
509 - p_source->p[1].i_visible_pitch;
510 const int i_dest_margin = p_dest->p->i_pitch
511 - p_dest->p->i_visible_pitch;
513 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
514 for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
517 p_line2 += p_dest->p->i_pitch;
520 p_y2 += p_source->p[Y_PLANE].i_pitch;
522 for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
524 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
530 MMX_CALL( MMX_YUV420_YVYU );
533 for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; )
538 p_y1 += i_source_margin;
539 p_y2 += i_source_margin;
540 p_u += i_source_margin_c;
541 p_v += i_source_margin_c;
542 p_line1 += i_dest_margin;
543 p_line2 += i_dest_margin;
546 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
547 /* re-enable FPU registers */
551 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
555 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
557 ** SSE2 128 bits fetch/store instructions are faster
558 ** if memory access is 16 bytes aligned
560 if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
561 ((intptr_t)p_line2|(intptr_t)p_y2))) )
563 /* use faster SSE2 aligned fetch and store */
564 for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
567 p_line2 += p_dest->p->i_pitch;
570 p_y2 += p_source->p[Y_PLANE].i_pitch;
572 for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
574 SSE2_CALL( SSE2_YUV420_YVYU_ALIGNED );
576 for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
581 p_y1 += i_source_margin;
582 p_y2 += i_source_margin;
583 p_u += i_source_margin_c;
584 p_v += i_source_margin_c;
585 p_line1 += i_dest_margin;
586 p_line2 += i_dest_margin;
591 /* use slower SSE2 unaligned fetch and store */
592 for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
595 p_line2 += p_dest->p->i_pitch;
598 p_y2 += p_source->p[Y_PLANE].i_pitch;
600 for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
602 SSE2_CALL( SSE2_YUV420_YVYU_UNALIGNED );
604 for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
609 p_y1 += i_source_margin;
610 p_y2 += i_source_margin;
611 p_u += i_source_margin_c;
612 p_v += i_source_margin_c;
613 p_line1 += i_dest_margin;
614 p_line2 += i_dest_margin;
617 /* make sure all SSE2 stores are visible thereafter */
619 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
622 /*****************************************************************************
623 * I420_UYVY: planar YUV 4:2:0 to packed UYVY 4:2:2
624 *****************************************************************************/
625 static void I420_UYVY( filter_t *p_filter, picture_t *p_source,
628 uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
629 uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
630 uint8_t *p_u = p_source->U_PIXELS;
631 uint8_t *p_v = p_source->V_PIXELS;
635 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
636 #define VEC_NEXT_LINES( ) \
638 p_line2 += p_dest->p->i_pitch; \
640 p_y2 += p_source->p[Y_PLANE].i_pitch;
642 #define VEC_LOAD_UV( ) \
643 u_vec = vec_ld( 0, p_u ); p_u += 16; \
644 v_vec = vec_ld( 0, p_v ); p_v += 16;
646 #define VEC_MERGE( a ) \
647 uv_vec = a( u_vec, v_vec ); \
648 y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
649 vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
650 vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
651 y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
652 vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16; \
653 vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16;
655 vector unsigned char u_vec;
656 vector unsigned char v_vec;
657 vector unsigned char uv_vec;
658 vector unsigned char y_vec;
660 if( !( ( p_filter->fmt_in.video.i_width % 32 ) |
661 ( p_filter->fmt_in.video.i_height % 2 ) ) )
663 /* Width is a multiple of 32, we take 2 lines at a time */
664 for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
667 for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
670 VEC_MERGE( vec_mergeh );
671 VEC_MERGE( vec_mergel );
675 else if( !( ( p_filter->fmt_in.video.i_width % 16 ) |
676 ( p_filter->fmt_in.video.i_height % 4 ) ) )
678 /* Width is only a multiple of 16, we take 4 lines at a time */
679 for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; )
681 /* Line 1 and 2, pixels 0 to ( width - 16 ) */
683 for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
686 VEC_MERGE( vec_mergeh );
687 VEC_MERGE( vec_mergel );
690 /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
692 VEC_MERGE( vec_mergeh );
694 /* Line 3 and 4, pixels 0 to 16 */
696 VEC_MERGE( vec_mergel );
698 /* Line 3 and 4, pixels 16 to ( width ) */
699 for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
702 VEC_MERGE( vec_mergeh );
703 VEC_MERGE( vec_mergel );
709 /* Crap, use the C version */
710 #undef VEC_NEXT_LINES
715 const int i_source_margin = p_source->p[0].i_pitch
716 - p_source->p[0].i_visible_pitch;
717 const int i_source_margin_c = p_source->p[1].i_pitch
718 - p_source->p[1].i_visible_pitch;
719 const int i_dest_margin = p_dest->p->i_pitch
720 - p_dest->p->i_visible_pitch;
722 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
723 for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
726 p_line2 += p_dest->p->i_pitch;
729 p_y2 += p_source->p[Y_PLANE].i_pitch;
731 for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
733 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
739 MMX_CALL( MMX_YUV420_UYVY );
742 for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x--; )
747 p_y1 += i_source_margin;
748 p_y2 += i_source_margin;
749 p_u += i_source_margin_c;
750 p_v += i_source_margin_c;
751 p_line1 += i_dest_margin;
752 p_line2 += i_dest_margin;
755 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
756 /* re-enable FPU registers */
760 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
764 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
766 ** SSE2 128 bits fetch/store instructions are faster
767 ** if memory access is 16 bytes aligned
769 if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
770 ((intptr_t)p_line2|(intptr_t)p_y2))) )
772 /* use faster SSE2 aligned fetch and store */
773 for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
776 p_line2 += p_dest->p->i_pitch;
779 p_y2 += p_source->p[Y_PLANE].i_pitch;
781 for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
783 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
785 for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
790 p_y1 += i_source_margin;
791 p_y2 += i_source_margin;
792 p_u += i_source_margin_c;
793 p_v += i_source_margin_c;
794 p_line1 += i_dest_margin;
795 p_line2 += i_dest_margin;
800 /* use slower SSE2 unaligned fetch and store */
801 for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
804 p_line2 += p_dest->p->i_pitch;
807 p_y2 += p_source->p[Y_PLANE].i_pitch;
809 for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
811 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
813 for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
818 p_y1 += i_source_margin;
819 p_y2 += i_source_margin;
820 p_u += i_source_margin_c;
821 p_v += i_source_margin_c;
822 p_line1 += i_dest_margin;
823 p_line2 += i_dest_margin;
826 /* make sure all SSE2 stores are visible thereafter */
828 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
831 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
832 /*****************************************************************************
833 * I420_IUYV: planar YUV 4:2:0 to interleaved packed UYVY 4:2:2
834 *****************************************************************************/
835 static void I420_IUYV( filter_t *p_filter, picture_t *p_source,
838 VLC_UNUSED(p_source); VLC_UNUSED(p_dest);
840 msg_Err( p_filter, "I420_IUYV unimplemented, please harass <sam@zoy.org>" );
843 /*****************************************************************************
844 * I420_cyuv: planar YUV 4:2:0 to upside-down packed UYVY 4:2:2
845 *****************************************************************************/
846 static void I420_cyuv( filter_t *p_filter, picture_t *p_source,
849 uint8_t *p_line1 = p_dest->p->p_pixels +
850 p_dest->p->i_visible_lines * p_dest->p->i_pitch
851 + p_dest->p->i_pitch;
852 uint8_t *p_line2 = p_dest->p->p_pixels +
853 p_dest->p->i_visible_lines * p_dest->p->i_pitch;
854 uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
855 uint8_t *p_u = p_source->U_PIXELS;
856 uint8_t *p_v = p_source->V_PIXELS;
860 const int i_source_margin = p_source->p[0].i_pitch
861 - p_source->p[0].i_visible_pitch;
862 const int i_source_margin_c = p_source->p[1].i_pitch
863 - p_source->p[1].i_visible_pitch;
864 const int i_dest_margin = p_dest->p->i_pitch
865 - p_dest->p->i_visible_pitch;
867 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
868 for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
870 p_line1 -= 3 * p_dest->p->i_pitch;
871 p_line2 -= 3 * p_dest->p->i_pitch;
874 p_y2 += p_source->p[Y_PLANE].i_pitch;
876 for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
878 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
884 MMX_CALL( MMX_YUV420_UYVY );
887 for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; )
892 p_y1 += i_source_margin;
893 p_y2 += i_source_margin;
894 p_u += i_source_margin_c;
895 p_v += i_source_margin_c;
896 p_line1 += i_dest_margin;
897 p_line2 += i_dest_margin;
900 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
901 /* re-enable FPU registers */
905 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
907 ** SSE2 128 bits fetch/store instructions are faster
908 ** if memory access is 16 bytes aligned
910 if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
911 ((intptr_t)p_line2|(intptr_t)p_y2))) )
913 /* use faster SSE2 aligned fetch and store */
914 for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
917 p_line2 += p_dest->p->i_pitch;
920 p_y2 += p_source->p[Y_PLANE].i_pitch;
922 for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
924 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
926 for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
931 p_y1 += i_source_margin;
932 p_y2 += i_source_margin;
933 p_u += i_source_margin_c;
934 p_v += i_source_margin_c;
935 p_line1 += i_dest_margin;
936 p_line2 += i_dest_margin;
941 /* use slower SSE2 unaligned fetch and store */
942 for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
945 p_line2 += p_dest->p->i_pitch;
948 p_y2 += p_source->p[Y_PLANE].i_pitch;
950 for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
952 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
954 for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
959 p_y1 += i_source_margin;
960 p_y2 += i_source_margin;
961 p_u += i_source_margin_c;
962 p_v += i_source_margin_c;
963 p_line1 += i_dest_margin;
964 p_line2 += i_dest_margin;
967 /* make sure all SSE2 stores are visible thereafter */
969 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
971 #endif // !defined (MODULE_NAME_IS_i420_yuy2_altivec)
973 /*****************************************************************************
974 * I420_Y211: planar YUV 4:2:0 to packed YUYV 2:1:1
975 *****************************************************************************/
976 #if defined (MODULE_NAME_IS_i420_yuy2)
977 static void I420_Y211( filter_t *p_filter, picture_t *p_source,
980 uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
981 uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
982 uint8_t *p_u = p_source->U_PIXELS;
983 uint8_t *p_v = p_source->V_PIXELS;
987 const int i_source_margin = p_source->p[0].i_pitch
988 - p_source->p[0].i_visible_pitch;
989 const int i_source_margin_c = p_source->p[1].i_pitch
990 - p_source->p[1].i_visible_pitch;
991 const int i_dest_margin = p_dest->p->i_pitch
992 - p_dest->p->i_visible_pitch;
994 for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
997 p_line2 += p_dest->p->i_pitch;
1000 p_y2 += p_source->p[Y_PLANE].i_pitch;
1002 for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
1008 p_y1 += i_source_margin;
1009 p_y2 += i_source_margin;
1010 p_u += i_source_margin_c;
1011 p_v += i_source_margin_c;
1012 p_line1 += i_dest_margin;
1013 p_line2 += i_dest_margin;