1 /*****************************************************************************
2 * i420_yuy2.c : YUV to YUV conversion module for vlc
3 *****************************************************************************
4 * Copyright (C) 2000, 2001 the VideoLAN team
7 * Authors: Samuel Hocevar <sam@zoy.org>
8 * Damien Fouilleul <damien@videolan.org>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
23 *****************************************************************************/
25 /*****************************************************************************
27 *****************************************************************************/
33 #include <vlc_common.h>
34 #include <vlc_plugin.h>
35 #include <vlc_filter.h>
38 #if defined (MODULE_NAME_IS_i420_yuy2_altivec) && defined(HAVE_ALTIVEC_H)
42 #include "i420_yuy2.h"
44 #define SRC_FOURCC "I420,IYUV,YV12"
46 #if defined (MODULE_NAME_IS_i420_yuy2)
47 # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv,Y211"
48 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
49 # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
50 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
51 # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
52 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
53 # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422"
56 /*****************************************************************************
57 * Local and extern prototypes.
58 *****************************************************************************/
59 static int Activate ( vlc_object_t * );
61 static void I420_YUY2 ( filter_t *, picture_t *, picture_t * );
62 static void I420_YVYU ( filter_t *, picture_t *, picture_t * );
63 static void I420_UYVY ( filter_t *, picture_t *, picture_t * );
64 static picture_t *I420_YUY2_Filter ( filter_t *, picture_t * );
65 static picture_t *I420_YVYU_Filter ( filter_t *, picture_t * );
66 static picture_t *I420_UYVY_Filter ( filter_t *, picture_t * );
67 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
68 static void I420_IUYV ( filter_t *, picture_t *, picture_t * );
69 static void I420_cyuv ( filter_t *, picture_t *, picture_t * );
70 static picture_t *I420_IUYV_Filter ( filter_t *, picture_t * );
71 static picture_t *I420_cyuv_Filter ( filter_t *, picture_t * );
73 #if defined (MODULE_NAME_IS_i420_yuy2)
74 static void I420_Y211 ( filter_t *, picture_t *, picture_t * );
75 static picture_t *I420_Y211_Filter ( filter_t *, picture_t * );
78 #ifdef MODULE_NAME_IS_i420_yuy2_mmx
79 /* Initialize MMX-specific constants */
80 static const uint64_t i_00ffw = 0x00ff00ff00ff00ffULL;
81 static const uint64_t i_80w = 0x0000000080808080ULL;
84 /*****************************************************************************
86 *****************************************************************************/
88 #if defined (MODULE_NAME_IS_i420_yuy2)
89 set_description( N_("Conversions from " SRC_FOURCC " to " DEST_FOURCC) );
90 set_capability( "video filter2", 80 );
91 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
92 set_description( N_("MMX conversions from " SRC_FOURCC " to " DEST_FOURCC) );
93 set_capability( "video filter2", 100 );
94 add_requirement( MMX );
95 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
96 set_description( N_("SSE2 conversions from " SRC_FOURCC " to " DEST_FOURCC) );
97 set_capability( "video filter2", 120 );
98 add_requirement( SSE2 );
99 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
101 _("AltiVec conversions from " SRC_FOURCC " to " DEST_FOURCC) );
102 set_capability( "video filter2", 100 );
103 add_requirement( ALTIVEC );
105 set_callbacks( Activate, NULL );
108 /*****************************************************************************
109 * Activate: allocate a chroma function
110 *****************************************************************************
111 * This function allocates and initializes a chroma function
112 *****************************************************************************/
113 static int Activate( vlc_object_t *p_this )
115 filter_t *p_filter = (filter_t *)p_this;
117 if( p_filter->fmt_in.video.i_width & 1
118 || p_filter->fmt_in.video.i_height & 1 )
123 switch( p_filter->fmt_in.video.i_chroma )
125 case VLC_FOURCC('Y','V','1','2'):
126 case VLC_FOURCC('I','4','2','0'):
127 case VLC_FOURCC('I','Y','U','V'):
128 switch( p_filter->fmt_out.video.i_chroma )
130 case VLC_FOURCC('Y','U','Y','2'):
131 case VLC_FOURCC('Y','U','N','V'):
132 p_filter->pf_video_filter = I420_YUY2_Filter;
135 case VLC_FOURCC('Y','V','Y','U'):
136 p_filter->pf_video_filter = I420_YVYU_Filter;
139 case VLC_FOURCC('U','Y','V','Y'):
140 case VLC_FOURCC('U','Y','N','V'):
141 case VLC_FOURCC('Y','4','2','2'):
142 p_filter->pf_video_filter = I420_UYVY_Filter;
144 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
145 case VLC_FOURCC('I','U','Y','V'):
146 p_filter->pf_video_filter = I420_IUYV_Filter;
149 case VLC_FOURCC('c','y','u','v'):
150 p_filter->pf_video_filter = I420_cyuv_Filter;
154 #if defined (MODULE_NAME_IS_i420_yuy2)
155 case VLC_FOURCC('Y','2','1','1'):
156 p_filter->pf_video_filter = I420_Y211_Filter;
173 static inline unsigned long long read_cycles(void)
175 unsigned long long v;
176 __asm__ __volatile__("rdtsc" : "=A" (v): );
182 /* Following functions are local */
184 VIDEO_FILTER_WRAPPER( I420_YUY2 )
185 VIDEO_FILTER_WRAPPER( I420_YVYU )
186 VIDEO_FILTER_WRAPPER( I420_UYVY )
187 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
188 VIDEO_FILTER_WRAPPER( I420_IUYV )
189 VIDEO_FILTER_WRAPPER( I420_cyuv )
191 #if defined (MODULE_NAME_IS_i420_yuy2)
192 VIDEO_FILTER_WRAPPER( I420_Y211 )
195 /*****************************************************************************
196 * I420_YUY2: planar YUV 4:2:0 to packed YUYV 4:2:2
197 *****************************************************************************/
198 static void I420_YUY2( filter_t *p_filter, picture_t *p_source,
201 uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
202 uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
203 uint8_t *p_u = p_source->U_PIXELS;
204 uint8_t *p_v = p_source->V_PIXELS;
208 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
209 #define VEC_NEXT_LINES( ) \
211 p_line2 += p_dest->p->i_pitch; \
213 p_y2 += p_source->p[Y_PLANE].i_pitch;
215 #define VEC_LOAD_UV( ) \
216 u_vec = vec_ld( 0, p_u ); p_u += 16; \
217 v_vec = vec_ld( 0, p_v ); p_v += 16;
219 #define VEC_MERGE( a ) \
220 uv_vec = a( u_vec, v_vec ); \
221 y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
222 vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
223 vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
224 y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
225 vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16; \
226 vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16;
228 vector unsigned char u_vec;
229 vector unsigned char v_vec;
230 vector unsigned char uv_vec;
231 vector unsigned char y_vec;
233 if( !( ( p_filter->fmt_in.video.i_width % 32 ) |
234 ( p_filter->fmt_in.video.i_height % 2 ) ) )
236 /* Width is a multiple of 32, we take 2 lines at a time */
237 for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
240 for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
243 VEC_MERGE( vec_mergeh );
244 VEC_MERGE( vec_mergel );
248 else if( !( ( p_filter->fmt_in.video.i_width % 16 ) |
249 ( p_filter->fmt_in.video.i_height % 4 ) ) )
251 /* Width is only a multiple of 16, we take 4 lines at a time */
252 for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; )
254 /* Line 1 and 2, pixels 0 to ( width - 16 ) */
256 for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
259 VEC_MERGE( vec_mergeh );
260 VEC_MERGE( vec_mergel );
263 /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
265 VEC_MERGE( vec_mergeh );
267 /* Line 3 and 4, pixels 0 to 16 */
269 VEC_MERGE( vec_mergel );
271 /* Line 3 and 4, pixels 16 to ( width ) */
272 for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
275 VEC_MERGE( vec_mergeh );
276 VEC_MERGE( vec_mergel );
282 /* Crap, use the C version */
283 #undef VEC_NEXT_LINES
288 const int i_source_margin = p_source->p[0].i_pitch
289 - p_source->p[0].i_visible_pitch;
290 const int i_source_margin_c = p_source->p[1].i_pitch
291 - p_source->p[1].i_visible_pitch;
292 const int i_dest_margin = p_dest->p->i_pitch
293 - p_dest->p->i_visible_pitch;
295 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
296 for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
299 p_line2 += p_dest->p->i_pitch;
302 p_y2 += p_source->p[Y_PLANE].i_pitch;
304 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
305 for( i_x = p_filter->fmt_in.video.i_width / 8; i_x-- ; )
313 for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
315 MMX_CALL( MMX_YUV420_YUYV );
318 for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; )
323 p_y1 += i_source_margin;
324 p_y2 += i_source_margin;
325 p_u += i_source_margin_c;
326 p_v += i_source_margin_c;
327 p_line1 += i_dest_margin;
328 p_line2 += i_dest_margin;
331 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
332 /* re-enable FPU registers */
336 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
340 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
342 ** SSE2 128 bits fetch/store instructions are faster
343 ** if memory access is 16 bytes aligned
346 if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
347 ((intptr_t)p_line2|(intptr_t)p_y2))) )
349 /* use faster SSE2 aligned fetch and store */
350 for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
353 p_line2 += p_dest->p->i_pitch;
356 p_y2 += p_source->p[Y_PLANE].i_pitch;
358 for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
360 SSE2_CALL( SSE2_YUV420_YUYV_ALIGNED );
362 for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
367 p_y1 += i_source_margin;
368 p_y2 += i_source_margin;
369 p_u += i_source_margin_c;
370 p_v += i_source_margin_c;
371 p_line1 += i_dest_margin;
372 p_line2 += i_dest_margin;
377 /* use slower SSE2 unaligned fetch and store */
378 for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
381 p_line2 += p_dest->p->i_pitch;
384 p_y2 += p_source->p[Y_PLANE].i_pitch;
386 for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
388 SSE2_CALL( SSE2_YUV420_YUYV_UNALIGNED );
390 for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
395 p_y1 += i_source_margin;
396 p_y2 += i_source_margin;
397 p_u += i_source_margin_c;
398 p_v += i_source_margin_c;
399 p_line1 += i_dest_margin;
400 p_line2 += i_dest_margin;
403 /* make sure all SSE2 stores are visible thereafter */
406 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
409 /*****************************************************************************
410 * I420_YVYU: planar YUV 4:2:0 to packed YVYU 4:2:2
411 *****************************************************************************/
412 static void I420_YVYU( filter_t *p_filter, picture_t *p_source,
415 uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
416 uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
417 uint8_t *p_u = p_source->U_PIXELS;
418 uint8_t *p_v = p_source->V_PIXELS;
422 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
423 #define VEC_NEXT_LINES( ) \
425 p_line2 += p_dest->p->i_pitch; \
427 p_y2 += p_source->p[Y_PLANE].i_pitch;
429 #define VEC_LOAD_UV( ) \
430 u_vec = vec_ld( 0, p_u ); p_u += 16; \
431 v_vec = vec_ld( 0, p_v ); p_v += 16;
433 #define VEC_MERGE( a ) \
434 vu_vec = a( v_vec, u_vec ); \
435 y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
436 vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
437 vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
438 y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
439 vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16; \
440 vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16;
442 vector unsigned char u_vec;
443 vector unsigned char v_vec;
444 vector unsigned char vu_vec;
445 vector unsigned char y_vec;
447 if( !( ( p_filter->fmt_in.video.i_width % 32 ) |
448 ( p_filter->fmt_in.video.i_height % 2 ) ) )
450 /* Width is a multiple of 32, we take 2 lines at a time */
451 for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
454 for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
457 VEC_MERGE( vec_mergeh );
458 VEC_MERGE( vec_mergel );
462 else if( !( ( p_filter->fmt_in.video.i_width % 16 ) |
463 ( p_filter->fmt_in.video.i_height % 4 ) ) )
465 /* Width is only a multiple of 16, we take 4 lines at a time */
466 for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; )
468 /* Line 1 and 2, pixels 0 to ( width - 16 ) */
470 for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
473 VEC_MERGE( vec_mergeh );
474 VEC_MERGE( vec_mergel );
477 /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
479 VEC_MERGE( vec_mergeh );
481 /* Line 3 and 4, pixels 0 to 16 */
483 VEC_MERGE( vec_mergel );
485 /* Line 3 and 4, pixels 16 to ( width ) */
486 for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
489 VEC_MERGE( vec_mergeh );
490 VEC_MERGE( vec_mergel );
496 /* Crap, use the C version */
497 #undef VEC_NEXT_LINES
502 const int i_source_margin = p_source->p[0].i_pitch
503 - p_source->p[0].i_visible_pitch;
504 const int i_source_margin_c = p_source->p[1].i_pitch
505 - p_source->p[1].i_visible_pitch;
506 const int i_dest_margin = p_dest->p->i_pitch
507 - p_dest->p->i_visible_pitch;
509 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
510 for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
513 p_line2 += p_dest->p->i_pitch;
516 p_y2 += p_source->p[Y_PLANE].i_pitch;
518 for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
520 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
526 MMX_CALL( MMX_YUV420_YVYU );
529 for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; )
534 p_y1 += i_source_margin;
535 p_y2 += i_source_margin;
536 p_u += i_source_margin_c;
537 p_v += i_source_margin_c;
538 p_line1 += i_dest_margin;
539 p_line2 += i_dest_margin;
542 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
543 /* re-enable FPU registers */
547 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
551 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
553 ** SSE2 128 bits fetch/store instructions are faster
554 ** if memory access is 16 bytes aligned
556 if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
557 ((intptr_t)p_line2|(intptr_t)p_y2))) )
559 /* use faster SSE2 aligned fetch and store */
560 for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
563 p_line2 += p_dest->p->i_pitch;
566 p_y2 += p_source->p[Y_PLANE].i_pitch;
568 for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
570 SSE2_CALL( SSE2_YUV420_YVYU_ALIGNED );
572 for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
577 p_y1 += i_source_margin;
578 p_y2 += i_source_margin;
579 p_u += i_source_margin_c;
580 p_v += i_source_margin_c;
581 p_line1 += i_dest_margin;
582 p_line2 += i_dest_margin;
587 /* use slower SSE2 unaligned fetch and store */
588 for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
591 p_line2 += p_dest->p->i_pitch;
594 p_y2 += p_source->p[Y_PLANE].i_pitch;
596 for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
598 SSE2_CALL( SSE2_YUV420_YVYU_UNALIGNED );
600 for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
605 p_y1 += i_source_margin;
606 p_y2 += i_source_margin;
607 p_u += i_source_margin_c;
608 p_v += i_source_margin_c;
609 p_line1 += i_dest_margin;
610 p_line2 += i_dest_margin;
613 /* make sure all SSE2 stores are visible thereafter */
615 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
618 /*****************************************************************************
619 * I420_UYVY: planar YUV 4:2:0 to packed UYVY 4:2:2
620 *****************************************************************************/
621 static void I420_UYVY( filter_t *p_filter, picture_t *p_source,
624 uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
625 uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
626 uint8_t *p_u = p_source->U_PIXELS;
627 uint8_t *p_v = p_source->V_PIXELS;
631 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
632 #define VEC_NEXT_LINES( ) \
634 p_line2 += p_dest->p->i_pitch; \
636 p_y2 += p_source->p[Y_PLANE].i_pitch;
638 #define VEC_LOAD_UV( ) \
639 u_vec = vec_ld( 0, p_u ); p_u += 16; \
640 v_vec = vec_ld( 0, p_v ); p_v += 16;
642 #define VEC_MERGE( a ) \
643 uv_vec = a( u_vec, v_vec ); \
644 y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
645 vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
646 vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
647 y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
648 vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16; \
649 vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16;
651 vector unsigned char u_vec;
652 vector unsigned char v_vec;
653 vector unsigned char uv_vec;
654 vector unsigned char y_vec;
656 if( !( ( p_filter->fmt_in.video.i_width % 32 ) |
657 ( p_filter->fmt_in.video.i_height % 2 ) ) )
659 /* Width is a multiple of 32, we take 2 lines at a time */
660 for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
663 for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
666 VEC_MERGE( vec_mergeh );
667 VEC_MERGE( vec_mergel );
671 else if( !( ( p_filter->fmt_in.video.i_width % 16 ) |
672 ( p_filter->fmt_in.video.i_height % 4 ) ) )
674 /* Width is only a multiple of 16, we take 4 lines at a time */
675 for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; )
677 /* Line 1 and 2, pixels 0 to ( width - 16 ) */
679 for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
682 VEC_MERGE( vec_mergeh );
683 VEC_MERGE( vec_mergel );
686 /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
688 VEC_MERGE( vec_mergeh );
690 /* Line 3 and 4, pixels 0 to 16 */
692 VEC_MERGE( vec_mergel );
694 /* Line 3 and 4, pixels 16 to ( width ) */
695 for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
698 VEC_MERGE( vec_mergeh );
699 VEC_MERGE( vec_mergel );
705 /* Crap, use the C version */
706 #undef VEC_NEXT_LINES
711 const int i_source_margin = p_source->p[0].i_pitch
712 - p_source->p[0].i_visible_pitch;
713 const int i_source_margin_c = p_source->p[1].i_pitch
714 - p_source->p[1].i_visible_pitch;
715 const int i_dest_margin = p_dest->p->i_pitch
716 - p_dest->p->i_visible_pitch;
718 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
719 for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
722 p_line2 += p_dest->p->i_pitch;
725 p_y2 += p_source->p[Y_PLANE].i_pitch;
727 for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
729 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
735 MMX_CALL( MMX_YUV420_UYVY );
738 for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x--; )
743 p_y1 += i_source_margin;
744 p_y2 += i_source_margin;
745 p_u += i_source_margin_c;
746 p_v += i_source_margin_c;
747 p_line1 += i_dest_margin;
748 p_line2 += i_dest_margin;
751 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
752 /* re-enable FPU registers */
756 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
760 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
762 ** SSE2 128 bits fetch/store instructions are faster
763 ** if memory access is 16 bytes aligned
765 if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
766 ((intptr_t)p_line2|(intptr_t)p_y2))) )
768 /* use faster SSE2 aligned fetch and store */
769 for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
772 p_line2 += p_dest->p->i_pitch;
775 p_y2 += p_source->p[Y_PLANE].i_pitch;
777 for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
779 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
781 for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
786 p_y1 += i_source_margin;
787 p_y2 += i_source_margin;
788 p_u += i_source_margin_c;
789 p_v += i_source_margin_c;
790 p_line1 += i_dest_margin;
791 p_line2 += i_dest_margin;
796 /* use slower SSE2 unaligned fetch and store */
797 for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
800 p_line2 += p_dest->p->i_pitch;
803 p_y2 += p_source->p[Y_PLANE].i_pitch;
805 for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
807 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
809 for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
814 p_y1 += i_source_margin;
815 p_y2 += i_source_margin;
816 p_u += i_source_margin_c;
817 p_v += i_source_margin_c;
818 p_line1 += i_dest_margin;
819 p_line2 += i_dest_margin;
822 /* make sure all SSE2 stores are visible thereafter */
824 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
827 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
828 /*****************************************************************************
829 * I420_IUYV: planar YUV 4:2:0 to interleaved packed UYVY 4:2:2
830 *****************************************************************************/
831 static void I420_IUYV( filter_t *p_filter, picture_t *p_source,
834 VLC_UNUSED(p_source); VLC_UNUSED(p_dest);
836 msg_Err( p_filter, "I420_IUYV unimplemented, please harass <sam@zoy.org>" );
839 /*****************************************************************************
840 * I420_cyuv: planar YUV 4:2:0 to upside-down packed UYVY 4:2:2
841 *****************************************************************************/
842 static void I420_cyuv( filter_t *p_filter, picture_t *p_source,
845 uint8_t *p_line1 = p_dest->p->p_pixels +
846 p_dest->p->i_visible_lines * p_dest->p->i_pitch
847 + p_dest->p->i_pitch;
848 uint8_t *p_line2 = p_dest->p->p_pixels +
849 p_dest->p->i_visible_lines * p_dest->p->i_pitch;
850 uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
851 uint8_t *p_u = p_source->U_PIXELS;
852 uint8_t *p_v = p_source->V_PIXELS;
856 const int i_source_margin = p_source->p[0].i_pitch
857 - p_source->p[0].i_visible_pitch;
858 const int i_source_margin_c = p_source->p[1].i_pitch
859 - p_source->p[1].i_visible_pitch;
860 const int i_dest_margin = p_dest->p->i_pitch
861 - p_dest->p->i_visible_pitch;
863 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
864 for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
866 p_line1 -= 3 * p_dest->p->i_pitch;
867 p_line2 -= 3 * p_dest->p->i_pitch;
870 p_y2 += p_source->p[Y_PLANE].i_pitch;
872 for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
874 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
880 MMX_CALL( MMX_YUV420_UYVY );
883 for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; )
888 p_y1 += i_source_margin;
889 p_y2 += i_source_margin;
890 p_u += i_source_margin_c;
891 p_v += i_source_margin_c;
892 p_line1 += i_dest_margin;
893 p_line2 += i_dest_margin;
896 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
897 /* re-enable FPU registers */
901 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
903 ** SSE2 128 bits fetch/store instructions are faster
904 ** if memory access is 16 bytes aligned
906 if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
907 ((intptr_t)p_line2|(intptr_t)p_y2))) )
909 /* use faster SSE2 aligned fetch and store */
910 for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
913 p_line2 += p_dest->p->i_pitch;
916 p_y2 += p_source->p[Y_PLANE].i_pitch;
918 for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
920 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
922 for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
927 p_y1 += i_source_margin;
928 p_y2 += i_source_margin;
929 p_u += i_source_margin_c;
930 p_v += i_source_margin_c;
931 p_line1 += i_dest_margin;
932 p_line2 += i_dest_margin;
937 /* use slower SSE2 unaligned fetch and store */
938 for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
941 p_line2 += p_dest->p->i_pitch;
944 p_y2 += p_source->p[Y_PLANE].i_pitch;
946 for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
948 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
950 for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
955 p_y1 += i_source_margin;
956 p_y2 += i_source_margin;
957 p_u += i_source_margin_c;
958 p_v += i_source_margin_c;
959 p_line1 += i_dest_margin;
960 p_line2 += i_dest_margin;
963 /* make sure all SSE2 stores are visible thereafter */
965 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
967 #endif // !defined (MODULE_NAME_IS_i420_yuy2_altivec)
969 /*****************************************************************************
970 * I420_Y211: planar YUV 4:2:0 to packed YUYV 2:1:1
971 *****************************************************************************/
972 #if defined (MODULE_NAME_IS_i420_yuy2)
973 static void I420_Y211( filter_t *p_filter, picture_t *p_source,
976 uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
977 uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
978 uint8_t *p_u = p_source->U_PIXELS;
979 uint8_t *p_v = p_source->V_PIXELS;
983 const int i_source_margin = p_source->p[0].i_pitch
984 - p_source->p[0].i_visible_pitch;
985 const int i_source_margin_c = p_source->p[1].i_pitch
986 - p_source->p[1].i_visible_pitch;
987 const int i_dest_margin = p_dest->p->i_pitch
988 - p_dest->p->i_visible_pitch;
990 for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
993 p_line2 += p_dest->p->i_pitch;
996 p_y2 += p_source->p[Y_PLANE].i_pitch;
998 for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
1004 p_y1 += i_source_margin;
1005 p_y2 += i_source_margin;
1006 p_u += i_source_margin_c;
1007 p_v += i_source_margin_c;
1008 p_line1 += i_dest_margin;
1009 p_line2 += i_dest_margin;