1 /*****************************************************************************
2 * i420_yuy2.c : YUV to YUV conversion module for vlc
3 *****************************************************************************
4 * Copyright (C) 2000, 2001 VLC authors and VideoLAN
7 * Authors: Samuel Hocevar <sam@zoy.org>
8 * Damien Fouilleul <damien@videolan.org>
10 * This program is free software; you can redistribute it and/or modify it
11 * under the terms of the GNU Lesser General Public License as published by
12 * the Free Software Foundation; either version 2.1 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public License
21 * along with this program; if not, write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
23 *****************************************************************************/
25 /*****************************************************************************
27 *****************************************************************************/
33 #include <vlc_common.h>
34 #include <vlc_plugin.h>
35 #include <vlc_filter.h>
38 #if defined (MODULE_NAME_IS_i420_yuy2_altivec) && defined(HAVE_ALTIVEC_H)
42 #include "i420_yuy2.h"
44 #define SRC_FOURCC "I420,IYUV,YV12"
46 #if defined (MODULE_NAME_IS_i420_yuy2)
47 # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv,Y211"
49 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
50 # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
51 # define VLC_TARGET VLC_MMX
52 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
53 # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
54 # define VLC_TARGET VLC_SSE
55 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
56 # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422"
60 /*****************************************************************************
61 * Local and extern prototypes.
62 *****************************************************************************/
63 static int Activate ( vlc_object_t * );
65 static void I420_YUY2 ( filter_t *, picture_t *, picture_t * );
66 static void I420_YVYU ( filter_t *, picture_t *, picture_t * );
67 static void I420_UYVY ( filter_t *, picture_t *, picture_t * );
68 static picture_t *I420_YUY2_Filter ( filter_t *, picture_t * );
69 static picture_t *I420_YVYU_Filter ( filter_t *, picture_t * );
70 static picture_t *I420_UYVY_Filter ( filter_t *, picture_t * );
71 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
72 static void I420_IUYV ( filter_t *, picture_t *, picture_t * );
73 static void I420_cyuv ( filter_t *, picture_t *, picture_t * );
74 static picture_t *I420_IUYV_Filter ( filter_t *, picture_t * );
75 static picture_t *I420_cyuv_Filter ( filter_t *, picture_t * );
77 #if defined (MODULE_NAME_IS_i420_yuy2)
78 static void I420_Y211 ( filter_t *, picture_t *, picture_t * );
79 static picture_t *I420_Y211_Filter ( filter_t *, picture_t * );
82 #ifdef MODULE_NAME_IS_i420_yuy2_mmx
83 /* Initialize MMX-specific constants */
84 static const uint64_t i_00ffw = 0x00ff00ff00ff00ffULL;
85 static const uint64_t i_80w = 0x0000000080808080ULL;
88 /*****************************************************************************
90 *****************************************************************************/
92 #if defined (MODULE_NAME_IS_i420_yuy2)
93 set_description( N_("Conversions from " SRC_FOURCC " to " DEST_FOURCC) )
94 set_capability( "video filter2", 80 )
95 # define vlc_CPU_capable() (true)
96 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
97 set_description( N_("MMX conversions from " SRC_FOURCC " to " DEST_FOURCC) )
98 set_capability( "video filter2", 160 )
99 # define vlc_CPU_capable() vlc_CPU_MMX()
100 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
101 set_description( N_("SSE2 conversions from " SRC_FOURCC " to " DEST_FOURCC) )
102 set_capability( "video filter2", 250 )
103 # define vlc_CPU_capable() vlc_CPU_SSE2()
104 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
106 _("AltiVec conversions from " SRC_FOURCC " to " DEST_FOURCC) );
107 set_capability( "video filter2", 250 )
108 # define vlc_CPU_capable() vlc_CPU_ALTIVEC()
110 set_callbacks( Activate, NULL )
113 /*****************************************************************************
114 * Activate: allocate a chroma function
115 *****************************************************************************
116 * This function allocates and initializes a chroma function
117 *****************************************************************************/
118 static int Activate( vlc_object_t *p_this )
120 filter_t *p_filter = (filter_t *)p_this;
122 if( !vlc_CPU_capable() )
124 if( p_filter->fmt_in.video.i_width & 1
125 || p_filter->fmt_in.video.i_height & 1 )
130 if( p_filter->fmt_in.video.i_width != p_filter->fmt_out.video.i_width
131 || p_filter->fmt_in.video.i_height != p_filter->fmt_out.video.i_height
132 || p_filter->fmt_in.video.orientation != p_filter->fmt_out.video.orientation )
135 switch( p_filter->fmt_in.video.i_chroma )
139 switch( p_filter->fmt_out.video.i_chroma )
142 p_filter->pf_video_filter = I420_YUY2_Filter;
146 p_filter->pf_video_filter = I420_YVYU_Filter;
150 p_filter->pf_video_filter = I420_UYVY_Filter;
152 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
153 case VLC_FOURCC('I','U','Y','V'):
154 p_filter->pf_video_filter = I420_IUYV_Filter;
158 p_filter->pf_video_filter = I420_cyuv_Filter;
162 #if defined (MODULE_NAME_IS_i420_yuy2)
164 p_filter->pf_video_filter = I420_Y211_Filter;
181 static inline unsigned long long read_cycles(void)
183 unsigned long long v;
184 __asm__ __volatile__("rdtsc" : "=A" (v): );
190 /* Following functions are local */
192 VIDEO_FILTER_WRAPPER( I420_YUY2 )
193 VIDEO_FILTER_WRAPPER( I420_YVYU )
194 VIDEO_FILTER_WRAPPER( I420_UYVY )
195 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
196 VIDEO_FILTER_WRAPPER( I420_IUYV )
197 VIDEO_FILTER_WRAPPER( I420_cyuv )
199 #if defined (MODULE_NAME_IS_i420_yuy2)
200 VIDEO_FILTER_WRAPPER( I420_Y211 )
203 /*****************************************************************************
204 * I420_YUY2: planar YUV 4:2:0 to packed YUYV 4:2:2
205 *****************************************************************************/
207 static void I420_YUY2( filter_t *p_filter, picture_t *p_source,
210 uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
211 uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
212 uint8_t *p_u = p_source->U_PIXELS;
213 uint8_t *p_v = p_source->V_PIXELS;
217 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
218 #define VEC_NEXT_LINES( ) \
220 p_line2 += p_dest->p->i_pitch; \
222 p_y2 += p_source->p[Y_PLANE].i_pitch;
224 #define VEC_LOAD_UV( ) \
225 u_vec = vec_ld( 0, p_u ); p_u += 16; \
226 v_vec = vec_ld( 0, p_v ); p_v += 16;
228 #define VEC_MERGE( a ) \
229 uv_vec = a( u_vec, v_vec ); \
230 y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
231 vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
232 vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
233 y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
234 vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16; \
235 vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16;
237 vector unsigned char u_vec;
238 vector unsigned char v_vec;
239 vector unsigned char uv_vec;
240 vector unsigned char y_vec;
242 if( !( ( p_filter->fmt_in.video.i_width % 32 ) |
243 ( p_filter->fmt_in.video.i_height % 2 ) ) )
245 /* Width is a multiple of 32, we take 2 lines at a time */
246 for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
249 for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
252 VEC_MERGE( vec_mergeh );
253 VEC_MERGE( vec_mergel );
257 #warning FIXME: converting widths % 16 but !widths % 32 is broken on altivec
259 else if( !( ( p_filter->fmt_in.video.i_width % 16 ) |
260 ( p_filter->fmt_in.video.i_height % 4 ) ) )
262 /* Width is only a multiple of 16, we take 4 lines at a time */
263 for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; )
265 /* Line 1 and 2, pixels 0 to ( width - 16 ) */
267 for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
270 VEC_MERGE( vec_mergeh );
271 VEC_MERGE( vec_mergel );
274 /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
276 VEC_MERGE( vec_mergeh );
278 /* Line 3 and 4, pixels 0 to 16 */
280 VEC_MERGE( vec_mergel );
282 /* Line 3 and 4, pixels 16 to ( width ) */
283 for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
286 VEC_MERGE( vec_mergeh );
287 VEC_MERGE( vec_mergel );
294 /* Crap, use the C version */
295 #undef VEC_NEXT_LINES
300 const int i_source_margin = p_source->p[0].i_pitch
301 - p_source->p[0].i_visible_pitch;
302 const int i_source_margin_c = p_source->p[1].i_pitch
303 - p_source->p[1].i_visible_pitch;
304 const int i_dest_margin = p_dest->p->i_pitch
305 - p_dest->p->i_visible_pitch;
307 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
308 for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
311 p_line2 += p_dest->p->i_pitch;
314 p_y2 += p_source->p[Y_PLANE].i_pitch;
316 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
317 for( i_x = p_filter->fmt_in.video.i_width / 8; i_x-- ; )
325 for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
327 MMX_CALL( MMX_YUV420_YUYV );
330 for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; )
335 p_y1 += i_source_margin;
336 p_y2 += i_source_margin;
337 p_u += i_source_margin_c;
338 p_v += i_source_margin_c;
339 p_line1 += i_dest_margin;
340 p_line2 += i_dest_margin;
343 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
344 /* re-enable FPU registers */
348 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
352 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
354 ** SSE2 128 bits fetch/store instructions are faster
355 ** if memory access is 16 bytes aligned
358 if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
359 ((intptr_t)p_line2|(intptr_t)p_y2))) )
361 /* use faster SSE2 aligned fetch and store */
362 for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
365 p_line2 += p_dest->p->i_pitch;
368 p_y2 += p_source->p[Y_PLANE].i_pitch;
370 for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
372 SSE2_CALL( SSE2_YUV420_YUYV_ALIGNED );
374 for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
379 p_y1 += i_source_margin;
380 p_y2 += i_source_margin;
381 p_u += i_source_margin_c;
382 p_v += i_source_margin_c;
383 p_line1 += i_dest_margin;
384 p_line2 += i_dest_margin;
389 /* use slower SSE2 unaligned fetch and store */
390 for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
393 p_line2 += p_dest->p->i_pitch;
396 p_y2 += p_source->p[Y_PLANE].i_pitch;
398 for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
400 SSE2_CALL( SSE2_YUV420_YUYV_UNALIGNED );
402 for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
407 p_y1 += i_source_margin;
408 p_y2 += i_source_margin;
409 p_u += i_source_margin_c;
410 p_v += i_source_margin_c;
411 p_line1 += i_dest_margin;
412 p_line2 += i_dest_margin;
415 /* make sure all SSE2 stores are visible thereafter */
418 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
421 /*****************************************************************************
422 * I420_YVYU: planar YUV 4:2:0 to packed YVYU 4:2:2
423 *****************************************************************************/
425 static void I420_YVYU( filter_t *p_filter, picture_t *p_source,
428 uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
429 uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
430 uint8_t *p_u = p_source->U_PIXELS;
431 uint8_t *p_v = p_source->V_PIXELS;
435 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
436 #define VEC_NEXT_LINES( ) \
438 p_line2 += p_dest->p->i_pitch; \
440 p_y2 += p_source->p[Y_PLANE].i_pitch;
442 #define VEC_LOAD_UV( ) \
443 u_vec = vec_ld( 0, p_u ); p_u += 16; \
444 v_vec = vec_ld( 0, p_v ); p_v += 16;
446 #define VEC_MERGE( a ) \
447 vu_vec = a( v_vec, u_vec ); \
448 y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
449 vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
450 vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
451 y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
452 vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16; \
453 vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16;
455 vector unsigned char u_vec;
456 vector unsigned char v_vec;
457 vector unsigned char vu_vec;
458 vector unsigned char y_vec;
460 if( !( ( p_filter->fmt_in.video.i_width % 32 ) |
461 ( p_filter->fmt_in.video.i_height % 2 ) ) )
463 /* Width is a multiple of 32, we take 2 lines at a time */
464 for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
467 for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
470 VEC_MERGE( vec_mergeh );
471 VEC_MERGE( vec_mergel );
475 else if( !( ( p_filter->fmt_in.video.i_width % 16 ) |
476 ( p_filter->fmt_in.video.i_height % 4 ) ) )
478 /* Width is only a multiple of 16, we take 4 lines at a time */
479 for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; )
481 /* Line 1 and 2, pixels 0 to ( width - 16 ) */
483 for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
486 VEC_MERGE( vec_mergeh );
487 VEC_MERGE( vec_mergel );
490 /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
492 VEC_MERGE( vec_mergeh );
494 /* Line 3 and 4, pixels 0 to 16 */
496 VEC_MERGE( vec_mergel );
498 /* Line 3 and 4, pixels 16 to ( width ) */
499 for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
502 VEC_MERGE( vec_mergeh );
503 VEC_MERGE( vec_mergel );
509 /* Crap, use the C version */
510 #undef VEC_NEXT_LINES
515 const int i_source_margin = p_source->p[0].i_pitch
516 - p_source->p[0].i_visible_pitch;
517 const int i_source_margin_c = p_source->p[1].i_pitch
518 - p_source->p[1].i_visible_pitch;
519 const int i_dest_margin = p_dest->p->i_pitch
520 - p_dest->p->i_visible_pitch;
522 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
523 for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
526 p_line2 += p_dest->p->i_pitch;
529 p_y2 += p_source->p[Y_PLANE].i_pitch;
531 for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
533 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
539 MMX_CALL( MMX_YUV420_YVYU );
542 for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; )
547 p_y1 += i_source_margin;
548 p_y2 += i_source_margin;
549 p_u += i_source_margin_c;
550 p_v += i_source_margin_c;
551 p_line1 += i_dest_margin;
552 p_line2 += i_dest_margin;
555 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
556 /* re-enable FPU registers */
560 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
564 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
566 ** SSE2 128 bits fetch/store instructions are faster
567 ** if memory access is 16 bytes aligned
569 if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
570 ((intptr_t)p_line2|(intptr_t)p_y2))) )
572 /* use faster SSE2 aligned fetch and store */
573 for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
576 p_line2 += p_dest->p->i_pitch;
579 p_y2 += p_source->p[Y_PLANE].i_pitch;
581 for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
583 SSE2_CALL( SSE2_YUV420_YVYU_ALIGNED );
585 for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
590 p_y1 += i_source_margin;
591 p_y2 += i_source_margin;
592 p_u += i_source_margin_c;
593 p_v += i_source_margin_c;
594 p_line1 += i_dest_margin;
595 p_line2 += i_dest_margin;
600 /* use slower SSE2 unaligned fetch and store */
601 for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
604 p_line2 += p_dest->p->i_pitch;
607 p_y2 += p_source->p[Y_PLANE].i_pitch;
609 for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
611 SSE2_CALL( SSE2_YUV420_YVYU_UNALIGNED );
613 for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
618 p_y1 += i_source_margin;
619 p_y2 += i_source_margin;
620 p_u += i_source_margin_c;
621 p_v += i_source_margin_c;
622 p_line1 += i_dest_margin;
623 p_line2 += i_dest_margin;
626 /* make sure all SSE2 stores are visible thereafter */
628 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
631 /*****************************************************************************
632 * I420_UYVY: planar YUV 4:2:0 to packed UYVY 4:2:2
633 *****************************************************************************/
635 static void I420_UYVY( filter_t *p_filter, picture_t *p_source,
638 uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
639 uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
640 uint8_t *p_u = p_source->U_PIXELS;
641 uint8_t *p_v = p_source->V_PIXELS;
645 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
646 #define VEC_NEXT_LINES( ) \
648 p_line2 += p_dest->p->i_pitch; \
650 p_y2 += p_source->p[Y_PLANE].i_pitch;
652 #define VEC_LOAD_UV( ) \
653 u_vec = vec_ld( 0, p_u ); p_u += 16; \
654 v_vec = vec_ld( 0, p_v ); p_v += 16;
656 #define VEC_MERGE( a ) \
657 uv_vec = a( u_vec, v_vec ); \
658 y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
659 vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
660 vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
661 y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
662 vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16; \
663 vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16;
665 vector unsigned char u_vec;
666 vector unsigned char v_vec;
667 vector unsigned char uv_vec;
668 vector unsigned char y_vec;
670 if( !( ( p_filter->fmt_in.video.i_width % 32 ) |
671 ( p_filter->fmt_in.video.i_height % 2 ) ) )
673 /* Width is a multiple of 32, we take 2 lines at a time */
674 for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
677 for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
680 VEC_MERGE( vec_mergeh );
681 VEC_MERGE( vec_mergel );
685 else if( !( ( p_filter->fmt_in.video.i_width % 16 ) |
686 ( p_filter->fmt_in.video.i_height % 4 ) ) )
688 /* Width is only a multiple of 16, we take 4 lines at a time */
689 for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; )
691 /* Line 1 and 2, pixels 0 to ( width - 16 ) */
693 for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
696 VEC_MERGE( vec_mergeh );
697 VEC_MERGE( vec_mergel );
700 /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
702 VEC_MERGE( vec_mergeh );
704 /* Line 3 and 4, pixels 0 to 16 */
706 VEC_MERGE( vec_mergel );
708 /* Line 3 and 4, pixels 16 to ( width ) */
709 for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
712 VEC_MERGE( vec_mergeh );
713 VEC_MERGE( vec_mergel );
719 /* Crap, use the C version */
720 #undef VEC_NEXT_LINES
725 const int i_source_margin = p_source->p[0].i_pitch
726 - p_source->p[0].i_visible_pitch;
727 const int i_source_margin_c = p_source->p[1].i_pitch
728 - p_source->p[1].i_visible_pitch;
729 const int i_dest_margin = p_dest->p->i_pitch
730 - p_dest->p->i_visible_pitch;
732 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
733 for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
736 p_line2 += p_dest->p->i_pitch;
739 p_y2 += p_source->p[Y_PLANE].i_pitch;
741 for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
743 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
749 MMX_CALL( MMX_YUV420_UYVY );
752 for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x--; )
757 p_y1 += i_source_margin;
758 p_y2 += i_source_margin;
759 p_u += i_source_margin_c;
760 p_v += i_source_margin_c;
761 p_line1 += i_dest_margin;
762 p_line2 += i_dest_margin;
765 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
766 /* re-enable FPU registers */
770 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
774 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
776 ** SSE2 128 bits fetch/store instructions are faster
777 ** if memory access is 16 bytes aligned
779 if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
780 ((intptr_t)p_line2|(intptr_t)p_y2))) )
782 /* use faster SSE2 aligned fetch and store */
783 for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
786 p_line2 += p_dest->p->i_pitch;
789 p_y2 += p_source->p[Y_PLANE].i_pitch;
791 for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
793 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
795 for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
800 p_y1 += i_source_margin;
801 p_y2 += i_source_margin;
802 p_u += i_source_margin_c;
803 p_v += i_source_margin_c;
804 p_line1 += i_dest_margin;
805 p_line2 += i_dest_margin;
810 /* use slower SSE2 unaligned fetch and store */
811 for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
814 p_line2 += p_dest->p->i_pitch;
817 p_y2 += p_source->p[Y_PLANE].i_pitch;
819 for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
821 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
823 for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
828 p_y1 += i_source_margin;
829 p_y2 += i_source_margin;
830 p_u += i_source_margin_c;
831 p_v += i_source_margin_c;
832 p_line1 += i_dest_margin;
833 p_line2 += i_dest_margin;
836 /* make sure all SSE2 stores are visible thereafter */
838 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
841 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
842 /*****************************************************************************
843 * I420_IUYV: planar YUV 4:2:0 to interleaved packed UYVY 4:2:2
844 *****************************************************************************/
845 static void I420_IUYV( filter_t *p_filter, picture_t *p_source,
848 VLC_UNUSED(p_source); VLC_UNUSED(p_dest);
850 msg_Err( p_filter, "I420_IUYV unimplemented, please harass <sam@zoy.org>" );
853 /*****************************************************************************
854 * I420_cyuv: planar YUV 4:2:0 to upside-down packed UYVY 4:2:2
855 *****************************************************************************/
857 static void I420_cyuv( filter_t *p_filter, picture_t *p_source,
860 uint8_t *p_line1 = p_dest->p->p_pixels +
861 p_dest->p->i_visible_lines * p_dest->p->i_pitch
862 + p_dest->p->i_pitch;
863 uint8_t *p_line2 = p_dest->p->p_pixels +
864 p_dest->p->i_visible_lines * p_dest->p->i_pitch;
865 uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
866 uint8_t *p_u = p_source->U_PIXELS;
867 uint8_t *p_v = p_source->V_PIXELS;
871 const int i_source_margin = p_source->p[0].i_pitch
872 - p_source->p[0].i_visible_pitch;
873 const int i_source_margin_c = p_source->p[1].i_pitch
874 - p_source->p[1].i_visible_pitch;
875 const int i_dest_margin = p_dest->p->i_pitch
876 - p_dest->p->i_visible_pitch;
878 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
879 for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
881 p_line1 -= 3 * p_dest->p->i_pitch;
882 p_line2 -= 3 * p_dest->p->i_pitch;
885 p_y2 += p_source->p[Y_PLANE].i_pitch;
887 for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
889 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
895 MMX_CALL( MMX_YUV420_UYVY );
898 for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; )
903 p_y1 += i_source_margin;
904 p_y2 += i_source_margin;
905 p_u += i_source_margin_c;
906 p_v += i_source_margin_c;
907 p_line1 += i_dest_margin;
908 p_line2 += i_dest_margin;
911 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
912 /* re-enable FPU registers */
916 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
918 ** SSE2 128 bits fetch/store instructions are faster
919 ** if memory access is 16 bytes aligned
921 if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
922 ((intptr_t)p_line2|(intptr_t)p_y2))) )
924 /* use faster SSE2 aligned fetch and store */
925 for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
928 p_line2 += p_dest->p->i_pitch;
931 p_y2 += p_source->p[Y_PLANE].i_pitch;
933 for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
935 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
937 for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
942 p_y1 += i_source_margin;
943 p_y2 += i_source_margin;
944 p_u += i_source_margin_c;
945 p_v += i_source_margin_c;
946 p_line1 += i_dest_margin;
947 p_line2 += i_dest_margin;
952 /* use slower SSE2 unaligned fetch and store */
953 for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
956 p_line2 += p_dest->p->i_pitch;
959 p_y2 += p_source->p[Y_PLANE].i_pitch;
961 for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
963 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
965 for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
970 p_y1 += i_source_margin;
971 p_y2 += i_source_margin;
972 p_u += i_source_margin_c;
973 p_v += i_source_margin_c;
974 p_line1 += i_dest_margin;
975 p_line2 += i_dest_margin;
978 /* make sure all SSE2 stores are visible thereafter */
980 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
982 #endif // !defined (MODULE_NAME_IS_i420_yuy2_altivec)
984 /*****************************************************************************
985 * I420_Y211: planar YUV 4:2:0 to packed YUYV 2:1:1
986 *****************************************************************************/
987 #if defined (MODULE_NAME_IS_i420_yuy2)
988 static void I420_Y211( filter_t *p_filter, picture_t *p_source,
991 uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
992 uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
993 uint8_t *p_u = p_source->U_PIXELS;
994 uint8_t *p_v = p_source->V_PIXELS;
998 const int i_source_margin = p_source->p[0].i_pitch
999 - p_source->p[0].i_visible_pitch;
1000 const int i_source_margin_c = p_source->p[1].i_pitch
1001 - p_source->p[1].i_visible_pitch;
1002 const int i_dest_margin = p_dest->p->i_pitch
1003 - p_dest->p->i_visible_pitch;
1005 for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
1008 p_line2 += p_dest->p->i_pitch;
1011 p_y2 += p_source->p[Y_PLANE].i_pitch;
1013 for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
1019 p_y1 += i_source_margin;
1020 p_y2 += i_source_margin;
1021 p_u += i_source_margin_c;
1022 p_v += i_source_margin_c;
1023 p_line1 += i_dest_margin;
1024 p_line2 += i_dest_margin;