]> git.sesse.net Git - vlc/blob - modules/video_chroma/i420_yuy2.c
15f5ac2fee6d469c27339e27e161b761f1ba043c
[vlc] / modules / video_chroma / i420_yuy2.c
1 /*****************************************************************************
2  * i420_yuy2.c : YUV to YUV conversion module for vlc
3  *****************************************************************************
4  * Copyright (C) 2000, 2001 the VideoLAN team
5  * $Id$
6  *
7  * Authors: Samuel Hocevar <sam@zoy.org>
8  *          Damien Fouilleul <damien@videolan.org>
9  *
10  * This program is free software; you can redistribute it and/or modify
11  * it under the terms of the GNU General Public License as published by
12  * the Free Software Foundation; either version 2 of the License, or
13  * (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public License
21  * along with this program; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
23  *****************************************************************************/
24
25 /*****************************************************************************
26  * Preamble
27  *****************************************************************************/
28
29 #ifdef HAVE_CONFIG_H
30 # include "config.h"
31 #endif
32
33 #include <vlc_common.h>
34 #include <vlc_plugin.h>
35 #include <vlc_filter.h>
36 #include <vlc_vout.h>
37
38 #if defined (MODULE_NAME_IS_i420_yuy2_altivec) && defined(HAVE_ALTIVEC_H)
39 #   include <altivec.h>
40 #endif
41
42 #include "i420_yuy2.h"
43
44 #define SRC_FOURCC  "I420,IYUV,YV12"
45
46 #if defined (MODULE_NAME_IS_i420_yuy2)
47 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv,Y211"
48 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
49 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
50 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
51 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
52 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
53 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422"
54 #endif
55
56 /*****************************************************************************
57  * Local and extern prototypes.
58  *****************************************************************************/
59 static int  Activate ( vlc_object_t * );
60
61 static void I420_YUY2           ( filter_t *, picture_t *, picture_t * );
62 static void I420_YVYU           ( filter_t *, picture_t *, picture_t * );
63 static void I420_UYVY           ( filter_t *, picture_t *, picture_t * );
64 static picture_t *I420_YUY2_Filter    ( filter_t *, picture_t * );
65 static picture_t *I420_YVYU_Filter    ( filter_t *, picture_t * );
66 static picture_t *I420_UYVY_Filter    ( filter_t *, picture_t * );
67 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
68 static void I420_IUYV           ( filter_t *, picture_t *, picture_t * );
69 static void I420_cyuv           ( filter_t *, picture_t *, picture_t * );
70 static picture_t *I420_IUYV_Filter    ( filter_t *, picture_t * );
71 static picture_t *I420_cyuv_Filter    ( filter_t *, picture_t * );
72 #endif
73 #if defined (MODULE_NAME_IS_i420_yuy2)
74 static void I420_Y211           ( filter_t *, picture_t *, picture_t * );
75 static picture_t *I420_Y211_Filter    ( filter_t *, picture_t * );
76 #endif
77
78 #ifdef MODULE_NAME_IS_i420_yuy2_mmx
79 /* Initialize MMX-specific constants */
80 static const uint64_t i_00ffw = 0x00ff00ff00ff00ffULL;
81 static const uint64_t i_80w   = 0x0000000080808080ULL;
82 #endif
83
84 /*****************************************************************************
85  * Module descriptor.
86  *****************************************************************************/
87 vlc_module_begin ()
88 #if defined (MODULE_NAME_IS_i420_yuy2)
89     set_description( N_("Conversions from " SRC_FOURCC " to " DEST_FOURCC) )
90     set_capability( "video filter2", 80 )
91 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
92     set_description( N_("MMX conversions from " SRC_FOURCC " to " DEST_FOURCC) )
93     set_capability( "video filter2", 160 )
94     add_requirement( MMX )
95 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
96     set_description( N_("SSE2 conversions from " SRC_FOURCC " to " DEST_FOURCC) )
97     set_capability( "video filter2", 250 )
98     add_requirement( SSE2 )
99 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
100     set_description(
101             _("AltiVec conversions from " SRC_FOURCC " to " DEST_FOURCC) );
102     set_capability( "video filter2", 250 )
103     add_requirement( ALTIVEC )
104 #endif
105     set_callbacks( Activate, NULL )
106 vlc_module_end ()
107
108 /*****************************************************************************
109  * Activate: allocate a chroma function
110  *****************************************************************************
111  * This function allocates and initializes a chroma function
112  *****************************************************************************/
113 static int Activate( vlc_object_t *p_this )
114 {
115     filter_t *p_filter = (filter_t *)p_this;
116
117     if( p_filter->fmt_in.video.i_width & 1
118      || p_filter->fmt_in.video.i_height & 1 )
119     {
120         return -1;
121     }
122
123     if( p_filter->fmt_in.video.i_width != p_filter->fmt_out.video.i_width
124      || p_filter->fmt_in.video.i_height != p_filter->fmt_out.video.i_height )
125         return -1;
126
127     switch( p_filter->fmt_in.video.i_chroma )
128     {
129         case VLC_FOURCC('Y','V','1','2'):
130         case VLC_FOURCC('I','4','2','0'):
131         case VLC_FOURCC('I','Y','U','V'):
132             switch( p_filter->fmt_out.video.i_chroma )
133             {
134                 case VLC_FOURCC('Y','U','Y','2'):
135                 case VLC_FOURCC('Y','U','N','V'):
136                     p_filter->pf_video_filter = I420_YUY2_Filter;
137                     break;
138
139                 case VLC_FOURCC('Y','V','Y','U'):
140                     p_filter->pf_video_filter = I420_YVYU_Filter;
141                     break;
142
143                 case VLC_FOURCC('U','Y','V','Y'):
144                 case VLC_FOURCC('U','Y','N','V'):
145                 case VLC_FOURCC('Y','4','2','2'):
146                     p_filter->pf_video_filter = I420_UYVY_Filter;
147                     break;
148 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
149                 case VLC_FOURCC('I','U','Y','V'):
150                     p_filter->pf_video_filter = I420_IUYV_Filter;
151                     break;
152
153                 case VLC_FOURCC('c','y','u','v'):
154                     p_filter->pf_video_filter = I420_cyuv_Filter;
155                     break;
156 #endif
157
158 #if defined (MODULE_NAME_IS_i420_yuy2)
159                 case VLC_FOURCC('Y','2','1','1'):
160                     p_filter->pf_video_filter = I420_Y211_Filter;
161                     break;
162 #endif
163
164                 default:
165                     return -1;
166             }
167             break;
168
169         default:
170             return -1;
171     }
172
173     return 0;
174 }
175
176 #if 0
177 static inline unsigned long long read_cycles(void)
178 {
179     unsigned long long v;
180     __asm__ __volatile__("rdtsc" : "=A" (v): );
181
182     return v;
183 }
184 #endif
185
186 /* Following functions are local */
187
188 VIDEO_FILTER_WRAPPER( I420_YUY2 )
189 VIDEO_FILTER_WRAPPER( I420_YVYU )
190 VIDEO_FILTER_WRAPPER( I420_UYVY )
191 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
192 VIDEO_FILTER_WRAPPER( I420_IUYV )
193 VIDEO_FILTER_WRAPPER( I420_cyuv )
194 #endif
195 #if defined (MODULE_NAME_IS_i420_yuy2)
196 VIDEO_FILTER_WRAPPER( I420_Y211 )
197 #endif
198
199 /*****************************************************************************
200  * I420_YUY2: planar YUV 4:2:0 to packed YUYV 4:2:2
201  *****************************************************************************/
202 static void I420_YUY2( filter_t *p_filter, picture_t *p_source,
203                                            picture_t *p_dest )
204 {
205     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
206     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
207     uint8_t *p_u = p_source->U_PIXELS;
208     uint8_t *p_v = p_source->V_PIXELS;
209
210     int i_x, i_y;
211
212 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
213 #define VEC_NEXT_LINES( ) \
214     p_line1  = p_line2; \
215     p_line2 += p_dest->p->i_pitch; \
216     p_y1     = p_y2; \
217     p_y2    += p_source->p[Y_PLANE].i_pitch;
218
219 #define VEC_LOAD_UV( ) \
220     u_vec = vec_ld( 0, p_u ); p_u += 16; \
221     v_vec = vec_ld( 0, p_v ); p_v += 16;
222
223 #define VEC_MERGE( a ) \
224     uv_vec = a( u_vec, v_vec ); \
225     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
226     vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
227     vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
228     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
229     vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16; \
230     vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16;
231
232     vector unsigned char u_vec;
233     vector unsigned char v_vec;
234     vector unsigned char uv_vec;
235     vector unsigned char y_vec;
236
237     if( !( ( p_filter->fmt_in.video.i_width % 32 ) |
238            ( p_filter->fmt_in.video.i_height % 2 ) ) )
239     {
240         /* Width is a multiple of 32, we take 2 lines at a time */
241         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
242         {
243             VEC_NEXT_LINES( );
244             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
245             {
246                 VEC_LOAD_UV( );
247                 VEC_MERGE( vec_mergeh );
248                 VEC_MERGE( vec_mergel );
249             }
250         }
251     }
252     else if( !( ( p_filter->fmt_in.video.i_width % 16 ) |
253                 ( p_filter->fmt_in.video.i_height % 4 ) ) )
254     {
255         /* Width is only a multiple of 16, we take 4 lines at a time */
256         for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; )
257         {
258             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
259             VEC_NEXT_LINES( );
260             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
261             {
262                 VEC_LOAD_UV( );
263                 VEC_MERGE( vec_mergeh );
264                 VEC_MERGE( vec_mergel );
265             }
266
267             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
268             VEC_LOAD_UV( );
269             VEC_MERGE( vec_mergeh );
270
271             /* Line 3 and 4, pixels 0 to 16 */
272             VEC_NEXT_LINES( );
273             VEC_MERGE( vec_mergel );
274
275             /* Line 3 and 4, pixels 16 to ( width ) */
276             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
277             {
278                 VEC_LOAD_UV( );
279                 VEC_MERGE( vec_mergeh );
280                 VEC_MERGE( vec_mergel );
281             }
282         }
283     }
284     else
285     {
286         /* Crap, use the C version */
287 #undef VEC_NEXT_LINES
288 #undef VEC_LOAD_UV
289 #undef VEC_MERGE
290 #endif
291
292     const int i_source_margin = p_source->p[0].i_pitch
293                                  - p_source->p[0].i_visible_pitch;
294     const int i_source_margin_c = p_source->p[1].i_pitch
295                                  - p_source->p[1].i_visible_pitch;
296     const int i_dest_margin = p_dest->p->i_pitch
297                                - p_dest->p->i_visible_pitch;
298
299 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
300     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
301     {
302         p_line1 = p_line2;
303         p_line2 += p_dest->p->i_pitch;
304
305         p_y1 = p_y2;
306         p_y2 += p_source->p[Y_PLANE].i_pitch;
307
308 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
309         for( i_x = p_filter->fmt_in.video.i_width / 8; i_x-- ; )
310         {
311             C_YUV420_YUYV( );
312             C_YUV420_YUYV( );
313             C_YUV420_YUYV( );
314             C_YUV420_YUYV( );
315         }
316 #else
317         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
318         {
319             MMX_CALL( MMX_YUV420_YUYV );
320         }
321 #endif
322         for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; )
323         {
324             C_YUV420_YUYV( );
325         }
326
327         p_y1 += i_source_margin;
328         p_y2 += i_source_margin;
329         p_u += i_source_margin_c;
330         p_v += i_source_margin_c;
331         p_line1 += i_dest_margin;
332         p_line2 += i_dest_margin;
333     }
334
335 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
336     /* re-enable FPU registers */
337     MMX_END;
338 #endif
339
340 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
341     }
342 #endif
343
344 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
345     /*
346     ** SSE2 128 bits fetch/store instructions are faster
347     ** if memory access is 16 bytes aligned
348     */
349
350     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
351         ((intptr_t)p_line2|(intptr_t)p_y2))) )
352     {
353         /* use faster SSE2 aligned fetch and store */
354         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
355         {
356             p_line1 = p_line2;
357             p_line2 += p_dest->p->i_pitch;
358
359             p_y1 = p_y2;
360             p_y2 += p_source->p[Y_PLANE].i_pitch;
361
362             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
363             {
364                 SSE2_CALL( SSE2_YUV420_YUYV_ALIGNED );
365             }
366             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
367             {
368                 C_YUV420_YUYV( );
369             }
370
371             p_y1 += i_source_margin;
372             p_y2 += i_source_margin;
373             p_u += i_source_margin_c;
374             p_v += i_source_margin_c;
375             p_line1 += i_dest_margin;
376             p_line2 += i_dest_margin;
377         }
378     }
379     else
380     {
381         /* use slower SSE2 unaligned fetch and store */
382         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
383         {
384             p_line1 = p_line2;
385             p_line2 += p_dest->p->i_pitch;
386
387             p_y1 = p_y2;
388             p_y2 += p_source->p[Y_PLANE].i_pitch;
389
390             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
391             {
392                 SSE2_CALL( SSE2_YUV420_YUYV_UNALIGNED );
393             }
394             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
395             {
396                 C_YUV420_YUYV( );
397             }
398
399             p_y1 += i_source_margin;
400             p_y2 += i_source_margin;
401             p_u += i_source_margin_c;
402             p_v += i_source_margin_c;
403             p_line1 += i_dest_margin;
404             p_line2 += i_dest_margin;
405         }
406     }
407     /* make sure all SSE2 stores are visible thereafter */
408     SSE2_END;
409
410 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
411 }
412
413 /*****************************************************************************
414  * I420_YVYU: planar YUV 4:2:0 to packed YVYU 4:2:2
415  *****************************************************************************/
416 static void I420_YVYU( filter_t *p_filter, picture_t *p_source,
417                                            picture_t *p_dest )
418 {
419     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
420     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
421     uint8_t *p_u = p_source->U_PIXELS;
422     uint8_t *p_v = p_source->V_PIXELS;
423
424     int i_x, i_y;
425
426 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
427 #define VEC_NEXT_LINES( ) \
428     p_line1  = p_line2; \
429     p_line2 += p_dest->p->i_pitch; \
430     p_y1     = p_y2; \
431     p_y2    += p_source->p[Y_PLANE].i_pitch;
432
433 #define VEC_LOAD_UV( ) \
434     u_vec = vec_ld( 0, p_u ); p_u += 16; \
435     v_vec = vec_ld( 0, p_v ); p_v += 16;
436
437 #define VEC_MERGE( a ) \
438     vu_vec = a( v_vec, u_vec ); \
439     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
440     vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
441     vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
442     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
443     vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16; \
444     vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16;
445
446     vector unsigned char u_vec;
447     vector unsigned char v_vec;
448     vector unsigned char vu_vec;
449     vector unsigned char y_vec;
450
451     if( !( ( p_filter->fmt_in.video.i_width % 32 ) |
452            ( p_filter->fmt_in.video.i_height % 2 ) ) )
453     {
454         /* Width is a multiple of 32, we take 2 lines at a time */
455         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
456         {
457             VEC_NEXT_LINES( );
458             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
459             {
460                 VEC_LOAD_UV( );
461                 VEC_MERGE( vec_mergeh );
462                 VEC_MERGE( vec_mergel );
463             }
464         }
465     }
466     else if( !( ( p_filter->fmt_in.video.i_width % 16 ) |
467                 ( p_filter->fmt_in.video.i_height % 4 ) ) )
468     {
469         /* Width is only a multiple of 16, we take 4 lines at a time */
470         for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; )
471         {
472             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
473             VEC_NEXT_LINES( );
474             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
475             {
476                 VEC_LOAD_UV( );
477                 VEC_MERGE( vec_mergeh );
478                 VEC_MERGE( vec_mergel );
479             }
480
481             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
482             VEC_LOAD_UV( );
483             VEC_MERGE( vec_mergeh );
484
485             /* Line 3 and 4, pixels 0 to 16 */
486             VEC_NEXT_LINES( );
487             VEC_MERGE( vec_mergel );
488
489             /* Line 3 and 4, pixels 16 to ( width ) */
490             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
491             {
492                 VEC_LOAD_UV( );
493                 VEC_MERGE( vec_mergeh );
494                 VEC_MERGE( vec_mergel );
495             }
496         }
497     }
498     else
499     {
500         /* Crap, use the C version */
501 #undef VEC_NEXT_LINES
502 #undef VEC_LOAD_UV
503 #undef VEC_MERGE
504 #endif
505
506     const int i_source_margin = p_source->p[0].i_pitch
507                                  - p_source->p[0].i_visible_pitch;
508     const int i_source_margin_c = p_source->p[1].i_pitch
509                                  - p_source->p[1].i_visible_pitch;
510     const int i_dest_margin = p_dest->p->i_pitch
511                                - p_dest->p->i_visible_pitch;
512
513 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
514     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
515     {
516         p_line1 = p_line2;
517         p_line2 += p_dest->p->i_pitch;
518
519         p_y1 = p_y2;
520         p_y2 += p_source->p[Y_PLANE].i_pitch;
521
522         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
523         {
524 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
525             C_YUV420_YVYU( );
526             C_YUV420_YVYU( );
527             C_YUV420_YVYU( );
528             C_YUV420_YVYU( );
529 #else
530             MMX_CALL( MMX_YUV420_YVYU );
531 #endif
532         }
533         for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; )
534         {
535             C_YUV420_YVYU( );
536         }
537
538         p_y1 += i_source_margin;
539         p_y2 += i_source_margin;
540         p_u += i_source_margin_c;
541         p_v += i_source_margin_c;
542         p_line1 += i_dest_margin;
543         p_line2 += i_dest_margin;
544     }
545
546 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
547     /* re-enable FPU registers */
548     MMX_END;
549 #endif
550
551 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
552     }
553 #endif
554
555 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
556     /*
557     ** SSE2 128 bits fetch/store instructions are faster
558     ** if memory access is 16 bytes aligned
559     */
560     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
561         ((intptr_t)p_line2|(intptr_t)p_y2))) )
562     {
563         /* use faster SSE2 aligned fetch and store */
564         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
565         {
566             p_line1 = p_line2;
567             p_line2 += p_dest->p->i_pitch;
568
569             p_y1 = p_y2;
570             p_y2 += p_source->p[Y_PLANE].i_pitch;
571
572             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
573             {
574                 SSE2_CALL( SSE2_YUV420_YVYU_ALIGNED );
575             }
576             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
577             {
578                 C_YUV420_YVYU( );
579             }
580
581             p_y1 += i_source_margin;
582             p_y2 += i_source_margin;
583             p_u += i_source_margin_c;
584             p_v += i_source_margin_c;
585             p_line1 += i_dest_margin;
586             p_line2 += i_dest_margin;
587         }
588     }
589     else
590     {
591         /* use slower SSE2 unaligned fetch and store */
592         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
593         {
594             p_line1 = p_line2;
595             p_line2 += p_dest->p->i_pitch;
596
597             p_y1 = p_y2;
598             p_y2 += p_source->p[Y_PLANE].i_pitch;
599
600             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
601             {
602                 SSE2_CALL( SSE2_YUV420_YVYU_UNALIGNED );
603             }
604             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
605             {
606                 C_YUV420_YVYU( );
607             }
608
609             p_y1 += i_source_margin;
610             p_y2 += i_source_margin;
611             p_u += i_source_margin_c;
612             p_v += i_source_margin_c;
613             p_line1 += i_dest_margin;
614             p_line2 += i_dest_margin;
615         }
616     }
617     /* make sure all SSE2 stores are visible thereafter */
618     SSE2_END;
619 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
620 }
621
622 /*****************************************************************************
623  * I420_UYVY: planar YUV 4:2:0 to packed UYVY 4:2:2
624  *****************************************************************************/
625 static void I420_UYVY( filter_t *p_filter, picture_t *p_source,
626                                            picture_t *p_dest )
627 {
628     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
629     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
630     uint8_t *p_u = p_source->U_PIXELS;
631     uint8_t *p_v = p_source->V_PIXELS;
632
633     int i_x, i_y;
634
635 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
636 #define VEC_NEXT_LINES( ) \
637     p_line1  = p_line2; \
638     p_line2 += p_dest->p->i_pitch; \
639     p_y1     = p_y2; \
640     p_y2    += p_source->p[Y_PLANE].i_pitch;
641
642 #define VEC_LOAD_UV( ) \
643     u_vec = vec_ld( 0, p_u ); p_u += 16; \
644     v_vec = vec_ld( 0, p_v ); p_v += 16;
645
646 #define VEC_MERGE( a ) \
647     uv_vec = a( u_vec, v_vec ); \
648     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
649     vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
650     vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
651     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
652     vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16; \
653     vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16;
654
655     vector unsigned char u_vec;
656     vector unsigned char v_vec;
657     vector unsigned char uv_vec;
658     vector unsigned char y_vec;
659
660     if( !( ( p_filter->fmt_in.video.i_width % 32 ) |
661            ( p_filter->fmt_in.video.i_height % 2 ) ) )
662     {
663         /* Width is a multiple of 32, we take 2 lines at a time */
664         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
665         {
666             VEC_NEXT_LINES( );
667             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
668             {
669                 VEC_LOAD_UV( );
670                 VEC_MERGE( vec_mergeh );
671                 VEC_MERGE( vec_mergel );
672             }
673         }
674     }
675     else if( !( ( p_filter->fmt_in.video.i_width % 16 ) |
676                 ( p_filter->fmt_in.video.i_height % 4 ) ) )
677     {
678         /* Width is only a multiple of 16, we take 4 lines at a time */
679         for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; )
680         {
681             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
682             VEC_NEXT_LINES( );
683             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
684             {
685                 VEC_LOAD_UV( );
686                 VEC_MERGE( vec_mergeh );
687                 VEC_MERGE( vec_mergel );
688             }
689
690             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
691             VEC_LOAD_UV( );
692             VEC_MERGE( vec_mergeh );
693
694             /* Line 3 and 4, pixels 0 to 16 */
695             VEC_NEXT_LINES( );
696             VEC_MERGE( vec_mergel );
697
698             /* Line 3 and 4, pixels 16 to ( width ) */
699             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
700             {
701                 VEC_LOAD_UV( );
702                 VEC_MERGE( vec_mergeh );
703                 VEC_MERGE( vec_mergel );
704             }
705         }
706     }
707     else
708     {
709         /* Crap, use the C version */
710 #undef VEC_NEXT_LINES
711 #undef VEC_LOAD_UV
712 #undef VEC_MERGE
713 #endif
714
715     const int i_source_margin = p_source->p[0].i_pitch
716                                  - p_source->p[0].i_visible_pitch;
717     const int i_source_margin_c = p_source->p[1].i_pitch
718                                  - p_source->p[1].i_visible_pitch;
719     const int i_dest_margin = p_dest->p->i_pitch
720                                - p_dest->p->i_visible_pitch;
721
722 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
723     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
724     {
725         p_line1 = p_line2;
726         p_line2 += p_dest->p->i_pitch;
727
728         p_y1 = p_y2;
729         p_y2 += p_source->p[Y_PLANE].i_pitch;
730
731         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
732         {
733 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
734             C_YUV420_UYVY( );
735             C_YUV420_UYVY( );
736             C_YUV420_UYVY( );
737             C_YUV420_UYVY( );
738 #else
739             MMX_CALL( MMX_YUV420_UYVY );
740 #endif
741         }
742         for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x--; )
743         {
744             C_YUV420_UYVY( );
745         }
746
747         p_y1 += i_source_margin;
748         p_y2 += i_source_margin;
749         p_u += i_source_margin_c;
750         p_v += i_source_margin_c;
751         p_line1 += i_dest_margin;
752         p_line2 += i_dest_margin;
753     }
754
755 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
756     /* re-enable FPU registers */
757     MMX_END;
758 #endif
759
760 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
761     }
762 #endif
763
764 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
765     /*
766     ** SSE2 128 bits fetch/store instructions are faster
767     ** if memory access is 16 bytes aligned
768     */
769     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
770         ((intptr_t)p_line2|(intptr_t)p_y2))) )
771     {
772         /* use faster SSE2 aligned fetch and store */
773         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
774         {
775             p_line1 = p_line2;
776             p_line2 += p_dest->p->i_pitch;
777
778             p_y1 = p_y2;
779             p_y2 += p_source->p[Y_PLANE].i_pitch;
780
781             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
782             {
783                 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
784             }
785             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
786             {
787                 C_YUV420_UYVY( );
788             }
789
790             p_y1 += i_source_margin;
791             p_y2 += i_source_margin;
792             p_u += i_source_margin_c;
793             p_v += i_source_margin_c;
794             p_line1 += i_dest_margin;
795             p_line2 += i_dest_margin;
796         }
797     }
798     else
799     {
800         /* use slower SSE2 unaligned fetch and store */
801         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
802         {
803             p_line1 = p_line2;
804             p_line2 += p_dest->p->i_pitch;
805
806             p_y1 = p_y2;
807             p_y2 += p_source->p[Y_PLANE].i_pitch;
808
809             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
810             {
811                 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
812             }
813             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
814             {
815                 C_YUV420_UYVY( );
816             }
817
818             p_y1 += i_source_margin;
819             p_y2 += i_source_margin;
820             p_u += i_source_margin_c;
821             p_v += i_source_margin_c;
822             p_line1 += i_dest_margin;
823             p_line2 += i_dest_margin;
824         }
825     }
826     /* make sure all SSE2 stores are visible thereafter */
827     SSE2_END;
828 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
829 }
830
831 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
832 /*****************************************************************************
833  * I420_IUYV: planar YUV 4:2:0 to interleaved packed UYVY 4:2:2
834  *****************************************************************************/
835 static void I420_IUYV( filter_t *p_filter, picture_t *p_source,
836                                            picture_t *p_dest )
837 {
838     VLC_UNUSED(p_source); VLC_UNUSED(p_dest);
839     /* FIXME: TODO ! */
840     msg_Err( p_filter, "I420_IUYV unimplemented, please harass <sam@zoy.org>" );
841 }
842
843 /*****************************************************************************
844  * I420_cyuv: planar YUV 4:2:0 to upside-down packed UYVY 4:2:2
845  *****************************************************************************/
846 static void I420_cyuv( filter_t *p_filter, picture_t *p_source,
847                                            picture_t *p_dest )
848 {
849     uint8_t *p_line1 = p_dest->p->p_pixels +
850                        p_dest->p->i_visible_lines * p_dest->p->i_pitch
851                        + p_dest->p->i_pitch;
852     uint8_t *p_line2 = p_dest->p->p_pixels +
853                        p_dest->p->i_visible_lines * p_dest->p->i_pitch;
854     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
855     uint8_t *p_u = p_source->U_PIXELS;
856     uint8_t *p_v = p_source->V_PIXELS;
857
858     int i_x, i_y;
859
860     const int i_source_margin = p_source->p[0].i_pitch
861                                  - p_source->p[0].i_visible_pitch;
862     const int i_source_margin_c = p_source->p[1].i_pitch
863                                  - p_source->p[1].i_visible_pitch;
864     const int i_dest_margin = p_dest->p->i_pitch
865                                - p_dest->p->i_visible_pitch;
866
867 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
868     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
869     {
870         p_line1 -= 3 * p_dest->p->i_pitch;
871         p_line2 -= 3 * p_dest->p->i_pitch;
872
873         p_y1 = p_y2;
874         p_y2 += p_source->p[Y_PLANE].i_pitch;
875
876         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
877         {
878 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
879             C_YUV420_UYVY( );
880             C_YUV420_UYVY( );
881             C_YUV420_UYVY( );
882             C_YUV420_UYVY( );
883 #else
884             MMX_CALL( MMX_YUV420_UYVY );
885 #endif
886         }
887         for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; )
888         {
889             C_YUV420_UYVY( );
890         }
891
892         p_y1 += i_source_margin;
893         p_y2 += i_source_margin;
894         p_u += i_source_margin_c;
895         p_v += i_source_margin_c;
896         p_line1 += i_dest_margin;
897         p_line2 += i_dest_margin;
898     }
899
900 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
901     /* re-enable FPU registers */
902     MMX_END;
903 #endif
904
905 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
906     /*
907     ** SSE2 128 bits fetch/store instructions are faster
908     ** if memory access is 16 bytes aligned
909     */
910     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
911         ((intptr_t)p_line2|(intptr_t)p_y2))) )
912     {
913         /* use faster SSE2 aligned fetch and store */
914         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
915         {
916             p_line1 = p_line2;
917             p_line2 += p_dest->p->i_pitch;
918
919             p_y1 = p_y2;
920             p_y2 += p_source->p[Y_PLANE].i_pitch;
921
922             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
923             {
924                 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
925             }
926             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
927             {
928                 C_YUV420_UYVY( );
929             }
930
931             p_y1 += i_source_margin;
932             p_y2 += i_source_margin;
933             p_u += i_source_margin_c;
934             p_v += i_source_margin_c;
935             p_line1 += i_dest_margin;
936             p_line2 += i_dest_margin;
937         }
938     }
939     else
940     {
941         /* use slower SSE2 unaligned fetch and store */
942         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
943         {
944             p_line1 = p_line2;
945             p_line2 += p_dest->p->i_pitch;
946
947             p_y1 = p_y2;
948             p_y2 += p_source->p[Y_PLANE].i_pitch;
949
950             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
951             {
952                 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
953             }
954             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
955             {
956                 C_YUV420_UYVY( );
957             }
958
959             p_y1 += i_source_margin;
960             p_y2 += i_source_margin;
961             p_u += i_source_margin_c;
962             p_v += i_source_margin_c;
963             p_line1 += i_dest_margin;
964             p_line2 += i_dest_margin;
965         }
966     }
967     /* make sure all SSE2 stores are visible thereafter */
968     SSE2_END;
969 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
970 }
971 #endif // !defined (MODULE_NAME_IS_i420_yuy2_altivec)
972
973 /*****************************************************************************
974  * I420_Y211: planar YUV 4:2:0 to packed YUYV 2:1:1
975  *****************************************************************************/
976 #if defined (MODULE_NAME_IS_i420_yuy2)
977 static void I420_Y211( filter_t *p_filter, picture_t *p_source,
978                                            picture_t *p_dest )
979 {
980     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
981     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
982     uint8_t *p_u = p_source->U_PIXELS;
983     uint8_t *p_v = p_source->V_PIXELS;
984
985     int i_x, i_y;
986
987     const int i_source_margin = p_source->p[0].i_pitch
988                                  - p_source->p[0].i_visible_pitch;
989     const int i_source_margin_c = p_source->p[1].i_pitch
990                                  - p_source->p[1].i_visible_pitch;
991     const int i_dest_margin = p_dest->p->i_pitch
992                                - p_dest->p->i_visible_pitch;
993
994     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
995     {
996         p_line1 = p_line2;
997         p_line2 += p_dest->p->i_pitch;
998
999         p_y1 = p_y2;
1000         p_y2 += p_source->p[Y_PLANE].i_pitch;
1001
1002         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
1003         {
1004             C_YUV420_Y211( );
1005             C_YUV420_Y211( );
1006         }
1007
1008         p_y1 += i_source_margin;
1009         p_y2 += i_source_margin;
1010         p_u += i_source_margin_c;
1011         p_v += i_source_margin_c;
1012         p_line1 += i_dest_margin;
1013         p_line2 += i_dest_margin;
1014     }
1015 }
1016 #endif