]> git.sesse.net Git - vlc/blob - modules/video_chroma/i420_yuy2.c
auhal: fix inverted logic (and potential NULL pointer deref).
[vlc] / modules / video_chroma / i420_yuy2.c
1 /*****************************************************************************
2  * i420_yuy2.c : YUV to YUV conversion module for vlc
3  *****************************************************************************
4  * Copyright (C) 2000, 2001 the VideoLAN team
5  * $Id$
6  *
7  * Authors: Samuel Hocevar <sam@zoy.org>
8  *          Damien Fouilleul <damien@videolan.org>
9  *
10  * This program is free software; you can redistribute it and/or modify
11  * it under the terms of the GNU General Public License as published by
12  * the Free Software Foundation; either version 2 of the License, or
13  * (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public License
21  * along with this program; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
23  *****************************************************************************/
24
25 /*****************************************************************************
26  * Preamble
27  *****************************************************************************/
28
29 #ifdef HAVE_CONFIG_H
30 # include "config.h"
31 #endif
32
33 #include <vlc_common.h>
34 #include <vlc_plugin.h>
35 #include <vlc_filter.h>
36 #include <vlc_cpu.h>
37
38 #if defined (MODULE_NAME_IS_i420_yuy2_altivec) && defined(HAVE_ALTIVEC_H)
39 #   include <altivec.h>
40 #endif
41
42 #include "i420_yuy2.h"
43
44 #define SRC_FOURCC  "I420,IYUV,YV12"
45
46 #if defined (MODULE_NAME_IS_i420_yuy2)
47 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv,Y211"
48 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
49 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
50 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
51 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
52 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
53 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422"
54 #endif
55
56 /*****************************************************************************
57  * Local and extern prototypes.
58  *****************************************************************************/
59 static int  Activate ( vlc_object_t * );
60
61 static void I420_YUY2           ( filter_t *, picture_t *, picture_t * );
62 static void I420_YVYU           ( filter_t *, picture_t *, picture_t * );
63 static void I420_UYVY           ( filter_t *, picture_t *, picture_t * );
64 static picture_t *I420_YUY2_Filter    ( filter_t *, picture_t * );
65 static picture_t *I420_YVYU_Filter    ( filter_t *, picture_t * );
66 static picture_t *I420_UYVY_Filter    ( filter_t *, picture_t * );
67 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
68 static void I420_IUYV           ( filter_t *, picture_t *, picture_t * );
69 static void I420_cyuv           ( filter_t *, picture_t *, picture_t * );
70 static picture_t *I420_IUYV_Filter    ( filter_t *, picture_t * );
71 static picture_t *I420_cyuv_Filter    ( filter_t *, picture_t * );
72 #endif
73 #if defined (MODULE_NAME_IS_i420_yuy2)
74 static void I420_Y211           ( filter_t *, picture_t *, picture_t * );
75 static picture_t *I420_Y211_Filter    ( filter_t *, picture_t * );
76 #endif
77
78 #ifdef MODULE_NAME_IS_i420_yuy2_mmx
79 /* Initialize MMX-specific constants */
80 static const uint64_t i_00ffw = 0x00ff00ff00ff00ffULL;
81 static const uint64_t i_80w   = 0x0000000080808080ULL;
82 #endif
83
84 /*****************************************************************************
85  * Module descriptor.
86  *****************************************************************************/
87 vlc_module_begin ()
88 #if defined (MODULE_NAME_IS_i420_yuy2)
89     set_description( N_("Conversions from " SRC_FOURCC " to " DEST_FOURCC) )
90     set_capability( "video filter2", 80 )
91 # define vlc_CPU_capable() (true)
92 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
93     set_description( N_("MMX conversions from " SRC_FOURCC " to " DEST_FOURCC) )
94     set_capability( "video filter2", 160 )
95 # define vlc_CPU_capable() vlc_CPU_MMX()
96 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
97     set_description( N_("SSE2 conversions from " SRC_FOURCC " to " DEST_FOURCC) )
98     set_capability( "video filter2", 250 )
99 # define vlc_CPU_capable() vlc_CPU_SSE2()
100 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
101     set_description(
102             _("AltiVec conversions from " SRC_FOURCC " to " DEST_FOURCC) );
103     set_capability( "video filter2", 250 )
104 # define vlc_CPU_capable() vlc_CPU_ALTIVEC()
105 #endif
106     set_callbacks( Activate, NULL )
107 vlc_module_end ()
108
109 /*****************************************************************************
110  * Activate: allocate a chroma function
111  *****************************************************************************
112  * This function allocates and initializes a chroma function
113  *****************************************************************************/
114 static int Activate( vlc_object_t *p_this )
115 {
116     filter_t *p_filter = (filter_t *)p_this;
117
118     if( !vlc_CPU_capable() )
119         return VLC_EGENERIC;
120     if( p_filter->fmt_in.video.i_width & 1
121      || p_filter->fmt_in.video.i_height & 1 )
122     {
123         return -1;
124     }
125
126     if( p_filter->fmt_in.video.i_width != p_filter->fmt_out.video.i_width
127      || p_filter->fmt_in.video.i_height != p_filter->fmt_out.video.i_height )
128         return -1;
129
130     switch( p_filter->fmt_in.video.i_chroma )
131     {
132         case VLC_CODEC_YV12:
133         case VLC_CODEC_I420:
134             switch( p_filter->fmt_out.video.i_chroma )
135             {
136                 case VLC_CODEC_YUYV:
137                     p_filter->pf_video_filter = I420_YUY2_Filter;
138                     break;
139
140                 case VLC_CODEC_YVYU:
141                     p_filter->pf_video_filter = I420_YVYU_Filter;
142                     break;
143
144                 case VLC_CODEC_UYVY:
145                     p_filter->pf_video_filter = I420_UYVY_Filter;
146                     break;
147 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
148                 case VLC_FOURCC('I','U','Y','V'):
149                     p_filter->pf_video_filter = I420_IUYV_Filter;
150                     break;
151
152                 case VLC_CODEC_CYUV:
153                     p_filter->pf_video_filter = I420_cyuv_Filter;
154                     break;
155 #endif
156
157 #if defined (MODULE_NAME_IS_i420_yuy2)
158                 case VLC_CODEC_Y211:
159                     p_filter->pf_video_filter = I420_Y211_Filter;
160                     break;
161 #endif
162
163                 default:
164                     return -1;
165             }
166             break;
167
168         default:
169             return -1;
170     }
171
172     return 0;
173 }
174
175 #if 0
176 static inline unsigned long long read_cycles(void)
177 {
178     unsigned long long v;
179     __asm__ __volatile__("rdtsc" : "=A" (v): );
180
181     return v;
182 }
183 #endif
184
185 /* Following functions are local */
186
187 VIDEO_FILTER_WRAPPER( I420_YUY2 )
188 VIDEO_FILTER_WRAPPER( I420_YVYU )
189 VIDEO_FILTER_WRAPPER( I420_UYVY )
190 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
191 VIDEO_FILTER_WRAPPER( I420_IUYV )
192 VIDEO_FILTER_WRAPPER( I420_cyuv )
193 #endif
194 #if defined (MODULE_NAME_IS_i420_yuy2)
195 VIDEO_FILTER_WRAPPER( I420_Y211 )
196 #endif
197
198 /*****************************************************************************
199  * I420_YUY2: planar YUV 4:2:0 to packed YUYV 4:2:2
200  *****************************************************************************/
201 static void I420_YUY2( filter_t *p_filter, picture_t *p_source,
202                                            picture_t *p_dest )
203 {
204     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
205     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
206     uint8_t *p_u = p_source->U_PIXELS;
207     uint8_t *p_v = p_source->V_PIXELS;
208
209     int i_x, i_y;
210
211 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
212 #define VEC_NEXT_LINES( ) \
213     p_line1  = p_line2; \
214     p_line2 += p_dest->p->i_pitch; \
215     p_y1     = p_y2; \
216     p_y2    += p_source->p[Y_PLANE].i_pitch;
217
218 #define VEC_LOAD_UV( ) \
219     u_vec = vec_ld( 0, p_u ); p_u += 16; \
220     v_vec = vec_ld( 0, p_v ); p_v += 16;
221
222 #define VEC_MERGE( a ) \
223     uv_vec = a( u_vec, v_vec ); \
224     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
225     vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
226     vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
227     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
228     vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16; \
229     vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16;
230
231     vector unsigned char u_vec;
232     vector unsigned char v_vec;
233     vector unsigned char uv_vec;
234     vector unsigned char y_vec;
235
236     if( !( ( p_filter->fmt_in.video.i_width % 32 ) |
237            ( p_filter->fmt_in.video.i_height % 2 ) ) )
238     {
239         /* Width is a multiple of 32, we take 2 lines at a time */
240         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
241         {
242             VEC_NEXT_LINES( );
243             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
244             {
245                 VEC_LOAD_UV( );
246                 VEC_MERGE( vec_mergeh );
247                 VEC_MERGE( vec_mergel );
248             }
249         }
250     }
251 #warning FIXME: converting widths % 16 but !widths % 32 is broken on altivec
252 #if 0
253     else if( !( ( p_filter->fmt_in.video.i_width % 16 ) |
254                 ( p_filter->fmt_in.video.i_height % 4 ) ) )
255     {
256         /* Width is only a multiple of 16, we take 4 lines at a time */
257         for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; )
258         {
259             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
260             VEC_NEXT_LINES( );
261             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
262             {
263                 VEC_LOAD_UV( );
264                 VEC_MERGE( vec_mergeh );
265                 VEC_MERGE( vec_mergel );
266             }
267
268             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
269             VEC_LOAD_UV( );
270             VEC_MERGE( vec_mergeh );
271
272             /* Line 3 and 4, pixels 0 to 16 */
273             VEC_NEXT_LINES( );
274             VEC_MERGE( vec_mergel );
275
276             /* Line 3 and 4, pixels 16 to ( width ) */
277             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
278             {
279                 VEC_LOAD_UV( );
280                 VEC_MERGE( vec_mergeh );
281                 VEC_MERGE( vec_mergel );
282             }
283         }
284     }
285 #endif
286     else
287     {
288         /* Crap, use the C version */
289 #undef VEC_NEXT_LINES
290 #undef VEC_LOAD_UV
291 #undef VEC_MERGE
292 #endif
293
294     const int i_source_margin = p_source->p[0].i_pitch
295                                  - p_source->p[0].i_visible_pitch;
296     const int i_source_margin_c = p_source->p[1].i_pitch
297                                  - p_source->p[1].i_visible_pitch;
298     const int i_dest_margin = p_dest->p->i_pitch
299                                - p_dest->p->i_visible_pitch;
300
301 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
302     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
303     {
304         p_line1 = p_line2;
305         p_line2 += p_dest->p->i_pitch;
306
307         p_y1 = p_y2;
308         p_y2 += p_source->p[Y_PLANE].i_pitch;
309
310 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
311         for( i_x = p_filter->fmt_in.video.i_width / 8; i_x-- ; )
312         {
313             C_YUV420_YUYV( );
314             C_YUV420_YUYV( );
315             C_YUV420_YUYV( );
316             C_YUV420_YUYV( );
317         }
318 #else
319         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
320         {
321             MMX_CALL( MMX_YUV420_YUYV );
322         }
323 #endif
324         for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; )
325         {
326             C_YUV420_YUYV( );
327         }
328
329         p_y1 += i_source_margin;
330         p_y2 += i_source_margin;
331         p_u += i_source_margin_c;
332         p_v += i_source_margin_c;
333         p_line1 += i_dest_margin;
334         p_line2 += i_dest_margin;
335     }
336
337 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
338     /* re-enable FPU registers */
339     MMX_END;
340 #endif
341
342 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
343     }
344 #endif
345
346 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
347     /*
348     ** SSE2 128 bits fetch/store instructions are faster
349     ** if memory access is 16 bytes aligned
350     */
351
352     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
353         ((intptr_t)p_line2|(intptr_t)p_y2))) )
354     {
355         /* use faster SSE2 aligned fetch and store */
356         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
357         {
358             p_line1 = p_line2;
359             p_line2 += p_dest->p->i_pitch;
360
361             p_y1 = p_y2;
362             p_y2 += p_source->p[Y_PLANE].i_pitch;
363
364             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
365             {
366                 SSE2_CALL( SSE2_YUV420_YUYV_ALIGNED );
367             }
368             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
369             {
370                 C_YUV420_YUYV( );
371             }
372
373             p_y1 += i_source_margin;
374             p_y2 += i_source_margin;
375             p_u += i_source_margin_c;
376             p_v += i_source_margin_c;
377             p_line1 += i_dest_margin;
378             p_line2 += i_dest_margin;
379         }
380     }
381     else
382     {
383         /* use slower SSE2 unaligned fetch and store */
384         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
385         {
386             p_line1 = p_line2;
387             p_line2 += p_dest->p->i_pitch;
388
389             p_y1 = p_y2;
390             p_y2 += p_source->p[Y_PLANE].i_pitch;
391
392             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
393             {
394                 SSE2_CALL( SSE2_YUV420_YUYV_UNALIGNED );
395             }
396             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
397             {
398                 C_YUV420_YUYV( );
399             }
400
401             p_y1 += i_source_margin;
402             p_y2 += i_source_margin;
403             p_u += i_source_margin_c;
404             p_v += i_source_margin_c;
405             p_line1 += i_dest_margin;
406             p_line2 += i_dest_margin;
407         }
408     }
409     /* make sure all SSE2 stores are visible thereafter */
410     SSE2_END;
411
412 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
413 }
414
415 /*****************************************************************************
416  * I420_YVYU: planar YUV 4:2:0 to packed YVYU 4:2:2
417  *****************************************************************************/
418 static void I420_YVYU( filter_t *p_filter, picture_t *p_source,
419                                            picture_t *p_dest )
420 {
421     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
422     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
423     uint8_t *p_u = p_source->U_PIXELS;
424     uint8_t *p_v = p_source->V_PIXELS;
425
426     int i_x, i_y;
427
428 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
429 #define VEC_NEXT_LINES( ) \
430     p_line1  = p_line2; \
431     p_line2 += p_dest->p->i_pitch; \
432     p_y1     = p_y2; \
433     p_y2    += p_source->p[Y_PLANE].i_pitch;
434
435 #define VEC_LOAD_UV( ) \
436     u_vec = vec_ld( 0, p_u ); p_u += 16; \
437     v_vec = vec_ld( 0, p_v ); p_v += 16;
438
439 #define VEC_MERGE( a ) \
440     vu_vec = a( v_vec, u_vec ); \
441     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
442     vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
443     vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
444     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
445     vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16; \
446     vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16;
447
448     vector unsigned char u_vec;
449     vector unsigned char v_vec;
450     vector unsigned char vu_vec;
451     vector unsigned char y_vec;
452
453     if( !( ( p_filter->fmt_in.video.i_width % 32 ) |
454            ( p_filter->fmt_in.video.i_height % 2 ) ) )
455     {
456         /* Width is a multiple of 32, we take 2 lines at a time */
457         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
458         {
459             VEC_NEXT_LINES( );
460             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
461             {
462                 VEC_LOAD_UV( );
463                 VEC_MERGE( vec_mergeh );
464                 VEC_MERGE( vec_mergel );
465             }
466         }
467     }
468     else if( !( ( p_filter->fmt_in.video.i_width % 16 ) |
469                 ( p_filter->fmt_in.video.i_height % 4 ) ) )
470     {
471         /* Width is only a multiple of 16, we take 4 lines at a time */
472         for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; )
473         {
474             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
475             VEC_NEXT_LINES( );
476             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
477             {
478                 VEC_LOAD_UV( );
479                 VEC_MERGE( vec_mergeh );
480                 VEC_MERGE( vec_mergel );
481             }
482
483             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
484             VEC_LOAD_UV( );
485             VEC_MERGE( vec_mergeh );
486
487             /* Line 3 and 4, pixels 0 to 16 */
488             VEC_NEXT_LINES( );
489             VEC_MERGE( vec_mergel );
490
491             /* Line 3 and 4, pixels 16 to ( width ) */
492             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
493             {
494                 VEC_LOAD_UV( );
495                 VEC_MERGE( vec_mergeh );
496                 VEC_MERGE( vec_mergel );
497             }
498         }
499     }
500     else
501     {
502         /* Crap, use the C version */
503 #undef VEC_NEXT_LINES
504 #undef VEC_LOAD_UV
505 #undef VEC_MERGE
506 #endif
507
508     const int i_source_margin = p_source->p[0].i_pitch
509                                  - p_source->p[0].i_visible_pitch;
510     const int i_source_margin_c = p_source->p[1].i_pitch
511                                  - p_source->p[1].i_visible_pitch;
512     const int i_dest_margin = p_dest->p->i_pitch
513                                - p_dest->p->i_visible_pitch;
514
515 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
516     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
517     {
518         p_line1 = p_line2;
519         p_line2 += p_dest->p->i_pitch;
520
521         p_y1 = p_y2;
522         p_y2 += p_source->p[Y_PLANE].i_pitch;
523
524         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
525         {
526 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
527             C_YUV420_YVYU( );
528             C_YUV420_YVYU( );
529             C_YUV420_YVYU( );
530             C_YUV420_YVYU( );
531 #else
532             MMX_CALL( MMX_YUV420_YVYU );
533 #endif
534         }
535         for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; )
536         {
537             C_YUV420_YVYU( );
538         }
539
540         p_y1 += i_source_margin;
541         p_y2 += i_source_margin;
542         p_u += i_source_margin_c;
543         p_v += i_source_margin_c;
544         p_line1 += i_dest_margin;
545         p_line2 += i_dest_margin;
546     }
547
548 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
549     /* re-enable FPU registers */
550     MMX_END;
551 #endif
552
553 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
554     }
555 #endif
556
557 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
558     /*
559     ** SSE2 128 bits fetch/store instructions are faster
560     ** if memory access is 16 bytes aligned
561     */
562     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
563         ((intptr_t)p_line2|(intptr_t)p_y2))) )
564     {
565         /* use faster SSE2 aligned fetch and store */
566         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
567         {
568             p_line1 = p_line2;
569             p_line2 += p_dest->p->i_pitch;
570
571             p_y1 = p_y2;
572             p_y2 += p_source->p[Y_PLANE].i_pitch;
573
574             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
575             {
576                 SSE2_CALL( SSE2_YUV420_YVYU_ALIGNED );
577             }
578             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
579             {
580                 C_YUV420_YVYU( );
581             }
582
583             p_y1 += i_source_margin;
584             p_y2 += i_source_margin;
585             p_u += i_source_margin_c;
586             p_v += i_source_margin_c;
587             p_line1 += i_dest_margin;
588             p_line2 += i_dest_margin;
589         }
590     }
591     else
592     {
593         /* use slower SSE2 unaligned fetch and store */
594         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
595         {
596             p_line1 = p_line2;
597             p_line2 += p_dest->p->i_pitch;
598
599             p_y1 = p_y2;
600             p_y2 += p_source->p[Y_PLANE].i_pitch;
601
602             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
603             {
604                 SSE2_CALL( SSE2_YUV420_YVYU_UNALIGNED );
605             }
606             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
607             {
608                 C_YUV420_YVYU( );
609             }
610
611             p_y1 += i_source_margin;
612             p_y2 += i_source_margin;
613             p_u += i_source_margin_c;
614             p_v += i_source_margin_c;
615             p_line1 += i_dest_margin;
616             p_line2 += i_dest_margin;
617         }
618     }
619     /* make sure all SSE2 stores are visible thereafter */
620     SSE2_END;
621 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
622 }
623
624 /*****************************************************************************
625  * I420_UYVY: planar YUV 4:2:0 to packed UYVY 4:2:2
626  *****************************************************************************/
627 static void I420_UYVY( filter_t *p_filter, picture_t *p_source,
628                                            picture_t *p_dest )
629 {
630     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
631     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
632     uint8_t *p_u = p_source->U_PIXELS;
633     uint8_t *p_v = p_source->V_PIXELS;
634
635     int i_x, i_y;
636
637 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
638 #define VEC_NEXT_LINES( ) \
639     p_line1  = p_line2; \
640     p_line2 += p_dest->p->i_pitch; \
641     p_y1     = p_y2; \
642     p_y2    += p_source->p[Y_PLANE].i_pitch;
643
644 #define VEC_LOAD_UV( ) \
645     u_vec = vec_ld( 0, p_u ); p_u += 16; \
646     v_vec = vec_ld( 0, p_v ); p_v += 16;
647
648 #define VEC_MERGE( a ) \
649     uv_vec = a( u_vec, v_vec ); \
650     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
651     vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
652     vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
653     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
654     vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16; \
655     vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16;
656
657     vector unsigned char u_vec;
658     vector unsigned char v_vec;
659     vector unsigned char uv_vec;
660     vector unsigned char y_vec;
661
662     if( !( ( p_filter->fmt_in.video.i_width % 32 ) |
663            ( p_filter->fmt_in.video.i_height % 2 ) ) )
664     {
665         /* Width is a multiple of 32, we take 2 lines at a time */
666         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
667         {
668             VEC_NEXT_LINES( );
669             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
670             {
671                 VEC_LOAD_UV( );
672                 VEC_MERGE( vec_mergeh );
673                 VEC_MERGE( vec_mergel );
674             }
675         }
676     }
677     else if( !( ( p_filter->fmt_in.video.i_width % 16 ) |
678                 ( p_filter->fmt_in.video.i_height % 4 ) ) )
679     {
680         /* Width is only a multiple of 16, we take 4 lines at a time */
681         for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; )
682         {
683             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
684             VEC_NEXT_LINES( );
685             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
686             {
687                 VEC_LOAD_UV( );
688                 VEC_MERGE( vec_mergeh );
689                 VEC_MERGE( vec_mergel );
690             }
691
692             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
693             VEC_LOAD_UV( );
694             VEC_MERGE( vec_mergeh );
695
696             /* Line 3 and 4, pixels 0 to 16 */
697             VEC_NEXT_LINES( );
698             VEC_MERGE( vec_mergel );
699
700             /* Line 3 and 4, pixels 16 to ( width ) */
701             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
702             {
703                 VEC_LOAD_UV( );
704                 VEC_MERGE( vec_mergeh );
705                 VEC_MERGE( vec_mergel );
706             }
707         }
708     }
709     else
710     {
711         /* Crap, use the C version */
712 #undef VEC_NEXT_LINES
713 #undef VEC_LOAD_UV
714 #undef VEC_MERGE
715 #endif
716
717     const int i_source_margin = p_source->p[0].i_pitch
718                                  - p_source->p[0].i_visible_pitch;
719     const int i_source_margin_c = p_source->p[1].i_pitch
720                                  - p_source->p[1].i_visible_pitch;
721     const int i_dest_margin = p_dest->p->i_pitch
722                                - p_dest->p->i_visible_pitch;
723
724 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
725     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
726     {
727         p_line1 = p_line2;
728         p_line2 += p_dest->p->i_pitch;
729
730         p_y1 = p_y2;
731         p_y2 += p_source->p[Y_PLANE].i_pitch;
732
733         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
734         {
735 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
736             C_YUV420_UYVY( );
737             C_YUV420_UYVY( );
738             C_YUV420_UYVY( );
739             C_YUV420_UYVY( );
740 #else
741             MMX_CALL( MMX_YUV420_UYVY );
742 #endif
743         }
744         for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x--; )
745         {
746             C_YUV420_UYVY( );
747         }
748
749         p_y1 += i_source_margin;
750         p_y2 += i_source_margin;
751         p_u += i_source_margin_c;
752         p_v += i_source_margin_c;
753         p_line1 += i_dest_margin;
754         p_line2 += i_dest_margin;
755     }
756
757 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
758     /* re-enable FPU registers */
759     MMX_END;
760 #endif
761
762 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
763     }
764 #endif
765
766 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
767     /*
768     ** SSE2 128 bits fetch/store instructions are faster
769     ** if memory access is 16 bytes aligned
770     */
771     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
772         ((intptr_t)p_line2|(intptr_t)p_y2))) )
773     {
774         /* use faster SSE2 aligned fetch and store */
775         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
776         {
777             p_line1 = p_line2;
778             p_line2 += p_dest->p->i_pitch;
779
780             p_y1 = p_y2;
781             p_y2 += p_source->p[Y_PLANE].i_pitch;
782
783             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
784             {
785                 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
786             }
787             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
788             {
789                 C_YUV420_UYVY( );
790             }
791
792             p_y1 += i_source_margin;
793             p_y2 += i_source_margin;
794             p_u += i_source_margin_c;
795             p_v += i_source_margin_c;
796             p_line1 += i_dest_margin;
797             p_line2 += i_dest_margin;
798         }
799     }
800     else
801     {
802         /* use slower SSE2 unaligned fetch and store */
803         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
804         {
805             p_line1 = p_line2;
806             p_line2 += p_dest->p->i_pitch;
807
808             p_y1 = p_y2;
809             p_y2 += p_source->p[Y_PLANE].i_pitch;
810
811             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
812             {
813                 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
814             }
815             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
816             {
817                 C_YUV420_UYVY( );
818             }
819
820             p_y1 += i_source_margin;
821             p_y2 += i_source_margin;
822             p_u += i_source_margin_c;
823             p_v += i_source_margin_c;
824             p_line1 += i_dest_margin;
825             p_line2 += i_dest_margin;
826         }
827     }
828     /* make sure all SSE2 stores are visible thereafter */
829     SSE2_END;
830 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
831 }
832
833 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
834 /*****************************************************************************
835  * I420_IUYV: planar YUV 4:2:0 to interleaved packed UYVY 4:2:2
836  *****************************************************************************/
837 static void I420_IUYV( filter_t *p_filter, picture_t *p_source,
838                                            picture_t *p_dest )
839 {
840     VLC_UNUSED(p_source); VLC_UNUSED(p_dest);
841     /* FIXME: TODO ! */
842     msg_Err( p_filter, "I420_IUYV unimplemented, please harass <sam@zoy.org>" );
843 }
844
845 /*****************************************************************************
846  * I420_cyuv: planar YUV 4:2:0 to upside-down packed UYVY 4:2:2
847  *****************************************************************************/
848 static void I420_cyuv( filter_t *p_filter, picture_t *p_source,
849                                            picture_t *p_dest )
850 {
851     uint8_t *p_line1 = p_dest->p->p_pixels +
852                        p_dest->p->i_visible_lines * p_dest->p->i_pitch
853                        + p_dest->p->i_pitch;
854     uint8_t *p_line2 = p_dest->p->p_pixels +
855                        p_dest->p->i_visible_lines * p_dest->p->i_pitch;
856     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
857     uint8_t *p_u = p_source->U_PIXELS;
858     uint8_t *p_v = p_source->V_PIXELS;
859
860     int i_x, i_y;
861
862     const int i_source_margin = p_source->p[0].i_pitch
863                                  - p_source->p[0].i_visible_pitch;
864     const int i_source_margin_c = p_source->p[1].i_pitch
865                                  - p_source->p[1].i_visible_pitch;
866     const int i_dest_margin = p_dest->p->i_pitch
867                                - p_dest->p->i_visible_pitch;
868
869 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
870     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
871     {
872         p_line1 -= 3 * p_dest->p->i_pitch;
873         p_line2 -= 3 * p_dest->p->i_pitch;
874
875         p_y1 = p_y2;
876         p_y2 += p_source->p[Y_PLANE].i_pitch;
877
878         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
879         {
880 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
881             C_YUV420_UYVY( );
882             C_YUV420_UYVY( );
883             C_YUV420_UYVY( );
884             C_YUV420_UYVY( );
885 #else
886             MMX_CALL( MMX_YUV420_UYVY );
887 #endif
888         }
889         for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; )
890         {
891             C_YUV420_UYVY( );
892         }
893
894         p_y1 += i_source_margin;
895         p_y2 += i_source_margin;
896         p_u += i_source_margin_c;
897         p_v += i_source_margin_c;
898         p_line1 += i_dest_margin;
899         p_line2 += i_dest_margin;
900     }
901
902 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
903     /* re-enable FPU registers */
904     MMX_END;
905 #endif
906
907 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
908     /*
909     ** SSE2 128 bits fetch/store instructions are faster
910     ** if memory access is 16 bytes aligned
911     */
912     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
913         ((intptr_t)p_line2|(intptr_t)p_y2))) )
914     {
915         /* use faster SSE2 aligned fetch and store */
916         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
917         {
918             p_line1 = p_line2;
919             p_line2 += p_dest->p->i_pitch;
920
921             p_y1 = p_y2;
922             p_y2 += p_source->p[Y_PLANE].i_pitch;
923
924             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
925             {
926                 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
927             }
928             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
929             {
930                 C_YUV420_UYVY( );
931             }
932
933             p_y1 += i_source_margin;
934             p_y2 += i_source_margin;
935             p_u += i_source_margin_c;
936             p_v += i_source_margin_c;
937             p_line1 += i_dest_margin;
938             p_line2 += i_dest_margin;
939         }
940     }
941     else
942     {
943         /* use slower SSE2 unaligned fetch and store */
944         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
945         {
946             p_line1 = p_line2;
947             p_line2 += p_dest->p->i_pitch;
948
949             p_y1 = p_y2;
950             p_y2 += p_source->p[Y_PLANE].i_pitch;
951
952             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
953             {
954                 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
955             }
956             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
957             {
958                 C_YUV420_UYVY( );
959             }
960
961             p_y1 += i_source_margin;
962             p_y2 += i_source_margin;
963             p_u += i_source_margin_c;
964             p_v += i_source_margin_c;
965             p_line1 += i_dest_margin;
966             p_line2 += i_dest_margin;
967         }
968     }
969     /* make sure all SSE2 stores are visible thereafter */
970     SSE2_END;
971 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
972 }
973 #endif // !defined (MODULE_NAME_IS_i420_yuy2_altivec)
974
975 /*****************************************************************************
976  * I420_Y211: planar YUV 4:2:0 to packed YUYV 2:1:1
977  *****************************************************************************/
978 #if defined (MODULE_NAME_IS_i420_yuy2)
979 static void I420_Y211( filter_t *p_filter, picture_t *p_source,
980                                            picture_t *p_dest )
981 {
982     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
983     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
984     uint8_t *p_u = p_source->U_PIXELS;
985     uint8_t *p_v = p_source->V_PIXELS;
986
987     int i_x, i_y;
988
989     const int i_source_margin = p_source->p[0].i_pitch
990                                  - p_source->p[0].i_visible_pitch;
991     const int i_source_margin_c = p_source->p[1].i_pitch
992                                  - p_source->p[1].i_visible_pitch;
993     const int i_dest_margin = p_dest->p->i_pitch
994                                - p_dest->p->i_visible_pitch;
995
996     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
997     {
998         p_line1 = p_line2;
999         p_line2 += p_dest->p->i_pitch;
1000
1001         p_y1 = p_y2;
1002         p_y2 += p_source->p[Y_PLANE].i_pitch;
1003
1004         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
1005         {
1006             C_YUV420_Y211( );
1007             C_YUV420_Y211( );
1008         }
1009
1010         p_y1 += i_source_margin;
1011         p_y2 += i_source_margin;
1012         p_u += i_source_margin_c;
1013         p_v += i_source_margin_c;
1014         p_line1 += i_dest_margin;
1015         p_line2 += i_dest_margin;
1016     }
1017 }
1018 #endif