]> git.sesse.net Git - vlc/blob - modules/video_chroma/i420_yuy2.c
Qt: ToolbarEditDialog: show tooltip on widgets list
[vlc] / modules / video_chroma / i420_yuy2.c
1 /*****************************************************************************
2  * i420_yuy2.c : YUV to YUV conversion module for vlc
3  *****************************************************************************
4  * Copyright (C) 2000, 2001 VLC authors and VideoLAN
5  * $Id$
6  *
7  * Authors: Samuel Hocevar <sam@zoy.org>
8  *          Damien Fouilleul <damien@videolan.org>
9  *
10  * This program is free software; you can redistribute it and/or modify it
11  * under the terms of the GNU Lesser General Public License as published by
12  * the Free Software Foundation; either version 2.1 of the License, or
13  * (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18  * GNU Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public License
21  * along with this program; if not, write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
23  *****************************************************************************/
24
25 /*****************************************************************************
26  * Preamble
27  *****************************************************************************/
28
29 #ifdef HAVE_CONFIG_H
30 # include "config.h"
31 #endif
32
33 #include <vlc_common.h>
34 #include <vlc_plugin.h>
35 #include <vlc_filter.h>
36 #include <vlc_cpu.h>
37
38 #if defined (MODULE_NAME_IS_i420_yuy2_altivec) && defined(HAVE_ALTIVEC_H)
39 #   include <altivec.h>
40 #endif
41
42 #include "i420_yuy2.h"
43
44 #define SRC_FOURCC  "I420,IYUV,YV12"
45
46 #if defined (MODULE_NAME_IS_i420_yuy2)
47 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv,Y211"
48 #    define VLC_TARGET
49 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
50 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
51 #    define VLC_TARGET VLC_MMX
52 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
53 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
54 #    define VLC_TARGET VLC_SSE
55 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
56 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422"
57 #    define VLC_TARGET
58 #endif
59
60 /*****************************************************************************
61  * Local and extern prototypes.
62  *****************************************************************************/
63 static int  Activate ( vlc_object_t * );
64
65 static void I420_YUY2           ( filter_t *, picture_t *, picture_t * );
66 static void I420_YVYU           ( filter_t *, picture_t *, picture_t * );
67 static void I420_UYVY           ( filter_t *, picture_t *, picture_t * );
68 static picture_t *I420_YUY2_Filter    ( filter_t *, picture_t * );
69 static picture_t *I420_YVYU_Filter    ( filter_t *, picture_t * );
70 static picture_t *I420_UYVY_Filter    ( filter_t *, picture_t * );
71 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
72 static void I420_IUYV           ( filter_t *, picture_t *, picture_t * );
73 static void I420_cyuv           ( filter_t *, picture_t *, picture_t * );
74 static picture_t *I420_IUYV_Filter    ( filter_t *, picture_t * );
75 static picture_t *I420_cyuv_Filter    ( filter_t *, picture_t * );
76 #endif
77 #if defined (MODULE_NAME_IS_i420_yuy2)
78 static void I420_Y211           ( filter_t *, picture_t *, picture_t * );
79 static picture_t *I420_Y211_Filter    ( filter_t *, picture_t * );
80 #endif
81
82 #ifdef MODULE_NAME_IS_i420_yuy2_mmx
83 /* Initialize MMX-specific constants */
84 static const uint64_t i_00ffw = 0x00ff00ff00ff00ffULL;
85 static const uint64_t i_80w   = 0x0000000080808080ULL;
86 #endif
87
88 /*****************************************************************************
89  * Module descriptor.
90  *****************************************************************************/
91 vlc_module_begin ()
92 #if defined (MODULE_NAME_IS_i420_yuy2)
93     set_description( N_("Conversions from " SRC_FOURCC " to " DEST_FOURCC) )
94     set_capability( "video filter2", 80 )
95 # define vlc_CPU_capable() (true)
96 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
97     set_description( N_("MMX conversions from " SRC_FOURCC " to " DEST_FOURCC) )
98     set_capability( "video filter2", 160 )
99 # define vlc_CPU_capable() vlc_CPU_MMX()
100 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
101     set_description( N_("SSE2 conversions from " SRC_FOURCC " to " DEST_FOURCC) )
102     set_capability( "video filter2", 250 )
103 # define vlc_CPU_capable() vlc_CPU_SSE2()
104 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
105     set_description(
106             _("AltiVec conversions from " SRC_FOURCC " to " DEST_FOURCC) );
107     set_capability( "video filter2", 250 )
108 # define vlc_CPU_capable() vlc_CPU_ALTIVEC()
109 #endif
110     set_callbacks( Activate, NULL )
111 vlc_module_end ()
112
113 /*****************************************************************************
114  * Activate: allocate a chroma function
115  *****************************************************************************
116  * This function allocates and initializes a chroma function
117  *****************************************************************************/
118 static int Activate( vlc_object_t *p_this )
119 {
120     filter_t *p_filter = (filter_t *)p_this;
121
122     if( !vlc_CPU_capable() )
123         return VLC_EGENERIC;
124     if( p_filter->fmt_in.video.i_width & 1
125      || p_filter->fmt_in.video.i_height & 1 )
126     {
127         return -1;
128     }
129
130     if( p_filter->fmt_in.video.i_width != p_filter->fmt_out.video.i_width
131      || p_filter->fmt_in.video.i_height != p_filter->fmt_out.video.i_height )
132         return -1;
133
134     switch( p_filter->fmt_in.video.i_chroma )
135     {
136         case VLC_CODEC_YV12:
137         case VLC_CODEC_I420:
138             switch( p_filter->fmt_out.video.i_chroma )
139             {
140                 case VLC_CODEC_YUYV:
141                     p_filter->pf_video_filter = I420_YUY2_Filter;
142                     break;
143
144                 case VLC_CODEC_YVYU:
145                     p_filter->pf_video_filter = I420_YVYU_Filter;
146                     break;
147
148                 case VLC_CODEC_UYVY:
149                     p_filter->pf_video_filter = I420_UYVY_Filter;
150                     break;
151 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
152                 case VLC_FOURCC('I','U','Y','V'):
153                     p_filter->pf_video_filter = I420_IUYV_Filter;
154                     break;
155
156                 case VLC_CODEC_CYUV:
157                     p_filter->pf_video_filter = I420_cyuv_Filter;
158                     break;
159 #endif
160
161 #if defined (MODULE_NAME_IS_i420_yuy2)
162                 case VLC_CODEC_Y211:
163                     p_filter->pf_video_filter = I420_Y211_Filter;
164                     break;
165 #endif
166
167                 default:
168                     return -1;
169             }
170             break;
171
172         default:
173             return -1;
174     }
175
176     return 0;
177 }
178
179 #if 0
180 static inline unsigned long long read_cycles(void)
181 {
182     unsigned long long v;
183     __asm__ __volatile__("rdtsc" : "=A" (v): );
184
185     return v;
186 }
187 #endif
188
189 /* Following functions are local */
190
191 VIDEO_FILTER_WRAPPER( I420_YUY2 )
192 VIDEO_FILTER_WRAPPER( I420_YVYU )
193 VIDEO_FILTER_WRAPPER( I420_UYVY )
194 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
195 VIDEO_FILTER_WRAPPER( I420_IUYV )
196 VIDEO_FILTER_WRAPPER( I420_cyuv )
197 #endif
198 #if defined (MODULE_NAME_IS_i420_yuy2)
199 VIDEO_FILTER_WRAPPER( I420_Y211 )
200 #endif
201
202 /*****************************************************************************
203  * I420_YUY2: planar YUV 4:2:0 to packed YUYV 4:2:2
204  *****************************************************************************/
205 VLC_TARGET
206 static void I420_YUY2( filter_t *p_filter, picture_t *p_source,
207                                            picture_t *p_dest )
208 {
209     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
210     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
211     uint8_t *p_u = p_source->U_PIXELS;
212     uint8_t *p_v = p_source->V_PIXELS;
213
214     int i_x, i_y;
215
216 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
217 #define VEC_NEXT_LINES( ) \
218     p_line1  = p_line2; \
219     p_line2 += p_dest->p->i_pitch; \
220     p_y1     = p_y2; \
221     p_y2    += p_source->p[Y_PLANE].i_pitch;
222
223 #define VEC_LOAD_UV( ) \
224     u_vec = vec_ld( 0, p_u ); p_u += 16; \
225     v_vec = vec_ld( 0, p_v ); p_v += 16;
226
227 #define VEC_MERGE( a ) \
228     uv_vec = a( u_vec, v_vec ); \
229     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
230     vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
231     vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
232     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
233     vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16; \
234     vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16;
235
236     vector unsigned char u_vec;
237     vector unsigned char v_vec;
238     vector unsigned char uv_vec;
239     vector unsigned char y_vec;
240
241     if( !( ( p_filter->fmt_in.video.i_width % 32 ) |
242            ( p_filter->fmt_in.video.i_height % 2 ) ) )
243     {
244         /* Width is a multiple of 32, we take 2 lines at a time */
245         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
246         {
247             VEC_NEXT_LINES( );
248             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
249             {
250                 VEC_LOAD_UV( );
251                 VEC_MERGE( vec_mergeh );
252                 VEC_MERGE( vec_mergel );
253             }
254         }
255     }
256 #warning FIXME: converting widths % 16 but !widths % 32 is broken on altivec
257 #if 0
258     else if( !( ( p_filter->fmt_in.video.i_width % 16 ) |
259                 ( p_filter->fmt_in.video.i_height % 4 ) ) )
260     {
261         /* Width is only a multiple of 16, we take 4 lines at a time */
262         for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; )
263         {
264             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
265             VEC_NEXT_LINES( );
266             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
267             {
268                 VEC_LOAD_UV( );
269                 VEC_MERGE( vec_mergeh );
270                 VEC_MERGE( vec_mergel );
271             }
272
273             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
274             VEC_LOAD_UV( );
275             VEC_MERGE( vec_mergeh );
276
277             /* Line 3 and 4, pixels 0 to 16 */
278             VEC_NEXT_LINES( );
279             VEC_MERGE( vec_mergel );
280
281             /* Line 3 and 4, pixels 16 to ( width ) */
282             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
283             {
284                 VEC_LOAD_UV( );
285                 VEC_MERGE( vec_mergeh );
286                 VEC_MERGE( vec_mergel );
287             }
288         }
289     }
290 #endif
291     else
292     {
293         /* Crap, use the C version */
294 #undef VEC_NEXT_LINES
295 #undef VEC_LOAD_UV
296 #undef VEC_MERGE
297 #endif
298
299     const int i_source_margin = p_source->p[0].i_pitch
300                                  - p_source->p[0].i_visible_pitch;
301     const int i_source_margin_c = p_source->p[1].i_pitch
302                                  - p_source->p[1].i_visible_pitch;
303     const int i_dest_margin = p_dest->p->i_pitch
304                                - p_dest->p->i_visible_pitch;
305
306 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
307     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
308     {
309         p_line1 = p_line2;
310         p_line2 += p_dest->p->i_pitch;
311
312         p_y1 = p_y2;
313         p_y2 += p_source->p[Y_PLANE].i_pitch;
314
315 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
316         for( i_x = p_filter->fmt_in.video.i_width / 8; i_x-- ; )
317         {
318             C_YUV420_YUYV( );
319             C_YUV420_YUYV( );
320             C_YUV420_YUYV( );
321             C_YUV420_YUYV( );
322         }
323 #else
324         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
325         {
326             MMX_CALL( MMX_YUV420_YUYV );
327         }
328 #endif
329         for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; )
330         {
331             C_YUV420_YUYV( );
332         }
333
334         p_y1 += i_source_margin;
335         p_y2 += i_source_margin;
336         p_u += i_source_margin_c;
337         p_v += i_source_margin_c;
338         p_line1 += i_dest_margin;
339         p_line2 += i_dest_margin;
340     }
341
342 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
343     /* re-enable FPU registers */
344     MMX_END;
345 #endif
346
347 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
348     }
349 #endif
350
351 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
352     /*
353     ** SSE2 128 bits fetch/store instructions are faster
354     ** if memory access is 16 bytes aligned
355     */
356
357     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
358         ((intptr_t)p_line2|(intptr_t)p_y2))) )
359     {
360         /* use faster SSE2 aligned fetch and store */
361         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
362         {
363             p_line1 = p_line2;
364             p_line2 += p_dest->p->i_pitch;
365
366             p_y1 = p_y2;
367             p_y2 += p_source->p[Y_PLANE].i_pitch;
368
369             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
370             {
371                 SSE2_CALL( SSE2_YUV420_YUYV_ALIGNED );
372             }
373             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
374             {
375                 C_YUV420_YUYV( );
376             }
377
378             p_y1 += i_source_margin;
379             p_y2 += i_source_margin;
380             p_u += i_source_margin_c;
381             p_v += i_source_margin_c;
382             p_line1 += i_dest_margin;
383             p_line2 += i_dest_margin;
384         }
385     }
386     else
387     {
388         /* use slower SSE2 unaligned fetch and store */
389         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
390         {
391             p_line1 = p_line2;
392             p_line2 += p_dest->p->i_pitch;
393
394             p_y1 = p_y2;
395             p_y2 += p_source->p[Y_PLANE].i_pitch;
396
397             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
398             {
399                 SSE2_CALL( SSE2_YUV420_YUYV_UNALIGNED );
400             }
401             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
402             {
403                 C_YUV420_YUYV( );
404             }
405
406             p_y1 += i_source_margin;
407             p_y2 += i_source_margin;
408             p_u += i_source_margin_c;
409             p_v += i_source_margin_c;
410             p_line1 += i_dest_margin;
411             p_line2 += i_dest_margin;
412         }
413     }
414     /* make sure all SSE2 stores are visible thereafter */
415     SSE2_END;
416
417 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
418 }
419
420 /*****************************************************************************
421  * I420_YVYU: planar YUV 4:2:0 to packed YVYU 4:2:2
422  *****************************************************************************/
423 VLC_TARGET
424 static void I420_YVYU( filter_t *p_filter, picture_t *p_source,
425                                            picture_t *p_dest )
426 {
427     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
428     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
429     uint8_t *p_u = p_source->U_PIXELS;
430     uint8_t *p_v = p_source->V_PIXELS;
431
432     int i_x, i_y;
433
434 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
435 #define VEC_NEXT_LINES( ) \
436     p_line1  = p_line2; \
437     p_line2 += p_dest->p->i_pitch; \
438     p_y1     = p_y2; \
439     p_y2    += p_source->p[Y_PLANE].i_pitch;
440
441 #define VEC_LOAD_UV( ) \
442     u_vec = vec_ld( 0, p_u ); p_u += 16; \
443     v_vec = vec_ld( 0, p_v ); p_v += 16;
444
445 #define VEC_MERGE( a ) \
446     vu_vec = a( v_vec, u_vec ); \
447     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
448     vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
449     vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
450     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
451     vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16; \
452     vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16;
453
454     vector unsigned char u_vec;
455     vector unsigned char v_vec;
456     vector unsigned char vu_vec;
457     vector unsigned char y_vec;
458
459     if( !( ( p_filter->fmt_in.video.i_width % 32 ) |
460            ( p_filter->fmt_in.video.i_height % 2 ) ) )
461     {
462         /* Width is a multiple of 32, we take 2 lines at a time */
463         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
464         {
465             VEC_NEXT_LINES( );
466             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
467             {
468                 VEC_LOAD_UV( );
469                 VEC_MERGE( vec_mergeh );
470                 VEC_MERGE( vec_mergel );
471             }
472         }
473     }
474     else if( !( ( p_filter->fmt_in.video.i_width % 16 ) |
475                 ( p_filter->fmt_in.video.i_height % 4 ) ) )
476     {
477         /* Width is only a multiple of 16, we take 4 lines at a time */
478         for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; )
479         {
480             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
481             VEC_NEXT_LINES( );
482             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
483             {
484                 VEC_LOAD_UV( );
485                 VEC_MERGE( vec_mergeh );
486                 VEC_MERGE( vec_mergel );
487             }
488
489             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
490             VEC_LOAD_UV( );
491             VEC_MERGE( vec_mergeh );
492
493             /* Line 3 and 4, pixels 0 to 16 */
494             VEC_NEXT_LINES( );
495             VEC_MERGE( vec_mergel );
496
497             /* Line 3 and 4, pixels 16 to ( width ) */
498             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
499             {
500                 VEC_LOAD_UV( );
501                 VEC_MERGE( vec_mergeh );
502                 VEC_MERGE( vec_mergel );
503             }
504         }
505     }
506     else
507     {
508         /* Crap, use the C version */
509 #undef VEC_NEXT_LINES
510 #undef VEC_LOAD_UV
511 #undef VEC_MERGE
512 #endif
513
514     const int i_source_margin = p_source->p[0].i_pitch
515                                  - p_source->p[0].i_visible_pitch;
516     const int i_source_margin_c = p_source->p[1].i_pitch
517                                  - p_source->p[1].i_visible_pitch;
518     const int i_dest_margin = p_dest->p->i_pitch
519                                - p_dest->p->i_visible_pitch;
520
521 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
522     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
523     {
524         p_line1 = p_line2;
525         p_line2 += p_dest->p->i_pitch;
526
527         p_y1 = p_y2;
528         p_y2 += p_source->p[Y_PLANE].i_pitch;
529
530         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
531         {
532 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
533             C_YUV420_YVYU( );
534             C_YUV420_YVYU( );
535             C_YUV420_YVYU( );
536             C_YUV420_YVYU( );
537 #else
538             MMX_CALL( MMX_YUV420_YVYU );
539 #endif
540         }
541         for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; )
542         {
543             C_YUV420_YVYU( );
544         }
545
546         p_y1 += i_source_margin;
547         p_y2 += i_source_margin;
548         p_u += i_source_margin_c;
549         p_v += i_source_margin_c;
550         p_line1 += i_dest_margin;
551         p_line2 += i_dest_margin;
552     }
553
554 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
555     /* re-enable FPU registers */
556     MMX_END;
557 #endif
558
559 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
560     }
561 #endif
562
563 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
564     /*
565     ** SSE2 128 bits fetch/store instructions are faster
566     ** if memory access is 16 bytes aligned
567     */
568     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
569         ((intptr_t)p_line2|(intptr_t)p_y2))) )
570     {
571         /* use faster SSE2 aligned fetch and store */
572         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
573         {
574             p_line1 = p_line2;
575             p_line2 += p_dest->p->i_pitch;
576
577             p_y1 = p_y2;
578             p_y2 += p_source->p[Y_PLANE].i_pitch;
579
580             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
581             {
582                 SSE2_CALL( SSE2_YUV420_YVYU_ALIGNED );
583             }
584             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
585             {
586                 C_YUV420_YVYU( );
587             }
588
589             p_y1 += i_source_margin;
590             p_y2 += i_source_margin;
591             p_u += i_source_margin_c;
592             p_v += i_source_margin_c;
593             p_line1 += i_dest_margin;
594             p_line2 += i_dest_margin;
595         }
596     }
597     else
598     {
599         /* use slower SSE2 unaligned fetch and store */
600         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
601         {
602             p_line1 = p_line2;
603             p_line2 += p_dest->p->i_pitch;
604
605             p_y1 = p_y2;
606             p_y2 += p_source->p[Y_PLANE].i_pitch;
607
608             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
609             {
610                 SSE2_CALL( SSE2_YUV420_YVYU_UNALIGNED );
611             }
612             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
613             {
614                 C_YUV420_YVYU( );
615             }
616
617             p_y1 += i_source_margin;
618             p_y2 += i_source_margin;
619             p_u += i_source_margin_c;
620             p_v += i_source_margin_c;
621             p_line1 += i_dest_margin;
622             p_line2 += i_dest_margin;
623         }
624     }
625     /* make sure all SSE2 stores are visible thereafter */
626     SSE2_END;
627 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
628 }
629
630 /*****************************************************************************
631  * I420_UYVY: planar YUV 4:2:0 to packed UYVY 4:2:2
632  *****************************************************************************/
633 VLC_TARGET
634 static void I420_UYVY( filter_t *p_filter, picture_t *p_source,
635                                            picture_t *p_dest )
636 {
637     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
638     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
639     uint8_t *p_u = p_source->U_PIXELS;
640     uint8_t *p_v = p_source->V_PIXELS;
641
642     int i_x, i_y;
643
644 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
645 #define VEC_NEXT_LINES( ) \
646     p_line1  = p_line2; \
647     p_line2 += p_dest->p->i_pitch; \
648     p_y1     = p_y2; \
649     p_y2    += p_source->p[Y_PLANE].i_pitch;
650
651 #define VEC_LOAD_UV( ) \
652     u_vec = vec_ld( 0, p_u ); p_u += 16; \
653     v_vec = vec_ld( 0, p_v ); p_v += 16;
654
655 #define VEC_MERGE( a ) \
656     uv_vec = a( u_vec, v_vec ); \
657     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
658     vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
659     vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
660     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
661     vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16; \
662     vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16;
663
664     vector unsigned char u_vec;
665     vector unsigned char v_vec;
666     vector unsigned char uv_vec;
667     vector unsigned char y_vec;
668
669     if( !( ( p_filter->fmt_in.video.i_width % 32 ) |
670            ( p_filter->fmt_in.video.i_height % 2 ) ) )
671     {
672         /* Width is a multiple of 32, we take 2 lines at a time */
673         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
674         {
675             VEC_NEXT_LINES( );
676             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
677             {
678                 VEC_LOAD_UV( );
679                 VEC_MERGE( vec_mergeh );
680                 VEC_MERGE( vec_mergel );
681             }
682         }
683     }
684     else if( !( ( p_filter->fmt_in.video.i_width % 16 ) |
685                 ( p_filter->fmt_in.video.i_height % 4 ) ) )
686     {
687         /* Width is only a multiple of 16, we take 4 lines at a time */
688         for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; )
689         {
690             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
691             VEC_NEXT_LINES( );
692             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
693             {
694                 VEC_LOAD_UV( );
695                 VEC_MERGE( vec_mergeh );
696                 VEC_MERGE( vec_mergel );
697             }
698
699             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
700             VEC_LOAD_UV( );
701             VEC_MERGE( vec_mergeh );
702
703             /* Line 3 and 4, pixels 0 to 16 */
704             VEC_NEXT_LINES( );
705             VEC_MERGE( vec_mergel );
706
707             /* Line 3 and 4, pixels 16 to ( width ) */
708             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
709             {
710                 VEC_LOAD_UV( );
711                 VEC_MERGE( vec_mergeh );
712                 VEC_MERGE( vec_mergel );
713             }
714         }
715     }
716     else
717     {
718         /* Crap, use the C version */
719 #undef VEC_NEXT_LINES
720 #undef VEC_LOAD_UV
721 #undef VEC_MERGE
722 #endif
723
724     const int i_source_margin = p_source->p[0].i_pitch
725                                  - p_source->p[0].i_visible_pitch;
726     const int i_source_margin_c = p_source->p[1].i_pitch
727                                  - p_source->p[1].i_visible_pitch;
728     const int i_dest_margin = p_dest->p->i_pitch
729                                - p_dest->p->i_visible_pitch;
730
731 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
732     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
733     {
734         p_line1 = p_line2;
735         p_line2 += p_dest->p->i_pitch;
736
737         p_y1 = p_y2;
738         p_y2 += p_source->p[Y_PLANE].i_pitch;
739
740         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
741         {
742 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
743             C_YUV420_UYVY( );
744             C_YUV420_UYVY( );
745             C_YUV420_UYVY( );
746             C_YUV420_UYVY( );
747 #else
748             MMX_CALL( MMX_YUV420_UYVY );
749 #endif
750         }
751         for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x--; )
752         {
753             C_YUV420_UYVY( );
754         }
755
756         p_y1 += i_source_margin;
757         p_y2 += i_source_margin;
758         p_u += i_source_margin_c;
759         p_v += i_source_margin_c;
760         p_line1 += i_dest_margin;
761         p_line2 += i_dest_margin;
762     }
763
764 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
765     /* re-enable FPU registers */
766     MMX_END;
767 #endif
768
769 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
770     }
771 #endif
772
773 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
774     /*
775     ** SSE2 128 bits fetch/store instructions are faster
776     ** if memory access is 16 bytes aligned
777     */
778     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
779         ((intptr_t)p_line2|(intptr_t)p_y2))) )
780     {
781         /* use faster SSE2 aligned fetch and store */
782         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
783         {
784             p_line1 = p_line2;
785             p_line2 += p_dest->p->i_pitch;
786
787             p_y1 = p_y2;
788             p_y2 += p_source->p[Y_PLANE].i_pitch;
789
790             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
791             {
792                 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
793             }
794             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
795             {
796                 C_YUV420_UYVY( );
797             }
798
799             p_y1 += i_source_margin;
800             p_y2 += i_source_margin;
801             p_u += i_source_margin_c;
802             p_v += i_source_margin_c;
803             p_line1 += i_dest_margin;
804             p_line2 += i_dest_margin;
805         }
806     }
807     else
808     {
809         /* use slower SSE2 unaligned fetch and store */
810         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
811         {
812             p_line1 = p_line2;
813             p_line2 += p_dest->p->i_pitch;
814
815             p_y1 = p_y2;
816             p_y2 += p_source->p[Y_PLANE].i_pitch;
817
818             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
819             {
820                 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
821             }
822             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
823             {
824                 C_YUV420_UYVY( );
825             }
826
827             p_y1 += i_source_margin;
828             p_y2 += i_source_margin;
829             p_u += i_source_margin_c;
830             p_v += i_source_margin_c;
831             p_line1 += i_dest_margin;
832             p_line2 += i_dest_margin;
833         }
834     }
835     /* make sure all SSE2 stores are visible thereafter */
836     SSE2_END;
837 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
838 }
839
840 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
841 /*****************************************************************************
842  * I420_IUYV: planar YUV 4:2:0 to interleaved packed UYVY 4:2:2
843  *****************************************************************************/
844 static void I420_IUYV( filter_t *p_filter, picture_t *p_source,
845                                            picture_t *p_dest )
846 {
847     VLC_UNUSED(p_source); VLC_UNUSED(p_dest);
848     /* FIXME: TODO ! */
849     msg_Err( p_filter, "I420_IUYV unimplemented, please harass <sam@zoy.org>" );
850 }
851
852 /*****************************************************************************
853  * I420_cyuv: planar YUV 4:2:0 to upside-down packed UYVY 4:2:2
854  *****************************************************************************/
855 VLC_TARGET
856 static void I420_cyuv( filter_t *p_filter, picture_t *p_source,
857                                            picture_t *p_dest )
858 {
859     uint8_t *p_line1 = p_dest->p->p_pixels +
860                        p_dest->p->i_visible_lines * p_dest->p->i_pitch
861                        + p_dest->p->i_pitch;
862     uint8_t *p_line2 = p_dest->p->p_pixels +
863                        p_dest->p->i_visible_lines * p_dest->p->i_pitch;
864     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
865     uint8_t *p_u = p_source->U_PIXELS;
866     uint8_t *p_v = p_source->V_PIXELS;
867
868     int i_x, i_y;
869
870     const int i_source_margin = p_source->p[0].i_pitch
871                                  - p_source->p[0].i_visible_pitch;
872     const int i_source_margin_c = p_source->p[1].i_pitch
873                                  - p_source->p[1].i_visible_pitch;
874     const int i_dest_margin = p_dest->p->i_pitch
875                                - p_dest->p->i_visible_pitch;
876
877 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
878     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
879     {
880         p_line1 -= 3 * p_dest->p->i_pitch;
881         p_line2 -= 3 * p_dest->p->i_pitch;
882
883         p_y1 = p_y2;
884         p_y2 += p_source->p[Y_PLANE].i_pitch;
885
886         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
887         {
888 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
889             C_YUV420_UYVY( );
890             C_YUV420_UYVY( );
891             C_YUV420_UYVY( );
892             C_YUV420_UYVY( );
893 #else
894             MMX_CALL( MMX_YUV420_UYVY );
895 #endif
896         }
897         for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; )
898         {
899             C_YUV420_UYVY( );
900         }
901
902         p_y1 += i_source_margin;
903         p_y2 += i_source_margin;
904         p_u += i_source_margin_c;
905         p_v += i_source_margin_c;
906         p_line1 += i_dest_margin;
907         p_line2 += i_dest_margin;
908     }
909
910 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
911     /* re-enable FPU registers */
912     MMX_END;
913 #endif
914
915 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
916     /*
917     ** SSE2 128 bits fetch/store instructions are faster
918     ** if memory access is 16 bytes aligned
919     */
920     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
921         ((intptr_t)p_line2|(intptr_t)p_y2))) )
922     {
923         /* use faster SSE2 aligned fetch and store */
924         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
925         {
926             p_line1 = p_line2;
927             p_line2 += p_dest->p->i_pitch;
928
929             p_y1 = p_y2;
930             p_y2 += p_source->p[Y_PLANE].i_pitch;
931
932             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
933             {
934                 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
935             }
936             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
937             {
938                 C_YUV420_UYVY( );
939             }
940
941             p_y1 += i_source_margin;
942             p_y2 += i_source_margin;
943             p_u += i_source_margin_c;
944             p_v += i_source_margin_c;
945             p_line1 += i_dest_margin;
946             p_line2 += i_dest_margin;
947         }
948     }
949     else
950     {
951         /* use slower SSE2 unaligned fetch and store */
952         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
953         {
954             p_line1 = p_line2;
955             p_line2 += p_dest->p->i_pitch;
956
957             p_y1 = p_y2;
958             p_y2 += p_source->p[Y_PLANE].i_pitch;
959
960             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
961             {
962                 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
963             }
964             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
965             {
966                 C_YUV420_UYVY( );
967             }
968
969             p_y1 += i_source_margin;
970             p_y2 += i_source_margin;
971             p_u += i_source_margin_c;
972             p_v += i_source_margin_c;
973             p_line1 += i_dest_margin;
974             p_line2 += i_dest_margin;
975         }
976     }
977     /* make sure all SSE2 stores are visible thereafter */
978     SSE2_END;
979 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
980 }
981 #endif // !defined (MODULE_NAME_IS_i420_yuy2_altivec)
982
983 /*****************************************************************************
984  * I420_Y211: planar YUV 4:2:0 to packed YUYV 2:1:1
985  *****************************************************************************/
986 #if defined (MODULE_NAME_IS_i420_yuy2)
987 static void I420_Y211( filter_t *p_filter, picture_t *p_source,
988                                            picture_t *p_dest )
989 {
990     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
991     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
992     uint8_t *p_u = p_source->U_PIXELS;
993     uint8_t *p_v = p_source->V_PIXELS;
994
995     int i_x, i_y;
996
997     const int i_source_margin = p_source->p[0].i_pitch
998                                  - p_source->p[0].i_visible_pitch;
999     const int i_source_margin_c = p_source->p[1].i_pitch
1000                                  - p_source->p[1].i_visible_pitch;
1001     const int i_dest_margin = p_dest->p->i_pitch
1002                                - p_dest->p->i_visible_pitch;
1003
1004     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
1005     {
1006         p_line1 = p_line2;
1007         p_line2 += p_dest->p->i_pitch;
1008
1009         p_y1 = p_y2;
1010         p_y2 += p_source->p[Y_PLANE].i_pitch;
1011
1012         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
1013         {
1014             C_YUV420_Y211( );
1015             C_YUV420_Y211( );
1016         }
1017
1018         p_y1 += i_source_margin;
1019         p_y2 += i_source_margin;
1020         p_u += i_source_margin_c;
1021         p_v += i_source_margin_c;
1022         p_line1 += i_dest_margin;
1023         p_line2 += i_dest_margin;
1024     }
1025 }
1026 #endif