]> git.sesse.net Git - vlc/blob - modules/video_chroma/i420_yuy2.c
Use var_Inherit* instead of var_CreateGet*.
[vlc] / modules / video_chroma / i420_yuy2.c
1 /*****************************************************************************
2  * i420_yuy2.c : YUV to YUV conversion module for vlc
3  *****************************************************************************
4  * Copyright (C) 2000, 2001 the VideoLAN team
5  * $Id$
6  *
7  * Authors: Samuel Hocevar <sam@zoy.org>
8  *          Damien Fouilleul <damien@videolan.org>
9  *
10  * This program is free software; you can redistribute it and/or modify
11  * it under the terms of the GNU General Public License as published by
12  * the Free Software Foundation; either version 2 of the License, or
13  * (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public License
21  * along with this program; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
23  *****************************************************************************/
24
25 /*****************************************************************************
26  * Preamble
27  *****************************************************************************/
28
29 #ifdef HAVE_CONFIG_H
30 # include "config.h"
31 #endif
32
33 #include <vlc_common.h>
34 #include <vlc_plugin.h>
35 #include <vlc_filter.h>
36
37 #if defined (MODULE_NAME_IS_i420_yuy2_altivec) && defined(HAVE_ALTIVEC_H)
38 #   include <altivec.h>
39 #endif
40
41 #include "i420_yuy2.h"
42
43 #define SRC_FOURCC  "I420,IYUV,YV12"
44
45 #if defined (MODULE_NAME_IS_i420_yuy2)
46 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv,Y211"
47 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
48 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
49 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
50 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
51 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
52 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422"
53 #endif
54
55 /*****************************************************************************
56  * Local and extern prototypes.
57  *****************************************************************************/
58 static int  Activate ( vlc_object_t * );
59
60 static void I420_YUY2           ( filter_t *, picture_t *, picture_t * );
61 static void I420_YVYU           ( filter_t *, picture_t *, picture_t * );
62 static void I420_UYVY           ( filter_t *, picture_t *, picture_t * );
63 static picture_t *I420_YUY2_Filter    ( filter_t *, picture_t * );
64 static picture_t *I420_YVYU_Filter    ( filter_t *, picture_t * );
65 static picture_t *I420_UYVY_Filter    ( filter_t *, picture_t * );
66 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
67 static void I420_IUYV           ( filter_t *, picture_t *, picture_t * );
68 static void I420_cyuv           ( filter_t *, picture_t *, picture_t * );
69 static picture_t *I420_IUYV_Filter    ( filter_t *, picture_t * );
70 static picture_t *I420_cyuv_Filter    ( filter_t *, picture_t * );
71 #endif
72 #if defined (MODULE_NAME_IS_i420_yuy2)
73 static void I420_Y211           ( filter_t *, picture_t *, picture_t * );
74 static picture_t *I420_Y211_Filter    ( filter_t *, picture_t * );
75 #endif
76
77 #ifdef MODULE_NAME_IS_i420_yuy2_mmx
78 /* Initialize MMX-specific constants */
79 static const uint64_t i_00ffw = 0x00ff00ff00ff00ffULL;
80 static const uint64_t i_80w   = 0x0000000080808080ULL;
81 #endif
82
83 /*****************************************************************************
84  * Module descriptor.
85  *****************************************************************************/
86 vlc_module_begin ()
87 #if defined (MODULE_NAME_IS_i420_yuy2)
88     set_description( N_("Conversions from " SRC_FOURCC " to " DEST_FOURCC) )
89     set_capability( "video filter2", 80 )
90 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
91     set_description( N_("MMX conversions from " SRC_FOURCC " to " DEST_FOURCC) )
92     set_capability( "video filter2", 160 )
93 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
94     set_description( N_("SSE2 conversions from " SRC_FOURCC " to " DEST_FOURCC) )
95     set_capability( "video filter2", 250 )
96 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
97     set_description(
98             _("AltiVec conversions from " SRC_FOURCC " to " DEST_FOURCC) );
99     set_capability( "video filter2", 250 )
100 #endif
101     set_callbacks( Activate, NULL )
102 vlc_module_end ()
103
104 /*****************************************************************************
105  * Activate: allocate a chroma function
106  *****************************************************************************
107  * This function allocates and initializes a chroma function
108  *****************************************************************************/
109 static int Activate( vlc_object_t *p_this )
110 {
111     filter_t *p_filter = (filter_t *)p_this;
112
113     if( p_filter->fmt_in.video.i_width & 1
114      || p_filter->fmt_in.video.i_height & 1 )
115     {
116         return -1;
117     }
118
119     if( p_filter->fmt_in.video.i_width != p_filter->fmt_out.video.i_width
120      || p_filter->fmt_in.video.i_height != p_filter->fmt_out.video.i_height )
121         return -1;
122
123     switch( p_filter->fmt_in.video.i_chroma )
124     {
125         case VLC_CODEC_YV12:
126         case VLC_CODEC_I420:
127             switch( p_filter->fmt_out.video.i_chroma )
128             {
129                 case VLC_CODEC_YUYV:
130                     p_filter->pf_video_filter = I420_YUY2_Filter;
131                     break;
132
133                 case VLC_CODEC_YVYU:
134                     p_filter->pf_video_filter = I420_YVYU_Filter;
135                     break;
136
137                 case VLC_CODEC_UYVY:
138                     p_filter->pf_video_filter = I420_UYVY_Filter;
139                     break;
140 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
141                 case VLC_FOURCC('I','U','Y','V'):
142                     p_filter->pf_video_filter = I420_IUYV_Filter;
143                     break;
144
145                 case VLC_CODEC_CYUV:
146                     p_filter->pf_video_filter = I420_cyuv_Filter;
147                     break;
148 #endif
149
150 #if defined (MODULE_NAME_IS_i420_yuy2)
151                 case VLC_CODEC_Y211:
152                     p_filter->pf_video_filter = I420_Y211_Filter;
153                     break;
154 #endif
155
156                 default:
157                     return -1;
158             }
159             break;
160
161         default:
162             return -1;
163     }
164
165     return 0;
166 }
167
168 #if 0
169 static inline unsigned long long read_cycles(void)
170 {
171     unsigned long long v;
172     __asm__ __volatile__("rdtsc" : "=A" (v): );
173
174     return v;
175 }
176 #endif
177
178 /* Following functions are local */
179
180 VIDEO_FILTER_WRAPPER( I420_YUY2 )
181 VIDEO_FILTER_WRAPPER( I420_YVYU )
182 VIDEO_FILTER_WRAPPER( I420_UYVY )
183 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
184 VIDEO_FILTER_WRAPPER( I420_IUYV )
185 VIDEO_FILTER_WRAPPER( I420_cyuv )
186 #endif
187 #if defined (MODULE_NAME_IS_i420_yuy2)
188 VIDEO_FILTER_WRAPPER( I420_Y211 )
189 #endif
190
191 /*****************************************************************************
192  * I420_YUY2: planar YUV 4:2:0 to packed YUYV 4:2:2
193  *****************************************************************************/
194 static void I420_YUY2( filter_t *p_filter, picture_t *p_source,
195                                            picture_t *p_dest )
196 {
197     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
198     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
199     uint8_t *p_u = p_source->U_PIXELS;
200     uint8_t *p_v = p_source->V_PIXELS;
201
202     int i_x, i_y;
203
204 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
205 #define VEC_NEXT_LINES( ) \
206     p_line1  = p_line2; \
207     p_line2 += p_dest->p->i_pitch; \
208     p_y1     = p_y2; \
209     p_y2    += p_source->p[Y_PLANE].i_pitch;
210
211 #define VEC_LOAD_UV( ) \
212     u_vec = vec_ld( 0, p_u ); p_u += 16; \
213     v_vec = vec_ld( 0, p_v ); p_v += 16;
214
215 #define VEC_MERGE( a ) \
216     uv_vec = a( u_vec, v_vec ); \
217     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
218     vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
219     vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
220     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
221     vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16; \
222     vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16;
223
224     vector unsigned char u_vec;
225     vector unsigned char v_vec;
226     vector unsigned char uv_vec;
227     vector unsigned char y_vec;
228
229     if( !( ( p_filter->fmt_in.video.i_width % 32 ) |
230            ( p_filter->fmt_in.video.i_height % 2 ) ) )
231     {
232         /* Width is a multiple of 32, we take 2 lines at a time */
233         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
234         {
235             VEC_NEXT_LINES( );
236             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
237             {
238                 VEC_LOAD_UV( );
239                 VEC_MERGE( vec_mergeh );
240                 VEC_MERGE( vec_mergel );
241             }
242         }
243     }
244 #warning FIXME: converting widths % 16 but !widths % 32 is broken on altivec
245 #if 0
246     else if( !( ( p_filter->fmt_in.video.i_width % 16 ) |
247                 ( p_filter->fmt_in.video.i_height % 4 ) ) )
248     {
249         /* Width is only a multiple of 16, we take 4 lines at a time */
250         for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; )
251         {
252             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
253             VEC_NEXT_LINES( );
254             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
255             {
256                 VEC_LOAD_UV( );
257                 VEC_MERGE( vec_mergeh );
258                 VEC_MERGE( vec_mergel );
259             }
260
261             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
262             VEC_LOAD_UV( );
263             VEC_MERGE( vec_mergeh );
264
265             /* Line 3 and 4, pixels 0 to 16 */
266             VEC_NEXT_LINES( );
267             VEC_MERGE( vec_mergel );
268
269             /* Line 3 and 4, pixels 16 to ( width ) */
270             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
271             {
272                 VEC_LOAD_UV( );
273                 VEC_MERGE( vec_mergeh );
274                 VEC_MERGE( vec_mergel );
275             }
276         }
277     }
278 #endif
279     else
280     {
281         /* Crap, use the C version */
282 #undef VEC_NEXT_LINES
283 #undef VEC_LOAD_UV
284 #undef VEC_MERGE
285 #endif
286
287     const int i_source_margin = p_source->p[0].i_pitch
288                                  - p_source->p[0].i_visible_pitch;
289     const int i_source_margin_c = p_source->p[1].i_pitch
290                                  - p_source->p[1].i_visible_pitch;
291     const int i_dest_margin = p_dest->p->i_pitch
292                                - p_dest->p->i_visible_pitch;
293
294 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
295     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
296     {
297         p_line1 = p_line2;
298         p_line2 += p_dest->p->i_pitch;
299
300         p_y1 = p_y2;
301         p_y2 += p_source->p[Y_PLANE].i_pitch;
302
303 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
304         for( i_x = p_filter->fmt_in.video.i_width / 8; i_x-- ; )
305         {
306             C_YUV420_YUYV( );
307             C_YUV420_YUYV( );
308             C_YUV420_YUYV( );
309             C_YUV420_YUYV( );
310         }
311 #else
312         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
313         {
314             MMX_CALL( MMX_YUV420_YUYV );
315         }
316 #endif
317         for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; )
318         {
319             C_YUV420_YUYV( );
320         }
321
322         p_y1 += i_source_margin;
323         p_y2 += i_source_margin;
324         p_u += i_source_margin_c;
325         p_v += i_source_margin_c;
326         p_line1 += i_dest_margin;
327         p_line2 += i_dest_margin;
328     }
329
330 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
331     /* re-enable FPU registers */
332     MMX_END;
333 #endif
334
335 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
336     }
337 #endif
338
339 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
340     /*
341     ** SSE2 128 bits fetch/store instructions are faster
342     ** if memory access is 16 bytes aligned
343     */
344
345     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
346         ((intptr_t)p_line2|(intptr_t)p_y2))) )
347     {
348         /* use faster SSE2 aligned fetch and store */
349         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
350         {
351             p_line1 = p_line2;
352             p_line2 += p_dest->p->i_pitch;
353
354             p_y1 = p_y2;
355             p_y2 += p_source->p[Y_PLANE].i_pitch;
356
357             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
358             {
359                 SSE2_CALL( SSE2_YUV420_YUYV_ALIGNED );
360             }
361             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
362             {
363                 C_YUV420_YUYV( );
364             }
365
366             p_y1 += i_source_margin;
367             p_y2 += i_source_margin;
368             p_u += i_source_margin_c;
369             p_v += i_source_margin_c;
370             p_line1 += i_dest_margin;
371             p_line2 += i_dest_margin;
372         }
373     }
374     else
375     {
376         /* use slower SSE2 unaligned fetch and store */
377         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
378         {
379             p_line1 = p_line2;
380             p_line2 += p_dest->p->i_pitch;
381
382             p_y1 = p_y2;
383             p_y2 += p_source->p[Y_PLANE].i_pitch;
384
385             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
386             {
387                 SSE2_CALL( SSE2_YUV420_YUYV_UNALIGNED );
388             }
389             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
390             {
391                 C_YUV420_YUYV( );
392             }
393
394             p_y1 += i_source_margin;
395             p_y2 += i_source_margin;
396             p_u += i_source_margin_c;
397             p_v += i_source_margin_c;
398             p_line1 += i_dest_margin;
399             p_line2 += i_dest_margin;
400         }
401     }
402     /* make sure all SSE2 stores are visible thereafter */
403     SSE2_END;
404
405 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
406 }
407
408 /*****************************************************************************
409  * I420_YVYU: planar YUV 4:2:0 to packed YVYU 4:2:2
410  *****************************************************************************/
411 static void I420_YVYU( filter_t *p_filter, picture_t *p_source,
412                                            picture_t *p_dest )
413 {
414     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
415     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
416     uint8_t *p_u = p_source->U_PIXELS;
417     uint8_t *p_v = p_source->V_PIXELS;
418
419     int i_x, i_y;
420
421 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
422 #define VEC_NEXT_LINES( ) \
423     p_line1  = p_line2; \
424     p_line2 += p_dest->p->i_pitch; \
425     p_y1     = p_y2; \
426     p_y2    += p_source->p[Y_PLANE].i_pitch;
427
428 #define VEC_LOAD_UV( ) \
429     u_vec = vec_ld( 0, p_u ); p_u += 16; \
430     v_vec = vec_ld( 0, p_v ); p_v += 16;
431
432 #define VEC_MERGE( a ) \
433     vu_vec = a( v_vec, u_vec ); \
434     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
435     vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
436     vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
437     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
438     vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16; \
439     vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16;
440
441     vector unsigned char u_vec;
442     vector unsigned char v_vec;
443     vector unsigned char vu_vec;
444     vector unsigned char y_vec;
445
446     if( !( ( p_filter->fmt_in.video.i_width % 32 ) |
447            ( p_filter->fmt_in.video.i_height % 2 ) ) )
448     {
449         /* Width is a multiple of 32, we take 2 lines at a time */
450         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
451         {
452             VEC_NEXT_LINES( );
453             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
454             {
455                 VEC_LOAD_UV( );
456                 VEC_MERGE( vec_mergeh );
457                 VEC_MERGE( vec_mergel );
458             }
459         }
460     }
461     else if( !( ( p_filter->fmt_in.video.i_width % 16 ) |
462                 ( p_filter->fmt_in.video.i_height % 4 ) ) )
463     {
464         /* Width is only a multiple of 16, we take 4 lines at a time */
465         for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; )
466         {
467             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
468             VEC_NEXT_LINES( );
469             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
470             {
471                 VEC_LOAD_UV( );
472                 VEC_MERGE( vec_mergeh );
473                 VEC_MERGE( vec_mergel );
474             }
475
476             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
477             VEC_LOAD_UV( );
478             VEC_MERGE( vec_mergeh );
479
480             /* Line 3 and 4, pixels 0 to 16 */
481             VEC_NEXT_LINES( );
482             VEC_MERGE( vec_mergel );
483
484             /* Line 3 and 4, pixels 16 to ( width ) */
485             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
486             {
487                 VEC_LOAD_UV( );
488                 VEC_MERGE( vec_mergeh );
489                 VEC_MERGE( vec_mergel );
490             }
491         }
492     }
493     else
494     {
495         /* Crap, use the C version */
496 #undef VEC_NEXT_LINES
497 #undef VEC_LOAD_UV
498 #undef VEC_MERGE
499 #endif
500
501     const int i_source_margin = p_source->p[0].i_pitch
502                                  - p_source->p[0].i_visible_pitch;
503     const int i_source_margin_c = p_source->p[1].i_pitch
504                                  - p_source->p[1].i_visible_pitch;
505     const int i_dest_margin = p_dest->p->i_pitch
506                                - p_dest->p->i_visible_pitch;
507
508 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
509     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
510     {
511         p_line1 = p_line2;
512         p_line2 += p_dest->p->i_pitch;
513
514         p_y1 = p_y2;
515         p_y2 += p_source->p[Y_PLANE].i_pitch;
516
517         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
518         {
519 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
520             C_YUV420_YVYU( );
521             C_YUV420_YVYU( );
522             C_YUV420_YVYU( );
523             C_YUV420_YVYU( );
524 #else
525             MMX_CALL( MMX_YUV420_YVYU );
526 #endif
527         }
528         for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; )
529         {
530             C_YUV420_YVYU( );
531         }
532
533         p_y1 += i_source_margin;
534         p_y2 += i_source_margin;
535         p_u += i_source_margin_c;
536         p_v += i_source_margin_c;
537         p_line1 += i_dest_margin;
538         p_line2 += i_dest_margin;
539     }
540
541 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
542     /* re-enable FPU registers */
543     MMX_END;
544 #endif
545
546 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
547     }
548 #endif
549
550 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
551     /*
552     ** SSE2 128 bits fetch/store instructions are faster
553     ** if memory access is 16 bytes aligned
554     */
555     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
556         ((intptr_t)p_line2|(intptr_t)p_y2))) )
557     {
558         /* use faster SSE2 aligned fetch and store */
559         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
560         {
561             p_line1 = p_line2;
562             p_line2 += p_dest->p->i_pitch;
563
564             p_y1 = p_y2;
565             p_y2 += p_source->p[Y_PLANE].i_pitch;
566
567             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
568             {
569                 SSE2_CALL( SSE2_YUV420_YVYU_ALIGNED );
570             }
571             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
572             {
573                 C_YUV420_YVYU( );
574             }
575
576             p_y1 += i_source_margin;
577             p_y2 += i_source_margin;
578             p_u += i_source_margin_c;
579             p_v += i_source_margin_c;
580             p_line1 += i_dest_margin;
581             p_line2 += i_dest_margin;
582         }
583     }
584     else
585     {
586         /* use slower SSE2 unaligned fetch and store */
587         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
588         {
589             p_line1 = p_line2;
590             p_line2 += p_dest->p->i_pitch;
591
592             p_y1 = p_y2;
593             p_y2 += p_source->p[Y_PLANE].i_pitch;
594
595             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
596             {
597                 SSE2_CALL( SSE2_YUV420_YVYU_UNALIGNED );
598             }
599             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
600             {
601                 C_YUV420_YVYU( );
602             }
603
604             p_y1 += i_source_margin;
605             p_y2 += i_source_margin;
606             p_u += i_source_margin_c;
607             p_v += i_source_margin_c;
608             p_line1 += i_dest_margin;
609             p_line2 += i_dest_margin;
610         }
611     }
612     /* make sure all SSE2 stores are visible thereafter */
613     SSE2_END;
614 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
615 }
616
617 /*****************************************************************************
618  * I420_UYVY: planar YUV 4:2:0 to packed UYVY 4:2:2
619  *****************************************************************************/
620 static void I420_UYVY( filter_t *p_filter, picture_t *p_source,
621                                            picture_t *p_dest )
622 {
623     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
624     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
625     uint8_t *p_u = p_source->U_PIXELS;
626     uint8_t *p_v = p_source->V_PIXELS;
627
628     int i_x, i_y;
629
630 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
631 #define VEC_NEXT_LINES( ) \
632     p_line1  = p_line2; \
633     p_line2 += p_dest->p->i_pitch; \
634     p_y1     = p_y2; \
635     p_y2    += p_source->p[Y_PLANE].i_pitch;
636
637 #define VEC_LOAD_UV( ) \
638     u_vec = vec_ld( 0, p_u ); p_u += 16; \
639     v_vec = vec_ld( 0, p_v ); p_v += 16;
640
641 #define VEC_MERGE( a ) \
642     uv_vec = a( u_vec, v_vec ); \
643     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
644     vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
645     vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
646     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
647     vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16; \
648     vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16;
649
650     vector unsigned char u_vec;
651     vector unsigned char v_vec;
652     vector unsigned char uv_vec;
653     vector unsigned char y_vec;
654
655     if( !( ( p_filter->fmt_in.video.i_width % 32 ) |
656            ( p_filter->fmt_in.video.i_height % 2 ) ) )
657     {
658         /* Width is a multiple of 32, we take 2 lines at a time */
659         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
660         {
661             VEC_NEXT_LINES( );
662             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
663             {
664                 VEC_LOAD_UV( );
665                 VEC_MERGE( vec_mergeh );
666                 VEC_MERGE( vec_mergel );
667             }
668         }
669     }
670     else if( !( ( p_filter->fmt_in.video.i_width % 16 ) |
671                 ( p_filter->fmt_in.video.i_height % 4 ) ) )
672     {
673         /* Width is only a multiple of 16, we take 4 lines at a time */
674         for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; )
675         {
676             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
677             VEC_NEXT_LINES( );
678             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
679             {
680                 VEC_LOAD_UV( );
681                 VEC_MERGE( vec_mergeh );
682                 VEC_MERGE( vec_mergel );
683             }
684
685             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
686             VEC_LOAD_UV( );
687             VEC_MERGE( vec_mergeh );
688
689             /* Line 3 and 4, pixels 0 to 16 */
690             VEC_NEXT_LINES( );
691             VEC_MERGE( vec_mergel );
692
693             /* Line 3 and 4, pixels 16 to ( width ) */
694             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
695             {
696                 VEC_LOAD_UV( );
697                 VEC_MERGE( vec_mergeh );
698                 VEC_MERGE( vec_mergel );
699             }
700         }
701     }
702     else
703     {
704         /* Crap, use the C version */
705 #undef VEC_NEXT_LINES
706 #undef VEC_LOAD_UV
707 #undef VEC_MERGE
708 #endif
709
710     const int i_source_margin = p_source->p[0].i_pitch
711                                  - p_source->p[0].i_visible_pitch;
712     const int i_source_margin_c = p_source->p[1].i_pitch
713                                  - p_source->p[1].i_visible_pitch;
714     const int i_dest_margin = p_dest->p->i_pitch
715                                - p_dest->p->i_visible_pitch;
716
717 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
718     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
719     {
720         p_line1 = p_line2;
721         p_line2 += p_dest->p->i_pitch;
722
723         p_y1 = p_y2;
724         p_y2 += p_source->p[Y_PLANE].i_pitch;
725
726         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
727         {
728 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
729             C_YUV420_UYVY( );
730             C_YUV420_UYVY( );
731             C_YUV420_UYVY( );
732             C_YUV420_UYVY( );
733 #else
734             MMX_CALL( MMX_YUV420_UYVY );
735 #endif
736         }
737         for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x--; )
738         {
739             C_YUV420_UYVY( );
740         }
741
742         p_y1 += i_source_margin;
743         p_y2 += i_source_margin;
744         p_u += i_source_margin_c;
745         p_v += i_source_margin_c;
746         p_line1 += i_dest_margin;
747         p_line2 += i_dest_margin;
748     }
749
750 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
751     /* re-enable FPU registers */
752     MMX_END;
753 #endif
754
755 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
756     }
757 #endif
758
759 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
760     /*
761     ** SSE2 128 bits fetch/store instructions are faster
762     ** if memory access is 16 bytes aligned
763     */
764     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
765         ((intptr_t)p_line2|(intptr_t)p_y2))) )
766     {
767         /* use faster SSE2 aligned fetch and store */
768         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
769         {
770             p_line1 = p_line2;
771             p_line2 += p_dest->p->i_pitch;
772
773             p_y1 = p_y2;
774             p_y2 += p_source->p[Y_PLANE].i_pitch;
775
776             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
777             {
778                 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
779             }
780             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
781             {
782                 C_YUV420_UYVY( );
783             }
784
785             p_y1 += i_source_margin;
786             p_y2 += i_source_margin;
787             p_u += i_source_margin_c;
788             p_v += i_source_margin_c;
789             p_line1 += i_dest_margin;
790             p_line2 += i_dest_margin;
791         }
792     }
793     else
794     {
795         /* use slower SSE2 unaligned fetch and store */
796         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
797         {
798             p_line1 = p_line2;
799             p_line2 += p_dest->p->i_pitch;
800
801             p_y1 = p_y2;
802             p_y2 += p_source->p[Y_PLANE].i_pitch;
803
804             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
805             {
806                 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
807             }
808             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
809             {
810                 C_YUV420_UYVY( );
811             }
812
813             p_y1 += i_source_margin;
814             p_y2 += i_source_margin;
815             p_u += i_source_margin_c;
816             p_v += i_source_margin_c;
817             p_line1 += i_dest_margin;
818             p_line2 += i_dest_margin;
819         }
820     }
821     /* make sure all SSE2 stores are visible thereafter */
822     SSE2_END;
823 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
824 }
825
826 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
827 /*****************************************************************************
828  * I420_IUYV: planar YUV 4:2:0 to interleaved packed UYVY 4:2:2
829  *****************************************************************************/
830 static void I420_IUYV( filter_t *p_filter, picture_t *p_source,
831                                            picture_t *p_dest )
832 {
833     VLC_UNUSED(p_source); VLC_UNUSED(p_dest);
834     /* FIXME: TODO ! */
835     msg_Err( p_filter, "I420_IUYV unimplemented, please harass <sam@zoy.org>" );
836 }
837
838 /*****************************************************************************
839  * I420_cyuv: planar YUV 4:2:0 to upside-down packed UYVY 4:2:2
840  *****************************************************************************/
841 static void I420_cyuv( filter_t *p_filter, picture_t *p_source,
842                                            picture_t *p_dest )
843 {
844     uint8_t *p_line1 = p_dest->p->p_pixels +
845                        p_dest->p->i_visible_lines * p_dest->p->i_pitch
846                        + p_dest->p->i_pitch;
847     uint8_t *p_line2 = p_dest->p->p_pixels +
848                        p_dest->p->i_visible_lines * p_dest->p->i_pitch;
849     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
850     uint8_t *p_u = p_source->U_PIXELS;
851     uint8_t *p_v = p_source->V_PIXELS;
852
853     int i_x, i_y;
854
855     const int i_source_margin = p_source->p[0].i_pitch
856                                  - p_source->p[0].i_visible_pitch;
857     const int i_source_margin_c = p_source->p[1].i_pitch
858                                  - p_source->p[1].i_visible_pitch;
859     const int i_dest_margin = p_dest->p->i_pitch
860                                - p_dest->p->i_visible_pitch;
861
862 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
863     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
864     {
865         p_line1 -= 3 * p_dest->p->i_pitch;
866         p_line2 -= 3 * p_dest->p->i_pitch;
867
868         p_y1 = p_y2;
869         p_y2 += p_source->p[Y_PLANE].i_pitch;
870
871         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
872         {
873 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
874             C_YUV420_UYVY( );
875             C_YUV420_UYVY( );
876             C_YUV420_UYVY( );
877             C_YUV420_UYVY( );
878 #else
879             MMX_CALL( MMX_YUV420_UYVY );
880 #endif
881         }
882         for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; )
883         {
884             C_YUV420_UYVY( );
885         }
886
887         p_y1 += i_source_margin;
888         p_y2 += i_source_margin;
889         p_u += i_source_margin_c;
890         p_v += i_source_margin_c;
891         p_line1 += i_dest_margin;
892         p_line2 += i_dest_margin;
893     }
894
895 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
896     /* re-enable FPU registers */
897     MMX_END;
898 #endif
899
900 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
901     /*
902     ** SSE2 128 bits fetch/store instructions are faster
903     ** if memory access is 16 bytes aligned
904     */
905     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
906         ((intptr_t)p_line2|(intptr_t)p_y2))) )
907     {
908         /* use faster SSE2 aligned fetch and store */
909         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
910         {
911             p_line1 = p_line2;
912             p_line2 += p_dest->p->i_pitch;
913
914             p_y1 = p_y2;
915             p_y2 += p_source->p[Y_PLANE].i_pitch;
916
917             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
918             {
919                 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
920             }
921             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
922             {
923                 C_YUV420_UYVY( );
924             }
925
926             p_y1 += i_source_margin;
927             p_y2 += i_source_margin;
928             p_u += i_source_margin_c;
929             p_v += i_source_margin_c;
930             p_line1 += i_dest_margin;
931             p_line2 += i_dest_margin;
932         }
933     }
934     else
935     {
936         /* use slower SSE2 unaligned fetch and store */
937         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
938         {
939             p_line1 = p_line2;
940             p_line2 += p_dest->p->i_pitch;
941
942             p_y1 = p_y2;
943             p_y2 += p_source->p[Y_PLANE].i_pitch;
944
945             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
946             {
947                 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
948             }
949             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
950             {
951                 C_YUV420_UYVY( );
952             }
953
954             p_y1 += i_source_margin;
955             p_y2 += i_source_margin;
956             p_u += i_source_margin_c;
957             p_v += i_source_margin_c;
958             p_line1 += i_dest_margin;
959             p_line2 += i_dest_margin;
960         }
961     }
962     /* make sure all SSE2 stores are visible thereafter */
963     SSE2_END;
964 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
965 }
966 #endif // !defined (MODULE_NAME_IS_i420_yuy2_altivec)
967
968 /*****************************************************************************
969  * I420_Y211: planar YUV 4:2:0 to packed YUYV 2:1:1
970  *****************************************************************************/
971 #if defined (MODULE_NAME_IS_i420_yuy2)
972 static void I420_Y211( filter_t *p_filter, picture_t *p_source,
973                                            picture_t *p_dest )
974 {
975     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
976     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
977     uint8_t *p_u = p_source->U_PIXELS;
978     uint8_t *p_v = p_source->V_PIXELS;
979
980     int i_x, i_y;
981
982     const int i_source_margin = p_source->p[0].i_pitch
983                                  - p_source->p[0].i_visible_pitch;
984     const int i_source_margin_c = p_source->p[1].i_pitch
985                                  - p_source->p[1].i_visible_pitch;
986     const int i_dest_margin = p_dest->p->i_pitch
987                                - p_dest->p->i_visible_pitch;
988
989     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
990     {
991         p_line1 = p_line2;
992         p_line2 += p_dest->p->i_pitch;
993
994         p_y1 = p_y2;
995         p_y2 += p_source->p[Y_PLANE].i_pitch;
996
997         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
998         {
999             C_YUV420_Y211( );
1000             C_YUV420_Y211( );
1001         }
1002
1003         p_y1 += i_source_margin;
1004         p_y2 += i_source_margin;
1005         p_u += i_source_margin_c;
1006         p_v += i_source_margin_c;
1007         p_line1 += i_dest_margin;
1008         p_line2 += i_dest_margin;
1009     }
1010 }
1011 #endif