]> git.sesse.net Git - vlc/blob - modules/video_chroma/i420_yuy2.c
Chroma API change. Chromas are now normal video filters (almost).
[vlc] / modules / video_chroma / i420_yuy2.c
1 /*****************************************************************************
2  * i420_yuy2.c : YUV to YUV conversion module for vlc
3  *****************************************************************************
4  * Copyright (C) 2000, 2001 the VideoLAN team
5  * $Id$
6  *
7  * Authors: Samuel Hocevar <sam@zoy.org>
8  *          Damien Fouilleul <damien@videolan.org>
9  *
10  * This program is free software; you can redistribute it and/or modify
11  * it under the terms of the GNU General Public License as published by
12  * the Free Software Foundation; either version 2 of the License, or
13  * (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public License
21  * along with this program; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
23  *****************************************************************************/
24
25 /*****************************************************************************
26  * Preamble
27  *****************************************************************************/
28
29 #ifdef HAVE_CONFIG_H
30 # include "config.h"
31 #endif
32
33 #include <vlc_common.h>
34 #include <vlc_plugin.h>
35 #include <vlc_filter.h>
36 #include <vlc_vout.h>
37
38 #if defined (MODULE_NAME_IS_i420_yuy2_altivec) && defined(HAVE_ALTIVEC_H)
39 #   include <altivec.h>
40 #endif
41
42 #include "i420_yuy2.h"
43
44 #define SRC_FOURCC  "I420,IYUV,YV12"
45
46 #if defined (MODULE_NAME_IS_i420_yuy2)
47 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv,Y211"
48 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
49 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
50 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
51 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
52 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
53 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422"
54 #endif
55
56 /*****************************************************************************
57  * Local and extern prototypes.
58  *****************************************************************************/
59 static int  Activate ( vlc_object_t * );
60
61 static void I420_YUY2           ( filter_t *, picture_t *, picture_t * );
62 static void I420_YVYU           ( filter_t *, picture_t *, picture_t * );
63 static void I420_UYVY           ( filter_t *, picture_t *, picture_t * );
64 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
65 static void I420_IUYV           ( filter_t *, picture_t *, picture_t * );
66 static void I420_cyuv           ( filter_t *, picture_t *, picture_t * );
67 #endif
68 #if defined (MODULE_NAME_IS_i420_yuy2)
69 static void I420_Y211           ( filter_t *, picture_t *, picture_t * );
70 #endif
71
72 #ifdef MODULE_NAME_IS_i420_yuy2_mmx
73 /* Initialize MMX-specific constants */
74 static const uint64_t i_00ffw = 0x00ff00ff00ff00ffULL;
75 static const uint64_t i_80w   = 0x0000000080808080ULL;
76 #endif
77
78 /*****************************************************************************
79  * Module descriptor.
80  *****************************************************************************/
81 vlc_module_begin();
82 #if defined (MODULE_NAME_IS_i420_yuy2)
83     set_description( N_("Conversions from " SRC_FOURCC " to " DEST_FOURCC) );
84     set_capability( "chroma", 80 );
85 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
86     set_description( N_("MMX conversions from " SRC_FOURCC " to " DEST_FOURCC) );
87     set_capability( "chroma", 100 );
88     add_requirement( MMX );
89 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
90     set_description( N_("SSE2 conversions from " SRC_FOURCC " to " DEST_FOURCC) );
91     set_capability( "chroma", 120 );
92     add_requirement( SSE2 );
93 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
94     set_description(
95             _("AltiVec conversions from " SRC_FOURCC " to " DEST_FOURCC) );
96     set_capability( "chroma", 100 );
97     add_requirement( ALTIVEC );
98 #endif
99     set_callbacks( Activate, NULL );
100 vlc_module_end();
101
102 /*****************************************************************************
103  * Activate: allocate a chroma function
104  *****************************************************************************
105  * This function allocates and initializes a chroma function
106  *****************************************************************************/
107 static int Activate( vlc_object_t *p_this )
108 {
109     filter_t *p_filter = (filter_t *)p_this;
110
111     if( p_filter->fmt_in.video.i_width & 1
112      || p_filter->fmt_in.video.i_height & 1 )
113     {
114         return -1;
115     }
116
117     switch( p_filter->fmt_in.video.i_chroma )
118     {
119         case VLC_FOURCC('Y','V','1','2'):
120         case VLC_FOURCC('I','4','2','0'):
121         case VLC_FOURCC('I','Y','U','V'):
122             switch( p_filter->fmt_out.video.i_chroma )
123             {
124                 case VLC_FOURCC('Y','U','Y','2'):
125                 case VLC_FOURCC('Y','U','N','V'):
126                     p_filter->pf_video_filter_io = I420_YUY2;
127                     break;
128
129                 case VLC_FOURCC('Y','V','Y','U'):
130                     p_filter->pf_video_filter_io = I420_YVYU;
131                     break;
132
133                 case VLC_FOURCC('U','Y','V','Y'):
134                 case VLC_FOURCC('U','Y','N','V'):
135                 case VLC_FOURCC('Y','4','2','2'):
136                     p_filter->pf_video_filter_io = I420_UYVY;
137                     break;
138 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
139                 case VLC_FOURCC('I','U','Y','V'):
140                     p_filter->pf_video_filter_io = I420_IUYV;
141                     break;
142
143                 case VLC_FOURCC('c','y','u','v'):
144                     p_filter->pf_video_filter_io = I420_cyuv;
145                     break;
146 #endif
147
148 #if defined (MODULE_NAME_IS_i420_yuy2)
149                 case VLC_FOURCC('Y','2','1','1'):
150                     p_filter->pf_video_filter_io = I420_Y211;
151                     break;
152 #endif
153
154                 default:
155                     return -1;
156             }
157             break;
158
159         default:
160             return -1;
161     }
162
163     return 0;
164 }
165
166 #if 0
167 static inline unsigned long long read_cycles(void)
168 {
169     unsigned long long v;
170     __asm__ __volatile__("rdtsc" : "=A" (v): );
171
172     return v;
173 }
174 #endif
175
176 /* Following functions are local */
177 /*****************************************************************************
178  * I420_YUY2: planar YUV 4:2:0 to packed YUYV 4:2:2
179  *****************************************************************************/
180 static void I420_YUY2( filter_t *p_filter, picture_t *p_source,
181                                            picture_t *p_dest )
182 {
183     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
184     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
185     uint8_t *p_u = p_source->U_PIXELS;
186     uint8_t *p_v = p_source->V_PIXELS;
187
188     int i_x, i_y;
189
190 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
191 #define VEC_NEXT_LINES( ) \
192     p_line1  = p_line2; \
193     p_line2 += p_dest->p->i_pitch; \
194     p_y1     = p_y2; \
195     p_y2    += p_source->p[Y_PLANE].i_pitch;
196
197 #define VEC_LOAD_UV( ) \
198     u_vec = vec_ld( 0, p_u ); p_u += 16; \
199     v_vec = vec_ld( 0, p_v ); p_v += 16;
200
201 #define VEC_MERGE( a ) \
202     uv_vec = a( u_vec, v_vec ); \
203     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
204     vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
205     vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
206     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
207     vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16; \
208     vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16;
209
210     vector unsigned char u_vec;
211     vector unsigned char v_vec;
212     vector unsigned char uv_vec;
213     vector unsigned char y_vec;
214
215     if( !( ( p_filter->fmt_in.video.i_width % 32 ) |
216            ( p_filter->fmt_in.video.i_height % 2 ) ) )
217     {
218         /* Width is a multiple of 32, we take 2 lines at a time */
219         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
220         {
221             VEC_NEXT_LINES( );
222             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
223             {
224                 VEC_LOAD_UV( );
225                 VEC_MERGE( vec_mergeh );
226                 VEC_MERGE( vec_mergel );
227             }
228         }
229     }
230     else if( !( ( p_filter->fmt_in.video.i_width % 16 ) |
231                 ( p_filter->fmt_in.video.i_height % 4 ) ) )
232     {
233         /* Width is only a multiple of 16, we take 4 lines at a time */
234         for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; )
235         {
236             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
237             VEC_NEXT_LINES( );
238             for( i_x = p_fiter->fmt_in.video.i_width / 32 ; i_x-- ; )
239             {
240                 VEC_LOAD_UV( );
241                 VEC_MERGE( vec_mergeh );
242                 VEC_MERGE( vec_mergel );
243             }
244
245             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
246             VEC_LOAD_UV( );
247             VEC_MERGE( vec_mergeh );
248
249             /* Line 3 and 4, pixels 0 to 16 */
250             VEC_NEXT_LINES( );
251             VEC_MERGE( vec_mergel );
252
253             /* Line 3 and 4, pixels 16 to ( width ) */
254             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
255             {
256                 VEC_LOAD_UV( );
257                 VEC_MERGE( vec_mergeh );
258                 VEC_MERGE( vec_mergel );
259             }
260         }
261     }
262     else
263     {
264         /* Crap, use the C version */
265 #undef VEC_NEXT_LINES
266 #undef VEC_LOAD_UV
267 #undef VEC_MERGE
268 #endif
269
270     const int i_source_margin = p_source->p[0].i_pitch
271                                  - p_source->p[0].i_visible_pitch;
272     const int i_source_margin_c = p_source->p[1].i_pitch
273                                  - p_source->p[1].i_visible_pitch;
274     const int i_dest_margin = p_dest->p->i_pitch
275                                - p_dest->p->i_visible_pitch;
276
277 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
278     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
279     {
280         p_line1 = p_line2;
281         p_line2 += p_dest->p->i_pitch;
282
283         p_y1 = p_y2;
284         p_y2 += p_source->p[Y_PLANE].i_pitch;
285
286 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
287         for( i_x = p_filter->fmt_in.video.i_width / 8; i_x-- ; )
288         {
289             C_YUV420_YUYV( );
290             C_YUV420_YUYV( );
291             C_YUV420_YUYV( );
292             C_YUV420_YUYV( );
293         }
294 #else
295         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
296         {
297             MMX_CALL( MMX_YUV420_YUYV );
298         }
299 #endif
300         for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; )
301         {
302             C_YUV420_YUYV( );
303         }
304
305         p_y1 += i_source_margin;
306         p_y2 += i_source_margin;
307         p_u += i_source_margin_c;
308         p_v += i_source_margin_c;
309         p_line1 += i_dest_margin;
310         p_line2 += i_dest_margin;
311     }
312
313 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
314     /* re-enable FPU registers */
315     MMX_END;
316 #endif
317
318 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
319     }
320 #endif
321
322 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
323     /*
324     ** SSE2 128 bits fetch/store instructions are faster
325     ** if memory access is 16 bytes aligned
326     */
327
328     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
329         ((intptr_t)p_line2|(intptr_t)p_y2))) )
330     {
331         /* use faster SSE2 aligned fetch and store */
332         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
333         {
334             p_line1 = p_line2;
335             p_line2 += p_dest->p->i_pitch;
336
337             p_y1 = p_y2;
338             p_y2 += p_source->p[Y_PLANE].i_pitch;
339
340             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
341             {
342                 SSE2_CALL( SSE2_YUV420_YUYV_ALIGNED );
343             }
344             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
345             {
346                 C_YUV420_YUYV( );
347             }
348
349             p_y1 += i_source_margin;
350             p_y2 += i_source_margin;
351             p_u += i_source_margin_c;
352             p_v += i_source_margin_c;
353             p_line1 += i_dest_margin;
354             p_line2 += i_dest_margin;
355         }
356     }
357     else
358     {
359         /* use slower SSE2 unaligned fetch and store */
360         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
361         {
362             p_line1 = p_line2;
363             p_line2 += p_dest->p->i_pitch;
364
365             p_y1 = p_y2;
366             p_y2 += p_source->p[Y_PLANE].i_pitch;
367
368             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
369             {
370                 SSE2_CALL( SSE2_YUV420_YUYV_UNALIGNED );
371             }
372             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
373             {
374                 C_YUV420_YUYV( );
375             }
376
377             p_y1 += i_source_margin;
378             p_y2 += i_source_margin;
379             p_u += i_source_margin_c;
380             p_v += i_source_margin_c;
381             p_line1 += i_dest_margin;
382             p_line2 += i_dest_margin;
383         }
384     }
385     /* make sure all SSE2 stores are visible thereafter */
386     SSE2_END;
387
388 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
389 }
390
391 /*****************************************************************************
392  * I420_YVYU: planar YUV 4:2:0 to packed YVYU 4:2:2
393  *****************************************************************************/
394 static void I420_YVYU( filter_t *p_filter, picture_t *p_source,
395                                            picture_t *p_dest )
396 {
397     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
398     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
399     uint8_t *p_u = p_source->U_PIXELS;
400     uint8_t *p_v = p_source->V_PIXELS;
401
402     int i_x, i_y;
403
404 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
405 #define VEC_NEXT_LINES( ) \
406     p_line1  = p_line2; \
407     p_line2 += p_dest->p->i_pitch; \
408     p_y1     = p_y2; \
409     p_y2    += p_source->p[Y_PLANE].i_pitch;
410
411 #define VEC_LOAD_UV( ) \
412     u_vec = vec_ld( 0, p_u ); p_u += 16; \
413     v_vec = vec_ld( 0, p_v ); p_v += 16;
414
415 #define VEC_MERGE( a ) \
416     vu_vec = a( v_vec, u_vec ); \
417     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
418     vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
419     vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
420     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
421     vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16; \
422     vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16;
423
424     vector unsigned char u_vec;
425     vector unsigned char v_vec;
426     vector unsigned char vu_vec;
427     vector unsigned char y_vec;
428
429     if( !( ( p_filter->fmt_in.video.i_width % 32 ) |
430            ( p_filter->fmt_in.video.i_height % 2 ) ) )
431     {
432         /* Width is a multiple of 32, we take 2 lines at a time */
433         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
434         {
435             VEC_NEXT_LINES( );
436             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
437             {
438                 VEC_LOAD_UV( );
439                 VEC_MERGE( vec_mergeh );
440                 VEC_MERGE( vec_mergel );
441             }
442         }
443     }
444     else if( !( ( p_filter->fmt_in.video.i_width % 16 ) |
445                 ( p_filter->fmt_in.video.i_height % 4 ) ) )
446     {
447         /* Width is only a multiple of 16, we take 4 lines at a time */
448         for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; )
449         {
450             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
451             VEC_NEXT_LINES( );
452             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
453             {
454                 VEC_LOAD_UV( );
455                 VEC_MERGE( vec_mergeh );
456                 VEC_MERGE( vec_mergel );
457             }
458
459             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
460             VEC_LOAD_UV( );
461             VEC_MERGE( vec_mergeh );
462
463             /* Line 3 and 4, pixels 0 to 16 */
464             VEC_NEXT_LINES( );
465             VEC_MERGE( vec_mergel );
466
467             /* Line 3 and 4, pixels 16 to ( width ) */
468             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
469             {
470                 VEC_LOAD_UV( );
471                 VEC_MERGE( vec_mergeh );
472                 VEC_MERGE( vec_mergel );
473             }
474         }
475     }
476     else
477     {
478         /* Crap, use the C version */
479 #undef VEC_NEXT_LINES
480 #undef VEC_LOAD_UV
481 #undef VEC_MERGE
482 #endif
483
484     const int i_source_margin = p_source->p[0].i_pitch
485                                  - p_source->p[0].i_visible_pitch;
486     const int i_source_margin_c = p_source->p[1].i_pitch
487                                  - p_source->p[1].i_visible_pitch;
488     const int i_dest_margin = p_dest->p->i_pitch
489                                - p_dest->p->i_visible_pitch;
490
491 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
492     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
493     {
494         p_line1 = p_line2;
495         p_line2 += p_dest->p->i_pitch;
496
497         p_y1 = p_y2;
498         p_y2 += p_source->p[Y_PLANE].i_pitch;
499
500         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
501         {
502 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
503             C_YUV420_YVYU( );
504             C_YUV420_YVYU( );
505             C_YUV420_YVYU( );
506             C_YUV420_YVYU( );
507 #else
508             MMX_CALL( MMX_YUV420_YVYU );
509 #endif
510         }
511         for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; )
512         {
513             C_YUV420_YVYU( );
514         }
515
516         p_y1 += i_source_margin;
517         p_y2 += i_source_margin;
518         p_u += i_source_margin_c;
519         p_v += i_source_margin_c;
520         p_line1 += i_dest_margin;
521         p_line2 += i_dest_margin;
522     }
523
524 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
525     /* re-enable FPU registers */
526     MMX_END;
527 #endif
528
529 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
530     }
531 #endif
532
533 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
534     /*
535     ** SSE2 128 bits fetch/store instructions are faster
536     ** if memory access is 16 bytes aligned
537     */
538     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
539         ((intptr_t)p_line2|(intptr_t)p_y2))) )
540     {
541         /* use faster SSE2 aligned fetch and store */
542         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
543         {
544             p_line1 = p_line2;
545             p_line2 += p_dest->p->i_pitch;
546
547             p_y1 = p_y2;
548             p_y2 += p_source->p[Y_PLANE].i_pitch;
549
550             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
551             {
552                 SSE2_CALL( SSE2_YUV420_YVYU_ALIGNED );
553             }
554             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
555             {
556                 C_YUV420_YVYU( );
557             }
558
559             p_y1 += i_source_margin;
560             p_y2 += i_source_margin;
561             p_u += i_source_margin_c;
562             p_v += i_source_margin_c;
563             p_line1 += i_dest_margin;
564             p_line2 += i_dest_margin;
565         }
566     }
567     else
568     {
569         /* use slower SSE2 unaligned fetch and store */
570         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
571         {
572             p_line1 = p_line2;
573             p_line2 += p_dest->p->i_pitch;
574
575             p_y1 = p_y2;
576             p_y2 += p_source->p[Y_PLANE].i_pitch;
577
578             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
579             {
580                 SSE2_CALL( SSE2_YUV420_YVYU_UNALIGNED );
581             }
582             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
583             {
584                 C_YUV420_YVYU( );
585             }
586
587             p_y1 += i_source_margin;
588             p_y2 += i_source_margin;
589             p_u += i_source_margin_c;
590             p_v += i_source_margin_c;
591             p_line1 += i_dest_margin;
592             p_line2 += i_dest_margin;
593         }
594     }
595     /* make sure all SSE2 stores are visible thereafter */
596     SSE2_END;
597 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
598 }
599
600 /*****************************************************************************
601  * I420_UYVY: planar YUV 4:2:0 to packed UYVY 4:2:2
602  *****************************************************************************/
603 static void I420_UYVY( filter_t *p_filter, picture_t *p_source,
604                                            picture_t *p_dest )
605 {
606     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
607     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
608     uint8_t *p_u = p_source->U_PIXELS;
609     uint8_t *p_v = p_source->V_PIXELS;
610
611     int i_x, i_y;
612
613 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
614 #define VEC_NEXT_LINES( ) \
615     p_line1  = p_line2; \
616     p_line2 += p_dest->p->i_pitch; \
617     p_y1     = p_y2; \
618     p_y2    += p_source->p[Y_PLANE].i_pitch;
619
620 #define VEC_LOAD_UV( ) \
621     u_vec = vec_ld( 0, p_u ); p_u += 16; \
622     v_vec = vec_ld( 0, p_v ); p_v += 16;
623
624 #define VEC_MERGE( a ) \
625     uv_vec = a( u_vec, v_vec ); \
626     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
627     vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
628     vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
629     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
630     vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16; \
631     vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16;
632
633     vector unsigned char u_vec;
634     vector unsigned char v_vec;
635     vector unsigned char uv_vec;
636     vector unsigned char y_vec;
637
638     if( !( ( p_filter->fmt_in.video.i_width % 32 ) |
639            ( p_filter->fmt_in.video.i_height % 2 ) ) )
640     {
641         /* Width is a multiple of 32, we take 2 lines at a time */
642         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
643         {
644             VEC_NEXT_LINES( );
645             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
646             {
647                 VEC_LOAD_UV( );
648                 VEC_MERGE( vec_mergeh );
649                 VEC_MERGE( vec_mergel );
650             }
651         }
652     }
653     else if( !( ( p_filter->fmt_in.video.i_width % 16 ) |
654                 ( p_filter->fmt_in.video.i_height % 4 ) ) )
655     {
656         /* Width is only a multiple of 16, we take 4 lines at a time */
657         for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; )
658         {
659             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
660             VEC_NEXT_LINES( );
661             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
662             {
663                 VEC_LOAD_UV( );
664                 VEC_MERGE( vec_mergeh );
665                 VEC_MERGE( vec_mergel );
666             }
667
668             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
669             VEC_LOAD_UV( );
670             VEC_MERGE( vec_mergeh );
671
672             /* Line 3 and 4, pixels 0 to 16 */
673             VEC_NEXT_LINES( );
674             VEC_MERGE( vec_mergel );
675
676             /* Line 3 and 4, pixels 16 to ( width ) */
677             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
678             {
679                 VEC_LOAD_UV( );
680                 VEC_MERGE( vec_mergeh );
681                 VEC_MERGE( vec_mergel );
682             }
683         }
684     }
685     else
686     {
687         /* Crap, use the C version */
688 #undef VEC_NEXT_LINES
689 #undef VEC_LOAD_UV
690 #undef VEC_MERGE
691 #endif
692
693     const int i_source_margin = p_source->p[0].i_pitch
694                                  - p_source->p[0].i_visible_pitch;
695     const int i_source_margin_c = p_source->p[1].i_pitch
696                                  - p_source->p[1].i_visible_pitch;
697     const int i_dest_margin = p_dest->p->i_pitch
698                                - p_dest->p->i_visible_pitch;
699
700 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
701     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
702     {
703         p_line1 = p_line2;
704         p_line2 += p_dest->p->i_pitch;
705
706         p_y1 = p_y2;
707         p_y2 += p_source->p[Y_PLANE].i_pitch;
708
709         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
710         {
711 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
712             C_YUV420_UYVY( );
713             C_YUV420_UYVY( );
714             C_YUV420_UYVY( );
715             C_YUV420_UYVY( );
716 #else
717             MMX_CALL( MMX_YUV420_UYVY );
718 #endif
719         }
720         for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x--; )
721         {
722             C_YUV420_UYVY( );
723         }
724
725         p_y1 += i_source_margin;
726         p_y2 += i_source_margin;
727         p_u += i_source_margin_c;
728         p_v += i_source_margin_c;
729         p_line1 += i_dest_margin;
730         p_line2 += i_dest_margin;
731     }
732
733 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
734     /* re-enable FPU registers */
735     MMX_END;
736 #endif
737
738 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
739     }
740 #endif
741
742 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
743     /*
744     ** SSE2 128 bits fetch/store instructions are faster
745     ** if memory access is 16 bytes aligned
746     */
747     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
748         ((intptr_t)p_line2|(intptr_t)p_y2))) )
749     {
750         /* use faster SSE2 aligned fetch and store */
751         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
752         {
753             p_line1 = p_line2;
754             p_line2 += p_dest->p->i_pitch;
755
756             p_y1 = p_y2;
757             p_y2 += p_source->p[Y_PLANE].i_pitch;
758
759             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
760             {
761                 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
762             }
763             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
764             {
765                 C_YUV420_UYVY( );
766             }
767
768             p_y1 += i_source_margin;
769             p_y2 += i_source_margin;
770             p_u += i_source_margin_c;
771             p_v += i_source_margin_c;
772             p_line1 += i_dest_margin;
773             p_line2 += i_dest_margin;
774         }
775     }
776     else
777     {
778         /* use slower SSE2 unaligned fetch and store */
779         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
780         {
781             p_line1 = p_line2;
782             p_line2 += p_dest->p->i_pitch;
783
784             p_y1 = p_y2;
785             p_y2 += p_source->p[Y_PLANE].i_pitch;
786
787             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
788             {
789                 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
790             }
791             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
792             {
793                 C_YUV420_UYVY( );
794             }
795
796             p_y1 += i_source_margin;
797             p_y2 += i_source_margin;
798             p_u += i_source_margin_c;
799             p_v += i_source_margin_c;
800             p_line1 += i_dest_margin;
801             p_line2 += i_dest_margin;
802         }
803     }
804     /* make sure all SSE2 stores are visible thereafter */
805     SSE2_END;
806 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
807 }
808
809 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
810 /*****************************************************************************
811  * I420_IUYV: planar YUV 4:2:0 to interleaved packed UYVY 4:2:2
812  *****************************************************************************/
813 static void I420_IUYV( filter_t *p_filter, picture_t *p_source,
814                                            picture_t *p_dest )
815 {
816     VLC_UNUSED(p_source); VLC_UNUSED(p_dest);
817     /* FIXME: TODO ! */
818     msg_Err( p_filter, "I420_IUYV unimplemented, please harass <sam@zoy.org>" );
819 }
820
821 /*****************************************************************************
822  * I420_cyuv: planar YUV 4:2:0 to upside-down packed UYVY 4:2:2
823  *****************************************************************************/
824 static void I420_cyuv( filter_t *p_filter, picture_t *p_source,
825                                            picture_t *p_dest )
826 {
827     uint8_t *p_line1 = p_dest->p->p_pixels +
828                        p_dest->p->i_visible_lines * p_dest->p->i_pitch
829                        + p_dest->p->i_pitch;
830     uint8_t *p_line2 = p_dest->p->p_pixels +
831                        p_dest->p->i_visible_lines * p_dest->p->i_pitch;
832     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
833     uint8_t *p_u = p_source->U_PIXELS;
834     uint8_t *p_v = p_source->V_PIXELS;
835
836     int i_x, i_y;
837
838     const int i_source_margin = p_source->p[0].i_pitch
839                                  - p_source->p[0].i_visible_pitch;
840     const int i_source_margin_c = p_source->p[1].i_pitch
841                                  - p_source->p[1].i_visible_pitch;
842     const int i_dest_margin = p_dest->p->i_pitch
843                                - p_dest->p->i_visible_pitch;
844
845 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
846     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
847     {
848         p_line1 -= 3 * p_dest->p->i_pitch;
849         p_line2 -= 3 * p_dest->p->i_pitch;
850
851         p_y1 = p_y2;
852         p_y2 += p_source->p[Y_PLANE].i_pitch;
853
854         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
855         {
856 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
857             C_YUV420_UYVY( );
858             C_YUV420_UYVY( );
859             C_YUV420_UYVY( );
860             C_YUV420_UYVY( );
861 #else
862             MMX_CALL( MMX_YUV420_UYVY );
863 #endif
864         }
865         for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; )
866         {
867             C_YUV420_UYVY( );
868         }
869
870         p_y1 += i_source_margin;
871         p_y2 += i_source_margin;
872         p_u += i_source_margin_c;
873         p_v += i_source_margin_c;
874         p_line1 += i_dest_margin;
875         p_line2 += i_dest_margin;
876     }
877
878 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
879     /* re-enable FPU registers */
880     MMX_END;
881 #endif
882
883 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
884     /*
885     ** SSE2 128 bits fetch/store instructions are faster
886     ** if memory access is 16 bytes aligned
887     */
888     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
889         ((intptr_t)p_line2|(intptr_t)p_y2))) )
890     {
891         /* use faster SSE2 aligned fetch and store */
892         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
893         {
894             p_line1 = p_line2;
895             p_line2 += p_dest->p->i_pitch;
896
897             p_y1 = p_y2;
898             p_y2 += p_source->p[Y_PLANE].i_pitch;
899
900             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
901             {
902                 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
903             }
904             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
905             {
906                 C_YUV420_UYVY( );
907             }
908
909             p_y1 += i_source_margin;
910             p_y2 += i_source_margin;
911             p_u += i_source_margin_c;
912             p_v += i_source_margin_c;
913             p_line1 += i_dest_margin;
914             p_line2 += i_dest_margin;
915         }
916     }
917     else
918     {
919         /* use slower SSE2 unaligned fetch and store */
920         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
921         {
922             p_line1 = p_line2;
923             p_line2 += p_dest->p->i_pitch;
924
925             p_y1 = p_y2;
926             p_y2 += p_source->p[Y_PLANE].i_pitch;
927
928             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
929             {
930                 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
931             }
932             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
933             {
934                 C_YUV420_UYVY( );
935             }
936
937             p_y1 += i_source_margin;
938             p_y2 += i_source_margin;
939             p_u += i_source_margin_c;
940             p_v += i_source_margin_c;
941             p_line1 += i_dest_margin;
942             p_line2 += i_dest_margin;
943         }
944     }
945     /* make sure all SSE2 stores are visible thereafter */
946     SSE2_END;
947 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
948 }
949 #endif // !defined (MODULE_NAME_IS_i420_yuy2_altivec)
950
951 /*****************************************************************************
952  * I420_Y211: planar YUV 4:2:0 to packed YUYV 2:1:1
953  *****************************************************************************/
954 #if defined (MODULE_NAME_IS_i420_yuy2)
955 static void I420_Y211( filter_t *p_filter, picture_t *p_source,
956                                            picture_t *p_dest )
957 {
958     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
959     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
960     uint8_t *p_u = p_source->U_PIXELS;
961     uint8_t *p_v = p_source->V_PIXELS;
962
963     int i_x, i_y;
964
965     const int i_source_margin = p_source->p[0].i_pitch
966                                  - p_source->p[0].i_visible_pitch;
967     const int i_source_margin_c = p_source->p[1].i_pitch
968                                  - p_source->p[1].i_visible_pitch;
969     const int i_dest_margin = p_dest->p->i_pitch
970                                - p_dest->p->i_visible_pitch;
971
972     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
973     {
974         p_line1 = p_line2;
975         p_line2 += p_dest->p->i_pitch;
976
977         p_y1 = p_y2;
978         p_y2 += p_source->p[Y_PLANE].i_pitch;
979
980         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
981         {
982             C_YUV420_Y211( );
983             C_YUV420_Y211( );
984         }
985
986         p_y1 += i_source_margin;
987         p_y2 += i_source_margin;
988         p_u += i_source_margin_c;
989         p_v += i_source_margin_c;
990         p_line1 += i_dest_margin;
991         p_line2 += i_dest_margin;
992     }
993 }
994 #endif