]> git.sesse.net Git - vlc/blob - modules/video_chroma/i420_yuy2.c
Small simplifications (freetype).
[vlc] / modules / video_chroma / i420_yuy2.c
1 /*****************************************************************************
2  * i420_yuy2.c : YUV to YUV conversion module for vlc
3  *****************************************************************************
4  * Copyright (C) 2000, 2001 the VideoLAN team
5  * $Id$
6  *
7  * Authors: Samuel Hocevar <sam@zoy.org>
8  *          Damien Fouilleul <damien@videolan.org>
9  *
10  * This program is free software; you can redistribute it and/or modify
11  * it under the terms of the GNU General Public License as published by
12  * the Free Software Foundation; either version 2 of the License, or
13  * (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public License
21  * along with this program; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
23  *****************************************************************************/
24
25 /*****************************************************************************
26  * Preamble
27  *****************************************************************************/
28
29 #ifdef HAVE_CONFIG_H
30 # include "config.h"
31 #endif
32
33 #include <vlc_common.h>
34 #include <vlc_plugin.h>
35 #include <vlc_filter.h>
36 #include <vlc_cpu.h>
37
38 #if defined (MODULE_NAME_IS_i420_yuy2_altivec) && defined(HAVE_ALTIVEC_H)
39 #   include <altivec.h>
40 #endif
41
42 #include "i420_yuy2.h"
43
44 #define SRC_FOURCC  "I420,IYUV,YV12"
45
46 #if defined (MODULE_NAME_IS_i420_yuy2)
47 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv,Y211"
48 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
49 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
50 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
51 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
52 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
53 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422"
54 #endif
55
56 /*****************************************************************************
57  * Local and extern prototypes.
58  *****************************************************************************/
59 static int  Activate ( vlc_object_t * );
60
61 static void I420_YUY2           ( filter_t *, picture_t *, picture_t * );
62 static void I420_YVYU           ( filter_t *, picture_t *, picture_t * );
63 static void I420_UYVY           ( filter_t *, picture_t *, picture_t * );
64 static picture_t *I420_YUY2_Filter    ( filter_t *, picture_t * );
65 static picture_t *I420_YVYU_Filter    ( filter_t *, picture_t * );
66 static picture_t *I420_UYVY_Filter    ( filter_t *, picture_t * );
67 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
68 static void I420_IUYV           ( filter_t *, picture_t *, picture_t * );
69 static void I420_cyuv           ( filter_t *, picture_t *, picture_t * );
70 static picture_t *I420_IUYV_Filter    ( filter_t *, picture_t * );
71 static picture_t *I420_cyuv_Filter    ( filter_t *, picture_t * );
72 #endif
73 #if defined (MODULE_NAME_IS_i420_yuy2)
74 static void I420_Y211           ( filter_t *, picture_t *, picture_t * );
75 static picture_t *I420_Y211_Filter    ( filter_t *, picture_t * );
76 #endif
77
78 #ifdef MODULE_NAME_IS_i420_yuy2_mmx
79 /* Initialize MMX-specific constants */
80 static const uint64_t i_00ffw = 0x00ff00ff00ff00ffULL;
81 static const uint64_t i_80w   = 0x0000000080808080ULL;
82 #endif
83
84 /*****************************************************************************
85  * Module descriptor.
86  *****************************************************************************/
87 vlc_module_begin ()
88 #if defined (MODULE_NAME_IS_i420_yuy2)
89     set_description( N_("Conversions from " SRC_FOURCC " to " DEST_FOURCC) )
90     set_capability( "video filter2", 80 )
91 # define CPU_CAPABILITY 0
92 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
93     set_description( N_("MMX conversions from " SRC_FOURCC " to " DEST_FOURCC) )
94     set_capability( "video filter2", 160 )
95 # define CPU_CAPABILITY CPU_CAPABILITY_MMX
96 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
97     set_description( N_("SSE2 conversions from " SRC_FOURCC " to " DEST_FOURCC) )
98     set_capability( "video filter2", 250 )
99 # define CPU_CAPABILITY CPU_CAPABILITY_SSE2
100 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
101     set_description(
102             _("AltiVec conversions from " SRC_FOURCC " to " DEST_FOURCC) );
103     set_capability( "video filter2", 250 )
104 # define CPU_CAPABILITY CPU_CAPABILITY_ALTIVEC
105 #endif
106     set_callbacks( Activate, NULL )
107 vlc_module_end ()
108
109 /*****************************************************************************
110  * Activate: allocate a chroma function
111  *****************************************************************************
112  * This function allocates and initializes a chroma function
113  *****************************************************************************/
114 static int Activate( vlc_object_t *p_this )
115 {
116     filter_t *p_filter = (filter_t *)p_this;
117
118 #if CPU_CAPABILITY
119     if( !(vlc_CPU() & CPU_CAPABILITY) )
120         return VLC_EGENERIC;
121 #endif
122     if( p_filter->fmt_in.video.i_width & 1
123      || p_filter->fmt_in.video.i_height & 1 )
124     {
125         return -1;
126     }
127
128     if( p_filter->fmt_in.video.i_width != p_filter->fmt_out.video.i_width
129      || p_filter->fmt_in.video.i_height != p_filter->fmt_out.video.i_height )
130         return -1;
131
132     switch( p_filter->fmt_in.video.i_chroma )
133     {
134         case VLC_CODEC_YV12:
135         case VLC_CODEC_I420:
136             switch( p_filter->fmt_out.video.i_chroma )
137             {
138                 case VLC_CODEC_YUYV:
139                     p_filter->pf_video_filter = I420_YUY2_Filter;
140                     break;
141
142                 case VLC_CODEC_YVYU:
143                     p_filter->pf_video_filter = I420_YVYU_Filter;
144                     break;
145
146                 case VLC_CODEC_UYVY:
147                     p_filter->pf_video_filter = I420_UYVY_Filter;
148                     break;
149 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
150                 case VLC_FOURCC('I','U','Y','V'):
151                     p_filter->pf_video_filter = I420_IUYV_Filter;
152                     break;
153
154                 case VLC_CODEC_CYUV:
155                     p_filter->pf_video_filter = I420_cyuv_Filter;
156                     break;
157 #endif
158
159 #if defined (MODULE_NAME_IS_i420_yuy2)
160                 case VLC_CODEC_Y211:
161                     p_filter->pf_video_filter = I420_Y211_Filter;
162                     break;
163 #endif
164
165                 default:
166                     return -1;
167             }
168             break;
169
170         default:
171             return -1;
172     }
173
174     return 0;
175 }
176
177 #if 0
178 static inline unsigned long long read_cycles(void)
179 {
180     unsigned long long v;
181     __asm__ __volatile__("rdtsc" : "=A" (v): );
182
183     return v;
184 }
185 #endif
186
187 /* Following functions are local */
188
189 VIDEO_FILTER_WRAPPER( I420_YUY2 )
190 VIDEO_FILTER_WRAPPER( I420_YVYU )
191 VIDEO_FILTER_WRAPPER( I420_UYVY )
192 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
193 VIDEO_FILTER_WRAPPER( I420_IUYV )
194 VIDEO_FILTER_WRAPPER( I420_cyuv )
195 #endif
196 #if defined (MODULE_NAME_IS_i420_yuy2)
197 VIDEO_FILTER_WRAPPER( I420_Y211 )
198 #endif
199
200 /*****************************************************************************
201  * I420_YUY2: planar YUV 4:2:0 to packed YUYV 4:2:2
202  *****************************************************************************/
203 static void I420_YUY2( filter_t *p_filter, picture_t *p_source,
204                                            picture_t *p_dest )
205 {
206     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
207     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
208     uint8_t *p_u = p_source->U_PIXELS;
209     uint8_t *p_v = p_source->V_PIXELS;
210
211     int i_x, i_y;
212
213 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
214 #define VEC_NEXT_LINES( ) \
215     p_line1  = p_line2; \
216     p_line2 += p_dest->p->i_pitch; \
217     p_y1     = p_y2; \
218     p_y2    += p_source->p[Y_PLANE].i_pitch;
219
220 #define VEC_LOAD_UV( ) \
221     u_vec = vec_ld( 0, p_u ); p_u += 16; \
222     v_vec = vec_ld( 0, p_v ); p_v += 16;
223
224 #define VEC_MERGE( a ) \
225     uv_vec = a( u_vec, v_vec ); \
226     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
227     vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
228     vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
229     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
230     vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16; \
231     vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16;
232
233     vector unsigned char u_vec;
234     vector unsigned char v_vec;
235     vector unsigned char uv_vec;
236     vector unsigned char y_vec;
237
238     if( !( ( p_filter->fmt_in.video.i_width % 32 ) |
239            ( p_filter->fmt_in.video.i_height % 2 ) ) )
240     {
241         /* Width is a multiple of 32, we take 2 lines at a time */
242         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
243         {
244             VEC_NEXT_LINES( );
245             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
246             {
247                 VEC_LOAD_UV( );
248                 VEC_MERGE( vec_mergeh );
249                 VEC_MERGE( vec_mergel );
250             }
251         }
252     }
253 #warning FIXME: converting widths % 16 but !widths % 32 is broken on altivec
254 #if 0
255     else if( !( ( p_filter->fmt_in.video.i_width % 16 ) |
256                 ( p_filter->fmt_in.video.i_height % 4 ) ) )
257     {
258         /* Width is only a multiple of 16, we take 4 lines at a time */
259         for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; )
260         {
261             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
262             VEC_NEXT_LINES( );
263             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
264             {
265                 VEC_LOAD_UV( );
266                 VEC_MERGE( vec_mergeh );
267                 VEC_MERGE( vec_mergel );
268             }
269
270             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
271             VEC_LOAD_UV( );
272             VEC_MERGE( vec_mergeh );
273
274             /* Line 3 and 4, pixels 0 to 16 */
275             VEC_NEXT_LINES( );
276             VEC_MERGE( vec_mergel );
277
278             /* Line 3 and 4, pixels 16 to ( width ) */
279             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
280             {
281                 VEC_LOAD_UV( );
282                 VEC_MERGE( vec_mergeh );
283                 VEC_MERGE( vec_mergel );
284             }
285         }
286     }
287 #endif
288     else
289     {
290         /* Crap, use the C version */
291 #undef VEC_NEXT_LINES
292 #undef VEC_LOAD_UV
293 #undef VEC_MERGE
294 #endif
295
296     const int i_source_margin = p_source->p[0].i_pitch
297                                  - p_source->p[0].i_visible_pitch;
298     const int i_source_margin_c = p_source->p[1].i_pitch
299                                  - p_source->p[1].i_visible_pitch;
300     const int i_dest_margin = p_dest->p->i_pitch
301                                - p_dest->p->i_visible_pitch;
302
303 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
304     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
305     {
306         p_line1 = p_line2;
307         p_line2 += p_dest->p->i_pitch;
308
309         p_y1 = p_y2;
310         p_y2 += p_source->p[Y_PLANE].i_pitch;
311
312 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
313         for( i_x = p_filter->fmt_in.video.i_width / 8; i_x-- ; )
314         {
315             C_YUV420_YUYV( );
316             C_YUV420_YUYV( );
317             C_YUV420_YUYV( );
318             C_YUV420_YUYV( );
319         }
320 #else
321         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
322         {
323             MMX_CALL( MMX_YUV420_YUYV );
324         }
325 #endif
326         for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; )
327         {
328             C_YUV420_YUYV( );
329         }
330
331         p_y1 += i_source_margin;
332         p_y2 += i_source_margin;
333         p_u += i_source_margin_c;
334         p_v += i_source_margin_c;
335         p_line1 += i_dest_margin;
336         p_line2 += i_dest_margin;
337     }
338
339 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
340     /* re-enable FPU registers */
341     MMX_END;
342 #endif
343
344 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
345     }
346 #endif
347
348 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
349     /*
350     ** SSE2 128 bits fetch/store instructions are faster
351     ** if memory access is 16 bytes aligned
352     */
353
354     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
355         ((intptr_t)p_line2|(intptr_t)p_y2))) )
356     {
357         /* use faster SSE2 aligned fetch and store */
358         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
359         {
360             p_line1 = p_line2;
361             p_line2 += p_dest->p->i_pitch;
362
363             p_y1 = p_y2;
364             p_y2 += p_source->p[Y_PLANE].i_pitch;
365
366             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
367             {
368                 SSE2_CALL( SSE2_YUV420_YUYV_ALIGNED );
369             }
370             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
371             {
372                 C_YUV420_YUYV( );
373             }
374
375             p_y1 += i_source_margin;
376             p_y2 += i_source_margin;
377             p_u += i_source_margin_c;
378             p_v += i_source_margin_c;
379             p_line1 += i_dest_margin;
380             p_line2 += i_dest_margin;
381         }
382     }
383     else
384     {
385         /* use slower SSE2 unaligned fetch and store */
386         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
387         {
388             p_line1 = p_line2;
389             p_line2 += p_dest->p->i_pitch;
390
391             p_y1 = p_y2;
392             p_y2 += p_source->p[Y_PLANE].i_pitch;
393
394             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
395             {
396                 SSE2_CALL( SSE2_YUV420_YUYV_UNALIGNED );
397             }
398             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
399             {
400                 C_YUV420_YUYV( );
401             }
402
403             p_y1 += i_source_margin;
404             p_y2 += i_source_margin;
405             p_u += i_source_margin_c;
406             p_v += i_source_margin_c;
407             p_line1 += i_dest_margin;
408             p_line2 += i_dest_margin;
409         }
410     }
411     /* make sure all SSE2 stores are visible thereafter */
412     SSE2_END;
413
414 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
415 }
416
417 /*****************************************************************************
418  * I420_YVYU: planar YUV 4:2:0 to packed YVYU 4:2:2
419  *****************************************************************************/
420 static void I420_YVYU( filter_t *p_filter, picture_t *p_source,
421                                            picture_t *p_dest )
422 {
423     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
424     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
425     uint8_t *p_u = p_source->U_PIXELS;
426     uint8_t *p_v = p_source->V_PIXELS;
427
428     int i_x, i_y;
429
430 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
431 #define VEC_NEXT_LINES( ) \
432     p_line1  = p_line2; \
433     p_line2 += p_dest->p->i_pitch; \
434     p_y1     = p_y2; \
435     p_y2    += p_source->p[Y_PLANE].i_pitch;
436
437 #define VEC_LOAD_UV( ) \
438     u_vec = vec_ld( 0, p_u ); p_u += 16; \
439     v_vec = vec_ld( 0, p_v ); p_v += 16;
440
441 #define VEC_MERGE( a ) \
442     vu_vec = a( v_vec, u_vec ); \
443     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
444     vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
445     vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
446     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
447     vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16; \
448     vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16;
449
450     vector unsigned char u_vec;
451     vector unsigned char v_vec;
452     vector unsigned char vu_vec;
453     vector unsigned char y_vec;
454
455     if( !( ( p_filter->fmt_in.video.i_width % 32 ) |
456            ( p_filter->fmt_in.video.i_height % 2 ) ) )
457     {
458         /* Width is a multiple of 32, we take 2 lines at a time */
459         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
460         {
461             VEC_NEXT_LINES( );
462             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
463             {
464                 VEC_LOAD_UV( );
465                 VEC_MERGE( vec_mergeh );
466                 VEC_MERGE( vec_mergel );
467             }
468         }
469     }
470     else if( !( ( p_filter->fmt_in.video.i_width % 16 ) |
471                 ( p_filter->fmt_in.video.i_height % 4 ) ) )
472     {
473         /* Width is only a multiple of 16, we take 4 lines at a time */
474         for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; )
475         {
476             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
477             VEC_NEXT_LINES( );
478             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
479             {
480                 VEC_LOAD_UV( );
481                 VEC_MERGE( vec_mergeh );
482                 VEC_MERGE( vec_mergel );
483             }
484
485             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
486             VEC_LOAD_UV( );
487             VEC_MERGE( vec_mergeh );
488
489             /* Line 3 and 4, pixels 0 to 16 */
490             VEC_NEXT_LINES( );
491             VEC_MERGE( vec_mergel );
492
493             /* Line 3 and 4, pixels 16 to ( width ) */
494             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
495             {
496                 VEC_LOAD_UV( );
497                 VEC_MERGE( vec_mergeh );
498                 VEC_MERGE( vec_mergel );
499             }
500         }
501     }
502     else
503     {
504         /* Crap, use the C version */
505 #undef VEC_NEXT_LINES
506 #undef VEC_LOAD_UV
507 #undef VEC_MERGE
508 #endif
509
510     const int i_source_margin = p_source->p[0].i_pitch
511                                  - p_source->p[0].i_visible_pitch;
512     const int i_source_margin_c = p_source->p[1].i_pitch
513                                  - p_source->p[1].i_visible_pitch;
514     const int i_dest_margin = p_dest->p->i_pitch
515                                - p_dest->p->i_visible_pitch;
516
517 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
518     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
519     {
520         p_line1 = p_line2;
521         p_line2 += p_dest->p->i_pitch;
522
523         p_y1 = p_y2;
524         p_y2 += p_source->p[Y_PLANE].i_pitch;
525
526         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
527         {
528 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
529             C_YUV420_YVYU( );
530             C_YUV420_YVYU( );
531             C_YUV420_YVYU( );
532             C_YUV420_YVYU( );
533 #else
534             MMX_CALL( MMX_YUV420_YVYU );
535 #endif
536         }
537         for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; )
538         {
539             C_YUV420_YVYU( );
540         }
541
542         p_y1 += i_source_margin;
543         p_y2 += i_source_margin;
544         p_u += i_source_margin_c;
545         p_v += i_source_margin_c;
546         p_line1 += i_dest_margin;
547         p_line2 += i_dest_margin;
548     }
549
550 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
551     /* re-enable FPU registers */
552     MMX_END;
553 #endif
554
555 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
556     }
557 #endif
558
559 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
560     /*
561     ** SSE2 128 bits fetch/store instructions are faster
562     ** if memory access is 16 bytes aligned
563     */
564     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
565         ((intptr_t)p_line2|(intptr_t)p_y2))) )
566     {
567         /* use faster SSE2 aligned fetch and store */
568         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
569         {
570             p_line1 = p_line2;
571             p_line2 += p_dest->p->i_pitch;
572
573             p_y1 = p_y2;
574             p_y2 += p_source->p[Y_PLANE].i_pitch;
575
576             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
577             {
578                 SSE2_CALL( SSE2_YUV420_YVYU_ALIGNED );
579             }
580             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
581             {
582                 C_YUV420_YVYU( );
583             }
584
585             p_y1 += i_source_margin;
586             p_y2 += i_source_margin;
587             p_u += i_source_margin_c;
588             p_v += i_source_margin_c;
589             p_line1 += i_dest_margin;
590             p_line2 += i_dest_margin;
591         }
592     }
593     else
594     {
595         /* use slower SSE2 unaligned fetch and store */
596         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
597         {
598             p_line1 = p_line2;
599             p_line2 += p_dest->p->i_pitch;
600
601             p_y1 = p_y2;
602             p_y2 += p_source->p[Y_PLANE].i_pitch;
603
604             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
605             {
606                 SSE2_CALL( SSE2_YUV420_YVYU_UNALIGNED );
607             }
608             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
609             {
610                 C_YUV420_YVYU( );
611             }
612
613             p_y1 += i_source_margin;
614             p_y2 += i_source_margin;
615             p_u += i_source_margin_c;
616             p_v += i_source_margin_c;
617             p_line1 += i_dest_margin;
618             p_line2 += i_dest_margin;
619         }
620     }
621     /* make sure all SSE2 stores are visible thereafter */
622     SSE2_END;
623 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
624 }
625
626 /*****************************************************************************
627  * I420_UYVY: planar YUV 4:2:0 to packed UYVY 4:2:2
628  *****************************************************************************/
629 static void I420_UYVY( filter_t *p_filter, picture_t *p_source,
630                                            picture_t *p_dest )
631 {
632     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
633     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
634     uint8_t *p_u = p_source->U_PIXELS;
635     uint8_t *p_v = p_source->V_PIXELS;
636
637     int i_x, i_y;
638
639 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
640 #define VEC_NEXT_LINES( ) \
641     p_line1  = p_line2; \
642     p_line2 += p_dest->p->i_pitch; \
643     p_y1     = p_y2; \
644     p_y2    += p_source->p[Y_PLANE].i_pitch;
645
646 #define VEC_LOAD_UV( ) \
647     u_vec = vec_ld( 0, p_u ); p_u += 16; \
648     v_vec = vec_ld( 0, p_v ); p_v += 16;
649
650 #define VEC_MERGE( a ) \
651     uv_vec = a( u_vec, v_vec ); \
652     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
653     vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
654     vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
655     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
656     vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16; \
657     vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16;
658
659     vector unsigned char u_vec;
660     vector unsigned char v_vec;
661     vector unsigned char uv_vec;
662     vector unsigned char y_vec;
663
664     if( !( ( p_filter->fmt_in.video.i_width % 32 ) |
665            ( p_filter->fmt_in.video.i_height % 2 ) ) )
666     {
667         /* Width is a multiple of 32, we take 2 lines at a time */
668         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
669         {
670             VEC_NEXT_LINES( );
671             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
672             {
673                 VEC_LOAD_UV( );
674                 VEC_MERGE( vec_mergeh );
675                 VEC_MERGE( vec_mergel );
676             }
677         }
678     }
679     else if( !( ( p_filter->fmt_in.video.i_width % 16 ) |
680                 ( p_filter->fmt_in.video.i_height % 4 ) ) )
681     {
682         /* Width is only a multiple of 16, we take 4 lines at a time */
683         for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; )
684         {
685             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
686             VEC_NEXT_LINES( );
687             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
688             {
689                 VEC_LOAD_UV( );
690                 VEC_MERGE( vec_mergeh );
691                 VEC_MERGE( vec_mergel );
692             }
693
694             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
695             VEC_LOAD_UV( );
696             VEC_MERGE( vec_mergeh );
697
698             /* Line 3 and 4, pixels 0 to 16 */
699             VEC_NEXT_LINES( );
700             VEC_MERGE( vec_mergel );
701
702             /* Line 3 and 4, pixels 16 to ( width ) */
703             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
704             {
705                 VEC_LOAD_UV( );
706                 VEC_MERGE( vec_mergeh );
707                 VEC_MERGE( vec_mergel );
708             }
709         }
710     }
711     else
712     {
713         /* Crap, use the C version */
714 #undef VEC_NEXT_LINES
715 #undef VEC_LOAD_UV
716 #undef VEC_MERGE
717 #endif
718
719     const int i_source_margin = p_source->p[0].i_pitch
720                                  - p_source->p[0].i_visible_pitch;
721     const int i_source_margin_c = p_source->p[1].i_pitch
722                                  - p_source->p[1].i_visible_pitch;
723     const int i_dest_margin = p_dest->p->i_pitch
724                                - p_dest->p->i_visible_pitch;
725
726 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
727     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
728     {
729         p_line1 = p_line2;
730         p_line2 += p_dest->p->i_pitch;
731
732         p_y1 = p_y2;
733         p_y2 += p_source->p[Y_PLANE].i_pitch;
734
735         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
736         {
737 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
738             C_YUV420_UYVY( );
739             C_YUV420_UYVY( );
740             C_YUV420_UYVY( );
741             C_YUV420_UYVY( );
742 #else
743             MMX_CALL( MMX_YUV420_UYVY );
744 #endif
745         }
746         for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x--; )
747         {
748             C_YUV420_UYVY( );
749         }
750
751         p_y1 += i_source_margin;
752         p_y2 += i_source_margin;
753         p_u += i_source_margin_c;
754         p_v += i_source_margin_c;
755         p_line1 += i_dest_margin;
756         p_line2 += i_dest_margin;
757     }
758
759 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
760     /* re-enable FPU registers */
761     MMX_END;
762 #endif
763
764 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
765     }
766 #endif
767
768 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
769     /*
770     ** SSE2 128 bits fetch/store instructions are faster
771     ** if memory access is 16 bytes aligned
772     */
773     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
774         ((intptr_t)p_line2|(intptr_t)p_y2))) )
775     {
776         /* use faster SSE2 aligned fetch and store */
777         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
778         {
779             p_line1 = p_line2;
780             p_line2 += p_dest->p->i_pitch;
781
782             p_y1 = p_y2;
783             p_y2 += p_source->p[Y_PLANE].i_pitch;
784
785             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
786             {
787                 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
788             }
789             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
790             {
791                 C_YUV420_UYVY( );
792             }
793
794             p_y1 += i_source_margin;
795             p_y2 += i_source_margin;
796             p_u += i_source_margin_c;
797             p_v += i_source_margin_c;
798             p_line1 += i_dest_margin;
799             p_line2 += i_dest_margin;
800         }
801     }
802     else
803     {
804         /* use slower SSE2 unaligned fetch and store */
805         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
806         {
807             p_line1 = p_line2;
808             p_line2 += p_dest->p->i_pitch;
809
810             p_y1 = p_y2;
811             p_y2 += p_source->p[Y_PLANE].i_pitch;
812
813             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
814             {
815                 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
816             }
817             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
818             {
819                 C_YUV420_UYVY( );
820             }
821
822             p_y1 += i_source_margin;
823             p_y2 += i_source_margin;
824             p_u += i_source_margin_c;
825             p_v += i_source_margin_c;
826             p_line1 += i_dest_margin;
827             p_line2 += i_dest_margin;
828         }
829     }
830     /* make sure all SSE2 stores are visible thereafter */
831     SSE2_END;
832 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
833 }
834
835 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
836 /*****************************************************************************
837  * I420_IUYV: planar YUV 4:2:0 to interleaved packed UYVY 4:2:2
838  *****************************************************************************/
839 static void I420_IUYV( filter_t *p_filter, picture_t *p_source,
840                                            picture_t *p_dest )
841 {
842     VLC_UNUSED(p_source); VLC_UNUSED(p_dest);
843     /* FIXME: TODO ! */
844     msg_Err( p_filter, "I420_IUYV unimplemented, please harass <sam@zoy.org>" );
845 }
846
847 /*****************************************************************************
848  * I420_cyuv: planar YUV 4:2:0 to upside-down packed UYVY 4:2:2
849  *****************************************************************************/
850 static void I420_cyuv( filter_t *p_filter, picture_t *p_source,
851                                            picture_t *p_dest )
852 {
853     uint8_t *p_line1 = p_dest->p->p_pixels +
854                        p_dest->p->i_visible_lines * p_dest->p->i_pitch
855                        + p_dest->p->i_pitch;
856     uint8_t *p_line2 = p_dest->p->p_pixels +
857                        p_dest->p->i_visible_lines * p_dest->p->i_pitch;
858     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
859     uint8_t *p_u = p_source->U_PIXELS;
860     uint8_t *p_v = p_source->V_PIXELS;
861
862     int i_x, i_y;
863
864     const int i_source_margin = p_source->p[0].i_pitch
865                                  - p_source->p[0].i_visible_pitch;
866     const int i_source_margin_c = p_source->p[1].i_pitch
867                                  - p_source->p[1].i_visible_pitch;
868     const int i_dest_margin = p_dest->p->i_pitch
869                                - p_dest->p->i_visible_pitch;
870
871 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
872     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
873     {
874         p_line1 -= 3 * p_dest->p->i_pitch;
875         p_line2 -= 3 * p_dest->p->i_pitch;
876
877         p_y1 = p_y2;
878         p_y2 += p_source->p[Y_PLANE].i_pitch;
879
880         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
881         {
882 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
883             C_YUV420_UYVY( );
884             C_YUV420_UYVY( );
885             C_YUV420_UYVY( );
886             C_YUV420_UYVY( );
887 #else
888             MMX_CALL( MMX_YUV420_UYVY );
889 #endif
890         }
891         for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; )
892         {
893             C_YUV420_UYVY( );
894         }
895
896         p_y1 += i_source_margin;
897         p_y2 += i_source_margin;
898         p_u += i_source_margin_c;
899         p_v += i_source_margin_c;
900         p_line1 += i_dest_margin;
901         p_line2 += i_dest_margin;
902     }
903
904 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
905     /* re-enable FPU registers */
906     MMX_END;
907 #endif
908
909 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
910     /*
911     ** SSE2 128 bits fetch/store instructions are faster
912     ** if memory access is 16 bytes aligned
913     */
914     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
915         ((intptr_t)p_line2|(intptr_t)p_y2))) )
916     {
917         /* use faster SSE2 aligned fetch and store */
918         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
919         {
920             p_line1 = p_line2;
921             p_line2 += p_dest->p->i_pitch;
922
923             p_y1 = p_y2;
924             p_y2 += p_source->p[Y_PLANE].i_pitch;
925
926             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
927             {
928                 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
929             }
930             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
931             {
932                 C_YUV420_UYVY( );
933             }
934
935             p_y1 += i_source_margin;
936             p_y2 += i_source_margin;
937             p_u += i_source_margin_c;
938             p_v += i_source_margin_c;
939             p_line1 += i_dest_margin;
940             p_line2 += i_dest_margin;
941         }
942     }
943     else
944     {
945         /* use slower SSE2 unaligned fetch and store */
946         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
947         {
948             p_line1 = p_line2;
949             p_line2 += p_dest->p->i_pitch;
950
951             p_y1 = p_y2;
952             p_y2 += p_source->p[Y_PLANE].i_pitch;
953
954             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
955             {
956                 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
957             }
958             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
959             {
960                 C_YUV420_UYVY( );
961             }
962
963             p_y1 += i_source_margin;
964             p_y2 += i_source_margin;
965             p_u += i_source_margin_c;
966             p_v += i_source_margin_c;
967             p_line1 += i_dest_margin;
968             p_line2 += i_dest_margin;
969         }
970     }
971     /* make sure all SSE2 stores are visible thereafter */
972     SSE2_END;
973 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
974 }
975 #endif // !defined (MODULE_NAME_IS_i420_yuy2_altivec)
976
977 /*****************************************************************************
978  * I420_Y211: planar YUV 4:2:0 to packed YUYV 2:1:1
979  *****************************************************************************/
980 #if defined (MODULE_NAME_IS_i420_yuy2)
981 static void I420_Y211( filter_t *p_filter, picture_t *p_source,
982                                            picture_t *p_dest )
983 {
984     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
985     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
986     uint8_t *p_u = p_source->U_PIXELS;
987     uint8_t *p_v = p_source->V_PIXELS;
988
989     int i_x, i_y;
990
991     const int i_source_margin = p_source->p[0].i_pitch
992                                  - p_source->p[0].i_visible_pitch;
993     const int i_source_margin_c = p_source->p[1].i_pitch
994                                  - p_source->p[1].i_visible_pitch;
995     const int i_dest_margin = p_dest->p->i_pitch
996                                - p_dest->p->i_visible_pitch;
997
998     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
999     {
1000         p_line1 = p_line2;
1001         p_line2 += p_dest->p->i_pitch;
1002
1003         p_y1 = p_y2;
1004         p_y2 += p_source->p[Y_PLANE].i_pitch;
1005
1006         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
1007         {
1008             C_YUV420_Y211( );
1009             C_YUV420_Y211( );
1010         }
1011
1012         p_y1 += i_source_margin;
1013         p_y2 += i_source_margin;
1014         p_u += i_source_margin_c;
1015         p_v += i_source_margin_c;
1016         p_line1 += i_dest_margin;
1017         p_line2 += i_dest_margin;
1018     }
1019 }
1020 #endif