]> git.sesse.net Git - vlc/blob - modules/video_chroma/i420_yuy2.c
Move SSE2 chromas to sse2/
[vlc] / modules / video_chroma / i420_yuy2.c
1 /*****************************************************************************
2  * i420_yuy2.c : YUV to YUV conversion module for vlc
3  *****************************************************************************
4  * Copyright (C) 2000, 2001 the VideoLAN team
5  * $Id$
6  *
7  * Authors: Samuel Hocevar <sam@zoy.org>
8  *          Damien Fouilleul <damien@videolan.org>
9  *
10  * This program is free software; you can redistribute it and/or modify
11  * it under the terms of the GNU General Public License as published by
12  * the Free Software Foundation; either version 2 of the License, or
13  * (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public License
21  * along with this program; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
23  *****************************************************************************/
24
25 /*****************************************************************************
26  * Preamble
27  *****************************************************************************/
28
29 #ifdef HAVE_CONFIG_H
30 # include "config.h"
31 #endif
32
33 #include <vlc_common.h>
34 #include <vlc_plugin.h>
35 #include <vlc_filter.h>
36 #include <vlc_cpu.h>
37
38 #if defined (MODULE_NAME_IS_i420_yuy2_altivec) && defined(HAVE_ALTIVEC_H)
39 #   include <altivec.h>
40 #endif
41
42 #include "i420_yuy2.h"
43
44 #define SRC_FOURCC  "I420,IYUV,YV12"
45
46 #if defined (MODULE_NAME_IS_i420_yuy2)
47 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv,Y211"
48 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
49 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
50 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
51 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
52 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
53 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422"
54 #endif
55
56 /*****************************************************************************
57  * Local and extern prototypes.
58  *****************************************************************************/
59 static int  Activate ( vlc_object_t * );
60
61 static void I420_YUY2           ( filter_t *, picture_t *, picture_t * );
62 static void I420_YVYU           ( filter_t *, picture_t *, picture_t * );
63 static void I420_UYVY           ( filter_t *, picture_t *, picture_t * );
64 static picture_t *I420_YUY2_Filter    ( filter_t *, picture_t * );
65 static picture_t *I420_YVYU_Filter    ( filter_t *, picture_t * );
66 static picture_t *I420_UYVY_Filter    ( filter_t *, picture_t * );
67 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
68 static void I420_IUYV           ( filter_t *, picture_t *, picture_t * );
69 static void I420_cyuv           ( filter_t *, picture_t *, picture_t * );
70 static picture_t *I420_IUYV_Filter    ( filter_t *, picture_t * );
71 static picture_t *I420_cyuv_Filter    ( filter_t *, picture_t * );
72 #endif
73 #if defined (MODULE_NAME_IS_i420_yuy2)
74 static void I420_Y211           ( filter_t *, picture_t *, picture_t * );
75 static picture_t *I420_Y211_Filter    ( filter_t *, picture_t * );
76 #endif
77
78 #ifdef MODULE_NAME_IS_i420_yuy2_mmx
79 /* Initialize MMX-specific constants */
80 static const uint64_t i_00ffw = 0x00ff00ff00ff00ffULL;
81 static const uint64_t i_80w   = 0x0000000080808080ULL;
82 #endif
83
84 /*****************************************************************************
85  * Module descriptor.
86  *****************************************************************************/
87 vlc_module_begin ()
88 #if defined (MODULE_NAME_IS_i420_yuy2)
89     set_description( N_("Conversions from " SRC_FOURCC " to " DEST_FOURCC) )
90     set_capability( "video filter2", 80 )
91 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
92     set_description( N_("MMX conversions from " SRC_FOURCC " to " DEST_FOURCC) )
93     set_capability( "video filter2", 160 )
94 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
95     set_description( N_("SSE2 conversions from " SRC_FOURCC " to " DEST_FOURCC) )
96     set_capability( "video filter2", 250 )
97 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
98     set_description(
99             _("AltiVec conversions from " SRC_FOURCC " to " DEST_FOURCC) );
100     set_capability( "video filter2", 250 )
101     add_requirement( ALTIVEC )
102 #endif
103     set_callbacks( Activate, NULL )
104 vlc_module_end ()
105
106 /*****************************************************************************
107  * Activate: allocate a chroma function
108  *****************************************************************************
109  * This function allocates and initializes a chroma function
110  *****************************************************************************/
111 static int Activate( vlc_object_t *p_this )
112 {
113     filter_t *p_filter = (filter_t *)p_this;
114
115     if( p_filter->fmt_in.video.i_width & 1
116      || p_filter->fmt_in.video.i_height & 1 )
117     {
118         return -1;
119     }
120
121     if( p_filter->fmt_in.video.i_width != p_filter->fmt_out.video.i_width
122      || p_filter->fmt_in.video.i_height != p_filter->fmt_out.video.i_height )
123         return -1;
124
125     switch( p_filter->fmt_in.video.i_chroma )
126     {
127         case VLC_CODEC_YV12:
128         case VLC_CODEC_I420:
129             switch( p_filter->fmt_out.video.i_chroma )
130             {
131                 case VLC_CODEC_YUYV:
132                     p_filter->pf_video_filter = I420_YUY2_Filter;
133                     break;
134
135                 case VLC_CODEC_YVYU:
136                     p_filter->pf_video_filter = I420_YVYU_Filter;
137                     break;
138
139                 case VLC_CODEC_UYVY:
140                     p_filter->pf_video_filter = I420_UYVY_Filter;
141                     break;
142 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
143                 case VLC_FOURCC('I','U','Y','V'):
144                     p_filter->pf_video_filter = I420_IUYV_Filter;
145                     break;
146
147                 case VLC_CODEC_CYUV:
148                     p_filter->pf_video_filter = I420_cyuv_Filter;
149                     break;
150 #endif
151
152 #if defined (MODULE_NAME_IS_i420_yuy2)
153                 case VLC_CODEC_Y211:
154                     p_filter->pf_video_filter = I420_Y211_Filter;
155                     break;
156 #endif
157
158                 default:
159                     return -1;
160             }
161             break;
162
163         default:
164             return -1;
165     }
166
167     return 0;
168 }
169
170 #if 0
171 static inline unsigned long long read_cycles(void)
172 {
173     unsigned long long v;
174     __asm__ __volatile__("rdtsc" : "=A" (v): );
175
176     return v;
177 }
178 #endif
179
180 /* Following functions are local */
181
182 VIDEO_FILTER_WRAPPER( I420_YUY2 )
183 VIDEO_FILTER_WRAPPER( I420_YVYU )
184 VIDEO_FILTER_WRAPPER( I420_UYVY )
185 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
186 VIDEO_FILTER_WRAPPER( I420_IUYV )
187 VIDEO_FILTER_WRAPPER( I420_cyuv )
188 #endif
189 #if defined (MODULE_NAME_IS_i420_yuy2)
190 VIDEO_FILTER_WRAPPER( I420_Y211 )
191 #endif
192
193 /*****************************************************************************
194  * I420_YUY2: planar YUV 4:2:0 to packed YUYV 4:2:2
195  *****************************************************************************/
196 static void I420_YUY2( filter_t *p_filter, picture_t *p_source,
197                                            picture_t *p_dest )
198 {
199     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
200     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
201     uint8_t *p_u = p_source->U_PIXELS;
202     uint8_t *p_v = p_source->V_PIXELS;
203
204     int i_x, i_y;
205
206 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
207 #define VEC_NEXT_LINES( ) \
208     p_line1  = p_line2; \
209     p_line2 += p_dest->p->i_pitch; \
210     p_y1     = p_y2; \
211     p_y2    += p_source->p[Y_PLANE].i_pitch;
212
213 #define VEC_LOAD_UV( ) \
214     u_vec = vec_ld( 0, p_u ); p_u += 16; \
215     v_vec = vec_ld( 0, p_v ); p_v += 16;
216
217 #define VEC_MERGE( a ) \
218     uv_vec = a( u_vec, v_vec ); \
219     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
220     vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
221     vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
222     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
223     vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16; \
224     vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16;
225
226     vector unsigned char u_vec;
227     vector unsigned char v_vec;
228     vector unsigned char uv_vec;
229     vector unsigned char y_vec;
230
231     if( !( ( p_filter->fmt_in.video.i_width % 32 ) |
232            ( p_filter->fmt_in.video.i_height % 2 ) ) )
233     {
234         /* Width is a multiple of 32, we take 2 lines at a time */
235         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
236         {
237             VEC_NEXT_LINES( );
238             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
239             {
240                 VEC_LOAD_UV( );
241                 VEC_MERGE( vec_mergeh );
242                 VEC_MERGE( vec_mergel );
243             }
244         }
245     }
246     else if( !( ( p_filter->fmt_in.video.i_width % 16 ) |
247                 ( p_filter->fmt_in.video.i_height % 4 ) ) )
248     {
249         /* Width is only a multiple of 16, we take 4 lines at a time */
250         for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; )
251         {
252             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
253             VEC_NEXT_LINES( );
254             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
255             {
256                 VEC_LOAD_UV( );
257                 VEC_MERGE( vec_mergeh );
258                 VEC_MERGE( vec_mergel );
259             }
260
261             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
262             VEC_LOAD_UV( );
263             VEC_MERGE( vec_mergeh );
264
265             /* Line 3 and 4, pixels 0 to 16 */
266             VEC_NEXT_LINES( );
267             VEC_MERGE( vec_mergel );
268
269             /* Line 3 and 4, pixels 16 to ( width ) */
270             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
271             {
272                 VEC_LOAD_UV( );
273                 VEC_MERGE( vec_mergeh );
274                 VEC_MERGE( vec_mergel );
275             }
276         }
277     }
278     else
279     {
280         /* Crap, use the C version */
281 #undef VEC_NEXT_LINES
282 #undef VEC_LOAD_UV
283 #undef VEC_MERGE
284 #endif
285
286     const int i_source_margin = p_source->p[0].i_pitch
287                                  - p_source->p[0].i_visible_pitch;
288     const int i_source_margin_c = p_source->p[1].i_pitch
289                                  - p_source->p[1].i_visible_pitch;
290     const int i_dest_margin = p_dest->p->i_pitch
291                                - p_dest->p->i_visible_pitch;
292
293 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
294     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
295     {
296         p_line1 = p_line2;
297         p_line2 += p_dest->p->i_pitch;
298
299         p_y1 = p_y2;
300         p_y2 += p_source->p[Y_PLANE].i_pitch;
301
302 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
303         for( i_x = p_filter->fmt_in.video.i_width / 8; i_x-- ; )
304         {
305             C_YUV420_YUYV( );
306             C_YUV420_YUYV( );
307             C_YUV420_YUYV( );
308             C_YUV420_YUYV( );
309         }
310 #else
311         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
312         {
313             MMX_CALL( MMX_YUV420_YUYV );
314         }
315 #endif
316         for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; )
317         {
318             C_YUV420_YUYV( );
319         }
320
321         p_y1 += i_source_margin;
322         p_y2 += i_source_margin;
323         p_u += i_source_margin_c;
324         p_v += i_source_margin_c;
325         p_line1 += i_dest_margin;
326         p_line2 += i_dest_margin;
327     }
328
329 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
330     /* re-enable FPU registers */
331     MMX_END;
332 #endif
333
334 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
335     }
336 #endif
337
338 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
339     /*
340     ** SSE2 128 bits fetch/store instructions are faster
341     ** if memory access is 16 bytes aligned
342     */
343
344     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
345         ((intptr_t)p_line2|(intptr_t)p_y2))) )
346     {
347         /* use faster SSE2 aligned fetch and store */
348         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
349         {
350             p_line1 = p_line2;
351             p_line2 += p_dest->p->i_pitch;
352
353             p_y1 = p_y2;
354             p_y2 += p_source->p[Y_PLANE].i_pitch;
355
356             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
357             {
358                 SSE2_CALL( SSE2_YUV420_YUYV_ALIGNED );
359             }
360             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
361             {
362                 C_YUV420_YUYV( );
363             }
364
365             p_y1 += i_source_margin;
366             p_y2 += i_source_margin;
367             p_u += i_source_margin_c;
368             p_v += i_source_margin_c;
369             p_line1 += i_dest_margin;
370             p_line2 += i_dest_margin;
371         }
372     }
373     else
374     {
375         /* use slower SSE2 unaligned fetch and store */
376         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
377         {
378             p_line1 = p_line2;
379             p_line2 += p_dest->p->i_pitch;
380
381             p_y1 = p_y2;
382             p_y2 += p_source->p[Y_PLANE].i_pitch;
383
384             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
385             {
386                 SSE2_CALL( SSE2_YUV420_YUYV_UNALIGNED );
387             }
388             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
389             {
390                 C_YUV420_YUYV( );
391             }
392
393             p_y1 += i_source_margin;
394             p_y2 += i_source_margin;
395             p_u += i_source_margin_c;
396             p_v += i_source_margin_c;
397             p_line1 += i_dest_margin;
398             p_line2 += i_dest_margin;
399         }
400     }
401     /* make sure all SSE2 stores are visible thereafter */
402     SSE2_END;
403
404 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
405 }
406
407 /*****************************************************************************
408  * I420_YVYU: planar YUV 4:2:0 to packed YVYU 4:2:2
409  *****************************************************************************/
410 static void I420_YVYU( filter_t *p_filter, picture_t *p_source,
411                                            picture_t *p_dest )
412 {
413     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
414     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
415     uint8_t *p_u = p_source->U_PIXELS;
416     uint8_t *p_v = p_source->V_PIXELS;
417
418     int i_x, i_y;
419
420 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
421 #define VEC_NEXT_LINES( ) \
422     p_line1  = p_line2; \
423     p_line2 += p_dest->p->i_pitch; \
424     p_y1     = p_y2; \
425     p_y2    += p_source->p[Y_PLANE].i_pitch;
426
427 #define VEC_LOAD_UV( ) \
428     u_vec = vec_ld( 0, p_u ); p_u += 16; \
429     v_vec = vec_ld( 0, p_v ); p_v += 16;
430
431 #define VEC_MERGE( a ) \
432     vu_vec = a( v_vec, u_vec ); \
433     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
434     vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
435     vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
436     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
437     vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16; \
438     vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16;
439
440     vector unsigned char u_vec;
441     vector unsigned char v_vec;
442     vector unsigned char vu_vec;
443     vector unsigned char y_vec;
444
445     if( !( ( p_filter->fmt_in.video.i_width % 32 ) |
446            ( p_filter->fmt_in.video.i_height % 2 ) ) )
447     {
448         /* Width is a multiple of 32, we take 2 lines at a time */
449         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
450         {
451             VEC_NEXT_LINES( );
452             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
453             {
454                 VEC_LOAD_UV( );
455                 VEC_MERGE( vec_mergeh );
456                 VEC_MERGE( vec_mergel );
457             }
458         }
459     }
460     else if( !( ( p_filter->fmt_in.video.i_width % 16 ) |
461                 ( p_filter->fmt_in.video.i_height % 4 ) ) )
462     {
463         /* Width is only a multiple of 16, we take 4 lines at a time */
464         for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; )
465         {
466             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
467             VEC_NEXT_LINES( );
468             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
469             {
470                 VEC_LOAD_UV( );
471                 VEC_MERGE( vec_mergeh );
472                 VEC_MERGE( vec_mergel );
473             }
474
475             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
476             VEC_LOAD_UV( );
477             VEC_MERGE( vec_mergeh );
478
479             /* Line 3 and 4, pixels 0 to 16 */
480             VEC_NEXT_LINES( );
481             VEC_MERGE( vec_mergel );
482
483             /* Line 3 and 4, pixels 16 to ( width ) */
484             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
485             {
486                 VEC_LOAD_UV( );
487                 VEC_MERGE( vec_mergeh );
488                 VEC_MERGE( vec_mergel );
489             }
490         }
491     }
492     else
493     {
494         /* Crap, use the C version */
495 #undef VEC_NEXT_LINES
496 #undef VEC_LOAD_UV
497 #undef VEC_MERGE
498 #endif
499
500     const int i_source_margin = p_source->p[0].i_pitch
501                                  - p_source->p[0].i_visible_pitch;
502     const int i_source_margin_c = p_source->p[1].i_pitch
503                                  - p_source->p[1].i_visible_pitch;
504     const int i_dest_margin = p_dest->p->i_pitch
505                                - p_dest->p->i_visible_pitch;
506
507 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
508     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
509     {
510         p_line1 = p_line2;
511         p_line2 += p_dest->p->i_pitch;
512
513         p_y1 = p_y2;
514         p_y2 += p_source->p[Y_PLANE].i_pitch;
515
516         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
517         {
518 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
519             C_YUV420_YVYU( );
520             C_YUV420_YVYU( );
521             C_YUV420_YVYU( );
522             C_YUV420_YVYU( );
523 #else
524             MMX_CALL( MMX_YUV420_YVYU );
525 #endif
526         }
527         for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; )
528         {
529             C_YUV420_YVYU( );
530         }
531
532         p_y1 += i_source_margin;
533         p_y2 += i_source_margin;
534         p_u += i_source_margin_c;
535         p_v += i_source_margin_c;
536         p_line1 += i_dest_margin;
537         p_line2 += i_dest_margin;
538     }
539
540 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
541     /* re-enable FPU registers */
542     MMX_END;
543 #endif
544
545 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
546     }
547 #endif
548
549 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
550     /*
551     ** SSE2 128 bits fetch/store instructions are faster
552     ** if memory access is 16 bytes aligned
553     */
554     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
555         ((intptr_t)p_line2|(intptr_t)p_y2))) )
556     {
557         /* use faster SSE2 aligned fetch and store */
558         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
559         {
560             p_line1 = p_line2;
561             p_line2 += p_dest->p->i_pitch;
562
563             p_y1 = p_y2;
564             p_y2 += p_source->p[Y_PLANE].i_pitch;
565
566             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
567             {
568                 SSE2_CALL( SSE2_YUV420_YVYU_ALIGNED );
569             }
570             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
571             {
572                 C_YUV420_YVYU( );
573             }
574
575             p_y1 += i_source_margin;
576             p_y2 += i_source_margin;
577             p_u += i_source_margin_c;
578             p_v += i_source_margin_c;
579             p_line1 += i_dest_margin;
580             p_line2 += i_dest_margin;
581         }
582     }
583     else
584     {
585         /* use slower SSE2 unaligned fetch and store */
586         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
587         {
588             p_line1 = p_line2;
589             p_line2 += p_dest->p->i_pitch;
590
591             p_y1 = p_y2;
592             p_y2 += p_source->p[Y_PLANE].i_pitch;
593
594             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
595             {
596                 SSE2_CALL( SSE2_YUV420_YVYU_UNALIGNED );
597             }
598             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
599             {
600                 C_YUV420_YVYU( );
601             }
602
603             p_y1 += i_source_margin;
604             p_y2 += i_source_margin;
605             p_u += i_source_margin_c;
606             p_v += i_source_margin_c;
607             p_line1 += i_dest_margin;
608             p_line2 += i_dest_margin;
609         }
610     }
611     /* make sure all SSE2 stores are visible thereafter */
612     SSE2_END;
613 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
614 }
615
616 /*****************************************************************************
617  * I420_UYVY: planar YUV 4:2:0 to packed UYVY 4:2:2
618  *****************************************************************************/
619 static void I420_UYVY( filter_t *p_filter, picture_t *p_source,
620                                            picture_t *p_dest )
621 {
622     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
623     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
624     uint8_t *p_u = p_source->U_PIXELS;
625     uint8_t *p_v = p_source->V_PIXELS;
626
627     int i_x, i_y;
628
629 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
630 #define VEC_NEXT_LINES( ) \
631     p_line1  = p_line2; \
632     p_line2 += p_dest->p->i_pitch; \
633     p_y1     = p_y2; \
634     p_y2    += p_source->p[Y_PLANE].i_pitch;
635
636 #define VEC_LOAD_UV( ) \
637     u_vec = vec_ld( 0, p_u ); p_u += 16; \
638     v_vec = vec_ld( 0, p_v ); p_v += 16;
639
640 #define VEC_MERGE( a ) \
641     uv_vec = a( u_vec, v_vec ); \
642     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
643     vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
644     vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
645     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
646     vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16; \
647     vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16;
648
649     vector unsigned char u_vec;
650     vector unsigned char v_vec;
651     vector unsigned char uv_vec;
652     vector unsigned char y_vec;
653
654     if( !( ( p_filter->fmt_in.video.i_width % 32 ) |
655            ( p_filter->fmt_in.video.i_height % 2 ) ) )
656     {
657         /* Width is a multiple of 32, we take 2 lines at a time */
658         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
659         {
660             VEC_NEXT_LINES( );
661             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
662             {
663                 VEC_LOAD_UV( );
664                 VEC_MERGE( vec_mergeh );
665                 VEC_MERGE( vec_mergel );
666             }
667         }
668     }
669     else if( !( ( p_filter->fmt_in.video.i_width % 16 ) |
670                 ( p_filter->fmt_in.video.i_height % 4 ) ) )
671     {
672         /* Width is only a multiple of 16, we take 4 lines at a time */
673         for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; )
674         {
675             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
676             VEC_NEXT_LINES( );
677             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
678             {
679                 VEC_LOAD_UV( );
680                 VEC_MERGE( vec_mergeh );
681                 VEC_MERGE( vec_mergel );
682             }
683
684             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
685             VEC_LOAD_UV( );
686             VEC_MERGE( vec_mergeh );
687
688             /* Line 3 and 4, pixels 0 to 16 */
689             VEC_NEXT_LINES( );
690             VEC_MERGE( vec_mergel );
691
692             /* Line 3 and 4, pixels 16 to ( width ) */
693             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
694             {
695                 VEC_LOAD_UV( );
696                 VEC_MERGE( vec_mergeh );
697                 VEC_MERGE( vec_mergel );
698             }
699         }
700     }
701     else
702     {
703         /* Crap, use the C version */
704 #undef VEC_NEXT_LINES
705 #undef VEC_LOAD_UV
706 #undef VEC_MERGE
707 #endif
708
709     const int i_source_margin = p_source->p[0].i_pitch
710                                  - p_source->p[0].i_visible_pitch;
711     const int i_source_margin_c = p_source->p[1].i_pitch
712                                  - p_source->p[1].i_visible_pitch;
713     const int i_dest_margin = p_dest->p->i_pitch
714                                - p_dest->p->i_visible_pitch;
715
716 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
717     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
718     {
719         p_line1 = p_line2;
720         p_line2 += p_dest->p->i_pitch;
721
722         p_y1 = p_y2;
723         p_y2 += p_source->p[Y_PLANE].i_pitch;
724
725         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
726         {
727 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
728             C_YUV420_UYVY( );
729             C_YUV420_UYVY( );
730             C_YUV420_UYVY( );
731             C_YUV420_UYVY( );
732 #else
733             MMX_CALL( MMX_YUV420_UYVY );
734 #endif
735         }
736         for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x--; )
737         {
738             C_YUV420_UYVY( );
739         }
740
741         p_y1 += i_source_margin;
742         p_y2 += i_source_margin;
743         p_u += i_source_margin_c;
744         p_v += i_source_margin_c;
745         p_line1 += i_dest_margin;
746         p_line2 += i_dest_margin;
747     }
748
749 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
750     /* re-enable FPU registers */
751     MMX_END;
752 #endif
753
754 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
755     }
756 #endif
757
758 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
759     /*
760     ** SSE2 128 bits fetch/store instructions are faster
761     ** if memory access is 16 bytes aligned
762     */
763     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
764         ((intptr_t)p_line2|(intptr_t)p_y2))) )
765     {
766         /* use faster SSE2 aligned fetch and store */
767         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
768         {
769             p_line1 = p_line2;
770             p_line2 += p_dest->p->i_pitch;
771
772             p_y1 = p_y2;
773             p_y2 += p_source->p[Y_PLANE].i_pitch;
774
775             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
776             {
777                 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
778             }
779             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
780             {
781                 C_YUV420_UYVY( );
782             }
783
784             p_y1 += i_source_margin;
785             p_y2 += i_source_margin;
786             p_u += i_source_margin_c;
787             p_v += i_source_margin_c;
788             p_line1 += i_dest_margin;
789             p_line2 += i_dest_margin;
790         }
791     }
792     else
793     {
794         /* use slower SSE2 unaligned fetch and store */
795         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
796         {
797             p_line1 = p_line2;
798             p_line2 += p_dest->p->i_pitch;
799
800             p_y1 = p_y2;
801             p_y2 += p_source->p[Y_PLANE].i_pitch;
802
803             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
804             {
805                 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
806             }
807             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
808             {
809                 C_YUV420_UYVY( );
810             }
811
812             p_y1 += i_source_margin;
813             p_y2 += i_source_margin;
814             p_u += i_source_margin_c;
815             p_v += i_source_margin_c;
816             p_line1 += i_dest_margin;
817             p_line2 += i_dest_margin;
818         }
819     }
820     /* make sure all SSE2 stores are visible thereafter */
821     SSE2_END;
822 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
823 }
824
825 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
826 /*****************************************************************************
827  * I420_IUYV: planar YUV 4:2:0 to interleaved packed UYVY 4:2:2
828  *****************************************************************************/
829 static void I420_IUYV( filter_t *p_filter, picture_t *p_source,
830                                            picture_t *p_dest )
831 {
832     VLC_UNUSED(p_source); VLC_UNUSED(p_dest);
833     /* FIXME: TODO ! */
834     msg_Err( p_filter, "I420_IUYV unimplemented, please harass <sam@zoy.org>" );
835 }
836
837 /*****************************************************************************
838  * I420_cyuv: planar YUV 4:2:0 to upside-down packed UYVY 4:2:2
839  *****************************************************************************/
840 static void I420_cyuv( filter_t *p_filter, picture_t *p_source,
841                                            picture_t *p_dest )
842 {
843     uint8_t *p_line1 = p_dest->p->p_pixels +
844                        p_dest->p->i_visible_lines * p_dest->p->i_pitch
845                        + p_dest->p->i_pitch;
846     uint8_t *p_line2 = p_dest->p->p_pixels +
847                        p_dest->p->i_visible_lines * p_dest->p->i_pitch;
848     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
849     uint8_t *p_u = p_source->U_PIXELS;
850     uint8_t *p_v = p_source->V_PIXELS;
851
852     int i_x, i_y;
853
854     const int i_source_margin = p_source->p[0].i_pitch
855                                  - p_source->p[0].i_visible_pitch;
856     const int i_source_margin_c = p_source->p[1].i_pitch
857                                  - p_source->p[1].i_visible_pitch;
858     const int i_dest_margin = p_dest->p->i_pitch
859                                - p_dest->p->i_visible_pitch;
860
861 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
862     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
863     {
864         p_line1 -= 3 * p_dest->p->i_pitch;
865         p_line2 -= 3 * p_dest->p->i_pitch;
866
867         p_y1 = p_y2;
868         p_y2 += p_source->p[Y_PLANE].i_pitch;
869
870         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
871         {
872 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
873             C_YUV420_UYVY( );
874             C_YUV420_UYVY( );
875             C_YUV420_UYVY( );
876             C_YUV420_UYVY( );
877 #else
878             MMX_CALL( MMX_YUV420_UYVY );
879 #endif
880         }
881         for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; )
882         {
883             C_YUV420_UYVY( );
884         }
885
886         p_y1 += i_source_margin;
887         p_y2 += i_source_margin;
888         p_u += i_source_margin_c;
889         p_v += i_source_margin_c;
890         p_line1 += i_dest_margin;
891         p_line2 += i_dest_margin;
892     }
893
894 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
895     /* re-enable FPU registers */
896     MMX_END;
897 #endif
898
899 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
900     /*
901     ** SSE2 128 bits fetch/store instructions are faster
902     ** if memory access is 16 bytes aligned
903     */
904     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
905         ((intptr_t)p_line2|(intptr_t)p_y2))) )
906     {
907         /* use faster SSE2 aligned fetch and store */
908         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
909         {
910             p_line1 = p_line2;
911             p_line2 += p_dest->p->i_pitch;
912
913             p_y1 = p_y2;
914             p_y2 += p_source->p[Y_PLANE].i_pitch;
915
916             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
917             {
918                 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
919             }
920             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
921             {
922                 C_YUV420_UYVY( );
923             }
924
925             p_y1 += i_source_margin;
926             p_y2 += i_source_margin;
927             p_u += i_source_margin_c;
928             p_v += i_source_margin_c;
929             p_line1 += i_dest_margin;
930             p_line2 += i_dest_margin;
931         }
932     }
933     else
934     {
935         /* use slower SSE2 unaligned fetch and store */
936         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
937         {
938             p_line1 = p_line2;
939             p_line2 += p_dest->p->i_pitch;
940
941             p_y1 = p_y2;
942             p_y2 += p_source->p[Y_PLANE].i_pitch;
943
944             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
945             {
946                 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
947             }
948             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
949             {
950                 C_YUV420_UYVY( );
951             }
952
953             p_y1 += i_source_margin;
954             p_y2 += i_source_margin;
955             p_u += i_source_margin_c;
956             p_v += i_source_margin_c;
957             p_line1 += i_dest_margin;
958             p_line2 += i_dest_margin;
959         }
960     }
961     /* make sure all SSE2 stores are visible thereafter */
962     SSE2_END;
963 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
964 }
965 #endif // !defined (MODULE_NAME_IS_i420_yuy2_altivec)
966
967 /*****************************************************************************
968  * I420_Y211: planar YUV 4:2:0 to packed YUYV 2:1:1
969  *****************************************************************************/
970 #if defined (MODULE_NAME_IS_i420_yuy2)
971 static void I420_Y211( filter_t *p_filter, picture_t *p_source,
972                                            picture_t *p_dest )
973 {
974     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
975     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
976     uint8_t *p_u = p_source->U_PIXELS;
977     uint8_t *p_v = p_source->V_PIXELS;
978
979     int i_x, i_y;
980
981     const int i_source_margin = p_source->p[0].i_pitch
982                                  - p_source->p[0].i_visible_pitch;
983     const int i_source_margin_c = p_source->p[1].i_pitch
984                                  - p_source->p[1].i_visible_pitch;
985     const int i_dest_margin = p_dest->p->i_pitch
986                                - p_dest->p->i_visible_pitch;
987
988     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
989     {
990         p_line1 = p_line2;
991         p_line2 += p_dest->p->i_pitch;
992
993         p_y1 = p_y2;
994         p_y2 += p_source->p[Y_PLANE].i_pitch;
995
996         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
997         {
998             C_YUV420_Y211( );
999             C_YUV420_Y211( );
1000         }
1001
1002         p_y1 += i_source_margin;
1003         p_y2 += i_source_margin;
1004         p_u += i_source_margin_c;
1005         p_v += i_source_margin_c;
1006         p_line1 += i_dest_margin;
1007         p_line2 += i_dest_margin;
1008     }
1009 }
1010 #endif