]> git.sesse.net Git - vlc/blob - modules/video_filter/deinterlace.c
Removed useless include in remoteosd.
[vlc] / modules / video_filter / deinterlace.c
1 /*****************************************************************************
2  * deinterlace.c : deinterlacer plugin for vlc
3  *****************************************************************************
4  * Copyright (C) 2000-2009 the VideoLAN team
5  * $Id$
6  *
7  * Author: Sam Hocevar <sam@zoy.org>
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License as published by
11  * the Free Software Foundation; either version 2 of the License, or
12  * (at your option) any later version.
13  *
14  * This program is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  * GNU General Public License for more details.
18  *
19  * You should have received a copy of the GNU General Public License
20  * along with this program; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
22  *****************************************************************************/
23
24 /*****************************************************************************
25  * Preamble
26  *****************************************************************************/
27
28 #ifdef HAVE_CONFIG_H
29 # include "config.h"
30 #endif
31
32 #include <assert.h>
33
34 #ifdef HAVE_ALTIVEC_H
35 #   include <altivec.h>
36 #endif
37
38 #include <vlc_common.h>
39 #include <vlc_plugin.h>
40 #include <vlc_filter.h>
41 #include <vlc_cpu.h>
42
43 #ifdef CAN_COMPILE_MMXEXT
44 #   include "mmx.h"
45 #endif
46
47 #define DEINTERLACE_DISCARD 1
48 #define DEINTERLACE_MEAN    2
49 #define DEINTERLACE_BLEND   3
50 #define DEINTERLACE_BOB     4
51 #define DEINTERLACE_LINEAR  5
52 #define DEINTERLACE_X       6
53 #define DEINTERLACE_YADIF   7
54 #define DEINTERLACE_YADIF2X 8
55
56 /*****************************************************************************
57  * Module descriptor
58  *****************************************************************************/
59 static int  Open ( vlc_object_t * );
60 static void Close( vlc_object_t * );
61
62 #define MODE_TEXT N_("Deinterlace mode")
63 #define MODE_LONGTEXT N_("Deinterlace method to use for local playback.")
64
65 #define SOUT_MODE_TEXT N_("Streaming deinterlace mode")
66 #define SOUT_MODE_LONGTEXT N_("Deinterlace method to use for streaming.")
67
68 #define FILTER_CFG_PREFIX "sout-deinterlace-"
69
70 static const char *const mode_list[] = {
71     "discard", "blend", "mean", "bob", "linear", "x", "yadif", "yadif2x" };
72 static const char *const mode_list_text[] = {
73     N_("Discard"), N_("Blend"), N_("Mean"), N_("Bob"), N_("Linear"), "X", "Yadif", "Yadif (2x)" };
74
75 vlc_module_begin ()
76     set_description( N_("Deinterlacing video filter") )
77     set_shortname( N_("Deinterlace" ))
78     set_capability( "video filter", 0 )
79     set_category( CAT_VIDEO )
80     set_subcategory( SUBCAT_VIDEO_VFILTER )
81
82     set_capability( "video filter2", 0 )
83     add_string( FILTER_CFG_PREFIX "mode", "blend", NULL, SOUT_MODE_TEXT,
84                 SOUT_MODE_LONGTEXT, false )
85         change_string_list( mode_list, mode_list_text, 0 )
86         change_safe ()
87     add_shortcut( "deinterlace" )
88     set_callbacks( Open, Close )
89 vlc_module_end ()
90
91
92 /*****************************************************************************
93  * Local protypes
94  *****************************************************************************/
95 static void RenderDiscard( filter_t *, picture_t *, picture_t *, int );
96 static void RenderBob    ( filter_t *, picture_t *, picture_t *, int );
97 static void RenderMean   ( filter_t *, picture_t *, picture_t * );
98 static void RenderBlend  ( filter_t *, picture_t *, picture_t * );
99 static void RenderLinear ( filter_t *, picture_t *, picture_t *, int );
100 static void RenderX      ( picture_t *, picture_t * );
101 static void RenderYadif  ( filter_t *, picture_t *, picture_t *, int, int );
102
103 static void MergeGeneric ( void *, const void *, const void *, size_t );
104 #if defined(CAN_COMPILE_C_ALTIVEC)
105 static void MergeAltivec ( void *, const void *, const void *, size_t );
106 #endif
107 #if defined(CAN_COMPILE_MMXEXT)
108 static void MergeMMXEXT  ( void *, const void *, const void *, size_t );
109 #endif
110 #if defined(CAN_COMPILE_3DNOW)
111 static void Merge3DNow   ( void *, const void *, const void *, size_t );
112 #endif
113 #if defined(CAN_COMPILE_SSE)
114 static void MergeSSE2    ( void *, const void *, const void *, size_t );
115 #endif
116 #if defined(CAN_COMPILE_MMXEXT) || defined(CAN_COMPILE_SSE)
117 static void EndMMX       ( void );
118 #endif
119 #if defined(CAN_COMPILE_3DNOW)
120 static void End3DNow     ( void );
121 #endif
122 #if defined __ARM_NEON__
123 static void MergeNEON (void *, const void *, const void *, size_t);
124 #endif
125
126 static const char *const ppsz_filter_options[] = {
127     "mode", NULL
128 };
129
130 #define HISTORY_SIZE (3)
131 struct filter_sys_t
132 {
133     int  i_mode;        /* Deinterlace mode */
134     bool b_double_rate; /* Shall we double the framerate? */
135     bool b_half_height; /* Shall be divide the height by 2 */
136
137     void (*pf_merge) ( void *, const void *, const void *, size_t );
138     void (*pf_end_merge) ( void );
139
140     /* Yadif */
141     picture_t *pp_history[HISTORY_SIZE];
142 };
143
144 /*****************************************************************************
145  * SetFilterMethod: setup the deinterlace method to use.
146  *****************************************************************************/
147 static void SetFilterMethod( filter_t *p_filter, const char *psz_method, vlc_fourcc_t i_chroma )
148 {
149     filter_sys_t *p_sys = p_filter->p_sys;
150
151     if( !psz_method )
152         psz_method = "";
153
154     if( !strcmp( psz_method, "mean" ) )
155     {
156         p_sys->i_mode = DEINTERLACE_MEAN;
157         p_sys->b_double_rate = false;
158         p_sys->b_half_height = true;
159     }
160     else if( !strcmp( psz_method, "bob" )
161              || !strcmp( psz_method, "progressive-scan" ) )
162     {
163         p_sys->i_mode = DEINTERLACE_BOB;
164         p_sys->b_double_rate = true;
165         p_sys->b_half_height = false;
166     }
167     else if( !strcmp( psz_method, "linear" ) )
168     {
169         p_sys->i_mode = DEINTERLACE_LINEAR;
170         p_sys->b_double_rate = true;
171         p_sys->b_half_height = false;
172     }
173     else if( !strcmp( psz_method, "x" ) )
174     {
175         p_sys->i_mode = DEINTERLACE_X;
176         p_sys->b_double_rate = false;
177         p_sys->b_half_height = false;
178     }
179     else if( !strcmp( psz_method, "yadif" ) )
180     {
181         p_sys->i_mode = DEINTERLACE_YADIF;
182         p_sys->b_double_rate = false;
183         p_sys->b_half_height = false;
184     }
185     else if( !strcmp( psz_method, "yadif2x" ) )
186     {
187         p_sys->i_mode = DEINTERLACE_YADIF2X;
188         p_sys->b_double_rate = true;
189         p_sys->b_half_height = false;
190     }
191     else if( !strcmp( psz_method, "discard" ) )
192     {
193         const bool b_i422 = i_chroma == VLC_CODEC_I422 ||
194                             i_chroma == VLC_CODEC_J422;
195
196         p_sys->i_mode = DEINTERLACE_DISCARD;
197         p_sys->b_double_rate = false;
198         p_sys->b_half_height = !b_i422;
199     }
200     else
201     {
202         if( strcmp( psz_method, "blend" ) )
203             msg_Err( p_filter,
204                      "no valid deinterlace mode provided, using \"blend\"" );
205
206         p_sys->i_mode = DEINTERLACE_BLEND;
207         p_sys->b_double_rate = false;
208         p_sys->b_half_height = false;
209     }
210
211     msg_Dbg( p_filter, "using %s deinterlace method", psz_method );
212 }
213
214 static void GetOutputFormat( filter_t *p_filter,
215                              video_format_t *p_dst, const video_format_t *p_src )
216 {
217     filter_sys_t *p_sys = p_filter->p_sys;
218     *p_dst = *p_src;
219
220     if( p_sys->b_half_height )
221     {
222         p_dst->i_height /= 2;
223         p_dst->i_visible_height /= 2;
224         p_dst->i_y_offset /= 2;
225         p_dst->i_sar_den *= 2;
226     }
227
228     if( p_src->i_chroma == VLC_CODEC_I422 ||
229         p_src->i_chroma == VLC_CODEC_J422 )
230     {
231         switch( p_sys->i_mode )
232         {
233         case DEINTERLACE_MEAN:
234         case DEINTERLACE_LINEAR:
235         case DEINTERLACE_X:
236         case DEINTERLACE_YADIF:
237         case DEINTERLACE_YADIF2X:
238             p_dst->i_chroma = p_src->i_chroma;
239             break;
240         default:
241             p_dst->i_chroma = p_src->i_chroma == VLC_CODEC_I422 ? VLC_CODEC_I420 :
242                                                                   VLC_CODEC_J420;
243             break;
244         }
245     }
246 }
247
248 static bool IsChromaSupported( vlc_fourcc_t i_chroma )
249 {
250     return i_chroma == VLC_CODEC_I420 ||
251            i_chroma == VLC_CODEC_J420 ||
252            i_chroma == VLC_CODEC_YV12 ||
253            i_chroma == VLC_CODEC_I422 ||
254            i_chroma == VLC_CODEC_J422;
255 }
256
257 /*****************************************************************************
258  * RenderDiscard: only keep TOP or BOTTOM field, discard the other.
259  *****************************************************************************/
260 static void RenderDiscard( filter_t *p_filter,
261                            picture_t *p_outpic, picture_t *p_pic, int i_field )
262 {
263     int i_plane;
264
265     /* Copy image and skip lines */
266     for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
267     {
268         uint8_t *p_in, *p_out_end, *p_out;
269         int i_increment;
270
271         p_in = p_pic->p[i_plane].p_pixels
272                    + i_field * p_pic->p[i_plane].i_pitch;
273
274         p_out = p_outpic->p[i_plane].p_pixels;
275         p_out_end = p_out + p_outpic->p[i_plane].i_pitch
276                              * p_outpic->p[i_plane].i_visible_lines;
277
278         switch( p_filter->fmt_in.video.i_chroma )
279         {
280         case VLC_CODEC_I420:
281         case VLC_CODEC_J420:
282         case VLC_CODEC_YV12:
283
284             for( ; p_out < p_out_end ; )
285             {
286                 vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
287
288                 p_out += p_outpic->p[i_plane].i_pitch;
289                 p_in += 2 * p_pic->p[i_plane].i_pitch;
290             }
291             break;
292
293         case VLC_CODEC_I422:
294         case VLC_CODEC_J422:
295
296             i_increment = 2 * p_pic->p[i_plane].i_pitch;
297
298             if( i_plane == Y_PLANE )
299             {
300                 for( ; p_out < p_out_end ; )
301                 {
302                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
303                     p_out += p_outpic->p[i_plane].i_pitch;
304                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
305                     p_out += p_outpic->p[i_plane].i_pitch;
306                     p_in += i_increment;
307                 }
308             }
309             else
310             {
311                 for( ; p_out < p_out_end ; )
312                 {
313                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
314                     p_out += p_outpic->p[i_plane].i_pitch;
315                     p_in += i_increment;
316                 }
317             }
318             break;
319
320         default:
321             break;
322         }
323     }
324 }
325
326 /*****************************************************************************
327  * RenderBob: renders a BOB picture - simple copy
328  *****************************************************************************/
329 static void RenderBob( filter_t *p_filter,
330                        picture_t *p_outpic, picture_t *p_pic, int i_field )
331 {
332     int i_plane;
333
334     /* Copy image and skip lines */
335     for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
336     {
337         uint8_t *p_in, *p_out_end, *p_out;
338
339         p_in = p_pic->p[i_plane].p_pixels;
340         p_out = p_outpic->p[i_plane].p_pixels;
341         p_out_end = p_out + p_outpic->p[i_plane].i_pitch
342                              * p_outpic->p[i_plane].i_visible_lines;
343
344         switch( p_filter->fmt_in.video.i_chroma )
345         {
346             case VLC_CODEC_I420:
347             case VLC_CODEC_J420:
348             case VLC_CODEC_YV12:
349                 /* For BOTTOM field we need to add the first line */
350                 if( i_field == 1 )
351                 {
352                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
353                     p_in += p_pic->p[i_plane].i_pitch;
354                     p_out += p_outpic->p[i_plane].i_pitch;
355                 }
356
357                 p_out_end -= 2 * p_outpic->p[i_plane].i_pitch;
358
359                 for( ; p_out < p_out_end ; )
360                 {
361                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
362
363                     p_out += p_outpic->p[i_plane].i_pitch;
364
365                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
366
367                     p_in += 2 * p_pic->p[i_plane].i_pitch;
368                     p_out += p_outpic->p[i_plane].i_pitch;
369                 }
370
371                 vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
372
373                 /* For TOP field we need to add the last line */
374                 if( i_field == 0 )
375                 {
376                     p_in += p_pic->p[i_plane].i_pitch;
377                     p_out += p_outpic->p[i_plane].i_pitch;
378                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
379                 }
380                 break;
381
382             case VLC_CODEC_I422:
383             case VLC_CODEC_J422:
384                 /* For BOTTOM field we need to add the first line */
385                 if( i_field == 1 )
386                 {
387                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
388                     p_in += p_pic->p[i_plane].i_pitch;
389                     p_out += p_outpic->p[i_plane].i_pitch;
390                 }
391
392                 p_out_end -= 2 * p_outpic->p[i_plane].i_pitch;
393
394                 if( i_plane == Y_PLANE )
395                 {
396                     for( ; p_out < p_out_end ; )
397                     {
398                         vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
399
400                         p_out += p_outpic->p[i_plane].i_pitch;
401
402                         vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
403
404                         p_in += 2 * p_pic->p[i_plane].i_pitch;
405                         p_out += p_outpic->p[i_plane].i_pitch;
406                     }
407                 }
408                 else
409                 {
410                     for( ; p_out < p_out_end ; )
411                     {
412                         vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
413
414                         p_out += p_outpic->p[i_plane].i_pitch;
415                         p_in += 2 * p_pic->p[i_plane].i_pitch;
416                     }
417                 }
418
419                 vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
420
421                 /* For TOP field we need to add the last line */
422                 if( i_field == 0 )
423                 {
424                     p_in += p_pic->p[i_plane].i_pitch;
425                     p_out += p_outpic->p[i_plane].i_pitch;
426                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
427                 }
428                 break;
429         }
430     }
431 }
432
433 #define Merge p_filter->p_sys->pf_merge
434 #define EndMerge if(p_filter->p_sys->pf_end_merge) p_filter->p_sys->pf_end_merge
435
436 /*****************************************************************************
437  * RenderLinear: BOB with linear interpolation
438  *****************************************************************************/
439 static void RenderLinear( filter_t *p_filter,
440                           picture_t *p_outpic, picture_t *p_pic, int i_field )
441 {
442     int i_plane;
443
444     /* Copy image and skip lines */
445     for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
446     {
447         uint8_t *p_in, *p_out_end, *p_out;
448
449         p_in = p_pic->p[i_plane].p_pixels;
450         p_out = p_outpic->p[i_plane].p_pixels;
451         p_out_end = p_out + p_outpic->p[i_plane].i_pitch
452                              * p_outpic->p[i_plane].i_visible_lines;
453
454         /* For BOTTOM field we need to add the first line */
455         if( i_field == 1 )
456         {
457             vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
458             p_in += p_pic->p[i_plane].i_pitch;
459             p_out += p_outpic->p[i_plane].i_pitch;
460         }
461
462         p_out_end -= 2 * p_outpic->p[i_plane].i_pitch;
463
464         for( ; p_out < p_out_end ; )
465         {
466             vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
467
468             p_out += p_outpic->p[i_plane].i_pitch;
469
470             Merge( p_out, p_in, p_in + 2 * p_pic->p[i_plane].i_pitch,
471                    p_pic->p[i_plane].i_pitch );
472
473             p_in += 2 * p_pic->p[i_plane].i_pitch;
474             p_out += p_outpic->p[i_plane].i_pitch;
475         }
476
477         vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
478
479         /* For TOP field we need to add the last line */
480         if( i_field == 0 )
481         {
482             p_in += p_pic->p[i_plane].i_pitch;
483             p_out += p_outpic->p[i_plane].i_pitch;
484             vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
485         }
486     }
487     EndMerge();
488 }
489
490 static void RenderMean( filter_t *p_filter,
491                         picture_t *p_outpic, picture_t *p_pic )
492 {
493     int i_plane;
494
495     /* Copy image and skip lines */
496     for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
497     {
498         uint8_t *p_in, *p_out_end, *p_out;
499
500         p_in = p_pic->p[i_plane].p_pixels;
501
502         p_out = p_outpic->p[i_plane].p_pixels;
503         p_out_end = p_out + p_outpic->p[i_plane].i_pitch
504                              * p_outpic->p[i_plane].i_visible_lines;
505
506         /* All lines: mean value */
507         for( ; p_out < p_out_end ; )
508         {
509             Merge( p_out, p_in, p_in + p_pic->p[i_plane].i_pitch,
510                    p_pic->p[i_plane].i_pitch );
511
512             p_out += p_outpic->p[i_plane].i_pitch;
513             p_in += 2 * p_pic->p[i_plane].i_pitch;
514         }
515     }
516     EndMerge();
517 }
518
519 static void RenderBlend( filter_t *p_filter,
520                          picture_t *p_outpic, picture_t *p_pic )
521 {
522     int i_plane;
523
524     /* Copy image and skip lines */
525     for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
526     {
527         uint8_t *p_in, *p_out_end, *p_out;
528
529         p_in = p_pic->p[i_plane].p_pixels;
530
531         p_out = p_outpic->p[i_plane].p_pixels;
532         p_out_end = p_out + p_outpic->p[i_plane].i_pitch
533                              * p_outpic->p[i_plane].i_visible_lines;
534
535         switch( p_filter->fmt_in.video.i_chroma )
536         {
537             case VLC_CODEC_I420:
538             case VLC_CODEC_J420:
539             case VLC_CODEC_YV12:
540                 /* First line: simple copy */
541                 vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
542                 p_out += p_outpic->p[i_plane].i_pitch;
543
544                 /* Remaining lines: mean value */
545                 for( ; p_out < p_out_end ; )
546                 {
547                     Merge( p_out, p_in, p_in + p_pic->p[i_plane].i_pitch,
548                            p_pic->p[i_plane].i_pitch );
549
550                     p_out += p_outpic->p[i_plane].i_pitch;
551                     p_in += p_pic->p[i_plane].i_pitch;
552                 }
553                 break;
554
555             case VLC_CODEC_I422:
556             case VLC_CODEC_J422:
557                 /* First line: simple copy */
558                 vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
559                 p_out += p_outpic->p[i_plane].i_pitch;
560
561                 /* Remaining lines: mean value */
562                 if( i_plane == Y_PLANE )
563                 {
564                     for( ; p_out < p_out_end ; )
565                     {
566                         Merge( p_out, p_in, p_in + p_pic->p[i_plane].i_pitch,
567                                p_pic->p[i_plane].i_pitch );
568
569                         p_out += p_outpic->p[i_plane].i_pitch;
570                         p_in += p_pic->p[i_plane].i_pitch;
571                     }
572                 }
573
574                 else
575                 {
576                     for( ; p_out < p_out_end ; )
577                     {
578                         Merge( p_out, p_in, p_in + p_pic->p[i_plane].i_pitch,
579                                p_pic->p[i_plane].i_pitch );
580
581                         p_out += p_outpic->p[i_plane].i_pitch;
582                         p_in += 2*p_pic->p[i_plane].i_pitch;
583                     }
584                 }
585                 break;
586         }
587     }
588     EndMerge();
589 }
590
591 #undef Merge
592
593 static void MergeGeneric( void *_p_dest, const void *_p_s1,
594                           const void *_p_s2, size_t i_bytes )
595 {
596     uint8_t* p_dest = (uint8_t*)_p_dest;
597     const uint8_t *p_s1 = (const uint8_t *)_p_s1;
598     const uint8_t *p_s2 = (const uint8_t *)_p_s2;
599     uint8_t* p_end = p_dest + i_bytes - 8;
600
601     while( p_dest < p_end )
602     {
603         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
604         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
605         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
606         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
607         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
608         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
609         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
610         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
611     }
612
613     p_end += 8;
614
615     while( p_dest < p_end )
616     {
617         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
618     }
619 }
620
621 #if defined(CAN_COMPILE_MMXEXT)
622 static void MergeMMXEXT( void *_p_dest, const void *_p_s1, const void *_p_s2,
623                          size_t i_bytes )
624 {
625     uint8_t* p_dest = (uint8_t*)_p_dest;
626     const uint8_t *p_s1 = (const uint8_t *)_p_s1;
627     const uint8_t *p_s2 = (const uint8_t *)_p_s2;
628     uint8_t* p_end = p_dest + i_bytes - 8;
629     while( p_dest < p_end )
630     {
631         __asm__  __volatile__( "movq %2,%%mm1;"
632                                "pavgb %1, %%mm1;"
633                                "movq %%mm1, %0" :"=m" (*p_dest):
634                                                  "m" (*p_s1),
635                                                  "m" (*p_s2) );
636         p_dest += 8;
637         p_s1 += 8;
638         p_s2 += 8;
639     }
640
641     p_end += 8;
642
643     while( p_dest < p_end )
644     {
645         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
646     }
647 }
648 #endif
649
650 #if defined(CAN_COMPILE_3DNOW)
651 static void Merge3DNow( void *_p_dest, const void *_p_s1, const void *_p_s2,
652                         size_t i_bytes )
653 {
654     uint8_t* p_dest = (uint8_t*)_p_dest;
655     const uint8_t *p_s1 = (const uint8_t *)_p_s1;
656     const uint8_t *p_s2 = (const uint8_t *)_p_s2;
657     uint8_t* p_end = p_dest + i_bytes - 8;
658     while( p_dest < p_end )
659     {
660         __asm__  __volatile__( "movq %2,%%mm1;"
661                                "pavgusb %1, %%mm1;"
662                                "movq %%mm1, %0" :"=m" (*p_dest):
663                                                  "m" (*p_s1),
664                                                  "m" (*p_s2) );
665         p_dest += 8;
666         p_s1 += 8;
667         p_s2 += 8;
668     }
669
670     p_end += 8;
671
672     while( p_dest < p_end )
673     {
674         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
675     }
676 }
677 #endif
678
679 #if defined(CAN_COMPILE_SSE)
680 static void MergeSSE2( void *_p_dest, const void *_p_s1, const void *_p_s2,
681                        size_t i_bytes )
682 {
683     uint8_t* p_dest = (uint8_t*)_p_dest;
684     const uint8_t *p_s1 = (const uint8_t *)_p_s1;
685     const uint8_t *p_s2 = (const uint8_t *)_p_s2;
686     uint8_t* p_end;
687     while( (uintptr_t)p_s1 % 16 )
688     {
689         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
690     }
691     p_end = p_dest + i_bytes - 16;
692     while( p_dest < p_end )
693     {
694         __asm__  __volatile__( "movdqu %2,%%xmm1;"
695                                "pavgb %1, %%xmm1;"
696                                "movdqu %%xmm1, %0" :"=m" (*p_dest):
697                                                  "m" (*p_s1),
698                                                  "m" (*p_s2) );
699         p_dest += 16;
700         p_s1 += 16;
701         p_s2 += 16;
702     }
703
704     p_end += 16;
705
706     while( p_dest < p_end )
707     {
708         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
709     }
710 }
711 #endif
712
713 #if defined(CAN_COMPILE_MMXEXT) || defined(CAN_COMPILE_SSE)
714 static void EndMMX( void )
715 {
716     __asm__ __volatile__( "emms" :: );
717 }
718 #endif
719
720 #if defined(CAN_COMPILE_3DNOW)
721 static void End3DNow( void )
722 {
723     __asm__ __volatile__( "femms" :: );
724 }
725 #endif
726
727 #ifdef CAN_COMPILE_C_ALTIVEC
728 static void MergeAltivec( void *_p_dest, const void *_p_s1,
729                           const void *_p_s2, size_t i_bytes )
730 {
731     uint8_t *p_dest = (uint8_t *)_p_dest;
732     uint8_t *p_s1   = (uint8_t *)_p_s1;
733     uint8_t *p_s2   = (uint8_t *)_p_s2;
734     uint8_t *p_end  = p_dest + i_bytes - 15;
735
736     /* Use C until the first 16-bytes aligned destination pixel */
737     while( (uintptr_t)p_dest & 0xF )
738     {
739         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
740     }
741
742     if( ( (int)p_s1 & 0xF ) | ( (int)p_s2 & 0xF ) )
743     {
744         /* Unaligned source */
745         vector unsigned char s1v, s2v, destv;
746         vector unsigned char s1oldv, s2oldv, s1newv, s2newv;
747         vector unsigned char perm1v, perm2v;
748
749         perm1v = vec_lvsl( 0, p_s1 );
750         perm2v = vec_lvsl( 0, p_s2 );
751         s1oldv = vec_ld( 0, p_s1 );
752         s2oldv = vec_ld( 0, p_s2 );
753
754         while( p_dest < p_end )
755         {
756             s1newv = vec_ld( 16, p_s1 );
757             s2newv = vec_ld( 16, p_s2 );
758             s1v    = vec_perm( s1oldv, s1newv, perm1v );
759             s2v    = vec_perm( s2oldv, s2newv, perm2v );
760             s1oldv = s1newv;
761             s2oldv = s2newv;
762             destv  = vec_avg( s1v, s2v );
763             vec_st( destv, 0, p_dest );
764
765             p_s1   += 16;
766             p_s2   += 16;
767             p_dest += 16;
768         }
769     }
770     else
771     {
772         /* Aligned source */
773         vector unsigned char s1v, s2v, destv;
774
775         while( p_dest < p_end )
776         {
777             s1v   = vec_ld( 0, p_s1 );
778             s2v   = vec_ld( 0, p_s2 );
779             destv = vec_avg( s1v, s2v );
780             vec_st( destv, 0, p_dest );
781
782             p_s1   += 16;
783             p_s2   += 16;
784             p_dest += 16;
785         }
786     }
787
788     p_end += 15;
789
790     while( p_dest < p_end )
791     {
792         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
793     }
794 }
795 #endif
796
797 #ifdef __ARM_NEON__
798 static void MergeNEON (void *restrict out, const void *in1,
799                        const void *in2, size_t n)
800 {
801     uint8_t *outp = out;
802     const uint8_t *in1p = in1;
803     const uint8_t *in2p = in2;
804     size_t mis = ((uintptr_t)outp) & 15;
805
806     if (mis)
807     {
808         MergeGeneric (outp, in1p, in2p, mis);
809         outp += mis;
810         in1p += mis;
811         in2p += mis;
812         n -= mis;
813     }
814
815     uint8_t *end = outp + (n & ~15);
816
817     if ((((uintptr_t)in1p)|((uintptr_t)in2p)) & 15)
818         while (outp < end)
819             asm volatile (
820                 "vld1.u8  {q0-q1}, [%[in1]]!\n"
821                 "vld1.u8  {q2-q3}, [%[in2]]!\n"
822                 "vhadd.u8 q4, q0, q2\n"
823                 "vld1.u8  {q6-q7}, [%[in1]]!\n"
824                 "vhadd.u8 q5, q1, q3\n"
825                 "vld1.u8  {q8-q9}, [%[in2]]!\n"
826                 "vhadd.u8 q10, q6, q8\n"
827                 "vhadd.u8 q11, q7, q9\n"
828                 "vst1.u8  {q4-q5}, [%[out],:128]!\n"
829                 "vst1.u8  {q10-q11}, [%[out],:128]!\n"
830                 : [out] "+r" (outp), [in1] "+r" (in1p), [in2] "+r" (in2p)
831                 :
832                 : "q0", "q1", "q2", "memory");
833     else
834          while (outp < end)
835             asm volatile (
836                 "vld1.u8  {q0-q1}, [%[in1],:128]!\n"
837                 "vld1.u8  {q2-q3}, [%[in2],:128]!\n"
838                 "vhadd.u8 q4, q0, q2\n"
839                 "vld1.u8  {q6-q7}, [%[in1],:128]!\n"
840                 "vhadd.u8 q5, q1, q3\n"
841                 "vld1.u8  {q8-q9}, [%[in2],:128]!\n"
842                 "vhadd.u8 q10, q6, q8\n"
843                 "vhadd.u8 q11, q7, q9\n"
844                 "vst1.u8  {q4-q5}, [%[out],:128]!\n"
845                 "vst1.u8  {q10-q11}, [%[out],:128]!\n"
846                 : [out] "+r" (outp), [in1] "+r" (in1p), [in2] "+r" (in2p)
847                 :
848                 : "q0", "q1", "q2", "memory");
849     n &= 15;
850     if (n)
851         MergeGeneric (outp, in1p, in2p, n);
852 }
853 #endif
854
855 /*****************************************************************************
856  * RenderX: This algo works on a 8x8 block basic, it copies the top field
857  * and apply a process to recreate the bottom field :
858  *  If a 8x8 block is classified as :
859  *   - progressive: it applies a small blend (1,6,1)
860  *   - interlaced:
861  *    * in the MMX version: we do a ME between the 2 fields, if there is a
862  *    good match we use MC to recreate the bottom field (with a small
863  *    blend (1,6,1) )
864  *    * otherwise: it recreates the bottom field by an edge oriented
865  *    interpolation.
866   *****************************************************************************/
867
868 /* XDeint8x8Detect: detect if a 8x8 block is interlaced.
869  * XXX: It need to access to 8x10
870  * We use more than 8 lines to help with scrolling (text)
871  * (and because XDeint8x8Frame use line 9)
872  * XXX: smooth/uniform area with noise detection doesn't works well
873  * but it's not really a problem because they don't have much details anyway
874  */
875 static inline int ssd( int a ) { return a*a; }
876 static inline int XDeint8x8DetectC( uint8_t *src, int i_src )
877 {
878     int y, x;
879     int ff, fr;
880     int fc;
881
882     /* Detect interlacing */
883     fc = 0;
884     for( y = 0; y < 7; y += 2 )
885     {
886         ff = fr = 0;
887         for( x = 0; x < 8; x++ )
888         {
889             fr += ssd(src[      x] - src[1*i_src+x]) +
890                   ssd(src[i_src+x] - src[2*i_src+x]);
891             ff += ssd(src[      x] - src[2*i_src+x]) +
892                   ssd(src[i_src+x] - src[3*i_src+x]);
893         }
894         if( ff < 6*fr/8 && fr > 32 )
895             fc++;
896
897         src += 2*i_src;
898     }
899
900     return fc < 1 ? false : true;
901 }
902 #ifdef CAN_COMPILE_MMXEXT
903 static inline int XDeint8x8DetectMMXEXT( uint8_t *src, int i_src )
904 {
905
906     int y, x;
907     int32_t ff, fr;
908     int fc;
909
910     /* Detect interlacing */
911     fc = 0;
912     pxor_r2r( mm7, mm7 );
913     for( y = 0; y < 9; y += 2 )
914     {
915         ff = fr = 0;
916         pxor_r2r( mm5, mm5 );
917         pxor_r2r( mm6, mm6 );
918         for( x = 0; x < 8; x+=4 )
919         {
920             movd_m2r( src[        x], mm0 );
921             movd_m2r( src[1*i_src+x], mm1 );
922             movd_m2r( src[2*i_src+x], mm2 );
923             movd_m2r( src[3*i_src+x], mm3 );
924
925             punpcklbw_r2r( mm7, mm0 );
926             punpcklbw_r2r( mm7, mm1 );
927             punpcklbw_r2r( mm7, mm2 );
928             punpcklbw_r2r( mm7, mm3 );
929
930             movq_r2r( mm0, mm4 );
931
932             psubw_r2r( mm1, mm0 );
933             psubw_r2r( mm2, mm4 );
934
935             psubw_r2r( mm1, mm2 );
936             psubw_r2r( mm1, mm3 );
937
938             pmaddwd_r2r( mm0, mm0 );
939             pmaddwd_r2r( mm4, mm4 );
940             pmaddwd_r2r( mm2, mm2 );
941             pmaddwd_r2r( mm3, mm3 );
942             paddd_r2r( mm0, mm2 );
943             paddd_r2r( mm4, mm3 );
944             paddd_r2r( mm2, mm5 );
945             paddd_r2r( mm3, mm6 );
946         }
947
948         movq_r2r( mm5, mm0 );
949         psrlq_i2r( 32, mm0 );
950         paddd_r2r( mm0, mm5 );
951         movd_r2m( mm5, fr );
952
953         movq_r2r( mm6, mm0 );
954         psrlq_i2r( 32, mm0 );
955         paddd_r2r( mm0, mm6 );
956         movd_r2m( mm6, ff );
957
958         if( ff < 6*fr/8 && fr > 32 )
959             fc++;
960
961         src += 2*i_src;
962     }
963     return fc;
964 }
965 #endif
966
967 static inline void XDeint8x8MergeC( uint8_t *dst, int i_dst,
968                                     uint8_t *src1, int i_src1,
969                                     uint8_t *src2, int i_src2 )
970 {
971     int y, x;
972
973     /* Progressive */
974     for( y = 0; y < 8; y += 2 )
975     {
976         memcpy( dst, src1, 8 );
977         dst  += i_dst;
978
979         for( x = 0; x < 8; x++ )
980             dst[x] = (src1[x] + 6*src2[x] + src1[i_src1+x] + 4 ) >> 3;
981         dst += i_dst;
982
983         src1 += i_src1;
984         src2 += i_src2;
985     }
986 }
987
988 #ifdef CAN_COMPILE_MMXEXT
989 static inline void XDeint8x8MergeMMXEXT( uint8_t *dst, int i_dst,
990                                          uint8_t *src1, int i_src1,
991                                          uint8_t *src2, int i_src2 )
992 {
993     static const uint64_t m_4 = INT64_C(0x0004000400040004);
994     int y, x;
995
996     /* Progressive */
997     pxor_r2r( mm7, mm7 );
998     for( y = 0; y < 8; y += 2 )
999     {
1000         for( x = 0; x < 8; x +=4 )
1001         {
1002             movd_m2r( src1[x], mm0 );
1003             movd_r2m( mm0, dst[x] );
1004
1005             movd_m2r( src2[x], mm1 );
1006             movd_m2r( src1[i_src1+x], mm2 );
1007
1008             punpcklbw_r2r( mm7, mm0 );
1009             punpcklbw_r2r( mm7, mm1 );
1010             punpcklbw_r2r( mm7, mm2 );
1011             paddw_r2r( mm1, mm1 );
1012             movq_r2r( mm1, mm3 );
1013             paddw_r2r( mm3, mm3 );
1014             paddw_r2r( mm2, mm0 );
1015             paddw_r2r( mm3, mm1 );
1016             paddw_m2r( m_4, mm1 );
1017             paddw_r2r( mm1, mm0 );
1018             psraw_i2r( 3, mm0 );
1019             packuswb_r2r( mm7, mm0 );
1020             movd_r2m( mm0, dst[i_dst+x] );
1021         }
1022         dst += 2*i_dst;
1023         src1 += i_src1;
1024         src2 += i_src2;
1025     }
1026 }
1027
1028 #endif
1029
1030 /* For debug */
1031 static inline void XDeint8x8Set( uint8_t *dst, int i_dst, uint8_t v )
1032 {
1033     int y;
1034     for( y = 0; y < 8; y++ )
1035         memset( &dst[y*i_dst], v, 8 );
1036 }
1037
1038 /* XDeint8x8FieldE: Stupid deinterlacing (1,0,1) for block that miss a
1039  * neighbour
1040  * (Use 8x9 pixels)
1041  * TODO: a better one for the inner part.
1042  */
1043 static inline void XDeint8x8FieldEC( uint8_t *dst, int i_dst,
1044                                      uint8_t *src, int i_src )
1045 {
1046     int y, x;
1047
1048     /* Interlaced */
1049     for( y = 0; y < 8; y += 2 )
1050     {
1051         memcpy( dst, src, 8 );
1052         dst += i_dst;
1053
1054         for( x = 0; x < 8; x++ )
1055             dst[x] = (src[x] + src[2*i_src+x] ) >> 1;
1056         dst += 1*i_dst;
1057         src += 2*i_src;
1058     }
1059 }
1060 #ifdef CAN_COMPILE_MMXEXT
1061 static inline void XDeint8x8FieldEMMXEXT( uint8_t *dst, int i_dst,
1062                                           uint8_t *src, int i_src )
1063 {
1064     int y;
1065
1066     /* Interlaced */
1067     for( y = 0; y < 8; y += 2 )
1068     {
1069         movq_m2r( src[0], mm0 );
1070         movq_r2m( mm0, dst[0] );
1071         dst += i_dst;
1072
1073         movq_m2r( src[2*i_src], mm1 );
1074         pavgb_r2r( mm1, mm0 );
1075
1076         movq_r2m( mm0, dst[0] );
1077
1078         dst += 1*i_dst;
1079         src += 2*i_src;
1080     }
1081 }
1082 #endif
1083
1084 /* XDeint8x8Field: Edge oriented interpolation
1085  * (Need -4 and +5 pixels H, +1 line)
1086  */
1087 static inline void XDeint8x8FieldC( uint8_t *dst, int i_dst,
1088                                     uint8_t *src, int i_src )
1089 {
1090     int y, x;
1091
1092     /* Interlaced */
1093     for( y = 0; y < 8; y += 2 )
1094     {
1095         memcpy( dst, src, 8 );
1096         dst += i_dst;
1097
1098         for( x = 0; x < 8; x++ )
1099         {
1100             uint8_t *src2 = &src[2*i_src];
1101             /* I use 8 pixels just to match the MMX version, but it's overkill
1102              * 5 would be enough (less isn't good) */
1103             const int c0 = abs(src[x-4]-src2[x-2]) + abs(src[x-3]-src2[x-1]) +
1104                            abs(src[x-2]-src2[x+0]) + abs(src[x-1]-src2[x+1]) +
1105                            abs(src[x+0]-src2[x+2]) + abs(src[x+1]-src2[x+3]) +
1106                            abs(src[x+2]-src2[x+4]) + abs(src[x+3]-src2[x+5]);
1107
1108             const int c1 = abs(src[x-3]-src2[x-3]) + abs(src[x-2]-src2[x-2]) +
1109                            abs(src[x-1]-src2[x-1]) + abs(src[x+0]-src2[x+0]) +
1110                            abs(src[x+1]-src2[x+1]) + abs(src[x+2]-src2[x+2]) +
1111                            abs(src[x+3]-src2[x+3]) + abs(src[x+4]-src2[x+4]);
1112
1113             const int c2 = abs(src[x-2]-src2[x-4]) + abs(src[x-1]-src2[x-3]) +
1114                            abs(src[x+0]-src2[x-2]) + abs(src[x+1]-src2[x-1]) +
1115                            abs(src[x+2]-src2[x+0]) + abs(src[x+3]-src2[x+1]) +
1116                            abs(src[x+4]-src2[x+2]) + abs(src[x+5]-src2[x+3]);
1117
1118             if( c0 < c1 && c1 <= c2 )
1119                 dst[x] = (src[x-1] + src2[x+1]) >> 1;
1120             else if( c2 < c1 && c1 <= c0 )
1121                 dst[x] = (src[x+1] + src2[x-1]) >> 1;
1122             else
1123                 dst[x] = (src[x+0] + src2[x+0]) >> 1;
1124         }
1125
1126         dst += 1*i_dst;
1127         src += 2*i_src;
1128     }
1129 }
1130 #ifdef CAN_COMPILE_MMXEXT
1131 static inline void XDeint8x8FieldMMXEXT( uint8_t *dst, int i_dst,
1132                                          uint8_t *src, int i_src )
1133 {
1134     int y, x;
1135
1136     /* Interlaced */
1137     for( y = 0; y < 8; y += 2 )
1138     {
1139         memcpy( dst, src, 8 );
1140         dst += i_dst;
1141
1142         for( x = 0; x < 8; x++ )
1143         {
1144             uint8_t *src2 = &src[2*i_src];
1145             int32_t c0, c1, c2;
1146
1147             movq_m2r( src[x-2], mm0 );
1148             movq_m2r( src[x-3], mm1 );
1149             movq_m2r( src[x-4], mm2 );
1150
1151             psadbw_m2r( src2[x-4], mm0 );
1152             psadbw_m2r( src2[x-3], mm1 );
1153             psadbw_m2r( src2[x-2], mm2 );
1154
1155             movd_r2m( mm0, c2 );
1156             movd_r2m( mm1, c1 );
1157             movd_r2m( mm2, c0 );
1158
1159             if( c0 < c1 && c1 <= c2 )
1160                 dst[x] = (src[x-1] + src2[x+1]) >> 1;
1161             else if( c2 < c1 && c1 <= c0 )
1162                 dst[x] = (src[x+1] + src2[x-1]) >> 1;
1163             else
1164                 dst[x] = (src[x+0] + src2[x+0]) >> 1;
1165         }
1166
1167         dst += 1*i_dst;
1168         src += 2*i_src;
1169     }
1170 }
1171 #endif
1172
1173 /* NxN arbitray size (and then only use pixel in the NxN block)
1174  */
1175 static inline int XDeintNxNDetect( uint8_t *src, int i_src,
1176                                    int i_height, int i_width )
1177 {
1178     int y, x;
1179     int ff, fr;
1180     int fc;
1181
1182
1183     /* Detect interlacing */
1184     /* FIXME way too simple, need to be more like XDeint8x8Detect */
1185     ff = fr = 0;
1186     fc = 0;
1187     for( y = 0; y < i_height - 2; y += 2 )
1188     {
1189         const uint8_t *s = &src[y*i_src];
1190         for( x = 0; x < i_width; x++ )
1191         {
1192             fr += ssd(s[      x] - s[1*i_src+x]);
1193             ff += ssd(s[      x] - s[2*i_src+x]);
1194         }
1195         if( ff < fr && fr > i_width / 2 )
1196             fc++;
1197     }
1198
1199     return fc < 2 ? false : true;
1200 }
1201
1202 static inline void XDeintNxNFrame( uint8_t *dst, int i_dst,
1203                                    uint8_t *src, int i_src,
1204                                    int i_width, int i_height )
1205 {
1206     int y, x;
1207
1208     /* Progressive */
1209     for( y = 0; y < i_height; y += 2 )
1210     {
1211         memcpy( dst, src, i_width );
1212         dst += i_dst;
1213
1214         if( y < i_height - 2 )
1215         {
1216             for( x = 0; x < i_width; x++ )
1217                 dst[x] = (src[x] + 2*src[1*i_src+x] + src[2*i_src+x] + 2 ) >> 2;
1218         }
1219         else
1220         {
1221             /* Blend last line */
1222             for( x = 0; x < i_width; x++ )
1223                 dst[x] = (src[x] + src[1*i_src+x] ) >> 1;
1224         }
1225         dst += 1*i_dst;
1226         src += 2*i_src;
1227     }
1228 }
1229
1230 static inline void XDeintNxNField( uint8_t *dst, int i_dst,
1231                                    uint8_t *src, int i_src,
1232                                    int i_width, int i_height )
1233 {
1234     int y, x;
1235
1236     /* Interlaced */
1237     for( y = 0; y < i_height; y += 2 )
1238     {
1239         memcpy( dst, src, i_width );
1240         dst += i_dst;
1241
1242         if( y < i_height - 2 )
1243         {
1244             for( x = 0; x < i_width; x++ )
1245                 dst[x] = (src[x] + src[2*i_src+x] ) >> 1;
1246         }
1247         else
1248         {
1249             /* Blend last line */
1250             for( x = 0; x < i_width; x++ )
1251                 dst[x] = (src[x] + src[i_src+x]) >> 1;
1252         }
1253         dst += 1*i_dst;
1254         src += 2*i_src;
1255     }
1256 }
1257
1258 static inline void XDeintNxN( uint8_t *dst, int i_dst, uint8_t *src, int i_src,
1259                               int i_width, int i_height )
1260 {
1261     if( XDeintNxNDetect( src, i_src, i_width, i_height ) )
1262         XDeintNxNField( dst, i_dst, src, i_src, i_width, i_height );
1263     else
1264         XDeintNxNFrame( dst, i_dst, src, i_src, i_width, i_height );
1265 }
1266
1267
1268 static inline int median( int a, int b, int c )
1269 {
1270     int min = a, max =a;
1271     if( b < min )
1272         min = b;
1273     else
1274         max = b;
1275
1276     if( c < min )
1277         min = c;
1278     else if( c > max )
1279         max = c;
1280
1281     return a + b + c - min - max;
1282 }
1283
1284
1285 /* XDeintBand8x8:
1286  */
1287 static inline void XDeintBand8x8C( uint8_t *dst, int i_dst,
1288                                    uint8_t *src, int i_src,
1289                                    const int i_mbx, int i_modx )
1290 {
1291     int x;
1292
1293     for( x = 0; x < i_mbx; x++ )
1294     {
1295         int s;
1296         if( ( s = XDeint8x8DetectC( src, i_src ) ) )
1297         {
1298             if( x == 0 || x == i_mbx - 1 )
1299                 XDeint8x8FieldEC( dst, i_dst, src, i_src );
1300             else
1301                 XDeint8x8FieldC( dst, i_dst, src, i_src );
1302         }
1303         else
1304         {
1305             XDeint8x8MergeC( dst, i_dst,
1306                              &src[0*i_src], 2*i_src,
1307                              &src[1*i_src], 2*i_src );
1308         }
1309
1310         dst += 8;
1311         src += 8;
1312     }
1313
1314     if( i_modx )
1315         XDeintNxN( dst, i_dst, src, i_src, i_modx, 8 );
1316 }
1317 #ifdef CAN_COMPILE_MMXEXT
1318 static inline void XDeintBand8x8MMXEXT( uint8_t *dst, int i_dst,
1319                                         uint8_t *src, int i_src,
1320                                         const int i_mbx, int i_modx )
1321 {
1322     int x;
1323
1324     /* Reset current line */
1325     for( x = 0; x < i_mbx; x++ )
1326     {
1327         int s;
1328         if( ( s = XDeint8x8DetectMMXEXT( src, i_src ) ) )
1329         {
1330             if( x == 0 || x == i_mbx - 1 )
1331                 XDeint8x8FieldEMMXEXT( dst, i_dst, src, i_src );
1332             else
1333                 XDeint8x8FieldMMXEXT( dst, i_dst, src, i_src );
1334         }
1335         else
1336         {
1337             XDeint8x8MergeMMXEXT( dst, i_dst,
1338                                   &src[0*i_src], 2*i_src,
1339                                   &src[1*i_src], 2*i_src );
1340         }
1341
1342         dst += 8;
1343         src += 8;
1344     }
1345
1346     if( i_modx )
1347         XDeintNxN( dst, i_dst, src, i_src, i_modx, 8 );
1348 }
1349 #endif
1350
1351 static void RenderX( picture_t *p_outpic, picture_t *p_pic )
1352 {
1353     int i_plane;
1354
1355     /* Copy image and skip lines */
1356     for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
1357     {
1358         const int i_mby = ( p_outpic->p[i_plane].i_visible_lines + 7 )/8 - 1;
1359         const int i_mbx = p_outpic->p[i_plane].i_visible_pitch/8;
1360
1361         const int i_mody = p_outpic->p[i_plane].i_visible_lines - 8*i_mby;
1362         const int i_modx = p_outpic->p[i_plane].i_visible_pitch - 8*i_mbx;
1363
1364         const int i_dst = p_outpic->p[i_plane].i_pitch;
1365         const int i_src = p_pic->p[i_plane].i_pitch;
1366
1367         int y, x;
1368
1369         for( y = 0; y < i_mby; y++ )
1370         {
1371             uint8_t *dst = &p_outpic->p[i_plane].p_pixels[8*y*i_dst];
1372             uint8_t *src = &p_pic->p[i_plane].p_pixels[8*y*i_src];
1373
1374 #ifdef CAN_COMPILE_MMXEXT
1375             if( vlc_CPU() & CPU_CAPABILITY_MMXEXT )
1376                 XDeintBand8x8MMXEXT( dst, i_dst, src, i_src, i_mbx, i_modx );
1377             else
1378 #endif
1379                 XDeintBand8x8C( dst, i_dst, src, i_src, i_mbx, i_modx );
1380         }
1381
1382         /* Last line (C only)*/
1383         if( i_mody )
1384         {
1385             uint8_t *dst = &p_outpic->p[i_plane].p_pixels[8*y*i_dst];
1386             uint8_t *src = &p_pic->p[i_plane].p_pixels[8*y*i_src];
1387
1388             for( x = 0; x < i_mbx; x++ )
1389             {
1390                 XDeintNxN( dst, i_dst, src, i_src, 8, i_mody );
1391
1392                 dst += 8;
1393                 src += 8;
1394             }
1395
1396             if( i_modx )
1397                 XDeintNxN( dst, i_dst, src, i_src, i_modx, i_mody );
1398         }
1399     }
1400
1401 #ifdef CAN_COMPILE_MMXEXT
1402     if( vlc_CPU() & CPU_CAPABILITY_MMXEXT )
1403         emms();
1404 #endif
1405 }
1406
1407 /*****************************************************************************
1408  * Yadif (Yet Another DeInterlacing Filter).
1409  *****************************************************************************/
1410 /* */
1411 struct vf_priv_s {
1412     /*
1413      * 0: Output 1 frame for each frame.
1414      * 1: Output 1 frame for each field.
1415      * 2: Like 0 but skips spatial interlacing check.
1416      * 3: Like 1 but skips spatial interlacing check.
1417      *
1418      * In vlc, only & 0x02 has meaning, as we do the & 0x01 ourself.
1419      */
1420     int mode;
1421 };
1422
1423 /* I am unsure it is the right one */
1424 typedef intptr_t x86_reg;
1425
1426 #define FFABS(a) ((a) >= 0 ? (a) : (-(a)))
1427 #define FFMAX(a,b)      __MAX(a,b)
1428 #define FFMAX3(a,b,c)   FFMAX(FFMAX(a,b),c)
1429 #define FFMIN(a,b)      __MIN(a,b)
1430 #define FFMIN3(a,b,c)   FFMIN(FFMIN(a,b),c)
1431
1432 /* yadif.h comes from vf_yadif.c of mplayer project */
1433 #include "yadif.h"
1434
1435 static void RenderYadif( filter_t *p_filter, picture_t *p_dst, picture_t *p_src, int i_order, int i_field )
1436 {
1437     filter_sys_t *p_sys = p_filter->p_sys;
1438
1439     /* */
1440     assert( i_order == 0 || i_order == 1 );
1441     assert( i_field == 0 || i_field == 1 );
1442
1443     if( i_order == 0 )
1444     {
1445         /* Duplicate the picture
1446          * TODO when the vout rework is finished, picture_Hold() might be enough
1447          * but becarefull, the pitches must match */
1448         picture_t *p_dup = picture_NewFromFormat( &p_src->format );
1449         if( p_dup )
1450             picture_Copy( p_dup, p_src );
1451
1452         /* Slide the history */
1453         if( p_sys->pp_history[0] )
1454             picture_Release( p_sys->pp_history[0]  );
1455         for( int i = 1; i < HISTORY_SIZE; i++ )
1456             p_sys->pp_history[i-1] = p_sys->pp_history[i];
1457         p_sys->pp_history[HISTORY_SIZE-1] = p_dup;
1458     }
1459
1460     /* As the pitches must match, use ONLY pictures coming from picture_New()! */
1461     picture_t *p_prev = p_sys->pp_history[0];
1462     picture_t *p_cur  = p_sys->pp_history[1];
1463     picture_t *p_next = p_sys->pp_history[2];
1464
1465     /* Filter if we have all the pictures we need */
1466     if( p_prev && p_cur && p_next )
1467     {
1468         /* */
1469         void (*filter)(struct vf_priv_s *p, uint8_t *dst, uint8_t *prev, uint8_t *cur, uint8_t *next, int w, int refs, int parity);
1470 #if defined(HAVE_YADIF_SSE2)
1471         if( vlc_CPU() & CPU_CAPABILITY_SSE2 )
1472             filter = yadif_filter_line_mmx2;
1473         else
1474 #endif
1475             filter = yadif_filter_line_c;
1476
1477         for( int n = 0; n < p_dst->i_planes; n++ )
1478         {
1479             const plane_t *prevp = &p_prev->p[n];
1480             const plane_t *curp  = &p_cur->p[n];
1481             const plane_t *nextp = &p_next->p[n];
1482             plane_t *dstp        = &p_dst->p[n];
1483
1484             for( int y = 1; y < dstp->i_visible_lines - 1; y++ )
1485             {
1486                 if( (y % 2) == i_field )
1487                 {
1488                     vlc_memcpy( &dstp->p_pixels[y * dstp->i_pitch],
1489                                 &curp->p_pixels[y * curp->i_pitch], dstp->i_visible_pitch );
1490                 }
1491                 else
1492                 {
1493                     struct vf_priv_s cfg;
1494                     /* Spatial checks only when enough data */
1495                     cfg.mode = (y >= 2 && y < dstp->i_visible_lines - 2) ? 0 : 2;
1496
1497                     assert( prevp->i_pitch == curp->i_pitch && curp->i_pitch == nextp->i_pitch );
1498                     filter( &cfg,
1499                             &dstp->p_pixels[y * dstp->i_pitch],
1500                             &prevp->p_pixels[y * prevp->i_pitch],
1501                             &curp->p_pixels[y * curp->i_pitch],
1502                             &nextp->p_pixels[y * nextp->i_pitch],
1503                             dstp->i_visible_pitch,
1504                             curp->i_pitch,
1505                             (i_field ^ (i_order == i_field)) & 1 );
1506                 }
1507
1508                 /* We duplicate the first and last lines */
1509                 if( y == 1 )
1510                     vlc_memcpy(&dstp->p_pixels[(y-1) * dstp->i_pitch], &dstp->p_pixels[y * dstp->i_pitch], dstp->i_pitch);
1511                 else if( y == dstp->i_visible_lines - 2 )
1512                     vlc_memcpy(&dstp->p_pixels[(y+1) * dstp->i_pitch], &dstp->p_pixels[y * dstp->i_pitch], dstp->i_pitch);
1513             }
1514         }
1515
1516         /* */
1517         p_dst->date = (p_next->date - p_cur->date) * i_order / 2 + p_cur->date;
1518     }
1519     else
1520     {
1521         /* Fallback to something simple
1522          * XXX it is wrong when we have 2 pictures, we should not output a picture */
1523         RenderX( p_dst, p_src );
1524     }
1525 }
1526
1527 /*****************************************************************************
1528  * video filter2 functions
1529  *****************************************************************************/
1530 static picture_t *Deinterlace( filter_t *p_filter, picture_t *p_pic )
1531 {
1532     filter_sys_t *p_sys = p_filter->p_sys;
1533     picture_t *p_pic_dst;
1534
1535     /* Request output picture */
1536     p_pic_dst = filter_NewPicture( p_filter );
1537     if( p_pic_dst == NULL )
1538     {
1539         picture_Release( p_pic );
1540         return NULL;
1541     }
1542
1543     switch( p_sys->i_mode )
1544     {
1545         case DEINTERLACE_DISCARD:
1546             RenderDiscard( p_filter, p_pic_dst, p_pic, 0 );
1547             break;
1548
1549         case DEINTERLACE_BOB:
1550 #if 0
1551             RenderBob( p_filter, pp_outpic[0], p_pic, !p_pic->b_top_field_first );
1552             RenderBob( p_filter, pp_outpic[1], p_pic, p_pic->b_top_field_first );
1553             break;
1554 #endif
1555
1556         case DEINTERLACE_LINEAR:
1557 #if 0
1558             RenderLinear( p_filter, pp_outpic[0], p_pic, !p_pic->b_top_field_first );
1559             RenderLinear( p_filter, pp_outpic[1], p_pic, p_pic->b_top_field_first );
1560 #endif
1561             msg_Err( p_filter, "doubling the frame rate is not supported yet" );
1562             picture_Release( p_pic_dst );
1563             picture_Release( p_pic );
1564             return NULL;
1565
1566         case DEINTERLACE_MEAN:
1567             RenderMean( p_filter, p_pic_dst, p_pic );
1568             break;
1569
1570         case DEINTERLACE_BLEND:
1571             RenderBlend( p_filter, p_pic_dst, p_pic );
1572             break;
1573
1574         case DEINTERLACE_X:
1575             RenderX( p_pic_dst, p_pic );
1576             break;
1577
1578         case DEINTERLACE_YADIF:
1579             msg_Err( p_filter, "delaying frames is not supported yet" );
1580             //RenderYadif( p_vout, pp_outpic[0], p_pic, 0, 0 );
1581             picture_Release( p_pic_dst );
1582             picture_Release( p_pic );
1583             return NULL;
1584
1585         case DEINTERLACE_YADIF2X:
1586             msg_Err( p_filter, "doubling the frame rate is not supported yet" );
1587             //RenderYadif( p_vout, pp_outpic[0], p_pic, 0, !p_pic->b_top_field_first );
1588             //RenderYadif( p_vout, pp_outpic[1], p_pic, 1, p_pic->b_top_field_first );
1589             picture_Release( p_pic_dst );
1590             picture_Release( p_pic );
1591             return NULL;
1592     }
1593
1594     picture_CopyProperties( p_pic_dst, p_pic );
1595     p_pic_dst->b_progressive = true;
1596
1597     picture_Release( p_pic );
1598     return p_pic_dst;
1599 }
1600
1601 static int Mouse( filter_t *p_filter,
1602                   vlc_mouse_t *p_mouse, const vlc_mouse_t *p_old, const vlc_mouse_t *p_new )
1603 {
1604     *p_mouse = *p_new;
1605     if( p_filter->p_sys->b_half_height )
1606         p_mouse->i_y *= 2;
1607     return VLC_SUCCESS;
1608 }
1609
1610
1611 /*****************************************************************************
1612  * Open
1613  *****************************************************************************/
1614 static int Open( vlc_object_t *p_this )
1615 {
1616     filter_t *p_filter = (filter_t*)p_this;
1617     filter_sys_t *p_sys;
1618
1619     if( !IsChromaSupported( p_filter->fmt_in.video.i_chroma ) )
1620         return VLC_EGENERIC;
1621
1622     /* */
1623     p_sys = p_filter->p_sys = malloc( sizeof( *p_sys ) );
1624     if( !p_sys )
1625         return VLC_ENOMEM;
1626
1627     p_sys->i_mode = DEINTERLACE_BLEND;
1628     p_sys->b_double_rate = false;
1629     p_sys->b_half_height = true;
1630
1631 #if defined(CAN_COMPILE_C_ALTIVEC)
1632     if( vlc_CPU() & CPU_CAPABILITY_ALTIVEC )
1633     {
1634         p_sys->pf_merge = MergeAltivec;
1635         p_sys->pf_end_merge = NULL;
1636     }
1637     else
1638 #endif
1639 #if defined(CAN_COMPILE_SSE)
1640     if( vlc_CPU() & CPU_CAPABILITY_SSE2 )
1641     {
1642         p_sys->pf_merge = MergeSSE2;
1643         p_sys->pf_end_merge = EndMMX;
1644     }
1645     else
1646 #endif
1647 #if defined(CAN_COMPILE_MMXEXT)
1648     if( vlc_CPU() & CPU_CAPABILITY_MMXEXT )
1649     {
1650         p_sys->pf_merge = MergeMMXEXT;
1651         p_sys->pf_end_merge = EndMMX;
1652     }
1653     else
1654 #endif
1655 #if defined(CAN_COMPILE_3DNOW)
1656     if( vlc_CPU() & CPU_CAPABILITY_3DNOW )
1657     {
1658         p_sys->pf_merge = Merge3DNow;
1659         p_sys->pf_end_merge = End3DNow;
1660     }
1661     else
1662 #endif
1663 #if defined __ARM_NEON__
1664     if( vlc_CPU() & CPU_CAPABILITY_NEON )
1665     {
1666         p_sys->pf_merge = MergeNEON;
1667         p_sys->pf_end_merge = NULL;
1668     }
1669     else
1670 #endif
1671     {
1672         p_sys->pf_merge = MergeGeneric;
1673         p_sys->pf_end_merge = NULL;
1674     }
1675
1676     /* */
1677     config_ChainParse( p_filter, FILTER_CFG_PREFIX, ppsz_filter_options,
1678                        p_filter->p_cfg );
1679
1680     char *psz_mode = var_GetNonEmptyString( p_filter, FILTER_CFG_PREFIX "mode" );
1681     SetFilterMethod( p_filter, psz_mode, p_filter->fmt_in.video.i_chroma );
1682     free( psz_mode );
1683
1684     /* */
1685     video_format_t fmt;
1686     GetOutputFormat( p_filter, &fmt, &p_filter->fmt_in.video );
1687     if( !p_filter->b_allow_fmt_out_change &&
1688         ( fmt.i_chroma != p_filter->fmt_in.video.i_chroma ||
1689           fmt.i_height != p_filter->fmt_in.video.i_height ) )
1690     {
1691         Close( VLC_OBJECT(p_filter) );
1692         return VLC_EGENERIC;
1693     }
1694     p_filter->fmt_out.video = fmt;
1695     p_filter->fmt_out.i_codec = fmt.i_chroma;
1696     p_filter->pf_video_filter = Deinterlace;
1697     p_filter->pf_video_mouse  = Mouse;
1698
1699     msg_Dbg( p_filter, "deinterlacing" );
1700
1701     return VLC_SUCCESS;
1702 }
1703
1704 /*****************************************************************************
1705  * Close: clean up the filter
1706  *****************************************************************************/
1707 static void Close( vlc_object_t *p_this )
1708 {
1709     filter_t *p_filter = (filter_t*)p_this;
1710
1711     free( p_filter->p_sys );
1712 }
1713