]> git.sesse.net Git - vlc/blob - modules/video_filter/deinterlace.c
m3u: use the album art provided by jamendo along with the m3u files.
[vlc] / modules / video_filter / deinterlace.c
1 /*****************************************************************************
2  * deinterlace.c : deinterlacer plugin for vlc
3  *****************************************************************************
4  * Copyright (C) 2000-2009 the VideoLAN team
5  * $Id$
6  *
7  * Author: Sam Hocevar <sam@zoy.org>
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License as published by
11  * the Free Software Foundation; either version 2 of the License, or
12  * (at your option) any later version.
13  *
14  * This program is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  * GNU General Public License for more details.
18  *
19  * You should have received a copy of the GNU General Public License
20  * along with this program; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
22  *****************************************************************************/
23
24 /*****************************************************************************
25  * Preamble
26  *****************************************************************************/
27
28 #ifdef HAVE_CONFIG_H
29 # include "config.h"
30 #endif
31
32 #include <assert.h>
33
34 #ifdef HAVE_ALTIVEC_H
35 #   include <altivec.h>
36 #endif
37
38 #include <vlc_common.h>
39 #include <vlc_plugin.h>
40 #include <vlc_filter.h>
41 #include <vlc_cpu.h>
42
43 #ifdef CAN_COMPILE_MMXEXT
44 #   include "mmx.h"
45 #endif
46
47 #define DEINTERLACE_DISCARD 1
48 #define DEINTERLACE_MEAN    2
49 #define DEINTERLACE_BLEND   3
50 #define DEINTERLACE_BOB     4
51 #define DEINTERLACE_LINEAR  5
52 #define DEINTERLACE_X       6
53 #define DEINTERLACE_YADIF   7
54 #define DEINTERLACE_YADIF2X 8
55
56 /*****************************************************************************
57  * Module descriptor
58  *****************************************************************************/
59 static int  Open ( vlc_object_t * );
60 static void Close( vlc_object_t * );
61
62 #define MODE_TEXT N_("Deinterlace mode")
63 #define MODE_LONGTEXT N_("Deinterlace method to use for local playback.")
64
65 #define SOUT_MODE_TEXT N_("Streaming deinterlace mode")
66 #define SOUT_MODE_LONGTEXT N_("Deinterlace method to use for streaming.")
67
68 #define FILTER_CFG_PREFIX "sout-deinterlace-"
69
70 static const char *const mode_list[] = {
71     "discard", "blend", "mean", "bob", "linear", "x", "yadif", "yadif2x" };
72 static const char *const mode_list_text[] = {
73     N_("Discard"), N_("Blend"), N_("Mean"), N_("Bob"), N_("Linear"), "X", "Yadif", "Yadif (2x)" };
74
75 vlc_module_begin ()
76     set_description( N_("Deinterlacing video filter") )
77     set_shortname( N_("Deinterlace" ))
78     set_capability( "video filter", 0 )
79     set_category( CAT_VIDEO )
80     set_subcategory( SUBCAT_VIDEO_VFILTER )
81
82     set_capability( "video filter2", 0 )
83     add_string( FILTER_CFG_PREFIX "mode", "blend", NULL, SOUT_MODE_TEXT,
84                 SOUT_MODE_LONGTEXT, false )
85         change_string_list( mode_list, mode_list_text, 0 )
86         change_safe ()
87     add_shortcut( "deinterlace" )
88     set_callbacks( Open, Close )
89 vlc_module_end ()
90
91
92 /*****************************************************************************
93  * Local protypes
94  *****************************************************************************/
95 static void RenderDiscard( filter_t *, picture_t *, picture_t *, int );
96 static void RenderBob    ( filter_t *, picture_t *, picture_t *, int );
97 static void RenderMean   ( filter_t *, picture_t *, picture_t * );
98 static void RenderBlend  ( filter_t *, picture_t *, picture_t * );
99 static void RenderLinear ( filter_t *, picture_t *, picture_t *, int );
100 static void RenderX      ( picture_t *, picture_t * );
101 static int  RenderYadif  ( filter_t *, picture_t *, picture_t *, int, int );
102
103 static void MergeGeneric ( void *, const void *, const void *, size_t );
104 #if defined(CAN_COMPILE_C_ALTIVEC)
105 static void MergeAltivec ( void *, const void *, const void *, size_t );
106 #endif
107 #if defined(CAN_COMPILE_MMXEXT)
108 static void MergeMMXEXT  ( void *, const void *, const void *, size_t );
109 #endif
110 #if defined(CAN_COMPILE_3DNOW)
111 static void Merge3DNow   ( void *, const void *, const void *, size_t );
112 #endif
113 #if defined(CAN_COMPILE_SSE)
114 static void MergeSSE2    ( void *, const void *, const void *, size_t );
115 #endif
116 #if defined(CAN_COMPILE_MMXEXT) || defined(CAN_COMPILE_SSE)
117 static void EndMMX       ( void );
118 #endif
119 #if defined(CAN_COMPILE_3DNOW)
120 static void End3DNow     ( void );
121 #endif
122 #if defined __ARM_NEON__
123 static void MergeNEON (void *, const void *, const void *, size_t);
124 #endif
125
126 static const char *const ppsz_filter_options[] = {
127     "mode", NULL
128 };
129
130 #define HISTORY_SIZE (3)
131 struct filter_sys_t
132 {
133     int  i_mode;        /* Deinterlace mode */
134     bool b_double_rate; /* Shall we double the framerate? */
135     bool b_half_height; /* Shall be divide the height by 2 */
136
137     void (*pf_merge) ( void *, const void *, const void *, size_t );
138     void (*pf_end_merge) ( void );
139
140     /* Yadif */
141     picture_t *pp_history[HISTORY_SIZE];
142 };
143
144 /*****************************************************************************
145  * SetFilterMethod: setup the deinterlace method to use.
146  *****************************************************************************/
147 static void SetFilterMethod( filter_t *p_filter, const char *psz_method, vlc_fourcc_t i_chroma )
148 {
149     filter_sys_t *p_sys = p_filter->p_sys;
150
151     if( !psz_method )
152         psz_method = "";
153
154     if( !strcmp( psz_method, "mean" ) )
155     {
156         p_sys->i_mode = DEINTERLACE_MEAN;
157         p_sys->b_double_rate = false;
158         p_sys->b_half_height = true;
159     }
160     else if( !strcmp( psz_method, "bob" )
161              || !strcmp( psz_method, "progressive-scan" ) )
162     {
163         p_sys->i_mode = DEINTERLACE_BOB;
164         p_sys->b_double_rate = true;
165         p_sys->b_half_height = false;
166     }
167     else if( !strcmp( psz_method, "linear" ) )
168     {
169         p_sys->i_mode = DEINTERLACE_LINEAR;
170         p_sys->b_double_rate = true;
171         p_sys->b_half_height = false;
172     }
173     else if( !strcmp( psz_method, "x" ) )
174     {
175         p_sys->i_mode = DEINTERLACE_X;
176         p_sys->b_double_rate = false;
177         p_sys->b_half_height = false;
178     }
179     else if( !strcmp( psz_method, "yadif" ) )
180     {
181         p_sys->i_mode = DEINTERLACE_YADIF;
182         p_sys->b_double_rate = false;
183         p_sys->b_half_height = false;
184     }
185     else if( !strcmp( psz_method, "yadif2x" ) )
186     {
187         p_sys->i_mode = DEINTERLACE_YADIF2X;
188         p_sys->b_double_rate = true;
189         p_sys->b_half_height = false;
190     }
191     else if( !strcmp( psz_method, "discard" ) )
192     {
193         const bool b_i422 = i_chroma == VLC_CODEC_I422 ||
194                             i_chroma == VLC_CODEC_J422;
195
196         p_sys->i_mode = DEINTERLACE_DISCARD;
197         p_sys->b_double_rate = false;
198         p_sys->b_half_height = !b_i422;
199     }
200     else
201     {
202         if( strcmp( psz_method, "blend" ) )
203             msg_Err( p_filter,
204                      "no valid deinterlace mode provided, using \"blend\"" );
205
206         p_sys->i_mode = DEINTERLACE_BLEND;
207         p_sys->b_double_rate = false;
208         p_sys->b_half_height = false;
209     }
210
211     msg_Dbg( p_filter, "using %s deinterlace method", psz_method );
212 }
213
214 static void GetOutputFormat( filter_t *p_filter,
215                              video_format_t *p_dst, const video_format_t *p_src )
216 {
217     filter_sys_t *p_sys = p_filter->p_sys;
218     *p_dst = *p_src;
219
220     if( p_sys->b_half_height )
221     {
222         p_dst->i_height /= 2;
223         p_dst->i_visible_height /= 2;
224         p_dst->i_y_offset /= 2;
225         p_dst->i_sar_den *= 2;
226     }
227
228     if( p_src->i_chroma == VLC_CODEC_I422 ||
229         p_src->i_chroma == VLC_CODEC_J422 )
230     {
231         switch( p_sys->i_mode )
232         {
233         case DEINTERLACE_MEAN:
234         case DEINTERLACE_LINEAR:
235         case DEINTERLACE_X:
236         case DEINTERLACE_YADIF:
237         case DEINTERLACE_YADIF2X:
238             p_dst->i_chroma = p_src->i_chroma;
239             break;
240         default:
241             p_dst->i_chroma = p_src->i_chroma == VLC_CODEC_I422 ? VLC_CODEC_I420 :
242                                                                   VLC_CODEC_J420;
243             break;
244         }
245     }
246 }
247
248 static bool IsChromaSupported( vlc_fourcc_t i_chroma )
249 {
250     return i_chroma == VLC_CODEC_I420 ||
251            i_chroma == VLC_CODEC_J420 ||
252            i_chroma == VLC_CODEC_YV12 ||
253            i_chroma == VLC_CODEC_I422 ||
254            i_chroma == VLC_CODEC_J422;
255 }
256
257 /*****************************************************************************
258  * RenderDiscard: only keep TOP or BOTTOM field, discard the other.
259  *****************************************************************************/
260 static void RenderDiscard( filter_t *p_filter,
261                            picture_t *p_outpic, picture_t *p_pic, int i_field )
262 {
263     int i_plane;
264
265     /* Copy image and skip lines */
266     for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
267     {
268         uint8_t *p_in, *p_out_end, *p_out;
269         int i_increment;
270
271         p_in = p_pic->p[i_plane].p_pixels
272                    + i_field * p_pic->p[i_plane].i_pitch;
273
274         p_out = p_outpic->p[i_plane].p_pixels;
275         p_out_end = p_out + p_outpic->p[i_plane].i_pitch
276                              * p_outpic->p[i_plane].i_visible_lines;
277
278         switch( p_filter->fmt_in.video.i_chroma )
279         {
280         case VLC_CODEC_I420:
281         case VLC_CODEC_J420:
282         case VLC_CODEC_YV12:
283
284             for( ; p_out < p_out_end ; )
285             {
286                 vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
287
288                 p_out += p_outpic->p[i_plane].i_pitch;
289                 p_in += 2 * p_pic->p[i_plane].i_pitch;
290             }
291             break;
292
293         case VLC_CODEC_I422:
294         case VLC_CODEC_J422:
295
296             i_increment = 2 * p_pic->p[i_plane].i_pitch;
297
298             if( i_plane == Y_PLANE )
299             {
300                 for( ; p_out < p_out_end ; )
301                 {
302                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
303                     p_out += p_outpic->p[i_plane].i_pitch;
304                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
305                     p_out += p_outpic->p[i_plane].i_pitch;
306                     p_in += i_increment;
307                 }
308             }
309             else
310             {
311                 for( ; p_out < p_out_end ; )
312                 {
313                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
314                     p_out += p_outpic->p[i_plane].i_pitch;
315                     p_in += i_increment;
316                 }
317             }
318             break;
319
320         default:
321             break;
322         }
323     }
324 }
325
326 /*****************************************************************************
327  * RenderBob: renders a BOB picture - simple copy
328  *****************************************************************************/
329 static void RenderBob( filter_t *p_filter,
330                        picture_t *p_outpic, picture_t *p_pic, int i_field )
331 {
332     int i_plane;
333
334     /* Copy image and skip lines */
335     for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
336     {
337         uint8_t *p_in, *p_out_end, *p_out;
338
339         p_in = p_pic->p[i_plane].p_pixels;
340         p_out = p_outpic->p[i_plane].p_pixels;
341         p_out_end = p_out + p_outpic->p[i_plane].i_pitch
342                              * p_outpic->p[i_plane].i_visible_lines;
343
344         switch( p_filter->fmt_in.video.i_chroma )
345         {
346             case VLC_CODEC_I420:
347             case VLC_CODEC_J420:
348             case VLC_CODEC_YV12:
349                 /* For BOTTOM field we need to add the first line */
350                 if( i_field == 1 )
351                 {
352                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
353                     p_in += p_pic->p[i_plane].i_pitch;
354                     p_out += p_outpic->p[i_plane].i_pitch;
355                 }
356
357                 p_out_end -= 2 * p_outpic->p[i_plane].i_pitch;
358
359                 for( ; p_out < p_out_end ; )
360                 {
361                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
362
363                     p_out += p_outpic->p[i_plane].i_pitch;
364
365                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
366
367                     p_in += 2 * p_pic->p[i_plane].i_pitch;
368                     p_out += p_outpic->p[i_plane].i_pitch;
369                 }
370
371                 vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
372
373                 /* For TOP field we need to add the last line */
374                 if( i_field == 0 )
375                 {
376                     p_in += p_pic->p[i_plane].i_pitch;
377                     p_out += p_outpic->p[i_plane].i_pitch;
378                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
379                 }
380                 break;
381
382             case VLC_CODEC_I422:
383             case VLC_CODEC_J422:
384                 /* For BOTTOM field we need to add the first line */
385                 if( i_field == 1 )
386                 {
387                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
388                     p_in += p_pic->p[i_plane].i_pitch;
389                     p_out += p_outpic->p[i_plane].i_pitch;
390                 }
391
392                 p_out_end -= 2 * p_outpic->p[i_plane].i_pitch;
393
394                 if( i_plane == Y_PLANE )
395                 {
396                     for( ; p_out < p_out_end ; )
397                     {
398                         vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
399
400                         p_out += p_outpic->p[i_plane].i_pitch;
401
402                         vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
403
404                         p_in += 2 * p_pic->p[i_plane].i_pitch;
405                         p_out += p_outpic->p[i_plane].i_pitch;
406                     }
407                 }
408                 else
409                 {
410                     for( ; p_out < p_out_end ; )
411                     {
412                         vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
413
414                         p_out += p_outpic->p[i_plane].i_pitch;
415                         p_in += 2 * p_pic->p[i_plane].i_pitch;
416                     }
417                 }
418
419                 vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
420
421                 /* For TOP field we need to add the last line */
422                 if( i_field == 0 )
423                 {
424                     p_in += p_pic->p[i_plane].i_pitch;
425                     p_out += p_outpic->p[i_plane].i_pitch;
426                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
427                 }
428                 break;
429         }
430     }
431 }
432
433 #define Merge p_filter->p_sys->pf_merge
434 #define EndMerge if(p_filter->p_sys->pf_end_merge) p_filter->p_sys->pf_end_merge
435
436 /*****************************************************************************
437  * RenderLinear: BOB with linear interpolation
438  *****************************************************************************/
439 static void RenderLinear( filter_t *p_filter,
440                           picture_t *p_outpic, picture_t *p_pic, int i_field )
441 {
442     int i_plane;
443
444     /* Copy image and skip lines */
445     for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
446     {
447         uint8_t *p_in, *p_out_end, *p_out;
448
449         p_in = p_pic->p[i_plane].p_pixels;
450         p_out = p_outpic->p[i_plane].p_pixels;
451         p_out_end = p_out + p_outpic->p[i_plane].i_pitch
452                              * p_outpic->p[i_plane].i_visible_lines;
453
454         /* For BOTTOM field we need to add the first line */
455         if( i_field == 1 )
456         {
457             vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
458             p_in += p_pic->p[i_plane].i_pitch;
459             p_out += p_outpic->p[i_plane].i_pitch;
460         }
461
462         p_out_end -= 2 * p_outpic->p[i_plane].i_pitch;
463
464         for( ; p_out < p_out_end ; )
465         {
466             vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
467
468             p_out += p_outpic->p[i_plane].i_pitch;
469
470             Merge( p_out, p_in, p_in + 2 * p_pic->p[i_plane].i_pitch,
471                    p_pic->p[i_plane].i_pitch );
472
473             p_in += 2 * p_pic->p[i_plane].i_pitch;
474             p_out += p_outpic->p[i_plane].i_pitch;
475         }
476
477         vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
478
479         /* For TOP field we need to add the last line */
480         if( i_field == 0 )
481         {
482             p_in += p_pic->p[i_plane].i_pitch;
483             p_out += p_outpic->p[i_plane].i_pitch;
484             vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
485         }
486     }
487     EndMerge();
488 }
489
490 static void RenderMean( filter_t *p_filter,
491                         picture_t *p_outpic, picture_t *p_pic )
492 {
493     int i_plane;
494
495     /* Copy image and skip lines */
496     for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
497     {
498         uint8_t *p_in, *p_out_end, *p_out;
499
500         p_in = p_pic->p[i_plane].p_pixels;
501
502         p_out = p_outpic->p[i_plane].p_pixels;
503         p_out_end = p_out + p_outpic->p[i_plane].i_pitch
504                              * p_outpic->p[i_plane].i_visible_lines;
505
506         /* All lines: mean value */
507         for( ; p_out < p_out_end ; )
508         {
509             Merge( p_out, p_in, p_in + p_pic->p[i_plane].i_pitch,
510                    p_pic->p[i_plane].i_pitch );
511
512             p_out += p_outpic->p[i_plane].i_pitch;
513             p_in += 2 * p_pic->p[i_plane].i_pitch;
514         }
515     }
516     EndMerge();
517 }
518
519 static void RenderBlend( filter_t *p_filter,
520                          picture_t *p_outpic, picture_t *p_pic )
521 {
522     int i_plane;
523
524     /* Copy image and skip lines */
525     for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
526     {
527         uint8_t *p_in, *p_out_end, *p_out;
528
529         p_in = p_pic->p[i_plane].p_pixels;
530
531         p_out = p_outpic->p[i_plane].p_pixels;
532         p_out_end = p_out + p_outpic->p[i_plane].i_pitch
533                              * p_outpic->p[i_plane].i_visible_lines;
534
535         switch( p_filter->fmt_in.video.i_chroma )
536         {
537             case VLC_CODEC_I420:
538             case VLC_CODEC_J420:
539             case VLC_CODEC_YV12:
540                 /* First line: simple copy */
541                 vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
542                 p_out += p_outpic->p[i_plane].i_pitch;
543
544                 /* Remaining lines: mean value */
545                 for( ; p_out < p_out_end ; )
546                 {
547                     Merge( p_out, p_in, p_in + p_pic->p[i_plane].i_pitch,
548                            p_pic->p[i_plane].i_pitch );
549
550                     p_out += p_outpic->p[i_plane].i_pitch;
551                     p_in += p_pic->p[i_plane].i_pitch;
552                 }
553                 break;
554
555             case VLC_CODEC_I422:
556             case VLC_CODEC_J422:
557                 /* First line: simple copy */
558                 vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
559                 p_out += p_outpic->p[i_plane].i_pitch;
560
561                 /* Remaining lines: mean value */
562                 if( i_plane == Y_PLANE )
563                 {
564                     for( ; p_out < p_out_end ; )
565                     {
566                         Merge( p_out, p_in, p_in + p_pic->p[i_plane].i_pitch,
567                                p_pic->p[i_plane].i_pitch );
568
569                         p_out += p_outpic->p[i_plane].i_pitch;
570                         p_in += p_pic->p[i_plane].i_pitch;
571                     }
572                 }
573
574                 else
575                 {
576                     for( ; p_out < p_out_end ; )
577                     {
578                         Merge( p_out, p_in, p_in + p_pic->p[i_plane].i_pitch,
579                                p_pic->p[i_plane].i_pitch );
580
581                         p_out += p_outpic->p[i_plane].i_pitch;
582                         p_in += 2*p_pic->p[i_plane].i_pitch;
583                     }
584                 }
585                 break;
586         }
587     }
588     EndMerge();
589 }
590
591 #undef Merge
592
593 static void MergeGeneric( void *_p_dest, const void *_p_s1,
594                           const void *_p_s2, size_t i_bytes )
595 {
596     uint8_t* p_dest = (uint8_t*)_p_dest;
597     const uint8_t *p_s1 = (const uint8_t *)_p_s1;
598     const uint8_t *p_s2 = (const uint8_t *)_p_s2;
599     uint8_t* p_end = p_dest + i_bytes - 8;
600
601     while( p_dest < p_end )
602     {
603         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
604         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
605         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
606         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
607         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
608         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
609         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
610         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
611     }
612
613     p_end += 8;
614
615     while( p_dest < p_end )
616     {
617         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
618     }
619 }
620
621 #if defined(CAN_COMPILE_MMXEXT)
622 static void MergeMMXEXT( void *_p_dest, const void *_p_s1, const void *_p_s2,
623                          size_t i_bytes )
624 {
625     uint8_t* p_dest = (uint8_t*)_p_dest;
626     const uint8_t *p_s1 = (const uint8_t *)_p_s1;
627     const uint8_t *p_s2 = (const uint8_t *)_p_s2;
628     uint8_t* p_end = p_dest + i_bytes - 8;
629     while( p_dest < p_end )
630     {
631         __asm__  __volatile__( "movq %2,%%mm1;"
632                                "pavgb %1, %%mm1;"
633                                "movq %%mm1, %0" :"=m" (*p_dest):
634                                                  "m" (*p_s1),
635                                                  "m" (*p_s2) );
636         p_dest += 8;
637         p_s1 += 8;
638         p_s2 += 8;
639     }
640
641     p_end += 8;
642
643     while( p_dest < p_end )
644     {
645         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
646     }
647 }
648 #endif
649
650 #if defined(CAN_COMPILE_3DNOW)
651 static void Merge3DNow( void *_p_dest, const void *_p_s1, const void *_p_s2,
652                         size_t i_bytes )
653 {
654     uint8_t* p_dest = (uint8_t*)_p_dest;
655     const uint8_t *p_s1 = (const uint8_t *)_p_s1;
656     const uint8_t *p_s2 = (const uint8_t *)_p_s2;
657     uint8_t* p_end = p_dest + i_bytes - 8;
658     while( p_dest < p_end )
659     {
660         __asm__  __volatile__( "movq %2,%%mm1;"
661                                "pavgusb %1, %%mm1;"
662                                "movq %%mm1, %0" :"=m" (*p_dest):
663                                                  "m" (*p_s1),
664                                                  "m" (*p_s2) );
665         p_dest += 8;
666         p_s1 += 8;
667         p_s2 += 8;
668     }
669
670     p_end += 8;
671
672     while( p_dest < p_end )
673     {
674         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
675     }
676 }
677 #endif
678
679 #if defined(CAN_COMPILE_SSE)
680 static void MergeSSE2( void *_p_dest, const void *_p_s1, const void *_p_s2,
681                        size_t i_bytes )
682 {
683     uint8_t* p_dest = (uint8_t*)_p_dest;
684     const uint8_t *p_s1 = (const uint8_t *)_p_s1;
685     const uint8_t *p_s2 = (const uint8_t *)_p_s2;
686     uint8_t* p_end;
687     while( (uintptr_t)p_s1 % 16 )
688     {
689         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
690     }
691     p_end = p_dest + i_bytes - 16;
692     while( p_dest < p_end )
693     {
694         __asm__  __volatile__( "movdqu %2,%%xmm1;"
695                                "pavgb %1, %%xmm1;"
696                                "movdqu %%xmm1, %0" :"=m" (*p_dest):
697                                                  "m" (*p_s1),
698                                                  "m" (*p_s2) );
699         p_dest += 16;
700         p_s1 += 16;
701         p_s2 += 16;
702     }
703
704     p_end += 16;
705
706     while( p_dest < p_end )
707     {
708         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
709     }
710 }
711 #endif
712
713 #if defined(CAN_COMPILE_MMXEXT) || defined(CAN_COMPILE_SSE)
714 static void EndMMX( void )
715 {
716     __asm__ __volatile__( "emms" :: );
717 }
718 #endif
719
720 #if defined(CAN_COMPILE_3DNOW)
721 static void End3DNow( void )
722 {
723     __asm__ __volatile__( "femms" :: );
724 }
725 #endif
726
727 #ifdef CAN_COMPILE_C_ALTIVEC
728 static void MergeAltivec( void *_p_dest, const void *_p_s1,
729                           const void *_p_s2, size_t i_bytes )
730 {
731     uint8_t *p_dest = (uint8_t *)_p_dest;
732     uint8_t *p_s1   = (uint8_t *)_p_s1;
733     uint8_t *p_s2   = (uint8_t *)_p_s2;
734     uint8_t *p_end  = p_dest + i_bytes - 15;
735
736     /* Use C until the first 16-bytes aligned destination pixel */
737     while( (uintptr_t)p_dest & 0xF )
738     {
739         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
740     }
741
742     if( ( (int)p_s1 & 0xF ) | ( (int)p_s2 & 0xF ) )
743     {
744         /* Unaligned source */
745         vector unsigned char s1v, s2v, destv;
746         vector unsigned char s1oldv, s2oldv, s1newv, s2newv;
747         vector unsigned char perm1v, perm2v;
748
749         perm1v = vec_lvsl( 0, p_s1 );
750         perm2v = vec_lvsl( 0, p_s2 );
751         s1oldv = vec_ld( 0, p_s1 );
752         s2oldv = vec_ld( 0, p_s2 );
753
754         while( p_dest < p_end )
755         {
756             s1newv = vec_ld( 16, p_s1 );
757             s2newv = vec_ld( 16, p_s2 );
758             s1v    = vec_perm( s1oldv, s1newv, perm1v );
759             s2v    = vec_perm( s2oldv, s2newv, perm2v );
760             s1oldv = s1newv;
761             s2oldv = s2newv;
762             destv  = vec_avg( s1v, s2v );
763             vec_st( destv, 0, p_dest );
764
765             p_s1   += 16;
766             p_s2   += 16;
767             p_dest += 16;
768         }
769     }
770     else
771     {
772         /* Aligned source */
773         vector unsigned char s1v, s2v, destv;
774
775         while( p_dest < p_end )
776         {
777             s1v   = vec_ld( 0, p_s1 );
778             s2v   = vec_ld( 0, p_s2 );
779             destv = vec_avg( s1v, s2v );
780             vec_st( destv, 0, p_dest );
781
782             p_s1   += 16;
783             p_s2   += 16;
784             p_dest += 16;
785         }
786     }
787
788     p_end += 15;
789
790     while( p_dest < p_end )
791     {
792         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
793     }
794 }
795 #endif
796
797 #ifdef __ARM_NEON__
798 static void MergeNEON (void *restrict out, const void *in1,
799                        const void *in2, size_t n)
800 {
801     uint8_t *outp = out;
802     const uint8_t *in1p = in1;
803     const uint8_t *in2p = in2;
804     size_t mis = ((uintptr_t)outp) & 15;
805
806     if (mis)
807     {
808         MergeGeneric (outp, in1p, in2p, mis);
809         outp += mis;
810         in1p += mis;
811         in2p += mis;
812         n -= mis;
813     }
814
815     uint8_t *end = outp + (n & ~15);
816
817     if ((((uintptr_t)in1p)|((uintptr_t)in2p)) & 15)
818         while (outp < end)
819             asm volatile (
820                 "vld1.u8  {q0-q1}, [%[in1]]!\n"
821                 "vld1.u8  {q2-q3}, [%[in2]]!\n"
822                 "vhadd.u8 q4, q0, q2\n"
823                 "vld1.u8  {q6-q7}, [%[in1]]!\n"
824                 "vhadd.u8 q5, q1, q3\n"
825                 "vld1.u8  {q8-q9}, [%[in2]]!\n"
826                 "vhadd.u8 q10, q6, q8\n"
827                 "vhadd.u8 q11, q7, q9\n"
828                 "vst1.u8  {q4-q5}, [%[out],:128]!\n"
829                 "vst1.u8  {q10-q11}, [%[out],:128]!\n"
830                 : [out] "+r" (outp), [in1] "+r" (in1p), [in2] "+r" (in2p)
831                 :
832                 : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
833                   "q8", "q9", "q10", "q11", "memory");
834     else
835          while (outp < end)
836             asm volatile (
837                 "vld1.u8  {q0-q1}, [%[in1],:128]!\n"
838                 "vld1.u8  {q2-q3}, [%[in2],:128]!\n"
839                 "vhadd.u8 q4, q0, q2\n"
840                 "vld1.u8  {q6-q7}, [%[in1],:128]!\n"
841                 "vhadd.u8 q5, q1, q3\n"
842                 "vld1.u8  {q8-q9}, [%[in2],:128]!\n"
843                 "vhadd.u8 q10, q6, q8\n"
844                 "vhadd.u8 q11, q7, q9\n"
845                 "vst1.u8  {q4-q5}, [%[out],:128]!\n"
846                 "vst1.u8  {q10-q11}, [%[out],:128]!\n"
847                 : [out] "+r" (outp), [in1] "+r" (in1p), [in2] "+r" (in2p)
848                 :
849                 : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
850                   "q8", "q9", "q10", "q11", "memory");
851     n &= 15;
852     if (n)
853         MergeGeneric (outp, in1p, in2p, n);
854 }
855 #endif
856
857 /*****************************************************************************
858  * RenderX: This algo works on a 8x8 block basic, it copies the top field
859  * and apply a process to recreate the bottom field :
860  *  If a 8x8 block is classified as :
861  *   - progressive: it applies a small blend (1,6,1)
862  *   - interlaced:
863  *    * in the MMX version: we do a ME between the 2 fields, if there is a
864  *    good match we use MC to recreate the bottom field (with a small
865  *    blend (1,6,1) )
866  *    * otherwise: it recreates the bottom field by an edge oriented
867  *    interpolation.
868   *****************************************************************************/
869
870 /* XDeint8x8Detect: detect if a 8x8 block is interlaced.
871  * XXX: It need to access to 8x10
872  * We use more than 8 lines to help with scrolling (text)
873  * (and because XDeint8x8Frame use line 9)
874  * XXX: smooth/uniform area with noise detection doesn't works well
875  * but it's not really a problem because they don't have much details anyway
876  */
877 static inline int ssd( int a ) { return a*a; }
878 static inline int XDeint8x8DetectC( uint8_t *src, int i_src )
879 {
880     int y, x;
881     int ff, fr;
882     int fc;
883
884     /* Detect interlacing */
885     fc = 0;
886     for( y = 0; y < 7; y += 2 )
887     {
888         ff = fr = 0;
889         for( x = 0; x < 8; x++ )
890         {
891             fr += ssd(src[      x] - src[1*i_src+x]) +
892                   ssd(src[i_src+x] - src[2*i_src+x]);
893             ff += ssd(src[      x] - src[2*i_src+x]) +
894                   ssd(src[i_src+x] - src[3*i_src+x]);
895         }
896         if( ff < 6*fr/8 && fr > 32 )
897             fc++;
898
899         src += 2*i_src;
900     }
901
902     return fc < 1 ? false : true;
903 }
904 #ifdef CAN_COMPILE_MMXEXT
905 static inline int XDeint8x8DetectMMXEXT( uint8_t *src, int i_src )
906 {
907
908     int y, x;
909     int32_t ff, fr;
910     int fc;
911
912     /* Detect interlacing */
913     fc = 0;
914     pxor_r2r( mm7, mm7 );
915     for( y = 0; y < 9; y += 2 )
916     {
917         ff = fr = 0;
918         pxor_r2r( mm5, mm5 );
919         pxor_r2r( mm6, mm6 );
920         for( x = 0; x < 8; x+=4 )
921         {
922             movd_m2r( src[        x], mm0 );
923             movd_m2r( src[1*i_src+x], mm1 );
924             movd_m2r( src[2*i_src+x], mm2 );
925             movd_m2r( src[3*i_src+x], mm3 );
926
927             punpcklbw_r2r( mm7, mm0 );
928             punpcklbw_r2r( mm7, mm1 );
929             punpcklbw_r2r( mm7, mm2 );
930             punpcklbw_r2r( mm7, mm3 );
931
932             movq_r2r( mm0, mm4 );
933
934             psubw_r2r( mm1, mm0 );
935             psubw_r2r( mm2, mm4 );
936
937             psubw_r2r( mm1, mm2 );
938             psubw_r2r( mm1, mm3 );
939
940             pmaddwd_r2r( mm0, mm0 );
941             pmaddwd_r2r( mm4, mm4 );
942             pmaddwd_r2r( mm2, mm2 );
943             pmaddwd_r2r( mm3, mm3 );
944             paddd_r2r( mm0, mm2 );
945             paddd_r2r( mm4, mm3 );
946             paddd_r2r( mm2, mm5 );
947             paddd_r2r( mm3, mm6 );
948         }
949
950         movq_r2r( mm5, mm0 );
951         psrlq_i2r( 32, mm0 );
952         paddd_r2r( mm0, mm5 );
953         movd_r2m( mm5, fr );
954
955         movq_r2r( mm6, mm0 );
956         psrlq_i2r( 32, mm0 );
957         paddd_r2r( mm0, mm6 );
958         movd_r2m( mm6, ff );
959
960         if( ff < 6*fr/8 && fr > 32 )
961             fc++;
962
963         src += 2*i_src;
964     }
965     return fc;
966 }
967 #endif
968
969 static inline void XDeint8x8MergeC( uint8_t *dst, int i_dst,
970                                     uint8_t *src1, int i_src1,
971                                     uint8_t *src2, int i_src2 )
972 {
973     int y, x;
974
975     /* Progressive */
976     for( y = 0; y < 8; y += 2 )
977     {
978         memcpy( dst, src1, 8 );
979         dst  += i_dst;
980
981         for( x = 0; x < 8; x++ )
982             dst[x] = (src1[x] + 6*src2[x] + src1[i_src1+x] + 4 ) >> 3;
983         dst += i_dst;
984
985         src1 += i_src1;
986         src2 += i_src2;
987     }
988 }
989
990 #ifdef CAN_COMPILE_MMXEXT
991 static inline void XDeint8x8MergeMMXEXT( uint8_t *dst, int i_dst,
992                                          uint8_t *src1, int i_src1,
993                                          uint8_t *src2, int i_src2 )
994 {
995     static const uint64_t m_4 = INT64_C(0x0004000400040004);
996     int y, x;
997
998     /* Progressive */
999     pxor_r2r( mm7, mm7 );
1000     for( y = 0; y < 8; y += 2 )
1001     {
1002         for( x = 0; x < 8; x +=4 )
1003         {
1004             movd_m2r( src1[x], mm0 );
1005             movd_r2m( mm0, dst[x] );
1006
1007             movd_m2r( src2[x], mm1 );
1008             movd_m2r( src1[i_src1+x], mm2 );
1009
1010             punpcklbw_r2r( mm7, mm0 );
1011             punpcklbw_r2r( mm7, mm1 );
1012             punpcklbw_r2r( mm7, mm2 );
1013             paddw_r2r( mm1, mm1 );
1014             movq_r2r( mm1, mm3 );
1015             paddw_r2r( mm3, mm3 );
1016             paddw_r2r( mm2, mm0 );
1017             paddw_r2r( mm3, mm1 );
1018             paddw_m2r( m_4, mm1 );
1019             paddw_r2r( mm1, mm0 );
1020             psraw_i2r( 3, mm0 );
1021             packuswb_r2r( mm7, mm0 );
1022             movd_r2m( mm0, dst[i_dst+x] );
1023         }
1024         dst += 2*i_dst;
1025         src1 += i_src1;
1026         src2 += i_src2;
1027     }
1028 }
1029
1030 #endif
1031
1032 /* For debug */
1033 static inline void XDeint8x8Set( uint8_t *dst, int i_dst, uint8_t v )
1034 {
1035     int y;
1036     for( y = 0; y < 8; y++ )
1037         memset( &dst[y*i_dst], v, 8 );
1038 }
1039
1040 /* XDeint8x8FieldE: Stupid deinterlacing (1,0,1) for block that miss a
1041  * neighbour
1042  * (Use 8x9 pixels)
1043  * TODO: a better one for the inner part.
1044  */
1045 static inline void XDeint8x8FieldEC( uint8_t *dst, int i_dst,
1046                                      uint8_t *src, int i_src )
1047 {
1048     int y, x;
1049
1050     /* Interlaced */
1051     for( y = 0; y < 8; y += 2 )
1052     {
1053         memcpy( dst, src, 8 );
1054         dst += i_dst;
1055
1056         for( x = 0; x < 8; x++ )
1057             dst[x] = (src[x] + src[2*i_src+x] ) >> 1;
1058         dst += 1*i_dst;
1059         src += 2*i_src;
1060     }
1061 }
1062 #ifdef CAN_COMPILE_MMXEXT
1063 static inline void XDeint8x8FieldEMMXEXT( uint8_t *dst, int i_dst,
1064                                           uint8_t *src, int i_src )
1065 {
1066     int y;
1067
1068     /* Interlaced */
1069     for( y = 0; y < 8; y += 2 )
1070     {
1071         movq_m2r( src[0], mm0 );
1072         movq_r2m( mm0, dst[0] );
1073         dst += i_dst;
1074
1075         movq_m2r( src[2*i_src], mm1 );
1076         pavgb_r2r( mm1, mm0 );
1077
1078         movq_r2m( mm0, dst[0] );
1079
1080         dst += 1*i_dst;
1081         src += 2*i_src;
1082     }
1083 }
1084 #endif
1085
1086 /* XDeint8x8Field: Edge oriented interpolation
1087  * (Need -4 and +5 pixels H, +1 line)
1088  */
1089 static inline void XDeint8x8FieldC( uint8_t *dst, int i_dst,
1090                                     uint8_t *src, int i_src )
1091 {
1092     int y, x;
1093
1094     /* Interlaced */
1095     for( y = 0; y < 8; y += 2 )
1096     {
1097         memcpy( dst, src, 8 );
1098         dst += i_dst;
1099
1100         for( x = 0; x < 8; x++ )
1101         {
1102             uint8_t *src2 = &src[2*i_src];
1103             /* I use 8 pixels just to match the MMX version, but it's overkill
1104              * 5 would be enough (less isn't good) */
1105             const int c0 = abs(src[x-4]-src2[x-2]) + abs(src[x-3]-src2[x-1]) +
1106                            abs(src[x-2]-src2[x+0]) + abs(src[x-1]-src2[x+1]) +
1107                            abs(src[x+0]-src2[x+2]) + abs(src[x+1]-src2[x+3]) +
1108                            abs(src[x+2]-src2[x+4]) + abs(src[x+3]-src2[x+5]);
1109
1110             const int c1 = abs(src[x-3]-src2[x-3]) + abs(src[x-2]-src2[x-2]) +
1111                            abs(src[x-1]-src2[x-1]) + abs(src[x+0]-src2[x+0]) +
1112                            abs(src[x+1]-src2[x+1]) + abs(src[x+2]-src2[x+2]) +
1113                            abs(src[x+3]-src2[x+3]) + abs(src[x+4]-src2[x+4]);
1114
1115             const int c2 = abs(src[x-2]-src2[x-4]) + abs(src[x-1]-src2[x-3]) +
1116                            abs(src[x+0]-src2[x-2]) + abs(src[x+1]-src2[x-1]) +
1117                            abs(src[x+2]-src2[x+0]) + abs(src[x+3]-src2[x+1]) +
1118                            abs(src[x+4]-src2[x+2]) + abs(src[x+5]-src2[x+3]);
1119
1120             if( c0 < c1 && c1 <= c2 )
1121                 dst[x] = (src[x-1] + src2[x+1]) >> 1;
1122             else if( c2 < c1 && c1 <= c0 )
1123                 dst[x] = (src[x+1] + src2[x-1]) >> 1;
1124             else
1125                 dst[x] = (src[x+0] + src2[x+0]) >> 1;
1126         }
1127
1128         dst += 1*i_dst;
1129         src += 2*i_src;
1130     }
1131 }
1132 #ifdef CAN_COMPILE_MMXEXT
1133 static inline void XDeint8x8FieldMMXEXT( uint8_t *dst, int i_dst,
1134                                          uint8_t *src, int i_src )
1135 {
1136     int y, x;
1137
1138     /* Interlaced */
1139     for( y = 0; y < 8; y += 2 )
1140     {
1141         memcpy( dst, src, 8 );
1142         dst += i_dst;
1143
1144         for( x = 0; x < 8; x++ )
1145         {
1146             uint8_t *src2 = &src[2*i_src];
1147             int32_t c0, c1, c2;
1148
1149             movq_m2r( src[x-2], mm0 );
1150             movq_m2r( src[x-3], mm1 );
1151             movq_m2r( src[x-4], mm2 );
1152
1153             psadbw_m2r( src2[x-4], mm0 );
1154             psadbw_m2r( src2[x-3], mm1 );
1155             psadbw_m2r( src2[x-2], mm2 );
1156
1157             movd_r2m( mm0, c2 );
1158             movd_r2m( mm1, c1 );
1159             movd_r2m( mm2, c0 );
1160
1161             if( c0 < c1 && c1 <= c2 )
1162                 dst[x] = (src[x-1] + src2[x+1]) >> 1;
1163             else if( c2 < c1 && c1 <= c0 )
1164                 dst[x] = (src[x+1] + src2[x-1]) >> 1;
1165             else
1166                 dst[x] = (src[x+0] + src2[x+0]) >> 1;
1167         }
1168
1169         dst += 1*i_dst;
1170         src += 2*i_src;
1171     }
1172 }
1173 #endif
1174
1175 /* NxN arbitray size (and then only use pixel in the NxN block)
1176  */
1177 static inline int XDeintNxNDetect( uint8_t *src, int i_src,
1178                                    int i_height, int i_width )
1179 {
1180     int y, x;
1181     int ff, fr;
1182     int fc;
1183
1184
1185     /* Detect interlacing */
1186     /* FIXME way too simple, need to be more like XDeint8x8Detect */
1187     ff = fr = 0;
1188     fc = 0;
1189     for( y = 0; y < i_height - 2; y += 2 )
1190     {
1191         const uint8_t *s = &src[y*i_src];
1192         for( x = 0; x < i_width; x++ )
1193         {
1194             fr += ssd(s[      x] - s[1*i_src+x]);
1195             ff += ssd(s[      x] - s[2*i_src+x]);
1196         }
1197         if( ff < fr && fr > i_width / 2 )
1198             fc++;
1199     }
1200
1201     return fc < 2 ? false : true;
1202 }
1203
1204 static inline void XDeintNxNFrame( uint8_t *dst, int i_dst,
1205                                    uint8_t *src, int i_src,
1206                                    int i_width, int i_height )
1207 {
1208     int y, x;
1209
1210     /* Progressive */
1211     for( y = 0; y < i_height; y += 2 )
1212     {
1213         memcpy( dst, src, i_width );
1214         dst += i_dst;
1215
1216         if( y < i_height - 2 )
1217         {
1218             for( x = 0; x < i_width; x++ )
1219                 dst[x] = (src[x] + 2*src[1*i_src+x] + src[2*i_src+x] + 2 ) >> 2;
1220         }
1221         else
1222         {
1223             /* Blend last line */
1224             for( x = 0; x < i_width; x++ )
1225                 dst[x] = (src[x] + src[1*i_src+x] ) >> 1;
1226         }
1227         dst += 1*i_dst;
1228         src += 2*i_src;
1229     }
1230 }
1231
1232 static inline void XDeintNxNField( uint8_t *dst, int i_dst,
1233                                    uint8_t *src, int i_src,
1234                                    int i_width, int i_height )
1235 {
1236     int y, x;
1237
1238     /* Interlaced */
1239     for( y = 0; y < i_height; y += 2 )
1240     {
1241         memcpy( dst, src, i_width );
1242         dst += i_dst;
1243
1244         if( y < i_height - 2 )
1245         {
1246             for( x = 0; x < i_width; x++ )
1247                 dst[x] = (src[x] + src[2*i_src+x] ) >> 1;
1248         }
1249         else
1250         {
1251             /* Blend last line */
1252             for( x = 0; x < i_width; x++ )
1253                 dst[x] = (src[x] + src[i_src+x]) >> 1;
1254         }
1255         dst += 1*i_dst;
1256         src += 2*i_src;
1257     }
1258 }
1259
1260 static inline void XDeintNxN( uint8_t *dst, int i_dst, uint8_t *src, int i_src,
1261                               int i_width, int i_height )
1262 {
1263     if( XDeintNxNDetect( src, i_src, i_width, i_height ) )
1264         XDeintNxNField( dst, i_dst, src, i_src, i_width, i_height );
1265     else
1266         XDeintNxNFrame( dst, i_dst, src, i_src, i_width, i_height );
1267 }
1268
1269
1270 static inline int median( int a, int b, int c )
1271 {
1272     int min = a, max =a;
1273     if( b < min )
1274         min = b;
1275     else
1276         max = b;
1277
1278     if( c < min )
1279         min = c;
1280     else if( c > max )
1281         max = c;
1282
1283     return a + b + c - min - max;
1284 }
1285
1286
1287 /* XDeintBand8x8:
1288  */
1289 static inline void XDeintBand8x8C( uint8_t *dst, int i_dst,
1290                                    uint8_t *src, int i_src,
1291                                    const int i_mbx, int i_modx )
1292 {
1293     int x;
1294
1295     for( x = 0; x < i_mbx; x++ )
1296     {
1297         int s;
1298         if( ( s = XDeint8x8DetectC( src, i_src ) ) )
1299         {
1300             if( x == 0 || x == i_mbx - 1 )
1301                 XDeint8x8FieldEC( dst, i_dst, src, i_src );
1302             else
1303                 XDeint8x8FieldC( dst, i_dst, src, i_src );
1304         }
1305         else
1306         {
1307             XDeint8x8MergeC( dst, i_dst,
1308                              &src[0*i_src], 2*i_src,
1309                              &src[1*i_src], 2*i_src );
1310         }
1311
1312         dst += 8;
1313         src += 8;
1314     }
1315
1316     if( i_modx )
1317         XDeintNxN( dst, i_dst, src, i_src, i_modx, 8 );
1318 }
1319 #ifdef CAN_COMPILE_MMXEXT
1320 static inline void XDeintBand8x8MMXEXT( uint8_t *dst, int i_dst,
1321                                         uint8_t *src, int i_src,
1322                                         const int i_mbx, int i_modx )
1323 {
1324     int x;
1325
1326     /* Reset current line */
1327     for( x = 0; x < i_mbx; x++ )
1328     {
1329         int s;
1330         if( ( s = XDeint8x8DetectMMXEXT( src, i_src ) ) )
1331         {
1332             if( x == 0 || x == i_mbx - 1 )
1333                 XDeint8x8FieldEMMXEXT( dst, i_dst, src, i_src );
1334             else
1335                 XDeint8x8FieldMMXEXT( dst, i_dst, src, i_src );
1336         }
1337         else
1338         {
1339             XDeint8x8MergeMMXEXT( dst, i_dst,
1340                                   &src[0*i_src], 2*i_src,
1341                                   &src[1*i_src], 2*i_src );
1342         }
1343
1344         dst += 8;
1345         src += 8;
1346     }
1347
1348     if( i_modx )
1349         XDeintNxN( dst, i_dst, src, i_src, i_modx, 8 );
1350 }
1351 #endif
1352
1353 static void RenderX( picture_t *p_outpic, picture_t *p_pic )
1354 {
1355     int i_plane;
1356
1357     /* Copy image and skip lines */
1358     for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
1359     {
1360         const int i_mby = ( p_outpic->p[i_plane].i_visible_lines + 7 )/8 - 1;
1361         const int i_mbx = p_outpic->p[i_plane].i_visible_pitch/8;
1362
1363         const int i_mody = p_outpic->p[i_plane].i_visible_lines - 8*i_mby;
1364         const int i_modx = p_outpic->p[i_plane].i_visible_pitch - 8*i_mbx;
1365
1366         const int i_dst = p_outpic->p[i_plane].i_pitch;
1367         const int i_src = p_pic->p[i_plane].i_pitch;
1368
1369         int y, x;
1370
1371         for( y = 0; y < i_mby; y++ )
1372         {
1373             uint8_t *dst = &p_outpic->p[i_plane].p_pixels[8*y*i_dst];
1374             uint8_t *src = &p_pic->p[i_plane].p_pixels[8*y*i_src];
1375
1376 #ifdef CAN_COMPILE_MMXEXT
1377             if( vlc_CPU() & CPU_CAPABILITY_MMXEXT )
1378                 XDeintBand8x8MMXEXT( dst, i_dst, src, i_src, i_mbx, i_modx );
1379             else
1380 #endif
1381                 XDeintBand8x8C( dst, i_dst, src, i_src, i_mbx, i_modx );
1382         }
1383
1384         /* Last line (C only)*/
1385         if( i_mody )
1386         {
1387             uint8_t *dst = &p_outpic->p[i_plane].p_pixels[8*y*i_dst];
1388             uint8_t *src = &p_pic->p[i_plane].p_pixels[8*y*i_src];
1389
1390             for( x = 0; x < i_mbx; x++ )
1391             {
1392                 XDeintNxN( dst, i_dst, src, i_src, 8, i_mody );
1393
1394                 dst += 8;
1395                 src += 8;
1396             }
1397
1398             if( i_modx )
1399                 XDeintNxN( dst, i_dst, src, i_src, i_modx, i_mody );
1400         }
1401     }
1402
1403 #ifdef CAN_COMPILE_MMXEXT
1404     if( vlc_CPU() & CPU_CAPABILITY_MMXEXT )
1405         emms();
1406 #endif
1407 }
1408
1409 /*****************************************************************************
1410  * Yadif (Yet Another DeInterlacing Filter).
1411  *****************************************************************************/
1412 /* */
1413 struct vf_priv_s {
1414     /*
1415      * 0: Output 1 frame for each frame.
1416      * 1: Output 1 frame for each field.
1417      * 2: Like 0 but skips spatial interlacing check.
1418      * 3: Like 1 but skips spatial interlacing check.
1419      *
1420      * In vlc, only & 0x02 has meaning, as we do the & 0x01 ourself.
1421      */
1422     int mode;
1423 };
1424
1425 /* I am unsure it is the right one */
1426 typedef intptr_t x86_reg;
1427
1428 #define FFABS(a) ((a) >= 0 ? (a) : (-(a)))
1429 #define FFMAX(a,b)      __MAX(a,b)
1430 #define FFMAX3(a,b,c)   FFMAX(FFMAX(a,b),c)
1431 #define FFMIN(a,b)      __MIN(a,b)
1432 #define FFMIN3(a,b,c)   FFMIN(FFMIN(a,b),c)
1433
1434 /* yadif.h comes from vf_yadif.c of mplayer project */
1435 #include "yadif.h"
1436
1437 static int RenderYadif( filter_t *p_filter, picture_t *p_dst, picture_t *p_src, int i_order, int i_field )
1438 {
1439     filter_sys_t *p_sys = p_filter->p_sys;
1440
1441     /* */
1442     assert( i_order == 0 || i_order == 1 );
1443     assert( i_field == 0 || i_field == 1 );
1444
1445     if( i_order == 0 )
1446     {
1447         /* Duplicate the picture
1448          * TODO when the vout rework is finished, picture_Hold() might be enough
1449          * but becarefull, the pitches must match */
1450         picture_t *p_dup = picture_NewFromFormat( &p_src->format );
1451         if( p_dup )
1452             picture_Copy( p_dup, p_src );
1453
1454         /* Slide the history */
1455         if( p_sys->pp_history[0] )
1456             picture_Release( p_sys->pp_history[0]  );
1457         for( int i = 1; i < HISTORY_SIZE; i++ )
1458             p_sys->pp_history[i-1] = p_sys->pp_history[i];
1459         p_sys->pp_history[HISTORY_SIZE-1] = p_dup;
1460     }
1461
1462     /* As the pitches must match, use ONLY pictures coming from picture_New()! */
1463     picture_t *p_prev = p_sys->pp_history[0];
1464     picture_t *p_cur  = p_sys->pp_history[1];
1465     picture_t *p_next = p_sys->pp_history[2];
1466
1467     /* Filter if we have all the pictures we need */
1468     if( p_prev && p_cur && p_next )
1469     {
1470         /* */
1471         void (*filter)(struct vf_priv_s *p, uint8_t *dst, uint8_t *prev, uint8_t *cur, uint8_t *next, int w, int refs, int parity);
1472 #if defined(HAVE_YADIF_SSE2)
1473         if( vlc_CPU() & CPU_CAPABILITY_SSE2 )
1474             filter = yadif_filter_line_mmx2;
1475         else
1476 #endif
1477             filter = yadif_filter_line_c;
1478
1479         for( int n = 0; n < p_dst->i_planes; n++ )
1480         {
1481             const plane_t *prevp = &p_prev->p[n];
1482             const plane_t *curp  = &p_cur->p[n];
1483             const plane_t *nextp = &p_next->p[n];
1484             plane_t *dstp        = &p_dst->p[n];
1485
1486             for( int y = 1; y < dstp->i_visible_lines - 1; y++ )
1487             {
1488                 if( (y % 2) == i_field )
1489                 {
1490                     vlc_memcpy( &dstp->p_pixels[y * dstp->i_pitch],
1491                                 &curp->p_pixels[y * curp->i_pitch], dstp->i_visible_pitch );
1492                 }
1493                 else
1494                 {
1495                     struct vf_priv_s cfg;
1496                     /* Spatial checks only when enough data */
1497                     cfg.mode = (y >= 2 && y < dstp->i_visible_lines - 2) ? 0 : 2;
1498
1499                     assert( prevp->i_pitch == curp->i_pitch && curp->i_pitch == nextp->i_pitch );
1500                     filter( &cfg,
1501                             &dstp->p_pixels[y * dstp->i_pitch],
1502                             &prevp->p_pixels[y * prevp->i_pitch],
1503                             &curp->p_pixels[y * curp->i_pitch],
1504                             &nextp->p_pixels[y * nextp->i_pitch],
1505                             dstp->i_visible_pitch,
1506                             curp->i_pitch,
1507                             (i_field ^ (i_order == i_field)) & 1 );
1508                 }
1509
1510                 /* We duplicate the first and last lines */
1511                 if( y == 1 )
1512                     vlc_memcpy(&dstp->p_pixels[(y-1) * dstp->i_pitch], &dstp->p_pixels[y * dstp->i_pitch], dstp->i_pitch);
1513                 else if( y == dstp->i_visible_lines - 2 )
1514                     vlc_memcpy(&dstp->p_pixels[(y+1) * dstp->i_pitch], &dstp->p_pixels[y * dstp->i_pitch], dstp->i_pitch);
1515             }
1516         }
1517
1518         /* */
1519         p_dst->date = (p_next->date - p_cur->date) * i_order / 2 + p_cur->date;
1520         return VLC_SUCCESS;
1521     }
1522     else if( !p_prev && !p_cur && p_next )
1523     {
1524         RenderX( p_dst, p_next );
1525         return VLC_SUCCESS;
1526     }
1527     else
1528     {
1529         return VLC_EGENERIC;
1530     }
1531 }
1532
1533 /*****************************************************************************
1534  * video filter2 functions
1535  *****************************************************************************/
1536 static picture_t *Deinterlace( filter_t *p_filter, picture_t *p_pic )
1537 {
1538     filter_sys_t *p_sys = p_filter->p_sys;
1539     picture_t *p_pic_dst;
1540
1541     /* Request output picture */
1542     p_pic_dst = filter_NewPicture( p_filter );
1543     if( p_pic_dst == NULL )
1544     {
1545         picture_Release( p_pic );
1546         return NULL;
1547     }
1548
1549     picture_CopyProperties( p_pic_dst, p_pic );
1550
1551     switch( p_sys->i_mode )
1552     {
1553         case DEINTERLACE_DISCARD:
1554             RenderDiscard( p_filter, p_pic_dst, p_pic, 0 );
1555             break;
1556
1557         case DEINTERLACE_BOB:
1558 #if 0
1559             RenderBob( p_filter, pp_outpic[0], p_pic, !p_pic->b_top_field_first );
1560             RenderBob( p_filter, pp_outpic[1], p_pic, p_pic->b_top_field_first );
1561             break;
1562 #endif
1563
1564         case DEINTERLACE_LINEAR:
1565 #if 0
1566             RenderLinear( p_filter, pp_outpic[0], p_pic, !p_pic->b_top_field_first );
1567             RenderLinear( p_filter, pp_outpic[1], p_pic, p_pic->b_top_field_first );
1568 #endif
1569             msg_Err( p_filter, "doubling the frame rate is not supported yet" );
1570             goto drop;
1571
1572         case DEINTERLACE_MEAN:
1573             RenderMean( p_filter, p_pic_dst, p_pic );
1574             break;
1575
1576         case DEINTERLACE_BLEND:
1577             RenderBlend( p_filter, p_pic_dst, p_pic );
1578             break;
1579
1580         case DEINTERLACE_X:
1581             RenderX( p_pic_dst, p_pic );
1582             break;
1583
1584         case DEINTERLACE_YADIF:
1585             if( RenderYadif( p_filter, p_pic_dst, p_pic, 0, 0 ) )
1586                 goto drop;
1587             break;
1588
1589         case DEINTERLACE_YADIF2X:
1590             msg_Err( p_filter, "doubling the frame rate is not supported yet" );
1591             //RenderYadif( p_vout, pp_outpic[0], p_pic, 0, !p_pic->b_top_field_first );
1592             //RenderYadif( p_vout, pp_outpic[1], p_pic, 1, p_pic->b_top_field_first );
1593             goto drop;
1594     }
1595
1596     p_pic_dst->b_progressive = true;
1597
1598     picture_Release( p_pic );
1599     return p_pic_dst;
1600
1601 drop:
1602     picture_Release( p_pic_dst );
1603     picture_Release( p_pic );
1604     return NULL;
1605 }
1606
1607 static void Flush( filter_t *p_filter )
1608 {
1609     filter_sys_t *p_sys = p_filter->p_sys;
1610
1611     for( int i = 0; i < HISTORY_SIZE; i++ )
1612     {
1613         if( p_sys->pp_history[i] )
1614             picture_Release( p_sys->pp_history[i] );
1615         p_sys->pp_history[i] = NULL;
1616     }
1617 }
1618
1619 static int Mouse( filter_t *p_filter,
1620                   vlc_mouse_t *p_mouse, const vlc_mouse_t *p_old, const vlc_mouse_t *p_new )
1621 {
1622     *p_mouse = *p_new;
1623     if( p_filter->p_sys->b_half_height )
1624         p_mouse->i_y *= 2;
1625     return VLC_SUCCESS;
1626 }
1627
1628
1629 /*****************************************************************************
1630  * Open
1631  *****************************************************************************/
1632 static int Open( vlc_object_t *p_this )
1633 {
1634     filter_t *p_filter = (filter_t*)p_this;
1635     filter_sys_t *p_sys;
1636
1637     if( !IsChromaSupported( p_filter->fmt_in.video.i_chroma ) )
1638         return VLC_EGENERIC;
1639
1640     /* */
1641     p_sys = p_filter->p_sys = malloc( sizeof( *p_sys ) );
1642     if( !p_sys )
1643         return VLC_ENOMEM;
1644
1645     p_sys->i_mode = DEINTERLACE_BLEND;
1646     p_sys->b_double_rate = false;
1647     p_sys->b_half_height = true;
1648     for( int i = 0; i < HISTORY_SIZE; i++ )
1649         p_sys->pp_history[i] = NULL;
1650
1651 #if defined(CAN_COMPILE_C_ALTIVEC)
1652     if( vlc_CPU() & CPU_CAPABILITY_ALTIVEC )
1653     {
1654         p_sys->pf_merge = MergeAltivec;
1655         p_sys->pf_end_merge = NULL;
1656     }
1657     else
1658 #endif
1659 #if defined(CAN_COMPILE_SSE)
1660     if( vlc_CPU() & CPU_CAPABILITY_SSE2 )
1661     {
1662         p_sys->pf_merge = MergeSSE2;
1663         p_sys->pf_end_merge = EndMMX;
1664     }
1665     else
1666 #endif
1667 #if defined(CAN_COMPILE_MMXEXT)
1668     if( vlc_CPU() & CPU_CAPABILITY_MMXEXT )
1669     {
1670         p_sys->pf_merge = MergeMMXEXT;
1671         p_sys->pf_end_merge = EndMMX;
1672     }
1673     else
1674 #endif
1675 #if defined(CAN_COMPILE_3DNOW)
1676     if( vlc_CPU() & CPU_CAPABILITY_3DNOW )
1677     {
1678         p_sys->pf_merge = Merge3DNow;
1679         p_sys->pf_end_merge = End3DNow;
1680     }
1681     else
1682 #endif
1683 #if defined __ARM_NEON__
1684     if( vlc_CPU() & CPU_CAPABILITY_NEON )
1685     {
1686         p_sys->pf_merge = MergeNEON;
1687         p_sys->pf_end_merge = NULL;
1688     }
1689     else
1690 #endif
1691     {
1692         p_sys->pf_merge = MergeGeneric;
1693         p_sys->pf_end_merge = NULL;
1694     }
1695
1696     /* */
1697     config_ChainParse( p_filter, FILTER_CFG_PREFIX, ppsz_filter_options,
1698                        p_filter->p_cfg );
1699
1700     char *psz_mode = var_GetNonEmptyString( p_filter, FILTER_CFG_PREFIX "mode" );
1701     SetFilterMethod( p_filter, psz_mode, p_filter->fmt_in.video.i_chroma );
1702     free( psz_mode );
1703
1704     /* */
1705     video_format_t fmt;
1706     GetOutputFormat( p_filter, &fmt, &p_filter->fmt_in.video );
1707     if( !p_filter->b_allow_fmt_out_change &&
1708         ( fmt.i_chroma != p_filter->fmt_in.video.i_chroma ||
1709           fmt.i_height != p_filter->fmt_in.video.i_height ) )
1710     {
1711         Close( VLC_OBJECT(p_filter) );
1712         return VLC_EGENERIC;
1713     }
1714     p_filter->fmt_out.video = fmt;
1715     p_filter->fmt_out.i_codec = fmt.i_chroma;
1716     p_filter->pf_video_filter = Deinterlace;
1717     p_filter->pf_video_flush  = Flush;
1718     p_filter->pf_video_mouse  = Mouse;
1719
1720     msg_Dbg( p_filter, "deinterlacing" );
1721
1722     return VLC_SUCCESS;
1723 }
1724
1725 /*****************************************************************************
1726  * Close: clean up the filter
1727  *****************************************************************************/
1728 static void Close( vlc_object_t *p_this )
1729 {
1730     filter_t *p_filter = (filter_t*)p_this;
1731
1732     Flush( p_filter );
1733     free( p_filter->p_sys );
1734 }
1735