]> git.sesse.net Git - vlc/blob - modules/video_filter/deinterlace.c
atmo: fix assertion issue with picture_release
[vlc] / modules / video_filter / deinterlace.c
1 /*****************************************************************************
2  * deinterlace.c : deinterlacer plugin for vlc
3  *****************************************************************************
4  * Copyright (C) 2000-2009 the VideoLAN team
5  * $Id$
6  *
7  * Author: Sam Hocevar <sam@zoy.org>
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License as published by
11  * the Free Software Foundation; either version 2 of the License, or
12  * (at your option) any later version.
13  *
14  * This program is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  * GNU General Public License for more details.
18  *
19  * You should have received a copy of the GNU General Public License
20  * along with this program; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
22  *****************************************************************************/
23
24 /*****************************************************************************
25  * Preamble
26  *****************************************************************************/
27
28 #ifdef HAVE_CONFIG_H
29 # include "config.h"
30 #endif
31
32 #include <assert.h>
33
34 #ifdef HAVE_ALTIVEC_H
35 #   include <altivec.h>
36 #endif
37
38 #include <vlc_common.h>
39 #include <vlc_plugin.h>
40 #include <vlc_filter.h>
41 #include <vlc_cpu.h>
42
43 #ifdef CAN_COMPILE_MMXEXT
44 #   include "mmx.h"
45 #endif
46
47 #define DEINTERLACE_DISCARD 1
48 #define DEINTERLACE_MEAN    2
49 #define DEINTERLACE_BLEND   3
50 #define DEINTERLACE_BOB     4
51 #define DEINTERLACE_LINEAR  5
52 #define DEINTERLACE_X       6
53 #define DEINTERLACE_YADIF   7
54 #define DEINTERLACE_YADIF2X 8
55
56 /*****************************************************************************
57  * Module descriptor
58  *****************************************************************************/
59 static int  Open ( vlc_object_t * );
60 static void Close( vlc_object_t * );
61
62 #define MODE_TEXT N_("Deinterlace mode")
63 #define MODE_LONGTEXT N_("Deinterlace method to use for local playback.")
64
65 #define SOUT_MODE_TEXT N_("Streaming deinterlace mode")
66 #define SOUT_MODE_LONGTEXT N_("Deinterlace method to use for streaming.")
67
68 #define FILTER_CFG_PREFIX "sout-deinterlace-"
69
70 static const char *const mode_list[] = {
71     "discard", "blend", "mean", "bob", "linear", "x", "yadif", "yadif2x" };
72 static const char *const mode_list_text[] = {
73     N_("Discard"), N_("Blend"), N_("Mean"), N_("Bob"), N_("Linear"), "X", "Yadif", "Yadif (2x)" };
74
75 vlc_module_begin ()
76     set_description( N_("Deinterlacing video filter") )
77     set_shortname( N_("Deinterlace" ))
78     set_capability( "video filter2", 0 )
79     set_category( CAT_VIDEO )
80     set_subcategory( SUBCAT_VIDEO_VFILTER )
81
82     add_string( FILTER_CFG_PREFIX "mode", "blend", SOUT_MODE_TEXT,
83                 SOUT_MODE_LONGTEXT, false )
84         change_string_list( mode_list, mode_list_text, 0 )
85         change_safe ()
86     add_shortcut( "deinterlace" )
87     set_callbacks( Open, Close )
88 vlc_module_end ()
89
90
91 /*****************************************************************************
92  * Local protypes
93  *****************************************************************************/
94 static void RenderDiscard( filter_t *, picture_t *, picture_t *, int );
95 static void RenderBob    ( filter_t *, picture_t *, picture_t *, int );
96 static void RenderMean   ( filter_t *, picture_t *, picture_t * );
97 static void RenderBlend  ( filter_t *, picture_t *, picture_t * );
98 static void RenderLinear ( filter_t *, picture_t *, picture_t *, int );
99 static void RenderX      ( picture_t *, picture_t * );
100 static int  RenderYadif  ( filter_t *, picture_t *, picture_t *, int, int );
101
102 static void MergeGeneric ( void *, const void *, const void *, size_t );
103 #if defined(CAN_COMPILE_C_ALTIVEC)
104 static void MergeAltivec ( void *, const void *, const void *, size_t );
105 #endif
106 #if defined(CAN_COMPILE_MMXEXT)
107 static void MergeMMXEXT  ( void *, const void *, const void *, size_t );
108 #endif
109 #if defined(CAN_COMPILE_3DNOW)
110 static void Merge3DNow   ( void *, const void *, const void *, size_t );
111 #endif
112 #if defined(CAN_COMPILE_SSE)
113 static void MergeSSE2    ( void *, const void *, const void *, size_t );
114 #endif
115 #if defined(CAN_COMPILE_MMXEXT) || defined(CAN_COMPILE_SSE)
116 static void EndMMX       ( void );
117 #endif
118 #if defined(CAN_COMPILE_3DNOW)
119 static void End3DNow     ( void );
120 #endif
121 #if defined __ARM_NEON__
122 static void MergeNEON (void *, const void *, const void *, size_t);
123 #endif
124
125 static const char *const ppsz_filter_options[] = {
126     "mode", NULL
127 };
128
129 #define HISTORY_SIZE (3)
130 struct filter_sys_t
131 {
132     int  i_mode;        /* Deinterlace mode */
133     bool b_double_rate; /* Shall we double the framerate? */
134     bool b_half_height; /* Shall be divide the height by 2 */
135
136     void (*pf_merge) ( void *, const void *, const void *, size_t );
137     void (*pf_end_merge) ( void );
138
139     mtime_t i_last_date;
140
141     /* Yadif */
142     picture_t *pp_history[HISTORY_SIZE];
143 };
144
145 /*****************************************************************************
146  * SetFilterMethod: setup the deinterlace method to use.
147  *****************************************************************************/
148 static void SetFilterMethod( filter_t *p_filter, const char *psz_method, vlc_fourcc_t i_chroma )
149 {
150     filter_sys_t *p_sys = p_filter->p_sys;
151
152     if( !psz_method )
153         psz_method = "";
154
155     if( !strcmp( psz_method, "mean" ) )
156     {
157         p_sys->i_mode = DEINTERLACE_MEAN;
158         p_sys->b_double_rate = false;
159         p_sys->b_half_height = true;
160     }
161     else if( !strcmp( psz_method, "bob" )
162              || !strcmp( psz_method, "progressive-scan" ) )
163     {
164         p_sys->i_mode = DEINTERLACE_BOB;
165         p_sys->b_double_rate = true;
166         p_sys->b_half_height = false;
167     }
168     else if( !strcmp( psz_method, "linear" ) )
169     {
170         p_sys->i_mode = DEINTERLACE_LINEAR;
171         p_sys->b_double_rate = true;
172         p_sys->b_half_height = false;
173     }
174     else if( !strcmp( psz_method, "x" ) )
175     {
176         p_sys->i_mode = DEINTERLACE_X;
177         p_sys->b_double_rate = false;
178         p_sys->b_half_height = false;
179     }
180     else if( !strcmp( psz_method, "yadif" ) )
181     {
182         p_sys->i_mode = DEINTERLACE_YADIF;
183         p_sys->b_double_rate = false;
184         p_sys->b_half_height = false;
185     }
186     else if( !strcmp( psz_method, "yadif2x" ) )
187     {
188         p_sys->i_mode = DEINTERLACE_YADIF2X;
189         p_sys->b_double_rate = true;
190         p_sys->b_half_height = false;
191     }
192     else if( !strcmp( psz_method, "discard" ) )
193     {
194         const bool b_i422 = i_chroma == VLC_CODEC_I422 ||
195                             i_chroma == VLC_CODEC_J422;
196
197         p_sys->i_mode = DEINTERLACE_DISCARD;
198         p_sys->b_double_rate = false;
199         p_sys->b_half_height = !b_i422;
200     }
201     else
202     {
203         if( strcmp( psz_method, "blend" ) )
204             msg_Err( p_filter,
205                      "no valid deinterlace mode provided, using \"blend\"" );
206
207         p_sys->i_mode = DEINTERLACE_BLEND;
208         p_sys->b_double_rate = false;
209         p_sys->b_half_height = false;
210     }
211
212     msg_Dbg( p_filter, "using %s deinterlace method", psz_method );
213 }
214
215 static void GetOutputFormat( filter_t *p_filter,
216                              video_format_t *p_dst, const video_format_t *p_src )
217 {
218     filter_sys_t *p_sys = p_filter->p_sys;
219     *p_dst = *p_src;
220
221     if( p_sys->b_half_height )
222     {
223         p_dst->i_height /= 2;
224         p_dst->i_visible_height /= 2;
225         p_dst->i_y_offset /= 2;
226         p_dst->i_sar_den *= 2;
227     }
228
229     if( p_src->i_chroma == VLC_CODEC_I422 ||
230         p_src->i_chroma == VLC_CODEC_J422 )
231     {
232         switch( p_sys->i_mode )
233         {
234         case DEINTERLACE_MEAN:
235         case DEINTERLACE_LINEAR:
236         case DEINTERLACE_X:
237         case DEINTERLACE_YADIF:
238         case DEINTERLACE_YADIF2X:
239             p_dst->i_chroma = p_src->i_chroma;
240             break;
241         default:
242             p_dst->i_chroma = p_src->i_chroma == VLC_CODEC_I422 ? VLC_CODEC_I420 :
243                                                                   VLC_CODEC_J420;
244             break;
245         }
246     }
247 }
248
249 static bool IsChromaSupported( vlc_fourcc_t i_chroma )
250 {
251     return i_chroma == VLC_CODEC_I420 ||
252            i_chroma == VLC_CODEC_J420 ||
253            i_chroma == VLC_CODEC_YV12 ||
254            i_chroma == VLC_CODEC_I422 ||
255            i_chroma == VLC_CODEC_J422;
256 }
257
258 /*****************************************************************************
259  * RenderDiscard: only keep TOP or BOTTOM field, discard the other.
260  *****************************************************************************/
261 static void RenderDiscard( filter_t *p_filter,
262                            picture_t *p_outpic, picture_t *p_pic, int i_field )
263 {
264     int i_plane;
265
266     /* Copy image and skip lines */
267     for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
268     {
269         uint8_t *p_in, *p_out_end, *p_out;
270         int i_increment;
271
272         p_in = p_pic->p[i_plane].p_pixels
273                    + i_field * p_pic->p[i_plane].i_pitch;
274
275         p_out = p_outpic->p[i_plane].p_pixels;
276         p_out_end = p_out + p_outpic->p[i_plane].i_pitch
277                              * p_outpic->p[i_plane].i_visible_lines;
278
279         switch( p_filter->fmt_in.video.i_chroma )
280         {
281         case VLC_CODEC_I420:
282         case VLC_CODEC_J420:
283         case VLC_CODEC_YV12:
284
285             for( ; p_out < p_out_end ; )
286             {
287                 vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
288
289                 p_out += p_outpic->p[i_plane].i_pitch;
290                 p_in += 2 * p_pic->p[i_plane].i_pitch;
291             }
292             break;
293
294         case VLC_CODEC_I422:
295         case VLC_CODEC_J422:
296
297             i_increment = 2 * p_pic->p[i_plane].i_pitch;
298
299             if( i_plane == Y_PLANE )
300             {
301                 for( ; p_out < p_out_end ; )
302                 {
303                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
304                     p_out += p_outpic->p[i_plane].i_pitch;
305                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
306                     p_out += p_outpic->p[i_plane].i_pitch;
307                     p_in += i_increment;
308                 }
309             }
310             else
311             {
312                 for( ; p_out < p_out_end ; )
313                 {
314                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
315                     p_out += p_outpic->p[i_plane].i_pitch;
316                     p_in += i_increment;
317                 }
318             }
319             break;
320
321         default:
322             break;
323         }
324     }
325 }
326
327 /*****************************************************************************
328  * RenderBob: renders a BOB picture - simple copy
329  *****************************************************************************/
330 static void RenderBob( filter_t *p_filter,
331                        picture_t *p_outpic, picture_t *p_pic, int i_field )
332 {
333     int i_plane;
334
335     /* Copy image and skip lines */
336     for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
337     {
338         uint8_t *p_in, *p_out_end, *p_out;
339
340         p_in = p_pic->p[i_plane].p_pixels;
341         p_out = p_outpic->p[i_plane].p_pixels;
342         p_out_end = p_out + p_outpic->p[i_plane].i_pitch
343                              * p_outpic->p[i_plane].i_visible_lines;
344
345         switch( p_filter->fmt_in.video.i_chroma )
346         {
347             case VLC_CODEC_I420:
348             case VLC_CODEC_J420:
349             case VLC_CODEC_YV12:
350                 /* For BOTTOM field we need to add the first line */
351                 if( i_field == 1 )
352                 {
353                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
354                     p_in += p_pic->p[i_plane].i_pitch;
355                     p_out += p_outpic->p[i_plane].i_pitch;
356                 }
357
358                 p_out_end -= 2 * p_outpic->p[i_plane].i_pitch;
359
360                 for( ; p_out < p_out_end ; )
361                 {
362                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
363
364                     p_out += p_outpic->p[i_plane].i_pitch;
365
366                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
367
368                     p_in += 2 * p_pic->p[i_plane].i_pitch;
369                     p_out += p_outpic->p[i_plane].i_pitch;
370                 }
371
372                 vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
373
374                 /* For TOP field we need to add the last line */
375                 if( i_field == 0 )
376                 {
377                     p_in += p_pic->p[i_plane].i_pitch;
378                     p_out += p_outpic->p[i_plane].i_pitch;
379                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
380                 }
381                 break;
382
383             case VLC_CODEC_I422:
384             case VLC_CODEC_J422:
385                 /* For BOTTOM field we need to add the first line */
386                 if( i_field == 1 )
387                 {
388                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
389                     p_in += p_pic->p[i_plane].i_pitch;
390                     p_out += p_outpic->p[i_plane].i_pitch;
391                 }
392
393                 p_out_end -= 2 * p_outpic->p[i_plane].i_pitch;
394
395                 if( i_plane == Y_PLANE )
396                 {
397                     for( ; p_out < p_out_end ; )
398                     {
399                         vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
400
401                         p_out += p_outpic->p[i_plane].i_pitch;
402
403                         vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
404
405                         p_in += 2 * p_pic->p[i_plane].i_pitch;
406                         p_out += p_outpic->p[i_plane].i_pitch;
407                     }
408                 }
409                 else
410                 {
411                     for( ; p_out < p_out_end ; )
412                     {
413                         vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
414
415                         p_out += p_outpic->p[i_plane].i_pitch;
416                         p_in += 2 * p_pic->p[i_plane].i_pitch;
417                     }
418                 }
419
420                 vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
421
422                 /* For TOP field we need to add the last line */
423                 if( i_field == 0 )
424                 {
425                     p_in += p_pic->p[i_plane].i_pitch;
426                     p_out += p_outpic->p[i_plane].i_pitch;
427                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
428                 }
429                 break;
430         }
431     }
432 }
433
434 #define Merge p_filter->p_sys->pf_merge
435 #define EndMerge if(p_filter->p_sys->pf_end_merge) p_filter->p_sys->pf_end_merge
436
437 /*****************************************************************************
438  * RenderLinear: BOB with linear interpolation
439  *****************************************************************************/
440 static void RenderLinear( filter_t *p_filter,
441                           picture_t *p_outpic, picture_t *p_pic, int i_field )
442 {
443     int i_plane;
444
445     /* Copy image and skip lines */
446     for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
447     {
448         uint8_t *p_in, *p_out_end, *p_out;
449
450         p_in = p_pic->p[i_plane].p_pixels;
451         p_out = p_outpic->p[i_plane].p_pixels;
452         p_out_end = p_out + p_outpic->p[i_plane].i_pitch
453                              * p_outpic->p[i_plane].i_visible_lines;
454
455         /* For BOTTOM field we need to add the first line */
456         if( i_field == 1 )
457         {
458             vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
459             p_in += p_pic->p[i_plane].i_pitch;
460             p_out += p_outpic->p[i_plane].i_pitch;
461         }
462
463         p_out_end -= 2 * p_outpic->p[i_plane].i_pitch;
464
465         for( ; p_out < p_out_end ; )
466         {
467             vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
468
469             p_out += p_outpic->p[i_plane].i_pitch;
470
471             Merge( p_out, p_in, p_in + 2 * p_pic->p[i_plane].i_pitch,
472                    p_pic->p[i_plane].i_pitch );
473
474             p_in += 2 * p_pic->p[i_plane].i_pitch;
475             p_out += p_outpic->p[i_plane].i_pitch;
476         }
477
478         vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
479
480         /* For TOP field we need to add the last line */
481         if( i_field == 0 )
482         {
483             p_in += p_pic->p[i_plane].i_pitch;
484             p_out += p_outpic->p[i_plane].i_pitch;
485             vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
486         }
487     }
488     EndMerge();
489 }
490
491 static void RenderMean( filter_t *p_filter,
492                         picture_t *p_outpic, picture_t *p_pic )
493 {
494     int i_plane;
495
496     /* Copy image and skip lines */
497     for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
498     {
499         uint8_t *p_in, *p_out_end, *p_out;
500
501         p_in = p_pic->p[i_plane].p_pixels;
502
503         p_out = p_outpic->p[i_plane].p_pixels;
504         p_out_end = p_out + p_outpic->p[i_plane].i_pitch
505                              * p_outpic->p[i_plane].i_visible_lines;
506
507         /* All lines: mean value */
508         for( ; p_out < p_out_end ; )
509         {
510             Merge( p_out, p_in, p_in + p_pic->p[i_plane].i_pitch,
511                    p_pic->p[i_plane].i_pitch );
512
513             p_out += p_outpic->p[i_plane].i_pitch;
514             p_in += 2 * p_pic->p[i_plane].i_pitch;
515         }
516     }
517     EndMerge();
518 }
519
520 static void RenderBlend( filter_t *p_filter,
521                          picture_t *p_outpic, picture_t *p_pic )
522 {
523     int i_plane;
524
525     /* Copy image and skip lines */
526     for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
527     {
528         uint8_t *p_in, *p_out_end, *p_out;
529
530         p_in = p_pic->p[i_plane].p_pixels;
531
532         p_out = p_outpic->p[i_plane].p_pixels;
533         p_out_end = p_out + p_outpic->p[i_plane].i_pitch
534                              * p_outpic->p[i_plane].i_visible_lines;
535
536         switch( p_filter->fmt_in.video.i_chroma )
537         {
538             case VLC_CODEC_I420:
539             case VLC_CODEC_J420:
540             case VLC_CODEC_YV12:
541                 /* First line: simple copy */
542                 vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
543                 p_out += p_outpic->p[i_plane].i_pitch;
544
545                 /* Remaining lines: mean value */
546                 for( ; p_out < p_out_end ; )
547                 {
548                     Merge( p_out, p_in, p_in + p_pic->p[i_plane].i_pitch,
549                            p_pic->p[i_plane].i_pitch );
550
551                     p_out += p_outpic->p[i_plane].i_pitch;
552                     p_in += p_pic->p[i_plane].i_pitch;
553                 }
554                 break;
555
556             case VLC_CODEC_I422:
557             case VLC_CODEC_J422:
558                 /* First line: simple copy */
559                 vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
560                 p_out += p_outpic->p[i_plane].i_pitch;
561
562                 /* Remaining lines: mean value */
563                 if( i_plane == Y_PLANE )
564                 {
565                     for( ; p_out < p_out_end ; )
566                     {
567                         Merge( p_out, p_in, p_in + p_pic->p[i_plane].i_pitch,
568                                p_pic->p[i_plane].i_pitch );
569
570                         p_out += p_outpic->p[i_plane].i_pitch;
571                         p_in += p_pic->p[i_plane].i_pitch;
572                     }
573                 }
574
575                 else
576                 {
577                     for( ; p_out < p_out_end ; )
578                     {
579                         Merge( p_out, p_in, p_in + p_pic->p[i_plane].i_pitch,
580                                p_pic->p[i_plane].i_pitch );
581
582                         p_out += p_outpic->p[i_plane].i_pitch;
583                         p_in += 2*p_pic->p[i_plane].i_pitch;
584                     }
585                 }
586                 break;
587         }
588     }
589     EndMerge();
590 }
591
592 #undef Merge
593
594 static void MergeGeneric( void *_p_dest, const void *_p_s1,
595                           const void *_p_s2, size_t i_bytes )
596 {
597     uint8_t* p_dest = (uint8_t*)_p_dest;
598     const uint8_t *p_s1 = (const uint8_t *)_p_s1;
599     const uint8_t *p_s2 = (const uint8_t *)_p_s2;
600     uint8_t* p_end = p_dest + i_bytes - 8;
601
602     while( p_dest < p_end )
603     {
604         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
605         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
606         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
607         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
608         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
609         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
610         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
611         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
612     }
613
614     p_end += 8;
615
616     while( p_dest < p_end )
617     {
618         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
619     }
620 }
621
622 #if defined(CAN_COMPILE_MMXEXT)
623 static void MergeMMXEXT( void *_p_dest, const void *_p_s1, const void *_p_s2,
624                          size_t i_bytes )
625 {
626     uint8_t* p_dest = (uint8_t*)_p_dest;
627     const uint8_t *p_s1 = (const uint8_t *)_p_s1;
628     const uint8_t *p_s2 = (const uint8_t *)_p_s2;
629     uint8_t* p_end = p_dest + i_bytes - 8;
630     while( p_dest < p_end )
631     {
632         __asm__  __volatile__( "movq %2,%%mm1;"
633                                "pavgb %1, %%mm1;"
634                                "movq %%mm1, %0" :"=m" (*p_dest):
635                                                  "m" (*p_s1),
636                                                  "m" (*p_s2) );
637         p_dest += 8;
638         p_s1 += 8;
639         p_s2 += 8;
640     }
641
642     p_end += 8;
643
644     while( p_dest < p_end )
645     {
646         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
647     }
648 }
649 #endif
650
651 #if defined(CAN_COMPILE_3DNOW)
652 static void Merge3DNow( void *_p_dest, const void *_p_s1, const void *_p_s2,
653                         size_t i_bytes )
654 {
655     uint8_t* p_dest = (uint8_t*)_p_dest;
656     const uint8_t *p_s1 = (const uint8_t *)_p_s1;
657     const uint8_t *p_s2 = (const uint8_t *)_p_s2;
658     uint8_t* p_end = p_dest + i_bytes - 8;
659     while( p_dest < p_end )
660     {
661         __asm__  __volatile__( "movq %2,%%mm1;"
662                                "pavgusb %1, %%mm1;"
663                                "movq %%mm1, %0" :"=m" (*p_dest):
664                                                  "m" (*p_s1),
665                                                  "m" (*p_s2) );
666         p_dest += 8;
667         p_s1 += 8;
668         p_s2 += 8;
669     }
670
671     p_end += 8;
672
673     while( p_dest < p_end )
674     {
675         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
676     }
677 }
678 #endif
679
680 #if defined(CAN_COMPILE_SSE)
681 static void MergeSSE2( void *_p_dest, const void *_p_s1, const void *_p_s2,
682                        size_t i_bytes )
683 {
684     uint8_t* p_dest = (uint8_t*)_p_dest;
685     const uint8_t *p_s1 = (const uint8_t *)_p_s1;
686     const uint8_t *p_s2 = (const uint8_t *)_p_s2;
687     uint8_t* p_end;
688     while( (uintptr_t)p_s1 % 16 )
689     {
690         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
691     }
692     p_end = p_dest + i_bytes - 16;
693     while( p_dest < p_end )
694     {
695         __asm__  __volatile__( "movdqu %2,%%xmm1;"
696                                "pavgb %1, %%xmm1;"
697                                "movdqu %%xmm1, %0" :"=m" (*p_dest):
698                                                  "m" (*p_s1),
699                                                  "m" (*p_s2) );
700         p_dest += 16;
701         p_s1 += 16;
702         p_s2 += 16;
703     }
704
705     p_end += 16;
706
707     while( p_dest < p_end )
708     {
709         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
710     }
711 }
712 #endif
713
714 #if defined(CAN_COMPILE_MMXEXT) || defined(CAN_COMPILE_SSE)
715 static void EndMMX( void )
716 {
717     __asm__ __volatile__( "emms" :: );
718 }
719 #endif
720
721 #if defined(CAN_COMPILE_3DNOW)
722 static void End3DNow( void )
723 {
724     __asm__ __volatile__( "femms" :: );
725 }
726 #endif
727
728 #ifdef CAN_COMPILE_C_ALTIVEC
729 static void MergeAltivec( void *_p_dest, const void *_p_s1,
730                           const void *_p_s2, size_t i_bytes )
731 {
732     uint8_t *p_dest = (uint8_t *)_p_dest;
733     uint8_t *p_s1   = (uint8_t *)_p_s1;
734     uint8_t *p_s2   = (uint8_t *)_p_s2;
735     uint8_t *p_end  = p_dest + i_bytes - 15;
736
737     /* Use C until the first 16-bytes aligned destination pixel */
738     while( (uintptr_t)p_dest & 0xF )
739     {
740         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
741     }
742
743     if( ( (int)p_s1 & 0xF ) | ( (int)p_s2 & 0xF ) )
744     {
745         /* Unaligned source */
746         vector unsigned char s1v, s2v, destv;
747         vector unsigned char s1oldv, s2oldv, s1newv, s2newv;
748         vector unsigned char perm1v, perm2v;
749
750         perm1v = vec_lvsl( 0, p_s1 );
751         perm2v = vec_lvsl( 0, p_s2 );
752         s1oldv = vec_ld( 0, p_s1 );
753         s2oldv = vec_ld( 0, p_s2 );
754
755         while( p_dest < p_end )
756         {
757             s1newv = vec_ld( 16, p_s1 );
758             s2newv = vec_ld( 16, p_s2 );
759             s1v    = vec_perm( s1oldv, s1newv, perm1v );
760             s2v    = vec_perm( s2oldv, s2newv, perm2v );
761             s1oldv = s1newv;
762             s2oldv = s2newv;
763             destv  = vec_avg( s1v, s2v );
764             vec_st( destv, 0, p_dest );
765
766             p_s1   += 16;
767             p_s2   += 16;
768             p_dest += 16;
769         }
770     }
771     else
772     {
773         /* Aligned source */
774         vector unsigned char s1v, s2v, destv;
775
776         while( p_dest < p_end )
777         {
778             s1v   = vec_ld( 0, p_s1 );
779             s2v   = vec_ld( 0, p_s2 );
780             destv = vec_avg( s1v, s2v );
781             vec_st( destv, 0, p_dest );
782
783             p_s1   += 16;
784             p_s2   += 16;
785             p_dest += 16;
786         }
787     }
788
789     p_end += 15;
790
791     while( p_dest < p_end )
792     {
793         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
794     }
795 }
796 #endif
797
798 #ifdef __ARM_NEON__
799 static void MergeNEON (void *restrict out, const void *in1,
800                        const void *in2, size_t n)
801 {
802     uint8_t *outp = out;
803     const uint8_t *in1p = in1;
804     const uint8_t *in2p = in2;
805     size_t mis = ((uintptr_t)outp) & 15;
806
807     if (mis)
808     {
809         MergeGeneric (outp, in1p, in2p, mis);
810         outp += mis;
811         in1p += mis;
812         in2p += mis;
813         n -= mis;
814     }
815
816     uint8_t *end = outp + (n & ~15);
817
818     if ((((uintptr_t)in1p)|((uintptr_t)in2p)) & 15)
819         while (outp < end)
820             asm volatile (
821                 "vld1.u8  {q0-q1}, [%[in1]]!\n"
822                 "vld1.u8  {q2-q3}, [%[in2]]!\n"
823                 "vhadd.u8 q4, q0, q2\n"
824                 "vld1.u8  {q6-q7}, [%[in1]]!\n"
825                 "vhadd.u8 q5, q1, q3\n"
826                 "vld1.u8  {q8-q9}, [%[in2]]!\n"
827                 "vhadd.u8 q10, q6, q8\n"
828                 "vhadd.u8 q11, q7, q9\n"
829                 "vst1.u8  {q4-q5}, [%[out],:128]!\n"
830                 "vst1.u8  {q10-q11}, [%[out],:128]!\n"
831                 : [out] "+r" (outp), [in1] "+r" (in1p), [in2] "+r" (in2p)
832                 :
833                 : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
834                   "q8", "q9", "q10", "q11", "memory");
835     else
836          while (outp < end)
837             asm volatile (
838                 "vld1.u8  {q0-q1}, [%[in1],:128]!\n"
839                 "vld1.u8  {q2-q3}, [%[in2],:128]!\n"
840                 "vhadd.u8 q4, q0, q2\n"
841                 "vld1.u8  {q6-q7}, [%[in1],:128]!\n"
842                 "vhadd.u8 q5, q1, q3\n"
843                 "vld1.u8  {q8-q9}, [%[in2],:128]!\n"
844                 "vhadd.u8 q10, q6, q8\n"
845                 "vhadd.u8 q11, q7, q9\n"
846                 "vst1.u8  {q4-q5}, [%[out],:128]!\n"
847                 "vst1.u8  {q10-q11}, [%[out],:128]!\n"
848                 : [out] "+r" (outp), [in1] "+r" (in1p), [in2] "+r" (in2p)
849                 :
850                 : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
851                   "q8", "q9", "q10", "q11", "memory");
852     n &= 15;
853     if (n)
854         MergeGeneric (outp, in1p, in2p, n);
855 }
856 #endif
857
858 /*****************************************************************************
859  * RenderX: This algo works on a 8x8 block basic, it copies the top field
860  * and apply a process to recreate the bottom field :
861  *  If a 8x8 block is classified as :
862  *   - progressive: it applies a small blend (1,6,1)
863  *   - interlaced:
864  *    * in the MMX version: we do a ME between the 2 fields, if there is a
865  *    good match we use MC to recreate the bottom field (with a small
866  *    blend (1,6,1) )
867  *    * otherwise: it recreates the bottom field by an edge oriented
868  *    interpolation.
869   *****************************************************************************/
870
871 /* XDeint8x8Detect: detect if a 8x8 block is interlaced.
872  * XXX: It need to access to 8x10
873  * We use more than 8 lines to help with scrolling (text)
874  * (and because XDeint8x8Frame use line 9)
875  * XXX: smooth/uniform area with noise detection doesn't works well
876  * but it's not really a problem because they don't have much details anyway
877  */
878 static inline int ssd( int a ) { return a*a; }
879 static inline int XDeint8x8DetectC( uint8_t *src, int i_src )
880 {
881     int y, x;
882     int ff, fr;
883     int fc;
884
885     /* Detect interlacing */
886     fc = 0;
887     for( y = 0; y < 7; y += 2 )
888     {
889         ff = fr = 0;
890         for( x = 0; x < 8; x++ )
891         {
892             fr += ssd(src[      x] - src[1*i_src+x]) +
893                   ssd(src[i_src+x] - src[2*i_src+x]);
894             ff += ssd(src[      x] - src[2*i_src+x]) +
895                   ssd(src[i_src+x] - src[3*i_src+x]);
896         }
897         if( ff < 6*fr/8 && fr > 32 )
898             fc++;
899
900         src += 2*i_src;
901     }
902
903     return fc < 1 ? false : true;
904 }
905 #ifdef CAN_COMPILE_MMXEXT
906 static inline int XDeint8x8DetectMMXEXT( uint8_t *src, int i_src )
907 {
908
909     int y, x;
910     int32_t ff, fr;
911     int fc;
912
913     /* Detect interlacing */
914     fc = 0;
915     pxor_r2r( mm7, mm7 );
916     for( y = 0; y < 9; y += 2 )
917     {
918         ff = fr = 0;
919         pxor_r2r( mm5, mm5 );
920         pxor_r2r( mm6, mm6 );
921         for( x = 0; x < 8; x+=4 )
922         {
923             movd_m2r( src[        x], mm0 );
924             movd_m2r( src[1*i_src+x], mm1 );
925             movd_m2r( src[2*i_src+x], mm2 );
926             movd_m2r( src[3*i_src+x], mm3 );
927
928             punpcklbw_r2r( mm7, mm0 );
929             punpcklbw_r2r( mm7, mm1 );
930             punpcklbw_r2r( mm7, mm2 );
931             punpcklbw_r2r( mm7, mm3 );
932
933             movq_r2r( mm0, mm4 );
934
935             psubw_r2r( mm1, mm0 );
936             psubw_r2r( mm2, mm4 );
937
938             psubw_r2r( mm1, mm2 );
939             psubw_r2r( mm1, mm3 );
940
941             pmaddwd_r2r( mm0, mm0 );
942             pmaddwd_r2r( mm4, mm4 );
943             pmaddwd_r2r( mm2, mm2 );
944             pmaddwd_r2r( mm3, mm3 );
945             paddd_r2r( mm0, mm2 );
946             paddd_r2r( mm4, mm3 );
947             paddd_r2r( mm2, mm5 );
948             paddd_r2r( mm3, mm6 );
949         }
950
951         movq_r2r( mm5, mm0 );
952         psrlq_i2r( 32, mm0 );
953         paddd_r2r( mm0, mm5 );
954         movd_r2m( mm5, fr );
955
956         movq_r2r( mm6, mm0 );
957         psrlq_i2r( 32, mm0 );
958         paddd_r2r( mm0, mm6 );
959         movd_r2m( mm6, ff );
960
961         if( ff < 6*fr/8 && fr > 32 )
962             fc++;
963
964         src += 2*i_src;
965     }
966     return fc;
967 }
968 #endif
969
970 static inline void XDeint8x8MergeC( uint8_t *dst, int i_dst,
971                                     uint8_t *src1, int i_src1,
972                                     uint8_t *src2, int i_src2 )
973 {
974     int y, x;
975
976     /* Progressive */
977     for( y = 0; y < 8; y += 2 )
978     {
979         memcpy( dst, src1, 8 );
980         dst  += i_dst;
981
982         for( x = 0; x < 8; x++ )
983             dst[x] = (src1[x] + 6*src2[x] + src1[i_src1+x] + 4 ) >> 3;
984         dst += i_dst;
985
986         src1 += i_src1;
987         src2 += i_src2;
988     }
989 }
990
991 #ifdef CAN_COMPILE_MMXEXT
992 static inline void XDeint8x8MergeMMXEXT( uint8_t *dst, int i_dst,
993                                          uint8_t *src1, int i_src1,
994                                          uint8_t *src2, int i_src2 )
995 {
996     static const uint64_t m_4 = INT64_C(0x0004000400040004);
997     int y, x;
998
999     /* Progressive */
1000     pxor_r2r( mm7, mm7 );
1001     for( y = 0; y < 8; y += 2 )
1002     {
1003         for( x = 0; x < 8; x +=4 )
1004         {
1005             movd_m2r( src1[x], mm0 );
1006             movd_r2m( mm0, dst[x] );
1007
1008             movd_m2r( src2[x], mm1 );
1009             movd_m2r( src1[i_src1+x], mm2 );
1010
1011             punpcklbw_r2r( mm7, mm0 );
1012             punpcklbw_r2r( mm7, mm1 );
1013             punpcklbw_r2r( mm7, mm2 );
1014             paddw_r2r( mm1, mm1 );
1015             movq_r2r( mm1, mm3 );
1016             paddw_r2r( mm3, mm3 );
1017             paddw_r2r( mm2, mm0 );
1018             paddw_r2r( mm3, mm1 );
1019             paddw_m2r( m_4, mm1 );
1020             paddw_r2r( mm1, mm0 );
1021             psraw_i2r( 3, mm0 );
1022             packuswb_r2r( mm7, mm0 );
1023             movd_r2m( mm0, dst[i_dst+x] );
1024         }
1025         dst += 2*i_dst;
1026         src1 += i_src1;
1027         src2 += i_src2;
1028     }
1029 }
1030
1031 #endif
1032
1033 /* For debug */
1034 static inline void XDeint8x8Set( uint8_t *dst, int i_dst, uint8_t v )
1035 {
1036     int y;
1037     for( y = 0; y < 8; y++ )
1038         memset( &dst[y*i_dst], v, 8 );
1039 }
1040
1041 /* XDeint8x8FieldE: Stupid deinterlacing (1,0,1) for block that miss a
1042  * neighbour
1043  * (Use 8x9 pixels)
1044  * TODO: a better one for the inner part.
1045  */
1046 static inline void XDeint8x8FieldEC( uint8_t *dst, int i_dst,
1047                                      uint8_t *src, int i_src )
1048 {
1049     int y, x;
1050
1051     /* Interlaced */
1052     for( y = 0; y < 8; y += 2 )
1053     {
1054         memcpy( dst, src, 8 );
1055         dst += i_dst;
1056
1057         for( x = 0; x < 8; x++ )
1058             dst[x] = (src[x] + src[2*i_src+x] ) >> 1;
1059         dst += 1*i_dst;
1060         src += 2*i_src;
1061     }
1062 }
1063 #ifdef CAN_COMPILE_MMXEXT
1064 static inline void XDeint8x8FieldEMMXEXT( uint8_t *dst, int i_dst,
1065                                           uint8_t *src, int i_src )
1066 {
1067     int y;
1068
1069     /* Interlaced */
1070     for( y = 0; y < 8; y += 2 )
1071     {
1072         movq_m2r( src[0], mm0 );
1073         movq_r2m( mm0, dst[0] );
1074         dst += i_dst;
1075
1076         movq_m2r( src[2*i_src], mm1 );
1077         pavgb_r2r( mm1, mm0 );
1078
1079         movq_r2m( mm0, dst[0] );
1080
1081         dst += 1*i_dst;
1082         src += 2*i_src;
1083     }
1084 }
1085 #endif
1086
1087 /* XDeint8x8Field: Edge oriented interpolation
1088  * (Need -4 and +5 pixels H, +1 line)
1089  */
1090 static inline void XDeint8x8FieldC( uint8_t *dst, int i_dst,
1091                                     uint8_t *src, int i_src )
1092 {
1093     int y, x;
1094
1095     /* Interlaced */
1096     for( y = 0; y < 8; y += 2 )
1097     {
1098         memcpy( dst, src, 8 );
1099         dst += i_dst;
1100
1101         for( x = 0; x < 8; x++ )
1102         {
1103             uint8_t *src2 = &src[2*i_src];
1104             /* I use 8 pixels just to match the MMX version, but it's overkill
1105              * 5 would be enough (less isn't good) */
1106             const int c0 = abs(src[x-4]-src2[x-2]) + abs(src[x-3]-src2[x-1]) +
1107                            abs(src[x-2]-src2[x+0]) + abs(src[x-1]-src2[x+1]) +
1108                            abs(src[x+0]-src2[x+2]) + abs(src[x+1]-src2[x+3]) +
1109                            abs(src[x+2]-src2[x+4]) + abs(src[x+3]-src2[x+5]);
1110
1111             const int c1 = abs(src[x-3]-src2[x-3]) + abs(src[x-2]-src2[x-2]) +
1112                            abs(src[x-1]-src2[x-1]) + abs(src[x+0]-src2[x+0]) +
1113                            abs(src[x+1]-src2[x+1]) + abs(src[x+2]-src2[x+2]) +
1114                            abs(src[x+3]-src2[x+3]) + abs(src[x+4]-src2[x+4]);
1115
1116             const int c2 = abs(src[x-2]-src2[x-4]) + abs(src[x-1]-src2[x-3]) +
1117                            abs(src[x+0]-src2[x-2]) + abs(src[x+1]-src2[x-1]) +
1118                            abs(src[x+2]-src2[x+0]) + abs(src[x+3]-src2[x+1]) +
1119                            abs(src[x+4]-src2[x+2]) + abs(src[x+5]-src2[x+3]);
1120
1121             if( c0 < c1 && c1 <= c2 )
1122                 dst[x] = (src[x-1] + src2[x+1]) >> 1;
1123             else if( c2 < c1 && c1 <= c0 )
1124                 dst[x] = (src[x+1] + src2[x-1]) >> 1;
1125             else
1126                 dst[x] = (src[x+0] + src2[x+0]) >> 1;
1127         }
1128
1129         dst += 1*i_dst;
1130         src += 2*i_src;
1131     }
1132 }
1133 #ifdef CAN_COMPILE_MMXEXT
1134 static inline void XDeint8x8FieldMMXEXT( uint8_t *dst, int i_dst,
1135                                          uint8_t *src, int i_src )
1136 {
1137     int y, x;
1138
1139     /* Interlaced */
1140     for( y = 0; y < 8; y += 2 )
1141     {
1142         memcpy( dst, src, 8 );
1143         dst += i_dst;
1144
1145         for( x = 0; x < 8; x++ )
1146         {
1147             uint8_t *src2 = &src[2*i_src];
1148             int32_t c0, c1, c2;
1149
1150             movq_m2r( src[x-2], mm0 );
1151             movq_m2r( src[x-3], mm1 );
1152             movq_m2r( src[x-4], mm2 );
1153
1154             psadbw_m2r( src2[x-4], mm0 );
1155             psadbw_m2r( src2[x-3], mm1 );
1156             psadbw_m2r( src2[x-2], mm2 );
1157
1158             movd_r2m( mm0, c2 );
1159             movd_r2m( mm1, c1 );
1160             movd_r2m( mm2, c0 );
1161
1162             if( c0 < c1 && c1 <= c2 )
1163                 dst[x] = (src[x-1] + src2[x+1]) >> 1;
1164             else if( c2 < c1 && c1 <= c0 )
1165                 dst[x] = (src[x+1] + src2[x-1]) >> 1;
1166             else
1167                 dst[x] = (src[x+0] + src2[x+0]) >> 1;
1168         }
1169
1170         dst += 1*i_dst;
1171         src += 2*i_src;
1172     }
1173 }
1174 #endif
1175
1176 /* NxN arbitray size (and then only use pixel in the NxN block)
1177  */
1178 static inline int XDeintNxNDetect( uint8_t *src, int i_src,
1179                                    int i_height, int i_width )
1180 {
1181     int y, x;
1182     int ff, fr;
1183     int fc;
1184
1185
1186     /* Detect interlacing */
1187     /* FIXME way too simple, need to be more like XDeint8x8Detect */
1188     ff = fr = 0;
1189     fc = 0;
1190     for( y = 0; y < i_height - 2; y += 2 )
1191     {
1192         const uint8_t *s = &src[y*i_src];
1193         for( x = 0; x < i_width; x++ )
1194         {
1195             fr += ssd(s[      x] - s[1*i_src+x]);
1196             ff += ssd(s[      x] - s[2*i_src+x]);
1197         }
1198         if( ff < fr && fr > i_width / 2 )
1199             fc++;
1200     }
1201
1202     return fc < 2 ? false : true;
1203 }
1204
1205 static inline void XDeintNxNFrame( uint8_t *dst, int i_dst,
1206                                    uint8_t *src, int i_src,
1207                                    int i_width, int i_height )
1208 {
1209     int y, x;
1210
1211     /* Progressive */
1212     for( y = 0; y < i_height; y += 2 )
1213     {
1214         memcpy( dst, src, i_width );
1215         dst += i_dst;
1216
1217         if( y < i_height - 2 )
1218         {
1219             for( x = 0; x < i_width; x++ )
1220                 dst[x] = (src[x] + 2*src[1*i_src+x] + src[2*i_src+x] + 2 ) >> 2;
1221         }
1222         else
1223         {
1224             /* Blend last line */
1225             for( x = 0; x < i_width; x++ )
1226                 dst[x] = (src[x] + src[1*i_src+x] ) >> 1;
1227         }
1228         dst += 1*i_dst;
1229         src += 2*i_src;
1230     }
1231 }
1232
1233 static inline void XDeintNxNField( uint8_t *dst, int i_dst,
1234                                    uint8_t *src, int i_src,
1235                                    int i_width, int i_height )
1236 {
1237     int y, x;
1238
1239     /* Interlaced */
1240     for( y = 0; y < i_height; y += 2 )
1241     {
1242         memcpy( dst, src, i_width );
1243         dst += i_dst;
1244
1245         if( y < i_height - 2 )
1246         {
1247             for( x = 0; x < i_width; x++ )
1248                 dst[x] = (src[x] + src[2*i_src+x] ) >> 1;
1249         }
1250         else
1251         {
1252             /* Blend last line */
1253             for( x = 0; x < i_width; x++ )
1254                 dst[x] = (src[x] + src[i_src+x]) >> 1;
1255         }
1256         dst += 1*i_dst;
1257         src += 2*i_src;
1258     }
1259 }
1260
1261 static inline void XDeintNxN( uint8_t *dst, int i_dst, uint8_t *src, int i_src,
1262                               int i_width, int i_height )
1263 {
1264     if( XDeintNxNDetect( src, i_src, i_width, i_height ) )
1265         XDeintNxNField( dst, i_dst, src, i_src, i_width, i_height );
1266     else
1267         XDeintNxNFrame( dst, i_dst, src, i_src, i_width, i_height );
1268 }
1269
1270
1271 static inline int median( int a, int b, int c )
1272 {
1273     int min = a, max =a;
1274     if( b < min )
1275         min = b;
1276     else
1277         max = b;
1278
1279     if( c < min )
1280         min = c;
1281     else if( c > max )
1282         max = c;
1283
1284     return a + b + c - min - max;
1285 }
1286
1287
1288 /* XDeintBand8x8:
1289  */
1290 static inline void XDeintBand8x8C( uint8_t *dst, int i_dst,
1291                                    uint8_t *src, int i_src,
1292                                    const int i_mbx, int i_modx )
1293 {
1294     int x;
1295
1296     for( x = 0; x < i_mbx; x++ )
1297     {
1298         int s;
1299         if( ( s = XDeint8x8DetectC( src, i_src ) ) )
1300         {
1301             if( x == 0 || x == i_mbx - 1 )
1302                 XDeint8x8FieldEC( dst, i_dst, src, i_src );
1303             else
1304                 XDeint8x8FieldC( dst, i_dst, src, i_src );
1305         }
1306         else
1307         {
1308             XDeint8x8MergeC( dst, i_dst,
1309                              &src[0*i_src], 2*i_src,
1310                              &src[1*i_src], 2*i_src );
1311         }
1312
1313         dst += 8;
1314         src += 8;
1315     }
1316
1317     if( i_modx )
1318         XDeintNxN( dst, i_dst, src, i_src, i_modx, 8 );
1319 }
1320 #ifdef CAN_COMPILE_MMXEXT
1321 static inline void XDeintBand8x8MMXEXT( uint8_t *dst, int i_dst,
1322                                         uint8_t *src, int i_src,
1323                                         const int i_mbx, int i_modx )
1324 {
1325     int x;
1326
1327     /* Reset current line */
1328     for( x = 0; x < i_mbx; x++ )
1329     {
1330         int s;
1331         if( ( s = XDeint8x8DetectMMXEXT( src, i_src ) ) )
1332         {
1333             if( x == 0 || x == i_mbx - 1 )
1334                 XDeint8x8FieldEMMXEXT( dst, i_dst, src, i_src );
1335             else
1336                 XDeint8x8FieldMMXEXT( dst, i_dst, src, i_src );
1337         }
1338         else
1339         {
1340             XDeint8x8MergeMMXEXT( dst, i_dst,
1341                                   &src[0*i_src], 2*i_src,
1342                                   &src[1*i_src], 2*i_src );
1343         }
1344
1345         dst += 8;
1346         src += 8;
1347     }
1348
1349     if( i_modx )
1350         XDeintNxN( dst, i_dst, src, i_src, i_modx, 8 );
1351 }
1352 #endif
1353
1354 static void RenderX( picture_t *p_outpic, picture_t *p_pic )
1355 {
1356     int i_plane;
1357
1358     /* Copy image and skip lines */
1359     for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
1360     {
1361         const int i_mby = ( p_outpic->p[i_plane].i_visible_lines + 7 )/8 - 1;
1362         const int i_mbx = p_outpic->p[i_plane].i_visible_pitch/8;
1363
1364         const int i_mody = p_outpic->p[i_plane].i_visible_lines - 8*i_mby;
1365         const int i_modx = p_outpic->p[i_plane].i_visible_pitch - 8*i_mbx;
1366
1367         const int i_dst = p_outpic->p[i_plane].i_pitch;
1368         const int i_src = p_pic->p[i_plane].i_pitch;
1369
1370         int y, x;
1371
1372         for( y = 0; y < i_mby; y++ )
1373         {
1374             uint8_t *dst = &p_outpic->p[i_plane].p_pixels[8*y*i_dst];
1375             uint8_t *src = &p_pic->p[i_plane].p_pixels[8*y*i_src];
1376
1377 #ifdef CAN_COMPILE_MMXEXT
1378             if( vlc_CPU() & CPU_CAPABILITY_MMXEXT )
1379                 XDeintBand8x8MMXEXT( dst, i_dst, src, i_src, i_mbx, i_modx );
1380             else
1381 #endif
1382                 XDeintBand8x8C( dst, i_dst, src, i_src, i_mbx, i_modx );
1383         }
1384
1385         /* Last line (C only)*/
1386         if( i_mody )
1387         {
1388             uint8_t *dst = &p_outpic->p[i_plane].p_pixels[8*y*i_dst];
1389             uint8_t *src = &p_pic->p[i_plane].p_pixels[8*y*i_src];
1390
1391             for( x = 0; x < i_mbx; x++ )
1392             {
1393                 XDeintNxN( dst, i_dst, src, i_src, 8, i_mody );
1394
1395                 dst += 8;
1396                 src += 8;
1397             }
1398
1399             if( i_modx )
1400                 XDeintNxN( dst, i_dst, src, i_src, i_modx, i_mody );
1401         }
1402     }
1403
1404 #ifdef CAN_COMPILE_MMXEXT
1405     if( vlc_CPU() & CPU_CAPABILITY_MMXEXT )
1406         emms();
1407 #endif
1408 }
1409
1410 /*****************************************************************************
1411  * Yadif (Yet Another DeInterlacing Filter).
1412  *****************************************************************************/
1413 /* */
1414 struct vf_priv_s {
1415     /*
1416      * 0: Output 1 frame for each frame.
1417      * 1: Output 1 frame for each field.
1418      * 2: Like 0 but skips spatial interlacing check.
1419      * 3: Like 1 but skips spatial interlacing check.
1420      *
1421      * In vlc, only & 0x02 has meaning, as we do the & 0x01 ourself.
1422      */
1423     int mode;
1424 };
1425
1426 /* I am unsure it is the right one */
1427 typedef intptr_t x86_reg;
1428
1429 #define FFABS(a) ((a) >= 0 ? (a) : (-(a)))
1430 #define FFMAX(a,b)      __MAX(a,b)
1431 #define FFMAX3(a,b,c)   FFMAX(FFMAX(a,b),c)
1432 #define FFMIN(a,b)      __MIN(a,b)
1433 #define FFMIN3(a,b,c)   FFMIN(FFMIN(a,b),c)
1434
1435 /* yadif.h comes from vf_yadif.c of mplayer project */
1436 #include "yadif.h"
1437
1438 static int RenderYadif( filter_t *p_filter, picture_t *p_dst, picture_t *p_src, int i_order, int i_field )
1439 {
1440     filter_sys_t *p_sys = p_filter->p_sys;
1441
1442     /* */
1443     assert( i_order == 0 || i_order == 1 );
1444     assert( i_field == 0 || i_field == 1 );
1445
1446     if( i_order == 0 )
1447     {
1448         /* Duplicate the picture
1449          * TODO when the vout rework is finished, picture_Hold() might be enough
1450          * but becarefull, the pitches must match */
1451         picture_t *p_dup = picture_NewFromFormat( &p_src->format );
1452         if( p_dup )
1453             picture_Copy( p_dup, p_src );
1454
1455         /* Slide the history */
1456         if( p_sys->pp_history[0] )
1457             picture_Release( p_sys->pp_history[0]  );
1458         for( int i = 1; i < HISTORY_SIZE; i++ )
1459             p_sys->pp_history[i-1] = p_sys->pp_history[i];
1460         p_sys->pp_history[HISTORY_SIZE-1] = p_dup;
1461     }
1462
1463     /* As the pitches must match, use ONLY pictures coming from picture_New()! */
1464     picture_t *p_prev = p_sys->pp_history[0];
1465     picture_t *p_cur  = p_sys->pp_history[1];
1466     picture_t *p_next = p_sys->pp_history[2];
1467
1468     /* Filter if we have all the pictures we need */
1469     if( p_prev && p_cur && p_next )
1470     {
1471         /* */
1472         void (*filter)(struct vf_priv_s *p, uint8_t *dst, uint8_t *prev, uint8_t *cur, uint8_t *next, int w, int refs, int parity);
1473 #if defined(HAVE_YADIF_SSE2)
1474         if( vlc_CPU() & CPU_CAPABILITY_SSE2 )
1475             filter = yadif_filter_line_mmx2;
1476         else
1477 #endif
1478             filter = yadif_filter_line_c;
1479
1480         for( int n = 0; n < p_dst->i_planes; n++ )
1481         {
1482             const plane_t *prevp = &p_prev->p[n];
1483             const plane_t *curp  = &p_cur->p[n];
1484             const plane_t *nextp = &p_next->p[n];
1485             plane_t *dstp        = &p_dst->p[n];
1486
1487             for( int y = 1; y < dstp->i_visible_lines - 1; y++ )
1488             {
1489                 if( (y % 2) == i_field )
1490                 {
1491                     vlc_memcpy( &dstp->p_pixels[y * dstp->i_pitch],
1492                                 &curp->p_pixels[y * curp->i_pitch], dstp->i_visible_pitch );
1493                 }
1494                 else
1495                 {
1496                     struct vf_priv_s cfg;
1497                     /* Spatial checks only when enough data */
1498                     cfg.mode = (y >= 2 && y < dstp->i_visible_lines - 2) ? 0 : 2;
1499
1500                     assert( prevp->i_pitch == curp->i_pitch && curp->i_pitch == nextp->i_pitch );
1501                     filter( &cfg,
1502                             &dstp->p_pixels[y * dstp->i_pitch],
1503                             &prevp->p_pixels[y * prevp->i_pitch],
1504                             &curp->p_pixels[y * curp->i_pitch],
1505                             &nextp->p_pixels[y * nextp->i_pitch],
1506                             dstp->i_visible_pitch,
1507                             curp->i_pitch,
1508                             (i_field ^ (i_order == i_field)) & 1 );
1509                 }
1510
1511                 /* We duplicate the first and last lines */
1512                 if( y == 1 )
1513                     vlc_memcpy(&dstp->p_pixels[(y-1) * dstp->i_pitch], &dstp->p_pixels[y * dstp->i_pitch], dstp->i_pitch);
1514                 else if( y == dstp->i_visible_lines - 2 )
1515                     vlc_memcpy(&dstp->p_pixels[(y+1) * dstp->i_pitch], &dstp->p_pixels[y * dstp->i_pitch], dstp->i_pitch);
1516             }
1517         }
1518
1519         /* */
1520         p_dst->date = (p_next->date - p_cur->date) * i_order / 2 + p_cur->date;
1521         return VLC_SUCCESS;
1522     }
1523     else if( !p_prev && !p_cur && p_next )
1524     {
1525         /* FIXME not good as it does not use i_order/i_field */
1526         RenderX( p_dst, p_next );
1527         return VLC_SUCCESS;
1528     }
1529     else
1530     {
1531         return VLC_EGENERIC;
1532     }
1533 }
1534
1535 /*****************************************************************************
1536  * video filter2 functions
1537  *****************************************************************************/
1538 static picture_t *Deinterlace( filter_t *p_filter, picture_t *p_pic )
1539 {
1540     filter_sys_t *p_sys = p_filter->p_sys;
1541     picture_t *p_dst[2];
1542
1543     /* Request output picture */
1544     p_dst[0] = filter_NewPicture( p_filter );
1545     if( p_dst[0] == NULL )
1546     {
1547         picture_Release( p_pic );
1548         return NULL;
1549     }
1550     picture_CopyProperties( p_dst[0], p_pic );
1551
1552     if( p_sys->b_double_rate )
1553     {
1554         p_dst[0]->p_next =
1555         p_dst[1]         = filter_NewPicture( p_filter );
1556         if( p_dst[1] )
1557         {
1558             picture_CopyProperties( p_dst[1], p_pic );
1559             /* XXX it's not really good especially for the first picture, but
1560              * I don't think that delaying by one frame is worth it */
1561             if( p_sys->i_last_date > VLC_TS_INVALID && p_pic->date > VLC_TS_INVALID )
1562                 p_dst[1]->date = p_pic->date + (p_pic->date - p_sys->i_last_date) / 2;
1563         }
1564         p_sys->i_last_date = p_pic->date;
1565     }
1566     else
1567     {
1568         p_dst[1] = NULL;
1569     }
1570
1571     switch( p_sys->i_mode )
1572     {
1573         case DEINTERLACE_DISCARD:
1574             RenderDiscard( p_filter, p_dst[0], p_pic, 0 );
1575             break;
1576
1577         case DEINTERLACE_BOB:
1578             RenderBob( p_filter, p_dst[0], p_pic, !p_pic->b_top_field_first );
1579             if( p_dst[1] )
1580                 RenderBob( p_filter, p_dst[1], p_pic, p_pic->b_top_field_first );
1581             break;;
1582
1583         case DEINTERLACE_LINEAR:
1584             RenderLinear( p_filter, p_dst[0], p_pic, !p_pic->b_top_field_first );
1585             if( p_dst[1] )
1586                 RenderLinear( p_filter, p_dst[1], p_pic, p_pic->b_top_field_first );
1587             break;
1588
1589         case DEINTERLACE_MEAN:
1590             RenderMean( p_filter, p_dst[0], p_pic );
1591             break;
1592
1593         case DEINTERLACE_BLEND:
1594             RenderBlend( p_filter, p_dst[0], p_pic );
1595             break;
1596
1597         case DEINTERLACE_X:
1598             RenderX( p_dst[0], p_pic );
1599             break;
1600
1601         case DEINTERLACE_YADIF:
1602             if( RenderYadif( p_filter, p_dst[0], p_pic, 0, 0 ) )
1603                 goto drop;
1604             break;
1605
1606         case DEINTERLACE_YADIF2X:
1607             if( RenderYadif( p_filter, p_dst[0], p_pic, 0, !p_pic->b_top_field_first ) )
1608                 goto drop;
1609             if( p_dst[1] )
1610                 RenderYadif( p_filter, p_dst[1], p_pic, 1, p_pic->b_top_field_first );
1611             break;
1612     }
1613
1614     p_dst[0]->b_progressive = true;
1615     if( p_dst[1] )
1616         p_dst[1]->b_progressive = true;
1617
1618     picture_Release( p_pic );
1619     return p_dst[0];
1620
1621 drop:
1622     picture_Release( p_dst[0] );
1623     if( p_dst[1] )
1624         picture_Release( p_dst[1] );
1625     picture_Release( p_pic );
1626     return NULL;
1627 }
1628
1629 static void Flush( filter_t *p_filter )
1630 {
1631     filter_sys_t *p_sys = p_filter->p_sys;
1632
1633     p_sys->i_last_date = VLC_TS_INVALID;
1634     for( int i = 0; i < HISTORY_SIZE; i++ )
1635     {
1636         if( p_sys->pp_history[i] )
1637             picture_Release( p_sys->pp_history[i] );
1638         p_sys->pp_history[i] = NULL;
1639     }
1640 }
1641
1642 static int Mouse( filter_t *p_filter,
1643                   vlc_mouse_t *p_mouse, const vlc_mouse_t *p_old, const vlc_mouse_t *p_new )
1644 {
1645     VLC_UNUSED(p_old);
1646     *p_mouse = *p_new;
1647     if( p_filter->p_sys->b_half_height )
1648         p_mouse->i_y *= 2;
1649     return VLC_SUCCESS;
1650 }
1651
1652
1653 /*****************************************************************************
1654  * Open
1655  *****************************************************************************/
1656 static int Open( vlc_object_t *p_this )
1657 {
1658     filter_t *p_filter = (filter_t*)p_this;
1659     filter_sys_t *p_sys;
1660
1661     if( !IsChromaSupported( p_filter->fmt_in.video.i_chroma ) )
1662         return VLC_EGENERIC;
1663
1664     /* */
1665     p_sys = p_filter->p_sys = malloc( sizeof( *p_sys ) );
1666     if( !p_sys )
1667         return VLC_ENOMEM;
1668
1669     p_sys->i_mode = DEINTERLACE_BLEND;
1670     p_sys->b_double_rate = false;
1671     p_sys->b_half_height = true;
1672     p_sys->i_last_date = VLC_TS_INVALID;
1673     for( int i = 0; i < HISTORY_SIZE; i++ )
1674         p_sys->pp_history[i] = NULL;
1675
1676 #if defined(CAN_COMPILE_C_ALTIVEC)
1677     if( vlc_CPU() & CPU_CAPABILITY_ALTIVEC )
1678     {
1679         p_sys->pf_merge = MergeAltivec;
1680         p_sys->pf_end_merge = NULL;
1681     }
1682     else
1683 #endif
1684 #if defined(CAN_COMPILE_SSE)
1685     if( vlc_CPU() & CPU_CAPABILITY_SSE2 )
1686     {
1687         p_sys->pf_merge = MergeSSE2;
1688         p_sys->pf_end_merge = EndMMX;
1689     }
1690     else
1691 #endif
1692 #if defined(CAN_COMPILE_MMXEXT)
1693     if( vlc_CPU() & CPU_CAPABILITY_MMXEXT )
1694     {
1695         p_sys->pf_merge = MergeMMXEXT;
1696         p_sys->pf_end_merge = EndMMX;
1697     }
1698     else
1699 #endif
1700 #if defined(CAN_COMPILE_3DNOW)
1701     if( vlc_CPU() & CPU_CAPABILITY_3DNOW )
1702     {
1703         p_sys->pf_merge = Merge3DNow;
1704         p_sys->pf_end_merge = End3DNow;
1705     }
1706     else
1707 #endif
1708 #if defined __ARM_NEON__
1709     if( vlc_CPU() & CPU_CAPABILITY_NEON )
1710     {
1711         p_sys->pf_merge = MergeNEON;
1712         p_sys->pf_end_merge = NULL;
1713     }
1714     else
1715 #endif
1716     {
1717         p_sys->pf_merge = MergeGeneric;
1718         p_sys->pf_end_merge = NULL;
1719     }
1720
1721     /* */
1722     config_ChainParse( p_filter, FILTER_CFG_PREFIX, ppsz_filter_options,
1723                        p_filter->p_cfg );
1724
1725     char *psz_mode = var_GetNonEmptyString( p_filter, FILTER_CFG_PREFIX "mode" );
1726     SetFilterMethod( p_filter, psz_mode, p_filter->fmt_in.video.i_chroma );
1727     free( psz_mode );
1728
1729     /* */
1730     video_format_t fmt;
1731     GetOutputFormat( p_filter, &fmt, &p_filter->fmt_in.video );
1732     if( !p_filter->b_allow_fmt_out_change &&
1733         ( fmt.i_chroma != p_filter->fmt_in.video.i_chroma ||
1734           fmt.i_height != p_filter->fmt_in.video.i_height ) )
1735     {
1736         Close( VLC_OBJECT(p_filter) );
1737         return VLC_EGENERIC;
1738     }
1739     p_filter->fmt_out.video = fmt;
1740     p_filter->fmt_out.i_codec = fmt.i_chroma;
1741     p_filter->pf_video_filter = Deinterlace;
1742     p_filter->pf_video_flush  = Flush;
1743     p_filter->pf_video_mouse  = Mouse;
1744
1745     msg_Dbg( p_filter, "deinterlacing" );
1746
1747     return VLC_SUCCESS;
1748 }
1749
1750 /*****************************************************************************
1751  * Close: clean up the filter
1752  *****************************************************************************/
1753 static void Close( vlc_object_t *p_this )
1754 {
1755     filter_t *p_filter = (filter_t*)p_this;
1756
1757     Flush( p_filter );
1758     free( p_filter->p_sys );
1759 }
1760