git.sesse.net Git - vlc/blob - modules/video_filter/deinterlace.c

   1 /*****************************************************************************
   2  * deinterlace.c : deinterlacer plugin for vlc
   3  *****************************************************************************
   4  * Copyright (C) 2000-2009 the VideoLAN team
   5  * $Id$
   6  *
   7  * Author: Sam Hocevar <sam@zoy.org>
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation; either version 2 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License
  20  * along with this program; if not, write to the Free Software
  21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
  22  *****************************************************************************/
  23
  24 /*****************************************************************************
  25  * Preamble
  26  *****************************************************************************/
  27
  28 #ifdef HAVE_CONFIG_H
  29 # include "config.h"
  30 #endif
  31
  32 #include <assert.h>
  33
  34 #ifdef HAVE_ALTIVEC_H
  35 #   include <altivec.h>
  36 #endif
  37
  38 #include <vlc_common.h>
  39 #include <vlc_plugin.h>
  40 #include <vlc_filter.h>
  41 #include <vlc_cpu.h>
  42
  43 #ifdef CAN_COMPILE_MMXEXT
  44 #   include "mmx.h"
  45 #endif
  46
  47 #define DEINTERLACE_DISCARD 1
  48 #define DEINTERLACE_MEAN    2
  49 #define DEINTERLACE_BLEND   3
  50 #define DEINTERLACE_BOB     4
  51 #define DEINTERLACE_LINEAR  5
  52 #define DEINTERLACE_X       6
  53 #define DEINTERLACE_YADIF   7
  54 #define DEINTERLACE_YADIF2X 8
  55
  56 /*****************************************************************************
  57  * Module descriptor
  58  *****************************************************************************/
  59 static int  Open ( vlc_object_t * );
  60 static void Close( vlc_object_t * );
  61
  62 #define MODE_TEXT N_("Deinterlace mode")
  63 #define MODE_LONGTEXT N_("Deinterlace method to use for local playback.")
  64
  65 #define SOUT_MODE_TEXT N_("Streaming deinterlace mode")
  66 #define SOUT_MODE_LONGTEXT N_("Deinterlace method to use for streaming.")
  67
  68 #define FILTER_CFG_PREFIX "sout-deinterlace-"
  69
  70 static const char *const mode_list[] = {
  71     "discard", "blend", "mean", "bob", "linear", "x", "yadif", "yadif2x" };
  72 static const char *const mode_list_text[] = {
  73     N_("Discard"), N_("Blend"), N_("Mean"), N_("Bob"), N_("Linear"), "X", "Yadif", "Yadif (2x)" };
  74
  75 vlc_module_begin ()
  76     set_description( N_("Deinterlacing video filter") )
  77     set_shortname( N_("Deinterlace" ))
  78     set_capability( "video filter2", 0 )
  79     set_category( CAT_VIDEO )
  80     set_subcategory( SUBCAT_VIDEO_VFILTER )
  81
  82     add_string( FILTER_CFG_PREFIX "mode", "blend", SOUT_MODE_TEXT,
  83                 SOUT_MODE_LONGTEXT, false )
  84         change_string_list( mode_list, mode_list_text, 0 )
  85         change_safe ()
  86     add_shortcut( "deinterlace" )
  87     set_callbacks( Open, Close )
  88 vlc_module_end ()
  89
  90
  91 /*****************************************************************************
  92  * Local protypes
  93  *****************************************************************************/
  94 static void RenderDiscard( filter_t *, picture_t *, picture_t *, int );
  95 static void RenderBob    ( filter_t *, picture_t *, picture_t *, int );
  96 static void RenderMean   ( filter_t *, picture_t *, picture_t * );
  97 static void RenderBlend  ( filter_t *, picture_t *, picture_t * );
  98 static void RenderLinear ( filter_t *, picture_t *, picture_t *, int );
  99 static void RenderX      ( picture_t *, picture_t * );
 100 static int  RenderYadif  ( filter_t *, picture_t *, picture_t *, int, int );
 101
 102 static void MergeGeneric ( void *, const void *, const void *, size_t );
 103 #if defined(CAN_COMPILE_C_ALTIVEC)
 104 static void MergeAltivec ( void *, const void *, const void *, size_t );
 105 #endif
 106 #if defined(CAN_COMPILE_MMXEXT)
 107 static void MergeMMXEXT  ( void *, const void *, const void *, size_t );
 108 #endif
 109 #if defined(CAN_COMPILE_3DNOW)
 110 static void Merge3DNow   ( void *, const void *, const void *, size_t );
 111 #endif
 112 #if defined(CAN_COMPILE_SSE)
 113 static void MergeSSE2    ( void *, const void *, const void *, size_t );
 114 #endif
 115 #if defined(CAN_COMPILE_MMXEXT) || defined(CAN_COMPILE_SSE)
 116 static void EndMMX       ( void );
 117 #endif
 118 #if defined(CAN_COMPILE_3DNOW)
 119 static void End3DNow     ( void );
 120 #endif
 121 #if defined __ARM_NEON__
 122 static void MergeNEON (void *, const void *, const void *, size_t);
 123 #endif
 124
 125 static const char *const ppsz_filter_options[] = {
 126     "mode", NULL
 127 };
 128
 129 /* Used for framerate doublers */
 130 #define METADATA_SIZE (3)
 131 typedef struct {
 132     mtime_t pi_date[METADATA_SIZE];
 133     int     pi_nb_fields[METADATA_SIZE];
 134     bool    pb_top_field_first[METADATA_SIZE];
 135 } metadata_history_t;
 136
 137 #define HISTORY_SIZE (3)
 138 #define CUSTOM_PTS -1
 139 struct filter_sys_t
 140 {
 141     int  i_mode;              /* Deinterlace mode */
 142     bool b_double_rate;       /* Shall we double the framerate? */
 143     bool b_half_height;       /* Shall be divide the height by 2 */
 144     bool b_use_frame_history; /* Does the algorithm need the input frame history buffer? */
 145
 146     void (*pf_merge) ( void *, const void *, const void *, size_t );
 147     void (*pf_end_merge) ( void );
 148
 149     /* Metadata history (PTS, nb_fields, TFF). Used for framerate doublers. */
 150     metadata_history_t meta;
 151
 152     /* Output frame timing / framerate doubler control (see below) */
 153     int i_frame_offset;
 154
 155     /* Input frame history buffer for algorithms that perform temporal filtering. */
 156     picture_t *pp_history[HISTORY_SIZE];
 157 };
 158
 159 /*  NOTE on i_frame_offset:
 160
 161     This value indicates the offset between input and output frames in the currently active deinterlace algorithm.
 162     See the rationale below for why this is needed and how it is used.
 163
 164     Valid range: 0 <= i_frame_offset < METADATA_SIZE, or i_frame_offset = CUSTOM_PTS.
 165                  The special value CUSTOM_PTS is only allowed if b_double_rate is false.
 166
 167                  If CUSTOM_PTS is used, the algorithm must compute the outgoing PTSs itself,
 168                  and additionally, read the TFF/BFF information itself (if it needs it)
 169                  from the incoming frames.
 170
 171     Meaning of values:
 172     0 = output frame corresponds to the current input frame
 173         (no frame offset; default if not set),
 174     1 = output frame corresponds to the previous input frame
 175         (e.g. Yadif and Yadif2x work like this),
 176     ...
 177
 178     If necessary, i_frame_offset should be updated by the active deinterlace algorithm
 179     to indicate the correct delay for the *next* input frame. It does not matter at which i_order
 180     the algorithm updates this information, but the new value will only take effect upon the
 181     next call to Deinterlace() (i.e. at the next incoming frame).
 182
 183     The first-ever frame that arrives to the filter after Open() is always handled as having
 184     i_frame_offset = 0. For the second and all subsequent frames, each algorithm is responsible
 185     for setting the offset correctly. (The default is 0, so if that is correct, there's no need
 186     to do anything.)
 187
 188     This solution guarantees that i_frame_offset:
 189       1) is up to date at the start of each frame,
 190       2) does not change (as far as Deinterlace() is concerned) during a frame, and
 191       3) does not need a special API for setting the value at the start of each input frame,
 192          before the algorithm starts rendering the (first) output frame for that input frame.
 193
 194     The deinterlace algorithm is allowed to behave differently for different input frames.
 195     This is especially important for startup, when full history (as defined by each algorithm)
 196     is not yet available. During the first-ever input frame, it is clear that it is the
 197     only possible source for information, so i_frame_offset = 0 is necessarily correct.
 198     After that, what to do is up to each algorithm.
 199
 200     Having the correct offset at the start of each input frame is critically important in order to:
 201       1) Allocate the correct number of output frames for framerate doublers, and to
 202       2) Pass correct TFF/BFF information to the algorithm.
 203
 204     These points are important for proper soft field repeat support. This feature is used in some
 205     streams originating from film. In soft NTSC telecine, the number of fields alternates as 3,2,3,2,...
 206     and the video field dominance flips every two frames (after every "3"). Also, some streams
 207     request an occasional field repeat (nb_fields = 3), after which the video field dominance flips.
 208     To render such streams correctly, the nb_fields and TFF/BFF information must be taken from
 209     the specific input frame that the algorithm intends to render.
 210
 211     Additionally, the output PTS is automatically computed by Deinterlace() from i_frame_offset and i_order.
 212
 213     It is possible to use the special value CUSTOM_PTS to indicate that the algorithm computes
 214     the output PTSs itself. In this case, Deinterlace() will pass them through. This special value
 215     is not valid for framerate doublers, as by definition they are field renderers, so they need to
 216     use the original field timings to work correctly. Basically, this special value is only intended
 217     for algorithms that need to perform nontrivial framerate conversions (such as IVTC).
 218 */
 219
 220
 221 /*****************************************************************************
 222  * SetFilterMethod: setup the deinterlace method to use.
 223  *****************************************************************************/
 224 static void SetFilterMethod( filter_t *p_filter, const char *psz_method, vlc_fourcc_t i_chroma )
 225 {
 226     filter_sys_t *p_sys = p_filter->p_sys;
 227
 228     if( !psz_method )
 229         psz_method = "";
 230
 231     if( !strcmp( psz_method, "mean" ) )
 232     {
 233         p_sys->i_mode = DEINTERLACE_MEAN;
 234         p_sys->b_double_rate = false;
 235         p_sys->b_half_height = true;
 236         p_sys->b_use_frame_history = false;
 237     }
 238     else if( !strcmp( psz_method, "bob" )
 239              || !strcmp( psz_method, "progressive-scan" ) )
 240     {
 241         p_sys->i_mode = DEINTERLACE_BOB;
 242         p_sys->b_double_rate = true;
 243         p_sys->b_half_height = false;
 244         p_sys->b_use_frame_history = false;
 245     }
 246     else if( !strcmp( psz_method, "linear" ) )
 247     {
 248         p_sys->i_mode = DEINTERLACE_LINEAR;
 249         p_sys->b_double_rate = true;
 250         p_sys->b_half_height = false;
 251         p_sys->b_use_frame_history = false;
 252     }
 253     else if( !strcmp( psz_method, "x" ) )
 254     {
 255         p_sys->i_mode = DEINTERLACE_X;
 256         p_sys->b_double_rate = false;
 257         p_sys->b_half_height = false;
 258         p_sys->b_use_frame_history = false;
 259     }
 260     else if( !strcmp( psz_method, "yadif" ) )
 261     {
 262         p_sys->i_mode = DEINTERLACE_YADIF;
 263         p_sys->b_double_rate = false;
 264         p_sys->b_half_height = false;
 265         p_sys->b_use_frame_history = true;
 266     }
 267     else if( !strcmp( psz_method, "yadif2x" ) )
 268     {
 269         p_sys->i_mode = DEINTERLACE_YADIF2X;
 270         p_sys->b_double_rate = true;
 271         p_sys->b_half_height = false;
 272         p_sys->b_use_frame_history = true;
 273     }
 274     else if( !strcmp( psz_method, "discard" ) )
 275     {
 276         const bool b_i422 = i_chroma == VLC_CODEC_I422 ||
 277                             i_chroma == VLC_CODEC_J422;
 278
 279         p_sys->i_mode = DEINTERLACE_DISCARD;
 280         p_sys->b_double_rate = false;
 281         p_sys->b_half_height = !b_i422;
 282         p_sys->b_use_frame_history = false;
 283     }
 284     else
 285     {
 286         if( strcmp( psz_method, "blend" ) )
 287             msg_Err( p_filter,
 288                      "no valid deinterlace mode provided, using \"blend\"" );
 289
 290         p_sys->i_mode = DEINTERLACE_BLEND;
 291         p_sys->b_double_rate = false;
 292         p_sys->b_half_height = false;
 293         p_sys->b_use_frame_history = false;
 294     }
 295
 296     p_sys->i_frame_offset = 0; /* reset to default when method changes */
 297
 298     msg_Dbg( p_filter, "using %s deinterlace method", psz_method );
 299 }
 300
 301 static void GetOutputFormat( filter_t *p_filter,
 302                              video_format_t *p_dst, const video_format_t *p_src )
 303 {
 304     filter_sys_t *p_sys = p_filter->p_sys;
 305     *p_dst = *p_src;
 306
 307     if( p_sys->b_half_height )
 308     {
 309         p_dst->i_height /= 2;
 310         p_dst->i_visible_height /= 2;
 311         p_dst->i_y_offset /= 2;
 312         p_dst->i_sar_den *= 2;
 313     }
 314
 315     if( p_src->i_chroma == VLC_CODEC_I422 ||
 316         p_src->i_chroma == VLC_CODEC_J422 )
 317     {
 318         switch( p_sys->i_mode )
 319         {
 320         case DEINTERLACE_MEAN:
 321         case DEINTERLACE_LINEAR:
 322         case DEINTERLACE_X:
 323         case DEINTERLACE_YADIF:
 324         case DEINTERLACE_YADIF2X:
 325             p_dst->i_chroma = p_src->i_chroma;
 326             break;
 327         default:
 328             p_dst->i_chroma = p_src->i_chroma == VLC_CODEC_I422 ? VLC_CODEC_I420 :
 329                                                                   VLC_CODEC_J420;
 330             break;
 331         }
 332     }
 333 }
 334
 335 static bool IsChromaSupported( vlc_fourcc_t i_chroma )
 336 {
 337     return i_chroma == VLC_CODEC_I420 ||
 338            i_chroma == VLC_CODEC_J420 ||
 339            i_chroma == VLC_CODEC_YV12 ||
 340            i_chroma == VLC_CODEC_I422 ||
 341            i_chroma == VLC_CODEC_J422;
 342 }
 343
 344 /*****************************************************************************
 345  * RenderDiscard: only keep TOP or BOTTOM field, discard the other.
 346  *****************************************************************************/
 347 static void RenderDiscard( filter_t *p_filter,
 348                            picture_t *p_outpic, picture_t *p_pic, int i_field )
 349 {
 350     int i_plane;
 351
 352     /* Copy image and skip lines */
 353     for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
 354     {
 355         uint8_t *p_in, *p_out_end, *p_out;
 356         int i_increment;
 357
 358         p_in = p_pic->p[i_plane].p_pixels
 359                    + i_field * p_pic->p[i_plane].i_pitch;
 360
 361         p_out = p_outpic->p[i_plane].p_pixels;
 362         p_out_end = p_out + p_outpic->p[i_plane].i_pitch
 363                              * p_outpic->p[i_plane].i_visible_lines;
 364
 365         switch( p_filter->fmt_in.video.i_chroma )
 366         {
 367         case VLC_CODEC_I420:
 368         case VLC_CODEC_J420:
 369         case VLC_CODEC_YV12:
 370
 371             for( ; p_out < p_out_end ; )
 372             {
 373                 vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 374
 375                 p_out += p_outpic->p[i_plane].i_pitch;
 376                 p_in += 2 * p_pic->p[i_plane].i_pitch;
 377             }
 378             break;
 379
 380         case VLC_CODEC_I422:
 381         case VLC_CODEC_J422:
 382
 383             i_increment = 2 * p_pic->p[i_plane].i_pitch;
 384
 385             if( i_plane == Y_PLANE )
 386             {
 387                 for( ; p_out < p_out_end ; )
 388                 {
 389                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 390                     p_out += p_outpic->p[i_plane].i_pitch;
 391                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 392                     p_out += p_outpic->p[i_plane].i_pitch;
 393                     p_in += i_increment;
 394                 }
 395             }
 396             else
 397             {
 398                 for( ; p_out < p_out_end ; )
 399                 {
 400                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 401                     p_out += p_outpic->p[i_plane].i_pitch;
 402                     p_in += i_increment;
 403                 }
 404             }
 405             break;
 406
 407         default:
 408             break;
 409         }
 410     }
 411 }
 412
 413 /*****************************************************************************
 414  * RenderBob: renders a BOB picture - simple copy
 415  *****************************************************************************/
 416 static void RenderBob( filter_t *p_filter,
 417                        picture_t *p_outpic, picture_t *p_pic, int i_field )
 418 {
 419     int i_plane;
 420
 421     /* Copy image and skip lines */
 422     for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
 423     {
 424         uint8_t *p_in, *p_out_end, *p_out;
 425
 426         p_in = p_pic->p[i_plane].p_pixels;
 427         p_out = p_outpic->p[i_plane].p_pixels;
 428         p_out_end = p_out + p_outpic->p[i_plane].i_pitch
 429                              * p_outpic->p[i_plane].i_visible_lines;
 430
 431         switch( p_filter->fmt_in.video.i_chroma )
 432         {
 433             case VLC_CODEC_I420:
 434             case VLC_CODEC_J420:
 435             case VLC_CODEC_YV12:
 436                 /* For BOTTOM field we need to add the first line */
 437                 if( i_field == 1 )
 438                 {
 439                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 440                     p_in += p_pic->p[i_plane].i_pitch;
 441                     p_out += p_outpic->p[i_plane].i_pitch;
 442                 }
 443
 444                 p_out_end -= 2 * p_outpic->p[i_plane].i_pitch;
 445
 446                 for( ; p_out < p_out_end ; )
 447                 {
 448                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 449
 450                     p_out += p_outpic->p[i_plane].i_pitch;
 451
 452                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 453
 454                     p_in += 2 * p_pic->p[i_plane].i_pitch;
 455                     p_out += p_outpic->p[i_plane].i_pitch;
 456                 }
 457
 458                 vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 459
 460                 /* For TOP field we need to add the last line */
 461                 if( i_field == 0 )
 462                 {
 463                     p_in += p_pic->p[i_plane].i_pitch;
 464                     p_out += p_outpic->p[i_plane].i_pitch;
 465                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 466                 }
 467                 break;
 468
 469             case VLC_CODEC_I422:
 470             case VLC_CODEC_J422:
 471                 /* For BOTTOM field we need to add the first line */
 472                 if( i_field == 1 )
 473                 {
 474                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 475                     p_in += p_pic->p[i_plane].i_pitch;
 476                     p_out += p_outpic->p[i_plane].i_pitch;
 477                 }
 478
 479                 p_out_end -= 2 * p_outpic->p[i_plane].i_pitch;
 480
 481                 if( i_plane == Y_PLANE )
 482                 {
 483                     for( ; p_out < p_out_end ; )
 484                     {
 485                         vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 486
 487                         p_out += p_outpic->p[i_plane].i_pitch;
 488
 489                         vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 490
 491                         p_in += 2 * p_pic->p[i_plane].i_pitch;
 492                         p_out += p_outpic->p[i_plane].i_pitch;
 493                     }
 494                 }
 495                 else
 496                 {
 497                     for( ; p_out < p_out_end ; )
 498                     {
 499                         vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 500
 501                         p_out += p_outpic->p[i_plane].i_pitch;
 502                         p_in += 2 * p_pic->p[i_plane].i_pitch;
 503                     }
 504                 }
 505
 506                 vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 507
 508                 /* For TOP field we need to add the last line */
 509                 if( i_field == 0 )
 510                 {
 511                     p_in += p_pic->p[i_plane].i_pitch;
 512                     p_out += p_outpic->p[i_plane].i_pitch;
 513                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 514                 }
 515                 break;
 516         }
 517     }
 518 }
 519
 520 #define Merge p_filter->p_sys->pf_merge
 521 #define EndMerge if(p_filter->p_sys->pf_end_merge) p_filter->p_sys->pf_end_merge
 522
 523 /*****************************************************************************
 524  * RenderLinear: BOB with linear interpolation
 525  *****************************************************************************/
 526 static void RenderLinear( filter_t *p_filter,
 527                           picture_t *p_outpic, picture_t *p_pic, int i_field )
 528 {
 529     int i_plane;
 530
 531     /* Copy image and skip lines */
 532     for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
 533     {
 534         uint8_t *p_in, *p_out_end, *p_out;
 535
 536         p_in = p_pic->p[i_plane].p_pixels;
 537         p_out = p_outpic->p[i_plane].p_pixels;
 538         p_out_end = p_out + p_outpic->p[i_plane].i_pitch
 539                              * p_outpic->p[i_plane].i_visible_lines;
 540
 541         /* For BOTTOM field we need to add the first line */
 542         if( i_field == 1 )
 543         {
 544             vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 545             p_in += p_pic->p[i_plane].i_pitch;
 546             p_out += p_outpic->p[i_plane].i_pitch;
 547         }
 548
 549         p_out_end -= 2 * p_outpic->p[i_plane].i_pitch;
 550
 551         for( ; p_out < p_out_end ; )
 552         {
 553             vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 554
 555             p_out += p_outpic->p[i_plane].i_pitch;
 556
 557             Merge( p_out, p_in, p_in + 2 * p_pic->p[i_plane].i_pitch,
 558                    p_pic->p[i_plane].i_pitch );
 559
 560             p_in += 2 * p_pic->p[i_plane].i_pitch;
 561             p_out += p_outpic->p[i_plane].i_pitch;
 562         }
 563
 564         vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 565
 566         /* For TOP field we need to add the last line */
 567         if( i_field == 0 )
 568         {
 569             p_in += p_pic->p[i_plane].i_pitch;
 570             p_out += p_outpic->p[i_plane].i_pitch;
 571             vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 572         }
 573     }
 574     EndMerge();
 575 }
 576
 577 static void RenderMean( filter_t *p_filter,
 578                         picture_t *p_outpic, picture_t *p_pic )
 579 {
 580     int i_plane;
 581
 582     /* Copy image and skip lines */
 583     for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
 584     {
 585         uint8_t *p_in, *p_out_end, *p_out;
 586
 587         p_in = p_pic->p[i_plane].p_pixels;
 588
 589         p_out = p_outpic->p[i_plane].p_pixels;
 590         p_out_end = p_out + p_outpic->p[i_plane].i_pitch
 591                              * p_outpic->p[i_plane].i_visible_lines;
 592
 593         /* All lines: mean value */
 594         for( ; p_out < p_out_end ; )
 595         {
 596             Merge( p_out, p_in, p_in + p_pic->p[i_plane].i_pitch,
 597                    p_pic->p[i_plane].i_pitch );
 598
 599             p_out += p_outpic->p[i_plane].i_pitch;
 600             p_in += 2 * p_pic->p[i_plane].i_pitch;
 601         }
 602     }
 603     EndMerge();
 604 }
 605
 606 static void RenderBlend( filter_t *p_filter,
 607                          picture_t *p_outpic, picture_t *p_pic )
 608 {
 609     int i_plane;
 610
 611     /* Copy image and skip lines */
 612     for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
 613     {
 614         uint8_t *p_in, *p_out_end, *p_out;
 615
 616         p_in = p_pic->p[i_plane].p_pixels;
 617
 618         p_out = p_outpic->p[i_plane].p_pixels;
 619         p_out_end = p_out + p_outpic->p[i_plane].i_pitch
 620                              * p_outpic->p[i_plane].i_visible_lines;
 621
 622         switch( p_filter->fmt_in.video.i_chroma )
 623         {
 624             case VLC_CODEC_I420:
 625             case VLC_CODEC_J420:
 626             case VLC_CODEC_YV12:
 627                 /* First line: simple copy */
 628                 vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 629                 p_out += p_outpic->p[i_plane].i_pitch;
 630
 631                 /* Remaining lines: mean value */
 632                 for( ; p_out < p_out_end ; )
 633                 {
 634                     Merge( p_out, p_in, p_in + p_pic->p[i_plane].i_pitch,
 635                            p_pic->p[i_plane].i_pitch );
 636
 637                     p_out += p_outpic->p[i_plane].i_pitch;
 638                     p_in += p_pic->p[i_plane].i_pitch;
 639                 }
 640                 break;
 641
 642             case VLC_CODEC_I422:
 643             case VLC_CODEC_J422:
 644                 /* First line: simple copy */
 645                 vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 646                 p_out += p_outpic->p[i_plane].i_pitch;
 647
 648                 /* Remaining lines: mean value */
 649                 if( i_plane == Y_PLANE )
 650                 {
 651                     for( ; p_out < p_out_end ; )
 652                     {
 653                         Merge( p_out, p_in, p_in + p_pic->p[i_plane].i_pitch,
 654                                p_pic->p[i_plane].i_pitch );
 655
 656                         p_out += p_outpic->p[i_plane].i_pitch;
 657                         p_in += p_pic->p[i_plane].i_pitch;
 658                     }
 659                 }
 660
 661                 else
 662                 {
 663                     for( ; p_out < p_out_end ; )
 664                     {
 665                         Merge( p_out, p_in, p_in + p_pic->p[i_plane].i_pitch,
 666                                p_pic->p[i_plane].i_pitch );
 667
 668                         p_out += p_outpic->p[i_plane].i_pitch;
 669                         p_in += 2*p_pic->p[i_plane].i_pitch;
 670                     }
 671                 }
 672                 break;
 673         }
 674     }
 675     EndMerge();
 676 }
 677
 678 #undef Merge
 679
 680 static void MergeGeneric( void *_p_dest, const void *_p_s1,
 681                           const void *_p_s2, size_t i_bytes )
 682 {
 683     uint8_t* p_dest = (uint8_t*)_p_dest;
 684     const uint8_t *p_s1 = (const uint8_t *)_p_s1;
 685     const uint8_t *p_s2 = (const uint8_t *)_p_s2;
 686     uint8_t* p_end = p_dest + i_bytes - 8;
 687
 688     while( p_dest < p_end )
 689     {
 690         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 691         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 692         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 693         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 694         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 695         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 696         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 697         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 698     }
 699
 700     p_end += 8;
 701
 702     while( p_dest < p_end )
 703     {
 704         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 705     }
 706 }
 707
 708 #if defined(CAN_COMPILE_MMXEXT)
 709 static void MergeMMXEXT( void *_p_dest, const void *_p_s1, const void *_p_s2,
 710                          size_t i_bytes )
 711 {
 712     uint8_t* p_dest = (uint8_t*)_p_dest;
 713     const uint8_t *p_s1 = (const uint8_t *)_p_s1;
 714     const uint8_t *p_s2 = (const uint8_t *)_p_s2;
 715     uint8_t* p_end = p_dest + i_bytes - 8;
 716     while( p_dest < p_end )
 717     {
 718         __asm__  __volatile__( "movq %2,%%mm1;"
 719                                "pavgb %1, %%mm1;"
 720                                "movq %%mm1, %0" :"=m" (*p_dest):
 721                                                  "m" (*p_s1),
 722                                                  "m" (*p_s2) );
 723         p_dest += 8;
 724         p_s1 += 8;
 725         p_s2 += 8;
 726     }
 727
 728     p_end += 8;
 729
 730     while( p_dest < p_end )
 731     {
 732         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 733     }
 734 }
 735 #endif
 736
 737 #if defined(CAN_COMPILE_3DNOW)
 738 static void Merge3DNow( void *_p_dest, const void *_p_s1, const void *_p_s2,
 739                         size_t i_bytes )
 740 {
 741     uint8_t* p_dest = (uint8_t*)_p_dest;
 742     const uint8_t *p_s1 = (const uint8_t *)_p_s1;
 743     const uint8_t *p_s2 = (const uint8_t *)_p_s2;
 744     uint8_t* p_end = p_dest + i_bytes - 8;
 745     while( p_dest < p_end )
 746     {
 747         __asm__  __volatile__( "movq %2,%%mm1;"
 748                                "pavgusb %1, %%mm1;"
 749                                "movq %%mm1, %0" :"=m" (*p_dest):
 750                                                  "m" (*p_s1),
 751                                                  "m" (*p_s2) );
 752         p_dest += 8;
 753         p_s1 += 8;
 754         p_s2 += 8;
 755     }
 756
 757     p_end += 8;
 758
 759     while( p_dest < p_end )
 760     {
 761         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 762     }
 763 }
 764 #endif
 765
 766 #if defined(CAN_COMPILE_SSE)
 767 static void MergeSSE2( void *_p_dest, const void *_p_s1, const void *_p_s2,
 768                        size_t i_bytes )
 769 {
 770     uint8_t* p_dest = (uint8_t*)_p_dest;
 771     const uint8_t *p_s1 = (const uint8_t *)_p_s1;
 772     const uint8_t *p_s2 = (const uint8_t *)_p_s2;
 773     uint8_t* p_end;
 774     while( (uintptr_t)p_s1 % 16 )
 775     {
 776         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 777     }
 778     p_end = p_dest + i_bytes - 16;
 779     while( p_dest < p_end )
 780     {
 781         __asm__  __volatile__( "movdqu %2,%%xmm1;"
 782                                "pavgb %1, %%xmm1;"
 783                                "movdqu %%xmm1, %0" :"=m" (*p_dest):
 784                                                  "m" (*p_s1),
 785                                                  "m" (*p_s2) );
 786         p_dest += 16;
 787         p_s1 += 16;
 788         p_s2 += 16;
 789     }
 790
 791     p_end += 16;
 792
 793     while( p_dest < p_end )
 794     {
 795         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 796     }
 797 }
 798 #endif
 799
 800 #if defined(CAN_COMPILE_MMXEXT) || defined(CAN_COMPILE_SSE)
 801 static void EndMMX( void )
 802 {
 803     __asm__ __volatile__( "emms" :: );
 804 }
 805 #endif
 806
 807 #if defined(CAN_COMPILE_3DNOW)
 808 static void End3DNow( void )
 809 {
 810     __asm__ __volatile__( "femms" :: );
 811 }
 812 #endif
 813
 814 #ifdef CAN_COMPILE_C_ALTIVEC
 815 static void MergeAltivec( void *_p_dest, const void *_p_s1,
 816                           const void *_p_s2, size_t i_bytes )
 817 {
 818     uint8_t *p_dest = (uint8_t *)_p_dest;
 819     uint8_t *p_s1   = (uint8_t *)_p_s1;
 820     uint8_t *p_s2   = (uint8_t *)_p_s2;
 821     uint8_t *p_end  = p_dest + i_bytes - 15;
 822
 823     /* Use C until the first 16-bytes aligned destination pixel */
 824     while( (uintptr_t)p_dest & 0xF )
 825     {
 826         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 827     }
 828
 829     if( ( (int)p_s1 & 0xF ) | ( (int)p_s2 & 0xF ) )
 830     {
 831         /* Unaligned source */
 832         vector unsigned char s1v, s2v, destv;
 833         vector unsigned char s1oldv, s2oldv, s1newv, s2newv;
 834         vector unsigned char perm1v, perm2v;
 835
 836         perm1v = vec_lvsl( 0, p_s1 );
 837         perm2v = vec_lvsl( 0, p_s2 );
 838         s1oldv = vec_ld( 0, p_s1 );
 839         s2oldv = vec_ld( 0, p_s2 );
 840
 841         while( p_dest < p_end )
 842         {
 843             s1newv = vec_ld( 16, p_s1 );
 844             s2newv = vec_ld( 16, p_s2 );
 845             s1v    = vec_perm( s1oldv, s1newv, perm1v );
 846             s2v    = vec_perm( s2oldv, s2newv, perm2v );
 847             s1oldv = s1newv;
 848             s2oldv = s2newv;
 849             destv  = vec_avg( s1v, s2v );
 850             vec_st( destv, 0, p_dest );
 851
 852             p_s1   += 16;
 853             p_s2   += 16;
 854             p_dest += 16;
 855         }
 856     }
 857     else
 858     {
 859         /* Aligned source */
 860         vector unsigned char s1v, s2v, destv;
 861
 862         while( p_dest < p_end )
 863         {
 864             s1v   = vec_ld( 0, p_s1 );
 865             s2v   = vec_ld( 0, p_s2 );
 866             destv = vec_avg( s1v, s2v );
 867             vec_st( destv, 0, p_dest );
 868
 869             p_s1   += 16;
 870             p_s2   += 16;
 871             p_dest += 16;
 872         }
 873     }
 874
 875     p_end += 15;
 876
 877     while( p_dest < p_end )
 878     {
 879         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 880     }
 881 }
 882 #endif
 883
 884 #ifdef __ARM_NEON__
 885 static void MergeNEON (void *restrict out, const void *in1,
 886                        const void *in2, size_t n)
 887 {
 888     uint8_t *outp = out;
 889     const uint8_t *in1p = in1;
 890     const uint8_t *in2p = in2;
 891     size_t mis = ((uintptr_t)outp) & 15;
 892
 893     if (mis)
 894     {
 895         MergeGeneric (outp, in1p, in2p, mis);
 896         outp += mis;
 897         in1p += mis;
 898         in2p += mis;
 899         n -= mis;
 900     }
 901
 902     uint8_t *end = outp + (n & ~15);
 903
 904     if ((((uintptr_t)in1p)|((uintptr_t)in2p)) & 15)
 905         while (outp < end)
 906             asm volatile (
 907                 "vld1.u8  {q0-q1}, [%[in1]]!\n"
 908                 "vld1.u8  {q2-q3}, [%[in2]]!\n"
 909                 "vhadd.u8 q4, q0, q2\n"
 910                 "vld1.u8  {q6-q7}, [%[in1]]!\n"
 911                 "vhadd.u8 q5, q1, q3\n"
 912                 "vld1.u8  {q8-q9}, [%[in2]]!\n"
 913                 "vhadd.u8 q10, q6, q8\n"
 914                 "vhadd.u8 q11, q7, q9\n"
 915                 "vst1.u8  {q4-q5}, [%[out],:128]!\n"
 916                 "vst1.u8  {q10-q11}, [%[out],:128]!\n"
 917                 : [out] "+r" (outp), [in1] "+r" (in1p), [in2] "+r" (in2p)
 918                 :
 919                 : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
 920                   "q8", "q9", "q10", "q11", "memory");
 921     else
 922          while (outp < end)
 923             asm volatile (
 924                 "vld1.u8  {q0-q1}, [%[in1],:128]!\n"
 925                 "vld1.u8  {q2-q3}, [%[in2],:128]!\n"
 926                 "vhadd.u8 q4, q0, q2\n"
 927                 "vld1.u8  {q6-q7}, [%[in1],:128]!\n"
 928                 "vhadd.u8 q5, q1, q3\n"
 929                 "vld1.u8  {q8-q9}, [%[in2],:128]!\n"
 930                 "vhadd.u8 q10, q6, q8\n"
 931                 "vhadd.u8 q11, q7, q9\n"
 932                 "vst1.u8  {q4-q5}, [%[out],:128]!\n"
 933                 "vst1.u8  {q10-q11}, [%[out],:128]!\n"
 934                 : [out] "+r" (outp), [in1] "+r" (in1p), [in2] "+r" (in2p)
 935                 :
 936                 : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
 937                   "q8", "q9", "q10", "q11", "memory");
 938     n &= 15;
 939     if (n)
 940         MergeGeneric (outp, in1p, in2p, n);
 941 }
 942 #endif
 943
 944 /*****************************************************************************
 945  * RenderX: This algo works on a 8x8 block basic, it copies the top field
 946  * and apply a process to recreate the bottom field :
 947  *  If a 8x8 block is classified as :
 948  *   - progressive: it applies a small blend (1,6,1)
 949  *   - interlaced:
 950  *    * in the MMX version: we do a ME between the 2 fields, if there is a
 951  *    good match we use MC to recreate the bottom field (with a small
 952  *    blend (1,6,1) )
 953  *    * otherwise: it recreates the bottom field by an edge oriented
 954  *    interpolation.
 955   *****************************************************************************/
 956
 957 /* XDeint8x8Detect: detect if a 8x8 block is interlaced.
 958  * XXX: It need to access to 8x10
 959  * We use more than 8 lines to help with scrolling (text)
 960  * (and because XDeint8x8Frame use line 9)
 961  * XXX: smooth/uniform area with noise detection doesn't works well
 962  * but it's not really a problem because they don't have much details anyway
 963  */
 964 static inline int ssd( int a ) { return a*a; }
 965 static inline int XDeint8x8DetectC( uint8_t *src, int i_src )
 966 {
 967     int y, x;
 968     int ff, fr;
 969     int fc;
 970
 971     /* Detect interlacing */
 972     fc = 0;
 973     for( y = 0; y < 7; y += 2 )
 974     {
 975         ff = fr = 0;
 976         for( x = 0; x < 8; x++ )
 977         {
 978             fr += ssd(src[      x] - src[1*i_src+x]) +
 979                   ssd(src[i_src+x] - src[2*i_src+x]);
 980             ff += ssd(src[      x] - src[2*i_src+x]) +
 981                   ssd(src[i_src+x] - src[3*i_src+x]);
 982         }
 983         if( ff < 6*fr/8 && fr > 32 )
 984             fc++;
 985
 986         src += 2*i_src;
 987     }
 988
 989     return fc < 1 ? false : true;
 990 }
 991 #ifdef CAN_COMPILE_MMXEXT
 992 static inline int XDeint8x8DetectMMXEXT( uint8_t *src, int i_src )
 993 {
 994
 995     int y, x;
 996     int32_t ff, fr;
 997     int fc;
 998
 999     /* Detect interlacing */
1000     fc = 0;
1001     pxor_r2r( mm7, mm7 );
1002     for( y = 0; y < 9; y += 2 )
1003     {
1004         ff = fr = 0;
1005         pxor_r2r( mm5, mm5 );
1006         pxor_r2r( mm6, mm6 );
1007         for( x = 0; x < 8; x+=4 )
1008         {
1009             movd_m2r( src[        x], mm0 );
1010             movd_m2r( src[1*i_src+x], mm1 );
1011             movd_m2r( src[2*i_src+x], mm2 );
1012             movd_m2r( src[3*i_src+x], mm3 );
1013
1014             punpcklbw_r2r( mm7, mm0 );
1015             punpcklbw_r2r( mm7, mm1 );
1016             punpcklbw_r2r( mm7, mm2 );
1017             punpcklbw_r2r( mm7, mm3 );
1018
1019             movq_r2r( mm0, mm4 );
1020
1021             psubw_r2r( mm1, mm0 );
1022             psubw_r2r( mm2, mm4 );
1023
1024             psubw_r2r( mm1, mm2 );
1025             psubw_r2r( mm1, mm3 );
1026
1027             pmaddwd_r2r( mm0, mm0 );
1028             pmaddwd_r2r( mm4, mm4 );
1029             pmaddwd_r2r( mm2, mm2 );
1030             pmaddwd_r2r( mm3, mm3 );
1031             paddd_r2r( mm0, mm2 );
1032             paddd_r2r( mm4, mm3 );
1033             paddd_r2r( mm2, mm5 );
1034             paddd_r2r( mm3, mm6 );
1035         }
1036
1037         movq_r2r( mm5, mm0 );
1038         psrlq_i2r( 32, mm0 );
1039         paddd_r2r( mm0, mm5 );
1040         movd_r2m( mm5, fr );
1041
1042         movq_r2r( mm6, mm0 );
1043         psrlq_i2r( 32, mm0 );
1044         paddd_r2r( mm0, mm6 );
1045         movd_r2m( mm6, ff );
1046
1047         if( ff < 6*fr/8 && fr > 32 )
1048             fc++;
1049
1050         src += 2*i_src;
1051     }
1052     return fc;
1053 }
1054 #endif
1055
1056 static inline void XDeint8x8MergeC( uint8_t *dst, int i_dst,
1057                                     uint8_t *src1, int i_src1,
1058                                     uint8_t *src2, int i_src2 )
1059 {
1060     int y, x;
1061
1062     /* Progressive */
1063     for( y = 0; y < 8; y += 2 )
1064     {
1065         memcpy( dst, src1, 8 );
1066         dst  += i_dst;
1067
1068         for( x = 0; x < 8; x++ )
1069             dst[x] = (src1[x] + 6*src2[x] + src1[i_src1+x] + 4 ) >> 3;
1070         dst += i_dst;
1071
1072         src1 += i_src1;
1073         src2 += i_src2;
1074     }
1075 }
1076
1077 #ifdef CAN_COMPILE_MMXEXT
1078 static inline void XDeint8x8MergeMMXEXT( uint8_t *dst, int i_dst,
1079                                          uint8_t *src1, int i_src1,
1080                                          uint8_t *src2, int i_src2 )
1081 {
1082     static const uint64_t m_4 = INT64_C(0x0004000400040004);
1083     int y, x;
1084
1085     /* Progressive */
1086     pxor_r2r( mm7, mm7 );
1087     for( y = 0; y < 8; y += 2 )
1088     {
1089         for( x = 0; x < 8; x +=4 )
1090         {
1091             movd_m2r( src1[x], mm0 );
1092             movd_r2m( mm0, dst[x] );
1093
1094             movd_m2r( src2[x], mm1 );
1095             movd_m2r( src1[i_src1+x], mm2 );
1096
1097             punpcklbw_r2r( mm7, mm0 );
1098             punpcklbw_r2r( mm7, mm1 );
1099             punpcklbw_r2r( mm7, mm2 );
1100             paddw_r2r( mm1, mm1 );
1101             movq_r2r( mm1, mm3 );
1102             paddw_r2r( mm3, mm3 );
1103             paddw_r2r( mm2, mm0 );
1104             paddw_r2r( mm3, mm1 );
1105             paddw_m2r( m_4, mm1 );
1106             paddw_r2r( mm1, mm0 );
1107             psraw_i2r( 3, mm0 );
1108             packuswb_r2r( mm7, mm0 );
1109             movd_r2m( mm0, dst[i_dst+x] );
1110         }
1111         dst += 2*i_dst;
1112         src1 += i_src1;
1113         src2 += i_src2;
1114     }
1115 }
1116
1117 #endif
1118
1119 /* For debug */
1120 static inline void XDeint8x8Set( uint8_t *dst, int i_dst, uint8_t v )
1121 {
1122     int y;
1123     for( y = 0; y < 8; y++ )
1124         memset( &dst[y*i_dst], v, 8 );
1125 }
1126
1127 /* XDeint8x8FieldE: Stupid deinterlacing (1,0,1) for block that miss a
1128  * neighbour
1129  * (Use 8x9 pixels)
1130  * TODO: a better one for the inner part.
1131  */
1132 static inline void XDeint8x8FieldEC( uint8_t *dst, int i_dst,
1133                                      uint8_t *src, int i_src )
1134 {
1135     int y, x;
1136
1137     /* Interlaced */
1138     for( y = 0; y < 8; y += 2 )
1139     {
1140         memcpy( dst, src, 8 );
1141         dst += i_dst;
1142
1143         for( x = 0; x < 8; x++ )
1144             dst[x] = (src[x] + src[2*i_src+x] ) >> 1;
1145         dst += 1*i_dst;
1146         src += 2*i_src;
1147     }
1148 }
1149 #ifdef CAN_COMPILE_MMXEXT
1150 static inline void XDeint8x8FieldEMMXEXT( uint8_t *dst, int i_dst,
1151                                           uint8_t *src, int i_src )
1152 {
1153     int y;
1154
1155     /* Interlaced */
1156     for( y = 0; y < 8; y += 2 )
1157     {
1158         movq_m2r( src[0], mm0 );
1159         movq_r2m( mm0, dst[0] );
1160         dst += i_dst;
1161
1162         movq_m2r( src[2*i_src], mm1 );
1163         pavgb_r2r( mm1, mm0 );
1164
1165         movq_r2m( mm0, dst[0] );
1166
1167         dst += 1*i_dst;
1168         src += 2*i_src;
1169     }
1170 }
1171 #endif
1172
1173 /* XDeint8x8Field: Edge oriented interpolation
1174  * (Need -4 and +5 pixels H, +1 line)
1175  */
1176 static inline void XDeint8x8FieldC( uint8_t *dst, int i_dst,
1177                                     uint8_t *src, int i_src )
1178 {
1179     int y, x;
1180
1181     /* Interlaced */
1182     for( y = 0; y < 8; y += 2 )
1183     {
1184         memcpy( dst, src, 8 );
1185         dst += i_dst;
1186
1187         for( x = 0; x < 8; x++ )
1188         {
1189             uint8_t *src2 = &src[2*i_src];
1190             /* I use 8 pixels just to match the MMX version, but it's overkill
1191              * 5 would be enough (less isn't good) */
1192             const int c0 = abs(src[x-4]-src2[x-2]) + abs(src[x-3]-src2[x-1]) +
1193                            abs(src[x-2]-src2[x+0]) + abs(src[x-1]-src2[x+1]) +
1194                            abs(src[x+0]-src2[x+2]) + abs(src[x+1]-src2[x+3]) +
1195                            abs(src[x+2]-src2[x+4]) + abs(src[x+3]-src2[x+5]);
1196
1197             const int c1 = abs(src[x-3]-src2[x-3]) + abs(src[x-2]-src2[x-2]) +
1198                            abs(src[x-1]-src2[x-1]) + abs(src[x+0]-src2[x+0]) +
1199                            abs(src[x+1]-src2[x+1]) + abs(src[x+2]-src2[x+2]) +
1200                            abs(src[x+3]-src2[x+3]) + abs(src[x+4]-src2[x+4]);
1201
1202             const int c2 = abs(src[x-2]-src2[x-4]) + abs(src[x-1]-src2[x-3]) +
1203                            abs(src[x+0]-src2[x-2]) + abs(src[x+1]-src2[x-1]) +
1204                            abs(src[x+2]-src2[x+0]) + abs(src[x+3]-src2[x+1]) +
1205                            abs(src[x+4]-src2[x+2]) + abs(src[x+5]-src2[x+3]);
1206
1207             if( c0 < c1 && c1 <= c2 )
1208                 dst[x] = (src[x-1] + src2[x+1]) >> 1;
1209             else if( c2 < c1 && c1 <= c0 )
1210                 dst[x] = (src[x+1] + src2[x-1]) >> 1;
1211             else
1212                 dst[x] = (src[x+0] + src2[x+0]) >> 1;
1213         }
1214
1215         dst += 1*i_dst;
1216         src += 2*i_src;
1217     }
1218 }
1219 #ifdef CAN_COMPILE_MMXEXT
1220 static inline void XDeint8x8FieldMMXEXT( uint8_t *dst, int i_dst,
1221                                          uint8_t *src, int i_src )
1222 {
1223     int y, x;
1224
1225     /* Interlaced */
1226     for( y = 0; y < 8; y += 2 )
1227     {
1228         memcpy( dst, src, 8 );
1229         dst += i_dst;
1230
1231         for( x = 0; x < 8; x++ )
1232         {
1233             uint8_t *src2 = &src[2*i_src];
1234             int32_t c0, c1, c2;
1235
1236             movq_m2r( src[x-2], mm0 );
1237             movq_m2r( src[x-3], mm1 );
1238             movq_m2r( src[x-4], mm2 );
1239
1240             psadbw_m2r( src2[x-4], mm0 );
1241             psadbw_m2r( src2[x-3], mm1 );
1242             psadbw_m2r( src2[x-2], mm2 );
1243
1244             movd_r2m( mm0, c2 );
1245             movd_r2m( mm1, c1 );
1246             movd_r2m( mm2, c0 );
1247
1248             if( c0 < c1 && c1 <= c2 )
1249                 dst[x] = (src[x-1] + src2[x+1]) >> 1;
1250             else if( c2 < c1 && c1 <= c0 )
1251                 dst[x] = (src[x+1] + src2[x-1]) >> 1;
1252             else
1253                 dst[x] = (src[x+0] + src2[x+0]) >> 1;
1254         }
1255
1256         dst += 1*i_dst;
1257         src += 2*i_src;
1258     }
1259 }
1260 #endif
1261
1262 /* NxN arbitray size (and then only use pixel in the NxN block)
1263  */
1264 static inline int XDeintNxNDetect( uint8_t *src, int i_src,
1265                                    int i_height, int i_width )
1266 {
1267     int y, x;
1268     int ff, fr;
1269     int fc;
1270
1271
1272     /* Detect interlacing */
1273     /* FIXME way too simple, need to be more like XDeint8x8Detect */
1274     ff = fr = 0;
1275     fc = 0;
1276     for( y = 0; y < i_height - 2; y += 2 )
1277     {
1278         const uint8_t *s = &src[y*i_src];
1279         for( x = 0; x < i_width; x++ )
1280         {
1281             fr += ssd(s[      x] - s[1*i_src+x]);
1282             ff += ssd(s[      x] - s[2*i_src+x]);
1283         }
1284         if( ff < fr && fr > i_width / 2 )
1285             fc++;
1286     }
1287
1288     return fc < 2 ? false : true;
1289 }
1290
1291 static inline void XDeintNxNFrame( uint8_t *dst, int i_dst,
1292                                    uint8_t *src, int i_src,
1293                                    int i_width, int i_height )
1294 {
1295     int y, x;
1296
1297     /* Progressive */
1298     for( y = 0; y < i_height; y += 2 )
1299     {
1300         memcpy( dst, src, i_width );
1301         dst += i_dst;
1302
1303         if( y < i_height - 2 )
1304         {
1305             for( x = 0; x < i_width; x++ )
1306                 dst[x] = (src[x] + 2*src[1*i_src+x] + src[2*i_src+x] + 2 ) >> 2;
1307         }
1308         else
1309         {
1310             /* Blend last line */
1311             for( x = 0; x < i_width; x++ )
1312                 dst[x] = (src[x] + src[1*i_src+x] ) >> 1;
1313         }
1314         dst += 1*i_dst;
1315         src += 2*i_src;
1316     }
1317 }
1318
1319 static inline void XDeintNxNField( uint8_t *dst, int i_dst,
1320                                    uint8_t *src, int i_src,
1321                                    int i_width, int i_height )
1322 {
1323     int y, x;
1324
1325     /* Interlaced */
1326     for( y = 0; y < i_height; y += 2 )
1327     {
1328         memcpy( dst, src, i_width );
1329         dst += i_dst;
1330
1331         if( y < i_height - 2 )
1332         {
1333             for( x = 0; x < i_width; x++ )
1334                 dst[x] = (src[x] + src[2*i_src+x] ) >> 1;
1335         }
1336         else
1337         {
1338             /* Blend last line */
1339             for( x = 0; x < i_width; x++ )
1340                 dst[x] = (src[x] + src[i_src+x]) >> 1;
1341         }
1342         dst += 1*i_dst;
1343         src += 2*i_src;
1344     }
1345 }
1346
1347 static inline void XDeintNxN( uint8_t *dst, int i_dst, uint8_t *src, int i_src,
1348                               int i_width, int i_height )
1349 {
1350     if( XDeintNxNDetect( src, i_src, i_width, i_height ) )
1351         XDeintNxNField( dst, i_dst, src, i_src, i_width, i_height );
1352     else
1353         XDeintNxNFrame( dst, i_dst, src, i_src, i_width, i_height );
1354 }
1355
1356
1357 static inline int median( int a, int b, int c )
1358 {
1359     int min = a, max =a;
1360     if( b < min )
1361         min = b;
1362     else
1363         max = b;
1364
1365     if( c < min )
1366         min = c;
1367     else if( c > max )
1368         max = c;
1369
1370     return a + b + c - min - max;
1371 }
1372
1373
1374 /* XDeintBand8x8:
1375  */
1376 static inline void XDeintBand8x8C( uint8_t *dst, int i_dst,
1377                                    uint8_t *src, int i_src,
1378                                    const int i_mbx, int i_modx )
1379 {
1380     int x;
1381
1382     for( x = 0; x < i_mbx; x++ )
1383     {
1384         int s;
1385         if( ( s = XDeint8x8DetectC( src, i_src ) ) )
1386         {
1387             if( x == 0 || x == i_mbx - 1 )
1388                 XDeint8x8FieldEC( dst, i_dst, src, i_src );
1389             else
1390                 XDeint8x8FieldC( dst, i_dst, src, i_src );
1391         }
1392         else
1393         {
1394             XDeint8x8MergeC( dst, i_dst,
1395                              &src[0*i_src], 2*i_src,
1396                              &src[1*i_src], 2*i_src );
1397         }
1398
1399         dst += 8;
1400         src += 8;
1401     }
1402
1403     if( i_modx )
1404         XDeintNxN( dst, i_dst, src, i_src, i_modx, 8 );
1405 }
1406 #ifdef CAN_COMPILE_MMXEXT
1407 static inline void XDeintBand8x8MMXEXT( uint8_t *dst, int i_dst,
1408                                         uint8_t *src, int i_src,
1409                                         const int i_mbx, int i_modx )
1410 {
1411     int x;
1412
1413     /* Reset current line */
1414     for( x = 0; x < i_mbx; x++ )
1415     {
1416         int s;
1417         if( ( s = XDeint8x8DetectMMXEXT( src, i_src ) ) )
1418         {
1419             if( x == 0 || x == i_mbx - 1 )
1420                 XDeint8x8FieldEMMXEXT( dst, i_dst, src, i_src );
1421             else
1422                 XDeint8x8FieldMMXEXT( dst, i_dst, src, i_src );
1423         }
1424         else
1425         {
1426             XDeint8x8MergeMMXEXT( dst, i_dst,
1427                                   &src[0*i_src], 2*i_src,
1428                                   &src[1*i_src], 2*i_src );
1429         }
1430
1431         dst += 8;
1432         src += 8;
1433     }
1434
1435     if( i_modx )
1436         XDeintNxN( dst, i_dst, src, i_src, i_modx, 8 );
1437 }
1438 #endif
1439
1440 static void RenderX( picture_t *p_outpic, picture_t *p_pic )
1441 {
1442     int i_plane;
1443
1444     /* Copy image and skip lines */
1445     for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
1446     {
1447         const int i_mby = ( p_outpic->p[i_plane].i_visible_lines + 7 )/8 - 1;
1448         const int i_mbx = p_outpic->p[i_plane].i_visible_pitch/8;
1449
1450         const int i_mody = p_outpic->p[i_plane].i_visible_lines - 8*i_mby;
1451         const int i_modx = p_outpic->p[i_plane].i_visible_pitch - 8*i_mbx;
1452
1453         const int i_dst = p_outpic->p[i_plane].i_pitch;
1454         const int i_src = p_pic->p[i_plane].i_pitch;
1455
1456         int y, x;
1457
1458         for( y = 0; y < i_mby; y++ )
1459         {
1460             uint8_t *dst = &p_outpic->p[i_plane].p_pixels[8*y*i_dst];
1461             uint8_t *src = &p_pic->p[i_plane].p_pixels[8*y*i_src];
1462
1463 #ifdef CAN_COMPILE_MMXEXT
1464             if( vlc_CPU() & CPU_CAPABILITY_MMXEXT )
1465                 XDeintBand8x8MMXEXT( dst, i_dst, src, i_src, i_mbx, i_modx );
1466             else
1467 #endif
1468                 XDeintBand8x8C( dst, i_dst, src, i_src, i_mbx, i_modx );
1469         }
1470
1471         /* Last line (C only)*/
1472         if( i_mody )
1473         {
1474             uint8_t *dst = &p_outpic->p[i_plane].p_pixels[8*y*i_dst];
1475             uint8_t *src = &p_pic->p[i_plane].p_pixels[8*y*i_src];
1476
1477             for( x = 0; x < i_mbx; x++ )
1478             {
1479                 XDeintNxN( dst, i_dst, src, i_src, 8, i_mody );
1480
1481                 dst += 8;
1482                 src += 8;
1483             }
1484
1485             if( i_modx )
1486                 XDeintNxN( dst, i_dst, src, i_src, i_modx, i_mody );
1487         }
1488     }
1489
1490 #ifdef CAN_COMPILE_MMXEXT
1491     if( vlc_CPU() & CPU_CAPABILITY_MMXEXT )
1492         emms();
1493 #endif
1494 }
1495
1496 /*****************************************************************************
1497  * Yadif (Yet Another DeInterlacing Filter).
1498  *****************************************************************************/
1499 /* */
1500 struct vf_priv_s {
1501     /*
1502      * 0: Output 1 frame for each frame.
1503      * 1: Output 1 frame for each field.
1504      * 2: Like 0 but skips spatial interlacing check.
1505      * 3: Like 1 but skips spatial interlacing check.
1506      *
1507      * In vlc, only & 0x02 has meaning, as we do the & 0x01 ourself.
1508      */
1509     int mode;
1510 };
1511
1512 /* I am unsure it is the right one */
1513 typedef intptr_t x86_reg;
1514
1515 #define FFABS(a) ((a) >= 0 ? (a) : (-(a)))
1516 #define FFMAX(a,b)      __MAX(a,b)
1517 #define FFMAX3(a,b,c)   FFMAX(FFMAX(a,b),c)
1518 #define FFMIN(a,b)      __MIN(a,b)
1519 #define FFMIN3(a,b,c)   FFMIN(FFMIN(a,b),c)
1520
1521 /* yadif.h comes from vf_yadif.c of mplayer project */
1522 #include "yadif.h"
1523
1524 static int RenderYadif( filter_t *p_filter, picture_t *p_dst, picture_t *p_src, int i_order, int i_field )
1525 {
1526     VLC_UNUSED(p_src);
1527
1528     filter_sys_t *p_sys = p_filter->p_sys;
1529
1530     /* */
1531     assert( i_order >= 0 && i_order <= 2 ); /* 2 = soft field repeat */
1532     assert( i_field == 0 || i_field == 1 );
1533
1534     /* As the pitches must match, use ONLY pictures coming from picture_New()! */
1535     picture_t *p_prev = p_sys->pp_history[0];
1536     picture_t *p_cur  = p_sys->pp_history[1];
1537     picture_t *p_next = p_sys->pp_history[2];
1538
1539     /* Account for soft field repeat.
1540
1541        The "parity" parameter affects the algorithm like this (from yadif.h):
1542        uint8_t *prev2= parity ? prev : cur ;
1543        uint8_t *next2= parity ? cur  : next;
1544
1545        The original parity expression that was used here is:
1546        (i_field ^ (i_order == i_field)) & 1
1547
1548        Truth table:
1549        i_field = 0, i_order = 0  => 1
1550        i_field = 1, i_order = 1  => 0
1551        i_field = 1, i_order = 0  => 1
1552        i_field = 0, i_order = 1  => 0
1553
1554        => equivalent with e.g.  (1 - i_order)  or  (i_order + 1) % 2
1555
1556        Thus, in a normal two-field frame,
1557              parity 1 = first field  (i_order == 0)
1558              parity 0 = second field (i_order == 1)
1559
1560        Now, with three fields, where the third is a copy of the first,
1561              i_order = 0  =>  parity 1 (as usual)
1562              i_order = 1  =>  due to the repeat, prev = cur, but also next = cur.
1563                               Because in such a case there is no motion (otherwise field repeat makes no sense),
1564                               we don't actually need to invoke Yadif's filter(). Thus, set "parity" to 2,
1565                               and use this to bypass the filter.
1566              i_order = 2  =>  parity 0 (as usual)
1567     */
1568     int yadif_parity;
1569     if( p_cur  &&  p_cur->i_nb_fields > 2 )
1570         yadif_parity = (i_order + 1) % 3; /* 1, *2*, 0; where 2 is a special value meaning "bypass filter". */
1571     else
1572         yadif_parity = (i_order + 1) % 2; /* 1, 0 */
1573
1574     /* Filter if we have all the pictures we need */
1575     if( p_prev && p_cur && p_next )
1576     {
1577         /* */
1578         void (*filter)(struct vf_priv_s *p, uint8_t *dst, uint8_t *prev, uint8_t *cur, uint8_t *next, int w, int refs, int parity);
1579 #if defined(HAVE_YADIF_SSE2)
1580         if( vlc_CPU() & CPU_CAPABILITY_SSE2 )
1581             filter = yadif_filter_line_mmx2;
1582         else
1583 #endif
1584             filter = yadif_filter_line_c;
1585
1586         for( int n = 0; n < p_dst->i_planes; n++ )
1587         {
1588             const plane_t *prevp = &p_prev->p[n];
1589             const plane_t *curp  = &p_cur->p[n];
1590             const plane_t *nextp = &p_next->p[n];
1591             plane_t *dstp        = &p_dst->p[n];
1592
1593             for( int y = 1; y < dstp->i_visible_lines - 1; y++ )
1594             {
1595                 if( (y % 2) == i_field  ||  yadif_parity == 2 )
1596                 {
1597                     vlc_memcpy( &dstp->p_pixels[y * dstp->i_pitch],
1598                                 &curp->p_pixels[y * curp->i_pitch], dstp->i_visible_pitch );
1599                 }
1600                 else
1601                 {
1602                     struct vf_priv_s cfg;
1603                     /* Spatial checks only when enough data */
1604                     cfg.mode = (y >= 2 && y < dstp->i_visible_lines - 2) ? 0 : 2;
1605
1606                     assert( prevp->i_pitch == curp->i_pitch && curp->i_pitch == nextp->i_pitch );
1607                     filter( &cfg,
1608                             &dstp->p_pixels[y * dstp->i_pitch],
1609                             &prevp->p_pixels[y * prevp->i_pitch],
1610                             &curp->p_pixels[y * curp->i_pitch],
1611                             &nextp->p_pixels[y * nextp->i_pitch],
1612                             dstp->i_visible_pitch,
1613                             curp->i_pitch,
1614                             yadif_parity );
1615                 }
1616
1617                 /* We duplicate the first and last lines */
1618                 if( y == 1 )
1619                     vlc_memcpy(&dstp->p_pixels[(y-1) * dstp->i_pitch], &dstp->p_pixels[y * dstp->i_pitch], dstp->i_pitch);
1620                 else if( y == dstp->i_visible_lines - 2 )
1621                     vlc_memcpy(&dstp->p_pixels[(y+1) * dstp->i_pitch], &dstp->p_pixels[y * dstp->i_pitch], dstp->i_pitch);
1622             }
1623         }
1624
1625         p_sys->i_frame_offset = 1; /* p_curr will be rendered at next frame, too */
1626
1627         return VLC_SUCCESS;
1628     }
1629     else if( !p_prev && !p_cur && p_next )
1630     {
1631         /* NOTE: For the first frame, we use the default frame offset
1632                  as set by Open() or SetFilterMethod(). It is always 0. */
1633
1634         /* FIXME not good as it does not use i_order/i_field */
1635         RenderX( p_dst, p_next );
1636         return VLC_SUCCESS;
1637     }
1638     else
1639     {
1640         p_sys->i_frame_offset = 1; /* p_curr will be rendered at next frame */
1641
1642         return VLC_EGENERIC;
1643     }
1644 }
1645
1646 /*****************************************************************************
1647  * video filter2 functions
1648  *****************************************************************************/
1649 #define DEINTERLACE_DST_SIZE 3
1650 static picture_t *Deinterlace( filter_t *p_filter, picture_t *p_pic )
1651 {
1652     filter_sys_t *p_sys = p_filter->p_sys;
1653     picture_t *p_dst[DEINTERLACE_DST_SIZE];
1654
1655     /* Request output picture */
1656     p_dst[0] = filter_NewPicture( p_filter );
1657     if( p_dst[0] == NULL )
1658     {
1659         picture_Release( p_pic );
1660         return NULL;
1661     }
1662     picture_CopyProperties( p_dst[0], p_pic );
1663
1664     /* Any unused p_dst pointers must be NULL, because they are used to check how many output frames we have. */
1665     for( int i = 1; i < DEINTERLACE_DST_SIZE; ++i )
1666         p_dst[i] = NULL;
1667
1668     /* Update the input frame history, if the currently active algorithm needs it. */
1669     if( p_sys->b_use_frame_history )
1670     {
1671         /* Duplicate the picture
1672          * TODO when the vout rework is finished, picture_Hold() might be enough
1673          * but becarefull, the pitches must match */
1674         picture_t *p_dup = picture_NewFromFormat( &p_pic->format );
1675         if( p_dup )
1676             picture_Copy( p_dup, p_pic );
1677
1678         /* Slide the history */
1679         if( p_sys->pp_history[0] )
1680             picture_Release( p_sys->pp_history[0] );
1681         for( int i = 1; i < HISTORY_SIZE; i++ )
1682             p_sys->pp_history[i-1] = p_sys->pp_history[i];
1683         p_sys->pp_history[HISTORY_SIZE-1] = p_dup;
1684     }
1685
1686     /* Slide the metadata history. */
1687     for( int i = 1; i < METADATA_SIZE; i++ )
1688     {
1689         p_sys->meta.pi_date[i-1]            = p_sys->meta.pi_date[i];
1690         p_sys->meta.pi_nb_fields[i-1]       = p_sys->meta.pi_nb_fields[i];
1691         p_sys->meta.pb_top_field_first[i-1] = p_sys->meta.pb_top_field_first[i];
1692     }
1693     /* The last element corresponds to the current input frame. */
1694     p_sys->meta.pi_date[METADATA_SIZE-1]            = p_pic->date;
1695     p_sys->meta.pi_nb_fields[METADATA_SIZE-1]       = p_pic->i_nb_fields;
1696     p_sys->meta.pb_top_field_first[METADATA_SIZE-1] = p_pic->b_top_field_first;
1697
1698     /* Remember the frame offset that we should use for this frame.
1699        The value in p_sys will be updated to reflect the correct value
1700        for the *next* frame when we call the renderer. */
1701     int i_frame_offset = p_sys->i_frame_offset;
1702     int i_meta_idx     = (METADATA_SIZE-1) - i_frame_offset;
1703
1704     /* These correspond to the current *outgoing* frame. */
1705     bool b_top_field_first;
1706     int i_nb_fields;
1707     if( i_frame_offset != CUSTOM_PTS )
1708     {
1709         /* Pick the correct values from the history. */
1710         b_top_field_first = p_sys->meta.pb_top_field_first[i_meta_idx];
1711         i_nb_fields       = p_sys->meta.pi_nb_fields[i_meta_idx];
1712     }
1713     else
1714     {
1715         /* Framerate doublers must not request CUSTOM_PTS, as they need the original field timings,
1716            and need Deinterlace() to allocate the correct number of output frames. */
1717         assert( !p_sys->b_double_rate );
1718
1719         /* NOTE: i_nb_fields is only used for framerate doublers, so it is unused in this case.
1720                  b_top_field_first is only passed to the algorithm. We assume that algorithms that
1721                  request CUSTOM_PTS will, if necessary, extract the TFF/BFF information themselves.
1722         */
1723         b_top_field_first = p_pic->b_top_field_first; /* this is not guaranteed to be meaningful */
1724         i_nb_fields       = p_pic->i_nb_fields;       /* unused */
1725     }
1726
1727     /* For framerate doublers, determine field duration and allocate output frames. */
1728     mtime_t i_field_dur = 0;
1729     int i_double_rate_alloc_end = 0; /* One past last for allocated output frames in p_dst[].
1730                                         Used only for framerate doublers. Will be inited below.
1731                                         Declared here because the PTS logic needs the result. */
1732     if( p_sys->b_double_rate )
1733     {
1734         /* Calculate one field duration. */
1735         int i = 0;
1736         int iend = METADATA_SIZE-1;
1737         /* Find oldest valid logged date. Note: the current input frame doesn't count. */
1738         for( ; i < iend; i++ )
1739             if( p_sys->meta.pi_date[i] > VLC_TS_INVALID )
1740                 break;
1741         if( i < iend )
1742         {
1743             /* Count how many fields the valid history entries (except the new frame) represent. */
1744             int i_fields_total = 0;
1745             for( int j = i ; j < iend; j++ )
1746                 i_fields_total += p_sys->meta.pi_nb_fields[j];
1747             /* One field took this long. */
1748             i_field_dur = (p_pic->date - p_sys->meta.pi_date[i]) / i_fields_total;
1749         }
1750         /* Note that we default to field duration 0 if it could not be determined.
1751            This behaves the same as the old code - leaving the extra output frame
1752            dates the same as p_pic->date if the last cached date was not valid.
1753         */
1754
1755         i_double_rate_alloc_end = i_nb_fields;
1756         if( i_nb_fields > DEINTERLACE_DST_SIZE )
1757         {
1758             /* Note that the effective buffer size depends also on the constant private_picture in vout_wrapper.c,
1759                since that determines the maximum number of output pictures filter_NewPicture() will successfully
1760                allocate for one input frame.
1761             */
1762             msg_Err( p_filter, "Framerate doubler: output buffer too small; fields = %d, buffer size = %d. Dropping the remaining fields.", i_nb_fields, DEINTERLACE_DST_SIZE );
1763             i_double_rate_alloc_end = DEINTERLACE_DST_SIZE;
1764         }
1765
1766         /* Allocate output frames. */
1767         for( int i = 1; i < i_double_rate_alloc_end ; ++i )
1768         {
1769             p_dst[i-1]->p_next =
1770             p_dst[i]           = filter_NewPicture( p_filter );
1771             if( p_dst[i] )
1772             {
1773                 picture_CopyProperties( p_dst[i], p_pic );
1774             }
1775             else
1776             {
1777                 msg_Err( p_filter, "Framerate doubler: could not allocate output frame %d", i+1 );
1778                 i_double_rate_alloc_end = i; /* Inform the PTS logic about the correct end position. */
1779                 break; /* If this happens, the rest of the allocations aren't likely to work, either... */
1780             }
1781         }
1782         /* Now we have allocated *up to* the correct number of frames; normally, exactly the correct number.
1783            Upon alloc failure, we may have succeeded in allocating *some* output frames, but fewer than
1784            were desired. In such a case, as many will be rendered as were successfully allocated.
1785
1786            Note that now p_dst[i] != NULL for 0 <= i < i_double_rate_alloc_end. */
1787     }
1788     assert( p_sys->b_double_rate == true  ||  p_dst[1] == NULL );
1789     assert( i_nb_fields > 2  ||  p_dst[2] == NULL );
1790
1791     /* Render */
1792     switch( p_sys->i_mode )
1793     {
1794         case DEINTERLACE_DISCARD:
1795             RenderDiscard( p_filter, p_dst[0], p_pic, 0 );
1796             break;
1797
1798         case DEINTERLACE_BOB:
1799             RenderBob( p_filter, p_dst[0], p_pic, !b_top_field_first );
1800             if( p_dst[1] )
1801                 RenderBob( p_filter, p_dst[1], p_pic, b_top_field_first );
1802             if( p_dst[2] )
1803                 RenderBob( p_filter, p_dst[2], p_pic, !b_top_field_first );
1804             break;;
1805
1806         case DEINTERLACE_LINEAR:
1807             RenderLinear( p_filter, p_dst[0], p_pic, !b_top_field_first );
1808             if( p_dst[1] )
1809                 RenderLinear( p_filter, p_dst[1], p_pic, b_top_field_first );
1810             if( p_dst[2] )
1811                 RenderLinear( p_filter, p_dst[2], p_pic, !b_top_field_first );
1812             break;
1813
1814         case DEINTERLACE_MEAN:
1815             RenderMean( p_filter, p_dst[0], p_pic );
1816             break;
1817
1818         case DEINTERLACE_BLEND:
1819             RenderBlend( p_filter, p_dst[0], p_pic );
1820             break;
1821
1822         case DEINTERLACE_X:
1823             RenderX( p_dst[0], p_pic );
1824             break;
1825
1826         case DEINTERLACE_YADIF:
1827             if( RenderYadif( p_filter, p_dst[0], p_pic, 0, 0 ) )
1828                 goto drop;
1829             break;
1830
1831         case DEINTERLACE_YADIF2X:
1832             if( RenderYadif( p_filter, p_dst[0], p_pic, 0, !b_top_field_first ) )
1833                 goto drop;
1834             if( p_dst[1] )
1835                 RenderYadif( p_filter, p_dst[1], p_pic, 1, b_top_field_first );
1836             if( p_dst[2] )
1837                 RenderYadif( p_filter, p_dst[2], p_pic, 2, !b_top_field_first );
1838             break;
1839     }
1840
1841     /* Set output timestamps, if the algorithm didn't request CUSTOM_PTS for this frame. */
1842     assert( i_frame_offset <= METADATA_SIZE  ||  i_frame_offset == CUSTOM_PTS );
1843     if( i_frame_offset != CUSTOM_PTS )
1844     {
1845         mtime_t i_base_pts = p_sys->meta.pi_date[i_meta_idx];
1846
1847         /* Note: in the usual case (i_frame_offset = 0  and  b_double_rate = false),
1848                  this effectively does nothing. This is needed to correct the timestamp
1849                  when i_frame_offset > 0. */
1850         p_dst[0]->date = i_base_pts;
1851
1852         if( p_sys->b_double_rate )
1853         {
1854             /* Processing all actually allocated output frames. */
1855             for( int i = 1; i < i_double_rate_alloc_end; ++i )
1856             {
1857                 /* XXX it's not really good especially for the first picture, but
1858                  * I don't think that delaying by one frame is worth it */
1859                 if( i_base_pts > VLC_TS_INVALID )
1860                     p_dst[i]->date = i_base_pts + i * i_field_dur;
1861                 else
1862                     p_dst[i]->date = VLC_TS_INVALID;
1863             }
1864         }
1865     }
1866
1867     p_dst[0]->b_progressive = true;
1868     for( int i = 1; i < DEINTERLACE_DST_SIZE; ++i )
1869     {
1870         if( p_dst[i] )
1871         {
1872             p_dst[i]->b_progressive = true;
1873             p_dst[i]->i_nb_fields = 2;
1874         }
1875     }
1876
1877     picture_Release( p_pic );
1878     return p_dst[0];
1879
1880 drop:
1881     picture_Release( p_dst[0] );
1882     for( int i = 1; i < DEINTERLACE_DST_SIZE; ++i )
1883     {
1884         if( p_dst[i] )
1885             picture_Release( p_dst[i] );
1886     }
1887     picture_Release( p_pic );
1888     return NULL;
1889 }
1890
1891 static void Flush( filter_t *p_filter )
1892 {
1893     filter_sys_t *p_sys = p_filter->p_sys;
1894
1895     for( int i = 0; i < METADATA_SIZE; i++ )
1896     {
1897         p_sys->meta.pi_date[i] = VLC_TS_INVALID;
1898         p_sys->meta.pi_nb_fields[i] = 2;
1899         p_sys->meta.pb_top_field_first[i] = true;
1900     }
1901     p_sys->i_frame_offset = 0; /* reset to default value (first frame after flush cannot have offset) */
1902     for( int i = 0; i < HISTORY_SIZE; i++ )
1903     {
1904         if( p_sys->pp_history[i] )
1905             picture_Release( p_sys->pp_history[i] );
1906         p_sys->pp_history[i] = NULL;
1907     }
1908 }
1909
1910 static int Mouse( filter_t *p_filter,
1911                   vlc_mouse_t *p_mouse, const vlc_mouse_t *p_old, const vlc_mouse_t *p_new )
1912 {
1913     VLC_UNUSED(p_old);
1914     *p_mouse = *p_new;
1915     if( p_filter->p_sys->b_half_height )
1916         p_mouse->i_y *= 2;
1917     return VLC_SUCCESS;
1918 }
1919
1920
1921 /*****************************************************************************
1922  * Open
1923  *****************************************************************************/
1924 static int Open( vlc_object_t *p_this )
1925 {
1926     filter_t *p_filter = (filter_t*)p_this;
1927     filter_sys_t *p_sys;
1928
1929     if( !IsChromaSupported( p_filter->fmt_in.video.i_chroma ) )
1930         return VLC_EGENERIC;
1931
1932     /* */
1933     p_sys = p_filter->p_sys = malloc( sizeof( *p_sys ) );
1934     if( !p_sys )
1935         return VLC_ENOMEM;
1936
1937     p_sys->i_mode = DEINTERLACE_BLEND;
1938     p_sys->b_double_rate = false;
1939     p_sys->b_half_height = true;
1940     p_sys->b_use_frame_history = false;
1941     for( int i = 0; i < METADATA_SIZE; i++ )
1942     {
1943         p_sys->meta.pi_date[i] = VLC_TS_INVALID;
1944         p_sys->meta.pi_nb_fields[i] = 2;
1945         p_sys->meta.pb_top_field_first[i] = true;
1946     }
1947     p_sys->i_frame_offset = 0; /* start with default value (first-ever frame cannot have offset) */
1948     for( int i = 0; i < HISTORY_SIZE; i++ )
1949         p_sys->pp_history[i] = NULL;
1950
1951 #if defined(CAN_COMPILE_C_ALTIVEC)
1952     if( vlc_CPU() & CPU_CAPABILITY_ALTIVEC )
1953     {
1954         p_sys->pf_merge = MergeAltivec;
1955         p_sys->pf_end_merge = NULL;
1956     }
1957     else
1958 #endif
1959 #if defined(CAN_COMPILE_SSE)
1960     if( vlc_CPU() & CPU_CAPABILITY_SSE2 )
1961     {
1962         p_sys->pf_merge = MergeSSE2;
1963         p_sys->pf_end_merge = EndMMX;
1964     }
1965     else
1966 #endif
1967 #if defined(CAN_COMPILE_MMXEXT)
1968     if( vlc_CPU() & CPU_CAPABILITY_MMXEXT )
1969     {
1970         p_sys->pf_merge = MergeMMXEXT;
1971         p_sys->pf_end_merge = EndMMX;
1972     }
1973     else
1974 #endif
1975 #if defined(CAN_COMPILE_3DNOW)
1976     if( vlc_CPU() & CPU_CAPABILITY_3DNOW )
1977     {
1978         p_sys->pf_merge = Merge3DNow;
1979         p_sys->pf_end_merge = End3DNow;
1980     }
1981     else
1982 #endif
1983 #if defined __ARM_NEON__
1984     if( vlc_CPU() & CPU_CAPABILITY_NEON )
1985     {
1986         p_sys->pf_merge = MergeNEON;
1987         p_sys->pf_end_merge = NULL;
1988     }
1989     else
1990 #endif
1991     {
1992         p_sys->pf_merge = MergeGeneric;
1993         p_sys->pf_end_merge = NULL;
1994     }
1995
1996     /* */
1997     config_ChainParse( p_filter, FILTER_CFG_PREFIX, ppsz_filter_options,
1998                        p_filter->p_cfg );
1999
2000     char *psz_mode = var_GetNonEmptyString( p_filter, FILTER_CFG_PREFIX "mode" );
2001     SetFilterMethod( p_filter, psz_mode, p_filter->fmt_in.video.i_chroma );
2002     free( psz_mode );
2003
2004     /* */
2005     video_format_t fmt;
2006     GetOutputFormat( p_filter, &fmt, &p_filter->fmt_in.video );
2007     if( !p_filter->b_allow_fmt_out_change &&
2008         ( fmt.i_chroma != p_filter->fmt_in.video.i_chroma ||
2009           fmt.i_height != p_filter->fmt_in.video.i_height ) )
2010     {
2011         Close( VLC_OBJECT(p_filter) );
2012         return VLC_EGENERIC;
2013     }
2014     p_filter->fmt_out.video = fmt;
2015     p_filter->fmt_out.i_codec = fmt.i_chroma;
2016     p_filter->pf_video_filter = Deinterlace;
2017     p_filter->pf_video_flush  = Flush;
2018     p_filter->pf_video_mouse  = Mouse;
2019
2020     msg_Dbg( p_filter, "deinterlacing" );
2021
2022     return VLC_SUCCESS;
2023 }
2024
2025 /*****************************************************************************
2026  * Close: clean up the filter
2027  *****************************************************************************/
2028 static void Close( vlc_object_t *p_this )
2029 {
2030     filter_t *p_filter = (filter_t*)p_this;
2031
2032     Flush( p_filter );
2033     free( p_filter->p_sys );
2034 }
2035