git.sesse.net Git - vlc/blob - modules/video_filter/deinterlace.c

   1 /*****************************************************************************
   2  * deinterlace.c : deinterlacer plugin for vlc
   3  *****************************************************************************
   4  * Copyright (C) 2000-2011 the VideoLAN team
   5  * $Id$
   6  *
   7  * Author: Sam Hocevar <sam@zoy.org>
   8  *         Juha Jeronen <juha.jeronen@jyu.fi> (Phosphor and IVTC modes)
   9  *
  10  * This program is free software; you can redistribute it and/or modify
  11  * it under the terms of the GNU General Public License as published by
  12  * the Free Software Foundation; either version 2 of the License, or
  13  * (at your option) any later version.
  14  *
  15  * This program is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  * GNU General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU General Public License
  21  * along with this program; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
  23  *****************************************************************************/
  24
  25 /*****************************************************************************
  26  * Preamble
  27  *****************************************************************************/
  28
  29 #ifdef HAVE_CONFIG_H
  30 # include "config.h"
  31 #endif
  32
  33 #include <assert.h>
  34 #include <stdint.h> /* int_fast32_t */
  35
  36 #ifdef HAVE_ALTIVEC_H
  37 #   include <altivec.h>
  38 #endif
  39
  40 #include <vlc_common.h>
  41 #include <vlc_plugin.h>
  42 #include <vlc_filter.h>
  43 #include <vlc_cpu.h>
  44
  45 #ifdef CAN_COMPILE_MMXEXT
  46 #   include "mmx.h"
  47 #endif
  48
  49 #define DEINTERLACE_DISCARD  1
  50 #define DEINTERLACE_MEAN     2
  51 #define DEINTERLACE_BLEND    3
  52 #define DEINTERLACE_BOB      4
  53 #define DEINTERLACE_LINEAR   5
  54 #define DEINTERLACE_X        6
  55 #define DEINTERLACE_YADIF    7
  56 #define DEINTERLACE_YADIF2X  8
  57 #define DEINTERLACE_PHOSPHOR 9
  58 #define DEINTERLACE_IVTC     10
  59
  60 /*****************************************************************************
  61  * Module descriptor
  62  *****************************************************************************/
  63 static int  Open ( vlc_object_t * );
  64 static void Close( vlc_object_t * );
  65
  66 #define MODE_TEXT N_("Deinterlace mode")
  67 #define MODE_LONGTEXT N_("Deinterlace method to use for local playback.")
  68
  69 #define SOUT_MODE_TEXT N_("Streaming deinterlace mode")
  70 #define SOUT_MODE_LONGTEXT N_("Deinterlace method to use for streaming.")
  71
  72 #define FILTER_CFG_PREFIX "sout-deinterlace-"
  73
  74 static const char *const mode_list[] = {
  75     "discard", "blend", "mean", "bob", "linear", "x",
  76     "yadif", "yadif2x", "phosphor", "ivtc" };
  77 static const char *const mode_list_text[] = {
  78     N_("Discard"), N_("Blend"), N_("Mean"), N_("Bob"), N_("Linear"), "X",
  79     "Yadif", "Yadif (2x)", N_("Phosphor"), N_("Film NTSC (IVTC)") };
  80
  81 /* Tooltips drop linefeeds (at least in the Qt GUI);
  82    thus the space before each set of consecutive \n. */
  83 #define PHOSPHOR_CHROMA_TEXT N_("Phosphor chroma mode for 4:2:0 input")
  84 #define PHOSPHOR_CHROMA_LONGTEXT N_("Choose handling for colours in those "\
  85                                     "output frames that fall across input "\
  86                                     "frame boundaries. \n"\
  87                                     "\n"\
  88                                     "Latest: take chroma from new (bright) "\
  89                                     "field only. Good for interlaced input, "\
  90                                     "such as videos from a camcorder. \n"\
  91                                     "\n"\
  92                                     "AltLine: take chroma line 1 from top "\
  93                                     "field, line 2 from bottom field, etc. \n"\
  94                                     "Default, good for NTSC telecined input "\
  95                                     "(anime DVDs, etc.). \n"\
  96                                     "\n"\
  97                                     "Blend: average input field chromas. "\
  98                                     "May distort the colours of the new "\
  99                                     "(bright) field, too. \n"\
 100                                     "\n"\
 101                                     "Upconvert: output in 4:2:2 format "\
 102                                     "(independent chroma for each field). "\
 103                                     "Best simulation, but requires more CPU "\
 104                                     "and memory bandwidth.")
 105
 106 #define PHOSPHOR_DIMMER_TEXT N_("Phosphor old field dimmer strength")
 107 #define PHOSPHOR_DIMMER_LONGTEXT N_("This controls the strength of the "\
 108                                     "darkening filter that simulates CRT TV "\
 109                                     "phosphor light decay for the old field "\
 110                                     "in the Phosphor framerate doubler. "\
 111                                     "Default: Low.")
 112
 113 /* These numbers, and phosphor_chroma_list[], should be in the same order
 114    as phosphor_chroma_list_text[]. The value 0 is reserved, because
 115    var_GetInteger() returns 0 in case of error. */
 116 typedef enum { PC_LATEST = 1, PC_ALTLINE   = 2,
 117                PC_BLEND  = 3, PC_UPCONVERT = 4 } phosphor_chroma_t;
 118 static const int phosphor_chroma_list[] = { PC_LATEST, PC_ALTLINE,
 119                                             PC_BLEND,  PC_UPCONVERT };
 120 static const char *const phosphor_chroma_list_text[] = { N_("Latest"),
 121                                                          N_("AltLine"),
 122                                                          N_("Blend"),
 123                                                          N_("Upconvert") };
 124
 125 /* Same here. Same order as in phosphor_dimmer_list_text[],
 126    and the value 0 is reserved for config error. */
 127 static const int phosphor_dimmer_list[] = { 1, 2, 3, 4 };
 128 static const char *const phosphor_dimmer_list_text[] = { N_("Off"),
 129                                                          N_("Low"),
 130                                                          N_("Medium"),
 131                                                          N_("High") };
 132
 133 vlc_module_begin ()
 134     set_description( N_("Deinterlacing video filter") )
 135     set_shortname( N_("Deinterlace" ))
 136     set_capability( "video filter2", 0 )
 137     set_category( CAT_VIDEO )
 138     set_subcategory( SUBCAT_VIDEO_VFILTER )
 139
 140     add_string( FILTER_CFG_PREFIX "mode", "blend", SOUT_MODE_TEXT,
 141                 SOUT_MODE_LONGTEXT, false )
 142         change_string_list( mode_list, mode_list_text, 0 )
 143         change_safe ()
 144     add_integer( FILTER_CFG_PREFIX "phosphor-chroma", 2, PHOSPHOR_CHROMA_TEXT,
 145                 PHOSPHOR_CHROMA_LONGTEXT, true )
 146         change_integer_list( phosphor_chroma_list, phosphor_chroma_list_text )
 147         change_safe ()
 148     add_integer( FILTER_CFG_PREFIX "phosphor-dimmer", 2, PHOSPHOR_DIMMER_TEXT,
 149                 PHOSPHOR_DIMMER_LONGTEXT, true )
 150         change_integer_list( phosphor_dimmer_list, phosphor_dimmer_list_text )
 151         change_safe ()
 152     add_shortcut( "deinterlace" )
 153     set_callbacks( Open, Close )
 154 vlc_module_end ()
 155
 156
 157 /*****************************************************************************
 158  * Local protypes
 159  *****************************************************************************/
 160 static void RenderDiscard ( filter_t *, picture_t *, picture_t *, int );
 161 static void RenderBob     ( filter_t *, picture_t *, picture_t *, int );
 162 static void RenderMean    ( filter_t *, picture_t *, picture_t * );
 163 static void RenderBlend   ( filter_t *, picture_t *, picture_t * );
 164 static void RenderLinear  ( filter_t *, picture_t *, picture_t *, int );
 165 static void RenderX       ( picture_t *, picture_t * );
 166 static int  RenderYadif   ( filter_t *, picture_t *, picture_t *, int, int );
 167 static int  RenderPhosphor( filter_t *, picture_t *, picture_t *, int, int );
 168 static int  RenderIVTC    ( filter_t *, picture_t *, picture_t * );
 169
 170 static void MergeGeneric ( void *, const void *, const void *, size_t );
 171 #if defined(CAN_COMPILE_C_ALTIVEC)
 172 static void MergeAltivec ( void *, const void *, const void *, size_t );
 173 #endif
 174 #if defined(CAN_COMPILE_MMXEXT)
 175 static void MergeMMXEXT  ( void *, const void *, const void *, size_t );
 176 #endif
 177 #if defined(CAN_COMPILE_3DNOW)
 178 static void Merge3DNow   ( void *, const void *, const void *, size_t );
 179 #endif
 180 #if defined(CAN_COMPILE_SSE)
 181 static void MergeSSE2    ( void *, const void *, const void *, size_t );
 182 #endif
 183 #if defined(CAN_COMPILE_MMXEXT) || defined(CAN_COMPILE_SSE)
 184 static void EndMMX       ( void );
 185 #endif
 186 #if defined(CAN_COMPILE_3DNOW)
 187 static void End3DNow     ( void );
 188 #endif
 189 #if defined __ARM_NEON__
 190 static void MergeNEON (void *, const void *, const void *, size_t);
 191 #endif
 192
 193 /* Converts a full-frame plane_t to a field plane_t */
 194 static void FieldFromPlane( plane_t *p_dst, const plane_t *p_src,
 195                             int i_field );
 196
 197 /* Composes a frame from the given field pair */
 198 typedef enum { CC_ALTLINE, CC_UPCONVERT, CC_SOURCE_TOP, CC_SOURCE_BOTTOM,
 199                CC_MERGE } compose_chroma_t;
 200 static void ComposeFrame( filter_t *, picture_t *, picture_t *, picture_t *,
 201                           compose_chroma_t );
 202
 203 static const char *const ppsz_filter_options[] = {
 204     "mode", "phosphor-chroma", "phosphor-dimmer",
 205     NULL
 206 };
 207
 208 /* Used for framerate doublers */
 209 #define METADATA_SIZE (3)
 210 typedef struct {
 211     mtime_t pi_date[METADATA_SIZE];
 212     int     pi_nb_fields[METADATA_SIZE];
 213     bool    pb_top_field_first[METADATA_SIZE];
 214 } metadata_history_t;
 215
 216 /* Algorithm-specific state */
 217 typedef struct
 218 {
 219     phosphor_chroma_t i_chroma_for_420;
 220     int i_dimmer_strength;
 221 } phosphor_sys_t;
 222
 223 /**
 224  * Inverse telecine subsystem state.
 225  * @see RenderIVTC()
 226  */
 227 #define IVTC_NUM_FIELD_PAIRS 7
 228 #define IVTC_DETECTION_HISTORY_SIZE 3
 229 #define IVTC_LATEST (IVTC_DETECTION_HISTORY_SIZE-1)
 230 typedef struct
 231 {
 232     int i_mode; /**< Detecting, hard TC, or soft TC. @see ivtc_mode */
 233     int i_old_mode; /**< @see IVTCSoftTelecineDetect() */
 234
 235     int i_cadence_pos; /**< Cadence counter, 0..4. Runs when locked on. */
 236     int i_tfd; /**< TFF or BFF telecine. Detected from the video. */
 237
 238     /** Raw low-level detector output.
 239      *
 240      *  @see IVTCLowLevelDetect()
 241      */
 242     int pi_scores[IVTC_NUM_FIELD_PAIRS]; /**< Interlace scores. */
 243     int pi_motion[IVTC_DETECTION_HISTORY_SIZE]; /**< 8x8 blocks with motion. */
 244     int pi_top_rep[IVTC_DETECTION_HISTORY_SIZE]; /**< Hard top field repeat. */
 245     int pi_bot_rep[IVTC_DETECTION_HISTORY_SIZE]; /**< Hard bot field repeat. */
 246
 247     /** Interlace scores of outgoing frames, used for judging IVTC output
 248      *  (detecting cadence breaks).
 249      *
 250      *  @see IVTCOutputOrDropFrame()
 251      */
 252     int pi_final_scores[IVTC_DETECTION_HISTORY_SIZE];
 253
 254     /** Cadence position detection history (in ivtc_cadence_pos format).
 255      *  Contains the detected cadence position and a corresponding
 256      *  reliability flag for each algorithm.
 257      *
 258      *  s = scores, interlace scores based algorithm, original to this filter.
 259      *  v = vektor, hard field repeat based algorithm, inspired by
 260      *              the TVTime/Xine IVTC filter by Billy Biggs (Vektor).
 261      *
 262      *  Each algorithm may also keep internal, opaque data.
 263      *
 264      *  @see ivtc_cadence_pos
 265      *  @see IVTCCadenceDetectAlgoScores()
 266      *  @see IVTCCadenceDetectAlgoVektor()
 267      */
 268     int  pi_s_cadence_pos[IVTC_DETECTION_HISTORY_SIZE];
 269     bool pb_s_reliable[IVTC_DETECTION_HISTORY_SIZE];
 270     int  pi_v_raw[IVTC_DETECTION_HISTORY_SIZE]; /**< "vektor" algo internal */
 271     int  pi_v_cadence_pos[IVTC_DETECTION_HISTORY_SIZE];
 272     bool pb_v_reliable[IVTC_DETECTION_HISTORY_SIZE];
 273
 274     /** Final result, chosen by IVTCCadenceDetectFinalize() from the results
 275      *  given by the different detection algorithms.
 276      *
 277      *  @see IVTCCadenceDetectFinalize()
 278      */
 279     int pi_cadence_pos_history[IVTC_DETECTION_HISTORY_SIZE];
 280
 281     /**
 282      *  Set by cadence analyzer. Whether the sequence of last
 283      *  IVTC_DETECTION_HISTORY_SIZE detected positions, stored in
 284      *  pi_cadence_pos_history, looks like a valid telecine.
 285      *
 286      *  @see IVTCCadenceAnalyze()
 287      */
 288     bool b_sequence_valid;
 289
 290     /**
 291      *  Set by cadence analyzer. True if detected position = "dea".
 292      *  The three entries of this are used for detecting three progressive
 293      *  stencil positions in a row, i.e. five progressive frames in a row;
 294      *  this triggers exit from hard IVTC.
 295      *
 296      *  @see IVTCCadenceAnalyze()
 297      */
 298     bool pb_all_progressives[IVTC_DETECTION_HISTORY_SIZE];
 299 } ivtc_sys_t;
 300
 301 /* Top-level subsystem state */
 302 #define HISTORY_SIZE (3)
 303 #define CUSTOM_PTS -1
 304 struct filter_sys_t
 305 {
 306     int  i_mode;              /* Deinterlace mode */
 307     bool b_double_rate;       /* Shall we double the framerate? */
 308     bool b_half_height;       /* Shall be divide the height by 2 */
 309     bool b_use_frame_history; /* Does the algorithm need the input frame history buffer? */
 310
 311     void (*pf_merge) ( void *, const void *, const void *, size_t );
 312     void (*pf_end_merge) ( void );
 313
 314     /* Metadata history (PTS, nb_fields, TFF). Used for framerate doublers. */
 315     metadata_history_t meta;
 316
 317     /* Output frame timing / framerate doubler control (see below) */
 318     int i_frame_offset;
 319
 320     /* Input frame history buffer for algorithms that perform temporal filtering. */
 321     picture_t *pp_history[HISTORY_SIZE];
 322
 323     /* Algorithm-specific substructures */
 324     phosphor_sys_t phosphor;
 325     ivtc_sys_t ivtc;
 326 };
 327
 328 /*  NOTE on i_frame_offset:
 329
 330     This value indicates the offset between input and output frames in the currently active deinterlace algorithm.
 331     See the rationale below for why this is needed and how it is used.
 332
 333     Valid range: 0 <= i_frame_offset < METADATA_SIZE, or i_frame_offset = CUSTOM_PTS.
 334                  The special value CUSTOM_PTS is only allowed if b_double_rate is false.
 335
 336                  If CUSTOM_PTS is used, the algorithm must compute the outgoing PTSs itself,
 337                  and additionally, read the TFF/BFF information itself (if it needs it)
 338                  from the incoming frames.
 339
 340     Meaning of values:
 341     0 = output frame corresponds to the current input frame
 342         (no frame offset; default if not set),
 343     1 = output frame corresponds to the previous input frame
 344         (e.g. Yadif and Yadif2x work like this),
 345     ...
 346
 347     If necessary, i_frame_offset should be updated by the active deinterlace algorithm
 348     to indicate the correct delay for the *next* input frame. It does not matter at which i_order
 349     the algorithm updates this information, but the new value will only take effect upon the
 350     next call to Deinterlace() (i.e. at the next incoming frame).
 351
 352     The first-ever frame that arrives to the filter after Open() is always handled as having
 353     i_frame_offset = 0. For the second and all subsequent frames, each algorithm is responsible
 354     for setting the offset correctly. (The default is 0, so if that is correct, there's no need
 355     to do anything.)
 356
 357     This solution guarantees that i_frame_offset:
 358       1) is up to date at the start of each frame,
 359       2) does not change (as far as Deinterlace() is concerned) during a frame, and
 360       3) does not need a special API for setting the value at the start of each input frame,
 361          before the algorithm starts rendering the (first) output frame for that input frame.
 362
 363     The deinterlace algorithm is allowed to behave differently for different input frames.
 364     This is especially important for startup, when full history (as defined by each algorithm)
 365     is not yet available. During the first-ever input frame, it is clear that it is the
 366     only possible source for information, so i_frame_offset = 0 is necessarily correct.
 367     After that, what to do is up to each algorithm.
 368
 369     Having the correct offset at the start of each input frame is critically important in order to:
 370       1) Allocate the correct number of output frames for framerate doublers, and to
 371       2) Pass correct TFF/BFF information to the algorithm.
 372
 373     These points are important for proper soft field repeat support. This feature is used in some
 374     streams originating from film. In soft NTSC telecine, the number of fields alternates as 3,2,3,2,...
 375     and the video field dominance flips every two frames (after every "3"). Also, some streams
 376     request an occasional field repeat (nb_fields = 3), after which the video field dominance flips.
 377     To render such streams correctly, the nb_fields and TFF/BFF information must be taken from
 378     the specific input frame that the algorithm intends to render.
 379
 380     Additionally, the output PTS is automatically computed by Deinterlace() from i_frame_offset and i_order.
 381
 382     It is possible to use the special value CUSTOM_PTS to indicate that the algorithm computes
 383     the output PTSs itself. In this case, Deinterlace() will pass them through. This special value
 384     is not valid for framerate doublers, as by definition they are field renderers, so they need to
 385     use the original field timings to work correctly. Basically, this special value is only intended
 386     for algorithms that need to perform nontrivial framerate conversions (such as IVTC).
 387 */
 388
 389
 390 /*****************************************************************************
 391  * SetFilterMethod: setup the deinterlace method to use.
 392  *****************************************************************************/
 393 static void SetFilterMethod( filter_t *p_filter, const char *psz_method, vlc_fourcc_t i_chroma )
 394 {
 395     filter_sys_t *p_sys = p_filter->p_sys;
 396
 397     if( !psz_method )
 398         psz_method = "";
 399
 400     if( !strcmp( psz_method, "mean" ) )
 401     {
 402         p_sys->i_mode = DEINTERLACE_MEAN;
 403         p_sys->b_double_rate = false;
 404         p_sys->b_half_height = true;
 405         p_sys->b_use_frame_history = false;
 406     }
 407     else if( !strcmp( psz_method, "bob" )
 408              || !strcmp( psz_method, "progressive-scan" ) )
 409     {
 410         p_sys->i_mode = DEINTERLACE_BOB;
 411         p_sys->b_double_rate = true;
 412         p_sys->b_half_height = false;
 413         p_sys->b_use_frame_history = false;
 414     }
 415     else if( !strcmp( psz_method, "linear" ) )
 416     {
 417         p_sys->i_mode = DEINTERLACE_LINEAR;
 418         p_sys->b_double_rate = true;
 419         p_sys->b_half_height = false;
 420         p_sys->b_use_frame_history = false;
 421     }
 422     else if( !strcmp( psz_method, "x" ) )
 423     {
 424         p_sys->i_mode = DEINTERLACE_X;
 425         p_sys->b_double_rate = false;
 426         p_sys->b_half_height = false;
 427         p_sys->b_use_frame_history = false;
 428     }
 429     else if( !strcmp( psz_method, "yadif" ) )
 430     {
 431         p_sys->i_mode = DEINTERLACE_YADIF;
 432         p_sys->b_double_rate = false;
 433         p_sys->b_half_height = false;
 434         p_sys->b_use_frame_history = true;
 435     }
 436     else if( !strcmp( psz_method, "yadif2x" ) )
 437     {
 438         p_sys->i_mode = DEINTERLACE_YADIF2X;
 439         p_sys->b_double_rate = true;
 440         p_sys->b_half_height = false;
 441         p_sys->b_use_frame_history = true;
 442     }
 443     else if( !strcmp( psz_method, "phosphor" ) )
 444     {
 445         p_sys->i_mode = DEINTERLACE_PHOSPHOR;
 446         p_sys->b_double_rate = true;
 447         p_sys->b_half_height = false;
 448         p_sys->b_use_frame_history = true;
 449     }
 450     else if( !strcmp( psz_method, "ivtc" ) )
 451     {
 452         p_sys->i_mode = DEINTERLACE_IVTC;
 453         p_sys->b_double_rate = false;
 454         p_sys->b_half_height = false;
 455         p_sys->b_use_frame_history = true;
 456     }
 457     else if( !strcmp( psz_method, "discard" ) )
 458     {
 459         const bool b_i422 = i_chroma == VLC_CODEC_I422 ||
 460                             i_chroma == VLC_CODEC_J422;
 461
 462         p_sys->i_mode = DEINTERLACE_DISCARD;
 463         p_sys->b_double_rate = false;
 464         p_sys->b_half_height = !b_i422;
 465         p_sys->b_use_frame_history = false;
 466     }
 467     else
 468     {
 469         if( strcmp( psz_method, "blend" ) )
 470             msg_Err( p_filter,
 471                      "no valid deinterlace mode provided, using \"blend\"" );
 472
 473         p_sys->i_mode = DEINTERLACE_BLEND;
 474         p_sys->b_double_rate = false;
 475         p_sys->b_half_height = false;
 476         p_sys->b_use_frame_history = false;
 477     }
 478
 479     p_sys->i_frame_offset = 0; /* reset to default when method changes */
 480
 481     msg_Dbg( p_filter, "using %s deinterlace method", psz_method );
 482 }
 483
 484 static void GetOutputFormat( filter_t *p_filter,
 485                              video_format_t *p_dst, const video_format_t *p_src )
 486 {
 487     filter_sys_t *p_sys = p_filter->p_sys;
 488     *p_dst = *p_src;
 489
 490     if( p_sys->b_half_height )
 491     {
 492         p_dst->i_height /= 2;
 493         p_dst->i_visible_height /= 2;
 494         p_dst->i_y_offset /= 2;
 495         p_dst->i_sar_den *= 2;
 496     }
 497
 498     if( p_src->i_chroma == VLC_CODEC_I422 ||
 499         p_src->i_chroma == VLC_CODEC_J422 )
 500     {
 501         switch( p_sys->i_mode )
 502         {
 503         case DEINTERLACE_MEAN:
 504         case DEINTERLACE_LINEAR:
 505         case DEINTERLACE_X:
 506         case DEINTERLACE_YADIF:
 507         case DEINTERLACE_YADIF2X:
 508         case DEINTERLACE_PHOSPHOR:
 509         case DEINTERLACE_IVTC:
 510             p_dst->i_chroma = p_src->i_chroma;
 511             break;
 512         default:
 513             p_dst->i_chroma = p_src->i_chroma == VLC_CODEC_I422 ? VLC_CODEC_I420 :
 514                                                                   VLC_CODEC_J420;
 515             break;
 516         }
 517     }
 518     else if( p_sys->i_mode == DEINTERLACE_PHOSPHOR  &&
 519              p_sys->phosphor.i_chroma_for_420 == PC_UPCONVERT )
 520     {
 521         p_dst->i_chroma = p_src->i_chroma == VLC_CODEC_J420 ? VLC_CODEC_J422 :
 522                                                               VLC_CODEC_I422;
 523     }
 524 }
 525
 526 static bool IsChromaSupported( vlc_fourcc_t i_chroma )
 527 {
 528     return i_chroma == VLC_CODEC_I420 ||
 529            i_chroma == VLC_CODEC_J420 ||
 530            i_chroma == VLC_CODEC_YV12 ||
 531            i_chroma == VLC_CODEC_I422 ||
 532            i_chroma == VLC_CODEC_J422;
 533 }
 534
 535 /*****************************************************************************
 536  * RenderDiscard: only keep TOP or BOTTOM field, discard the other.
 537  *****************************************************************************/
 538 static void RenderDiscard( filter_t *p_filter,
 539                            picture_t *p_outpic, picture_t *p_pic, int i_field )
 540 {
 541     int i_plane;
 542
 543     /* Copy image and skip lines */
 544     for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
 545     {
 546         uint8_t *p_in, *p_out_end, *p_out;
 547         int i_increment;
 548
 549         p_in = p_pic->p[i_plane].p_pixels
 550                    + i_field * p_pic->p[i_plane].i_pitch;
 551
 552         p_out = p_outpic->p[i_plane].p_pixels;
 553         p_out_end = p_out + p_outpic->p[i_plane].i_pitch
 554                              * p_outpic->p[i_plane].i_visible_lines;
 555
 556         switch( p_filter->fmt_in.video.i_chroma )
 557         {
 558         case VLC_CODEC_I420:
 559         case VLC_CODEC_J420:
 560         case VLC_CODEC_YV12:
 561
 562             for( ; p_out < p_out_end ; )
 563             {
 564                 vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 565
 566                 p_out += p_outpic->p[i_plane].i_pitch;
 567                 p_in += 2 * p_pic->p[i_plane].i_pitch;
 568             }
 569             break;
 570
 571         case VLC_CODEC_I422:
 572         case VLC_CODEC_J422:
 573
 574             i_increment = 2 * p_pic->p[i_plane].i_pitch;
 575
 576             if( i_plane == Y_PLANE )
 577             {
 578                 for( ; p_out < p_out_end ; )
 579                 {
 580                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 581                     p_out += p_outpic->p[i_plane].i_pitch;
 582                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 583                     p_out += p_outpic->p[i_plane].i_pitch;
 584                     p_in += i_increment;
 585                 }
 586             }
 587             else
 588             {
 589                 for( ; p_out < p_out_end ; )
 590                 {
 591                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 592                     p_out += p_outpic->p[i_plane].i_pitch;
 593                     p_in += i_increment;
 594                 }
 595             }
 596             break;
 597
 598         default:
 599             break;
 600         }
 601     }
 602 }
 603
 604 /*****************************************************************************
 605  * RenderBob: renders a BOB picture - simple copy
 606  *****************************************************************************/
 607 static void RenderBob( filter_t *p_filter,
 608                        picture_t *p_outpic, picture_t *p_pic, int i_field )
 609 {
 610     int i_plane;
 611
 612     /* Copy image and skip lines */
 613     for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
 614     {
 615         uint8_t *p_in, *p_out_end, *p_out;
 616
 617         p_in = p_pic->p[i_plane].p_pixels;
 618         p_out = p_outpic->p[i_plane].p_pixels;
 619         p_out_end = p_out + p_outpic->p[i_plane].i_pitch
 620                              * p_outpic->p[i_plane].i_visible_lines;
 621
 622         switch( p_filter->fmt_in.video.i_chroma )
 623         {
 624             case VLC_CODEC_I420:
 625             case VLC_CODEC_J420:
 626             case VLC_CODEC_YV12:
 627                 /* For BOTTOM field we need to add the first line */
 628                 if( i_field == 1 )
 629                 {
 630                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 631                     p_in += p_pic->p[i_plane].i_pitch;
 632                     p_out += p_outpic->p[i_plane].i_pitch;
 633                 }
 634
 635                 p_out_end -= 2 * p_outpic->p[i_plane].i_pitch;
 636
 637                 for( ; p_out < p_out_end ; )
 638                 {
 639                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 640
 641                     p_out += p_outpic->p[i_plane].i_pitch;
 642
 643                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 644
 645                     p_in += 2 * p_pic->p[i_plane].i_pitch;
 646                     p_out += p_outpic->p[i_plane].i_pitch;
 647                 }
 648
 649                 vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 650
 651                 /* For TOP field we need to add the last line */
 652                 if( i_field == 0 )
 653                 {
 654                     p_in += p_pic->p[i_plane].i_pitch;
 655                     p_out += p_outpic->p[i_plane].i_pitch;
 656                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 657                 }
 658                 break;
 659
 660             case VLC_CODEC_I422:
 661             case VLC_CODEC_J422:
 662                 /* For BOTTOM field we need to add the first line */
 663                 if( i_field == 1 )
 664                 {
 665                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 666                     p_in += p_pic->p[i_plane].i_pitch;
 667                     p_out += p_outpic->p[i_plane].i_pitch;
 668                 }
 669
 670                 p_out_end -= 2 * p_outpic->p[i_plane].i_pitch;
 671
 672                 if( i_plane == Y_PLANE )
 673                 {
 674                     for( ; p_out < p_out_end ; )
 675                     {
 676                         vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 677
 678                         p_out += p_outpic->p[i_plane].i_pitch;
 679
 680                         vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 681
 682                         p_in += 2 * p_pic->p[i_plane].i_pitch;
 683                         p_out += p_outpic->p[i_plane].i_pitch;
 684                     }
 685                 }
 686                 else
 687                 {
 688                     for( ; p_out < p_out_end ; )
 689                     {
 690                         vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 691
 692                         p_out += p_outpic->p[i_plane].i_pitch;
 693                         p_in += 2 * p_pic->p[i_plane].i_pitch;
 694                     }
 695                 }
 696
 697                 vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 698
 699                 /* For TOP field we need to add the last line */
 700                 if( i_field == 0 )
 701                 {
 702                     p_in += p_pic->p[i_plane].i_pitch;
 703                     p_out += p_outpic->p[i_plane].i_pitch;
 704                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 705                 }
 706                 break;
 707         }
 708     }
 709 }
 710
 711 #define Merge p_filter->p_sys->pf_merge
 712 #define EndMerge if(p_filter->p_sys->pf_end_merge) p_filter->p_sys->pf_end_merge
 713
 714 /*****************************************************************************
 715  * RenderLinear: BOB with linear interpolation
 716  *****************************************************************************/
 717 static void RenderLinear( filter_t *p_filter,
 718                           picture_t *p_outpic, picture_t *p_pic, int i_field )
 719 {
 720     int i_plane;
 721
 722     /* Copy image and skip lines */
 723     for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
 724     {
 725         uint8_t *p_in, *p_out_end, *p_out;
 726
 727         p_in = p_pic->p[i_plane].p_pixels;
 728         p_out = p_outpic->p[i_plane].p_pixels;
 729         p_out_end = p_out + p_outpic->p[i_plane].i_pitch
 730                              * p_outpic->p[i_plane].i_visible_lines;
 731
 732         /* For BOTTOM field we need to add the first line */
 733         if( i_field == 1 )
 734         {
 735             vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 736             p_in += p_pic->p[i_plane].i_pitch;
 737             p_out += p_outpic->p[i_plane].i_pitch;
 738         }
 739
 740         p_out_end -= 2 * p_outpic->p[i_plane].i_pitch;
 741
 742         for( ; p_out < p_out_end ; )
 743         {
 744             vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 745
 746             p_out += p_outpic->p[i_plane].i_pitch;
 747
 748             Merge( p_out, p_in, p_in + 2 * p_pic->p[i_plane].i_pitch,
 749                    p_pic->p[i_plane].i_pitch );
 750
 751             p_in += 2 * p_pic->p[i_plane].i_pitch;
 752             p_out += p_outpic->p[i_plane].i_pitch;
 753         }
 754
 755         vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 756
 757         /* For TOP field we need to add the last line */
 758         if( i_field == 0 )
 759         {
 760             p_in += p_pic->p[i_plane].i_pitch;
 761             p_out += p_outpic->p[i_plane].i_pitch;
 762             vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 763         }
 764     }
 765     EndMerge();
 766 }
 767
 768 static void RenderMean( filter_t *p_filter,
 769                         picture_t *p_outpic, picture_t *p_pic )
 770 {
 771     int i_plane;
 772
 773     /* Copy image and skip lines */
 774     for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
 775     {
 776         uint8_t *p_in, *p_out_end, *p_out;
 777
 778         p_in = p_pic->p[i_plane].p_pixels;
 779
 780         p_out = p_outpic->p[i_plane].p_pixels;
 781         p_out_end = p_out + p_outpic->p[i_plane].i_pitch
 782                              * p_outpic->p[i_plane].i_visible_lines;
 783
 784         /* All lines: mean value */
 785         for( ; p_out < p_out_end ; )
 786         {
 787             Merge( p_out, p_in, p_in + p_pic->p[i_plane].i_pitch,
 788                    p_pic->p[i_plane].i_pitch );
 789
 790             p_out += p_outpic->p[i_plane].i_pitch;
 791             p_in += 2 * p_pic->p[i_plane].i_pitch;
 792         }
 793     }
 794     EndMerge();
 795 }
 796
 797 static void RenderBlend( filter_t *p_filter,
 798                          picture_t *p_outpic, picture_t *p_pic )
 799 {
 800     int i_plane;
 801
 802     /* Copy image and skip lines */
 803     for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
 804     {
 805         uint8_t *p_in, *p_out_end, *p_out;
 806
 807         p_in = p_pic->p[i_plane].p_pixels;
 808
 809         p_out = p_outpic->p[i_plane].p_pixels;
 810         p_out_end = p_out + p_outpic->p[i_plane].i_pitch
 811                              * p_outpic->p[i_plane].i_visible_lines;
 812
 813         switch( p_filter->fmt_in.video.i_chroma )
 814         {
 815             case VLC_CODEC_I420:
 816             case VLC_CODEC_J420:
 817             case VLC_CODEC_YV12:
 818                 /* First line: simple copy */
 819                 vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 820                 p_out += p_outpic->p[i_plane].i_pitch;
 821
 822                 /* Remaining lines: mean value */
 823                 for( ; p_out < p_out_end ; )
 824                 {
 825                     Merge( p_out, p_in, p_in + p_pic->p[i_plane].i_pitch,
 826                            p_pic->p[i_plane].i_pitch );
 827
 828                     p_out += p_outpic->p[i_plane].i_pitch;
 829                     p_in += p_pic->p[i_plane].i_pitch;
 830                 }
 831                 break;
 832
 833             case VLC_CODEC_I422:
 834             case VLC_CODEC_J422:
 835                 /* First line: simple copy */
 836                 vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 837                 p_out += p_outpic->p[i_plane].i_pitch;
 838
 839                 /* Remaining lines: mean value */
 840                 if( i_plane == Y_PLANE )
 841                 {
 842                     for( ; p_out < p_out_end ; )
 843                     {
 844                         Merge( p_out, p_in, p_in + p_pic->p[i_plane].i_pitch,
 845                                p_pic->p[i_plane].i_pitch );
 846
 847                         p_out += p_outpic->p[i_plane].i_pitch;
 848                         p_in += p_pic->p[i_plane].i_pitch;
 849                     }
 850                 }
 851
 852                 else
 853                 {
 854                     for( ; p_out < p_out_end ; )
 855                     {
 856                         Merge( p_out, p_in, p_in + p_pic->p[i_plane].i_pitch,
 857                                p_pic->p[i_plane].i_pitch );
 858
 859                         p_out += p_outpic->p[i_plane].i_pitch;
 860                         p_in += 2*p_pic->p[i_plane].i_pitch;
 861                     }
 862                 }
 863                 break;
 864         }
 865     }
 866     EndMerge();
 867 }
 868
 869 static void MergeGeneric( void *_p_dest, const void *_p_s1,
 870                           const void *_p_s2, size_t i_bytes )
 871 {
 872     uint8_t* p_dest = (uint8_t*)_p_dest;
 873     const uint8_t *p_s1 = (const uint8_t *)_p_s1;
 874     const uint8_t *p_s2 = (const uint8_t *)_p_s2;
 875     uint8_t* p_end = p_dest + i_bytes - 8;
 876
 877     while( p_dest < p_end )
 878     {
 879         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 880         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 881         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 882         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 883         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 884         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 885         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 886         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 887     }
 888
 889     p_end += 8;
 890
 891     while( p_dest < p_end )
 892     {
 893         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 894     }
 895 }
 896
 897 #if defined(CAN_COMPILE_MMXEXT)
 898 static void MergeMMXEXT( void *_p_dest, const void *_p_s1, const void *_p_s2,
 899                          size_t i_bytes )
 900 {
 901     uint8_t* p_dest = (uint8_t*)_p_dest;
 902     const uint8_t *p_s1 = (const uint8_t *)_p_s1;
 903     const uint8_t *p_s2 = (const uint8_t *)_p_s2;
 904     uint8_t* p_end = p_dest + i_bytes - 8;
 905     while( p_dest < p_end )
 906     {
 907         __asm__  __volatile__( "movq %2,%%mm1;"
 908                                "pavgb %1, %%mm1;"
 909                                "movq %%mm1, %0" :"=m" (*p_dest):
 910                                                  "m" (*p_s1),
 911                                                  "m" (*p_s2) );
 912         p_dest += 8;
 913         p_s1 += 8;
 914         p_s2 += 8;
 915     }
 916
 917     p_end += 8;
 918
 919     while( p_dest < p_end )
 920     {
 921         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 922     }
 923 }
 924 #endif
 925
 926 #if defined(CAN_COMPILE_3DNOW)
 927 static void Merge3DNow( void *_p_dest, const void *_p_s1, const void *_p_s2,
 928                         size_t i_bytes )
 929 {
 930     uint8_t* p_dest = (uint8_t*)_p_dest;
 931     const uint8_t *p_s1 = (const uint8_t *)_p_s1;
 932     const uint8_t *p_s2 = (const uint8_t *)_p_s2;
 933     uint8_t* p_end = p_dest + i_bytes - 8;
 934     while( p_dest < p_end )
 935     {
 936         __asm__  __volatile__( "movq %2,%%mm1;"
 937                                "pavgusb %1, %%mm1;"
 938                                "movq %%mm1, %0" :"=m" (*p_dest):
 939                                                  "m" (*p_s1),
 940                                                  "m" (*p_s2) );
 941         p_dest += 8;
 942         p_s1 += 8;
 943         p_s2 += 8;
 944     }
 945
 946     p_end += 8;
 947
 948     while( p_dest < p_end )
 949     {
 950         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 951     }
 952 }
 953 #endif
 954
 955 #if defined(CAN_COMPILE_SSE)
 956 static void MergeSSE2( void *_p_dest, const void *_p_s1, const void *_p_s2,
 957                        size_t i_bytes )
 958 {
 959     uint8_t* p_dest = (uint8_t*)_p_dest;
 960     const uint8_t *p_s1 = (const uint8_t *)_p_s1;
 961     const uint8_t *p_s2 = (const uint8_t *)_p_s2;
 962     uint8_t* p_end;
 963     while( (uintptr_t)p_s1 % 16 )
 964     {
 965         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 966     }
 967     p_end = p_dest + i_bytes - 16;
 968     while( p_dest < p_end )
 969     {
 970         __asm__  __volatile__( "movdqu %2,%%xmm1;"
 971                                "pavgb %1, %%xmm1;"
 972                                "movdqu %%xmm1, %0" :"=m" (*p_dest):
 973                                                  "m" (*p_s1),
 974                                                  "m" (*p_s2) );
 975         p_dest += 16;
 976         p_s1 += 16;
 977         p_s2 += 16;
 978     }
 979
 980     p_end += 16;
 981
 982     while( p_dest < p_end )
 983     {
 984         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 985     }
 986 }
 987 #endif
 988
 989 #if defined(CAN_COMPILE_MMXEXT) || defined(CAN_COMPILE_SSE)
 990 static void EndMMX( void )
 991 {
 992     __asm__ __volatile__( "emms" :: );
 993 }
 994 #endif
 995
 996 #if defined(CAN_COMPILE_3DNOW)
 997 static void End3DNow( void )
 998 {
 999     __asm__ __volatile__( "femms" :: );
1000 }
1001 #endif
1002
1003 #ifdef CAN_COMPILE_C_ALTIVEC
1004 static void MergeAltivec( void *_p_dest, const void *_p_s1,
1005                           const void *_p_s2, size_t i_bytes )
1006 {
1007     uint8_t *p_dest = (uint8_t *)_p_dest;
1008     uint8_t *p_s1   = (uint8_t *)_p_s1;
1009     uint8_t *p_s2   = (uint8_t *)_p_s2;
1010     uint8_t *p_end  = p_dest + i_bytes - 15;
1011
1012     /* Use C until the first 16-bytes aligned destination pixel */
1013     while( (uintptr_t)p_dest & 0xF )
1014     {
1015         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
1016     }
1017
1018     if( ( (int)p_s1 & 0xF ) | ( (int)p_s2 & 0xF ) )
1019     {
1020         /* Unaligned source */
1021         vector unsigned char s1v, s2v, destv;
1022         vector unsigned char s1oldv, s2oldv, s1newv, s2newv;
1023         vector unsigned char perm1v, perm2v;
1024
1025         perm1v = vec_lvsl( 0, p_s1 );
1026         perm2v = vec_lvsl( 0, p_s2 );
1027         s1oldv = vec_ld( 0, p_s1 );
1028         s2oldv = vec_ld( 0, p_s2 );
1029
1030         while( p_dest < p_end )
1031         {
1032             s1newv = vec_ld( 16, p_s1 );
1033             s2newv = vec_ld( 16, p_s2 );
1034             s1v    = vec_perm( s1oldv, s1newv, perm1v );
1035             s2v    = vec_perm( s2oldv, s2newv, perm2v );
1036             s1oldv = s1newv;
1037             s2oldv = s2newv;
1038             destv  = vec_avg( s1v, s2v );
1039             vec_st( destv, 0, p_dest );
1040
1041             p_s1   += 16;
1042             p_s2   += 16;
1043             p_dest += 16;
1044         }
1045     }
1046     else
1047     {
1048         /* Aligned source */
1049         vector unsigned char s1v, s2v, destv;
1050
1051         while( p_dest < p_end )
1052         {
1053             s1v   = vec_ld( 0, p_s1 );
1054             s2v   = vec_ld( 0, p_s2 );
1055             destv = vec_avg( s1v, s2v );
1056             vec_st( destv, 0, p_dest );
1057
1058             p_s1   += 16;
1059             p_s2   += 16;
1060             p_dest += 16;
1061         }
1062     }
1063
1064     p_end += 15;
1065
1066     while( p_dest < p_end )
1067     {
1068         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
1069     }
1070 }
1071 #endif
1072
1073 #ifdef __ARM_NEON__
1074 static void MergeNEON (void *restrict out, const void *in1,
1075                        const void *in2, size_t n)
1076 {
1077     uint8_t *outp = out;
1078     const uint8_t *in1p = in1;
1079     const uint8_t *in2p = in2;
1080     size_t mis = ((uintptr_t)outp) & 15;
1081
1082     if (mis)
1083     {
1084         MergeGeneric (outp, in1p, in2p, mis);
1085         outp += mis;
1086         in1p += mis;
1087         in2p += mis;
1088         n -= mis;
1089     }
1090
1091     uint8_t *end = outp + (n & ~15);
1092
1093     if ((((uintptr_t)in1p)|((uintptr_t)in2p)) & 15)
1094         while (outp < end)
1095             asm volatile (
1096                 "vld1.u8  {q0-q1}, [%[in1]]!\n"
1097                 "vld1.u8  {q2-q3}, [%[in2]]!\n"
1098                 "vhadd.u8 q4, q0, q2\n"
1099                 "vld1.u8  {q6-q7}, [%[in1]]!\n"
1100                 "vhadd.u8 q5, q1, q3\n"
1101                 "vld1.u8  {q8-q9}, [%[in2]]!\n"
1102                 "vhadd.u8 q10, q6, q8\n"
1103                 "vhadd.u8 q11, q7, q9\n"
1104                 "vst1.u8  {q4-q5}, [%[out],:128]!\n"
1105                 "vst1.u8  {q10-q11}, [%[out],:128]!\n"
1106                 : [out] "+r" (outp), [in1] "+r" (in1p), [in2] "+r" (in2p)
1107                 :
1108                 : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
1109                   "q8", "q9", "q10", "q11", "memory");
1110     else
1111          while (outp < end)
1112             asm volatile (
1113                 "vld1.u8  {q0-q1}, [%[in1],:128]!\n"
1114                 "vld1.u8  {q2-q3}, [%[in2],:128]!\n"
1115                 "vhadd.u8 q4, q0, q2\n"
1116                 "vld1.u8  {q6-q7}, [%[in1],:128]!\n"
1117                 "vhadd.u8 q5, q1, q3\n"
1118                 "vld1.u8  {q8-q9}, [%[in2],:128]!\n"
1119                 "vhadd.u8 q10, q6, q8\n"
1120                 "vhadd.u8 q11, q7, q9\n"
1121                 "vst1.u8  {q4-q5}, [%[out],:128]!\n"
1122                 "vst1.u8  {q10-q11}, [%[out],:128]!\n"
1123                 : [out] "+r" (outp), [in1] "+r" (in1p), [in2] "+r" (in2p)
1124                 :
1125                 : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
1126                   "q8", "q9", "q10", "q11", "memory");
1127     n &= 15;
1128     if (n)
1129         MergeGeneric (outp, in1p, in2p, n);
1130 }
1131 #endif
1132
1133 /*****************************************************************************
1134  * RenderX: This algo works on a 8x8 block basic, it copies the top field
1135  * and apply a process to recreate the bottom field :
1136  *  If a 8x8 block is classified as :
1137  *   - progressive: it applies a small blend (1,6,1)
1138  *   - interlaced:
1139  *    * in the MMX version: we do a ME between the 2 fields, if there is a
1140  *    good match we use MC to recreate the bottom field (with a small
1141  *    blend (1,6,1) )
1142  *    * otherwise: it recreates the bottom field by an edge oriented
1143  *    interpolation.
1144   *****************************************************************************/
1145
1146 /* XDeint8x8Detect: detect if a 8x8 block is interlaced.
1147  * XXX: It need to access to 8x10
1148  * We use more than 8 lines to help with scrolling (text)
1149  * (and because XDeint8x8Frame use line 9)
1150  * XXX: smooth/uniform area with noise detection doesn't works well
1151  * but it's not really a problem because they don't have much details anyway
1152  */
1153 static inline int ssd( int a ) { return a*a; }
1154 static inline int XDeint8x8DetectC( uint8_t *src, int i_src )
1155 {
1156     int y, x;
1157     int ff, fr;
1158     int fc;
1159
1160     /* Detect interlacing */
1161     fc = 0;
1162     for( y = 0; y < 7; y += 2 )
1163     {
1164         ff = fr = 0;
1165         for( x = 0; x < 8; x++ )
1166         {
1167             fr += ssd(src[      x] - src[1*i_src+x]) +
1168                   ssd(src[i_src+x] - src[2*i_src+x]);
1169             ff += ssd(src[      x] - src[2*i_src+x]) +
1170                   ssd(src[i_src+x] - src[3*i_src+x]);
1171         }
1172         if( ff < 6*fr/8 && fr > 32 )
1173             fc++;
1174
1175         src += 2*i_src;
1176     }
1177
1178     return fc < 1 ? false : true;
1179 }
1180 #ifdef CAN_COMPILE_MMXEXT
1181 static inline int XDeint8x8DetectMMXEXT( uint8_t *src, int i_src )
1182 {
1183
1184     int y, x;
1185     int32_t ff, fr;
1186     int fc;
1187
1188     /* Detect interlacing */
1189     fc = 0;
1190     pxor_r2r( mm7, mm7 );
1191     for( y = 0; y < 9; y += 2 )
1192     {
1193         ff = fr = 0;
1194         pxor_r2r( mm5, mm5 );
1195         pxor_r2r( mm6, mm6 );
1196         for( x = 0; x < 8; x+=4 )
1197         {
1198             movd_m2r( src[        x], mm0 );
1199             movd_m2r( src[1*i_src+x], mm1 );
1200             movd_m2r( src[2*i_src+x], mm2 );
1201             movd_m2r( src[3*i_src+x], mm3 );
1202
1203             punpcklbw_r2r( mm7, mm0 );
1204             punpcklbw_r2r( mm7, mm1 );
1205             punpcklbw_r2r( mm7, mm2 );
1206             punpcklbw_r2r( mm7, mm3 );
1207
1208             movq_r2r( mm0, mm4 );
1209
1210             psubw_r2r( mm1, mm0 );
1211             psubw_r2r( mm2, mm4 );
1212
1213             psubw_r2r( mm1, mm2 );
1214             psubw_r2r( mm1, mm3 );
1215
1216             pmaddwd_r2r( mm0, mm0 );
1217             pmaddwd_r2r( mm4, mm4 );
1218             pmaddwd_r2r( mm2, mm2 );
1219             pmaddwd_r2r( mm3, mm3 );
1220             paddd_r2r( mm0, mm2 );
1221             paddd_r2r( mm4, mm3 );
1222             paddd_r2r( mm2, mm5 );
1223             paddd_r2r( mm3, mm6 );
1224         }
1225
1226         movq_r2r( mm5, mm0 );
1227         psrlq_i2r( 32, mm0 );
1228         paddd_r2r( mm0, mm5 );
1229         movd_r2m( mm5, fr );
1230
1231         movq_r2r( mm6, mm0 );
1232         psrlq_i2r( 32, mm0 );
1233         paddd_r2r( mm0, mm6 );
1234         movd_r2m( mm6, ff );
1235
1236         if( ff < 6*fr/8 && fr > 32 )
1237             fc++;
1238
1239         src += 2*i_src;
1240     }
1241     return fc;
1242 }
1243 #endif
1244
1245 static inline void XDeint8x8MergeC( uint8_t *dst, int i_dst,
1246                                     uint8_t *src1, int i_src1,
1247                                     uint8_t *src2, int i_src2 )
1248 {
1249     int y, x;
1250
1251     /* Progressive */
1252     for( y = 0; y < 8; y += 2 )
1253     {
1254         memcpy( dst, src1, 8 );
1255         dst  += i_dst;
1256
1257         for( x = 0; x < 8; x++ )
1258             dst[x] = (src1[x] + 6*src2[x] + src1[i_src1+x] + 4 ) >> 3;
1259         dst += i_dst;
1260
1261         src1 += i_src1;
1262         src2 += i_src2;
1263     }
1264 }
1265
1266 #ifdef CAN_COMPILE_MMXEXT
1267 static inline void XDeint8x8MergeMMXEXT( uint8_t *dst, int i_dst,
1268                                          uint8_t *src1, int i_src1,
1269                                          uint8_t *src2, int i_src2 )
1270 {
1271     static const uint64_t m_4 = INT64_C(0x0004000400040004);
1272     int y, x;
1273
1274     /* Progressive */
1275     pxor_r2r( mm7, mm7 );
1276     for( y = 0; y < 8; y += 2 )
1277     {
1278         for( x = 0; x < 8; x +=4 )
1279         {
1280             movd_m2r( src1[x], mm0 );
1281             movd_r2m( mm0, dst[x] );
1282
1283             movd_m2r( src2[x], mm1 );
1284             movd_m2r( src1[i_src1+x], mm2 );
1285
1286             punpcklbw_r2r( mm7, mm0 );
1287             punpcklbw_r2r( mm7, mm1 );
1288             punpcklbw_r2r( mm7, mm2 );
1289             paddw_r2r( mm1, mm1 );
1290             movq_r2r( mm1, mm3 );
1291             paddw_r2r( mm3, mm3 );
1292             paddw_r2r( mm2, mm0 );
1293             paddw_r2r( mm3, mm1 );
1294             paddw_m2r( m_4, mm1 );
1295             paddw_r2r( mm1, mm0 );
1296             psraw_i2r( 3, mm0 );
1297             packuswb_r2r( mm7, mm0 );
1298             movd_r2m( mm0, dst[i_dst+x] );
1299         }
1300         dst += 2*i_dst;
1301         src1 += i_src1;
1302         src2 += i_src2;
1303     }
1304 }
1305
1306 #endif
1307
1308 /* For debug */
1309 static inline void XDeint8x8Set( uint8_t *dst, int i_dst, uint8_t v )
1310 {
1311     int y;
1312     for( y = 0; y < 8; y++ )
1313         memset( &dst[y*i_dst], v, 8 );
1314 }
1315
1316 /* XDeint8x8FieldE: Stupid deinterlacing (1,0,1) for block that miss a
1317  * neighbour
1318  * (Use 8x9 pixels)
1319  * TODO: a better one for the inner part.
1320  */
1321 static inline void XDeint8x8FieldEC( uint8_t *dst, int i_dst,
1322                                      uint8_t *src, int i_src )
1323 {
1324     int y, x;
1325
1326     /* Interlaced */
1327     for( y = 0; y < 8; y += 2 )
1328     {
1329         memcpy( dst, src, 8 );
1330         dst += i_dst;
1331
1332         for( x = 0; x < 8; x++ )
1333             dst[x] = (src[x] + src[2*i_src+x] ) >> 1;
1334         dst += 1*i_dst;
1335         src += 2*i_src;
1336     }
1337 }
1338 #ifdef CAN_COMPILE_MMXEXT
1339 static inline void XDeint8x8FieldEMMXEXT( uint8_t *dst, int i_dst,
1340                                           uint8_t *src, int i_src )
1341 {
1342     int y;
1343
1344     /* Interlaced */
1345     for( y = 0; y < 8; y += 2 )
1346     {
1347         movq_m2r( src[0], mm0 );
1348         movq_r2m( mm0, dst[0] );
1349         dst += i_dst;
1350
1351         movq_m2r( src[2*i_src], mm1 );
1352         pavgb_r2r( mm1, mm0 );
1353
1354         movq_r2m( mm0, dst[0] );
1355
1356         dst += 1*i_dst;
1357         src += 2*i_src;
1358     }
1359 }
1360 #endif
1361
1362 /* XDeint8x8Field: Edge oriented interpolation
1363  * (Need -4 and +5 pixels H, +1 line)
1364  */
1365 static inline void XDeint8x8FieldC( uint8_t *dst, int i_dst,
1366                                     uint8_t *src, int i_src )
1367 {
1368     int y, x;
1369
1370     /* Interlaced */
1371     for( y = 0; y < 8; y += 2 )
1372     {
1373         memcpy( dst, src, 8 );
1374         dst += i_dst;
1375
1376         for( x = 0; x < 8; x++ )
1377         {
1378             uint8_t *src2 = &src[2*i_src];
1379             /* I use 8 pixels just to match the MMX version, but it's overkill
1380              * 5 would be enough (less isn't good) */
1381             const int c0 = abs(src[x-4]-src2[x-2]) + abs(src[x-3]-src2[x-1]) +
1382                            abs(src[x-2]-src2[x+0]) + abs(src[x-1]-src2[x+1]) +
1383                            abs(src[x+0]-src2[x+2]) + abs(src[x+1]-src2[x+3]) +
1384                            abs(src[x+2]-src2[x+4]) + abs(src[x+3]-src2[x+5]);
1385
1386             const int c1 = abs(src[x-3]-src2[x-3]) + abs(src[x-2]-src2[x-2]) +
1387                            abs(src[x-1]-src2[x-1]) + abs(src[x+0]-src2[x+0]) +
1388                            abs(src[x+1]-src2[x+1]) + abs(src[x+2]-src2[x+2]) +
1389                            abs(src[x+3]-src2[x+3]) + abs(src[x+4]-src2[x+4]);
1390
1391             const int c2 = abs(src[x-2]-src2[x-4]) + abs(src[x-1]-src2[x-3]) +
1392                            abs(src[x+0]-src2[x-2]) + abs(src[x+1]-src2[x-1]) +
1393                            abs(src[x+2]-src2[x+0]) + abs(src[x+3]-src2[x+1]) +
1394                            abs(src[x+4]-src2[x+2]) + abs(src[x+5]-src2[x+3]);
1395
1396             if( c0 < c1 && c1 <= c2 )
1397                 dst[x] = (src[x-1] + src2[x+1]) >> 1;
1398             else if( c2 < c1 && c1 <= c0 )
1399                 dst[x] = (src[x+1] + src2[x-1]) >> 1;
1400             else
1401                 dst[x] = (src[x+0] + src2[x+0]) >> 1;
1402         }
1403
1404         dst += 1*i_dst;
1405         src += 2*i_src;
1406     }
1407 }
1408 #ifdef CAN_COMPILE_MMXEXT
1409 static inline void XDeint8x8FieldMMXEXT( uint8_t *dst, int i_dst,
1410                                          uint8_t *src, int i_src )
1411 {
1412     int y, x;
1413
1414     /* Interlaced */
1415     for( y = 0; y < 8; y += 2 )
1416     {
1417         memcpy( dst, src, 8 );
1418         dst += i_dst;
1419
1420         for( x = 0; x < 8; x++ )
1421         {
1422             uint8_t *src2 = &src[2*i_src];
1423             int32_t c0, c1, c2;
1424
1425             movq_m2r( src[x-2], mm0 );
1426             movq_m2r( src[x-3], mm1 );
1427             movq_m2r( src[x-4], mm2 );
1428
1429             psadbw_m2r( src2[x-4], mm0 );
1430             psadbw_m2r( src2[x-3], mm1 );
1431             psadbw_m2r( src2[x-2], mm2 );
1432
1433             movd_r2m( mm0, c2 );
1434             movd_r2m( mm1, c1 );
1435             movd_r2m( mm2, c0 );
1436
1437             if( c0 < c1 && c1 <= c2 )
1438                 dst[x] = (src[x-1] + src2[x+1]) >> 1;
1439             else if( c2 < c1 && c1 <= c0 )
1440                 dst[x] = (src[x+1] + src2[x-1]) >> 1;
1441             else
1442                 dst[x] = (src[x+0] + src2[x+0]) >> 1;
1443         }
1444
1445         dst += 1*i_dst;
1446         src += 2*i_src;
1447     }
1448 }
1449 #endif
1450
1451 /* NxN arbitray size (and then only use pixel in the NxN block)
1452  */
1453 static inline int XDeintNxNDetect( uint8_t *src, int i_src,
1454                                    int i_height, int i_width )
1455 {
1456     int y, x;
1457     int ff, fr;
1458     int fc;
1459
1460
1461     /* Detect interlacing */
1462     /* FIXME way too simple, need to be more like XDeint8x8Detect */
1463     ff = fr = 0;
1464     fc = 0;
1465     for( y = 0; y < i_height - 2; y += 2 )
1466     {
1467         const uint8_t *s = &src[y*i_src];
1468         for( x = 0; x < i_width; x++ )
1469         {
1470             fr += ssd(s[      x] - s[1*i_src+x]);
1471             ff += ssd(s[      x] - s[2*i_src+x]);
1472         }
1473         if( ff < fr && fr > i_width / 2 )
1474             fc++;
1475     }
1476
1477     return fc < 2 ? false : true;
1478 }
1479
1480 static inline void XDeintNxNFrame( uint8_t *dst, int i_dst,
1481                                    uint8_t *src, int i_src,
1482                                    int i_width, int i_height )
1483 {
1484     int y, x;
1485
1486     /* Progressive */
1487     for( y = 0; y < i_height; y += 2 )
1488     {
1489         memcpy( dst, src, i_width );
1490         dst += i_dst;
1491
1492         if( y < i_height - 2 )
1493         {
1494             for( x = 0; x < i_width; x++ )
1495                 dst[x] = (src[x] + 2*src[1*i_src+x] + src[2*i_src+x] + 2 ) >> 2;
1496         }
1497         else
1498         {
1499             /* Blend last line */
1500             for( x = 0; x < i_width; x++ )
1501                 dst[x] = (src[x] + src[1*i_src+x] ) >> 1;
1502         }
1503         dst += 1*i_dst;
1504         src += 2*i_src;
1505     }
1506 }
1507
1508 static inline void XDeintNxNField( uint8_t *dst, int i_dst,
1509                                    uint8_t *src, int i_src,
1510                                    int i_width, int i_height )
1511 {
1512     int y, x;
1513
1514     /* Interlaced */
1515     for( y = 0; y < i_height; y += 2 )
1516     {
1517         memcpy( dst, src, i_width );
1518         dst += i_dst;
1519
1520         if( y < i_height - 2 )
1521         {
1522             for( x = 0; x < i_width; x++ )
1523                 dst[x] = (src[x] + src[2*i_src+x] ) >> 1;
1524         }
1525         else
1526         {
1527             /* Blend last line */
1528             for( x = 0; x < i_width; x++ )
1529                 dst[x] = (src[x] + src[i_src+x]) >> 1;
1530         }
1531         dst += 1*i_dst;
1532         src += 2*i_src;
1533     }
1534 }
1535
1536 static inline void XDeintNxN( uint8_t *dst, int i_dst, uint8_t *src, int i_src,
1537                               int i_width, int i_height )
1538 {
1539     if( XDeintNxNDetect( src, i_src, i_width, i_height ) )
1540         XDeintNxNField( dst, i_dst, src, i_src, i_width, i_height );
1541     else
1542         XDeintNxNFrame( dst, i_dst, src, i_src, i_width, i_height );
1543 }
1544
1545
1546 static inline int median( int a, int b, int c )
1547 {
1548     int min = a, max =a;
1549     if( b < min )
1550         min = b;
1551     else
1552         max = b;
1553
1554     if( c < min )
1555         min = c;
1556     else if( c > max )
1557         max = c;
1558
1559     return a + b + c - min - max;
1560 }
1561
1562
1563 /* XDeintBand8x8:
1564  */
1565 static inline void XDeintBand8x8C( uint8_t *dst, int i_dst,
1566                                    uint8_t *src, int i_src,
1567                                    const int i_mbx, int i_modx )
1568 {
1569     int x;
1570
1571     for( x = 0; x < i_mbx; x++ )
1572     {
1573         int s;
1574         if( ( s = XDeint8x8DetectC( src, i_src ) ) )
1575         {
1576             if( x == 0 || x == i_mbx - 1 )
1577                 XDeint8x8FieldEC( dst, i_dst, src, i_src );
1578             else
1579                 XDeint8x8FieldC( dst, i_dst, src, i_src );
1580         }
1581         else
1582         {
1583             XDeint8x8MergeC( dst, i_dst,
1584                              &src[0*i_src], 2*i_src,
1585                              &src[1*i_src], 2*i_src );
1586         }
1587
1588         dst += 8;
1589         src += 8;
1590     }
1591
1592     if( i_modx )
1593         XDeintNxN( dst, i_dst, src, i_src, i_modx, 8 );
1594 }
1595 #ifdef CAN_COMPILE_MMXEXT
1596 static inline void XDeintBand8x8MMXEXT( uint8_t *dst, int i_dst,
1597                                         uint8_t *src, int i_src,
1598                                         const int i_mbx, int i_modx )
1599 {
1600     int x;
1601
1602     /* Reset current line */
1603     for( x = 0; x < i_mbx; x++ )
1604     {
1605         int s;
1606         if( ( s = XDeint8x8DetectMMXEXT( src, i_src ) ) )
1607         {
1608             if( x == 0 || x == i_mbx - 1 )
1609                 XDeint8x8FieldEMMXEXT( dst, i_dst, src, i_src );
1610             else
1611                 XDeint8x8FieldMMXEXT( dst, i_dst, src, i_src );
1612         }
1613         else
1614         {
1615             XDeint8x8MergeMMXEXT( dst, i_dst,
1616                                   &src[0*i_src], 2*i_src,
1617                                   &src[1*i_src], 2*i_src );
1618         }
1619
1620         dst += 8;
1621         src += 8;
1622     }
1623
1624     if( i_modx )
1625         XDeintNxN( dst, i_dst, src, i_src, i_modx, 8 );
1626 }
1627 #endif
1628
1629 static void RenderX( picture_t *p_outpic, picture_t *p_pic )
1630 {
1631     int i_plane;
1632     unsigned u_cpu = vlc_CPU();
1633
1634     /* Copy image and skip lines */
1635     for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
1636     {
1637         const int i_mby = ( p_outpic->p[i_plane].i_visible_lines + 7 )/8 - 1;
1638         const int i_mbx = p_outpic->p[i_plane].i_visible_pitch/8;
1639
1640         const int i_mody = p_outpic->p[i_plane].i_visible_lines - 8*i_mby;
1641         const int i_modx = p_outpic->p[i_plane].i_visible_pitch - 8*i_mbx;
1642
1643         const int i_dst = p_outpic->p[i_plane].i_pitch;
1644         const int i_src = p_pic->p[i_plane].i_pitch;
1645
1646         int y, x;
1647
1648         for( y = 0; y < i_mby; y++ )
1649         {
1650             uint8_t *dst = &p_outpic->p[i_plane].p_pixels[8*y*i_dst];
1651             uint8_t *src = &p_pic->p[i_plane].p_pixels[8*y*i_src];
1652
1653 #ifdef CAN_COMPILE_MMXEXT
1654             if( u_cpu & CPU_CAPABILITY_MMXEXT )
1655                 XDeintBand8x8MMXEXT( dst, i_dst, src, i_src, i_mbx, i_modx );
1656             else
1657 #endif
1658                 XDeintBand8x8C( dst, i_dst, src, i_src, i_mbx, i_modx );
1659         }
1660
1661         /* Last line (C only)*/
1662         if( i_mody )
1663         {
1664             uint8_t *dst = &p_outpic->p[i_plane].p_pixels[8*y*i_dst];
1665             uint8_t *src = &p_pic->p[i_plane].p_pixels[8*y*i_src];
1666
1667             for( x = 0; x < i_mbx; x++ )
1668             {
1669                 XDeintNxN( dst, i_dst, src, i_src, 8, i_mody );
1670
1671                 dst += 8;
1672                 src += 8;
1673             }
1674
1675             if( i_modx )
1676                 XDeintNxN( dst, i_dst, src, i_src, i_modx, i_mody );
1677         }
1678     }
1679
1680 #ifdef CAN_COMPILE_MMXEXT
1681     if( u_cpu & CPU_CAPABILITY_MMXEXT )
1682         emms();
1683 #endif
1684 }
1685
1686 /*****************************************************************************
1687  * Yadif (Yet Another DeInterlacing Filter).
1688  *****************************************************************************/
1689 /* */
1690 struct vf_priv_s {
1691     /*
1692      * 0: Output 1 frame for each frame.
1693      * 1: Output 1 frame for each field.
1694      * 2: Like 0 but skips spatial interlacing check.
1695      * 3: Like 1 but skips spatial interlacing check.
1696      *
1697      * In vlc, only & 0x02 has meaning, as we do the & 0x01 ourself.
1698      */
1699     int mode;
1700 };
1701
1702 /* I am unsure it is the right one */
1703 typedef intptr_t x86_reg;
1704
1705 #define FFABS(a) ((a) >= 0 ? (a) : (-(a)))
1706 #define FFMAX(a,b)      __MAX(a,b)
1707 #define FFMAX3(a,b,c)   FFMAX(FFMAX(a,b),c)
1708 #define FFMIN(a,b)      __MIN(a,b)
1709 #define FFMIN3(a,b,c)   FFMIN(FFMIN(a,b),c)
1710
1711 /* yadif.h comes from vf_yadif.c of mplayer project */
1712 #include "yadif.h"
1713
1714 static int RenderYadif( filter_t *p_filter, picture_t *p_dst, picture_t *p_src, int i_order, int i_field )
1715 {
1716     VLC_UNUSED(p_src);
1717
1718     filter_sys_t *p_sys = p_filter->p_sys;
1719
1720     /* */
1721     assert( i_order >= 0 && i_order <= 2 ); /* 2 = soft field repeat */
1722     assert( i_field == 0 || i_field == 1 );
1723
1724     /* As the pitches must match, use ONLY pictures coming from picture_New()! */
1725     picture_t *p_prev = p_sys->pp_history[0];
1726     picture_t *p_cur  = p_sys->pp_history[1];
1727     picture_t *p_next = p_sys->pp_history[2];
1728
1729     /* Account for soft field repeat.
1730
1731        The "parity" parameter affects the algorithm like this (from yadif.h):
1732        uint8_t *prev2= parity ? prev : cur ;
1733        uint8_t *next2= parity ? cur  : next;
1734
1735        The original parity expression that was used here is:
1736        (i_field ^ (i_order == i_field)) & 1
1737
1738        Truth table:
1739        i_field = 0, i_order = 0  => 1
1740        i_field = 1, i_order = 1  => 0
1741        i_field = 1, i_order = 0  => 1
1742        i_field = 0, i_order = 1  => 0
1743
1744        => equivalent with e.g.  (1 - i_order)  or  (i_order + 1) % 2
1745
1746        Thus, in a normal two-field frame,
1747              parity 1 = first field  (i_order == 0)
1748              parity 0 = second field (i_order == 1)
1749
1750        Now, with three fields, where the third is a copy of the first,
1751              i_order = 0  =>  parity 1 (as usual)
1752              i_order = 1  =>  due to the repeat, prev = cur, but also next = cur.
1753                               Because in such a case there is no motion (otherwise field repeat makes no sense),
1754                               we don't actually need to invoke Yadif's filter(). Thus, set "parity" to 2,
1755                               and use this to bypass the filter.
1756              i_order = 2  =>  parity 0 (as usual)
1757     */
1758     int yadif_parity;
1759     if( p_cur  &&  p_cur->i_nb_fields > 2 )
1760         yadif_parity = (i_order + 1) % 3; /* 1, *2*, 0; where 2 is a special value meaning "bypass filter". */
1761     else
1762         yadif_parity = (i_order + 1) % 2; /* 1, 0 */
1763
1764     /* Filter if we have all the pictures we need */
1765     if( p_prev && p_cur && p_next )
1766     {
1767         /* */
1768         void (*filter)(struct vf_priv_s *p, uint8_t *dst, uint8_t *prev, uint8_t *cur, uint8_t *next, int w, int refs, int parity);
1769 #if defined(HAVE_YADIF_SSE2)
1770         if( vlc_CPU() & CPU_CAPABILITY_SSE2 )
1771             filter = yadif_filter_line_mmx2;
1772         else
1773 #endif
1774             filter = yadif_filter_line_c;
1775
1776         for( int n = 0; n < p_dst->i_planes; n++ )
1777         {
1778             const plane_t *prevp = &p_prev->p[n];
1779             const plane_t *curp  = &p_cur->p[n];
1780             const plane_t *nextp = &p_next->p[n];
1781             plane_t *dstp        = &p_dst->p[n];
1782
1783             for( int y = 1; y < dstp->i_visible_lines - 1; y++ )
1784             {
1785                 if( (y % 2) == i_field  ||  yadif_parity == 2 )
1786                 {
1787                     vlc_memcpy( &dstp->p_pixels[y * dstp->i_pitch],
1788                                 &curp->p_pixels[y * curp->i_pitch], dstp->i_visible_pitch );
1789                 }
1790                 else
1791                 {
1792                     struct vf_priv_s cfg;
1793                     /* Spatial checks only when enough data */
1794                     cfg.mode = (y >= 2 && y < dstp->i_visible_lines - 2) ? 0 : 2;
1795
1796                     assert( prevp->i_pitch == curp->i_pitch && curp->i_pitch == nextp->i_pitch );
1797                     filter( &cfg,
1798                             &dstp->p_pixels[y * dstp->i_pitch],
1799                             &prevp->p_pixels[y * prevp->i_pitch],
1800                             &curp->p_pixels[y * curp->i_pitch],
1801                             &nextp->p_pixels[y * nextp->i_pitch],
1802                             dstp->i_visible_pitch,
1803                             curp->i_pitch,
1804                             yadif_parity );
1805                 }
1806
1807                 /* We duplicate the first and last lines */
1808                 if( y == 1 )
1809                     vlc_memcpy(&dstp->p_pixels[(y-1) * dstp->i_pitch], &dstp->p_pixels[y * dstp->i_pitch], dstp->i_pitch);
1810                 else if( y == dstp->i_visible_lines - 2 )
1811                     vlc_memcpy(&dstp->p_pixels[(y+1) * dstp->i_pitch], &dstp->p_pixels[y * dstp->i_pitch], dstp->i_pitch);
1812             }
1813         }
1814
1815         p_sys->i_frame_offset = 1; /* p_curr will be rendered at next frame, too */
1816
1817         return VLC_SUCCESS;
1818     }
1819     else if( !p_prev && !p_cur && p_next )
1820     {
1821         /* NOTE: For the first frame, we use the default frame offset
1822                  as set by Open() or SetFilterMethod(). It is always 0. */
1823
1824         /* FIXME not good as it does not use i_order/i_field */
1825         RenderX( p_dst, p_next );
1826         return VLC_SUCCESS;
1827     }
1828     else
1829     {
1830         p_sys->i_frame_offset = 1; /* p_curr will be rendered at next frame */
1831
1832         return VLC_EGENERIC;
1833     }
1834 }
1835
1836 /*****************************************************************************
1837 * Phosphor - a framerate doubler that simulates gradual light decay of a CRT.
1838 *****************************************************************************/
1839
1840 /**
1841  * This function converts a normal (full frame) plane_t into a field plane_t.
1842  *
1843  * Field plane_t's can be used e.g. for a weaving copy operation from two
1844  * source frames into one destination frame.
1845  *
1846  * The pixels themselves will not be touched; only the metadata is generated.
1847  * The same pixel data is shared by both the original plane_t and the field
1848  * plane_t. Note, however, that the bottom field's data starts from the
1849  * second line, so for the bottom field, the actual pixel pointer value
1850  * does not exactly match the original plane pixel pointer value. (It points
1851  * one line further down.)
1852  *
1853  * The caller must allocate p_dst (creating a local variable is fine).
1854  *
1855  * @param p_dst Field plane_t is written here. Must be non-NULL.
1856  * @param p_src Original full-frame plane_t. Must be non-NULL.
1857  * @param i_field Extract which field? 0 = top field, 1 = bottom field.
1858  * @see plane_CopyPixels()
1859  * @see ComposeFrame()
1860  * @see RenderPhosphor()
1861  */
1862 static void FieldFromPlane( plane_t *p_dst, const plane_t *p_src, int i_field )
1863 {
1864     assert( p_dst != NULL );
1865     assert( p_src != NULL );
1866     assert( i_field == 0  ||  i_field == 1 );
1867
1868     /* Start with a copy of the metadata, and then update it to refer
1869        to one field only.
1870
1871        We utilize the fact that plane_CopyPixels() differentiates between
1872        visible_pitch and pitch.
1873
1874        The other field will be defined as the "margin" by doubling the pitch.
1875        The visible pitch will be left as in the original.
1876     */
1877     (*p_dst) = (*p_src);
1878     p_dst->i_lines /= 2;
1879     p_dst->i_visible_lines /= 2;
1880     p_dst->i_pitch *= 2;
1881     /* For the bottom field, skip the first line in the pixel data. */
1882     if( i_field == 1 )
1883         p_dst->p_pixels += p_src->i_pitch;
1884 }
1885
1886 /**
1887  * Helper function: composes a frame from the given field pair.
1888  *
1889  * Caller must manage allocation/deallocation of p_outpic.
1890  *
1891  * The inputs are full pictures (frames); only one field
1892  * will be used from each.
1893  *
1894  * Chroma formats of the inputs must match. It is also desirable that the
1895  * visible pitches of both inputs are the same, so that this will do something
1896  * sensible. The pitch or visible pitch of the output does not need to match
1897  * with the input; the compatible (smaller) part of the visible pitch will
1898  * be filled.
1899  *
1900  * The i_output_chroma parameter must always be supplied, but it is only used
1901  * when the chroma format of the input is detected as 4:2:0. Available modes:
1902  *   - CC_ALTLINE:       Alternate line copy, like for luma. Chroma line 0
1903  *                       comes from top field picture, chroma line 1 comes
1904  *                       from bottom field picture, chroma line 2 from top
1905  *                       field picture, and so on. This is usually the right
1906  *                       choice for IVTCing NTSC DVD material, but rarely
1907  *                       for any other use cases.
1908  *   - CC_UPCONVERT:     The output will have 4:2:2 chroma. All 4:2:0 chroma
1909  *                       data from both input fields will be used to generate
1910  *                       the 4:2:2 chroma data of the output. Each output line
1911  *                       will thus have independent chroma. This is a good
1912  *                       choice for most purposes except IVTC, if the machine
1913  *                       can handle the increased throughput. (Make sure to
1914  *                       allocate a 4:2:2 output picture first!)
1915  *                       This mode can also be used for converting a 4:2:0
1916  *                       frame to 4:2:2 format (by passing the same input
1917  *                       picture for both input fields).
1918  *                       Conversions: I420, YV12 --> I422
1919  *                                    J420       --> J422
1920  *   - CC_SOURCE_TOP:    Copy chroma of source top field picture.
1921  *                       Ignore chroma of source bottom field picture.
1922  *   - CC_SOURCE_BOTTOM: Copy chroma of source bottom field picture.
1923  *                       Ignore chroma of source top field picture.
1924  *   - CC_MERGE:         Average the chroma of the input field pictures.
1925  *                       (Note that this has no effect if the input fields
1926  *                        come from the same frame.)
1927  *
1928  * @param p_outpic Composed picture is written here. Allocated by caller.
1929  * @param p_inpic_top Picture to extract the top field from.
1930  * @param p_inpic_bottom Picture to extract the bottom field from.
1931  * @param i_output_chroma Chroma operation mode for 4:2:0 (see function doc)
1932  * @see compose_chroma_t
1933  * @see RenderPhosphor()
1934  */
1935 static void ComposeFrame( filter_t *p_filter, picture_t *p_outpic,
1936                           picture_t *p_inpic_top, picture_t *p_inpic_bottom,
1937                           compose_chroma_t i_output_chroma )
1938 {
1939     assert( p_filter != NULL );
1940     assert( p_outpic != NULL );
1941     assert( p_inpic_top != NULL );
1942     assert( p_inpic_bottom != NULL );
1943
1944     /* Valid 4:2:0 chroma handling modes. */
1945     assert( i_output_chroma == CC_ALTLINE       ||
1946             i_output_chroma == CC_UPCONVERT     ||
1947             i_output_chroma == CC_SOURCE_TOP    ||
1948             i_output_chroma == CC_SOURCE_BOTTOM ||
1949             i_output_chroma == CC_MERGE );
1950
1951     const int i_chroma = p_filter->fmt_in.video.i_chroma;
1952     const bool b_i422 = i_chroma == VLC_CODEC_I422 ||
1953                         i_chroma == VLC_CODEC_J422;
1954     const bool b_upconvert_chroma = ( !b_i422  &&
1955                                       i_output_chroma == CC_UPCONVERT );
1956
1957     for( int i_plane = 0 ; i_plane < p_inpic_top->i_planes ; i_plane++ )
1958     {
1959         bool b_is_chroma_plane = ( i_plane == U_PLANE || i_plane == V_PLANE );
1960
1961         /* YV12 is YVU, but I422 is YUV. For such input, swap chroma planes
1962            in output when converting to 4:2:2. */
1963         int i_out_plane;
1964         if( b_is_chroma_plane  &&  b_upconvert_chroma  &&
1965             i_chroma == VLC_CODEC_YV12 )
1966         {
1967             if( i_plane == U_PLANE )
1968                 i_out_plane = V_PLANE;
1969             else /* V_PLANE */
1970                 i_out_plane = U_PLANE;
1971         }
1972         else
1973         {
1974             i_out_plane = i_plane;
1975         }
1976
1977         /* Copy luma or chroma, alternating between input fields. */
1978         if( !b_is_chroma_plane  ||  b_i422  ||  i_output_chroma == CC_ALTLINE )
1979         {
1980             /* Do an alternating line copy. This is always done for luma,
1981                and for 4:2:2 chroma. It can be requested for 4:2:0 chroma
1982                using CC_ALTLINE (see function doc).
1983
1984                Note that when we get here, the number of lines matches
1985                in input and output.
1986             */
1987             plane_t dst_top;
1988             plane_t dst_bottom;
1989             plane_t src_top;
1990             plane_t src_bottom;
1991             FieldFromPlane( &dst_top,    &p_outpic->p[i_out_plane],   0 );
1992             FieldFromPlane( &dst_bottom, &p_outpic->p[i_out_plane],   1 );
1993             FieldFromPlane( &src_top,    &p_inpic_top->p[i_plane],    0 );
1994             FieldFromPlane( &src_bottom, &p_inpic_bottom->p[i_plane], 1 );
1995
1996             /* Copy each field from the corresponding source. */
1997             plane_CopyPixels( &dst_top,    &src_top    );
1998             plane_CopyPixels( &dst_bottom, &src_bottom );
1999         }
2000         else /* Input 4:2:0, on a chroma plane, and not in altline mode. */
2001         {
2002             if( i_output_chroma == CC_UPCONVERT )
2003             {
2004                 /* Upconverting copy - use all data from both input fields.
2005
2006                    This produces an output picture with independent chroma
2007                    for each field. It can be used for general input when
2008                    the two input frames are different.
2009
2010                    The output is 4:2:2, but the input is 4:2:0. Thus the output
2011                    has twice the lines of the input, and each full chroma plane
2012                    in the input corresponds to a field chroma plane in the
2013                    output.
2014                 */
2015                 plane_t dst_top;
2016                 plane_t dst_bottom;
2017                 FieldFromPlane( &dst_top,    &p_outpic->p[i_out_plane], 0 );
2018                 FieldFromPlane( &dst_bottom, &p_outpic->p[i_out_plane], 1 );
2019
2020                 /* Copy each field from the corresponding source. */
2021                 plane_CopyPixels( &dst_top,    &p_inpic_top->p[i_plane]    );
2022                 plane_CopyPixels( &dst_bottom, &p_inpic_bottom->p[i_plane] );
2023             }
2024             else if( i_output_chroma == CC_SOURCE_TOP )
2025             {
2026                 /* Copy chroma of input top field. Ignore chroma of input
2027                    bottom field. Input and output are both 4:2:0, so we just
2028                    copy the whole plane. */
2029                 plane_CopyPixels( &p_outpic->p[i_out_plane],
2030                                   &p_inpic_top->p[i_plane] );
2031             }
2032             else if( i_output_chroma == CC_SOURCE_BOTTOM )
2033             {
2034                 /* Copy chroma of input bottom field. Ignore chroma of input
2035                    top field. Input and output are both 4:2:0, so we just
2036                    copy the whole plane. */
2037                 plane_CopyPixels( &p_outpic->p[i_out_plane],
2038                                   &p_inpic_bottom->p[i_plane] );
2039             }
2040             else /* i_output_chroma == CC_MERGE */
2041             {
2042                 /* Average the chroma of the input fields.
2043                    Input and output are both 4:2:0. */
2044                 uint8_t *p_in_top, *p_in_bottom, *p_out_end, *p_out;
2045                 p_in_top    = p_inpic_top->p[i_plane].p_pixels;
2046                 p_in_bottom = p_inpic_bottom->p[i_plane].p_pixels;
2047                 p_out = p_outpic->p[i_out_plane].p_pixels;
2048                 p_out_end = p_out + p_outpic->p[i_out_plane].i_pitch
2049                                   * p_outpic->p[i_out_plane].i_visible_lines;
2050
2051                 int w = FFMIN3( p_inpic_top->p[i_plane].i_visible_pitch,
2052                                 p_inpic_bottom->p[i_plane].i_visible_pitch,
2053                                 p_outpic->p[i_plane].i_visible_pitch );
2054
2055                 for( ; p_out < p_out_end ; )
2056                 {
2057                     Merge( p_out, p_in_top, p_in_bottom, w );
2058                     p_out       += p_outpic->p[i_out_plane].i_pitch;
2059                     p_in_top    += p_inpic_top->p[i_plane].i_pitch;
2060                     p_in_bottom += p_inpic_bottom->p[i_plane].i_pitch;
2061                 }
2062                 EndMerge();
2063             }
2064         }
2065     }
2066 }
2067
2068 #undef Merge
2069
2070 /**
2071  * Helper function: dims (darkens) the given field of the given picture.
2072  *
2073  * This is used for simulating CRT light output decay in RenderPhosphor().
2074  *
2075  * The strength "1" is recommended. It's a matter of taste,
2076  * so it's parametrized.
2077  *
2078  * Note on chroma formats:
2079  *   - If input is 4:2:2, all planes are processed.
2080  *   - If input is 4:2:0, only the luma plane is processed, because both fields
2081  *     have the same chroma. This will distort colours, especially for high
2082  *     filter strengths, especially for pixels whose U and/or V values are
2083  *     far away from the origin (which is at 128 in uint8 format).
2084  *
2085  * @param p_dst Input/output picture. Will be modified in-place.
2086  * @param i_field Darken which field? 0 = top, 1 = bottom.
2087  * @param i_strength Strength of effect: 1, 2 or 3 (division by 2, 4 or 8).
2088  * @see RenderPhosphor()
2089  * @see ComposeFrame()
2090  */
2091 static void DarkenField( picture_t *p_dst, const int i_field,
2092                                            const int i_strength )
2093 {
2094     assert( p_dst != NULL );
2095     assert( i_field == 0 || i_field == 1 );
2096     assert( i_strength >= 1 && i_strength <= 3 );
2097
2098     unsigned u_cpu = vlc_CPU();
2099
2100     /* Bitwise ANDing with this clears the i_strength highest bits
2101        of each byte */
2102 #ifdef CAN_COMPILE_MMXEXT
2103     uint64_t i_strength_u64 = i_strength; /* for MMX version (needs to know
2104                                              number of bits) */
2105 #endif
2106     const uint8_t  remove_high_u8 = 0xFF >> i_strength;
2107     const uint64_t remove_high_u64 = remove_high_u8 *
2108                                             INT64_C(0x0101010101010101);
2109
2110     /* Process luma.
2111
2112        For luma, the operation is just a shift + bitwise AND, so we vectorize
2113        even in the C version.
2114
2115        There is an MMX version, too, because it performs about twice faster.
2116     */
2117     int i_plane = Y_PLANE;
2118     uint8_t *p_out, *p_out_end;
2119     int w = p_dst->p[i_plane].i_visible_pitch;
2120     p_out = p_dst->p[i_plane].p_pixels;
2121     p_out_end = p_out + p_dst->p[i_plane].i_pitch
2122                       * p_dst->p[i_plane].i_visible_lines;
2123
2124     /* skip first line for bottom field */
2125     if( i_field == 1 )
2126         p_out += p_dst->p[i_plane].i_pitch;
2127
2128     int wm8 = w % 8;   /* remainder */
2129     int w8  = w - wm8; /* part of width that is divisible by 8 */
2130     for( ; p_out < p_out_end ; p_out += 2*p_dst->p[i_plane].i_pitch )
2131     {
2132         uint64_t *po = (uint64_t *)p_out;
2133 #ifdef CAN_COMPILE_MMXEXT
2134         if( u_cpu & CPU_CAPABILITY_MMXEXT )
2135         {
2136             movq_m2r( i_strength_u64,  mm1 );
2137             movq_m2r( remove_high_u64, mm2 );
2138             for( int x = 0 ; x < w8; x += 8 )
2139             {
2140                 movq_m2r( (*po), mm0 );
2141
2142                 psrlq_r2r( mm1, mm0 );
2143                 pand_r2r(  mm2, mm0 );
2144
2145                 movq_r2m( mm0, (*po++) );
2146             }
2147         }
2148         else
2149         {
2150 #endif
2151             for( int x = 0 ; x < w8; x += 8, ++po )
2152                 (*po) = ( ((*po) >> i_strength) & remove_high_u64 );
2153 #ifdef CAN_COMPILE_MMXEXT
2154         }
2155 #endif
2156         /* handle the width remainder */
2157         if( wm8 )
2158         {
2159             uint8_t *po_temp = (uint8_t *)po;
2160             for( int x = 0 ; x < wm8; ++x, ++po_temp )
2161                 (*po_temp) = ( ((*po_temp) >> i_strength) & remove_high_u8 );
2162         }
2163     }
2164
2165     /* Process chroma if the field chromas are independent.
2166
2167        The origin (black) is at YUV = (0, 128, 128) in the uint8 format.
2168        The chroma processing is a bit more complicated than luma,
2169        and needs MMX for vectorization.
2170     */
2171     if( p_dst->format.i_chroma == VLC_CODEC_I422  ||
2172         p_dst->format.i_chroma == VLC_CODEC_J422 )
2173     {
2174         for( i_plane = 0 ; i_plane < p_dst->i_planes ; i_plane++ )
2175         {
2176             if( i_plane == Y_PLANE )
2177                 continue; /* luma already handled */
2178
2179             int w = p_dst->p[i_plane].i_visible_pitch;
2180 #ifdef CAN_COMPILE_MMXEXT
2181             int wm8 = w % 8;   /* remainder */
2182             int w8  = w - wm8; /* part of width that is divisible by 8 */
2183 #endif
2184             p_out = p_dst->p[i_plane].p_pixels;
2185             p_out_end = p_out + p_dst->p[i_plane].i_pitch
2186                               * p_dst->p[i_plane].i_visible_lines;
2187
2188             /* skip first line for bottom field */
2189             if( i_field == 1 )
2190                 p_out += p_dst->p[i_plane].i_pitch;
2191
2192             for( ; p_out < p_out_end ; p_out += 2*p_dst->p[i_plane].i_pitch )
2193             {
2194 #ifdef CAN_COMPILE_MMXEXT
2195                 /* See also easy-to-read C version below. */
2196                 if( u_cpu & CPU_CAPABILITY_MMXEXT )
2197                 {
2198                     static const mmx_t b128 = { .uq = 0x8080808080808080ULL };
2199                     movq_m2r( b128, mm5 );
2200                     movq_m2r( i_strength_u64,  mm6 );
2201                     movq_m2r( remove_high_u64, mm7 );
2202
2203                     uint64_t *po = (uint64_t *)p_out;
2204                     for( int x = 0 ; x < w8; x += 8 )
2205                     {
2206                         movq_m2r( (*po), mm0 );
2207
2208                         movq_r2r( mm5, mm2 ); /* 128 */
2209                         movq_r2r( mm0, mm1 ); /* copy of data */
2210                         psubusb_r2r( mm2, mm1 ); /* mm1 = max(data - 128, 0) */
2211                         psubusb_r2r( mm0, mm2 ); /* mm2 = max(128 - data, 0) */
2212
2213                         /* >> i_strength */
2214                         psrlq_r2r( mm6, mm1 );
2215                         psrlq_r2r( mm6, mm2 );
2216                         pand_r2r(  mm7, mm1 );
2217                         pand_r2r(  mm7, mm2 );
2218
2219                         /* collect results from pos./neg. parts */
2220                         psubb_r2r( mm2, mm1 );
2221                         paddb_r2r( mm5, mm1 );
2222
2223                         movq_r2m( mm1, (*po++) );
2224                     }
2225
2226                     /* handle the width remainder */
2227                     if( wm8 )
2228                     {
2229                         /* The output is closer to 128 than the input;
2230                            the result always fits in uint8. */
2231                         uint8_t *po8 = (uint8_t *)po;
2232                         for( int x = 0 ; x < wm8; ++x, ++po8 )
2233                             (*po8) = 128 + ( ((*po8) - 128) /
2234                                                   (1 << i_strength) );
2235                     }
2236                 }
2237                 else
2238                 {
2239 #endif
2240                     /* 4:2:2 chroma handler, C version */
2241                     uint8_t *po = p_out;
2242                     for( int x = 0 ; x < w; ++x, ++po )
2243                         (*po) = 128 + ( ((*po) - 128) / (1 << i_strength) );
2244 #ifdef CAN_COMPILE_MMXEXT
2245                 }
2246 #endif
2247             } /* for p_out... */
2248         } /* for i_plane... */
2249     } /* if b_i422 */
2250
2251 #ifdef CAN_COMPILE_MMXEXT
2252     if( u_cpu & CPU_CAPABILITY_MMXEXT )
2253         emms();
2254 #endif
2255 }
2256
2257 /**
2258  * Deinterlace filter. Simulates an interlaced CRT TV (to some extent).
2259  *
2260  * The main use case for this filter is anime for which IVTC is not applicable.
2261  * This is the case, if 24fps telecined material has been mixed with 60fps
2262  * interlaced effects, such as in Sol Bianca or Silent Mobius. It can also
2263  * be used for true interlaced video, such as most camcorder recordings.
2264  *
2265  * The filter has several modes for handling 4:2:0 chroma for those output
2266  * frames that fall across input frame temporal boundaries (i.e. fields come
2267  * from different frames). Upconvert (to 4:2:2) provides the most accurate
2268  * CRT simulation, but requires more CPU and memory bandwidth than the other
2269  * modes. The other modes keep the chroma at 4:2:0.
2270  *
2271  * About these modes: telecined input (such as NTSC anime DVDs) works better
2272  * with AltLine, while true interlaced input works better with Latest.
2273  * Merge is a compromise, which may or may not look acceptable.
2274  * The mode can be set in the VLC advanced configuration,
2275  * All settings > Video > Filters > Deinterlace
2276  *
2277  * Technically speaking, this is an interlaced field renderer targeted for
2278  * progressive displays. It works by framerate doubling, and simulating one
2279  * step of light output decay of the "old" field during the "new" field,
2280  * until the next new field comes in to replace the "old" one.
2281  *
2282  * While playback is running, the simulated light decay gives the picture an
2283  * appearance of visible "scanlines", much like on a real TV. Only when the
2284  * video is paused, it is clearly visible that one of the fields is actually
2285  * brighter than the other.
2286  *
2287  * The main differences to the Bob algorithm are:
2288  *  - in addition to the current field, the previous one (fading out)
2289  *    is also rendered
2290  *  - some horizontal lines don't seem to flicker as much
2291  *  - scanline visual effect (adjustable; the dimmer strength can be set
2292  *    in the VLC advanced configuration)
2293  *  - the picture appears 25%, 38% or 44% darker on average (for dimmer
2294  *    strengths 1, 2 and 3)
2295  *  - if the input has 4:2:0 chroma, the colours may look messed up in some
2296  *    output frames. This is a limitation of the 4:2:0 chroma format, and due
2297  *    to the fact that both fields are present in each output picture. Usually
2298  *    this doesn't matter in practice, but see the 4:2:0 chroma mode setting
2299  *    in the configuration if needed (it may help a bit).
2300  *
2301  * In addition, when this filter is used on an LCD computer monitor,
2302  * the main differences to a real CRT TV are:
2303  *  - Pixel shape and grid layout; CRT TVs were designed for interlaced
2304  *    field rendering, while LCD monitors weren't.
2305  *  - No scan flicker even though the display runs (usually) at 60Hz.
2306  *    (This at least is a good thing.)
2307  *
2308  * The output vertical resolution should be large enough for the scaling
2309  * not to have a too adverse effect on the regular scanline pattern.
2310  * In practice, NTSC video can be acceptably rendered already at 1024x600
2311  * if fullscreen even on an LCD. PAL video requires more.
2312  *
2313  * Just like Bob, this filter works properly only if the input framerate
2314  * is stable. Otherwise the scanline effect breaks down and the picture
2315  * will flicker.
2316  *
2317  * Soft field repeat (repeat_pict) is supported. Note that the generated
2318  * "repeated" output picture is unique because of the simulated light decay.
2319  * Its "old" field comes from the same input frame as the "new" one, unlike
2320  * the first output picture of the same frame.
2321  *
2322  * As many output frames should be requested for each input frame as is
2323  * indicated by p_src->i_nb_fields. This is done by calling this function
2324  * several times, first with i_order = 0, and then with all other parameters
2325  * the same, but a new p_dst, increasing i_order (1 for second field,
2326  * and then if i_nb_fields = 3, also i_order = 2 to get the repeated first
2327  * field), and alternating i_field (starting, at i_order = 0, with the field
2328  * according to p_src->b_top_field_first). See Deinterlace() for an example.
2329  *
2330  * @param p_filter The filter instance. Must be non-NULL.
2331  * @param p_dst Output frame. Must be allocated by caller.
2332  * @param p_src Input frame. Must exist.
2333  * @param i_order Temporal field number: 0 = first, 1 = second, 2 = rep. first.
2334  * @param i_field Render which field? 0 = top field, 1 = bottom field.
2335  * @return VLC error code (int).
2336  * @retval VLC_SUCCESS The requested field was rendered into p_dst.
2337  * @retval VLC_EGENERIC No pictures in history buffer, cannot render.
2338  * @see RenderBob()
2339  * @see RenderLinear()
2340  * @see Deinterlace()
2341  */
2342 static int RenderPhosphor( filter_t *p_filter,
2343                            picture_t *p_dst, picture_t *p_src,
2344                            int i_order, int i_field )
2345 {
2346     assert( p_filter != NULL );
2347     assert( p_dst != NULL );
2348     assert( p_src != NULL );
2349     assert( i_order >= 0 && i_order <= 2 ); /* 2 = soft field repeat */
2350     assert( i_field == 0 || i_field == 1 );
2351
2352     filter_sys_t *p_sys = p_filter->p_sys;
2353
2354     /* Last two input frames */
2355     picture_t *p_in  = p_sys->pp_history[HISTORY_SIZE-1];
2356     picture_t *p_old = p_sys->pp_history[HISTORY_SIZE-2];
2357
2358     /* Use the same input picture as "old" at the first frame after startup */
2359     if( !p_old )
2360         p_old = p_in;
2361
2362     /* If the history mechanism has failed, we can't do anything. */
2363     if( !p_in )
2364         return VLC_EGENERIC;
2365
2366     assert( p_old != NULL );
2367     assert( p_in != NULL );
2368
2369     /* Decide sources for top & bottom fields of output. */
2370     picture_t *p_in_top    = p_in;
2371     picture_t *p_in_bottom = p_in;
2372     /* For the first output field this frame,
2373        grab "old" field from previous frame. */
2374     if( i_order == 0 )
2375     {
2376         if( i_field == 0 ) /* rendering top field */
2377             p_in_bottom = p_old;
2378         else /* i_field == 1, rendering bottom field */
2379             p_in_top = p_old;
2380     }
2381
2382     compose_chroma_t cc;
2383     switch( p_sys->phosphor.i_chroma_for_420 )
2384     {
2385         case PC_BLEND:
2386             cc = CC_MERGE;
2387             break;
2388         case PC_LATEST:
2389             if( i_field == 0 )
2390                 cc = CC_SOURCE_TOP;
2391             else /* i_field == 1 */
2392                 cc = CC_SOURCE_BOTTOM;
2393             break;
2394         case PC_ALTLINE:
2395             cc = CC_ALTLINE;
2396             break;
2397         case PC_UPCONVERT:
2398             cc = CC_UPCONVERT;
2399             break;
2400         default:
2401             /* The above are the only possibilities, if there are no bugs. */
2402             assert(0);
2403             break;
2404     }
2405
2406     ComposeFrame( p_filter, p_dst, p_in_top, p_in_bottom, cc );
2407
2408     /* Simulate phosphor light output decay for the old field.
2409
2410        The dimmer can also be switched off in the configuration, but that is
2411        more of a technical curiosity or an educational toy for advanced users
2412        than a useful deinterlacer mode (although it does make telecined
2413        material look slightly better than without any filtering).
2414
2415        In most use cases the dimmer is used.
2416     */
2417     if( p_sys->phosphor.i_dimmer_strength > 0 )
2418         DarkenField( p_dst, !i_field, p_sys->phosphor.i_dimmer_strength );
2419
2420     return VLC_SUCCESS;
2421 }
2422
2423 /*****************************************************************************
2424  * Inverse telecine (IVTC) filter (a.k.a. "film mode", "3:2 reverse pulldown")
2425  *****************************************************************************/
2426
2427 /**
2428  * @fn RenderIVTC
2429  * Deinterlace filter. Performs inverse telecine.
2430  *
2431  * Also known as "film mode" or "3:2 reverse pulldown" in some equipment.
2432  *
2433  * This filter attempts to reconstruct the original film frames from an
2434  * NTSC telecined signal. It is intended for 24fps progressive material
2435  * that was telecined to NTSC 60i. For example, most NTSC anime DVDs
2436  * are like this.
2437  *
2438  * @param p_filter The filter instance.
2439  * @param[in] p_src Input frame.
2440  * @param[out] p_dst Output frame. Must be allocated by caller.
2441  * @return VLC error code (int).
2442  * @retval VLC_SUCCESS A film frame was reconstructed to p_dst.
2443  * @retval VLC_EGENERIC Frame dropped as part of normal IVTC operation.
2444  * @see Deinterlace()
2445  * @see ComposeFrame()
2446  * @see CalculateInterlaceScore()
2447  * @see EstimateNumBlocksWithMotion()
2448  *
2449  * Overall explanation:
2450  *
2451  * This filter attempts to do in realtime what Transcode's
2452  * ivtc->decimate->32detect chain does offline. Additionally, it removes
2453  * soft telecine. It is an original design, based on some ideas from
2454  * Transcode, some from TVTime, and some original.
2455  *
2456  * If the input material is pure NTSC telecined film, inverse telecine
2457  * will (ideally) exactly recover the original progressive film frames.
2458  * The output will run at 4/5 of the original framerate with no loss of
2459  * information. Interlacing artifacts are removed, and motion becomes
2460  * as smooth as it was on the original film. For soft-telecined material,
2461  * on the other hand, the progressive frames alredy exist, so only the
2462  * timings are changed such that the output becomes smooth 24fps (or would,
2463  * if the output device had an infinite framerate).
2464  *
2465  * Put in simple terms, this filter is targeted for NTSC movies and
2466  * especially anime. Virtually all 1990s and early 2000s anime is
2467  * hard-telecined. Because the source material is like that,
2468  * IVTC is needed for also virtually all official R1 (US) anime DVDs.
2469  *
2470  * Note that some anime from the turn of the century (e.g. Silent Mobius
2471  * and Sol Bianca) is a hybrid of telecined film and true interlaced
2472  * computer-generated effects and camera pans. In this case, applying IVTC
2473  * will effectively attempt to reconstruct the frames based on the film
2474  * component, but even if this is successful, the framerate reduction will
2475  * cause the computer-generated effects to stutter. This is mathematically
2476  * unavoidable. Instead of IVTC, a framerate doubling deinterlacer is
2477  * recommended for such material. Try "Phosphor", "Bob", or "Linear".
2478  *
2479  * Fortunately, 30fps true progressive anime is on the rise (e.g. ARIA,
2480  * Black Lagoon, Galaxy Angel, Ghost in the Shell: Solid State Society,
2481  * Mai Otome, Last Exile, and Rocket Girls). This type requires no
2482  * deinterlacer at all.
2483  *
2484  * Another recent trend is using 24fps computer-generated effects and
2485  * telecining them along with the cels (e.g. Kiddy Grade, Str.A.In. and
2486  * The Third: The Girl with the Blue Eye). For this group, IVTC is the
2487  * correct way to deinterlace, and works properly.
2488  *
2489  * Soft telecined anime, while rare, also exists. Stellvia of the Universe
2490  * and Angel Links are examples of this. Stellvia constantly alternates
2491  * between soft and hard telecine - pure CGI sequences are soft-telecined,
2492  * while sequences incorporating cel animation are hard-telecined.
2493  * This makes it very hard for the cadence detector to lock on,
2494  * and indeed Stellvia gives some trouble for the filter.
2495  *
2496  * To finish the list of different material types, Azumanga Daioh deserves
2497  * a special mention. The OP and ED sequences are both 30fps progressive,
2498  * while the episodes themselves are hard-telecined. This filter should
2499  * mostly work correctly with such material, too. (The beginning of the OP
2500  * shows some artifacts, but otherwise both the OP and ED are indeed
2501  * rendered progressive. The technical reason is that the filter has been
2502  * designed to aggressively reconstruct film frames, which helps in many
2503  * cases with hard-telecined material. In very rare cases, this approach may
2504  * go wrong, regardless of whether the input is telecined or progressive.)
2505  *
2506  * Finally, note also that IVTC is the only correct way to deinterlace NTSC
2507  * telecined material. Simply applying an interpolating deinterlacing filter
2508  * (with no framerate doubling) is harmful for two reasons. First, even if
2509  * the filter does not damage already progressive frames, it will lose half
2510  * of the available vertical resolution of those frames that are judged
2511  * interlaced. Some algorithms combining data from multiple frames may be
2512  * able to counter this to an extent, effectively performing something akin
2513  * to the frame reconstruction part of IVTC. A more serious problem is that
2514  * any motion will stutter, because (even in the ideal case) one out of
2515  * every four film frames will be shown twice, while the other three will
2516  * be shown only once. Duplicate removal and framerate reduction - which are
2517  * part of IVTC - are also needed to properly play back telecined material
2518  * on progressive displays at a non-doubled framerate.
2519  *
2520  * So, try this filter on your NTSC anime DVDs. It just might help.
2521  *
2522  *
2523  * Technical details:
2524  *
2525  *
2526  * First, NTSC hard telecine in a nutshell:
2527  *
2528  * Film is commonly captured at 24 fps. The framerate must be raised from
2529  * 24 fps to 59.94 fields per second, This starts by pretending that the
2530  * original framerate is 23.976 fps. When authoring, the audio can be
2531  * slowed down by 0.1% to match. Now 59.94 = 5/4 * (2*23.976), which gives
2532  * a nice ratio made out of small integers.
2533  *
2534  * Thus, each group of four film frames must become five frames in the NTSC
2535  * video stream. One cannot simply repeat one frame of every four, because
2536  * this would result in jerky motion. To slightly soften the jerkiness,
2537  * the extra frame is split into two extra fields, inserted at different
2538  * times. The content of the extra fields is (in classical telecine)
2539  * duplicated as-is from existing fields.
2540  *
2541  * The field duplication technique is called "3:2 pulldown". The pattern
2542  * is called the cadence. The output from 3:2 pulldown looks like this
2543  * (if the telecine is TFF, top field first):
2544  *
2545  * a  b  c  d  e     Telecined frame (actual frames stored on DVD)
2546  * T1 T1 T2 T3 T4    *T*op field content
2547  * B1 B2 B3 B3 B4    *B*ottom field content
2548  *
2549  * Numbers 1-4 denote the original film frames. E.g. T1 = top field of
2550  * original film frame 1. The field Tb, and one of either Bc or Bd, are
2551  * the extra fields inserted in the telecine. With exact duplication, it
2552  * of course doesn't matter whether Bc or Bd is the extra field, but
2553  * with "full field blended" material (see below) this will affect how to
2554  * correctly wxtract film frame 3.
2555  *
2556  * See the following web pages for illustrations and discussion:
2557  * http://neuron2.net/LVG/telecining1.html
2558  * http://arbor.ee.ntu.edu.tw/~jackeikuo/dvd2avi/ivtc/
2559  *
2560  * Note that film frame 2 has been stored "half and half" into two telecined
2561  * frames (b and c). Note also that telecine produces a sequence of
2562  * 3 progressive frames (d, e and a) followed by 2 interlaced frames
2563  * (b and c).
2564  *
2565  * The output may also look like this (BFF telecine, bottom field first):
2566  *
2567  * a' b' c' d' e'
2568  * T1 T2 T3 T3 T4
2569  * B1 B1 B2 B3 B4
2570  *
2571  * Now field Bb', and one of either Tc' or Td', are the extra fields.
2572  * Again, film frame 2 is stored "half and half" (into b' and c').
2573  *
2574  * Whether the pattern is like abcde or a'b'c'd'e', depends on the telecine
2575  * field dominance (TFF or BFF). This must match the video field dominance,
2576  * but is conceptually different. Importantly, there is no temporal
2577  * difference between those fields that came from the same film frame.
2578  * Also, see the section on soft telecine below.
2579  *
2580  * In a hard telecine, the TFD and VFD must match for field renderers
2581  * (e.g. traditional DVD player + CRT TV) to work correctly; this should be
2582  * fairly obvious by considering the above telecine patterns and how a
2583  * field renderer displays the material (one field at a time, dominant
2584  * field first).
2585  *
2586  * The VFD may, *correctly*, flip mid-stream, if soft field repeats
2587  * (repeat_pict) have been used. They are commonly used in soft telecine
2588  * (see below), but also occasional lone field repeats exist in some streams,
2589  * e.g., Sol Bianca.
2590  *
2591  * See e.g.
2592  * http://www.cambridgeimaging.co.uk/downloads/Telecine%20field%20dominance.pdf
2593  * for discussion. The document discusses mostly PAL, but includes some notes
2594  * on NTSC, too.
2595  *
2596  * The reason for the words "classical telecine" above, when field
2597  * duplication was first mentioned, is that there exists a
2598  * "full field blended" version, where the added fields are not exact
2599  * duplicates, but are blends of the original film frames. This is rare
2600  * in NTSC, but some material like this reportedly exists. See
2601  * http://www.animemusicvideos.org/guides/avtech/videogetb2a.html
2602  * In these cases, the additional fields are a (probably 50%) blend of the
2603  * frames between which they have been inserted. Which one of the two
2604  * possibilites is the extra field then becomes important.
2605  * This filter does NOT support "full field blended" material.
2606  *
2607  * To summarize, the 3:2 pulldown sequence produces a group of ten fields
2608  * out of every four film frames. Only eight of these fields are unique.
2609  * To remove the telecine, the duplicate fields must be removed, and the
2610  * original progressive frames restored. Additionally, the presentation
2611  * timestamps (PTS) must be adjusted, and one frame out of five (containing
2612  * no new information) dropped. The duration of each frame in the output
2613  * becomes 5/4 of that in the input, i.e. 25% longer.
2614  *
2615  * Theoretically, this whole mess could be avoided by soft telecining, if the
2616  * original material is pure 24fps progressive. By using the stream flags
2617  * correctly, the original progressive frames can be stored on the DVD.
2618  * In such cases, the DVD player will apply "soft" 3:2 pulldown. See the
2619  * following section.
2620  *
2621  * Also, the mess with cadence detection for hard telecine (see below) could
2622  * be avoided by using the progressive frame flag and a five-frame future
2623  * buffer, but no one ever sets the flag correctly for hard-telecined
2624  * streams. All frames are marked as interlaced, regardless of their cadence
2625  * position. This is evil, but sort-of-understandable, given that video
2626  * editors often come with "progressive" and "interlaced" editing modes,
2627  * but no separate "telecined" mode that could correctly handle this
2628  * information.
2629  *
2630  * In practice, most material with its origins in Asia (including virtually
2631  * all official US (R1) anime DVDs) is hard-telecined. Combined with the
2632  * turn-of-the-century practice of rendering true interlaced effects
2633  * on top of the hard-telecined stream, we have what can only be described
2634  * as a monstrosity. Fortunately, recent material is much more consistent,
2635  * even though still almost always hard-telecined.
2636  *
2637  * Finally, note that telecined video is often edited directly in interlaced
2638  * form, disregarding safe cut positions as pertains to the telecine sequence
2639  * (there are only two: between "d" and "e", or between "e" and the
2640  * next "a"). Thus, the telecine sequence will in practice jump erratically
2641  * at cuts [**]. An aggressive detection strategy is needed to cope with
2642  * this.
2643  *
2644  * [**] http://users.softlab.ece.ntua.gr/~ttsiod/ivtc.html
2645  *
2646  *
2647  * Note about chroma formats: 4:2:0 is very common at least on anime DVDs.
2648  * In the interlaced frames in a hard telecine, the chroma alternates
2649  * every chroma line, even if the chroma format is 4:2:0! This means that
2650  * if the interlaced picture is viewed as-is, the luma alternates every line,
2651  * while the chroma alternates only every two lines of the picture.
2652  *
2653  * That is, an interlaced frame in a 4:2:0 telecine looks like this
2654  * (numbers indicate which film frame the data comes from):
2655  *
2656  * luma  stored 4:2:0 chroma  displayed chroma
2657  * 1111  1111                 1111
2658  * 2222                       1111
2659  * 1111  2222                 2222
2660  * 2222                       2222
2661  * ...   ...                  ...
2662  *
2663  * The deinterlace filter sees the stored 4:2:0 chroma. The "displayed chroma"
2664  * is only generated later in the filter chain (probably when YUV is converted
2665  * to the display format, if the display does not accept YUV 4:2:0 directly).
2666  *
2667  *
2668  * Next, how NTSC soft telecine works:
2669  *
2670  * a  b  c  d     Frame index (actual frames stored on DVD)
2671  * T1 T2 T3 T4    *T*op field content
2672  * B1 B2 B3 B4    *B*ottom field content
2673  *
2674  * Here the progressive frames are stored as-is. The catch is in the stream
2675  * flags. For hard telecine, which was explained above, we have
2676  * VFD = constant and nb_fields = 2, just like in a true progressive or
2677  * true interlaced stream. Soft telecine, on the other hand, looks like this:
2678  *
2679  * a  b  c  d
2680  * 3  2  3  2     nb_fields
2681  * T  B  B  T     *Video* field dominance (for TFF telecine)
2682  * B  T  T  B     *Video* field dominance (for BFF telecine)
2683  *
2684  * Now the video field dominance flipflops every two frames!
2685  *
2686  * Note that nb_fields = 3 means the frame duration will be 1.5x that of a
2687  * normal frame. Often, soft-telecined frames are correctly flagged as
2688  * progressive.
2689  *
2690  * Here the telecining is expected to be done by the player, utilizing the
2691  * soft field repeat (repeat_pict) feature. This is indeed what a field
2692  * renderer (traditional interlaced equipment, or a framerate doubler)
2693  * should do with such a stream.
2694  *
2695  * In the IVTC filter, our job is to even out the frame durations, but
2696  * disregard video field dominance and just pass the progressive pictures
2697  * through as-is.
2698  *
2699  * Fortunately, for soft telecine to work at all, the stream flags must be
2700  * set correctly. Thus this type can be detected reliably by reading
2701  * nb_fields from three consecutive frames:
2702  *
2703  * Let P = previous, C = current, N = next. If the frame to be rendered is C,
2704  * there are only three relevant nb_fields flag patterns for the three-frame
2705  * stencil concerning soft telecine:
2706  *
2707  * P C N   What is happening:
2708  * 2 3 2   Entering soft telecine at frame C, or running inside it already.
2709  * 3 2 3   Running inside soft telecine.
2710  * 3 2 2   Exiting soft telecine at frame C. C is the last frame that should
2711  *         be handled as soft-telecined. (If we do timing adjustments to the
2712  *         "3"s only, we can already exit soft telecine mode when we see
2713  *         this pattern.)
2714  *
2715  * Note that the same stream may alternate between soft and hard telecine,
2716  * but these cannot occur at the same time. The start and end of the
2717  * soft-telecined parts can be read off the stream flags, and the rest of
2718  * the stream can be handed to the hard IVTC part of the filter for analysis.
2719  *
2720  * Finally, note also that a stream may also request a lone field repeat
2721  * (a sudden "3" surrounded by "2"s). Fortunately, these can be handled as
2722  * a two-frame soft telecine, as they match the first and third
2723  * flag patterns above.
2724  *
2725  * Combinations with several "3"s in a row are not valid for soft or hard
2726  * telecine, so if they occur, the frames can be passed through as-is.
2727  *
2728  *
2729  * Cadence detection for hard telecine:
2730  *
2731  * Consider viewing the TFF and BFF hard telecine sequences through a
2732  * three-frame stencil. Again, let P = previous, C = current, N = next.
2733  * A brief analysis leads to the following cadence tables.
2734  *
2735  * PCN                 = stencil position (Previous Current Next),
2736  * Dups.               = duplicate fields,
2737  * Best field pairs... = combinations of fields which correctly reproduce
2738  *                       the original progressive frames,
2739  * *                   = see timestamp considerations below for why
2740  *                       this particular arrangement.
2741  *
2742  * For TFF:
2743  *
2744  * PCN   Dups.     Best field pairs for progressive (correct, theoretical)
2745  * abc   TP = TC   TPBP = frame 1, TCBP = frame 1, TNBC = frame 2
2746  * bcd   BC = BN   TCBP = frame 2, TNBC = frame 3, TNBN = frame 3
2747  * cde   BP = BC   TCBP = frame 3, TCBC = frame 3, TNBN = frame 4
2748  * dea   none      TPBP = frame 3, TCBC = frame 4, TNBN = frame 1
2749  * eab   TC = TN   TPBP = frame 4, TCBC = frame 1, TNBC = frame 1
2750  *
2751  * (table cont'd)
2752  * PCN   Progressive output*
2753  * abc   frame 2 = TNBC (compose TN+BC)
2754  * bcd   frame 3 = TNBN (copy N)
2755  * cde   frame 4 = TNBN (copy N)
2756  * dea   (drop)
2757  * eab   frame 1 = TCBC (copy C), or TNBC (compose TN+BC)
2758  *
2759  * On the rows "dea" and "eab", frame 1 refers to a frame from the next
2760  * group of 4. "Compose TN+BC" means to construct a frame using the
2761  * top field of N, and the bottom field of C. See ComposeFrame().
2762  *
2763  * For BFF, swap all B and T, and rearrange the symbol pairs to again
2764  * read "TxBx". We have:
2765  *
2766  * PCN   Dups.     Best field pairs for progressive (correct, theoretical)
2767  * abc   BP = BC   TPBP = frame 1, TPBC = frame 1, TCBN = frame 2
2768  * bcd   TC = TN   TPBC = frame 2, TCBN = frame 3, TNBN = frame 3
2769  * cde   TP = TC   TPBC = frame 3, TCBC = frame 3, TNBN = frame 4
2770  * dea   none      TPBP = frame 3, TCBC = frame 4, TNBN = frame 1
2771  * eab   BC = BN   TPBP = frame 4, TCBC = frame 1, TCBN = frame 1
2772  *
2773  * (table cont'd)
2774  * PCN   Progressive output*
2775  * abc   frame 2 = TCBN (compose TC+BN)
2776  * bcd   frame 3 = TNBN (copy N)
2777  * cde   frame 4 = TNBN (copy N)
2778  * dea   (drop)
2779  * eab   frame 1 = TCBC (copy C), or TCBN (compose TC+BN)
2780  *
2781  * From these cadence tables we can extract two strategies for
2782  * cadence detection. We use both.
2783  *
2784  * Strategy 1: duplicated fields ("vektor").
2785  *
2786  * Consider that each stencil position has a unique duplicate field
2787  * condition. In one unique position, "dea", there is no match; in all
2788  * other positions, exactly one. By conservatively filtering the
2789  * possibilities based on detected hard field repeats (identical fields
2790  * in successive input frames), it is possible to gradually lock on
2791  * to the cadence. This kind of strategy is used by the classic IVTC filter
2792  * in TVTime/Xine by Billy Biggs (Vektor), hence the name.
2793  *
2794  * "Conservative" here means that we do not rule anything out, but start at
2795  * each stencil position by suggesting the position "dea", and then only add
2796  * to the list of possibilities based on field repeats that are detected at
2797  * the present stencil position. This estimate is then filtered by ANDing
2798  * against a shifted (time-advanced) version of the estimate from the
2799  * previous stencil position. Once the detected position becomes unique,
2800  * the filter locks on. If the new detection is inconsistent with the
2801  * previous one, the detector resets itself and starts from scratch.
2802  *
2803  * The strategy is very reliable, as it only requires running (fuzzy)
2804  * duplicate field detection against the input. It is very good at staying
2805  * locked on once it acquires the cadence, and it does so correctly very
2806  * often. These are indeed characteristics that can be observed in the
2807  * behaviour of the TVTime/Xine filter.
2808  *
2809  * Note especially that 8fps/12fps animation, common in anime, will cause
2810  * spurious hard-repeated fields. The conservative nature of the method
2811  * makes it very good at dealing with this - any spurious repeats will only
2812  * slow down the lock-on, not completely confuse it. It should also be good
2813  * at detecting the presence of a telecine, as neither true interlaced nor
2814  * true progressive material should contain any hard field repeats.
2815  * (This, however, has not been tested yet.)
2816  *
2817  * The disadvantages are that at times the method may lock on slowly,
2818  * because the detection must be filtered against the history until
2819  * a unique solution is found. Resets, if they happen, will also
2820  * slow down the lock-on.
2821  *
2822  * The hard duplicate detection required by this strategy can be made
2823  * data-adaptive in several ways. TVTime uses a running average of motion
2824  * scores for its history buffer. We utilize a different, original approach.
2825  * It is rare, if not nonexistent, that only one field changes between
2826  * two valid frames. Thus, if one field changes "much more" than the other
2827  * in fieldwise motion detection, the less changed one is probably a
2828  * duplicate. Importantly, this works with telecined input, too - the field
2829  * that changes "much" may be part of another film frame, while the "less"
2830  * changed one is actually a duplicate from the previous film frame.
2831  * If both fields change "about as much", then no hard field repeat
2832  * is detected.
2833  *
2834  *
2835  * Strategy 2: progressive/interlaced field combinations ("scores").
2836  *
2837  * We can also form a second strategy, which is not as reliable in practice,
2838  * but which locks on faster when it does. This is original to this filter.
2839  *
2840  * Consider all possible field pairs from two successive frames: TCBC, TCBN,
2841  * TNBC, TNBN. After one frame, these become TPBP, TPBC, TCBP, TCBC.
2842  * These eight pairs (seven unique, disregarding the duplicate TCBC)
2843  * are the exhaustive list of possible field pairs from two successive
2844  * frames in the three-frame PCN stencil.
2845  *
2846  * The above tables list triplets of field pair combinations for each cadence
2847  * position, which should produce progressive frames. All the given triplets
2848  * are unique in each table alone, although the one at "dea" is
2849  * indistinguishable from the case of pure progressive material. It is also
2850  * the only one which is not unique across both tables.
2851  *
2852  * Thus, all sequences of two neighboring triplets are unique across both
2853  * tables. (For "neighboring", each table is considered to wrap around from
2854  * "eab" back to "abc", i.e. from the last row back to the first row.)
2855  * Furthermore, each sequence of three neighboring triplets is redundantly
2856  * unique (i.e. is unique, and reduces the chance of false positives).
2857  * (In practice, though, we already know which table to consider, from the fact
2858  * that TFD and VFD must match. Checking only the relevant table makes the
2859  * strategy slightly more robust.)
2860  *
2861  * The important idea is: *all other* field pair combinations should produce
2862  * frames that look interlaced. This includes those combinations present in
2863  * the "wrong" (i.e. not current position) rows of the table (insofar as
2864  * those combinations are not also present in the "correct" row; by the
2865  * uniqueness property, *every* "wrong" row will always contain at least one
2866  * combination that differs from those in the "correct" row).
2867  *
2868  * We generate the artificial frames TCBC, TCBN, TNBC and TNBN (virtually;
2869  * no data is actually moved). Two of these are just the frames C and N,
2870  * which already exist; the two others correspond to composing the given
2871  * field pairs. We then compute the interlace score for each of these frames.
2872  * The interlace scores of what are now TPBP, TPBC and TCBP, also needed,
2873  * were computed by this same mechanism during the previous input frame.
2874  * These can be slided in history and reused.
2875  *
2876  * We then check, using the computed interlace scores, and taking into
2877  * account the video field dominance information, which field combination
2878  * triplet given in the appropriate table produces the smallest sum of
2879  * interlace scores. Unless we are at PCN = "dea" (which could also be pure
2880  * progressive!), this immediately gives us the most likely current cadence
2881  * position. Combined with a two-step history, the sequence of three most
2882  * likely positions found this way always allows us to make a more or less
2883  * reliable detection. (That is, when a reliable detection is possible; if the
2884  * video has no motion at all, every detection will report the position "dea".
2885  * In anime, still shots are common. Thus we must augment this with a
2886  * full-frame motion detection that switches the detector off if no motion
2887  * was detected.)
2888  *
2889  * The detection seems to need four full-frame interlace analyses per frame.
2890  * Actually, three are enough, because the previous N is the new C, so we can
2891  * slide the already computed result. Also during initialization, we only
2892  * need to compute TNBN on the first frame; this has become TPBP when the
2893  * third frame is reached. Similarly, we compute TNBN, TNBC and TCBN during
2894  * the second frame (just before the filter starts), and these get slided
2895  * into TCBC, TCBP and TPBC when the third frame is reached. At that point,
2896  * initialization is complete.
2897  *
2898  * Because we only compare interlace scores against each other, no threshold
2899  * is needed in the cadence detector. Thus it, trivially, adapts to the
2900  * material automatically.
2901  *
2902  * The weakness of this approach is that any comb metric detects incorrectly
2903  * every now and then. Especially slow vertical camera pans often get treated
2904  * wrong, because the messed-up field combination looks less interlaced
2905  * according to the comb metric (especially in anime) than the correct one
2906  * (which contains, correctly, one-pixel thick cartoon outlines, parts of
2907  * which often perfectly horizontal).
2908  *
2909  * The advantage is that this strategy catches horizontal camera pans
2910  * immediately and reliably, while the other strategy may still be trying
2911  * to lock on.
2912  *
2913  *
2914  * Frame reconstruction:
2915  *
2916  * We utilize a hybrid approach. If a valid cadence is locked on, we use the
2917  * operation table to decide what to do. This handles those cases correctly,
2918  * which would be difficult for the interlace detector alone (e.g. vertical
2919  * camera pans). Note that the operations that must be performed for IVTC
2920  * include timestamp mangling and frame dropping, which can only be done
2921  * reliably on a valid cadence.
2922  *
2923  * When the cadence fails (we detect this from a sudden upward jump in the
2924  * interlace scores of the constructed frames), we reset the "vektor"
2925  * detector strategy and fall back to an emergency frame composer, where we
2926  * use ideas from Transcode's IVTC.
2927  *
2928  * In this emergency mode, we simply output the least interlaced frame out of
2929  * the combinations TNBN, TNBC and TCBN (where only one of the last two is
2930  * tested, based on the stream TFF/BFF information). In this mode, we do not
2931  * touch the timestamps, and just pass all five frames from each group right
2932  * through. This introduces some stutter, but in practice it is often not
2933  * noticeable. This is because the kind of material that is likely to trip up
2934  * the cadence detector usually includes irregular 8fps/12fps motion. With
2935  * true 24fps motion, the cadence quickly locks on, and stays locked on.
2936  *
2937  * Once the cadence locks on again, we resume normal operation based on
2938  * the operation table.
2939  *
2940  *
2941  * Timestamp mangling:
2942  *
2943  * To make five into four we need to extend frame durations by 25%.
2944  * Consider the following diagram (times given in 90kHz ticks, rounded to
2945  * integers; this is just for illustration, and for comparison with the
2946  * "scratch paper" comments in pulldown.c of TVTime/Xine):
2947  *
2948  * NTSC input (29.97 fps)
2949  * a       b       c       d        e        a (from next group) ...
2950  * 0    3003    6006    9009    12012    15015
2951  * 0      3754      7508       11261     15015
2952  * 1         2         3           4         1 (from next group) ...
2953  * Film output (23.976 fps)
2954  *
2955  * Three of the film frames have length 3754, and one has 3753
2956  * (it is 1/90000 sec shorter). This rounding was chosen so that the lengths
2957  * of the group of four sum to the original 15015.
2958  *
2959  * From the diagram we get these deltas for presentation timestamp adjustment
2960  * (in 90 kHz ticks, for illustration):
2961  * (1-a)   (2-b)  (3-c)   (4-d)   (skip)   (1-a) ...
2962  *     0   +751   +1502   +2252   (skip)       0 ...
2963  *
2964  * In fractions of (p_next->date - p_cur->date), regardless of actual
2965  * time unit, the deltas are:
2966  * (1-a)   (2-b)  (3-c)   (4-d)   (skip)   (1-a) ...
2967  *     0   +0.25  +0.50   +0.75   (skip)       0 ...
2968  *
2969  * This is what we actually use. In our implementation, the values are stored
2970  * multiplied by 4, as integers.
2971  *
2972  * The "current" frame should be displayed at [original time + delta].
2973  * E.g., when "current" = b (i.e. PCN = abc), start displaying film frame 2
2974  * at time [original time of b + 751 ticks]. So, when we catch the cadence,
2975  * we will start mangling the timestamps according to the cadence position
2976  * of the "current" frame, using the deltas given above. This will cause
2977  * a one-time jerk, most noticeable if the cadence happens to catch at
2978  * position "d". (Alternatively, upon lock-on, we could wait until we are
2979  * at "a" before switching on IVTC, but this makes the maximal delay
2980  * [max. detection + max. wait] = 3 + 4 = 7 input frames, which comes to
2981  * 7/30 ~ 0.23 seconds instead of the 3/30 = 0.10 seconds from purely
2982  * the detection. The one-time jerk is simpler to implement and gives the
2983  * faster lock-on.)
2984  *
2985  * It is clear that "e" is a safe choice for the dropped frame. This can be
2986  * seen from the timings and the cadence tables. First, consider the timings.
2987  * If we have only one future frame, "e" is the only one whose PTS, comparing
2988  * to the film frames, allows dropping it safely. To see this, consider which
2989  * film frame needs to be rendered as each new input frame arrives. Secondly,
2990  * consider the cadence tables. It is ok to drop "e", because the same
2991  * film frame "1" is available also at the next PCN position "eab".
2992  * (As a side note, it is interesting that Vektor's filter drops "b".
2993  * See the TVTime sources.)
2994  *
2995  * When the filter falls out of film mode, the timestamps of the incoming
2996  * frames are left untouched. Thus, the output from this filter has a
2997  * variable framerate: 4/5 of the input framerate when IVTC is active
2998  * (whether hard or soft), and the same framerate as input when it is not
2999  * (or when in emergency mode).
3000  *
3001  *
3002  * For other open-source IVTC codes, which may be a useful source for ideas,
3003  * see the following:
3004  *
3005  * The classic filter by Billy Biggs (Vektor). Written in 2001-2003 for
3006  * TVTime, and adapted into Xine later. In xine-lib 1.1.19, it is at
3007  * src/post/deinterlace/pulldown.*. Also needed are tvtime.*, and speedy.*.
3008  *
3009  * Transcode's ivtc->decimate->32detect chain by Thanassis Tsiodras.
3010  * Written in 2002, added in Transcode 0.6.12. This probably has something
3011  * to do with the same chain in MPlayer, considering that MPlayer acquired
3012  * an IVTC filter around the same time. In Transcode 1.1.5, the IVTC part is
3013  * at filter/filter_ivtc.c. Transcode 1.1.5 sources can be downloaded from
3014  * http://developer.berlios.de/project/showfiles.php?group_id=10094
3015  */
3016
3017 /**
3018  * Helper function: estimates "how much interlaced" the given field pair is.
3019  *
3020  * It is allowed that p_pic_top == p_pic_bottom.
3021  *
3022  * If p_pic_top != p_pic_bot (fields come from different pictures), you can use
3023  * ComposeFrame() to actually construct the picture if needed.
3024  *
3025  * Number of planes, and number of lines in each plane, in p_pic_top and
3026  * p_pic_bot must match. If the visible pitches differ, only the compatible
3027  * (smaller) part will be tested.
3028  *
3029  * Luma and chroma planes are tested in the same way. This is correct for
3030  * telecined input, where in the interlaced frames also chroma alternates
3031  * every chroma line, even if the chroma format is 4:2:0!
3032  *
3033  * This is just a raw detector that produces a score. The overall score
3034  * indicating a progressive or interlaced frame may vary wildly, depending on
3035  * the material, especially in anime. The scores should be compared to
3036  * each other locally (in the temporal sense) to make meaningful decisions
3037  * about progressive or interlaced frames.
3038  *
3039  * @param p_pic_top Picture to take the top field from.
3040  * @param p_pic_bot Picture to take the bottom field from (same or different).
3041  * @return Interlace score, >= 0. Higher values mean more interlaced.
3042  * @retval -1 Error: incompatible input pictures.
3043  * @see RenderIVTC()
3044  * @see ComposeFrame()
3045  */
3046 static int CalculateInterlaceScore( const picture_t* p_pic_top,
3047                                     const picture_t* p_pic_bot )
3048 {
3049     /*
3050         We use the comb metric from the IVTC filter of Transcode 1.1.5.
3051         This was found to work better for the particular purpose of IVTC
3052         than RenderX()'s comb metric.
3053
3054         Note that we *must not* subsample at all in order to catch interlacing
3055         in telecined frames with localized motion (e.g. anime with characters
3056         talking, where only mouths move and everything else stays still.)
3057     */
3058
3059     assert( p_pic_top != NULL );
3060     assert( p_pic_bot != NULL );
3061
3062     if( p_pic_top->i_planes != p_pic_bot->i_planes )
3063         return -1;
3064
3065     unsigned u_cpu = vlc_CPU();
3066
3067     /* Amount of bits must be known for MMX, thus int32_t.
3068        Doesn't hurt the C implementation. */
3069     int32_t i_score = 0;
3070
3071 #ifdef CAN_COMPILE_MMXEXT
3072     if( u_cpu & CPU_CAPABILITY_MMXEXT )
3073         pxor_r2r( mm7, mm7 ); /* we will keep score in mm7 */
3074 #endif
3075
3076     for( int i_plane = 0 ; i_plane < p_pic_top->i_planes ; ++i_plane )
3077     {
3078         /* Sanity check */
3079         if( p_pic_top->p[i_plane].i_visible_lines !=
3080             p_pic_bot->p[i_plane].i_visible_lines )
3081             return -1;
3082
3083         const int i_lasty = p_pic_top->p[i_plane].i_visible_lines-1;
3084         const int w = FFMIN( p_pic_top->p[i_plane].i_visible_pitch,
3085                              p_pic_bot->p[i_plane].i_visible_pitch );
3086         const int wm8 = w % 8;   /* remainder */
3087         const int w8  = w - wm8; /* part of width that is divisible by 8 */
3088
3089         /* Current line / neighbouring lines picture pointers */
3090         const picture_t *cur = p_pic_bot;
3091         const picture_t *ngh = p_pic_top;
3092         int wc = cur->p[i_plane].i_pitch;
3093         int wn = ngh->p[i_plane].i_pitch;
3094
3095         /* Transcode 1.1.5 only checks every other line. Checking every line
3096            works better for anime, which may contain horizontal,
3097            one pixel thick cartoon outlines.
3098         */
3099         for( int y = 1; y < i_lasty; ++y )
3100         {
3101             uint8_t *p_c = &cur->p[i_plane].p_pixels[y*wc];     /* this line */
3102             uint8_t *p_p = &ngh->p[i_plane].p_pixels[(y-1)*wn]; /* prev line */
3103             uint8_t *p_n = &ngh->p[i_plane].p_pixels[(y+1)*wn]; /* next line */
3104
3105             int x = 0;
3106
3107 /* Threshold (value from Transcode 1.1.5) */
3108 #define T 100
3109 #ifdef CAN_COMPILE_MMXEXT
3110             /* Easy-to-read C version further below.
3111
3112                Assumptions: 0 < T < 127
3113                             # of pixels < (2^32)/255
3114                Note: calculates score * 255
3115             */
3116             if( u_cpu & CPU_CAPABILITY_MMXEXT )
3117             {
3118                 static const mmx_t b0   = { .uq = 0x0000000000000000ULL };
3119                 static const mmx_t b128 = { .uq = 0x8080808080808080ULL };
3120                 static const mmx_t bT   = { .ub = { T, T, T, T, T, T, T, T } };
3121
3122                 for( ; x < w8; x += 8 )
3123                 {
3124                     movq_m2r( *((int64_t*)p_c), mm0 );
3125                     movq_m2r( *((int64_t*)p_p), mm1 );
3126                     movq_m2r( *((int64_t*)p_n), mm2 );
3127
3128                     psubb_m2r( b128, mm0 );
3129                     psubb_m2r( b128, mm1 );
3130                     psubb_m2r( b128, mm2 );
3131
3132                     psubsb_r2r( mm0, mm1 );
3133                     psubsb_r2r( mm0, mm2 );
3134
3135                     pxor_r2r( mm3, mm3 );
3136                     pxor_r2r( mm4, mm4 );
3137                     pxor_r2r( mm5, mm5 );
3138                     pxor_r2r( mm6, mm6 );
3139
3140                     punpcklbw_r2r( mm1, mm3 );
3141                     punpcklbw_r2r( mm2, mm4 );
3142                     punpckhbw_r2r( mm1, mm5 );
3143                     punpckhbw_r2r( mm2, mm6 );
3144
3145                     pmulhw_r2r( mm3, mm4 );
3146                     pmulhw_r2r( mm5, mm6 );
3147
3148                     packsswb_r2r(mm4, mm6);
3149                     pcmpgtb_m2r( bT, mm6 );
3150                     psadbw_m2r( b0, mm6 );
3151                     paddd_r2r( mm6, mm7 );
3152
3153                     p_c += 8;
3154                     p_p += 8;
3155                     p_n += 8;
3156                 }
3157             }
3158 #endif
3159             for( ; x < w; ++x )
3160             {
3161                 /* Worst case: need 17 bits for "comb". */
3162                 int_fast32_t C = *p_c;
3163                 int_fast32_t P = *p_p;
3164                 int_fast32_t N = *p_n;
3165
3166                 /* Comments in Transcode's filter_ivtc.c attribute this
3167                    combing metric to Gunnar Thalin.
3168
3169                     The idea is that if the picture is interlaced, both
3170                     expressions will have the same sign, and this comes
3171                     up positive. The value T = 100 has been chosen such
3172                     that a pixel difference of 10 (on average) will
3173                     trigger the detector.
3174                 */
3175                 int_fast32_t comb = (P - C) * (N - C);
3176                 if( comb > T )
3177                     ++i_score;
3178
3179                 ++p_c;
3180                 ++p_p;
3181                 ++p_n;
3182             }
3183
3184             /* Now the other field - swap current and neighbour pictures */
3185             const picture_t *tmp = cur;
3186             cur = ngh;
3187             ngh = tmp;
3188             int tmp_pitch = wc;
3189             wc = wn;
3190             wn = tmp_pitch;
3191         }
3192     }
3193
3194 #ifdef CAN_COMPILE_MMXEXT
3195     if( u_cpu & CPU_CAPABILITY_MMXEXT )
3196     {
3197         movd_r2m( mm7, i_score );
3198         emms();
3199         i_score /= 255;
3200     }
3201 #endif
3202
3203     return i_score;
3204 }
3205 #undef T
3206
3207 /**
3208  * Internal helper function for EstimateNumBlocksWithMotion():
3209  * estimates whether there is motion in the given 8x8 block on one plane
3210  * between two images. The block as a whole and its fields are evaluated
3211  * separately, and use different motion thresholds.
3212  *
3213  * This is a low-level function only used by EstimateNumBlocksWithMotion().
3214  * There is no need to call this function manually.
3215  *
3216  * For interpretation of pi_top and pi_bot, it is assumed that the block
3217  * starts on an even-numbered line (belonging to the top field).
3218  *
3219  * The b_mmx parameter avoids the need to call vlc_CPU() separately
3220  * for each block.
3221  *
3222  * @param[in] p_pix_p Base pointer to the block in previous picture
3223  * @param[in] p_pix_c Base pointer to the same block in current picture
3224  * @param i_pitch_prev i_pitch of previous picture
3225  * @param i_pitch_curr i_pitch of current picture
3226  * @param b_mmx (vlc_CPU() & CPU_CAPABILITY_MMXEXT) or false.
3227  * @param[out] pi_top 1 if top field of the block had motion, 0 if no
3228  * @param[out] pi_bot 1 if bottom field of the block had motion, 0 if no
3229  * @return 1 if the block had motion, 0 if no
3230  * @see EstimateNumBlocksWithMotion()
3231  */
3232 static inline int TestForMotionInBlock( uint8_t *p_pix_p, uint8_t *p_pix_c,
3233                                         int i_pitch_prev, int i_pitch_curr,
3234                                         bool b_mmx,
3235                                         int* pi_top, int* pi_bot )
3236 {
3237 /* Pixel luma/chroma difference threshold to detect motion. */
3238 #define T 10
3239
3240     int32_t i_motion = 0;
3241     int32_t i_top_motion = 0;
3242     int32_t i_bot_motion = 0;
3243
3244 /* See below for the C version to see more quickly what this does. */
3245 #ifdef CAN_COMPILE_MMXEXT
3246     if( b_mmx )
3247     {
3248         static const mmx_t bT   = { .ub = { T, T, T, T, T, T, T, T } };
3249         pxor_r2r( mm6, mm6 ); /* zero, used in psadbw */
3250         movq_m2r( bT,  mm5 );
3251
3252         pxor_r2r( mm3, mm3 ); /* score (top field) */
3253         pxor_r2r( mm4, mm4 ); /* score (bottom field) */
3254         for( int y = 0; y < 8; y+=2 )
3255         {
3256             /* top field */
3257             movq_m2r( *((uint64_t*)p_pix_c), mm0 );
3258             movq_m2r( *((uint64_t*)p_pix_p), mm1 );
3259             movq_r2r( mm0, mm2 );
3260             psubusb_r2r( mm1, mm2 );
3261             psubusb_r2r( mm0, mm1 );
3262
3263             pcmpgtb_r2r( mm5, mm2 );
3264             pcmpgtb_r2r( mm5, mm1 );
3265             psadbw_r2r(  mm6, mm2 );
3266             psadbw_r2r(  mm6, mm1 );
3267
3268             paddd_r2r( mm2, mm1 );
3269             paddd_r2r( mm1, mm3 ); /* add to top field score */
3270
3271             p_pix_c += i_pitch_curr;
3272             p_pix_p += i_pitch_prev;
3273
3274             /* bottom field - handling identical to top field, except... */
3275             movq_m2r( *((uint64_t*)p_pix_c), mm0 );
3276             movq_m2r( *((uint64_t*)p_pix_p), mm1 );
3277             movq_r2r( mm0, mm2 );
3278             psubusb_r2r( mm1, mm2 );
3279             psubusb_r2r( mm0, mm1 );
3280
3281             pcmpgtb_r2r( mm5, mm2 );
3282             pcmpgtb_r2r( mm5, mm1 );
3283             psadbw_r2r(  mm6, mm2 );
3284             psadbw_r2r(  mm6, mm1 );
3285
3286             paddd_r2r( mm2, mm1 );
3287             paddd_r2r( mm1, mm4 ); /* ...here we add to bottom field score */
3288
3289             p_pix_c += i_pitch_curr;
3290             p_pix_p += i_pitch_prev;
3291         }
3292         movq_r2r(  mm3, mm7 ); /* score (total) */
3293         paddd_r2r( mm4, mm7 );
3294         movd_r2m( mm3, i_top_motion );
3295         movd_r2m( mm4, i_bot_motion );
3296         movd_r2m( mm7, i_motion );
3297
3298         /* The loop counts actual score * 255. */
3299         i_top_motion /= 255;
3300         i_bot_motion /= 255;
3301         i_motion     /= 255;
3302
3303         emms();
3304     }
3305     else
3306 #endif
3307     {
3308         for( int y = 0; y < 8; ++y )
3309         {
3310             uint8_t *pc = p_pix_c;
3311             uint8_t *pp = p_pix_p;
3312             int score = 0;
3313             for( int x = 0; x < 8; ++x )
3314             {
3315                 int_fast16_t C = abs((*pc) - (*pp));
3316                 if( C > T )
3317                     ++score;
3318
3319                 ++pc;
3320                 ++pp;
3321             }
3322
3323             i_motion += score;
3324             if( y % 2 == 0 )
3325                 i_top_motion += score;
3326             else
3327                 i_bot_motion += score;
3328
3329             p_pix_c += i_pitch_curr;
3330             p_pix_p += i_pitch_prev;
3331         }
3332     }
3333
3334     /* Field motion thresholds.
3335
3336        Empirical value - works better in practice than the "4" that
3337        would be consistent with the full-block threshold.
3338
3339        Especially the opening scene of The Third ep. 1 (just after the OP)
3340        works better with this. It also fixes some talking scenes in
3341        Stellvia ep. 1, where the cadence would otherwise catch on incorrectly,
3342        leading to more interlacing artifacts than by just using the emergency
3343        mode frame composer.
3344     */
3345     (*pi_top) = ( i_top_motion >= 8 );
3346     (*pi_bot) = ( i_bot_motion >= 8 );
3347
3348     /* Full-block threshold = (8*8)/8: motion is detected if 1/8 of the block
3349        changes "enough". */
3350     return (i_motion >= 8);
3351 }
3352 #undef T
3353
3354 /**
3355  * Helper function: Estimates the number of 8x8 blocks which have motion
3356  * between the given pictures. Needed for various detectors in RenderIVTC().
3357  *
3358  * Number of planes and visible lines in each plane, in the inputs must match.
3359  * If the visible pitches do not match, only the compatible (smaller)
3360  * part will be tested.
3361  *
3362  * Note that the return value is NOT simply *pi_top + *pi_bot, because
3363  * the fields and the full block use different motion thresholds.
3364  *
3365  * If you do not want the separate field scores, pass NULL for pi_top and
3366  * pi_bot. This does not affect computation speed, and is only provided as
3367  * a syntactic convenience.
3368  *
3369  * Motion in each picture plane (Y, U, V) counts separately.
3370  * The sum of number of blocks with motion across all planes is returned.
3371  *
3372  * For 4:2:0 chroma, even-numbered chroma lines make up the "top field" for
3373  * chroma, and odd-numbered chroma lines the "bottom field" for chroma.
3374  * This is correct for IVTC purposes.
3375  *
3376  * @param[in] p_prev Previous picture
3377  * @param[in] p_curr Current picture
3378  * @param[out] pi_top Number of 8x8 blocks where top field has motion.
3379  * @param[out] pi_bot Number of 8x8 blocks where bottom field has motion.
3380  * @return Number of 8x8 blocks that have motion.
3381  * @retval -1 Error: incompatible input pictures.
3382  * @see TestForMotionInBlock()
3383  * @see RenderIVTC()
3384  */
3385 static int EstimateNumBlocksWithMotion( const picture_t* p_prev,
3386                                         const picture_t* p_curr,
3387                                         int *pi_top, int *pi_bot)
3388 {
3389     assert( p_prev != NULL );
3390     assert( p_curr != NULL );
3391
3392     int i_score_top = 0;
3393     int i_score_bot = 0;
3394
3395     if( p_prev->i_planes != p_curr->i_planes )
3396         return -1;
3397
3398     /* We must tell our inline helper whether to use MMX acceleration. */
3399 #ifdef CAN_COMPILE_MMXEXT
3400     bool b_mmx = ( vlc_CPU() & CPU_CAPABILITY_MMXEXT );
3401 #else
3402     bool b_mmx = false;
3403 #endif
3404
3405     int i_score = 0;
3406     for( int i_plane = 0 ; i_plane < p_prev->i_planes ; i_plane++ )
3407     {
3408         /* Sanity check */
3409         if( p_prev->p[i_plane].i_visible_lines !=
3410             p_curr->p[i_plane].i_visible_lines )
3411             return -1;
3412
3413         const int i_pitch_prev = p_prev->p[i_plane].i_pitch;
3414         const int i_pitch_curr = p_curr->p[i_plane].i_pitch;
3415
3416         /* Last pixels and lines (which do not make whole blocks) are ignored.
3417            Shouldn't really matter for our purposes. */
3418         const int i_mby = p_prev->p[i_plane].i_visible_lines / 8;
3419         const int w = FFMIN( p_prev->p[i_plane].i_visible_pitch,
3420                              p_curr->p[i_plane].i_visible_pitch );
3421         const int i_mbx = w / 8;
3422
3423         for( int by = 0; by < i_mby; ++by )
3424         {
3425             uint8_t *p_pix_p = &p_prev->p[i_plane].p_pixels[i_pitch_prev*8*by];
3426             uint8_t *p_pix_c = &p_curr->p[i_plane].p_pixels[i_pitch_curr*8*by];
3427
3428             for( int bx = 0; bx < i_mbx; ++bx )
3429             {
3430                 int i_top_temp, i_bot_temp;
3431                 i_score += TestForMotionInBlock( p_pix_p, p_pix_c,
3432                                                  i_pitch_prev, i_pitch_curr,
3433                                                  b_mmx,
3434                                                  &i_top_temp, &i_bot_temp );
3435                 i_score_top += i_top_temp;
3436                 i_score_bot += i_bot_temp;
3437
3438                 p_pix_p += 8;
3439                 p_pix_c += 8;
3440             }
3441         }
3442     }
3443
3444     if( pi_top )
3445         (*pi_top) = i_score_top;
3446     if( pi_bot )
3447         (*pi_bot) = i_score_bot;
3448
3449     return i_score;
3450 }
3451
3452 /* Fasten your seatbelt - lots of IVTC constants follow... */
3453
3454 /**
3455  * IVTC filter modes.
3456  *
3457  * Hard telecine: burned into video stream.
3458  * Soft telecine: stream consists of progressive frames;
3459  *                telecining handled by stream flags.
3460  *
3461  * @see ivtc_sys_t
3462  * @see RenderIVTC()
3463  */
3464 typedef enum { IVTC_MODE_DETECTING           = 0,
3465                IVTC_MODE_TELECINED_NTSC_HARD = 1,
3466                IVTC_MODE_TELECINED_NTSC_SOFT = 2 } ivtc_mode;
3467
3468 /**
3469  *  Field pair combinations from successive frames in the PCN stencil.
3470  *  T = top, B = bottom, P = previous, C = current, N = next
3471  *  These are used as array indices; hence the explicit numbering.
3472  */
3473 typedef enum { FIELD_PAIR_TPBP = 0, FIELD_PAIR_TPBC = 1,
3474                FIELD_PAIR_TCBP = 2, FIELD_PAIR_TCBC = 3,
3475                FIELD_PAIR_TCBN = 4, FIELD_PAIR_TNBC = 5,
3476                FIELD_PAIR_TNBN = 6 } ivtc_field_pair;
3477
3478 /* Note: only valid ones count for NUM */
3479 #define NUM_CADENCE_POS 9
3480 /**
3481  * Cadence positions for the PCN stencil (PCN, Previous Current Next).
3482  *
3483  * Note that "dea" in both cadence tables and a pure progressive signal
3484  * are indistinguishable.
3485  *
3486  * Used as array indices except the -1.
3487  *
3488  * This is a combined raw position containing both i_cadence_pos
3489  * and telecine field dominance.
3490  * @see pi_detected_pos_to_cadence_pos
3491  * @see pi_detected_pos_to_tfd
3492  */
3493 typedef enum { CADENCE_POS_INVALID     = -1,
3494                CADENCE_POS_PROGRESSIVE =  0,
3495                CADENCE_POS_TFF_ABC     =  1,
3496                CADENCE_POS_TFF_BCD     =  2,
3497                CADENCE_POS_TFF_CDE     =  3,
3498                CADENCE_POS_TFF_EAB     =  4,
3499                CADENCE_POS_BFF_ABC     =  5,
3500                CADENCE_POS_BFF_BCD     =  6,
3501                CADENCE_POS_BFF_CDE     =  7,
3502                CADENCE_POS_BFF_EAB     =  8 } ivtc_cadence_pos;
3503 /* First and one-past-end for TFF-only and BFF-only raw positions. */
3504 #define CADENCE_POS_TFF_FIRST 1
3505 #define CADENCE_POS_TFF_END   5
3506 #define CADENCE_POS_BFF_FIRST 5
3507 #define CADENCE_POS_BFF_END   9
3508
3509 /**
3510  * For the "vektor" cadence detector algorithm.
3511  *
3512  * The algorithm produces a set of possible positions instead of a unique
3513  * position, until it locks on. The set is represented as a bitmask.
3514  *
3515  * The bitmask is stored in a word, and its layout is:
3516  * blank blank BFF_CARRY BFF4 BFF3 BFF2 BFF1 BFF0   (high byte)
3517  * blank blank TFF_CARRY TFF4 TFF3 TFF2 TFF1 TFF0   (low byte)
3518  *
3519  * This allows predicting the next position by left-shifting the previous
3520  * result by one bit, copying the CARRY bits to the respective zeroth position,
3521  * and ANDing with 0x1F1F.
3522  *
3523  * This table is indexed with a valid ivtc_cadence_pos.
3524  * @see ivtc_cadence_pos
3525  */
3526 const int pi_detected_pos_to_bitmask[NUM_CADENCE_POS] = { 0x0808, /* prog. */
3527                                                           0x0001, /* TFF ABC */
3528                                                           0x0002, /* TFF BCD */
3529                                                           0x0004, /* TFF CDE */
3530                                                           0x0010, /* TFF EAB */
3531                                                           0x0100, /* BFF ABC */
3532                                                           0x0200, /* BFF BCD */
3533                                                           0x0400, /* BFF CDE */
3534                                                           0x1000, /* BFF EAB */
3535                                                         };
3536 #define VEKTOR_CADENCE_POS_ALL 0x1F1F
3537 #define VEKTOR_CADENCE_POS_TFF 0x00FF
3538 #define VEKTOR_CADENCE_POS_BFF 0xFF00
3539 #define VEKTOR_CADENCE_POS_TFF_HIGH 0x0010
3540 #define VEKTOR_CADENCE_POS_TFF_LOW  0x0001
3541 #define VEKTOR_CADENCE_POS_BFF_HIGH 0x1000
3542 #define VEKTOR_CADENCE_POS_BFF_LOW  0x0100
3543
3544 /* Telecine field dominance */
3545 typedef enum { TFD_INVALID = -1, TFD_TFF = 0, TFD_BFF = 1 } ivtc_tfd;
3546
3547 /**
3548  * Position detection table for the "scores" cadence detector algorithm.
3549  *
3550  * These are the (only) field pair combinations that should give progressive
3551  * frames. There are three for each position.
3552  *
3553  * First index: ivtc_cadence_pos
3554  */
3555 static const ivtc_field_pair pi_best_field_pairs[NUM_CADENCE_POS][3] = {
3556     {FIELD_PAIR_TPBP, FIELD_PAIR_TCBC, FIELD_PAIR_TNBN}, /* prog. */
3557
3558     {FIELD_PAIR_TPBP, FIELD_PAIR_TCBP, FIELD_PAIR_TNBC}, /* TFF ABC */
3559     {FIELD_PAIR_TCBP, FIELD_PAIR_TNBC, FIELD_PAIR_TNBN}, /* TFF BCD */
3560     {FIELD_PAIR_TCBP, FIELD_PAIR_TCBC, FIELD_PAIR_TNBN}, /* TFF CDE */
3561     {FIELD_PAIR_TPBP, FIELD_PAIR_TCBC, FIELD_PAIR_TNBC}, /* TFF EAB */
3562
3563     {FIELD_PAIR_TPBP, FIELD_PAIR_TPBC, FIELD_PAIR_TCBN}, /* BFF ABC */
3564     {FIELD_PAIR_TPBC, FIELD_PAIR_TCBN, FIELD_PAIR_TNBN}, /* BFF BCD */
3565     {FIELD_PAIR_TPBC, FIELD_PAIR_TCBC, FIELD_PAIR_TNBN}, /* BFF CDE */
3566     {FIELD_PAIR_TPBP, FIELD_PAIR_TCBC, FIELD_PAIR_TCBN}, /* BFF EAB */
3567 };
3568
3569 /**
3570  * Alternative position detection table for the "scores" cadence detector
3571  * algorithm.
3572  *
3573  * These field pair combinations should give only interlaced frames.
3574  * There are four for each position.
3575  *
3576  * First index: ivtc_cadence_pos
3577  *
3578  * Currently unused. During development it was tested that whether we detect
3579  * best or worst, the resulting detected cadence positions are identical
3580  * (neither strategy performs any different from the other).
3581  */
3582 static const ivtc_field_pair pi_worst_field_pairs[NUM_CADENCE_POS][4] = {
3583     {FIELD_PAIR_TPBC, FIELD_PAIR_TCBP,
3584         FIELD_PAIR_TCBN, FIELD_PAIR_TNBC}, /* prog. */
3585
3586     {FIELD_PAIR_TPBC, FIELD_PAIR_TCBC,
3587         FIELD_PAIR_TCBN, FIELD_PAIR_TNBN}, /* TFF ABC */
3588     {FIELD_PAIR_TPBP, FIELD_PAIR_TPBC,
3589         FIELD_PAIR_TCBC, FIELD_PAIR_TCBN}, /* TFF BCD */
3590     {FIELD_PAIR_TPBP, FIELD_PAIR_TPBC,
3591         FIELD_PAIR_TCBN, FIELD_PAIR_TNBC}, /* TFF CDE */
3592     {FIELD_PAIR_TPBC, FIELD_PAIR_TCBP,
3593         FIELD_PAIR_TCBN, FIELD_PAIR_TNBN}, /* TFF EAB */
3594
3595     {FIELD_PAIR_TCBP, FIELD_PAIR_TCBC,
3596         FIELD_PAIR_TNBC, FIELD_PAIR_TNBN}, /* BFF ABC */
3597     {FIELD_PAIR_TPBP, FIELD_PAIR_TCBP,
3598         FIELD_PAIR_TCBC, FIELD_PAIR_TNBC}, /* BFF BCD */
3599     {FIELD_PAIR_TPBP, FIELD_PAIR_TCBP,
3600         FIELD_PAIR_TNBC, FIELD_PAIR_TCBN}, /* BFF CDE */
3601     {FIELD_PAIR_TCBP, FIELD_PAIR_TPBC,
3602         FIELD_PAIR_TNBC, FIELD_PAIR_TNBN}, /* BFF EAB */
3603 };
3604
3605 /**
3606  * Table for extracting the i_cadence_pos part of detected cadence position
3607  * (ivtc_cadence_pos).
3608  *
3609  * The counter goes from 0 to 4, where "abc" = 0, "bcd" = 1, ...
3610  *
3611  * @see ivtc_cadence_pos
3612  */
3613 static const int pi_detected_pos_to_cadence_pos[NUM_CADENCE_POS] = {
3614     3, /* prog. */
3615     0, /* TFF ABC */
3616     1, /* TFF BCD */
3617     2, /* TFF CDE */
3618     4, /* TFF EAB */
3619     0, /* BFF ABC */
3620     1, /* BFF BCD */
3621     2, /* BFF CDE */
3622     4, /* BFF EAB */
3623 };
3624
3625 /**
3626  * Table for extracting the telecine field dominance part of detected
3627  * cadence position (ivtc_cadence_pos).
3628  *
3629  * The position "dea" does not provide TFF/BFF information, because it is
3630  * indistinguishable from progressive.
3631  *
3632  * @see ivtc_cadence_pos
3633  */
3634 static const int pi_detected_pos_to_tfd[NUM_CADENCE_POS] = {
3635     TFD_INVALID, /* prog. */
3636     TFD_TFF, /* TFF ABC */
3637     TFD_TFF, /* TFF BCD */
3638     TFD_TFF, /* TFF CDE */
3639     TFD_TFF, /* TFF EAB */
3640     TFD_BFF, /* BFF ABC */
3641     TFD_BFF, /* BFF BCD */
3642     TFD_BFF, /* BFF CDE */
3643     TFD_BFF, /* BFF EAB */
3644 };
3645
3646 /* Valid telecine sequences (TFF and BFF). Indices: [TFD][i_cadence_pos] */
3647 /* Currently unused and left here for documentation only.
3648    There is an easier way - just decode the i_cadence_pos part of the
3649    detected position using the pi_detected_pos_to_cadence_pos table,
3650    and check that it is successive mod 5. See IVTCCadenceAnalyze(). */
3651 /*static const int pi_valid_cadences[2][5] = { {CADENCE_POS_TFF_ABC,
3652                                              CADENCE_POS_TFF_BCD,
3653                                              CADENCE_POS_TFF_CDE,
3654                                              CADENCE_POS_PROGRESSIVE,
3655                                              CADENCE_POS_TFF_EAB},
3656
3657                                              {CADENCE_POS_BFF_ABC,
3658                                              CADENCE_POS_BFF_BCD,
3659                                              CADENCE_POS_BFF_CDE,
3660                                              CADENCE_POS_PROGRESSIVE,
3661                                              CADENCE_POS_BFF_EAB},
3662                                            };
3663 */
3664
3665 /**
3666  * Operations needed in film frame reconstruction.
3667  */
3668 typedef enum { IVTC_OP_DROP_FRAME,
3669                IVTC_OP_COPY_N,
3670                IVTC_OP_COPY_C,
3671                IVTC_OP_COMPOSE_TNBC,
3672                IVTC_OP_COMPOSE_TCBN } ivtc_op;
3673
3674 /* Note: During hard IVTC, we must avoid COPY_C and do a compose instead.
3675    If we COPY_C, some subtitles will flicker badly, even if we use the
3676    cadence-based film frame reconstruction. Try the first scene in
3677    Kanon (2006) vol. 3 to see the problem.
3678
3679    COPY_C can be used without problems when it is used consistently
3680    (not constantly mixed in with COPY_N and compose operations),
3681    for example in soft IVTC.
3682 */
3683 /**
3684  * Operation table for film frame reconstruction depending on cadence position.
3685  * Indices: [TFD][i_cadence_pos]
3686  * @see pi_detected_pos_to_tfd
3687  * @see pi_detected_pos_to_cadence_pos
3688  */
3689 static const ivtc_op pi_reconstruction_ops[2][5] = { /* TFF */
3690                                                      {IVTC_OP_COMPOSE_TNBC,
3691                                                       IVTC_OP_COPY_N,
3692                                                       IVTC_OP_COPY_N,
3693                                                       IVTC_OP_DROP_FRAME,
3694                                                       IVTC_OP_COMPOSE_TNBC},
3695
3696                                                      /* BFF */
3697                                                      {IVTC_OP_COMPOSE_TCBN,
3698                                                       IVTC_OP_COPY_N,
3699                                                       IVTC_OP_COPY_N,
3700                                                       IVTC_OP_DROP_FRAME,
3701                                                       IVTC_OP_COMPOSE_TCBN},
3702                                                    };
3703
3704 /**
3705  * Timestamp mangling table.
3706  *
3707  * This is used in the 29.97 -> 23.976 fps conversion.
3708  *
3709  * Index: i_cadence_pos, 0..4.
3710  *
3711  * Valid values are nonnegative. The -1 corresponds to the dropped frame
3712  * and is never used, except for a debug assert.
3713  *
3714  * The unit of the values is 1/4 of frame duration.
3715  * See the function documentation of RenderIVTC() for an explanation.
3716  * @see ivtc_cadence_pos
3717  * @see pi_detected_pos_to_cadence_pos
3718  * @see pi_reconstruction_ops
3719  * @see RenderIVTC()
3720  */
3721 static const int pi_timestamp_deltas[5] = { 1, 2, 3, -1, 0 };
3722
3723 /**
3724  * Internal helper function for RenderIVTC(): performs initialization
3725  * at the start of a new frame.
3726  *
3727  * In practice, this slides detector histories.
3728  *
3729  * This function should only perform initialization that does NOT require
3730  * the input frame history buffer. This runs at every frame, including
3731  * the first two.
3732  *
3733  * This is an internal function only used by RenderIVTC().
3734  * There is no need to call this function manually.
3735  *
3736  * @param p_filter The filter instance.
3737  * @see RenderIVTC()
3738  */
3739 static inline void IVTCFrameInit( filter_t *p_filter )
3740 {
3741     assert( p_filter != NULL );
3742
3743     filter_sys_t *p_sys = p_filter->p_sys;
3744     ivtc_sys_t *p_ivtc  = &p_sys->ivtc;
3745
3746     /* Slide detector histories */
3747     for( int i = 1; i < IVTC_DETECTION_HISTORY_SIZE; i++ )
3748     {
3749         p_ivtc->pi_top_rep[i-1] = p_ivtc->pi_top_rep[i];
3750         p_ivtc->pi_bot_rep[i-1] = p_ivtc->pi_bot_rep[i];
3751         p_ivtc->pi_motion[i-1]  = p_ivtc->pi_motion[i];
3752
3753         p_ivtc->pi_s_cadence_pos[i-1] = p_ivtc->pi_s_cadence_pos[i];
3754         p_ivtc->pb_s_reliable[i-1]    = p_ivtc->pb_s_reliable[i];
3755         p_ivtc->pi_v_cadence_pos[i-1] = p_ivtc->pi_v_cadence_pos[i];
3756         p_ivtc->pi_v_raw[i-1]         = p_ivtc->pi_v_raw[i];
3757         p_ivtc->pb_v_reliable[i-1]    = p_ivtc->pb_v_reliable[i];
3758
3759         p_ivtc->pi_cadence_pos_history[i-1]
3760                                       = p_ivtc->pi_cadence_pos_history[i];
3761
3762         p_ivtc->pb_all_progressives[i-1] = p_ivtc->pb_all_progressives[i];
3763     }
3764     /* The latest position has not been detected yet. */
3765     p_ivtc->pi_s_cadence_pos[IVTC_LATEST] = CADENCE_POS_INVALID;
3766     p_ivtc->pb_s_reliable[IVTC_LATEST]    = false;
3767     p_ivtc->pi_v_cadence_pos[IVTC_LATEST] = CADENCE_POS_INVALID;
3768     p_ivtc->pi_v_raw[IVTC_LATEST]         = VEKTOR_CADENCE_POS_ALL;
3769     p_ivtc->pb_v_reliable[IVTC_LATEST]    = false;
3770     p_ivtc->pi_cadence_pos_history[IVTC_LATEST] = CADENCE_POS_INVALID;
3771     p_ivtc->pi_top_rep[IVTC_LATEST] =  0;
3772     p_ivtc->pi_bot_rep[IVTC_LATEST] =  0;
3773     p_ivtc->pi_motion[IVTC_LATEST]  = -1;
3774     p_ivtc->pb_all_progressives[IVTC_LATEST] = false;
3775
3776     /* Slide history of field pair interlace scores */
3777     p_ivtc->pi_scores[FIELD_PAIR_TPBP] = p_ivtc->pi_scores[FIELD_PAIR_TCBC];
3778     p_ivtc->pi_scores[FIELD_PAIR_TPBC] = p_ivtc->pi_scores[FIELD_PAIR_TCBN];
3779     p_ivtc->pi_scores[FIELD_PAIR_TCBP] = p_ivtc->pi_scores[FIELD_PAIR_TNBC];
3780     p_ivtc->pi_scores[FIELD_PAIR_TCBC] = p_ivtc->pi_scores[FIELD_PAIR_TNBN];
3781     /* These have not been detected yet */
3782     p_ivtc->pi_scores[FIELD_PAIR_TCBN] = 0;
3783     p_ivtc->pi_scores[FIELD_PAIR_TNBC] = 0;
3784     p_ivtc->pi_scores[FIELD_PAIR_TNBN] = 0;
3785 }
3786
3787 /**
3788  * Internal helper function for RenderIVTC(): computes various raw detector
3789  * data at the start of a new frame.
3790  *
3791  * This function requires the input frame history buffer.
3792  * IVTCFrameInit() must have been called first.
3793  * Last two frames must be available in the history buffer.
3794  *
3795  * This is an internal function only used by RenderIVTC().
3796  * There is no need to call this function manually.
3797  *
3798  * @param p_filter The filter instance.
3799  * @see RenderIVTC()
3800  * @see IVTCFrameInit()
3801  */
3802 static inline void IVTCLowLevelDetect( filter_t *p_filter )
3803 {
3804     assert( p_filter != NULL );
3805
3806     filter_sys_t *p_sys = p_filter->p_sys;
3807     ivtc_sys_t *p_ivtc  = &p_sys->ivtc;
3808     picture_t *p_curr = p_sys->pp_history[1];
3809     picture_t *p_next = p_sys->pp_history[2];
3810
3811     assert( p_next != NULL );
3812     assert( p_curr != NULL );
3813
3814     /* Compute interlace scores for TNBN, TNBC and TCBN.
3815         Note that p_next contains TNBN. */
3816     p_ivtc->pi_scores[FIELD_PAIR_TNBN] = CalculateInterlaceScore( p_next,
3817                                                                   p_next );
3818     p_ivtc->pi_scores[FIELD_PAIR_TNBC] = CalculateInterlaceScore( p_next,
3819                                                                   p_curr );
3820     p_ivtc->pi_scores[FIELD_PAIR_TCBN] = CalculateInterlaceScore( p_curr,
3821                                                                   p_next );
3822
3823     int i_top = 0, i_bot = 0;
3824     int i_motion = EstimateNumBlocksWithMotion(p_curr, p_next, &i_top, &i_bot);
3825     p_ivtc->pi_motion[IVTC_LATEST] = i_motion;
3826
3827     /* If one field changes "clearly more" than the other, we know the
3828        less changed one is a likely duplicate.
3829
3830        Threshold 1/2 is too low for some scenes (e.g. pan of the space junk
3831        at beginning of The Third ep. 1, right after the OP). Thus, we use 2/3,
3832        which seems to work.
3833     */
3834     p_ivtc->pi_top_rep[IVTC_LATEST] = (i_top <= 2*i_bot/3);
3835     p_ivtc->pi_bot_rep[IVTC_LATEST] = (i_bot <= 2*i_top/3);
3836 }
3837
3838 /**
3839  * Internal helper function for RenderIVTC(): using raw detector data,
3840  * detect cadence position by an interlace scores based algorithm ("scores").
3841  *
3842  * IVTCFrameInit() and IVTCLowLevelDetect() must have been called first.
3843  * Last frame must be available in the history buffer.
3844  *
3845  * This is an internal function only used by RenderIVTC().
3846  * There is no need to call this function manually.
3847  *
3848  * @param p_filter The filter instance.
3849  * @see RenderIVTC()
3850  * @see IVTCFrameInit()
3851  * @see IVTCLowLevelDetect()
3852  * @see IVTCCadenceDetectFinalize()
3853  */
3854 static inline void IVTCCadenceDetectAlgoScores( filter_t *p_filter )
3855 {
3856     assert( p_filter != NULL );
3857
3858     filter_sys_t *p_sys = p_filter->p_sys;
3859     ivtc_sys_t *p_ivtc  = &p_sys->ivtc;
3860     picture_t *p_next = p_sys->pp_history[2];
3861
3862     assert( p_next != NULL );
3863
3864     /* Detect likely cadence position according to the tables,
3865        using the tabulated combinations of all 7 available interlace scores.
3866     */
3867     int pi_ivtc_scores[NUM_CADENCE_POS];
3868     for( int i = 0; i < NUM_CADENCE_POS; i++ )
3869         pi_ivtc_scores[i] = p_ivtc->pi_scores[ pi_best_field_pairs[i][0] ]
3870                           + p_ivtc->pi_scores[ pi_best_field_pairs[i][1] ]
3871                           + p_ivtc->pi_scores[ pi_best_field_pairs[i][2] ];
3872     /* Find minimum */
3873     int j = CADENCE_POS_PROGRESSIVE; /* valid regardless of TFD */
3874     int minscore = pi_ivtc_scores[j];
3875     /* A TFF (respectively BFF) stream may only have TFF (respectively BFF)
3876        telecine. Don't bother looking at the wrong table. */
3877     int imin = CADENCE_POS_TFF_FIRST; /* first TFF-only entry */
3878     int iend = CADENCE_POS_TFF_END;   /* one past last TFF-only entry */
3879     if( !p_next->b_top_field_first )
3880     {
3881         imin = CADENCE_POS_BFF_FIRST; /* first BFF-only entry */
3882         iend = CADENCE_POS_BFF_END;   /* one past last BFF-only entry */
3883     }
3884     for( int i = imin; i < iend; i++ )
3885     {
3886         if( pi_ivtc_scores[i] < minscore )
3887         {
3888             minscore = pi_ivtc_scores[i];
3889             j = i;
3890         }
3891     }
3892
3893     /* Now "j" contains the most likely position according to the tables,
3894        accounting also for video TFF/BFF. */
3895     p_ivtc->pi_s_cadence_pos[IVTC_LATEST] = j;
3896
3897     /* Estimate reliability of detector result.
3898
3899        We do this by checking if the winner is an outlier at least
3900        to some extent. For anyone better versed in statistics,
3901        feel free to improve this.
3902     */
3903
3904     /* Compute sample mean with the winner included and without.
3905
3906        Sample mean is defined as mu = sum( x_i, i ) / N ,
3907        where N is the number of samples.
3908     */
3909     int mean = pi_ivtc_scores[CADENCE_POS_PROGRESSIVE];
3910     int mean_except_min = 0;
3911     if( j != CADENCE_POS_PROGRESSIVE )
3912         mean_except_min = pi_ivtc_scores[CADENCE_POS_PROGRESSIVE];
3913     for( int i = imin; i < iend; i++ )
3914     {
3915         mean += pi_ivtc_scores[i];
3916         if( i != j )
3917             mean_except_min += pi_ivtc_scores[i];
3918     }
3919     /* iend points one past end, but progressive counts as the +1. */
3920     mean /= (iend - imin + 1);
3921     mean_except_min /= (iend - imin);
3922
3923     /* Check how much excluding the winner changes the mean. */
3924     double mean_ratio = (double)mean_except_min / (double)mean;
3925
3926     /* Let's pretend that the detected position is a stochastic variable.
3927        Compute sample variance with the winner included and without.
3928
3929        var = sum( (x_i - mu)^2, i ) / N ,
3930
3931        where mu is the sample mean.
3932
3933        Note that we really need int64_t; the numbers are pretty large.
3934     */
3935     int64_t diff = (int64_t)(pi_ivtc_scores[CADENCE_POS_PROGRESSIVE] - mean);
3936     int64_t var = diff*diff;
3937     int64_t var_except_min = 0;
3938     if( j != CADENCE_POS_PROGRESSIVE )
3939     {
3940         int64_t diff_exm = (int64_t)(pi_ivtc_scores[CADENCE_POS_PROGRESSIVE]
3941                                       - mean_except_min);
3942         var_except_min = diff_exm*diff_exm;
3943     }
3944     for( int i = imin; i < iend; i++ )
3945     {
3946         diff = (int64_t)(pi_ivtc_scores[i] - mean);
3947         var += (diff*diff);
3948         if( i != j )
3949         {
3950             int64_t diff_exm = (int64_t)(pi_ivtc_scores[i] - mean_except_min);
3951             var_except_min += (diff_exm*diff_exm);
3952         }
3953     }
3954     /* iend points one past end, but progressive counts as the +1. */
3955     var /= (uint64_t)(iend - imin + 1);
3956     var_except_min /= (uint64_t)(iend - imin);
3957
3958     /* Extract cadence counter part of detected positions for the
3959        last two frames.
3960
3961        Note that for the previous frame, we use the final detected cadence
3962        position, which was not necessarily produced by this algorithm.
3963        It is the result that was judged the most reliable.
3964     */
3965     int j_curr = p_ivtc->pi_cadence_pos_history[IVTC_LATEST-1];
3966     int pos_next = pi_detected_pos_to_cadence_pos[j];
3967
3968     /* Be optimistic when unsure. We bias the detection toward accepting
3969        the next "correct" position, even if the variance check comes up bad.
3970     */
3971     bool b_expected = false;
3972     if( j_curr != CADENCE_POS_INVALID )
3973     {
3974         int pos_curr = pi_detected_pos_to_cadence_pos[j_curr];
3975         b_expected = (pos_next == (pos_curr + 1) % 5);
3976     }
3977
3978     /* Use motion detect result as a final sanity check.
3979        If no motion, the result from this algorithm cannot be reliable.
3980     */
3981     int i_blocks_with_motion = p_ivtc->pi_motion[IVTC_LATEST];
3982
3983     /* The numbers given here are empirical constants that have been tuned
3984        through trial and error. The test material used was NTSC anime DVDs.
3985
3986         Easy-to-detect parts seem to give variance boosts of 40-70%, but
3987         hard-to-detect parts sometimes only 18%. Anything with a smaller boost
3988         in variance doesn't seem reliable for catching a new lock-on,
3989
3990         Additionally, it seems that if the mean changes by less than 0.5%,
3991         the result is not reliable.
3992
3993         Note that the numbers given are only valid for the pi_best_field_pairs
3994         detector strategy.
3995
3996         For motion detection, the detector seems good enough so that
3997         we can threshold at zero.
3998     */
3999     bool b_result_reliable =
4000       ( i_blocks_with_motion > 0      &&
4001         mean_ratio           > 1.005  &&
4002         ( b_expected || ( (double)var > 1.17*(double)var_except_min ) )
4003       );
4004     p_ivtc->pb_s_reliable[IVTC_LATEST] = b_result_reliable;
4005 }
4006
4007 /**
4008  * Internal helper function for RenderIVTC(): using raw detector data,
4009  * detect cadence position by a hard field repeat based algorithm ("vektor").
4010  *
4011  * This algorithm is inspired by the classic TVTime/Xine IVTC filter
4012  * by Billy Biggs (Vektor); hence the name. There are however some
4013  * differences between this and the TVTime/Xine filter.
4014  *
4015  * IVTCFrameInit() and IVTCLowLevelDetect() must have been called first.
4016  * Last frame must be available in the history buffer.
4017  *
4018  * This is an internal function only used by RenderIVTC().
4019  * There is no need to call this function manually.
4020  *
4021  * @param p_filter The filter instance.
4022  * @see RenderIVTC()
4023  * @see IVTCFrameInit()
4024  * @see IVTCLowLevelDetect()
4025  * @see IVTCCadenceDetectFinalize()
4026  */
4027 static inline void IVTCCadenceDetectAlgoVektor( filter_t *p_filter )
4028 {
4029     assert( p_filter != NULL );
4030
4031     filter_sys_t *p_sys = p_filter->p_sys;
4032     ivtc_sys_t *p_ivtc  = &p_sys->ivtc;
4033
4034     picture_t *p_next = p_sys->pp_history[2];
4035
4036     assert( p_next != NULL );
4037
4038     /* This algorithm is based on detecting hard-repeated fields (by motion
4039        detection), and conservatively estimating what the seen repeats could
4040        mean for the cadence position.
4041
4042        "Conservative" means that we do not rule out possibilities if repeats
4043        are *not* seen, but only *add* possibilities based on what repeats
4044        *are* seen. This is important. Otherwise full-frame repeats in the
4045        original film (8fps or 12fps animation is very common in anime),
4046        causing spurious field repeats, would mess up the detection.
4047        With this strategy, spurious repeats will only slow down the lock-on,
4048        and will not break an existing lock-on once acquired.
4049
4050        Several possibilities are kept open until the sequence gives enough
4051        information to make a unique detection. When the sequence becomes
4052        inconsistent (e.g. bad cut), the detector resets itself.
4053
4054        The main ideas taken from the TVTime/Xine algorithm are:
4055         1) Conservatively using information from detected field repeats,
4056         2) Cadence counting the earlier detection results and combining with
4057            the new detection result, and
4058         3) The observation that video TFF/BFF uniquely determines TFD.
4059
4060        The main differences are
4061         1) Different motion detection (see EstimateNumBlocksWithMotion()).
4062            Vektor's original estimates the average top/bottom field diff
4063            over the last 3 frames, while ours uses a block-based approach
4064            for diffing and just compares the field diffs between "curr" and
4065            "next" against each other (see IVTCLowLevelDetect()).
4066            Both approaches are adaptive, but in a different way.
4067         2) The specific detection logic used is a bit different (see both
4068            codes for details; the original is in xine-lib, function
4069            determine_pulldown_offset_short_history_new() in pulldown.c;
4070            ours is obviously given below). I think the one given here
4071            is a bit simpler.
4072
4073        Note that we don't have to worry about getting a detection in all cases.
4074        It's enough if we work reliably, say, 99% of the time, and the other 1%
4075        of the time just admit that we don't know the cadence position.
4076        (This mostly happens after a bad cut, when the new scene has
4077        "difficult" motion characteristics, such as repeated film frames.)
4078        Our frame composer is built to handle also cases where we have no
4079        reliable detection of the cadence position; see IVTCOutputOrDropFrame().
4080        More important is to never lock on incorrectly, as this would both
4081        generate interlacing artifacts where none existed, and cause motion
4082        to stutter (because duplicate frames would be shown and unique ones
4083        dropped).
4084     */
4085
4086     /* Progressive requires no repeats, so it is always a possibility.
4087        Filtering will drop it out if we know that the current position
4088        cannot be "dea".
4089     */
4090     int detected = 0;
4091     detected |= pi_detected_pos_to_bitmask[ CADENCE_POS_PROGRESSIVE ];
4092
4093     /* Add in other possibilities depending on field repeats seen during the
4094        last three input frames (i.e. two transitions between input frames).
4095        See the "Dups." column in the cadence tables.
4096     */
4097     bool b_top_rep     = p_ivtc->pi_top_rep[IVTC_LATEST];
4098     bool b_bot_rep     = p_ivtc->pi_bot_rep[IVTC_LATEST];
4099     bool b_old_top_rep = p_ivtc->pi_top_rep[IVTC_LATEST-1];
4100     bool b_old_bot_rep = p_ivtc->pi_bot_rep[IVTC_LATEST-1];
4101     if( b_top_rep )
4102     {
4103         detected |= pi_detected_pos_to_bitmask[ CADENCE_POS_TFF_EAB ];
4104         detected |= pi_detected_pos_to_bitmask[ CADENCE_POS_BFF_BCD ];
4105     }
4106     if( b_old_top_rep )
4107     {
4108         detected |= pi_detected_pos_to_bitmask[ CADENCE_POS_TFF_ABC ];
4109         detected |= pi_detected_pos_to_bitmask[ CADENCE_POS_BFF_CDE ];
4110     }
4111     if( b_bot_rep )
4112     {
4113         detected |= pi_detected_pos_to_bitmask[ CADENCE_POS_TFF_BCD ];
4114         detected |= pi_detected_pos_to_bitmask[ CADENCE_POS_BFF_EAB ];
4115     }
4116     if( b_old_bot_rep )
4117     {
4118         detected |= pi_detected_pos_to_bitmask[ CADENCE_POS_TFF_CDE ];
4119         detected |= pi_detected_pos_to_bitmask[ CADENCE_POS_BFF_ABC ];
4120     }
4121
4122     /* A TFF stream may only have TFF telecine, and similarly for BFF.
4123        Discard the possibility we know to be incorrect for this stream.
4124        (The stream may flipflop between the possibilities if it contains
4125         soft-telecined sequences or lone field repeats, so we must keep
4126         detecting this for each incoming frame.)
4127     */
4128     bool b_tff = p_next->b_top_field_first;
4129     if( b_tff )
4130         detected &= VEKTOR_CADENCE_POS_TFF;
4131     else
4132         detected &= VEKTOR_CADENCE_POS_BFF;
4133
4134     /* Predict possible next positions based on our last detection.
4135        Begin with a shift and carry. */
4136     int predicted = p_ivtc->pi_v_raw[IVTC_LATEST-1];
4137     bool b_wrap_tff = false;
4138     bool b_wrap_bff = false;
4139     if( predicted & VEKTOR_CADENCE_POS_TFF_HIGH )
4140         b_wrap_tff = true;
4141     if( predicted & VEKTOR_CADENCE_POS_BFF_HIGH )
4142         b_wrap_bff = true;
4143     /* bump to next position and keep only valid bits */
4144     predicted = (predicted << 1) & VEKTOR_CADENCE_POS_ALL;
4145     /* carry */
4146     if( b_wrap_tff )
4147         predicted |= VEKTOR_CADENCE_POS_TFF_LOW;
4148     if( b_wrap_bff )
4149         predicted |= VEKTOR_CADENCE_POS_BFF_LOW;
4150
4151     /* Filter: narrow down possibilities based on previous detection,
4152        if consistent. If not consistent, reset the detector.
4153        Reset works better than just using the latest raw detection.
4154     */
4155     if( (detected & predicted) != 0 )
4156         detected = detected & predicted;
4157     else
4158         detected = VEKTOR_CADENCE_POS_ALL;
4159
4160     /* We're done. Save result to our internal storage so we can use it
4161        for prediction at the next frame.
4162
4163        Note that the outgoing frame check in IVTCOutputOrDropFrame()
4164        has a veto right, resetting our state if it determines that
4165        the cadence has become broken.
4166     */
4167     p_ivtc->pi_v_raw[IVTC_LATEST] = detected;
4168
4169     /* See if the position has been detected uniquely.
4170        If so, we have acquired a lock-on. */
4171     ivtc_cadence_pos exact = CADENCE_POS_INVALID;
4172     if( detected != 0 )
4173     {
4174         for( int i = 0; i < NUM_CADENCE_POS; i++ )
4175         {
4176             /* Note that we must use "&" instead of just equality to catch
4177                the progressive case, and also not to trigger on an incomplete
4178                detection. */
4179             if( detected == (detected & pi_detected_pos_to_bitmask[i]) )
4180             {
4181                 exact = i;
4182                 break;
4183             }
4184         }
4185     }
4186
4187     /* If the result was unique, now "exact" contains the detected
4188        cadence position (and otherwise CADENCE_POS_INVALID).
4189
4190        In practice, if the result from this algorithm is unique,
4191        it is always reliable.
4192     */
4193     p_ivtc->pi_v_cadence_pos[IVTC_LATEST] =  exact;
4194     p_ivtc->pb_v_reliable[IVTC_LATEST]    = (exact != CADENCE_POS_INVALID);
4195 }
4196
4197 /**
4198  * Internal helper function for RenderIVTC(): decide the final detected
4199  * cadence position for the current position of the PCN stencil,
4200  * using the results of the different cadence detection algorithms.
4201  *
4202  * Must be called after all IVTCCadenceDetectAlgo*() functions.
4203  *
4204  * This is an internal function only used by RenderIVTC().
4205  * There is no need to call this function manually.
4206  *
4207  * @param p_filter The filter instance.
4208  * @see RenderIVTC()
4209  * @see IVTCCadenceDetectAlgoScores()
4210  * @see IVTCCadenceDetectAlgoVektor()
4211  */
4212 static inline void IVTCCadenceDetectFinalize( filter_t *p_filter )
4213 {
4214     assert( p_filter != NULL );
4215
4216     filter_sys_t *p_sys = p_filter->p_sys;
4217     ivtc_sys_t *p_ivtc  = &p_sys->ivtc;
4218
4219     /* In practice "vektor" is more reliable than "scores", but it may
4220        take longer to lock on. Thus, we prefer "vektor" if its reliable bit
4221        is set, then "scores", and finally just give up.
4222
4223        For progressive sequences, "vektor" outputs "3, -, 3, -, ...",
4224        because the repeated progressive position is an inconsistent prediction.
4225        In this case, "scores" fills in the blanks. (This particular task
4226        could also be done without another cadence detector, by just
4227        detecting the alternating pattern of "3" and no result.)
4228     */
4229     int pos = CADENCE_POS_INVALID;
4230     if( p_ivtc->pb_v_reliable[IVTC_LATEST] )
4231         pos = p_ivtc->pi_v_cadence_pos[IVTC_LATEST];
4232     else if( p_ivtc->pb_s_reliable[IVTC_LATEST] )
4233         pos = p_ivtc->pi_s_cadence_pos[IVTC_LATEST];
4234     p_ivtc->pi_cadence_pos_history[IVTC_LATEST] = pos;
4235 }
4236
4237 /**
4238  * Internal helper function for RenderIVTC(): using stream flags,
4239  * detect soft telecine.
4240  *
4241  * This function is different from the other detectors; it may enter or exit
4242  * IVTC_MODE_TELECINED_NTSC_SOFT, if it detects that soft telecine has just
4243  * been entered or exited.
4244  *
4245  * Upon exit from soft telecine, the filter will resume operation in its
4246  * previous mode (which it had when soft telecine was entered).
4247  *
4248  * Last three frames must be available in the history buffer.
4249  *
4250  * This is an internal function only used by RenderIVTC().
4251  * There is no need to call this function manually.
4252  *
4253  * @param p_filter The filter instance.
4254  * @see RenderIVTC()
4255  */
4256 static inline void IVTCSoftTelecineDetect( filter_t *p_filter )
4257 {
4258     assert( p_filter != NULL );
4259
4260     filter_sys_t *p_sys = p_filter->p_sys;
4261     ivtc_sys_t *p_ivtc  = &p_sys->ivtc;
4262     picture_t *p_prev = p_sys->pp_history[0];
4263     picture_t *p_curr = p_sys->pp_history[1];
4264     picture_t *p_next = p_sys->pp_history[2];
4265
4266     assert( p_next != NULL );
4267     assert( p_curr != NULL );
4268     assert( p_prev != NULL );
4269
4270     /* Soft telecine can be detected from the flag pattern:
4271        nb_fields = 3,2,3,2,... and *video* TFF = true, false, false, true
4272        (TFF telecine) or false, true, true, false (BFF telecine).
4273
4274        We don't particularly care which field goes first, because in soft TC
4275        we're working with progressive frames. And in any case, the video FDs
4276        of successive frames must match any field repeats in order for field
4277        renderers (such as traditional DVD player + CRT TV) to work correctly.
4278        Thus the video TFF/BFF flag provides no additional useful information
4279        for us on top of checking nb_fields.
4280
4281        The only thing to *do* to soft telecine in an IVTC filter is to even
4282        out the outgoing PTS diffs to 2.5 fields each, so that we get
4283        a steady 24fps output. Thus, we can do this processing even if it turns
4284        out that we saw a lone field repeat (which are also sometimes used,
4285        such as in the Silent Mobius OP and in Sol Bianca). We can be aggressive
4286        and don't need to care about false positives - as long as we are equally
4287        aggressive about dropping out of soft telecine mode the moment a "2" is
4288        followed by another "2" and not a "3" as in soft TC.
4289
4290        Finally, we conclude that the one-frame future buffer is enough for us
4291        to make soft TC decisions just in time for rendering the frame in the
4292        "current" position. The flag patterns given below constitute proof
4293        of this property.
4294
4295        Soft telecine is relatively rare at least in anime, but it exists;
4296        e.g. Angel Links OP, Silent Mobius, and Stellvia of the Universe have
4297        sequences that are soft telecined. Stellvia, especially, alternates
4298        between soft and hard telecine all the time.
4299     */
4300
4301     /* Valid stream flag patterns for soft telecine. There are three: */
4302
4303     /* Entering soft telecine at frame curr, or running inside it already */
4304     bool b_soft_telecine_1 = (p_prev->i_nb_fields == 2) &&
4305                              (p_curr->i_nb_fields == 3) &&
4306                              (p_next->i_nb_fields == 2);
4307     /* Running inside soft telecine */
4308     bool b_soft_telecine_2 = (p_prev->i_nb_fields == 3) &&
4309                              (p_curr->i_nb_fields == 2) &&
4310                              (p_next->i_nb_fields == 3);
4311     /* Exiting soft telecine at frame curr (curr is the last frame
4312        that should be handled as soft TC) */
4313     bool b_soft_telecine_3 = (p_prev->i_nb_fields == 3) &&
4314                              (p_curr->i_nb_fields == 2) &&
4315                              (p_next->i_nb_fields == 2);
4316
4317     /* Soft telecine is very clear-cut - the moment we see or do not see
4318        a valid flag pattern, we can change the filter mode.
4319     */
4320     if( b_soft_telecine_1 || b_soft_telecine_2 || b_soft_telecine_3 )
4321     {
4322         if( p_ivtc->i_mode != IVTC_MODE_TELECINED_NTSC_SOFT )
4323         {
4324             msg_Dbg( p_filter, "IVTC: 3:2 pulldown: NTSC soft telecine "\
4325                                "detected." );
4326             p_ivtc->i_old_mode = p_ivtc->i_mode;
4327         }
4328
4329         /* Valid flag pattern seen, this frame is soft telecined */
4330         p_ivtc->i_mode = IVTC_MODE_TELECINED_NTSC_SOFT;
4331
4332         /* Only used during IVTC'ing hard telecine. */
4333         p_ivtc->i_cadence_pos = CADENCE_POS_INVALID;
4334         p_ivtc->i_tfd         = TFD_INVALID;
4335     }
4336     /* Note: no flag pattern match now */
4337     else if( p_ivtc->i_mode == IVTC_MODE_TELECINED_NTSC_SOFT )
4338     {
4339         msg_Dbg( p_filter, "IVTC: 3:2 pulldown: NTSC soft telecine ended. "\
4340                            "Returning to previous mode." );
4341
4342         /* No longer soft telecined, return filter to the mode it had earlier.
4343            This is needed to fix cases where we came in from hard telecine, and
4344            should go back, but can't catch a cadence in time before telecined
4345            frames slip through. Kickstarting back to hard IVTC, using the
4346            emergency frame composer until the cadence locks on again,
4347            fixes the problem. This happens a lot in Stellvia.
4348         */
4349         p_ivtc->i_mode = p_ivtc->i_old_mode;
4350         p_ivtc->i_cadence_pos = 0; /* Wild guess. The film frame reconstruction
4351                                       will start in emergency mode, and this
4352                                       will be filled in by the detector ASAP.*/
4353         /* I suppose video field dominance no longer flipflops. */
4354         p_ivtc->i_tfd = !p_next->b_top_field_first; /* tff  <=>  TFD == 0 */
4355     }
4356 }
4357
4358 /**
4359  * Internal helper function for RenderIVTC(): using the history of detected
4360  * cadence positions, analyze the cadence and enter or exit
4361  * IVTC_MODE_TELECINED_NTSC_HARD when appropriate.
4362  *
4363  * This also updates b_sequence_valid.
4364  *
4365  * Last three frames must be available in the history buffer.
4366  *
4367  * This is an internal function only used by RenderIVTC().
4368  * There is no need to call this function manually.
4369  *
4370  * @param p_filter The filter instance.
4371  * @see RenderIVTC()
4372  */
4373 static void IVTCCadenceAnalyze( filter_t *p_filter )
4374 {
4375     assert( p_filter != NULL );
4376
4377     filter_sys_t *p_sys = p_filter->p_sys;
4378     ivtc_sys_t *p_ivtc  = &p_sys->ivtc;
4379     picture_t *p_prev = p_sys->pp_history[0];
4380     picture_t *p_curr = p_sys->pp_history[1];
4381     picture_t *p_next = p_sys->pp_history[2];
4382
4383     assert( p_next != NULL );
4384     assert( p_curr != NULL );
4385     assert( p_prev != NULL );
4386
4387     /* Determine which frames in the buffer qualify for analysis.
4388
4389        Note that hard telecine always has nb_fields = 2 and
4390        video TFF = constant (i.e. the stream flags look no different from
4391        a true interlaced or true progressive stream). Basically, no one ever
4392        sets the progressive frame flag for the input frames d, e, and a -
4393        in practice they're all flagged as interlaced.
4394
4395        A frame may qualify for hard TC analysis if it has no soft field repeat
4396        (i.e. it cannot be part of a soft telecine). The condition
4397        nb_fields == 2 must always match.
4398
4399        Additionally, curr and next must have had motion with respect to the
4400        previous frame, to ensure that the different field combinations have
4401        produced unique pictures.
4402
4403        Alternatively, if there was no motion, but the cadence position was
4404        reliably detected and it was the expected one, we qualify the frame
4405        for analysis (mainly, for TFD voting).
4406
4407        We only proceed with the cadence analysis if all three frames
4408        in the buffer qualify.
4409     */
4410
4411     /* Note that these are the final detected positions
4412        produced by IVTCCadenceDetectFinalize(). */
4413     int j_next = p_ivtc->pi_cadence_pos_history[IVTC_LATEST];
4414     int j_curr = p_ivtc->pi_cadence_pos_history[IVTC_LATEST-1];
4415     int j_prev = p_ivtc->pi_cadence_pos_history[IVTC_LATEST-2];
4416
4417     bool b_expected = false;
4418     if( j_next != CADENCE_POS_INVALID  &&  j_curr != CADENCE_POS_INVALID )
4419     {
4420         int pos_next = pi_detected_pos_to_cadence_pos[j_next];
4421         int pos_curr = pi_detected_pos_to_cadence_pos[j_curr];
4422         b_expected = (pos_next == (pos_curr + 1) % 5);
4423     }
4424     bool b_old_expected  = false;
4425     if( j_curr != CADENCE_POS_INVALID  &&  j_prev != CADENCE_POS_INVALID )
4426     {
4427         int pos_curr = pi_detected_pos_to_cadence_pos[j_curr];
4428         int pos_prev = pi_detected_pos_to_cadence_pos[j_prev];
4429         b_old_expected = (pos_curr == (pos_prev + 1) % 5);
4430     }
4431
4432     int i_motion     = p_ivtc->pi_motion[IVTC_LATEST];
4433     int i_old_motion = p_ivtc->pi_motion[IVTC_LATEST-1];
4434
4435     bool b_prev_valid  = (p_prev->i_nb_fields == 2);
4436     bool b_curr_valid  = (p_curr->i_nb_fields == 2)  &&
4437                          (i_old_motion > 0  ||  b_old_expected);
4438     bool b_next_valid  = (p_next->i_nb_fields == 2)  &&
4439                          (i_motion > 0      ||  b_expected);
4440     bool b_no_invalids = (b_prev_valid && b_curr_valid && b_next_valid);
4441
4442     /* Final sanity check: see that the detection history has been
4443        completely filled,  i.e. the latest three positions of the stencil
4444        have given a result from the cadence detector.
4445     */
4446     if( b_no_invalids )
4447     {
4448         for( int i = 0; i < IVTC_DETECTION_HISTORY_SIZE; ++i )
4449         {
4450             const int i_detected_pos = p_ivtc->pi_cadence_pos_history[i];
4451             if( i_detected_pos == CADENCE_POS_INVALID )
4452             {
4453                 b_no_invalids = false;
4454                 break;
4455             }
4456         }
4457     }
4458
4459     /* If still ok, do the analysis. */
4460     p_ivtc->b_sequence_valid = false; /* needed in frame reconstruction */
4461     if( b_no_invalids )
4462     {
4463         /* Convert the history elements to cadence position and TFD. */
4464         int pi_tfd[IVTC_DETECTION_HISTORY_SIZE];
4465         int pi_pos[IVTC_DETECTION_HISTORY_SIZE];
4466         for( int i = 0; i < IVTC_DETECTION_HISTORY_SIZE; ++i )
4467         {
4468             const int i_detected_pos = p_ivtc->pi_cadence_pos_history[i];
4469             pi_pos[i] = pi_detected_pos_to_cadence_pos[i_detected_pos];
4470             pi_tfd[i] = pi_detected_pos_to_tfd[i_detected_pos];
4471         }
4472
4473         /* See if the sequence is valid. The cadence positions must be
4474            successive mod 5. We can't say anything about TFF/BFF yet,
4475            because the progressive-looking position "dea" may be there.
4476            If the sequence otherwise looks valid, we handle that last
4477            by voting.
4478
4479            We also test for a progressive signal here, so that we know
4480            when to exit IVTC_MODE_TELECINED_NTSC_HARD.
4481         */
4482         p_ivtc->b_sequence_valid = true;
4483         bool b_all_progressive = (pi_pos[0] == 3);
4484         int j = pi_pos[0];
4485         for( int i = 1; i < IVTC_DETECTION_HISTORY_SIZE; ++i )
4486         {
4487             if( pi_pos[i] != (++j % 5) )
4488                 p_ivtc->b_sequence_valid = false;
4489             if( pi_pos[i] != 3 )
4490                 b_all_progressive = false;
4491         }
4492         p_ivtc->pb_all_progressives[IVTC_LATEST] = b_all_progressive;
4493
4494         if( p_ivtc->b_sequence_valid )
4495         {
4496             /* Determine TFF/BFF. */
4497             int i_vote_invalid = 0;
4498             int i_vote_tff     = 0;
4499             int i_vote_bff     = 0;
4500             for( int i = 0; i < IVTC_DETECTION_HISTORY_SIZE; ++i )
4501             {
4502                 if( pi_tfd[i] == TFD_INVALID )
4503                     i_vote_invalid++;
4504                 else if( pi_tfd[i] == TFD_TFF )
4505                     i_vote_tff++;
4506                 else if( pi_tfd[i] == TFD_BFF )
4507                     i_vote_bff++;
4508             }
4509
4510             /* With three entries, two votes for any one item are enough
4511                to decide this conclusively. */
4512             int i_telecine_field_dominance = TFD_INVALID;
4513             if( i_vote_tff >= 2)
4514                 i_telecine_field_dominance = TFD_TFF;
4515             else if( i_vote_bff >= 2)
4516                 i_telecine_field_dominance = TFD_BFF;
4517             /* In all other cases, "invalid" won or no winner.
4518                This means no NTSC telecine detected. */
4519
4520             /* Lock on to the cadence if it was valid and TFF/BFF was found.
4521
4522                Also, aggressively update the cadence counter from the
4523                lock-on data whenever we can. In practice this has been found
4524                to be a reliable strategy (if the cadence detectors are
4525                good enough).
4526             */
4527             if( i_telecine_field_dominance == TFD_TFF )
4528             {
4529                 if( p_ivtc->i_mode != IVTC_MODE_TELECINED_NTSC_HARD )
4530                     msg_Dbg( p_filter, "IVTC: 3:2 pulldown: NTSC TFF "\
4531                                        "hard telecine detected." );
4532                 p_ivtc->i_mode        = IVTC_MODE_TELECINED_NTSC_HARD;
4533                 p_ivtc->i_cadence_pos = pi_pos[IVTC_LATEST];
4534                 p_ivtc->i_tfd         = TFD_TFF;
4535             }
4536             else if( i_telecine_field_dominance == TFD_BFF )
4537             {
4538                 if( p_ivtc->i_mode != IVTC_MODE_TELECINED_NTSC_HARD )
4539                     msg_Dbg( p_filter, "IVTC: 3:2 pulldown: NTSC BFF "\
4540                                        "hard telecine detected." );
4541                 p_ivtc->i_mode        = IVTC_MODE_TELECINED_NTSC_HARD;
4542                 p_ivtc->i_cadence_pos = pi_pos[IVTC_LATEST];
4543                 p_ivtc->i_tfd         = TFD_BFF;
4544             }
4545         }
4546         /* No telecine... maybe a progressive signal? */
4547         else if( b_all_progressive )
4548         {
4549             /* It seems that in practice, three "3"s in a row can still be
4550                a fluke rather often. Four or five usually are not.
4551                This fixes the Stellvia OP. */
4552
4553             bool b_really_all_progressive = true;
4554             for( int i = 0; i < IVTC_DETECTION_HISTORY_SIZE ; i++ )
4555             {
4556                 if( p_ivtc->pb_all_progressives[i] == false )
4557                 {
4558                     b_really_all_progressive = false;
4559                     break;
4560                 }
4561             }
4562
4563             /* If we still think the signal is progressive... */
4564             if( b_really_all_progressive )
4565             {
4566                 /* ...exit film mode immediately. This does not break
4567                    soft TC handling, because for soft TC at least one
4568                    of the frames will not qualify (due to i_nb_fields == 3),
4569                    and in that case this analysis will not run.
4570                 */
4571                 if( p_ivtc->i_mode == IVTC_MODE_TELECINED_NTSC_HARD )
4572                     msg_Dbg( p_filter, "IVTC: 3:2 pulldown: progressive "\
4573                                        "signal detected." );
4574                 p_ivtc->i_mode        = IVTC_MODE_DETECTING;
4575                 p_ivtc->i_cadence_pos = CADENCE_POS_INVALID;
4576                 p_ivtc->i_tfd         = TFD_INVALID;
4577             }
4578         }
4579         /* Final missing "else": no valid NTSC telecine sequence detected.
4580
4581            Either there is no telecine, or the detector - although it produced
4582            results - had trouble finding it. In this case we do nothing,
4583            as it's not a good idea to act on unreliable data.
4584
4585            Note that if we are already in IVTC_MODE_TELECINED_NTSC_HARD, this
4586            case means that we have lost the lock-on, but are still (probably)
4587            in a hard-telecined stream. This will start the emergency mode
4588            for film frame reconstruction. See IVTCOutputOrDropFrame().
4589         */
4590     }
4591 }
4592
4593 /**
4594  * Internal helper function for RenderIVTC(): render or drop frame,
4595  * whichever needs to be done. This also sets the output frame PTS.
4596  *
4597  * Last two frames must be available in the history buffer.
4598  *
4599  * This is an internal function only used by RenderIVTC().
4600  * There is no need to call this function manually.
4601  *
4602  * @param p_filter The filter instance. Must be non-NULL.
4603  * @param[out] p_dst Frame will be rendered here. Must be non-NULL.
4604  * @return Whether a frame was constructed.
4605  * @retval true Yes, output frame is in p_dst.
4606  * @retval false No, this frame was dropped as part of normal IVTC operation.
4607  * @see RenderIVTC()
4608  */
4609 static bool IVTCOutputOrDropFrame( filter_t *p_filter, picture_t *p_dst )
4610 {
4611     assert( p_filter != NULL );
4612     assert( p_dst != NULL );
4613
4614     filter_sys_t *p_sys = p_filter->p_sys;
4615     ivtc_sys_t *p_ivtc  = &p_sys->ivtc;
4616     mtime_t t_final = VLC_TS_INVALID; /* for custom timestamp mangling */
4617
4618     picture_t *p_curr = p_sys->pp_history[1];
4619     picture_t *p_next = p_sys->pp_history[2];
4620
4621     assert( p_next != NULL );
4622     assert( p_curr != NULL );
4623
4624     /* Perform IVTC if we're in film mode (either hard or soft telecine).
4625
4626        Note that we don't necessarily have a lock-on, even if we are in
4627        IVTC_MODE_TELECINED_NTSC_HARD. We *may* be locked on, or alternatively,
4628        we have seen a valid cadence some time in the past, but lock-on has
4629        since been lost, and we have not seen a progressive signal after that.
4630        The latter case usually results from bad cuts, which interrupt
4631        the cadence.
4632
4633        Lock-on state is given by p_ivtc->b_sequence_valid.
4634     */
4635     int i_result_score = -1;
4636     int op;
4637     if( p_ivtc->i_mode == IVTC_MODE_TELECINED_NTSC_HARD )
4638     {
4639         /* Decide what to do. The operation table is only enabled
4640            if the cadence seems reliable. Otherwise we use a backup strategy.
4641         */
4642         if( p_ivtc->b_sequence_valid )
4643         {
4644             assert( p_ivtc->i_cadence_pos != CADENCE_POS_INVALID );
4645             assert( p_ivtc->i_tfd != TFD_INVALID );
4646
4647             /* Pick correct operation from the operation table. */
4648             op = pi_reconstruction_ops[p_ivtc->i_tfd][p_ivtc->i_cadence_pos];
4649
4650             if( op == IVTC_OP_DROP_FRAME )
4651             {
4652                 /* Bump cadence counter into the next expected position */
4653                 p_ivtc->i_cadence_pos = ++p_ivtc->i_cadence_pos % 5;
4654
4655                 /* Drop frame. We're done. */
4656                 return false;
4657             }
4658             else
4659             {
4660                 if( op == IVTC_OP_COPY_N )
4661                     i_result_score = p_ivtc->pi_scores[FIELD_PAIR_TNBN];
4662                 else if( op == IVTC_OP_COPY_C )
4663                     i_result_score = p_ivtc->pi_scores[FIELD_PAIR_TCBC];
4664                 else if( op == IVTC_OP_COMPOSE_TNBC )
4665                     i_result_score = p_ivtc->pi_scores[FIELD_PAIR_TNBC];
4666                 else if( op == IVTC_OP_COMPOSE_TCBN )
4667                     i_result_score = p_ivtc->pi_scores[FIELD_PAIR_TCBN];
4668
4669                 /* Sanity check the result */
4670
4671                 /* Compute running mean of outgoing interlace score.
4672                    See below for history mechanism. */
4673                 int i_avg = 0;
4674                 for( int i = 0; i < IVTC_DETECTION_HISTORY_SIZE; i++)
4675                     i_avg += p_ivtc->pi_final_scores[i];
4676                 i_avg /= IVTC_DETECTION_HISTORY_SIZE;
4677
4678                 /* Check if the score suddenly became "clearly larger".
4679                    Also, filter out spurious peaks at the low end. */
4680                 if( i_result_score > 1000  &&  i_result_score > 2*i_avg )
4681                 {
4682                     /* Sequence wasn't reliable after all; we'll use
4683                        the Transcode strategy for this frame. */
4684                     p_ivtc->b_sequence_valid = false;
4685                     msg_Dbg( p_filter, "Rejected cadence-based frame "\
4686                                        "construction: interlace score %d "\
4687                                        "(running average %d)",
4688                                        i_result_score, i_avg );
4689
4690                     /* We also reset the detector used in the "vektor"
4691                        algorithm, as it depends on having a reliable previous
4692                        position. In practice, we continue using the Transcode
4693                        strategy until the cadence becomes locked on again.
4694                        (At that point, b_sequence_valid will become true again,
4695                        and we continue with this strategy.)
4696                     */
4697                     p_ivtc->pi_v_raw[IVTC_LATEST] = VEKTOR_CADENCE_POS_ALL;
4698                 }
4699             }
4700         }
4701
4702         /* Frame not dropped, and the cadence counter seems unreliable.
4703
4704             Note that this is not an "else" to the previous case. This may
4705             begin with a valid sequence, and then the above logic decides
4706             that it wasn't valid after all.
4707         */
4708         if( !p_ivtc->b_sequence_valid )
4709         {
4710             /* In this case, we must proceed with no cadence information.
4711                 We use a Transcode-like strategy.
4712
4713                 We check which field paired with TN or BN (accounting for
4714                 the field dominance) gives the smallest interlace score,
4715                 and declare that combination the resulting progressive frame.
4716
4717                 This strategy gives good results on average, but often fails
4718                 in talking scenes in anime. Those can be handled more reliably
4719                 with a locked-on cadence produced by the "vektor" algorithm.
4720             */
4721
4722             int tnbn = p_ivtc->pi_scores[FIELD_PAIR_TNBN]; /* TFF and BFF */
4723             int tnbc = p_ivtc->pi_scores[FIELD_PAIR_TNBC]; /* TFF only */
4724             int tcbn = p_ivtc->pi_scores[FIELD_PAIR_TCBN]; /* BFF only */
4725
4726             if( p_next->b_top_field_first )
4727             {
4728                 if( tnbn <= tnbc )
4729                 {
4730                     op = IVTC_OP_COPY_N;
4731                     i_result_score = tnbn;
4732                 }
4733                 else
4734                 {
4735                     op = IVTC_OP_COMPOSE_TNBC;
4736                     i_result_score = tnbc;
4737                 }
4738             }
4739             else
4740             {
4741                 if( tnbn <= tcbn )
4742                 {
4743                     op = IVTC_OP_COPY_N;
4744                     i_result_score = tnbn;
4745                 }
4746                 else
4747                 {
4748                     op = IVTC_OP_COMPOSE_TCBN;
4749                     i_result_score = tcbn;
4750                 }
4751             }
4752         }
4753
4754         /* Mangle timestamps when locked on.
4755
4756            "Current" is the frame that is being extracted now. Use its original
4757            timestamp as the base.
4758
4759            Note that this way there will be no extra delay compared to the
4760            raw stream, even though we look one frame into the future.
4761         */
4762         if( p_ivtc->b_sequence_valid )
4763         {
4764             /* Convert 29.97 -> 23.976 fps. We get to this point only if we
4765                didn't drop the frame, so we always get a valid delta.
4766             */
4767             int i_timestamp_delta = pi_timestamp_deltas[p_ivtc->i_cadence_pos];
4768             assert( i_timestamp_delta >= 0 );
4769
4770             /* FIXME: use field length as measured by Deinterlace()? */
4771             t_final = p_curr->date
4772                     + (p_next->date - p_curr->date)*i_timestamp_delta/4;
4773         }
4774         else /* Do not mangle timestamps (or drop frames, either) if cadence
4775                 is not locked on. This causes one of five output frames - if
4776                 all are reconstructed correctly - to be a duplicate, but in
4777                 practice at least with anime (which is the kind of material
4778                 that tends to have this problem) this is less noticeable than
4779                 a sudden jump in the cadence. Especially, a consistently wrong
4780                 lock-on will cause a very visible stutter, which we wish
4781                 to avoid. */
4782         {
4783             t_final = p_curr->date;
4784         }
4785
4786         /* Bump cadence counter into the next expected position. */
4787         p_ivtc->i_cadence_pos = ++p_ivtc->i_cadence_pos % 5;
4788     }
4789     else if( p_ivtc->i_mode == IVTC_MODE_TELECINED_NTSC_SOFT )
4790     {
4791         /* Soft telecine. We have the progressive frames already;
4792            even out PTS diffs only. */
4793
4794         /* Pass through the "current" frame. We must choose the frame "current"
4795            in order to be able to detect soft telecine before we have to output
4796            the frame. See IVTCSoftTelecineDetect(). Also, this allows
4797            us to peek at the next timestamp to calculate the duration of
4798            "current".
4799         */
4800         op = IVTC_OP_COPY_C;
4801         i_result_score = p_ivtc->pi_scores[FIELD_PAIR_TCBC];
4802
4803         /* Timestamp mangling for soft telecine: bump "threes" forward by
4804            0.5 field durations. This is more forgiving for the renderer
4805            than bumping the "twos" back (which would require to render
4806            them sooner),
4807         */
4808         if( p_curr->i_nb_fields == 3 )
4809         {
4810             /* Approximate field duration from the PTS difference. */
4811             /* FIXME: use field length as measured by Deinterlace()? */
4812             mtime_t i_half_field_dur = ( (p_next->date - p_curr->date)/3 ) / 2;
4813             t_final = p_curr->date + i_half_field_dur;
4814         }
4815         else /* Otherwise, use original PTS of the outgoing frame. */
4816         {
4817             t_final = p_curr->date;
4818         }
4819     }
4820     else /* Not film mode, timestamp mangling bypassed. */
4821     {
4822         op = IVTC_OP_COPY_N;
4823         i_result_score = p_ivtc->pi_scores[FIELD_PAIR_TNBN];
4824
4825         /* Preserve original PTS (note that now, in principle,
4826                                   "next" is the outgoing frame) */
4827         t_final = p_next->date;
4828     }
4829
4830     /* There is only one case where we should drop the frame,
4831        and it was already handled above. */
4832     assert( op != IVTC_OP_DROP_FRAME );
4833
4834     /* Render into p_dst according to the final operation chosen. */
4835     if( op == IVTC_OP_COPY_N )
4836         picture_Copy( p_dst, p_next );
4837     else if( op == IVTC_OP_COPY_C )
4838         picture_Copy( p_dst, p_curr );
4839     else if( op == IVTC_OP_COMPOSE_TNBC )
4840         ComposeFrame( p_filter, p_dst, p_next, p_curr, CC_ALTLINE );
4841     else if( op == IVTC_OP_COMPOSE_TCBN )
4842         ComposeFrame( p_filter, p_dst, p_curr, p_next, CC_ALTLINE );
4843
4844     /* Slide history of outgoing interlace scores. This must be done last,
4845        and only if the frame was not dropped, so we do it here.
4846
4847        This is used during the reconstruction to get an idea of what is
4848        (in the temporally local sense) an acceptable interlace score
4849        for a correctly reconstructed frame. See above.
4850     */
4851     for( int i = 1; i < IVTC_DETECTION_HISTORY_SIZE; i++ )
4852         p_ivtc->pi_final_scores[i-1] = p_ivtc->pi_final_scores[i];
4853     p_ivtc->pi_final_scores[IVTC_LATEST] = i_result_score;
4854
4855     /* Note that picture_Copy() copies the PTS, too. Apply timestamp mangling
4856        now, if any was needed.
4857     */
4858     if( t_final > VLC_TS_INVALID )
4859         p_dst->date = t_final;
4860
4861     return true;
4862 }
4863
4864 /* The top-level routine of the IVTC filter.
4865
4866    See the lengthy comment above for function documentation.
4867 */
4868 static int RenderIVTC( filter_t *p_filter, picture_t *p_dst, picture_t *p_src )
4869 {
4870     assert( p_filter != NULL );
4871     assert( p_src != NULL );
4872     assert( p_dst != NULL );
4873
4874     filter_sys_t *p_sys = p_filter->p_sys;
4875     ivtc_sys_t *p_ivtc  = &p_sys->ivtc;
4876
4877     picture_t *p_prev = p_sys->pp_history[0];
4878     picture_t *p_curr = p_sys->pp_history[1];
4879     picture_t *p_next = p_sys->pp_history[2];
4880
4881     /* If the history mechanism has failed, we have nothing to do. */
4882     if( !p_next )
4883         return VLC_EGENERIC;
4884
4885     /* Slide algorithm-specific histories */
4886     IVTCFrameInit( p_filter );
4887
4888     /* Filter if we have all the pictures we need.
4889        Note that we always have p_next at this point. */
4890     if( p_prev && p_curr )
4891     {
4892         /* Update raw data for motion, field repeats, interlace scores... */
4893         IVTCLowLevelDetect( p_filter );
4894
4895         /* Detect soft telecine.
4896
4897            Enter/exit IVTC_MODE_TELECINED_NTSC_SOFT when needed.
4898         */
4899         IVTCSoftTelecineDetect( p_filter );
4900
4901         /* Detect hard telecine.
4902
4903            Enter/exit IVTC_MODE_TELECINED_NTSC_HARD when needed.
4904
4905            If we happen to be running in IVTC_MODE_TELECINED_NTSC_SOFT,
4906            we nevertheless let the algorithms see for themselves that
4907            the stream is progressive. This doesn't break anything,
4908            and this way the full filter state gets updated at each frame.
4909
4910            See the individual function docs for details.
4911         */
4912         IVTCCadenceDetectAlgoScores( p_filter );
4913         IVTCCadenceDetectAlgoVektor( p_filter );
4914         IVTCCadenceDetectFinalize( p_filter ); /* pick winner */
4915         IVTCCadenceAnalyze( p_filter ); /* update filter state */
4916
4917         /* Now we can... */
4918         bool b_have_output_frame = IVTCOutputOrDropFrame( p_filter, p_dst );
4919
4920         /* The next frame will get a custom timestamp, too. */
4921         p_sys->i_frame_offset = CUSTOM_PTS;
4922
4923         if( b_have_output_frame )
4924             return VLC_SUCCESS;
4925         else
4926             return VLC_EGENERIC; /* Signal the caller not to expect a frame */
4927     }
4928     else if( !p_prev && !p_curr ) /* first frame */
4929     {
4930         /* Render the first frame as-is, so that a picture appears immediately.
4931
4932            We will also do some init for the filter. This score will become
4933            TPBP by the time the actual filter starts. Note that the sliding of
4934            final scores only starts when the filter has started (third frame).
4935         */
4936         int i_score = CalculateInterlaceScore( p_next, p_next );
4937         p_ivtc->pi_scores[FIELD_PAIR_TNBN] = i_score;
4938         p_ivtc->pi_final_scores[0]         = i_score;
4939
4940         picture_Copy( p_dst, p_next );
4941         return VLC_SUCCESS;
4942     }
4943     else /* second frame */
4944     {
4945         /* If the history sliding mechanism works correctly,
4946            the only remaining possibility is that: */
4947         assert( p_curr && !p_prev );
4948
4949         /* We need three frames for the cadence detector to work, so we just
4950            do some init for the detector and pass the frame through.
4951            Passthrough for second frame, too, works better than drop
4952            for some still-image DVD menus.
4953
4954            Now that we have two frames, we can run a full IVTCLowLevelDetect().
4955
4956            The interlace scores from here will become TCBC, TCBP and TPBC
4957            when the filter starts. The score for the current TCBC has already
4958            been computed at the first frame, and slid into place at the start
4959            of this frame (by IVTCFrameInit()).
4960         */
4961         IVTCLowLevelDetect( p_filter );
4962
4963         /* Note that the sliding mechanism for output scores only starts
4964            when the actual filter does.
4965         */
4966         p_ivtc->pi_final_scores[1] = p_ivtc->pi_scores[FIELD_PAIR_TNBN];
4967
4968         /* At the next frame, the filter starts. The next frame will get
4969            a custom timestamp. */
4970         p_sys->i_frame_offset = CUSTOM_PTS;
4971
4972         picture_Copy( p_dst, p_next );
4973         return VLC_SUCCESS;
4974     }
4975 }
4976
4977 /**
4978  * Clears the inverse telecine subsystem state.
4979  *
4980  * Used during initialization and uninitialization.
4981  *
4982  * @param p_filter The filter instance.
4983  * @see RenderIVTC()
4984  * @see Open()
4985  * @see Flush()
4986  */
4987 static void IVTCClearState( filter_t *p_filter )
4988 {
4989     assert( p_filter != NULL );
4990
4991     filter_sys_t *p_sys = p_filter->p_sys;
4992     ivtc_sys_t *p_ivtc = &p_sys->ivtc;
4993
4994     p_ivtc->i_cadence_pos = CADENCE_POS_INVALID;
4995     p_ivtc->i_tfd         = TFD_INVALID;
4996     p_ivtc->b_sequence_valid = false;
4997     p_ivtc->i_mode     = IVTC_MODE_DETECTING;
4998     p_ivtc->i_old_mode = IVTC_MODE_DETECTING;
4999     for( int i = 0; i < IVTC_NUM_FIELD_PAIRS; i++ )
5000         p_ivtc->pi_scores[i] = 0;
5001     for( int i = 0; i < IVTC_DETECTION_HISTORY_SIZE; i++ )
5002     {
5003         p_ivtc->pi_cadence_pos_history[i] = CADENCE_POS_INVALID;
5004
5005         p_ivtc->pi_s_cadence_pos[i] = CADENCE_POS_INVALID;
5006         p_ivtc->pb_s_reliable[i]    = false;
5007         p_ivtc->pi_v_cadence_pos[i] = CADENCE_POS_INVALID;
5008         p_ivtc->pb_v_reliable[i]    = false;
5009
5010         p_ivtc->pi_v_raw[i]         = VEKTOR_CADENCE_POS_ALL;
5011
5012         p_ivtc->pi_top_rep[i] =  0;
5013         p_ivtc->pi_bot_rep[i] =  0;
5014         p_ivtc->pi_motion[i]  = -1;
5015
5016         p_ivtc->pb_all_progressives[i] = false;
5017
5018         p_ivtc->pi_final_scores[i] = 0;
5019     }
5020 }
5021
5022 /*****************************************************************************
5023  * video filter2 functions
5024  *****************************************************************************/
5025 #define DEINTERLACE_DST_SIZE 3
5026 static picture_t *Deinterlace( filter_t *p_filter, picture_t *p_pic )
5027 {
5028     filter_sys_t *p_sys = p_filter->p_sys;
5029     picture_t *p_dst[DEINTERLACE_DST_SIZE];
5030
5031     /* Request output picture */
5032     p_dst[0] = filter_NewPicture( p_filter );
5033     if( p_dst[0] == NULL )
5034     {
5035         picture_Release( p_pic );
5036         return NULL;
5037     }
5038     picture_CopyProperties( p_dst[0], p_pic );
5039
5040     /* Any unused p_dst pointers must be NULL, because they are used to check how many output frames we have. */
5041     for( int i = 1; i < DEINTERLACE_DST_SIZE; ++i )
5042         p_dst[i] = NULL;
5043
5044     /* Update the input frame history, if the currently active algorithm needs it. */
5045     if( p_sys->b_use_frame_history )
5046     {
5047         /* Duplicate the picture
5048          * TODO when the vout rework is finished, picture_Hold() might be enough
5049          * but becarefull, the pitches must match */
5050         picture_t *p_dup = picture_NewFromFormat( &p_pic->format );
5051         if( p_dup )
5052             picture_Copy( p_dup, p_pic );
5053
5054         /* Slide the history */
5055         if( p_sys->pp_history[0] )
5056             picture_Release( p_sys->pp_history[0] );
5057         for( int i = 1; i < HISTORY_SIZE; i++ )
5058             p_sys->pp_history[i-1] = p_sys->pp_history[i];
5059         p_sys->pp_history[HISTORY_SIZE-1] = p_dup;
5060     }
5061
5062     /* Slide the metadata history. */
5063     for( int i = 1; i < METADATA_SIZE; i++ )
5064     {
5065         p_sys->meta.pi_date[i-1]            = p_sys->meta.pi_date[i];
5066         p_sys->meta.pi_nb_fields[i-1]       = p_sys->meta.pi_nb_fields[i];
5067         p_sys->meta.pb_top_field_first[i-1] = p_sys->meta.pb_top_field_first[i];
5068     }
5069     /* The last element corresponds to the current input frame. */
5070     p_sys->meta.pi_date[METADATA_SIZE-1]            = p_pic->date;
5071     p_sys->meta.pi_nb_fields[METADATA_SIZE-1]       = p_pic->i_nb_fields;
5072     p_sys->meta.pb_top_field_first[METADATA_SIZE-1] = p_pic->b_top_field_first;
5073
5074     /* Remember the frame offset that we should use for this frame.
5075        The value in p_sys will be updated to reflect the correct value
5076        for the *next* frame when we call the renderer. */
5077     int i_frame_offset = p_sys->i_frame_offset;
5078     int i_meta_idx     = (METADATA_SIZE-1) - i_frame_offset;
5079
5080     /* These correspond to the current *outgoing* frame. */
5081     bool b_top_field_first;
5082     int i_nb_fields;
5083     if( i_frame_offset != CUSTOM_PTS )
5084     {
5085         /* Pick the correct values from the history. */
5086         b_top_field_first = p_sys->meta.pb_top_field_first[i_meta_idx];
5087         i_nb_fields       = p_sys->meta.pi_nb_fields[i_meta_idx];
5088     }
5089     else
5090     {
5091         /* Framerate doublers must not request CUSTOM_PTS, as they need the original field timings,
5092            and need Deinterlace() to allocate the correct number of output frames. */
5093         assert( !p_sys->b_double_rate );
5094
5095         /* NOTE: i_nb_fields is only used for framerate doublers, so it is unused in this case.
5096                  b_top_field_first is only passed to the algorithm. We assume that algorithms that
5097                  request CUSTOM_PTS will, if necessary, extract the TFF/BFF information themselves.
5098         */
5099         b_top_field_first = p_pic->b_top_field_first; /* this is not guaranteed to be meaningful */
5100         i_nb_fields       = p_pic->i_nb_fields;       /* unused */
5101     }
5102
5103     /* For framerate doublers, determine field duration and allocate output frames. */
5104     mtime_t i_field_dur = 0;
5105     int i_double_rate_alloc_end = 0; /* One past last for allocated output frames in p_dst[].
5106                                         Used only for framerate doublers. Will be inited below.
5107                                         Declared here because the PTS logic needs the result. */
5108     if( p_sys->b_double_rate )
5109     {
5110         /* Calculate one field duration. */
5111         int i = 0;
5112         int iend = METADATA_SIZE-1;
5113         /* Find oldest valid logged date. Note: the current input frame doesn't count. */
5114         for( ; i < iend; i++ )
5115             if( p_sys->meta.pi_date[i] > VLC_TS_INVALID )
5116                 break;
5117         if( i < iend )
5118         {
5119             /* Count how many fields the valid history entries (except the new frame) represent. */
5120             int i_fields_total = 0;
5121             for( int j = i ; j < iend; j++ )
5122                 i_fields_total += p_sys->meta.pi_nb_fields[j];
5123             /* One field took this long. */
5124             i_field_dur = (p_pic->date - p_sys->meta.pi_date[i]) / i_fields_total;
5125         }
5126         /* Note that we default to field duration 0 if it could not be determined.
5127            This behaves the same as the old code - leaving the extra output frame
5128            dates the same as p_pic->date if the last cached date was not valid.
5129         */
5130
5131         i_double_rate_alloc_end = i_nb_fields;
5132         if( i_nb_fields > DEINTERLACE_DST_SIZE )
5133         {
5134             /* Note that the effective buffer size depends also on the constant private_picture in vout_wrapper.c,
5135                since that determines the maximum number of output pictures filter_NewPicture() will successfully
5136                allocate for one input frame.
5137             */
5138             msg_Err( p_filter, "Framerate doubler: output buffer too small; fields = %d, buffer size = %d. Dropping the remaining fields.", i_nb_fields, DEINTERLACE_DST_SIZE );
5139             i_double_rate_alloc_end = DEINTERLACE_DST_SIZE;
5140         }
5141
5142         /* Allocate output frames. */
5143         for( int i = 1; i < i_double_rate_alloc_end ; ++i )
5144         {
5145             p_dst[i-1]->p_next =
5146             p_dst[i]           = filter_NewPicture( p_filter );
5147             if( p_dst[i] )
5148             {
5149                 picture_CopyProperties( p_dst[i], p_pic );
5150             }
5151             else
5152             {
5153                 msg_Err( p_filter, "Framerate doubler: could not allocate output frame %d", i+1 );
5154                 i_double_rate_alloc_end = i; /* Inform the PTS logic about the correct end position. */
5155                 break; /* If this happens, the rest of the allocations aren't likely to work, either... */
5156             }
5157         }
5158         /* Now we have allocated *up to* the correct number of frames; normally, exactly the correct number.
5159            Upon alloc failure, we may have succeeded in allocating *some* output frames, but fewer than
5160            were desired. In such a case, as many will be rendered as were successfully allocated.
5161
5162            Note that now p_dst[i] != NULL for 0 <= i < i_double_rate_alloc_end. */
5163     }
5164     assert( p_sys->b_double_rate  ||  p_dst[1] == NULL );
5165     assert( i_nb_fields > 2  ||  p_dst[2] == NULL );
5166
5167     /* Render */
5168     switch( p_sys->i_mode )
5169     {
5170         case DEINTERLACE_DISCARD:
5171             RenderDiscard( p_filter, p_dst[0], p_pic, 0 );
5172             break;
5173
5174         case DEINTERLACE_BOB:
5175             RenderBob( p_filter, p_dst[0], p_pic, !b_top_field_first );
5176             if( p_dst[1] )
5177                 RenderBob( p_filter, p_dst[1], p_pic, b_top_field_first );
5178             if( p_dst[2] )
5179                 RenderBob( p_filter, p_dst[2], p_pic, !b_top_field_first );
5180             break;;
5181
5182         case DEINTERLACE_LINEAR:
5183             RenderLinear( p_filter, p_dst[0], p_pic, !b_top_field_first );
5184             if( p_dst[1] )
5185                 RenderLinear( p_filter, p_dst[1], p_pic, b_top_field_first );
5186             if( p_dst[2] )
5187                 RenderLinear( p_filter, p_dst[2], p_pic, !b_top_field_first );
5188             break;
5189
5190         case DEINTERLACE_MEAN:
5191             RenderMean( p_filter, p_dst[0], p_pic );
5192             break;
5193
5194         case DEINTERLACE_BLEND:
5195             RenderBlend( p_filter, p_dst[0], p_pic );
5196             break;
5197
5198         case DEINTERLACE_X:
5199             RenderX( p_dst[0], p_pic );
5200             break;
5201
5202         case DEINTERLACE_YADIF:
5203             if( RenderYadif( p_filter, p_dst[0], p_pic, 0, 0 ) )
5204                 goto drop;
5205             break;
5206
5207         case DEINTERLACE_YADIF2X:
5208             if( RenderYadif( p_filter, p_dst[0], p_pic, 0, !b_top_field_first ) )
5209                 goto drop;
5210             if( p_dst[1] )
5211                 RenderYadif( p_filter, p_dst[1], p_pic, 1, b_top_field_first );
5212             if( p_dst[2] )
5213                 RenderYadif( p_filter, p_dst[2], p_pic, 2, !b_top_field_first );
5214             break;
5215
5216         case DEINTERLACE_PHOSPHOR:
5217             if( RenderPhosphor( p_filter, p_dst[0], p_pic, 0,
5218                                 !b_top_field_first ) )
5219                 goto drop;
5220             if( p_dst[1] )
5221                 RenderPhosphor( p_filter, p_dst[1], p_pic, 1,
5222                                 b_top_field_first );
5223             if( p_dst[2] )
5224                 RenderPhosphor( p_filter, p_dst[2], p_pic, 2,
5225                                 !b_top_field_first );
5226             break;
5227
5228         case DEINTERLACE_IVTC:
5229             /* Note: RenderIVTC will automatically drop the duplicate frames
5230                      produced by IVTC. This is part of normal operation. */
5231             if( RenderIVTC( p_filter, p_dst[0], p_pic ) )
5232                 goto drop;
5233             break;
5234     }
5235
5236     /* Set output timestamps, if the algorithm didn't request CUSTOM_PTS for this frame. */
5237     assert( i_frame_offset <= METADATA_SIZE  ||  i_frame_offset == CUSTOM_PTS );
5238     if( i_frame_offset != CUSTOM_PTS )
5239     {
5240         mtime_t i_base_pts = p_sys->meta.pi_date[i_meta_idx];
5241
5242         /* Note: in the usual case (i_frame_offset = 0  and  b_double_rate = false),
5243                  this effectively does nothing. This is needed to correct the timestamp
5244                  when i_frame_offset > 0. */
5245         p_dst[0]->date = i_base_pts;
5246
5247         if( p_sys->b_double_rate )
5248         {
5249             /* Processing all actually allocated output frames. */
5250             for( int i = 1; i < i_double_rate_alloc_end; ++i )
5251             {
5252                 /* XXX it's not really good especially for the first picture, but
5253                  * I don't think that delaying by one frame is worth it */
5254                 if( i_base_pts > VLC_TS_INVALID )
5255                     p_dst[i]->date = i_base_pts + i * i_field_dur;
5256                 else
5257                     p_dst[i]->date = VLC_TS_INVALID;
5258             }
5259         }
5260     }
5261
5262     for( int i = 0; i < DEINTERLACE_DST_SIZE; ++i )
5263     {
5264         if( p_dst[i] )
5265         {
5266             p_dst[i]->b_progressive = true;
5267             p_dst[i]->i_nb_fields = 2;
5268         }
5269     }
5270
5271     picture_Release( p_pic );
5272     return p_dst[0];
5273
5274 drop:
5275     picture_Release( p_dst[0] );
5276     for( int i = 1; i < DEINTERLACE_DST_SIZE; ++i )
5277     {
5278         if( p_dst[i] )
5279             picture_Release( p_dst[i] );
5280     }
5281     picture_Release( p_pic );
5282     return NULL;
5283 }
5284
5285 static void Flush( filter_t *p_filter )
5286 {
5287     filter_sys_t *p_sys = p_filter->p_sys;
5288
5289     for( int i = 0; i < METADATA_SIZE; i++ )
5290     {
5291         p_sys->meta.pi_date[i] = VLC_TS_INVALID;
5292         p_sys->meta.pi_nb_fields[i] = 2;
5293         p_sys->meta.pb_top_field_first[i] = true;
5294     }
5295     p_sys->i_frame_offset = 0; /* reset to default value (first frame after flush cannot have offset) */
5296     for( int i = 0; i < HISTORY_SIZE; i++ )
5297     {
5298         if( p_sys->pp_history[i] )
5299             picture_Release( p_sys->pp_history[i] );
5300         p_sys->pp_history[i] = NULL;
5301     }
5302     IVTCClearState( p_filter );
5303 }
5304
5305 static int Mouse( filter_t *p_filter,
5306                   vlc_mouse_t *p_mouse, const vlc_mouse_t *p_old, const vlc_mouse_t *p_new )
5307 {
5308     VLC_UNUSED(p_old);
5309     *p_mouse = *p_new;
5310     if( p_filter->p_sys->b_half_height )
5311         p_mouse->i_y *= 2;
5312     return VLC_SUCCESS;
5313 }
5314
5315
5316 /*****************************************************************************
5317  * Open
5318  *****************************************************************************/
5319 static int Open( vlc_object_t *p_this )
5320 {
5321     filter_t *p_filter = (filter_t*)p_this;
5322     filter_sys_t *p_sys;
5323
5324     if( !IsChromaSupported( p_filter->fmt_in.video.i_chroma ) )
5325         return VLC_EGENERIC;
5326
5327     /* */
5328     p_sys = p_filter->p_sys = malloc( sizeof( *p_sys ) );
5329     if( !p_sys )
5330         return VLC_ENOMEM;
5331
5332     p_sys->i_mode = DEINTERLACE_BLEND;
5333     p_sys->b_double_rate = false;
5334     p_sys->b_half_height = true;
5335     p_sys->b_use_frame_history = false;
5336     for( int i = 0; i < METADATA_SIZE; i++ )
5337     {
5338         p_sys->meta.pi_date[i] = VLC_TS_INVALID;
5339         p_sys->meta.pi_nb_fields[i] = 2;
5340         p_sys->meta.pb_top_field_first[i] = true;
5341     }
5342     p_sys->i_frame_offset = 0; /* start with default value (first-ever frame cannot have offset) */
5343     for( int i = 0; i < HISTORY_SIZE; i++ )
5344         p_sys->pp_history[i] = NULL;
5345
5346     IVTCClearState( p_filter );
5347
5348 #if defined(CAN_COMPILE_C_ALTIVEC)
5349     if( vlc_CPU() & CPU_CAPABILITY_ALTIVEC )
5350     {
5351         p_sys->pf_merge = MergeAltivec;
5352         p_sys->pf_end_merge = NULL;
5353     }
5354     else
5355 #endif
5356 #if defined(CAN_COMPILE_SSE)
5357     if( vlc_CPU() & CPU_CAPABILITY_SSE2 )
5358     {
5359         p_sys->pf_merge = MergeSSE2;
5360         p_sys->pf_end_merge = EndMMX;
5361     }
5362     else
5363 #endif
5364 #if defined(CAN_COMPILE_MMXEXT)
5365     if( vlc_CPU() & CPU_CAPABILITY_MMXEXT )
5366     {
5367         p_sys->pf_merge = MergeMMXEXT;
5368         p_sys->pf_end_merge = EndMMX;
5369     }
5370     else
5371 #endif
5372 #if defined(CAN_COMPILE_3DNOW)
5373     if( vlc_CPU() & CPU_CAPABILITY_3DNOW )
5374     {
5375         p_sys->pf_merge = Merge3DNow;
5376         p_sys->pf_end_merge = End3DNow;
5377     }
5378     else
5379 #endif
5380 #if defined __ARM_NEON__
5381     if( vlc_CPU() & CPU_CAPABILITY_NEON )
5382     {
5383         p_sys->pf_merge = MergeNEON;
5384         p_sys->pf_end_merge = NULL;
5385     }
5386     else
5387 #endif
5388     {
5389         p_sys->pf_merge = MergeGeneric;
5390         p_sys->pf_end_merge = NULL;
5391     }
5392
5393     /* */
5394     config_ChainParse( p_filter, FILTER_CFG_PREFIX, ppsz_filter_options,
5395                        p_filter->p_cfg );
5396
5397     char *psz_mode = var_GetNonEmptyString( p_filter, FILTER_CFG_PREFIX "mode" );
5398     SetFilterMethod( p_filter, psz_mode, p_filter->fmt_in.video.i_chroma );
5399     free( psz_mode );
5400
5401     if( p_sys->i_mode == DEINTERLACE_PHOSPHOR )
5402     {
5403         int i_c420 = var_GetInteger( p_filter,
5404                                      FILTER_CFG_PREFIX "phosphor-chroma" );
5405         if( i_c420 != PC_LATEST  &&  i_c420 != PC_ALTLINE  &&
5406             i_c420 != PC_BLEND   && i_c420 != PC_UPCONVERT )
5407         {
5408             msg_Dbg( p_filter, "Phosphor 4:2:0 input chroma mode not set"\
5409                                "or out of range (valid: 1, 2, 3 or 4), "\
5410                                "using default" );
5411             i_c420 = PC_ALTLINE;
5412         }
5413         msg_Dbg( p_filter, "using Phosphor 4:2:0 input chroma mode %d",
5414                            i_c420 );
5415         /* This maps directly to the phosphor_chroma_t enum. */
5416         p_sys->phosphor.i_chroma_for_420 = i_c420;
5417
5418         int i_dimmer = var_GetInteger( p_filter,
5419                                        FILTER_CFG_PREFIX "phosphor-dimmer" );
5420         if( i_dimmer < 1  ||  i_dimmer > 4 )
5421         {
5422             msg_Dbg( p_filter, "Phosphor dimmer strength not set "\
5423                                "or out of range (valid: 1, 2, 3 or 4), "\
5424                                "using default" );
5425             i_dimmer = 2; /* low */
5426         }
5427         msg_Dbg( p_filter, "using Phosphor dimmer strength %d", i_dimmer );
5428         /* The internal value ranges from 0 to 3. */
5429         p_sys->phosphor.i_dimmer_strength = i_dimmer - 1;
5430     }
5431     else
5432     {
5433         p_sys->phosphor.i_chroma_for_420 = PC_ALTLINE;
5434         p_sys->phosphor.i_dimmer_strength = 1;
5435     }
5436
5437     /* */
5438     video_format_t fmt;
5439     GetOutputFormat( p_filter, &fmt, &p_filter->fmt_in.video );
5440     if( !p_filter->b_allow_fmt_out_change &&
5441         ( fmt.i_chroma != p_filter->fmt_in.video.i_chroma ||
5442           fmt.i_height != p_filter->fmt_in.video.i_height ) )
5443     {
5444         Close( VLC_OBJECT(p_filter) );
5445         return VLC_EGENERIC;
5446     }
5447     p_filter->fmt_out.video = fmt;
5448     p_filter->fmt_out.i_codec = fmt.i_chroma;
5449     p_filter->pf_video_filter = Deinterlace;
5450     p_filter->pf_video_flush  = Flush;
5451     p_filter->pf_video_mouse  = Mouse;
5452
5453     msg_Dbg( p_filter, "deinterlacing" );
5454
5455     return VLC_SUCCESS;
5456 }
5457
5458 /*****************************************************************************
5459  * Close: clean up the filter
5460  *****************************************************************************/
5461 static void Close( vlc_object_t *p_this )
5462 {
5463     filter_t *p_filter = (filter_t*)p_this;
5464
5465     Flush( p_filter );
5466     free( p_filter->p_sys );
5467 }
5468