git.sesse.net Git - vlc/blob - modules/video_filter/deinterlace.c

   1 /*****************************************************************************
   2  * deinterlace.c : deinterlacer plugin for vlc
   3  *****************************************************************************
   4  * Copyright (C) 2000-2009 the VideoLAN team
   5  * $Id$
   6  *
   7  * Author: Sam Hocevar <sam@zoy.org>
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation; either version 2 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License
  20  * along with this program; if not, write to the Free Software
  21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
  22  *****************************************************************************/
  23
  24 /*****************************************************************************
  25  * Preamble
  26  *****************************************************************************/
  27
  28 #ifdef HAVE_CONFIG_H
  29 # include "config.h"
  30 #endif
  31
  32 #include <assert.h>
  33
  34 #ifdef HAVE_ALTIVEC_H
  35 #   include <altivec.h>
  36 #endif
  37
  38 #include <vlc_common.h>
  39 #include <vlc_plugin.h>
  40 #include <vlc_filter.h>
  41 #include <vlc_cpu.h>
  42
  43 #ifdef CAN_COMPILE_MMXEXT
  44 #   include "mmx.h"
  45 #endif
  46
  47 #define DEINTERLACE_DISCARD 1
  48 #define DEINTERLACE_MEAN    2
  49 #define DEINTERLACE_BLEND   3
  50 #define DEINTERLACE_BOB     4
  51 #define DEINTERLACE_LINEAR  5
  52 #define DEINTERLACE_X       6
  53 #define DEINTERLACE_YADIF   7
  54 #define DEINTERLACE_YADIF2X 8
  55
  56 /*****************************************************************************
  57  * Module descriptor
  58  *****************************************************************************/
  59 static int  Open ( vlc_object_t * );
  60 static void Close( vlc_object_t * );
  61
  62 #define MODE_TEXT N_("Deinterlace mode")
  63 #define MODE_LONGTEXT N_("Deinterlace method to use for local playback.")
  64
  65 #define SOUT_MODE_TEXT N_("Streaming deinterlace mode")
  66 #define SOUT_MODE_LONGTEXT N_("Deinterlace method to use for streaming.")
  67
  68 #define FILTER_CFG_PREFIX "sout-deinterlace-"
  69
  70 static const char *const mode_list[] = {
  71     "discard", "blend", "mean", "bob", "linear", "x", "yadif", "yadif2x" };
  72 static const char *const mode_list_text[] = {
  73     N_("Discard"), N_("Blend"), N_("Mean"), N_("Bob"), N_("Linear"), "X", "Yadif", "Yadif (2x)" };
  74
  75 vlc_module_begin ()
  76     set_description( N_("Deinterlacing video filter") )
  77     set_shortname( N_("Deinterlace" ))
  78     set_capability( "video filter", 0 )
  79     set_category( CAT_VIDEO )
  80     set_subcategory( SUBCAT_VIDEO_VFILTER )
  81
  82     set_capability( "video filter2", 0 )
  83     add_string( FILTER_CFG_PREFIX "mode", "blend", NULL, SOUT_MODE_TEXT,
  84                 SOUT_MODE_LONGTEXT, false )
  85         change_string_list( mode_list, mode_list_text, 0 )
  86         change_safe ()
  87     add_shortcut( "deinterlace" )
  88     set_callbacks( Open, Close )
  89 vlc_module_end ()
  90
  91
  92 /*****************************************************************************
  93  * Local protypes
  94  *****************************************************************************/
  95 static void RenderDiscard( filter_t *, picture_t *, picture_t *, int );
  96 static void RenderBob    ( filter_t *, picture_t *, picture_t *, int );
  97 static void RenderMean   ( filter_t *, picture_t *, picture_t * );
  98 static void RenderBlend  ( filter_t *, picture_t *, picture_t * );
  99 static void RenderLinear ( filter_t *, picture_t *, picture_t *, int );
 100 static void RenderX      ( picture_t *, picture_t * );
 101 static int  RenderYadif  ( filter_t *, picture_t *, picture_t *, int, int );
 102
 103 static void MergeGeneric ( void *, const void *, const void *, size_t );
 104 #if defined(CAN_COMPILE_C_ALTIVEC)
 105 static void MergeAltivec ( void *, const void *, const void *, size_t );
 106 #endif
 107 #if defined(CAN_COMPILE_MMXEXT)
 108 static void MergeMMXEXT  ( void *, const void *, const void *, size_t );
 109 #endif
 110 #if defined(CAN_COMPILE_3DNOW)
 111 static void Merge3DNow   ( void *, const void *, const void *, size_t );
 112 #endif
 113 #if defined(CAN_COMPILE_SSE)
 114 static void MergeSSE2    ( void *, const void *, const void *, size_t );
 115 #endif
 116 #if defined(CAN_COMPILE_MMXEXT) || defined(CAN_COMPILE_SSE)
 117 static void EndMMX       ( void );
 118 #endif
 119 #if defined(CAN_COMPILE_3DNOW)
 120 static void End3DNow     ( void );
 121 #endif
 122 #if defined __ARM_NEON__
 123 static void MergeNEON (void *, const void *, const void *, size_t);
 124 #endif
 125
 126 static const char *const ppsz_filter_options[] = {
 127     "mode", NULL
 128 };
 129
 130 #define HISTORY_SIZE (3)
 131 struct filter_sys_t
 132 {
 133     int  i_mode;        /* Deinterlace mode */
 134     bool b_double_rate; /* Shall we double the framerate? */
 135     bool b_half_height; /* Shall be divide the height by 2 */
 136
 137     void (*pf_merge) ( void *, const void *, const void *, size_t );
 138     void (*pf_end_merge) ( void );
 139
 140     /* Yadif */
 141     picture_t *pp_history[HISTORY_SIZE];
 142 };
 143
 144 /*****************************************************************************
 145  * SetFilterMethod: setup the deinterlace method to use.
 146  *****************************************************************************/
 147 static void SetFilterMethod( filter_t *p_filter, const char *psz_method, vlc_fourcc_t i_chroma )
 148 {
 149     filter_sys_t *p_sys = p_filter->p_sys;
 150
 151     if( !psz_method )
 152         psz_method = "";
 153
 154     if( !strcmp( psz_method, "mean" ) )
 155     {
 156         p_sys->i_mode = DEINTERLACE_MEAN;
 157         p_sys->b_double_rate = false;
 158         p_sys->b_half_height = true;
 159     }
 160     else if( !strcmp( psz_method, "bob" )
 161              || !strcmp( psz_method, "progressive-scan" ) )
 162     {
 163         p_sys->i_mode = DEINTERLACE_BOB;
 164         p_sys->b_double_rate = true;
 165         p_sys->b_half_height = false;
 166     }
 167     else if( !strcmp( psz_method, "linear" ) )
 168     {
 169         p_sys->i_mode = DEINTERLACE_LINEAR;
 170         p_sys->b_double_rate = true;
 171         p_sys->b_half_height = false;
 172     }
 173     else if( !strcmp( psz_method, "x" ) )
 174     {
 175         p_sys->i_mode = DEINTERLACE_X;
 176         p_sys->b_double_rate = false;
 177         p_sys->b_half_height = false;
 178     }
 179     else if( !strcmp( psz_method, "yadif" ) )
 180     {
 181         p_sys->i_mode = DEINTERLACE_YADIF;
 182         p_sys->b_double_rate = false;
 183         p_sys->b_half_height = false;
 184     }
 185     else if( !strcmp( psz_method, "yadif2x" ) )
 186     {
 187         p_sys->i_mode = DEINTERLACE_YADIF2X;
 188         p_sys->b_double_rate = true;
 189         p_sys->b_half_height = false;
 190     }
 191     else if( !strcmp( psz_method, "discard" ) )
 192     {
 193         const bool b_i422 = i_chroma == VLC_CODEC_I422 ||
 194                             i_chroma == VLC_CODEC_J422;
 195
 196         p_sys->i_mode = DEINTERLACE_DISCARD;
 197         p_sys->b_double_rate = false;
 198         p_sys->b_half_height = !b_i422;
 199     }
 200     else
 201     {
 202         if( strcmp( psz_method, "blend" ) )
 203             msg_Err( p_filter,
 204                      "no valid deinterlace mode provided, using \"blend\"" );
 205
 206         p_sys->i_mode = DEINTERLACE_BLEND;
 207         p_sys->b_double_rate = false;
 208         p_sys->b_half_height = false;
 209     }
 210
 211     msg_Dbg( p_filter, "using %s deinterlace method", psz_method );
 212 }
 213
 214 static void GetOutputFormat( filter_t *p_filter,
 215                              video_format_t *p_dst, const video_format_t *p_src )
 216 {
 217     filter_sys_t *p_sys = p_filter->p_sys;
 218     *p_dst = *p_src;
 219
 220     if( p_sys->b_half_height )
 221     {
 222         p_dst->i_height /= 2;
 223         p_dst->i_visible_height /= 2;
 224         p_dst->i_y_offset /= 2;
 225         p_dst->i_sar_den *= 2;
 226     }
 227
 228     if( p_src->i_chroma == VLC_CODEC_I422 ||
 229         p_src->i_chroma == VLC_CODEC_J422 )
 230     {
 231         switch( p_sys->i_mode )
 232         {
 233         case DEINTERLACE_MEAN:
 234         case DEINTERLACE_LINEAR:
 235         case DEINTERLACE_X:
 236         case DEINTERLACE_YADIF:
 237         case DEINTERLACE_YADIF2X:
 238             p_dst->i_chroma = p_src->i_chroma;
 239             break;
 240         default:
 241             p_dst->i_chroma = p_src->i_chroma == VLC_CODEC_I422 ? VLC_CODEC_I420 :
 242                                                                   VLC_CODEC_J420;
 243             break;
 244         }
 245     }
 246 }
 247
 248 static bool IsChromaSupported( vlc_fourcc_t i_chroma )
 249 {
 250     return i_chroma == VLC_CODEC_I420 ||
 251            i_chroma == VLC_CODEC_J420 ||
 252            i_chroma == VLC_CODEC_YV12 ||
 253            i_chroma == VLC_CODEC_I422 ||
 254            i_chroma == VLC_CODEC_J422;
 255 }
 256
 257 /*****************************************************************************
 258  * RenderDiscard: only keep TOP or BOTTOM field, discard the other.
 259  *****************************************************************************/
 260 static void RenderDiscard( filter_t *p_filter,
 261                            picture_t *p_outpic, picture_t *p_pic, int i_field )
 262 {
 263     int i_plane;
 264
 265     /* Copy image and skip lines */
 266     for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
 267     {
 268         uint8_t *p_in, *p_out_end, *p_out;
 269         int i_increment;
 270
 271         p_in = p_pic->p[i_plane].p_pixels
 272                    + i_field * p_pic->p[i_plane].i_pitch;
 273
 274         p_out = p_outpic->p[i_plane].p_pixels;
 275         p_out_end = p_out + p_outpic->p[i_plane].i_pitch
 276                              * p_outpic->p[i_plane].i_visible_lines;
 277
 278         switch( p_filter->fmt_in.video.i_chroma )
 279         {
 280         case VLC_CODEC_I420:
 281         case VLC_CODEC_J420:
 282         case VLC_CODEC_YV12:
 283
 284             for( ; p_out < p_out_end ; )
 285             {
 286                 vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 287
 288                 p_out += p_outpic->p[i_plane].i_pitch;
 289                 p_in += 2 * p_pic->p[i_plane].i_pitch;
 290             }
 291             break;
 292
 293         case VLC_CODEC_I422:
 294         case VLC_CODEC_J422:
 295
 296             i_increment = 2 * p_pic->p[i_plane].i_pitch;
 297
 298             if( i_plane == Y_PLANE )
 299             {
 300                 for( ; p_out < p_out_end ; )
 301                 {
 302                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 303                     p_out += p_outpic->p[i_plane].i_pitch;
 304                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 305                     p_out += p_outpic->p[i_plane].i_pitch;
 306                     p_in += i_increment;
 307                 }
 308             }
 309             else
 310             {
 311                 for( ; p_out < p_out_end ; )
 312                 {
 313                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 314                     p_out += p_outpic->p[i_plane].i_pitch;
 315                     p_in += i_increment;
 316                 }
 317             }
 318             break;
 319
 320         default:
 321             break;
 322         }
 323     }
 324 }
 325
 326 /*****************************************************************************
 327  * RenderBob: renders a BOB picture - simple copy
 328  *****************************************************************************/
 329 static void RenderBob( filter_t *p_filter,
 330                        picture_t *p_outpic, picture_t *p_pic, int i_field )
 331 {
 332     int i_plane;
 333
 334     /* Copy image and skip lines */
 335     for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
 336     {
 337         uint8_t *p_in, *p_out_end, *p_out;
 338
 339         p_in = p_pic->p[i_plane].p_pixels;
 340         p_out = p_outpic->p[i_plane].p_pixels;
 341         p_out_end = p_out + p_outpic->p[i_plane].i_pitch
 342                              * p_outpic->p[i_plane].i_visible_lines;
 343
 344         switch( p_filter->fmt_in.video.i_chroma )
 345         {
 346             case VLC_CODEC_I420:
 347             case VLC_CODEC_J420:
 348             case VLC_CODEC_YV12:
 349                 /* For BOTTOM field we need to add the first line */
 350                 if( i_field == 1 )
 351                 {
 352                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 353                     p_in += p_pic->p[i_plane].i_pitch;
 354                     p_out += p_outpic->p[i_plane].i_pitch;
 355                 }
 356
 357                 p_out_end -= 2 * p_outpic->p[i_plane].i_pitch;
 358
 359                 for( ; p_out < p_out_end ; )
 360                 {
 361                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 362
 363                     p_out += p_outpic->p[i_plane].i_pitch;
 364
 365                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 366
 367                     p_in += 2 * p_pic->p[i_plane].i_pitch;
 368                     p_out += p_outpic->p[i_plane].i_pitch;
 369                 }
 370
 371                 vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 372
 373                 /* For TOP field we need to add the last line */
 374                 if( i_field == 0 )
 375                 {
 376                     p_in += p_pic->p[i_plane].i_pitch;
 377                     p_out += p_outpic->p[i_plane].i_pitch;
 378                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 379                 }
 380                 break;
 381
 382             case VLC_CODEC_I422:
 383             case VLC_CODEC_J422:
 384                 /* For BOTTOM field we need to add the first line */
 385                 if( i_field == 1 )
 386                 {
 387                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 388                     p_in += p_pic->p[i_plane].i_pitch;
 389                     p_out += p_outpic->p[i_plane].i_pitch;
 390                 }
 391
 392                 p_out_end -= 2 * p_outpic->p[i_plane].i_pitch;
 393
 394                 if( i_plane == Y_PLANE )
 395                 {
 396                     for( ; p_out < p_out_end ; )
 397                     {
 398                         vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 399
 400                         p_out += p_outpic->p[i_plane].i_pitch;
 401
 402                         vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 403
 404                         p_in += 2 * p_pic->p[i_plane].i_pitch;
 405                         p_out += p_outpic->p[i_plane].i_pitch;
 406                     }
 407                 }
 408                 else
 409                 {
 410                     for( ; p_out < p_out_end ; )
 411                     {
 412                         vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 413
 414                         p_out += p_outpic->p[i_plane].i_pitch;
 415                         p_in += 2 * p_pic->p[i_plane].i_pitch;
 416                     }
 417                 }
 418
 419                 vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 420
 421                 /* For TOP field we need to add the last line */
 422                 if( i_field == 0 )
 423                 {
 424                     p_in += p_pic->p[i_plane].i_pitch;
 425                     p_out += p_outpic->p[i_plane].i_pitch;
 426                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 427                 }
 428                 break;
 429         }
 430     }
 431 }
 432
 433 #define Merge p_filter->p_sys->pf_merge
 434 #define EndMerge if(p_filter->p_sys->pf_end_merge) p_filter->p_sys->pf_end_merge
 435
 436 /*****************************************************************************
 437  * RenderLinear: BOB with linear interpolation
 438  *****************************************************************************/
 439 static void RenderLinear( filter_t *p_filter,
 440                           picture_t *p_outpic, picture_t *p_pic, int i_field )
 441 {
 442     int i_plane;
 443
 444     /* Copy image and skip lines */
 445     for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
 446     {
 447         uint8_t *p_in, *p_out_end, *p_out;
 448
 449         p_in = p_pic->p[i_plane].p_pixels;
 450         p_out = p_outpic->p[i_plane].p_pixels;
 451         p_out_end = p_out + p_outpic->p[i_plane].i_pitch
 452                              * p_outpic->p[i_plane].i_visible_lines;
 453
 454         /* For BOTTOM field we need to add the first line */
 455         if( i_field == 1 )
 456         {
 457             vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 458             p_in += p_pic->p[i_plane].i_pitch;
 459             p_out += p_outpic->p[i_plane].i_pitch;
 460         }
 461
 462         p_out_end -= 2 * p_outpic->p[i_plane].i_pitch;
 463
 464         for( ; p_out < p_out_end ; )
 465         {
 466             vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 467
 468             p_out += p_outpic->p[i_plane].i_pitch;
 469
 470             Merge( p_out, p_in, p_in + 2 * p_pic->p[i_plane].i_pitch,
 471                    p_pic->p[i_plane].i_pitch );
 472
 473             p_in += 2 * p_pic->p[i_plane].i_pitch;
 474             p_out += p_outpic->p[i_plane].i_pitch;
 475         }
 476
 477         vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 478
 479         /* For TOP field we need to add the last line */
 480         if( i_field == 0 )
 481         {
 482             p_in += p_pic->p[i_plane].i_pitch;
 483             p_out += p_outpic->p[i_plane].i_pitch;
 484             vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 485         }
 486     }
 487     EndMerge();
 488 }
 489
 490 static void RenderMean( filter_t *p_filter,
 491                         picture_t *p_outpic, picture_t *p_pic )
 492 {
 493     int i_plane;
 494
 495     /* Copy image and skip lines */
 496     for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
 497     {
 498         uint8_t *p_in, *p_out_end, *p_out;
 499
 500         p_in = p_pic->p[i_plane].p_pixels;
 501
 502         p_out = p_outpic->p[i_plane].p_pixels;
 503         p_out_end = p_out + p_outpic->p[i_plane].i_pitch
 504                              * p_outpic->p[i_plane].i_visible_lines;
 505
 506         /* All lines: mean value */
 507         for( ; p_out < p_out_end ; )
 508         {
 509             Merge( p_out, p_in, p_in + p_pic->p[i_plane].i_pitch,
 510                    p_pic->p[i_plane].i_pitch );
 511
 512             p_out += p_outpic->p[i_plane].i_pitch;
 513             p_in += 2 * p_pic->p[i_plane].i_pitch;
 514         }
 515     }
 516     EndMerge();
 517 }
 518
 519 static void RenderBlend( filter_t *p_filter,
 520                          picture_t *p_outpic, picture_t *p_pic )
 521 {
 522     int i_plane;
 523
 524     /* Copy image and skip lines */
 525     for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
 526     {
 527         uint8_t *p_in, *p_out_end, *p_out;
 528
 529         p_in = p_pic->p[i_plane].p_pixels;
 530
 531         p_out = p_outpic->p[i_plane].p_pixels;
 532         p_out_end = p_out + p_outpic->p[i_plane].i_pitch
 533                              * p_outpic->p[i_plane].i_visible_lines;
 534
 535         switch( p_filter->fmt_in.video.i_chroma )
 536         {
 537             case VLC_CODEC_I420:
 538             case VLC_CODEC_J420:
 539             case VLC_CODEC_YV12:
 540                 /* First line: simple copy */
 541                 vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 542                 p_out += p_outpic->p[i_plane].i_pitch;
 543
 544                 /* Remaining lines: mean value */
 545                 for( ; p_out < p_out_end ; )
 546                 {
 547                     Merge( p_out, p_in, p_in + p_pic->p[i_plane].i_pitch,
 548                            p_pic->p[i_plane].i_pitch );
 549
 550                     p_out += p_outpic->p[i_plane].i_pitch;
 551                     p_in += p_pic->p[i_plane].i_pitch;
 552                 }
 553                 break;
 554
 555             case VLC_CODEC_I422:
 556             case VLC_CODEC_J422:
 557                 /* First line: simple copy */
 558                 vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 559                 p_out += p_outpic->p[i_plane].i_pitch;
 560
 561                 /* Remaining lines: mean value */
 562                 if( i_plane == Y_PLANE )
 563                 {
 564                     for( ; p_out < p_out_end ; )
 565                     {
 566                         Merge( p_out, p_in, p_in + p_pic->p[i_plane].i_pitch,
 567                                p_pic->p[i_plane].i_pitch );
 568
 569                         p_out += p_outpic->p[i_plane].i_pitch;
 570                         p_in += p_pic->p[i_plane].i_pitch;
 571                     }
 572                 }
 573
 574                 else
 575                 {
 576                     for( ; p_out < p_out_end ; )
 577                     {
 578                         Merge( p_out, p_in, p_in + p_pic->p[i_plane].i_pitch,
 579                                p_pic->p[i_plane].i_pitch );
 580
 581                         p_out += p_outpic->p[i_plane].i_pitch;
 582                         p_in += 2*p_pic->p[i_plane].i_pitch;
 583                     }
 584                 }
 585                 break;
 586         }
 587     }
 588     EndMerge();
 589 }
 590
 591 #undef Merge
 592
 593 static void MergeGeneric( void *_p_dest, const void *_p_s1,
 594                           const void *_p_s2, size_t i_bytes )
 595 {
 596     uint8_t* p_dest = (uint8_t*)_p_dest;
 597     const uint8_t *p_s1 = (const uint8_t *)_p_s1;
 598     const uint8_t *p_s2 = (const uint8_t *)_p_s2;
 599     uint8_t* p_end = p_dest + i_bytes - 8;
 600
 601     while( p_dest < p_end )
 602     {
 603         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 604         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 605         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 606         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 607         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 608         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 609         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 610         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 611     }
 612
 613     p_end += 8;
 614
 615     while( p_dest < p_end )
 616     {
 617         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 618     }
 619 }
 620
 621 #if defined(CAN_COMPILE_MMXEXT)
 622 static void MergeMMXEXT( void *_p_dest, const void *_p_s1, const void *_p_s2,
 623                          size_t i_bytes )
 624 {
 625     uint8_t* p_dest = (uint8_t*)_p_dest;
 626     const uint8_t *p_s1 = (const uint8_t *)_p_s1;
 627     const uint8_t *p_s2 = (const uint8_t *)_p_s2;
 628     uint8_t* p_end = p_dest + i_bytes - 8;
 629     while( p_dest < p_end )
 630     {
 631         __asm__  __volatile__( "movq %2,%%mm1;"
 632                                "pavgb %1, %%mm1;"
 633                                "movq %%mm1, %0" :"=m" (*p_dest):
 634                                                  "m" (*p_s1),
 635                                                  "m" (*p_s2) );
 636         p_dest += 8;
 637         p_s1 += 8;
 638         p_s2 += 8;
 639     }
 640
 641     p_end += 8;
 642
 643     while( p_dest < p_end )
 644     {
 645         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 646     }
 647 }
 648 #endif
 649
 650 #if defined(CAN_COMPILE_3DNOW)
 651 static void Merge3DNow( void *_p_dest, const void *_p_s1, const void *_p_s2,
 652                         size_t i_bytes )
 653 {
 654     uint8_t* p_dest = (uint8_t*)_p_dest;
 655     const uint8_t *p_s1 = (const uint8_t *)_p_s1;
 656     const uint8_t *p_s2 = (const uint8_t *)_p_s2;
 657     uint8_t* p_end = p_dest + i_bytes - 8;
 658     while( p_dest < p_end )
 659     {
 660         __asm__  __volatile__( "movq %2,%%mm1;"
 661                                "pavgusb %1, %%mm1;"
 662                                "movq %%mm1, %0" :"=m" (*p_dest):
 663                                                  "m" (*p_s1),
 664                                                  "m" (*p_s2) );
 665         p_dest += 8;
 666         p_s1 += 8;
 667         p_s2 += 8;
 668     }
 669
 670     p_end += 8;
 671
 672     while( p_dest < p_end )
 673     {
 674         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 675     }
 676 }
 677 #endif
 678
 679 #if defined(CAN_COMPILE_SSE)
 680 static void MergeSSE2( void *_p_dest, const void *_p_s1, const void *_p_s2,
 681                        size_t i_bytes )
 682 {
 683     uint8_t* p_dest = (uint8_t*)_p_dest;
 684     const uint8_t *p_s1 = (const uint8_t *)_p_s1;
 685     const uint8_t *p_s2 = (const uint8_t *)_p_s2;
 686     uint8_t* p_end;
 687     while( (uintptr_t)p_s1 % 16 )
 688     {
 689         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 690     }
 691     p_end = p_dest + i_bytes - 16;
 692     while( p_dest < p_end )
 693     {
 694         __asm__  __volatile__( "movdqu %2,%%xmm1;"
 695                                "pavgb %1, %%xmm1;"
 696                                "movdqu %%xmm1, %0" :"=m" (*p_dest):
 697                                                  "m" (*p_s1),
 698                                                  "m" (*p_s2) );
 699         p_dest += 16;
 700         p_s1 += 16;
 701         p_s2 += 16;
 702     }
 703
 704     p_end += 16;
 705
 706     while( p_dest < p_end )
 707     {
 708         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 709     }
 710 }
 711 #endif
 712
 713 #if defined(CAN_COMPILE_MMXEXT) || defined(CAN_COMPILE_SSE)
 714 static void EndMMX( void )
 715 {
 716     __asm__ __volatile__( "emms" :: );
 717 }
 718 #endif
 719
 720 #if defined(CAN_COMPILE_3DNOW)
 721 static void End3DNow( void )
 722 {
 723     __asm__ __volatile__( "femms" :: );
 724 }
 725 #endif
 726
 727 #ifdef CAN_COMPILE_C_ALTIVEC
 728 static void MergeAltivec( void *_p_dest, const void *_p_s1,
 729                           const void *_p_s2, size_t i_bytes )
 730 {
 731     uint8_t *p_dest = (uint8_t *)_p_dest;
 732     uint8_t *p_s1   = (uint8_t *)_p_s1;
 733     uint8_t *p_s2   = (uint8_t *)_p_s2;
 734     uint8_t *p_end  = p_dest + i_bytes - 15;
 735
 736     /* Use C until the first 16-bytes aligned destination pixel */
 737     while( (uintptr_t)p_dest & 0xF )
 738     {
 739         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 740     }
 741
 742     if( ( (int)p_s1 & 0xF ) | ( (int)p_s2 & 0xF ) )
 743     {
 744         /* Unaligned source */
 745         vector unsigned char s1v, s2v, destv;
 746         vector unsigned char s1oldv, s2oldv, s1newv, s2newv;
 747         vector unsigned char perm1v, perm2v;
 748
 749         perm1v = vec_lvsl( 0, p_s1 );
 750         perm2v = vec_lvsl( 0, p_s2 );
 751         s1oldv = vec_ld( 0, p_s1 );
 752         s2oldv = vec_ld( 0, p_s2 );
 753
 754         while( p_dest < p_end )
 755         {
 756             s1newv = vec_ld( 16, p_s1 );
 757             s2newv = vec_ld( 16, p_s2 );
 758             s1v    = vec_perm( s1oldv, s1newv, perm1v );
 759             s2v    = vec_perm( s2oldv, s2newv, perm2v );
 760             s1oldv = s1newv;
 761             s2oldv = s2newv;
 762             destv  = vec_avg( s1v, s2v );
 763             vec_st( destv, 0, p_dest );
 764
 765             p_s1   += 16;
 766             p_s2   += 16;
 767             p_dest += 16;
 768         }
 769     }
 770     else
 771     {
 772         /* Aligned source */
 773         vector unsigned char s1v, s2v, destv;
 774
 775         while( p_dest < p_end )
 776         {
 777             s1v   = vec_ld( 0, p_s1 );
 778             s2v   = vec_ld( 0, p_s2 );
 779             destv = vec_avg( s1v, s2v );
 780             vec_st( destv, 0, p_dest );
 781
 782             p_s1   += 16;
 783             p_s2   += 16;
 784             p_dest += 16;
 785         }
 786     }
 787
 788     p_end += 15;
 789
 790     while( p_dest < p_end )
 791     {
 792         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 793     }
 794 }
 795 #endif
 796
 797 #ifdef __ARM_NEON__
 798 static void MergeNEON (void *restrict out, const void *in1,
 799                        const void *in2, size_t n)
 800 {
 801     uint8_t *outp = out;
 802     const uint8_t *in1p = in1;
 803     const uint8_t *in2p = in2;
 804     size_t mis = ((uintptr_t)outp) & 15;
 805
 806     if (mis)
 807     {
 808         MergeGeneric (outp, in1p, in2p, mis);
 809         outp += mis;
 810         in1p += mis;
 811         in2p += mis;
 812         n -= mis;
 813     }
 814
 815     uint8_t *end = outp + (n & ~15);
 816
 817     if ((((uintptr_t)in1p)|((uintptr_t)in2p)) & 15)
 818         while (outp < end)
 819             asm volatile (
 820                 "vld1.u8  {q0-q1}, [%[in1]]!\n"
 821                 "vld1.u8  {q2-q3}, [%[in2]]!\n"
 822                 "vhadd.u8 q4, q0, q2\n"
 823                 "vld1.u8  {q6-q7}, [%[in1]]!\n"
 824                 "vhadd.u8 q5, q1, q3\n"
 825                 "vld1.u8  {q8-q9}, [%[in2]]!\n"
 826                 "vhadd.u8 q10, q6, q8\n"
 827                 "vhadd.u8 q11, q7, q9\n"
 828                 "vst1.u8  {q4-q5}, [%[out],:128]!\n"
 829                 "vst1.u8  {q10-q11}, [%[out],:128]!\n"
 830                 : [out] "+r" (outp), [in1] "+r" (in1p), [in2] "+r" (in2p)
 831                 :
 832                 : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
 833                   "q8", "q9", "q10", "q11", "memory");
 834     else
 835          while (outp < end)
 836             asm volatile (
 837                 "vld1.u8  {q0-q1}, [%[in1],:128]!\n"
 838                 "vld1.u8  {q2-q3}, [%[in2],:128]!\n"
 839                 "vhadd.u8 q4, q0, q2\n"
 840                 "vld1.u8  {q6-q7}, [%[in1],:128]!\n"
 841                 "vhadd.u8 q5, q1, q3\n"
 842                 "vld1.u8  {q8-q9}, [%[in2],:128]!\n"
 843                 "vhadd.u8 q10, q6, q8\n"
 844                 "vhadd.u8 q11, q7, q9\n"
 845                 "vst1.u8  {q4-q5}, [%[out],:128]!\n"
 846                 "vst1.u8  {q10-q11}, [%[out],:128]!\n"
 847                 : [out] "+r" (outp), [in1] "+r" (in1p), [in2] "+r" (in2p)
 848                 :
 849                 : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
 850                   "q8", "q9", "q10", "q11", "memory");
 851     n &= 15;
 852     if (n)
 853         MergeGeneric (outp, in1p, in2p, n);
 854 }
 855 #endif
 856
 857 /*****************************************************************************
 858  * RenderX: This algo works on a 8x8 block basic, it copies the top field
 859  * and apply a process to recreate the bottom field :
 860  *  If a 8x8 block is classified as :
 861  *   - progressive: it applies a small blend (1,6,1)
 862  *   - interlaced:
 863  *    * in the MMX version: we do a ME between the 2 fields, if there is a
 864  *    good match we use MC to recreate the bottom field (with a small
 865  *    blend (1,6,1) )
 866  *    * otherwise: it recreates the bottom field by an edge oriented
 867  *    interpolation.
 868   *****************************************************************************/
 869
 870 /* XDeint8x8Detect: detect if a 8x8 block is interlaced.
 871  * XXX: It need to access to 8x10
 872  * We use more than 8 lines to help with scrolling (text)
 873  * (and because XDeint8x8Frame use line 9)
 874  * XXX: smooth/uniform area with noise detection doesn't works well
 875  * but it's not really a problem because they don't have much details anyway
 876  */
 877 static inline int ssd( int a ) { return a*a; }
 878 static inline int XDeint8x8DetectC( uint8_t *src, int i_src )
 879 {
 880     int y, x;
 881     int ff, fr;
 882     int fc;
 883
 884     /* Detect interlacing */
 885     fc = 0;
 886     for( y = 0; y < 7; y += 2 )
 887     {
 888         ff = fr = 0;
 889         for( x = 0; x < 8; x++ )
 890         {
 891             fr += ssd(src[      x] - src[1*i_src+x]) +
 892                   ssd(src[i_src+x] - src[2*i_src+x]);
 893             ff += ssd(src[      x] - src[2*i_src+x]) +
 894                   ssd(src[i_src+x] - src[3*i_src+x]);
 895         }
 896         if( ff < 6*fr/8 && fr > 32 )
 897             fc++;
 898
 899         src += 2*i_src;
 900     }
 901
 902     return fc < 1 ? false : true;
 903 }
 904 #ifdef CAN_COMPILE_MMXEXT
 905 static inline int XDeint8x8DetectMMXEXT( uint8_t *src, int i_src )
 906 {
 907
 908     int y, x;
 909     int32_t ff, fr;
 910     int fc;
 911
 912     /* Detect interlacing */
 913     fc = 0;
 914     pxor_r2r( mm7, mm7 );
 915     for( y = 0; y < 9; y += 2 )
 916     {
 917         ff = fr = 0;
 918         pxor_r2r( mm5, mm5 );
 919         pxor_r2r( mm6, mm6 );
 920         for( x = 0; x < 8; x+=4 )
 921         {
 922             movd_m2r( src[        x], mm0 );
 923             movd_m2r( src[1*i_src+x], mm1 );
 924             movd_m2r( src[2*i_src+x], mm2 );
 925             movd_m2r( src[3*i_src+x], mm3 );
 926
 927             punpcklbw_r2r( mm7, mm0 );
 928             punpcklbw_r2r( mm7, mm1 );
 929             punpcklbw_r2r( mm7, mm2 );
 930             punpcklbw_r2r( mm7, mm3 );
 931
 932             movq_r2r( mm0, mm4 );
 933
 934             psubw_r2r( mm1, mm0 );
 935             psubw_r2r( mm2, mm4 );
 936
 937             psubw_r2r( mm1, mm2 );
 938             psubw_r2r( mm1, mm3 );
 939
 940             pmaddwd_r2r( mm0, mm0 );
 941             pmaddwd_r2r( mm4, mm4 );
 942             pmaddwd_r2r( mm2, mm2 );
 943             pmaddwd_r2r( mm3, mm3 );
 944             paddd_r2r( mm0, mm2 );
 945             paddd_r2r( mm4, mm3 );
 946             paddd_r2r( mm2, mm5 );
 947             paddd_r2r( mm3, mm6 );
 948         }
 949
 950         movq_r2r( mm5, mm0 );
 951         psrlq_i2r( 32, mm0 );
 952         paddd_r2r( mm0, mm5 );
 953         movd_r2m( mm5, fr );
 954
 955         movq_r2r( mm6, mm0 );
 956         psrlq_i2r( 32, mm0 );
 957         paddd_r2r( mm0, mm6 );
 958         movd_r2m( mm6, ff );
 959
 960         if( ff < 6*fr/8 && fr > 32 )
 961             fc++;
 962
 963         src += 2*i_src;
 964     }
 965     return fc;
 966 }
 967 #endif
 968
 969 static inline void XDeint8x8MergeC( uint8_t *dst, int i_dst,
 970                                     uint8_t *src1, int i_src1,
 971                                     uint8_t *src2, int i_src2 )
 972 {
 973     int y, x;
 974
 975     /* Progressive */
 976     for( y = 0; y < 8; y += 2 )
 977     {
 978         memcpy( dst, src1, 8 );
 979         dst  += i_dst;
 980
 981         for( x = 0; x < 8; x++ )
 982             dst[x] = (src1[x] + 6*src2[x] + src1[i_src1+x] + 4 ) >> 3;
 983         dst += i_dst;
 984
 985         src1 += i_src1;
 986         src2 += i_src2;
 987     }
 988 }
 989
 990 #ifdef CAN_COMPILE_MMXEXT
 991 static inline void XDeint8x8MergeMMXEXT( uint8_t *dst, int i_dst,
 992                                          uint8_t *src1, int i_src1,
 993                                          uint8_t *src2, int i_src2 )
 994 {
 995     static const uint64_t m_4 = INT64_C(0x0004000400040004);
 996     int y, x;
 997
 998     /* Progressive */
 999     pxor_r2r( mm7, mm7 );
1000     for( y = 0; y < 8; y += 2 )
1001     {
1002         for( x = 0; x < 8; x +=4 )
1003         {
1004             movd_m2r( src1[x], mm0 );
1005             movd_r2m( mm0, dst[x] );
1006
1007             movd_m2r( src2[x], mm1 );
1008             movd_m2r( src1[i_src1+x], mm2 );
1009
1010             punpcklbw_r2r( mm7, mm0 );
1011             punpcklbw_r2r( mm7, mm1 );
1012             punpcklbw_r2r( mm7, mm2 );
1013             paddw_r2r( mm1, mm1 );
1014             movq_r2r( mm1, mm3 );
1015             paddw_r2r( mm3, mm3 );
1016             paddw_r2r( mm2, mm0 );
1017             paddw_r2r( mm3, mm1 );
1018             paddw_m2r( m_4, mm1 );
1019             paddw_r2r( mm1, mm0 );
1020             psraw_i2r( 3, mm0 );
1021             packuswb_r2r( mm7, mm0 );
1022             movd_r2m( mm0, dst[i_dst+x] );
1023         }
1024         dst += 2*i_dst;
1025         src1 += i_src1;
1026         src2 += i_src2;
1027     }
1028 }
1029
1030 #endif
1031
1032 /* For debug */
1033 static inline void XDeint8x8Set( uint8_t *dst, int i_dst, uint8_t v )
1034 {
1035     int y;
1036     for( y = 0; y < 8; y++ )
1037         memset( &dst[y*i_dst], v, 8 );
1038 }
1039
1040 /* XDeint8x8FieldE: Stupid deinterlacing (1,0,1) for block that miss a
1041  * neighbour
1042  * (Use 8x9 pixels)
1043  * TODO: a better one for the inner part.
1044  */
1045 static inline void XDeint8x8FieldEC( uint8_t *dst, int i_dst,
1046                                      uint8_t *src, int i_src )
1047 {
1048     int y, x;
1049
1050     /* Interlaced */
1051     for( y = 0; y < 8; y += 2 )
1052     {
1053         memcpy( dst, src, 8 );
1054         dst += i_dst;
1055
1056         for( x = 0; x < 8; x++ )
1057             dst[x] = (src[x] + src[2*i_src+x] ) >> 1;
1058         dst += 1*i_dst;
1059         src += 2*i_src;
1060     }
1061 }
1062 #ifdef CAN_COMPILE_MMXEXT
1063 static inline void XDeint8x8FieldEMMXEXT( uint8_t *dst, int i_dst,
1064                                           uint8_t *src, int i_src )
1065 {
1066     int y;
1067
1068     /* Interlaced */
1069     for( y = 0; y < 8; y += 2 )
1070     {
1071         movq_m2r( src[0], mm0 );
1072         movq_r2m( mm0, dst[0] );
1073         dst += i_dst;
1074
1075         movq_m2r( src[2*i_src], mm1 );
1076         pavgb_r2r( mm1, mm0 );
1077
1078         movq_r2m( mm0, dst[0] );
1079
1080         dst += 1*i_dst;
1081         src += 2*i_src;
1082     }
1083 }
1084 #endif
1085
1086 /* XDeint8x8Field: Edge oriented interpolation
1087  * (Need -4 and +5 pixels H, +1 line)
1088  */
1089 static inline void XDeint8x8FieldC( uint8_t *dst, int i_dst,
1090                                     uint8_t *src, int i_src )
1091 {
1092     int y, x;
1093
1094     /* Interlaced */
1095     for( y = 0; y < 8; y += 2 )
1096     {
1097         memcpy( dst, src, 8 );
1098         dst += i_dst;
1099
1100         for( x = 0; x < 8; x++ )
1101         {
1102             uint8_t *src2 = &src[2*i_src];
1103             /* I use 8 pixels just to match the MMX version, but it's overkill
1104              * 5 would be enough (less isn't good) */
1105             const int c0 = abs(src[x-4]-src2[x-2]) + abs(src[x-3]-src2[x-1]) +
1106                            abs(src[x-2]-src2[x+0]) + abs(src[x-1]-src2[x+1]) +
1107                            abs(src[x+0]-src2[x+2]) + abs(src[x+1]-src2[x+3]) +
1108                            abs(src[x+2]-src2[x+4]) + abs(src[x+3]-src2[x+5]);
1109
1110             const int c1 = abs(src[x-3]-src2[x-3]) + abs(src[x-2]-src2[x-2]) +
1111                            abs(src[x-1]-src2[x-1]) + abs(src[x+0]-src2[x+0]) +
1112                            abs(src[x+1]-src2[x+1]) + abs(src[x+2]-src2[x+2]) +
1113                            abs(src[x+3]-src2[x+3]) + abs(src[x+4]-src2[x+4]);
1114
1115             const int c2 = abs(src[x-2]-src2[x-4]) + abs(src[x-1]-src2[x-3]) +
1116                            abs(src[x+0]-src2[x-2]) + abs(src[x+1]-src2[x-1]) +
1117                            abs(src[x+2]-src2[x+0]) + abs(src[x+3]-src2[x+1]) +
1118                            abs(src[x+4]-src2[x+2]) + abs(src[x+5]-src2[x+3]);
1119
1120             if( c0 < c1 && c1 <= c2 )
1121                 dst[x] = (src[x-1] + src2[x+1]) >> 1;
1122             else if( c2 < c1 && c1 <= c0 )
1123                 dst[x] = (src[x+1] + src2[x-1]) >> 1;
1124             else
1125                 dst[x] = (src[x+0] + src2[x+0]) >> 1;
1126         }
1127
1128         dst += 1*i_dst;
1129         src += 2*i_src;
1130     }
1131 }
1132 #ifdef CAN_COMPILE_MMXEXT
1133 static inline void XDeint8x8FieldMMXEXT( uint8_t *dst, int i_dst,
1134                                          uint8_t *src, int i_src )
1135 {
1136     int y, x;
1137
1138     /* Interlaced */
1139     for( y = 0; y < 8; y += 2 )
1140     {
1141         memcpy( dst, src, 8 );
1142         dst += i_dst;
1143
1144         for( x = 0; x < 8; x++ )
1145         {
1146             uint8_t *src2 = &src[2*i_src];
1147             int32_t c0, c1, c2;
1148
1149             movq_m2r( src[x-2], mm0 );
1150             movq_m2r( src[x-3], mm1 );
1151             movq_m2r( src[x-4], mm2 );
1152
1153             psadbw_m2r( src2[x-4], mm0 );
1154             psadbw_m2r( src2[x-3], mm1 );
1155             psadbw_m2r( src2[x-2], mm2 );
1156
1157             movd_r2m( mm0, c2 );
1158             movd_r2m( mm1, c1 );
1159             movd_r2m( mm2, c0 );
1160
1161             if( c0 < c1 && c1 <= c2 )
1162                 dst[x] = (src[x-1] + src2[x+1]) >> 1;
1163             else if( c2 < c1 && c1 <= c0 )
1164                 dst[x] = (src[x+1] + src2[x-1]) >> 1;
1165             else
1166                 dst[x] = (src[x+0] + src2[x+0]) >> 1;
1167         }
1168
1169         dst += 1*i_dst;
1170         src += 2*i_src;
1171     }
1172 }
1173 #endif
1174
1175 /* NxN arbitray size (and then only use pixel in the NxN block)
1176  */
1177 static inline int XDeintNxNDetect( uint8_t *src, int i_src,
1178                                    int i_height, int i_width )
1179 {
1180     int y, x;
1181     int ff, fr;
1182     int fc;
1183
1184
1185     /* Detect interlacing */
1186     /* FIXME way too simple, need to be more like XDeint8x8Detect */
1187     ff = fr = 0;
1188     fc = 0;
1189     for( y = 0; y < i_height - 2; y += 2 )
1190     {
1191         const uint8_t *s = &src[y*i_src];
1192         for( x = 0; x < i_width; x++ )
1193         {
1194             fr += ssd(s[      x] - s[1*i_src+x]);
1195             ff += ssd(s[      x] - s[2*i_src+x]);
1196         }
1197         if( ff < fr && fr > i_width / 2 )
1198             fc++;
1199     }
1200
1201     return fc < 2 ? false : true;
1202 }
1203
1204 static inline void XDeintNxNFrame( uint8_t *dst, int i_dst,
1205                                    uint8_t *src, int i_src,
1206                                    int i_width, int i_height )
1207 {
1208     int y, x;
1209
1210     /* Progressive */
1211     for( y = 0; y < i_height; y += 2 )
1212     {
1213         memcpy( dst, src, i_width );
1214         dst += i_dst;
1215
1216         if( y < i_height - 2 )
1217         {
1218             for( x = 0; x < i_width; x++ )
1219                 dst[x] = (src[x] + 2*src[1*i_src+x] + src[2*i_src+x] + 2 ) >> 2;
1220         }
1221         else
1222         {
1223             /* Blend last line */
1224             for( x = 0; x < i_width; x++ )
1225                 dst[x] = (src[x] + src[1*i_src+x] ) >> 1;
1226         }
1227         dst += 1*i_dst;
1228         src += 2*i_src;
1229     }
1230 }
1231
1232 static inline void XDeintNxNField( uint8_t *dst, int i_dst,
1233                                    uint8_t *src, int i_src,
1234                                    int i_width, int i_height )
1235 {
1236     int y, x;
1237
1238     /* Interlaced */
1239     for( y = 0; y < i_height; y += 2 )
1240     {
1241         memcpy( dst, src, i_width );
1242         dst += i_dst;
1243
1244         if( y < i_height - 2 )
1245         {
1246             for( x = 0; x < i_width; x++ )
1247                 dst[x] = (src[x] + src[2*i_src+x] ) >> 1;
1248         }
1249         else
1250         {
1251             /* Blend last line */
1252             for( x = 0; x < i_width; x++ )
1253                 dst[x] = (src[x] + src[i_src+x]) >> 1;
1254         }
1255         dst += 1*i_dst;
1256         src += 2*i_src;
1257     }
1258 }
1259
1260 static inline void XDeintNxN( uint8_t *dst, int i_dst, uint8_t *src, int i_src,
1261                               int i_width, int i_height )
1262 {
1263     if( XDeintNxNDetect( src, i_src, i_width, i_height ) )
1264         XDeintNxNField( dst, i_dst, src, i_src, i_width, i_height );
1265     else
1266         XDeintNxNFrame( dst, i_dst, src, i_src, i_width, i_height );
1267 }
1268
1269
1270 static inline int median( int a, int b, int c )
1271 {
1272     int min = a, max =a;
1273     if( b < min )
1274         min = b;
1275     else
1276         max = b;
1277
1278     if( c < min )
1279         min = c;
1280     else if( c > max )
1281         max = c;
1282
1283     return a + b + c - min - max;
1284 }
1285
1286
1287 /* XDeintBand8x8:
1288  */
1289 static inline void XDeintBand8x8C( uint8_t *dst, int i_dst,
1290                                    uint8_t *src, int i_src,
1291                                    const int i_mbx, int i_modx )
1292 {
1293     int x;
1294
1295     for( x = 0; x < i_mbx; x++ )
1296     {
1297         int s;
1298         if( ( s = XDeint8x8DetectC( src, i_src ) ) )
1299         {
1300             if( x == 0 || x == i_mbx - 1 )
1301                 XDeint8x8FieldEC( dst, i_dst, src, i_src );
1302             else
1303                 XDeint8x8FieldC( dst, i_dst, src, i_src );
1304         }
1305         else
1306         {
1307             XDeint8x8MergeC( dst, i_dst,
1308                              &src[0*i_src], 2*i_src,
1309                              &src[1*i_src], 2*i_src );
1310         }
1311
1312         dst += 8;
1313         src += 8;
1314     }
1315
1316     if( i_modx )
1317         XDeintNxN( dst, i_dst, src, i_src, i_modx, 8 );
1318 }
1319 #ifdef CAN_COMPILE_MMXEXT
1320 static inline void XDeintBand8x8MMXEXT( uint8_t *dst, int i_dst,
1321                                         uint8_t *src, int i_src,
1322                                         const int i_mbx, int i_modx )
1323 {
1324     int x;
1325
1326     /* Reset current line */
1327     for( x = 0; x < i_mbx; x++ )
1328     {
1329         int s;
1330         if( ( s = XDeint8x8DetectMMXEXT( src, i_src ) ) )
1331         {
1332             if( x == 0 || x == i_mbx - 1 )
1333                 XDeint8x8FieldEMMXEXT( dst, i_dst, src, i_src );
1334             else
1335                 XDeint8x8FieldMMXEXT( dst, i_dst, src, i_src );
1336         }
1337         else
1338         {
1339             XDeint8x8MergeMMXEXT( dst, i_dst,
1340                                   &src[0*i_src], 2*i_src,
1341                                   &src[1*i_src], 2*i_src );
1342         }
1343
1344         dst += 8;
1345         src += 8;
1346     }
1347
1348     if( i_modx )
1349         XDeintNxN( dst, i_dst, src, i_src, i_modx, 8 );
1350 }
1351 #endif
1352
1353 static void RenderX( picture_t *p_outpic, picture_t *p_pic )
1354 {
1355     int i_plane;
1356
1357     /* Copy image and skip lines */
1358     for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
1359     {
1360         const int i_mby = ( p_outpic->p[i_plane].i_visible_lines + 7 )/8 - 1;
1361         const int i_mbx = p_outpic->p[i_plane].i_visible_pitch/8;
1362
1363         const int i_mody = p_outpic->p[i_plane].i_visible_lines - 8*i_mby;
1364         const int i_modx = p_outpic->p[i_plane].i_visible_pitch - 8*i_mbx;
1365
1366         const int i_dst = p_outpic->p[i_plane].i_pitch;
1367         const int i_src = p_pic->p[i_plane].i_pitch;
1368
1369         int y, x;
1370
1371         for( y = 0; y < i_mby; y++ )
1372         {
1373             uint8_t *dst = &p_outpic->p[i_plane].p_pixels[8*y*i_dst];
1374             uint8_t *src = &p_pic->p[i_plane].p_pixels[8*y*i_src];
1375
1376 #ifdef CAN_COMPILE_MMXEXT
1377             if( vlc_CPU() & CPU_CAPABILITY_MMXEXT )
1378                 XDeintBand8x8MMXEXT( dst, i_dst, src, i_src, i_mbx, i_modx );
1379             else
1380 #endif
1381                 XDeintBand8x8C( dst, i_dst, src, i_src, i_mbx, i_modx );
1382         }
1383
1384         /* Last line (C only)*/
1385         if( i_mody )
1386         {
1387             uint8_t *dst = &p_outpic->p[i_plane].p_pixels[8*y*i_dst];
1388             uint8_t *src = &p_pic->p[i_plane].p_pixels[8*y*i_src];
1389
1390             for( x = 0; x < i_mbx; x++ )
1391             {
1392                 XDeintNxN( dst, i_dst, src, i_src, 8, i_mody );
1393
1394                 dst += 8;
1395                 src += 8;
1396             }
1397
1398             if( i_modx )
1399                 XDeintNxN( dst, i_dst, src, i_src, i_modx, i_mody );
1400         }
1401     }
1402
1403 #ifdef CAN_COMPILE_MMXEXT
1404     if( vlc_CPU() & CPU_CAPABILITY_MMXEXT )
1405         emms();
1406 #endif
1407 }
1408
1409 /*****************************************************************************
1410  * Yadif (Yet Another DeInterlacing Filter).
1411  *****************************************************************************/
1412 /* */
1413 struct vf_priv_s {
1414     /*
1415      * 0: Output 1 frame for each frame.
1416      * 1: Output 1 frame for each field.
1417      * 2: Like 0 but skips spatial interlacing check.
1418      * 3: Like 1 but skips spatial interlacing check.
1419      *
1420      * In vlc, only & 0x02 has meaning, as we do the & 0x01 ourself.
1421      */
1422     int mode;
1423 };
1424
1425 /* I am unsure it is the right one */
1426 typedef intptr_t x86_reg;
1427
1428 #define FFABS(a) ((a) >= 0 ? (a) : (-(a)))
1429 #define FFMAX(a,b)      __MAX(a,b)
1430 #define FFMAX3(a,b,c)   FFMAX(FFMAX(a,b),c)
1431 #define FFMIN(a,b)      __MIN(a,b)
1432 #define FFMIN3(a,b,c)   FFMIN(FFMIN(a,b),c)
1433
1434 /* yadif.h comes from vf_yadif.c of mplayer project */
1435 #include "yadif.h"
1436
1437 static int RenderYadif( filter_t *p_filter, picture_t *p_dst, picture_t *p_src, int i_order, int i_field )
1438 {
1439     filter_sys_t *p_sys = p_filter->p_sys;
1440
1441     /* */
1442     assert( i_order == 0 || i_order == 1 );
1443     assert( i_field == 0 || i_field == 1 );
1444
1445     if( i_order == 0 )
1446     {
1447         /* Duplicate the picture
1448          * TODO when the vout rework is finished, picture_Hold() might be enough
1449          * but becarefull, the pitches must match */
1450         picture_t *p_dup = picture_NewFromFormat( &p_src->format );
1451         if( p_dup )
1452             picture_Copy( p_dup, p_src );
1453
1454         /* Slide the history */
1455         if( p_sys->pp_history[0] )
1456             picture_Release( p_sys->pp_history[0]  );
1457         for( int i = 1; i < HISTORY_SIZE; i++ )
1458             p_sys->pp_history[i-1] = p_sys->pp_history[i];
1459         p_sys->pp_history[HISTORY_SIZE-1] = p_dup;
1460     }
1461
1462     /* As the pitches must match, use ONLY pictures coming from picture_New()! */
1463     picture_t *p_prev = p_sys->pp_history[0];
1464     picture_t *p_cur  = p_sys->pp_history[1];
1465     picture_t *p_next = p_sys->pp_history[2];
1466
1467     /* Filter if we have all the pictures we need */
1468     if( p_prev && p_cur && p_next )
1469     {
1470         /* */
1471         void (*filter)(struct vf_priv_s *p, uint8_t *dst, uint8_t *prev, uint8_t *cur, uint8_t *next, int w, int refs, int parity);
1472 #if defined(HAVE_YADIF_SSE2)
1473         if( vlc_CPU() & CPU_CAPABILITY_SSE2 )
1474             filter = yadif_filter_line_mmx2;
1475         else
1476 #endif
1477             filter = yadif_filter_line_c;
1478
1479         for( int n = 0; n < p_dst->i_planes; n++ )
1480         {
1481             const plane_t *prevp = &p_prev->p[n];
1482             const plane_t *curp  = &p_cur->p[n];
1483             const plane_t *nextp = &p_next->p[n];
1484             plane_t *dstp        = &p_dst->p[n];
1485
1486             for( int y = 1; y < dstp->i_visible_lines - 1; y++ )
1487             {
1488                 if( (y % 2) == i_field )
1489                 {
1490                     vlc_memcpy( &dstp->p_pixels[y * dstp->i_pitch],
1491                                 &curp->p_pixels[y * curp->i_pitch], dstp->i_visible_pitch );
1492                 }
1493                 else
1494                 {
1495                     struct vf_priv_s cfg;
1496                     /* Spatial checks only when enough data */
1497                     cfg.mode = (y >= 2 && y < dstp->i_visible_lines - 2) ? 0 : 2;
1498
1499                     assert( prevp->i_pitch == curp->i_pitch && curp->i_pitch == nextp->i_pitch );
1500                     filter( &cfg,
1501                             &dstp->p_pixels[y * dstp->i_pitch],
1502                             &prevp->p_pixels[y * prevp->i_pitch],
1503                             &curp->p_pixels[y * curp->i_pitch],
1504                             &nextp->p_pixels[y * nextp->i_pitch],
1505                             dstp->i_visible_pitch,
1506                             curp->i_pitch,
1507                             (i_field ^ (i_order == i_field)) & 1 );
1508                 }
1509
1510                 /* We duplicate the first and last lines */
1511                 if( y == 1 )
1512                     vlc_memcpy(&dstp->p_pixels[(y-1) * dstp->i_pitch], &dstp->p_pixels[y * dstp->i_pitch], dstp->i_pitch);
1513                 else if( y == dstp->i_visible_lines - 2 )
1514                     vlc_memcpy(&dstp->p_pixels[(y+1) * dstp->i_pitch], &dstp->p_pixels[y * dstp->i_pitch], dstp->i_pitch);
1515             }
1516         }
1517
1518         /* */
1519         p_dst->date = (p_next->date - p_cur->date) * i_order / 2 + p_cur->date;
1520         return VLC_SUCCESS;
1521     }
1522     else if( !p_prev && !p_cur && p_next )
1523     {
1524         RenderX( p_dst, p_next );
1525         return VLC_SUCCESS;
1526     }
1527     else
1528     {
1529         return VLC_EGENERIC;
1530     }
1531 }
1532
1533 /*****************************************************************************
1534  * video filter2 functions
1535  *****************************************************************************/
1536 static picture_t *Deinterlace( filter_t *p_filter, picture_t *p_pic )
1537 {
1538     filter_sys_t *p_sys = p_filter->p_sys;
1539     picture_t *p_pic_dst;
1540
1541     /* Request output picture */
1542     p_pic_dst = filter_NewPicture( p_filter );
1543     if( p_pic_dst == NULL )
1544     {
1545         picture_Release( p_pic );
1546         return NULL;
1547     }
1548
1549     picture_CopyProperties( p_pic_dst, p_pic );
1550
1551     switch( p_sys->i_mode )
1552     {
1553         case DEINTERLACE_DISCARD:
1554             RenderDiscard( p_filter, p_pic_dst, p_pic, 0 );
1555             break;
1556
1557         case DEINTERLACE_BOB:
1558 #if 0
1559             RenderBob( p_filter, pp_outpic[0], p_pic, !p_pic->b_top_field_first );
1560             RenderBob( p_filter, pp_outpic[1], p_pic, p_pic->b_top_field_first );
1561             break;
1562 #endif
1563
1564         case DEINTERLACE_LINEAR:
1565 #if 0
1566             RenderLinear( p_filter, pp_outpic[0], p_pic, !p_pic->b_top_field_first );
1567             RenderLinear( p_filter, pp_outpic[1], p_pic, p_pic->b_top_field_first );
1568 #endif
1569             msg_Err( p_filter, "doubling the frame rate is not supported yet" );
1570             goto drop;
1571
1572         case DEINTERLACE_MEAN:
1573             RenderMean( p_filter, p_pic_dst, p_pic );
1574             break;
1575
1576         case DEINTERLACE_BLEND:
1577             RenderBlend( p_filter, p_pic_dst, p_pic );
1578             break;
1579
1580         case DEINTERLACE_X:
1581             RenderX( p_pic_dst, p_pic );
1582             break;
1583
1584         case DEINTERLACE_YADIF:
1585             if( RenderYadif( p_filter, p_pic_dst, p_pic, 0, 0 ) )
1586                 goto drop;
1587             break;
1588
1589         case DEINTERLACE_YADIF2X:
1590             msg_Err( p_filter, "doubling the frame rate is not supported yet" );
1591             //RenderYadif( p_vout, pp_outpic[0], p_pic, 0, !p_pic->b_top_field_first );
1592             //RenderYadif( p_vout, pp_outpic[1], p_pic, 1, p_pic->b_top_field_first );
1593             goto drop;
1594     }
1595
1596     p_pic_dst->b_progressive = true;
1597
1598     picture_Release( p_pic );
1599     return p_pic_dst;
1600
1601 drop:
1602     picture_Release( p_pic_dst );
1603     picture_Release( p_pic );
1604     return NULL;
1605 }
1606
1607 static void Flush( filter_t *p_filter )
1608 {
1609     filter_sys_t *p_sys = p_filter->p_sys;
1610
1611     for( int i = 0; i < HISTORY_SIZE; i++ )
1612     {
1613         if( p_sys->pp_history[i] )
1614             picture_Release( p_sys->pp_history[i] );
1615         p_sys->pp_history[i] = NULL;
1616     }
1617 }
1618
1619 static int Mouse( filter_t *p_filter,
1620                   vlc_mouse_t *p_mouse, const vlc_mouse_t *p_old, const vlc_mouse_t *p_new )
1621 {
1622     *p_mouse = *p_new;
1623     if( p_filter->p_sys->b_half_height )
1624         p_mouse->i_y *= 2;
1625     return VLC_SUCCESS;
1626 }
1627
1628
1629 /*****************************************************************************
1630  * Open
1631  *****************************************************************************/
1632 static int Open( vlc_object_t *p_this )
1633 {
1634     filter_t *p_filter = (filter_t*)p_this;
1635     filter_sys_t *p_sys;
1636
1637     if( !IsChromaSupported( p_filter->fmt_in.video.i_chroma ) )
1638         return VLC_EGENERIC;
1639
1640     /* */
1641     p_sys = p_filter->p_sys = malloc( sizeof( *p_sys ) );
1642     if( !p_sys )
1643         return VLC_ENOMEM;
1644
1645     p_sys->i_mode = DEINTERLACE_BLEND;
1646     p_sys->b_double_rate = false;
1647     p_sys->b_half_height = true;
1648     for( int i = 0; i < HISTORY_SIZE; i++ )
1649         p_sys->pp_history[i] = NULL;
1650
1651 #if defined(CAN_COMPILE_C_ALTIVEC)
1652     if( vlc_CPU() & CPU_CAPABILITY_ALTIVEC )
1653     {
1654         p_sys->pf_merge = MergeAltivec;
1655         p_sys->pf_end_merge = NULL;
1656     }
1657     else
1658 #endif
1659 #if defined(CAN_COMPILE_SSE)
1660     if( vlc_CPU() & CPU_CAPABILITY_SSE2 )
1661     {
1662         p_sys->pf_merge = MergeSSE2;
1663         p_sys->pf_end_merge = EndMMX;
1664     }
1665     else
1666 #endif
1667 #if defined(CAN_COMPILE_MMXEXT)
1668     if( vlc_CPU() & CPU_CAPABILITY_MMXEXT )
1669     {
1670         p_sys->pf_merge = MergeMMXEXT;
1671         p_sys->pf_end_merge = EndMMX;
1672     }
1673     else
1674 #endif
1675 #if defined(CAN_COMPILE_3DNOW)
1676     if( vlc_CPU() & CPU_CAPABILITY_3DNOW )
1677     {
1678         p_sys->pf_merge = Merge3DNow;
1679         p_sys->pf_end_merge = End3DNow;
1680     }
1681     else
1682 #endif
1683 #if defined __ARM_NEON__
1684     if( vlc_CPU() & CPU_CAPABILITY_NEON )
1685     {
1686         p_sys->pf_merge = MergeNEON;
1687         p_sys->pf_end_merge = NULL;
1688     }
1689     else
1690 #endif
1691     {
1692         p_sys->pf_merge = MergeGeneric;
1693         p_sys->pf_end_merge = NULL;
1694     }
1695
1696     /* */
1697     config_ChainParse( p_filter, FILTER_CFG_PREFIX, ppsz_filter_options,
1698                        p_filter->p_cfg );
1699
1700     char *psz_mode = var_GetNonEmptyString( p_filter, FILTER_CFG_PREFIX "mode" );
1701     SetFilterMethod( p_filter, psz_mode, p_filter->fmt_in.video.i_chroma );
1702     free( psz_mode );
1703
1704     /* */
1705     video_format_t fmt;
1706     GetOutputFormat( p_filter, &fmt, &p_filter->fmt_in.video );
1707     if( !p_filter->b_allow_fmt_out_change &&
1708         ( fmt.i_chroma != p_filter->fmt_in.video.i_chroma ||
1709           fmt.i_height != p_filter->fmt_in.video.i_height ) )
1710     {
1711         Close( VLC_OBJECT(p_filter) );
1712         return VLC_EGENERIC;
1713     }
1714     p_filter->fmt_out.video = fmt;
1715     p_filter->fmt_out.i_codec = fmt.i_chroma;
1716     p_filter->pf_video_filter = Deinterlace;
1717     p_filter->pf_video_flush  = Flush;
1718     p_filter->pf_video_mouse  = Mouse;
1719
1720     msg_Dbg( p_filter, "deinterlacing" );
1721
1722     return VLC_SUCCESS;
1723 }
1724
1725 /*****************************************************************************
1726  * Close: clean up the filter
1727  *****************************************************************************/
1728 static void Close( vlc_object_t *p_this )
1729 {
1730     filter_t *p_filter = (filter_t*)p_this;
1731
1732     Flush( p_filter );
1733     free( p_filter->p_sys );
1734 }
1735