git.sesse.net Git - vlc/blob - modules/video_filter/deinterlace.c

   1 /*****************************************************************************
   2  * deinterlace.c : deinterlacer plugin for vlc
   3  *****************************************************************************
   4  * Copyright (C) 2000-2009 the VideoLAN team
   5  * $Id$
   6  *
   7  * Author: Sam Hocevar <sam@zoy.org>
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation; either version 2 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License
  20  * along with this program; if not, write to the Free Software
  21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
  22  *****************************************************************************/
  23
  24 /*****************************************************************************
  25  * Preamble
  26  *****************************************************************************/
  27
  28 #ifdef HAVE_CONFIG_H
  29 # include "config.h"
  30 #endif
  31
  32 #include <assert.h>
  33
  34 #ifdef HAVE_ALTIVEC_H
  35 #   include <altivec.h>
  36 #endif
  37
  38 #include <vlc_common.h>
  39 #include <vlc_plugin.h>
  40 #include <vlc_filter.h>
  41 #include <vlc_cpu.h>
  42
  43 #ifdef CAN_COMPILE_MMXEXT
  44 #   include "mmx.h"
  45 #endif
  46
  47 #define DEINTERLACE_DISCARD 1
  48 #define DEINTERLACE_MEAN    2
  49 #define DEINTERLACE_BLEND   3
  50 #define DEINTERLACE_BOB     4
  51 #define DEINTERLACE_LINEAR  5
  52 #define DEINTERLACE_X       6
  53 #define DEINTERLACE_YADIF   7
  54 #define DEINTERLACE_YADIF2X 8
  55
  56 /*****************************************************************************
  57  * Module descriptor
  58  *****************************************************************************/
  59 static int  Open ( vlc_object_t * );
  60 static void Close( vlc_object_t * );
  61
  62 #define MODE_TEXT N_("Deinterlace mode")
  63 #define MODE_LONGTEXT N_("Deinterlace method to use for local playback.")
  64
  65 #define SOUT_MODE_TEXT N_("Streaming deinterlace mode")
  66 #define SOUT_MODE_LONGTEXT N_("Deinterlace method to use for streaming.")
  67
  68 #define FILTER_CFG_PREFIX "sout-deinterlace-"
  69
  70 static const char *const mode_list[] = {
  71     "discard", "blend", "mean", "bob", "linear", "x", "yadif", "yadif2x" };
  72 static const char *const mode_list_text[] = {
  73     N_("Discard"), N_("Blend"), N_("Mean"), N_("Bob"), N_("Linear"), "X", "Yadif", "Yadif (2x)" };
  74
  75 vlc_module_begin ()
  76     set_description( N_("Deinterlacing video filter") )
  77     set_shortname( N_("Deinterlace" ))
  78     set_capability( "video filter", 0 )
  79     set_category( CAT_VIDEO )
  80     set_subcategory( SUBCAT_VIDEO_VFILTER )
  81
  82     set_capability( "video filter2", 0 )
  83     add_string( FILTER_CFG_PREFIX "mode", "blend", NULL, SOUT_MODE_TEXT,
  84                 SOUT_MODE_LONGTEXT, false )
  85         change_string_list( mode_list, mode_list_text, 0 )
  86         change_safe ()
  87     add_shortcut( "deinterlace" )
  88     set_callbacks( Open, Close )
  89 vlc_module_end ()
  90
  91
  92 /*****************************************************************************
  93  * Local protypes
  94  *****************************************************************************/
  95 static void RenderDiscard( filter_t *, picture_t *, picture_t *, int );
  96 static void RenderBob    ( filter_t *, picture_t *, picture_t *, int );
  97 static void RenderMean   ( filter_t *, picture_t *, picture_t * );
  98 static void RenderBlend  ( filter_t *, picture_t *, picture_t * );
  99 static void RenderLinear ( filter_t *, picture_t *, picture_t *, int );
 100 static void RenderX      ( picture_t *, picture_t * );
 101 static void RenderYadif  ( filter_t *, picture_t *, picture_t *, int, int );
 102
 103 static void MergeGeneric ( void *, const void *, const void *, size_t );
 104 #if defined(CAN_COMPILE_C_ALTIVEC)
 105 static void MergeAltivec ( void *, const void *, const void *, size_t );
 106 #endif
 107 #if defined(CAN_COMPILE_MMXEXT)
 108 static void MergeMMXEXT  ( void *, const void *, const void *, size_t );
 109 #endif
 110 #if defined(CAN_COMPILE_3DNOW)
 111 static void Merge3DNow   ( void *, const void *, const void *, size_t );
 112 #endif
 113 #if defined(CAN_COMPILE_SSE)
 114 static void MergeSSE2    ( void *, const void *, const void *, size_t );
 115 #endif
 116 #if defined(CAN_COMPILE_MMXEXT) || defined(CAN_COMPILE_SSE)
 117 static void EndMMX       ( void );
 118 #endif
 119 #if defined(CAN_COMPILE_3DNOW)
 120 static void End3DNow     ( void );
 121 #endif
 122 #if defined __ARM_NEON__
 123 static void MergeNEON (void *, const void *, const void *, size_t);
 124 #endif
 125
 126 static const char *const ppsz_filter_options[] = {
 127     "mode", NULL
 128 };
 129
 130 #define HISTORY_SIZE (3)
 131 struct filter_sys_t
 132 {
 133     int  i_mode;        /* Deinterlace mode */
 134     bool b_double_rate; /* Shall we double the framerate? */
 135     bool b_half_height; /* Shall be divide the height by 2 */
 136
 137     void (*pf_merge) ( void *, const void *, const void *, size_t );
 138     void (*pf_end_merge) ( void );
 139
 140     /* Yadif */
 141     picture_t *pp_history[HISTORY_SIZE];
 142 };
 143
 144 /*****************************************************************************
 145  * SetFilterMethod: setup the deinterlace method to use.
 146  *****************************************************************************/
 147 static void SetFilterMethod( filter_t *p_filter, const char *psz_method, vlc_fourcc_t i_chroma )
 148 {
 149     filter_sys_t *p_sys = p_filter->p_sys;
 150
 151     if( !psz_method )
 152         psz_method = "";
 153
 154     if( !strcmp( psz_method, "mean" ) )
 155     {
 156         p_sys->i_mode = DEINTERLACE_MEAN;
 157         p_sys->b_double_rate = false;
 158         p_sys->b_half_height = true;
 159     }
 160     else if( !strcmp( psz_method, "bob" )
 161              || !strcmp( psz_method, "progressive-scan" ) )
 162     {
 163         p_sys->i_mode = DEINTERLACE_BOB;
 164         p_sys->b_double_rate = true;
 165         p_sys->b_half_height = false;
 166     }
 167     else if( !strcmp( psz_method, "linear" ) )
 168     {
 169         p_sys->i_mode = DEINTERLACE_LINEAR;
 170         p_sys->b_double_rate = true;
 171         p_sys->b_half_height = false;
 172     }
 173     else if( !strcmp( psz_method, "x" ) )
 174     {
 175         p_sys->i_mode = DEINTERLACE_X;
 176         p_sys->b_double_rate = false;
 177         p_sys->b_half_height = false;
 178     }
 179     else if( !strcmp( psz_method, "yadif" ) )
 180     {
 181         p_sys->i_mode = DEINTERLACE_YADIF;
 182         p_sys->b_double_rate = false;
 183         p_sys->b_half_height = false;
 184     }
 185     else if( !strcmp( psz_method, "yadif2x" ) )
 186     {
 187         p_sys->i_mode = DEINTERLACE_YADIF2X;
 188         p_sys->b_double_rate = true;
 189         p_sys->b_half_height = false;
 190     }
 191     else if( !strcmp( psz_method, "discard" ) )
 192     {
 193         const bool b_i422 = i_chroma == VLC_CODEC_I422 ||
 194                             i_chroma == VLC_CODEC_J422;
 195
 196         p_sys->i_mode = DEINTERLACE_DISCARD;
 197         p_sys->b_double_rate = false;
 198         p_sys->b_half_height = !b_i422;
 199     }
 200     else
 201     {
 202         if( strcmp( psz_method, "blend" ) )
 203             msg_Err( p_filter,
 204                      "no valid deinterlace mode provided, using \"blend\"" );
 205
 206         p_sys->i_mode = DEINTERLACE_BLEND;
 207         p_sys->b_double_rate = false;
 208         p_sys->b_half_height = false;
 209     }
 210
 211     msg_Dbg( p_filter, "using %s deinterlace method", psz_method );
 212 }
 213
 214 static void GetOutputFormat( filter_t *p_filter,
 215                              video_format_t *p_dst, const video_format_t *p_src )
 216 {
 217     filter_sys_t *p_sys = p_filter->p_sys;
 218     *p_dst = *p_src;
 219
 220     if( p_sys->b_half_height )
 221     {
 222         p_dst->i_height /= 2;
 223         p_dst->i_visible_height /= 2;
 224         p_dst->i_y_offset /= 2;
 225         p_dst->i_sar_den *= 2;
 226     }
 227
 228     if( p_src->i_chroma == VLC_CODEC_I422 ||
 229         p_src->i_chroma == VLC_CODEC_J422 )
 230     {
 231         switch( p_sys->i_mode )
 232         {
 233         case DEINTERLACE_MEAN:
 234         case DEINTERLACE_LINEAR:
 235         case DEINTERLACE_X:
 236         case DEINTERLACE_YADIF:
 237         case DEINTERLACE_YADIF2X:
 238             p_dst->i_chroma = p_src->i_chroma;
 239             break;
 240         default:
 241             p_dst->i_chroma = p_src->i_chroma == VLC_CODEC_I422 ? VLC_CODEC_I420 :
 242                                                                   VLC_CODEC_J420;
 243             break;
 244         }
 245     }
 246 }
 247
 248 static bool IsChromaSupported( vlc_fourcc_t i_chroma )
 249 {
 250     return i_chroma == VLC_CODEC_I420 ||
 251            i_chroma == VLC_CODEC_J420 ||
 252            i_chroma == VLC_CODEC_YV12 ||
 253            i_chroma == VLC_CODEC_I422 ||
 254            i_chroma == VLC_CODEC_J422;
 255 }
 256
 257 /*****************************************************************************
 258  * RenderDiscard: only keep TOP or BOTTOM field, discard the other.
 259  *****************************************************************************/
 260 static void RenderDiscard( filter_t *p_filter,
 261                            picture_t *p_outpic, picture_t *p_pic, int i_field )
 262 {
 263     int i_plane;
 264
 265     /* Copy image and skip lines */
 266     for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
 267     {
 268         uint8_t *p_in, *p_out_end, *p_out;
 269         int i_increment;
 270
 271         p_in = p_pic->p[i_plane].p_pixels
 272                    + i_field * p_pic->p[i_plane].i_pitch;
 273
 274         p_out = p_outpic->p[i_plane].p_pixels;
 275         p_out_end = p_out + p_outpic->p[i_plane].i_pitch
 276                              * p_outpic->p[i_plane].i_visible_lines;
 277
 278         switch( p_filter->fmt_in.video.i_chroma )
 279         {
 280         case VLC_CODEC_I420:
 281         case VLC_CODEC_J420:
 282         case VLC_CODEC_YV12:
 283
 284             for( ; p_out < p_out_end ; )
 285             {
 286                 vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 287
 288                 p_out += p_outpic->p[i_plane].i_pitch;
 289                 p_in += 2 * p_pic->p[i_plane].i_pitch;
 290             }
 291             break;
 292
 293         case VLC_CODEC_I422:
 294         case VLC_CODEC_J422:
 295
 296             i_increment = 2 * p_pic->p[i_plane].i_pitch;
 297
 298             if( i_plane == Y_PLANE )
 299             {
 300                 for( ; p_out < p_out_end ; )
 301                 {
 302                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 303                     p_out += p_outpic->p[i_plane].i_pitch;
 304                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 305                     p_out += p_outpic->p[i_plane].i_pitch;
 306                     p_in += i_increment;
 307                 }
 308             }
 309             else
 310             {
 311                 for( ; p_out < p_out_end ; )
 312                 {
 313                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 314                     p_out += p_outpic->p[i_plane].i_pitch;
 315                     p_in += i_increment;
 316                 }
 317             }
 318             break;
 319
 320         default:
 321             break;
 322         }
 323     }
 324 }
 325
 326 /*****************************************************************************
 327  * RenderBob: renders a BOB picture - simple copy
 328  *****************************************************************************/
 329 static void RenderBob( filter_t *p_filter,
 330                        picture_t *p_outpic, picture_t *p_pic, int i_field )
 331 {
 332     int i_plane;
 333
 334     /* Copy image and skip lines */
 335     for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
 336     {
 337         uint8_t *p_in, *p_out_end, *p_out;
 338
 339         p_in = p_pic->p[i_plane].p_pixels;
 340         p_out = p_outpic->p[i_plane].p_pixels;
 341         p_out_end = p_out + p_outpic->p[i_plane].i_pitch
 342                              * p_outpic->p[i_plane].i_visible_lines;
 343
 344         switch( p_filter->fmt_in.video.i_chroma )
 345         {
 346             case VLC_CODEC_I420:
 347             case VLC_CODEC_J420:
 348             case VLC_CODEC_YV12:
 349                 /* For BOTTOM field we need to add the first line */
 350                 if( i_field == 1 )
 351                 {
 352                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 353                     p_in += p_pic->p[i_plane].i_pitch;
 354                     p_out += p_outpic->p[i_plane].i_pitch;
 355                 }
 356
 357                 p_out_end -= 2 * p_outpic->p[i_plane].i_pitch;
 358
 359                 for( ; p_out < p_out_end ; )
 360                 {
 361                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 362
 363                     p_out += p_outpic->p[i_plane].i_pitch;
 364
 365                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 366
 367                     p_in += 2 * p_pic->p[i_plane].i_pitch;
 368                     p_out += p_outpic->p[i_plane].i_pitch;
 369                 }
 370
 371                 vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 372
 373                 /* For TOP field we need to add the last line */
 374                 if( i_field == 0 )
 375                 {
 376                     p_in += p_pic->p[i_plane].i_pitch;
 377                     p_out += p_outpic->p[i_plane].i_pitch;
 378                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 379                 }
 380                 break;
 381
 382             case VLC_CODEC_I422:
 383             case VLC_CODEC_J422:
 384                 /* For BOTTOM field we need to add the first line */
 385                 if( i_field == 1 )
 386                 {
 387                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 388                     p_in += p_pic->p[i_plane].i_pitch;
 389                     p_out += p_outpic->p[i_plane].i_pitch;
 390                 }
 391
 392                 p_out_end -= 2 * p_outpic->p[i_plane].i_pitch;
 393
 394                 if( i_plane == Y_PLANE )
 395                 {
 396                     for( ; p_out < p_out_end ; )
 397                     {
 398                         vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 399
 400                         p_out += p_outpic->p[i_plane].i_pitch;
 401
 402                         vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 403
 404                         p_in += 2 * p_pic->p[i_plane].i_pitch;
 405                         p_out += p_outpic->p[i_plane].i_pitch;
 406                     }
 407                 }
 408                 else
 409                 {
 410                     for( ; p_out < p_out_end ; )
 411                     {
 412                         vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 413
 414                         p_out += p_outpic->p[i_plane].i_pitch;
 415                         p_in += 2 * p_pic->p[i_plane].i_pitch;
 416                     }
 417                 }
 418
 419                 vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 420
 421                 /* For TOP field we need to add the last line */
 422                 if( i_field == 0 )
 423                 {
 424                     p_in += p_pic->p[i_plane].i_pitch;
 425                     p_out += p_outpic->p[i_plane].i_pitch;
 426                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 427                 }
 428                 break;
 429         }
 430     }
 431 }
 432
 433 #define Merge p_filter->p_sys->pf_merge
 434 #define EndMerge if(p_filter->p_sys->pf_end_merge) p_filter->p_sys->pf_end_merge
 435
 436 /*****************************************************************************
 437  * RenderLinear: BOB with linear interpolation
 438  *****************************************************************************/
 439 static void RenderLinear( filter_t *p_filter,
 440                           picture_t *p_outpic, picture_t *p_pic, int i_field )
 441 {
 442     int i_plane;
 443
 444     /* Copy image and skip lines */
 445     for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
 446     {
 447         uint8_t *p_in, *p_out_end, *p_out;
 448
 449         p_in = p_pic->p[i_plane].p_pixels;
 450         p_out = p_outpic->p[i_plane].p_pixels;
 451         p_out_end = p_out + p_outpic->p[i_plane].i_pitch
 452                              * p_outpic->p[i_plane].i_visible_lines;
 453
 454         /* For BOTTOM field we need to add the first line */
 455         if( i_field == 1 )
 456         {
 457             vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 458             p_in += p_pic->p[i_plane].i_pitch;
 459             p_out += p_outpic->p[i_plane].i_pitch;
 460         }
 461
 462         p_out_end -= 2 * p_outpic->p[i_plane].i_pitch;
 463
 464         for( ; p_out < p_out_end ; )
 465         {
 466             vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 467
 468             p_out += p_outpic->p[i_plane].i_pitch;
 469
 470             Merge( p_out, p_in, p_in + 2 * p_pic->p[i_plane].i_pitch,
 471                    p_pic->p[i_plane].i_pitch );
 472
 473             p_in += 2 * p_pic->p[i_plane].i_pitch;
 474             p_out += p_outpic->p[i_plane].i_pitch;
 475         }
 476
 477         vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 478
 479         /* For TOP field we need to add the last line */
 480         if( i_field == 0 )
 481         {
 482             p_in += p_pic->p[i_plane].i_pitch;
 483             p_out += p_outpic->p[i_plane].i_pitch;
 484             vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 485         }
 486     }
 487     EndMerge();
 488 }
 489
 490 static void RenderMean( filter_t *p_filter,
 491                         picture_t *p_outpic, picture_t *p_pic )
 492 {
 493     int i_plane;
 494
 495     /* Copy image and skip lines */
 496     for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
 497     {
 498         uint8_t *p_in, *p_out_end, *p_out;
 499
 500         p_in = p_pic->p[i_plane].p_pixels;
 501
 502         p_out = p_outpic->p[i_plane].p_pixels;
 503         p_out_end = p_out + p_outpic->p[i_plane].i_pitch
 504                              * p_outpic->p[i_plane].i_visible_lines;
 505
 506         /* All lines: mean value */
 507         for( ; p_out < p_out_end ; )
 508         {
 509             Merge( p_out, p_in, p_in + p_pic->p[i_plane].i_pitch,
 510                    p_pic->p[i_plane].i_pitch );
 511
 512             p_out += p_outpic->p[i_plane].i_pitch;
 513             p_in += 2 * p_pic->p[i_plane].i_pitch;
 514         }
 515     }
 516     EndMerge();
 517 }
 518
 519 static void RenderBlend( filter_t *p_filter,
 520                          picture_t *p_outpic, picture_t *p_pic )
 521 {
 522     int i_plane;
 523
 524     /* Copy image and skip lines */
 525     for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
 526     {
 527         uint8_t *p_in, *p_out_end, *p_out;
 528
 529         p_in = p_pic->p[i_plane].p_pixels;
 530
 531         p_out = p_outpic->p[i_plane].p_pixels;
 532         p_out_end = p_out + p_outpic->p[i_plane].i_pitch
 533                              * p_outpic->p[i_plane].i_visible_lines;
 534
 535         switch( p_filter->fmt_in.video.i_chroma )
 536         {
 537             case VLC_CODEC_I420:
 538             case VLC_CODEC_J420:
 539             case VLC_CODEC_YV12:
 540                 /* First line: simple copy */
 541                 vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 542                 p_out += p_outpic->p[i_plane].i_pitch;
 543
 544                 /* Remaining lines: mean value */
 545                 for( ; p_out < p_out_end ; )
 546                 {
 547                     Merge( p_out, p_in, p_in + p_pic->p[i_plane].i_pitch,
 548                            p_pic->p[i_plane].i_pitch );
 549
 550                     p_out += p_outpic->p[i_plane].i_pitch;
 551                     p_in += p_pic->p[i_plane].i_pitch;
 552                 }
 553                 break;
 554
 555             case VLC_CODEC_I422:
 556             case VLC_CODEC_J422:
 557                 /* First line: simple copy */
 558                 vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 559                 p_out += p_outpic->p[i_plane].i_pitch;
 560
 561                 /* Remaining lines: mean value */
 562                 if( i_plane == Y_PLANE )
 563                 {
 564                     for( ; p_out < p_out_end ; )
 565                     {
 566                         Merge( p_out, p_in, p_in + p_pic->p[i_plane].i_pitch,
 567                                p_pic->p[i_plane].i_pitch );
 568
 569                         p_out += p_outpic->p[i_plane].i_pitch;
 570                         p_in += p_pic->p[i_plane].i_pitch;
 571                     }
 572                 }
 573
 574                 else
 575                 {
 576                     for( ; p_out < p_out_end ; )
 577                     {
 578                         Merge( p_out, p_in, p_in + p_pic->p[i_plane].i_pitch,
 579                                p_pic->p[i_plane].i_pitch );
 580
 581                         p_out += p_outpic->p[i_plane].i_pitch;
 582                         p_in += 2*p_pic->p[i_plane].i_pitch;
 583                     }
 584                 }
 585                 break;
 586         }
 587     }
 588     EndMerge();
 589 }
 590
 591 #undef Merge
 592
 593 static void MergeGeneric( void *_p_dest, const void *_p_s1,
 594                           const void *_p_s2, size_t i_bytes )
 595 {
 596     uint8_t* p_dest = (uint8_t*)_p_dest;
 597     const uint8_t *p_s1 = (const uint8_t *)_p_s1;
 598     const uint8_t *p_s2 = (const uint8_t *)_p_s2;
 599     uint8_t* p_end = p_dest + i_bytes - 8;
 600
 601     while( p_dest < p_end )
 602     {
 603         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 604         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 605         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 606         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 607         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 608         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 609         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 610         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 611     }
 612
 613     p_end += 8;
 614
 615     while( p_dest < p_end )
 616     {
 617         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 618     }
 619 }
 620
 621 #if defined(CAN_COMPILE_MMXEXT)
 622 static void MergeMMXEXT( void *_p_dest, const void *_p_s1, const void *_p_s2,
 623                          size_t i_bytes )
 624 {
 625     uint8_t* p_dest = (uint8_t*)_p_dest;
 626     const uint8_t *p_s1 = (const uint8_t *)_p_s1;
 627     const uint8_t *p_s2 = (const uint8_t *)_p_s2;
 628     uint8_t* p_end = p_dest + i_bytes - 8;
 629     while( p_dest < p_end )
 630     {
 631         __asm__  __volatile__( "movq %2,%%mm1;"
 632                                "pavgb %1, %%mm1;"
 633                                "movq %%mm1, %0" :"=m" (*p_dest):
 634                                                  "m" (*p_s1),
 635                                                  "m" (*p_s2) );
 636         p_dest += 8;
 637         p_s1 += 8;
 638         p_s2 += 8;
 639     }
 640
 641     p_end += 8;
 642
 643     while( p_dest < p_end )
 644     {
 645         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 646     }
 647 }
 648 #endif
 649
 650 #if defined(CAN_COMPILE_3DNOW)
 651 static void Merge3DNow( void *_p_dest, const void *_p_s1, const void *_p_s2,
 652                         size_t i_bytes )
 653 {
 654     uint8_t* p_dest = (uint8_t*)_p_dest;
 655     const uint8_t *p_s1 = (const uint8_t *)_p_s1;
 656     const uint8_t *p_s2 = (const uint8_t *)_p_s2;
 657     uint8_t* p_end = p_dest + i_bytes - 8;
 658     while( p_dest < p_end )
 659     {
 660         __asm__  __volatile__( "movq %2,%%mm1;"
 661                                "pavgusb %1, %%mm1;"
 662                                "movq %%mm1, %0" :"=m" (*p_dest):
 663                                                  "m" (*p_s1),
 664                                                  "m" (*p_s2) );
 665         p_dest += 8;
 666         p_s1 += 8;
 667         p_s2 += 8;
 668     }
 669
 670     p_end += 8;
 671
 672     while( p_dest < p_end )
 673     {
 674         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 675     }
 676 }
 677 #endif
 678
 679 #if defined(CAN_COMPILE_SSE)
 680 static void MergeSSE2( void *_p_dest, const void *_p_s1, const void *_p_s2,
 681                        size_t i_bytes )
 682 {
 683     uint8_t* p_dest = (uint8_t*)_p_dest;
 684     const uint8_t *p_s1 = (const uint8_t *)_p_s1;
 685     const uint8_t *p_s2 = (const uint8_t *)_p_s2;
 686     uint8_t* p_end;
 687     while( (uintptr_t)p_s1 % 16 )
 688     {
 689         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 690     }
 691     p_end = p_dest + i_bytes - 16;
 692     while( p_dest < p_end )
 693     {
 694         __asm__  __volatile__( "movdqu %2,%%xmm1;"
 695                                "pavgb %1, %%xmm1;"
 696                                "movdqu %%xmm1, %0" :"=m" (*p_dest):
 697                                                  "m" (*p_s1),
 698                                                  "m" (*p_s2) );
 699         p_dest += 16;
 700         p_s1 += 16;
 701         p_s2 += 16;
 702     }
 703
 704     p_end += 16;
 705
 706     while( p_dest < p_end )
 707     {
 708         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 709     }
 710 }
 711 #endif
 712
 713 #if defined(CAN_COMPILE_MMXEXT) || defined(CAN_COMPILE_SSE)
 714 static void EndMMX( void )
 715 {
 716     __asm__ __volatile__( "emms" :: );
 717 }
 718 #endif
 719
 720 #if defined(CAN_COMPILE_3DNOW)
 721 static void End3DNow( void )
 722 {
 723     __asm__ __volatile__( "femms" :: );
 724 }
 725 #endif
 726
 727 #ifdef CAN_COMPILE_C_ALTIVEC
 728 static void MergeAltivec( void *_p_dest, const void *_p_s1,
 729                           const void *_p_s2, size_t i_bytes )
 730 {
 731     uint8_t *p_dest = (uint8_t *)_p_dest;
 732     uint8_t *p_s1   = (uint8_t *)_p_s1;
 733     uint8_t *p_s2   = (uint8_t *)_p_s2;
 734     uint8_t *p_end  = p_dest + i_bytes - 15;
 735
 736     /* Use C until the first 16-bytes aligned destination pixel */
 737     while( (uintptr_t)p_dest & 0xF )
 738     {
 739         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 740     }
 741
 742     if( ( (int)p_s1 & 0xF ) | ( (int)p_s2 & 0xF ) )
 743     {
 744         /* Unaligned source */
 745         vector unsigned char s1v, s2v, destv;
 746         vector unsigned char s1oldv, s2oldv, s1newv, s2newv;
 747         vector unsigned char perm1v, perm2v;
 748
 749         perm1v = vec_lvsl( 0, p_s1 );
 750         perm2v = vec_lvsl( 0, p_s2 );
 751         s1oldv = vec_ld( 0, p_s1 );
 752         s2oldv = vec_ld( 0, p_s2 );
 753
 754         while( p_dest < p_end )
 755         {
 756             s1newv = vec_ld( 16, p_s1 );
 757             s2newv = vec_ld( 16, p_s2 );
 758             s1v    = vec_perm( s1oldv, s1newv, perm1v );
 759             s2v    = vec_perm( s2oldv, s2newv, perm2v );
 760             s1oldv = s1newv;
 761             s2oldv = s2newv;
 762             destv  = vec_avg( s1v, s2v );
 763             vec_st( destv, 0, p_dest );
 764
 765             p_s1   += 16;
 766             p_s2   += 16;
 767             p_dest += 16;
 768         }
 769     }
 770     else
 771     {
 772         /* Aligned source */
 773         vector unsigned char s1v, s2v, destv;
 774
 775         while( p_dest < p_end )
 776         {
 777             s1v   = vec_ld( 0, p_s1 );
 778             s2v   = vec_ld( 0, p_s2 );
 779             destv = vec_avg( s1v, s2v );
 780             vec_st( destv, 0, p_dest );
 781
 782             p_s1   += 16;
 783             p_s2   += 16;
 784             p_dest += 16;
 785         }
 786     }
 787
 788     p_end += 15;
 789
 790     while( p_dest < p_end )
 791     {
 792         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 793     }
 794 }
 795 #endif
 796
 797 #ifdef __ARM_NEON__
 798 static void MergeNEON (void *restrict out, const void *in1,
 799                        const void *in2, size_t n)
 800 {
 801     uint8_t *outp = out;
 802     const uint8_t *in1p = in1;
 803     const uint8_t *in2p = in2;
 804     size_t mis = ((uintptr_t)outp) & 15;
 805
 806     if (mis)
 807     {
 808         MergeGeneric (outp, in1p, in2p, mis);
 809         outp += mis;
 810         in1p += mis;
 811         in2p += mis;
 812         n -= mis;
 813     }
 814
 815     uint8_t *end = outp + (n & ~15);
 816
 817     if ((((uintptr_t)in1p)|((uintptr_t)in2p)) & 15)
 818         while (outp < end)
 819             asm volatile (
 820                 "vld1.u8  {q0-q1}, [%[in1]]!\n"
 821                 "vld1.u8  {q2-q3}, [%[in2]]!\n"
 822                 "vhadd.u8 q4, q0, q2\n"
 823                 "vld1.u8  {q6-q7}, [%[in1]]!\n"
 824                 "vhadd.u8 q5, q1, q3\n"
 825                 "vld1.u8  {q8-q9}, [%[in2]]!\n"
 826                 "vhadd.u8 q10, q6, q8\n"
 827                 "vhadd.u8 q11, q7, q9\n"
 828                 "vst1.u8  {q4-q5}, [%[out],:128]!\n"
 829                 "vst1.u8  {q10-q11}, [%[out],:128]!\n"
 830                 : [out] "+r" (outp), [in1] "+r" (in1p), [in2] "+r" (in2p)
 831                 :
 832                 : "q0", "q1", "q2", "memory");
 833     else
 834          while (outp < end)
 835             asm volatile (
 836                 "vld1.u8  {q0-q1}, [%[in1],:128]!\n"
 837                 "vld1.u8  {q2-q3}, [%[in2],:128]!\n"
 838                 "vhadd.u8 q4, q0, q2\n"
 839                 "vld1.u8  {q6-q7}, [%[in1],:128]!\n"
 840                 "vhadd.u8 q5, q1, q3\n"
 841                 "vld1.u8  {q8-q9}, [%[in2],:128]!\n"
 842                 "vhadd.u8 q10, q6, q8\n"
 843                 "vhadd.u8 q11, q7, q9\n"
 844                 "vst1.u8  {q4-q5}, [%[out],:128]!\n"
 845                 "vst1.u8  {q10-q11}, [%[out],:128]!\n"
 846                 : [out] "+r" (outp), [in1] "+r" (in1p), [in2] "+r" (in2p)
 847                 :
 848                 : "q0", "q1", "q2", "memory");
 849     n &= 15;
 850     if (n)
 851         MergeGeneric (outp, in1p, in2p, n);
 852 }
 853 #endif
 854
 855 /*****************************************************************************
 856  * RenderX: This algo works on a 8x8 block basic, it copies the top field
 857  * and apply a process to recreate the bottom field :
 858  *  If a 8x8 block is classified as :
 859  *   - progressive: it applies a small blend (1,6,1)
 860  *   - interlaced:
 861  *    * in the MMX version: we do a ME between the 2 fields, if there is a
 862  *    good match we use MC to recreate the bottom field (with a small
 863  *    blend (1,6,1) )
 864  *    * otherwise: it recreates the bottom field by an edge oriented
 865  *    interpolation.
 866   *****************************************************************************/
 867
 868 /* XDeint8x8Detect: detect if a 8x8 block is interlaced.
 869  * XXX: It need to access to 8x10
 870  * We use more than 8 lines to help with scrolling (text)
 871  * (and because XDeint8x8Frame use line 9)
 872  * XXX: smooth/uniform area with noise detection doesn't works well
 873  * but it's not really a problem because they don't have much details anyway
 874  */
 875 static inline int ssd( int a ) { return a*a; }
 876 static inline int XDeint8x8DetectC( uint8_t *src, int i_src )
 877 {
 878     int y, x;
 879     int ff, fr;
 880     int fc;
 881
 882     /* Detect interlacing */
 883     fc = 0;
 884     for( y = 0; y < 7; y += 2 )
 885     {
 886         ff = fr = 0;
 887         for( x = 0; x < 8; x++ )
 888         {
 889             fr += ssd(src[      x] - src[1*i_src+x]) +
 890                   ssd(src[i_src+x] - src[2*i_src+x]);
 891             ff += ssd(src[      x] - src[2*i_src+x]) +
 892                   ssd(src[i_src+x] - src[3*i_src+x]);
 893         }
 894         if( ff < 6*fr/8 && fr > 32 )
 895             fc++;
 896
 897         src += 2*i_src;
 898     }
 899
 900     return fc < 1 ? false : true;
 901 }
 902 #ifdef CAN_COMPILE_MMXEXT
 903 static inline int XDeint8x8DetectMMXEXT( uint8_t *src, int i_src )
 904 {
 905
 906     int y, x;
 907     int32_t ff, fr;
 908     int fc;
 909
 910     /* Detect interlacing */
 911     fc = 0;
 912     pxor_r2r( mm7, mm7 );
 913     for( y = 0; y < 9; y += 2 )
 914     {
 915         ff = fr = 0;
 916         pxor_r2r( mm5, mm5 );
 917         pxor_r2r( mm6, mm6 );
 918         for( x = 0; x < 8; x+=4 )
 919         {
 920             movd_m2r( src[        x], mm0 );
 921             movd_m2r( src[1*i_src+x], mm1 );
 922             movd_m2r( src[2*i_src+x], mm2 );
 923             movd_m2r( src[3*i_src+x], mm3 );
 924
 925             punpcklbw_r2r( mm7, mm0 );
 926             punpcklbw_r2r( mm7, mm1 );
 927             punpcklbw_r2r( mm7, mm2 );
 928             punpcklbw_r2r( mm7, mm3 );
 929
 930             movq_r2r( mm0, mm4 );
 931
 932             psubw_r2r( mm1, mm0 );
 933             psubw_r2r( mm2, mm4 );
 934
 935             psubw_r2r( mm1, mm2 );
 936             psubw_r2r( mm1, mm3 );
 937
 938             pmaddwd_r2r( mm0, mm0 );
 939             pmaddwd_r2r( mm4, mm4 );
 940             pmaddwd_r2r( mm2, mm2 );
 941             pmaddwd_r2r( mm3, mm3 );
 942             paddd_r2r( mm0, mm2 );
 943             paddd_r2r( mm4, mm3 );
 944             paddd_r2r( mm2, mm5 );
 945             paddd_r2r( mm3, mm6 );
 946         }
 947
 948         movq_r2r( mm5, mm0 );
 949         psrlq_i2r( 32, mm0 );
 950         paddd_r2r( mm0, mm5 );
 951         movd_r2m( mm5, fr );
 952
 953         movq_r2r( mm6, mm0 );
 954         psrlq_i2r( 32, mm0 );
 955         paddd_r2r( mm0, mm6 );
 956         movd_r2m( mm6, ff );
 957
 958         if( ff < 6*fr/8 && fr > 32 )
 959             fc++;
 960
 961         src += 2*i_src;
 962     }
 963     return fc;
 964 }
 965 #endif
 966
 967 static inline void XDeint8x8MergeC( uint8_t *dst, int i_dst,
 968                                     uint8_t *src1, int i_src1,
 969                                     uint8_t *src2, int i_src2 )
 970 {
 971     int y, x;
 972
 973     /* Progressive */
 974     for( y = 0; y < 8; y += 2 )
 975     {
 976         memcpy( dst, src1, 8 );
 977         dst  += i_dst;
 978
 979         for( x = 0; x < 8; x++ )
 980             dst[x] = (src1[x] + 6*src2[x] + src1[i_src1+x] + 4 ) >> 3;
 981         dst += i_dst;
 982
 983         src1 += i_src1;
 984         src2 += i_src2;
 985     }
 986 }
 987
 988 #ifdef CAN_COMPILE_MMXEXT
 989 static inline void XDeint8x8MergeMMXEXT( uint8_t *dst, int i_dst,
 990                                          uint8_t *src1, int i_src1,
 991                                          uint8_t *src2, int i_src2 )
 992 {
 993     static const uint64_t m_4 = INT64_C(0x0004000400040004);
 994     int y, x;
 995
 996     /* Progressive */
 997     pxor_r2r( mm7, mm7 );
 998     for( y = 0; y < 8; y += 2 )
 999     {
1000         for( x = 0; x < 8; x +=4 )
1001         {
1002             movd_m2r( src1[x], mm0 );
1003             movd_r2m( mm0, dst[x] );
1004
1005             movd_m2r( src2[x], mm1 );
1006             movd_m2r( src1[i_src1+x], mm2 );
1007
1008             punpcklbw_r2r( mm7, mm0 );
1009             punpcklbw_r2r( mm7, mm1 );
1010             punpcklbw_r2r( mm7, mm2 );
1011             paddw_r2r( mm1, mm1 );
1012             movq_r2r( mm1, mm3 );
1013             paddw_r2r( mm3, mm3 );
1014             paddw_r2r( mm2, mm0 );
1015             paddw_r2r( mm3, mm1 );
1016             paddw_m2r( m_4, mm1 );
1017             paddw_r2r( mm1, mm0 );
1018             psraw_i2r( 3, mm0 );
1019             packuswb_r2r( mm7, mm0 );
1020             movd_r2m( mm0, dst[i_dst+x] );
1021         }
1022         dst += 2*i_dst;
1023         src1 += i_src1;
1024         src2 += i_src2;
1025     }
1026 }
1027
1028 #endif
1029
1030 /* For debug */
1031 static inline void XDeint8x8Set( uint8_t *dst, int i_dst, uint8_t v )
1032 {
1033     int y;
1034     for( y = 0; y < 8; y++ )
1035         memset( &dst[y*i_dst], v, 8 );
1036 }
1037
1038 /* XDeint8x8FieldE: Stupid deinterlacing (1,0,1) for block that miss a
1039  * neighbour
1040  * (Use 8x9 pixels)
1041  * TODO: a better one for the inner part.
1042  */
1043 static inline void XDeint8x8FieldEC( uint8_t *dst, int i_dst,
1044                                      uint8_t *src, int i_src )
1045 {
1046     int y, x;
1047
1048     /* Interlaced */
1049     for( y = 0; y < 8; y += 2 )
1050     {
1051         memcpy( dst, src, 8 );
1052         dst += i_dst;
1053
1054         for( x = 0; x < 8; x++ )
1055             dst[x] = (src[x] + src[2*i_src+x] ) >> 1;
1056         dst += 1*i_dst;
1057         src += 2*i_src;
1058     }
1059 }
1060 #ifdef CAN_COMPILE_MMXEXT
1061 static inline void XDeint8x8FieldEMMXEXT( uint8_t *dst, int i_dst,
1062                                           uint8_t *src, int i_src )
1063 {
1064     int y;
1065
1066     /* Interlaced */
1067     for( y = 0; y < 8; y += 2 )
1068     {
1069         movq_m2r( src[0], mm0 );
1070         movq_r2m( mm0, dst[0] );
1071         dst += i_dst;
1072
1073         movq_m2r( src[2*i_src], mm1 );
1074         pavgb_r2r( mm1, mm0 );
1075
1076         movq_r2m( mm0, dst[0] );
1077
1078         dst += 1*i_dst;
1079         src += 2*i_src;
1080     }
1081 }
1082 #endif
1083
1084 /* XDeint8x8Field: Edge oriented interpolation
1085  * (Need -4 and +5 pixels H, +1 line)
1086  */
1087 static inline void XDeint8x8FieldC( uint8_t *dst, int i_dst,
1088                                     uint8_t *src, int i_src )
1089 {
1090     int y, x;
1091
1092     /* Interlaced */
1093     for( y = 0; y < 8; y += 2 )
1094     {
1095         memcpy( dst, src, 8 );
1096         dst += i_dst;
1097
1098         for( x = 0; x < 8; x++ )
1099         {
1100             uint8_t *src2 = &src[2*i_src];
1101             /* I use 8 pixels just to match the MMX version, but it's overkill
1102              * 5 would be enough (less isn't good) */
1103             const int c0 = abs(src[x-4]-src2[x-2]) + abs(src[x-3]-src2[x-1]) +
1104                            abs(src[x-2]-src2[x+0]) + abs(src[x-1]-src2[x+1]) +
1105                            abs(src[x+0]-src2[x+2]) + abs(src[x+1]-src2[x+3]) +
1106                            abs(src[x+2]-src2[x+4]) + abs(src[x+3]-src2[x+5]);
1107
1108             const int c1 = abs(src[x-3]-src2[x-3]) + abs(src[x-2]-src2[x-2]) +
1109                            abs(src[x-1]-src2[x-1]) + abs(src[x+0]-src2[x+0]) +
1110                            abs(src[x+1]-src2[x+1]) + abs(src[x+2]-src2[x+2]) +
1111                            abs(src[x+3]-src2[x+3]) + abs(src[x+4]-src2[x+4]);
1112
1113             const int c2 = abs(src[x-2]-src2[x-4]) + abs(src[x-1]-src2[x-3]) +
1114                            abs(src[x+0]-src2[x-2]) + abs(src[x+1]-src2[x-1]) +
1115                            abs(src[x+2]-src2[x+0]) + abs(src[x+3]-src2[x+1]) +
1116                            abs(src[x+4]-src2[x+2]) + abs(src[x+5]-src2[x+3]);
1117
1118             if( c0 < c1 && c1 <= c2 )
1119                 dst[x] = (src[x-1] + src2[x+1]) >> 1;
1120             else if( c2 < c1 && c1 <= c0 )
1121                 dst[x] = (src[x+1] + src2[x-1]) >> 1;
1122             else
1123                 dst[x] = (src[x+0] + src2[x+0]) >> 1;
1124         }
1125
1126         dst += 1*i_dst;
1127         src += 2*i_src;
1128     }
1129 }
1130 #ifdef CAN_COMPILE_MMXEXT
1131 static inline void XDeint8x8FieldMMXEXT( uint8_t *dst, int i_dst,
1132                                          uint8_t *src, int i_src )
1133 {
1134     int y, x;
1135
1136     /* Interlaced */
1137     for( y = 0; y < 8; y += 2 )
1138     {
1139         memcpy( dst, src, 8 );
1140         dst += i_dst;
1141
1142         for( x = 0; x < 8; x++ )
1143         {
1144             uint8_t *src2 = &src[2*i_src];
1145             int32_t c0, c1, c2;
1146
1147             movq_m2r( src[x-2], mm0 );
1148             movq_m2r( src[x-3], mm1 );
1149             movq_m2r( src[x-4], mm2 );
1150
1151             psadbw_m2r( src2[x-4], mm0 );
1152             psadbw_m2r( src2[x-3], mm1 );
1153             psadbw_m2r( src2[x-2], mm2 );
1154
1155             movd_r2m( mm0, c2 );
1156             movd_r2m( mm1, c1 );
1157             movd_r2m( mm2, c0 );
1158
1159             if( c0 < c1 && c1 <= c2 )
1160                 dst[x] = (src[x-1] + src2[x+1]) >> 1;
1161             else if( c2 < c1 && c1 <= c0 )
1162                 dst[x] = (src[x+1] + src2[x-1]) >> 1;
1163             else
1164                 dst[x] = (src[x+0] + src2[x+0]) >> 1;
1165         }
1166
1167         dst += 1*i_dst;
1168         src += 2*i_src;
1169     }
1170 }
1171 #endif
1172
1173 /* NxN arbitray size (and then only use pixel in the NxN block)
1174  */
1175 static inline int XDeintNxNDetect( uint8_t *src, int i_src,
1176                                    int i_height, int i_width )
1177 {
1178     int y, x;
1179     int ff, fr;
1180     int fc;
1181
1182
1183     /* Detect interlacing */
1184     /* FIXME way too simple, need to be more like XDeint8x8Detect */
1185     ff = fr = 0;
1186     fc = 0;
1187     for( y = 0; y < i_height - 2; y += 2 )
1188     {
1189         const uint8_t *s = &src[y*i_src];
1190         for( x = 0; x < i_width; x++ )
1191         {
1192             fr += ssd(s[      x] - s[1*i_src+x]);
1193             ff += ssd(s[      x] - s[2*i_src+x]);
1194         }
1195         if( ff < fr && fr > i_width / 2 )
1196             fc++;
1197     }
1198
1199     return fc < 2 ? false : true;
1200 }
1201
1202 static inline void XDeintNxNFrame( uint8_t *dst, int i_dst,
1203                                    uint8_t *src, int i_src,
1204                                    int i_width, int i_height )
1205 {
1206     int y, x;
1207
1208     /* Progressive */
1209     for( y = 0; y < i_height; y += 2 )
1210     {
1211         memcpy( dst, src, i_width );
1212         dst += i_dst;
1213
1214         if( y < i_height - 2 )
1215         {
1216             for( x = 0; x < i_width; x++ )
1217                 dst[x] = (src[x] + 2*src[1*i_src+x] + src[2*i_src+x] + 2 ) >> 2;
1218         }
1219         else
1220         {
1221             /* Blend last line */
1222             for( x = 0; x < i_width; x++ )
1223                 dst[x] = (src[x] + src[1*i_src+x] ) >> 1;
1224         }
1225         dst += 1*i_dst;
1226         src += 2*i_src;
1227     }
1228 }
1229
1230 static inline void XDeintNxNField( uint8_t *dst, int i_dst,
1231                                    uint8_t *src, int i_src,
1232                                    int i_width, int i_height )
1233 {
1234     int y, x;
1235
1236     /* Interlaced */
1237     for( y = 0; y < i_height; y += 2 )
1238     {
1239         memcpy( dst, src, i_width );
1240         dst += i_dst;
1241
1242         if( y < i_height - 2 )
1243         {
1244             for( x = 0; x < i_width; x++ )
1245                 dst[x] = (src[x] + src[2*i_src+x] ) >> 1;
1246         }
1247         else
1248         {
1249             /* Blend last line */
1250             for( x = 0; x < i_width; x++ )
1251                 dst[x] = (src[x] + src[i_src+x]) >> 1;
1252         }
1253         dst += 1*i_dst;
1254         src += 2*i_src;
1255     }
1256 }
1257
1258 static inline void XDeintNxN( uint8_t *dst, int i_dst, uint8_t *src, int i_src,
1259                               int i_width, int i_height )
1260 {
1261     if( XDeintNxNDetect( src, i_src, i_width, i_height ) )
1262         XDeintNxNField( dst, i_dst, src, i_src, i_width, i_height );
1263     else
1264         XDeintNxNFrame( dst, i_dst, src, i_src, i_width, i_height );
1265 }
1266
1267
1268 static inline int median( int a, int b, int c )
1269 {
1270     int min = a, max =a;
1271     if( b < min )
1272         min = b;
1273     else
1274         max = b;
1275
1276     if( c < min )
1277         min = c;
1278     else if( c > max )
1279         max = c;
1280
1281     return a + b + c - min - max;
1282 }
1283
1284
1285 /* XDeintBand8x8:
1286  */
1287 static inline void XDeintBand8x8C( uint8_t *dst, int i_dst,
1288                                    uint8_t *src, int i_src,
1289                                    const int i_mbx, int i_modx )
1290 {
1291     int x;
1292
1293     for( x = 0; x < i_mbx; x++ )
1294     {
1295         int s;
1296         if( ( s = XDeint8x8DetectC( src, i_src ) ) )
1297         {
1298             if( x == 0 || x == i_mbx - 1 )
1299                 XDeint8x8FieldEC( dst, i_dst, src, i_src );
1300             else
1301                 XDeint8x8FieldC( dst, i_dst, src, i_src );
1302         }
1303         else
1304         {
1305             XDeint8x8MergeC( dst, i_dst,
1306                              &src[0*i_src], 2*i_src,
1307                              &src[1*i_src], 2*i_src );
1308         }
1309
1310         dst += 8;
1311         src += 8;
1312     }
1313
1314     if( i_modx )
1315         XDeintNxN( dst, i_dst, src, i_src, i_modx, 8 );
1316 }
1317 #ifdef CAN_COMPILE_MMXEXT
1318 static inline void XDeintBand8x8MMXEXT( uint8_t *dst, int i_dst,
1319                                         uint8_t *src, int i_src,
1320                                         const int i_mbx, int i_modx )
1321 {
1322     int x;
1323
1324     /* Reset current line */
1325     for( x = 0; x < i_mbx; x++ )
1326     {
1327         int s;
1328         if( ( s = XDeint8x8DetectMMXEXT( src, i_src ) ) )
1329         {
1330             if( x == 0 || x == i_mbx - 1 )
1331                 XDeint8x8FieldEMMXEXT( dst, i_dst, src, i_src );
1332             else
1333                 XDeint8x8FieldMMXEXT( dst, i_dst, src, i_src );
1334         }
1335         else
1336         {
1337             XDeint8x8MergeMMXEXT( dst, i_dst,
1338                                   &src[0*i_src], 2*i_src,
1339                                   &src[1*i_src], 2*i_src );
1340         }
1341
1342         dst += 8;
1343         src += 8;
1344     }
1345
1346     if( i_modx )
1347         XDeintNxN( dst, i_dst, src, i_src, i_modx, 8 );
1348 }
1349 #endif
1350
1351 static void RenderX( picture_t *p_outpic, picture_t *p_pic )
1352 {
1353     int i_plane;
1354
1355     /* Copy image and skip lines */
1356     for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
1357     {
1358         const int i_mby = ( p_outpic->p[i_plane].i_visible_lines + 7 )/8 - 1;
1359         const int i_mbx = p_outpic->p[i_plane].i_visible_pitch/8;
1360
1361         const int i_mody = p_outpic->p[i_plane].i_visible_lines - 8*i_mby;
1362         const int i_modx = p_outpic->p[i_plane].i_visible_pitch - 8*i_mbx;
1363
1364         const int i_dst = p_outpic->p[i_plane].i_pitch;
1365         const int i_src = p_pic->p[i_plane].i_pitch;
1366
1367         int y, x;
1368
1369         for( y = 0; y < i_mby; y++ )
1370         {
1371             uint8_t *dst = &p_outpic->p[i_plane].p_pixels[8*y*i_dst];
1372             uint8_t *src = &p_pic->p[i_plane].p_pixels[8*y*i_src];
1373
1374 #ifdef CAN_COMPILE_MMXEXT
1375             if( vlc_CPU() & CPU_CAPABILITY_MMXEXT )
1376                 XDeintBand8x8MMXEXT( dst, i_dst, src, i_src, i_mbx, i_modx );
1377             else
1378 #endif
1379                 XDeintBand8x8C( dst, i_dst, src, i_src, i_mbx, i_modx );
1380         }
1381
1382         /* Last line (C only)*/
1383         if( i_mody )
1384         {
1385             uint8_t *dst = &p_outpic->p[i_plane].p_pixels[8*y*i_dst];
1386             uint8_t *src = &p_pic->p[i_plane].p_pixels[8*y*i_src];
1387
1388             for( x = 0; x < i_mbx; x++ )
1389             {
1390                 XDeintNxN( dst, i_dst, src, i_src, 8, i_mody );
1391
1392                 dst += 8;
1393                 src += 8;
1394             }
1395
1396             if( i_modx )
1397                 XDeintNxN( dst, i_dst, src, i_src, i_modx, i_mody );
1398         }
1399     }
1400
1401 #ifdef CAN_COMPILE_MMXEXT
1402     if( vlc_CPU() & CPU_CAPABILITY_MMXEXT )
1403         emms();
1404 #endif
1405 }
1406
1407 /*****************************************************************************
1408  * Yadif (Yet Another DeInterlacing Filter).
1409  *****************************************************************************/
1410 /* */
1411 struct vf_priv_s {
1412     /*
1413      * 0: Output 1 frame for each frame.
1414      * 1: Output 1 frame for each field.
1415      * 2: Like 0 but skips spatial interlacing check.
1416      * 3: Like 1 but skips spatial interlacing check.
1417      *
1418      * In vlc, only & 0x02 has meaning, as we do the & 0x01 ourself.
1419      */
1420     int mode;
1421 };
1422
1423 /* I am unsure it is the right one */
1424 typedef intptr_t x86_reg;
1425
1426 #define FFABS(a) ((a) >= 0 ? (a) : (-(a)))
1427 #define FFMAX(a,b)      __MAX(a,b)
1428 #define FFMAX3(a,b,c)   FFMAX(FFMAX(a,b),c)
1429 #define FFMIN(a,b)      __MIN(a,b)
1430 #define FFMIN3(a,b,c)   FFMIN(FFMIN(a,b),c)
1431
1432 /* yadif.h comes from vf_yadif.c of mplayer project */
1433 #include "yadif.h"
1434
1435 static void RenderYadif( filter_t *p_filter, picture_t *p_dst, picture_t *p_src, int i_order, int i_field )
1436 {
1437     filter_sys_t *p_sys = p_filter->p_sys;
1438
1439     /* */
1440     assert( i_order == 0 || i_order == 1 );
1441     assert( i_field == 0 || i_field == 1 );
1442
1443     if( i_order == 0 )
1444     {
1445         /* Duplicate the picture
1446          * TODO when the vout rework is finished, picture_Hold() might be enough
1447          * but becarefull, the pitches must match */
1448         picture_t *p_dup = picture_NewFromFormat( &p_src->format );
1449         if( p_dup )
1450             picture_Copy( p_dup, p_src );
1451
1452         /* Slide the history */
1453         if( p_sys->pp_history[0] )
1454             picture_Release( p_sys->pp_history[0]  );
1455         for( int i = 1; i < HISTORY_SIZE; i++ )
1456             p_sys->pp_history[i-1] = p_sys->pp_history[i];
1457         p_sys->pp_history[HISTORY_SIZE-1] = p_dup;
1458     }
1459
1460     /* As the pitches must match, use ONLY pictures coming from picture_New()! */
1461     picture_t *p_prev = p_sys->pp_history[0];
1462     picture_t *p_cur  = p_sys->pp_history[1];
1463     picture_t *p_next = p_sys->pp_history[2];
1464
1465     /* Filter if we have all the pictures we need */
1466     if( p_prev && p_cur && p_next )
1467     {
1468         /* */
1469         void (*filter)(struct vf_priv_s *p, uint8_t *dst, uint8_t *prev, uint8_t *cur, uint8_t *next, int w, int refs, int parity);
1470 #if defined(HAVE_YADIF_SSE2)
1471         if( vlc_CPU() & CPU_CAPABILITY_SSE2 )
1472             filter = yadif_filter_line_mmx2;
1473         else
1474 #endif
1475             filter = yadif_filter_line_c;
1476
1477         for( int n = 0; n < p_dst->i_planes; n++ )
1478         {
1479             const plane_t *prevp = &p_prev->p[n];
1480             const plane_t *curp  = &p_cur->p[n];
1481             const plane_t *nextp = &p_next->p[n];
1482             plane_t *dstp        = &p_dst->p[n];
1483
1484             for( int y = 1; y < dstp->i_visible_lines - 1; y++ )
1485             {
1486                 if( (y % 2) == i_field )
1487                 {
1488                     vlc_memcpy( &dstp->p_pixels[y * dstp->i_pitch],
1489                                 &curp->p_pixels[y * curp->i_pitch], dstp->i_visible_pitch );
1490                 }
1491                 else
1492                 {
1493                     struct vf_priv_s cfg;
1494                     /* Spatial checks only when enough data */
1495                     cfg.mode = (y >= 2 && y < dstp->i_visible_lines - 2) ? 0 : 2;
1496
1497                     assert( prevp->i_pitch == curp->i_pitch && curp->i_pitch == nextp->i_pitch );
1498                     filter( &cfg,
1499                             &dstp->p_pixels[y * dstp->i_pitch],
1500                             &prevp->p_pixels[y * prevp->i_pitch],
1501                             &curp->p_pixels[y * curp->i_pitch],
1502                             &nextp->p_pixels[y * nextp->i_pitch],
1503                             dstp->i_visible_pitch,
1504                             curp->i_pitch,
1505                             (i_field ^ (i_order == i_field)) & 1 );
1506                 }
1507
1508                 /* We duplicate the first and last lines */
1509                 if( y == 1 )
1510                     vlc_memcpy(&dstp->p_pixels[(y-1) * dstp->i_pitch], &dstp->p_pixels[y * dstp->i_pitch], dstp->i_pitch);
1511                 else if( y == dstp->i_visible_lines - 2 )
1512                     vlc_memcpy(&dstp->p_pixels[(y+1) * dstp->i_pitch], &dstp->p_pixels[y * dstp->i_pitch], dstp->i_pitch);
1513             }
1514         }
1515
1516         /* */
1517         p_dst->date = (p_next->date - p_cur->date) * i_order / 2 + p_cur->date;
1518     }
1519     else
1520     {
1521         /* Fallback to something simple
1522          * XXX it is wrong when we have 2 pictures, we should not output a picture */
1523         RenderX( p_dst, p_src );
1524     }
1525 }
1526
1527 /*****************************************************************************
1528  * video filter2 functions
1529  *****************************************************************************/
1530 static picture_t *Deinterlace( filter_t *p_filter, picture_t *p_pic )
1531 {
1532     filter_sys_t *p_sys = p_filter->p_sys;
1533     picture_t *p_pic_dst;
1534
1535     /* Request output picture */
1536     p_pic_dst = filter_NewPicture( p_filter );
1537     if( p_pic_dst == NULL )
1538     {
1539         picture_Release( p_pic );
1540         return NULL;
1541     }
1542
1543     switch( p_sys->i_mode )
1544     {
1545         case DEINTERLACE_DISCARD:
1546             RenderDiscard( p_filter, p_pic_dst, p_pic, 0 );
1547             break;
1548
1549         case DEINTERLACE_BOB:
1550 #if 0
1551             RenderBob( p_filter, pp_outpic[0], p_pic, !p_pic->b_top_field_first );
1552             RenderBob( p_filter, pp_outpic[1], p_pic, p_pic->b_top_field_first );
1553             break;
1554 #endif
1555
1556         case DEINTERLACE_LINEAR:
1557 #if 0
1558             RenderLinear( p_filter, pp_outpic[0], p_pic, !p_pic->b_top_field_first );
1559             RenderLinear( p_filter, pp_outpic[1], p_pic, p_pic->b_top_field_first );
1560 #endif
1561             msg_Err( p_filter, "doubling the frame rate is not supported yet" );
1562             picture_Release( p_pic_dst );
1563             picture_Release( p_pic );
1564             return NULL;
1565
1566         case DEINTERLACE_MEAN:
1567             RenderMean( p_filter, p_pic_dst, p_pic );
1568             break;
1569
1570         case DEINTERLACE_BLEND:
1571             RenderBlend( p_filter, p_pic_dst, p_pic );
1572             break;
1573
1574         case DEINTERLACE_X:
1575             RenderX( p_pic_dst, p_pic );
1576             break;
1577
1578         case DEINTERLACE_YADIF:
1579             msg_Err( p_filter, "delaying frames is not supported yet" );
1580             //RenderYadif( p_vout, pp_outpic[0], p_pic, 0, 0 );
1581             picture_Release( p_pic_dst );
1582             picture_Release( p_pic );
1583             return NULL;
1584
1585         case DEINTERLACE_YADIF2X:
1586             msg_Err( p_filter, "doubling the frame rate is not supported yet" );
1587             //RenderYadif( p_vout, pp_outpic[0], p_pic, 0, !p_pic->b_top_field_first );
1588             //RenderYadif( p_vout, pp_outpic[1], p_pic, 1, p_pic->b_top_field_first );
1589             picture_Release( p_pic_dst );
1590             picture_Release( p_pic );
1591             return NULL;
1592     }
1593
1594     picture_CopyProperties( p_pic_dst, p_pic );
1595     p_pic_dst->b_progressive = true;
1596
1597     picture_Release( p_pic );
1598     return p_pic_dst;
1599 }
1600
1601 static int Mouse( filter_t *p_filter,
1602                   vlc_mouse_t *p_mouse, const vlc_mouse_t *p_old, const vlc_mouse_t *p_new )
1603 {
1604     *p_mouse = *p_new;
1605     if( p_filter->p_sys->b_half_height )
1606         p_mouse->i_y *= 2;
1607     return VLC_SUCCESS;
1608 }
1609
1610
1611 /*****************************************************************************
1612  * Open
1613  *****************************************************************************/
1614 static int Open( vlc_object_t *p_this )
1615 {
1616     filter_t *p_filter = (filter_t*)p_this;
1617     filter_sys_t *p_sys;
1618
1619     if( !IsChromaSupported( p_filter->fmt_in.video.i_chroma ) )
1620         return VLC_EGENERIC;
1621
1622     /* */
1623     p_sys = p_filter->p_sys = malloc( sizeof( *p_sys ) );
1624     if( !p_sys )
1625         return VLC_ENOMEM;
1626
1627     p_sys->i_mode = DEINTERLACE_BLEND;
1628     p_sys->b_double_rate = false;
1629     p_sys->b_half_height = true;
1630
1631 #if defined(CAN_COMPILE_C_ALTIVEC)
1632     if( vlc_CPU() & CPU_CAPABILITY_ALTIVEC )
1633     {
1634         p_sys->pf_merge = MergeAltivec;
1635         p_sys->pf_end_merge = NULL;
1636     }
1637     else
1638 #endif
1639 #if defined(CAN_COMPILE_SSE)
1640     if( vlc_CPU() & CPU_CAPABILITY_SSE2 )
1641     {
1642         p_sys->pf_merge = MergeSSE2;
1643         p_sys->pf_end_merge = EndMMX;
1644     }
1645     else
1646 #endif
1647 #if defined(CAN_COMPILE_MMXEXT)
1648     if( vlc_CPU() & CPU_CAPABILITY_MMXEXT )
1649     {
1650         p_sys->pf_merge = MergeMMXEXT;
1651         p_sys->pf_end_merge = EndMMX;
1652     }
1653     else
1654 #endif
1655 #if defined(CAN_COMPILE_3DNOW)
1656     if( vlc_CPU() & CPU_CAPABILITY_3DNOW )
1657     {
1658         p_sys->pf_merge = Merge3DNow;
1659         p_sys->pf_end_merge = End3DNow;
1660     }
1661     else
1662 #endif
1663 #if defined __ARM_NEON__
1664     if( vlc_CPU() & CPU_CAPABILITY_NEON )
1665     {
1666         p_sys->pf_merge = MergeNEON;
1667         p_sys->pf_end_merge = NULL;
1668     }
1669     else
1670 #endif
1671     {
1672         p_sys->pf_merge = MergeGeneric;
1673         p_sys->pf_end_merge = NULL;
1674     }
1675
1676     /* */
1677     config_ChainParse( p_filter, FILTER_CFG_PREFIX, ppsz_filter_options,
1678                        p_filter->p_cfg );
1679
1680     char *psz_mode = var_GetNonEmptyString( p_filter, FILTER_CFG_PREFIX "mode" );
1681     SetFilterMethod( p_filter, psz_mode, p_filter->fmt_in.video.i_chroma );
1682     free( psz_mode );
1683
1684     /* */
1685     video_format_t fmt;
1686     GetOutputFormat( p_filter, &fmt, &p_filter->fmt_in.video );
1687     if( !p_filter->b_allow_fmt_out_change &&
1688         ( fmt.i_chroma != p_filter->fmt_in.video.i_chroma ||
1689           fmt.i_height != p_filter->fmt_in.video.i_height ) )
1690     {
1691         Close( VLC_OBJECT(p_filter) );
1692         return VLC_EGENERIC;
1693     }
1694     p_filter->fmt_out.video = fmt;
1695     p_filter->fmt_out.i_codec = fmt.i_chroma;
1696     p_filter->pf_video_filter = Deinterlace;
1697     p_filter->pf_video_mouse  = Mouse;
1698
1699     msg_Dbg( p_filter, "deinterlacing" );
1700
1701     return VLC_SUCCESS;
1702 }
1703
1704 /*****************************************************************************
1705  * Close: clean up the filter
1706  *****************************************************************************/
1707 static void Close( vlc_object_t *p_this )
1708 {
1709     filter_t *p_filter = (filter_t*)p_this;
1710
1711     free( p_filter->p_sys );
1712 }
1713