git.sesse.net Git - vlc/blob - modules/video_filter/deinterlace.c

   1 /*****************************************************************************
   2  * deinterlace.c : deinterlacer plugin for vlc
   3  *****************************************************************************
   4  * Copyright (C) 2000-2009 the VideoLAN team
   5  * $Id$
   6  *
   7  * Author: Sam Hocevar <sam@zoy.org>
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation; either version 2 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License
  20  * along with this program; if not, write to the Free Software
  21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
  22  *****************************************************************************/
  23
  24 /*****************************************************************************
  25  * Preamble
  26  *****************************************************************************/
  27
  28 #ifdef HAVE_CONFIG_H
  29 # include "config.h"
  30 #endif
  31
  32 #include <assert.h>
  33
  34 #ifdef HAVE_ALTIVEC_H
  35 #   include <altivec.h>
  36 #endif
  37
  38 #include <vlc_common.h>
  39 #include <vlc_plugin.h>
  40 #include <vlc_filter.h>
  41 #include <vlc_cpu.h>
  42
  43 #ifdef CAN_COMPILE_MMXEXT
  44 #   include "mmx.h"
  45 #endif
  46
  47 #define DEINTERLACE_DISCARD 1
  48 #define DEINTERLACE_MEAN    2
  49 #define DEINTERLACE_BLEND   3
  50 #define DEINTERLACE_BOB     4
  51 #define DEINTERLACE_LINEAR  5
  52 #define DEINTERLACE_X       6
  53 #define DEINTERLACE_YADIF   7
  54 #define DEINTERLACE_YADIF2X 8
  55
  56 /*****************************************************************************
  57  * Module descriptor
  58  *****************************************************************************/
  59 static int  Open ( vlc_object_t * );
  60 static void Close( vlc_object_t * );
  61
  62 #define MODE_TEXT N_("Deinterlace mode")
  63 #define MODE_LONGTEXT N_("Deinterlace method to use for local playback.")
  64
  65 #define SOUT_MODE_TEXT N_("Streaming deinterlace mode")
  66 #define SOUT_MODE_LONGTEXT N_("Deinterlace method to use for streaming.")
  67
  68 #define FILTER_CFG_PREFIX "sout-deinterlace-"
  69
  70 static const char *const mode_list[] = {
  71     "discard", "blend", "mean", "bob", "linear", "x", "yadif", "yadif2x" };
  72 static const char *const mode_list_text[] = {
  73     N_("Discard"), N_("Blend"), N_("Mean"), N_("Bob"), N_("Linear"), "X", "Yadif", "Yadif (2x)" };
  74
  75 vlc_module_begin ()
  76     set_description( N_("Deinterlacing video filter") )
  77     set_shortname( N_("Deinterlace" ))
  78     set_capability( "video filter", 0 )
  79     set_category( CAT_VIDEO )
  80     set_subcategory( SUBCAT_VIDEO_VFILTER )
  81
  82     set_capability( "video filter2", 0 )
  83     add_string( FILTER_CFG_PREFIX "mode", "blend", NULL, SOUT_MODE_TEXT,
  84                 SOUT_MODE_LONGTEXT, false )
  85         change_string_list( mode_list, mode_list_text, 0 )
  86         change_safe ()
  87     add_shortcut( "deinterlace" )
  88     set_callbacks( Open, Close )
  89 vlc_module_end ()
  90
  91
  92 /*****************************************************************************
  93  * Local protypes
  94  *****************************************************************************/
  95 static void RenderDiscard( filter_t *, picture_t *, picture_t *, int );
  96 static void RenderBob    ( filter_t *, picture_t *, picture_t *, int );
  97 static void RenderMean   ( filter_t *, picture_t *, picture_t * );
  98 static void RenderBlend  ( filter_t *, picture_t *, picture_t * );
  99 static void RenderLinear ( filter_t *, picture_t *, picture_t *, int );
 100 static void RenderX      ( picture_t *, picture_t * );
 101 static int  RenderYadif  ( filter_t *, picture_t *, picture_t *, int, int );
 102
 103 static void MergeGeneric ( void *, const void *, const void *, size_t );
 104 #if defined(CAN_COMPILE_C_ALTIVEC)
 105 static void MergeAltivec ( void *, const void *, const void *, size_t );
 106 #endif
 107 #if defined(CAN_COMPILE_MMXEXT)
 108 static void MergeMMXEXT  ( void *, const void *, const void *, size_t );
 109 #endif
 110 #if defined(CAN_COMPILE_3DNOW)
 111 static void Merge3DNow   ( void *, const void *, const void *, size_t );
 112 #endif
 113 #if defined(CAN_COMPILE_SSE)
 114 static void MergeSSE2    ( void *, const void *, const void *, size_t );
 115 #endif
 116 #if defined(CAN_COMPILE_MMXEXT) || defined(CAN_COMPILE_SSE)
 117 static void EndMMX       ( void );
 118 #endif
 119 #if defined(CAN_COMPILE_3DNOW)
 120 static void End3DNow     ( void );
 121 #endif
 122 #if defined __ARM_NEON__
 123 static void MergeNEON (void *, const void *, const void *, size_t);
 124 #endif
 125
 126 static const char *const ppsz_filter_options[] = {
 127     "mode", NULL
 128 };
 129
 130 #define HISTORY_SIZE (3)
 131 struct filter_sys_t
 132 {
 133     int  i_mode;        /* Deinterlace mode */
 134     bool b_double_rate; /* Shall we double the framerate? */
 135     bool b_half_height; /* Shall be divide the height by 2 */
 136
 137     void (*pf_merge) ( void *, const void *, const void *, size_t );
 138     void (*pf_end_merge) ( void );
 139
 140     mtime_t i_last_date;
 141
 142     /* Yadif */
 143     picture_t *pp_history[HISTORY_SIZE];
 144 };
 145
 146 /*****************************************************************************
 147  * SetFilterMethod: setup the deinterlace method to use.
 148  *****************************************************************************/
 149 static void SetFilterMethod( filter_t *p_filter, const char *psz_method, vlc_fourcc_t i_chroma )
 150 {
 151     filter_sys_t *p_sys = p_filter->p_sys;
 152
 153     if( !psz_method )
 154         psz_method = "";
 155
 156     if( !strcmp( psz_method, "mean" ) )
 157     {
 158         p_sys->i_mode = DEINTERLACE_MEAN;
 159         p_sys->b_double_rate = false;
 160         p_sys->b_half_height = true;
 161     }
 162     else if( !strcmp( psz_method, "bob" )
 163              || !strcmp( psz_method, "progressive-scan" ) )
 164     {
 165         p_sys->i_mode = DEINTERLACE_BOB;
 166         p_sys->b_double_rate = true;
 167         p_sys->b_half_height = false;
 168     }
 169     else if( !strcmp( psz_method, "linear" ) )
 170     {
 171         p_sys->i_mode = DEINTERLACE_LINEAR;
 172         p_sys->b_double_rate = true;
 173         p_sys->b_half_height = false;
 174     }
 175     else if( !strcmp( psz_method, "x" ) )
 176     {
 177         p_sys->i_mode = DEINTERLACE_X;
 178         p_sys->b_double_rate = false;
 179         p_sys->b_half_height = false;
 180     }
 181     else if( !strcmp( psz_method, "yadif" ) )
 182     {
 183         p_sys->i_mode = DEINTERLACE_YADIF;
 184         p_sys->b_double_rate = false;
 185         p_sys->b_half_height = false;
 186     }
 187     else if( !strcmp( psz_method, "yadif2x" ) )
 188     {
 189         p_sys->i_mode = DEINTERLACE_YADIF2X;
 190         p_sys->b_double_rate = true;
 191         p_sys->b_half_height = false;
 192     }
 193     else if( !strcmp( psz_method, "discard" ) )
 194     {
 195         const bool b_i422 = i_chroma == VLC_CODEC_I422 ||
 196                             i_chroma == VLC_CODEC_J422;
 197
 198         p_sys->i_mode = DEINTERLACE_DISCARD;
 199         p_sys->b_double_rate = false;
 200         p_sys->b_half_height = !b_i422;
 201     }
 202     else
 203     {
 204         if( strcmp( psz_method, "blend" ) )
 205             msg_Err( p_filter,
 206                      "no valid deinterlace mode provided, using \"blend\"" );
 207
 208         p_sys->i_mode = DEINTERLACE_BLEND;
 209         p_sys->b_double_rate = false;
 210         p_sys->b_half_height = false;
 211     }
 212
 213     msg_Dbg( p_filter, "using %s deinterlace method", psz_method );
 214 }
 215
 216 static void GetOutputFormat( filter_t *p_filter,
 217                              video_format_t *p_dst, const video_format_t *p_src )
 218 {
 219     filter_sys_t *p_sys = p_filter->p_sys;
 220     *p_dst = *p_src;
 221
 222     if( p_sys->b_half_height )
 223     {
 224         p_dst->i_height /= 2;
 225         p_dst->i_visible_height /= 2;
 226         p_dst->i_y_offset /= 2;
 227         p_dst->i_sar_den *= 2;
 228     }
 229
 230     if( p_src->i_chroma == VLC_CODEC_I422 ||
 231         p_src->i_chroma == VLC_CODEC_J422 )
 232     {
 233         switch( p_sys->i_mode )
 234         {
 235         case DEINTERLACE_MEAN:
 236         case DEINTERLACE_LINEAR:
 237         case DEINTERLACE_X:
 238         case DEINTERLACE_YADIF:
 239         case DEINTERLACE_YADIF2X:
 240             p_dst->i_chroma = p_src->i_chroma;
 241             break;
 242         default:
 243             p_dst->i_chroma = p_src->i_chroma == VLC_CODEC_I422 ? VLC_CODEC_I420 :
 244                                                                   VLC_CODEC_J420;
 245             break;
 246         }
 247     }
 248 }
 249
 250 static bool IsChromaSupported( vlc_fourcc_t i_chroma )
 251 {
 252     return i_chroma == VLC_CODEC_I420 ||
 253            i_chroma == VLC_CODEC_J420 ||
 254            i_chroma == VLC_CODEC_YV12 ||
 255            i_chroma == VLC_CODEC_I422 ||
 256            i_chroma == VLC_CODEC_J422;
 257 }
 258
 259 /*****************************************************************************
 260  * RenderDiscard: only keep TOP or BOTTOM field, discard the other.
 261  *****************************************************************************/
 262 static void RenderDiscard( filter_t *p_filter,
 263                            picture_t *p_outpic, picture_t *p_pic, int i_field )
 264 {
 265     int i_plane;
 266
 267     /* Copy image and skip lines */
 268     for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
 269     {
 270         uint8_t *p_in, *p_out_end, *p_out;
 271         int i_increment;
 272
 273         p_in = p_pic->p[i_plane].p_pixels
 274                    + i_field * p_pic->p[i_plane].i_pitch;
 275
 276         p_out = p_outpic->p[i_plane].p_pixels;
 277         p_out_end = p_out + p_outpic->p[i_plane].i_pitch
 278                              * p_outpic->p[i_plane].i_visible_lines;
 279
 280         switch( p_filter->fmt_in.video.i_chroma )
 281         {
 282         case VLC_CODEC_I420:
 283         case VLC_CODEC_J420:
 284         case VLC_CODEC_YV12:
 285
 286             for( ; p_out < p_out_end ; )
 287             {
 288                 vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 289
 290                 p_out += p_outpic->p[i_plane].i_pitch;
 291                 p_in += 2 * p_pic->p[i_plane].i_pitch;
 292             }
 293             break;
 294
 295         case VLC_CODEC_I422:
 296         case VLC_CODEC_J422:
 297
 298             i_increment = 2 * p_pic->p[i_plane].i_pitch;
 299
 300             if( i_plane == Y_PLANE )
 301             {
 302                 for( ; p_out < p_out_end ; )
 303                 {
 304                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 305                     p_out += p_outpic->p[i_plane].i_pitch;
 306                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 307                     p_out += p_outpic->p[i_plane].i_pitch;
 308                     p_in += i_increment;
 309                 }
 310             }
 311             else
 312             {
 313                 for( ; p_out < p_out_end ; )
 314                 {
 315                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 316                     p_out += p_outpic->p[i_plane].i_pitch;
 317                     p_in += i_increment;
 318                 }
 319             }
 320             break;
 321
 322         default:
 323             break;
 324         }
 325     }
 326 }
 327
 328 /*****************************************************************************
 329  * RenderBob: renders a BOB picture - simple copy
 330  *****************************************************************************/
 331 static void RenderBob( filter_t *p_filter,
 332                        picture_t *p_outpic, picture_t *p_pic, int i_field )
 333 {
 334     int i_plane;
 335
 336     /* Copy image and skip lines */
 337     for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
 338     {
 339         uint8_t *p_in, *p_out_end, *p_out;
 340
 341         p_in = p_pic->p[i_plane].p_pixels;
 342         p_out = p_outpic->p[i_plane].p_pixels;
 343         p_out_end = p_out + p_outpic->p[i_plane].i_pitch
 344                              * p_outpic->p[i_plane].i_visible_lines;
 345
 346         switch( p_filter->fmt_in.video.i_chroma )
 347         {
 348             case VLC_CODEC_I420:
 349             case VLC_CODEC_J420:
 350             case VLC_CODEC_YV12:
 351                 /* For BOTTOM field we need to add the first line */
 352                 if( i_field == 1 )
 353                 {
 354                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 355                     p_in += p_pic->p[i_plane].i_pitch;
 356                     p_out += p_outpic->p[i_plane].i_pitch;
 357                 }
 358
 359                 p_out_end -= 2 * p_outpic->p[i_plane].i_pitch;
 360
 361                 for( ; p_out < p_out_end ; )
 362                 {
 363                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 364
 365                     p_out += p_outpic->p[i_plane].i_pitch;
 366
 367                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 368
 369                     p_in += 2 * p_pic->p[i_plane].i_pitch;
 370                     p_out += p_outpic->p[i_plane].i_pitch;
 371                 }
 372
 373                 vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 374
 375                 /* For TOP field we need to add the last line */
 376                 if( i_field == 0 )
 377                 {
 378                     p_in += p_pic->p[i_plane].i_pitch;
 379                     p_out += p_outpic->p[i_plane].i_pitch;
 380                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 381                 }
 382                 break;
 383
 384             case VLC_CODEC_I422:
 385             case VLC_CODEC_J422:
 386                 /* For BOTTOM field we need to add the first line */
 387                 if( i_field == 1 )
 388                 {
 389                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 390                     p_in += p_pic->p[i_plane].i_pitch;
 391                     p_out += p_outpic->p[i_plane].i_pitch;
 392                 }
 393
 394                 p_out_end -= 2 * p_outpic->p[i_plane].i_pitch;
 395
 396                 if( i_plane == Y_PLANE )
 397                 {
 398                     for( ; p_out < p_out_end ; )
 399                     {
 400                         vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 401
 402                         p_out += p_outpic->p[i_plane].i_pitch;
 403
 404                         vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 405
 406                         p_in += 2 * p_pic->p[i_plane].i_pitch;
 407                         p_out += p_outpic->p[i_plane].i_pitch;
 408                     }
 409                 }
 410                 else
 411                 {
 412                     for( ; p_out < p_out_end ; )
 413                     {
 414                         vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 415
 416                         p_out += p_outpic->p[i_plane].i_pitch;
 417                         p_in += 2 * p_pic->p[i_plane].i_pitch;
 418                     }
 419                 }
 420
 421                 vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 422
 423                 /* For TOP field we need to add the last line */
 424                 if( i_field == 0 )
 425                 {
 426                     p_in += p_pic->p[i_plane].i_pitch;
 427                     p_out += p_outpic->p[i_plane].i_pitch;
 428                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 429                 }
 430                 break;
 431         }
 432     }
 433 }
 434
 435 #define Merge p_filter->p_sys->pf_merge
 436 #define EndMerge if(p_filter->p_sys->pf_end_merge) p_filter->p_sys->pf_end_merge
 437
 438 /*****************************************************************************
 439  * RenderLinear: BOB with linear interpolation
 440  *****************************************************************************/
 441 static void RenderLinear( filter_t *p_filter,
 442                           picture_t *p_outpic, picture_t *p_pic, int i_field )
 443 {
 444     int i_plane;
 445
 446     /* Copy image and skip lines */
 447     for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
 448     {
 449         uint8_t *p_in, *p_out_end, *p_out;
 450
 451         p_in = p_pic->p[i_plane].p_pixels;
 452         p_out = p_outpic->p[i_plane].p_pixels;
 453         p_out_end = p_out + p_outpic->p[i_plane].i_pitch
 454                              * p_outpic->p[i_plane].i_visible_lines;
 455
 456         /* For BOTTOM field we need to add the first line */
 457         if( i_field == 1 )
 458         {
 459             vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 460             p_in += p_pic->p[i_plane].i_pitch;
 461             p_out += p_outpic->p[i_plane].i_pitch;
 462         }
 463
 464         p_out_end -= 2 * p_outpic->p[i_plane].i_pitch;
 465
 466         for( ; p_out < p_out_end ; )
 467         {
 468             vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 469
 470             p_out += p_outpic->p[i_plane].i_pitch;
 471
 472             Merge( p_out, p_in, p_in + 2 * p_pic->p[i_plane].i_pitch,
 473                    p_pic->p[i_plane].i_pitch );
 474
 475             p_in += 2 * p_pic->p[i_plane].i_pitch;
 476             p_out += p_outpic->p[i_plane].i_pitch;
 477         }
 478
 479         vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 480
 481         /* For TOP field we need to add the last line */
 482         if( i_field == 0 )
 483         {
 484             p_in += p_pic->p[i_plane].i_pitch;
 485             p_out += p_outpic->p[i_plane].i_pitch;
 486             vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 487         }
 488     }
 489     EndMerge();
 490 }
 491
 492 static void RenderMean( filter_t *p_filter,
 493                         picture_t *p_outpic, picture_t *p_pic )
 494 {
 495     int i_plane;
 496
 497     /* Copy image and skip lines */
 498     for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
 499     {
 500         uint8_t *p_in, *p_out_end, *p_out;
 501
 502         p_in = p_pic->p[i_plane].p_pixels;
 503
 504         p_out = p_outpic->p[i_plane].p_pixels;
 505         p_out_end = p_out + p_outpic->p[i_plane].i_pitch
 506                              * p_outpic->p[i_plane].i_visible_lines;
 507
 508         /* All lines: mean value */
 509         for( ; p_out < p_out_end ; )
 510         {
 511             Merge( p_out, p_in, p_in + p_pic->p[i_plane].i_pitch,
 512                    p_pic->p[i_plane].i_pitch );
 513
 514             p_out += p_outpic->p[i_plane].i_pitch;
 515             p_in += 2 * p_pic->p[i_plane].i_pitch;
 516         }
 517     }
 518     EndMerge();
 519 }
 520
 521 static void RenderBlend( filter_t *p_filter,
 522                          picture_t *p_outpic, picture_t *p_pic )
 523 {
 524     int i_plane;
 525
 526     /* Copy image and skip lines */
 527     for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
 528     {
 529         uint8_t *p_in, *p_out_end, *p_out;
 530
 531         p_in = p_pic->p[i_plane].p_pixels;
 532
 533         p_out = p_outpic->p[i_plane].p_pixels;
 534         p_out_end = p_out + p_outpic->p[i_plane].i_pitch
 535                              * p_outpic->p[i_plane].i_visible_lines;
 536
 537         switch( p_filter->fmt_in.video.i_chroma )
 538         {
 539             case VLC_CODEC_I420:
 540             case VLC_CODEC_J420:
 541             case VLC_CODEC_YV12:
 542                 /* First line: simple copy */
 543                 vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 544                 p_out += p_outpic->p[i_plane].i_pitch;
 545
 546                 /* Remaining lines: mean value */
 547                 for( ; p_out < p_out_end ; )
 548                 {
 549                     Merge( p_out, p_in, p_in + p_pic->p[i_plane].i_pitch,
 550                            p_pic->p[i_plane].i_pitch );
 551
 552                     p_out += p_outpic->p[i_plane].i_pitch;
 553                     p_in += p_pic->p[i_plane].i_pitch;
 554                 }
 555                 break;
 556
 557             case VLC_CODEC_I422:
 558             case VLC_CODEC_J422:
 559                 /* First line: simple copy */
 560                 vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 561                 p_out += p_outpic->p[i_plane].i_pitch;
 562
 563                 /* Remaining lines: mean value */
 564                 if( i_plane == Y_PLANE )
 565                 {
 566                     for( ; p_out < p_out_end ; )
 567                     {
 568                         Merge( p_out, p_in, p_in + p_pic->p[i_plane].i_pitch,
 569                                p_pic->p[i_plane].i_pitch );
 570
 571                         p_out += p_outpic->p[i_plane].i_pitch;
 572                         p_in += p_pic->p[i_plane].i_pitch;
 573                     }
 574                 }
 575
 576                 else
 577                 {
 578                     for( ; p_out < p_out_end ; )
 579                     {
 580                         Merge( p_out, p_in, p_in + p_pic->p[i_plane].i_pitch,
 581                                p_pic->p[i_plane].i_pitch );
 582
 583                         p_out += p_outpic->p[i_plane].i_pitch;
 584                         p_in += 2*p_pic->p[i_plane].i_pitch;
 585                     }
 586                 }
 587                 break;
 588         }
 589     }
 590     EndMerge();
 591 }
 592
 593 #undef Merge
 594
 595 static void MergeGeneric( void *_p_dest, const void *_p_s1,
 596                           const void *_p_s2, size_t i_bytes )
 597 {
 598     uint8_t* p_dest = (uint8_t*)_p_dest;
 599     const uint8_t *p_s1 = (const uint8_t *)_p_s1;
 600     const uint8_t *p_s2 = (const uint8_t *)_p_s2;
 601     uint8_t* p_end = p_dest + i_bytes - 8;
 602
 603     while( p_dest < p_end )
 604     {
 605         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 606         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 607         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 608         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 609         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 610         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 611         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 612         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 613     }
 614
 615     p_end += 8;
 616
 617     while( p_dest < p_end )
 618     {
 619         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 620     }
 621 }
 622
 623 #if defined(CAN_COMPILE_MMXEXT)
 624 static void MergeMMXEXT( void *_p_dest, const void *_p_s1, const void *_p_s2,
 625                          size_t i_bytes )
 626 {
 627     uint8_t* p_dest = (uint8_t*)_p_dest;
 628     const uint8_t *p_s1 = (const uint8_t *)_p_s1;
 629     const uint8_t *p_s2 = (const uint8_t *)_p_s2;
 630     uint8_t* p_end = p_dest + i_bytes - 8;
 631     while( p_dest < p_end )
 632     {
 633         __asm__  __volatile__( "movq %2,%%mm1;"
 634                                "pavgb %1, %%mm1;"
 635                                "movq %%mm1, %0" :"=m" (*p_dest):
 636                                                  "m" (*p_s1),
 637                                                  "m" (*p_s2) );
 638         p_dest += 8;
 639         p_s1 += 8;
 640         p_s2 += 8;
 641     }
 642
 643     p_end += 8;
 644
 645     while( p_dest < p_end )
 646     {
 647         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 648     }
 649 }
 650 #endif
 651
 652 #if defined(CAN_COMPILE_3DNOW)
 653 static void Merge3DNow( void *_p_dest, const void *_p_s1, const void *_p_s2,
 654                         size_t i_bytes )
 655 {
 656     uint8_t* p_dest = (uint8_t*)_p_dest;
 657     const uint8_t *p_s1 = (const uint8_t *)_p_s1;
 658     const uint8_t *p_s2 = (const uint8_t *)_p_s2;
 659     uint8_t* p_end = p_dest + i_bytes - 8;
 660     while( p_dest < p_end )
 661     {
 662         __asm__  __volatile__( "movq %2,%%mm1;"
 663                                "pavgusb %1, %%mm1;"
 664                                "movq %%mm1, %0" :"=m" (*p_dest):
 665                                                  "m" (*p_s1),
 666                                                  "m" (*p_s2) );
 667         p_dest += 8;
 668         p_s1 += 8;
 669         p_s2 += 8;
 670     }
 671
 672     p_end += 8;
 673
 674     while( p_dest < p_end )
 675     {
 676         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 677     }
 678 }
 679 #endif
 680
 681 #if defined(CAN_COMPILE_SSE)
 682 static void MergeSSE2( void *_p_dest, const void *_p_s1, const void *_p_s2,
 683                        size_t i_bytes )
 684 {
 685     uint8_t* p_dest = (uint8_t*)_p_dest;
 686     const uint8_t *p_s1 = (const uint8_t *)_p_s1;
 687     const uint8_t *p_s2 = (const uint8_t *)_p_s2;
 688     uint8_t* p_end;
 689     while( (uintptr_t)p_s1 % 16 )
 690     {
 691         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 692     }
 693     p_end = p_dest + i_bytes - 16;
 694     while( p_dest < p_end )
 695     {
 696         __asm__  __volatile__( "movdqu %2,%%xmm1;"
 697                                "pavgb %1, %%xmm1;"
 698                                "movdqu %%xmm1, %0" :"=m" (*p_dest):
 699                                                  "m" (*p_s1),
 700                                                  "m" (*p_s2) );
 701         p_dest += 16;
 702         p_s1 += 16;
 703         p_s2 += 16;
 704     }
 705
 706     p_end += 16;
 707
 708     while( p_dest < p_end )
 709     {
 710         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 711     }
 712 }
 713 #endif
 714
 715 #if defined(CAN_COMPILE_MMXEXT) || defined(CAN_COMPILE_SSE)
 716 static void EndMMX( void )
 717 {
 718     __asm__ __volatile__( "emms" :: );
 719 }
 720 #endif
 721
 722 #if defined(CAN_COMPILE_3DNOW)
 723 static void End3DNow( void )
 724 {
 725     __asm__ __volatile__( "femms" :: );
 726 }
 727 #endif
 728
 729 #ifdef CAN_COMPILE_C_ALTIVEC
 730 static void MergeAltivec( void *_p_dest, const void *_p_s1,
 731                           const void *_p_s2, size_t i_bytes )
 732 {
 733     uint8_t *p_dest = (uint8_t *)_p_dest;
 734     uint8_t *p_s1   = (uint8_t *)_p_s1;
 735     uint8_t *p_s2   = (uint8_t *)_p_s2;
 736     uint8_t *p_end  = p_dest + i_bytes - 15;
 737
 738     /* Use C until the first 16-bytes aligned destination pixel */
 739     while( (uintptr_t)p_dest & 0xF )
 740     {
 741         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 742     }
 743
 744     if( ( (int)p_s1 & 0xF ) | ( (int)p_s2 & 0xF ) )
 745     {
 746         /* Unaligned source */
 747         vector unsigned char s1v, s2v, destv;
 748         vector unsigned char s1oldv, s2oldv, s1newv, s2newv;
 749         vector unsigned char perm1v, perm2v;
 750
 751         perm1v = vec_lvsl( 0, p_s1 );
 752         perm2v = vec_lvsl( 0, p_s2 );
 753         s1oldv = vec_ld( 0, p_s1 );
 754         s2oldv = vec_ld( 0, p_s2 );
 755
 756         while( p_dest < p_end )
 757         {
 758             s1newv = vec_ld( 16, p_s1 );
 759             s2newv = vec_ld( 16, p_s2 );
 760             s1v    = vec_perm( s1oldv, s1newv, perm1v );
 761             s2v    = vec_perm( s2oldv, s2newv, perm2v );
 762             s1oldv = s1newv;
 763             s2oldv = s2newv;
 764             destv  = vec_avg( s1v, s2v );
 765             vec_st( destv, 0, p_dest );
 766
 767             p_s1   += 16;
 768             p_s2   += 16;
 769             p_dest += 16;
 770         }
 771     }
 772     else
 773     {
 774         /* Aligned source */
 775         vector unsigned char s1v, s2v, destv;
 776
 777         while( p_dest < p_end )
 778         {
 779             s1v   = vec_ld( 0, p_s1 );
 780             s2v   = vec_ld( 0, p_s2 );
 781             destv = vec_avg( s1v, s2v );
 782             vec_st( destv, 0, p_dest );
 783
 784             p_s1   += 16;
 785             p_s2   += 16;
 786             p_dest += 16;
 787         }
 788     }
 789
 790     p_end += 15;
 791
 792     while( p_dest < p_end )
 793     {
 794         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 795     }
 796 }
 797 #endif
 798
 799 #ifdef __ARM_NEON__
 800 static void MergeNEON (void *restrict out, const void *in1,
 801                        const void *in2, size_t n)
 802 {
 803     uint8_t *outp = out;
 804     const uint8_t *in1p = in1;
 805     const uint8_t *in2p = in2;
 806     size_t mis = ((uintptr_t)outp) & 15;
 807
 808     if (mis)
 809     {
 810         MergeGeneric (outp, in1p, in2p, mis);
 811         outp += mis;
 812         in1p += mis;
 813         in2p += mis;
 814         n -= mis;
 815     }
 816
 817     uint8_t *end = outp + (n & ~15);
 818
 819     if ((((uintptr_t)in1p)|((uintptr_t)in2p)) & 15)
 820         while (outp < end)
 821             asm volatile (
 822                 "vld1.u8  {q0-q1}, [%[in1]]!\n"
 823                 "vld1.u8  {q2-q3}, [%[in2]]!\n"
 824                 "vhadd.u8 q4, q0, q2\n"
 825                 "vld1.u8  {q6-q7}, [%[in1]]!\n"
 826                 "vhadd.u8 q5, q1, q3\n"
 827                 "vld1.u8  {q8-q9}, [%[in2]]!\n"
 828                 "vhadd.u8 q10, q6, q8\n"
 829                 "vhadd.u8 q11, q7, q9\n"
 830                 "vst1.u8  {q4-q5}, [%[out],:128]!\n"
 831                 "vst1.u8  {q10-q11}, [%[out],:128]!\n"
 832                 : [out] "+r" (outp), [in1] "+r" (in1p), [in2] "+r" (in2p)
 833                 :
 834                 : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
 835                   "q8", "q9", "q10", "q11", "memory");
 836     else
 837          while (outp < end)
 838             asm volatile (
 839                 "vld1.u8  {q0-q1}, [%[in1],:128]!\n"
 840                 "vld1.u8  {q2-q3}, [%[in2],:128]!\n"
 841                 "vhadd.u8 q4, q0, q2\n"
 842                 "vld1.u8  {q6-q7}, [%[in1],:128]!\n"
 843                 "vhadd.u8 q5, q1, q3\n"
 844                 "vld1.u8  {q8-q9}, [%[in2],:128]!\n"
 845                 "vhadd.u8 q10, q6, q8\n"
 846                 "vhadd.u8 q11, q7, q9\n"
 847                 "vst1.u8  {q4-q5}, [%[out],:128]!\n"
 848                 "vst1.u8  {q10-q11}, [%[out],:128]!\n"
 849                 : [out] "+r" (outp), [in1] "+r" (in1p), [in2] "+r" (in2p)
 850                 :
 851                 : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
 852                   "q8", "q9", "q10", "q11", "memory");
 853     n &= 15;
 854     if (n)
 855         MergeGeneric (outp, in1p, in2p, n);
 856 }
 857 #endif
 858
 859 /*****************************************************************************
 860  * RenderX: This algo works on a 8x8 block basic, it copies the top field
 861  * and apply a process to recreate the bottom field :
 862  *  If a 8x8 block is classified as :
 863  *   - progressive: it applies a small blend (1,6,1)
 864  *   - interlaced:
 865  *    * in the MMX version: we do a ME between the 2 fields, if there is a
 866  *    good match we use MC to recreate the bottom field (with a small
 867  *    blend (1,6,1) )
 868  *    * otherwise: it recreates the bottom field by an edge oriented
 869  *    interpolation.
 870   *****************************************************************************/
 871
 872 /* XDeint8x8Detect: detect if a 8x8 block is interlaced.
 873  * XXX: It need to access to 8x10
 874  * We use more than 8 lines to help with scrolling (text)
 875  * (and because XDeint8x8Frame use line 9)
 876  * XXX: smooth/uniform area with noise detection doesn't works well
 877  * but it's not really a problem because they don't have much details anyway
 878  */
 879 static inline int ssd( int a ) { return a*a; }
 880 static inline int XDeint8x8DetectC( uint8_t *src, int i_src )
 881 {
 882     int y, x;
 883     int ff, fr;
 884     int fc;
 885
 886     /* Detect interlacing */
 887     fc = 0;
 888     for( y = 0; y < 7; y += 2 )
 889     {
 890         ff = fr = 0;
 891         for( x = 0; x < 8; x++ )
 892         {
 893             fr += ssd(src[      x] - src[1*i_src+x]) +
 894                   ssd(src[i_src+x] - src[2*i_src+x]);
 895             ff += ssd(src[      x] - src[2*i_src+x]) +
 896                   ssd(src[i_src+x] - src[3*i_src+x]);
 897         }
 898         if( ff < 6*fr/8 && fr > 32 )
 899             fc++;
 900
 901         src += 2*i_src;
 902     }
 903
 904     return fc < 1 ? false : true;
 905 }
 906 #ifdef CAN_COMPILE_MMXEXT
 907 static inline int XDeint8x8DetectMMXEXT( uint8_t *src, int i_src )
 908 {
 909
 910     int y, x;
 911     int32_t ff, fr;
 912     int fc;
 913
 914     /* Detect interlacing */
 915     fc = 0;
 916     pxor_r2r( mm7, mm7 );
 917     for( y = 0; y < 9; y += 2 )
 918     {
 919         ff = fr = 0;
 920         pxor_r2r( mm5, mm5 );
 921         pxor_r2r( mm6, mm6 );
 922         for( x = 0; x < 8; x+=4 )
 923         {
 924             movd_m2r( src[        x], mm0 );
 925             movd_m2r( src[1*i_src+x], mm1 );
 926             movd_m2r( src[2*i_src+x], mm2 );
 927             movd_m2r( src[3*i_src+x], mm3 );
 928
 929             punpcklbw_r2r( mm7, mm0 );
 930             punpcklbw_r2r( mm7, mm1 );
 931             punpcklbw_r2r( mm7, mm2 );
 932             punpcklbw_r2r( mm7, mm3 );
 933
 934             movq_r2r( mm0, mm4 );
 935
 936             psubw_r2r( mm1, mm0 );
 937             psubw_r2r( mm2, mm4 );
 938
 939             psubw_r2r( mm1, mm2 );
 940             psubw_r2r( mm1, mm3 );
 941
 942             pmaddwd_r2r( mm0, mm0 );
 943             pmaddwd_r2r( mm4, mm4 );
 944             pmaddwd_r2r( mm2, mm2 );
 945             pmaddwd_r2r( mm3, mm3 );
 946             paddd_r2r( mm0, mm2 );
 947             paddd_r2r( mm4, mm3 );
 948             paddd_r2r( mm2, mm5 );
 949             paddd_r2r( mm3, mm6 );
 950         }
 951
 952         movq_r2r( mm5, mm0 );
 953         psrlq_i2r( 32, mm0 );
 954         paddd_r2r( mm0, mm5 );
 955         movd_r2m( mm5, fr );
 956
 957         movq_r2r( mm6, mm0 );
 958         psrlq_i2r( 32, mm0 );
 959         paddd_r2r( mm0, mm6 );
 960         movd_r2m( mm6, ff );
 961
 962         if( ff < 6*fr/8 && fr > 32 )
 963             fc++;
 964
 965         src += 2*i_src;
 966     }
 967     return fc;
 968 }
 969 #endif
 970
 971 static inline void XDeint8x8MergeC( uint8_t *dst, int i_dst,
 972                                     uint8_t *src1, int i_src1,
 973                                     uint8_t *src2, int i_src2 )
 974 {
 975     int y, x;
 976
 977     /* Progressive */
 978     for( y = 0; y < 8; y += 2 )
 979     {
 980         memcpy( dst, src1, 8 );
 981         dst  += i_dst;
 982
 983         for( x = 0; x < 8; x++ )
 984             dst[x] = (src1[x] + 6*src2[x] + src1[i_src1+x] + 4 ) >> 3;
 985         dst += i_dst;
 986
 987         src1 += i_src1;
 988         src2 += i_src2;
 989     }
 990 }
 991
 992 #ifdef CAN_COMPILE_MMXEXT
 993 static inline void XDeint8x8MergeMMXEXT( uint8_t *dst, int i_dst,
 994                                          uint8_t *src1, int i_src1,
 995                                          uint8_t *src2, int i_src2 )
 996 {
 997     static const uint64_t m_4 = INT64_C(0x0004000400040004);
 998     int y, x;
 999
1000     /* Progressive */
1001     pxor_r2r( mm7, mm7 );
1002     for( y = 0; y < 8; y += 2 )
1003     {
1004         for( x = 0; x < 8; x +=4 )
1005         {
1006             movd_m2r( src1[x], mm0 );
1007             movd_r2m( mm0, dst[x] );
1008
1009             movd_m2r( src2[x], mm1 );
1010             movd_m2r( src1[i_src1+x], mm2 );
1011
1012             punpcklbw_r2r( mm7, mm0 );
1013             punpcklbw_r2r( mm7, mm1 );
1014             punpcklbw_r2r( mm7, mm2 );
1015             paddw_r2r( mm1, mm1 );
1016             movq_r2r( mm1, mm3 );
1017             paddw_r2r( mm3, mm3 );
1018             paddw_r2r( mm2, mm0 );
1019             paddw_r2r( mm3, mm1 );
1020             paddw_m2r( m_4, mm1 );
1021             paddw_r2r( mm1, mm0 );
1022             psraw_i2r( 3, mm0 );
1023             packuswb_r2r( mm7, mm0 );
1024             movd_r2m( mm0, dst[i_dst+x] );
1025         }
1026         dst += 2*i_dst;
1027         src1 += i_src1;
1028         src2 += i_src2;
1029     }
1030 }
1031
1032 #endif
1033
1034 /* For debug */
1035 static inline void XDeint8x8Set( uint8_t *dst, int i_dst, uint8_t v )
1036 {
1037     int y;
1038     for( y = 0; y < 8; y++ )
1039         memset( &dst[y*i_dst], v, 8 );
1040 }
1041
1042 /* XDeint8x8FieldE: Stupid deinterlacing (1,0,1) for block that miss a
1043  * neighbour
1044  * (Use 8x9 pixels)
1045  * TODO: a better one for the inner part.
1046  */
1047 static inline void XDeint8x8FieldEC( uint8_t *dst, int i_dst,
1048                                      uint8_t *src, int i_src )
1049 {
1050     int y, x;
1051
1052     /* Interlaced */
1053     for( y = 0; y < 8; y += 2 )
1054     {
1055         memcpy( dst, src, 8 );
1056         dst += i_dst;
1057
1058         for( x = 0; x < 8; x++ )
1059             dst[x] = (src[x] + src[2*i_src+x] ) >> 1;
1060         dst += 1*i_dst;
1061         src += 2*i_src;
1062     }
1063 }
1064 #ifdef CAN_COMPILE_MMXEXT
1065 static inline void XDeint8x8FieldEMMXEXT( uint8_t *dst, int i_dst,
1066                                           uint8_t *src, int i_src )
1067 {
1068     int y;
1069
1070     /* Interlaced */
1071     for( y = 0; y < 8; y += 2 )
1072     {
1073         movq_m2r( src[0], mm0 );
1074         movq_r2m( mm0, dst[0] );
1075         dst += i_dst;
1076
1077         movq_m2r( src[2*i_src], mm1 );
1078         pavgb_r2r( mm1, mm0 );
1079
1080         movq_r2m( mm0, dst[0] );
1081
1082         dst += 1*i_dst;
1083         src += 2*i_src;
1084     }
1085 }
1086 #endif
1087
1088 /* XDeint8x8Field: Edge oriented interpolation
1089  * (Need -4 and +5 pixels H, +1 line)
1090  */
1091 static inline void XDeint8x8FieldC( uint8_t *dst, int i_dst,
1092                                     uint8_t *src, int i_src )
1093 {
1094     int y, x;
1095
1096     /* Interlaced */
1097     for( y = 0; y < 8; y += 2 )
1098     {
1099         memcpy( dst, src, 8 );
1100         dst += i_dst;
1101
1102         for( x = 0; x < 8; x++ )
1103         {
1104             uint8_t *src2 = &src[2*i_src];
1105             /* I use 8 pixels just to match the MMX version, but it's overkill
1106              * 5 would be enough (less isn't good) */
1107             const int c0 = abs(src[x-4]-src2[x-2]) + abs(src[x-3]-src2[x-1]) +
1108                            abs(src[x-2]-src2[x+0]) + abs(src[x-1]-src2[x+1]) +
1109                            abs(src[x+0]-src2[x+2]) + abs(src[x+1]-src2[x+3]) +
1110                            abs(src[x+2]-src2[x+4]) + abs(src[x+3]-src2[x+5]);
1111
1112             const int c1 = abs(src[x-3]-src2[x-3]) + abs(src[x-2]-src2[x-2]) +
1113                            abs(src[x-1]-src2[x-1]) + abs(src[x+0]-src2[x+0]) +
1114                            abs(src[x+1]-src2[x+1]) + abs(src[x+2]-src2[x+2]) +
1115                            abs(src[x+3]-src2[x+3]) + abs(src[x+4]-src2[x+4]);
1116
1117             const int c2 = abs(src[x-2]-src2[x-4]) + abs(src[x-1]-src2[x-3]) +
1118                            abs(src[x+0]-src2[x-2]) + abs(src[x+1]-src2[x-1]) +
1119                            abs(src[x+2]-src2[x+0]) + abs(src[x+3]-src2[x+1]) +
1120                            abs(src[x+4]-src2[x+2]) + abs(src[x+5]-src2[x+3]);
1121
1122             if( c0 < c1 && c1 <= c2 )
1123                 dst[x] = (src[x-1] + src2[x+1]) >> 1;
1124             else if( c2 < c1 && c1 <= c0 )
1125                 dst[x] = (src[x+1] + src2[x-1]) >> 1;
1126             else
1127                 dst[x] = (src[x+0] + src2[x+0]) >> 1;
1128         }
1129
1130         dst += 1*i_dst;
1131         src += 2*i_src;
1132     }
1133 }
1134 #ifdef CAN_COMPILE_MMXEXT
1135 static inline void XDeint8x8FieldMMXEXT( uint8_t *dst, int i_dst,
1136                                          uint8_t *src, int i_src )
1137 {
1138     int y, x;
1139
1140     /* Interlaced */
1141     for( y = 0; y < 8; y += 2 )
1142     {
1143         memcpy( dst, src, 8 );
1144         dst += i_dst;
1145
1146         for( x = 0; x < 8; x++ )
1147         {
1148             uint8_t *src2 = &src[2*i_src];
1149             int32_t c0, c1, c2;
1150
1151             movq_m2r( src[x-2], mm0 );
1152             movq_m2r( src[x-3], mm1 );
1153             movq_m2r( src[x-4], mm2 );
1154
1155             psadbw_m2r( src2[x-4], mm0 );
1156             psadbw_m2r( src2[x-3], mm1 );
1157             psadbw_m2r( src2[x-2], mm2 );
1158
1159             movd_r2m( mm0, c2 );
1160             movd_r2m( mm1, c1 );
1161             movd_r2m( mm2, c0 );
1162
1163             if( c0 < c1 && c1 <= c2 )
1164                 dst[x] = (src[x-1] + src2[x+1]) >> 1;
1165             else if( c2 < c1 && c1 <= c0 )
1166                 dst[x] = (src[x+1] + src2[x-1]) >> 1;
1167             else
1168                 dst[x] = (src[x+0] + src2[x+0]) >> 1;
1169         }
1170
1171         dst += 1*i_dst;
1172         src += 2*i_src;
1173     }
1174 }
1175 #endif
1176
1177 /* NxN arbitray size (and then only use pixel in the NxN block)
1178  */
1179 static inline int XDeintNxNDetect( uint8_t *src, int i_src,
1180                                    int i_height, int i_width )
1181 {
1182     int y, x;
1183     int ff, fr;
1184     int fc;
1185
1186
1187     /* Detect interlacing */
1188     /* FIXME way too simple, need to be more like XDeint8x8Detect */
1189     ff = fr = 0;
1190     fc = 0;
1191     for( y = 0; y < i_height - 2; y += 2 )
1192     {
1193         const uint8_t *s = &src[y*i_src];
1194         for( x = 0; x < i_width; x++ )
1195         {
1196             fr += ssd(s[      x] - s[1*i_src+x]);
1197             ff += ssd(s[      x] - s[2*i_src+x]);
1198         }
1199         if( ff < fr && fr > i_width / 2 )
1200             fc++;
1201     }
1202
1203     return fc < 2 ? false : true;
1204 }
1205
1206 static inline void XDeintNxNFrame( uint8_t *dst, int i_dst,
1207                                    uint8_t *src, int i_src,
1208                                    int i_width, int i_height )
1209 {
1210     int y, x;
1211
1212     /* Progressive */
1213     for( y = 0; y < i_height; y += 2 )
1214     {
1215         memcpy( dst, src, i_width );
1216         dst += i_dst;
1217
1218         if( y < i_height - 2 )
1219         {
1220             for( x = 0; x < i_width; x++ )
1221                 dst[x] = (src[x] + 2*src[1*i_src+x] + src[2*i_src+x] + 2 ) >> 2;
1222         }
1223         else
1224         {
1225             /* Blend last line */
1226             for( x = 0; x < i_width; x++ )
1227                 dst[x] = (src[x] + src[1*i_src+x] ) >> 1;
1228         }
1229         dst += 1*i_dst;
1230         src += 2*i_src;
1231     }
1232 }
1233
1234 static inline void XDeintNxNField( uint8_t *dst, int i_dst,
1235                                    uint8_t *src, int i_src,
1236                                    int i_width, int i_height )
1237 {
1238     int y, x;
1239
1240     /* Interlaced */
1241     for( y = 0; y < i_height; y += 2 )
1242     {
1243         memcpy( dst, src, i_width );
1244         dst += i_dst;
1245
1246         if( y < i_height - 2 )
1247         {
1248             for( x = 0; x < i_width; x++ )
1249                 dst[x] = (src[x] + src[2*i_src+x] ) >> 1;
1250         }
1251         else
1252         {
1253             /* Blend last line */
1254             for( x = 0; x < i_width; x++ )
1255                 dst[x] = (src[x] + src[i_src+x]) >> 1;
1256         }
1257         dst += 1*i_dst;
1258         src += 2*i_src;
1259     }
1260 }
1261
1262 static inline void XDeintNxN( uint8_t *dst, int i_dst, uint8_t *src, int i_src,
1263                               int i_width, int i_height )
1264 {
1265     if( XDeintNxNDetect( src, i_src, i_width, i_height ) )
1266         XDeintNxNField( dst, i_dst, src, i_src, i_width, i_height );
1267     else
1268         XDeintNxNFrame( dst, i_dst, src, i_src, i_width, i_height );
1269 }
1270
1271
1272 static inline int median( int a, int b, int c )
1273 {
1274     int min = a, max =a;
1275     if( b < min )
1276         min = b;
1277     else
1278         max = b;
1279
1280     if( c < min )
1281         min = c;
1282     else if( c > max )
1283         max = c;
1284
1285     return a + b + c - min - max;
1286 }
1287
1288
1289 /* XDeintBand8x8:
1290  */
1291 static inline void XDeintBand8x8C( uint8_t *dst, int i_dst,
1292                                    uint8_t *src, int i_src,
1293                                    const int i_mbx, int i_modx )
1294 {
1295     int x;
1296
1297     for( x = 0; x < i_mbx; x++ )
1298     {
1299         int s;
1300         if( ( s = XDeint8x8DetectC( src, i_src ) ) )
1301         {
1302             if( x == 0 || x == i_mbx - 1 )
1303                 XDeint8x8FieldEC( dst, i_dst, src, i_src );
1304             else
1305                 XDeint8x8FieldC( dst, i_dst, src, i_src );
1306         }
1307         else
1308         {
1309             XDeint8x8MergeC( dst, i_dst,
1310                              &src[0*i_src], 2*i_src,
1311                              &src[1*i_src], 2*i_src );
1312         }
1313
1314         dst += 8;
1315         src += 8;
1316     }
1317
1318     if( i_modx )
1319         XDeintNxN( dst, i_dst, src, i_src, i_modx, 8 );
1320 }
1321 #ifdef CAN_COMPILE_MMXEXT
1322 static inline void XDeintBand8x8MMXEXT( uint8_t *dst, int i_dst,
1323                                         uint8_t *src, int i_src,
1324                                         const int i_mbx, int i_modx )
1325 {
1326     int x;
1327
1328     /* Reset current line */
1329     for( x = 0; x < i_mbx; x++ )
1330     {
1331         int s;
1332         if( ( s = XDeint8x8DetectMMXEXT( src, i_src ) ) )
1333         {
1334             if( x == 0 || x == i_mbx - 1 )
1335                 XDeint8x8FieldEMMXEXT( dst, i_dst, src, i_src );
1336             else
1337                 XDeint8x8FieldMMXEXT( dst, i_dst, src, i_src );
1338         }
1339         else
1340         {
1341             XDeint8x8MergeMMXEXT( dst, i_dst,
1342                                   &src[0*i_src], 2*i_src,
1343                                   &src[1*i_src], 2*i_src );
1344         }
1345
1346         dst += 8;
1347         src += 8;
1348     }
1349
1350     if( i_modx )
1351         XDeintNxN( dst, i_dst, src, i_src, i_modx, 8 );
1352 }
1353 #endif
1354
1355 static void RenderX( picture_t *p_outpic, picture_t *p_pic )
1356 {
1357     int i_plane;
1358
1359     /* Copy image and skip lines */
1360     for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
1361     {
1362         const int i_mby = ( p_outpic->p[i_plane].i_visible_lines + 7 )/8 - 1;
1363         const int i_mbx = p_outpic->p[i_plane].i_visible_pitch/8;
1364
1365         const int i_mody = p_outpic->p[i_plane].i_visible_lines - 8*i_mby;
1366         const int i_modx = p_outpic->p[i_plane].i_visible_pitch - 8*i_mbx;
1367
1368         const int i_dst = p_outpic->p[i_plane].i_pitch;
1369         const int i_src = p_pic->p[i_plane].i_pitch;
1370
1371         int y, x;
1372
1373         for( y = 0; y < i_mby; y++ )
1374         {
1375             uint8_t *dst = &p_outpic->p[i_plane].p_pixels[8*y*i_dst];
1376             uint8_t *src = &p_pic->p[i_plane].p_pixels[8*y*i_src];
1377
1378 #ifdef CAN_COMPILE_MMXEXT
1379             if( vlc_CPU() & CPU_CAPABILITY_MMXEXT )
1380                 XDeintBand8x8MMXEXT( dst, i_dst, src, i_src, i_mbx, i_modx );
1381             else
1382 #endif
1383                 XDeintBand8x8C( dst, i_dst, src, i_src, i_mbx, i_modx );
1384         }
1385
1386         /* Last line (C only)*/
1387         if( i_mody )
1388         {
1389             uint8_t *dst = &p_outpic->p[i_plane].p_pixels[8*y*i_dst];
1390             uint8_t *src = &p_pic->p[i_plane].p_pixels[8*y*i_src];
1391
1392             for( x = 0; x < i_mbx; x++ )
1393             {
1394                 XDeintNxN( dst, i_dst, src, i_src, 8, i_mody );
1395
1396                 dst += 8;
1397                 src += 8;
1398             }
1399
1400             if( i_modx )
1401                 XDeintNxN( dst, i_dst, src, i_src, i_modx, i_mody );
1402         }
1403     }
1404
1405 #ifdef CAN_COMPILE_MMXEXT
1406     if( vlc_CPU() & CPU_CAPABILITY_MMXEXT )
1407         emms();
1408 #endif
1409 }
1410
1411 /*****************************************************************************
1412  * Yadif (Yet Another DeInterlacing Filter).
1413  *****************************************************************************/
1414 /* */
1415 struct vf_priv_s {
1416     /*
1417      * 0: Output 1 frame for each frame.
1418      * 1: Output 1 frame for each field.
1419      * 2: Like 0 but skips spatial interlacing check.
1420      * 3: Like 1 but skips spatial interlacing check.
1421      *
1422      * In vlc, only & 0x02 has meaning, as we do the & 0x01 ourself.
1423      */
1424     int mode;
1425 };
1426
1427 /* I am unsure it is the right one */
1428 typedef intptr_t x86_reg;
1429
1430 #define FFABS(a) ((a) >= 0 ? (a) : (-(a)))
1431 #define FFMAX(a,b)      __MAX(a,b)
1432 #define FFMAX3(a,b,c)   FFMAX(FFMAX(a,b),c)
1433 #define FFMIN(a,b)      __MIN(a,b)
1434 #define FFMIN3(a,b,c)   FFMIN(FFMIN(a,b),c)
1435
1436 /* yadif.h comes from vf_yadif.c of mplayer project */
1437 #include "yadif.h"
1438
1439 static int RenderYadif( filter_t *p_filter, picture_t *p_dst, picture_t *p_src, int i_order, int i_field )
1440 {
1441     filter_sys_t *p_sys = p_filter->p_sys;
1442
1443     /* */
1444     assert( i_order == 0 || i_order == 1 );
1445     assert( i_field == 0 || i_field == 1 );
1446
1447     if( i_order == 0 )
1448     {
1449         /* Duplicate the picture
1450          * TODO when the vout rework is finished, picture_Hold() might be enough
1451          * but becarefull, the pitches must match */
1452         picture_t *p_dup = picture_NewFromFormat( &p_src->format );
1453         if( p_dup )
1454             picture_Copy( p_dup, p_src );
1455
1456         /* Slide the history */
1457         if( p_sys->pp_history[0] )
1458             picture_Release( p_sys->pp_history[0]  );
1459         for( int i = 1; i < HISTORY_SIZE; i++ )
1460             p_sys->pp_history[i-1] = p_sys->pp_history[i];
1461         p_sys->pp_history[HISTORY_SIZE-1] = p_dup;
1462     }
1463
1464     /* As the pitches must match, use ONLY pictures coming from picture_New()! */
1465     picture_t *p_prev = p_sys->pp_history[0];
1466     picture_t *p_cur  = p_sys->pp_history[1];
1467     picture_t *p_next = p_sys->pp_history[2];
1468
1469     /* Filter if we have all the pictures we need */
1470     if( p_prev && p_cur && p_next )
1471     {
1472         /* */
1473         void (*filter)(struct vf_priv_s *p, uint8_t *dst, uint8_t *prev, uint8_t *cur, uint8_t *next, int w, int refs, int parity);
1474 #if defined(HAVE_YADIF_SSE2)
1475         if( vlc_CPU() & CPU_CAPABILITY_SSE2 )
1476             filter = yadif_filter_line_mmx2;
1477         else
1478 #endif
1479             filter = yadif_filter_line_c;
1480
1481         for( int n = 0; n < p_dst->i_planes; n++ )
1482         {
1483             const plane_t *prevp = &p_prev->p[n];
1484             const plane_t *curp  = &p_cur->p[n];
1485             const plane_t *nextp = &p_next->p[n];
1486             plane_t *dstp        = &p_dst->p[n];
1487
1488             for( int y = 1; y < dstp->i_visible_lines - 1; y++ )
1489             {
1490                 if( (y % 2) == i_field )
1491                 {
1492                     vlc_memcpy( &dstp->p_pixels[y * dstp->i_pitch],
1493                                 &curp->p_pixels[y * curp->i_pitch], dstp->i_visible_pitch );
1494                 }
1495                 else
1496                 {
1497                     struct vf_priv_s cfg;
1498                     /* Spatial checks only when enough data */
1499                     cfg.mode = (y >= 2 && y < dstp->i_visible_lines - 2) ? 0 : 2;
1500
1501                     assert( prevp->i_pitch == curp->i_pitch && curp->i_pitch == nextp->i_pitch );
1502                     filter( &cfg,
1503                             &dstp->p_pixels[y * dstp->i_pitch],
1504                             &prevp->p_pixels[y * prevp->i_pitch],
1505                             &curp->p_pixels[y * curp->i_pitch],
1506                             &nextp->p_pixels[y * nextp->i_pitch],
1507                             dstp->i_visible_pitch,
1508                             curp->i_pitch,
1509                             (i_field ^ (i_order == i_field)) & 1 );
1510                 }
1511
1512                 /* We duplicate the first and last lines */
1513                 if( y == 1 )
1514                     vlc_memcpy(&dstp->p_pixels[(y-1) * dstp->i_pitch], &dstp->p_pixels[y * dstp->i_pitch], dstp->i_pitch);
1515                 else if( y == dstp->i_visible_lines - 2 )
1516                     vlc_memcpy(&dstp->p_pixels[(y+1) * dstp->i_pitch], &dstp->p_pixels[y * dstp->i_pitch], dstp->i_pitch);
1517             }
1518         }
1519
1520         /* */
1521         p_dst->date = (p_next->date - p_cur->date) * i_order / 2 + p_cur->date;
1522         return VLC_SUCCESS;
1523     }
1524     else if( !p_prev && !p_cur && p_next )
1525     {
1526         /* FIXME not good as it does not use i_order/i_field */
1527         RenderX( p_dst, p_next );
1528         return VLC_SUCCESS;
1529     }
1530     else
1531     {
1532         return VLC_EGENERIC;
1533     }
1534 }
1535
1536 /*****************************************************************************
1537  * video filter2 functions
1538  *****************************************************************************/
1539 static picture_t *Deinterlace( filter_t *p_filter, picture_t *p_pic )
1540 {
1541     filter_sys_t *p_sys = p_filter->p_sys;
1542     picture_t *p_dst[2];
1543
1544     /* Request output picture */
1545     p_dst[0] = filter_NewPicture( p_filter );
1546     if( p_dst[0] == NULL )
1547     {
1548         picture_Release( p_pic );
1549         return NULL;
1550     }
1551     picture_CopyProperties( p_dst[0], p_pic );
1552
1553     if( p_sys->b_double_rate )
1554     {
1555         p_dst[0]->p_next =
1556         p_dst[1]         = filter_NewPicture( p_filter );
1557         if( p_dst[1] )
1558         {
1559             picture_CopyProperties( p_dst[1], p_pic );
1560             /* XXX it's not really good especially for the first picture, but
1561              * I don't think that delaying by one frame is worth it */
1562             if( p_sys->i_last_date > VLC_TS_INVALID && p_pic->date > VLC_TS_INVALID )
1563                 p_dst[1]->date = p_pic->date + (p_pic->date - p_sys->i_last_date) / 2;
1564         }
1565         p_sys->i_last_date = p_pic->date;
1566     }
1567     else
1568     {
1569         p_dst[1] = NULL;
1570     }
1571
1572     switch( p_sys->i_mode )
1573     {
1574         case DEINTERLACE_DISCARD:
1575             RenderDiscard( p_filter, p_dst[0], p_pic, 0 );
1576             break;
1577
1578         case DEINTERLACE_BOB:
1579             RenderBob( p_filter, p_dst[0], p_pic, !p_pic->b_top_field_first );
1580             if( p_dst[1] )
1581                 RenderBob( p_filter, p_dst[1], p_pic, p_pic->b_top_field_first );
1582             break;;
1583
1584         case DEINTERLACE_LINEAR:
1585             RenderLinear( p_filter, p_dst[0], p_pic, !p_pic->b_top_field_first );
1586             if( p_dst[1] )
1587                 RenderLinear( p_filter, p_dst[1], p_pic, p_pic->b_top_field_first );
1588             break;
1589
1590         case DEINTERLACE_MEAN:
1591             RenderMean( p_filter, p_dst[0], p_pic );
1592             break;
1593
1594         case DEINTERLACE_BLEND:
1595             RenderBlend( p_filter, p_dst[0], p_pic );
1596             break;
1597
1598         case DEINTERLACE_X:
1599             RenderX( p_dst[0], p_pic );
1600             break;
1601
1602         case DEINTERLACE_YADIF:
1603             if( RenderYadif( p_filter, p_dst[0], p_pic, 0, 0 ) )
1604                 goto drop;
1605             break;
1606
1607         case DEINTERLACE_YADIF2X:
1608             if( RenderYadif( p_filter, p_dst[0], p_pic, 0, !p_pic->b_top_field_first ) )
1609                 goto drop;
1610             if( p_dst[1] )
1611                 RenderYadif( p_filter, p_dst[1], p_pic, 1, p_pic->b_top_field_first );
1612             break;
1613     }
1614
1615     p_dst[0]->b_progressive = true;
1616     if( p_dst[1] )
1617         p_dst[1]->b_progressive = true;
1618
1619     picture_Release( p_pic );
1620     return p_dst[0];
1621
1622 drop:
1623     picture_Release( p_dst[0] );
1624     if( p_dst[1] )
1625         picture_Release( p_dst[1] );
1626     picture_Release( p_pic );
1627     return NULL;
1628 }
1629
1630 static void Flush( filter_t *p_filter )
1631 {
1632     filter_sys_t *p_sys = p_filter->p_sys;
1633
1634     p_sys->i_last_date = VLC_TS_INVALID;
1635     for( int i = 0; i < HISTORY_SIZE; i++ )
1636     {
1637         if( p_sys->pp_history[i] )
1638             picture_Release( p_sys->pp_history[i] );
1639         p_sys->pp_history[i] = NULL;
1640     }
1641 }
1642
1643 static int Mouse( filter_t *p_filter,
1644                   vlc_mouse_t *p_mouse, const vlc_mouse_t *p_old, const vlc_mouse_t *p_new )
1645 {
1646     VLC_UNUSED(p_old);
1647     *p_mouse = *p_new;
1648     if( p_filter->p_sys->b_half_height )
1649         p_mouse->i_y *= 2;
1650     return VLC_SUCCESS;
1651 }
1652
1653
1654 /*****************************************************************************
1655  * Open
1656  *****************************************************************************/
1657 static int Open( vlc_object_t *p_this )
1658 {
1659     filter_t *p_filter = (filter_t*)p_this;
1660     filter_sys_t *p_sys;
1661
1662     if( !IsChromaSupported( p_filter->fmt_in.video.i_chroma ) )
1663         return VLC_EGENERIC;
1664
1665     /* */
1666     p_sys = p_filter->p_sys = malloc( sizeof( *p_sys ) );
1667     if( !p_sys )
1668         return VLC_ENOMEM;
1669
1670     p_sys->i_mode = DEINTERLACE_BLEND;
1671     p_sys->b_double_rate = false;
1672     p_sys->b_half_height = true;
1673     p_sys->i_last_date = VLC_TS_INVALID;
1674     for( int i = 0; i < HISTORY_SIZE; i++ )
1675         p_sys->pp_history[i] = NULL;
1676
1677 #if defined(CAN_COMPILE_C_ALTIVEC)
1678     if( vlc_CPU() & CPU_CAPABILITY_ALTIVEC )
1679     {
1680         p_sys->pf_merge = MergeAltivec;
1681         p_sys->pf_end_merge = NULL;
1682     }
1683     else
1684 #endif
1685 #if defined(CAN_COMPILE_SSE)
1686     if( vlc_CPU() & CPU_CAPABILITY_SSE2 )
1687     {
1688         p_sys->pf_merge = MergeSSE2;
1689         p_sys->pf_end_merge = EndMMX;
1690     }
1691     else
1692 #endif
1693 #if defined(CAN_COMPILE_MMXEXT)
1694     if( vlc_CPU() & CPU_CAPABILITY_MMXEXT )
1695     {
1696         p_sys->pf_merge = MergeMMXEXT;
1697         p_sys->pf_end_merge = EndMMX;
1698     }
1699     else
1700 #endif
1701 #if defined(CAN_COMPILE_3DNOW)
1702     if( vlc_CPU() & CPU_CAPABILITY_3DNOW )
1703     {
1704         p_sys->pf_merge = Merge3DNow;
1705         p_sys->pf_end_merge = End3DNow;
1706     }
1707     else
1708 #endif
1709 #if defined __ARM_NEON__
1710     if( vlc_CPU() & CPU_CAPABILITY_NEON )
1711     {
1712         p_sys->pf_merge = MergeNEON;
1713         p_sys->pf_end_merge = NULL;
1714     }
1715     else
1716 #endif
1717     {
1718         p_sys->pf_merge = MergeGeneric;
1719         p_sys->pf_end_merge = NULL;
1720     }
1721
1722     /* */
1723     config_ChainParse( p_filter, FILTER_CFG_PREFIX, ppsz_filter_options,
1724                        p_filter->p_cfg );
1725
1726     char *psz_mode = var_GetNonEmptyString( p_filter, FILTER_CFG_PREFIX "mode" );
1727     SetFilterMethod( p_filter, psz_mode, p_filter->fmt_in.video.i_chroma );
1728     free( psz_mode );
1729
1730     /* */
1731     video_format_t fmt;
1732     GetOutputFormat( p_filter, &fmt, &p_filter->fmt_in.video );
1733     if( !p_filter->b_allow_fmt_out_change &&
1734         ( fmt.i_chroma != p_filter->fmt_in.video.i_chroma ||
1735           fmt.i_height != p_filter->fmt_in.video.i_height ) )
1736     {
1737         Close( VLC_OBJECT(p_filter) );
1738         return VLC_EGENERIC;
1739     }
1740     p_filter->fmt_out.video = fmt;
1741     p_filter->fmt_out.i_codec = fmt.i_chroma;
1742     p_filter->pf_video_filter = Deinterlace;
1743     p_filter->pf_video_flush  = Flush;
1744     p_filter->pf_video_mouse  = Mouse;
1745
1746     msg_Dbg( p_filter, "deinterlacing" );
1747
1748     return VLC_SUCCESS;
1749 }
1750
1751 /*****************************************************************************
1752  * Close: clean up the filter
1753  *****************************************************************************/
1754 static void Close( vlc_object_t *p_this )
1755 {
1756     filter_t *p_filter = (filter_t*)p_this;
1757
1758     Flush( p_filter );
1759     free( p_filter->p_sys );
1760 }
1761