git.sesse.net Git - vlc/blob - modules/video_filter/deinterlace.c

   1 /*****************************************************************************
   2  * deinterlace.c : deinterlacer plugin for vlc
   3  *****************************************************************************
   4  * Copyright (C) 2000-2009 the VideoLAN team
   5  * $Id$
   6  *
   7  * Author: Sam Hocevar <sam@zoy.org>
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation; either version 2 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License
  20  * along with this program; if not, write to the Free Software
  21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
  22  *****************************************************************************/
  23
  24 /*****************************************************************************
  25  * Preamble
  26  *****************************************************************************/
  27
  28 #ifdef HAVE_CONFIG_H
  29 # include "config.h"
  30 #endif
  31
  32 #include <assert.h>
  33
  34 #ifdef HAVE_ALTIVEC_H
  35 #   include <altivec.h>
  36 #endif
  37
  38 #include <vlc_common.h>
  39 #include <vlc_plugin.h>
  40 #include <vlc_filter.h>
  41 #include <vlc_cpu.h>
  42
  43 #ifdef CAN_COMPILE_MMXEXT
  44 #   include "mmx.h"
  45 #endif
  46
  47 #define DEINTERLACE_DISCARD 1
  48 #define DEINTERLACE_MEAN    2
  49 #define DEINTERLACE_BLEND   3
  50 #define DEINTERLACE_BOB     4
  51 #define DEINTERLACE_LINEAR  5
  52 #define DEINTERLACE_X       6
  53 #define DEINTERLACE_YADIF   7
  54 #define DEINTERLACE_YADIF2X 8
  55
  56 /*****************************************************************************
  57  * Module descriptor
  58  *****************************************************************************/
  59 static int  Open ( vlc_object_t * );
  60 static void Close( vlc_object_t * );
  61
  62 #define MODE_TEXT N_("Deinterlace mode")
  63 #define MODE_LONGTEXT N_("Deinterlace method to use for local playback.")
  64
  65 #define SOUT_MODE_TEXT N_("Streaming deinterlace mode")
  66 #define SOUT_MODE_LONGTEXT N_("Deinterlace method to use for streaming.")
  67
  68 #define FILTER_CFG_PREFIX "sout-deinterlace-"
  69
  70 static const char *const mode_list[] = {
  71     "discard", "blend", "mean", "bob", "linear", "x", "yadif", "yadif2x" };
  72 static const char *const mode_list_text[] = {
  73     N_("Discard"), N_("Blend"), N_("Mean"), N_("Bob"), N_("Linear"), "X", "Yadif", "Yadif (2x)" };
  74
  75 vlc_module_begin ()
  76     set_description( N_("Deinterlacing video filter") )
  77     set_shortname( N_("Deinterlace" ))
  78     set_capability( "video filter2", 0 )
  79     set_category( CAT_VIDEO )
  80     set_subcategory( SUBCAT_VIDEO_VFILTER )
  81
  82     add_string( FILTER_CFG_PREFIX "mode", "blend", SOUT_MODE_TEXT,
  83                 SOUT_MODE_LONGTEXT, false )
  84         change_string_list( mode_list, mode_list_text, 0 )
  85         change_safe ()
  86     add_shortcut( "deinterlace" )
  87     set_callbacks( Open, Close )
  88 vlc_module_end ()
  89
  90
  91 /*****************************************************************************
  92  * Local protypes
  93  *****************************************************************************/
  94 static void RenderDiscard( filter_t *, picture_t *, picture_t *, int );
  95 static void RenderBob    ( filter_t *, picture_t *, picture_t *, int );
  96 static void RenderMean   ( filter_t *, picture_t *, picture_t * );
  97 static void RenderBlend  ( filter_t *, picture_t *, picture_t * );
  98 static void RenderLinear ( filter_t *, picture_t *, picture_t *, int );
  99 static void RenderX      ( picture_t *, picture_t * );
 100 static int  RenderYadif  ( filter_t *, picture_t *, picture_t *, int, int );
 101
 102 static void MergeGeneric ( void *, const void *, const void *, size_t );
 103 #if defined(CAN_COMPILE_C_ALTIVEC)
 104 static void MergeAltivec ( void *, const void *, const void *, size_t );
 105 #endif
 106 #if defined(CAN_COMPILE_MMXEXT)
 107 static void MergeMMXEXT  ( void *, const void *, const void *, size_t );
 108 #endif
 109 #if defined(CAN_COMPILE_3DNOW)
 110 static void Merge3DNow   ( void *, const void *, const void *, size_t );
 111 #endif
 112 #if defined(CAN_COMPILE_SSE)
 113 static void MergeSSE2    ( void *, const void *, const void *, size_t );
 114 #endif
 115 #if defined(CAN_COMPILE_MMXEXT) || defined(CAN_COMPILE_SSE)
 116 static void EndMMX       ( void );
 117 #endif
 118 #if defined(CAN_COMPILE_3DNOW)
 119 static void End3DNow     ( void );
 120 #endif
 121 #if defined __ARM_NEON__
 122 static void MergeNEON (void *, const void *, const void *, size_t);
 123 #endif
 124
 125 static const char *const ppsz_filter_options[] = {
 126     "mode", NULL
 127 };
 128
 129 #define HISTORY_SIZE (3)
 130 struct filter_sys_t
 131 {
 132     int  i_mode;        /* Deinterlace mode */
 133     bool b_double_rate; /* Shall we double the framerate? */
 134     bool b_half_height; /* Shall be divide the height by 2 */
 135
 136     void (*pf_merge) ( void *, const void *, const void *, size_t );
 137     void (*pf_end_merge) ( void );
 138
 139     mtime_t i_last_date;
 140
 141     /* Yadif */
 142     picture_t *pp_history[HISTORY_SIZE];
 143 };
 144
 145 /*****************************************************************************
 146  * SetFilterMethod: setup the deinterlace method to use.
 147  *****************************************************************************/
 148 static void SetFilterMethod( filter_t *p_filter, const char *psz_method, vlc_fourcc_t i_chroma )
 149 {
 150     filter_sys_t *p_sys = p_filter->p_sys;
 151
 152     if( !psz_method )
 153         psz_method = "";
 154
 155     if( !strcmp( psz_method, "mean" ) )
 156     {
 157         p_sys->i_mode = DEINTERLACE_MEAN;
 158         p_sys->b_double_rate = false;
 159         p_sys->b_half_height = true;
 160     }
 161     else if( !strcmp( psz_method, "bob" )
 162              || !strcmp( psz_method, "progressive-scan" ) )
 163     {
 164         p_sys->i_mode = DEINTERLACE_BOB;
 165         p_sys->b_double_rate = true;
 166         p_sys->b_half_height = false;
 167     }
 168     else if( !strcmp( psz_method, "linear" ) )
 169     {
 170         p_sys->i_mode = DEINTERLACE_LINEAR;
 171         p_sys->b_double_rate = true;
 172         p_sys->b_half_height = false;
 173     }
 174     else if( !strcmp( psz_method, "x" ) )
 175     {
 176         p_sys->i_mode = DEINTERLACE_X;
 177         p_sys->b_double_rate = false;
 178         p_sys->b_half_height = false;
 179     }
 180     else if( !strcmp( psz_method, "yadif" ) )
 181     {
 182         p_sys->i_mode = DEINTERLACE_YADIF;
 183         p_sys->b_double_rate = false;
 184         p_sys->b_half_height = false;
 185     }
 186     else if( !strcmp( psz_method, "yadif2x" ) )
 187     {
 188         p_sys->i_mode = DEINTERLACE_YADIF2X;
 189         p_sys->b_double_rate = true;
 190         p_sys->b_half_height = false;
 191     }
 192     else if( !strcmp( psz_method, "discard" ) )
 193     {
 194         const bool b_i422 = i_chroma == VLC_CODEC_I422 ||
 195                             i_chroma == VLC_CODEC_J422;
 196
 197         p_sys->i_mode = DEINTERLACE_DISCARD;
 198         p_sys->b_double_rate = false;
 199         p_sys->b_half_height = !b_i422;
 200     }
 201     else
 202     {
 203         if( strcmp( psz_method, "blend" ) )
 204             msg_Err( p_filter,
 205                      "no valid deinterlace mode provided, using \"blend\"" );
 206
 207         p_sys->i_mode = DEINTERLACE_BLEND;
 208         p_sys->b_double_rate = false;
 209         p_sys->b_half_height = false;
 210     }
 211
 212     msg_Dbg( p_filter, "using %s deinterlace method", psz_method );
 213 }
 214
 215 static void GetOutputFormat( filter_t *p_filter,
 216                              video_format_t *p_dst, const video_format_t *p_src )
 217 {
 218     filter_sys_t *p_sys = p_filter->p_sys;
 219     *p_dst = *p_src;
 220
 221     if( p_sys->b_half_height )
 222     {
 223         p_dst->i_height /= 2;
 224         p_dst->i_visible_height /= 2;
 225         p_dst->i_y_offset /= 2;
 226         p_dst->i_sar_den *= 2;
 227     }
 228
 229     if( p_src->i_chroma == VLC_CODEC_I422 ||
 230         p_src->i_chroma == VLC_CODEC_J422 )
 231     {
 232         switch( p_sys->i_mode )
 233         {
 234         case DEINTERLACE_MEAN:
 235         case DEINTERLACE_LINEAR:
 236         case DEINTERLACE_X:
 237         case DEINTERLACE_YADIF:
 238         case DEINTERLACE_YADIF2X:
 239             p_dst->i_chroma = p_src->i_chroma;
 240             break;
 241         default:
 242             p_dst->i_chroma = p_src->i_chroma == VLC_CODEC_I422 ? VLC_CODEC_I420 :
 243                                                                   VLC_CODEC_J420;
 244             break;
 245         }
 246     }
 247 }
 248
 249 static bool IsChromaSupported( vlc_fourcc_t i_chroma )
 250 {
 251     return i_chroma == VLC_CODEC_I420 ||
 252            i_chroma == VLC_CODEC_J420 ||
 253            i_chroma == VLC_CODEC_YV12 ||
 254            i_chroma == VLC_CODEC_I422 ||
 255            i_chroma == VLC_CODEC_J422;
 256 }
 257
 258 /*****************************************************************************
 259  * RenderDiscard: only keep TOP or BOTTOM field, discard the other.
 260  *****************************************************************************/
 261 static void RenderDiscard( filter_t *p_filter,
 262                            picture_t *p_outpic, picture_t *p_pic, int i_field )
 263 {
 264     int i_plane;
 265
 266     /* Copy image and skip lines */
 267     for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
 268     {
 269         uint8_t *p_in, *p_out_end, *p_out;
 270         int i_increment;
 271
 272         p_in = p_pic->p[i_plane].p_pixels
 273                    + i_field * p_pic->p[i_plane].i_pitch;
 274
 275         p_out = p_outpic->p[i_plane].p_pixels;
 276         p_out_end = p_out + p_outpic->p[i_plane].i_pitch
 277                              * p_outpic->p[i_plane].i_visible_lines;
 278
 279         switch( p_filter->fmt_in.video.i_chroma )
 280         {
 281         case VLC_CODEC_I420:
 282         case VLC_CODEC_J420:
 283         case VLC_CODEC_YV12:
 284
 285             for( ; p_out < p_out_end ; )
 286             {
 287                 vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 288
 289                 p_out += p_outpic->p[i_plane].i_pitch;
 290                 p_in += 2 * p_pic->p[i_plane].i_pitch;
 291             }
 292             break;
 293
 294         case VLC_CODEC_I422:
 295         case VLC_CODEC_J422:
 296
 297             i_increment = 2 * p_pic->p[i_plane].i_pitch;
 298
 299             if( i_plane == Y_PLANE )
 300             {
 301                 for( ; p_out < p_out_end ; )
 302                 {
 303                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 304                     p_out += p_outpic->p[i_plane].i_pitch;
 305                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 306                     p_out += p_outpic->p[i_plane].i_pitch;
 307                     p_in += i_increment;
 308                 }
 309             }
 310             else
 311             {
 312                 for( ; p_out < p_out_end ; )
 313                 {
 314                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 315                     p_out += p_outpic->p[i_plane].i_pitch;
 316                     p_in += i_increment;
 317                 }
 318             }
 319             break;
 320
 321         default:
 322             break;
 323         }
 324     }
 325 }
 326
 327 /*****************************************************************************
 328  * RenderBob: renders a BOB picture - simple copy
 329  *****************************************************************************/
 330 static void RenderBob( filter_t *p_filter,
 331                        picture_t *p_outpic, picture_t *p_pic, int i_field )
 332 {
 333     int i_plane;
 334
 335     /* Copy image and skip lines */
 336     for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
 337     {
 338         uint8_t *p_in, *p_out_end, *p_out;
 339
 340         p_in = p_pic->p[i_plane].p_pixels;
 341         p_out = p_outpic->p[i_plane].p_pixels;
 342         p_out_end = p_out + p_outpic->p[i_plane].i_pitch
 343                              * p_outpic->p[i_plane].i_visible_lines;
 344
 345         switch( p_filter->fmt_in.video.i_chroma )
 346         {
 347             case VLC_CODEC_I420:
 348             case VLC_CODEC_J420:
 349             case VLC_CODEC_YV12:
 350                 /* For BOTTOM field we need to add the first line */
 351                 if( i_field == 1 )
 352                 {
 353                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 354                     p_in += p_pic->p[i_plane].i_pitch;
 355                     p_out += p_outpic->p[i_plane].i_pitch;
 356                 }
 357
 358                 p_out_end -= 2 * p_outpic->p[i_plane].i_pitch;
 359
 360                 for( ; p_out < p_out_end ; )
 361                 {
 362                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 363
 364                     p_out += p_outpic->p[i_plane].i_pitch;
 365
 366                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 367
 368                     p_in += 2 * p_pic->p[i_plane].i_pitch;
 369                     p_out += p_outpic->p[i_plane].i_pitch;
 370                 }
 371
 372                 vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 373
 374                 /* For TOP field we need to add the last line */
 375                 if( i_field == 0 )
 376                 {
 377                     p_in += p_pic->p[i_plane].i_pitch;
 378                     p_out += p_outpic->p[i_plane].i_pitch;
 379                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 380                 }
 381                 break;
 382
 383             case VLC_CODEC_I422:
 384             case VLC_CODEC_J422:
 385                 /* For BOTTOM field we need to add the first line */
 386                 if( i_field == 1 )
 387                 {
 388                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 389                     p_in += p_pic->p[i_plane].i_pitch;
 390                     p_out += p_outpic->p[i_plane].i_pitch;
 391                 }
 392
 393                 p_out_end -= 2 * p_outpic->p[i_plane].i_pitch;
 394
 395                 if( i_plane == Y_PLANE )
 396                 {
 397                     for( ; p_out < p_out_end ; )
 398                     {
 399                         vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 400
 401                         p_out += p_outpic->p[i_plane].i_pitch;
 402
 403                         vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 404
 405                         p_in += 2 * p_pic->p[i_plane].i_pitch;
 406                         p_out += p_outpic->p[i_plane].i_pitch;
 407                     }
 408                 }
 409                 else
 410                 {
 411                     for( ; p_out < p_out_end ; )
 412                     {
 413                         vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 414
 415                         p_out += p_outpic->p[i_plane].i_pitch;
 416                         p_in += 2 * p_pic->p[i_plane].i_pitch;
 417                     }
 418                 }
 419
 420                 vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 421
 422                 /* For TOP field we need to add the last line */
 423                 if( i_field == 0 )
 424                 {
 425                     p_in += p_pic->p[i_plane].i_pitch;
 426                     p_out += p_outpic->p[i_plane].i_pitch;
 427                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 428                 }
 429                 break;
 430         }
 431     }
 432 }
 433
 434 #define Merge p_filter->p_sys->pf_merge
 435 #define EndMerge if(p_filter->p_sys->pf_end_merge) p_filter->p_sys->pf_end_merge
 436
 437 /*****************************************************************************
 438  * RenderLinear: BOB with linear interpolation
 439  *****************************************************************************/
 440 static void RenderLinear( filter_t *p_filter,
 441                           picture_t *p_outpic, picture_t *p_pic, int i_field )
 442 {
 443     int i_plane;
 444
 445     /* Copy image and skip lines */
 446     for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
 447     {
 448         uint8_t *p_in, *p_out_end, *p_out;
 449
 450         p_in = p_pic->p[i_plane].p_pixels;
 451         p_out = p_outpic->p[i_plane].p_pixels;
 452         p_out_end = p_out + p_outpic->p[i_plane].i_pitch
 453                              * p_outpic->p[i_plane].i_visible_lines;
 454
 455         /* For BOTTOM field we need to add the first line */
 456         if( i_field == 1 )
 457         {
 458             vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 459             p_in += p_pic->p[i_plane].i_pitch;
 460             p_out += p_outpic->p[i_plane].i_pitch;
 461         }
 462
 463         p_out_end -= 2 * p_outpic->p[i_plane].i_pitch;
 464
 465         for( ; p_out < p_out_end ; )
 466         {
 467             vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 468
 469             p_out += p_outpic->p[i_plane].i_pitch;
 470
 471             Merge( p_out, p_in, p_in + 2 * p_pic->p[i_plane].i_pitch,
 472                    p_pic->p[i_plane].i_pitch );
 473
 474             p_in += 2 * p_pic->p[i_plane].i_pitch;
 475             p_out += p_outpic->p[i_plane].i_pitch;
 476         }
 477
 478         vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 479
 480         /* For TOP field we need to add the last line */
 481         if( i_field == 0 )
 482         {
 483             p_in += p_pic->p[i_plane].i_pitch;
 484             p_out += p_outpic->p[i_plane].i_pitch;
 485             vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 486         }
 487     }
 488     EndMerge();
 489 }
 490
 491 static void RenderMean( filter_t *p_filter,
 492                         picture_t *p_outpic, picture_t *p_pic )
 493 {
 494     int i_plane;
 495
 496     /* Copy image and skip lines */
 497     for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
 498     {
 499         uint8_t *p_in, *p_out_end, *p_out;
 500
 501         p_in = p_pic->p[i_plane].p_pixels;
 502
 503         p_out = p_outpic->p[i_plane].p_pixels;
 504         p_out_end = p_out + p_outpic->p[i_plane].i_pitch
 505                              * p_outpic->p[i_plane].i_visible_lines;
 506
 507         /* All lines: mean value */
 508         for( ; p_out < p_out_end ; )
 509         {
 510             Merge( p_out, p_in, p_in + p_pic->p[i_plane].i_pitch,
 511                    p_pic->p[i_plane].i_pitch );
 512
 513             p_out += p_outpic->p[i_plane].i_pitch;
 514             p_in += 2 * p_pic->p[i_plane].i_pitch;
 515         }
 516     }
 517     EndMerge();
 518 }
 519
 520 static void RenderBlend( filter_t *p_filter,
 521                          picture_t *p_outpic, picture_t *p_pic )
 522 {
 523     int i_plane;
 524
 525     /* Copy image and skip lines */
 526     for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
 527     {
 528         uint8_t *p_in, *p_out_end, *p_out;
 529
 530         p_in = p_pic->p[i_plane].p_pixels;
 531
 532         p_out = p_outpic->p[i_plane].p_pixels;
 533         p_out_end = p_out + p_outpic->p[i_plane].i_pitch
 534                              * p_outpic->p[i_plane].i_visible_lines;
 535
 536         switch( p_filter->fmt_in.video.i_chroma )
 537         {
 538             case VLC_CODEC_I420:
 539             case VLC_CODEC_J420:
 540             case VLC_CODEC_YV12:
 541                 /* First line: simple copy */
 542                 vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 543                 p_out += p_outpic->p[i_plane].i_pitch;
 544
 545                 /* Remaining lines: mean value */
 546                 for( ; p_out < p_out_end ; )
 547                 {
 548                     Merge( p_out, p_in, p_in + p_pic->p[i_plane].i_pitch,
 549                            p_pic->p[i_plane].i_pitch );
 550
 551                     p_out += p_outpic->p[i_plane].i_pitch;
 552                     p_in += p_pic->p[i_plane].i_pitch;
 553                 }
 554                 break;
 555
 556             case VLC_CODEC_I422:
 557             case VLC_CODEC_J422:
 558                 /* First line: simple copy */
 559                 vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 560                 p_out += p_outpic->p[i_plane].i_pitch;
 561
 562                 /* Remaining lines: mean value */
 563                 if( i_plane == Y_PLANE )
 564                 {
 565                     for( ; p_out < p_out_end ; )
 566                     {
 567                         Merge( p_out, p_in, p_in + p_pic->p[i_plane].i_pitch,
 568                                p_pic->p[i_plane].i_pitch );
 569
 570                         p_out += p_outpic->p[i_plane].i_pitch;
 571                         p_in += p_pic->p[i_plane].i_pitch;
 572                     }
 573                 }
 574
 575                 else
 576                 {
 577                     for( ; p_out < p_out_end ; )
 578                     {
 579                         Merge( p_out, p_in, p_in + p_pic->p[i_plane].i_pitch,
 580                                p_pic->p[i_plane].i_pitch );
 581
 582                         p_out += p_outpic->p[i_plane].i_pitch;
 583                         p_in += 2*p_pic->p[i_plane].i_pitch;
 584                     }
 585                 }
 586                 break;
 587         }
 588     }
 589     EndMerge();
 590 }
 591
 592 #undef Merge
 593
 594 static void MergeGeneric( void *_p_dest, const void *_p_s1,
 595                           const void *_p_s2, size_t i_bytes )
 596 {
 597     uint8_t* p_dest = (uint8_t*)_p_dest;
 598     const uint8_t *p_s1 = (const uint8_t *)_p_s1;
 599     const uint8_t *p_s2 = (const uint8_t *)_p_s2;
 600     uint8_t* p_end = p_dest + i_bytes - 8;
 601
 602     while( p_dest < p_end )
 603     {
 604         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 605         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 606         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 607         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 608         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 609         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 610         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 611         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 612     }
 613
 614     p_end += 8;
 615
 616     while( p_dest < p_end )
 617     {
 618         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 619     }
 620 }
 621
 622 #if defined(CAN_COMPILE_MMXEXT)
 623 static void MergeMMXEXT( void *_p_dest, const void *_p_s1, const void *_p_s2,
 624                          size_t i_bytes )
 625 {
 626     uint8_t* p_dest = (uint8_t*)_p_dest;
 627     const uint8_t *p_s1 = (const uint8_t *)_p_s1;
 628     const uint8_t *p_s2 = (const uint8_t *)_p_s2;
 629     uint8_t* p_end = p_dest + i_bytes - 8;
 630     while( p_dest < p_end )
 631     {
 632         __asm__  __volatile__( "movq %2,%%mm1;"
 633                                "pavgb %1, %%mm1;"
 634                                "movq %%mm1, %0" :"=m" (*p_dest):
 635                                                  "m" (*p_s1),
 636                                                  "m" (*p_s2) );
 637         p_dest += 8;
 638         p_s1 += 8;
 639         p_s2 += 8;
 640     }
 641
 642     p_end += 8;
 643
 644     while( p_dest < p_end )
 645     {
 646         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 647     }
 648 }
 649 #endif
 650
 651 #if defined(CAN_COMPILE_3DNOW)
 652 static void Merge3DNow( void *_p_dest, const void *_p_s1, const void *_p_s2,
 653                         size_t i_bytes )
 654 {
 655     uint8_t* p_dest = (uint8_t*)_p_dest;
 656     const uint8_t *p_s1 = (const uint8_t *)_p_s1;
 657     const uint8_t *p_s2 = (const uint8_t *)_p_s2;
 658     uint8_t* p_end = p_dest + i_bytes - 8;
 659     while( p_dest < p_end )
 660     {
 661         __asm__  __volatile__( "movq %2,%%mm1;"
 662                                "pavgusb %1, %%mm1;"
 663                                "movq %%mm1, %0" :"=m" (*p_dest):
 664                                                  "m" (*p_s1),
 665                                                  "m" (*p_s2) );
 666         p_dest += 8;
 667         p_s1 += 8;
 668         p_s2 += 8;
 669     }
 670
 671     p_end += 8;
 672
 673     while( p_dest < p_end )
 674     {
 675         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 676     }
 677 }
 678 #endif
 679
 680 #if defined(CAN_COMPILE_SSE)
 681 static void MergeSSE2( void *_p_dest, const void *_p_s1, const void *_p_s2,
 682                        size_t i_bytes )
 683 {
 684     uint8_t* p_dest = (uint8_t*)_p_dest;
 685     const uint8_t *p_s1 = (const uint8_t *)_p_s1;
 686     const uint8_t *p_s2 = (const uint8_t *)_p_s2;
 687     uint8_t* p_end;
 688     while( (uintptr_t)p_s1 % 16 )
 689     {
 690         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 691     }
 692     p_end = p_dest + i_bytes - 16;
 693     while( p_dest < p_end )
 694     {
 695         __asm__  __volatile__( "movdqu %2,%%xmm1;"
 696                                "pavgb %1, %%xmm1;"
 697                                "movdqu %%xmm1, %0" :"=m" (*p_dest):
 698                                                  "m" (*p_s1),
 699                                                  "m" (*p_s2) );
 700         p_dest += 16;
 701         p_s1 += 16;
 702         p_s2 += 16;
 703     }
 704
 705     p_end += 16;
 706
 707     while( p_dest < p_end )
 708     {
 709         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 710     }
 711 }
 712 #endif
 713
 714 #if defined(CAN_COMPILE_MMXEXT) || defined(CAN_COMPILE_SSE)
 715 static void EndMMX( void )
 716 {
 717     __asm__ __volatile__( "emms" :: );
 718 }
 719 #endif
 720
 721 #if defined(CAN_COMPILE_3DNOW)
 722 static void End3DNow( void )
 723 {
 724     __asm__ __volatile__( "femms" :: );
 725 }
 726 #endif
 727
 728 #ifdef CAN_COMPILE_C_ALTIVEC
 729 static void MergeAltivec( void *_p_dest, const void *_p_s1,
 730                           const void *_p_s2, size_t i_bytes )
 731 {
 732     uint8_t *p_dest = (uint8_t *)_p_dest;
 733     uint8_t *p_s1   = (uint8_t *)_p_s1;
 734     uint8_t *p_s2   = (uint8_t *)_p_s2;
 735     uint8_t *p_end  = p_dest + i_bytes - 15;
 736
 737     /* Use C until the first 16-bytes aligned destination pixel */
 738     while( (uintptr_t)p_dest & 0xF )
 739     {
 740         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 741     }
 742
 743     if( ( (int)p_s1 & 0xF ) | ( (int)p_s2 & 0xF ) )
 744     {
 745         /* Unaligned source */
 746         vector unsigned char s1v, s2v, destv;
 747         vector unsigned char s1oldv, s2oldv, s1newv, s2newv;
 748         vector unsigned char perm1v, perm2v;
 749
 750         perm1v = vec_lvsl( 0, p_s1 );
 751         perm2v = vec_lvsl( 0, p_s2 );
 752         s1oldv = vec_ld( 0, p_s1 );
 753         s2oldv = vec_ld( 0, p_s2 );
 754
 755         while( p_dest < p_end )
 756         {
 757             s1newv = vec_ld( 16, p_s1 );
 758             s2newv = vec_ld( 16, p_s2 );
 759             s1v    = vec_perm( s1oldv, s1newv, perm1v );
 760             s2v    = vec_perm( s2oldv, s2newv, perm2v );
 761             s1oldv = s1newv;
 762             s2oldv = s2newv;
 763             destv  = vec_avg( s1v, s2v );
 764             vec_st( destv, 0, p_dest );
 765
 766             p_s1   += 16;
 767             p_s2   += 16;
 768             p_dest += 16;
 769         }
 770     }
 771     else
 772     {
 773         /* Aligned source */
 774         vector unsigned char s1v, s2v, destv;
 775
 776         while( p_dest < p_end )
 777         {
 778             s1v   = vec_ld( 0, p_s1 );
 779             s2v   = vec_ld( 0, p_s2 );
 780             destv = vec_avg( s1v, s2v );
 781             vec_st( destv, 0, p_dest );
 782
 783             p_s1   += 16;
 784             p_s2   += 16;
 785             p_dest += 16;
 786         }
 787     }
 788
 789     p_end += 15;
 790
 791     while( p_dest < p_end )
 792     {
 793         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 794     }
 795 }
 796 #endif
 797
 798 #ifdef __ARM_NEON__
 799 static void MergeNEON (void *restrict out, const void *in1,
 800                        const void *in2, size_t n)
 801 {
 802     uint8_t *outp = out;
 803     const uint8_t *in1p = in1;
 804     const uint8_t *in2p = in2;
 805     size_t mis = ((uintptr_t)outp) & 15;
 806
 807     if (mis)
 808     {
 809         MergeGeneric (outp, in1p, in2p, mis);
 810         outp += mis;
 811         in1p += mis;
 812         in2p += mis;
 813         n -= mis;
 814     }
 815
 816     uint8_t *end = outp + (n & ~15);
 817
 818     if ((((uintptr_t)in1p)|((uintptr_t)in2p)) & 15)
 819         while (outp < end)
 820             asm volatile (
 821                 "vld1.u8  {q0-q1}, [%[in1]]!\n"
 822                 "vld1.u8  {q2-q3}, [%[in2]]!\n"
 823                 "vhadd.u8 q4, q0, q2\n"
 824                 "vld1.u8  {q6-q7}, [%[in1]]!\n"
 825                 "vhadd.u8 q5, q1, q3\n"
 826                 "vld1.u8  {q8-q9}, [%[in2]]!\n"
 827                 "vhadd.u8 q10, q6, q8\n"
 828                 "vhadd.u8 q11, q7, q9\n"
 829                 "vst1.u8  {q4-q5}, [%[out],:128]!\n"
 830                 "vst1.u8  {q10-q11}, [%[out],:128]!\n"
 831                 : [out] "+r" (outp), [in1] "+r" (in1p), [in2] "+r" (in2p)
 832                 :
 833                 : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
 834                   "q8", "q9", "q10", "q11", "memory");
 835     else
 836          while (outp < end)
 837             asm volatile (
 838                 "vld1.u8  {q0-q1}, [%[in1],:128]!\n"
 839                 "vld1.u8  {q2-q3}, [%[in2],:128]!\n"
 840                 "vhadd.u8 q4, q0, q2\n"
 841                 "vld1.u8  {q6-q7}, [%[in1],:128]!\n"
 842                 "vhadd.u8 q5, q1, q3\n"
 843                 "vld1.u8  {q8-q9}, [%[in2],:128]!\n"
 844                 "vhadd.u8 q10, q6, q8\n"
 845                 "vhadd.u8 q11, q7, q9\n"
 846                 "vst1.u8  {q4-q5}, [%[out],:128]!\n"
 847                 "vst1.u8  {q10-q11}, [%[out],:128]!\n"
 848                 : [out] "+r" (outp), [in1] "+r" (in1p), [in2] "+r" (in2p)
 849                 :
 850                 : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
 851                   "q8", "q9", "q10", "q11", "memory");
 852     n &= 15;
 853     if (n)
 854         MergeGeneric (outp, in1p, in2p, n);
 855 }
 856 #endif
 857
 858 /*****************************************************************************
 859  * RenderX: This algo works on a 8x8 block basic, it copies the top field
 860  * and apply a process to recreate the bottom field :
 861  *  If a 8x8 block is classified as :
 862  *   - progressive: it applies a small blend (1,6,1)
 863  *   - interlaced:
 864  *    * in the MMX version: we do a ME between the 2 fields, if there is a
 865  *    good match we use MC to recreate the bottom field (with a small
 866  *    blend (1,6,1) )
 867  *    * otherwise: it recreates the bottom field by an edge oriented
 868  *    interpolation.
 869   *****************************************************************************/
 870
 871 /* XDeint8x8Detect: detect if a 8x8 block is interlaced.
 872  * XXX: It need to access to 8x10
 873  * We use more than 8 lines to help with scrolling (text)
 874  * (and because XDeint8x8Frame use line 9)
 875  * XXX: smooth/uniform area with noise detection doesn't works well
 876  * but it's not really a problem because they don't have much details anyway
 877  */
 878 static inline int ssd( int a ) { return a*a; }
 879 static inline int XDeint8x8DetectC( uint8_t *src, int i_src )
 880 {
 881     int y, x;
 882     int ff, fr;
 883     int fc;
 884
 885     /* Detect interlacing */
 886     fc = 0;
 887     for( y = 0; y < 7; y += 2 )
 888     {
 889         ff = fr = 0;
 890         for( x = 0; x < 8; x++ )
 891         {
 892             fr += ssd(src[      x] - src[1*i_src+x]) +
 893                   ssd(src[i_src+x] - src[2*i_src+x]);
 894             ff += ssd(src[      x] - src[2*i_src+x]) +
 895                   ssd(src[i_src+x] - src[3*i_src+x]);
 896         }
 897         if( ff < 6*fr/8 && fr > 32 )
 898             fc++;
 899
 900         src += 2*i_src;
 901     }
 902
 903     return fc < 1 ? false : true;
 904 }
 905 #ifdef CAN_COMPILE_MMXEXT
 906 static inline int XDeint8x8DetectMMXEXT( uint8_t *src, int i_src )
 907 {
 908
 909     int y, x;
 910     int32_t ff, fr;
 911     int fc;
 912
 913     /* Detect interlacing */
 914     fc = 0;
 915     pxor_r2r( mm7, mm7 );
 916     for( y = 0; y < 9; y += 2 )
 917     {
 918         ff = fr = 0;
 919         pxor_r2r( mm5, mm5 );
 920         pxor_r2r( mm6, mm6 );
 921         for( x = 0; x < 8; x+=4 )
 922         {
 923             movd_m2r( src[        x], mm0 );
 924             movd_m2r( src[1*i_src+x], mm1 );
 925             movd_m2r( src[2*i_src+x], mm2 );
 926             movd_m2r( src[3*i_src+x], mm3 );
 927
 928             punpcklbw_r2r( mm7, mm0 );
 929             punpcklbw_r2r( mm7, mm1 );
 930             punpcklbw_r2r( mm7, mm2 );
 931             punpcklbw_r2r( mm7, mm3 );
 932
 933             movq_r2r( mm0, mm4 );
 934
 935             psubw_r2r( mm1, mm0 );
 936             psubw_r2r( mm2, mm4 );
 937
 938             psubw_r2r( mm1, mm2 );
 939             psubw_r2r( mm1, mm3 );
 940
 941             pmaddwd_r2r( mm0, mm0 );
 942             pmaddwd_r2r( mm4, mm4 );
 943             pmaddwd_r2r( mm2, mm2 );
 944             pmaddwd_r2r( mm3, mm3 );
 945             paddd_r2r( mm0, mm2 );
 946             paddd_r2r( mm4, mm3 );
 947             paddd_r2r( mm2, mm5 );
 948             paddd_r2r( mm3, mm6 );
 949         }
 950
 951         movq_r2r( mm5, mm0 );
 952         psrlq_i2r( 32, mm0 );
 953         paddd_r2r( mm0, mm5 );
 954         movd_r2m( mm5, fr );
 955
 956         movq_r2r( mm6, mm0 );
 957         psrlq_i2r( 32, mm0 );
 958         paddd_r2r( mm0, mm6 );
 959         movd_r2m( mm6, ff );
 960
 961         if( ff < 6*fr/8 && fr > 32 )
 962             fc++;
 963
 964         src += 2*i_src;
 965     }
 966     return fc;
 967 }
 968 #endif
 969
 970 static inline void XDeint8x8MergeC( uint8_t *dst, int i_dst,
 971                                     uint8_t *src1, int i_src1,
 972                                     uint8_t *src2, int i_src2 )
 973 {
 974     int y, x;
 975
 976     /* Progressive */
 977     for( y = 0; y < 8; y += 2 )
 978     {
 979         memcpy( dst, src1, 8 );
 980         dst  += i_dst;
 981
 982         for( x = 0; x < 8; x++ )
 983             dst[x] = (src1[x] + 6*src2[x] + src1[i_src1+x] + 4 ) >> 3;
 984         dst += i_dst;
 985
 986         src1 += i_src1;
 987         src2 += i_src2;
 988     }
 989 }
 990
 991 #ifdef CAN_COMPILE_MMXEXT
 992 static inline void XDeint8x8MergeMMXEXT( uint8_t *dst, int i_dst,
 993                                          uint8_t *src1, int i_src1,
 994                                          uint8_t *src2, int i_src2 )
 995 {
 996     static const uint64_t m_4 = INT64_C(0x0004000400040004);
 997     int y, x;
 998
 999     /* Progressive */
1000     pxor_r2r( mm7, mm7 );
1001     for( y = 0; y < 8; y += 2 )
1002     {
1003         for( x = 0; x < 8; x +=4 )
1004         {
1005             movd_m2r( src1[x], mm0 );
1006             movd_r2m( mm0, dst[x] );
1007
1008             movd_m2r( src2[x], mm1 );
1009             movd_m2r( src1[i_src1+x], mm2 );
1010
1011             punpcklbw_r2r( mm7, mm0 );
1012             punpcklbw_r2r( mm7, mm1 );
1013             punpcklbw_r2r( mm7, mm2 );
1014             paddw_r2r( mm1, mm1 );
1015             movq_r2r( mm1, mm3 );
1016             paddw_r2r( mm3, mm3 );
1017             paddw_r2r( mm2, mm0 );
1018             paddw_r2r( mm3, mm1 );
1019             paddw_m2r( m_4, mm1 );
1020             paddw_r2r( mm1, mm0 );
1021             psraw_i2r( 3, mm0 );
1022             packuswb_r2r( mm7, mm0 );
1023             movd_r2m( mm0, dst[i_dst+x] );
1024         }
1025         dst += 2*i_dst;
1026         src1 += i_src1;
1027         src2 += i_src2;
1028     }
1029 }
1030
1031 #endif
1032
1033 /* For debug */
1034 static inline void XDeint8x8Set( uint8_t *dst, int i_dst, uint8_t v )
1035 {
1036     int y;
1037     for( y = 0; y < 8; y++ )
1038         memset( &dst[y*i_dst], v, 8 );
1039 }
1040
1041 /* XDeint8x8FieldE: Stupid deinterlacing (1,0,1) for block that miss a
1042  * neighbour
1043  * (Use 8x9 pixels)
1044  * TODO: a better one for the inner part.
1045  */
1046 static inline void XDeint8x8FieldEC( uint8_t *dst, int i_dst,
1047                                      uint8_t *src, int i_src )
1048 {
1049     int y, x;
1050
1051     /* Interlaced */
1052     for( y = 0; y < 8; y += 2 )
1053     {
1054         memcpy( dst, src, 8 );
1055         dst += i_dst;
1056
1057         for( x = 0; x < 8; x++ )
1058             dst[x] = (src[x] + src[2*i_src+x] ) >> 1;
1059         dst += 1*i_dst;
1060         src += 2*i_src;
1061     }
1062 }
1063 #ifdef CAN_COMPILE_MMXEXT
1064 static inline void XDeint8x8FieldEMMXEXT( uint8_t *dst, int i_dst,
1065                                           uint8_t *src, int i_src )
1066 {
1067     int y;
1068
1069     /* Interlaced */
1070     for( y = 0; y < 8; y += 2 )
1071     {
1072         movq_m2r( src[0], mm0 );
1073         movq_r2m( mm0, dst[0] );
1074         dst += i_dst;
1075
1076         movq_m2r( src[2*i_src], mm1 );
1077         pavgb_r2r( mm1, mm0 );
1078
1079         movq_r2m( mm0, dst[0] );
1080
1081         dst += 1*i_dst;
1082         src += 2*i_src;
1083     }
1084 }
1085 #endif
1086
1087 /* XDeint8x8Field: Edge oriented interpolation
1088  * (Need -4 and +5 pixels H, +1 line)
1089  */
1090 static inline void XDeint8x8FieldC( uint8_t *dst, int i_dst,
1091                                     uint8_t *src, int i_src )
1092 {
1093     int y, x;
1094
1095     /* Interlaced */
1096     for( y = 0; y < 8; y += 2 )
1097     {
1098         memcpy( dst, src, 8 );
1099         dst += i_dst;
1100
1101         for( x = 0; x < 8; x++ )
1102         {
1103             uint8_t *src2 = &src[2*i_src];
1104             /* I use 8 pixels just to match the MMX version, but it's overkill
1105              * 5 would be enough (less isn't good) */
1106             const int c0 = abs(src[x-4]-src2[x-2]) + abs(src[x-3]-src2[x-1]) +
1107                            abs(src[x-2]-src2[x+0]) + abs(src[x-1]-src2[x+1]) +
1108                            abs(src[x+0]-src2[x+2]) + abs(src[x+1]-src2[x+3]) +
1109                            abs(src[x+2]-src2[x+4]) + abs(src[x+3]-src2[x+5]);
1110
1111             const int c1 = abs(src[x-3]-src2[x-3]) + abs(src[x-2]-src2[x-2]) +
1112                            abs(src[x-1]-src2[x-1]) + abs(src[x+0]-src2[x+0]) +
1113                            abs(src[x+1]-src2[x+1]) + abs(src[x+2]-src2[x+2]) +
1114                            abs(src[x+3]-src2[x+3]) + abs(src[x+4]-src2[x+4]);
1115
1116             const int c2 = abs(src[x-2]-src2[x-4]) + abs(src[x-1]-src2[x-3]) +
1117                            abs(src[x+0]-src2[x-2]) + abs(src[x+1]-src2[x-1]) +
1118                            abs(src[x+2]-src2[x+0]) + abs(src[x+3]-src2[x+1]) +
1119                            abs(src[x+4]-src2[x+2]) + abs(src[x+5]-src2[x+3]);
1120
1121             if( c0 < c1 && c1 <= c2 )
1122                 dst[x] = (src[x-1] + src2[x+1]) >> 1;
1123             else if( c2 < c1 && c1 <= c0 )
1124                 dst[x] = (src[x+1] + src2[x-1]) >> 1;
1125             else
1126                 dst[x] = (src[x+0] + src2[x+0]) >> 1;
1127         }
1128
1129         dst += 1*i_dst;
1130         src += 2*i_src;
1131     }
1132 }
1133 #ifdef CAN_COMPILE_MMXEXT
1134 static inline void XDeint8x8FieldMMXEXT( uint8_t *dst, int i_dst,
1135                                          uint8_t *src, int i_src )
1136 {
1137     int y, x;
1138
1139     /* Interlaced */
1140     for( y = 0; y < 8; y += 2 )
1141     {
1142         memcpy( dst, src, 8 );
1143         dst += i_dst;
1144
1145         for( x = 0; x < 8; x++ )
1146         {
1147             uint8_t *src2 = &src[2*i_src];
1148             int32_t c0, c1, c2;
1149
1150             movq_m2r( src[x-2], mm0 );
1151             movq_m2r( src[x-3], mm1 );
1152             movq_m2r( src[x-4], mm2 );
1153
1154             psadbw_m2r( src2[x-4], mm0 );
1155             psadbw_m2r( src2[x-3], mm1 );
1156             psadbw_m2r( src2[x-2], mm2 );
1157
1158             movd_r2m( mm0, c2 );
1159             movd_r2m( mm1, c1 );
1160             movd_r2m( mm2, c0 );
1161
1162             if( c0 < c1 && c1 <= c2 )
1163                 dst[x] = (src[x-1] + src2[x+1]) >> 1;
1164             else if( c2 < c1 && c1 <= c0 )
1165                 dst[x] = (src[x+1] + src2[x-1]) >> 1;
1166             else
1167                 dst[x] = (src[x+0] + src2[x+0]) >> 1;
1168         }
1169
1170         dst += 1*i_dst;
1171         src += 2*i_src;
1172     }
1173 }
1174 #endif
1175
1176 /* NxN arbitray size (and then only use pixel in the NxN block)
1177  */
1178 static inline int XDeintNxNDetect( uint8_t *src, int i_src,
1179                                    int i_height, int i_width )
1180 {
1181     int y, x;
1182     int ff, fr;
1183     int fc;
1184
1185
1186     /* Detect interlacing */
1187     /* FIXME way too simple, need to be more like XDeint8x8Detect */
1188     ff = fr = 0;
1189     fc = 0;
1190     for( y = 0; y < i_height - 2; y += 2 )
1191     {
1192         const uint8_t *s = &src[y*i_src];
1193         for( x = 0; x < i_width; x++ )
1194         {
1195             fr += ssd(s[      x] - s[1*i_src+x]);
1196             ff += ssd(s[      x] - s[2*i_src+x]);
1197         }
1198         if( ff < fr && fr > i_width / 2 )
1199             fc++;
1200     }
1201
1202     return fc < 2 ? false : true;
1203 }
1204
1205 static inline void XDeintNxNFrame( uint8_t *dst, int i_dst,
1206                                    uint8_t *src, int i_src,
1207                                    int i_width, int i_height )
1208 {
1209     int y, x;
1210
1211     /* Progressive */
1212     for( y = 0; y < i_height; y += 2 )
1213     {
1214         memcpy( dst, src, i_width );
1215         dst += i_dst;
1216
1217         if( y < i_height - 2 )
1218         {
1219             for( x = 0; x < i_width; x++ )
1220                 dst[x] = (src[x] + 2*src[1*i_src+x] + src[2*i_src+x] + 2 ) >> 2;
1221         }
1222         else
1223         {
1224             /* Blend last line */
1225             for( x = 0; x < i_width; x++ )
1226                 dst[x] = (src[x] + src[1*i_src+x] ) >> 1;
1227         }
1228         dst += 1*i_dst;
1229         src += 2*i_src;
1230     }
1231 }
1232
1233 static inline void XDeintNxNField( uint8_t *dst, int i_dst,
1234                                    uint8_t *src, int i_src,
1235                                    int i_width, int i_height )
1236 {
1237     int y, x;
1238
1239     /* Interlaced */
1240     for( y = 0; y < i_height; y += 2 )
1241     {
1242         memcpy( dst, src, i_width );
1243         dst += i_dst;
1244
1245         if( y < i_height - 2 )
1246         {
1247             for( x = 0; x < i_width; x++ )
1248                 dst[x] = (src[x] + src[2*i_src+x] ) >> 1;
1249         }
1250         else
1251         {
1252             /* Blend last line */
1253             for( x = 0; x < i_width; x++ )
1254                 dst[x] = (src[x] + src[i_src+x]) >> 1;
1255         }
1256         dst += 1*i_dst;
1257         src += 2*i_src;
1258     }
1259 }
1260
1261 static inline void XDeintNxN( uint8_t *dst, int i_dst, uint8_t *src, int i_src,
1262                               int i_width, int i_height )
1263 {
1264     if( XDeintNxNDetect( src, i_src, i_width, i_height ) )
1265         XDeintNxNField( dst, i_dst, src, i_src, i_width, i_height );
1266     else
1267         XDeintNxNFrame( dst, i_dst, src, i_src, i_width, i_height );
1268 }
1269
1270
1271 static inline int median( int a, int b, int c )
1272 {
1273     int min = a, max =a;
1274     if( b < min )
1275         min = b;
1276     else
1277         max = b;
1278
1279     if( c < min )
1280         min = c;
1281     else if( c > max )
1282         max = c;
1283
1284     return a + b + c - min - max;
1285 }
1286
1287
1288 /* XDeintBand8x8:
1289  */
1290 static inline void XDeintBand8x8C( uint8_t *dst, int i_dst,
1291                                    uint8_t *src, int i_src,
1292                                    const int i_mbx, int i_modx )
1293 {
1294     int x;
1295
1296     for( x = 0; x < i_mbx; x++ )
1297     {
1298         int s;
1299         if( ( s = XDeint8x8DetectC( src, i_src ) ) )
1300         {
1301             if( x == 0 || x == i_mbx - 1 )
1302                 XDeint8x8FieldEC( dst, i_dst, src, i_src );
1303             else
1304                 XDeint8x8FieldC( dst, i_dst, src, i_src );
1305         }
1306         else
1307         {
1308             XDeint8x8MergeC( dst, i_dst,
1309                              &src[0*i_src], 2*i_src,
1310                              &src[1*i_src], 2*i_src );
1311         }
1312
1313         dst += 8;
1314         src += 8;
1315     }
1316
1317     if( i_modx )
1318         XDeintNxN( dst, i_dst, src, i_src, i_modx, 8 );
1319 }
1320 #ifdef CAN_COMPILE_MMXEXT
1321 static inline void XDeintBand8x8MMXEXT( uint8_t *dst, int i_dst,
1322                                         uint8_t *src, int i_src,
1323                                         const int i_mbx, int i_modx )
1324 {
1325     int x;
1326
1327     /* Reset current line */
1328     for( x = 0; x < i_mbx; x++ )
1329     {
1330         int s;
1331         if( ( s = XDeint8x8DetectMMXEXT( src, i_src ) ) )
1332         {
1333             if( x == 0 || x == i_mbx - 1 )
1334                 XDeint8x8FieldEMMXEXT( dst, i_dst, src, i_src );
1335             else
1336                 XDeint8x8FieldMMXEXT( dst, i_dst, src, i_src );
1337         }
1338         else
1339         {
1340             XDeint8x8MergeMMXEXT( dst, i_dst,
1341                                   &src[0*i_src], 2*i_src,
1342                                   &src[1*i_src], 2*i_src );
1343         }
1344
1345         dst += 8;
1346         src += 8;
1347     }
1348
1349     if( i_modx )
1350         XDeintNxN( dst, i_dst, src, i_src, i_modx, 8 );
1351 }
1352 #endif
1353
1354 static void RenderX( picture_t *p_outpic, picture_t *p_pic )
1355 {
1356     int i_plane;
1357
1358     /* Copy image and skip lines */
1359     for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
1360     {
1361         const int i_mby = ( p_outpic->p[i_plane].i_visible_lines + 7 )/8 - 1;
1362         const int i_mbx = p_outpic->p[i_plane].i_visible_pitch/8;
1363
1364         const int i_mody = p_outpic->p[i_plane].i_visible_lines - 8*i_mby;
1365         const int i_modx = p_outpic->p[i_plane].i_visible_pitch - 8*i_mbx;
1366
1367         const int i_dst = p_outpic->p[i_plane].i_pitch;
1368         const int i_src = p_pic->p[i_plane].i_pitch;
1369
1370         int y, x;
1371
1372         for( y = 0; y < i_mby; y++ )
1373         {
1374             uint8_t *dst = &p_outpic->p[i_plane].p_pixels[8*y*i_dst];
1375             uint8_t *src = &p_pic->p[i_plane].p_pixels[8*y*i_src];
1376
1377 #ifdef CAN_COMPILE_MMXEXT
1378             if( vlc_CPU() & CPU_CAPABILITY_MMXEXT )
1379                 XDeintBand8x8MMXEXT( dst, i_dst, src, i_src, i_mbx, i_modx );
1380             else
1381 #endif
1382                 XDeintBand8x8C( dst, i_dst, src, i_src, i_mbx, i_modx );
1383         }
1384
1385         /* Last line (C only)*/
1386         if( i_mody )
1387         {
1388             uint8_t *dst = &p_outpic->p[i_plane].p_pixels[8*y*i_dst];
1389             uint8_t *src = &p_pic->p[i_plane].p_pixels[8*y*i_src];
1390
1391             for( x = 0; x < i_mbx; x++ )
1392             {
1393                 XDeintNxN( dst, i_dst, src, i_src, 8, i_mody );
1394
1395                 dst += 8;
1396                 src += 8;
1397             }
1398
1399             if( i_modx )
1400                 XDeintNxN( dst, i_dst, src, i_src, i_modx, i_mody );
1401         }
1402     }
1403
1404 #ifdef CAN_COMPILE_MMXEXT
1405     if( vlc_CPU() & CPU_CAPABILITY_MMXEXT )
1406         emms();
1407 #endif
1408 }
1409
1410 /*****************************************************************************
1411  * Yadif (Yet Another DeInterlacing Filter).
1412  *****************************************************************************/
1413 /* */
1414 struct vf_priv_s {
1415     /*
1416      * 0: Output 1 frame for each frame.
1417      * 1: Output 1 frame for each field.
1418      * 2: Like 0 but skips spatial interlacing check.
1419      * 3: Like 1 but skips spatial interlacing check.
1420      *
1421      * In vlc, only & 0x02 has meaning, as we do the & 0x01 ourself.
1422      */
1423     int mode;
1424 };
1425
1426 /* I am unsure it is the right one */
1427 typedef intptr_t x86_reg;
1428
1429 #define FFABS(a) ((a) >= 0 ? (a) : (-(a)))
1430 #define FFMAX(a,b)      __MAX(a,b)
1431 #define FFMAX3(a,b,c)   FFMAX(FFMAX(a,b),c)
1432 #define FFMIN(a,b)      __MIN(a,b)
1433 #define FFMIN3(a,b,c)   FFMIN(FFMIN(a,b),c)
1434
1435 /* yadif.h comes from vf_yadif.c of mplayer project */
1436 #include "yadif.h"
1437
1438 static int RenderYadif( filter_t *p_filter, picture_t *p_dst, picture_t *p_src, int i_order, int i_field )
1439 {
1440     filter_sys_t *p_sys = p_filter->p_sys;
1441
1442     /* */
1443     assert( i_order == 0 || i_order == 1 );
1444     assert( i_field == 0 || i_field == 1 );
1445
1446     if( i_order == 0 )
1447     {
1448         /* Duplicate the picture
1449          * TODO when the vout rework is finished, picture_Hold() might be enough
1450          * but becarefull, the pitches must match */
1451         picture_t *p_dup = picture_NewFromFormat( &p_src->format );
1452         if( p_dup )
1453             picture_Copy( p_dup, p_src );
1454
1455         /* Slide the history */
1456         if( p_sys->pp_history[0] )
1457             picture_Release( p_sys->pp_history[0]  );
1458         for( int i = 1; i < HISTORY_SIZE; i++ )
1459             p_sys->pp_history[i-1] = p_sys->pp_history[i];
1460         p_sys->pp_history[HISTORY_SIZE-1] = p_dup;
1461     }
1462
1463     /* As the pitches must match, use ONLY pictures coming from picture_New()! */
1464     picture_t *p_prev = p_sys->pp_history[0];
1465     picture_t *p_cur  = p_sys->pp_history[1];
1466     picture_t *p_next = p_sys->pp_history[2];
1467
1468     /* Filter if we have all the pictures we need */
1469     if( p_prev && p_cur && p_next )
1470     {
1471         /* */
1472         void (*filter)(struct vf_priv_s *p, uint8_t *dst, uint8_t *prev, uint8_t *cur, uint8_t *next, int w, int refs, int parity);
1473 #if defined(HAVE_YADIF_SSE2)
1474         if( vlc_CPU() & CPU_CAPABILITY_SSE2 )
1475             filter = yadif_filter_line_mmx2;
1476         else
1477 #endif
1478             filter = yadif_filter_line_c;
1479
1480         for( int n = 0; n < p_dst->i_planes; n++ )
1481         {
1482             const plane_t *prevp = &p_prev->p[n];
1483             const plane_t *curp  = &p_cur->p[n];
1484             const plane_t *nextp = &p_next->p[n];
1485             plane_t *dstp        = &p_dst->p[n];
1486
1487             for( int y = 1; y < dstp->i_visible_lines - 1; y++ )
1488             {
1489                 if( (y % 2) == i_field )
1490                 {
1491                     vlc_memcpy( &dstp->p_pixels[y * dstp->i_pitch],
1492                                 &curp->p_pixels[y * curp->i_pitch], dstp->i_visible_pitch );
1493                 }
1494                 else
1495                 {
1496                     struct vf_priv_s cfg;
1497                     /* Spatial checks only when enough data */
1498                     cfg.mode = (y >= 2 && y < dstp->i_visible_lines - 2) ? 0 : 2;
1499
1500                     assert( prevp->i_pitch == curp->i_pitch && curp->i_pitch == nextp->i_pitch );
1501                     filter( &cfg,
1502                             &dstp->p_pixels[y * dstp->i_pitch],
1503                             &prevp->p_pixels[y * prevp->i_pitch],
1504                             &curp->p_pixels[y * curp->i_pitch],
1505                             &nextp->p_pixels[y * nextp->i_pitch],
1506                             dstp->i_visible_pitch,
1507                             curp->i_pitch,
1508                             (i_field ^ (i_order == i_field)) & 1 );
1509                 }
1510
1511                 /* We duplicate the first and last lines */
1512                 if( y == 1 )
1513                     vlc_memcpy(&dstp->p_pixels[(y-1) * dstp->i_pitch], &dstp->p_pixels[y * dstp->i_pitch], dstp->i_pitch);
1514                 else if( y == dstp->i_visible_lines - 2 )
1515                     vlc_memcpy(&dstp->p_pixels[(y+1) * dstp->i_pitch], &dstp->p_pixels[y * dstp->i_pitch], dstp->i_pitch);
1516             }
1517         }
1518
1519         /* */
1520         p_dst->date = (p_next->date - p_cur->date) * i_order / 2 + p_cur->date;
1521         return VLC_SUCCESS;
1522     }
1523     else if( !p_prev && !p_cur && p_next )
1524     {
1525         /* FIXME not good as it does not use i_order/i_field */
1526         RenderX( p_dst, p_next );
1527         return VLC_SUCCESS;
1528     }
1529     else
1530     {
1531         return VLC_EGENERIC;
1532     }
1533 }
1534
1535 /*****************************************************************************
1536  * video filter2 functions
1537  *****************************************************************************/
1538 static picture_t *Deinterlace( filter_t *p_filter, picture_t *p_pic )
1539 {
1540     filter_sys_t *p_sys = p_filter->p_sys;
1541     picture_t *p_dst[2];
1542
1543     /* Request output picture */
1544     p_dst[0] = filter_NewPicture( p_filter );
1545     if( p_dst[0] == NULL )
1546     {
1547         picture_Release( p_pic );
1548         return NULL;
1549     }
1550     picture_CopyProperties( p_dst[0], p_pic );
1551
1552     if( p_sys->b_double_rate )
1553     {
1554         p_dst[0]->p_next =
1555         p_dst[1]         = filter_NewPicture( p_filter );
1556         if( p_dst[1] )
1557         {
1558             picture_CopyProperties( p_dst[1], p_pic );
1559             /* XXX it's not really good especially for the first picture, but
1560              * I don't think that delaying by one frame is worth it */
1561             if( p_sys->i_last_date > VLC_TS_INVALID && p_pic->date > VLC_TS_INVALID )
1562                 p_dst[1]->date = p_pic->date + (p_pic->date - p_sys->i_last_date) / 2;
1563         }
1564         p_sys->i_last_date = p_pic->date;
1565     }
1566     else
1567     {
1568         p_dst[1] = NULL;
1569     }
1570
1571     switch( p_sys->i_mode )
1572     {
1573         case DEINTERLACE_DISCARD:
1574             RenderDiscard( p_filter, p_dst[0], p_pic, 0 );
1575             break;
1576
1577         case DEINTERLACE_BOB:
1578             RenderBob( p_filter, p_dst[0], p_pic, !p_pic->b_top_field_first );
1579             if( p_dst[1] )
1580                 RenderBob( p_filter, p_dst[1], p_pic, p_pic->b_top_field_first );
1581             break;;
1582
1583         case DEINTERLACE_LINEAR:
1584             RenderLinear( p_filter, p_dst[0], p_pic, !p_pic->b_top_field_first );
1585             if( p_dst[1] )
1586                 RenderLinear( p_filter, p_dst[1], p_pic, p_pic->b_top_field_first );
1587             break;
1588
1589         case DEINTERLACE_MEAN:
1590             RenderMean( p_filter, p_dst[0], p_pic );
1591             break;
1592
1593         case DEINTERLACE_BLEND:
1594             RenderBlend( p_filter, p_dst[0], p_pic );
1595             break;
1596
1597         case DEINTERLACE_X:
1598             RenderX( p_dst[0], p_pic );
1599             break;
1600
1601         case DEINTERLACE_YADIF:
1602             if( RenderYadif( p_filter, p_dst[0], p_pic, 0, 0 ) )
1603                 goto drop;
1604             break;
1605
1606         case DEINTERLACE_YADIF2X:
1607             if( RenderYadif( p_filter, p_dst[0], p_pic, 0, !p_pic->b_top_field_first ) )
1608                 goto drop;
1609             if( p_dst[1] )
1610                 RenderYadif( p_filter, p_dst[1], p_pic, 1, p_pic->b_top_field_first );
1611             break;
1612     }
1613
1614     p_dst[0]->b_progressive = true;
1615     if( p_dst[1] )
1616         p_dst[1]->b_progressive = true;
1617
1618     picture_Release( p_pic );
1619     return p_dst[0];
1620
1621 drop:
1622     picture_Release( p_dst[0] );
1623     if( p_dst[1] )
1624         picture_Release( p_dst[1] );
1625     picture_Release( p_pic );
1626     return NULL;
1627 }
1628
1629 static void Flush( filter_t *p_filter )
1630 {
1631     filter_sys_t *p_sys = p_filter->p_sys;
1632
1633     p_sys->i_last_date = VLC_TS_INVALID;
1634     for( int i = 0; i < HISTORY_SIZE; i++ )
1635     {
1636         if( p_sys->pp_history[i] )
1637             picture_Release( p_sys->pp_history[i] );
1638         p_sys->pp_history[i] = NULL;
1639     }
1640 }
1641
1642 static int Mouse( filter_t *p_filter,
1643                   vlc_mouse_t *p_mouse, const vlc_mouse_t *p_old, const vlc_mouse_t *p_new )
1644 {
1645     VLC_UNUSED(p_old);
1646     *p_mouse = *p_new;
1647     if( p_filter->p_sys->b_half_height )
1648         p_mouse->i_y *= 2;
1649     return VLC_SUCCESS;
1650 }
1651
1652
1653 /*****************************************************************************
1654  * Open
1655  *****************************************************************************/
1656 static int Open( vlc_object_t *p_this )
1657 {
1658     filter_t *p_filter = (filter_t*)p_this;
1659     filter_sys_t *p_sys;
1660
1661     if( !IsChromaSupported( p_filter->fmt_in.video.i_chroma ) )
1662         return VLC_EGENERIC;
1663
1664     /* */
1665     p_sys = p_filter->p_sys = malloc( sizeof( *p_sys ) );
1666     if( !p_sys )
1667         return VLC_ENOMEM;
1668
1669     p_sys->i_mode = DEINTERLACE_BLEND;
1670     p_sys->b_double_rate = false;
1671     p_sys->b_half_height = true;
1672     p_sys->i_last_date = VLC_TS_INVALID;
1673     for( int i = 0; i < HISTORY_SIZE; i++ )
1674         p_sys->pp_history[i] = NULL;
1675
1676 #if defined(CAN_COMPILE_C_ALTIVEC)
1677     if( vlc_CPU() & CPU_CAPABILITY_ALTIVEC )
1678     {
1679         p_sys->pf_merge = MergeAltivec;
1680         p_sys->pf_end_merge = NULL;
1681     }
1682     else
1683 #endif
1684 #if defined(CAN_COMPILE_SSE)
1685     if( vlc_CPU() & CPU_CAPABILITY_SSE2 )
1686     {
1687         p_sys->pf_merge = MergeSSE2;
1688         p_sys->pf_end_merge = EndMMX;
1689     }
1690     else
1691 #endif
1692 #if defined(CAN_COMPILE_MMXEXT)
1693     if( vlc_CPU() & CPU_CAPABILITY_MMXEXT )
1694     {
1695         p_sys->pf_merge = MergeMMXEXT;
1696         p_sys->pf_end_merge = EndMMX;
1697     }
1698     else
1699 #endif
1700 #if defined(CAN_COMPILE_3DNOW)
1701     if( vlc_CPU() & CPU_CAPABILITY_3DNOW )
1702     {
1703         p_sys->pf_merge = Merge3DNow;
1704         p_sys->pf_end_merge = End3DNow;
1705     }
1706     else
1707 #endif
1708 #if defined __ARM_NEON__
1709     if( vlc_CPU() & CPU_CAPABILITY_NEON )
1710     {
1711         p_sys->pf_merge = MergeNEON;
1712         p_sys->pf_end_merge = NULL;
1713     }
1714     else
1715 #endif
1716     {
1717         p_sys->pf_merge = MergeGeneric;
1718         p_sys->pf_end_merge = NULL;
1719     }
1720
1721     /* */
1722     config_ChainParse( p_filter, FILTER_CFG_PREFIX, ppsz_filter_options,
1723                        p_filter->p_cfg );
1724
1725     char *psz_mode = var_GetNonEmptyString( p_filter, FILTER_CFG_PREFIX "mode" );
1726     SetFilterMethod( p_filter, psz_mode, p_filter->fmt_in.video.i_chroma );
1727     free( psz_mode );
1728
1729     /* */
1730     video_format_t fmt;
1731     GetOutputFormat( p_filter, &fmt, &p_filter->fmt_in.video );
1732     if( !p_filter->b_allow_fmt_out_change &&
1733         ( fmt.i_chroma != p_filter->fmt_in.video.i_chroma ||
1734           fmt.i_height != p_filter->fmt_in.video.i_height ) )
1735     {
1736         Close( VLC_OBJECT(p_filter) );
1737         return VLC_EGENERIC;
1738     }
1739     p_filter->fmt_out.video = fmt;
1740     p_filter->fmt_out.i_codec = fmt.i_chroma;
1741     p_filter->pf_video_filter = Deinterlace;
1742     p_filter->pf_video_flush  = Flush;
1743     p_filter->pf_video_mouse  = Mouse;
1744
1745     msg_Dbg( p_filter, "deinterlacing" );
1746
1747     return VLC_SUCCESS;
1748 }
1749
1750 /*****************************************************************************
1751  * Close: clean up the filter
1752  *****************************************************************************/
1753 static void Close( vlc_object_t *p_this )
1754 {
1755     filter_t *p_filter = (filter_t*)p_this;
1756
1757     Flush( p_filter );
1758     free( p_filter->p_sys );
1759 }
1760