git.sesse.net Git - vlc/blob - modules/video_filter/deinterlace.c

   1 /*****************************************************************************
   2  * deinterlace.c : deinterlacer plugin for vlc
   3  *****************************************************************************
   4  * Copyright (C) 2000, 2001, 2002, 2003 the VideoLAN team
   5  * $Id$
   6  *
   7  * Author: Sam Hocevar <sam@zoy.org>
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation; either version 2 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License
  20  * along with this program; if not, write to the Free Software
  21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
  22  *****************************************************************************/
  23
  24 /*****************************************************************************
  25  * Preamble
  26  *****************************************************************************/
  27 #include <errno.h>
  28
  29 #ifdef HAVE_CONFIG_H
  30 # include "config.h"
  31 #endif
  32
  33 #ifdef HAVE_ALTIVEC_H
  34 #   include <altivec.h>
  35 #endif
  36
  37 #include <vlc_common.h>
  38 #include <vlc_plugin.h>
  39 #include <vlc_vout.h>
  40 #include <vlc_sout.h>
  41 #include "vlc_filter.h"
  42
  43 #ifdef CAN_COMPILE_MMXEXT
  44 #   include "mmx.h"
  45 #endif
  46
  47 #include "filter_common.h"
  48
  49 #define DEINTERLACE_DISCARD 1
  50 #define DEINTERLACE_MEAN    2
  51 #define DEINTERLACE_BLEND   3
  52 #define DEINTERLACE_BOB     4
  53 #define DEINTERLACE_LINEAR  5
  54 #define DEINTERLACE_X       6
  55
  56 /*****************************************************************************
  57  * Local protypes
  58  *****************************************************************************/
  59 static int  Create    ( vlc_object_t * );
  60 static void Destroy   ( vlc_object_t * );
  61
  62 static int  Init      ( vout_thread_t * );
  63 static void End       ( vout_thread_t * );
  64 static void Render    ( vout_thread_t *, picture_t * );
  65
  66 static void RenderDiscard( vout_thread_t *, picture_t *, picture_t *, int );
  67 static void RenderBob    ( vout_thread_t *, picture_t *, picture_t *, int );
  68 static void RenderMean   ( vout_thread_t *, picture_t *, picture_t * );
  69 static void RenderBlend  ( vout_thread_t *, picture_t *, picture_t * );
  70 static void RenderLinear ( vout_thread_t *, picture_t *, picture_t *, int );
  71 static void RenderX      ( picture_t *, picture_t * );
  72
  73 static void MergeGeneric ( void *, const void *, const void *, size_t );
  74 #if defined(CAN_COMPILE_C_ALTIVEC)
  75 static void MergeAltivec ( void *, const void *, const void *, size_t );
  76 #endif
  77 #if defined(CAN_COMPILE_MMXEXT)
  78 static void MergeMMXEXT  ( void *, const void *, const void *, size_t );
  79 #endif
  80 #if defined(CAN_COMPILE_3DNOW)
  81 static void Merge3DNow   ( void *, const void *, const void *, size_t );
  82 #endif
  83 #if defined(CAN_COMPILE_SSE)
  84 static void MergeSSE2    ( void *, const void *, const void *, size_t );
  85 #endif
  86 #if defined(CAN_COMPILE_MMXEXT) || defined(CAN_COMPILE_SSE)
  87 static void EndMMX       ( void );
  88 #endif
  89 #if defined(CAN_COMPILE_3DNOW)
  90 static void End3DNow     ( void );
  91 #endif
  92
  93 static int  SendEvents   ( vlc_object_t *, char const *,
  94                            vlc_value_t, vlc_value_t, void * );
  95
  96 static void SetFilterMethod( vout_thread_t *p_vout, char *psz_method );
  97 static vout_thread_t *SpawnRealVout( vout_thread_t *p_vout );
  98
  99 static int OpenFilter( vlc_object_t *p_this );
 100 static void CloseFilter( vlc_object_t *p_this );
 101
 102 /*****************************************************************************
 103  * Callback prototypes
 104  *****************************************************************************/
 105 static int FilterCallback ( vlc_object_t *, char const *,
 106                             vlc_value_t, vlc_value_t, void * );
 107
 108 /*****************************************************************************
 109  * Module descriptor
 110  *****************************************************************************/
 111 #define MODE_TEXT N_("Deinterlace mode")
 112 #define MODE_LONGTEXT N_("Deinterlace method to use for local playback.")
 113
 114 #define SOUT_MODE_TEXT N_("Streaming deinterlace mode")
 115 #define SOUT_MODE_LONGTEXT N_("Deinterlace method to use for streaming.")
 116
 117 #define FILTER_CFG_PREFIX "sout-deinterlace-"
 118
 119 static const char *const mode_list[] = {
 120     "discard", "blend", "mean", "bob", "linear", "x" };
 121 static const char *const mode_list_text[] = {
 122     N_("Discard"), N_("Blend"), N_("Mean"), N_("Bob"), N_("Linear"), "X" };
 123
 124 vlc_module_begin();
 125     set_description( N_("Deinterlacing video filter") );
 126     set_shortname( N_("Deinterlace" ));
 127     set_capability( "video filter", 0 );
 128     set_category( CAT_VIDEO );
 129     set_subcategory( SUBCAT_VIDEO_VFILTER );
 130
 131     set_section( N_("Display"),NULL);
 132     add_string( "deinterlace-mode", "discard", NULL, MODE_TEXT,
 133                 MODE_LONGTEXT, false );
 134         change_string_list( mode_list, mode_list_text, 0 );
 135
 136     add_shortcut( "deinterlace" );
 137     set_callbacks( Create, Destroy );
 138
 139     add_submodule();
 140     set_capability( "video filter2", 0 );
 141     set_section( N_("Streaming"),NULL);
 142     add_string( FILTER_CFG_PREFIX "mode", "blend", NULL, SOUT_MODE_TEXT,
 143                 SOUT_MODE_LONGTEXT, false );
 144         change_string_list( mode_list, mode_list_text, 0 );
 145     set_callbacks( OpenFilter, CloseFilter );
 146 vlc_module_end();
 147
 148 static const char *const ppsz_filter_options[] = {
 149     "mode", NULL
 150 };
 151
 152 /*****************************************************************************
 153  * vout_sys_t: Deinterlace video output method descriptor
 154  *****************************************************************************
 155  * This structure is part of the video output thread descriptor.
 156  * It describes the Deinterlace specific properties of an output thread.
 157  *****************************************************************************/
 158 struct vout_sys_t
 159 {
 160     int        i_mode;        /* Deinterlace mode */
 161     bool b_double_rate; /* Shall we double the framerate? */
 162
 163     mtime_t    last_date;
 164     mtime_t    next_date;
 165
 166     vout_thread_t *p_vout;
 167
 168     vlc_mutex_t filter_lock;
 169
 170     void (*pf_merge) ( void *, const void *, const void *, size_t );
 171     void (*pf_end_merge) ( void );
 172 };
 173
 174 /*****************************************************************************
 175  * Control: control facility for the vout (forwards to child vout)
 176  *****************************************************************************/
 177 static int Control( vout_thread_t *p_vout, int i_query, va_list args )
 178 {
 179     return vout_vaControl( p_vout->p_sys->p_vout, i_query, args );
 180 }
 181
 182 /*****************************************************************************
 183  * Create: allocates Deinterlace video thread output method
 184  *****************************************************************************
 185  * This function allocates and initializes a Deinterlace vout method.
 186  *****************************************************************************/
 187 static int Create( vlc_object_t *p_this )
 188 {
 189     vout_thread_t *p_vout = (vout_thread_t *)p_this;
 190     vlc_value_t val;
 191
 192     /* Allocate structure */
 193     p_vout->p_sys = malloc( sizeof( vout_sys_t ) );
 194     if( p_vout->p_sys == NULL )
 195     {
 196         msg_Err( p_vout, "out of memory" );
 197         return VLC_ENOMEM;
 198     }
 199
 200     p_vout->pf_init = Init;
 201     p_vout->pf_end = End;
 202     p_vout->pf_manage = NULL;
 203     p_vout->pf_render = Render;
 204     p_vout->pf_display = NULL;
 205     p_vout->pf_control = Control;
 206
 207     p_vout->p_sys->i_mode = DEINTERLACE_DISCARD;
 208     p_vout->p_sys->b_double_rate = false;
 209     p_vout->p_sys->last_date = 0;
 210     p_vout->p_sys->p_vout = 0;
 211     vlc_mutex_init( &p_vout->p_sys->filter_lock );
 212
 213 #if defined(CAN_COMPILE_C_ALTIVEC)
 214     if( vlc_CPU() & CPU_CAPABILITY_ALTIVEC )
 215     {
 216         p_vout->p_sys->pf_merge = MergeAltivec;
 217         p_vout->p_sys->pf_end_merge = NULL;
 218     }
 219     else
 220 #endif
 221 #if defined(CAN_COMPILE_SSE)
 222     if( vlc_CPU() & CPU_CAPABILITY_SSE2 )
 223     {
 224         p_vout->p_sys->pf_merge = MergeSSE2;
 225         p_vout->p_sys->pf_end_merge = EndMMX;
 226     }
 227     else
 228 #endif
 229 #if defined(CAN_COMPILE_MMXEXT)
 230     if( vlc_CPU() & CPU_CAPABILITY_MMXEXT )
 231     {
 232         p_vout->p_sys->pf_merge = MergeMMXEXT;
 233         p_vout->p_sys->pf_end_merge = EndMMX;
 234     }
 235     else
 236 #endif
 237 #if defined(CAN_COMPILE_3DNOW)
 238     if( vlc_CPU() & CPU_CAPABILITY_3DNOW )
 239     {
 240         p_vout->p_sys->pf_merge = Merge3DNow;
 241         p_vout->p_sys->pf_end_merge = End3DNow;
 242     }
 243     else
 244 #endif
 245     {
 246         p_vout->p_sys->pf_merge = MergeGeneric;
 247         p_vout->p_sys->pf_end_merge = NULL;
 248     }
 249
 250     /* Look what method was requested */
 251     var_Create( p_vout, "deinterlace-mode", VLC_VAR_STRING );
 252     var_Change( p_vout, "deinterlace-mode", VLC_VAR_INHERITVALUE, &val, NULL );
 253
 254     if( val.psz_string == NULL )
 255     {
 256         msg_Err( p_vout, "configuration variable deinterlace-mode empty" );
 257         msg_Err( p_vout, "no deinterlace mode provided, using \"discard\"" );
 258
 259         val.psz_string = strdup( "discard" );
 260     }
 261
 262     msg_Dbg( p_vout, "using %s deinterlace mode", val.psz_string );
 263
 264     SetFilterMethod( p_vout, val.psz_string );
 265
 266     free( val.psz_string );
 267
 268     return VLC_SUCCESS;
 269 }
 270
 271 /*****************************************************************************
 272  * SetFilterMethod: setup the deinterlace method to use.
 273  *****************************************************************************/
 274 static void SetFilterMethod( vout_thread_t *p_vout, char *psz_method )
 275 {
 276     if( !strcmp( psz_method, "discard" ) )
 277     {
 278         p_vout->p_sys->i_mode = DEINTERLACE_DISCARD;
 279         p_vout->p_sys->b_double_rate = false;
 280     }
 281     else if( !strcmp( psz_method, "mean" ) )
 282     {
 283         p_vout->p_sys->i_mode = DEINTERLACE_MEAN;
 284         p_vout->p_sys->b_double_rate = false;
 285     }
 286     else if( !strcmp( psz_method, "blend" )
 287              || !strcmp( psz_method, "average" )
 288              || !strcmp( psz_method, "combine-fields" ) )
 289     {
 290         p_vout->p_sys->i_mode = DEINTERLACE_BLEND;
 291         p_vout->p_sys->b_double_rate = false;
 292     }
 293     else if( !strcmp( psz_method, "bob" )
 294              || !strcmp( psz_method, "progressive-scan" ) )
 295     {
 296         p_vout->p_sys->i_mode = DEINTERLACE_BOB;
 297         p_vout->p_sys->b_double_rate = true;
 298     }
 299     else if( !strcmp( psz_method, "linear" ) )
 300     {
 301         p_vout->p_sys->i_mode = DEINTERLACE_LINEAR;
 302         p_vout->p_sys->b_double_rate = true;
 303     }
 304     else if( !strcmp( psz_method, "x" ) )
 305     {
 306         p_vout->p_sys->i_mode = DEINTERLACE_X;
 307         p_vout->p_sys->b_double_rate = false;
 308     }
 309     else
 310     {
 311         msg_Err( p_vout, "no valid deinterlace mode provided, "
 312                  "using \"discard\"" );
 313     }
 314
 315     msg_Dbg( p_vout, "using %s deinterlace method", psz_method );
 316 }
 317
 318 /*****************************************************************************
 319  * Init: initialize Deinterlace video thread output method
 320  *****************************************************************************/
 321 static int Init( vout_thread_t *p_vout )
 322 {
 323     int i_index;
 324     picture_t *p_pic;
 325
 326     I_OUTPUTPICTURES = 0;
 327
 328     /* Initialize the output structure, full of directbuffers since we want
 329      * the decoder to output directly to our structures. */
 330     switch( p_vout->render.i_chroma )
 331     {
 332         case VLC_FOURCC('I','4','2','0'):
 333         case VLC_FOURCC('I','Y','U','V'):
 334         case VLC_FOURCC('Y','V','1','2'):
 335         case VLC_FOURCC('I','4','2','2'):
 336             p_vout->output.i_chroma = p_vout->render.i_chroma;
 337             p_vout->output.i_width  = p_vout->render.i_width;
 338             p_vout->output.i_height = p_vout->render.i_height;
 339             p_vout->output.i_aspect = p_vout->render.i_aspect;
 340             p_vout->fmt_out = p_vout->fmt_in;
 341             break;
 342
 343         default:
 344             return VLC_EGENERIC; /* unknown chroma */
 345             break;
 346     }
 347
 348     /* Try to open the real video output */
 349     p_vout->p_sys->p_vout = SpawnRealVout( p_vout );
 350
 351     if( p_vout->p_sys->p_vout == NULL )
 352     {
 353         /* Everything failed */
 354         msg_Err( p_vout, "cannot open vout, aborting" );
 355
 356         return VLC_EGENERIC;
 357     }
 358
 359     var_AddCallback( p_vout, "deinterlace-mode", FilterCallback, NULL );
 360
 361     ALLOCATE_DIRECTBUFFERS( VOUT_MAX_PICTURES );
 362
 363     ADD_CALLBACKS( p_vout->p_sys->p_vout, SendEvents );
 364
 365     ADD_PARENT_CALLBACKS( SendEventsToChild );
 366
 367     return VLC_SUCCESS;
 368 }
 369
 370 /*****************************************************************************
 371  * SpawnRealVout: spawn the real video output.
 372  *****************************************************************************/
 373 static vout_thread_t *SpawnRealVout( vout_thread_t *p_vout )
 374 {
 375     vout_thread_t *p_real_vout = NULL;
 376     video_format_t fmt;
 377     memset( &fmt, 0, sizeof( video_format_t ) );
 378
 379     msg_Dbg( p_vout, "spawning the real video output" );
 380
 381     fmt = p_vout->fmt_out;
 382
 383     switch( p_vout->render.i_chroma )
 384     {
 385     case VLC_FOURCC('I','4','2','0'):
 386     case VLC_FOURCC('I','Y','U','V'):
 387     case VLC_FOURCC('Y','V','1','2'):
 388         switch( p_vout->p_sys->i_mode )
 389         {
 390         case DEINTERLACE_MEAN:
 391         case DEINTERLACE_DISCARD:
 392             fmt.i_height /= 2; fmt.i_visible_height /= 2; fmt.i_y_offset /= 2;
 393             fmt.i_sar_den *= 2;
 394             p_real_vout = vout_Create( p_vout, &fmt );
 395             break;
 396
 397         case DEINTERLACE_BOB:
 398         case DEINTERLACE_BLEND:
 399         case DEINTERLACE_LINEAR:
 400         case DEINTERLACE_X:
 401             p_real_vout = vout_Create( p_vout, &fmt );
 402             break;
 403         }
 404         break;
 405
 406     case VLC_FOURCC('I','4','2','2'):
 407         fmt.i_chroma = VLC_FOURCC('I','4','2','0');
 408         p_real_vout = vout_Create( p_vout, &fmt );
 409         break;
 410
 411     default:
 412         break;
 413     }
 414
 415     return p_real_vout;
 416 }
 417
 418 /*****************************************************************************
 419  * End: terminate Deinterlace video thread output method
 420  *****************************************************************************/
 421 static void End( vout_thread_t *p_vout )
 422 {
 423     int i_index;
 424
 425     /* Free the fake output buffers we allocated */
 426     for( i_index = I_OUTPUTPICTURES ; i_index ; )
 427     {
 428         i_index--;
 429         free( PP_OUTPUTPICTURE[ i_index ]->p_data_orig );
 430     }
 431
 432     if( p_vout->p_sys->p_vout )
 433     {
 434         DEL_CALLBACKS( p_vout->p_sys->p_vout, SendEvents );
 435         vlc_object_detach( p_vout->p_sys->p_vout );
 436         vlc_object_release( p_vout->p_sys->p_vout );
 437     }
 438
 439     DEL_PARENT_CALLBACKS( SendEventsToChild );
 440 }
 441
 442 /*****************************************************************************
 443  * Destroy: destroy Deinterlace video thread output method
 444  *****************************************************************************
 445  * Terminate an output method created by DeinterlaceCreateOutputMethod
 446  *****************************************************************************/
 447 static void Destroy( vlc_object_t *p_this )
 448 {
 449     vout_thread_t *p_vout = (vout_thread_t *)p_this;
 450     vlc_mutex_destroy( &p_vout->p_sys->filter_lock );
 451     free( p_vout->p_sys );
 452 }
 453
 454 /*****************************************************************************
 455  * Render: displays previously rendered output
 456  *****************************************************************************
 457  * This function send the currently rendered image to Deinterlace image,
 458  * waits until it is displayed and switch the two rendering buffers, preparing
 459  * next frame.
 460  *****************************************************************************/
 461 static void Render ( vout_thread_t *p_vout, picture_t *p_pic )
 462 {
 463     vout_sys_t *p_sys = p_vout->p_sys;
 464     picture_t *pp_outpic[2];
 465
 466     p_vout->fmt_out.i_x_offset = p_sys->p_vout->fmt_in.i_x_offset =
 467         p_vout->fmt_in.i_x_offset;
 468     p_vout->fmt_out.i_y_offset = p_sys->p_vout->fmt_in.i_y_offset =
 469         p_vout->fmt_in.i_y_offset;
 470     p_vout->fmt_out.i_visible_width = p_sys->p_vout->fmt_in.i_visible_width =
 471         p_vout->fmt_in.i_visible_width;
 472     p_vout->fmt_out.i_visible_height = p_sys->p_vout->fmt_in.i_visible_height =
 473         p_vout->fmt_in.i_visible_height;
 474     if( p_vout->p_sys->i_mode == DEINTERLACE_MEAN ||
 475         p_vout->p_sys->i_mode == DEINTERLACE_DISCARD )
 476     {
 477         p_vout->fmt_out.i_y_offset /= 2; p_sys->p_vout->fmt_in.i_y_offset /= 2;
 478         p_vout->fmt_out.i_visible_height /= 2;
 479         p_sys->p_vout->fmt_in.i_visible_height /= 2;
 480     }
 481
 482     pp_outpic[0] = pp_outpic[1] = NULL;
 483
 484     vlc_mutex_lock( &p_vout->p_sys->filter_lock );
 485
 486     /* Get a new picture */
 487     while( ( pp_outpic[0] = vout_CreatePicture( p_vout->p_sys->p_vout,
 488                                                 0, 0, 0 ) )
 489               == NULL )
 490     {
 491         if( p_vout->b_die || p_vout->b_error )
 492         {
 493             vlc_mutex_unlock( &p_vout->p_sys->filter_lock );
 494             return;
 495         }
 496         msleep( VOUT_OUTMEM_SLEEP );
 497     }
 498
 499     vout_DatePicture( p_vout->p_sys->p_vout, pp_outpic[0], p_pic->date );
 500
 501     /* If we are using double rate, get an additional new picture */
 502     if( p_vout->p_sys->b_double_rate )
 503     {
 504         while( ( pp_outpic[1] = vout_CreatePicture( p_vout->p_sys->p_vout,
 505                                                  0, 0, 0 ) )
 506                   == NULL )
 507         {
 508             if( p_vout->b_die || p_vout->b_error )
 509             {
 510                 vout_DestroyPicture( p_vout->p_sys->p_vout, pp_outpic[0] );
 511                 vlc_mutex_unlock( &p_vout->p_sys->filter_lock );
 512                 return;
 513             }
 514             msleep( VOUT_OUTMEM_SLEEP );
 515         }
 516
 517         /* 20ms is a bit arbitrary, but it's only for the first image we get */
 518         if( !p_vout->p_sys->last_date )
 519         {
 520             vout_DatePicture( p_vout->p_sys->p_vout, pp_outpic[1],
 521                               p_pic->date + 20000 );
 522         }
 523         else
 524         {
 525             vout_DatePicture( p_vout->p_sys->p_vout, pp_outpic[1],
 526                       (3 * p_pic->date - p_vout->p_sys->last_date) / 2 );
 527         }
 528         p_vout->p_sys->last_date = p_pic->date;
 529     }
 530
 531     switch( p_vout->p_sys->i_mode )
 532     {
 533         case DEINTERLACE_DISCARD:
 534             RenderDiscard( p_vout, pp_outpic[0], p_pic, 0 );
 535             vout_DisplayPicture( p_vout->p_sys->p_vout, pp_outpic[0] );
 536             break;
 537
 538         case DEINTERLACE_BOB:
 539             RenderBob( p_vout, pp_outpic[0], p_pic, p_pic->b_top_field_first ? 0 : 1 );
 540             vout_DisplayPicture( p_vout->p_sys->p_vout, pp_outpic[0] );
 541             RenderBob( p_vout, pp_outpic[1], p_pic, p_pic->b_top_field_first ? 1 : 0 );
 542             vout_DisplayPicture( p_vout->p_sys->p_vout, pp_outpic[1] );
 543             break;
 544
 545         case DEINTERLACE_LINEAR:
 546             RenderLinear( p_vout, pp_outpic[0], p_pic, p_pic->b_top_field_first ? 0 : 1 );
 547             vout_DisplayPicture( p_vout->p_sys->p_vout, pp_outpic[0] );
 548             RenderLinear( p_vout, pp_outpic[1], p_pic, p_pic->b_top_field_first ? 1 : 0 );
 549             vout_DisplayPicture( p_vout->p_sys->p_vout, pp_outpic[1] );
 550             break;
 551
 552         case DEINTERLACE_MEAN:
 553             RenderMean( p_vout, pp_outpic[0], p_pic );
 554             vout_DisplayPicture( p_vout->p_sys->p_vout, pp_outpic[0] );
 555             break;
 556
 557         case DEINTERLACE_BLEND:
 558             RenderBlend( p_vout, pp_outpic[0], p_pic );
 559             vout_DisplayPicture( p_vout->p_sys->p_vout, pp_outpic[0] );
 560             break;
 561
 562         case DEINTERLACE_X:
 563             RenderX( pp_outpic[0], p_pic );
 564             vout_DisplayPicture( p_vout->p_sys->p_vout, pp_outpic[0] );
 565             break;
 566     }
 567     vlc_mutex_unlock( &p_vout->p_sys->filter_lock );
 568 }
 569
 570 /*****************************************************************************
 571  * RenderDiscard: only keep TOP or BOTTOM field, discard the other.
 572  *****************************************************************************/
 573 static void RenderDiscard( vout_thread_t *p_vout,
 574                            picture_t *p_outpic, picture_t *p_pic, int i_field )
 575 {
 576     int i_plane;
 577
 578     /* Copy image and skip lines */
 579     for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
 580     {
 581         uint8_t *p_in, *p_out_end, *p_out;
 582         int i_increment;
 583
 584         p_in = p_pic->p[i_plane].p_pixels
 585                    + i_field * p_pic->p[i_plane].i_pitch;
 586
 587         p_out = p_outpic->p[i_plane].p_pixels;
 588         p_out_end = p_out + p_outpic->p[i_plane].i_pitch
 589                              * p_outpic->p[i_plane].i_visible_lines;
 590
 591         switch( p_vout->render.i_chroma )
 592         {
 593         case VLC_FOURCC('I','4','2','0'):
 594         case VLC_FOURCC('I','Y','U','V'):
 595         case VLC_FOURCC('Y','V','1','2'):
 596
 597             for( ; p_out < p_out_end ; )
 598             {
 599                 vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 600
 601                 p_out += p_outpic->p[i_plane].i_pitch;
 602                 p_in += 2 * p_pic->p[i_plane].i_pitch;
 603             }
 604             break;
 605
 606         case VLC_FOURCC('I','4','2','2'):
 607
 608             i_increment = 2 * p_pic->p[i_plane].i_pitch;
 609
 610             if( i_plane == Y_PLANE )
 611             {
 612                 for( ; p_out < p_out_end ; )
 613                 {
 614                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 615                     p_out += p_outpic->p[i_plane].i_pitch;
 616                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 617                     p_out += p_outpic->p[i_plane].i_pitch;
 618                     p_in += i_increment;
 619                 }
 620             }
 621             else
 622             {
 623                 for( ; p_out < p_out_end ; )
 624                 {
 625                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 626                     p_out += p_outpic->p[i_plane].i_pitch;
 627                     p_in += i_increment;
 628                 }
 629             }
 630             break;
 631
 632         default:
 633             break;
 634         }
 635     }
 636 }
 637
 638 /*****************************************************************************
 639  * RenderBob: renders a BOB picture - simple copy
 640  *****************************************************************************/
 641 static void RenderBob( vout_thread_t *p_vout,
 642                        picture_t *p_outpic, picture_t *p_pic, int i_field )
 643 {
 644     int i_plane;
 645
 646     /* Copy image and skip lines */
 647     for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
 648     {
 649         uint8_t *p_in, *p_out_end, *p_out;
 650
 651         p_in = p_pic->p[i_plane].p_pixels;
 652         p_out = p_outpic->p[i_plane].p_pixels;
 653         p_out_end = p_out + p_outpic->p[i_plane].i_pitch
 654                              * p_outpic->p[i_plane].i_visible_lines;
 655
 656         switch( p_vout->render.i_chroma )
 657         {
 658             case VLC_FOURCC('I','4','2','0'):
 659             case VLC_FOURCC('I','Y','U','V'):
 660             case VLC_FOURCC('Y','V','1','2'):
 661                 /* For BOTTOM field we need to add the first line */
 662                 if( i_field == 1 )
 663                 {
 664                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 665                     p_in += p_pic->p[i_plane].i_pitch;
 666                     p_out += p_outpic->p[i_plane].i_pitch;
 667                 }
 668
 669                 p_out_end -= 2 * p_outpic->p[i_plane].i_pitch;
 670
 671                 for( ; p_out < p_out_end ; )
 672                 {
 673                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 674
 675                     p_out += p_outpic->p[i_plane].i_pitch;
 676
 677                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 678
 679                     p_in += 2 * p_pic->p[i_plane].i_pitch;
 680                     p_out += p_outpic->p[i_plane].i_pitch;
 681                 }
 682
 683                 vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 684
 685                 /* For TOP field we need to add the last line */
 686                 if( i_field == 0 )
 687                 {
 688                     p_in += p_pic->p[i_plane].i_pitch;
 689                     p_out += p_outpic->p[i_plane].i_pitch;
 690                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 691                 }
 692                 break;
 693
 694             case VLC_FOURCC('I','4','2','2'):
 695                 /* For BOTTOM field we need to add the first line */
 696                 if( i_field == 1 )
 697                 {
 698                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 699                     p_in += p_pic->p[i_plane].i_pitch;
 700                     p_out += p_outpic->p[i_plane].i_pitch;
 701                 }
 702
 703                 p_out_end -= 2 * p_outpic->p[i_plane].i_pitch;
 704
 705                 if( i_plane == Y_PLANE )
 706                 {
 707                     for( ; p_out < p_out_end ; )
 708                     {
 709                         vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 710
 711                         p_out += p_outpic->p[i_plane].i_pitch;
 712
 713                         vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 714
 715                         p_in += 2 * p_pic->p[i_plane].i_pitch;
 716                         p_out += p_outpic->p[i_plane].i_pitch;
 717                     }
 718                 }
 719                 else
 720                 {
 721                     for( ; p_out < p_out_end ; )
 722                     {
 723                         vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 724
 725                         p_out += p_outpic->p[i_plane].i_pitch;
 726                         p_in += 2 * p_pic->p[i_plane].i_pitch;
 727                     }
 728                 }
 729
 730                 vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 731
 732                 /* For TOP field we need to add the last line */
 733                 if( i_field == 0 )
 734                 {
 735                     p_in += p_pic->p[i_plane].i_pitch;
 736                     p_out += p_outpic->p[i_plane].i_pitch;
 737                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 738                 }
 739                 break;
 740         }
 741     }
 742 }
 743
 744 #define Merge p_vout->p_sys->pf_merge
 745 #define EndMerge if(p_vout->p_sys->pf_end_merge) p_vout->p_sys->pf_end_merge
 746
 747 /*****************************************************************************
 748  * RenderLinear: BOB with linear interpolation
 749  *****************************************************************************/
 750 static void RenderLinear( vout_thread_t *p_vout,
 751                           picture_t *p_outpic, picture_t *p_pic, int i_field )
 752 {
 753     int i_plane;
 754
 755     /* Copy image and skip lines */
 756     for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
 757     {
 758         uint8_t *p_in, *p_out_end, *p_out;
 759
 760         p_in = p_pic->p[i_plane].p_pixels;
 761         p_out = p_outpic->p[i_plane].p_pixels;
 762         p_out_end = p_out + p_outpic->p[i_plane].i_pitch
 763                              * p_outpic->p[i_plane].i_visible_lines;
 764
 765         /* For BOTTOM field we need to add the first line */
 766         if( i_field == 1 )
 767         {
 768             vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 769             p_in += p_pic->p[i_plane].i_pitch;
 770             p_out += p_outpic->p[i_plane].i_pitch;
 771         }
 772
 773         p_out_end -= 2 * p_outpic->p[i_plane].i_pitch;
 774
 775         for( ; p_out < p_out_end ; )
 776         {
 777             vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 778
 779             p_out += p_outpic->p[i_plane].i_pitch;
 780
 781             Merge( p_out, p_in, p_in + 2 * p_pic->p[i_plane].i_pitch,
 782                    p_pic->p[i_plane].i_pitch );
 783
 784             p_in += 2 * p_pic->p[i_plane].i_pitch;
 785             p_out += p_outpic->p[i_plane].i_pitch;
 786         }
 787
 788         vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 789
 790         /* For TOP field we need to add the last line */
 791         if( i_field == 0 )
 792         {
 793             p_in += p_pic->p[i_plane].i_pitch;
 794             p_out += p_outpic->p[i_plane].i_pitch;
 795             vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 796         }
 797     }
 798     EndMerge();
 799 }
 800
 801 static void RenderMean( vout_thread_t *p_vout,
 802                         picture_t *p_outpic, picture_t *p_pic )
 803 {
 804     int i_plane;
 805
 806     /* Copy image and skip lines */
 807     for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
 808     {
 809         uint8_t *p_in, *p_out_end, *p_out;
 810
 811         p_in = p_pic->p[i_plane].p_pixels;
 812
 813         p_out = p_outpic->p[i_plane].p_pixels;
 814         p_out_end = p_out + p_outpic->p[i_plane].i_pitch
 815                              * p_outpic->p[i_plane].i_visible_lines;
 816
 817         /* All lines: mean value */
 818         for( ; p_out < p_out_end ; )
 819         {
 820             Merge( p_out, p_in, p_in + p_pic->p[i_plane].i_pitch,
 821                    p_pic->p[i_plane].i_pitch );
 822
 823             p_out += p_outpic->p[i_plane].i_pitch;
 824             p_in += 2 * p_pic->p[i_plane].i_pitch;
 825         }
 826     }
 827     EndMerge();
 828 }
 829
 830 static void RenderBlend( vout_thread_t *p_vout,
 831                          picture_t *p_outpic, picture_t *p_pic )
 832 {
 833     int i_plane;
 834
 835     /* Copy image and skip lines */
 836     for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
 837     {
 838         uint8_t *p_in, *p_out_end, *p_out;
 839
 840         p_in = p_pic->p[i_plane].p_pixels;
 841
 842         p_out = p_outpic->p[i_plane].p_pixels;
 843         p_out_end = p_out + p_outpic->p[i_plane].i_pitch
 844                              * p_outpic->p[i_plane].i_visible_lines;
 845
 846         switch( p_vout->render.i_chroma )
 847         {
 848             case VLC_FOURCC('I','4','2','0'):
 849             case VLC_FOURCC('I','Y','U','V'):
 850             case VLC_FOURCC('Y','V','1','2'):
 851                 /* First line: simple copy */
 852                 vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 853                 p_out += p_outpic->p[i_plane].i_pitch;
 854
 855                 /* Remaining lines: mean value */
 856                 for( ; p_out < p_out_end ; )
 857                 {
 858                     Merge( p_out, p_in, p_in + p_pic->p[i_plane].i_pitch,
 859                            p_pic->p[i_plane].i_pitch );
 860
 861                     p_out += p_outpic->p[i_plane].i_pitch;
 862                     p_in += p_pic->p[i_plane].i_pitch;
 863                 }
 864                 break;
 865
 866             case VLC_FOURCC('I','4','2','2'):
 867                 /* First line: simple copy */
 868                 vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
 869                 p_out += p_outpic->p[i_plane].i_pitch;
 870
 871                 /* Remaining lines: mean value */
 872                 if( i_plane == Y_PLANE )
 873                 {
 874                     for( ; p_out < p_out_end ; )
 875                     {
 876                         Merge( p_out, p_in, p_in + p_pic->p[i_plane].i_pitch,
 877                                p_pic->p[i_plane].i_pitch );
 878
 879                         p_out += p_outpic->p[i_plane].i_pitch;
 880                         p_in += p_pic->p[i_plane].i_pitch;
 881                     }
 882                 }
 883
 884                 else
 885                 {
 886                     for( ; p_out < p_out_end ; )
 887                     {
 888                         Merge( p_out, p_in, p_in + p_pic->p[i_plane].i_pitch,
 889                                p_pic->p[i_plane].i_pitch );
 890
 891                         p_out += p_outpic->p[i_plane].i_pitch;
 892                         p_in += 2*p_pic->p[i_plane].i_pitch;
 893                     }
 894                 }
 895                 break;
 896         }
 897     }
 898     EndMerge();
 899 }
 900
 901 #undef Merge
 902
 903 static void MergeGeneric( void *_p_dest, const void *_p_s1,
 904                           const void *_p_s2, size_t i_bytes )
 905 {
 906     uint8_t* p_dest = (uint8_t*)_p_dest;
 907     const uint8_t *p_s1 = (const uint8_t *)_p_s1;
 908     const uint8_t *p_s2 = (const uint8_t *)_p_s2;
 909     uint8_t* p_end = p_dest + i_bytes - 8;
 910
 911     while( p_dest < p_end )
 912     {
 913         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 914         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 915         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 916         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 917         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 918         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 919         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 920         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 921     }
 922
 923     p_end += 8;
 924
 925     while( p_dest < p_end )
 926     {
 927         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 928     }
 929 }
 930
 931 #if defined(CAN_COMPILE_MMXEXT)
 932 static void MergeMMXEXT( void *_p_dest, const void *_p_s1, const void *_p_s2,
 933                          size_t i_bytes )
 934 {
 935     uint8_t* p_dest = (uint8_t*)_p_dest;
 936     const uint8_t *p_s1 = (const uint8_t *)_p_s1;
 937     const uint8_t *p_s2 = (const uint8_t *)_p_s2;
 938     uint8_t* p_end = p_dest + i_bytes - 8;
 939     while( p_dest < p_end )
 940     {
 941         __asm__  __volatile__( "movq %2,%%mm1;"
 942                                "pavgb %1, %%mm1;"
 943                                "movq %%mm1, %0" :"=m" (*p_dest):
 944                                                  "m" (*p_s1),
 945                                                  "m" (*p_s2) );
 946         p_dest += 8;
 947         p_s1 += 8;
 948         p_s2 += 8;
 949     }
 950
 951     p_end += 8;
 952
 953     while( p_dest < p_end )
 954     {
 955         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 956     }
 957 }
 958 #endif
 959
 960 #if defined(CAN_COMPILE_3DNOW)
 961 static void Merge3DNow( void *_p_dest, const void *_p_s1, const void *_p_s2,
 962                         size_t i_bytes )
 963 {
 964     uint8_t* p_dest = (uint8_t*)_p_dest;
 965     const uint8_t *p_s1 = (const uint8_t *)_p_s1;
 966     const uint8_t *p_s2 = (const uint8_t *)_p_s2;
 967     uint8_t* p_end = p_dest + i_bytes - 8;
 968     while( p_dest < p_end )
 969     {
 970         __asm__  __volatile__( "movq %2,%%mm1;"
 971                                "pavgusb %1, %%mm1;"
 972                                "movq %%mm1, %0" :"=m" (*p_dest):
 973                                                  "m" (*p_s1),
 974                                                  "m" (*p_s2) );
 975         p_dest += 8;
 976         p_s1 += 8;
 977         p_s2 += 8;
 978     }
 979
 980     p_end += 8;
 981
 982     while( p_dest < p_end )
 983     {
 984         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
 985     }
 986 }
 987 #endif
 988
 989 #if defined(CAN_COMPILE_SSE)
 990 static void MergeSSE2( void *_p_dest, const void *_p_s1, const void *_p_s2,
 991                        size_t i_bytes )
 992 {
 993     uint8_t* p_dest = (uint8_t*)_p_dest;
 994     const uint8_t *p_s1 = (const uint8_t *)_p_s1;
 995     const uint8_t *p_s2 = (const uint8_t *)_p_s2;
 996     uint8_t* p_end;
 997     while( (uintptr_t)p_s1 % 16 )
 998     {
 999         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
1000     }
1001     p_end = p_dest + i_bytes - 16;
1002     while( p_dest < p_end )
1003     {
1004         __asm__  __volatile__( "movdqu %2,%%xmm1;"
1005                                "pavgb %1, %%xmm1;"
1006                                "movdqu %%xmm1, %0" :"=m" (*p_dest):
1007                                                  "m" (*p_s1),
1008                                                  "m" (*p_s2) );
1009         p_dest += 16;
1010         p_s1 += 16;
1011         p_s2 += 16;
1012     }
1013
1014     p_end += 16;
1015
1016     while( p_dest < p_end )
1017     {
1018         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
1019     }
1020 }
1021 #endif
1022
1023 #if defined(CAN_COMPILE_MMXEXT) || defined(CAN_COMPILE_SSE)
1024 static void EndMMX( void )
1025 {
1026     __asm__ __volatile__( "emms" :: );
1027 }
1028 #endif
1029
1030 #if defined(CAN_COMPILE_3DNOW)
1031 static void End3DNow( void )
1032 {
1033     __asm__ __volatile__( "femms" :: );
1034 }
1035 #endif
1036
1037 #ifdef CAN_COMPILE_C_ALTIVEC
1038 static void MergeAltivec( void *_p_dest, const void *_p_s1,
1039                           const void *_p_s2, size_t i_bytes )
1040 {
1041     uint8_t *p_dest = (uint8_t *)_p_dest;
1042     uint8_t *p_s1   = (uint8_t *)_p_s1;
1043     uint8_t *p_s2   = (uint8_t *)_p_s2;
1044     uint8_t *p_end  = p_dest + i_bytes - 15;
1045
1046     /* Use C until the first 16-bytes aligned destination pixel */
1047     while( (int)p_dest & 0xF )
1048     {
1049         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
1050     }
1051
1052     if( ( (int)p_s1 & 0xF ) | ( (int)p_s2 & 0xF ) )
1053     {
1054         /* Unaligned source */
1055         vector unsigned char s1v, s2v, destv;
1056         vector unsigned char s1oldv, s2oldv, s1newv, s2newv;
1057         vector unsigned char perm1v, perm2v;
1058
1059         perm1v = vec_lvsl( 0, p_s1 );
1060         perm2v = vec_lvsl( 0, p_s2 );
1061         s1oldv = vec_ld( 0, p_s1 );
1062         s2oldv = vec_ld( 0, p_s2 );
1063
1064         while( p_dest < p_end )
1065         {
1066             s1newv = vec_ld( 16, p_s1 );
1067             s2newv = vec_ld( 16, p_s2 );
1068             s1v    = vec_perm( s1oldv, s1newv, perm1v );
1069             s2v    = vec_perm( s2oldv, s2newv, perm2v );
1070             s1oldv = s1newv;
1071             s2oldv = s2newv;
1072             destv  = vec_avg( s1v, s2v );
1073             vec_st( destv, 0, p_dest );
1074
1075             p_s1   += 16;
1076             p_s2   += 16;
1077             p_dest += 16;
1078         }
1079     }
1080     else
1081     {
1082         /* Aligned source */
1083         vector unsigned char s1v, s2v, destv;
1084
1085         while( p_dest < p_end )
1086         {
1087             s1v   = vec_ld( 0, p_s1 );
1088             s2v   = vec_ld( 0, p_s2 );
1089             destv = vec_avg( s1v, s2v );
1090             vec_st( destv, 0, p_dest );
1091
1092             p_s1   += 16;
1093             p_s2   += 16;
1094             p_dest += 16;
1095         }
1096     }
1097
1098     p_end += 15;
1099
1100     while( p_dest < p_end )
1101     {
1102         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
1103     }
1104 }
1105 #endif
1106
1107 /*****************************************************************************
1108  * RenderX: This algo works on a 8x8 block basic, it copies the top field
1109  * and apply a process to recreate the bottom field :
1110  *  If a 8x8 block is classified as :
1111  *   - progressive: it applies a small blend (1,6,1)
1112  *   - interlaced:
1113  *    * in the MMX version: we do a ME between the 2 fields, if there is a
1114  *    good match we use MC to recreate the bottom field (with a small
1115  *    blend (1,6,1) )
1116  *    * otherwise: it recreates the bottom field by an edge oriented
1117  *    interpolation.
1118   *****************************************************************************/
1119
1120 /* XDeint8x8Detect: detect if a 8x8 block is interlaced.
1121  * XXX: It need to access to 8x10
1122  * We use more than 8 lines to help with scrolling (text)
1123  * (and because XDeint8x8Frame use line 9)
1124  * XXX: smooth/uniform area with noise detection doesn't works well
1125  * but it's not really a problem because they don't have much details anyway
1126  */
1127 static inline int ssd( int a ) { return a*a; }
1128 static inline int XDeint8x8DetectC( uint8_t *src, int i_src )
1129 {
1130     int y, x;
1131     int ff, fr;
1132     int fc;
1133
1134     /* Detect interlacing */
1135     fc = 0;
1136     for( y = 0; y < 7; y += 2 )
1137     {
1138         ff = fr = 0;
1139         for( x = 0; x < 8; x++ )
1140         {
1141             fr += ssd(src[      x] - src[1*i_src+x]) +
1142                   ssd(src[i_src+x] - src[2*i_src+x]);
1143             ff += ssd(src[      x] - src[2*i_src+x]) +
1144                   ssd(src[i_src+x] - src[3*i_src+x]);
1145         }
1146         if( ff < 6*fr/8 && fr > 32 )
1147             fc++;
1148
1149         src += 2*i_src;
1150     }
1151
1152     return fc < 1 ? false : true;
1153 }
1154 #ifdef CAN_COMPILE_MMXEXT
1155 static inline int XDeint8x8DetectMMXEXT( uint8_t *src, int i_src )
1156 {
1157
1158     int y, x;
1159     int32_t ff, fr;
1160     int fc;
1161
1162     /* Detect interlacing */
1163     fc = 0;
1164     pxor_r2r( mm7, mm7 );
1165     for( y = 0; y < 9; y += 2 )
1166     {
1167         ff = fr = 0;
1168         pxor_r2r( mm5, mm5 );
1169         pxor_r2r( mm6, mm6 );
1170         for( x = 0; x < 8; x+=4 )
1171         {
1172             movd_m2r( src[        x], mm0 );
1173             movd_m2r( src[1*i_src+x], mm1 );
1174             movd_m2r( src[2*i_src+x], mm2 );
1175             movd_m2r( src[3*i_src+x], mm3 );
1176
1177             punpcklbw_r2r( mm7, mm0 );
1178             punpcklbw_r2r( mm7, mm1 );
1179             punpcklbw_r2r( mm7, mm2 );
1180             punpcklbw_r2r( mm7, mm3 );
1181
1182             movq_r2r( mm0, mm4 );
1183
1184             psubw_r2r( mm1, mm0 );
1185             psubw_r2r( mm2, mm4 );
1186
1187             psubw_r2r( mm1, mm2 );
1188             psubw_r2r( mm1, mm3 );
1189
1190             pmaddwd_r2r( mm0, mm0 );
1191             pmaddwd_r2r( mm4, mm4 );
1192             pmaddwd_r2r( mm2, mm2 );
1193             pmaddwd_r2r( mm3, mm3 );
1194             paddd_r2r( mm0, mm2 );
1195             paddd_r2r( mm4, mm3 );
1196             paddd_r2r( mm2, mm5 );
1197             paddd_r2r( mm3, mm6 );
1198         }
1199
1200         movq_r2r( mm5, mm0 );
1201         psrlq_i2r( 32, mm0 );
1202         paddd_r2r( mm0, mm5 );
1203         movd_r2m( mm5, fr );
1204
1205         movq_r2r( mm6, mm0 );
1206         psrlq_i2r( 32, mm0 );
1207         paddd_r2r( mm0, mm6 );
1208         movd_r2m( mm6, ff );
1209
1210         if( ff < 6*fr/8 && fr > 32 )
1211             fc++;
1212
1213         src += 2*i_src;
1214     }
1215     return fc;
1216 }
1217 #endif
1218
1219 /* XDeint8x8Frame: apply a small blend between field (1,6,1).
1220  * This won't destroy details, and help if there is a bit of interlacing.
1221  * (It helps with paning to avoid flickers)
1222  * (Use 8x9 pixels)
1223  */
1224 #if 0
1225 static inline void XDeint8x8FrameC( uint8_t *dst, int i_dst,
1226                                     uint8_t *src, int i_src )
1227 {
1228     int y, x;
1229
1230     /* Progressive */
1231     for( y = 0; y < 8; y += 2 )
1232     {
1233         memcpy( dst, src, 8 );
1234         dst += i_dst;
1235
1236         for( x = 0; x < 8; x++ )
1237             dst[x] = (src[x] + 6*src[1*i_src+x] + src[2*i_src+x] + 4 ) >> 3;
1238         dst += 1*i_dst;
1239         src += 2*i_src;
1240     }
1241 }
1242 #endif
1243 static inline void XDeint8x8MergeC( uint8_t *dst, int i_dst,
1244                                     uint8_t *src1, int i_src1,
1245                                     uint8_t *src2, int i_src2 )
1246 {
1247     int y, x;
1248
1249     /* Progressive */
1250     for( y = 0; y < 8; y += 2 )
1251     {
1252         memcpy( dst, src1, 8 );
1253         dst  += i_dst;
1254
1255         for( x = 0; x < 8; x++ )
1256             dst[x] = (src1[x] + 6*src2[x] + src1[i_src1+x] + 4 ) >> 3;
1257         dst += i_dst;
1258
1259         src1 += i_src1;
1260         src2 += i_src2;
1261     }
1262 }
1263
1264 #ifdef CAN_COMPILE_MMXEXT
1265 static inline void XDeint8x8MergeMMXEXT( uint8_t *dst, int i_dst,
1266                                          uint8_t *src1, int i_src1,
1267                                          uint8_t *src2, int i_src2 )
1268 {
1269     static const uint64_t m_4 = INT64_C(0x0004000400040004);
1270     int y, x;
1271
1272     /* Progressive */
1273     pxor_r2r( mm7, mm7 );
1274     for( y = 0; y < 8; y += 2 )
1275     {
1276         for( x = 0; x < 8; x +=4 )
1277         {
1278             movd_m2r( src1[x], mm0 );
1279             movd_r2m( mm0, dst[x] );
1280
1281             movd_m2r( src2[x], mm1 );
1282             movd_m2r( src1[i_src1+x], mm2 );
1283
1284             punpcklbw_r2r( mm7, mm0 );
1285             punpcklbw_r2r( mm7, mm1 );
1286             punpcklbw_r2r( mm7, mm2 );
1287             paddw_r2r( mm1, mm1 );
1288             movq_r2r( mm1, mm3 );
1289             paddw_r2r( mm3, mm3 );
1290             paddw_r2r( mm2, mm0 );
1291             paddw_r2r( mm3, mm1 );
1292             paddw_m2r( m_4, mm1 );
1293             paddw_r2r( mm1, mm0 );
1294             psraw_i2r( 3, mm0 );
1295             packuswb_r2r( mm7, mm0 );
1296             movd_r2m( mm0, dst[i_dst+x] );
1297         }
1298         dst += 2*i_dst;
1299         src1 += i_src1;
1300         src2 += i_src2;
1301     }
1302 }
1303
1304 #endif
1305
1306 /* For debug */
1307 static inline void XDeint8x8Set( uint8_t *dst, int i_dst, uint8_t v )
1308 {
1309     int y;
1310     for( y = 0; y < 8; y++ )
1311         memset( &dst[y*i_dst], v, 8 );
1312 }
1313
1314 /* XDeint8x8FieldE: Stupid deinterlacing (1,0,1) for block that miss a
1315  * neighbour
1316  * (Use 8x9 pixels)
1317  * TODO: a better one for the inner part.
1318  */
1319 static inline void XDeint8x8FieldEC( uint8_t *dst, int i_dst,
1320                                      uint8_t *src, int i_src )
1321 {
1322     int y, x;
1323
1324     /* Interlaced */
1325     for( y = 0; y < 8; y += 2 )
1326     {
1327         memcpy( dst, src, 8 );
1328         dst += i_dst;
1329
1330         for( x = 0; x < 8; x++ )
1331             dst[x] = (src[x] + src[2*i_src+x] ) >> 1;
1332         dst += 1*i_dst;
1333         src += 2*i_src;
1334     }
1335 }
1336 #ifdef CAN_COMPILE_MMXEXT
1337 static inline void XDeint8x8FieldEMMXEXT( uint8_t *dst, int i_dst,
1338                                           uint8_t *src, int i_src )
1339 {
1340     int y;
1341
1342     /* Interlaced */
1343     for( y = 0; y < 8; y += 2 )
1344     {
1345         movq_m2r( src[0], mm0 );
1346         movq_r2m( mm0, dst[0] );
1347         dst += i_dst;
1348
1349         movq_m2r( src[2*i_src], mm1 );
1350         pavgb_r2r( mm1, mm0 );
1351
1352         movq_r2m( mm0, dst[0] );
1353
1354         dst += 1*i_dst;
1355         src += 2*i_src;
1356     }
1357 }
1358 #endif
1359
1360 /* XDeint8x8Field: Edge oriented interpolation
1361  * (Need -4 and +5 pixels H, +1 line)
1362  */
1363 static inline void XDeint8x8FieldC( uint8_t *dst, int i_dst,
1364                                     uint8_t *src, int i_src )
1365 {
1366     int y, x;
1367
1368     /* Interlaced */
1369     for( y = 0; y < 8; y += 2 )
1370     {
1371         memcpy( dst, src, 8 );
1372         dst += i_dst;
1373
1374         for( x = 0; x < 8; x++ )
1375         {
1376             uint8_t *src2 = &src[2*i_src];
1377             /* I use 8 pixels just to match the MMX version, but it's overkill
1378              * 5 would be enough (less isn't good) */
1379             const int c0 = abs(src[x-4]-src2[x-2]) + abs(src[x-3]-src2[x-1]) +
1380                            abs(src[x-2]-src2[x+0]) + abs(src[x-1]-src2[x+1]) +
1381                            abs(src[x+0]-src2[x+2]) + abs(src[x+1]-src2[x+3]) +
1382                            abs(src[x+2]-src2[x+4]) + abs(src[x+3]-src2[x+5]);
1383
1384             const int c1 = abs(src[x-3]-src2[x-3]) + abs(src[x-2]-src2[x-2]) +
1385                            abs(src[x-1]-src2[x-1]) + abs(src[x+0]-src2[x+0]) +
1386                            abs(src[x+1]-src2[x+1]) + abs(src[x+2]-src2[x+2]) +
1387                            abs(src[x+3]-src2[x+3]) + abs(src[x+4]-src2[x+4]);
1388
1389             const int c2 = abs(src[x-2]-src2[x-4]) + abs(src[x-1]-src2[x-3]) +
1390                            abs(src[x+0]-src2[x-2]) + abs(src[x+1]-src2[x-1]) +
1391                            abs(src[x+2]-src2[x+0]) + abs(src[x+3]-src2[x+1]) +
1392                            abs(src[x+4]-src2[x+2]) + abs(src[x+5]-src2[x+3]);
1393
1394             if( c0 < c1 && c1 <= c2 )
1395                 dst[x] = (src[x-1] + src2[x+1]) >> 1;
1396             else if( c2 < c1 && c1 <= c0 )
1397                 dst[x] = (src[x+1] + src2[x-1]) >> 1;
1398             else
1399                 dst[x] = (src[x+0] + src2[x+0]) >> 1;
1400         }
1401
1402         dst += 1*i_dst;
1403         src += 2*i_src;
1404     }
1405 }
1406 #ifdef CAN_COMPILE_MMXEXT
1407 static inline void XDeint8x8FieldMMXEXT( uint8_t *dst, int i_dst,
1408                                          uint8_t *src, int i_src )
1409 {
1410     int y, x;
1411
1412     /* Interlaced */
1413     for( y = 0; y < 8; y += 2 )
1414     {
1415         memcpy( dst, src, 8 );
1416         dst += i_dst;
1417
1418         for( x = 0; x < 8; x++ )
1419         {
1420             uint8_t *src2 = &src[2*i_src];
1421             int32_t c0, c1, c2;
1422
1423             movq_m2r( src[x-2], mm0 );
1424             movq_m2r( src[x-3], mm1 );
1425             movq_m2r( src[x-4], mm2 );
1426
1427             psadbw_m2r( src2[x-4], mm0 );
1428             psadbw_m2r( src2[x-3], mm1 );
1429             psadbw_m2r( src2[x-2], mm2 );
1430
1431             movd_r2m( mm0, c2 );
1432             movd_r2m( mm1, c1 );
1433             movd_r2m( mm2, c0 );
1434
1435             if( c0 < c1 && c1 <= c2 )
1436                 dst[x] = (src[x-1] + src2[x+1]) >> 1;
1437             else if( c2 < c1 && c1 <= c0 )
1438                 dst[x] = (src[x+1] + src2[x-1]) >> 1;
1439             else
1440                 dst[x] = (src[x+0] + src2[x+0]) >> 1;
1441         }
1442
1443         dst += 1*i_dst;
1444         src += 2*i_src;
1445     }
1446 }
1447 #endif
1448
1449 #if 0
1450 static inline int XDeint8x8SsdC( uint8_t *pix1, int i_pix1,
1451                                  uint8_t *pix2, int i_pix2 )
1452 {
1453     int y, x;
1454     int s = 0;
1455
1456     for( y = 0; y < 8; y++ )
1457         for( x = 0; x < 8; x++ )
1458             s += ssd( pix1[y*i_pix1+x] - pix2[y*i_pix2+x] );
1459     return s;
1460 }
1461
1462 #ifdef CAN_COMPILE_MMXEXT
1463 static inline int XDeint8x8SsdMMXEXT( uint8_t *pix1, int i_pix1,
1464                                       uint8_t *pix2, int i_pix2 )
1465 {
1466     int y;
1467     int32_t s;
1468
1469     pxor_r2r( mm7, mm7 );
1470     pxor_r2r( mm6, mm6 );
1471
1472     for( y = 0; y < 8; y++ )
1473     {
1474         movq_m2r( pix1[0], mm0 );
1475         movq_m2r( pix2[0], mm1 );
1476
1477         movq_r2r( mm0, mm2 );
1478         movq_r2r( mm1, mm3 );
1479
1480         punpcklbw_r2r( mm7, mm0 );
1481         punpckhbw_r2r( mm7, mm2 );
1482         punpcklbw_r2r( mm7, mm1 );
1483         punpckhbw_r2r( mm7, mm3 );
1484
1485         psubw_r2r( mm1, mm0 );
1486         psubw_r2r( mm3, mm2 );
1487
1488         pmaddwd_r2r( mm0, mm0 );
1489         pmaddwd_r2r( mm2, mm2 );
1490
1491         paddd_r2r( mm2, mm0 );
1492         paddd_r2r( mm0, mm6 );
1493
1494         pix1 += i_pix1;
1495         pix2 += i_pix2;
1496     }
1497
1498     movq_r2r( mm6, mm7 );
1499     psrlq_i2r( 32, mm7 );
1500     paddd_r2r( mm6, mm7 );
1501     movd_r2m( mm7, s );
1502
1503     return s;
1504 }
1505 #endif
1506 #endif
1507
1508 #if 0
1509 /* A little try with motion, but doesn't work better that pure intra (and slow) */
1510 #ifdef CAN_COMPILE_MMXEXT
1511 /* XDeintMC:
1512  *  Bilinear MC QPel
1513  *  TODO: mmx version (easier in sse2)
1514  */
1515 static inline void XDeintMC( uint8_t *dst, int i_dst,
1516                              uint8_t *src, int i_src,
1517                              int mvx, int mvy,
1518                              int i_width, int i_height )
1519 {
1520     const int d4x = mvx&0x03;
1521     const int d4y = mvy&0x03;
1522
1523     const int cA = (4-d4x)*(4-d4y);
1524     const int cB = d4x    *(4-d4y);
1525     const int cC = (4-d4x)*d4y;
1526     const int cD = d4x    *d4y;
1527
1528     int y, x;
1529     uint8_t *srcp;
1530
1531
1532     src  += (mvy >> 2) * i_src + (mvx >> 2);
1533     srcp = &src[i_src];
1534
1535     for( y = 0; y < i_height; y++ )
1536     {
1537         for( x = 0; x < i_width; x++ )
1538         {
1539             dst[x] = ( cA*src[x]  + cB*src[x+1] +
1540                        cC*srcp[x] + cD*srcp[x+1] + 8 ) >> 4;
1541         }
1542         dst  += i_dst;
1543
1544         src   = srcp;
1545         srcp += i_src;
1546     }
1547 }
1548 static int XDeint8x4SadMMXEXT( uint8_t *pix1, int i_pix1,
1549                                uint8_t *pix2, int i_pix2 )
1550 {
1551     int32_t s;
1552
1553     movq_m2r( pix1[0*i_pix1], mm0 );
1554     movq_m2r( pix1[1*i_pix1], mm1 );
1555
1556     psadbw_m2r( pix2[0*i_pix2], mm0 );
1557     psadbw_m2r( pix2[1*i_pix2], mm1 );
1558
1559     movq_m2r( pix1[2*i_pix1], mm2 );
1560     movq_m2r( pix1[3*i_pix1], mm3 );
1561     psadbw_m2r( pix2[2*i_pix2], mm2 );
1562     psadbw_m2r( pix2[3*i_pix2], mm3 );
1563
1564     paddd_r2r( mm1, mm0 );
1565     paddd_r2r( mm3, mm2 );
1566     paddd_r2r( mm2, mm0 );
1567     movd_r2m( mm0, s );
1568
1569     return s;
1570 }
1571
1572 static inline int XDeint8x4TestQpel( uint8_t *src, int i_src,
1573                                      uint8_t *ref, int i_stride,
1574                                      int mx, int my,
1575                                      int xmax, int ymax )
1576 {
1577     uint8_t buffer[8*4];
1578
1579     if( abs(mx) >= 4*xmax || abs(my) >= 4*ymax )
1580         return 255*255*255;
1581
1582     XDeintMC( buffer, 8, ref, i_stride, mx, my, 8, 4 );
1583     return XDeint8x4SadMMXEXT( src, i_src, buffer, 8 );
1584 }
1585 static inline int XDeint8x4TestInt( uint8_t *src, int i_src,
1586                                     uint8_t *ref, int i_stride,
1587                                     int mx, int my,
1588                                     int xmax, int ymax )
1589 {
1590     if( abs(mx) >= xmax || abs(my) >= ymax )
1591         return 255*255*255;
1592
1593     return XDeint8x4SadMMXEXT( src, i_src, &ref[my*i_stride+mx], i_stride );
1594 }
1595
1596 static inline void XDeint8x8FieldMotion( uint8_t *dst, int i_dst,
1597                                          uint8_t *src, int i_src,
1598                                          int *mpx, int *mpy,
1599                                          int xmax, int ymax )
1600 {
1601     static const int dx[8] = { 0,  0, -1, 1, -1, -1,  1, 1 };
1602     static const int dy[8] = {-1,  1,  0, 0, -1,  1, -1, 1 };
1603     uint8_t *next = &src[i_src];
1604     const int i_src2 = 2*i_src;
1605     int mvx, mvy;
1606     int mvs, s;
1607     int i_step;
1608
1609     uint8_t *rec = &dst[i_dst];
1610
1611     /* We construct with intra method the missing field */
1612     XDeint8x8FieldMMXEXT( dst, i_dst, src, i_src );
1613
1614     /* Now we will try to find a match with ME with the other field */
1615
1616     /* ME: A small/partial EPZS
1617      * We search only for small MV (with high motion intra will be perfect */
1618     if( xmax > 4 ) xmax = 4;
1619     if( ymax > 4 ) ymax = 4;
1620
1621     /* Init with NULL Mv */
1622     mvx = mvy = 0;
1623     mvs = XDeint8x4SadMMXEXT( rec, i_src2, next, i_src2 );
1624
1625     /* Try predicted Mv */
1626     if( (s=XDeint8x4TestInt( rec, i_src2, next, i_src2, *mpx, *mpy, xmax, ymax)) < mvs )
1627     {
1628         mvs = s;
1629         mvx = *mpx;
1630         mvy = *mpy;
1631     }
1632     /* Search interger pel (small mv) */
1633     for( i_step = 0; i_step < 4; i_step++ )
1634     {
1635         int c = 4;
1636         int s;
1637         int i;
1638
1639         for( i = 0; i < 4; i++ )
1640         {
1641             s = XDeint8x4TestInt( rec, i_src2,
1642                                   next, i_src2, mvx+dx[i], mvy+dy[i],
1643                                   xmax, ymax );
1644             if( s < mvs )
1645             {
1646                 mvs = s;
1647                 c = i;
1648             }
1649         }
1650         if( c == 4 )
1651             break;
1652
1653         mvx += dx[c];
1654         mvy += dy[c];
1655     }
1656     *mpx = mvx;
1657     *mpy = mvy;
1658
1659     mvx <<= 2;
1660     mvy <<= 2;
1661
1662     if( mvs > 4 && mvs < 256 )
1663     {
1664         /* Search Qpel */
1665         /* XXX: for now only HPEL (too slow) */
1666         for( i_step = 0; i_step < 4; i_step++ )
1667         {
1668             int c = 8;
1669             int s;
1670             int i;
1671
1672             for( i = 0; i < 8; i++ )
1673             {
1674                 s = XDeint8x4TestQpel( rec, i_src2, next, i_src2,
1675                                        mvx+dx[i], mvy+dy[i],
1676                                        xmax, ymax );
1677                 if( s < mvs )
1678                 {
1679                     mvs = s;
1680                     c = i;
1681                 }
1682             }
1683             if( c == 8 )
1684                 break;
1685
1686             mvx += dx[c];
1687             mvy += dy[c];
1688         }
1689     }
1690
1691     if( mvs < 128 )
1692     {
1693         uint8_t buffer[8*4];
1694         XDeintMC( buffer, 8, next, i_src2, mvx, mvy, 8, 4 );
1695         XDeint8x8MergeMMXEXT( dst, i_dst, src, 2*i_src, buffer, 8 );
1696
1697         //XDeint8x8Set( dst, i_dst, 0 );
1698     }
1699 }
1700 #endif
1701 #endif
1702
1703 #if 0
1704 /* Kernel interpolation (1,-5,20,20,-5,1)
1705  * Lose a bit more details+add aliasing than edge interpol but avoid
1706  * more artifacts
1707  */
1708 static inline uint8_t clip1( int a )
1709 {
1710     if( a <= 0 )
1711         return 0;
1712     else if( a >= 255 )
1713         return 255;
1714     else
1715         return a;
1716 }
1717 static inline void XDeint8x8Field( uint8_t *dst, int i_dst,
1718                                    uint8_t *src, int i_src )
1719 {
1720     int y, x;
1721
1722     /* Interlaced */
1723     for( y = 0; y < 8; y += 2 )
1724     {
1725         const int i_src2 = i_src*2;
1726
1727         memcpy( dst, src, 8 );
1728         dst += i_dst;
1729
1730         for( x = 0; x < 8; x++ )
1731         {
1732             int pix;
1733
1734             pix =   1*(src[-2*i_src2+x]+src[3*i_src2+x]) +
1735                    -5*(src[-1*i_src2+x]+src[2*i_src2+x])
1736                   +20*(src[ 0*i_src2+x]+src[1*i_src2+x]);
1737
1738             dst[x] = clip1( ( pix + 16 ) >> 5 );
1739         }
1740
1741         dst += 1*i_dst;
1742         src += 2*i_src;
1743     }
1744 }
1745
1746 #endif
1747
1748 /* NxN arbitray size (and then only use pixel in the NxN block)
1749  */
1750 static inline int XDeintNxNDetect( uint8_t *src, int i_src,
1751                                    int i_height, int i_width )
1752 {
1753     int y, x;
1754     int ff, fr;
1755     int fc;
1756
1757
1758     /* Detect interlacing */
1759     /* FIXME way too simple, need to be more like XDeint8x8Detect */
1760     ff = fr = 0;
1761     fc = 0;
1762     for( y = 0; y < i_height - 2; y += 2 )
1763     {
1764         const uint8_t *s = &src[y*i_src];
1765         for( x = 0; x < i_width; x++ )
1766         {
1767             fr += ssd(s[      x] - s[1*i_src+x]);
1768             ff += ssd(s[      x] - s[2*i_src+x]);
1769         }
1770         if( ff < fr && fr > i_width / 2 )
1771             fc++;
1772     }
1773
1774     return fc < 2 ? false : true;
1775 }
1776
1777 static inline void XDeintNxNFrame( uint8_t *dst, int i_dst,
1778                                    uint8_t *src, int i_src,
1779                                    int i_width, int i_height )
1780 {
1781     int y, x;
1782
1783     /* Progressive */
1784     for( y = 0; y < i_height; y += 2 )
1785     {
1786         memcpy( dst, src, i_width );
1787         dst += i_dst;
1788
1789         if( y < i_height - 2 )
1790         {
1791             for( x = 0; x < i_width; x++ )
1792                 dst[x] = (src[x] + 2*src[1*i_src+x] + src[2*i_src+x] + 2 ) >> 2;
1793         }
1794         else
1795         {
1796             /* Blend last line */
1797             for( x = 0; x < i_width; x++ )
1798                 dst[x] = (src[x] + src[1*i_src+x] ) >> 1;
1799         }
1800         dst += 1*i_dst;
1801         src += 2*i_src;
1802     }
1803 }
1804
1805 static inline void XDeintNxNField( uint8_t *dst, int i_dst,
1806                                    uint8_t *src, int i_src,
1807                                    int i_width, int i_height )
1808 {
1809     int y, x;
1810
1811     /* Interlaced */
1812     for( y = 0; y < i_height; y += 2 )
1813     {
1814         memcpy( dst, src, i_width );
1815         dst += i_dst;
1816
1817         if( y < i_height - 2 )
1818         {
1819             for( x = 0; x < i_width; x++ )
1820                 dst[x] = (src[x] + src[2*i_src+x] ) >> 1;
1821         }
1822         else
1823         {
1824             /* Blend last line */
1825             for( x = 0; x < i_width; x++ )
1826                 dst[x] = (src[x] + src[i_src+x]) >> 1;
1827         }
1828         dst += 1*i_dst;
1829         src += 2*i_src;
1830     }
1831 }
1832
1833 static inline void XDeintNxN( uint8_t *dst, int i_dst, uint8_t *src, int i_src,
1834                               int i_width, int i_height )
1835 {
1836     if( XDeintNxNDetect( src, i_src, i_width, i_height ) )
1837         XDeintNxNField( dst, i_dst, src, i_src, i_width, i_height );
1838     else
1839         XDeintNxNFrame( dst, i_dst, src, i_src, i_width, i_height );
1840 }
1841
1842
1843 static inline int median( int a, int b, int c )
1844 {
1845     int min = a, max =a;
1846     if( b < min )
1847         min = b;
1848     else
1849         max = b;
1850
1851     if( c < min )
1852         min = c;
1853     else if( c > max )
1854         max = c;
1855
1856     return a + b + c - min - max;
1857 }
1858
1859
1860 /* XDeintBand8x8:
1861  */
1862 static inline void XDeintBand8x8C( uint8_t *dst, int i_dst,
1863                                    uint8_t *src, int i_src,
1864                                    const int i_mbx, int i_modx )
1865 {
1866     int x;
1867
1868     for( x = 0; x < i_mbx; x++ )
1869     {
1870         int s;
1871         if( ( s = XDeint8x8DetectC( src, i_src ) ) )
1872         {
1873             if( x == 0 || x == i_mbx - 1 )
1874                 XDeint8x8FieldEC( dst, i_dst, src, i_src );
1875             else
1876                 XDeint8x8FieldC( dst, i_dst, src, i_src );
1877         }
1878         else
1879         {
1880             XDeint8x8MergeC( dst, i_dst,
1881                              &src[0*i_src], 2*i_src,
1882                              &src[1*i_src], 2*i_src );
1883         }
1884
1885         dst += 8;
1886         src += 8;
1887     }
1888
1889     if( i_modx )
1890         XDeintNxN( dst, i_dst, src, i_src, i_modx, 8 );
1891 }
1892 #ifdef CAN_COMPILE_MMXEXT
1893 static inline void XDeintBand8x8MMXEXT( uint8_t *dst, int i_dst,
1894                                         uint8_t *src, int i_src,
1895                                         const int i_mbx, int i_modx )
1896 {
1897     int x;
1898
1899     /* Reset current line */
1900     for( x = 0; x < i_mbx; x++ )
1901     {
1902         int s;
1903         if( ( s = XDeint8x8DetectMMXEXT( src, i_src ) ) )
1904         {
1905             if( x == 0 || x == i_mbx - 1 )
1906                 XDeint8x8FieldEMMXEXT( dst, i_dst, src, i_src );
1907             else
1908                 XDeint8x8FieldMMXEXT( dst, i_dst, src, i_src );
1909         }
1910         else
1911         {
1912             XDeint8x8MergeMMXEXT( dst, i_dst,
1913                                   &src[0*i_src], 2*i_src,
1914                                   &src[1*i_src], 2*i_src );
1915         }
1916
1917         dst += 8;
1918         src += 8;
1919     }
1920
1921     if( i_modx )
1922         XDeintNxN( dst, i_dst, src, i_src, i_modx, 8 );
1923 }
1924 #endif
1925
1926 static void RenderX( picture_t *p_outpic, picture_t *p_pic )
1927 {
1928     int i_plane;
1929
1930     /* Copy image and skip lines */
1931     for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
1932     {
1933         const int i_mby = ( p_outpic->p[i_plane].i_visible_lines + 7 )/8 - 1;
1934         const int i_mbx = p_outpic->p[i_plane].i_visible_pitch/8;
1935
1936         const int i_mody = p_outpic->p[i_plane].i_visible_lines - 8*i_mby;
1937         const int i_modx = p_outpic->p[i_plane].i_visible_pitch - 8*i_mbx;
1938
1939         const int i_dst = p_outpic->p[i_plane].i_pitch;
1940         const int i_src = p_pic->p[i_plane].i_pitch;
1941
1942         int y, x;
1943
1944         for( y = 0; y < i_mby; y++ )
1945         {
1946             uint8_t *dst = &p_outpic->p[i_plane].p_pixels[8*y*i_dst];
1947             uint8_t *src = &p_pic->p[i_plane].p_pixels[8*y*i_src];
1948
1949 #ifdef CAN_COMPILE_MMXEXT
1950             if( vlc_CPU() & CPU_CAPABILITY_MMXEXT )
1951                 XDeintBand8x8MMXEXT( dst, i_dst, src, i_src, i_mbx, i_modx );
1952             else
1953 #endif
1954                 XDeintBand8x8C( dst, i_dst, src, i_src, i_mbx, i_modx );
1955         }
1956
1957         /* Last line (C only)*/
1958         if( i_mody )
1959         {
1960             uint8_t *dst = &p_outpic->p[i_plane].p_pixels[8*y*i_dst];
1961             uint8_t *src = &p_pic->p[i_plane].p_pixels[8*y*i_src];
1962
1963             for( x = 0; x < i_mbx; x++ )
1964             {
1965                 XDeintNxN( dst, i_dst, src, i_src, 8, i_mody );
1966
1967                 dst += 8;
1968                 src += 8;
1969             }
1970
1971             if( i_modx )
1972                 XDeintNxN( dst, i_dst, src, i_src, i_modx, i_mody );
1973         }
1974     }
1975
1976 #ifdef CAN_COMPILE_MMXEXT
1977     if( vlc_CPU() & CPU_CAPABILITY_MMXEXT )
1978         emms();
1979 #endif
1980 }
1981
1982 /*****************************************************************************
1983  * SendEvents: forward mouse and keyboard events to the parent p_vout
1984  *****************************************************************************/
1985 static int SendEvents( vlc_object_t *p_this, char const *psz_var,
1986                        vlc_value_t oldval, vlc_value_t newval, void *_p_vout )
1987 {
1988     VLC_UNUSED(p_this); VLC_UNUSED(oldval);
1989     vout_thread_t *p_vout = (vout_thread_t *)_p_vout;
1990     vlc_value_t sentval = newval;
1991
1992     if( !strcmp( psz_var, "mouse-y" ) )
1993     {
1994         switch( p_vout->p_sys->i_mode )
1995         {
1996             case DEINTERLACE_MEAN:
1997             case DEINTERLACE_DISCARD:
1998                 sentval.i_int *= 2;
1999                 break;
2000         }
2001     }
2002
2003     var_Set( p_vout, psz_var, sentval );
2004
2005     return VLC_SUCCESS;
2006 }
2007
2008 /*****************************************************************************
2009  * FilterCallback: called when changing the deinterlace method on the fly.
2010  *****************************************************************************/
2011 static int FilterCallback( vlc_object_t *p_this, char const *psz_cmd,
2012                            vlc_value_t oldval, vlc_value_t newval,
2013                            void *p_data )
2014 {
2015     VLC_UNUSED(psz_cmd); VLC_UNUSED(p_data); VLC_UNUSED(oldval);
2016     vout_thread_t * p_vout = (vout_thread_t *)p_this;
2017     int i_old_mode = p_vout->p_sys->i_mode;
2018
2019     msg_Dbg( p_vout, "using %s deinterlace mode", newval.psz_string );
2020
2021     vlc_mutex_lock( &p_vout->p_sys->filter_lock );
2022
2023     SetFilterMethod( p_vout, newval.psz_string );
2024
2025     switch( p_vout->render.i_chroma )
2026     {
2027     case VLC_FOURCC('I','4','2','2'):
2028         vlc_mutex_unlock( &p_vout->p_sys->filter_lock );
2029         return VLC_SUCCESS;
2030         break;
2031
2032     case VLC_FOURCC('I','4','2','0'):
2033     case VLC_FOURCC('I','Y','U','V'):
2034     case VLC_FOURCC('Y','V','1','2'):
2035         switch( p_vout->p_sys->i_mode )
2036         {
2037         case DEINTERLACE_MEAN:
2038         case DEINTERLACE_DISCARD:
2039             if( ( i_old_mode == DEINTERLACE_MEAN )
2040                 || ( i_old_mode == DEINTERLACE_DISCARD ) )
2041             {
2042                 vlc_mutex_unlock( &p_vout->p_sys->filter_lock );
2043                 return VLC_SUCCESS;
2044             }
2045             break;
2046
2047         case DEINTERLACE_BOB:
2048         case DEINTERLACE_BLEND:
2049         case DEINTERLACE_LINEAR:
2050             if( ( i_old_mode == DEINTERLACE_BOB )
2051                 || ( i_old_mode == DEINTERLACE_BLEND )
2052                 || ( i_old_mode == DEINTERLACE_LINEAR ) )
2053             {
2054                 vlc_mutex_unlock( &p_vout->p_sys->filter_lock );
2055                 return VLC_SUCCESS;
2056             }
2057             break;
2058         }
2059         break;
2060
2061     default:
2062         break;
2063     }
2064
2065     /* We need to kill the old vout */
2066
2067     DEL_CALLBACKS( p_vout->p_sys->p_vout, SendEvents );
2068
2069     vlc_object_detach( p_vout->p_sys->p_vout );
2070     vlc_object_release( p_vout->p_sys->p_vout );
2071
2072     /* Try to open a new video output */
2073     p_vout->p_sys->p_vout = SpawnRealVout( p_vout );
2074
2075     if( p_vout->p_sys->p_vout == NULL )
2076     {
2077         /* Everything failed */
2078         msg_Err( p_vout, "cannot open vout, aborting" );
2079
2080         vlc_mutex_unlock( &p_vout->p_sys->filter_lock );
2081         return VLC_EGENERIC;
2082     }
2083
2084     ADD_CALLBACKS( p_vout->p_sys->p_vout, SendEvents );
2085
2086     vlc_mutex_unlock( &p_vout->p_sys->filter_lock );
2087     return VLC_SUCCESS;
2088 }
2089
2090 /*****************************************************************************
2091  * SendEventsToChild: forward events to the child/children vout
2092  *****************************************************************************/
2093 static int SendEventsToChild( vlc_object_t *p_this, char const *psz_var,
2094                        vlc_value_t oldval, vlc_value_t newval, void *p_data )
2095 {
2096     VLC_UNUSED(p_data); VLC_UNUSED(oldval);
2097     vout_thread_t *p_vout = (vout_thread_t *)p_this;
2098     var_Set( p_vout->p_sys->p_vout, psz_var, newval );
2099     return VLC_SUCCESS;
2100 }
2101
2102
2103 /*****************************************************************************
2104  * video filter2 functions
2105  *****************************************************************************/
2106 static picture_t *Deinterlace( filter_t *p_filter, picture_t *p_pic )
2107 {
2108     vout_thread_t *p_vout = (vout_thread_t *)p_filter->p_sys;
2109     picture_t *p_pic_dst;
2110
2111     /* Request output picture */
2112     p_pic_dst = p_filter->pf_vout_buffer_new( p_filter );
2113     if( p_pic_dst == NULL )
2114     {
2115         msg_Warn( p_filter, "can't get output picture" );
2116         return NULL;
2117     }
2118
2119     switch( p_vout->p_sys->i_mode )
2120     {
2121         case DEINTERLACE_DISCARD:
2122 #if 0
2123             RenderDiscard( p_vout, p_pic_dst, p_pic, 0 );
2124 #endif
2125             msg_Err( p_vout, "discarding lines is not supported yet" );
2126             p_pic_dst->pf_release( p_pic_dst );
2127             return p_pic;
2128             break;
2129
2130         case DEINTERLACE_BOB:
2131 #if 0
2132             RenderBob( p_vout, pp_outpic[0], p_pic, 0 );
2133             RenderBob( p_vout, pp_outpic[1], p_pic, 1 );
2134             break;
2135 #endif
2136
2137         case DEINTERLACE_LINEAR:
2138 #if 0
2139             RenderLinear( p_vout, pp_outpic[0], p_pic, 0 );
2140             RenderLinear( p_vout, pp_outpic[1], p_pic, 1 );
2141 #endif
2142             msg_Err( p_vout, "doubling the frame rate is not supported yet" );
2143             p_pic_dst->pf_release( p_pic_dst );
2144             return p_pic;
2145             break;
2146
2147         case DEINTERLACE_MEAN:
2148             RenderMean( p_vout, p_pic_dst, p_pic );
2149             break;
2150
2151         case DEINTERLACE_BLEND:
2152             RenderBlend( p_vout, p_pic_dst, p_pic );
2153             break;
2154
2155         case DEINTERLACE_X:
2156             RenderX( p_pic_dst, p_pic );
2157             break;
2158     }
2159
2160     p_pic_dst->date = p_pic->date;
2161     p_pic_dst->b_force = p_pic->b_force;
2162     p_pic_dst->i_nb_fields = p_pic->i_nb_fields;
2163     p_pic_dst->b_progressive = true;
2164     p_pic_dst->b_top_field_first = p_pic->b_top_field_first;
2165
2166     p_pic->pf_release( p_pic );
2167     return p_pic_dst;
2168 }
2169
2170 /*****************************************************************************
2171  * OpenFilter:
2172  *****************************************************************************/
2173 static int OpenFilter( vlc_object_t *p_this )
2174 {
2175     filter_t *p_filter = (filter_t*)p_this;
2176     vout_thread_t *p_vout;
2177     vlc_value_t val;
2178
2179     if( ( p_filter->fmt_in.video.i_chroma != VLC_FOURCC('I','4','2','0') &&
2180           p_filter->fmt_in.video.i_chroma != VLC_FOURCC('I','Y','U','V') &&
2181           p_filter->fmt_in.video.i_chroma != VLC_FOURCC('Y','V','1','2') ) ||
2182         p_filter->fmt_in.video.i_chroma != p_filter->fmt_out.video.i_chroma )
2183     {
2184         return VLC_EGENERIC;
2185     }
2186
2187     /* Impossible to use VLC_OBJECT_VOUT here because it would be used
2188      * by spu filters */
2189     p_vout = vlc_object_create( p_filter, sizeof(vout_thread_t) );
2190     vlc_object_attach( p_vout, p_filter );
2191     p_filter->p_sys = (filter_sys_t *)p_vout;
2192     p_vout->render.i_chroma = p_filter->fmt_in.video.i_chroma;
2193
2194     config_ChainParse( p_filter, FILTER_CFG_PREFIX, ppsz_filter_options,
2195                    p_filter->p_cfg );
2196     var_Get( p_filter, FILTER_CFG_PREFIX "mode", &val );
2197     var_Create( p_filter, "deinterlace-mode", VLC_VAR_STRING );
2198     var_Set( p_filter, "deinterlace-mode", val );
2199
2200     if ( Create( VLC_OBJECT(p_vout) ) != VLC_SUCCESS )
2201     {
2202         vlc_object_detach( p_vout );
2203         vlc_object_release( p_vout );
2204         return VLC_EGENERIC;
2205     }
2206
2207     p_filter->pf_video_filter = Deinterlace;
2208
2209     msg_Dbg( p_filter, "deinterlacing" );
2210
2211     return VLC_SUCCESS;
2212 }
2213
2214 /*****************************************************************************
2215  * CloseFilter: clean up the filter
2216  *****************************************************************************/
2217 static void CloseFilter( vlc_object_t *p_this )
2218 {
2219     filter_t *p_filter = (filter_t*)p_this;
2220     vout_thread_t *p_vout = (vout_thread_t *)p_filter->p_sys;
2221
2222     Destroy( VLC_OBJECT(p_vout) );
2223     vlc_object_detach( p_vout );
2224     vlc_object_release( p_vout );
2225 }
2226