git.sesse.net Git - vlc/blob - plugins/motion/motionmmx.c

   1 /*****************************************************************************
   2  * motionmmx.c : MMX motion compensation module for vlc
   3  *****************************************************************************
   4  * Copyright (C) 2001 VideoLAN
   5  * $Id: motionmmx.c,v 1.10 2001/08/22 17:21:45 massiot Exp $
   6  *
   7  * Authors: Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
   8  *          Michel Lespinasse <walken@zoy.org>
   9  *
  10  * This program is free software; you can redistribute it and/or modify
  11  * it under the terms of the GNU General Public License as published by
  12  * the Free Software Foundation; either version 2 of the License, or
  13  * (at your option) any later version.
  14  *
  15  * This program is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  * GNU General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU General Public License
  21  * along with this program; if not, write to the Free Software
  22  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
  23  *****************************************************************************/
  24
  25 #define MODULE_NAME motionmmx
  26 #include "modules_inner.h"
  27
  28 /*****************************************************************************
  29  * Preamble
  30  *****************************************************************************/
  31 #include "defs.h"
  32
  33 #include <stdlib.h>                                      /* malloc(), free() */
  34
  35 #include "config.h"
  36 #include "common.h"                                     /* boolean_t, byte_t */
  37 #include "threads.h"
  38 #include "mtime.h"
  39 #include "tests.h"
  40
  41 #include "mmx.h"
  42
  43 #include "modules.h"
  44 #include "modules_export.h"
  45
  46 /*****************************************************************************
  47  * Local and extern prototypes.
  48  *****************************************************************************/
  49 static void motion_getfunctions( function_list_t * p_function_list );
  50
  51 /*****************************************************************************
  52  * Build configuration tree.
  53  *****************************************************************************/
  54 MODULE_CONFIG_START
  55 ADD_WINDOW( "Configuration for MMX motion compensation module" )
  56     ADD_COMMENT( "Ha, ha -- nothing to configure yet" )
  57 MODULE_CONFIG_STOP
  58
  59 MODULE_INIT_START
  60     p_module->i_capabilities = MODULE_CAPABILITY_NULL
  61                                 | MODULE_CAPABILITY_MOTION;
  62     p_module->psz_longname = "MMX motion compensation module";
  63 MODULE_INIT_STOP
  64
  65 MODULE_ACTIVATE_START
  66     motion_getfunctions( &p_module->p_functions->motion );
  67 MODULE_ACTIVATE_STOP
  68
  69 MODULE_DEACTIVATE_START
  70 MODULE_DEACTIVATE_STOP
  71
  72 /*****************************************************************************
  73  * motion_Probe: tests probe the CPU and return a score
  74  *****************************************************************************/
  75 static int motion_Probe( probedata_t *p_data )
  76 {
  77     if( !TestCPU( CPU_CAPABILITY_MMX ) )
  78     {
  79         return( 0 );
  80     }
  81
  82     if( TestMethod( MOTION_METHOD_VAR, "motionmmx" )
  83          || TestMethod( MOTION_METHOD_VAR, "mmx" ) )
  84     {
  85         return( 999 );
  86     }
  87
  88     return( 150 );
  89 }
  90
  91 /*****************************************************************************
  92  * Motion compensation in MMX
  93  *****************************************************************************/
  94
  95 // some rounding constants
  96 mmx_t round1 = {0x0001000100010001LL};
  97 mmx_t round4 = {0x0002000200020002LL};
  98
  99 /*
 100  * This code should probably be compiled with loop unrolling
 101  * (ie, -funroll-loops in gcc)becuase some of the loops
 102  * use a small static number of iterations. This was written
 103  * with the assumption the compiler knows best about when
 104  * unrolling will help
 105  */
 106
 107 static __inline__ void mmx_zero_reg ()
 108 {
 109     // load 0 into mm0
 110     pxor_r2r (mm0, mm0);
 111 }
 112
 113 static __inline__ void mmx_average_2_U8 (yuv_data_t * dest,
 114                                      yuv_data_t * src1, yuv_data_t * src2)
 115 {
 116     //
 117     // *dest = (*src1 + *src2 + 1)/ 2;
 118     //
 119
 120     movq_m2r (*src1, mm1);        // load 8 src1 bytes
 121     movq_r2r (mm1, mm2);        // copy 8 src1 bytes
 122
 123     movq_m2r (*src2, mm3);        // load 8 src2 bytes
 124     movq_r2r (mm3, mm4);        // copy 8 src2 bytes
 125
 126     punpcklbw_r2r (mm0, mm1);        // unpack low src1 bytes
 127     punpckhbw_r2r (mm0, mm2);        // unpack high src1 bytes
 128
 129     punpcklbw_r2r (mm0, mm3);        // unpack low src2 bytes
 130     punpckhbw_r2r (mm0, mm4);        // unpack high src2 bytes
 131
 132     paddw_r2r (mm3, mm1);        // add lows to mm1
 133     paddw_m2r (round1, mm1);
 134     psraw_i2r (1, mm1);                // /2
 135
 136     paddw_r2r (mm4, mm2);        // add highs to mm2
 137     paddw_m2r (round1, mm2);
 138     psraw_i2r (1, mm2);                // /2
 139
 140     packuswb_r2r (mm2, mm1);        // pack (w/ saturation)
 141     movq_r2m (mm1, *dest);        // store result in dest
 142 }
 143
 144 static __inline__ void mmx_interp_average_2_U8 (yuv_data_t * dest,
 145                                             yuv_data_t * src1, yuv_data_t * src2)
 146 {
 147     //
 148     // *dest = (*dest + (*src1 + *src2 + 1)/ 2 + 1)/ 2;
 149     //
 150
 151     movq_m2r (*dest, mm1);        // load 8 dest bytes
 152     movq_r2r (mm1, mm2);        // copy 8 dest bytes
 153
 154     movq_m2r (*src1, mm3);        // load 8 src1 bytes
 155     movq_r2r (mm3, mm4);        // copy 8 src1 bytes
 156
 157     movq_m2r (*src2, mm5);        // load 8 src2 bytes
 158     movq_r2r (mm5, mm6);        // copy 8 src2 bytes
 159
 160     punpcklbw_r2r (mm0, mm1);        // unpack low dest bytes
 161     punpckhbw_r2r (mm0, mm2);        // unpack high dest bytes
 162
 163     punpcklbw_r2r (mm0, mm3);        // unpack low src1 bytes
 164     punpckhbw_r2r (mm0, mm4);        // unpack high src1 bytes
 165
 166     punpcklbw_r2r (mm0, mm5);        // unpack low src2 bytes
 167     punpckhbw_r2r (mm0, mm6);        // unpack high src2 bytes
 168
 169     paddw_r2r (mm5, mm3);        // add lows
 170     paddw_m2r (round1, mm3);
 171     psraw_i2r (1, mm3);                // /2
 172
 173     paddw_r2r (mm6, mm4);        // add highs
 174     paddw_m2r (round1, mm4);
 175     psraw_i2r (1, mm4);                // /2
 176
 177     paddw_r2r (mm3, mm1);        // add lows
 178     paddw_m2r (round1, mm1);
 179     psraw_i2r (1, mm1);                // /2
 180
 181     paddw_r2r (mm4, mm2);        // add highs
 182     paddw_m2r (round1, mm2);
 183     psraw_i2r (1, mm2);                // /2
 184
 185     packuswb_r2r (mm2, mm1);        // pack (w/ saturation)
 186     movq_r2m (mm1, *dest);        // store result in dest
 187 }
 188
 189 static __inline__ void mmx_average_4_U8 (yuv_data_t * dest,
 190                                      yuv_data_t * src1, yuv_data_t * src2,
 191                                      yuv_data_t * src3, yuv_data_t * src4)
 192 {
 193     //
 194     // *dest = (*src1 + *src2 + *src3 + *src4 + 2)/ 4;
 195     //
 196
 197     movq_m2r (*src1, mm1);        // load 8 src1 bytes
 198     movq_r2r (mm1, mm2);        // copy 8 src1 bytes
 199
 200     punpcklbw_r2r (mm0, mm1);        // unpack low src1 bytes
 201     punpckhbw_r2r (mm0, mm2);        // unpack high src1 bytes
 202
 203     movq_m2r (*src2, mm3);        // load 8 src2 bytes
 204     movq_r2r (mm3, mm4);        // copy 8 src2 bytes
 205
 206     punpcklbw_r2r (mm0, mm3);        // unpack low src2 bytes
 207     punpckhbw_r2r (mm0, mm4);        // unpack high src2 bytes
 208
 209     paddw_r2r (mm3, mm1);        // add lows
 210     paddw_r2r (mm4, mm2);        // add highs
 211
 212     // now have partials in mm1 and mm2
 213
 214     movq_m2r (*src3, mm3);        // load 8 src3 bytes
 215     movq_r2r (mm3, mm4);        // copy 8 src3 bytes
 216
 217     punpcklbw_r2r (mm0, mm3);        // unpack low src3 bytes
 218     punpckhbw_r2r (mm0, mm4);        // unpack high src3 bytes
 219
 220     paddw_r2r (mm3, mm1);        // add lows
 221     paddw_r2r (mm4, mm2);        // add highs
 222
 223     movq_m2r (*src4, mm5);        // load 8 src4 bytes
 224     movq_r2r (mm5, mm6);        // copy 8 src4 bytes
 225
 226     punpcklbw_r2r (mm0, mm5);        // unpack low src4 bytes
 227     punpckhbw_r2r (mm0, mm6);        // unpack high src4 bytes
 228
 229     paddw_r2r (mm5, mm1);        // add lows
 230     paddw_r2r (mm6, mm2);        // add highs
 231
 232     // now have subtotal in mm1 and mm2
 233
 234     paddw_m2r (round4, mm1);
 235     psraw_i2r (2, mm1);                // /4
 236     paddw_m2r (round4, mm2);
 237     psraw_i2r (2, mm2);                // /4
 238
 239     packuswb_r2r (mm2, mm1);        // pack (w/ saturation)
 240     movq_r2m (mm1, *dest);        // store result in dest
 241 }
 242
 243 static __inline__ void mmx_interp_average_4_U8 (yuv_data_t * dest,
 244                                             yuv_data_t * src1, yuv_data_t * src2,
 245                                             yuv_data_t * src3, yuv_data_t * src4)
 246 {
 247     //
 248     // *dest = (*dest + (*src1 + *src2 + *src3 + *src4 + 2)/ 4 + 1)/ 2;
 249     //
 250
 251     movq_m2r (*src1, mm1);        // load 8 src1 bytes
 252     movq_r2r (mm1, mm2);        // copy 8 src1 bytes
 253
 254     punpcklbw_r2r (mm0, mm1);        // unpack low src1 bytes
 255     punpckhbw_r2r (mm0, mm2);        // unpack high src1 bytes
 256
 257     movq_m2r (*src2, mm3);        // load 8 src2 bytes
 258     movq_r2r (mm3, mm4);        // copy 8 src2 bytes
 259
 260     punpcklbw_r2r (mm0, mm3);        // unpack low src2 bytes
 261     punpckhbw_r2r (mm0, mm4);        // unpack high src2 bytes
 262
 263     paddw_r2r (mm3, mm1);        // add lows
 264     paddw_r2r (mm4, mm2);        // add highs
 265
 266     // now have partials in mm1 and mm2
 267
 268     movq_m2r (*src3, mm3);        // load 8 src3 bytes
 269     movq_r2r (mm3, mm4);        // copy 8 src3 bytes
 270
 271     punpcklbw_r2r (mm0, mm3);        // unpack low src3 bytes
 272     punpckhbw_r2r (mm0, mm4);        // unpack high src3 bytes
 273
 274     paddw_r2r (mm3, mm1);        // add lows
 275     paddw_r2r (mm4, mm2);        // add highs
 276
 277     movq_m2r (*src4, mm5);        // load 8 src4 bytes
 278     movq_r2r (mm5, mm6);        // copy 8 src4 bytes
 279
 280     punpcklbw_r2r (mm0, mm5);        // unpack low src4 bytes
 281     punpckhbw_r2r (mm0, mm6);        // unpack high src4 bytes
 282
 283     paddw_r2r (mm5, mm1);        // add lows
 284     paddw_r2r (mm6, mm2);        // add highs
 285
 286     paddw_m2r (round4, mm1);
 287     psraw_i2r (2, mm1);                // /4
 288     paddw_m2r (round4, mm2);
 289     psraw_i2r (2, mm2);                // /4
 290
 291     // now have subtotal/4 in mm1 and mm2
 292
 293     movq_m2r (*dest, mm3);        // load 8 dest bytes
 294     movq_r2r (mm3, mm4);        // copy 8 dest bytes
 295
 296     punpcklbw_r2r (mm0, mm3);        // unpack low dest bytes
 297     punpckhbw_r2r (mm0, mm4);        // unpack high dest bytes
 298
 299     paddw_r2r (mm3, mm1);        // add lows
 300     paddw_r2r (mm4, mm2);        // add highs
 301
 302     paddw_m2r (round1, mm1);
 303     psraw_i2r (1, mm1);                // /2
 304     paddw_m2r (round1, mm2);
 305     psraw_i2r (1, mm2);                // /2
 306
 307     // now have end value in mm1 and mm2
 308
 309     packuswb_r2r (mm2, mm1);        // pack (w/ saturation)
 310     movq_r2m (mm1,*dest);        // store result in dest
 311 }
 312
 313 //-----------------------------------------------------------------------
 314
 315 static __inline__ void MC_avg_mmx (int width, int height,
 316                                yuv_data_t * dest, yuv_data_t * ref, int stride)
 317 {
 318     mmx_zero_reg ();
 319
 320     do {
 321         mmx_average_2_U8 (dest, dest, ref);
 322
 323         if (width == 16)
 324             mmx_average_2_U8 (dest+8, dest+8, ref+8);
 325
 326         dest += stride;
 327         ref += stride;
 328     } while (--height);
 329 }
 330
 331 static void MC_avg_16_mmx (yuv_data_t * dest, yuv_data_t * ref,
 332                            int stride, int height)
 333 {
 334     MC_avg_mmx (16, height, dest, ref, stride);
 335 }
 336
 337 static void MC_avg_8_mmx (yuv_data_t * dest, yuv_data_t * ref,
 338                           int stride, int height)
 339 {
 340     MC_avg_mmx (8, height, dest, ref, stride);
 341 }
 342
 343 //-----------------------------------------------------------------------
 344
 345 static __inline__ void MC_put_mmx (int width, int height,
 346                                yuv_data_t * dest, yuv_data_t * ref, int stride)
 347 {
 348     mmx_zero_reg ();
 349
 350     do {
 351         movq_m2r (* ref, mm1);        // load 8 ref bytes
 352         movq_r2m (mm1,* dest);        // store 8 bytes at curr
 353
 354         if (width == 16)
 355             {
 356                 movq_m2r (* (ref+8), mm1);        // load 8 ref bytes
 357                 movq_r2m (mm1,* (dest+8));        // store 8 bytes at curr
 358             }
 359
 360         dest += stride;
 361         ref += stride;
 362     } while (--height);
 363 }
 364
 365 static void MC_put_16_mmx (yuv_data_t * dest, yuv_data_t * ref,
 366                            int stride, int height)
 367 {
 368     MC_put_mmx (16, height, dest, ref, stride);
 369 }
 370
 371 static void MC_put_8_mmx (yuv_data_t * dest, yuv_data_t * ref,
 372                           int stride, int height)
 373 {
 374     MC_put_mmx (8, height, dest, ref, stride);
 375 }
 376
 377 //-----------------------------------------------------------------------
 378
 379 // Half pixel interpolation in the x direction
 380 static __inline__ void MC_avg_x_mmx (int width, int height,
 381                                  yuv_data_t * dest, yuv_data_t * ref, int stride)
 382 {
 383     mmx_zero_reg ();
 384
 385     do {
 386         mmx_interp_average_2_U8 (dest, ref, ref+1);
 387
 388         if (width == 16)
 389             mmx_interp_average_2_U8 (dest+8, ref+8, ref+9);
 390
 391         dest += stride;
 392         ref += stride;
 393     } while (--height);
 394 }
 395
 396 static void MC_avg_x16_mmx (yuv_data_t * dest, yuv_data_t * ref,
 397                             int stride, int height)
 398 {
 399     MC_avg_x_mmx (16, height, dest, ref, stride);
 400 }
 401
 402 static void MC_avg_x8_mmx (yuv_data_t * dest, yuv_data_t * ref,
 403                            int stride, int height)
 404 {
 405     MC_avg_x_mmx (8, height, dest, ref, stride);
 406 }
 407
 408 //-----------------------------------------------------------------------
 409
 410 static __inline__ void MC_put_x_mmx (int width, int height,
 411                                  yuv_data_t * dest, yuv_data_t * ref, int stride)
 412 {
 413     mmx_zero_reg ();
 414
 415     do {
 416         mmx_average_2_U8 (dest, ref, ref+1);
 417
 418         if (width == 16)
 419             mmx_average_2_U8 (dest+8, ref+8, ref+9);
 420
 421         dest += stride;
 422         ref += stride;
 423     } while (--height);
 424 }
 425
 426 static void MC_put_x16_mmx (yuv_data_t * dest, yuv_data_t * ref,
 427                             int stride, int height)
 428 {
 429     MC_put_x_mmx (16, height, dest, ref, stride);
 430 }
 431
 432 static void MC_put_x8_mmx (yuv_data_t * dest, yuv_data_t * ref,
 433                            int stride, int height)
 434 {
 435     MC_put_x_mmx (8, height, dest, ref, stride);
 436 }
 437
 438 //-----------------------------------------------------------------------
 439
 440 static __inline__ void MC_avg_xy_mmx (int width, int height,
 441                                   yuv_data_t * dest, yuv_data_t * ref, int stride)
 442 {
 443     yuv_data_t * ref_next = ref+stride;
 444
 445     mmx_zero_reg ();
 446
 447     do {
 448         mmx_interp_average_4_U8 (dest, ref, ref+1, ref_next, ref_next+1);
 449
 450         if (width == 16)
 451             mmx_interp_average_4_U8 (dest+8, ref+8, ref+9,
 452                                      ref_next+8, ref_next+9);
 453
 454         dest += stride;
 455         ref += stride;
 456         ref_next += stride;
 457     } while (--height);
 458 }
 459
 460 static void MC_avg_xy16_mmx (yuv_data_t * dest, yuv_data_t * ref,
 461                              int stride, int height)
 462 {
 463     MC_avg_xy_mmx (16, height, dest, ref, stride);
 464 }
 465
 466 static void MC_avg_xy8_mmx (yuv_data_t * dest, yuv_data_t * ref,
 467                             int stride, int height)
 468 {
 469     MC_avg_xy_mmx (8, height, dest, ref, stride);
 470 }
 471
 472 //-----------------------------------------------------------------------
 473
 474 static __inline__ void MC_put_xy_mmx (int width, int height,
 475                                   yuv_data_t * dest, yuv_data_t * ref, int stride)
 476 {
 477     yuv_data_t * ref_next = ref+stride;
 478
 479     mmx_zero_reg ();
 480
 481     do {
 482         mmx_average_4_U8 (dest, ref, ref+1, ref_next, ref_next+1);
 483
 484         if (width == 16)
 485             mmx_average_4_U8 (dest+8, ref+8, ref+9, ref_next+8, ref_next+9);
 486
 487         dest += stride;
 488         ref += stride;
 489         ref_next += stride;
 490     } while (--height);
 491 }
 492
 493 static void MC_put_xy16_mmx (yuv_data_t * dest, yuv_data_t * ref,
 494                              int stride, int height)
 495 {
 496     MC_put_xy_mmx (16, height, dest, ref, stride);
 497 }
 498
 499 static void MC_put_xy8_mmx (yuv_data_t * dest, yuv_data_t * ref,
 500                             int stride, int height)
 501 {
 502     MC_put_xy_mmx (8, height, dest, ref, stride);
 503 }
 504
 505 //-----------------------------------------------------------------------
 506
 507 static __inline__ void MC_avg_y_mmx (int width, int height,
 508                                  yuv_data_t * dest, yuv_data_t * ref, int stride)
 509 {
 510     yuv_data_t * ref_next = ref+stride;
 511
 512     mmx_zero_reg ();
 513
 514     do {
 515         mmx_interp_average_2_U8 (dest, ref, ref_next);
 516
 517         if (width == 16)
 518             mmx_interp_average_2_U8 (dest+8, ref+8, ref_next+8);
 519
 520         dest += stride;
 521         ref += stride;
 522         ref_next += stride;
 523     } while (--height);
 524 }
 525
 526 static void MC_avg_y16_mmx (yuv_data_t * dest, yuv_data_t * ref,
 527                             int stride, int height)
 528 {
 529     MC_avg_y_mmx (16, height, dest, ref, stride);
 530 }
 531
 532 static void MC_avg_y8_mmx (yuv_data_t * dest, yuv_data_t * ref,
 533                            int stride, int height)
 534 {
 535     MC_avg_y_mmx (8, height, dest, ref, stride);
 536 }
 537
 538 //-----------------------------------------------------------------------
 539
 540 static __inline__ void MC_put_y_mmx (int width, int height,
 541                                  yuv_data_t * dest, yuv_data_t * ref, int stride)
 542 {
 543     yuv_data_t * ref_next = ref+stride;
 544
 545     mmx_zero_reg ();
 546
 547     do {
 548         mmx_average_2_U8 (dest, ref, ref_next);
 549
 550         if (width == 16)
 551             mmx_average_2_U8 (dest+8, ref+8, ref_next+8);
 552
 553         dest += stride;
 554         ref += stride;
 555         ref_next += stride;
 556     } while (--height);
 557 }
 558
 559 static void MC_put_y16_mmx (yuv_data_t * dest, yuv_data_t * ref,
 560                             int stride, int height)
 561 {
 562     MC_put_y_mmx (16, height, dest, ref, stride);
 563 }
 564
 565 static void MC_put_y8_mmx (yuv_data_t * dest, yuv_data_t * ref,
 566                            int stride, int height)
 567 {
 568     MC_put_y_mmx (8, height, dest, ref, stride);
 569 }
 570
 571
 572 /*****************************************************************************
 573  * Functions exported as capabilities. They are declared as static so that
 574  * we don't pollute the namespace too much.
 575  *****************************************************************************/
 576 static void motion_getfunctions( function_list_t * p_function_list )
 577 {
 578     static void (* ppppf_motion[2][2][4])( yuv_data_t *, yuv_data_t *,
 579                                            int, int ) =
 580     {
 581         {
 582             /* Copying functions */
 583             {
 584                 /* Width == 16 */
 585                 MC_put_16_mmx, MC_put_x16_mmx, MC_put_y16_mmx, MC_put_xy16_mmx
 586             },
 587             {
 588                 /* Width == 8 */
 589                 MC_put_8_mmx,  MC_put_x8_mmx,  MC_put_y8_mmx, MC_put_xy8_mmx
 590             }
 591         },
 592         {
 593             /* Averaging functions */
 594             {
 595                 /* Width == 16 */
 596                 MC_avg_16_mmx, MC_avg_x16_mmx, MC_avg_y16_mmx, MC_avg_xy16_mmx
 597             },
 598             {
 599                 /* Width == 8 */
 600                 MC_avg_8_mmx,  MC_avg_x8_mmx,  MC_avg_y8_mmx,  MC_avg_xy8_mmx
 601             }
 602         }
 603     };
 604
 605     p_function_list->pf_probe = motion_Probe;
 606
 607 #define list p_function_list->functions.motion
 608     memcpy( list.ppppf_motion, ppppf_motion, sizeof( void * ) * 16 );
 609 #undef list
 610
 611     return;
 612 }
 613