git.sesse.net Git - vlc/blob - plugins/motion/motionmmx.c

   1 /*****************************************************************************
   2  * motionmmx.c : MMX motion compensation module for vlc
   3  *****************************************************************************
   4  * Copyright (C) 2001 VideoLAN
   5  * $Id: motionmmx.c,v 1.11 2001/09/06 14:02:56 massiot Exp $
   6  *
   7  * Authors: Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
   8  *          Michel Lespinasse <walken@zoy.org>
   9  *
  10  * This program is free software; you can redistribute it and/or modify
  11  * it under the terms of the GNU General Public License as published by
  12  * the Free Software Foundation; either version 2 of the License, or
  13  * (at your option) any later version.
  14  *
  15  * This program is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  * GNU General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU General Public License
  21  * along with this program; if not, write to the Free Software
  22  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
  23  *****************************************************************************/
  24
  25 #define MODULE_NAME motionmmx
  26 #include "modules_inner.h"
  27
  28 /*****************************************************************************
  29  * Preamble
  30  *****************************************************************************/
  31 #include "defs.h"
  32
  33 #include <stdlib.h>                                      /* malloc(), free() */
  34 #include <string.h>
  35
  36 #include "config.h"
  37 #include "common.h"                                     /* boolean_t, byte_t */
  38 #include "threads.h"
  39 #include "mtime.h"
  40 #include "tests.h"
  41
  42 #include "mmx.h"
  43
  44 #include "modules.h"
  45 #include "modules_export.h"
  46
  47 /*****************************************************************************
  48  * Local and extern prototypes.
  49  *****************************************************************************/
  50 static void motion_getfunctions( function_list_t * p_function_list );
  51
  52 /*****************************************************************************
  53  * Build configuration tree.
  54  *****************************************************************************/
  55 MODULE_CONFIG_START
  56 ADD_WINDOW( "Configuration for MMX motion compensation module" )
  57     ADD_COMMENT( "Ha, ha -- nothing to configure yet" )
  58 MODULE_CONFIG_STOP
  59
  60 MODULE_INIT_START
  61     p_module->i_capabilities = MODULE_CAPABILITY_NULL
  62                                 | MODULE_CAPABILITY_MOTION;
  63     p_module->psz_longname = "MMX motion compensation module";
  64 MODULE_INIT_STOP
  65
  66 MODULE_ACTIVATE_START
  67     motion_getfunctions( &p_module->p_functions->motion );
  68 MODULE_ACTIVATE_STOP
  69
  70 MODULE_DEACTIVATE_START
  71 MODULE_DEACTIVATE_STOP
  72
  73 /*****************************************************************************
  74  * motion_Probe: tests probe the CPU and return a score
  75  *****************************************************************************/
  76 static int motion_Probe( probedata_t *p_data )
  77 {
  78     if( !TestCPU( CPU_CAPABILITY_MMX ) )
  79     {
  80         return( 0 );
  81     }
  82
  83     if( TestMethod( MOTION_METHOD_VAR, "motionmmx" )
  84          || TestMethod( MOTION_METHOD_VAR, "mmx" ) )
  85     {
  86         return( 999 );
  87     }
  88
  89     return( 150 );
  90 }
  91
  92 /*****************************************************************************
  93  * Motion compensation in MMX
  94  *****************************************************************************/
  95
  96 // some rounding constants
  97 mmx_t round1 = {0x0001000100010001LL};
  98 mmx_t round4 = {0x0002000200020002LL};
  99
 100 /*
 101  * This code should probably be compiled with loop unrolling
 102  * (ie, -funroll-loops in gcc)becuase some of the loops
 103  * use a small static number of iterations. This was written
 104  * with the assumption the compiler knows best about when
 105  * unrolling will help
 106  */
 107
 108 static __inline__ void mmx_zero_reg ()
 109 {
 110     // load 0 into mm0
 111     pxor_r2r (mm0, mm0);
 112 }
 113
 114 static __inline__ void mmx_average_2_U8 (yuv_data_t * dest,
 115                                      yuv_data_t * src1, yuv_data_t * src2)
 116 {
 117     //
 118     // *dest = (*src1 + *src2 + 1)/ 2;
 119     //
 120
 121     movq_m2r (*src1, mm1);        // load 8 src1 bytes
 122     movq_r2r (mm1, mm2);        // copy 8 src1 bytes
 123
 124     movq_m2r (*src2, mm3);        // load 8 src2 bytes
 125     movq_r2r (mm3, mm4);        // copy 8 src2 bytes
 126
 127     punpcklbw_r2r (mm0, mm1);        // unpack low src1 bytes
 128     punpckhbw_r2r (mm0, mm2);        // unpack high src1 bytes
 129
 130     punpcklbw_r2r (mm0, mm3);        // unpack low src2 bytes
 131     punpckhbw_r2r (mm0, mm4);        // unpack high src2 bytes
 132
 133     paddw_r2r (mm3, mm1);        // add lows to mm1
 134     paddw_m2r (round1, mm1);
 135     psraw_i2r (1, mm1);                // /2
 136
 137     paddw_r2r (mm4, mm2);        // add highs to mm2
 138     paddw_m2r (round1, mm2);
 139     psraw_i2r (1, mm2);                // /2
 140
 141     packuswb_r2r (mm2, mm1);        // pack (w/ saturation)
 142     movq_r2m (mm1, *dest);        // store result in dest
 143 }
 144
 145 static __inline__ void mmx_interp_average_2_U8 (yuv_data_t * dest,
 146                                             yuv_data_t * src1, yuv_data_t * src2)
 147 {
 148     //
 149     // *dest = (*dest + (*src1 + *src2 + 1)/ 2 + 1)/ 2;
 150     //
 151
 152     movq_m2r (*dest, mm1);        // load 8 dest bytes
 153     movq_r2r (mm1, mm2);        // copy 8 dest bytes
 154
 155     movq_m2r (*src1, mm3);        // load 8 src1 bytes
 156     movq_r2r (mm3, mm4);        // copy 8 src1 bytes
 157
 158     movq_m2r (*src2, mm5);        // load 8 src2 bytes
 159     movq_r2r (mm5, mm6);        // copy 8 src2 bytes
 160
 161     punpcklbw_r2r (mm0, mm1);        // unpack low dest bytes
 162     punpckhbw_r2r (mm0, mm2);        // unpack high dest bytes
 163
 164     punpcklbw_r2r (mm0, mm3);        // unpack low src1 bytes
 165     punpckhbw_r2r (mm0, mm4);        // unpack high src1 bytes
 166
 167     punpcklbw_r2r (mm0, mm5);        // unpack low src2 bytes
 168     punpckhbw_r2r (mm0, mm6);        // unpack high src2 bytes
 169
 170     paddw_r2r (mm5, mm3);        // add lows
 171     paddw_m2r (round1, mm3);
 172     psraw_i2r (1, mm3);                // /2
 173
 174     paddw_r2r (mm6, mm4);        // add highs
 175     paddw_m2r (round1, mm4);
 176     psraw_i2r (1, mm4);                // /2
 177
 178     paddw_r2r (mm3, mm1);        // add lows
 179     paddw_m2r (round1, mm1);
 180     psraw_i2r (1, mm1);                // /2
 181
 182     paddw_r2r (mm4, mm2);        // add highs
 183     paddw_m2r (round1, mm2);
 184     psraw_i2r (1, mm2);                // /2
 185
 186     packuswb_r2r (mm2, mm1);        // pack (w/ saturation)
 187     movq_r2m (mm1, *dest);        // store result in dest
 188 }
 189
 190 static __inline__ void mmx_average_4_U8 (yuv_data_t * dest,
 191                                      yuv_data_t * src1, yuv_data_t * src2,
 192                                      yuv_data_t * src3, yuv_data_t * src4)
 193 {
 194     //
 195     // *dest = (*src1 + *src2 + *src3 + *src4 + 2)/ 4;
 196     //
 197
 198     movq_m2r (*src1, mm1);        // load 8 src1 bytes
 199     movq_r2r (mm1, mm2);        // copy 8 src1 bytes
 200
 201     punpcklbw_r2r (mm0, mm1);        // unpack low src1 bytes
 202     punpckhbw_r2r (mm0, mm2);        // unpack high src1 bytes
 203
 204     movq_m2r (*src2, mm3);        // load 8 src2 bytes
 205     movq_r2r (mm3, mm4);        // copy 8 src2 bytes
 206
 207     punpcklbw_r2r (mm0, mm3);        // unpack low src2 bytes
 208     punpckhbw_r2r (mm0, mm4);        // unpack high src2 bytes
 209
 210     paddw_r2r (mm3, mm1);        // add lows
 211     paddw_r2r (mm4, mm2);        // add highs
 212
 213     // now have partials in mm1 and mm2
 214
 215     movq_m2r (*src3, mm3);        // load 8 src3 bytes
 216     movq_r2r (mm3, mm4);        // copy 8 src3 bytes
 217
 218     punpcklbw_r2r (mm0, mm3);        // unpack low src3 bytes
 219     punpckhbw_r2r (mm0, mm4);        // unpack high src3 bytes
 220
 221     paddw_r2r (mm3, mm1);        // add lows
 222     paddw_r2r (mm4, mm2);        // add highs
 223
 224     movq_m2r (*src4, mm5);        // load 8 src4 bytes
 225     movq_r2r (mm5, mm6);        // copy 8 src4 bytes
 226
 227     punpcklbw_r2r (mm0, mm5);        // unpack low src4 bytes
 228     punpckhbw_r2r (mm0, mm6);        // unpack high src4 bytes
 229
 230     paddw_r2r (mm5, mm1);        // add lows
 231     paddw_r2r (mm6, mm2);        // add highs
 232
 233     // now have subtotal in mm1 and mm2
 234
 235     paddw_m2r (round4, mm1);
 236     psraw_i2r (2, mm1);                // /4
 237     paddw_m2r (round4, mm2);
 238     psraw_i2r (2, mm2);                // /4
 239
 240     packuswb_r2r (mm2, mm1);        // pack (w/ saturation)
 241     movq_r2m (mm1, *dest);        // store result in dest
 242 }
 243
 244 static __inline__ void mmx_interp_average_4_U8 (yuv_data_t * dest,
 245                                             yuv_data_t * src1, yuv_data_t * src2,
 246                                             yuv_data_t * src3, yuv_data_t * src4)
 247 {
 248     //
 249     // *dest = (*dest + (*src1 + *src2 + *src3 + *src4 + 2)/ 4 + 1)/ 2;
 250     //
 251
 252     movq_m2r (*src1, mm1);        // load 8 src1 bytes
 253     movq_r2r (mm1, mm2);        // copy 8 src1 bytes
 254
 255     punpcklbw_r2r (mm0, mm1);        // unpack low src1 bytes
 256     punpckhbw_r2r (mm0, mm2);        // unpack high src1 bytes
 257
 258     movq_m2r (*src2, mm3);        // load 8 src2 bytes
 259     movq_r2r (mm3, mm4);        // copy 8 src2 bytes
 260
 261     punpcklbw_r2r (mm0, mm3);        // unpack low src2 bytes
 262     punpckhbw_r2r (mm0, mm4);        // unpack high src2 bytes
 263
 264     paddw_r2r (mm3, mm1);        // add lows
 265     paddw_r2r (mm4, mm2);        // add highs
 266
 267     // now have partials in mm1 and mm2
 268
 269     movq_m2r (*src3, mm3);        // load 8 src3 bytes
 270     movq_r2r (mm3, mm4);        // copy 8 src3 bytes
 271
 272     punpcklbw_r2r (mm0, mm3);        // unpack low src3 bytes
 273     punpckhbw_r2r (mm0, mm4);        // unpack high src3 bytes
 274
 275     paddw_r2r (mm3, mm1);        // add lows
 276     paddw_r2r (mm4, mm2);        // add highs
 277
 278     movq_m2r (*src4, mm5);        // load 8 src4 bytes
 279     movq_r2r (mm5, mm6);        // copy 8 src4 bytes
 280
 281     punpcklbw_r2r (mm0, mm5);        // unpack low src4 bytes
 282     punpckhbw_r2r (mm0, mm6);        // unpack high src4 bytes
 283
 284     paddw_r2r (mm5, mm1);        // add lows
 285     paddw_r2r (mm6, mm2);        // add highs
 286
 287     paddw_m2r (round4, mm1);
 288     psraw_i2r (2, mm1);                // /4
 289     paddw_m2r (round4, mm2);
 290     psraw_i2r (2, mm2);                // /4
 291
 292     // now have subtotal/4 in mm1 and mm2
 293
 294     movq_m2r (*dest, mm3);        // load 8 dest bytes
 295     movq_r2r (mm3, mm4);        // copy 8 dest bytes
 296
 297     punpcklbw_r2r (mm0, mm3);        // unpack low dest bytes
 298     punpckhbw_r2r (mm0, mm4);        // unpack high dest bytes
 299
 300     paddw_r2r (mm3, mm1);        // add lows
 301     paddw_r2r (mm4, mm2);        // add highs
 302
 303     paddw_m2r (round1, mm1);
 304     psraw_i2r (1, mm1);                // /2
 305     paddw_m2r (round1, mm2);
 306     psraw_i2r (1, mm2);                // /2
 307
 308     // now have end value in mm1 and mm2
 309
 310     packuswb_r2r (mm2, mm1);        // pack (w/ saturation)
 311     movq_r2m (mm1,*dest);        // store result in dest
 312 }
 313
 314 //-----------------------------------------------------------------------
 315
 316 static __inline__ void MC_avg_mmx (int width, int height,
 317                                yuv_data_t * dest, yuv_data_t * ref, int stride)
 318 {
 319     mmx_zero_reg ();
 320
 321     do {
 322         mmx_average_2_U8 (dest, dest, ref);
 323
 324         if (width == 16)
 325             mmx_average_2_U8 (dest+8, dest+8, ref+8);
 326
 327         dest += stride;
 328         ref += stride;
 329     } while (--height);
 330 }
 331
 332 static void MC_avg_16_mmx (yuv_data_t * dest, yuv_data_t * ref,
 333                            int stride, int height)
 334 {
 335     MC_avg_mmx (16, height, dest, ref, stride);
 336 }
 337
 338 static void MC_avg_8_mmx (yuv_data_t * dest, yuv_data_t * ref,
 339                           int stride, int height)
 340 {
 341     MC_avg_mmx (8, height, dest, ref, stride);
 342 }
 343
 344 //-----------------------------------------------------------------------
 345
 346 static __inline__ void MC_put_mmx (int width, int height,
 347                                yuv_data_t * dest, yuv_data_t * ref, int stride)
 348 {
 349     mmx_zero_reg ();
 350
 351     do {
 352         movq_m2r (* ref, mm1);        // load 8 ref bytes
 353         movq_r2m (mm1,* dest);        // store 8 bytes at curr
 354
 355         if (width == 16)
 356             {
 357                 movq_m2r (* (ref+8), mm1);        // load 8 ref bytes
 358                 movq_r2m (mm1,* (dest+8));        // store 8 bytes at curr
 359             }
 360
 361         dest += stride;
 362         ref += stride;
 363     } while (--height);
 364 }
 365
 366 static void MC_put_16_mmx (yuv_data_t * dest, yuv_data_t * ref,
 367                            int stride, int height)
 368 {
 369     MC_put_mmx (16, height, dest, ref, stride);
 370 }
 371
 372 static void MC_put_8_mmx (yuv_data_t * dest, yuv_data_t * ref,
 373                           int stride, int height)
 374 {
 375     MC_put_mmx (8, height, dest, ref, stride);
 376 }
 377
 378 //-----------------------------------------------------------------------
 379
 380 // Half pixel interpolation in the x direction
 381 static __inline__ void MC_avg_x_mmx (int width, int height,
 382                                  yuv_data_t * dest, yuv_data_t * ref, int stride)
 383 {
 384     mmx_zero_reg ();
 385
 386     do {
 387         mmx_interp_average_2_U8 (dest, ref, ref+1);
 388
 389         if (width == 16)
 390             mmx_interp_average_2_U8 (dest+8, ref+8, ref+9);
 391
 392         dest += stride;
 393         ref += stride;
 394     } while (--height);
 395 }
 396
 397 static void MC_avg_x16_mmx (yuv_data_t * dest, yuv_data_t * ref,
 398                             int stride, int height)
 399 {
 400     MC_avg_x_mmx (16, height, dest, ref, stride);
 401 }
 402
 403 static void MC_avg_x8_mmx (yuv_data_t * dest, yuv_data_t * ref,
 404                            int stride, int height)
 405 {
 406     MC_avg_x_mmx (8, height, dest, ref, stride);
 407 }
 408
 409 //-----------------------------------------------------------------------
 410
 411 static __inline__ void MC_put_x_mmx (int width, int height,
 412                                  yuv_data_t * dest, yuv_data_t * ref, int stride)
 413 {
 414     mmx_zero_reg ();
 415
 416     do {
 417         mmx_average_2_U8 (dest, ref, ref+1);
 418
 419         if (width == 16)
 420             mmx_average_2_U8 (dest+8, ref+8, ref+9);
 421
 422         dest += stride;
 423         ref += stride;
 424     } while (--height);
 425 }
 426
 427 static void MC_put_x16_mmx (yuv_data_t * dest, yuv_data_t * ref,
 428                             int stride, int height)
 429 {
 430     MC_put_x_mmx (16, height, dest, ref, stride);
 431 }
 432
 433 static void MC_put_x8_mmx (yuv_data_t * dest, yuv_data_t * ref,
 434                            int stride, int height)
 435 {
 436     MC_put_x_mmx (8, height, dest, ref, stride);
 437 }
 438
 439 //-----------------------------------------------------------------------
 440
 441 static __inline__ void MC_avg_xy_mmx (int width, int height,
 442                                   yuv_data_t * dest, yuv_data_t * ref, int stride)
 443 {
 444     yuv_data_t * ref_next = ref+stride;
 445
 446     mmx_zero_reg ();
 447
 448     do {
 449         mmx_interp_average_4_U8 (dest, ref, ref+1, ref_next, ref_next+1);
 450
 451         if (width == 16)
 452             mmx_interp_average_4_U8 (dest+8, ref+8, ref+9,
 453                                      ref_next+8, ref_next+9);
 454
 455         dest += stride;
 456         ref += stride;
 457         ref_next += stride;
 458     } while (--height);
 459 }
 460
 461 static void MC_avg_xy16_mmx (yuv_data_t * dest, yuv_data_t * ref,
 462                              int stride, int height)
 463 {
 464     MC_avg_xy_mmx (16, height, dest, ref, stride);
 465 }
 466
 467 static void MC_avg_xy8_mmx (yuv_data_t * dest, yuv_data_t * ref,
 468                             int stride, int height)
 469 {
 470     MC_avg_xy_mmx (8, height, dest, ref, stride);
 471 }
 472
 473 //-----------------------------------------------------------------------
 474
 475 static __inline__ void MC_put_xy_mmx (int width, int height,
 476                                   yuv_data_t * dest, yuv_data_t * ref, int stride)
 477 {
 478     yuv_data_t * ref_next = ref+stride;
 479
 480     mmx_zero_reg ();
 481
 482     do {
 483         mmx_average_4_U8 (dest, ref, ref+1, ref_next, ref_next+1);
 484
 485         if (width == 16)
 486             mmx_average_4_U8 (dest+8, ref+8, ref+9, ref_next+8, ref_next+9);
 487
 488         dest += stride;
 489         ref += stride;
 490         ref_next += stride;
 491     } while (--height);
 492 }
 493
 494 static void MC_put_xy16_mmx (yuv_data_t * dest, yuv_data_t * ref,
 495                              int stride, int height)
 496 {
 497     MC_put_xy_mmx (16, height, dest, ref, stride);
 498 }
 499
 500 static void MC_put_xy8_mmx (yuv_data_t * dest, yuv_data_t * ref,
 501                             int stride, int height)
 502 {
 503     MC_put_xy_mmx (8, height, dest, ref, stride);
 504 }
 505
 506 //-----------------------------------------------------------------------
 507
 508 static __inline__ void MC_avg_y_mmx (int width, int height,
 509                                  yuv_data_t * dest, yuv_data_t * ref, int stride)
 510 {
 511     yuv_data_t * ref_next = ref+stride;
 512
 513     mmx_zero_reg ();
 514
 515     do {
 516         mmx_interp_average_2_U8 (dest, ref, ref_next);
 517
 518         if (width == 16)
 519             mmx_interp_average_2_U8 (dest+8, ref+8, ref_next+8);
 520
 521         dest += stride;
 522         ref += stride;
 523         ref_next += stride;
 524     } while (--height);
 525 }
 526
 527 static void MC_avg_y16_mmx (yuv_data_t * dest, yuv_data_t * ref,
 528                             int stride, int height)
 529 {
 530     MC_avg_y_mmx (16, height, dest, ref, stride);
 531 }
 532
 533 static void MC_avg_y8_mmx (yuv_data_t * dest, yuv_data_t * ref,
 534                            int stride, int height)
 535 {
 536     MC_avg_y_mmx (8, height, dest, ref, stride);
 537 }
 538
 539 //-----------------------------------------------------------------------
 540
 541 static __inline__ void MC_put_y_mmx (int width, int height,
 542                                  yuv_data_t * dest, yuv_data_t * ref, int stride)
 543 {
 544     yuv_data_t * ref_next = ref+stride;
 545
 546     mmx_zero_reg ();
 547
 548     do {
 549         mmx_average_2_U8 (dest, ref, ref_next);
 550
 551         if (width == 16)
 552             mmx_average_2_U8 (dest+8, ref+8, ref_next+8);
 553
 554         dest += stride;
 555         ref += stride;
 556         ref_next += stride;
 557     } while (--height);
 558 }
 559
 560 static void MC_put_y16_mmx (yuv_data_t * dest, yuv_data_t * ref,
 561                             int stride, int height)
 562 {
 563     MC_put_y_mmx (16, height, dest, ref, stride);
 564 }
 565
 566 static void MC_put_y8_mmx (yuv_data_t * dest, yuv_data_t * ref,
 567                            int stride, int height)
 568 {
 569     MC_put_y_mmx (8, height, dest, ref, stride);
 570 }
 571
 572
 573 /*****************************************************************************
 574  * Functions exported as capabilities. They are declared as static so that
 575  * we don't pollute the namespace too much.
 576  *****************************************************************************/
 577 static void motion_getfunctions( function_list_t * p_function_list )
 578 {
 579     static void (* ppppf_motion[2][2][4])( yuv_data_t *, yuv_data_t *,
 580                                            int, int ) =
 581     {
 582         {
 583             /* Copying functions */
 584             {
 585                 /* Width == 16 */
 586                 MC_put_16_mmx, MC_put_x16_mmx, MC_put_y16_mmx, MC_put_xy16_mmx
 587             },
 588             {
 589                 /* Width == 8 */
 590                 MC_put_8_mmx,  MC_put_x8_mmx,  MC_put_y8_mmx, MC_put_xy8_mmx
 591             }
 592         },
 593         {
 594             /* Averaging functions */
 595             {
 596                 /* Width == 16 */
 597                 MC_avg_16_mmx, MC_avg_x16_mmx, MC_avg_y16_mmx, MC_avg_xy16_mmx
 598             },
 599             {
 600                 /* Width == 8 */
 601                 MC_avg_8_mmx,  MC_avg_x8_mmx,  MC_avg_y8_mmx,  MC_avg_xy8_mmx
 602             }
 603         }
 604     };
 605
 606     p_function_list->pf_probe = motion_Probe;
 607
 608 #define list p_function_list->functions.motion
 609     memcpy( list.ppppf_motion, ppppf_motion, sizeof( void * ) * 16 );
 610 #undef list
 611
 612     return;
 613 }
 614