git.sesse.net Git - vlc/blob - plugins/motion/motionmmx.c

   1 /*****************************************************************************
   2  * motionmmx.c : MMX motion compensation module for vlc
   3  *****************************************************************************
   4  * Copyright (C) 2001 VideoLAN
   5  * $Id: motionmmx.c,v 1.12 2001/11/28 15:08:05 massiot Exp $
   6  *
   7  * Authors: Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
   8  *          Michel Lespinasse <walken@zoy.org>
   9  *
  10  * This program is free software; you can redistribute it and/or modify
  11  * it under the terms of the GNU General Public License as published by
  12  * the Free Software Foundation; either version 2 of the License, or
  13  * (at your option) any later version.
  14  *
  15  * This program is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  * GNU General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU General Public License
  21  * along with this program; if not, write to the Free Software
  22  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
  23  *****************************************************************************/
  24
  25 #define MODULE_NAME motionmmx
  26 #include "modules_inner.h"
  27
  28 /*****************************************************************************
  29  * Preamble
  30  *****************************************************************************/
  31 #include "defs.h"
  32
  33 #include <stdlib.h>                                      /* malloc(), free() */
  34 #include <string.h>
  35
  36 #include "config.h"
  37 #include "common.h"                                     /* boolean_t, byte_t */
  38 #include "intf_msg.h"
  39 #include "threads.h"
  40 #include "mtime.h"
  41 #include "tests.h"
  42
  43 #include "mmx.h"
  44
  45 #include "modules.h"
  46 #include "modules_export.h"
  47
  48 /*****************************************************************************
  49  * Local and extern prototypes.
  50  *****************************************************************************/
  51 static void motion_getfunctions( function_list_t * p_function_list );
  52
  53 /*****************************************************************************
  54  * Build configuration tree.
  55  *****************************************************************************/
  56 MODULE_CONFIG_START
  57 ADD_WINDOW( "Configuration for MMX motion compensation module" )
  58     ADD_COMMENT( "Ha, ha -- nothing to configure yet" )
  59 MODULE_CONFIG_STOP
  60
  61 MODULE_INIT_START
  62     p_module->i_capabilities = MODULE_CAPABILITY_NULL
  63                                 | MODULE_CAPABILITY_MOTION;
  64     p_module->psz_longname = "MMX motion compensation module";
  65 MODULE_INIT_STOP
  66
  67 MODULE_ACTIVATE_START
  68     motion_getfunctions( &p_module->p_functions->motion );
  69 MODULE_ACTIVATE_STOP
  70
  71 MODULE_DEACTIVATE_START
  72 MODULE_DEACTIVATE_STOP
  73
  74 /*****************************************************************************
  75  * motion_Probe: tests probe the CPU and return a score
  76  *****************************************************************************/
  77 static int motion_Probe( probedata_t *p_data )
  78 {
  79     if( !TestCPU( CPU_CAPABILITY_MMX ) )
  80     {
  81         return( 0 );
  82     }
  83
  84     if( TestMethod( MOTION_METHOD_VAR, "motionmmx" )
  85          || TestMethod( MOTION_METHOD_VAR, "mmx" ) )
  86     {
  87         return( 999 );
  88     }
  89
  90     return( 150 );
  91 }
  92
  93 /*****************************************************************************
  94  * Motion compensation in MMX
  95  *****************************************************************************/
  96
  97 // some rounding constants
  98 mmx_t round1 = {0x0001000100010001LL};
  99 mmx_t round4 = {0x0002000200020002LL};
 100
 101 /*
 102  * This code should probably be compiled with loop unrolling
 103  * (ie, -funroll-loops in gcc)becuase some of the loops
 104  * use a small static number of iterations. This was written
 105  * with the assumption the compiler knows best about when
 106  * unrolling will help
 107  */
 108
 109 static __inline__ void mmx_zero_reg ()
 110 {
 111     // load 0 into mm0
 112     pxor_r2r (mm0, mm0);
 113 }
 114
 115 static __inline__ void mmx_average_2_U8 (yuv_data_t * dest,
 116                                      yuv_data_t * src1, yuv_data_t * src2)
 117 {
 118     //
 119     // *dest = (*src1 + *src2 + 1)/ 2;
 120     //
 121
 122     movq_m2r (*src1, mm1);        // load 8 src1 bytes
 123     movq_r2r (mm1, mm2);        // copy 8 src1 bytes
 124
 125     movq_m2r (*src2, mm3);        // load 8 src2 bytes
 126     movq_r2r (mm3, mm4);        // copy 8 src2 bytes
 127
 128     punpcklbw_r2r (mm0, mm1);        // unpack low src1 bytes
 129     punpckhbw_r2r (mm0, mm2);        // unpack high src1 bytes
 130
 131     punpcklbw_r2r (mm0, mm3);        // unpack low src2 bytes
 132     punpckhbw_r2r (mm0, mm4);        // unpack high src2 bytes
 133
 134     paddw_r2r (mm3, mm1);        // add lows to mm1
 135     paddw_m2r (round1, mm1);
 136     psraw_i2r (1, mm1);                // /2
 137
 138     paddw_r2r (mm4, mm2);        // add highs to mm2
 139     paddw_m2r (round1, mm2);
 140     psraw_i2r (1, mm2);                // /2
 141
 142     packuswb_r2r (mm2, mm1);        // pack (w/ saturation)
 143     movq_r2m (mm1, *dest);        // store result in dest
 144 }
 145
 146 static __inline__ void mmx_interp_average_2_U8 (yuv_data_t * dest,
 147                                             yuv_data_t * src1, yuv_data_t * src2)
 148 {
 149     //
 150     // *dest = (*dest + (*src1 + *src2 + 1)/ 2 + 1)/ 2;
 151     //
 152
 153     movq_m2r (*dest, mm1);        // load 8 dest bytes
 154     movq_r2r (mm1, mm2);        // copy 8 dest bytes
 155
 156     movq_m2r (*src1, mm3);        // load 8 src1 bytes
 157     movq_r2r (mm3, mm4);        // copy 8 src1 bytes
 158
 159     movq_m2r (*src2, mm5);        // load 8 src2 bytes
 160     movq_r2r (mm5, mm6);        // copy 8 src2 bytes
 161
 162     punpcklbw_r2r (mm0, mm1);        // unpack low dest bytes
 163     punpckhbw_r2r (mm0, mm2);        // unpack high dest bytes
 164
 165     punpcklbw_r2r (mm0, mm3);        // unpack low src1 bytes
 166     punpckhbw_r2r (mm0, mm4);        // unpack high src1 bytes
 167
 168     punpcklbw_r2r (mm0, mm5);        // unpack low src2 bytes
 169     punpckhbw_r2r (mm0, mm6);        // unpack high src2 bytes
 170
 171     paddw_r2r (mm5, mm3);        // add lows
 172     paddw_m2r (round1, mm3);
 173     psraw_i2r (1, mm3);                // /2
 174
 175     paddw_r2r (mm6, mm4);        // add highs
 176     paddw_m2r (round1, mm4);
 177     psraw_i2r (1, mm4);                // /2
 178
 179     paddw_r2r (mm3, mm1);        // add lows
 180     paddw_m2r (round1, mm1);
 181     psraw_i2r (1, mm1);                // /2
 182
 183     paddw_r2r (mm4, mm2);        // add highs
 184     paddw_m2r (round1, mm2);
 185     psraw_i2r (1, mm2);                // /2
 186
 187     packuswb_r2r (mm2, mm1);        // pack (w/ saturation)
 188     movq_r2m (mm1, *dest);        // store result in dest
 189 }
 190
 191 static __inline__ void mmx_average_4_U8 (yuv_data_t * dest,
 192                                      yuv_data_t * src1, yuv_data_t * src2,
 193                                      yuv_data_t * src3, yuv_data_t * src4)
 194 {
 195     //
 196     // *dest = (*src1 + *src2 + *src3 + *src4 + 2)/ 4;
 197     //
 198
 199     movq_m2r (*src1, mm1);        // load 8 src1 bytes
 200     movq_r2r (mm1, mm2);        // copy 8 src1 bytes
 201
 202     punpcklbw_r2r (mm0, mm1);        // unpack low src1 bytes
 203     punpckhbw_r2r (mm0, mm2);        // unpack high src1 bytes
 204
 205     movq_m2r (*src2, mm3);        // load 8 src2 bytes
 206     movq_r2r (mm3, mm4);        // copy 8 src2 bytes
 207
 208     punpcklbw_r2r (mm0, mm3);        // unpack low src2 bytes
 209     punpckhbw_r2r (mm0, mm4);        // unpack high src2 bytes
 210
 211     paddw_r2r (mm3, mm1);        // add lows
 212     paddw_r2r (mm4, mm2);        // add highs
 213
 214     // now have partials in mm1 and mm2
 215
 216     movq_m2r (*src3, mm3);        // load 8 src3 bytes
 217     movq_r2r (mm3, mm4);        // copy 8 src3 bytes
 218
 219     punpcklbw_r2r (mm0, mm3);        // unpack low src3 bytes
 220     punpckhbw_r2r (mm0, mm4);        // unpack high src3 bytes
 221
 222     paddw_r2r (mm3, mm1);        // add lows
 223     paddw_r2r (mm4, mm2);        // add highs
 224
 225     movq_m2r (*src4, mm5);        // load 8 src4 bytes
 226     movq_r2r (mm5, mm6);        // copy 8 src4 bytes
 227
 228     punpcklbw_r2r (mm0, mm5);        // unpack low src4 bytes
 229     punpckhbw_r2r (mm0, mm6);        // unpack high src4 bytes
 230
 231     paddw_r2r (mm5, mm1);        // add lows
 232     paddw_r2r (mm6, mm2);        // add highs
 233
 234     // now have subtotal in mm1 and mm2
 235
 236     paddw_m2r (round4, mm1);
 237     psraw_i2r (2, mm1);                // /4
 238     paddw_m2r (round4, mm2);
 239     psraw_i2r (2, mm2);                // /4
 240
 241     packuswb_r2r (mm2, mm1);        // pack (w/ saturation)
 242     movq_r2m (mm1, *dest);        // store result in dest
 243 }
 244
 245 static __inline__ void mmx_interp_average_4_U8 (yuv_data_t * dest,
 246                                             yuv_data_t * src1, yuv_data_t * src2,
 247                                             yuv_data_t * src3, yuv_data_t * src4)
 248 {
 249     //
 250     // *dest = (*dest + (*src1 + *src2 + *src3 + *src4 + 2)/ 4 + 1)/ 2;
 251     //
 252
 253     movq_m2r (*src1, mm1);        // load 8 src1 bytes
 254     movq_r2r (mm1, mm2);        // copy 8 src1 bytes
 255
 256     punpcklbw_r2r (mm0, mm1);        // unpack low src1 bytes
 257     punpckhbw_r2r (mm0, mm2);        // unpack high src1 bytes
 258
 259     movq_m2r (*src2, mm3);        // load 8 src2 bytes
 260     movq_r2r (mm3, mm4);        // copy 8 src2 bytes
 261
 262     punpcklbw_r2r (mm0, mm3);        // unpack low src2 bytes
 263     punpckhbw_r2r (mm0, mm4);        // unpack high src2 bytes
 264
 265     paddw_r2r (mm3, mm1);        // add lows
 266     paddw_r2r (mm4, mm2);        // add highs
 267
 268     // now have partials in mm1 and mm2
 269
 270     movq_m2r (*src3, mm3);        // load 8 src3 bytes
 271     movq_r2r (mm3, mm4);        // copy 8 src3 bytes
 272
 273     punpcklbw_r2r (mm0, mm3);        // unpack low src3 bytes
 274     punpckhbw_r2r (mm0, mm4);        // unpack high src3 bytes
 275
 276     paddw_r2r (mm3, mm1);        // add lows
 277     paddw_r2r (mm4, mm2);        // add highs
 278
 279     movq_m2r (*src4, mm5);        // load 8 src4 bytes
 280     movq_r2r (mm5, mm6);        // copy 8 src4 bytes
 281
 282     punpcklbw_r2r (mm0, mm5);        // unpack low src4 bytes
 283     punpckhbw_r2r (mm0, mm6);        // unpack high src4 bytes
 284
 285     paddw_r2r (mm5, mm1);        // add lows
 286     paddw_r2r (mm6, mm2);        // add highs
 287
 288     paddw_m2r (round4, mm1);
 289     psraw_i2r (2, mm1);                // /4
 290     paddw_m2r (round4, mm2);
 291     psraw_i2r (2, mm2);                // /4
 292
 293     // now have subtotal/4 in mm1 and mm2
 294
 295     movq_m2r (*dest, mm3);        // load 8 dest bytes
 296     movq_r2r (mm3, mm4);        // copy 8 dest bytes
 297
 298     punpcklbw_r2r (mm0, mm3);        // unpack low dest bytes
 299     punpckhbw_r2r (mm0, mm4);        // unpack high dest bytes
 300
 301     paddw_r2r (mm3, mm1);        // add lows
 302     paddw_r2r (mm4, mm2);        // add highs
 303
 304     paddw_m2r (round1, mm1);
 305     psraw_i2r (1, mm1);                // /2
 306     paddw_m2r (round1, mm2);
 307     psraw_i2r (1, mm2);                // /2
 308
 309     // now have end value in mm1 and mm2
 310
 311     packuswb_r2r (mm2, mm1);        // pack (w/ saturation)
 312     movq_r2m (mm1,*dest);        // store result in dest
 313 }
 314
 315 //-----------------------------------------------------------------------
 316
 317 static __inline__ void MC_avg_mmx (int width, int height,
 318                                yuv_data_t * dest, yuv_data_t * ref, int stride)
 319 {
 320     mmx_zero_reg ();
 321
 322     do {
 323         mmx_average_2_U8 (dest, dest, ref);
 324
 325         if (width == 16)
 326             mmx_average_2_U8 (dest+8, dest+8, ref+8);
 327
 328         dest += stride;
 329         ref += stride;
 330     } while (--height);
 331 }
 332
 333 static void MC_avg_16_mmx (yuv_data_t * dest, yuv_data_t * ref,
 334                            int stride, int height)
 335 {
 336     MC_avg_mmx (16, height, dest, ref, stride);
 337 }
 338
 339 static void MC_avg_8_mmx (yuv_data_t * dest, yuv_data_t * ref,
 340                           int stride, int height)
 341 {
 342     MC_avg_mmx (8, height, dest, ref, stride);
 343 }
 344
 345 //-----------------------------------------------------------------------
 346
 347 static __inline__ void MC_put_mmx (int width, int height,
 348                                yuv_data_t * dest, yuv_data_t * ref, int stride)
 349 {
 350     mmx_zero_reg ();
 351
 352     do {
 353         movq_m2r (* ref, mm1);        // load 8 ref bytes
 354         movq_r2m (mm1,* dest);        // store 8 bytes at curr
 355
 356         if (width == 16)
 357             {
 358                 movq_m2r (* (ref+8), mm1);        // load 8 ref bytes
 359                 movq_r2m (mm1,* (dest+8));        // store 8 bytes at curr
 360             }
 361
 362         dest += stride;
 363         ref += stride;
 364     } while (--height);
 365 }
 366
 367 static void MC_put_16_mmx (yuv_data_t * dest, yuv_data_t * ref,
 368                            int stride, int height)
 369 {
 370     MC_put_mmx (16, height, dest, ref, stride);
 371 }
 372
 373 static void MC_put_8_mmx (yuv_data_t * dest, yuv_data_t * ref,
 374                           int stride, int height)
 375 {
 376     MC_put_mmx (8, height, dest, ref, stride);
 377 }
 378
 379 //-----------------------------------------------------------------------
 380
 381 // Half pixel interpolation in the x direction
 382 static __inline__ void MC_avg_x_mmx (int width, int height,
 383                                  yuv_data_t * dest, yuv_data_t * ref, int stride)
 384 {
 385     mmx_zero_reg ();
 386
 387     do {
 388         mmx_interp_average_2_U8 (dest, ref, ref+1);
 389
 390         if (width == 16)
 391             mmx_interp_average_2_U8 (dest+8, ref+8, ref+9);
 392
 393         dest += stride;
 394         ref += stride;
 395     } while (--height);
 396 }
 397
 398 static void MC_avg_x16_mmx (yuv_data_t * dest, yuv_data_t * ref,
 399                             int stride, int height)
 400 {
 401     MC_avg_x_mmx (16, height, dest, ref, stride);
 402 }
 403
 404 static void MC_avg_x8_mmx (yuv_data_t * dest, yuv_data_t * ref,
 405                            int stride, int height)
 406 {
 407     MC_avg_x_mmx (8, height, dest, ref, stride);
 408 }
 409
 410 //-----------------------------------------------------------------------
 411
 412 static __inline__ void MC_put_x_mmx (int width, int height,
 413                                  yuv_data_t * dest, yuv_data_t * ref, int stride)
 414 {
 415     mmx_zero_reg ();
 416
 417     do {
 418         mmx_average_2_U8 (dest, ref, ref+1);
 419
 420         if (width == 16)
 421             mmx_average_2_U8 (dest+8, ref+8, ref+9);
 422
 423         dest += stride;
 424         ref += stride;
 425     } while (--height);
 426 }
 427
 428 static void MC_put_x16_mmx (yuv_data_t * dest, yuv_data_t * ref,
 429                             int stride, int height)
 430 {
 431     MC_put_x_mmx (16, height, dest, ref, stride);
 432 }
 433
 434 static void MC_put_x8_mmx (yuv_data_t * dest, yuv_data_t * ref,
 435                            int stride, int height)
 436 {
 437     MC_put_x_mmx (8, height, dest, ref, stride);
 438 }
 439
 440 //-----------------------------------------------------------------------
 441
 442 static __inline__ void MC_avg_xy_mmx (int width, int height,
 443                                   yuv_data_t * dest, yuv_data_t * ref, int stride)
 444 {
 445     yuv_data_t * ref_next = ref+stride;
 446
 447     mmx_zero_reg ();
 448
 449     do {
 450         mmx_interp_average_4_U8 (dest, ref, ref+1, ref_next, ref_next+1);
 451
 452         if (width == 16)
 453             mmx_interp_average_4_U8 (dest+8, ref+8, ref+9,
 454                                      ref_next+8, ref_next+9);
 455
 456         dest += stride;
 457         ref += stride;
 458         ref_next += stride;
 459     } while (--height);
 460 }
 461
 462 static void MC_avg_xy16_mmx (yuv_data_t * dest, yuv_data_t * ref,
 463                              int stride, int height)
 464 {
 465     MC_avg_xy_mmx (16, height, dest, ref, stride);
 466 }
 467
 468 static void MC_avg_xy8_mmx (yuv_data_t * dest, yuv_data_t * ref,
 469                             int stride, int height)
 470 {
 471     MC_avg_xy_mmx (8, height, dest, ref, stride);
 472 }
 473
 474 //-----------------------------------------------------------------------
 475
 476 static __inline__ void MC_put_xy_mmx (int width, int height,
 477                                   yuv_data_t * dest, yuv_data_t * ref, int stride)
 478 {
 479     yuv_data_t * ref_next = ref+stride;
 480
 481     mmx_zero_reg ();
 482
 483     do {
 484         mmx_average_4_U8 (dest, ref, ref+1, ref_next, ref_next+1);
 485
 486         if (width == 16)
 487             mmx_average_4_U8 (dest+8, ref+8, ref+9, ref_next+8, ref_next+9);
 488
 489         dest += stride;
 490         ref += stride;
 491         ref_next += stride;
 492     } while (--height);
 493 }
 494
 495 static void MC_put_xy16_mmx (yuv_data_t * dest, yuv_data_t * ref,
 496                              int stride, int height)
 497 {
 498     MC_put_xy_mmx (16, height, dest, ref, stride);
 499 }
 500
 501 static void MC_put_xy8_mmx (yuv_data_t * dest, yuv_data_t * ref,
 502                             int stride, int height)
 503 {
 504     MC_put_xy_mmx (8, height, dest, ref, stride);
 505 }
 506
 507 //-----------------------------------------------------------------------
 508
 509 static __inline__ void MC_avg_y_mmx (int width, int height,
 510                                  yuv_data_t * dest, yuv_data_t * ref, int stride)
 511 {
 512     yuv_data_t * ref_next = ref+stride;
 513
 514     mmx_zero_reg ();
 515
 516     do {
 517         mmx_interp_average_2_U8 (dest, ref, ref_next);
 518
 519         if (width == 16)
 520             mmx_interp_average_2_U8 (dest+8, ref+8, ref_next+8);
 521
 522         dest += stride;
 523         ref += stride;
 524         ref_next += stride;
 525     } while (--height);
 526 }
 527
 528 static void MC_avg_y16_mmx (yuv_data_t * dest, yuv_data_t * ref,
 529                             int stride, int height)
 530 {
 531     MC_avg_y_mmx (16, height, dest, ref, stride);
 532 }
 533
 534 static void MC_avg_y8_mmx (yuv_data_t * dest, yuv_data_t * ref,
 535                            int stride, int height)
 536 {
 537     MC_avg_y_mmx (8, height, dest, ref, stride);
 538 }
 539
 540 //-----------------------------------------------------------------------
 541
 542 static __inline__ void MC_put_y_mmx (int width, int height,
 543                                  yuv_data_t * dest, yuv_data_t * ref, int stride)
 544 {
 545     yuv_data_t * ref_next = ref+stride;
 546
 547     mmx_zero_reg ();
 548
 549     do {
 550         mmx_average_2_U8 (dest, ref, ref_next);
 551
 552         if (width == 16)
 553             mmx_average_2_U8 (dest+8, ref+8, ref_next+8);
 554
 555         dest += stride;
 556         ref += stride;
 557         ref_next += stride;
 558     } while (--height);
 559 }
 560
 561 static void MC_put_y16_mmx (yuv_data_t * dest, yuv_data_t * ref,
 562                             int stride, int height)
 563 {
 564     MC_put_y_mmx (16, height, dest, ref, stride);
 565 }
 566
 567 static void MC_put_y8_mmx (yuv_data_t * dest, yuv_data_t * ref,
 568                            int stride, int height)
 569 {
 570     MC_put_y_mmx (8, height, dest, ref, stride);
 571 }
 572
 573
 574 /*****************************************************************************
 575  * Functions exported as capabilities. They are declared as static so that
 576  * we don't pollute the namespace too much.
 577  *****************************************************************************/
 578 static void motion_getfunctions( function_list_t * p_function_list )
 579 {
 580     static void (* ppppf_motion[2][2][4])( yuv_data_t *, yuv_data_t *,
 581                                            int, int ) =
 582     {
 583         {
 584             /* Copying functions */
 585             {
 586                 /* Width == 16 */
 587                 MC_put_16_mmx, MC_put_x16_mmx, MC_put_y16_mmx, MC_put_xy16_mmx
 588             },
 589             {
 590                 /* Width == 8 */
 591                 MC_put_8_mmx,  MC_put_x8_mmx,  MC_put_y8_mmx, MC_put_xy8_mmx
 592             }
 593         },
 594         {
 595             /* Averaging functions */
 596             {
 597                 /* Width == 16 */
 598                 MC_avg_16_mmx, MC_avg_x16_mmx, MC_avg_y16_mmx, MC_avg_xy16_mmx
 599             },
 600             {
 601                 /* Width == 8 */
 602                 MC_avg_8_mmx,  MC_avg_x8_mmx,  MC_avg_y8_mmx,  MC_avg_xy8_mmx
 603             }
 604         }
 605     };
 606
 607     p_function_list->pf_probe = motion_Probe;
 608
 609 #define list p_function_list->functions.motion
 610     memcpy( list.ppppf_motion, ppppf_motion, sizeof( void * ) * 16 );
 611 #undef list
 612
 613     return;
 614 }
 615