git.sesse.net Git - vlc/blob - plugins/motion/motionmmx.c

   1 /*****************************************************************************
   2  * motionmmx.c : MMX motion compensation module for vlc
   3  *****************************************************************************
   4  * Copyright (C) 2001 VideoLAN
   5  * $Id: motionmmx.c,v 1.14 2001/12/30 07:09:55 sam Exp $
   6  *
   7  * Authors: Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
   8  *          Michel Lespinasse <walken@zoy.org>
   9  *
  10  * This program is free software; you can redistribute it and/or modify
  11  * it under the terms of the GNU General Public License as published by
  12  * the Free Software Foundation; either version 2 of the License, or
  13  * (at your option) any later version.
  14  *
  15  * This program is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  * GNU General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU General Public License
  21  * along with this program; if not, write to the Free Software
  22  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
  23  *****************************************************************************/
  24
  25 /*****************************************************************************
  26  * Preamble
  27  *****************************************************************************/
  28 #include <stdlib.h>                                      /* malloc(), free() */
  29 #include <string.h>
  30
  31 #include <videolan/vlc.h>
  32
  33 #include "mmx.h"
  34
  35 /*****************************************************************************
  36  * Local and extern prototypes.
  37  *****************************************************************************/
  38 static void motion_getfunctions( function_list_t * p_function_list );
  39
  40 /*****************************************************************************
  41  * Build configuration tree.
  42  *****************************************************************************/
  43 MODULE_CONFIG_START
  44 MODULE_CONFIG_STOP
  45
  46 MODULE_INIT_START
  47     SET_DESCRIPTION( "MMX motion compensation module" )
  48     ADD_CAPABILITY( MOTION, 150 )
  49     ADD_REQUIREMENT( MMX )
  50     ADD_SHORTCUT( "mmx" )
  51     ADD_SHORTCUT( "motionmmx" )
  52 MODULE_INIT_STOP
  53
  54 MODULE_ACTIVATE_START
  55     motion_getfunctions( &p_module->p_functions->motion );
  56 MODULE_ACTIVATE_STOP
  57
  58 MODULE_DEACTIVATE_START
  59 MODULE_DEACTIVATE_STOP
  60
  61 /*****************************************************************************
  62  * motion_Probe: tests probe the CPU and return a score
  63  *****************************************************************************/
  64 static int motion_Probe( probedata_t *p_data )
  65 {
  66     return( 150 );
  67 }
  68
  69 /*****************************************************************************
  70  * Motion compensation in MMX
  71  *****************************************************************************/
  72
  73 // some rounding constants
  74 mmx_t round1 = {0x0001000100010001LL};
  75 mmx_t round4 = {0x0002000200020002LL};
  76
  77 /*
  78  * This code should probably be compiled with loop unrolling
  79  * (ie, -funroll-loops in gcc)becuase some of the loops
  80  * use a small static number of iterations. This was written
  81  * with the assumption the compiler knows best about when
  82  * unrolling will help
  83  */
  84
  85 static __inline__ void mmx_zero_reg ()
  86 {
  87     // load 0 into mm0
  88     pxor_r2r (mm0, mm0);
  89 }
  90
  91 static __inline__ void mmx_average_2_U8 (yuv_data_t * dest,
  92                                      yuv_data_t * src1, yuv_data_t * src2)
  93 {
  94     //
  95     // *dest = (*src1 + *src2 + 1)/ 2;
  96     //
  97
  98     movq_m2r (*src1, mm1);        // load 8 src1 bytes
  99     movq_r2r (mm1, mm2);        // copy 8 src1 bytes
 100
 101     movq_m2r (*src2, mm3);        // load 8 src2 bytes
 102     movq_r2r (mm3, mm4);        // copy 8 src2 bytes
 103
 104     punpcklbw_r2r (mm0, mm1);        // unpack low src1 bytes
 105     punpckhbw_r2r (mm0, mm2);        // unpack high src1 bytes
 106
 107     punpcklbw_r2r (mm0, mm3);        // unpack low src2 bytes
 108     punpckhbw_r2r (mm0, mm4);        // unpack high src2 bytes
 109
 110     paddw_r2r (mm3, mm1);        // add lows to mm1
 111     paddw_m2r (round1, mm1);
 112     psraw_i2r (1, mm1);                // /2
 113
 114     paddw_r2r (mm4, mm2);        // add highs to mm2
 115     paddw_m2r (round1, mm2);
 116     psraw_i2r (1, mm2);                // /2
 117
 118     packuswb_r2r (mm2, mm1);        // pack (w/ saturation)
 119     movq_r2m (mm1, *dest);        // store result in dest
 120 }
 121
 122 static __inline__ void mmx_interp_average_2_U8 (yuv_data_t * dest,
 123                                             yuv_data_t * src1, yuv_data_t * src2)
 124 {
 125     //
 126     // *dest = (*dest + (*src1 + *src2 + 1)/ 2 + 1)/ 2;
 127     //
 128
 129     movq_m2r (*dest, mm1);        // load 8 dest bytes
 130     movq_r2r (mm1, mm2);        // copy 8 dest bytes
 131
 132     movq_m2r (*src1, mm3);        // load 8 src1 bytes
 133     movq_r2r (mm3, mm4);        // copy 8 src1 bytes
 134
 135     movq_m2r (*src2, mm5);        // load 8 src2 bytes
 136     movq_r2r (mm5, mm6);        // copy 8 src2 bytes
 137
 138     punpcklbw_r2r (mm0, mm1);        // unpack low dest bytes
 139     punpckhbw_r2r (mm0, mm2);        // unpack high dest bytes
 140
 141     punpcklbw_r2r (mm0, mm3);        // unpack low src1 bytes
 142     punpckhbw_r2r (mm0, mm4);        // unpack high src1 bytes
 143
 144     punpcklbw_r2r (mm0, mm5);        // unpack low src2 bytes
 145     punpckhbw_r2r (mm0, mm6);        // unpack high src2 bytes
 146
 147     paddw_r2r (mm5, mm3);        // add lows
 148     paddw_m2r (round1, mm3);
 149     psraw_i2r (1, mm3);                // /2
 150
 151     paddw_r2r (mm6, mm4);        // add highs
 152     paddw_m2r (round1, mm4);
 153     psraw_i2r (1, mm4);                // /2
 154
 155     paddw_r2r (mm3, mm1);        // add lows
 156     paddw_m2r (round1, mm1);
 157     psraw_i2r (1, mm1);                // /2
 158
 159     paddw_r2r (mm4, mm2);        // add highs
 160     paddw_m2r (round1, mm2);
 161     psraw_i2r (1, mm2);                // /2
 162
 163     packuswb_r2r (mm2, mm1);        // pack (w/ saturation)
 164     movq_r2m (mm1, *dest);        // store result in dest
 165 }
 166
 167 static __inline__ void mmx_average_4_U8 (yuv_data_t * dest,
 168                                      yuv_data_t * src1, yuv_data_t * src2,
 169                                      yuv_data_t * src3, yuv_data_t * src4)
 170 {
 171     //
 172     // *dest = (*src1 + *src2 + *src3 + *src4 + 2)/ 4;
 173     //
 174
 175     movq_m2r (*src1, mm1);        // load 8 src1 bytes
 176     movq_r2r (mm1, mm2);        // copy 8 src1 bytes
 177
 178     punpcklbw_r2r (mm0, mm1);        // unpack low src1 bytes
 179     punpckhbw_r2r (mm0, mm2);        // unpack high src1 bytes
 180
 181     movq_m2r (*src2, mm3);        // load 8 src2 bytes
 182     movq_r2r (mm3, mm4);        // copy 8 src2 bytes
 183
 184     punpcklbw_r2r (mm0, mm3);        // unpack low src2 bytes
 185     punpckhbw_r2r (mm0, mm4);        // unpack high src2 bytes
 186
 187     paddw_r2r (mm3, mm1);        // add lows
 188     paddw_r2r (mm4, mm2);        // add highs
 189
 190     // now have partials in mm1 and mm2
 191
 192     movq_m2r (*src3, mm3);        // load 8 src3 bytes
 193     movq_r2r (mm3, mm4);        // copy 8 src3 bytes
 194
 195     punpcklbw_r2r (mm0, mm3);        // unpack low src3 bytes
 196     punpckhbw_r2r (mm0, mm4);        // unpack high src3 bytes
 197
 198     paddw_r2r (mm3, mm1);        // add lows
 199     paddw_r2r (mm4, mm2);        // add highs
 200
 201     movq_m2r (*src4, mm5);        // load 8 src4 bytes
 202     movq_r2r (mm5, mm6);        // copy 8 src4 bytes
 203
 204     punpcklbw_r2r (mm0, mm5);        // unpack low src4 bytes
 205     punpckhbw_r2r (mm0, mm6);        // unpack high src4 bytes
 206
 207     paddw_r2r (mm5, mm1);        // add lows
 208     paddw_r2r (mm6, mm2);        // add highs
 209
 210     // now have subtotal in mm1 and mm2
 211
 212     paddw_m2r (round4, mm1);
 213     psraw_i2r (2, mm1);                // /4
 214     paddw_m2r (round4, mm2);
 215     psraw_i2r (2, mm2);                // /4
 216
 217     packuswb_r2r (mm2, mm1);        // pack (w/ saturation)
 218     movq_r2m (mm1, *dest);        // store result in dest
 219 }
 220
 221 static __inline__ void mmx_interp_average_4_U8 (yuv_data_t * dest,
 222                                             yuv_data_t * src1, yuv_data_t * src2,
 223                                             yuv_data_t * src3, yuv_data_t * src4)
 224 {
 225     //
 226     // *dest = (*dest + (*src1 + *src2 + *src3 + *src4 + 2)/ 4 + 1)/ 2;
 227     //
 228
 229     movq_m2r (*src1, mm1);        // load 8 src1 bytes
 230     movq_r2r (mm1, mm2);        // copy 8 src1 bytes
 231
 232     punpcklbw_r2r (mm0, mm1);        // unpack low src1 bytes
 233     punpckhbw_r2r (mm0, mm2);        // unpack high src1 bytes
 234
 235     movq_m2r (*src2, mm3);        // load 8 src2 bytes
 236     movq_r2r (mm3, mm4);        // copy 8 src2 bytes
 237
 238     punpcklbw_r2r (mm0, mm3);        // unpack low src2 bytes
 239     punpckhbw_r2r (mm0, mm4);        // unpack high src2 bytes
 240
 241     paddw_r2r (mm3, mm1);        // add lows
 242     paddw_r2r (mm4, mm2);        // add highs
 243
 244     // now have partials in mm1 and mm2
 245
 246     movq_m2r (*src3, mm3);        // load 8 src3 bytes
 247     movq_r2r (mm3, mm4);        // copy 8 src3 bytes
 248
 249     punpcklbw_r2r (mm0, mm3);        // unpack low src3 bytes
 250     punpckhbw_r2r (mm0, mm4);        // unpack high src3 bytes
 251
 252     paddw_r2r (mm3, mm1);        // add lows
 253     paddw_r2r (mm4, mm2);        // add highs
 254
 255     movq_m2r (*src4, mm5);        // load 8 src4 bytes
 256     movq_r2r (mm5, mm6);        // copy 8 src4 bytes
 257
 258     punpcklbw_r2r (mm0, mm5);        // unpack low src4 bytes
 259     punpckhbw_r2r (mm0, mm6);        // unpack high src4 bytes
 260
 261     paddw_r2r (mm5, mm1);        // add lows
 262     paddw_r2r (mm6, mm2);        // add highs
 263
 264     paddw_m2r (round4, mm1);
 265     psraw_i2r (2, mm1);                // /4
 266     paddw_m2r (round4, mm2);
 267     psraw_i2r (2, mm2);                // /4
 268
 269     // now have subtotal/4 in mm1 and mm2
 270
 271     movq_m2r (*dest, mm3);        // load 8 dest bytes
 272     movq_r2r (mm3, mm4);        // copy 8 dest bytes
 273
 274     punpcklbw_r2r (mm0, mm3);        // unpack low dest bytes
 275     punpckhbw_r2r (mm0, mm4);        // unpack high dest bytes
 276
 277     paddw_r2r (mm3, mm1);        // add lows
 278     paddw_r2r (mm4, mm2);        // add highs
 279
 280     paddw_m2r (round1, mm1);
 281     psraw_i2r (1, mm1);                // /2
 282     paddw_m2r (round1, mm2);
 283     psraw_i2r (1, mm2);                // /2
 284
 285     // now have end value in mm1 and mm2
 286
 287     packuswb_r2r (mm2, mm1);        // pack (w/ saturation)
 288     movq_r2m (mm1,*dest);        // store result in dest
 289 }
 290
 291 //-----------------------------------------------------------------------
 292
 293 static __inline__ void MC_avg_mmx (int width, int height,
 294                                yuv_data_t * dest, yuv_data_t * ref, int stride)
 295 {
 296     mmx_zero_reg ();
 297
 298     do {
 299         mmx_average_2_U8 (dest, dest, ref);
 300
 301         if (width == 16)
 302             mmx_average_2_U8 (dest+8, dest+8, ref+8);
 303
 304         dest += stride;
 305         ref += stride;
 306     } while (--height);
 307 }
 308
 309 static void MC_avg_16_mmx (yuv_data_t * dest, yuv_data_t * ref,
 310                            int stride, int height)
 311 {
 312     MC_avg_mmx (16, height, dest, ref, stride);
 313 }
 314
 315 static void MC_avg_8_mmx (yuv_data_t * dest, yuv_data_t * ref,
 316                           int stride, int height)
 317 {
 318     MC_avg_mmx (8, height, dest, ref, stride);
 319 }
 320
 321 //-----------------------------------------------------------------------
 322
 323 static __inline__ void MC_put_mmx (int width, int height,
 324                                yuv_data_t * dest, yuv_data_t * ref, int stride)
 325 {
 326     mmx_zero_reg ();
 327
 328     do {
 329         movq_m2r (* ref, mm1);        // load 8 ref bytes
 330         movq_r2m (mm1,* dest);        // store 8 bytes at curr
 331
 332         if (width == 16)
 333             {
 334                 movq_m2r (* (ref+8), mm1);        // load 8 ref bytes
 335                 movq_r2m (mm1,* (dest+8));        // store 8 bytes at curr
 336             }
 337
 338         dest += stride;
 339         ref += stride;
 340     } while (--height);
 341 }
 342
 343 static void MC_put_16_mmx (yuv_data_t * dest, yuv_data_t * ref,
 344                            int stride, int height)
 345 {
 346     MC_put_mmx (16, height, dest, ref, stride);
 347 }
 348
 349 static void MC_put_8_mmx (yuv_data_t * dest, yuv_data_t * ref,
 350                           int stride, int height)
 351 {
 352     MC_put_mmx (8, height, dest, ref, stride);
 353 }
 354
 355 //-----------------------------------------------------------------------
 356
 357 // Half pixel interpolation in the x direction
 358 static __inline__ void MC_avg_x_mmx (int width, int height,
 359                                  yuv_data_t * dest, yuv_data_t * ref, int stride)
 360 {
 361     mmx_zero_reg ();
 362
 363     do {
 364         mmx_interp_average_2_U8 (dest, ref, ref+1);
 365
 366         if (width == 16)
 367             mmx_interp_average_2_U8 (dest+8, ref+8, ref+9);
 368
 369         dest += stride;
 370         ref += stride;
 371     } while (--height);
 372 }
 373
 374 static void MC_avg_x16_mmx (yuv_data_t * dest, yuv_data_t * ref,
 375                             int stride, int height)
 376 {
 377     MC_avg_x_mmx (16, height, dest, ref, stride);
 378 }
 379
 380 static void MC_avg_x8_mmx (yuv_data_t * dest, yuv_data_t * ref,
 381                            int stride, int height)
 382 {
 383     MC_avg_x_mmx (8, height, dest, ref, stride);
 384 }
 385
 386 //-----------------------------------------------------------------------
 387
 388 static __inline__ void MC_put_x_mmx (int width, int height,
 389                                  yuv_data_t * dest, yuv_data_t * ref, int stride)
 390 {
 391     mmx_zero_reg ();
 392
 393     do {
 394         mmx_average_2_U8 (dest, ref, ref+1);
 395
 396         if (width == 16)
 397             mmx_average_2_U8 (dest+8, ref+8, ref+9);
 398
 399         dest += stride;
 400         ref += stride;
 401     } while (--height);
 402 }
 403
 404 static void MC_put_x16_mmx (yuv_data_t * dest, yuv_data_t * ref,
 405                             int stride, int height)
 406 {
 407     MC_put_x_mmx (16, height, dest, ref, stride);
 408 }
 409
 410 static void MC_put_x8_mmx (yuv_data_t * dest, yuv_data_t * ref,
 411                            int stride, int height)
 412 {
 413     MC_put_x_mmx (8, height, dest, ref, stride);
 414 }
 415
 416 //-----------------------------------------------------------------------
 417
 418 static __inline__ void MC_avg_xy_mmx (int width, int height,
 419                                   yuv_data_t * dest, yuv_data_t * ref, int stride)
 420 {
 421     yuv_data_t * ref_next = ref+stride;
 422
 423     mmx_zero_reg ();
 424
 425     do {
 426         mmx_interp_average_4_U8 (dest, ref, ref+1, ref_next, ref_next+1);
 427
 428         if (width == 16)
 429             mmx_interp_average_4_U8 (dest+8, ref+8, ref+9,
 430                                      ref_next+8, ref_next+9);
 431
 432         dest += stride;
 433         ref += stride;
 434         ref_next += stride;
 435     } while (--height);
 436 }
 437
 438 static void MC_avg_xy16_mmx (yuv_data_t * dest, yuv_data_t * ref,
 439                              int stride, int height)
 440 {
 441     MC_avg_xy_mmx (16, height, dest, ref, stride);
 442 }
 443
 444 static void MC_avg_xy8_mmx (yuv_data_t * dest, yuv_data_t * ref,
 445                             int stride, int height)
 446 {
 447     MC_avg_xy_mmx (8, height, dest, ref, stride);
 448 }
 449
 450 //-----------------------------------------------------------------------
 451
 452 static __inline__ void MC_put_xy_mmx (int width, int height,
 453                                   yuv_data_t * dest, yuv_data_t * ref, int stride)
 454 {
 455     yuv_data_t * ref_next = ref+stride;
 456
 457     mmx_zero_reg ();
 458
 459     do {
 460         mmx_average_4_U8 (dest, ref, ref+1, ref_next, ref_next+1);
 461
 462         if (width == 16)
 463             mmx_average_4_U8 (dest+8, ref+8, ref+9, ref_next+8, ref_next+9);
 464
 465         dest += stride;
 466         ref += stride;
 467         ref_next += stride;
 468     } while (--height);
 469 }
 470
 471 static void MC_put_xy16_mmx (yuv_data_t * dest, yuv_data_t * ref,
 472                              int stride, int height)
 473 {
 474     MC_put_xy_mmx (16, height, dest, ref, stride);
 475 }
 476
 477 static void MC_put_xy8_mmx (yuv_data_t * dest, yuv_data_t * ref,
 478                             int stride, int height)
 479 {
 480     MC_put_xy_mmx (8, height, dest, ref, stride);
 481 }
 482
 483 //-----------------------------------------------------------------------
 484
 485 static __inline__ void MC_avg_y_mmx (int width, int height,
 486                                  yuv_data_t * dest, yuv_data_t * ref, int stride)
 487 {
 488     yuv_data_t * ref_next = ref+stride;
 489
 490     mmx_zero_reg ();
 491
 492     do {
 493         mmx_interp_average_2_U8 (dest, ref, ref_next);
 494
 495         if (width == 16)
 496             mmx_interp_average_2_U8 (dest+8, ref+8, ref_next+8);
 497
 498         dest += stride;
 499         ref += stride;
 500         ref_next += stride;
 501     } while (--height);
 502 }
 503
 504 static void MC_avg_y16_mmx (yuv_data_t * dest, yuv_data_t * ref,
 505                             int stride, int height)
 506 {
 507     MC_avg_y_mmx (16, height, dest, ref, stride);
 508 }
 509
 510 static void MC_avg_y8_mmx (yuv_data_t * dest, yuv_data_t * ref,
 511                            int stride, int height)
 512 {
 513     MC_avg_y_mmx (8, height, dest, ref, stride);
 514 }
 515
 516 //-----------------------------------------------------------------------
 517
 518 static __inline__ void MC_put_y_mmx (int width, int height,
 519                                  yuv_data_t * dest, yuv_data_t * ref, int stride)
 520 {
 521     yuv_data_t * ref_next = ref+stride;
 522
 523     mmx_zero_reg ();
 524
 525     do {
 526         mmx_average_2_U8 (dest, ref, ref_next);
 527
 528         if (width == 16)
 529             mmx_average_2_U8 (dest+8, ref+8, ref_next+8);
 530
 531         dest += stride;
 532         ref += stride;
 533         ref_next += stride;
 534     } while (--height);
 535 }
 536
 537 static void MC_put_y16_mmx (yuv_data_t * dest, yuv_data_t * ref,
 538                             int stride, int height)
 539 {
 540     MC_put_y_mmx (16, height, dest, ref, stride);
 541 }
 542
 543 static void MC_put_y8_mmx (yuv_data_t * dest, yuv_data_t * ref,
 544                            int stride, int height)
 545 {
 546     MC_put_y_mmx (8, height, dest, ref, stride);
 547 }
 548
 549
 550 /*****************************************************************************
 551  * Functions exported as capabilities. They are declared as static so that
 552  * we don't pollute the namespace too much.
 553  *****************************************************************************/
 554 static void motion_getfunctions( function_list_t * p_function_list )
 555 {
 556     static void (* ppppf_motion[2][2][4])( yuv_data_t *, yuv_data_t *,
 557                                            int, int ) =
 558     {
 559         {
 560             /* Copying functions */
 561             {
 562                 /* Width == 16 */
 563                 MC_put_16_mmx, MC_put_x16_mmx, MC_put_y16_mmx, MC_put_xy16_mmx
 564             },
 565             {
 566                 /* Width == 8 */
 567                 MC_put_8_mmx,  MC_put_x8_mmx,  MC_put_y8_mmx, MC_put_xy8_mmx
 568             }
 569         },
 570         {
 571             /* Averaging functions */
 572             {
 573                 /* Width == 16 */
 574                 MC_avg_16_mmx, MC_avg_x16_mmx, MC_avg_y16_mmx, MC_avg_xy16_mmx
 575             },
 576             {
 577                 /* Width == 8 */
 578                 MC_avg_8_mmx,  MC_avg_x8_mmx,  MC_avg_y8_mmx,  MC_avg_xy8_mmx
 579             }
 580         }
 581     };
 582
 583     p_function_list->pf_probe = motion_Probe;
 584
 585 #define list p_function_list->functions.motion
 586     memcpy( list.ppppf_motion, ppppf_motion, sizeof( void * ) * 16 );
 587 #undef list
 588
 589     return;
 590 }
 591