git.sesse.net Git - vlc/blob - plugins/motion/motionmmx.c

   1 /*****************************************************************************
   2  * motionmmx.c : MMX motion compensation module for vlc
   3  *****************************************************************************
   4  * Copyright (C) 2001 VideoLAN
   5  * $Id: motionmmx.c,v 1.17 2002/05/18 17:47:47 sam Exp $
   6  *
   7  * Authors: Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
   8  *          Michel Lespinasse <walken@zoy.org>
   9  *
  10  * This program is free software; you can redistribute it and/or modify
  11  * it under the terms of the GNU General Public License as published by
  12  * the Free Software Foundation; either version 2 of the License, or
  13  * (at your option) any later version.
  14  *
  15  * This program is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  * GNU General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU General Public License
  21  * along with this program; if not, write to the Free Software
  22  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
  23  *****************************************************************************/
  24
  25 /*****************************************************************************
  26  * Preamble
  27  *****************************************************************************/
  28 #include <stdlib.h>                                      /* malloc(), free() */
  29 #include <string.h>
  30
  31 #include <videolan/vlc.h>
  32
  33 #include "mmx.h"
  34
  35 /*****************************************************************************
  36  * Local and extern prototypes.
  37  *****************************************************************************/
  38 static void motion_getfunctions( function_list_t * p_function_list );
  39
  40 /*****************************************************************************
  41  * Build configuration tree.
  42  *****************************************************************************/
  43 MODULE_CONFIG_START
  44 MODULE_CONFIG_STOP
  45
  46 MODULE_INIT_START
  47     SET_DESCRIPTION( _("MMX motion compensation module") )
  48     ADD_CAPABILITY( MOTION, 150 )
  49     ADD_REQUIREMENT( MMX )
  50     ADD_SHORTCUT( "mmx" )
  51     ADD_SHORTCUT( "motionmmx" )
  52 MODULE_INIT_STOP
  53
  54 MODULE_ACTIVATE_START
  55     motion_getfunctions( &p_module->p_functions->motion );
  56 MODULE_ACTIVATE_STOP
  57
  58 MODULE_DEACTIVATE_START
  59 MODULE_DEACTIVATE_STOP
  60
  61 /*****************************************************************************
  62  * Motion compensation in MMX
  63  *****************************************************************************/
  64
  65 // some rounding constants
  66 mmx_t round1 = {0x0001000100010001LL};
  67 mmx_t round4 = {0x0002000200020002LL};
  68
  69 /*
  70  * This code should probably be compiled with loop unrolling
  71  * (ie, -funroll-loops in gcc)becuase some of the loops
  72  * use a small static number of iterations. This was written
  73  * with the assumption the compiler knows best about when
  74  * unrolling will help
  75  */
  76
  77 static inline void mmx_zero_reg ()
  78 {
  79     // load 0 into mm0
  80     pxor_r2r (mm0, mm0);
  81 }
  82
  83 static inline void mmx_average_2_U8 (yuv_data_t * dest,
  84                                      yuv_data_t * src1, yuv_data_t * src2)
  85 {
  86     //
  87     // *dest = (*src1 + *src2 + 1)/ 2;
  88     //
  89
  90     movq_m2r (*src1, mm1);        // load 8 src1 bytes
  91     movq_r2r (mm1, mm2);        // copy 8 src1 bytes
  92
  93     movq_m2r (*src2, mm3);        // load 8 src2 bytes
  94     movq_r2r (mm3, mm4);        // copy 8 src2 bytes
  95
  96     punpcklbw_r2r (mm0, mm1);        // unpack low src1 bytes
  97     punpckhbw_r2r (mm0, mm2);        // unpack high src1 bytes
  98
  99     punpcklbw_r2r (mm0, mm3);        // unpack low src2 bytes
 100     punpckhbw_r2r (mm0, mm4);        // unpack high src2 bytes
 101
 102     paddw_r2r (mm3, mm1);        // add lows to mm1
 103     paddw_m2r (round1, mm1);
 104     psraw_i2r (1, mm1);                // /2
 105
 106     paddw_r2r (mm4, mm2);        // add highs to mm2
 107     paddw_m2r (round1, mm2);
 108     psraw_i2r (1, mm2);                // /2
 109
 110     packuswb_r2r (mm2, mm1);        // pack (w/ saturation)
 111     movq_r2m (mm1, *dest);        // store result in dest
 112 }
 113
 114 static inline void mmx_interp_average_2_U8 (yuv_data_t * dest,
 115                                             yuv_data_t * src1, yuv_data_t * src2)
 116 {
 117     //
 118     // *dest = (*dest + (*src1 + *src2 + 1)/ 2 + 1)/ 2;
 119     //
 120
 121     movq_m2r (*dest, mm1);        // load 8 dest bytes
 122     movq_r2r (mm1, mm2);        // copy 8 dest bytes
 123
 124     movq_m2r (*src1, mm3);        // load 8 src1 bytes
 125     movq_r2r (mm3, mm4);        // copy 8 src1 bytes
 126
 127     movq_m2r (*src2, mm5);        // load 8 src2 bytes
 128     movq_r2r (mm5, mm6);        // copy 8 src2 bytes
 129
 130     punpcklbw_r2r (mm0, mm1);        // unpack low dest bytes
 131     punpckhbw_r2r (mm0, mm2);        // unpack high dest bytes
 132
 133     punpcklbw_r2r (mm0, mm3);        // unpack low src1 bytes
 134     punpckhbw_r2r (mm0, mm4);        // unpack high src1 bytes
 135
 136     punpcklbw_r2r (mm0, mm5);        // unpack low src2 bytes
 137     punpckhbw_r2r (mm0, mm6);        // unpack high src2 bytes
 138
 139     paddw_r2r (mm5, mm3);        // add lows
 140     paddw_m2r (round1, mm3);
 141     psraw_i2r (1, mm3);                // /2
 142
 143     paddw_r2r (mm6, mm4);        // add highs
 144     paddw_m2r (round1, mm4);
 145     psraw_i2r (1, mm4);                // /2
 146
 147     paddw_r2r (mm3, mm1);        // add lows
 148     paddw_m2r (round1, mm1);
 149     psraw_i2r (1, mm1);                // /2
 150
 151     paddw_r2r (mm4, mm2);        // add highs
 152     paddw_m2r (round1, mm2);
 153     psraw_i2r (1, mm2);                // /2
 154
 155     packuswb_r2r (mm2, mm1);        // pack (w/ saturation)
 156     movq_r2m (mm1, *dest);        // store result in dest
 157 }
 158
 159 static inline void mmx_average_4_U8 (yuv_data_t * dest,
 160                                      yuv_data_t * src1, yuv_data_t * src2,
 161                                      yuv_data_t * src3, yuv_data_t * src4)
 162 {
 163     //
 164     // *dest = (*src1 + *src2 + *src3 + *src4 + 2)/ 4;
 165     //
 166
 167     movq_m2r (*src1, mm1);        // load 8 src1 bytes
 168     movq_r2r (mm1, mm2);        // copy 8 src1 bytes
 169
 170     punpcklbw_r2r (mm0, mm1);        // unpack low src1 bytes
 171     punpckhbw_r2r (mm0, mm2);        // unpack high src1 bytes
 172
 173     movq_m2r (*src2, mm3);        // load 8 src2 bytes
 174     movq_r2r (mm3, mm4);        // copy 8 src2 bytes
 175
 176     punpcklbw_r2r (mm0, mm3);        // unpack low src2 bytes
 177     punpckhbw_r2r (mm0, mm4);        // unpack high src2 bytes
 178
 179     paddw_r2r (mm3, mm1);        // add lows
 180     paddw_r2r (mm4, mm2);        // add highs
 181
 182     // now have partials in mm1 and mm2
 183
 184     movq_m2r (*src3, mm3);        // load 8 src3 bytes
 185     movq_r2r (mm3, mm4);        // copy 8 src3 bytes
 186
 187     punpcklbw_r2r (mm0, mm3);        // unpack low src3 bytes
 188     punpckhbw_r2r (mm0, mm4);        // unpack high src3 bytes
 189
 190     paddw_r2r (mm3, mm1);        // add lows
 191     paddw_r2r (mm4, mm2);        // add highs
 192
 193     movq_m2r (*src4, mm5);        // load 8 src4 bytes
 194     movq_r2r (mm5, mm6);        // copy 8 src4 bytes
 195
 196     punpcklbw_r2r (mm0, mm5);        // unpack low src4 bytes
 197     punpckhbw_r2r (mm0, mm6);        // unpack high src4 bytes
 198
 199     paddw_r2r (mm5, mm1);        // add lows
 200     paddw_r2r (mm6, mm2);        // add highs
 201
 202     // now have subtotal in mm1 and mm2
 203
 204     paddw_m2r (round4, mm1);
 205     psraw_i2r (2, mm1);                // /4
 206     paddw_m2r (round4, mm2);
 207     psraw_i2r (2, mm2);                // /4
 208
 209     packuswb_r2r (mm2, mm1);        // pack (w/ saturation)
 210     movq_r2m (mm1, *dest);        // store result in dest
 211 }
 212
 213 static inline void mmx_interp_average_4_U8 (yuv_data_t * dest,
 214                                             yuv_data_t * src1, yuv_data_t * src2,
 215                                             yuv_data_t * src3, yuv_data_t * src4)
 216 {
 217     //
 218     // *dest = (*dest + (*src1 + *src2 + *src3 + *src4 + 2)/ 4 + 1)/ 2;
 219     //
 220
 221     movq_m2r (*src1, mm1);        // load 8 src1 bytes
 222     movq_r2r (mm1, mm2);        // copy 8 src1 bytes
 223
 224     punpcklbw_r2r (mm0, mm1);        // unpack low src1 bytes
 225     punpckhbw_r2r (mm0, mm2);        // unpack high src1 bytes
 226
 227     movq_m2r (*src2, mm3);        // load 8 src2 bytes
 228     movq_r2r (mm3, mm4);        // copy 8 src2 bytes
 229
 230     punpcklbw_r2r (mm0, mm3);        // unpack low src2 bytes
 231     punpckhbw_r2r (mm0, mm4);        // unpack high src2 bytes
 232
 233     paddw_r2r (mm3, mm1);        // add lows
 234     paddw_r2r (mm4, mm2);        // add highs
 235
 236     // now have partials in mm1 and mm2
 237
 238     movq_m2r (*src3, mm3);        // load 8 src3 bytes
 239     movq_r2r (mm3, mm4);        // copy 8 src3 bytes
 240
 241     punpcklbw_r2r (mm0, mm3);        // unpack low src3 bytes
 242     punpckhbw_r2r (mm0, mm4);        // unpack high src3 bytes
 243
 244     paddw_r2r (mm3, mm1);        // add lows
 245     paddw_r2r (mm4, mm2);        // add highs
 246
 247     movq_m2r (*src4, mm5);        // load 8 src4 bytes
 248     movq_r2r (mm5, mm6);        // copy 8 src4 bytes
 249
 250     punpcklbw_r2r (mm0, mm5);        // unpack low src4 bytes
 251     punpckhbw_r2r (mm0, mm6);        // unpack high src4 bytes
 252
 253     paddw_r2r (mm5, mm1);        // add lows
 254     paddw_r2r (mm6, mm2);        // add highs
 255
 256     paddw_m2r (round4, mm1);
 257     psraw_i2r (2, mm1);                // /4
 258     paddw_m2r (round4, mm2);
 259     psraw_i2r (2, mm2);                // /4
 260
 261     // now have subtotal/4 in mm1 and mm2
 262
 263     movq_m2r (*dest, mm3);        // load 8 dest bytes
 264     movq_r2r (mm3, mm4);        // copy 8 dest bytes
 265
 266     punpcklbw_r2r (mm0, mm3);        // unpack low dest bytes
 267     punpckhbw_r2r (mm0, mm4);        // unpack high dest bytes
 268
 269     paddw_r2r (mm3, mm1);        // add lows
 270     paddw_r2r (mm4, mm2);        // add highs
 271
 272     paddw_m2r (round1, mm1);
 273     psraw_i2r (1, mm1);                // /2
 274     paddw_m2r (round1, mm2);
 275     psraw_i2r (1, mm2);                // /2
 276
 277     // now have end value in mm1 and mm2
 278
 279     packuswb_r2r (mm2, mm1);        // pack (w/ saturation)
 280     movq_r2m (mm1,*dest);        // store result in dest
 281 }
 282
 283 //-----------------------------------------------------------------------
 284
 285 static inline void MC_avg_mmx (int width, int height,
 286                                yuv_data_t * dest, yuv_data_t * ref, int stride)
 287 {
 288     mmx_zero_reg ();
 289
 290     do {
 291         mmx_average_2_U8 (dest, dest, ref);
 292
 293         if (width == 16)
 294             mmx_average_2_U8 (dest+8, dest+8, ref+8);
 295
 296         dest += stride;
 297         ref += stride;
 298     } while (--height);
 299 }
 300
 301 static void MC_avg_16_mmx (yuv_data_t * dest, yuv_data_t * ref,
 302                            int stride, int height)
 303 {
 304     MC_avg_mmx (16, height, dest, ref, stride);
 305 }
 306
 307 static void MC_avg_8_mmx (yuv_data_t * dest, yuv_data_t * ref,
 308                           int stride, int height)
 309 {
 310     MC_avg_mmx (8, height, dest, ref, stride);
 311 }
 312
 313 //-----------------------------------------------------------------------
 314
 315 static inline void MC_put_mmx (int width, int height,
 316                                yuv_data_t * dest, yuv_data_t * ref, int stride)
 317 {
 318     mmx_zero_reg ();
 319
 320     do {
 321         movq_m2r (* ref, mm1);        // load 8 ref bytes
 322         movq_r2m (mm1,* dest);        // store 8 bytes at curr
 323
 324         if (width == 16)
 325             {
 326                 movq_m2r (* (ref+8), mm1);        // load 8 ref bytes
 327                 movq_r2m (mm1,* (dest+8));        // store 8 bytes at curr
 328             }
 329
 330         dest += stride;
 331         ref += stride;
 332     } while (--height);
 333 }
 334
 335 static void MC_put_16_mmx (yuv_data_t * dest, yuv_data_t * ref,
 336                            int stride, int height)
 337 {
 338     MC_put_mmx (16, height, dest, ref, stride);
 339 }
 340
 341 static void MC_put_8_mmx (yuv_data_t * dest, yuv_data_t * ref,
 342                           int stride, int height)
 343 {
 344     MC_put_mmx (8, height, dest, ref, stride);
 345 }
 346
 347 //-----------------------------------------------------------------------
 348
 349 // Half pixel interpolation in the x direction
 350 static inline void MC_avg_x_mmx (int width, int height,
 351                                  yuv_data_t * dest, yuv_data_t * ref, int stride)
 352 {
 353     mmx_zero_reg ();
 354
 355     do {
 356         mmx_interp_average_2_U8 (dest, ref, ref+1);
 357
 358         if (width == 16)
 359             mmx_interp_average_2_U8 (dest+8, ref+8, ref+9);
 360
 361         dest += stride;
 362         ref += stride;
 363     } while (--height);
 364 }
 365
 366 static void MC_avg_x16_mmx (yuv_data_t * dest, yuv_data_t * ref,
 367                             int stride, int height)
 368 {
 369     MC_avg_x_mmx (16, height, dest, ref, stride);
 370 }
 371
 372 static void MC_avg_x8_mmx (yuv_data_t * dest, yuv_data_t * ref,
 373                            int stride, int height)
 374 {
 375     MC_avg_x_mmx (8, height, dest, ref, stride);
 376 }
 377
 378 //-----------------------------------------------------------------------
 379
 380 static inline void MC_put_x_mmx (int width, int height,
 381                                  yuv_data_t * dest, yuv_data_t * ref, int stride)
 382 {
 383     mmx_zero_reg ();
 384
 385     do {
 386         mmx_average_2_U8 (dest, ref, ref+1);
 387
 388         if (width == 16)
 389             mmx_average_2_U8 (dest+8, ref+8, ref+9);
 390
 391         dest += stride;
 392         ref += stride;
 393     } while (--height);
 394 }
 395
 396 static void MC_put_x16_mmx (yuv_data_t * dest, yuv_data_t * ref,
 397                             int stride, int height)
 398 {
 399     MC_put_x_mmx (16, height, dest, ref, stride);
 400 }
 401
 402 static void MC_put_x8_mmx (yuv_data_t * dest, yuv_data_t * ref,
 403                            int stride, int height)
 404 {
 405     MC_put_x_mmx (8, height, dest, ref, stride);
 406 }
 407
 408 //-----------------------------------------------------------------------
 409
 410 static inline void MC_avg_xy_mmx (int width, int height,
 411                                   yuv_data_t * dest, yuv_data_t * ref, int stride)
 412 {
 413     yuv_data_t * ref_next = ref+stride;
 414
 415     mmx_zero_reg ();
 416
 417     do {
 418         mmx_interp_average_4_U8 (dest, ref, ref+1, ref_next, ref_next+1);
 419
 420         if (width == 16)
 421             mmx_interp_average_4_U8 (dest+8, ref+8, ref+9,
 422                                      ref_next+8, ref_next+9);
 423
 424         dest += stride;
 425         ref += stride;
 426         ref_next += stride;
 427     } while (--height);
 428 }
 429
 430 static void MC_avg_xy16_mmx (yuv_data_t * dest, yuv_data_t * ref,
 431                              int stride, int height)
 432 {
 433     MC_avg_xy_mmx (16, height, dest, ref, stride);
 434 }
 435
 436 static void MC_avg_xy8_mmx (yuv_data_t * dest, yuv_data_t * ref,
 437                             int stride, int height)
 438 {
 439     MC_avg_xy_mmx (8, height, dest, ref, stride);
 440 }
 441
 442 //-----------------------------------------------------------------------
 443
 444 static inline void MC_put_xy_mmx (int width, int height,
 445                                   yuv_data_t * dest, yuv_data_t * ref, int stride)
 446 {
 447     yuv_data_t * ref_next = ref+stride;
 448
 449     mmx_zero_reg ();
 450
 451     do {
 452         mmx_average_4_U8 (dest, ref, ref+1, ref_next, ref_next+1);
 453
 454         if (width == 16)
 455             mmx_average_4_U8 (dest+8, ref+8, ref+9, ref_next+8, ref_next+9);
 456
 457         dest += stride;
 458         ref += stride;
 459         ref_next += stride;
 460     } while (--height);
 461 }
 462
 463 static void MC_put_xy16_mmx (yuv_data_t * dest, yuv_data_t * ref,
 464                              int stride, int height)
 465 {
 466     MC_put_xy_mmx (16, height, dest, ref, stride);
 467 }
 468
 469 static void MC_put_xy8_mmx (yuv_data_t * dest, yuv_data_t * ref,
 470                             int stride, int height)
 471 {
 472     MC_put_xy_mmx (8, height, dest, ref, stride);
 473 }
 474
 475 //-----------------------------------------------------------------------
 476
 477 static inline void MC_avg_y_mmx (int width, int height,
 478                                  yuv_data_t * dest, yuv_data_t * ref, int stride)
 479 {
 480     yuv_data_t * ref_next = ref+stride;
 481
 482     mmx_zero_reg ();
 483
 484     do {
 485         mmx_interp_average_2_U8 (dest, ref, ref_next);
 486
 487         if (width == 16)
 488             mmx_interp_average_2_U8 (dest+8, ref+8, ref_next+8);
 489
 490         dest += stride;
 491         ref += stride;
 492         ref_next += stride;
 493     } while (--height);
 494 }
 495
 496 static void MC_avg_y16_mmx (yuv_data_t * dest, yuv_data_t * ref,
 497                             int stride, int height)
 498 {
 499     MC_avg_y_mmx (16, height, dest, ref, stride);
 500 }
 501
 502 static void MC_avg_y8_mmx (yuv_data_t * dest, yuv_data_t * ref,
 503                            int stride, int height)
 504 {
 505     MC_avg_y_mmx (8, height, dest, ref, stride);
 506 }
 507
 508 //-----------------------------------------------------------------------
 509
 510 static inline void MC_put_y_mmx (int width, int height,
 511                                  yuv_data_t * dest, yuv_data_t * ref, int stride)
 512 {
 513     yuv_data_t * ref_next = ref+stride;
 514
 515     mmx_zero_reg ();
 516
 517     do {
 518         mmx_average_2_U8 (dest, ref, ref_next);
 519
 520         if (width == 16)
 521             mmx_average_2_U8 (dest+8, ref+8, ref_next+8);
 522
 523         dest += stride;
 524         ref += stride;
 525         ref_next += stride;
 526     } while (--height);
 527 }
 528
 529 static void MC_put_y16_mmx (yuv_data_t * dest, yuv_data_t * ref,
 530                             int stride, int height)
 531 {
 532     MC_put_y_mmx (16, height, dest, ref, stride);
 533 }
 534
 535 static void MC_put_y8_mmx (yuv_data_t * dest, yuv_data_t * ref,
 536                            int stride, int height)
 537 {
 538     MC_put_y_mmx (8, height, dest, ref, stride);
 539 }
 540
 541
 542 /*****************************************************************************
 543  * Functions exported as capabilities. They are declared as static so that
 544  * we don't pollute the namespace too much.
 545  *****************************************************************************/
 546 static void motion_getfunctions( function_list_t * p_function_list )
 547 {
 548     static void (* ppppf_motion[2][2][4])( yuv_data_t *, yuv_data_t *,
 549                                            int, int ) =
 550     {
 551         {
 552             /* Copying functions */
 553             {
 554                 /* Width == 16 */
 555                 MC_put_16_mmx, MC_put_x16_mmx, MC_put_y16_mmx, MC_put_xy16_mmx
 556             },
 557             {
 558                 /* Width == 8 */
 559                 MC_put_8_mmx,  MC_put_x8_mmx,  MC_put_y8_mmx, MC_put_xy8_mmx
 560             }
 561         },
 562         {
 563             /* Averaging functions */
 564             {
 565                 /* Width == 16 */
 566                 MC_avg_16_mmx, MC_avg_x16_mmx, MC_avg_y16_mmx, MC_avg_xy16_mmx
 567             },
 568             {
 569                 /* Width == 8 */
 570                 MC_avg_8_mmx,  MC_avg_x8_mmx,  MC_avg_y8_mmx,  MC_avg_xy8_mmx
 571             }
 572         }
 573     };
 574
 575 #define list p_function_list->functions.motion
 576     memcpy( list.ppppf_motion, ppppf_motion, sizeof( void * ) * 16 );
 577 #undef list
 578
 579     return;
 580 }
 581