git.sesse.net Git - vlc/blob - plugins/motion/motionmmx.c

   1 /*****************************************************************************
   2  * motionmmx.c : MMX motion compensation module for vlc
   3  *****************************************************************************
   4  * Copyright (C) 2001 VideoLAN
   5  * $Id: motionmmx.c,v 1.18 2002/06/01 12:32:00 sam Exp $
   6  *
   7  * Authors: Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
   8  *          Michel Lespinasse <walken@zoy.org>
   9  *
  10  * This program is free software; you can redistribute it and/or modify
  11  * it under the terms of the GNU General Public License as published by
  12  * the Free Software Foundation; either version 2 of the License, or
  13  * (at your option) any later version.
  14  *
  15  * This program is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  * GNU General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU General Public License
  21  * along with this program; if not, write to the Free Software
  22  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
  23  *****************************************************************************/
  24
  25 /*****************************************************************************
  26  * Preamble
  27  *****************************************************************************/
  28 #include <stdlib.h>                                      /* malloc(), free() */
  29 #include <string.h>
  30
  31 #include <vlc/vlc.h>
  32
  33 #include "mmx.h"
  34
  35 /*****************************************************************************
  36  * Local and extern prototypes.
  37  *****************************************************************************/
  38 static void motion_getfunctions( function_list_t * p_function_list );
  39
  40 /*****************************************************************************
  41  * Build configuration tree.
  42  *****************************************************************************/
  43 MODULE_CONFIG_START
  44 MODULE_CONFIG_STOP
  45
  46 MODULE_INIT_START
  47     SET_DESCRIPTION( _("MMX motion compensation module") )
  48     ADD_CAPABILITY( MOTION, 150 )
  49     ADD_REQUIREMENT( MMX )
  50     ADD_SHORTCUT( "mmx" )
  51 MODULE_INIT_STOP
  52
  53 MODULE_ACTIVATE_START
  54     motion_getfunctions( &p_module->p_functions->motion );
  55 MODULE_ACTIVATE_STOP
  56
  57 MODULE_DEACTIVATE_START
  58 MODULE_DEACTIVATE_STOP
  59
  60 /*****************************************************************************
  61  * Motion compensation in MMX
  62  *****************************************************************************/
  63
  64 // some rounding constants
  65 mmx_t round1 = {0x0001000100010001LL};
  66 mmx_t round4 = {0x0002000200020002LL};
  67
  68 /*
  69  * This code should probably be compiled with loop unrolling
  70  * (ie, -funroll-loops in gcc)becuase some of the loops
  71  * use a small static number of iterations. This was written
  72  * with the assumption the compiler knows best about when
  73  * unrolling will help
  74  */
  75
  76 static inline void mmx_zero_reg ()
  77 {
  78     // load 0 into mm0
  79     pxor_r2r (mm0, mm0);
  80 }
  81
  82 static inline void mmx_average_2_U8 (yuv_data_t * dest,
  83                                      yuv_data_t * src1, yuv_data_t * src2)
  84 {
  85     //
  86     // *dest = (*src1 + *src2 + 1)/ 2;
  87     //
  88
  89     movq_m2r (*src1, mm1);        // load 8 src1 bytes
  90     movq_r2r (mm1, mm2);        // copy 8 src1 bytes
  91
  92     movq_m2r (*src2, mm3);        // load 8 src2 bytes
  93     movq_r2r (mm3, mm4);        // copy 8 src2 bytes
  94
  95     punpcklbw_r2r (mm0, mm1);        // unpack low src1 bytes
  96     punpckhbw_r2r (mm0, mm2);        // unpack high src1 bytes
  97
  98     punpcklbw_r2r (mm0, mm3);        // unpack low src2 bytes
  99     punpckhbw_r2r (mm0, mm4);        // unpack high src2 bytes
 100
 101     paddw_r2r (mm3, mm1);        // add lows to mm1
 102     paddw_m2r (round1, mm1);
 103     psraw_i2r (1, mm1);                // /2
 104
 105     paddw_r2r (mm4, mm2);        // add highs to mm2
 106     paddw_m2r (round1, mm2);
 107     psraw_i2r (1, mm2);                // /2
 108
 109     packuswb_r2r (mm2, mm1);        // pack (w/ saturation)
 110     movq_r2m (mm1, *dest);        // store result in dest
 111 }
 112
 113 static inline void mmx_interp_average_2_U8 (yuv_data_t * dest,
 114                                             yuv_data_t * src1, yuv_data_t * src2)
 115 {
 116     //
 117     // *dest = (*dest + (*src1 + *src2 + 1)/ 2 + 1)/ 2;
 118     //
 119
 120     movq_m2r (*dest, mm1);        // load 8 dest bytes
 121     movq_r2r (mm1, mm2);        // copy 8 dest bytes
 122
 123     movq_m2r (*src1, mm3);        // load 8 src1 bytes
 124     movq_r2r (mm3, mm4);        // copy 8 src1 bytes
 125
 126     movq_m2r (*src2, mm5);        // load 8 src2 bytes
 127     movq_r2r (mm5, mm6);        // copy 8 src2 bytes
 128
 129     punpcklbw_r2r (mm0, mm1);        // unpack low dest bytes
 130     punpckhbw_r2r (mm0, mm2);        // unpack high dest bytes
 131
 132     punpcklbw_r2r (mm0, mm3);        // unpack low src1 bytes
 133     punpckhbw_r2r (mm0, mm4);        // unpack high src1 bytes
 134
 135     punpcklbw_r2r (mm0, mm5);        // unpack low src2 bytes
 136     punpckhbw_r2r (mm0, mm6);        // unpack high src2 bytes
 137
 138     paddw_r2r (mm5, mm3);        // add lows
 139     paddw_m2r (round1, mm3);
 140     psraw_i2r (1, mm3);                // /2
 141
 142     paddw_r2r (mm6, mm4);        // add highs
 143     paddw_m2r (round1, mm4);
 144     psraw_i2r (1, mm4);                // /2
 145
 146     paddw_r2r (mm3, mm1);        // add lows
 147     paddw_m2r (round1, mm1);
 148     psraw_i2r (1, mm1);                // /2
 149
 150     paddw_r2r (mm4, mm2);        // add highs
 151     paddw_m2r (round1, mm2);
 152     psraw_i2r (1, mm2);                // /2
 153
 154     packuswb_r2r (mm2, mm1);        // pack (w/ saturation)
 155     movq_r2m (mm1, *dest);        // store result in dest
 156 }
 157
 158 static inline void mmx_average_4_U8 (yuv_data_t * dest,
 159                                      yuv_data_t * src1, yuv_data_t * src2,
 160                                      yuv_data_t * src3, yuv_data_t * src4)
 161 {
 162     //
 163     // *dest = (*src1 + *src2 + *src3 + *src4 + 2)/ 4;
 164     //
 165
 166     movq_m2r (*src1, mm1);        // load 8 src1 bytes
 167     movq_r2r (mm1, mm2);        // copy 8 src1 bytes
 168
 169     punpcklbw_r2r (mm0, mm1);        // unpack low src1 bytes
 170     punpckhbw_r2r (mm0, mm2);        // unpack high src1 bytes
 171
 172     movq_m2r (*src2, mm3);        // load 8 src2 bytes
 173     movq_r2r (mm3, mm4);        // copy 8 src2 bytes
 174
 175     punpcklbw_r2r (mm0, mm3);        // unpack low src2 bytes
 176     punpckhbw_r2r (mm0, mm4);        // unpack high src2 bytes
 177
 178     paddw_r2r (mm3, mm1);        // add lows
 179     paddw_r2r (mm4, mm2);        // add highs
 180
 181     // now have partials in mm1 and mm2
 182
 183     movq_m2r (*src3, mm3);        // load 8 src3 bytes
 184     movq_r2r (mm3, mm4);        // copy 8 src3 bytes
 185
 186     punpcklbw_r2r (mm0, mm3);        // unpack low src3 bytes
 187     punpckhbw_r2r (mm0, mm4);        // unpack high src3 bytes
 188
 189     paddw_r2r (mm3, mm1);        // add lows
 190     paddw_r2r (mm4, mm2);        // add highs
 191
 192     movq_m2r (*src4, mm5);        // load 8 src4 bytes
 193     movq_r2r (mm5, mm6);        // copy 8 src4 bytes
 194
 195     punpcklbw_r2r (mm0, mm5);        // unpack low src4 bytes
 196     punpckhbw_r2r (mm0, mm6);        // unpack high src4 bytes
 197
 198     paddw_r2r (mm5, mm1);        // add lows
 199     paddw_r2r (mm6, mm2);        // add highs
 200
 201     // now have subtotal in mm1 and mm2
 202
 203     paddw_m2r (round4, mm1);
 204     psraw_i2r (2, mm1);                // /4
 205     paddw_m2r (round4, mm2);
 206     psraw_i2r (2, mm2);                // /4
 207
 208     packuswb_r2r (mm2, mm1);        // pack (w/ saturation)
 209     movq_r2m (mm1, *dest);        // store result in dest
 210 }
 211
 212 static inline void mmx_interp_average_4_U8 (yuv_data_t * dest,
 213                                             yuv_data_t * src1, yuv_data_t * src2,
 214                                             yuv_data_t * src3, yuv_data_t * src4)
 215 {
 216     //
 217     // *dest = (*dest + (*src1 + *src2 + *src3 + *src4 + 2)/ 4 + 1)/ 2;
 218     //
 219
 220     movq_m2r (*src1, mm1);        // load 8 src1 bytes
 221     movq_r2r (mm1, mm2);        // copy 8 src1 bytes
 222
 223     punpcklbw_r2r (mm0, mm1);        // unpack low src1 bytes
 224     punpckhbw_r2r (mm0, mm2);        // unpack high src1 bytes
 225
 226     movq_m2r (*src2, mm3);        // load 8 src2 bytes
 227     movq_r2r (mm3, mm4);        // copy 8 src2 bytes
 228
 229     punpcklbw_r2r (mm0, mm3);        // unpack low src2 bytes
 230     punpckhbw_r2r (mm0, mm4);        // unpack high src2 bytes
 231
 232     paddw_r2r (mm3, mm1);        // add lows
 233     paddw_r2r (mm4, mm2);        // add highs
 234
 235     // now have partials in mm1 and mm2
 236
 237     movq_m2r (*src3, mm3);        // load 8 src3 bytes
 238     movq_r2r (mm3, mm4);        // copy 8 src3 bytes
 239
 240     punpcklbw_r2r (mm0, mm3);        // unpack low src3 bytes
 241     punpckhbw_r2r (mm0, mm4);        // unpack high src3 bytes
 242
 243     paddw_r2r (mm3, mm1);        // add lows
 244     paddw_r2r (mm4, mm2);        // add highs
 245
 246     movq_m2r (*src4, mm5);        // load 8 src4 bytes
 247     movq_r2r (mm5, mm6);        // copy 8 src4 bytes
 248
 249     punpcklbw_r2r (mm0, mm5);        // unpack low src4 bytes
 250     punpckhbw_r2r (mm0, mm6);        // unpack high src4 bytes
 251
 252     paddw_r2r (mm5, mm1);        // add lows
 253     paddw_r2r (mm6, mm2);        // add highs
 254
 255     paddw_m2r (round4, mm1);
 256     psraw_i2r (2, mm1);                // /4
 257     paddw_m2r (round4, mm2);
 258     psraw_i2r (2, mm2);                // /4
 259
 260     // now have subtotal/4 in mm1 and mm2
 261
 262     movq_m2r (*dest, mm3);        // load 8 dest bytes
 263     movq_r2r (mm3, mm4);        // copy 8 dest bytes
 264
 265     punpcklbw_r2r (mm0, mm3);        // unpack low dest bytes
 266     punpckhbw_r2r (mm0, mm4);        // unpack high dest bytes
 267
 268     paddw_r2r (mm3, mm1);        // add lows
 269     paddw_r2r (mm4, mm2);        // add highs
 270
 271     paddw_m2r (round1, mm1);
 272     psraw_i2r (1, mm1);                // /2
 273     paddw_m2r (round1, mm2);
 274     psraw_i2r (1, mm2);                // /2
 275
 276     // now have end value in mm1 and mm2
 277
 278     packuswb_r2r (mm2, mm1);        // pack (w/ saturation)
 279     movq_r2m (mm1,*dest);        // store result in dest
 280 }
 281
 282 //-----------------------------------------------------------------------
 283
 284 static inline void MC_avg_mmx (int width, int height,
 285                                yuv_data_t * dest, yuv_data_t * ref, int stride)
 286 {
 287     mmx_zero_reg ();
 288
 289     do {
 290         mmx_average_2_U8 (dest, dest, ref);
 291
 292         if (width == 16)
 293             mmx_average_2_U8 (dest+8, dest+8, ref+8);
 294
 295         dest += stride;
 296         ref += stride;
 297     } while (--height);
 298 }
 299
 300 static void MC_avg_16_mmx (yuv_data_t * dest, yuv_data_t * ref,
 301                            int stride, int height)
 302 {
 303     MC_avg_mmx (16, height, dest, ref, stride);
 304 }
 305
 306 static void MC_avg_8_mmx (yuv_data_t * dest, yuv_data_t * ref,
 307                           int stride, int height)
 308 {
 309     MC_avg_mmx (8, height, dest, ref, stride);
 310 }
 311
 312 //-----------------------------------------------------------------------
 313
 314 static inline void MC_put_mmx (int width, int height,
 315                                yuv_data_t * dest, yuv_data_t * ref, int stride)
 316 {
 317     mmx_zero_reg ();
 318
 319     do {
 320         movq_m2r (* ref, mm1);        // load 8 ref bytes
 321         movq_r2m (mm1,* dest);        // store 8 bytes at curr
 322
 323         if (width == 16)
 324             {
 325                 movq_m2r (* (ref+8), mm1);        // load 8 ref bytes
 326                 movq_r2m (mm1,* (dest+8));        // store 8 bytes at curr
 327             }
 328
 329         dest += stride;
 330         ref += stride;
 331     } while (--height);
 332 }
 333
 334 static void MC_put_16_mmx (yuv_data_t * dest, yuv_data_t * ref,
 335                            int stride, int height)
 336 {
 337     MC_put_mmx (16, height, dest, ref, stride);
 338 }
 339
 340 static void MC_put_8_mmx (yuv_data_t * dest, yuv_data_t * ref,
 341                           int stride, int height)
 342 {
 343     MC_put_mmx (8, height, dest, ref, stride);
 344 }
 345
 346 //-----------------------------------------------------------------------
 347
 348 // Half pixel interpolation in the x direction
 349 static inline void MC_avg_x_mmx (int width, int height,
 350                                  yuv_data_t * dest, yuv_data_t * ref, int stride)
 351 {
 352     mmx_zero_reg ();
 353
 354     do {
 355         mmx_interp_average_2_U8 (dest, ref, ref+1);
 356
 357         if (width == 16)
 358             mmx_interp_average_2_U8 (dest+8, ref+8, ref+9);
 359
 360         dest += stride;
 361         ref += stride;
 362     } while (--height);
 363 }
 364
 365 static void MC_avg_x16_mmx (yuv_data_t * dest, yuv_data_t * ref,
 366                             int stride, int height)
 367 {
 368     MC_avg_x_mmx (16, height, dest, ref, stride);
 369 }
 370
 371 static void MC_avg_x8_mmx (yuv_data_t * dest, yuv_data_t * ref,
 372                            int stride, int height)
 373 {
 374     MC_avg_x_mmx (8, height, dest, ref, stride);
 375 }
 376
 377 //-----------------------------------------------------------------------
 378
 379 static inline void MC_put_x_mmx (int width, int height,
 380                                  yuv_data_t * dest, yuv_data_t * ref, int stride)
 381 {
 382     mmx_zero_reg ();
 383
 384     do {
 385         mmx_average_2_U8 (dest, ref, ref+1);
 386
 387         if (width == 16)
 388             mmx_average_2_U8 (dest+8, ref+8, ref+9);
 389
 390         dest += stride;
 391         ref += stride;
 392     } while (--height);
 393 }
 394
 395 static void MC_put_x16_mmx (yuv_data_t * dest, yuv_data_t * ref,
 396                             int stride, int height)
 397 {
 398     MC_put_x_mmx (16, height, dest, ref, stride);
 399 }
 400
 401 static void MC_put_x8_mmx (yuv_data_t * dest, yuv_data_t * ref,
 402                            int stride, int height)
 403 {
 404     MC_put_x_mmx (8, height, dest, ref, stride);
 405 }
 406
 407 //-----------------------------------------------------------------------
 408
 409 static inline void MC_avg_xy_mmx (int width, int height,
 410                                   yuv_data_t * dest, yuv_data_t * ref, int stride)
 411 {
 412     yuv_data_t * ref_next = ref+stride;
 413
 414     mmx_zero_reg ();
 415
 416     do {
 417         mmx_interp_average_4_U8 (dest, ref, ref+1, ref_next, ref_next+1);
 418
 419         if (width == 16)
 420             mmx_interp_average_4_U8 (dest+8, ref+8, ref+9,
 421                                      ref_next+8, ref_next+9);
 422
 423         dest += stride;
 424         ref += stride;
 425         ref_next += stride;
 426     } while (--height);
 427 }
 428
 429 static void MC_avg_xy16_mmx (yuv_data_t * dest, yuv_data_t * ref,
 430                              int stride, int height)
 431 {
 432     MC_avg_xy_mmx (16, height, dest, ref, stride);
 433 }
 434
 435 static void MC_avg_xy8_mmx (yuv_data_t * dest, yuv_data_t * ref,
 436                             int stride, int height)
 437 {
 438     MC_avg_xy_mmx (8, height, dest, ref, stride);
 439 }
 440
 441 //-----------------------------------------------------------------------
 442
 443 static inline void MC_put_xy_mmx (int width, int height,
 444                                   yuv_data_t * dest, yuv_data_t * ref, int stride)
 445 {
 446     yuv_data_t * ref_next = ref+stride;
 447
 448     mmx_zero_reg ();
 449
 450     do {
 451         mmx_average_4_U8 (dest, ref, ref+1, ref_next, ref_next+1);
 452
 453         if (width == 16)
 454             mmx_average_4_U8 (dest+8, ref+8, ref+9, ref_next+8, ref_next+9);
 455
 456         dest += stride;
 457         ref += stride;
 458         ref_next += stride;
 459     } while (--height);
 460 }
 461
 462 static void MC_put_xy16_mmx (yuv_data_t * dest, yuv_data_t * ref,
 463                              int stride, int height)
 464 {
 465     MC_put_xy_mmx (16, height, dest, ref, stride);
 466 }
 467
 468 static void MC_put_xy8_mmx (yuv_data_t * dest, yuv_data_t * ref,
 469                             int stride, int height)
 470 {
 471     MC_put_xy_mmx (8, height, dest, ref, stride);
 472 }
 473
 474 //-----------------------------------------------------------------------
 475
 476 static inline void MC_avg_y_mmx (int width, int height,
 477                                  yuv_data_t * dest, yuv_data_t * ref, int stride)
 478 {
 479     yuv_data_t * ref_next = ref+stride;
 480
 481     mmx_zero_reg ();
 482
 483     do {
 484         mmx_interp_average_2_U8 (dest, ref, ref_next);
 485
 486         if (width == 16)
 487             mmx_interp_average_2_U8 (dest+8, ref+8, ref_next+8);
 488
 489         dest += stride;
 490         ref += stride;
 491         ref_next += stride;
 492     } while (--height);
 493 }
 494
 495 static void MC_avg_y16_mmx (yuv_data_t * dest, yuv_data_t * ref,
 496                             int stride, int height)
 497 {
 498     MC_avg_y_mmx (16, height, dest, ref, stride);
 499 }
 500
 501 static void MC_avg_y8_mmx (yuv_data_t * dest, yuv_data_t * ref,
 502                            int stride, int height)
 503 {
 504     MC_avg_y_mmx (8, height, dest, ref, stride);
 505 }
 506
 507 //-----------------------------------------------------------------------
 508
 509 static inline void MC_put_y_mmx (int width, int height,
 510                                  yuv_data_t * dest, yuv_data_t * ref, int stride)
 511 {
 512     yuv_data_t * ref_next = ref+stride;
 513
 514     mmx_zero_reg ();
 515
 516     do {
 517         mmx_average_2_U8 (dest, ref, ref_next);
 518
 519         if (width == 16)
 520             mmx_average_2_U8 (dest+8, ref+8, ref_next+8);
 521
 522         dest += stride;
 523         ref += stride;
 524         ref_next += stride;
 525     } while (--height);
 526 }
 527
 528 static void MC_put_y16_mmx (yuv_data_t * dest, yuv_data_t * ref,
 529                             int stride, int height)
 530 {
 531     MC_put_y_mmx (16, height, dest, ref, stride);
 532 }
 533
 534 static void MC_put_y8_mmx (yuv_data_t * dest, yuv_data_t * ref,
 535                            int stride, int height)
 536 {
 537     MC_put_y_mmx (8, height, dest, ref, stride);
 538 }
 539
 540
 541 /*****************************************************************************
 542  * Functions exported as capabilities. They are declared as static so that
 543  * we don't pollute the namespace too much.
 544  *****************************************************************************/
 545 static void motion_getfunctions( function_list_t * p_function_list )
 546 {
 547     static void (* ppppf_motion[2][2][4])( yuv_data_t *, yuv_data_t *,
 548                                            int, int ) =
 549     {
 550         {
 551             /* Copying functions */
 552             {
 553                 /* Width == 16 */
 554                 MC_put_16_mmx, MC_put_x16_mmx, MC_put_y16_mmx, MC_put_xy16_mmx
 555             },
 556             {
 557                 /* Width == 8 */
 558                 MC_put_8_mmx,  MC_put_x8_mmx,  MC_put_y8_mmx, MC_put_xy8_mmx
 559             }
 560         },
 561         {
 562             /* Averaging functions */
 563             {
 564                 /* Width == 16 */
 565                 MC_avg_16_mmx, MC_avg_x16_mmx, MC_avg_y16_mmx, MC_avg_xy16_mmx
 566             },
 567             {
 568                 /* Width == 8 */
 569                 MC_avg_8_mmx,  MC_avg_x8_mmx,  MC_avg_y8_mmx,  MC_avg_xy8_mmx
 570             }
 571         }
 572     };
 573
 574 #define list p_function_list->functions.motion
 575     memcpy( list.ppppf_motion, ppppf_motion, sizeof( void * ) * 16 );
 576 #undef list
 577
 578     return;
 579 }
 580