git.sesse.net Git - vlc/blob - plugins/motion/vdec_motion_inner_mmx.c

   1 /*****************************************************************************
   2  * vdec_motion_inner_mmx.c : motion compensation inner routines optimized in
   3  *                           MMX
   4  *****************************************************************************
   5  * Copyright (C) 1999, 2000 VideoLAN
   6  * $Id: vdec_motion_inner_mmx.c,v 1.1 2001/01/18 05:13:22 sam Exp $
   7  *
   8  * Authors: Christophe Massiot <massiot@via.ecp.fr>, largerly inspired by the
   9  *          work done by the livid project <http://www.linuxvideo.org/>
  10  *
  11  * This program is free software; you can redistribute it and/or modify
  12  * it under the terms of the GNU General Public License as published by
  13  * the Free Software Foundation; either version 2 of the License, or
  14  * (at your option) any later version.
  15  *
  16  * This program is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19  * GNU General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU General Public License
  22  * along with this program; if not, write to the Free Software
  23  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
  24  *****************************************************************************/
  25
  26 /*****************************************************************************
  27  * Preamble
  28  *****************************************************************************/
  29 #include "defs.h"
  30
  31 #include "config.h"
  32 #include "common.h"
  33 #include "threads.h"
  34 #include "mtime.h"
  35
  36 #include "video.h"
  37
  38 #include "attributes.h"
  39 #include "mmx.h"
  40
  41 /* OK, I know, this code has been taken from livid's mpeg2dec --Meuuh */
  42
  43 /* Some rounding constants */
  44 mmx_t round1 = {0x0001000100010001LL};
  45 mmx_t round4 = {0x0002000200020002LL};
  46
  47 /*
  48  * Useful functions
  49  */
  50
  51 static __inline__ void MMXZeroReg()
  52 {
  53    /* load 0 into mm0 */
  54    pxor_r2r(mm0,mm0);
  55 }
  56
  57 static __inline__ void MMXAverage2( u8 *dst, u8 *src1, u8 *src2 )
  58 {
  59    //
  60    // *dst = clip_to_u8((*src1 + *src2 + 1)/2);
  61    //
  62
  63    movq_m2r(*src1,mm1);        // load 8 src1 bytes
  64    movq_r2r(mm1,mm2);          // copy 8 src1 bytes
  65
  66    movq_m2r(*src2,mm3);        // load 8 src2 bytes
  67    movq_r2r(mm3,mm4);          // copy 8 src2 bytes
  68
  69    punpcklbw_r2r(mm0,mm1);     // unpack low src1 bytes
  70    punpckhbw_r2r(mm0,mm2);     // unpack high src1 bytes
  71
  72    punpcklbw_r2r(mm0,mm3);     // unpack low src2 bytes
  73    punpckhbw_r2r(mm0,mm4);     // unpack high src2 bytes
  74
  75    paddw_r2r(mm3,mm1);         // add lows to mm1
  76    paddw_m2r(round1,mm1);
  77    psraw_i2r(1,mm1);           // /2
  78
  79    paddw_r2r(mm4,mm2);         // add highs to mm2
  80    paddw_m2r(round1,mm2);
  81    psraw_i2r(1,mm2);           // /2
  82
  83    packuswb_r2r(mm2,mm1);      // pack (w/ saturation)
  84    movq_r2m(mm1,*dst);         // store result in dst
  85 }
  86
  87 static __inline__ void MMXInterpAverage2( u8 *dst, u8 *src1, u8 *src2 )
  88 {
  89    //
  90    // *dst = clip_to_u8((*dst + (*src1 + *src2 + 1)/2 + 1)/2);
  91    //
  92
  93    movq_m2r(*dst,mm1);            // load 8 dst bytes
  94    movq_r2r(mm1,mm2);             // copy 8 dst bytes
  95
  96    movq_m2r(*src1,mm3);           // load 8 src1 bytes
  97    movq_r2r(mm3,mm4);             // copy 8 src1 bytes
  98
  99    movq_m2r(*src2,mm5);           // load 8 src2 bytes
 100    movq_r2r(mm5,mm6);             // copy 8 src2 bytes
 101
 102    punpcklbw_r2r(mm0,mm1);        // unpack low dst bytes
 103    punpckhbw_r2r(mm0,mm2);        // unpack high dst bytes
 104
 105    punpcklbw_r2r(mm0,mm3);        // unpack low src1 bytes
 106    punpckhbw_r2r(mm0,mm4);        // unpack high src1 bytes
 107
 108    punpcklbw_r2r(mm0,mm5);        // unpack low src2 bytes
 109    punpckhbw_r2r(mm0,mm6);        // unpack high src2 bytes
 110
 111    paddw_r2r(mm5,mm3);            // add lows
 112    paddw_m2r(round1,mm3);
 113    psraw_i2r(1,mm3);              // /2
 114
 115    paddw_r2r(mm6,mm4);            // add highs
 116    paddw_m2r(round1,mm4);
 117    psraw_i2r(1,mm4);              // /2
 118
 119    paddw_r2r(mm3,mm1);            // add lows
 120    paddw_m2r(round1,mm1);
 121    psraw_i2r(1,mm1);              // /2
 122
 123    paddw_r2r(mm4,mm2);            // add highs
 124    paddw_m2r(round1,mm2);
 125    psraw_i2r(1,mm2);              // /2
 126
 127    packuswb_r2r(mm2,mm1);         // pack (w/ saturation)
 128    movq_r2m(mm1,*dst);            // store result in dst
 129 }
 130
 131 static __inline__ void MMXAverage4( u8 *dst, u8 *src1, u8 *src2, u8 *src3,
 132                                     u8 *src4 )
 133 {
 134    //
 135    // *dst = (*src1 + *src2 + *src3 + *src4 + 2) / 4;
 136    //
 137
 138    movq_m2r(*src1,mm1);                // load 8 src1 bytes
 139    movq_r2r(mm1,mm2);                  // copy 8 src1 bytes
 140
 141    punpcklbw_r2r(mm0,mm1);             // unpack low src1 bytes
 142    punpckhbw_r2r(mm0,mm2);             // unpack high src1 bytes
 143
 144    movq_m2r(*src2,mm3);                // load 8 src2 bytes
 145    movq_r2r(mm3,mm4);                  // copy 8 src2 bytes
 146
 147    punpcklbw_r2r(mm0,mm3);             // unpack low src2 bytes
 148    punpckhbw_r2r(mm0,mm4);             // unpack high src2 bytes
 149
 150    paddw_r2r(mm3,mm1);                 // add lows
 151    paddw_r2r(mm4,mm2);                 // add highs
 152
 153    // now have partials in mm1 and mm2
 154
 155    movq_m2r(*src3,mm3);                // load 8 src3 bytes
 156    movq_r2r(mm3,mm4);                  // copy 8 src3 bytes
 157
 158    punpcklbw_r2r(mm0,mm3);             // unpack low src3 bytes
 159    punpckhbw_r2r(mm0,mm4);             // unpack high src3 bytes
 160
 161    paddw_r2r(mm3,mm1);                 // add lows
 162    paddw_r2r(mm4,mm2);                 // add highs
 163
 164    movq_m2r(*src4,mm5);                // load 8 src4 bytes
 165    movq_r2r(mm5,mm6);                  // copy 8 src4 bytes
 166
 167    punpcklbw_r2r(mm0,mm5);             // unpack low src4 bytes
 168    punpckhbw_r2r(mm0,mm6);             // unpack high src4 bytes
 169
 170    paddw_r2r(mm5,mm1);                 // add lows
 171    paddw_r2r(mm6,mm2);                 // add highs
 172
 173    // now have subtotal in mm1 and mm2
 174
 175    paddw_m2r(round4,mm1);
 176    psraw_i2r(2,mm1);                   // /4
 177    paddw_m2r(round4,mm2);
 178    psraw_i2r(2,mm2);                   // /4
 179
 180    packuswb_r2r(mm2,mm1);              // pack (w/ saturation)
 181    movq_r2m(mm1,*dst);                 // store result in dst
 182 }
 183
 184 static __inline__ void MMXInterpAverage4( u8 *dst, u8 *src1, u8 *src2,
 185                                           u8 *src3, u8 *src4 )
 186 {
 187    //
 188    // *dst = clip_to_u8((*dst + (*src1 + *src2 + *src3 + *src4 + 2)/4 + 1)/2);
 189    //
 190
 191    movq_m2r(*src1,mm1);                // load 8 src1 bytes
 192    movq_r2r(mm1,mm2);                  // copy 8 src1 bytes
 193
 194    punpcklbw_r2r(mm0,mm1);             // unpack low src1 bytes
 195    punpckhbw_r2r(mm0,mm2);             // unpack high src1 bytes
 196
 197    movq_m2r(*src2,mm3);                // load 8 src2 bytes
 198    movq_r2r(mm3,mm4);                  // copy 8 src2 bytes
 199
 200    punpcklbw_r2r(mm0,mm3);             // unpack low src2 bytes
 201    punpckhbw_r2r(mm0,mm4);             // unpack high src2 bytes
 202
 203    paddw_r2r(mm3,mm1);                 // add lows
 204    paddw_r2r(mm4,mm2);                 // add highs
 205
 206    // now have partials in mm1 and mm2
 207
 208    movq_m2r(*src3,mm3);                // load 8 src3 bytes
 209    movq_r2r(mm3,mm4);                  // copy 8 src3 bytes
 210
 211    punpcklbw_r2r(mm0,mm3);             // unpack low src3 bytes
 212    punpckhbw_r2r(mm0,mm4);             // unpack high src3 bytes
 213
 214    paddw_r2r(mm3,mm1);                 // add lows
 215    paddw_r2r(mm4,mm2);                 // add highs
 216
 217    movq_m2r(*src4,mm5);                // load 8 src4 bytes
 218    movq_r2r(mm5,mm6);                  // copy 8 src4 bytes
 219
 220    punpcklbw_r2r(mm0,mm5);             // unpack low src4 bytes
 221    punpckhbw_r2r(mm0,mm6);             // unpack high src4 bytes
 222
 223    paddw_r2r(mm5,mm1);                 // add lows
 224    paddw_r2r(mm6,mm2);                 // add highs
 225
 226    paddw_m2r(round4,mm1);
 227    psraw_i2r(2,mm1);                   // /4
 228    paddw_m2r(round4,mm2);
 229    psraw_i2r(2,mm2);                   // /4
 230
 231    // now have subtotal/4 in mm1 and mm2
 232
 233    movq_m2r(*dst,mm3);                 // load 8 dst bytes
 234    movq_r2r(mm3,mm4);                  // copy 8 dst bytes
 235
 236    punpcklbw_r2r(mm0,mm3);             // unpack low dst bytes
 237    punpckhbw_r2r(mm0,mm4);             // unpack high dst bytes
 238
 239    paddw_r2r(mm3,mm1);                 // add lows
 240    paddw_r2r(mm4,mm2);                 // add highs
 241
 242    paddw_m2r(round1,mm1);
 243    psraw_i2r(1,mm1);                   // /2
 244    paddw_m2r(round1,mm2);
 245    psraw_i2r(1,mm2);                   // /2
 246
 247    // now have end value in mm1 and mm2
 248
 249    packuswb_r2r(mm2,mm1);              // pack (w/ saturation)
 250    movq_r2m(mm1,*dst);                 // store result in dst
 251 }
 252
 253
 254 /*
 255  * Actual Motion compensation
 256  */
 257
 258 #define pavg_r2r(src,dest)      pavgusb_r2r (src, dest);
 259 #define pavg_m2r(src,dest)      pavgusb_m2r (src, dest);
 260
 261 #define __MotionComponent_x_y_copy(width,height)                            \
 262 void MotionComponent_x_y_copy_##width##_##height(yuv_data_t * p_src,        \
 263                                                  yuv_data_t * p_dest,       \
 264                                                  int i_stride)              \
 265 {                                                                           \
 266     int i_y;                                                                \
 267                                                                             \
 268     MMXZeroReg();                                                           \
 269                                                                             \
 270     for( i_y = 0; i_y < height; i_y ++ )                                    \
 271     {                                                                       \
 272         movq_m2r( *p_src, mm0 );     /* load 8 ref bytes */                 \
 273         if( width == 16 )                                                   \
 274             movq_m2r( *(p_src + 8), mm1 );                                  \
 275         p_src += i_stride;                                                  \
 276                                                                             \
 277         movq_r2m( mm0, *p_dest );    /* store 8 bytes at curr */            \
 278         if( width == 16 )                                                   \
 279             movq_r2m( mm1, *(p_dest + 8) );                                 \
 280         p_dest += i_stride;                                                 \
 281     }                                                                       \
 282 }
 283
 284 #define __MotionComponent_X_y_copy(width,height)                            \
 285 void MotionComponent_X_y_copy_##width##_##height(yuv_data_t * p_src,        \
 286                                                  yuv_data_t * p_dest,       \
 287                                                  int i_stride)              \
 288 {                                                                           \
 289     int i_y;                                                                \
 290                                                                             \
 291     MMXZeroReg();                                                           \
 292                                                                             \
 293     for( i_y = 0; i_y < height; i_y ++ )                                    \
 294     {                                                                       \
 295         MMXAverage2( p_dest, p_src, p_src + 1 );                            \
 296                                                                             \
 297         if( width == 16 )                                                   \
 298         {                                                                   \
 299             MMXAverage2( p_dest + 8, p_src + 8, p_src + 9 );                \
 300         }                                                                   \
 301                                                                             \
 302         p_dest += i_stride;                                                 \
 303         p_src += i_stride;                                                  \
 304     }                                                                       \
 305 }
 306
 307 #define __MotionComponent_x_Y_copy(width,height)                            \
 308 void MotionComponent_x_Y_copy_##width##_##height(yuv_data_t * p_src,        \
 309                                                  yuv_data_t * p_dest,       \
 310                                                  int i_stride)              \
 311 {                                                                           \
 312     int i_y;                                                                \
 313     yuv_data_t * p_next_src = p_src + i_stride;                             \
 314                                                                             \
 315     MMXZeroReg();                                                           \
 316                                                                             \
 317     for( i_y = 0; i_y < height; i_y ++ )                                    \
 318     {                                                                       \
 319         MMXAverage2( p_dest, p_src, p_next_src );                           \
 320                                                                             \
 321         if( width == 16 )                                                   \
 322         {                                                                   \
 323             MMXAverage2( p_dest + 8, p_src + 8, p_next_src + 8 );           \
 324         }                                                                   \
 325                                                                             \
 326         p_dest += i_stride;                                                 \
 327         p_src += i_stride;                                                  \
 328         p_next_src += i_stride;                                             \
 329     }                                                                       \
 330 }
 331
 332 #define __MotionComponent_X_Y_copy(width,height)                            \
 333 void MotionComponent_X_Y_copy_##width##_##height(yuv_data_t * p_src,        \
 334                                                  yuv_data_t * p_dest,       \
 335                                                  int i_stride)              \
 336 {                                                                           \
 337     int i_y;                                                                \
 338     yuv_data_t * p_next_src = p_src + i_stride;                             \
 339                                                                             \
 340     MMXZeroReg();                                                           \
 341                                                                             \
 342     for( i_y = 0; i_y < height; i_y ++ )                                    \
 343     {                                                                       \
 344         MMXAverage4( p_dest, p_src, p_src + 1, p_next_src, p_next_src + 1 );\
 345                                                                             \
 346         if( width == 16 )                                                   \
 347         {                                                                   \
 348             MMXAverage4( p_dest + 8, p_src + 8, p_src + 9,                  \
 349                          p_next_src + 8, p_next_src + 9 );                  \
 350         }                                                                   \
 351                                                                             \
 352         p_dest += i_stride;                                                 \
 353         p_src += i_stride;                                                  \
 354         p_next_src += i_stride;                                             \
 355     }                                                                       \
 356 }
 357
 358 #define __MotionComponent_x_y_avg(width,height)                             \
 359 void MotionComponent_x_y_avg_##width##_##height(yuv_data_t * p_src,         \
 360                                                 yuv_data_t * p_dest,        \
 361                                                 int i_stride)               \
 362 {                                                                           \
 363     int i_y;                                                                \
 364                                                                             \
 365     MMXZeroReg();                                                           \
 366                                                                             \
 367     for( i_y = 0; i_y < height; i_y ++ )                                    \
 368     {                                                                       \
 369         MMXAverage2( p_dest, p_dest, p_src );                               \
 370                                                                             \
 371         if( width == 16 )                                                   \
 372         {                                                                   \
 373             MMXAverage2( p_dest + 8, p_dest + 8, p_src + 8 );               \
 374         }                                                                   \
 375                                                                             \
 376         p_dest += i_stride;                                                 \
 377         p_src += i_stride;                                                  \
 378     }                                                                       \
 379 }
 380
 381 #define __MotionComponent_X_y_avg(width,height)                             \
 382 void MotionComponent_X_y_avg_##width##_##height(yuv_data_t * p_src,         \
 383                                                 yuv_data_t * p_dest,        \
 384                                                 int i_stride)               \
 385 {                                                                           \
 386     int i_y;                                                                \
 387                                                                             \
 388     MMXZeroReg();                                                           \
 389                                                                             \
 390     for( i_y = 0; i_y < height; i_y ++ )                                    \
 391     {                                                                       \
 392         MMXInterpAverage2( p_dest, p_src, p_src + 1 );                      \
 393                                                                             \
 394         if( width == 16 )                                                   \
 395         {                                                                   \
 396             MMXInterpAverage2( p_dest + 8, p_src + 8, p_src + 9 );          \
 397         }                                                                   \
 398                                                                             \
 399         p_dest += i_stride;                                                 \
 400         p_src += i_stride;                                                  \
 401     }                                                                       \
 402 }
 403
 404 #define __MotionComponent_x_Y_avg(width,height)                             \
 405 void MotionComponent_x_Y_avg_##width##_##height(yuv_data_t * p_src,         \
 406                                                 yuv_data_t * p_dest,        \
 407                                                 int i_stride)               \
 408 {                                                                           \
 409     int i_y;                                                                \
 410     yuv_data_t * p_next_src = p_src + i_stride;                             \
 411                                                                             \
 412     MMXZeroReg();                                                           \
 413                                                                             \
 414     for( i_y = 0; i_y < height; i_y ++ )                                    \
 415     {                                                                       \
 416         MMXInterpAverage2( p_dest, p_src, p_next_src );                     \
 417                                                                             \
 418         if( width == 16 )                                                   \
 419         {                                                                   \
 420             MMXInterpAverage2( p_dest + 8, p_src + 8, p_next_src + 8 );     \
 421         }                                                                   \
 422         p_dest += i_stride;                                                 \
 423         p_src += i_stride;                                                  \
 424         p_next_src += i_stride;                                             \
 425     }                                                                       \
 426 }
 427
 428 #define __MotionComponent_X_Y_avg(width,height)                             \
 429 void MotionComponent_X_Y_avg_##width##_##height(yuv_data_t * p_src,         \
 430                                                 yuv_data_t * p_dest,        \
 431                                                 int i_stride)               \
 432 {                                                                           \
 433     int i_y;                                                                \
 434     yuv_data_t * p_next_src = p_src + i_stride;                             \
 435                                                                             \
 436     MMXZeroReg();                                                           \
 437                                                                             \
 438     for( i_y = 0; i_y < height; i_y ++ )                                    \
 439     {                                                                       \
 440         MMXInterpAverage4( p_dest, p_src, p_src + 1, p_next_src,            \
 441                            p_next_src + 1 );                                \
 442                                                                             \
 443         if( width == 16 )                                                   \
 444         {                                                                   \
 445             MMXInterpAverage4( p_dest + 8, p_src + 8, p_src + 9,            \
 446                                p_next_src + 8, p_next_src + 9 );            \
 447         }                                                                   \
 448                                                                             \
 449         p_dest += i_stride;                                                 \
 450         p_src += i_stride;                                                  \
 451         p_next_src += i_stride;                                             \
 452     }                                                                       \
 453 }
 454
 455 #define __MotionComponents(width,height)                                    \
 456 __MotionComponent_x_y_copy(width,height)                                    \
 457 __MotionComponent_X_y_copy(width,height)                                    \
 458 __MotionComponent_x_Y_copy(width,height)                                    \
 459 __MotionComponent_X_Y_copy(width,height)                                    \
 460 __MotionComponent_x_y_avg(width,height)                                     \
 461 __MotionComponent_X_y_avg(width,height)                                     \
 462 __MotionComponent_x_Y_avg(width,height)                                     \
 463 __MotionComponent_X_Y_avg(width,height)
 464
 465 __MotionComponents (16,16)      /* 444, 422, 420 */
 466 __MotionComponents (16,8)       /* 444, 422, 420 */
 467 __MotionComponents (8,8)        /* 422, 420 */
 468 __MotionComponents (8,4)        /* 420 */
 469 #if 0
 470 __MotionComponents (8,16)       /* 422 */
 471 #endif