git.sesse.net Git - vlc/blob - plugins/motion/vdec_motion_inner_mmx.c

   1 /*****************************************************************************
   2  * vdec_motion_inner_mmx.c : motion compensation inner routines optimized in
   3  *                           MMX
   4  *****************************************************************************
   5  * Copyright (C) 1999, 2000 VideoLAN
   6  * $Id: vdec_motion_inner_mmx.c,v 1.2 2001/06/07 15:27:44 sam Exp $
   7  *
   8  * Authors: Christophe Massiot <massiot@via.ecp.fr>, largerly inspired by the
   9  *          work done by the livid project <http://www.linuxvideo.org/>
  10  *
  11  * This program is free software; you can redistribute it and/or modify
  12  * it under the terms of the GNU General Public License as published by
  13  * the Free Software Foundation; either version 2 of the License, or
  14  * (at your option) any later version.
  15  *
  16  * This program is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19  * GNU General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU General Public License
  22  * along with this program; if not, write to the Free Software
  23  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
  24  *****************************************************************************/
  25
  26 #include "modules_inner.h"
  27
  28 /*****************************************************************************
  29  * Preamble
  30  *****************************************************************************/
  31 #include "defs.h"
  32
  33 #include "config.h"
  34 #include "common.h"
  35 #include "threads.h"
  36 #include "mtime.h"
  37
  38 #include "video.h"
  39
  40 #include "attributes.h"
  41 #include "mmx.h"
  42
  43 /* OK, I know, this code has been taken from livid's mpeg2dec --Meuuh */
  44
  45 /* Some rounding constants */
  46 mmx_t round1 = {0x0001000100010001LL};
  47 mmx_t round4 = {0x0002000200020002LL};
  48
  49 /*
  50  * Useful functions
  51  */
  52
  53 static __inline__ void MMXZeroReg()
  54 {
  55    /* load 0 into mm0 */
  56    pxor_r2r(mm0,mm0);
  57 }
  58
  59 static __inline__ void MMXAverage2( u8 *dst, u8 *src1, u8 *src2 )
  60 {
  61    //
  62    // *dst = clip_to_u8((*src1 + *src2 + 1)/2);
  63    //
  64
  65    movq_m2r(*src1,mm1);        // load 8 src1 bytes
  66    movq_r2r(mm1,mm2);          // copy 8 src1 bytes
  67
  68    movq_m2r(*src2,mm3);        // load 8 src2 bytes
  69    movq_r2r(mm3,mm4);          // copy 8 src2 bytes
  70
  71    punpcklbw_r2r(mm0,mm1);     // unpack low src1 bytes
  72    punpckhbw_r2r(mm0,mm2);     // unpack high src1 bytes
  73
  74    punpcklbw_r2r(mm0,mm3);     // unpack low src2 bytes
  75    punpckhbw_r2r(mm0,mm4);     // unpack high src2 bytes
  76
  77    paddw_r2r(mm3,mm1);         // add lows to mm1
  78    paddw_m2r(round1,mm1);
  79    psraw_i2r(1,mm1);           // /2
  80
  81    paddw_r2r(mm4,mm2);         // add highs to mm2
  82    paddw_m2r(round1,mm2);
  83    psraw_i2r(1,mm2);           // /2
  84
  85    packuswb_r2r(mm2,mm1);      // pack (w/ saturation)
  86    movq_r2m(mm1,*dst);         // store result in dst
  87 }
  88
  89 static __inline__ void MMXInterpAverage2( u8 *dst, u8 *src1, u8 *src2 )
  90 {
  91    //
  92    // *dst = clip_to_u8((*dst + (*src1 + *src2 + 1)/2 + 1)/2);
  93    //
  94
  95    movq_m2r(*dst,mm1);            // load 8 dst bytes
  96    movq_r2r(mm1,mm2);             // copy 8 dst bytes
  97
  98    movq_m2r(*src1,mm3);           // load 8 src1 bytes
  99    movq_r2r(mm3,mm4);             // copy 8 src1 bytes
 100
 101    movq_m2r(*src2,mm5);           // load 8 src2 bytes
 102    movq_r2r(mm5,mm6);             // copy 8 src2 bytes
 103
 104    punpcklbw_r2r(mm0,mm1);        // unpack low dst bytes
 105    punpckhbw_r2r(mm0,mm2);        // unpack high dst bytes
 106
 107    punpcklbw_r2r(mm0,mm3);        // unpack low src1 bytes
 108    punpckhbw_r2r(mm0,mm4);        // unpack high src1 bytes
 109
 110    punpcklbw_r2r(mm0,mm5);        // unpack low src2 bytes
 111    punpckhbw_r2r(mm0,mm6);        // unpack high src2 bytes
 112
 113    paddw_r2r(mm5,mm3);            // add lows
 114    paddw_m2r(round1,mm3);
 115    psraw_i2r(1,mm3);              // /2
 116
 117    paddw_r2r(mm6,mm4);            // add highs
 118    paddw_m2r(round1,mm4);
 119    psraw_i2r(1,mm4);              // /2
 120
 121    paddw_r2r(mm3,mm1);            // add lows
 122    paddw_m2r(round1,mm1);
 123    psraw_i2r(1,mm1);              // /2
 124
 125    paddw_r2r(mm4,mm2);            // add highs
 126    paddw_m2r(round1,mm2);
 127    psraw_i2r(1,mm2);              // /2
 128
 129    packuswb_r2r(mm2,mm1);         // pack (w/ saturation)
 130    movq_r2m(mm1,*dst);            // store result in dst
 131 }
 132
 133 static __inline__ void MMXAverage4( u8 *dst, u8 *src1, u8 *src2, u8 *src3,
 134                                     u8 *src4 )
 135 {
 136    //
 137    // *dst = (*src1 + *src2 + *src3 + *src4 + 2) / 4;
 138    //
 139
 140    movq_m2r(*src1,mm1);                // load 8 src1 bytes
 141    movq_r2r(mm1,mm2);                  // copy 8 src1 bytes
 142
 143    punpcklbw_r2r(mm0,mm1);             // unpack low src1 bytes
 144    punpckhbw_r2r(mm0,mm2);             // unpack high src1 bytes
 145
 146    movq_m2r(*src2,mm3);                // load 8 src2 bytes
 147    movq_r2r(mm3,mm4);                  // copy 8 src2 bytes
 148
 149    punpcklbw_r2r(mm0,mm3);             // unpack low src2 bytes
 150    punpckhbw_r2r(mm0,mm4);             // unpack high src2 bytes
 151
 152    paddw_r2r(mm3,mm1);                 // add lows
 153    paddw_r2r(mm4,mm2);                 // add highs
 154
 155    // now have partials in mm1 and mm2
 156
 157    movq_m2r(*src3,mm3);                // load 8 src3 bytes
 158    movq_r2r(mm3,mm4);                  // copy 8 src3 bytes
 159
 160    punpcklbw_r2r(mm0,mm3);             // unpack low src3 bytes
 161    punpckhbw_r2r(mm0,mm4);             // unpack high src3 bytes
 162
 163    paddw_r2r(mm3,mm1);                 // add lows
 164    paddw_r2r(mm4,mm2);                 // add highs
 165
 166    movq_m2r(*src4,mm5);                // load 8 src4 bytes
 167    movq_r2r(mm5,mm6);                  // copy 8 src4 bytes
 168
 169    punpcklbw_r2r(mm0,mm5);             // unpack low src4 bytes
 170    punpckhbw_r2r(mm0,mm6);             // unpack high src4 bytes
 171
 172    paddw_r2r(mm5,mm1);                 // add lows
 173    paddw_r2r(mm6,mm2);                 // add highs
 174
 175    // now have subtotal in mm1 and mm2
 176
 177    paddw_m2r(round4,mm1);
 178    psraw_i2r(2,mm1);                   // /4
 179    paddw_m2r(round4,mm2);
 180    psraw_i2r(2,mm2);                   // /4
 181
 182    packuswb_r2r(mm2,mm1);              // pack (w/ saturation)
 183    movq_r2m(mm1,*dst);                 // store result in dst
 184 }
 185
 186 static __inline__ void MMXInterpAverage4( u8 *dst, u8 *src1, u8 *src2,
 187                                           u8 *src3, u8 *src4 )
 188 {
 189    //
 190    // *dst = clip_to_u8((*dst + (*src1 + *src2 + *src3 + *src4 + 2)/4 + 1)/2);
 191    //
 192
 193    movq_m2r(*src1,mm1);                // load 8 src1 bytes
 194    movq_r2r(mm1,mm2);                  // copy 8 src1 bytes
 195
 196    punpcklbw_r2r(mm0,mm1);             // unpack low src1 bytes
 197    punpckhbw_r2r(mm0,mm2);             // unpack high src1 bytes
 198
 199    movq_m2r(*src2,mm3);                // load 8 src2 bytes
 200    movq_r2r(mm3,mm4);                  // copy 8 src2 bytes
 201
 202    punpcklbw_r2r(mm0,mm3);             // unpack low src2 bytes
 203    punpckhbw_r2r(mm0,mm4);             // unpack high src2 bytes
 204
 205    paddw_r2r(mm3,mm1);                 // add lows
 206    paddw_r2r(mm4,mm2);                 // add highs
 207
 208    // now have partials in mm1 and mm2
 209
 210    movq_m2r(*src3,mm3);                // load 8 src3 bytes
 211    movq_r2r(mm3,mm4);                  // copy 8 src3 bytes
 212
 213    punpcklbw_r2r(mm0,mm3);             // unpack low src3 bytes
 214    punpckhbw_r2r(mm0,mm4);             // unpack high src3 bytes
 215
 216    paddw_r2r(mm3,mm1);                 // add lows
 217    paddw_r2r(mm4,mm2);                 // add highs
 218
 219    movq_m2r(*src4,mm5);                // load 8 src4 bytes
 220    movq_r2r(mm5,mm6);                  // copy 8 src4 bytes
 221
 222    punpcklbw_r2r(mm0,mm5);             // unpack low src4 bytes
 223    punpckhbw_r2r(mm0,mm6);             // unpack high src4 bytes
 224
 225    paddw_r2r(mm5,mm1);                 // add lows
 226    paddw_r2r(mm6,mm2);                 // add highs
 227
 228    paddw_m2r(round4,mm1);
 229    psraw_i2r(2,mm1);                   // /4
 230    paddw_m2r(round4,mm2);
 231    psraw_i2r(2,mm2);                   // /4
 232
 233    // now have subtotal/4 in mm1 and mm2
 234
 235    movq_m2r(*dst,mm3);                 // load 8 dst bytes
 236    movq_r2r(mm3,mm4);                  // copy 8 dst bytes
 237
 238    punpcklbw_r2r(mm0,mm3);             // unpack low dst bytes
 239    punpckhbw_r2r(mm0,mm4);             // unpack high dst bytes
 240
 241    paddw_r2r(mm3,mm1);                 // add lows
 242    paddw_r2r(mm4,mm2);                 // add highs
 243
 244    paddw_m2r(round1,mm1);
 245    psraw_i2r(1,mm1);                   // /2
 246    paddw_m2r(round1,mm2);
 247    psraw_i2r(1,mm2);                   // /2
 248
 249    // now have end value in mm1 and mm2
 250
 251    packuswb_r2r(mm2,mm1);              // pack (w/ saturation)
 252    movq_r2m(mm1,*dst);                 // store result in dst
 253 }
 254
 255
 256 /*
 257  * Actual Motion compensation
 258  */
 259
 260 #define pavg_r2r(src,dest)      pavgusb_r2r (src, dest);
 261 #define pavg_m2r(src,dest)      pavgusb_m2r (src, dest);
 262
 263 #define __MotionComponent_x_y_copy(width,height)                            \
 264 void _M(MotionComponent_x_y_copy_##width##_##height)(yuv_data_t * p_src,    \
 265                                                  yuv_data_t * p_dest,       \
 266                                                  int i_stride)              \
 267 {                                                                           \
 268     int i_y;                                                                \
 269                                                                             \
 270     MMXZeroReg();                                                           \
 271                                                                             \
 272     for( i_y = 0; i_y < height; i_y ++ )                                    \
 273     {                                                                       \
 274         movq_m2r( *p_src, mm0 );     /* load 8 ref bytes */                 \
 275         if( width == 16 )                                                   \
 276             movq_m2r( *(p_src + 8), mm1 );                                  \
 277         p_src += i_stride;                                                  \
 278                                                                             \
 279         movq_r2m( mm0, *p_dest );    /* store 8 bytes at curr */            \
 280         if( width == 16 )                                                   \
 281             movq_r2m( mm1, *(p_dest + 8) );                                 \
 282         p_dest += i_stride;                                                 \
 283     }                                                                       \
 284 }
 285
 286 #define __MotionComponent_X_y_copy(width,height)                            \
 287 void _M(MotionComponent_X_y_copy_##width##_##height)(yuv_data_t * p_src,    \
 288                                                  yuv_data_t * p_dest,       \
 289                                                  int i_stride)              \
 290 {                                                                           \
 291     int i_y;                                                                \
 292                                                                             \
 293     MMXZeroReg();                                                           \
 294                                                                             \
 295     for( i_y = 0; i_y < height; i_y ++ )                                    \
 296     {                                                                       \
 297         MMXAverage2( p_dest, p_src, p_src + 1 );                            \
 298                                                                             \
 299         if( width == 16 )                                                   \
 300         {                                                                   \
 301             MMXAverage2( p_dest + 8, p_src + 8, p_src + 9 );                \
 302         }                                                                   \
 303                                                                             \
 304         p_dest += i_stride;                                                 \
 305         p_src += i_stride;                                                  \
 306     }                                                                       \
 307 }
 308
 309 #define __MotionComponent_x_Y_copy(width,height)                            \
 310 void _M(MotionComponent_x_Y_copy_##width##_##height)(yuv_data_t * p_src,    \
 311                                                  yuv_data_t * p_dest,       \
 312                                                  int i_stride)              \
 313 {                                                                           \
 314     int i_y;                                                                \
 315     yuv_data_t * p_next_src = p_src + i_stride;                             \
 316                                                                             \
 317     MMXZeroReg();                                                           \
 318                                                                             \
 319     for( i_y = 0; i_y < height; i_y ++ )                                    \
 320     {                                                                       \
 321         MMXAverage2( p_dest, p_src, p_next_src );                           \
 322                                                                             \
 323         if( width == 16 )                                                   \
 324         {                                                                   \
 325             MMXAverage2( p_dest + 8, p_src + 8, p_next_src + 8 );           \
 326         }                                                                   \
 327                                                                             \
 328         p_dest += i_stride;                                                 \
 329         p_src += i_stride;                                                  \
 330         p_next_src += i_stride;                                             \
 331     }                                                                       \
 332 }
 333
 334 #define __MotionComponent_X_Y_copy(width,height)                            \
 335 void _M(MotionComponent_X_Y_copy_##width##_##height)(yuv_data_t * p_src,    \
 336                                                  yuv_data_t * p_dest,       \
 337                                                  int i_stride)              \
 338 {                                                                           \
 339     int i_y;                                                                \
 340     yuv_data_t * p_next_src = p_src + i_stride;                             \
 341                                                                             \
 342     MMXZeroReg();                                                           \
 343                                                                             \
 344     for( i_y = 0; i_y < height; i_y ++ )                                    \
 345     {                                                                       \
 346         MMXAverage4( p_dest, p_src, p_src + 1, p_next_src, p_next_src + 1 );\
 347                                                                             \
 348         if( width == 16 )                                                   \
 349         {                                                                   \
 350             MMXAverage4( p_dest + 8, p_src + 8, p_src + 9,                  \
 351                          p_next_src + 8, p_next_src + 9 );                  \
 352         }                                                                   \
 353                                                                             \
 354         p_dest += i_stride;                                                 \
 355         p_src += i_stride;                                                  \
 356         p_next_src += i_stride;                                             \
 357     }                                                                       \
 358 }
 359
 360 #define __MotionComponent_x_y_avg(width,height)                             \
 361 void _M(MotionComponent_x_y_avg_##width##_##height)(yuv_data_t * p_src,     \
 362                                                 yuv_data_t * p_dest,        \
 363                                                 int i_stride)               \
 364 {                                                                           \
 365     int i_y;                                                                \
 366                                                                             \
 367     MMXZeroReg();                                                           \
 368                                                                             \
 369     for( i_y = 0; i_y < height; i_y ++ )                                    \
 370     {                                                                       \
 371         MMXAverage2( p_dest, p_dest, p_src );                               \
 372                                                                             \
 373         if( width == 16 )                                                   \
 374         {                                                                   \
 375             MMXAverage2( p_dest + 8, p_dest + 8, p_src + 8 );               \
 376         }                                                                   \
 377                                                                             \
 378         p_dest += i_stride;                                                 \
 379         p_src += i_stride;                                                  \
 380     }                                                                       \
 381 }
 382
 383 #define __MotionComponent_X_y_avg(width,height)                             \
 384 void _M(MotionComponent_X_y_avg_##width##_##height)(yuv_data_t * p_src,     \
 385                                                 yuv_data_t * p_dest,        \
 386                                                 int i_stride)               \
 387 {                                                                           \
 388     int i_y;                                                                \
 389                                                                             \
 390     MMXZeroReg();                                                           \
 391                                                                             \
 392     for( i_y = 0; i_y < height; i_y ++ )                                    \
 393     {                                                                       \
 394         MMXInterpAverage2( p_dest, p_src, p_src + 1 );                      \
 395                                                                             \
 396         if( width == 16 )                                                   \
 397         {                                                                   \
 398             MMXInterpAverage2( p_dest + 8, p_src + 8, p_src + 9 );          \
 399         }                                                                   \
 400                                                                             \
 401         p_dest += i_stride;                                                 \
 402         p_src += i_stride;                                                  \
 403     }                                                                       \
 404 }
 405
 406 #define __MotionComponent_x_Y_avg(width,height)                             \
 407 void _M(MotionComponent_x_Y_avg_##width##_##height)(yuv_data_t * p_src,     \
 408                                                 yuv_data_t * p_dest,        \
 409                                                 int i_stride)               \
 410 {                                                                           \
 411     int i_y;                                                                \
 412     yuv_data_t * p_next_src = p_src + i_stride;                             \
 413                                                                             \
 414     MMXZeroReg();                                                           \
 415                                                                             \
 416     for( i_y = 0; i_y < height; i_y ++ )                                    \
 417     {                                                                       \
 418         MMXInterpAverage2( p_dest, p_src, p_next_src );                     \
 419                                                                             \
 420         if( width == 16 )                                                   \
 421         {                                                                   \
 422             MMXInterpAverage2( p_dest + 8, p_src + 8, p_next_src + 8 );     \
 423         }                                                                   \
 424         p_dest += i_stride;                                                 \
 425         p_src += i_stride;                                                  \
 426         p_next_src += i_stride;                                             \
 427     }                                                                       \
 428 }
 429
 430 #define __MotionComponent_X_Y_avg(width,height)                             \
 431 void _M(MotionComponent_X_Y_avg_##width##_##height)(yuv_data_t * p_src,     \
 432                                                 yuv_data_t * p_dest,        \
 433                                                 int i_stride)               \
 434 {                                                                           \
 435     int i_y;                                                                \
 436     yuv_data_t * p_next_src = p_src + i_stride;                             \
 437                                                                             \
 438     MMXZeroReg();                                                           \
 439                                                                             \
 440     for( i_y = 0; i_y < height; i_y ++ )                                    \
 441     {                                                                       \
 442         MMXInterpAverage4( p_dest, p_src, p_src + 1, p_next_src,            \
 443                            p_next_src + 1 );                                \
 444                                                                             \
 445         if( width == 16 )                                                   \
 446         {                                                                   \
 447             MMXInterpAverage4( p_dest + 8, p_src + 8, p_src + 9,            \
 448                                p_next_src + 8, p_next_src + 9 );            \
 449         }                                                                   \
 450                                                                             \
 451         p_dest += i_stride;                                                 \
 452         p_src += i_stride;                                                  \
 453         p_next_src += i_stride;                                             \
 454     }                                                                       \
 455 }
 456
 457 #define __MotionComponents(width,height)                                    \
 458 __MotionComponent_x_y_copy(width,height)                                    \
 459 __MotionComponent_X_y_copy(width,height)                                    \
 460 __MotionComponent_x_Y_copy(width,height)                                    \
 461 __MotionComponent_X_Y_copy(width,height)                                    \
 462 __MotionComponent_x_y_avg(width,height)                                     \
 463 __MotionComponent_X_y_avg(width,height)                                     \
 464 __MotionComponent_x_Y_avg(width,height)                                     \
 465 __MotionComponent_X_Y_avg(width,height)
 466
 467 __MotionComponents (16,16)      /* 444, 422, 420 */
 468 __MotionComponents (16,8)       /* 444, 422, 420 */
 469 __MotionComponents (8,8)        /* 422, 420 */
 470 __MotionComponents (8,4)        /* 420 */
 471 #if 0
 472 __MotionComponents (8,16)       /* 422 */
 473 #endif