git.sesse.net Git - vlc/blob - src/video_decoder/vdec_motion_inner_mmx.c

   1 /*****************************************************************************
   2  * vdec_motion_inner_mmx.c : motion compensation inner routines optimized in
   3  *                           MMX
   4  *****************************************************************************
   5  * Copyright (C) 1999, 2000 VideoLAN
   6  *
   7  * Authors: Christophe Massiot <massiot@via.ecp.fr>, largerly inspired by the
   8  *          work done by the livid project <http://www.linuxvideo.org/>
   9  *
  10  * This program is free software; you can redistribute it and/or modify
  11  * it under the terms of the GNU General Public License as published by
  12  * the Free Software Foundation; either version 2 of the License, or
  13  * (at your option) any later version.
  14  *
  15  * This program is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  * GNU General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU General Public License
  21  * along with this program; if not, write to the Free Software
  22  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
  23  *****************************************************************************/
  24
  25 /*****************************************************************************
  26  * Preamble
  27  *****************************************************************************/
  28 #include "defs.h"
  29
  30 #include <sys/types.h>                        /* on BSD, uio.h needs types.h */
  31 #include <sys/uio.h>                                          /* for input.h */
  32
  33 #include "config.h"
  34 #include "common.h"
  35 #include "threads.h"
  36 #include "mtime.h"
  37 #include "plugins.h"
  38
  39 #include "intf_msg.h"
  40
  41 #include "stream_control.h"
  42 #include "input_ext-dec.h"
  43
  44 #include "video.h"
  45 #include "video_output.h"
  46
  47 #include "vdec_idct.h"
  48 #include "video_decoder.h"
  49 #include "vdec_motion.h"
  50
  51 #include "vpar_blocks.h"
  52 #include "vpar_headers.h"
  53 #include "vpar_synchro.h"
  54 #include "video_parser.h"
  55 #include "video_fifo.h"
  56
  57 #include "mmx.h"
  58
  59 /* OK, I know, this code has been taken from livid's mpeg2dec --Meuuh */
  60
  61 /* Some rounding constants */
  62 mmx_t round1 = {0x0001000100010001LL};
  63 mmx_t round4 = {0x0002000200020002LL};
  64
  65 /*
  66  * Useful functions
  67  */
  68
  69 static __inline__ void MMXZeroReg()
  70 {
  71    /* load 0 into mm0 */
  72    pxor_r2r(mm0,mm0);
  73 }
  74
  75 static __inline__ void MMXAverage2( u8 *dst, u8 *src1, u8 *src2 )
  76 {
  77    //
  78    // *dst = clip_to_u8((*src1 + *src2 + 1)/2);
  79    //
  80
  81    //mmx_zero_reg();
  82
  83    movq_m2r(*src1,mm1);        // load 8 src1 bytes
  84    movq_r2r(mm1,mm2);          // copy 8 src1 bytes
  85
  86    movq_m2r(*src2,mm3);        // load 8 src2 bytes
  87    movq_r2r(mm3,mm4);          // copy 8 src2 bytes
  88
  89    punpcklbw_r2r(mm0,mm1);     // unpack low src1 bytes
  90    punpckhbw_r2r(mm0,mm2);     // unpack high src1 bytes
  91
  92    punpcklbw_r2r(mm0,mm3);     // unpack low src2 bytes
  93    punpckhbw_r2r(mm0,mm4);     // unpack high src2 bytes
  94
  95    paddw_r2r(mm3,mm1);         // add lows to mm1
  96    paddw_m2r(round1,mm1);
  97    psraw_i2r(1,mm1);           // /2
  98
  99    paddw_r2r(mm4,mm2);         // add highs to mm2
 100    paddw_m2r(round1,mm2);
 101    psraw_i2r(1,mm2);           // /2
 102
 103    packuswb_r2r(mm2,mm1);      // pack (w/ saturation)
 104    movq_r2m(mm1,*dst);         // store result in dst
 105 }
 106
 107 static __inline__ void MMXInterpAverage2( u8 *dst, u8 *src1, u8 *src2 )
 108 {
 109    //
 110    // *dst = clip_to_u8((*dst + (*src1 + *src2 + 1)/2 + 1)/2);
 111    //
 112
 113    //mmx_zero_reg();
 114
 115    movq_m2r(*dst,mm1);            // load 8 dst bytes
 116    movq_r2r(mm1,mm2);             // copy 8 dst bytes
 117
 118    movq_m2r(*src1,mm3);           // load 8 src1 bytes
 119    movq_r2r(mm3,mm4);             // copy 8 src1 bytes
 120
 121    movq_m2r(*src2,mm5);           // load 8 src2 bytes
 122    movq_r2r(mm5,mm6);             // copy 8 src2 bytes
 123
 124    punpcklbw_r2r(mm0,mm1);        // unpack low dst bytes
 125    punpckhbw_r2r(mm0,mm2);        // unpack high dst bytes
 126
 127    punpcklbw_r2r(mm0,mm3);        // unpack low src1 bytes
 128    punpckhbw_r2r(mm0,mm4);        // unpack high src1 bytes
 129
 130    punpcklbw_r2r(mm0,mm5);        // unpack low src2 bytes
 131    punpckhbw_r2r(mm0,mm6);        // unpack high src2 bytes
 132
 133    paddw_r2r(mm5,mm3);            // add lows
 134    paddw_m2r(round1,mm3);
 135    psraw_i2r(1,mm3);              // /2
 136
 137    paddw_r2r(mm6,mm4);            // add highs
 138    paddw_m2r(round1,mm4);
 139    psraw_i2r(1,mm4);              // /2
 140
 141    paddw_r2r(mm3,mm1);            // add lows
 142    paddw_m2r(round1,mm1);
 143    psraw_i2r(1,mm1);              // /2
 144
 145    paddw_r2r(mm4,mm2);            // add highs
 146    paddw_m2r(round1,mm2);
 147    psraw_i2r(1,mm2);              // /2
 148
 149    packuswb_r2r(mm2,mm1);         // pack (w/ saturation)
 150    movq_r2m(mm1,*dst);            // store result in dst
 151 }
 152
 153 static __inline__ void MMXAverage4( u8 *dst, u8 *src1, u8 *src2, u8 *src3,
 154                                     u8 *src4 )
 155 {
 156    //
 157    // *dst = clip_to_u8((*src1 + *src2 + *src3 + *src4 + 2)/4);
 158    //
 159
 160    //mmx_zero_reg();
 161
 162    movq_m2r(*src1,mm1);                // load 8 src1 bytes
 163    movq_r2r(mm1,mm2);                  // copy 8 src1 bytes
 164
 165    punpcklbw_r2r(mm0,mm1);             // unpack low src1 bytes
 166    punpckhbw_r2r(mm0,mm2);             // unpack high src1 bytes
 167
 168    movq_m2r(*src2,mm3);                // load 8 src2 bytes
 169    movq_r2r(mm3,mm4);                  // copy 8 src2 bytes
 170
 171    punpcklbw_r2r(mm0,mm3);             // unpack low src2 bytes
 172    punpckhbw_r2r(mm0,mm4);             // unpack high src2 bytes
 173
 174    paddw_r2r(mm3,mm1);                 // add lows
 175    paddw_r2r(mm4,mm2);                 // add highs
 176
 177    // now have partials in mm1 and mm2
 178
 179    movq_m2r(*src3,mm3);                // load 8 src3 bytes
 180    movq_r2r(mm3,mm4);                  // copy 8 src3 bytes
 181
 182    punpcklbw_r2r(mm0,mm3);             // unpack low src3 bytes
 183    punpckhbw_r2r(mm0,mm4);             // unpack high src3 bytes
 184
 185    paddw_r2r(mm3,mm1);                 // add lows
 186    paddw_r2r(mm4,mm2);                 // add highs
 187
 188    movq_m2r(*src4,mm5);                // load 8 src4 bytes
 189    movq_r2r(mm5,mm6);                  // copy 8 src4 bytes
 190
 191    punpcklbw_r2r(mm0,mm5);             // unpack low src4 bytes
 192    punpckhbw_r2r(mm0,mm6);             // unpack high src4 bytes
 193
 194    paddw_r2r(mm5,mm1);                 // add lows
 195    paddw_r2r(mm6,mm2);                 // add highs
 196
 197    // now have subtotal in mm1 and mm2
 198
 199    paddw_m2r(round4,mm1);
 200    psraw_i2r(2,mm1);                   // /4
 201    paddw_m2r(round4,mm2);
 202    psraw_i2r(2,mm2);                   // /4
 203
 204    packuswb_r2r(mm2,mm1);              // pack (w/ saturation)
 205    movq_r2m(mm1,*dst);                 // store result in dst
 206 }
 207
 208 static __inline__ void MMXInterpAverage4( u8 *dst, u8 *src1, u8 *src2,
 209                                           u8 *src3, u8 *src4 )
 210 {
 211    //
 212    // *dst = clip_to_u8((*dst + (*src1 + *src2 + *src3 + *src4 + 2)/4 + 1)/2);
 213    //
 214
 215    //mmx_zero_reg();
 216
 217    movq_m2r(*src1,mm1);                // load 8 src1 bytes
 218    movq_r2r(mm1,mm2);                  // copy 8 src1 bytes
 219
 220    punpcklbw_r2r(mm0,mm1);             // unpack low src1 bytes
 221    punpckhbw_r2r(mm0,mm2);             // unpack high src1 bytes
 222
 223    movq_m2r(*src2,mm3);                // load 8 src2 bytes
 224    movq_r2r(mm3,mm4);                  // copy 8 src2 bytes
 225
 226    punpcklbw_r2r(mm0,mm3);             // unpack low src2 bytes
 227    punpckhbw_r2r(mm0,mm4);             // unpack high src2 bytes
 228
 229    paddw_r2r(mm3,mm1);                 // add lows
 230    paddw_r2r(mm4,mm2);                 // add highs
 231
 232    // now have partials in mm1 and mm2
 233
 234    movq_m2r(*src3,mm3);                // load 8 src3 bytes
 235    movq_r2r(mm3,mm4);                  // copy 8 src3 bytes
 236
 237    punpcklbw_r2r(mm0,mm3);             // unpack low src3 bytes
 238    punpckhbw_r2r(mm0,mm4);             // unpack high src3 bytes
 239
 240    paddw_r2r(mm3,mm1);                 // add lows
 241    paddw_r2r(mm4,mm2);                 // add highs
 242
 243    movq_m2r(*src4,mm5);                // load 8 src4 bytes
 244    movq_r2r(mm5,mm6);                  // copy 8 src4 bytes
 245
 246    punpcklbw_r2r(mm0,mm5);             // unpack low src4 bytes
 247    punpckhbw_r2r(mm0,mm6);             // unpack high src4 bytes
 248
 249    paddw_r2r(mm5,mm1);                 // add lows
 250    paddw_r2r(mm6,mm2);                 // add highs
 251
 252    paddw_m2r(round4,mm1);
 253    psraw_i2r(2,mm1);                   // /4
 254    paddw_m2r(round4,mm2);
 255    psraw_i2r(2,mm2);                   // /4
 256
 257    // now have subtotal/4 in mm1 and mm2
 258
 259    movq_m2r(*dst,mm3);                 // load 8 dst bytes
 260    movq_r2r(mm3,mm4);                  // copy 8 dst bytes
 261
 262    punpcklbw_r2r(mm0,mm3);             // unpack low dst bytes
 263    punpckhbw_r2r(mm0,mm4);             // unpack high dst bytes
 264
 265    paddw_r2r(mm3,mm1);                 // add lows
 266    paddw_r2r(mm4,mm2);                 // add highs
 267
 268    paddw_m2r(round1,mm1);
 269    psraw_i2r(1,mm1);                   // /2
 270    paddw_m2r(round1,mm2);
 271    psraw_i2r(1,mm2);                   // /2
 272
 273    // now have end value in mm1 and mm2
 274
 275    packuswb_r2r(mm2,mm1);              // pack (w/ saturation)
 276    movq_r2m(mm1,*dst);                 // store result in dst
 277 }
 278
 279
 280 /*
 281  * Actual Motion compensation
 282  */
 283
 284 #define __MotionComponent_x_y_copy(width,height)                            \
 285 void MotionComponent_x_y_copy_##width##_##height(yuv_data_t * p_src,        \
 286                                                  yuv_data_t * p_dest,       \
 287                                                  int i_stride)              \
 288 {                                                                           \
 289     int i_y;                                                                \
 290                                                                             \
 291     MMXZeroReg();                                                           \
 292                                                                             \
 293     for( i_y = 0; i_y < height; i_y ++ )                                    \
 294     {                                                                       \
 295         movq_m2r( *p_src, mm1 );     /* load 8 ref bytes */                 \
 296         movq_r2m( mm1, *p_dest );    /* store 8 bytes at curr */            \
 297                                                                             \
 298         if( width == 16 )                                                   \
 299         {                                                                   \
 300             movq_m2r( *(p_src + 8), mm1 );      /* load 8 ref bytes */      \
 301             movq_r2m( mm1, *(p_dest + 8) );     /* store 8 bytes at curr */ \
 302         }                                                                   \
 303                                                                             \
 304         p_dest += i_stride;                                                 \
 305         p_src += i_stride;                                                  \
 306     }                                                                       \
 307 }
 308
 309 #define __MotionComponent_X_y_copy(width,height)                            \
 310 void MotionComponent_X_y_copy_##width##_##height(yuv_data_t * p_src,        \
 311                                                  yuv_data_t * p_dest,       \
 312                                                  int i_stride)              \
 313 {                                                                           \
 314     int i_y;                                                                \
 315                                                                             \
 316     MMXZeroReg();                                                           \
 317                                                                             \
 318     for( i_y = 0; i_y < height; i_y ++ )                                    \
 319     {                                                                       \
 320         MMXAverage2( p_dest, p_src, p_src + 1 );                            \
 321                                                                             \
 322         if( width == 16 )                                                   \
 323         {                                                                   \
 324             MMXAverage2( p_dest + 8, p_src + 8, p_src + 9 );                \
 325         }                                                                   \
 326                                                                             \
 327         p_dest += i_stride;                                                 \
 328         p_src += i_stride;                                                  \
 329     }                                                                       \
 330 }
 331
 332 #define __MotionComponent_x_Y_copy(width,height)                            \
 333 void MotionComponent_x_Y_copy_##width##_##height(yuv_data_t * p_src,        \
 334                                                  yuv_data_t * p_dest,       \
 335                                                  int i_stride)              \
 336 {                                                                           \
 337     int i_y;                                                                \
 338     yuv_data_t * p_next_src = p_src + i_stride;                             \
 339                                                                             \
 340     MMXZeroReg();                                                           \
 341                                                                             \
 342     for( i_y = 0; i_y < height; i_y ++ )                                    \
 343     {                                                                       \
 344         MMXAverage2( p_dest, p_src, p_next_src );                           \
 345                                                                             \
 346         if( width == 16 )                                                   \
 347         {                                                                   \
 348             MMXAverage2( p_dest + 8, p_src + 8, p_next_src + 8 );           \
 349         }                                                                   \
 350                                                                             \
 351         p_dest += i_stride;                                                 \
 352         p_src += i_stride;                                                  \
 353         p_next_src += i_stride;                                             \
 354     }                                                                       \
 355 }
 356
 357 #define __MotionComponent_X_Y_copy(width,height)                            \
 358 void MotionComponent_X_Y_copy_##width##_##height(yuv_data_t * p_src,        \
 359                                                  yuv_data_t * p_dest,       \
 360                                                  int i_stride)              \
 361 {                                                                           \
 362     int i_y;                                                                \
 363     yuv_data_t * p_next_src = p_src + i_stride;                             \
 364                                                                             \
 365     MMXZeroReg();                                                           \
 366                                                                             \
 367     for( i_y = 0; i_y < height; i_y ++ )                                    \
 368     {                                                                       \
 369         MMXAverage4( p_dest, p_src, p_src + 1, p_next_src, p_next_src + 1 );\
 370                                                                             \
 371         if( width == 16 )                                                   \
 372         {                                                                   \
 373             MMXAverage4( p_dest + 8, p_src + 8, p_src + 9,                  \
 374                          p_next_src + 8, p_next_src + 9 );                  \
 375         }                                                                   \
 376                                                                             \
 377         p_dest += i_stride;                                                 \
 378         p_src += i_stride;                                                  \
 379         p_next_src += i_stride;                                             \
 380     }                                                                       \
 381 }
 382
 383 #define __MotionComponent_x_y_avg(width,height)                             \
 384 void MotionComponent_x_y_avg_##width##_##height(yuv_data_t * p_src,         \
 385                                                 yuv_data_t * p_dest,        \
 386                                                 int i_stride)               \
 387 {                                                                           \
 388     int i_y;                                                                \
 389                                                                             \
 390     MMXZeroReg();                                                           \
 391                                                                             \
 392     for( i_y = 0; i_y < height; i_y ++ )                                    \
 393     {                                                                       \
 394         MMXAverage2( p_dest, p_dest, p_src );                               \
 395                                                                             \
 396         if( width == 16 )                                                   \
 397         {                                                                   \
 398             MMXAverage2( p_dest + 8, p_dest + 8, p_src + 8 );               \
 399         }                                                                   \
 400                                                                             \
 401         p_dest += i_stride;                                                 \
 402         p_src += i_stride;                                                  \
 403     }                                                                       \
 404 }
 405
 406 #define __MotionComponent_X_y_avg(width,height)                             \
 407 void MotionComponent_X_y_avg_##width##_##height(yuv_data_t * p_src,         \
 408                                                 yuv_data_t * p_dest,        \
 409                                                 int i_stride)               \
 410 {                                                                           \
 411     int i_y;                                                                \
 412                                                                             \
 413     MMXZeroReg();                                                           \
 414                                                                             \
 415     for( i_y = 0; i_y < height; i_y ++ )                                    \
 416     {                                                                       \
 417         MMXInterpAverage2( p_dest, p_src, p_src + 1 );                      \
 418                                                                             \
 419         if( width == 16 )                                                   \
 420         {                                                                   \
 421             MMXInterpAverage2( p_dest + 8, p_dest + 8, p_src + 9 );         \
 422         }                                                                   \
 423                                                                             \
 424         p_dest += i_stride;                                                 \
 425         p_src += i_stride;                                                  \
 426     }                                                                       \
 427 }
 428
 429 #define __MotionComponent_x_Y_avg(width,height)                             \
 430 void MotionComponent_x_Y_avg_##width##_##height(yuv_data_t * p_src,         \
 431                                                 yuv_data_t * p_dest,        \
 432                                                 int i_stride)               \
 433 {                                                                           \
 434     int i_x, i_y;                                                           \
 435     unsigned int i_dummy;                                                   \
 436                                                                             \
 437     for( i_y = 0; i_y < height; i_y ++ )                                    \
 438     {                                                                       \
 439         for( i_x = 0; i_x < width; i_x++ )                                  \
 440         {                                                                   \
 441             i_dummy =                                                       \
 442                 p_dest[i_x] + ((unsigned int)(p_src[i_x]                    \
 443                                               + p_src[i_x + i_stride]       \
 444                                               + 1) >> 1);                   \
 445             p_dest[i_x] = (i_dummy + 1) >> 1;                               \
 446         }                                                                   \
 447         p_dest += i_stride;                                                 \
 448         p_src += i_stride;                                                  \
 449     }                                                                       \
 450 }
 451
 452 #define __MotionComponent_X_Y_avg(width,height)                             \
 453 void MotionComponent_X_Y_avg_##width##_##height(yuv_data_t * p_src,         \
 454                                                 yuv_data_t * p_dest,        \
 455                                                 int i_stride)               \
 456 {                                                                           \
 457     int i_y;                                                                \
 458     yuv_data_t * p_next_src = p_src + i_stride;                             \
 459                                                                             \
 460     MMXZeroReg();                                                           \
 461                                                                             \
 462     for( i_y = 0; i_y < height; i_y ++ )                                    \
 463     {                                                                       \
 464         MMXInterpAverage4( p_dest, p_src, p_src + 1, p_next_src,            \
 465                            p_next_src + 1 );                                \
 466                                                                             \
 467         if( width == 16 )                                                   \
 468         {                                                                   \
 469             MMXInterpAverage4( p_dest + 8, p_src + 8, p_src + 9,            \
 470                                p_next_src + 8, p_next_src + 9 );            \
 471         }                                                                   \
 472                                                                             \
 473         p_dest += i_stride;                                                 \
 474         p_src += i_stride;                                                  \
 475         p_next_src += i_stride;                                             \
 476     }                                                                       \
 477 }
 478
 479 #define __MotionComponents(width,height)                                    \
 480 __MotionComponent_x_y_copy(width,height)                                    \
 481 __MotionComponent_X_y_copy(width,height)                                    \
 482 __MotionComponent_x_Y_copy(width,height)                                    \
 483 __MotionComponent_X_Y_copy(width,height)                                    \
 484 __MotionComponent_x_y_avg(width,height)                                     \
 485 __MotionComponent_X_y_avg(width,height)                                     \
 486 __MotionComponent_x_Y_avg(width,height)                                     \
 487 __MotionComponent_X_Y_avg(width,height)
 488
 489 __MotionComponents (16,16)        /* 444, 422, 420 */
 490 __MotionComponents (16,8)        /* 444, 422, 420 */
 491 __MotionComponents (8,8)        /* 422, 420 */
 492 __MotionComponents (8,4)        /* 420 */
 493 #if 0
 494 __MotionComponents (8,16)        /* 422 */
 495 #endif