git.sesse.net Git - vlc/blob - src/video_decoder/vdec_motion_inner_mmx.c

   1 /*****************************************************************************
   2  * vdec_motion_inner_mmx.c : motion compensation inner routines optimized in
   3  *                           MMX
   4  *****************************************************************************
   5  * Copyright (C) 1999, 2000 VideoLAN
   6  * $Id: vdec_motion_inner_mmx.c,v 1.8 2001/01/16 17:59:23 massiot Exp $
   7  *
   8  * Authors: Christophe Massiot <massiot@via.ecp.fr>, largerly inspired by the
   9  *          work done by the livid project <http://www.linuxvideo.org/>
  10  *
  11  * This program is free software; you can redistribute it and/or modify
  12  * it under the terms of the GNU General Public License as published by
  13  * the Free Software Foundation; either version 2 of the License, or
  14  * (at your option) any later version.
  15  *
  16  * This program is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19  * GNU General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU General Public License
  22  * along with this program; if not, write to the Free Software
  23  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
  24  *****************************************************************************/
  25
  26 /*****************************************************************************
  27  * Preamble
  28  *****************************************************************************/
  29 #include "defs.h"
  30
  31 #include "config.h"
  32 #include "common.h"
  33 #include "threads.h"
  34 #include "mtime.h"
  35 #include "plugins.h"
  36
  37 #include "intf_msg.h"
  38
  39 #include "stream_control.h"
  40 #include "input_ext-dec.h"
  41
  42 #include "video.h"
  43 #include "video_output.h"
  44
  45 #include "vdec_idct.h"
  46 #include "video_decoder.h"
  47 #include "vdec_motion.h"
  48
  49 #include "vpar_blocks.h"
  50 #include "vpar_headers.h"
  51 #include "vpar_synchro.h"
  52 #include "video_parser.h"
  53 #include "video_fifo.h"
  54
  55 #include "attributes.h"
  56 #include "mmx.h"
  57
  58 /* OK, I know, this code has been taken from livid's mpeg2dec --Meuuh */
  59
  60 /* Some rounding constants */
  61 mmx_t round1 = {0x0001000100010001LL};
  62 mmx_t round4 = {0x0002000200020002LL};
  63
  64 /*
  65  * Useful functions
  66  */
  67
  68 static __inline__ void MMXZeroReg()
  69 {
  70    /* load 0 into mm0 */
  71    pxor_r2r(mm0,mm0);
  72 }
  73
  74 static __inline__ void MMXAverage2( u8 *dst, u8 *src1, u8 *src2 )
  75 {
  76    //
  77    // *dst = clip_to_u8((*src1 + *src2 + 1)/2);
  78    //
  79
  80    movq_m2r(*src1,mm1);        // load 8 src1 bytes
  81    movq_r2r(mm1,mm2);          // copy 8 src1 bytes
  82
  83    movq_m2r(*src2,mm3);        // load 8 src2 bytes
  84    movq_r2r(mm3,mm4);          // copy 8 src2 bytes
  85
  86    punpcklbw_r2r(mm0,mm1);     // unpack low src1 bytes
  87    punpckhbw_r2r(mm0,mm2);     // unpack high src1 bytes
  88
  89    punpcklbw_r2r(mm0,mm3);     // unpack low src2 bytes
  90    punpckhbw_r2r(mm0,mm4);     // unpack high src2 bytes
  91
  92    paddw_r2r(mm3,mm1);         // add lows to mm1
  93    paddw_m2r(round1,mm1);
  94    psraw_i2r(1,mm1);           // /2
  95
  96    paddw_r2r(mm4,mm2);         // add highs to mm2
  97    paddw_m2r(round1,mm2);
  98    psraw_i2r(1,mm2);           // /2
  99
 100    packuswb_r2r(mm2,mm1);      // pack (w/ saturation)
 101    movq_r2m(mm1,*dst);         // store result in dst
 102 }
 103
 104 static __inline__ void MMXInterpAverage2( u8 *dst, u8 *src1, u8 *src2 )
 105 {
 106    //
 107    // *dst = clip_to_u8((*dst + (*src1 + *src2 + 1)/2 + 1)/2);
 108    //
 109
 110    movq_m2r(*dst,mm1);            // load 8 dst bytes
 111    movq_r2r(mm1,mm2);             // copy 8 dst bytes
 112
 113    movq_m2r(*src1,mm3);           // load 8 src1 bytes
 114    movq_r2r(mm3,mm4);             // copy 8 src1 bytes
 115
 116    movq_m2r(*src2,mm5);           // load 8 src2 bytes
 117    movq_r2r(mm5,mm6);             // copy 8 src2 bytes
 118
 119    punpcklbw_r2r(mm0,mm1);        // unpack low dst bytes
 120    punpckhbw_r2r(mm0,mm2);        // unpack high dst bytes
 121
 122    punpcklbw_r2r(mm0,mm3);        // unpack low src1 bytes
 123    punpckhbw_r2r(mm0,mm4);        // unpack high src1 bytes
 124
 125    punpcklbw_r2r(mm0,mm5);        // unpack low src2 bytes
 126    punpckhbw_r2r(mm0,mm6);        // unpack high src2 bytes
 127
 128    paddw_r2r(mm5,mm3);            // add lows
 129    paddw_m2r(round1,mm3);
 130    psraw_i2r(1,mm3);              // /2
 131
 132    paddw_r2r(mm6,mm4);            // add highs
 133    paddw_m2r(round1,mm4);
 134    psraw_i2r(1,mm4);              // /2
 135
 136    paddw_r2r(mm3,mm1);            // add lows
 137    paddw_m2r(round1,mm1);
 138    psraw_i2r(1,mm1);              // /2
 139
 140    paddw_r2r(mm4,mm2);            // add highs
 141    paddw_m2r(round1,mm2);
 142    psraw_i2r(1,mm2);              // /2
 143
 144    packuswb_r2r(mm2,mm1);         // pack (w/ saturation)
 145    movq_r2m(mm1,*dst);            // store result in dst
 146 }
 147
 148 static __inline__ void MMXAverage4( u8 *dst, u8 *src1, u8 *src2, u8 *src3,
 149                                     u8 *src4 )
 150 {
 151    //
 152    // *dst = (*src1 + *src2 + *src3 + *src4 + 2) / 4;
 153    //
 154
 155    movq_m2r(*src1,mm1);                // load 8 src1 bytes
 156    movq_r2r(mm1,mm2);                  // copy 8 src1 bytes
 157
 158    punpcklbw_r2r(mm0,mm1);             // unpack low src1 bytes
 159    punpckhbw_r2r(mm0,mm2);             // unpack high src1 bytes
 160
 161    movq_m2r(*src2,mm3);                // load 8 src2 bytes
 162    movq_r2r(mm3,mm4);                  // copy 8 src2 bytes
 163
 164    punpcklbw_r2r(mm0,mm3);             // unpack low src2 bytes
 165    punpckhbw_r2r(mm0,mm4);             // unpack high src2 bytes
 166
 167    paddw_r2r(mm3,mm1);                 // add lows
 168    paddw_r2r(mm4,mm2);                 // add highs
 169
 170    // now have partials in mm1 and mm2
 171
 172    movq_m2r(*src3,mm3);                // load 8 src3 bytes
 173    movq_r2r(mm3,mm4);                  // copy 8 src3 bytes
 174
 175    punpcklbw_r2r(mm0,mm3);             // unpack low src3 bytes
 176    punpckhbw_r2r(mm0,mm4);             // unpack high src3 bytes
 177
 178    paddw_r2r(mm3,mm1);                 // add lows
 179    paddw_r2r(mm4,mm2);                 // add highs
 180
 181    movq_m2r(*src4,mm5);                // load 8 src4 bytes
 182    movq_r2r(mm5,mm6);                  // copy 8 src4 bytes
 183
 184    punpcklbw_r2r(mm0,mm5);             // unpack low src4 bytes
 185    punpckhbw_r2r(mm0,mm6);             // unpack high src4 bytes
 186
 187    paddw_r2r(mm5,mm1);                 // add lows
 188    paddw_r2r(mm6,mm2);                 // add highs
 189
 190    // now have subtotal in mm1 and mm2
 191
 192    paddw_m2r(round4,mm1);
 193    psraw_i2r(2,mm1);                   // /4
 194    paddw_m2r(round4,mm2);
 195    psraw_i2r(2,mm2);                   // /4
 196
 197    packuswb_r2r(mm2,mm1);              // pack (w/ saturation)
 198    movq_r2m(mm1,*dst);                 // store result in dst
 199 }
 200
 201 static __inline__ void MMXInterpAverage4( u8 *dst, u8 *src1, u8 *src2,
 202                                           u8 *src3, u8 *src4 )
 203 {
 204    //
 205    // *dst = clip_to_u8((*dst + (*src1 + *src2 + *src3 + *src4 + 2)/4 + 1)/2);
 206    //
 207
 208    movq_m2r(*src1,mm1);                // load 8 src1 bytes
 209    movq_r2r(mm1,mm2);                  // copy 8 src1 bytes
 210
 211    punpcklbw_r2r(mm0,mm1);             // unpack low src1 bytes
 212    punpckhbw_r2r(mm0,mm2);             // unpack high src1 bytes
 213
 214    movq_m2r(*src2,mm3);                // load 8 src2 bytes
 215    movq_r2r(mm3,mm4);                  // copy 8 src2 bytes
 216
 217    punpcklbw_r2r(mm0,mm3);             // unpack low src2 bytes
 218    punpckhbw_r2r(mm0,mm4);             // unpack high src2 bytes
 219
 220    paddw_r2r(mm3,mm1);                 // add lows
 221    paddw_r2r(mm4,mm2);                 // add highs
 222
 223    // now have partials in mm1 and mm2
 224
 225    movq_m2r(*src3,mm3);                // load 8 src3 bytes
 226    movq_r2r(mm3,mm4);                  // copy 8 src3 bytes
 227
 228    punpcklbw_r2r(mm0,mm3);             // unpack low src3 bytes
 229    punpckhbw_r2r(mm0,mm4);             // unpack high src3 bytes
 230
 231    paddw_r2r(mm3,mm1);                 // add lows
 232    paddw_r2r(mm4,mm2);                 // add highs
 233
 234    movq_m2r(*src4,mm5);                // load 8 src4 bytes
 235    movq_r2r(mm5,mm6);                  // copy 8 src4 bytes
 236
 237    punpcklbw_r2r(mm0,mm5);             // unpack low src4 bytes
 238    punpckhbw_r2r(mm0,mm6);             // unpack high src4 bytes
 239
 240    paddw_r2r(mm5,mm1);                 // add lows
 241    paddw_r2r(mm6,mm2);                 // add highs
 242
 243    paddw_m2r(round4,mm1);
 244    psraw_i2r(2,mm1);                   // /4
 245    paddw_m2r(round4,mm2);
 246    psraw_i2r(2,mm2);                   // /4
 247
 248    // now have subtotal/4 in mm1 and mm2
 249
 250    movq_m2r(*dst,mm3);                 // load 8 dst bytes
 251    movq_r2r(mm3,mm4);                  // copy 8 dst bytes
 252
 253    punpcklbw_r2r(mm0,mm3);             // unpack low dst bytes
 254    punpckhbw_r2r(mm0,mm4);             // unpack high dst bytes
 255
 256    paddw_r2r(mm3,mm1);                 // add lows
 257    paddw_r2r(mm4,mm2);                 // add highs
 258
 259    paddw_m2r(round1,mm1);
 260    psraw_i2r(1,mm1);                   // /2
 261    paddw_m2r(round1,mm2);
 262    psraw_i2r(1,mm2);                   // /2
 263
 264    // now have end value in mm1 and mm2
 265
 266    packuswb_r2r(mm2,mm1);              // pack (w/ saturation)
 267    movq_r2m(mm1,*dst);                 // store result in dst
 268 }
 269
 270
 271 /*
 272  * Actual Motion compensation
 273  */
 274
 275 #define pavg_r2r(src,dest)      pavgusb_r2r (src, dest);
 276 #define pavg_m2r(src,dest)      pavgusb_m2r (src, dest);
 277
 278 #define __MotionComponent_x_y_copy(width,height)                            \
 279 void MotionComponent_x_y_copy_##width##_##height(yuv_data_t * p_src,        \
 280                                                  yuv_data_t * p_dest,       \
 281                                                  int i_stride)              \
 282 {                                                                           \
 283     int i_y;                                                                \
 284                                                                             \
 285     MMXZeroReg();                                                           \
 286                                                                             \
 287     for( i_y = 0; i_y < height; i_y ++ )                                    \
 288     {                                                                       \
 289         movq_m2r( *p_src, mm0 );     /* load 8 ref bytes */                 \
 290         if( width == 16 )                                                   \
 291             movq_m2r( *(p_src + 8), mm1 );                                  \
 292         p_src += i_stride;                                                  \
 293                                                                             \
 294         movq_r2m( mm0, *p_dest );    /* store 8 bytes at curr */            \
 295         if( width == 16 )                                                   \
 296             movq_r2m( mm1, *(p_dest + 8) );                                 \
 297         p_dest += i_stride;                                                 \
 298     }                                                                       \
 299 }
 300
 301 #define __MotionComponent_X_y_copy(width,height)                            \
 302 void MotionComponent_X_y_copy_##width##_##height(yuv_data_t * p_src,        \
 303                                                  yuv_data_t * p_dest,       \
 304                                                  int i_stride)              \
 305 {                                                                           \
 306     int i_y;                                                                \
 307                                                                             \
 308     MMXZeroReg();                                                           \
 309                                                                             \
 310     for( i_y = 0; i_y < height; i_y ++ )                                    \
 311     {                                                                       \
 312         MMXAverage2( p_dest, p_src, p_src + 1 );                            \
 313                                                                             \
 314         if( width == 16 )                                                   \
 315         {                                                                   \
 316             MMXAverage2( p_dest + 8, p_src + 8, p_src + 9 );                \
 317         }                                                                   \
 318                                                                             \
 319         p_dest += i_stride;                                                 \
 320         p_src += i_stride;                                                  \
 321     }                                                                       \
 322 }
 323
 324 #define __MotionComponent_x_Y_copy(width,height)                            \
 325 void MotionComponent_x_Y_copy_##width##_##height(yuv_data_t * p_src,        \
 326                                                  yuv_data_t * p_dest,       \
 327                                                  int i_stride)              \
 328 {                                                                           \
 329     int i_y;                                                                \
 330     yuv_data_t * p_next_src = p_src + i_stride;                             \
 331                                                                             \
 332     MMXZeroReg();                                                           \
 333                                                                             \
 334     for( i_y = 0; i_y < height; i_y ++ )                                    \
 335     {                                                                       \
 336         MMXAverage2( p_dest, p_src, p_next_src );                           \
 337                                                                             \
 338         if( width == 16 )                                                   \
 339         {                                                                   \
 340             MMXAverage2( p_dest + 8, p_src + 8, p_next_src + 8 );           \
 341         }                                                                   \
 342                                                                             \
 343         p_dest += i_stride;                                                 \
 344         p_src += i_stride;                                                  \
 345         p_next_src += i_stride;                                             \
 346     }                                                                       \
 347 }
 348
 349 #define __MotionComponent_X_Y_copy(width,height)                            \
 350 void MotionComponent_X_Y_copy_##width##_##height(yuv_data_t * p_src,        \
 351                                                  yuv_data_t * p_dest,       \
 352                                                  int i_stride)              \
 353 {                                                                           \
 354     int i_y;                                                                \
 355     yuv_data_t * p_next_src = p_src + i_stride;                             \
 356                                                                             \
 357     MMXZeroReg();                                                           \
 358                                                                             \
 359     for( i_y = 0; i_y < height; i_y ++ )                                    \
 360     {                                                                       \
 361         MMXAverage4( p_dest, p_src, p_src + 1, p_next_src, p_next_src + 1 );\
 362                                                                             \
 363         if( width == 16 )                                                   \
 364         {                                                                   \
 365             MMXAverage4( p_dest + 8, p_src + 8, p_src + 9,                  \
 366                          p_next_src + 8, p_next_src + 9 );                  \
 367         }                                                                   \
 368                                                                             \
 369         p_dest += i_stride;                                                 \
 370         p_src += i_stride;                                                  \
 371         p_next_src += i_stride;                                             \
 372     }                                                                       \
 373 }
 374
 375 #define __MotionComponent_x_y_avg(width,height)                             \
 376 void MotionComponent_x_y_avg_##width##_##height(yuv_data_t * p_src,         \
 377                                                 yuv_data_t * p_dest,        \
 378                                                 int i_stride)               \
 379 {                                                                           \
 380     int i_y;                                                                \
 381                                                                             \
 382     MMXZeroReg();                                                           \
 383                                                                             \
 384     for( i_y = 0; i_y < height; i_y ++ )                                    \
 385     {                                                                       \
 386         MMXAverage2( p_dest, p_dest, p_src );                               \
 387                                                                             \
 388         if( width == 16 )                                                   \
 389         {                                                                   \
 390             MMXAverage2( p_dest + 8, p_dest + 8, p_src + 8 );               \
 391         }                                                                   \
 392                                                                             \
 393         p_dest += i_stride;                                                 \
 394         p_src += i_stride;                                                  \
 395     }                                                                       \
 396 }
 397
 398 #define __MotionComponent_X_y_avg(width,height)                             \
 399 void MotionComponent_X_y_avg_##width##_##height(yuv_data_t * p_src,         \
 400                                                 yuv_data_t * p_dest,        \
 401                                                 int i_stride)               \
 402 {                                                                           \
 403     int i_y;                                                                \
 404                                                                             \
 405     MMXZeroReg();                                                           \
 406                                                                             \
 407     for( i_y = 0; i_y < height; i_y ++ )                                    \
 408     {                                                                       \
 409         MMXInterpAverage2( p_dest, p_src, p_src + 1 );                      \
 410                                                                             \
 411         if( width == 16 )                                                   \
 412         {                                                                   \
 413             MMXInterpAverage2( p_dest + 8, p_src + 8, p_src + 9 );          \
 414         }                                                                   \
 415                                                                             \
 416         p_dest += i_stride;                                                 \
 417         p_src += i_stride;                                                  \
 418     }                                                                       \
 419 }
 420
 421 #define __MotionComponent_x_Y_avg(width,height)                             \
 422 void MotionComponent_x_Y_avg_##width##_##height(yuv_data_t * p_src,         \
 423                                                 yuv_data_t * p_dest,        \
 424                                                 int i_stride)               \
 425 {                                                                           \
 426     int i_y;                                                                \
 427     yuv_data_t * p_next_src = p_src + i_stride;                             \
 428                                                                             \
 429     MMXZeroReg();                                                           \
 430                                                                             \
 431     for( i_y = 0; i_y < height; i_y ++ )                                    \
 432     {                                                                       \
 433         MMXInterpAverage2( p_dest, p_src, p_next_src );                     \
 434                                                                             \
 435         if( width == 16 )                                                   \
 436         {                                                                   \
 437             MMXInterpAverage2( p_dest + 8, p_src + 8, p_next_src + 8 );     \
 438         }                                                                   \
 439         p_dest += i_stride;                                                 \
 440         p_src += i_stride;                                                  \
 441         p_next_src += i_stride;                                             \
 442     }                                                                       \
 443 }
 444
 445 #define __MotionComponent_X_Y_avg(width,height)                             \
 446 void MotionComponent_X_Y_avg_##width##_##height(yuv_data_t * p_src,         \
 447                                                 yuv_data_t * p_dest,        \
 448                                                 int i_stride)               \
 449 {                                                                           \
 450     int i_y;                                                                \
 451     yuv_data_t * p_next_src = p_src + i_stride;                             \
 452                                                                             \
 453     MMXZeroReg();                                                           \
 454                                                                             \
 455     for( i_y = 0; i_y < height; i_y ++ )                                    \
 456     {                                                                       \
 457         MMXInterpAverage4( p_dest, p_src, p_src + 1, p_next_src,            \
 458                            p_next_src + 1 );                                \
 459                                                                             \
 460         if( width == 16 )                                                   \
 461         {                                                                   \
 462             MMXInterpAverage4( p_dest + 8, p_src + 8, p_src + 9,            \
 463                                p_next_src + 8, p_next_src + 9 );            \
 464         }                                                                   \
 465                                                                             \
 466         p_dest += i_stride;                                                 \
 467         p_src += i_stride;                                                  \
 468         p_next_src += i_stride;                                             \
 469     }                                                                       \
 470 }
 471
 472 #define __MotionComponents(width,height)                                    \
 473 __MotionComponent_x_y_copy(width,height)                                    \
 474 __MotionComponent_X_y_copy(width,height)                                    \
 475 __MotionComponent_x_Y_copy(width,height)                                    \
 476 __MotionComponent_X_Y_copy(width,height)                                    \
 477 __MotionComponent_x_y_avg(width,height)                                     \
 478 __MotionComponent_X_y_avg(width,height)                                     \
 479 __MotionComponent_x_Y_avg(width,height)                                     \
 480 __MotionComponent_X_Y_avg(width,height)
 481
 482 __MotionComponents (16,16)      /* 444, 422, 420 */
 483 __MotionComponents (16,8)       /* 444, 422, 420 */
 484 __MotionComponents (8,8)        /* 422, 420 */
 485 __MotionComponents (8,4)        /* 420 */
 486 #if 0
 487 __MotionComponents (8,16)       /* 422 */
 488 #endif