git.sesse.net Git - x264/blob - common/i386/mc-c.c

   1 /*****************************************************************************
   2  * mc.c: h264 encoder library (Motion Compensation)
   3  *****************************************************************************
   4  * Copyright (C) 2003 Laurent Aimar
   5  * $Id: mc-c.c,v 1.5 2004/06/18 01:59:58 chenm001 Exp $
   6  *
   7  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation; either version 2 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License
  20  * along with this program; if not, write to the Free Software
  21  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
  22  *****************************************************************************/
  23
  24 #ifdef HAVE_STDINT_H
  25 #include <stdint.h>
  26 #else
  27 #include <inttypes.h>
  28 #endif
  29 #include <stdlib.h>
  30 #include <stdio.h>
  31 #include <string.h>
  32 #include <stdarg.h>
  33
  34 #include "x264.h"   /* DECLARE_ALIGNED */
  35 #include "common/pixel.h"
  36 #include "common/mc.h"
  37 #include "common/clip1.h"
  38 #include "mc.h"
  39
  40 /* NASM functions */
  41 extern void x264_pixel_avg_w4_mmxext( uint8_t *,  int, uint8_t *, int, uint8_t *, int, int );
  42 extern void x264_pixel_avg_w8_mmxext( uint8_t *,  int, uint8_t *, int, uint8_t *, int, int );
  43 extern void x264_pixel_avg_w16_mmxext( uint8_t *,  int, uint8_t *, int, uint8_t *, int, int );
  44 extern void x264_pixel_avg_w16_sse2( uint8_t *,  int, uint8_t *, int, uint8_t *, int, int );
  45 extern void x264_pixel_avg_weight_4x4_mmxext( uint8_t *, int, uint8_t *, int, int );
  46 extern void x264_pixel_avg_weight_w8_mmxext( uint8_t *, int, uint8_t *, int, int, int );
  47 extern void x264_pixel_avg_weight_w16_mmxext( uint8_t *, int, uint8_t *, int, int, int );
  48 extern void x264_mc_copy_w4_mmxext( uint8_t *, int, uint8_t *, int, int );
  49 extern void x264_mc_copy_w8_mmxext( uint8_t *, int, uint8_t *, int, int );
  50 extern void x264_mc_copy_w16_mmxext( uint8_t *, int, uint8_t *, int, int );
  51 extern void x264_mc_copy_w16_sse2( uint8_t *, int, uint8_t *, int, int );
  52
  53 #define AVG(W,H) \
  54 static void x264_pixel_avg_ ## W ## x ## H ## _mmxext( uint8_t *dst, int i_dst, uint8_t *src, int i_src ) \
  55 { \
  56     x264_pixel_avg_w ## W ## _mmxext( dst, i_dst, dst, i_dst, src, i_src, H ); \
  57 }
  58 AVG(16,16)
  59 AVG(16,8)
  60 AVG(8,16)
  61 AVG(8,8)
  62 AVG(8,4)
  63 AVG(4,8)
  64 AVG(4,4)
  65 AVG(4,2)
  66
  67 #define AVG_WEIGHT(W,H) \
  68 void x264_pixel_avg_weight_ ## W ## x ## H ## _mmxext( uint8_t *dst, int i_dst, uint8_t *src, int i_src, int i_weight_dst ) \
  69 { \
  70     x264_pixel_avg_weight_w ## W ## _mmxext( dst, i_dst, src, i_src, i_weight_dst, H ); \
  71 }
  72 AVG_WEIGHT(16,16)
  73 AVG_WEIGHT(16,8)
  74 AVG_WEIGHT(8,16)
  75 AVG_WEIGHT(8,8)
  76 AVG_WEIGHT(8,4)
  77
  78 #if 0
  79
  80 #if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 3)
  81 #define USED_UINT64(foo) \
  82     static const uint64_t foo __asm__ (#foo) __attribute__((used))
  83 #else
  84 #define USED_UINT64(foo) \
  85     static const uint64_t foo __asm__ (#foo) __attribute__((unused))
  86 #endif
  87
  88 USED_UINT64( x264_w0x10 ) = 0x0010001000100010ULL;
  89
  90
  91 #define MMX_ZERO( MMZ ) \
  92     asm volatile( "pxor " #MMZ ", " #MMZ "\n" :: )
  93
  94 #define MMX_INIT( MMV, NAME ) \
  95     asm volatile( "movq " #NAME ", " #MMV "\n" :: )
  96
  97 #define MMX_SAVE_4P( MMP, MMZ, dst ) \
  98     asm volatile( "packuswb " #MMZ  "," #MMP "\n" \
  99                   "movd " #MMP ", (%0)" :: "r"(dst) )
 100
 101 #define MMX_LOAD_4P( MMP, MMZ, pix ) \
 102     asm volatile( "movd (%0), " #MMP "\n" \
 103                   "punpcklbw  " #MMZ ", " #MMP "\n" : : "r"(pix) )
 104
 105 #define MMX_LOAD_4x4( MMP1, MMP2, MMP3, MMP4, MMZ, pix, i_pix )\
 106     MMX_LOAD_4P( MMP1, MMZ, &(pix)[0*(i_pix)] ); \
 107     MMX_LOAD_4P( MMP2, MMZ, &(pix)[1*(i_pix)] ); \
 108     MMX_LOAD_4P( MMP3, MMZ, &(pix)[2*(i_pix)] ); \
 109     MMX_LOAD_4P( MMP4, MMZ, &(pix)[3*(i_pix)] )
 110
 111 #define MMX_LOAD_2x4( MMP1, MMP2, MMZ, pix, i_pix )\
 112     MMX_LOAD_4P( MMP1, MMZ, &(pix)[0*(i_pix)] ); \
 113     MMX_LOAD_4P( MMP2, MMZ, &(pix)[1*(i_pix)] )
 114
 115 #define MMX_SAVEPACK_8P( MMP1, MMP2, MMZ, dst ) \
 116     asm volatile( "packuswb " #MMP2  "," #MMP1 "\n" \
 117                   "movq " #MMP1 ", (%0)\n" :: "r"(dst) )
 118
 119
 120 #define MMX_LOAD_8P( MMP1, MMP2, MMZ, pix ) \
 121     asm volatile( "movq         (%0)   , " #MMP1 "\n" \
 122                   "movq       " #MMP1 ", " #MMP2 "\n" \
 123                   "punpcklbw  " #MMZ  ", " #MMP1 "\n" \
 124                   "punpckhbw  " #MMZ  ", " #MMP2 "\n" : : "r"(pix) )
 125
 126 #define MMX_LOAD_2x8( MMP1, MMP2, MMP3, MMP4, MMZ, pix, i_pix )\
 127     MMX_LOAD_8P( MMP1, MMP2, MMZ, &(pix)[0*(i_pix)] ); \
 128     MMX_LOAD_8P( MMP3, MMP4, MMZ, &(pix)[1*(i_pix)] )
 129
 130 #define SBUTTERFLYwd(a,b,t )\
 131     asm volatile( "movq " #a ", " #t "        \n\t" \
 132                   "punpcklwd " #b ", " #a "   \n\t" \
 133                   "punpckhwd " #b ", " #t "   \n\t" :: )
 134
 135 #define SBUTTERFLYdq(a,b,t )\
 136     asm volatile( "movq " #a ", " #t "        \n\t" \
 137                   "punpckldq " #b ", " #a "   \n\t" \
 138                   "punpckhdq " #b ", " #t "   \n\t" :: )
 139
 140 /* input ABCD output ADTC  ( or 0?31-2->0123 ) */
 141 #define MMX_TRANSPOSE( MMA, MMB, MMC, MMD, MMT ) \
 142         SBUTTERFLYwd( MMA, MMB, MMT ); \
 143         SBUTTERFLYwd( MMC, MMD, MMB ); \
 144         SBUTTERFLYdq( MMA, MMC, MMD ); \
 145         SBUTTERFLYdq( MMT, MMB, MMC )
 146
 147 /* first pass MM0 = MM0 -5*MM1 */
 148 #define MMX_FILTERTAP_P1( MMP0, MMP1 ) \
 149     asm volatile( "psubw    " #MMP1 "," #MMP0 "\n" \
 150                   "psllw      $2,     " #MMP1 "\n" \
 151                   "psubw    " #MMP1 "," #MMP0 "\n" :: )
 152                                                    \
 153 /* second pass MM0 = MM0 + 20*(MM2+MM3) */
 154 #define MMX_FILTERTAP_P2( MMP0, MMP2, MMP3 ) \
 155     asm volatile( "paddw    " #MMP3 "," #MMP2 "\n" \
 156                                                  \
 157                   "psllw      $2,     " #MMP2 "\n" \
 158                   "paddw    " #MMP2 "," #MMP0 "\n" \
 159                   "psllw      $2,     " #MMP2 "\n" \
 160                   "paddw    " #MMP2 "," #MMP0 "\n" :: )
 161
 162 /* last pass: MM0 = ( MM0 -5*MM1 + MM2 + MMV ) >> 5 */
 163 #define MMX_FILTERTAP_P3( MMP0, MMP1, MMP2, MMV, MMZ ) \
 164     asm volatile( "psubw    " #MMP1 "," #MMP0 "\n" \
 165                   "psllw      $2,     " #MMP1 "\n" \
 166                   "psubw    " #MMP1 "," #MMP0 "\n" \
 167                                                    \
 168                   "paddw    " #MMP2 "," #MMP0 "\n" \
 169                   "paddw    " #MMV  "," #MMP0 "\n" \
 170                   "psraw      $5,     " #MMP0 "\n" :: )
 171
 172 #define MMX_FILTERTAP2_P1( MMP0, MMP1, MMP2, MMP3 ) \
 173     asm volatile( "psubw    " #MMP1 "," #MMP0 "\n" \
 174                   "psubw    " #MMP3 "," #MMP2 "\n" \
 175                   "psllw      $2,     " #MMP1 "\n" \
 176                   "psllw      $2,     " #MMP3 "\n" \
 177                   "psubw    " #MMP1 "," #MMP0 "\n" \
 178                   "psubw    " #MMP3 "," #MMP2 "\n" :: )
 179
 180 /* second pass MM0 = MM0 + 20*(MM1+MM2) */
 181 #define MMX_FILTERTAP2_P2( MMP0, MMP1, MMP2, MMP3, MMP4, MMP5 ) \
 182     asm volatile( "paddw    " #MMP2 "," #MMP1 "\n" \
 183                   "paddw    " #MMP5 "," #MMP4 "\n" \
 184                                                  \
 185                   "psllw      $2,     " #MMP1 "\n" \
 186                   "psllw      $2,     " #MMP4 "\n" \
 187                   "paddw    " #MMP1 "," #MMP0 "\n" \
 188                   "paddw    " #MMP4 "," #MMP3 "\n" \
 189                   "psllw      $2,     " #MMP1 "\n" \
 190                   "psllw      $2,     " #MMP4 "\n" \
 191                   "paddw    " #MMP1 "," #MMP0 "\n" \
 192                   "paddw    " #MMP4 "," #MMP3 "\n" :: )
 193
 194 #define MMX_LOAD_1r( m1, dst ) \
 195     asm volatile( "movq (%0), " #m1 "\n" :: "r"(dst) ); \
 196
 197 #define MMX_SAVE_1r( m1, dst ) \
 198     asm volatile( "movq " #m1 ", (%0)\n" :: "r"(dst) ); \
 199
 200 #define MMX_LOAD_2r( m1, m2, dst, i_dst ) \
 201     asm volatile( "movq (%0), " #m1 "\n" :: "r"(&((uint8_t*)dst)[0*(i_dst)]) ); \
 202     asm volatile( "movq (%0), " #m2 "\n" :: "r"(&((uint8_t*)dst)[1*(i_dst)]) )
 203
 204 #define MMX_SAVE_2r( m1, m2, dst, i_dst ) \
 205     asm volatile( "movq " #m1 ", (%0)\n" :: "r"(&((uint8_t*)dst)[0*(i_dst)]) ); \
 206     asm volatile( "movq " #m2 ", (%0)\n" :: "r"(&((uint8_t*)dst)[1*(i_dst)]) )
 207
 208 #define MMX_SAVE_4r( m1, m2, m3, m4, dst, i_dst ) \
 209     asm volatile( "movq " #m1 ", (%0)\n" :: "r"(&((uint8_t*)dst)[0*(i_dst)]) ); \
 210     asm volatile( "movq " #m2 ", (%0)\n" :: "r"(&((uint8_t*)dst)[1*(i_dst)]) ); \
 211     asm volatile( "movq " #m3 ", (%0)\n" :: "r"(&((uint8_t*)dst)[2*(i_dst)]) ); \
 212     asm volatile( "movq " #m4 ", (%0)\n" :: "r"(&((uint8_t*)dst)[3*(i_dst)]) )
 213
 214 #define MMX_LOAD_4r( m1, m2, m3, m4, dst, i_dst ) \
 215     asm volatile( "movq (%0), " #m1 "\n" :: "r"(&((uint8_t*)dst)[0*(i_dst)]) ); \
 216     asm volatile( "movq (%0), " #m2 "\n" :: "r"(&((uint8_t*)dst)[1*(i_dst)]) ); \
 217     asm volatile( "movq (%0), " #m3 "\n" :: "r"(&((uint8_t*)dst)[2*(i_dst)]) ); \
 218     asm volatile( "movq (%0), " #m4 "\n" :: "r"(&((uint8_t*)dst)[3*(i_dst)]) )
 219
 220
 221 static inline int x264_tapfilter( uint8_t *pix, int i_pix_next )
 222 {
 223     return pix[-2*i_pix_next] - 5*pix[-1*i_pix_next] + 20*(pix[0] + pix[1*i_pix_next]) - 5*pix[ 2*i_pix_next] + pix[ 3*i_pix_next];
 224 }
 225 static inline int x264_tapfilter1( uint8_t *pix )
 226 {
 227     return pix[-2] - 5*pix[-1] + 20*(pix[0] + pix[1]) - 5*pix[ 2] + pix[ 3];
 228 }
 229
 230 typedef void (*pf_mc_t)(uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height );
 231
 232 /* Macro to define NxM functions */
 233 /* mc I+H */
 234 #define MC_IH( name, cpu, width, height, off )  \
 235 static void name##_w##width##_##cpu( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) \
 236 {                                                               \
 237     DECLARE_ALIGNED( uint8_t, tmp[width*height], width );       \
 238                                                                 \
 239     mc_hh_w##width( src, i_src_stride, tmp, width, i_height );  \
 240     x264_pixel_avg_w##width##_##cpu( dst, i_dst_stride,         \
 241                                      src+(off), i_src_stride,   \
 242                                      tmp, width, i_height );    \
 243 }
 244
 245 /* mc I+V */
 246 #define MC_IV( name, cpu, width, height, off )  \
 247 static void name##_w##width##_##cpu( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) \
 248 {                                                               \
 249     DECLARE_ALIGNED( uint8_t, tmp[width*height], width );       \
 250                                                                 \
 251     mc_hv_w##width( src, i_src_stride, tmp, width, i_height );  \
 252     x264_pixel_avg_w##width##_##cpu( dst, i_dst_stride,         \
 253                                      src+(off), i_src_stride,   \
 254                                      tmp, width, i_height );    \
 255 }
 256
 257 /* mc H+V */
 258 #define MC_HV( name, cpu, width, height, off1, off2 ) \
 259 static void name##_w##width##_##cpu( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) \
 260 {                                                               \
 261     DECLARE_ALIGNED( uint8_t, tmp1[width*height], width );      \
 262     DECLARE_ALIGNED( uint8_t, tmp2[width*height], width );      \
 263                                                                 \
 264     mc_hv_w##width( src+(off1), i_src_stride, tmp1, width, i_height );  \
 265     mc_hh_w##width( src+(off2), i_src_stride, tmp2, width, i_height );  \
 266     x264_pixel_avg_w##width##_##cpu( dst, i_dst_stride,         \
 267                                      tmp1, width, tmp2, width,  \
 268                                      i_height );                \
 269 }
 270
 271 /* mc C+H */
 272 #define MC_CH( name, cpu, width, height, off ) \
 273 static void name##_w##width##_##cpu( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) \
 274 {                                                               \
 275     DECLARE_ALIGNED( uint8_t, tmp1[width*height], width );      \
 276     DECLARE_ALIGNED( uint8_t, tmp2[width*height], width );      \
 277                                                                 \
 278     mc_hc_w##width( src,       i_src_stride, tmp1, width, i_height );  \
 279     mc_hh_w##width( src+(off), i_src_stride, tmp2, width, i_height );  \
 280     x264_pixel_avg_w##width##_##cpu( dst, i_dst_stride,         \
 281                                      tmp1, width, tmp2, width,  \
 282                                      i_height );                \
 283 }
 284
 285 /* mc C+V */
 286 #define MC_CV( name, cpu, width, height, off ) \
 287 static void name##_w##width##_##cpu( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) \
 288 {                                                               \
 289     DECLARE_ALIGNED( uint8_t, tmp1[width*height], width );      \
 290     DECLARE_ALIGNED( uint8_t, tmp2[width*height], width );      \
 291                                                                 \
 292     mc_hc_w##width( src,       i_src_stride, tmp1, width, i_height );  \
 293     mc_hv_w##width( src+(off), i_src_stride, tmp2, width, i_height );  \
 294     x264_pixel_avg_w##width##_##cpu( dst, i_dst_stride,         \
 295                                      tmp1, width, tmp2, width,  \
 296                                      i_height );                \
 297 }
 298
 299
 300 /*****************************************************************************
 301  * MC with width == 4 (height <= 8)
 302  *****************************************************************************/
 303
 304 static inline void mc_hh_w4( uint8_t *src, int i_src, uint8_t *dst, int i_dst, int i_height )
 305 {
 306     const int h4 = i_height / 4;
 307     uint8_t  srct[4*8*3];
 308     uint64_t tmp[4];
 309     int y;
 310
 311     src -= 2;
 312
 313     MMX_ZERO( %%mm7 );
 314     MMX_INIT( %%mm6, x264_w0x10 );
 315
 316     for( y = 0; y < h4; y++ )
 317     {
 318         int i;
 319
 320         /* Preload data and transpose them */
 321         MMX_LOAD_4x4 ( %%mm0, %%mm4, %%mm3, %%mm1, %%mm7, &src[0], i_src );
 322         MMX_TRANSPOSE( %%mm0, %%mm4, %%mm3, %%mm1, %%mm2 ); /* 0123 */
 323         MMX_SAVE_4r( %%mm0, %%mm1, %%mm2, %%mm3, &srct[4*8*0], 8 );
 324
 325         MMX_LOAD_4x4 ( %%mm0, %%mm4, %%mm3, %%mm1, %%mm7, &src[4], i_src );
 326         MMX_TRANSPOSE( %%mm0, %%mm4, %%mm3, %%mm1, %%mm2 ); /* 0123 */
 327         MMX_SAVE_4r( %%mm0, %%mm1, %%mm2, %%mm3, &srct[4*8*1], 8 );
 328
 329         /* we read 2 more bytes that needed */
 330         MMX_LOAD_4x4 ( %%mm0, %%mm4, %%mm3, %%mm1, %%mm7, &src[8], i_src );
 331         MMX_TRANSPOSE( %%mm0, %%mm4, %%mm3, %%mm1, %%mm2 ); /* 0123 */
 332         MMX_SAVE_2r( %%mm0, %%mm1, &srct[4*8*2], 8 );
 333
 334         /* tap filter */
 335         for( i = 0; i < 4; i++ )
 336         {
 337             MMX_LOAD_4r( %%mm0, %%mm1, %%mm2, %%mm3, &srct[8*(i+0)], 8 );
 338             MMX_FILTERTAP_P1( %%mm0, %%mm1 );
 339             MMX_FILTERTAP_P2( %%mm0, %%mm2, %%mm3 );
 340
 341             MMX_LOAD_2r( %%mm1, %%mm2, &srct[8*(i+4)], 8 );
 342             MMX_FILTERTAP_P3( %%mm0, %%mm1, %%mm2, %%mm6, %%mm7 );
 343
 344             MMX_SAVE_1r( %%mm0, &tmp[i] );
 345         }
 346
 347         MMX_LOAD_4r( %%mm0, %%mm4, %%mm3, %%mm1, tmp, 8 );
 348         MMX_TRANSPOSE( %%mm0, %%mm4, %%mm3, %%mm1, %%mm2 ); /* 0123 */
 349         MMX_SAVE_4P( %%mm0, %%mm7, &dst[0*i_dst] );
 350         MMX_SAVE_4P( %%mm1, %%mm7, &dst[1*i_dst] );
 351         MMX_SAVE_4P( %%mm2, %%mm7, &dst[2*i_dst] );
 352         MMX_SAVE_4P( %%mm3, %%mm7, &dst[3*i_dst] );
 353
 354         src += 4 * i_src;
 355         dst += 4 * i_dst;
 356     }
 357 }
 358 static inline void mc_hv_w4( uint8_t *src, int i_src, uint8_t *dst, int i_dst, int i_height )
 359 {
 360     int y;
 361
 362     src -= 2 * i_src;
 363
 364     MMX_ZERO( %%mm7 );
 365     MMX_INIT( %%mm6, x264_w0x10 );
 366
 367     for( y = 0; y < i_height; y++ )
 368     {
 369         MMX_LOAD_4x4( %%mm0, %%mm1, %%mm2, %%mm3, %%mm7, src, i_src );
 370         MMX_FILTERTAP_P1( %%mm0, %%mm1 );
 371         MMX_FILTERTAP_P2( %%mm0, %%mm2, %%mm3 );
 372
 373         MMX_LOAD_2x4( %%mm4, %%mm5, %%mm7, &src[4*i_src], i_src );
 374         MMX_FILTERTAP_P3( %%mm0, %%mm4, %%mm5, %%mm6, %%mm7 );
 375         MMX_SAVE_4P( %%mm0, %%mm7, dst );
 376
 377         src += i_src;
 378         dst += i_dst;
 379     }
 380 }
 381
 382 static inline void mc_hc_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
 383 {
 384     int i, x, y;
 385
 386     for( y = 0; y < i_height; y++ )
 387     {
 388         int16_t tap[5+4];
 389
 390         for( i = 0; i < 5+4; i++ )
 391         {
 392             tap[i] = x264_tapfilter( &src[-2+i], i_src_stride );
 393         }
 394
 395         for( x = 0; x < 4; x++ )
 396         {
 397             dst[x] = x264_mc_clip1( ( tap[0+x] - 5*tap[1+x] + 20 * tap[2+x] + 20 * tap[3+x] -5*tap[4+x] + tap[5+x] + 512 ) >> 10 );
 398         }
 399
 400         src += i_src_stride;
 401         dst += i_dst_stride;
 402     }
 403 }
 404
 405 MC_IH( mc_xy10, mmxext, 4, 8, 0 )
 406 MC_IH( mc_xy30, mmxext, 4, 8, 1 )
 407
 408 MC_IV( mc_xy01, mmxext, 4, 8, 0 )
 409 MC_IV( mc_xy03, mmxext, 4, 8, i_src_stride )
 410
 411 MC_HV( mc_xy11, mmxext, 4, 8, 0, 0 )
 412 MC_HV( mc_xy31, mmxext, 4, 8, 1, 0 )
 413 MC_HV( mc_xy13, mmxext, 4, 8, 0, i_src_stride )
 414 MC_HV( mc_xy33, mmxext, 4, 8, 1, i_src_stride )
 415
 416 MC_CH( mc_xy21, mmxext, 4, 8, 0 )
 417 MC_CH( mc_xy23, mmxext, 4, 8, i_src_stride )
 418
 419 MC_CV( mc_xy12, mmxext, 4, 8, 0 )
 420 MC_CV( mc_xy32, mmxext, 4, 8, 1 )
 421
 422 #if 0
 423 static void mc_xy10_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
 424 {
 425     uint8_t tmp[4*8];
 426     mc_hh_w4( src, i_src_stride, tmp, 4, i_height );
 427     pixel_avg_w4( dst, i_dst_stride, src, i_src_stride, tmp, 4, i_height );
 428 }
 429 static void mc_xy30_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
 430 {
 431     uint8_t tmp[4*8];
 432     mc_hh_w4( src, i_src_stride, tmp, 4, i_height );
 433     pixel_avg_w4( dst, i_dst_stride, src+1, i_src_stride, tmp, 4, i_height );
 434 }
 435
 436 static void mc_xy01_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
 437 {
 438     uint8_t tmp[4*8];
 439     mc_hv_w4( src, i_src_stride, tmp, 4, i_height );
 440     pixel_avg_w4( dst, i_dst_stride, src, i_src_stride, tmp, 4, i_height );
 441 }
 442 static void mc_xy03_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
 443 {
 444     uint8_t tmp[4*8];
 445     mc_hv_w4( src, i_src_stride, tmp, 4, i_height );
 446     pixel_avg_w4( dst, i_dst_stride, src+i_src_stride, i_src_stride, tmp, 4, i_height );
 447 }
 448
 449 static void mc_xy11_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
 450 {
 451     uint8_t tmp1[4*8];
 452     uint8_t tmp2[4*8];
 453
 454     mc_hv_w4( src, i_src_stride, tmp1, 4, i_height );
 455     mc_hh_w4( src, i_src_stride, tmp2, 4, i_height );
 456     pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height );
 457 }
 458 static void mc_xy31_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
 459 {
 460     uint8_t tmp1[4*8];
 461     uint8_t tmp2[4*8];
 462
 463     mc_hv_w4( src+1, i_src_stride, tmp1, 4, i_height );
 464     mc_hh_w4( src,   i_src_stride, tmp2, 4, i_height );
 465     pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height );
 466 }
 467 static void mc_xy13_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
 468 {
 469     uint8_t tmp1[4*8];
 470     uint8_t tmp2[4*8];
 471
 472     mc_hv_w4( src,              i_src_stride, tmp1, 4, i_height );
 473     mc_hh_w4( src+i_src_stride, i_src_stride, tmp2, 4, i_height );
 474     pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height );
 475 }
 476 static void mc_xy33_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
 477 {
 478     uint8_t tmp1[4*8];
 479     uint8_t tmp2[4*8];
 480
 481     mc_hv_w4( src+1,            i_src_stride, tmp1, 4, i_height );
 482     mc_hh_w4( src+i_src_stride, i_src_stride, tmp2, 4, i_height );
 483     pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height );
 484 }
 485
 486 static void mc_xy21_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
 487 {
 488     uint8_t tmp1[4*8];
 489     uint8_t tmp2[4*8];
 490
 491     mc_hc_w4( src, i_src_stride, tmp1, 4, i_height );
 492     mc_hh_w4( src, i_src_stride, tmp2, 4, i_height );
 493     pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height );
 494 }
 495 static void mc_xy23_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
 496 {
 497     uint8_t tmp1[4*8];
 498     uint8_t tmp2[4*8];
 499
 500     mc_hc_w4( src,              i_src_stride, tmp1, 4, i_height );
 501     mc_hh_w4( src+i_src_stride, i_src_stride, tmp2, 4, i_height );
 502     pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height );
 503 }
 504
 505 static void mc_xy12_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
 506 {
 507     uint8_t tmp1[4*8];
 508     uint8_t tmp2[4*8];
 509
 510     mc_hc_w4( src, i_src_stride, tmp1, 4, i_height );
 511     mc_hv_w4( src, i_src_stride, tmp2, 4, i_height );
 512     pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height );
 513 }
 514 static void mc_xy32_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
 515 {
 516     uint8_t tmp1[4*8];
 517     uint8_t tmp2[4*8];
 518
 519     mc_hc_w4( src,   i_src_stride, tmp1, 4, i_height );
 520     mc_hv_w4( src+1, i_src_stride, tmp2, 4, i_height );
 521     pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height );
 522 }
 523 #endif
 524
 525 /*****************************************************************************
 526  * MC with width == 8 (height <= 16)
 527  *****************************************************************************/
 528
 529 static inline void mc_hh_w8( uint8_t *src, int i_src, uint8_t *dst, int i_dst, int i_height )
 530 {
 531     mc_hh_w4( &src[0], i_src, &dst[0], i_dst, i_height );
 532     mc_hh_w4( &src[4], i_src, &dst[4], i_dst, i_height );
 533 }
 534 static inline void mc_hv_w8( uint8_t *src, int i_src, uint8_t *dst, int i_dst, int i_height )
 535 {
 536     int y;
 537
 538     src -= 2 * i_src;
 539
 540     MMX_ZERO( %%mm7 );
 541     MMX_INIT( %%mm6, x264_w0x10 );
 542
 543     for( y = 0; y < i_height; y++ )
 544     {
 545         MMX_LOAD_2x8( %%mm0, %%mm5, %%mm1, %%mm2, %%mm7,  &src[0*i_src], i_src );
 546         MMX_FILTERTAP2_P1( %%mm0, %%mm1, %%mm5, %%mm2 );
 547
 548
 549         MMX_LOAD_2x8( %%mm1, %%mm3, %%mm2, %%mm4, %%mm7,  &src[2*i_src], i_src );
 550         MMX_FILTERTAP2_P2( %%mm0, %%mm1, %%mm2, %%mm5, %%mm3, %%mm4 );
 551
 552         MMX_LOAD_2x8( %%mm1, %%mm3, %%mm2, %%mm4, %%mm7,  &src[4*i_src], i_src );
 553         MMX_FILTERTAP_P3( %%mm0, %%mm1, %%mm2, %%mm6, %%mm7 );
 554         MMX_FILTERTAP_P3( %%mm5, %%mm3, %%mm4, %%mm6, %%mm7 );
 555
 556         MMX_SAVEPACK_8P( %%mm0, %%mm5, %%mm7, dst );
 557
 558         src += i_src;
 559         dst += i_dst;
 560     }
 561 }
 562
 563 static inline void mc_hc_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
 564 {
 565     int x, y;
 566
 567     asm volatile( "pxor %%mm7,        %%mm7\n" : : );
 568
 569     for( y = 0; y < i_height; y++ )
 570     {
 571         int16_t tap[5+8];
 572
 573         /* first 8 */
 574         asm volatile(
 575             "leal   (%0, %1),   %%eax\n"
 576
 577             "movq       (%0),   %%mm0\n"    /* load pix-2 */
 578             "movq       %%mm0,  %%mm2\n"
 579             "punpcklbw  %%mm7,  %%mm0\n"
 580             "punpckhbw  %%mm7,  %%mm2\n"
 581
 582             "movq       (%%eax),%%mm1\n"    /* load pix-1 */
 583             "movq       %%mm1,  %%mm3\n"
 584             "punpcklbw  %%mm7,  %%mm1\n"
 585             "punpckhbw  %%mm7,  %%mm3\n"
 586             "psubw      %%mm1,  %%mm0\n"
 587             "psllw      $2,     %%mm1\n"
 588             "psubw      %%mm1,  %%mm0\n"
 589             "psubw      %%mm3,  %%mm2\n"
 590             "psllw      $2,     %%mm3\n"
 591             "psubw      %%mm3,  %%mm2\n"
 592
 593             "movq       (%%eax,%1),%%mm1\n"  /* load pix */
 594             "movq       %%mm1,  %%mm3\n"
 595             "punpcklbw  %%mm7,  %%mm1\n"
 596             "punpckhbw  %%mm7,  %%mm3\n"
 597             "psllw      $2,     %%mm1\n"
 598             "paddw      %%mm1,  %%mm0\n"
 599             "psllw      $2,     %%mm1\n"
 600             "paddw      %%mm1,  %%mm0\n"
 601             "psllw      $2,     %%mm3\n"
 602             "paddw      %%mm3,  %%mm2\n"
 603             "psllw      $2,     %%mm3\n"
 604             "paddw      %%mm3,  %%mm2\n"
 605
 606             "movq       (%%eax,%1,2),%%mm1\n"  /* load pix+1 */
 607             "movq       %%mm1,  %%mm3\n"
 608             "punpcklbw  %%mm7,  %%mm1\n"
 609             "punpckhbw  %%mm7,  %%mm3\n"
 610             "psllw      $2,     %%mm1\n"
 611             "paddw      %%mm1,  %%mm0\n"
 612             "psllw      $2,     %%mm1\n"
 613             "paddw      %%mm1,  %%mm0\n"
 614             "psllw      $2,     %%mm3\n"
 615             "paddw      %%mm3,  %%mm2\n"
 616             "psllw      $2,     %%mm3\n"
 617             "paddw      %%mm3,  %%mm2\n"
 618
 619             "movq       (%0,%1,4),%%mm1\n"  /* load pix+2 */
 620             "movq       %%mm1,  %%mm3\n"
 621             "punpcklbw  %%mm7,  %%mm1\n"
 622             "punpckhbw  %%mm7,  %%mm3\n"
 623             "psubw      %%mm1,  %%mm0\n"
 624             "psllw      $2,     %%mm1\n"
 625             "psubw      %%mm1,  %%mm0\n"
 626             "psubw      %%mm3,  %%mm2\n"
 627             "psllw      $2,     %%mm3\n"
 628             "psubw      %%mm3,  %%mm2\n"
 629
 630             "movq       (%%eax,%1,4),%%mm1\n"  /* load pix+3 */
 631             "movq       %%mm1,  %%mm3\n"
 632             "punpcklbw  %%mm7,  %%mm1\n"
 633             "punpckhbw  %%mm7,  %%mm3\n"
 634             "paddw      %%mm1,  %%mm0\n"
 635             "paddw      %%mm3,  %%mm2\n"
 636
 637             "movq       %%mm0,   (%2)\n"
 638             "movq       %%mm2,  8(%2)\n"
 639
 640
 641             "addl   $8,         %%eax\n"
 642             "addl   $8,         %0\n"
 643
 644
 645             "movd       (%0),   %%mm0\n"    /* load pix-2 */
 646             "punpcklbw  %%mm7,  %%mm0\n"
 647
 648             "movd       (%%eax),%%mm1\n"    /* load pix-1 */
 649             "punpcklbw  %%mm7,  %%mm1\n"
 650             "psubw      %%mm1,  %%mm0\n"
 651             "psllw      $2,     %%mm1\n"
 652             "psubw      %%mm1,  %%mm0\n"
 653
 654             "movd       (%%eax,%1),%%mm1\n"  /* load pix */
 655             "punpcklbw  %%mm7,  %%mm1\n"
 656             "psllw      $2,     %%mm1\n"
 657             "paddw      %%mm1,  %%mm0\n"
 658             "psllw      $2,     %%mm1\n"
 659             "paddw      %%mm1,  %%mm0\n"
 660
 661             "movd       (%%eax,%1,2),%%mm1\n"  /* load pix+1 */
 662             "punpcklbw  %%mm7,  %%mm1\n"
 663             "psllw      $2,     %%mm1\n"
 664             "paddw      %%mm1,  %%mm0\n"
 665             "psllw      $2,     %%mm1\n"
 666             "paddw      %%mm1,  %%mm0\n"
 667
 668             "movd       (%0,%1,4),%%mm1\n"  /* load pix+2 */
 669             "punpcklbw  %%mm7,  %%mm1\n"
 670             "psubw      %%mm1,  %%mm0\n"
 671             "psllw      $2,     %%mm1\n"
 672             "psubw      %%mm1,  %%mm0\n"
 673
 674             "movd       (%%eax,%1,4),%%mm1\n"  /* load pix+3 */
 675             "punpcklbw  %%mm7,  %%mm1\n"
 676             "paddw      %%mm1,  %%mm0\n"
 677
 678             "movq       %%mm0,  16(%2)\n"
 679             : : "r"(src-2*i_src_stride-2), "r"(i_src_stride), "r"(&tap[0]) : "%eax" );
 680
 681         /* last one */
 682         tap[8+4] = x264_tapfilter( &src[-2+8+4], i_src_stride );
 683
 684         for( x = 0; x < 8; x++ )
 685         {
 686             dst[x] = x264_mc_clip1( ( tap[0+x] - 5*tap[1+x] + 20 * tap[2+x] + 20 * tap[3+x] -5*tap[4+x] + tap[5+x] + 512 ) >> 10 );
 687         }
 688
 689         src += i_src_stride;
 690         dst += i_dst_stride;
 691     }
 692 }
 693
 694 MC_IH( mc_xy10, mmxext, 8, 16, 0 )
 695 MC_IH( mc_xy30, mmxext, 8, 16, 1 )
 696
 697 MC_IV( mc_xy01, mmxext, 8, 16, 0 )
 698 MC_IV( mc_xy03, mmxext, 8, 16, i_src_stride )
 699
 700 MC_HV( mc_xy11, mmxext, 8, 16, 0, 0 )
 701 MC_HV( mc_xy31, mmxext, 8, 16, 1, 0 )
 702 MC_HV( mc_xy13, mmxext, 8, 16, 0, i_src_stride )
 703 MC_HV( mc_xy33, mmxext, 8, 16, 1, i_src_stride )
 704
 705 MC_CH( mc_xy21, mmxext, 8, 16, 0 )
 706 MC_CH( mc_xy23, mmxext, 8, 16, i_src_stride )
 707
 708 MC_CV( mc_xy12, mmxext, 8, 16, 0 )
 709 MC_CV( mc_xy32, mmxext, 8, 16, 1 )
 710
 711 #if 0
 712 /* mc I+H */
 713 static void mc_xy10_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
 714 {
 715     uint8_t tmp[8*16];
 716     mc_hh_w8( src, i_src_stride, tmp, 8, i_height );
 717     pixel_avg_w8( dst, i_dst_stride, src, i_src_stride, tmp, 8, i_height );
 718 }
 719 static void mc_xy30_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
 720 {
 721     uint8_t tmp[8*16];
 722     mc_hh_w8( src, i_src_stride, tmp, 8, i_height );
 723     pixel_avg_w8( dst, i_dst_stride, src+1, i_src_stride, tmp, 8, i_height );
 724 }
 725 /* mc I+V */
 726 static void mc_xy01_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
 727 {
 728     uint8_t tmp[8*16];
 729     mc_hv_w8( src, i_src_stride, tmp, 8, i_height );
 730     pixel_avg_w8( dst, i_dst_stride, src, i_src_stride, tmp, 8, i_height );
 731 }
 732 static void mc_xy03_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
 733 {
 734     uint8_t tmp[8*16];
 735     mc_hv_w8( src, i_src_stride, tmp, 8, i_height );
 736     pixel_avg_w8( dst, i_dst_stride, src+i_src_stride, i_src_stride, tmp, 8, i_height );
 737 }
 738 /* H+V */
 739 static void mc_xy11_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
 740 {
 741     uint8_t tmp1[8*16];
 742     uint8_t tmp2[8*16];
 743
 744     mc_hv_w8( src, i_src_stride, tmp1, 8, i_height );
 745     mc_hh_w8( src, i_src_stride, tmp2, 8, i_height );
 746     pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height );
 747 }
 748 static void mc_xy31_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
 749 {
 750     uint8_t tmp1[8*16];
 751     uint8_t tmp2[8*16];
 752
 753     mc_hv_w8( src+1, i_src_stride, tmp1, 8, i_height );
 754     mc_hh_w8( src,   i_src_stride, tmp2, 8, i_height );
 755     pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height );
 756 }
 757 static void mc_xy13_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
 758 {
 759     uint8_t tmp1[8*16];
 760     uint8_t tmp2[8*16];
 761
 762     mc_hv_w8( src,              i_src_stride, tmp1, 8, i_height );
 763     mc_hh_w8( src+i_src_stride, i_src_stride, tmp2, 8, i_height );
 764     pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height );
 765 }
 766 static void mc_xy33_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
 767 {
 768     uint8_t tmp1[8*16];
 769     uint8_t tmp2[8*16];
 770
 771     mc_hv_w8( src+1,            i_src_stride, tmp1, 8, i_height );
 772     mc_hh_w8( src+i_src_stride, i_src_stride, tmp2, 8, i_height );
 773     pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height );
 774 }
 775 static void mc_xy21_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
 776 {
 777     uint8_t tmp1[8*16];
 778     uint8_t tmp2[8*16];
 779
 780     mc_hc_w8( src, i_src_stride, tmp1, 8, i_height );
 781     mc_hh_w8( src, i_src_stride, tmp2, 8, i_height );
 782     pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height );
 783 }
 784 static void mc_xy12_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
 785 {
 786     uint8_t tmp1[8*16];
 787     uint8_t tmp2[8*16];
 788
 789     mc_hc_w8( src, i_src_stride, tmp1, 8, i_height );
 790     mc_hv_w8( src, i_src_stride, tmp2, 8, i_height );
 791     pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height );
 792 }
 793 static void mc_xy32_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
 794 {
 795     uint8_t tmp1[8*16];
 796     uint8_t tmp2[8*16];
 797
 798     mc_hc_w8( src,   i_src_stride, tmp1, 8, i_height );
 799     mc_hv_w8( src+1, i_src_stride, tmp2, 8, i_height );
 800     pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height );
 801 }
 802 static void mc_xy23_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
 803 {
 804     uint8_t tmp1[8*16];
 805     uint8_t tmp2[8*16];
 806
 807     mc_hc_w8( src,              i_src_stride, tmp1, 8, i_height );
 808     mc_hh_w8( src+i_src_stride, i_src_stride, tmp2, 8, i_height );
 809     pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height );
 810 }
 811 #endif
 812
 813 /*****************************************************************************
 814  * MC with width == 16 (height <= 16)
 815  *****************************************************************************/
 816
 817 static inline void mc_hh_w16( uint8_t *src, int i_src, uint8_t *dst, int i_dst, int i_height )
 818 {
 819     mc_hh_w4( &src[ 0], i_src, &dst[ 0], i_dst, i_height );
 820     mc_hh_w4( &src[ 4], i_src, &dst[ 4], i_dst, i_height );
 821     mc_hh_w4( &src[ 8], i_src, &dst[ 8], i_dst, i_height );
 822     mc_hh_w4( &src[12], i_src, &dst[12], i_dst, i_height );
 823 }
 824 static inline void mc_hv_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
 825 {
 826     mc_hv_w8( src,     i_src_stride, dst,     i_dst_stride, i_height );
 827     mc_hv_w8( &src[8], i_src_stride, &dst[8], i_dst_stride, i_height );
 828 }
 829
 830 static inline void mc_hc_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
 831 {
 832     mc_hc_w8( src,     i_src_stride, dst,     i_dst_stride, i_height );
 833     mc_hc_w8( &src[8], i_src_stride, &dst[8], i_dst_stride, i_height );
 834 }
 835
 836 /* MMX avg/copy */
 837 MC_IH( mc_xy10, mmxext, 16, 16, 0 )
 838 MC_IH( mc_xy30, mmxext, 16, 16, 1 )
 839
 840 MC_IV( mc_xy01, mmxext, 16, 16, 0 )
 841 MC_IV( mc_xy03, mmxext, 16, 16, i_src_stride )
 842
 843 MC_HV( mc_xy11, mmxext, 16, 16, 0, 0 )
 844 MC_HV( mc_xy31, mmxext, 16, 16, 1, 0 )
 845 MC_HV( mc_xy13, mmxext, 16, 16, 0, i_src_stride )
 846 MC_HV( mc_xy33, mmxext, 16, 16, 1, i_src_stride )
 847
 848 MC_CH( mc_xy21, mmxext, 16, 16, 0 )
 849 MC_CH( mc_xy23, mmxext, 16, 16, i_src_stride )
 850
 851 MC_CV( mc_xy12, mmxext, 16, 16, 0 )
 852 MC_CV( mc_xy32, mmxext, 16, 16, 1 )
 853
 854 /* SSE2 avg/copy */
 855 MC_IH( mc_xy10, sse2, 16, 16, 0 )
 856 MC_IH( mc_xy30, sse2, 16, 16, 1 )
 857
 858 MC_IV( mc_xy01, sse2, 16, 16, 0 )
 859 MC_IV( mc_xy03, sse2, 16, 16, i_src_stride )
 860
 861 MC_HV( mc_xy11, sse2, 16, 16, 0, 0 )
 862 MC_HV( mc_xy31, sse2, 16, 16, 1, 0 )
 863 MC_HV( mc_xy13, sse2, 16, 16, 0, i_src_stride )
 864 MC_HV( mc_xy33, sse2, 16, 16, 1, i_src_stride )
 865
 866 MC_CH( mc_xy21, sse2, 16, 16, 0 )
 867 MC_CH( mc_xy23, sse2, 16, 16, i_src_stride )
 868
 869 MC_CV( mc_xy12, sse2, 16, 16, 0 )
 870 MC_CV( mc_xy32, sse2, 16, 16, 1 )
 871
 872
 873 #if 0
 874 /* mc I+H */
 875 static void mc_xy10_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
 876 {
 877     DECLARE_ALIGNED(uint8_t, tmp[16*16], 16);
 878     mc_hh_w16( src, i_src_stride, tmp, 16, i_height );
 879     pixel_avg_w16( dst, i_dst_stride, src, i_src_stride, tmp, 16, i_height );
 880 }
 881 static void mc_xy30_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
 882 {
 883     DECLARE_ALIGNED(uint8_t, tmp[16*16], 16);
 884     mc_hh_w16( src, i_src_stride, tmp, 16, i_height );
 885     pixel_avg_w16( dst, i_dst_stride, src+1, i_src_stride, tmp, 16, i_height );
 886 }
 887 /* mc I+V */
 888 static void mc_xy01_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
 889 {
 890     DECLARE_ALIGNED(uint8_t, tmp[16*16], 16);
 891     mc_hv_w16( src, i_src_stride, tmp, 16, i_height );
 892     pixel_avg_w16( dst, i_dst_stride, src, i_src_stride, tmp, 16, i_height );
 893 }
 894 static void mc_xy03_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
 895 {
 896     DECLARE_ALIGNED(uint8_t, tmp[16*16], 16);
 897     mc_hv_w16( src, i_src_stride, tmp, 16, i_height );
 898     pixel_avg_w16( dst, i_dst_stride, src+i_src_stride, i_src_stride, tmp, 16, i_height );
 899 }
 900 /* H+V */
 901 static void mc_xy11_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
 902 {
 903     DECLARE_ALIGNED(uint8_t, tmp1[16*16], 16);
 904     DECLARE_ALIGNED(uint8_t, tmp2[16*16], 16);
 905
 906     mc_hv_w16( src, i_src_stride, tmp1, 16, i_height );
 907     mc_hh_w16( src, i_src_stride, tmp2, 16, i_height );
 908     pixel_avg_w16( dst, i_dst_stride, tmp1, 16, tmp2, 16, i_height );
 909 }
 910 static void mc_xy31_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
 911 {
 912     DECLARE_ALIGNED(uint8_t, tmp1[16*16], 16);
 913     DECLARE_ALIGNED(uint8_t, tmp2[16*16], 16);
 914
 915     mc_hv_w16( src+1, i_src_stride, tmp1, 16, i_height );
 916     mc_hh_w16( src,   i_src_stride, tmp2, 16, i_height );
 917     pixel_avg_w16( dst, i_dst_stride, tmp1, 16, tmp2, 16, i_height );
 918 }
 919 static void mc_xy13_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
 920 {
 921     DECLARE_ALIGNED(uint8_t, tmp1[16*16], 16);
 922     DECLARE_ALIGNED(uint8_t, tmp2[16*16], 16);
 923
 924     mc_hv_w16( src,              i_src_stride, tmp1, 16, i_height );
 925     mc_hh_w16( src+i_src_stride, i_src_stride, tmp2, 16, i_height );
 926     pixel_avg_w16( dst, i_dst_stride, tmp1, 16, tmp2, 16, i_height );
 927 }
 928 static void mc_xy33_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
 929 {
 930     DECLARE_ALIGNED(uint8_t, tmp1[16*16], 16);
 931     DECLARE_ALIGNED(uint8_t, tmp2[16*16], 16);
 932
 933     mc_hv_w16( src+1,            i_src_stride, tmp1, 16, i_height );
 934     mc_hh_w16( src+i_src_stride, i_src_stride, tmp2, 16, i_height );
 935     pixel_avg_w16( dst, i_dst_stride, tmp1, 16, tmp2, 16, i_height );
 936 }
 937 static void mc_xy21_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
 938 {
 939     DECLARE_ALIGNED(uint8_t, tmp1[16*16], 16);
 940     DECLARE_ALIGNED(uint8_t, tmp2[16*16], 16);
 941
 942     mc_hc_w16( src, i_src_stride, tmp1, 16, i_height );
 943     mc_hh_w16( src, i_src_stride, tmp2, 16, i_height );
 944     pixel_avg_w16( dst, i_dst_stride, tmp1, 16, tmp2, 16, i_height );
 945 }
 946 static void mc_xy12_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
 947 {
 948     DECLARE_ALIGNED(uint8_t, tmp1[16*16], 16);
 949     DECLARE_ALIGNED(uint8_t, tmp2[16*16], 16);
 950
 951     mc_hc_w16( src, i_src_stride, tmp1, 16, i_height );
 952     mc_hv_w16( src, i_src_stride, tmp2, 16, i_height );
 953     pixel_avg_w16( dst, i_dst_stride, tmp1, 16, tmp2, 16, i_height );
 954 }
 955 static void mc_xy32_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
 956 {
 957     DECLARE_ALIGNED(uint8_t, tmp1[16*16], 16);
 958     DECLARE_ALIGNED(uint8_t, tmp2[16*16], 16);
 959
 960     mc_hc_w16( src,   i_src_stride, tmp1, 16, i_height );
 961     mc_hv_w16( src+1, i_src_stride, tmp2, 16, i_height );
 962     pixel_avg_w16( dst, i_dst_stride, tmp1, 16, tmp2, 16, i_height );
 963 }
 964 static void mc_xy23_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
 965 {
 966     DECLARE_ALIGNED(uint8_t, tmp1[16*16], 16);
 967     DECLARE_ALIGNED(uint8_t, tmp2[16*16], 16);
 968
 969     mc_hc_w16( src,              i_src_stride, tmp1, 16, i_height );
 970     mc_hh_w16( src+i_src_stride, i_src_stride, tmp2, 16, i_height );
 971     pixel_avg_w16( dst, i_dst_stride, tmp1, 16, tmp2, 16, i_height );
 972 }
 973 #endif
 974
 975 #define MOTION_COMPENSATION_LUMA \
 976     src += (mvy >> 2) * i_src_stride + (mvx >> 2);  \
 977     if( i_width == 4 )                              \
 978     {                                               \
 979         pf_mc[0][mvy&0x03][mvx&0x03]( src, i_src_stride, dst, i_dst_stride, i_height ); \
 980     }                                               \
 981     else if( i_width == 8 )                         \
 982     {                                               \
 983         pf_mc[1][mvy&0x03][mvx&0x03]( src, i_src_stride, dst, i_dst_stride, i_height ); \
 984     }                                               \
 985     else if( i_width == 16 )                        \
 986     {                                               \
 987         pf_mc[2][mvy&0x03][mvx&0x03]( src, i_src_stride, dst, i_dst_stride, i_height ); \
 988     }                                               \
 989     else                                            \
 990     {                                               \
 991         fprintf( stderr, "Error: motion_compensation_luma called with invalid width" ); \
 992     }
 993
 994 static void motion_compensation_luma_mmxext( uint8_t *src, int i_src_stride,
 995                                              uint8_t *dst, int i_dst_stride,
 996                                              int mvx,int mvy,
 997                                              int i_width, int i_height )
 998 {
 999     static const pf_mc_t pf_mc[3][4][4] =    /*XXX [dqy][dqx] */
1000     {
1001         {
1002             { x264_mc_copy_w4_mmxext,   mc_xy10_w4_mmxext,    mc_hh_w4,             mc_xy30_w4_mmxext },
1003             { mc_xy01_w4_mmxext,        mc_xy11_w4_mmxext,    mc_xy21_w4_mmxext,    mc_xy31_w4_mmxext },
1004             { mc_hv_w4,                 mc_xy12_w4_mmxext,    mc_hc_w4,             mc_xy32_w4_mmxext },
1005             { mc_xy03_w4_mmxext,        mc_xy13_w4_mmxext,    mc_xy23_w4_mmxext,    mc_xy33_w4_mmxext },
1006         },
1007         {
1008             { x264_mc_copy_w8_mmxext,   mc_xy10_w8_mmxext,    mc_hh_w8,             mc_xy30_w8_mmxext },
1009             { mc_xy01_w8_mmxext,        mc_xy11_w8_mmxext,    mc_xy21_w8_mmxext,    mc_xy31_w8_mmxext },
1010             { mc_hv_w8,                 mc_xy12_w8_mmxext,    mc_hc_w8,             mc_xy32_w8_mmxext },
1011             { mc_xy03_w8_mmxext,        mc_xy13_w8_mmxext,    mc_xy23_w8_mmxext,    mc_xy33_w8_mmxext },
1012         },
1013         {
1014             { x264_mc_copy_w16_mmxext,   mc_xy10_w16_mmxext,    mc_hh_w16,             mc_xy30_w16_mmxext },
1015             { mc_xy01_w16_mmxext,        mc_xy11_w16_mmxext,    mc_xy21_w16_mmxext,    mc_xy31_w16_mmxext },
1016             { mc_hv_w16,                 mc_xy12_w16_mmxext,    mc_hc_w16,             mc_xy32_w16_mmxext },
1017             { mc_xy03_w16_mmxext,        mc_xy13_w16_mmxext,    mc_xy23_w16_mmxext,    mc_xy33_w16_mmxext },
1018         }
1019     };
1020
1021     MOTION_COMPENSATION_LUMA
1022 }
1023
1024 static void motion_compensation_luma_sse2( uint8_t *src, int i_src_stride,
1025                                            uint8_t *dst, int i_dst_stride,
1026                                            int mvx,int mvy,
1027                                            int i_width, int i_height )
1028 {
1029     static const pf_mc_t pf_mc[3][4][4] =    /*XXX [dqy][dqx] */
1030     {
1031         {
1032             { x264_mc_copy_w4_mmxext,   mc_xy10_w4_mmxext,    mc_hh_w4,             mc_xy30_w4_mmxext },
1033             { mc_xy01_w4_mmxext,        mc_xy11_w4_mmxext,    mc_xy21_w4_mmxext,    mc_xy31_w4_mmxext },
1034             { mc_hv_w4,                 mc_xy12_w4_mmxext,    mc_hc_w4,             mc_xy32_w4_mmxext },
1035             { mc_xy03_w4_mmxext,        mc_xy13_w4_mmxext,    mc_xy23_w4_mmxext,    mc_xy33_w4_mmxext },
1036         },
1037         {
1038             { x264_mc_copy_w8_mmxext,   mc_xy10_w8_mmxext,    mc_hh_w8,             mc_xy30_w8_mmxext },
1039             { mc_xy01_w8_mmxext,        mc_xy11_w8_mmxext,    mc_xy21_w8_mmxext,    mc_xy31_w8_mmxext },
1040             { mc_hv_w8,                 mc_xy12_w8_mmxext,    mc_hc_w8,             mc_xy32_w8_mmxext },
1041             { mc_xy03_w8_mmxext,        mc_xy13_w8_mmxext,    mc_xy23_w8_mmxext,    mc_xy33_w8_mmxext },
1042         },
1043         {
1044             { x264_mc_copy_w16_sse2,   mc_xy10_w16_sse2,    mc_hh_w16,             mc_xy30_w16_sse2 },
1045             { mc_xy01_w16_sse2,        mc_xy11_w16_sse2,    mc_xy21_w16_sse2,    mc_xy31_w16_sse2 },
1046             { mc_hv_w16,                 mc_xy12_w16_sse2,    mc_hc_w16,             mc_xy32_w16_sse2 },
1047             { mc_xy03_w16_sse2,        mc_xy13_w16_sse2,    mc_xy23_w16_sse2,    mc_xy33_w16_sse2 },
1048         }
1049     };
1050     MOTION_COMPENSATION_LUMA
1051 }
1052
1053 #endif
1054
1055 void mc_luma_mmx( uint8_t *src[4], int i_src_stride,
1056               uint8_t *dst,    int i_dst_stride,
1057               int mvx,int mvy,
1058               int i_width, int i_height )
1059 {
1060     uint8_t *src1, *src2;
1061
1062     int correction = (mvx&1) && (mvy&1) && ((mvx&2) ^ (mvy&2));
1063     int hpel1x = mvx>>1;
1064     int hpel1y = (mvy+1-correction)>>1;
1065     int filter1 = (hpel1x & 1) + ( (hpel1y & 1) << 1 );
1066
1067     src1 = src[filter1] + (hpel1y >> 1) * i_src_stride + (hpel1x >> 1);
1068
1069     if ( (mvx|mvy) & 1 ) /* qpel interpolation needed */
1070     {
1071         int hpel2x = (mvx+1)>>1;
1072         int hpel2y = (mvy+correction)>>1;
1073         int filter2 = (hpel2x & 1) + ( (hpel2y & 1) <<1 );
1074
1075         src2 = src[filter2] + (hpel2y >> 1) * i_src_stride + (hpel2x >> 1);
1076
1077         switch(i_width) {
1078         case 4:
1079             x264_pixel_avg_w4_mmxext( dst, i_dst_stride, src1, i_src_stride,
1080                           src2, i_src_stride, i_height );
1081             break;
1082         case 8:
1083             x264_pixel_avg_w8_mmxext( dst, i_dst_stride, src1, i_src_stride,
1084                           src2, i_src_stride, i_height );
1085             break;
1086         case 16:
1087         default:
1088             x264_pixel_avg_w16_mmxext(dst, i_dst_stride, src1, i_src_stride,
1089                           src2, i_src_stride, i_height );
1090         }
1091     }
1092     else
1093     {
1094         switch(i_width) {
1095         case 4:
1096             x264_mc_copy_w4_mmxext( src1, i_src_stride, dst, i_dst_stride, i_height );
1097             break;
1098         case 8:
1099             x264_mc_copy_w8_mmxext( src1, i_src_stride, dst, i_dst_stride, i_height );
1100             break;
1101         case 16:
1102             x264_mc_copy_w16_mmxext( src1, i_src_stride, dst, i_dst_stride, i_height );
1103             break;
1104         }
1105     }
1106 }
1107
1108 uint8_t *get_ref_mmx( uint8_t *src[4], int i_src_stride,
1109                       uint8_t *dst,   int *i_dst_stride,
1110                       int mvx,int mvy,
1111                       int i_width, int i_height )
1112 {
1113     uint8_t *src1, *src2;
1114
1115     int correction = (mvx&1) && (mvy&1) && ((mvx&2) ^ (mvy&2));
1116     int hpel1x = mvx>>1;
1117     int hpel1y = (mvy+1-correction)>>1;
1118     int filter1 = (hpel1x & 1) + ( (hpel1y & 1) << 1 );
1119
1120     src1 = src[filter1] + (hpel1y >> 1) * i_src_stride + (hpel1x >> 1);
1121
1122     if ( (mvx|mvy) & 1 ) /* qpel interpolation needed */
1123     {
1124         int hpel2x = (mvx+1)>>1;
1125         int hpel2y = (mvy+correction)>>1;
1126         int filter2 = (hpel2x & 1) + ( (hpel2y & 1) <<1 );
1127
1128         src2 = src[filter2] + (hpel2y >> 1) * i_src_stride + (hpel2x >> 1);
1129
1130         switch(i_width) {
1131         case 4:
1132             x264_pixel_avg_w4_mmxext( dst, *i_dst_stride, src1, i_src_stride,
1133                           src2, i_src_stride, i_height );
1134             break;
1135         case 8:
1136             x264_pixel_avg_w8_mmxext( dst, *i_dst_stride, src1, i_src_stride,
1137                           src2, i_src_stride, i_height );
1138             break;
1139         case 16:
1140         default:
1141             x264_pixel_avg_w16_mmxext(dst, *i_dst_stride, src1, i_src_stride,
1142                           src2, i_src_stride, i_height );
1143         }
1144         return dst;
1145
1146     }
1147     else
1148     {
1149         *i_dst_stride = i_src_stride;
1150         return src1;
1151     }
1152 }
1153
1154
1155 void x264_mc_mmxext_init( x264_mc_functions_t *pf )
1156 {
1157     pf->mc_luma   = mc_luma_mmx;
1158     pf->get_ref   = get_ref_mmx;
1159
1160     pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_mmxext;
1161     pf->avg[PIXEL_16x8]  = x264_pixel_avg_16x8_mmxext;
1162     pf->avg[PIXEL_8x16]  = x264_pixel_avg_8x16_mmxext;
1163     pf->avg[PIXEL_8x8]   = x264_pixel_avg_8x8_mmxext;
1164     pf->avg[PIXEL_8x4]   = x264_pixel_avg_8x4_mmxext;
1165     pf->avg[PIXEL_4x8]   = x264_pixel_avg_4x8_mmxext;
1166     pf->avg[PIXEL_4x4]   = x264_pixel_avg_4x4_mmxext;
1167     pf->avg[PIXEL_4x2]   = x264_pixel_avg_4x2_mmxext;
1168
1169     pf->avg_weight[PIXEL_16x16] = x264_pixel_avg_weight_16x16_mmxext;
1170     pf->avg_weight[PIXEL_16x8]  = x264_pixel_avg_weight_16x8_mmxext;
1171     pf->avg_weight[PIXEL_8x16]  = x264_pixel_avg_weight_8x16_mmxext;
1172     pf->avg_weight[PIXEL_8x8]   = x264_pixel_avg_weight_8x8_mmxext;
1173     pf->avg_weight[PIXEL_8x4]   = x264_pixel_avg_weight_8x4_mmxext;
1174     pf->avg_weight[PIXEL_4x4]   = x264_pixel_avg_weight_4x4_mmxext;
1175     // avg_weight_4x8 is rare and 4x2 is not used
1176 }
1177 void x264_mc_sse2_init( x264_mc_functions_t *pf )
1178 {
1179     /* todo: use sse2 */
1180     pf->mc_luma   = mc_luma_mmx;
1181     pf->get_ref   = get_ref_mmx;
1182 }
1183
1184 #if 0
1185 void get_funcs_mmx(pf_mc_t *int_h, pf_mc_t *int_v, pf_mc_t *int_hv)
1186 {
1187     *int_h = mc_hh_w16;
1188     *int_v = mc_hv_w16;
1189     *int_hv = mc_hc_w16;
1190 }
1191
1192 void get_funcs_sse2(pf_mc_t *int_h, pf_mc_t *int_v, pf_mc_t *int_hv)
1193 {
1194     *int_h = mc_hh_w16;
1195     *int_v = mc_hv_w16;
1196     *int_hv = mc_hc_w16;
1197 }
1198 #endif