git.sesse.net Git - ffmpeg/blob - libavcodec/x86/hpeldsp_init.c

   1 /*
   2  * MMX optimized DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * This file is part of FFmpeg.
   7  *
   8  * FFmpeg is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License as published by the Free Software Foundation; either
  11  * version 2.1 of the License, or (at your option) any later version.
  12  *
  13  * FFmpeg is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16  * Lesser General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU Lesser General Public
  19  * License along with FFmpeg; if not, write to the Free Software
  20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21  *
  22  * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  23  */
  24
  25 #include "libavutil/cpu.h"
  26 #include "libavutil/x86/asm.h"
  27 #include "libavcodec/hpeldsp.h"
  28 #include "dsputil_mmx.h"
  29
  30 void ff_put_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
  31                               ptrdiff_t line_size, int h);
  32 void ff_put_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
  33                              ptrdiff_t line_size, int h);
  34 void ff_put_pixels16_x2_mmxext(uint8_t *block, const uint8_t *pixels,
  35                                ptrdiff_t line_size, int h);
  36 void ff_put_pixels16_x2_3dnow(uint8_t *block, const uint8_t *pixels,
  37                               ptrdiff_t line_size, int h);
  38 void ff_put_no_rnd_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
  39                                      ptrdiff_t line_size, int h);
  40 void ff_put_no_rnd_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
  41                                     ptrdiff_t line_size, int h);
  42 void ff_put_no_rnd_pixels8_x2_exact_mmxext(uint8_t *block,
  43                                            const uint8_t *pixels,
  44                                            ptrdiff_t line_size, int h);
  45 void ff_put_no_rnd_pixels8_x2_exact_3dnow(uint8_t *block,
  46                                           const uint8_t *pixels,
  47                                           ptrdiff_t line_size, int h);
  48 void ff_put_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
  49                               ptrdiff_t line_size, int h);
  50 void ff_put_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
  51                              ptrdiff_t line_size, int h);
  52 void ff_put_no_rnd_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
  53                                      ptrdiff_t line_size, int h);
  54 void ff_put_no_rnd_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
  55                                     ptrdiff_t line_size, int h);
  56 void ff_put_no_rnd_pixels8_y2_exact_mmxext(uint8_t *block,
  57                                            const uint8_t *pixels,
  58                                            ptrdiff_t line_size, int h);
  59 void ff_put_no_rnd_pixels8_y2_exact_3dnow(uint8_t *block,
  60                                           const uint8_t *pixels,
  61                                           ptrdiff_t line_size, int h);
  62 void ff_avg_pixels8_3dnow(uint8_t *block, const uint8_t *pixels,
  63                           ptrdiff_t line_size, int h);
  64 void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
  65                               ptrdiff_t line_size, int h);
  66 void ff_avg_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
  67                              ptrdiff_t line_size, int h);
  68 void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
  69                               ptrdiff_t line_size, int h);
  70 void ff_avg_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
  71                              ptrdiff_t line_size, int h);
  72 void ff_avg_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
  73                                ptrdiff_t line_size, int h);
  74 void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels,
  75                               ptrdiff_t line_size, int h);
  76
  77
  78 #if HAVE_INLINE_ASM
  79
  80 #define JUMPALIGN()     __asm__ volatile (".p2align 3"::)
  81 #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::)
  82
  83 #define MOVQ_BFE(regd)                                  \
  84     __asm__ volatile (                                  \
  85         "pcmpeqd %%"#regd", %%"#regd"   \n\t"           \
  86         "paddb   %%"#regd", %%"#regd"   \n\t" ::)
  87
  88 #ifndef PIC
  89 #define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_bone))
  90 #define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_wtwo))
  91 #else
  92 // for shared library it's better to use this way for accessing constants
  93 // pcmpeqd -> -1
  94 #define MOVQ_BONE(regd)                                 \
  95     __asm__ volatile (                                  \
  96         "pcmpeqd  %%"#regd", %%"#regd"  \n\t"           \
  97         "psrlw          $15, %%"#regd"  \n\t"           \
  98         "packuswb %%"#regd", %%"#regd"  \n\t" ::)
  99
 100 #define MOVQ_WTWO(regd)                                 \
 101     __asm__ volatile (                                  \
 102         "pcmpeqd %%"#regd", %%"#regd"   \n\t"           \
 103         "psrlw         $15, %%"#regd"   \n\t"           \
 104         "psllw          $1, %%"#regd"   \n\t"::)
 105
 106 #endif
 107
 108 // using regr as temporary and for the output result
 109 // first argument is unmodifed and second is trashed
 110 // regfe is supposed to contain 0xfefefefefefefefe
 111 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe)                \
 112     "movq   "#rega", "#regr"            \n\t"                    \
 113     "pand   "#regb", "#regr"            \n\t"                    \
 114     "pxor   "#rega", "#regb"            \n\t"                    \
 115     "pand  "#regfe", "#regb"            \n\t"                    \
 116     "psrlq       $1, "#regb"            \n\t"                    \
 117     "paddb  "#regb", "#regr"            \n\t"
 118
 119 #define PAVGB_MMX(rega, regb, regr, regfe)                       \
 120     "movq   "#rega", "#regr"            \n\t"                    \
 121     "por    "#regb", "#regr"            \n\t"                    \
 122     "pxor   "#rega", "#regb"            \n\t"                    \
 123     "pand  "#regfe", "#regb"            \n\t"                    \
 124     "psrlq       $1, "#regb"            \n\t"                    \
 125     "psubb  "#regb", "#regr"            \n\t"
 126
 127 // mm6 is supposed to contain 0xfefefefefefefefe
 128 #define PAVGBP_MMX_NO_RND(rega, regb, regr,  regc, regd, regp)   \
 129     "movq  "#rega", "#regr"             \n\t"                    \
 130     "movq  "#regc", "#regp"             \n\t"                    \
 131     "pand  "#regb", "#regr"             \n\t"                    \
 132     "pand  "#regd", "#regp"             \n\t"                    \
 133     "pxor  "#rega", "#regb"             \n\t"                    \
 134     "pxor  "#regc", "#regd"             \n\t"                    \
 135     "pand    %%mm6, "#regb"             \n\t"                    \
 136     "pand    %%mm6, "#regd"             \n\t"                    \
 137     "psrlq      $1, "#regb"             \n\t"                    \
 138     "psrlq      $1, "#regd"             \n\t"                    \
 139     "paddb "#regb", "#regr"             \n\t"                    \
 140     "paddb "#regd", "#regp"             \n\t"
 141
 142 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp)           \
 143     "movq  "#rega", "#regr"             \n\t"                    \
 144     "movq  "#regc", "#regp"             \n\t"                    \
 145     "por   "#regb", "#regr"             \n\t"                    \
 146     "por   "#regd", "#regp"             \n\t"                    \
 147     "pxor  "#rega", "#regb"             \n\t"                    \
 148     "pxor  "#regc", "#regd"             \n\t"                    \
 149     "pand    %%mm6, "#regb"             \n\t"                    \
 150     "pand    %%mm6, "#regd"             \n\t"                    \
 151     "psrlq      $1, "#regd"             \n\t"                    \
 152     "psrlq      $1, "#regb"             \n\t"                    \
 153     "psubb "#regb", "#regr"             \n\t"                    \
 154     "psubb "#regd", "#regp"             \n\t"
 155
 156 /***********************************/
 157 /* MMX no rounding */
 158 #define NO_RND 1
 159 #define DEF(x, y) x ## _no_rnd_ ## y ## _mmx
 160 #define SET_RND  MOVQ_WONE
 161 #define PAVGBP(a, b, c, d, e, f)        PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
 162 #define PAVGB(a, b, c, e)               PAVGB_MMX_NO_RND(a, b, c, e)
 163 #define OP_AVG(a, b, c, e)              PAVGB_MMX(a, b, c, e)
 164
 165 #include "hpeldsp_rnd_template.c"
 166
 167 #undef DEF
 168 #undef SET_RND
 169 #undef PAVGBP
 170 #undef PAVGB
 171 #undef NO_RND
 172 /***********************************/
 173 /* MMX rounding */
 174
 175 #define DEF(x, y) x ## _ ## y ## _mmx
 176 #define SET_RND  MOVQ_WTWO
 177 #define PAVGBP(a, b, c, d, e, f)        PAVGBP_MMX(a, b, c, d, e, f)
 178 #define PAVGB(a, b, c, e)               PAVGB_MMX(a, b, c, e)
 179
 180 #include "hpeldsp_rnd_template.c"
 181
 182 #undef DEF
 183 #undef SET_RND
 184 #undef PAVGBP
 185 #undef PAVGB
 186 #undef OP_AVG
 187
 188 #endif /* HAVE_INLINE_ASM */
 189
 190
 191 #if HAVE_YASM
 192 #define ff_put_pixels8_mmx ff_put_pixels8_mmxext
 193
 194 /***********************************/
 195 /* 3Dnow specific */
 196
 197 #define DEF(x) x ## _3dnow
 198
 199 #include "hpeldsp_avg_template.c"
 200
 201 #undef DEF
 202
 203 /***********************************/
 204 /* MMXEXT specific */
 205
 206 #define DEF(x) x ## _mmxext
 207
 208 #include "hpeldsp_avg_template.c"
 209
 210 #undef DEF
 211
 212 #endif /* HAVE_YASM */
 213
 214
 215 #if HAVE_INLINE_ASM
 216 #define put_no_rnd_pixels16_mmx put_pixels16_mmx
 217 #define put_no_rnd_pixels8_mmx put_pixels8_mmx
 218 #define put_pixels16_mmxext put_pixels16_mmx
 219 #define put_pixels8_mmxext put_pixels8_mmx
 220 #define put_pixels4_mmxext put_pixels4_mmx
 221 #define put_no_rnd_pixels16_mmxext put_no_rnd_pixels16_mmx
 222 #define put_no_rnd_pixels8_mmxext put_no_rnd_pixels8_mmx
 223
 224 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
 225                             ptrdiff_t line_size, int h)
 226 {
 227     __asm__ volatile (
 228         "lea   (%3, %3), %%"REG_a"      \n\t"
 229         ".p2align     3                 \n\t"
 230         "1:                             \n\t"
 231         "movq  (%1    ), %%mm0          \n\t"
 232         "movq  (%1, %3), %%mm1          \n\t"
 233         "movq     %%mm0, (%2)           \n\t"
 234         "movq     %%mm1, (%2, %3)       \n\t"
 235         "add  %%"REG_a", %1             \n\t"
 236         "add  %%"REG_a", %2             \n\t"
 237         "movq  (%1    ), %%mm0          \n\t"
 238         "movq  (%1, %3), %%mm1          \n\t"
 239         "movq     %%mm0, (%2)           \n\t"
 240         "movq     %%mm1, (%2, %3)       \n\t"
 241         "add  %%"REG_a", %1             \n\t"
 242         "add  %%"REG_a", %2             \n\t"
 243         "subl        $4, %0             \n\t"
 244         "jnz         1b                 \n\t"
 245         : "+g"(h), "+r"(pixels),  "+r"(block)
 246         : "r"((x86_reg)line_size)
 247         : "%"REG_a, "memory"
 248         );
 249 }
 250
 251 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
 252                              ptrdiff_t line_size, int h)
 253 {
 254     __asm__ volatile (
 255         "lea   (%3, %3), %%"REG_a"      \n\t"
 256         ".p2align     3                 \n\t"
 257         "1:                             \n\t"
 258         "movq  (%1    ), %%mm0          \n\t"
 259         "movq 8(%1    ), %%mm4          \n\t"
 260         "movq  (%1, %3), %%mm1          \n\t"
 261         "movq 8(%1, %3), %%mm5          \n\t"
 262         "movq     %%mm0,  (%2)          \n\t"
 263         "movq     %%mm4, 8(%2)          \n\t"
 264         "movq     %%mm1,  (%2, %3)      \n\t"
 265         "movq     %%mm5, 8(%2, %3)      \n\t"
 266         "add  %%"REG_a", %1             \n\t"
 267         "add  %%"REG_a", %2             \n\t"
 268         "movq  (%1    ), %%mm0          \n\t"
 269         "movq 8(%1    ), %%mm4          \n\t"
 270         "movq  (%1, %3), %%mm1          \n\t"
 271         "movq 8(%1, %3), %%mm5          \n\t"
 272         "movq     %%mm0,  (%2)          \n\t"
 273         "movq     %%mm4, 8(%2)          \n\t"
 274         "movq     %%mm1,  (%2, %3)      \n\t"
 275         "movq     %%mm5, 8(%2, %3)      \n\t"
 276         "add  %%"REG_a", %1             \n\t"
 277         "add  %%"REG_a", %2             \n\t"
 278         "subl        $4, %0             \n\t"
 279         "jnz         1b                 \n\t"
 280         : "+g"(h), "+r"(pixels),  "+r"(block)
 281         : "r"((x86_reg)line_size)
 282         : "%"REG_a, "memory"
 283         );
 284 }
 285 #endif /* HAVE_INLINE_ASM */
 286
 287 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU)                                     \
 288     do {                                                                        \
 289         c->PFX ## _pixels_tab IDX [0] = PFX ## _pixels ## SIZE ## _     ## CPU; \
 290         c->PFX ## _pixels_tab IDX [1] = PFX ## _pixels ## SIZE ## _x2_  ## CPU; \
 291         c->PFX ## _pixels_tab IDX [2] = PFX ## _pixels ## SIZE ## _y2_  ## CPU; \
 292         c->PFX ## _pixels_tab IDX [3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \
 293     } while (0)
 294
 295 static void hpeldsp_init_mmx(HpelDSPContext *c, int flags, int mm_flags)
 296 {
 297 #if HAVE_INLINE_ASM
 298     SET_HPEL_FUNCS(put,        [0], 16, mmx);
 299     SET_HPEL_FUNCS(put_no_rnd, [0], 16, mmx);
 300     SET_HPEL_FUNCS(avg,        [0], 16, mmx);
 301     SET_HPEL_FUNCS(avg_no_rnd,    , 16, mmx);
 302     SET_HPEL_FUNCS(put,        [1],  8, mmx);
 303     SET_HPEL_FUNCS(put_no_rnd, [1],  8, mmx);
 304     SET_HPEL_FUNCS(avg,        [1],  8, mmx);
 305 #endif /* HAVE_INLINE_ASM */
 306 }
 307
 308 static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags, int mm_flags)
 309 {
 310 #if HAVE_YASM
 311     c->put_pixels_tab[0][1] = ff_put_pixels16_x2_mmxext;
 312     c->put_pixels_tab[0][2] = ff_put_pixels16_y2_mmxext;
 313
 314     c->avg_pixels_tab[0][0] = ff_avg_pixels16_mmxext;
 315     c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_mmxext;
 316     c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_mmxext;
 317
 318     c->put_pixels_tab[1][1] = ff_put_pixels8_x2_mmxext;
 319     c->put_pixels_tab[1][2] = ff_put_pixels8_y2_mmxext;
 320
 321     c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmxext;
 322     c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmxext;
 323     c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_mmxext;
 324
 325     if (!(flags & CODEC_FLAG_BITEXACT)) {
 326         c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_mmxext;
 327         c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_mmxext;
 328         c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_mmxext;
 329         c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_mmxext;
 330
 331         c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_mmxext;
 332         c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmxext;
 333     }
 334
 335     if (flags & CODEC_FLAG_BITEXACT && CONFIG_VP3_DECODER) {
 336         c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_mmxext;
 337         c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_mmxext;
 338     }
 339 #endif /* HAVE_YASM */
 340 }
 341
 342 static void hpeldsp_init_3dnow(HpelDSPContext *c, int flags, int mm_flags)
 343 {
 344 #if HAVE_YASM
 345     c->put_pixels_tab[0][1] = ff_put_pixels16_x2_3dnow;
 346     c->put_pixels_tab[0][2] = ff_put_pixels16_y2_3dnow;
 347
 348     c->avg_pixels_tab[0][0] = ff_avg_pixels16_3dnow;
 349     c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_3dnow;
 350     c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_3dnow;
 351
 352     c->put_pixels_tab[1][1] = ff_put_pixels8_x2_3dnow;
 353     c->put_pixels_tab[1][2] = ff_put_pixels8_y2_3dnow;
 354
 355     c->avg_pixels_tab[1][0] = ff_avg_pixels8_3dnow;
 356     c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_3dnow;
 357     c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_3dnow;
 358
 359     if (!(flags & CODEC_FLAG_BITEXACT)){
 360         c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_3dnow;
 361         c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_3dnow;
 362         c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_3dnow;
 363         c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_3dnow;
 364
 365         c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_3dnow;
 366         c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_3dnow;
 367     }
 368
 369     if (flags & CODEC_FLAG_BITEXACT && CONFIG_VP3_DECODER) {
 370         c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_3dnow;
 371         c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_3dnow;
 372     }
 373 #endif /* HAVE_YASM */
 374 }
 375
 376 static void hpeldsp_init_sse2(HpelDSPContext *c, int flags, int mm_flags)
 377 {
 378 #if HAVE_YASM
 379     if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
 380         // these functions are slower than mmx on AMD, but faster on Intel
 381         c->put_pixels_tab[0][0]        = ff_put_pixels16_sse2;
 382         c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
 383         c->avg_pixels_tab[0][0]        = ff_avg_pixels16_sse2;
 384     }
 385 #endif /* HAVE_YASM */
 386 }
 387
 388 void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags)
 389 {
 390     int mm_flags = av_get_cpu_flags();
 391
 392     if (HAVE_MMX && mm_flags & AV_CPU_FLAG_MMX)
 393         hpeldsp_init_mmx(c, flags, mm_flags);
 394
 395     if (mm_flags & AV_CPU_FLAG_MMXEXT)
 396         hpeldsp_init_mmxext(c, flags, mm_flags);
 397
 398     if (mm_flags & AV_CPU_FLAG_3DNOW)
 399         hpeldsp_init_3dnow(c, flags, mm_flags);
 400
 401     if (mm_flags & AV_CPU_FLAG_SSE2)
 402         hpeldsp_init_sse2(c, flags, mm_flags);
 403 }