git.sesse.net Git - ffmpeg/blob - libavcodec/x86/dsputilenc_mmx.c

   1 /*
   2  * MMX optimized DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
   7  *
   8  * This file is part of FFmpeg.
   9  *
  10  * FFmpeg is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2.1 of the License, or (at your option) any later version.
  14  *
  15  * FFmpeg is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with FFmpeg; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23  */
  24
  25 #include "libavutil/attributes.h"
  26 #include "libavutil/cpu.h"
  27 #include "libavutil/x86/asm.h"
  28 #include "libavutil/x86/cpu.h"
  29 #include "libavcodec/dct.h"
  30 #include "libavcodec/dsputil.h"
  31 #include "libavcodec/mpegvideo.h"
  32 #include "dsputil_x86.h"
  33
  34 void ff_get_pixels_mmx(int16_t *block, const uint8_t *pixels, int line_size);
  35 void ff_get_pixels_sse2(int16_t *block, const uint8_t *pixels, int line_size);
  36 void ff_diff_pixels_mmx(int16_t *block, const uint8_t *s1, const uint8_t *s2,
  37                         int stride);
  38 void ff_diff_pixels_sse2(int16_t *block, const uint8_t *s1, const uint8_t *s2,
  39                          int stride);
  40 int ff_pix_sum16_mmx(uint8_t *pix, int line_size);
  41 int ff_pix_sum16_sse2(uint8_t *pix, int line_size);
  42 int ff_pix_norm1_mmx(uint8_t *pix, int line_size);
  43 int ff_pix_norm1_sse2(uint8_t *pix, int line_size);
  44 int ff_sum_abs_dctelem_mmx(int16_t *block);
  45 int ff_sum_abs_dctelem_mmxext(int16_t *block);
  46 int ff_sum_abs_dctelem_sse2(int16_t *block);
  47 int ff_sum_abs_dctelem_ssse3(int16_t *block);
  48
  49 #if HAVE_INLINE_ASM
  50
  51 static int sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  52                     int line_size, int h)
  53 {
  54     int tmp;
  55
  56     __asm__ volatile (
  57         "movl         %4, %%ecx          \n"
  58         "shr          $1, %%ecx          \n"
  59         "pxor      %%mm0, %%mm0          \n" /* mm0 = 0 */
  60         "pxor      %%mm7, %%mm7          \n" /* mm7 holds the sum */
  61         "1:                              \n"
  62         "movq       (%0), %%mm1          \n" /* mm1 = pix1[0][0 - 7] */
  63         "movq       (%1), %%mm2          \n" /* mm2 = pix2[0][0 - 7] */
  64         "movq   (%0, %3), %%mm3          \n" /* mm3 = pix1[1][0 - 7] */
  65         "movq   (%1, %3), %%mm4          \n" /* mm4 = pix2[1][0 - 7] */
  66
  67         /* todo: mm1-mm2, mm3-mm4 */
  68         /* algo: subtract mm1 from mm2 with saturation and vice versa */
  69         /*       OR the results to get absolute difference */
  70         "movq      %%mm1, %%mm5          \n"
  71         "movq      %%mm3, %%mm6          \n"
  72         "psubusb   %%mm2, %%mm1          \n"
  73         "psubusb   %%mm4, %%mm3          \n"
  74         "psubusb   %%mm5, %%mm2          \n"
  75         "psubusb   %%mm6, %%mm4          \n"
  76
  77         "por       %%mm1, %%mm2          \n"
  78         "por       %%mm3, %%mm4          \n"
  79
  80         /* now convert to 16-bit vectors so we can square them */
  81         "movq      %%mm2, %%mm1          \n"
  82         "movq      %%mm4, %%mm3          \n"
  83
  84         "punpckhbw %%mm0, %%mm2          \n"
  85         "punpckhbw %%mm0, %%mm4          \n"
  86         "punpcklbw %%mm0, %%mm1          \n" /* mm1 now spread over (mm1, mm2) */
  87         "punpcklbw %%mm0, %%mm3          \n" /* mm4 now spread over (mm3, mm4) */
  88
  89         "pmaddwd   %%mm2, %%mm2          \n"
  90         "pmaddwd   %%mm4, %%mm4          \n"
  91         "pmaddwd   %%mm1, %%mm1          \n"
  92         "pmaddwd   %%mm3, %%mm3          \n"
  93
  94         "lea (%0, %3, 2), %0             \n" /* pix1 += 2 * line_size */
  95         "lea (%1, %3, 2), %1             \n" /* pix2 += 2 * line_size */
  96
  97         "paddd     %%mm2, %%mm1          \n"
  98         "paddd     %%mm4, %%mm3          \n"
  99         "paddd     %%mm1, %%mm7          \n"
 100         "paddd     %%mm3, %%mm7          \n"
 101
 102         "decl      %%ecx                 \n"
 103         "jnz       1b                    \n"
 104
 105         "movq      %%mm7, %%mm1          \n"
 106         "psrlq       $32, %%mm7          \n" /* shift hi dword to lo */
 107         "paddd     %%mm7, %%mm1          \n"
 108         "movd      %%mm1, %2             \n"
 109         : "+r" (pix1), "+r" (pix2), "=r" (tmp)
 110         : "r" ((x86_reg) line_size), "m" (h)
 111         : "%ecx");
 112
 113     return tmp;
 114 }
 115
 116 static int sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
 117                      int line_size, int h)
 118 {
 119     int tmp;
 120
 121     __asm__ volatile (
 122         "movl %4, %%ecx\n"
 123         "pxor %%mm0, %%mm0\n"    /* mm0 = 0 */
 124         "pxor %%mm7, %%mm7\n"    /* mm7 holds the sum */
 125         "1:\n"
 126         "movq (%0), %%mm1\n"     /* mm1 = pix1[0 -  7] */
 127         "movq (%1), %%mm2\n"     /* mm2 = pix2[0 -  7] */
 128         "movq 8(%0), %%mm3\n"    /* mm3 = pix1[8 - 15] */
 129         "movq 8(%1), %%mm4\n"    /* mm4 = pix2[8 - 15] */
 130
 131         /* todo: mm1-mm2, mm3-mm4 */
 132         /* algo: subtract mm1 from mm2 with saturation and vice versa */
 133         /*       OR the results to get absolute difference */
 134         "movq %%mm1, %%mm5\n"
 135         "movq %%mm3, %%mm6\n"
 136         "psubusb %%mm2, %%mm1\n"
 137         "psubusb %%mm4, %%mm3\n"
 138         "psubusb %%mm5, %%mm2\n"
 139         "psubusb %%mm6, %%mm4\n"
 140
 141         "por %%mm1, %%mm2\n"
 142         "por %%mm3, %%mm4\n"
 143
 144         /* now convert to 16-bit vectors so we can square them */
 145         "movq %%mm2, %%mm1\n"
 146         "movq %%mm4, %%mm3\n"
 147
 148         "punpckhbw %%mm0, %%mm2\n"
 149         "punpckhbw %%mm0, %%mm4\n"
 150         "punpcklbw %%mm0, %%mm1\n" /* mm1 now spread over (mm1, mm2) */
 151         "punpcklbw %%mm0, %%mm3\n" /* mm4 now spread over (mm3, mm4) */
 152
 153         "pmaddwd %%mm2, %%mm2\n"
 154         "pmaddwd %%mm4, %%mm4\n"
 155         "pmaddwd %%mm1, %%mm1\n"
 156         "pmaddwd %%mm3, %%mm3\n"
 157
 158         "add %3, %0\n"
 159         "add %3, %1\n"
 160
 161         "paddd %%mm2, %%mm1\n"
 162         "paddd %%mm4, %%mm3\n"
 163         "paddd %%mm1, %%mm7\n"
 164         "paddd %%mm3, %%mm7\n"
 165
 166         "decl %%ecx\n"
 167         "jnz 1b\n"
 168
 169         "movq %%mm7, %%mm1\n"
 170         "psrlq $32, %%mm7\n"    /* shift hi dword to lo */
 171         "paddd %%mm7, %%mm1\n"
 172         "movd %%mm1, %2\n"
 173         : "+r" (pix1), "+r" (pix2), "=r" (tmp)
 174         : "r" ((x86_reg) line_size), "m" (h)
 175         : "%ecx");
 176
 177     return tmp;
 178 }
 179
 180 static int hf_noise8_mmx(uint8_t *pix1, int line_size, int h)
 181 {
 182     int tmp;
 183
 184     __asm__ volatile (
 185         "movl %3, %%ecx\n"
 186         "pxor %%mm7, %%mm7\n"
 187         "pxor %%mm6, %%mm6\n"
 188
 189         "movq (%0), %%mm0\n"
 190         "movq %%mm0, %%mm1\n"
 191         "psllq $8, %%mm0\n"
 192         "psrlq $8, %%mm1\n"
 193         "psrlq $8, %%mm0\n"
 194         "movq %%mm0, %%mm2\n"
 195         "movq %%mm1, %%mm3\n"
 196         "punpcklbw %%mm7, %%mm0\n"
 197         "punpcklbw %%mm7, %%mm1\n"
 198         "punpckhbw %%mm7, %%mm2\n"
 199         "punpckhbw %%mm7, %%mm3\n"
 200         "psubw %%mm1, %%mm0\n"
 201         "psubw %%mm3, %%mm2\n"
 202
 203         "add %2, %0\n"
 204
 205         "movq (%0), %%mm4\n"
 206         "movq %%mm4, %%mm1\n"
 207         "psllq $8, %%mm4\n"
 208         "psrlq $8, %%mm1\n"
 209         "psrlq $8, %%mm4\n"
 210         "movq %%mm4, %%mm5\n"
 211         "movq %%mm1, %%mm3\n"
 212         "punpcklbw %%mm7, %%mm4\n"
 213         "punpcklbw %%mm7, %%mm1\n"
 214         "punpckhbw %%mm7, %%mm5\n"
 215         "punpckhbw %%mm7, %%mm3\n"
 216         "psubw %%mm1, %%mm4\n"
 217         "psubw %%mm3, %%mm5\n"
 218         "psubw %%mm4, %%mm0\n"
 219         "psubw %%mm5, %%mm2\n"
 220         "pxor %%mm3, %%mm3\n"
 221         "pxor %%mm1, %%mm1\n"
 222         "pcmpgtw %%mm0, %%mm3\n\t"
 223         "pcmpgtw %%mm2, %%mm1\n\t"
 224         "pxor %%mm3, %%mm0\n"
 225         "pxor %%mm1, %%mm2\n"
 226         "psubw %%mm3, %%mm0\n"
 227         "psubw %%mm1, %%mm2\n"
 228         "paddw %%mm0, %%mm2\n"
 229         "paddw %%mm2, %%mm6\n"
 230
 231         "add %2, %0\n"
 232         "1:\n"
 233
 234         "movq (%0), %%mm0\n"
 235         "movq %%mm0, %%mm1\n"
 236         "psllq $8, %%mm0\n"
 237         "psrlq $8, %%mm1\n"
 238         "psrlq $8, %%mm0\n"
 239         "movq %%mm0, %%mm2\n"
 240         "movq %%mm1, %%mm3\n"
 241         "punpcklbw %%mm7, %%mm0\n"
 242         "punpcklbw %%mm7, %%mm1\n"
 243         "punpckhbw %%mm7, %%mm2\n"
 244         "punpckhbw %%mm7, %%mm3\n"
 245         "psubw %%mm1, %%mm0\n"
 246         "psubw %%mm3, %%mm2\n"
 247         "psubw %%mm0, %%mm4\n"
 248         "psubw %%mm2, %%mm5\n"
 249         "pxor  %%mm3, %%mm3\n"
 250         "pxor  %%mm1, %%mm1\n"
 251         "pcmpgtw %%mm4, %%mm3\n\t"
 252         "pcmpgtw %%mm5, %%mm1\n\t"
 253         "pxor  %%mm3, %%mm4\n"
 254         "pxor  %%mm1, %%mm5\n"
 255         "psubw %%mm3, %%mm4\n"
 256         "psubw %%mm1, %%mm5\n"
 257         "paddw %%mm4, %%mm5\n"
 258         "paddw %%mm5, %%mm6\n"
 259
 260         "add %2, %0\n"
 261
 262         "movq (%0), %%mm4\n"
 263         "movq      %%mm4, %%mm1\n"
 264         "psllq $8, %%mm4\n"
 265         "psrlq $8, %%mm1\n"
 266         "psrlq $8, %%mm4\n"
 267         "movq      %%mm4, %%mm5\n"
 268         "movq      %%mm1, %%mm3\n"
 269         "punpcklbw %%mm7, %%mm4\n"
 270         "punpcklbw %%mm7, %%mm1\n"
 271         "punpckhbw %%mm7, %%mm5\n"
 272         "punpckhbw %%mm7, %%mm3\n"
 273         "psubw     %%mm1, %%mm4\n"
 274         "psubw     %%mm3, %%mm5\n"
 275         "psubw     %%mm4, %%mm0\n"
 276         "psubw     %%mm5, %%mm2\n"
 277         "pxor      %%mm3, %%mm3\n"
 278         "pxor      %%mm1, %%mm1\n"
 279         "pcmpgtw   %%mm0, %%mm3\n\t"
 280         "pcmpgtw   %%mm2, %%mm1\n\t"
 281         "pxor      %%mm3, %%mm0\n"
 282         "pxor      %%mm1, %%mm2\n"
 283         "psubw     %%mm3, %%mm0\n"
 284         "psubw     %%mm1, %%mm2\n"
 285         "paddw     %%mm0, %%mm2\n"
 286         "paddw     %%mm2, %%mm6\n"
 287
 288         "add  %2, %0\n"
 289         "subl $2, %%ecx\n"
 290         " jnz 1b\n"
 291
 292         "movq      %%mm6, %%mm0\n"
 293         "punpcklwd %%mm7, %%mm0\n"
 294         "punpckhwd %%mm7, %%mm6\n"
 295         "paddd     %%mm0, %%mm6\n"
 296
 297         "movq  %%mm6, %%mm0\n"
 298         "psrlq $32,   %%mm6\n"
 299         "paddd %%mm6, %%mm0\n"
 300         "movd  %%mm0, %1\n"
 301         : "+r" (pix1), "=r" (tmp)
 302         : "r" ((x86_reg) line_size), "g" (h - 2)
 303         : "%ecx");
 304
 305     return tmp;
 306 }
 307
 308 static int hf_noise16_mmx(uint8_t *pix1, int line_size, int h)
 309 {
 310     int tmp;
 311     uint8_t *pix = pix1;
 312
 313     __asm__ volatile (
 314         "movl %3, %%ecx\n"
 315         "pxor %%mm7, %%mm7\n"
 316         "pxor %%mm6, %%mm6\n"
 317
 318         "movq (%0), %%mm0\n"
 319         "movq 1(%0), %%mm1\n"
 320         "movq %%mm0, %%mm2\n"
 321         "movq %%mm1, %%mm3\n"
 322         "punpcklbw %%mm7, %%mm0\n"
 323         "punpcklbw %%mm7, %%mm1\n"
 324         "punpckhbw %%mm7, %%mm2\n"
 325         "punpckhbw %%mm7, %%mm3\n"
 326         "psubw %%mm1, %%mm0\n"
 327         "psubw %%mm3, %%mm2\n"
 328
 329         "add %2, %0\n"
 330
 331         "movq (%0), %%mm4\n"
 332         "movq 1(%0), %%mm1\n"
 333         "movq %%mm4, %%mm5\n"
 334         "movq %%mm1, %%mm3\n"
 335         "punpcklbw %%mm7, %%mm4\n"
 336         "punpcklbw %%mm7, %%mm1\n"
 337         "punpckhbw %%mm7, %%mm5\n"
 338         "punpckhbw %%mm7, %%mm3\n"
 339         "psubw %%mm1, %%mm4\n"
 340         "psubw %%mm3, %%mm5\n"
 341         "psubw %%mm4, %%mm0\n"
 342         "psubw %%mm5, %%mm2\n"
 343         "pxor %%mm3, %%mm3\n"
 344         "pxor %%mm1, %%mm1\n"
 345         "pcmpgtw %%mm0, %%mm3\n\t"
 346         "pcmpgtw %%mm2, %%mm1\n\t"
 347         "pxor %%mm3, %%mm0\n"
 348         "pxor %%mm1, %%mm2\n"
 349         "psubw %%mm3, %%mm0\n"
 350         "psubw %%mm1, %%mm2\n"
 351         "paddw %%mm0, %%mm2\n"
 352         "paddw %%mm2, %%mm6\n"
 353
 354         "add %2, %0\n"
 355         "1:\n"
 356
 357         "movq (%0), %%mm0\n"
 358         "movq 1(%0), %%mm1\n"
 359         "movq %%mm0, %%mm2\n"
 360         "movq %%mm1, %%mm3\n"
 361         "punpcklbw %%mm7, %%mm0\n"
 362         "punpcklbw %%mm7, %%mm1\n"
 363         "punpckhbw %%mm7, %%mm2\n"
 364         "punpckhbw %%mm7, %%mm3\n"
 365         "psubw %%mm1, %%mm0\n"
 366         "psubw %%mm3, %%mm2\n"
 367         "psubw %%mm0, %%mm4\n"
 368         "psubw %%mm2, %%mm5\n"
 369         "pxor %%mm3, %%mm3\n"
 370         "pxor %%mm1, %%mm1\n"
 371         "pcmpgtw %%mm4, %%mm3\n\t"
 372         "pcmpgtw %%mm5, %%mm1\n\t"
 373         "pxor %%mm3, %%mm4\n"
 374         "pxor %%mm1, %%mm5\n"
 375         "psubw %%mm3, %%mm4\n"
 376         "psubw %%mm1, %%mm5\n"
 377         "paddw %%mm4, %%mm5\n"
 378         "paddw %%mm5, %%mm6\n"
 379
 380         "add %2, %0\n"
 381
 382         "movq (%0), %%mm4\n"
 383         "movq 1(%0), %%mm1\n"
 384         "movq %%mm4, %%mm5\n"
 385         "movq %%mm1, %%mm3\n"
 386         "punpcklbw %%mm7, %%mm4\n"
 387         "punpcklbw %%mm7, %%mm1\n"
 388         "punpckhbw %%mm7, %%mm5\n"
 389         "punpckhbw %%mm7, %%mm3\n"
 390         "psubw %%mm1, %%mm4\n"
 391         "psubw %%mm3, %%mm5\n"
 392         "psubw %%mm4, %%mm0\n"
 393         "psubw %%mm5, %%mm2\n"
 394         "pxor %%mm3, %%mm3\n"
 395         "pxor %%mm1, %%mm1\n"
 396         "pcmpgtw %%mm0, %%mm3\n\t"
 397         "pcmpgtw %%mm2, %%mm1\n\t"
 398         "pxor %%mm3, %%mm0\n"
 399         "pxor %%mm1, %%mm2\n"
 400         "psubw %%mm3, %%mm0\n"
 401         "psubw %%mm1, %%mm2\n"
 402         "paddw %%mm0, %%mm2\n"
 403         "paddw %%mm2, %%mm6\n"
 404
 405         "add %2, %0\n"
 406         "subl $2, %%ecx\n"
 407         " jnz 1b\n"
 408
 409         "movq %%mm6, %%mm0\n"
 410         "punpcklwd %%mm7, %%mm0\n"
 411         "punpckhwd %%mm7, %%mm6\n"
 412         "paddd %%mm0, %%mm6\n"
 413
 414         "movq %%mm6, %%mm0\n"
 415         "psrlq $32, %%mm6\n"
 416         "paddd %%mm6, %%mm0\n"
 417         "movd %%mm0, %1\n"
 418         : "+r" (pix1), "=r" (tmp)
 419         : "r" ((x86_reg) line_size), "g" (h - 2)
 420         : "%ecx");
 421
 422     return tmp + hf_noise8_mmx(pix + 8, line_size, h);
 423 }
 424
 425 static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
 426                       int line_size, int h)
 427 {
 428     int score1, score2;
 429
 430     if (c)
 431         score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h);
 432     else
 433         score1 = sse16_mmx(c, pix1, pix2, line_size, h);
 434     score2 = hf_noise16_mmx(pix1, line_size, h) -
 435              hf_noise16_mmx(pix2, line_size, h);
 436
 437     if (c)
 438         return score1 + FFABS(score2) * c->avctx->nsse_weight;
 439     else
 440         return score1 + FFABS(score2) * 8;
 441 }
 442
 443 static int nsse8_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
 444                      int line_size, int h)
 445 {
 446     int score1 = sse8_mmx(c, pix1, pix2, line_size, h);
 447     int score2 = hf_noise8_mmx(pix1, line_size, h) -
 448                  hf_noise8_mmx(pix2, line_size, h);
 449
 450     if (c)
 451         return score1 + FFABS(score2) * c->avctx->nsse_weight;
 452     else
 453         return score1 + FFABS(score2) * 8;
 454 }
 455
 456 static int vsad_intra16_mmx(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
 457                             int line_size, int h)
 458 {
 459     int tmp;
 460
 461     av_assert2((((int) pix) & 7) == 0);
 462     av_assert2((line_size & 7) == 0);
 463
 464 #define SUM(in0, in1, out0, out1)               \
 465     "movq (%0), %%mm2\n"                        \
 466     "movq 8(%0), %%mm3\n"                       \
 467     "add %2,%0\n"                               \
 468     "movq %%mm2, " #out0 "\n"                   \
 469     "movq %%mm3, " #out1 "\n"                   \
 470     "psubusb " #in0 ", %%mm2\n"                 \
 471     "psubusb " #in1 ", %%mm3\n"                 \
 472     "psubusb " #out0 ", " #in0 "\n"             \
 473     "psubusb " #out1 ", " #in1 "\n"             \
 474     "por %%mm2, " #in0 "\n"                     \
 475     "por %%mm3, " #in1 "\n"                     \
 476     "movq " #in0 ", %%mm2\n"                    \
 477     "movq " #in1 ", %%mm3\n"                    \
 478     "punpcklbw %%mm7, " #in0 "\n"               \
 479     "punpcklbw %%mm7, " #in1 "\n"               \
 480     "punpckhbw %%mm7, %%mm2\n"                  \
 481     "punpckhbw %%mm7, %%mm3\n"                  \
 482     "paddw " #in1 ", " #in0 "\n"                \
 483     "paddw %%mm3, %%mm2\n"                      \
 484     "paddw %%mm2, " #in0 "\n"                   \
 485     "paddw " #in0 ", %%mm6\n"
 486
 487
 488     __asm__ volatile (
 489         "movl    %3, %%ecx\n"
 490         "pxor %%mm6, %%mm6\n"
 491         "pxor %%mm7, %%mm7\n"
 492         "movq  (%0), %%mm0\n"
 493         "movq 8(%0), %%mm1\n"
 494         "add %2, %0\n"
 495         "jmp 2f\n"
 496         "1:\n"
 497
 498         SUM(%%mm4, %%mm5, %%mm0, %%mm1)
 499         "2:\n"
 500         SUM(%%mm0, %%mm1, %%mm4, %%mm5)
 501
 502         "subl $2, %%ecx\n"
 503         "jnz 1b\n"
 504
 505         "movq  %%mm6, %%mm0\n"
 506         "psrlq $32,   %%mm6\n"
 507         "paddw %%mm6, %%mm0\n"
 508         "movq  %%mm0, %%mm6\n"
 509         "psrlq $16,   %%mm0\n"
 510         "paddw %%mm6, %%mm0\n"
 511         "movd  %%mm0, %1\n"
 512         : "+r" (pix), "=r" (tmp)
 513         : "r" ((x86_reg) line_size), "m" (h)
 514         : "%ecx");
 515
 516     return tmp & 0xFFFF;
 517 }
 518 #undef SUM
 519
 520 static int vsad_intra16_mmxext(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
 521                                int line_size, int h)
 522 {
 523     int tmp;
 524
 525     av_assert2((((int) pix) & 7) == 0);
 526     av_assert2((line_size & 7) == 0);
 527
 528 #define SUM(in0, in1, out0, out1)               \
 529     "movq (%0), " #out0 "\n"                    \
 530     "movq 8(%0), " #out1 "\n"                   \
 531     "add %2, %0\n"                              \
 532     "psadbw " #out0 ", " #in0 "\n"              \
 533     "psadbw " #out1 ", " #in1 "\n"              \
 534     "paddw " #in1 ", " #in0 "\n"                \
 535     "paddw " #in0 ", %%mm6\n"
 536
 537     __asm__ volatile (
 538         "movl %3, %%ecx\n"
 539         "pxor %%mm6, %%mm6\n"
 540         "pxor %%mm7, %%mm7\n"
 541         "movq (%0), %%mm0\n"
 542         "movq 8(%0), %%mm1\n"
 543         "add %2, %0\n"
 544         "jmp 2f\n"
 545         "1:\n"
 546
 547         SUM(%%mm4, %%mm5, %%mm0, %%mm1)
 548         "2:\n"
 549         SUM(%%mm0, %%mm1, %%mm4, %%mm5)
 550
 551         "subl $2, %%ecx\n"
 552         "jnz 1b\n"
 553
 554         "movd %%mm6, %1\n"
 555         : "+r" (pix), "=r" (tmp)
 556         : "r" ((x86_reg) line_size), "m" (h)
 557         : "%ecx");
 558
 559     return tmp;
 560 }
 561 #undef SUM
 562
 563 static int vsad16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
 564                       int line_size, int h)
 565 {
 566     int tmp;
 567
 568     av_assert2((((int) pix1) & 7) == 0);
 569     av_assert2((((int) pix2) & 7) == 0);
 570     av_assert2((line_size & 7) == 0);
 571
 572 #define SUM(in0, in1, out0, out1)       \
 573     "movq (%0), %%mm2\n"                \
 574     "movq (%1), " #out0 "\n"            \
 575     "movq 8(%0), %%mm3\n"               \
 576     "movq 8(%1), " #out1 "\n"           \
 577     "add %3, %0\n"                      \
 578     "add %3, %1\n"                      \
 579     "psubb " #out0 ", %%mm2\n"          \
 580     "psubb " #out1 ", %%mm3\n"          \
 581     "pxor %%mm7, %%mm2\n"               \
 582     "pxor %%mm7, %%mm3\n"               \
 583     "movq %%mm2, " #out0 "\n"           \
 584     "movq %%mm3, " #out1 "\n"           \
 585     "psubusb " #in0 ", %%mm2\n"         \
 586     "psubusb " #in1 ", %%mm3\n"         \
 587     "psubusb " #out0 ", " #in0 "\n"     \
 588     "psubusb " #out1 ", " #in1 "\n"     \
 589     "por %%mm2, " #in0 "\n"             \
 590     "por %%mm3, " #in1 "\n"             \
 591     "movq " #in0 ", %%mm2\n"            \
 592     "movq " #in1 ", %%mm3\n"            \
 593     "punpcklbw %%mm7, " #in0 "\n"       \
 594     "punpcklbw %%mm7, " #in1 "\n"       \
 595     "punpckhbw %%mm7, %%mm2\n"          \
 596     "punpckhbw %%mm7, %%mm3\n"          \
 597     "paddw " #in1 ", " #in0 "\n"        \
 598     "paddw %%mm3, %%mm2\n"              \
 599     "paddw %%mm2, " #in0 "\n"           \
 600     "paddw " #in0 ", %%mm6\n"
 601
 602
 603     __asm__ volatile (
 604         "movl %4, %%ecx\n"
 605         "pxor %%mm6, %%mm6\n"
 606         "pcmpeqw %%mm7, %%mm7\n"
 607         "psllw $15, %%mm7\n"
 608         "packsswb %%mm7, %%mm7\n"
 609         "movq (%0), %%mm0\n"
 610         "movq (%1), %%mm2\n"
 611         "movq 8(%0), %%mm1\n"
 612         "movq 8(%1), %%mm3\n"
 613         "add %3, %0\n"
 614         "add %3, %1\n"
 615         "psubb %%mm2, %%mm0\n"
 616         "psubb %%mm3, %%mm1\n"
 617         "pxor %%mm7, %%mm0\n"
 618         "pxor %%mm7, %%mm1\n"
 619         "jmp 2f\n"
 620         "1:\n"
 621
 622         SUM(%%mm4, %%mm5, %%mm0, %%mm1)
 623         "2:\n"
 624         SUM(%%mm0, %%mm1, %%mm4, %%mm5)
 625
 626         "subl $2, %%ecx\n"
 627         "jnz 1b\n"
 628
 629         "movq %%mm6, %%mm0\n"
 630         "psrlq $32, %%mm6\n"
 631         "paddw %%mm6, %%mm0\n"
 632         "movq %%mm0, %%mm6\n"
 633         "psrlq $16, %%mm0\n"
 634         "paddw %%mm6, %%mm0\n"
 635         "movd %%mm0, %2\n"
 636         : "+r" (pix1), "+r" (pix2), "=r" (tmp)
 637         : "r" ((x86_reg) line_size), "m" (h)
 638         : "%ecx");
 639
 640     return tmp & 0x7FFF;
 641 }
 642 #undef SUM
 643
 644 static int vsad16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
 645                          int line_size, int h)
 646 {
 647     int tmp;
 648
 649     av_assert2((((int) pix1) & 7) == 0);
 650     av_assert2((((int) pix2) & 7) == 0);
 651     av_assert2((line_size & 7) == 0);
 652
 653 #define SUM(in0, in1, out0, out1)               \
 654     "movq (%0), " #out0 "\n"                    \
 655     "movq (%1), %%mm2\n"                        \
 656     "movq 8(%0), " #out1 "\n"                   \
 657     "movq 8(%1), %%mm3\n"                       \
 658     "add %3, %0\n"                              \
 659     "add %3, %1\n"                              \
 660     "psubb %%mm2, " #out0 "\n"                  \
 661     "psubb %%mm3, " #out1 "\n"                  \
 662     "pxor %%mm7, " #out0 "\n"                   \
 663     "pxor %%mm7, " #out1 "\n"                   \
 664     "psadbw " #out0 ", " #in0 "\n"              \
 665     "psadbw " #out1 ", " #in1 "\n"              \
 666     "paddw " #in1 ", " #in0 "\n"                \
 667     "paddw " #in0 ", %%mm6\n    "
 668
 669     __asm__ volatile (
 670         "movl %4, %%ecx\n"
 671         "pxor %%mm6, %%mm6\n"
 672         "pcmpeqw %%mm7, %%mm7\n"
 673         "psllw $15, %%mm7\n"
 674         "packsswb %%mm7, %%mm7\n"
 675         "movq (%0), %%mm0\n"
 676         "movq (%1), %%mm2\n"
 677         "movq 8(%0), %%mm1\n"
 678         "movq 8(%1), %%mm3\n"
 679         "add %3, %0\n"
 680         "add %3, %1\n"
 681         "psubb %%mm2, %%mm0\n"
 682         "psubb %%mm3, %%mm1\n"
 683         "pxor %%mm7, %%mm0\n"
 684         "pxor %%mm7, %%mm1\n"
 685         "jmp 2f\n"
 686         "1:\n"
 687
 688         SUM(%%mm4, %%mm5, %%mm0, %%mm1)
 689         "2:\n"
 690         SUM(%%mm0, %%mm1, %%mm4, %%mm5)
 691
 692         "subl $2, %%ecx\n"
 693         "jnz 1b\n"
 694
 695         "movd %%mm6, %2\n"
 696         : "+r" (pix1), "+r" (pix2), "=r" (tmp)
 697         : "r" ((x86_reg) line_size), "m" (h)
 698         : "%ecx");
 699
 700     return tmp;
 701 }
 702 #undef SUM
 703
 704
 705 static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2,
 706                                  int size)
 707 {
 708     int sum;
 709     x86_reg i = size;
 710
 711     __asm__ volatile (
 712         "pxor %%mm4, %%mm4 \n"
 713         "1: \n"
 714         "sub $8, %0 \n"
 715         "movq (%2, %0), %%mm2 \n"
 716         "movq (%3, %0, 2), %%mm0 \n"
 717         "movq 8(%3, %0, 2), %%mm1 \n"
 718         "punpckhbw %%mm2, %%mm3 \n"
 719         "punpcklbw %%mm2, %%mm2 \n"
 720         "psraw $8, %%mm3 \n"
 721         "psraw $8, %%mm2 \n"
 722         "psubw %%mm3, %%mm1 \n"
 723         "psubw %%mm2, %%mm0 \n"
 724         "pmaddwd %%mm1, %%mm1 \n"
 725         "pmaddwd %%mm0, %%mm0 \n"
 726         "paddd %%mm1, %%mm4 \n"
 727         "paddd %%mm0, %%mm4 \n"
 728         "jg 1b \n"
 729         "movq %%mm4, %%mm3 \n"
 730         "psrlq $32, %%mm3 \n"
 731         "paddd %%mm3, %%mm4 \n"
 732         "movd %%mm4, %1 \n"
 733         : "+r" (i), "=r" (sum)
 734         : "r" (pix1), "r" (pix2));
 735
 736     return sum;
 737 }
 738
 739 #define PHADDD(a, t)                            \
 740     "movq  " #a ", " #t "               \n\t"   \
 741     "psrlq    $32, " #a "               \n\t"   \
 742     "paddd " #t ", " #a "               \n\t"
 743
 744 /*
 745  * pmulhw:   dst[0 - 15] = (src[0 - 15] * dst[0 - 15])[16 - 31]
 746  * pmulhrw:  dst[0 - 15] = (src[0 - 15] * dst[0 - 15] + 0x8000)[16 - 31]
 747  * pmulhrsw: dst[0 - 15] = (src[0 - 15] * dst[0 - 15] + 0x4000)[15 - 30]
 748  */
 749 #define PMULHRW(x, y, s, o)                     \
 750     "pmulhw " #s ", " #x "              \n\t"   \
 751     "pmulhw " #s ", " #y "              \n\t"   \
 752     "paddw  " #o ", " #x "              \n\t"   \
 753     "paddw  " #o ", " #y "              \n\t"   \
 754     "psraw      $1, " #x "              \n\t"   \
 755     "psraw      $1, " #y "              \n\t"
 756 #define DEF(x) x ## _mmx
 757 #define SET_RND MOVQ_WONE
 758 #define SCALE_OFFSET 1
 759
 760 #include "dsputil_qns_template.c"
 761
 762 #undef DEF
 763 #undef SET_RND
 764 #undef SCALE_OFFSET
 765 #undef PMULHRW
 766
 767 #define DEF(x) x ## _3dnow
 768 #define SET_RND(x)
 769 #define SCALE_OFFSET 0
 770 #define PMULHRW(x, y, s, o)                     \
 771     "pmulhrw " #s ", " #x "             \n\t"   \
 772     "pmulhrw " #s ", " #y "             \n\t"
 773
 774 #include "dsputil_qns_template.c"
 775
 776 #undef DEF
 777 #undef SET_RND
 778 #undef SCALE_OFFSET
 779 #undef PMULHRW
 780
 781 #if HAVE_SSSE3_INLINE
 782 #undef PHADDD
 783 #define DEF(x) x ## _ssse3
 784 #define SET_RND(x)
 785 #define SCALE_OFFSET -1
 786
 787 #define PHADDD(a, t)                            \
 788     "pshufw $0x0E, " #a ", " #t "       \n\t"   \
 789     /* faster than phaddd on core2 */           \
 790     "paddd " #t ", " #a "               \n\t"
 791
 792 #define PMULHRW(x, y, s, o)                     \
 793     "pmulhrsw " #s ", " #x "            \n\t"   \
 794     "pmulhrsw " #s ", " #y "            \n\t"
 795
 796 #include "dsputil_qns_template.c"
 797
 798 #undef DEF
 799 #undef SET_RND
 800 #undef SCALE_OFFSET
 801 #undef PMULHRW
 802 #undef PHADDD
 803 #endif /* HAVE_SSSE3_INLINE */
 804
 805 #endif /* HAVE_INLINE_ASM */
 806
 807 int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
 808                   int line_size, int h);
 809
 810 #define hadamard_func(cpu)                                              \
 811     int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1,     \
 812                                   uint8_t *src2, int stride, int h);    \
 813     int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, uint8_t *src1,   \
 814                                     uint8_t *src2, int stride, int h);
 815
 816 hadamard_func(mmx)
 817 hadamard_func(mmxext)
 818 hadamard_func(sse2)
 819 hadamard_func(ssse3)
 820
 821 av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx,
 822                                     unsigned high_bit_depth)
 823 {
 824     int cpu_flags = av_get_cpu_flags();
 825     const int dct_algo = avctx->dct_algo;
 826
 827     if (EXTERNAL_MMX(cpu_flags)) {
 828         if (!high_bit_depth)
 829             c->get_pixels = ff_get_pixels_mmx;
 830         c->diff_pixels = ff_diff_pixels_mmx;
 831         c->pix_sum     = ff_pix_sum16_mmx;
 832         c->pix_norm1   = ff_pix_norm1_mmx;
 833     }
 834
 835     if (EXTERNAL_SSE2(cpu_flags))
 836         if (!high_bit_depth)
 837             c->get_pixels = ff_get_pixels_sse2;
 838
 839 #if HAVE_INLINE_ASM
 840     if (INLINE_MMX(cpu_flags)) {
 841         if (!high_bit_depth &&
 842             (dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX))
 843             c->fdct = ff_fdct_mmx;
 844
 845         c->sse[0]  = sse16_mmx;
 846         c->sse[1]  = sse8_mmx;
 847         c->vsad[4] = vsad_intra16_mmx;
 848
 849         c->nsse[0] = nsse16_mmx;
 850         c->nsse[1] = nsse8_mmx;
 851         if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
 852             c->vsad[0]      = vsad16_mmx;
 853             c->try_8x8basis = try_8x8basis_mmx;
 854         }
 855         c->add_8x8basis = add_8x8basis_mmx;
 856
 857         c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx;
 858     }
 859
 860     if (INLINE_AMD3DNOW(cpu_flags)) {
 861         if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
 862             c->try_8x8basis = try_8x8basis_3dnow;
 863         }
 864         c->add_8x8basis = add_8x8basis_3dnow;
 865     }
 866
 867     if (INLINE_MMXEXT(cpu_flags)) {
 868         if (!high_bit_depth &&
 869             (dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX))
 870             c->fdct = ff_fdct_mmxext;
 871
 872         c->vsad[4]         = vsad_intra16_mmxext;
 873
 874         if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
 875             c->vsad[0] = vsad16_mmxext;
 876         }
 877     }
 878
 879     if (INLINE_SSE2(cpu_flags)) {
 880         if (!high_bit_depth &&
 881             (dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX))
 882             c->fdct = ff_fdct_sse2;
 883     }
 884
 885 #if HAVE_SSSE3_INLINE
 886     if (INLINE_SSSE3(cpu_flags)) {
 887         if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
 888             c->try_8x8basis = try_8x8basis_ssse3;
 889         }
 890         c->add_8x8basis    = add_8x8basis_ssse3;
 891     }
 892 #endif
 893 #endif /* HAVE_INLINE_ASM */
 894
 895     if (EXTERNAL_MMX(cpu_flags)) {
 896         c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx;
 897         c->hadamard8_diff[1] = ff_hadamard8_diff_mmx;
 898         c->sum_abs_dctelem   = ff_sum_abs_dctelem_mmx;
 899     }
 900
 901     if (EXTERNAL_MMXEXT(cpu_flags)) {
 902         c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext;
 903         c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext;
 904         c->sum_abs_dctelem   = ff_sum_abs_dctelem_mmxext;
 905     }
 906
 907     if (EXTERNAL_SSE2(cpu_flags)) {
 908         c->sse[0] = ff_sse16_sse2;
 909         c->sum_abs_dctelem   = ff_sum_abs_dctelem_sse2;
 910         c->diff_pixels = ff_diff_pixels_sse2;
 911         c->pix_sum     = ff_pix_sum16_sse2;
 912         c->pix_norm1   = ff_pix_norm1_sse2;
 913
 914 #if HAVE_ALIGNED_STACK
 915         c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
 916         c->hadamard8_diff[1] = ff_hadamard8_diff_sse2;
 917 #endif
 918     }
 919
 920     if (EXTERNAL_SSSE3(cpu_flags)) {
 921         c->sum_abs_dctelem   = ff_sum_abs_dctelem_ssse3;
 922 #if HAVE_ALIGNED_STACK
 923         c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3;
 924         c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3;
 925 #endif
 926     }
 927
 928     ff_dsputil_init_pix_mmx(c, avctx);
 929 }