git.sesse.net Git - ffmpeg/blob - libavcodec/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
   7  *
   8  * This file is part of FFmpeg.
   9  *
  10  * FFmpeg is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2.1 of the License, or (at your option) any later version.
  14  *
  15  * FFmpeg is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with FFmpeg; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23  */
  24
  25 /**
  26  * @file
  27  * DSP utils
  28  */
  29
  30 #include "libavutil/attributes.h"
  31 #include "libavutil/imgutils.h"
  32 #include "libavutil/internal.h"
  33 #include "avcodec.h"
  34 #include "copy_block.h"
  35 #include "dct.h"
  36 #include "dsputil.h"
  37 #include "simple_idct.h"
  38 #include "faandct.h"
  39 #include "faanidct.h"
  40 #include "imgconvert.h"
  41 #include "mathops.h"
  42 #include "mpegvideo.h"
  43 #include "config.h"
  44 #include "diracdsp.h"
  45
  46 uint32_t ff_square_tab[512] = { 0, };
  47
  48 #define BIT_DEPTH 16
  49 #include "dsputil_template.c"
  50 #undef BIT_DEPTH
  51
  52 #define BIT_DEPTH 8
  53 #include "dsputil_template.c"
  54
  55 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
  56 #define pb_7f (~0UL / 255 * 0x7f)
  57 #define pb_80 (~0UL / 255 * 0x80)
  58
  59 /* Specific zigzag scan for 248 idct. NOTE that unlike the
  60  * specification, we interleave the fields */
  61 const uint8_t ff_zigzag248_direct[64] = {
  62      0,  8,  1,  9, 16, 24,  2, 10,
  63     17, 25, 32, 40, 48, 56, 33, 41,
  64     18, 26,  3, 11,  4, 12, 19, 27,
  65     34, 42, 49, 57, 50, 58, 35, 43,
  66     20, 28,  5, 13,  6, 14, 21, 29,
  67     36, 44, 51, 59, 52, 60, 37, 45,
  68     22, 30,  7, 15, 23, 31, 38, 46,
  69     53, 61, 54, 62, 39, 47, 55, 63,
  70 };
  71
  72 const uint8_t ff_alternate_horizontal_scan[64] = {
  73      0,  1,  2,  3,  8,  9, 16, 17,
  74     10, 11,  4,  5,  6,  7, 15, 14,
  75     13, 12, 19, 18, 24, 25, 32, 33,
  76     26, 27, 20, 21, 22, 23, 28, 29,
  77     30, 31, 34, 35, 40, 41, 48, 49,
  78     42, 43, 36, 37, 38, 39, 44, 45,
  79     46, 47, 50, 51, 56, 57, 58, 59,
  80     52, 53, 54, 55, 60, 61, 62, 63,
  81 };
  82
  83 const uint8_t ff_alternate_vertical_scan[64] = {
  84      0,  8, 16, 24,  1,  9,  2, 10,
  85     17, 25, 32, 40, 48, 56, 57, 49,
  86     41, 33, 26, 18,  3, 11,  4, 12,
  87     19, 27, 34, 42, 50, 58, 35, 43,
  88     51, 59, 20, 28,  5, 13,  6, 14,
  89     21, 29, 36, 44, 52, 60, 37, 45,
  90     53, 61, 22, 30,  7, 15, 23, 31,
  91     38, 46, 54, 62, 39, 47, 55, 63,
  92 };
  93
  94 /* Input permutation for the simple_idct_mmx */
  95 static const uint8_t simple_mmx_permutation[64] = {
  96     0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
  97     0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
  98     0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
  99     0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 100     0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 101     0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 102     0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 103     0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 104 };
 105
 106 static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
 107
 108 av_cold void ff_init_scantable(uint8_t *permutation, ScanTable *st,
 109                                const uint8_t *src_scantable)
 110 {
 111     int i, end;
 112
 113     st->scantable = src_scantable;
 114
 115     for (i = 0; i < 64; i++) {
 116         int j = src_scantable[i];
 117         st->permutated[i] = permutation[j];
 118     }
 119
 120     end = -1;
 121     for (i = 0; i < 64; i++) {
 122         int j = st->permutated[i];
 123         if (j > end)
 124             end = j;
 125         st->raster_end[i] = end;
 126     }
 127 }
 128
 129 av_cold void ff_init_scantable_permutation(uint8_t *idct_permutation,
 130                                            int idct_permutation_type)
 131 {
 132     int i;
 133
 134     switch (idct_permutation_type) {
 135     case FF_NO_IDCT_PERM:
 136         for (i = 0; i < 64; i++)
 137             idct_permutation[i] = i;
 138         break;
 139     case FF_LIBMPEG2_IDCT_PERM:
 140         for (i = 0; i < 64; i++)
 141             idct_permutation[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
 142         break;
 143     case FF_SIMPLE_IDCT_PERM:
 144         for (i = 0; i < 64; i++)
 145             idct_permutation[i] = simple_mmx_permutation[i];
 146         break;
 147     case FF_TRANSPOSE_IDCT_PERM:
 148         for (i = 0; i < 64; i++)
 149             idct_permutation[i] = ((i & 7) << 3) | (i >> 3);
 150         break;
 151     case FF_PARTTRANS_IDCT_PERM:
 152         for (i = 0; i < 64; i++)
 153             idct_permutation[i] = (i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3);
 154         break;
 155     case FF_SSE2_IDCT_PERM:
 156         for (i = 0; i < 64; i++)
 157             idct_permutation[i] = (i & 0x38) | idct_sse2_row_perm[i & 7];
 158         break;
 159     default:
 160         av_log(NULL, AV_LOG_ERROR,
 161                "Internal error, IDCT permutation not set\n");
 162     }
 163 }
 164
 165 static int pix_sum_c(uint8_t *pix, int line_size)
 166 {
 167     int s = 0, i, j;
 168
 169     for (i = 0; i < 16; i++) {
 170         for (j = 0; j < 16; j += 8) {
 171             s   += pix[0];
 172             s   += pix[1];
 173             s   += pix[2];
 174             s   += pix[3];
 175             s   += pix[4];
 176             s   += pix[5];
 177             s   += pix[6];
 178             s   += pix[7];
 179             pix += 8;
 180         }
 181         pix += line_size - 16;
 182     }
 183     return s;
 184 }
 185
 186 static int pix_norm1_c(uint8_t *pix, int line_size)
 187 {
 188     int s = 0, i, j;
 189     uint32_t *sq = ff_square_tab + 256;
 190
 191     for (i = 0; i < 16; i++) {
 192         for (j = 0; j < 16; j += 8) {
 193 #if 0
 194             s += sq[pix[0]];
 195             s += sq[pix[1]];
 196             s += sq[pix[2]];
 197             s += sq[pix[3]];
 198             s += sq[pix[4]];
 199             s += sq[pix[5]];
 200             s += sq[pix[6]];
 201             s += sq[pix[7]];
 202 #else
 203 #if HAVE_FAST_64BIT
 204             register uint64_t x = *(uint64_t *) pix;
 205             s += sq[x         & 0xff];
 206             s += sq[(x >>  8) & 0xff];
 207             s += sq[(x >> 16) & 0xff];
 208             s += sq[(x >> 24) & 0xff];
 209             s += sq[(x >> 32) & 0xff];
 210             s += sq[(x >> 40) & 0xff];
 211             s += sq[(x >> 48) & 0xff];
 212             s += sq[(x >> 56) & 0xff];
 213 #else
 214             register uint32_t x = *(uint32_t *) pix;
 215             s += sq[x         & 0xff];
 216             s += sq[(x >>  8) & 0xff];
 217             s += sq[(x >> 16) & 0xff];
 218             s += sq[(x >> 24) & 0xff];
 219             x  = *(uint32_t *) (pix + 4);
 220             s += sq[x         & 0xff];
 221             s += sq[(x >>  8) & 0xff];
 222             s += sq[(x >> 16) & 0xff];
 223             s += sq[(x >> 24) & 0xff];
 224 #endif
 225 #endif
 226             pix += 8;
 227         }
 228         pix += line_size - 16;
 229     }
 230     return s;
 231 }
 232
 233 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w)
 234 {
 235     int i;
 236
 237     for (i = 0; i + 8 <= w; i += 8) {
 238         dst[i + 0] = av_bswap32(src[i + 0]);
 239         dst[i + 1] = av_bswap32(src[i + 1]);
 240         dst[i + 2] = av_bswap32(src[i + 2]);
 241         dst[i + 3] = av_bswap32(src[i + 3]);
 242         dst[i + 4] = av_bswap32(src[i + 4]);
 243         dst[i + 5] = av_bswap32(src[i + 5]);
 244         dst[i + 6] = av_bswap32(src[i + 6]);
 245         dst[i + 7] = av_bswap32(src[i + 7]);
 246     }
 247     for (; i < w; i++)
 248         dst[i + 0] = av_bswap32(src[i + 0]);
 249 }
 250
 251 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
 252 {
 253     while (len--)
 254         *dst++ = av_bswap16(*src++);
 255 }
 256
 257 static int sse4_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 258 {
 259     int s = 0, i;
 260     uint32_t *sq = ff_square_tab + 256;
 261
 262     for (i = 0; i < h; i++) {
 263         s    += sq[pix1[0] - pix2[0]];
 264         s    += sq[pix1[1] - pix2[1]];
 265         s    += sq[pix1[2] - pix2[2]];
 266         s    += sq[pix1[3] - pix2[3]];
 267         pix1 += line_size;
 268         pix2 += line_size;
 269     }
 270     return s;
 271 }
 272
 273 static int sse8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 274 {
 275     int s = 0, i;
 276     uint32_t *sq = ff_square_tab + 256;
 277
 278     for (i = 0; i < h; i++) {
 279         s    += sq[pix1[0] - pix2[0]];
 280         s    += sq[pix1[1] - pix2[1]];
 281         s    += sq[pix1[2] - pix2[2]];
 282         s    += sq[pix1[3] - pix2[3]];
 283         s    += sq[pix1[4] - pix2[4]];
 284         s    += sq[pix1[5] - pix2[5]];
 285         s    += sq[pix1[6] - pix2[6]];
 286         s    += sq[pix1[7] - pix2[7]];
 287         pix1 += line_size;
 288         pix2 += line_size;
 289     }
 290     return s;
 291 }
 292
 293 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 294 {
 295     int s = 0, i;
 296     uint32_t *sq = ff_square_tab + 256;
 297
 298     for (i = 0; i < h; i++) {
 299         s += sq[pix1[0]  - pix2[0]];
 300         s += sq[pix1[1]  - pix2[1]];
 301         s += sq[pix1[2]  - pix2[2]];
 302         s += sq[pix1[3]  - pix2[3]];
 303         s += sq[pix1[4]  - pix2[4]];
 304         s += sq[pix1[5]  - pix2[5]];
 305         s += sq[pix1[6]  - pix2[6]];
 306         s += sq[pix1[7]  - pix2[7]];
 307         s += sq[pix1[8]  - pix2[8]];
 308         s += sq[pix1[9]  - pix2[9]];
 309         s += sq[pix1[10] - pix2[10]];
 310         s += sq[pix1[11] - pix2[11]];
 311         s += sq[pix1[12] - pix2[12]];
 312         s += sq[pix1[13] - pix2[13]];
 313         s += sq[pix1[14] - pix2[14]];
 314         s += sq[pix1[15] - pix2[15]];
 315
 316         pix1 += line_size;
 317         pix2 += line_size;
 318     }
 319     return s;
 320 }
 321
 322 static void diff_pixels_c(int16_t *av_restrict block, const uint8_t *s1,
 323                           const uint8_t *s2, int stride)
 324 {
 325     int i;
 326
 327     /* read the pixels */
 328     for (i = 0; i < 8; i++) {
 329         block[0] = s1[0] - s2[0];
 330         block[1] = s1[1] - s2[1];
 331         block[2] = s1[2] - s2[2];
 332         block[3] = s1[3] - s2[3];
 333         block[4] = s1[4] - s2[4];
 334         block[5] = s1[5] - s2[5];
 335         block[6] = s1[6] - s2[6];
 336         block[7] = s1[7] - s2[7];
 337         s1      += stride;
 338         s2      += stride;
 339         block   += 8;
 340     }
 341 }
 342
 343 static void put_pixels_clamped_c(const int16_t *block, uint8_t *av_restrict pixels,
 344                                  int line_size)
 345 {
 346     int i;
 347
 348     /* read the pixels */
 349     for (i = 0; i < 8; i++) {
 350         pixels[0] = av_clip_uint8(block[0]);
 351         pixels[1] = av_clip_uint8(block[1]);
 352         pixels[2] = av_clip_uint8(block[2]);
 353         pixels[3] = av_clip_uint8(block[3]);
 354         pixels[4] = av_clip_uint8(block[4]);
 355         pixels[5] = av_clip_uint8(block[5]);
 356         pixels[6] = av_clip_uint8(block[6]);
 357         pixels[7] = av_clip_uint8(block[7]);
 358
 359         pixels += line_size;
 360         block  += 8;
 361     }
 362 }
 363
 364 static void put_pixels_clamped4_c(const int16_t *block, uint8_t *av_restrict pixels,
 365                                  int line_size)
 366 {
 367     int i;
 368
 369     /* read the pixels */
 370     for(i=0;i<4;i++) {
 371         pixels[0] = av_clip_uint8(block[0]);
 372         pixels[1] = av_clip_uint8(block[1]);
 373         pixels[2] = av_clip_uint8(block[2]);
 374         pixels[3] = av_clip_uint8(block[3]);
 375
 376         pixels += line_size;
 377         block += 8;
 378     }
 379 }
 380
 381 static void put_pixels_clamped2_c(const int16_t *block, uint8_t *av_restrict pixels,
 382                                  int line_size)
 383 {
 384     int i;
 385
 386     /* read the pixels */
 387     for(i=0;i<2;i++) {
 388         pixels[0] = av_clip_uint8(block[0]);
 389         pixels[1] = av_clip_uint8(block[1]);
 390
 391         pixels += line_size;
 392         block += 8;
 393     }
 394 }
 395
 396 static void put_signed_pixels_clamped_c(const int16_t *block,
 397                                         uint8_t *av_restrict pixels,
 398                                         int line_size)
 399 {
 400     int i, j;
 401
 402     for (i = 0; i < 8; i++) {
 403         for (j = 0; j < 8; j++) {
 404             if (*block < -128)
 405                 *pixels = 0;
 406             else if (*block > 127)
 407                 *pixels = 255;
 408             else
 409                 *pixels = (uint8_t) (*block + 128);
 410             block++;
 411             pixels++;
 412         }
 413         pixels += (line_size - 8);
 414     }
 415 }
 416
 417 static void add_pixels8_c(uint8_t *av_restrict pixels, int16_t *block,
 418                           int line_size)
 419 {
 420     int i;
 421
 422     for (i = 0; i < 8; i++) {
 423         pixels[0] += block[0];
 424         pixels[1] += block[1];
 425         pixels[2] += block[2];
 426         pixels[3] += block[3];
 427         pixels[4] += block[4];
 428         pixels[5] += block[5];
 429         pixels[6] += block[6];
 430         pixels[7] += block[7];
 431         pixels    += line_size;
 432         block     += 8;
 433     }
 434 }
 435
 436 static void add_pixels_clamped_c(const int16_t *block, uint8_t *av_restrict pixels,
 437                                  int line_size)
 438 {
 439     int i;
 440
 441     /* read the pixels */
 442     for (i = 0; i < 8; i++) {
 443         pixels[0] = av_clip_uint8(pixels[0] + block[0]);
 444         pixels[1] = av_clip_uint8(pixels[1] + block[1]);
 445         pixels[2] = av_clip_uint8(pixels[2] + block[2]);
 446         pixels[3] = av_clip_uint8(pixels[3] + block[3]);
 447         pixels[4] = av_clip_uint8(pixels[4] + block[4]);
 448         pixels[5] = av_clip_uint8(pixels[5] + block[5]);
 449         pixels[6] = av_clip_uint8(pixels[6] + block[6]);
 450         pixels[7] = av_clip_uint8(pixels[7] + block[7]);
 451         pixels   += line_size;
 452         block    += 8;
 453     }
 454 }
 455
 456 static void add_pixels_clamped4_c(const int16_t *block, uint8_t *av_restrict pixels,
 457                           int line_size)
 458 {
 459     int i;
 460
 461     /* read the pixels */
 462     for(i=0;i<4;i++) {
 463         pixels[0] = av_clip_uint8(pixels[0] + block[0]);
 464         pixels[1] = av_clip_uint8(pixels[1] + block[1]);
 465         pixels[2] = av_clip_uint8(pixels[2] + block[2]);
 466         pixels[3] = av_clip_uint8(pixels[3] + block[3]);
 467         pixels += line_size;
 468         block += 8;
 469     }
 470 }
 471
 472 static void add_pixels_clamped2_c(const int16_t *block, uint8_t *av_restrict pixels,
 473                           int line_size)
 474 {
 475     int i;
 476
 477     /* read the pixels */
 478     for(i=0;i<2;i++) {
 479         pixels[0] = av_clip_uint8(pixels[0] + block[0]);
 480         pixels[1] = av_clip_uint8(pixels[1] + block[1]);
 481         pixels += line_size;
 482         block += 8;
 483     }
 484 }
 485
 486 static int sum_abs_dctelem_c(int16_t *block)
 487 {
 488     int sum = 0, i;
 489
 490     for (i = 0; i < 64; i++)
 491         sum += FFABS(block[i]);
 492     return sum;
 493 }
 494
 495 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
 496 {
 497     int i;
 498
 499     for (i = 0; i < h; i++) {
 500         memset(block, value, 16);
 501         block += line_size;
 502     }
 503 }
 504
 505 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
 506 {
 507     int i;
 508
 509     for (i = 0; i < h; i++) {
 510         memset(block, value, 8);
 511         block += line_size;
 512     }
 513 }
 514
 515 #define avg2(a, b) ((a + b + 1) >> 1)
 516 #define avg4(a, b, c, d) ((a + b + c + d + 2) >> 2)
 517
 518 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h,
 519                    int x16, int y16, int rounder)
 520 {
 521     const int A = (16 - x16) * (16 - y16);
 522     const int B = (x16)      * (16 - y16);
 523     const int C = (16 - x16) * (y16);
 524     const int D = (x16)      * (y16);
 525     int i;
 526
 527     for (i = 0; i < h; i++) {
 528         dst[0] = (A * src[0] + B * src[1] + C * src[stride + 0] + D * src[stride + 1] + rounder) >> 8;
 529         dst[1] = (A * src[1] + B * src[2] + C * src[stride + 1] + D * src[stride + 2] + rounder) >> 8;
 530         dst[2] = (A * src[2] + B * src[3] + C * src[stride + 2] + D * src[stride + 3] + rounder) >> 8;
 531         dst[3] = (A * src[3] + B * src[4] + C * src[stride + 3] + D * src[stride + 4] + rounder) >> 8;
 532         dst[4] = (A * src[4] + B * src[5] + C * src[stride + 4] + D * src[stride + 5] + rounder) >> 8;
 533         dst[5] = (A * src[5] + B * src[6] + C * src[stride + 5] + D * src[stride + 6] + rounder) >> 8;
 534         dst[6] = (A * src[6] + B * src[7] + C * src[stride + 6] + D * src[stride + 7] + rounder) >> 8;
 535         dst[7] = (A * src[7] + B * src[8] + C * src[stride + 7] + D * src[stride + 8] + rounder) >> 8;
 536         dst   += stride;
 537         src   += stride;
 538     }
 539 }
 540
 541 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
 542               int dxx, int dxy, int dyx, int dyy, int shift, int r,
 543               int width, int height)
 544 {
 545     int y, vx, vy;
 546     const int s = 1 << shift;
 547
 548     width--;
 549     height--;
 550
 551     for (y = 0; y < h; y++) {
 552         int x;
 553
 554         vx = ox;
 555         vy = oy;
 556         for (x = 0; x < 8; x++) { // FIXME: optimize
 557             int index;
 558             int src_x  = vx >> 16;
 559             int src_y  = vy >> 16;
 560             int frac_x = src_x & (s - 1);
 561             int frac_y = src_y & (s - 1);
 562
 563             src_x >>= shift;
 564             src_y >>= shift;
 565
 566             if ((unsigned) src_x < width) {
 567                 if ((unsigned) src_y < height) {
 568                     index = src_x + src_y * stride;
 569                     dst[y * stride + x] =
 570                         ((src[index]                        * (s - frac_x) +
 571                           src[index + 1]          * frac_x) * (s - frac_y) +
 572                          (src[index + stride]               * (s - frac_x) +
 573                           src[index + stride + 1] * frac_x) *      frac_y  +
 574                          r) >> (shift * 2);
 575                 } else {
 576                     index = src_x + av_clip(src_y, 0, height) * stride;
 577                     dst[y * stride + x] =
 578                         ((src[index]               * (s - frac_x) +
 579                           src[index + 1] * frac_x) *  s           +
 580                          r) >> (shift * 2);
 581                 }
 582             } else {
 583                 if ((unsigned) src_y < height) {
 584                     index = av_clip(src_x, 0, width) + src_y * stride;
 585                     dst[y * stride + x] =
 586                         ((src[index]                    * (s - frac_y) +
 587                           src[index + stride] * frac_y) *  s           +
 588                          r) >> (shift * 2);
 589                 } else {
 590                     index = av_clip(src_x, 0, width) +
 591                             av_clip(src_y, 0, height) * stride;
 592                     dst[y * stride + x] = src[index];
 593                 }
 594             }
 595
 596             vx += dxx;
 597             vy += dyx;
 598         }
 599         ox += dxy;
 600         oy += dyy;
 601     }
 602 }
 603
 604 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src,
 605                                           int stride, int width, int height)
 606 {
 607     switch (width) {
 608     case 2:
 609         put_pixels2_8_c(dst, src, stride, height);
 610         break;
 611     case 4:
 612         put_pixels4_8_c(dst, src, stride, height);
 613         break;
 614     case 8:
 615         put_pixels8_8_c(dst, src, stride, height);
 616         break;
 617     case 16:
 618         put_pixels16_8_c(dst, src, stride, height);
 619         break;
 620     }
 621 }
 622
 623 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src,
 624                                           int stride, int width, int height)
 625 {
 626     int i, j;
 627
 628     for (i = 0; i < height; i++) {
 629         for (j = 0; j < width; j++)
 630             dst[j] = ((2 * src[j] + src[j + 1] + 1) *
 631                       683) >> 11;
 632         src += stride;
 633         dst += stride;
 634     }
 635 }
 636
 637 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src,
 638                                           int stride, int width, int height)
 639 {
 640     int i, j;
 641
 642     for (i = 0; i < height; i++) {
 643         for (j = 0; j < width; j++)
 644             dst[j] = ((src[j] + 2 * src[j + 1] + 1) *
 645                       683) >> 11;
 646         src += stride;
 647         dst += stride;
 648     }
 649 }
 650
 651 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src,
 652                                           int stride, int width, int height)
 653 {
 654     int i, j;
 655
 656     for (i = 0; i < height; i++) {
 657         for (j = 0; j < width; j++)
 658             dst[j] = ((2 * src[j] + src[j + stride] + 1) *
 659                       683) >> 11;
 660         src += stride;
 661         dst += stride;
 662     }
 663 }
 664
 665 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src,
 666                                           int stride, int width, int height)
 667 {
 668     int i, j;
 669
 670     for (i = 0; i < height; i++) {
 671         for (j = 0; j < width; j++)
 672             dst[j] = ((4 * src[j]          + 3 * src[j + 1] +
 673                        3 * src[j + stride] + 2 * src[j + stride + 1] + 6) *
 674                       2731) >> 15;
 675         src += stride;
 676         dst += stride;
 677     }
 678 }
 679
 680 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src,
 681                                           int stride, int width, int height)
 682 {
 683     int i, j;
 684
 685     for (i = 0; i < height; i++) {
 686         for (j = 0; j < width; j++)
 687             dst[j] = ((3 * src[j]          + 2 * src[j + 1] +
 688                        4 * src[j + stride] + 3 * src[j + stride + 1] + 6) *
 689                       2731) >> 15;
 690         src += stride;
 691         dst += stride;
 692     }
 693 }
 694
 695 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src,
 696                                           int stride, int width, int height)
 697 {
 698     int i, j;
 699
 700     for (i = 0; i < height; i++) {
 701         for (j = 0; j < width; j++)
 702             dst[j] = ((src[j] + 2 * src[j + stride] + 1) *
 703                       683) >> 11;
 704         src += stride;
 705         dst += stride;
 706     }
 707 }
 708
 709 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src,
 710                                           int stride, int width, int height)
 711 {
 712     int i, j;
 713
 714     for (i = 0; i < height; i++) {
 715         for (j = 0; j < width; j++)
 716             dst[j] = ((3 * src[j]          + 4 * src[j + 1] +
 717                        2 * src[j + stride] + 3 * src[j + stride + 1] + 6) *
 718                       2731) >> 15;
 719         src += stride;
 720         dst += stride;
 721     }
 722 }
 723
 724 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src,
 725                                           int stride, int width, int height)
 726 {
 727     int i, j;
 728
 729     for (i = 0; i < height; i++) {
 730         for (j = 0; j < width; j++)
 731             dst[j] = ((2 * src[j]          + 3 * src[j + 1] +
 732                        3 * src[j + stride] + 4 * src[j + stride + 1] + 6) *
 733                       2731) >> 15;
 734         src += stride;
 735         dst += stride;
 736     }
 737 }
 738
 739 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src,
 740                                           int stride, int width, int height)
 741 {
 742     switch (width) {
 743     case 2:
 744         avg_pixels2_8_c(dst, src, stride, height);
 745         break;
 746     case 4:
 747         avg_pixels4_8_c(dst, src, stride, height);
 748         break;
 749     case 8:
 750         avg_pixels8_8_c(dst, src, stride, height);
 751         break;
 752     case 16:
 753         avg_pixels16_8_c(dst, src, stride, height);
 754         break;
 755     }
 756 }
 757
 758 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src,
 759                                           int stride, int width, int height)
 760 {
 761     int i, j;
 762
 763     for (i = 0; i < height; i++) {
 764         for (j = 0; j < width; j++)
 765             dst[j] = (dst[j] +
 766                       (((2 * src[j] + src[j + 1] + 1) *
 767                         683) >> 11) + 1) >> 1;
 768         src += stride;
 769         dst += stride;
 770     }
 771 }
 772
 773 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src,
 774                                           int stride, int width, int height)
 775 {
 776     int i, j;
 777
 778     for (i = 0; i < height; i++) {
 779         for (j = 0; j < width; j++)
 780             dst[j] = (dst[j] +
 781                       (((src[j] + 2 * src[j + 1] + 1) *
 782                         683) >> 11) + 1) >> 1;
 783         src += stride;
 784         dst += stride;
 785     }
 786 }
 787
 788 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src,
 789                                           int stride, int width, int height)
 790 {
 791     int i, j;
 792
 793     for (i = 0; i < height; i++) {
 794         for (j = 0; j < width; j++)
 795             dst[j] = (dst[j] +
 796                       (((2 * src[j] + src[j + stride] + 1) *
 797                         683) >> 11) + 1) >> 1;
 798         src += stride;
 799         dst += stride;
 800     }
 801 }
 802
 803 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src,
 804                                           int stride, int width, int height)
 805 {
 806     int i, j;
 807
 808     for (i = 0; i < height; i++) {
 809         for (j = 0; j < width; j++)
 810             dst[j] = (dst[j] +
 811                       (((4 * src[j]          + 3 * src[j + 1] +
 812                          3 * src[j + stride] + 2 * src[j + stride + 1] + 6) *
 813                         2731) >> 15) + 1) >> 1;
 814         src += stride;
 815         dst += stride;
 816     }
 817 }
 818
 819 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src,
 820                                           int stride, int width, int height)
 821 {
 822     int i, j;
 823
 824     for (i = 0; i < height; i++) {
 825         for (j = 0; j < width; j++)
 826             dst[j] = (dst[j] +
 827                       (((3 * src[j]          + 2 * src[j + 1] +
 828                          4 * src[j + stride] + 3 * src[j + stride + 1] + 6) *
 829                         2731) >> 15) + 1) >> 1;
 830         src += stride;
 831         dst += stride;
 832     }
 833 }
 834
 835 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src,
 836                                           int stride, int width, int height)
 837 {
 838     int i, j;
 839
 840     for (i = 0; i < height; i++) {
 841         for (j = 0; j < width; j++)
 842             dst[j] = (dst[j] +
 843                       (((src[j] + 2 * src[j + stride] + 1) *
 844                         683) >> 11) + 1) >> 1;
 845         src += stride;
 846         dst += stride;
 847     }
 848 }
 849
 850 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src,
 851                                           int stride, int width, int height)
 852 {
 853     int i, j;
 854
 855     for (i = 0; i < height; i++) {
 856         for (j = 0; j < width; j++)
 857             dst[j] = (dst[j] +
 858                       (((3 * src[j]          + 4 * src[j + 1] +
 859                          2 * src[j + stride] + 3 * src[j + stride + 1] + 6) *
 860                         2731) >> 15) + 1) >> 1;
 861         src += stride;
 862         dst += stride;
 863     }
 864 }
 865
 866 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src,
 867                                           int stride, int width, int height)
 868 {
 869     int i, j;
 870
 871     for (i = 0; i < height; i++) {
 872         for (j = 0; j < width; j++)
 873             dst[j] = (dst[j] +
 874                       (((2 * src[j]          + 3 * src[j + 1] +
 875                          3 * src[j + stride] + 4 * src[j + stride + 1] + 6) *
 876                         2731) >> 15) + 1) >> 1;
 877         src += stride;
 878         dst += stride;
 879     }
 880 }
 881
 882 #define QPEL_MC(r, OPNAME, RND, OP)                                           \
 883 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src,       \
 884                                             int dstStride, int srcStride,     \
 885                                             int h)                            \
 886 {                                                                             \
 887     const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;                           \
 888     int i;                                                                    \
 889                                                                               \
 890     for (i = 0; i < h; i++) {                                                 \
 891         OP(dst[0], (src[0] + src[1]) * 20 - (src[0] + src[2]) * 6 + (src[1] + src[3]) * 3 - (src[2] + src[4])); \
 892         OP(dst[1], (src[1] + src[2]) * 20 - (src[0] + src[3]) * 6 + (src[0] + src[4]) * 3 - (src[1] + src[5])); \
 893         OP(dst[2], (src[2] + src[3]) * 20 - (src[1] + src[4]) * 6 + (src[0] + src[5]) * 3 - (src[0] + src[6])); \
 894         OP(dst[3], (src[3] + src[4]) * 20 - (src[2] + src[5]) * 6 + (src[1] + src[6]) * 3 - (src[0] + src[7])); \
 895         OP(dst[4], (src[4] + src[5]) * 20 - (src[3] + src[6]) * 6 + (src[2] + src[7]) * 3 - (src[1] + src[8])); \
 896         OP(dst[5], (src[5] + src[6]) * 20 - (src[4] + src[7]) * 6 + (src[3] + src[8]) * 3 - (src[2] + src[8])); \
 897         OP(dst[6], (src[6] + src[7]) * 20 - (src[5] + src[8]) * 6 + (src[4] + src[8]) * 3 - (src[3] + src[7])); \
 898         OP(dst[7], (src[7] + src[8]) * 20 - (src[6] + src[8]) * 6 + (src[5] + src[7]) * 3 - (src[4] + src[6])); \
 899         dst += dstStride;                                                     \
 900         src += srcStride;                                                     \
 901     }                                                                         \
 902 }                                                                             \
 903                                                                               \
 904 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src,       \
 905                                             int dstStride, int srcStride)     \
 906 {                                                                             \
 907     const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;                           \
 908     const int w = 8;                                                          \
 909     int i;                                                                    \
 910                                                                               \
 911     for (i = 0; i < w; i++) {                                                 \
 912         const int src0 = src[0 * srcStride];                                  \
 913         const int src1 = src[1 * srcStride];                                  \
 914         const int src2 = src[2 * srcStride];                                  \
 915         const int src3 = src[3 * srcStride];                                  \
 916         const int src4 = src[4 * srcStride];                                  \
 917         const int src5 = src[5 * srcStride];                                  \
 918         const int src6 = src[6 * srcStride];                                  \
 919         const int src7 = src[7 * srcStride];                                  \
 920         const int src8 = src[8 * srcStride];                                  \
 921         OP(dst[0 * dstStride], (src0 + src1) * 20 - (src0 + src2) * 6 + (src1 + src3) * 3 - (src2 + src4)); \
 922         OP(dst[1 * dstStride], (src1 + src2) * 20 - (src0 + src3) * 6 + (src0 + src4) * 3 - (src1 + src5)); \
 923         OP(dst[2 * dstStride], (src2 + src3) * 20 - (src1 + src4) * 6 + (src0 + src5) * 3 - (src0 + src6)); \
 924         OP(dst[3 * dstStride], (src3 + src4) * 20 - (src2 + src5) * 6 + (src1 + src6) * 3 - (src0 + src7)); \
 925         OP(dst[4 * dstStride], (src4 + src5) * 20 - (src3 + src6) * 6 + (src2 + src7) * 3 - (src1 + src8)); \
 926         OP(dst[5 * dstStride], (src5 + src6) * 20 - (src4 + src7) * 6 + (src3 + src8) * 3 - (src2 + src8)); \
 927         OP(dst[6 * dstStride], (src6 + src7) * 20 - (src5 + src8) * 6 + (src4 + src8) * 3 - (src3 + src7)); \
 928         OP(dst[7 * dstStride], (src7 + src8) * 20 - (src6 + src8) * 6 + (src5 + src7) * 3 - (src4 + src6)); \
 929         dst++;                                                                \
 930         src++;                                                                \
 931     }                                                                         \
 932 }                                                                             \
 933                                                                               \
 934 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src,      \
 935                                              int dstStride, int srcStride,    \
 936                                              int h)                           \
 937 {                                                                             \
 938     const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;                           \
 939     int i;                                                                    \
 940                                                                               \
 941     for (i = 0; i < h; i++) {                                                 \
 942         OP(dst[0],  (src[0]  + src[1])  * 20 - (src[0]  + src[2])  * 6 + (src[1]  + src[3])  * 3 - (src[2]  + src[4]));  \
 943         OP(dst[1],  (src[1]  + src[2])  * 20 - (src[0]  + src[3])  * 6 + (src[0]  + src[4])  * 3 - (src[1]  + src[5]));  \
 944         OP(dst[2],  (src[2]  + src[3])  * 20 - (src[1]  + src[4])  * 6 + (src[0]  + src[5])  * 3 - (src[0]  + src[6]));  \
 945         OP(dst[3],  (src[3]  + src[4])  * 20 - (src[2]  + src[5])  * 6 + (src[1]  + src[6])  * 3 - (src[0]  + src[7]));  \
 946         OP(dst[4],  (src[4]  + src[5])  * 20 - (src[3]  + src[6])  * 6 + (src[2]  + src[7])  * 3 - (src[1]  + src[8]));  \
 947         OP(dst[5],  (src[5]  + src[6])  * 20 - (src[4]  + src[7])  * 6 + (src[3]  + src[8])  * 3 - (src[2]  + src[9]));  \
 948         OP(dst[6],  (src[6]  + src[7])  * 20 - (src[5]  + src[8])  * 6 + (src[4]  + src[9])  * 3 - (src[3]  + src[10])); \
 949         OP(dst[7],  (src[7]  + src[8])  * 20 - (src[6]  + src[9])  * 6 + (src[5]  + src[10]) * 3 - (src[4]  + src[11])); \
 950         OP(dst[8],  (src[8]  + src[9])  * 20 - (src[7]  + src[10]) * 6 + (src[6]  + src[11]) * 3 - (src[5]  + src[12])); \
 951         OP(dst[9],  (src[9]  + src[10]) * 20 - (src[8]  + src[11]) * 6 + (src[7]  + src[12]) * 3 - (src[6]  + src[13])); \
 952         OP(dst[10], (src[10] + src[11]) * 20 - (src[9]  + src[12]) * 6 + (src[8]  + src[13]) * 3 - (src[7]  + src[14])); \
 953         OP(dst[11], (src[11] + src[12]) * 20 - (src[10] + src[13]) * 6 + (src[9]  + src[14]) * 3 - (src[8]  + src[15])); \
 954         OP(dst[12], (src[12] + src[13]) * 20 - (src[11] + src[14]) * 6 + (src[10] + src[15]) * 3 - (src[9]  + src[16])); \
 955         OP(dst[13], (src[13] + src[14]) * 20 - (src[12] + src[15]) * 6 + (src[11] + src[16]) * 3 - (src[10] + src[16])); \
 956         OP(dst[14], (src[14] + src[15]) * 20 - (src[13] + src[16]) * 6 + (src[12] + src[16]) * 3 - (src[11] + src[15])); \
 957         OP(dst[15], (src[15] + src[16]) * 20 - (src[14] + src[16]) * 6 + (src[13] + src[15]) * 3 - (src[12] + src[14])); \
 958         dst += dstStride;                                                     \
 959         src += srcStride;                                                     \
 960     }                                                                         \
 961 }                                                                             \
 962                                                                               \
 963 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src,      \
 964                                              int dstStride, int srcStride)    \
 965 {                                                                             \
 966     const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;                           \
 967     const int w = 16;                                                         \
 968     int i;                                                                    \
 969                                                                               \
 970     for (i = 0; i < w; i++) {                                                 \
 971         const int src0  = src[0  * srcStride];                                \
 972         const int src1  = src[1  * srcStride];                                \
 973         const int src2  = src[2  * srcStride];                                \
 974         const int src3  = src[3  * srcStride];                                \
 975         const int src4  = src[4  * srcStride];                                \
 976         const int src5  = src[5  * srcStride];                                \
 977         const int src6  = src[6  * srcStride];                                \
 978         const int src7  = src[7  * srcStride];                                \
 979         const int src8  = src[8  * srcStride];                                \
 980         const int src9  = src[9  * srcStride];                                \
 981         const int src10 = src[10 * srcStride];                                \
 982         const int src11 = src[11 * srcStride];                                \
 983         const int src12 = src[12 * srcStride];                                \
 984         const int src13 = src[13 * srcStride];                                \
 985         const int src14 = src[14 * srcStride];                                \
 986         const int src15 = src[15 * srcStride];                                \
 987         const int src16 = src[16 * srcStride];                                \
 988         OP(dst[0  * dstStride], (src0  + src1)  * 20 - (src0  + src2)  * 6 + (src1  + src3)  * 3 - (src2  + src4));  \
 989         OP(dst[1  * dstStride], (src1  + src2)  * 20 - (src0  + src3)  * 6 + (src0  + src4)  * 3 - (src1  + src5));  \
 990         OP(dst[2  * dstStride], (src2  + src3)  * 20 - (src1  + src4)  * 6 + (src0  + src5)  * 3 - (src0  + src6));  \
 991         OP(dst[3  * dstStride], (src3  + src4)  * 20 - (src2  + src5)  * 6 + (src1  + src6)  * 3 - (src0  + src7));  \
 992         OP(dst[4  * dstStride], (src4  + src5)  * 20 - (src3  + src6)  * 6 + (src2  + src7)  * 3 - (src1  + src8));  \
 993         OP(dst[5  * dstStride], (src5  + src6)  * 20 - (src4  + src7)  * 6 + (src3  + src8)  * 3 - (src2  + src9));  \
 994         OP(dst[6  * dstStride], (src6  + src7)  * 20 - (src5  + src8)  * 6 + (src4  + src9)  * 3 - (src3  + src10)); \
 995         OP(dst[7  * dstStride], (src7  + src8)  * 20 - (src6  + src9)  * 6 + (src5  + src10) * 3 - (src4  + src11)); \
 996         OP(dst[8  * dstStride], (src8  + src9)  * 20 - (src7  + src10) * 6 + (src6  + src11) * 3 - (src5  + src12)); \
 997         OP(dst[9  * dstStride], (src9  + src10) * 20 - (src8  + src11) * 6 + (src7  + src12) * 3 - (src6  + src13)); \
 998         OP(dst[10 * dstStride], (src10 + src11) * 20 - (src9  + src12) * 6 + (src8  + src13) * 3 - (src7  + src14)); \
 999         OP(dst[11 * dstStride], (src11 + src12) * 20 - (src10 + src13) * 6 + (src9  + src14) * 3 - (src8  + src15)); \
1000         OP(dst[12 * dstStride], (src12 + src13) * 20 - (src11 + src14) * 6 + (src10 + src15) * 3 - (src9  + src16)); \
1001         OP(dst[13 * dstStride], (src13 + src14) * 20 - (src12 + src15) * 6 + (src11 + src16) * 3 - (src10 + src16)); \
1002         OP(dst[14 * dstStride], (src14 + src15) * 20 - (src13 + src16) * 6 + (src12 + src16) * 3 - (src11 + src15)); \
1003         OP(dst[15 * dstStride], (src15 + src16) * 20 - (src14 + src16) * 6 + (src13 + src15) * 3 - (src12 + src14)); \
1004         dst++;                                                                \
1005         src++;                                                                \
1006     }                                                                         \
1007 }                                                                             \
1008                                                                               \
1009 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src,                \
1010                                    ptrdiff_t stride)                          \
1011 {                                                                             \
1012     uint8_t half[64];                                                         \
1013                                                                               \
1014     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);             \
1015     OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);             \
1016 }                                                                             \
1017                                                                               \
1018 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src,                \
1019                                    ptrdiff_t stride)                          \
1020 {                                                                             \
1021     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);             \
1022 }                                                                             \
1023                                                                               \
1024 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src,                \
1025                                    ptrdiff_t stride)                          \
1026 {                                                                             \
1027     uint8_t half[64];                                                         \
1028                                                                               \
1029     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);             \
1030     OPNAME ## pixels8_l2_8(dst, src + 1, half, stride, stride, 8, 8);         \
1031 }                                                                             \
1032                                                                               \
1033 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src,                \
1034                                    ptrdiff_t stride)                          \
1035 {                                                                             \
1036     uint8_t full[16 * 9];                                                     \
1037     uint8_t half[64];                                                         \
1038                                                                               \
1039     copy_block9(full, src, 16, stride, 9);                                    \
1040     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);                   \
1041     OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);                \
1042 }                                                                             \
1043                                                                               \
1044 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src,                \
1045                                    ptrdiff_t stride)                          \
1046 {                                                                             \
1047     uint8_t full[16 * 9];                                                     \
1048                                                                               \
1049     copy_block9(full, src, 16, stride, 9);                                    \
1050     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);                   \
1051 }                                                                             \
1052                                                                               \
1053 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src,                \
1054                                    ptrdiff_t stride)                          \
1055 {                                                                             \
1056     uint8_t full[16 * 9];                                                     \
1057     uint8_t half[64];                                                         \
1058                                                                               \
1059     copy_block9(full, src, 16, stride, 9);                                    \
1060     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);                   \
1061     OPNAME ## pixels8_l2_8(dst, full + 16, half, stride, 16, 8, 8);           \
1062 }                                                                             \
1063                                                                               \
1064 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src,            \
1065                                        ptrdiff_t stride)                      \
1066 {                                                                             \
1067     uint8_t full[16 * 9];                                                     \
1068     uint8_t halfH[72];                                                        \
1069     uint8_t halfV[64];                                                        \
1070     uint8_t halfHV[64];                                                       \
1071                                                                               \
1072     copy_block9(full, src, 16, stride, 9);                                    \
1073     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
1074     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);                  \
1075     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
1076     OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV,                   \
1077                            stride, 16, 8, 8, 8, 8);                           \
1078 }                                                                             \
1079                                                                               \
1080 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src,                \
1081                                    ptrdiff_t stride)                          \
1082 {                                                                             \
1083     uint8_t full[16 * 9];                                                     \
1084     uint8_t halfH[72];                                                        \
1085     uint8_t halfHV[64];                                                       \
1086                                                                               \
1087     copy_block9(full, src, 16, stride, 9);                                    \
1088     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
1089     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);              \
1090     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
1091     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);              \
1092 }                                                                             \
1093                                                                               \
1094 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src,            \
1095                                        ptrdiff_t stride)                      \
1096 {                                                                             \
1097     uint8_t full[16 * 9];                                                     \
1098     uint8_t halfH[72];                                                        \
1099     uint8_t halfV[64];                                                        \
1100     uint8_t halfHV[64];                                                       \
1101                                                                               \
1102     copy_block9(full, src, 16, stride, 9);                                    \
1103     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
1104     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full + 1, 8, 16);              \
1105     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
1106     OPNAME ## pixels8_l4_8(dst, full + 1, halfH, halfV, halfHV,               \
1107                            stride, 16, 8, 8, 8, 8);                           \
1108 }                                                                             \
1109                                                                               \
1110 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src,                \
1111                                    ptrdiff_t stride)                          \
1112 {                                                                             \
1113     uint8_t full[16 * 9];                                                     \
1114     uint8_t halfH[72];                                                        \
1115     uint8_t halfHV[64];                                                       \
1116                                                                               \
1117     copy_block9(full, src, 16, stride, 9);                                    \
1118     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
1119     put ## RND ## pixels8_l2_8(halfH, halfH, full + 1, 8, 8, 16, 9);          \
1120     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
1121     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);              \
1122 }                                                                             \
1123                                                                               \
1124 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src,            \
1125                                        ptrdiff_t stride)                      \
1126 {                                                                             \
1127     uint8_t full[16 * 9];                                                     \
1128     uint8_t halfH[72];                                                        \
1129     uint8_t halfV[64];                                                        \
1130     uint8_t halfHV[64];                                                       \
1131                                                                               \
1132     copy_block9(full, src, 16, stride, 9);                                    \
1133     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
1134     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);                  \
1135     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
1136     OPNAME ## pixels8_l4_8(dst, full + 16, halfH + 8, halfV, halfHV,          \
1137                            stride, 16, 8, 8, 8, 8);                           \
1138 }                                                                             \
1139                                                                               \
1140 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src,                \
1141                                    ptrdiff_t stride)                          \
1142 {                                                                             \
1143     uint8_t full[16 * 9];                                                     \
1144     uint8_t halfH[72];                                                        \
1145     uint8_t halfHV[64];                                                       \
1146                                                                               \
1147     copy_block9(full, src, 16, stride, 9);                                    \
1148     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
1149     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);              \
1150     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
1151     OPNAME ## pixels8_l2_8(dst, halfH + 8, halfHV, stride, 8, 8, 8);          \
1152 }                                                                             \
1153                                                                               \
1154 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src,            \
1155                                        ptrdiff_t stride)                      \
1156 {                                                                             \
1157     uint8_t full[16 * 9];                                                     \
1158     uint8_t halfH[72];                                                        \
1159     uint8_t halfV[64];                                                        \
1160     uint8_t halfHV[64];                                                       \
1161                                                                               \
1162     copy_block9(full, src, 16, stride, 9);                                    \
1163     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
1164     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full + 1, 8, 16);              \
1165     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
1166     OPNAME ## pixels8_l4_8(dst, full + 17, halfH + 8, halfV, halfHV,          \
1167                            stride, 16, 8, 8, 8, 8);                           \
1168 }                                                                             \
1169                                                                               \
1170 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src,                \
1171                                    ptrdiff_t stride)                          \
1172 {                                                                             \
1173     uint8_t full[16 * 9];                                                     \
1174     uint8_t halfH[72];                                                        \
1175     uint8_t halfHV[64];                                                       \
1176                                                                               \
1177     copy_block9(full, src, 16, stride, 9);                                    \
1178     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
1179     put ## RND ## pixels8_l2_8(halfH, halfH, full + 1, 8, 8, 16, 9);          \
1180     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
1181     OPNAME ## pixels8_l2_8(dst, halfH + 8, halfHV, stride, 8, 8, 8);          \
1182 }                                                                             \
1183                                                                               \
1184 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src,                \
1185                                    ptrdiff_t stride)                          \
1186 {                                                                             \
1187     uint8_t halfH[72];                                                        \
1188     uint8_t halfHV[64];                                                       \
1189                                                                               \
1190     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);            \
1191     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
1192     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);              \
1193 }                                                                             \
1194                                                                               \
1195 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src,                \
1196                                    ptrdiff_t stride)                          \
1197 {                                                                             \
1198     uint8_t halfH[72];                                                        \
1199     uint8_t halfHV[64];                                                       \
1200                                                                               \
1201     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);            \
1202     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
1203     OPNAME ## pixels8_l2_8(dst, halfH + 8, halfHV, stride, 8, 8, 8);          \
1204 }                                                                             \
1205                                                                               \
1206 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src,            \
1207                                        ptrdiff_t stride)                      \
1208 {                                                                             \
1209     uint8_t full[16 * 9];                                                     \
1210     uint8_t halfH[72];                                                        \
1211     uint8_t halfV[64];                                                        \
1212     uint8_t halfHV[64];                                                       \
1213                                                                               \
1214     copy_block9(full, src, 16, stride, 9);                                    \
1215     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
1216     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);                  \
1217     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
1218     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);              \
1219 }                                                                             \
1220                                                                               \
1221 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src,                \
1222                                    ptrdiff_t stride)                          \
1223 {                                                                             \
1224     uint8_t full[16 * 9];                                                     \
1225     uint8_t halfH[72];                                                        \
1226                                                                               \
1227     copy_block9(full, src, 16, stride, 9);                                    \
1228     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
1229     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);              \
1230     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);                   \
1231 }                                                                             \
1232                                                                               \
1233 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src,            \
1234                                        ptrdiff_t stride)                      \
1235 {                                                                             \
1236     uint8_t full[16 * 9];                                                     \
1237     uint8_t halfH[72];                                                        \
1238     uint8_t halfV[64];                                                        \
1239     uint8_t halfHV[64];                                                       \
1240                                                                               \
1241     copy_block9(full, src, 16, stride, 9);                                    \
1242     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
1243     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full + 1, 8, 16);              \
1244     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
1245     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);              \
1246 }                                                                             \
1247                                                                               \
1248 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src,                \
1249                                    ptrdiff_t stride)                          \
1250 {                                                                             \
1251     uint8_t full[16 * 9];                                                     \
1252     uint8_t halfH[72];                                                        \
1253                                                                               \
1254     copy_block9(full, src, 16, stride, 9);                                    \
1255     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
1256     put ## RND ## pixels8_l2_8(halfH, halfH, full + 1, 8, 8, 16, 9);          \
1257     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);                   \
1258 }                                                                             \
1259                                                                               \
1260 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src,                \
1261                                    ptrdiff_t stride)                          \
1262 {                                                                             \
1263     uint8_t halfH[72];                                                        \
1264                                                                               \
1265     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);            \
1266     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);                   \
1267 }                                                                             \
1268                                                                               \
1269 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src,               \
1270                                     ptrdiff_t stride)                         \
1271 {                                                                             \
1272     uint8_t half[256];                                                        \
1273                                                                               \
1274     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);          \
1275     OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);          \
1276 }                                                                             \
1277                                                                               \
1278 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src,               \
1279                                     ptrdiff_t stride)                         \
1280 {                                                                             \
1281     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);           \
1282 }                                                                             \
1283                                                                               \
1284 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src,               \
1285                                     ptrdiff_t stride)                         \
1286 {                                                                             \
1287     uint8_t half[256];                                                        \
1288                                                                               \
1289     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);          \
1290     OPNAME ## pixels16_l2_8(dst, src + 1, half, stride, stride, 16, 16);      \
1291 }                                                                             \
1292                                                                               \
1293 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src,               \
1294                                     ptrdiff_t stride)                         \
1295 {                                                                             \
1296     uint8_t full[24 * 17];                                                    \
1297     uint8_t half[256];                                                        \
1298                                                                               \
1299     copy_block17(full, src, 24, stride, 17);                                  \
1300     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);                 \
1301     OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);             \
1302 }                                                                             \
1303                                                                               \
1304 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src,               \
1305                                     ptrdiff_t stride)                         \
1306 {                                                                             \
1307     uint8_t full[24 * 17];                                                    \
1308                                                                               \
1309     copy_block17(full, src, 24, stride, 17);                                  \
1310     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);                  \
1311 }                                                                             \
1312                                                                               \
1313 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src,               \
1314                                     ptrdiff_t stride)                         \
1315 {                                                                             \
1316     uint8_t full[24 * 17];                                                    \
1317     uint8_t half[256];                                                        \
1318                                                                               \
1319     copy_block17(full, src, 24, stride, 17);                                  \
1320     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);                 \
1321     OPNAME ## pixels16_l2_8(dst, full + 24, half, stride, 24, 16, 16);        \
1322 }                                                                             \
1323                                                                               \
1324 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src,           \
1325                                         ptrdiff_t stride)                     \
1326 {                                                                             \
1327     uint8_t full[24 * 17];                                                    \
1328     uint8_t halfH[272];                                                       \
1329     uint8_t halfV[256];                                                       \
1330     uint8_t halfHV[256];                                                      \
1331                                                                               \
1332     copy_block17(full, src, 24, stride, 17);                                  \
1333     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1334     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);                \
1335     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1336     OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV,                  \
1337                             stride, 24, 16, 16, 16, 16);                      \
1338 }                                                                             \
1339                                                                               \
1340 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src,               \
1341                                     ptrdiff_t stride)                         \
1342 {                                                                             \
1343     uint8_t full[24 * 17];                                                    \
1344     uint8_t halfH[272];                                                       \
1345     uint8_t halfHV[256];                                                      \
1346                                                                               \
1347     copy_block17(full, src, 24, stride, 17);                                  \
1348     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1349     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);          \
1350     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1351     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);          \
1352 }                                                                             \
1353                                                                               \
1354 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src,           \
1355                                         ptrdiff_t stride)                     \
1356 {                                                                             \
1357     uint8_t full[24 * 17];                                                    \
1358     uint8_t halfH[272];                                                       \
1359     uint8_t halfV[256];                                                       \
1360     uint8_t halfHV[256];                                                      \
1361                                                                               \
1362     copy_block17(full, src, 24, stride, 17);                                  \
1363     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1364     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full + 1, 16, 24);            \
1365     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1366     OPNAME ## pixels16_l4_8(dst, full + 1, halfH, halfV, halfHV,              \
1367                             stride, 24, 16, 16, 16, 16);                      \
1368 }                                                                             \
1369                                                                               \
1370 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src,               \
1371                                     ptrdiff_t stride)                         \
1372 {                                                                             \
1373     uint8_t full[24 * 17];                                                    \
1374     uint8_t halfH[272];                                                       \
1375     uint8_t halfHV[256];                                                      \
1376                                                                               \
1377     copy_block17(full, src, 24, stride, 17);                                  \
1378     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1379     put ## RND ## pixels16_l2_8(halfH, halfH, full + 1, 16, 16, 24, 17);      \
1380     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1381     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);          \
1382 }                                                                             \
1383                                                                               \
1384 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src,           \
1385                                         ptrdiff_t stride)                     \
1386 {                                                                             \
1387     uint8_t full[24 * 17];                                                    \
1388     uint8_t halfH[272];                                                       \
1389     uint8_t halfV[256];                                                       \
1390     uint8_t halfHV[256];                                                      \
1391                                                                               \
1392     copy_block17(full, src, 24, stride, 17);                                  \
1393     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1394     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);                \
1395     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1396     OPNAME ## pixels16_l4_8(dst, full + 24, halfH + 16, halfV, halfHV,        \
1397                             stride, 24, 16, 16, 16, 16);                      \
1398 }                                                                             \
1399                                                                               \
1400 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src,               \
1401                                     ptrdiff_t stride)                         \
1402 {                                                                             \
1403     uint8_t full[24 * 17];                                                    \
1404     uint8_t halfH[272];                                                       \
1405     uint8_t halfHV[256];                                                      \
1406                                                                               \
1407     copy_block17(full, src, 24, stride, 17);                                  \
1408     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1409     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);          \
1410     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1411     OPNAME ## pixels16_l2_8(dst, halfH + 16, halfHV, stride, 16, 16, 16);     \
1412 }                                                                             \
1413                                                                               \
1414 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src,           \
1415                                         ptrdiff_t stride)                     \
1416 {                                                                             \
1417     uint8_t full[24 * 17];                                                    \
1418     uint8_t halfH[272];                                                       \
1419     uint8_t halfV[256];                                                       \
1420     uint8_t halfHV[256];                                                      \
1421                                                                               \
1422     copy_block17(full, src, 24, stride, 17);                                  \
1423     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1424     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full + 1, 16, 24);            \
1425     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1426     OPNAME ## pixels16_l4_8(dst, full + 25, halfH + 16, halfV, halfHV,        \
1427                             stride, 24, 16, 16, 16, 16);                      \
1428 }                                                                             \
1429                                                                               \
1430 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src,               \
1431                                     ptrdiff_t stride)                         \
1432 {                                                                             \
1433     uint8_t full[24 * 17];                                                    \
1434     uint8_t halfH[272];                                                       \
1435     uint8_t halfHV[256];                                                      \
1436                                                                               \
1437     copy_block17(full, src, 24, stride, 17);                                  \
1438     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1439     put ## RND ## pixels16_l2_8(halfH, halfH, full + 1, 16, 16, 24, 17);      \
1440     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1441     OPNAME ## pixels16_l2_8(dst, halfH + 16, halfHV, stride, 16, 16, 16);     \
1442 }                                                                             \
1443                                                                               \
1444 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src,               \
1445                                     ptrdiff_t stride)                         \
1446 {                                                                             \
1447     uint8_t halfH[272];                                                       \
1448     uint8_t halfHV[256];                                                      \
1449                                                                               \
1450     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);         \
1451     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1452     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);          \
1453 }                                                                             \
1454                                                                               \
1455 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src,               \
1456                                     ptrdiff_t stride)                         \
1457 {                                                                             \
1458     uint8_t halfH[272];                                                       \
1459     uint8_t halfHV[256];                                                      \
1460                                                                               \
1461     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);         \
1462     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1463     OPNAME ## pixels16_l2_8(dst, halfH + 16, halfHV, stride, 16, 16, 16);     \
1464 }                                                                             \
1465                                                                               \
1466 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src,           \
1467                                         ptrdiff_t stride)                     \
1468 {                                                                             \
1469     uint8_t full[24 * 17];                                                    \
1470     uint8_t halfH[272];                                                       \
1471     uint8_t halfV[256];                                                       \
1472     uint8_t halfHV[256];                                                      \
1473                                                                               \
1474     copy_block17(full, src, 24, stride, 17);                                  \
1475     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1476     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);                \
1477     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1478     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);          \
1479 }                                                                             \
1480                                                                               \
1481 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src,               \
1482                                     ptrdiff_t stride)                         \
1483 {                                                                             \
1484     uint8_t full[24 * 17];                                                    \
1485     uint8_t halfH[272];                                                       \
1486                                                                               \
1487     copy_block17(full, src, 24, stride, 17);                                  \
1488     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1489     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);          \
1490     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);                 \
1491 }                                                                             \
1492                                                                               \
1493 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src,           \
1494                                         ptrdiff_t stride)                     \
1495 {                                                                             \
1496     uint8_t full[24 * 17];                                                    \
1497     uint8_t halfH[272];                                                       \
1498     uint8_t halfV[256];                                                       \
1499     uint8_t halfHV[256];                                                      \
1500                                                                               \
1501     copy_block17(full, src, 24, stride, 17);                                  \
1502     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1503     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full + 1, 16, 24);            \
1504     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1505     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);          \
1506 }                                                                             \
1507                                                                               \
1508 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src,               \
1509                                     ptrdiff_t stride)                         \
1510 {                                                                             \
1511     uint8_t full[24 * 17];                                                    \
1512     uint8_t halfH[272];                                                       \
1513                                                                               \
1514     copy_block17(full, src, 24, stride, 17);                                  \
1515     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1516     put ## RND ## pixels16_l2_8(halfH, halfH, full + 1, 16, 16, 24, 17);      \
1517     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);                 \
1518 }                                                                             \
1519                                                                               \
1520 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src,               \
1521                                     ptrdiff_t stride)                         \
1522 {                                                                             \
1523     uint8_t halfH[272];                                                       \
1524                                                                               \
1525     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);         \
1526     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);                 \
1527 }
1528
1529 #define op_avg(a, b)        a = (((a) + cm[((b) + 16) >> 5] + 1) >> 1)
1530 #define op_avg_no_rnd(a, b) a = (((a) + cm[((b) + 15) >> 5])     >> 1)
1531 #define op_put(a, b)        a = cm[((b) + 16) >> 5]
1532 #define op_put_no_rnd(a, b) a = cm[((b) + 15) >> 5]
1533
1534 QPEL_MC(0, put_, _, op_put)
1535 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1536 QPEL_MC(0, avg_, _, op_avg)
1537
1538 #undef op_avg
1539 #undef op_put
1540 #undef op_put_no_rnd
1541
1542 void ff_put_pixels8x8_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1543 {
1544     put_pixels8_8_c(dst, src, stride, 8);
1545 }
1546
1547 void ff_avg_pixels8x8_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1548 {
1549     avg_pixels8_8_c(dst, src, stride, 8);
1550 }
1551
1552 void ff_put_pixels16x16_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1553 {
1554     put_pixels16_8_c(dst, src, stride, 16);
1555 }
1556
1557 void ff_avg_pixels16x16_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1558 {
1559     avg_pixels16_8_c(dst, src, stride, 16);
1560 }
1561
1562 #define put_qpel8_mc00_c         ff_put_pixels8x8_c
1563 #define avg_qpel8_mc00_c         ff_avg_pixels8x8_c
1564 #define put_qpel16_mc00_c        ff_put_pixels16x16_c
1565 #define avg_qpel16_mc00_c        ff_avg_pixels16x16_c
1566 #define put_no_rnd_qpel8_mc00_c  ff_put_pixels8x8_c
1567 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_c
1568
1569 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src,
1570                                   int dstStride, int srcStride, int h)
1571 {
1572     const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1573     int i;
1574
1575     for (i = 0; i < h; i++) {
1576         dst[0] = cm[(9 * (src[0] + src[1]) - (src[-1] + src[2]) + 8) >> 4];
1577         dst[1] = cm[(9 * (src[1] + src[2]) - (src[0]  + src[3]) + 8) >> 4];
1578         dst[2] = cm[(9 * (src[2] + src[3]) - (src[1]  + src[4]) + 8) >> 4];
1579         dst[3] = cm[(9 * (src[3] + src[4]) - (src[2]  + src[5]) + 8) >> 4];
1580         dst[4] = cm[(9 * (src[4] + src[5]) - (src[3]  + src[6]) + 8) >> 4];
1581         dst[5] = cm[(9 * (src[5] + src[6]) - (src[4]  + src[7]) + 8) >> 4];
1582         dst[6] = cm[(9 * (src[6] + src[7]) - (src[5]  + src[8]) + 8) >> 4];
1583         dst[7] = cm[(9 * (src[7] + src[8]) - (src[6]  + src[9]) + 8) >> 4];
1584         dst   += dstStride;
1585         src   += srcStride;
1586     }
1587 }
1588
1589 #if CONFIG_RV40_DECODER
1590 void ff_put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1591 {
1592     put_pixels16_xy2_8_c(dst, src, stride, 16);
1593 }
1594
1595 void ff_avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1596 {
1597     avg_pixels16_xy2_8_c(dst, src, stride, 16);
1598 }
1599
1600 void ff_put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1601 {
1602     put_pixels8_xy2_8_c(dst, src, stride, 8);
1603 }
1604
1605 void ff_avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1606 {
1607     avg_pixels8_xy2_8_c(dst, src, stride, 8);
1608 }
1609 #endif /* CONFIG_RV40_DECODER */
1610
1611 #if CONFIG_DIRAC_DECODER
1612 #define DIRAC_MC(OPNAME)\
1613 void ff_ ## OPNAME ## _dirac_pixels8_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1614 {\
1615      OPNAME ## _pixels8_8_c(dst, src[0], stride, h);\
1616 }\
1617 void ff_ ## OPNAME ## _dirac_pixels16_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1618 {\
1619     OPNAME ## _pixels16_8_c(dst, src[0], stride, h);\
1620 }\
1621 void ff_ ## OPNAME ## _dirac_pixels32_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1622 {\
1623     OPNAME ## _pixels16_8_c(dst   , src[0]   , stride, h);\
1624     OPNAME ## _pixels16_8_c(dst+16, src[0]+16, stride, h);\
1625 }\
1626 void ff_ ## OPNAME ## _dirac_pixels8_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1627 {\
1628     OPNAME ## _pixels8_l2_8(dst, src[0], src[1], stride, stride, stride, h);\
1629 }\
1630 void ff_ ## OPNAME ## _dirac_pixels16_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1631 {\
1632     OPNAME ## _pixels16_l2_8(dst, src[0], src[1], stride, stride, stride, h);\
1633 }\
1634 void ff_ ## OPNAME ## _dirac_pixels32_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1635 {\
1636     OPNAME ## _pixels16_l2_8(dst   , src[0]   , src[1]   , stride, stride, stride, h);\
1637     OPNAME ## _pixels16_l2_8(dst+16, src[0]+16, src[1]+16, stride, stride, stride, h);\
1638 }\
1639 void ff_ ## OPNAME ## _dirac_pixels8_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1640 {\
1641     OPNAME ## _pixels8_l4_8(dst, src[0], src[1], src[2], src[3], stride, stride, stride, stride, stride, h);\
1642 }\
1643 void ff_ ## OPNAME ## _dirac_pixels16_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1644 {\
1645     OPNAME ## _pixels16_l4_8(dst, src[0], src[1], src[2], src[3], stride, stride, stride, stride, stride, h);\
1646 }\
1647 void ff_ ## OPNAME ## _dirac_pixels32_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1648 {\
1649     OPNAME ## _pixels16_l4_8(dst   , src[0]   , src[1]   , src[2]   , src[3]   , stride, stride, stride, stride, stride, h);\
1650     OPNAME ## _pixels16_l4_8(dst+16, src[0]+16, src[1]+16, src[2]+16, src[3]+16, stride, stride, stride, stride, stride, h);\
1651 }
1652 DIRAC_MC(put)
1653 DIRAC_MC(avg)
1654 #endif
1655
1656 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src,
1657                                   int dstStride, int srcStride, int w)
1658 {
1659     const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1660     int i;
1661
1662     for (i = 0; i < w; i++) {
1663         const int src_1 = src[-srcStride];
1664         const int src0  = src[0];
1665         const int src1  = src[srcStride];
1666         const int src2  = src[2 * srcStride];
1667         const int src3  = src[3 * srcStride];
1668         const int src4  = src[4 * srcStride];
1669         const int src5  = src[5 * srcStride];
1670         const int src6  = src[6 * srcStride];
1671         const int src7  = src[7 * srcStride];
1672         const int src8  = src[8 * srcStride];
1673         const int src9  = src[9 * srcStride];
1674         dst[0 * dstStride] = cm[(9 * (src0 + src1) - (src_1 + src2) + 8) >> 4];
1675         dst[1 * dstStride] = cm[(9 * (src1 + src2) - (src0  + src3) + 8) >> 4];
1676         dst[2 * dstStride] = cm[(9 * (src2 + src3) - (src1  + src4) + 8) >> 4];
1677         dst[3 * dstStride] = cm[(9 * (src3 + src4) - (src2  + src5) + 8) >> 4];
1678         dst[4 * dstStride] = cm[(9 * (src4 + src5) - (src3  + src6) + 8) >> 4];
1679         dst[5 * dstStride] = cm[(9 * (src5 + src6) - (src4  + src7) + 8) >> 4];
1680         dst[6 * dstStride] = cm[(9 * (src6 + src7) - (src5  + src8) + 8) >> 4];
1681         dst[7 * dstStride] = cm[(9 * (src7 + src8) - (src6  + src9) + 8) >> 4];
1682         src++;
1683         dst++;
1684     }
1685 }
1686
1687 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1688 {
1689     uint8_t half[64];
1690
1691     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1692     put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1693 }
1694
1695 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1696 {
1697     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1698 }
1699
1700 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1701 {
1702     uint8_t half[64];
1703
1704     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1705     put_pixels8_l2_8(dst, src + 1, half, stride, stride, 8, 8);
1706 }
1707
1708 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1709 {
1710     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1711 }
1712
1713 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1714 {
1715     uint8_t halfH[88];
1716     uint8_t halfV[64];
1717     uint8_t halfHV[64];
1718
1719     wmv2_mspel8_h_lowpass(halfH, src - stride, 8, stride, 11);
1720     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1721     wmv2_mspel8_v_lowpass(halfHV, halfH + 8, 8, 8, 8);
1722     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1723 }
1724
1725 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1726 {
1727     uint8_t halfH[88];
1728     uint8_t halfV[64];
1729     uint8_t halfHV[64];
1730
1731     wmv2_mspel8_h_lowpass(halfH, src - stride, 8, stride, 11);
1732     wmv2_mspel8_v_lowpass(halfV, src + 1, 8, stride, 8);
1733     wmv2_mspel8_v_lowpass(halfHV, halfH + 8, 8, 8, 8);
1734     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1735 }
1736
1737 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1738 {
1739     uint8_t halfH[88];
1740
1741     wmv2_mspel8_h_lowpass(halfH, src - stride, 8, stride, 11);
1742     wmv2_mspel8_v_lowpass(dst, halfH + 8, stride, 8, 8);
1743 }
1744
1745 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2,
1746                               int line_size, int h)
1747 {
1748     int s = 0, i;
1749
1750     for (i = 0; i < h; i++) {
1751         s    += abs(pix1[0]  - pix2[0]);
1752         s    += abs(pix1[1]  - pix2[1]);
1753         s    += abs(pix1[2]  - pix2[2]);
1754         s    += abs(pix1[3]  - pix2[3]);
1755         s    += abs(pix1[4]  - pix2[4]);
1756         s    += abs(pix1[5]  - pix2[5]);
1757         s    += abs(pix1[6]  - pix2[6]);
1758         s    += abs(pix1[7]  - pix2[7]);
1759         s    += abs(pix1[8]  - pix2[8]);
1760         s    += abs(pix1[9]  - pix2[9]);
1761         s    += abs(pix1[10] - pix2[10]);
1762         s    += abs(pix1[11] - pix2[11]);
1763         s    += abs(pix1[12] - pix2[12]);
1764         s    += abs(pix1[13] - pix2[13]);
1765         s    += abs(pix1[14] - pix2[14]);
1766         s    += abs(pix1[15] - pix2[15]);
1767         pix1 += line_size;
1768         pix2 += line_size;
1769     }
1770     return s;
1771 }
1772
1773 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2,
1774                           int line_size, int h)
1775 {
1776     int s = 0, i;
1777
1778     for (i = 0; i < h; i++) {
1779         s    += abs(pix1[0]  - avg2(pix2[0],  pix2[1]));
1780         s    += abs(pix1[1]  - avg2(pix2[1],  pix2[2]));
1781         s    += abs(pix1[2]  - avg2(pix2[2],  pix2[3]));
1782         s    += abs(pix1[3]  - avg2(pix2[3],  pix2[4]));
1783         s    += abs(pix1[4]  - avg2(pix2[4],  pix2[5]));
1784         s    += abs(pix1[5]  - avg2(pix2[5],  pix2[6]));
1785         s    += abs(pix1[6]  - avg2(pix2[6],  pix2[7]));
1786         s    += abs(pix1[7]  - avg2(pix2[7],  pix2[8]));
1787         s    += abs(pix1[8]  - avg2(pix2[8],  pix2[9]));
1788         s    += abs(pix1[9]  - avg2(pix2[9],  pix2[10]));
1789         s    += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1790         s    += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1791         s    += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1792         s    += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1793         s    += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1794         s    += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1795         pix1 += line_size;
1796         pix2 += line_size;
1797     }
1798     return s;
1799 }
1800
1801 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2,
1802                           int line_size, int h)
1803 {
1804     int s = 0, i;
1805     uint8_t *pix3 = pix2 + line_size;
1806
1807     for (i = 0; i < h; i++) {
1808         s    += abs(pix1[0]  - avg2(pix2[0],  pix3[0]));
1809         s    += abs(pix1[1]  - avg2(pix2[1],  pix3[1]));
1810         s    += abs(pix1[2]  - avg2(pix2[2],  pix3[2]));
1811         s    += abs(pix1[3]  - avg2(pix2[3],  pix3[3]));
1812         s    += abs(pix1[4]  - avg2(pix2[4],  pix3[4]));
1813         s    += abs(pix1[5]  - avg2(pix2[5],  pix3[5]));
1814         s    += abs(pix1[6]  - avg2(pix2[6],  pix3[6]));
1815         s    += abs(pix1[7]  - avg2(pix2[7],  pix3[7]));
1816         s    += abs(pix1[8]  - avg2(pix2[8],  pix3[8]));
1817         s    += abs(pix1[9]  - avg2(pix2[9],  pix3[9]));
1818         s    += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1819         s    += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1820         s    += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1821         s    += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1822         s    += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1823         s    += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1824         pix1 += line_size;
1825         pix2 += line_size;
1826         pix3 += line_size;
1827     }
1828     return s;
1829 }
1830
1831 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2,
1832                            int line_size, int h)
1833 {
1834     int s = 0, i;
1835     uint8_t *pix3 = pix2 + line_size;
1836
1837     for (i = 0; i < h; i++) {
1838         s    += abs(pix1[0]  - avg4(pix2[0],  pix2[1],  pix3[0],  pix3[1]));
1839         s    += abs(pix1[1]  - avg4(pix2[1],  pix2[2],  pix3[1],  pix3[2]));
1840         s    += abs(pix1[2]  - avg4(pix2[2],  pix2[3],  pix3[2],  pix3[3]));
1841         s    += abs(pix1[3]  - avg4(pix2[3],  pix2[4],  pix3[3],  pix3[4]));
1842         s    += abs(pix1[4]  - avg4(pix2[4],  pix2[5],  pix3[4],  pix3[5]));
1843         s    += abs(pix1[5]  - avg4(pix2[5],  pix2[6],  pix3[5],  pix3[6]));
1844         s    += abs(pix1[6]  - avg4(pix2[6],  pix2[7],  pix3[6],  pix3[7]));
1845         s    += abs(pix1[7]  - avg4(pix2[7],  pix2[8],  pix3[7],  pix3[8]));
1846         s    += abs(pix1[8]  - avg4(pix2[8],  pix2[9],  pix3[8],  pix3[9]));
1847         s    += abs(pix1[9]  - avg4(pix2[9],  pix2[10], pix3[9],  pix3[10]));
1848         s    += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1849         s    += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1850         s    += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1851         s    += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1852         s    += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1853         s    += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1854         pix1 += line_size;
1855         pix2 += line_size;
1856         pix3 += line_size;
1857     }
1858     return s;
1859 }
1860
1861 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2,
1862                              int line_size, int h)
1863 {
1864     int s = 0, i;
1865
1866     for (i = 0; i < h; i++) {
1867         s    += abs(pix1[0] - pix2[0]);
1868         s    += abs(pix1[1] - pix2[1]);
1869         s    += abs(pix1[2] - pix2[2]);
1870         s    += abs(pix1[3] - pix2[3]);
1871         s    += abs(pix1[4] - pix2[4]);
1872         s    += abs(pix1[5] - pix2[5]);
1873         s    += abs(pix1[6] - pix2[6]);
1874         s    += abs(pix1[7] - pix2[7]);
1875         pix1 += line_size;
1876         pix2 += line_size;
1877     }
1878     return s;
1879 }
1880
1881 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2,
1882                          int line_size, int h)
1883 {
1884     int s = 0, i;
1885
1886     for (i = 0; i < h; i++) {
1887         s    += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1888         s    += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1889         s    += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1890         s    += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1891         s    += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1892         s    += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1893         s    += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1894         s    += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1895         pix1 += line_size;
1896         pix2 += line_size;
1897     }
1898     return s;
1899 }
1900
1901 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2,
1902                          int line_size, int h)
1903 {
1904     int s = 0, i;
1905     uint8_t *pix3 = pix2 + line_size;
1906
1907     for (i = 0; i < h; i++) {
1908         s    += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1909         s    += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1910         s    += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1911         s    += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1912         s    += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1913         s    += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1914         s    += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1915         s    += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1916         pix1 += line_size;
1917         pix2 += line_size;
1918         pix3 += line_size;
1919     }
1920     return s;
1921 }
1922
1923 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2,
1924                           int line_size, int h)
1925 {
1926     int s = 0, i;
1927     uint8_t *pix3 = pix2 + line_size;
1928
1929     for (i = 0; i < h; i++) {
1930         s    += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1931         s    += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1932         s    += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1933         s    += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1934         s    += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1935         s    += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1936         s    += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1937         s    += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1938         pix1 += line_size;
1939         pix2 += line_size;
1940         pix3 += line_size;
1941     }
1942     return s;
1943 }
1944
1945 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h)
1946 {
1947     MpegEncContext *c = v;
1948     int score1 = 0, score2 = 0, x, y;
1949
1950     for (y = 0; y < h; y++) {
1951         for (x = 0; x < 16; x++)
1952             score1 += (s1[x] - s2[x]) * (s1[x] - s2[x]);
1953         if (y + 1 < h) {
1954             for (x = 0; x < 15; x++)
1955                 score2 += FFABS(s1[x]     - s1[x + stride] -
1956                                 s1[x + 1] + s1[x + stride + 1]) -
1957                           FFABS(s2[x]     - s2[x + stride] -
1958                                 s2[x + 1] + s2[x + stride + 1]);
1959         }
1960         s1 += stride;
1961         s2 += stride;
1962     }
1963
1964     if (c)
1965         return score1 + FFABS(score2) * c->avctx->nsse_weight;
1966     else
1967         return score1 + FFABS(score2) * 8;
1968 }
1969
1970 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h)
1971 {
1972     MpegEncContext *c = v;
1973     int score1 = 0, score2 = 0, x, y;
1974
1975     for (y = 0; y < h; y++) {
1976         for (x = 0; x < 8; x++)
1977             score1 += (s1[x] - s2[x]) * (s1[x] - s2[x]);
1978         if (y + 1 < h) {
1979             for (x = 0; x < 7; x++)
1980                 score2 += FFABS(s1[x]     - s1[x + stride] -
1981                                 s1[x + 1] + s1[x + stride + 1]) -
1982                           FFABS(s2[x]     - s2[x + stride] -
1983                                 s2[x + 1] + s2[x + stride + 1]);
1984         }
1985         s1 += stride;
1986         s2 += stride;
1987     }
1988
1989     if (c)
1990         return score1 + FFABS(score2) * c->avctx->nsse_weight;
1991     else
1992         return score1 + FFABS(score2) * 8;
1993 }
1994
1995 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64],
1996                           int16_t basis[64], int scale)
1997 {
1998     int i;
1999     unsigned int sum = 0;
2000
2001     for (i = 0; i < 8 * 8; i++) {
2002         int b = rem[i] + ((basis[i] * scale +
2003                            (1 << (BASIS_SHIFT - RECON_SHIFT - 1))) >>
2004                           (BASIS_SHIFT - RECON_SHIFT));
2005         int w = weight[i];
2006         b >>= RECON_SHIFT;
2007         av_assert2(-512 < b && b < 512);
2008
2009         sum += (w * b) * (w * b) >> 4;
2010     }
2011     return sum >> 2;
2012 }
2013
2014 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale)
2015 {
2016     int i;
2017
2018     for (i = 0; i < 8 * 8; i++)
2019         rem[i] += (basis[i] * scale +
2020                    (1 << (BASIS_SHIFT - RECON_SHIFT - 1))) >>
2021                   (BASIS_SHIFT - RECON_SHIFT);
2022 }
2023
2024 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h)
2025 {
2026     return 0;
2027 }
2028
2029 void ff_set_cmp(DSPContext *c, me_cmp_func *cmp, int type)
2030 {
2031     int i;
2032
2033     memset(cmp, 0, sizeof(void *) * 6);
2034
2035     for (i = 0; i < 6; i++) {
2036         switch (type & 0xFF) {
2037         case FF_CMP_SAD:
2038             cmp[i] = c->sad[i];
2039             break;
2040         case FF_CMP_SATD:
2041             cmp[i] = c->hadamard8_diff[i];
2042             break;
2043         case FF_CMP_SSE:
2044             cmp[i] = c->sse[i];
2045             break;
2046         case FF_CMP_DCT:
2047             cmp[i] = c->dct_sad[i];
2048             break;
2049         case FF_CMP_DCT264:
2050             cmp[i] = c->dct264_sad[i];
2051             break;
2052         case FF_CMP_DCTMAX:
2053             cmp[i] = c->dct_max[i];
2054             break;
2055         case FF_CMP_PSNR:
2056             cmp[i] = c->quant_psnr[i];
2057             break;
2058         case FF_CMP_BIT:
2059             cmp[i] = c->bit[i];
2060             break;
2061         case FF_CMP_RD:
2062             cmp[i] = c->rd[i];
2063             break;
2064         case FF_CMP_VSAD:
2065             cmp[i] = c->vsad[i];
2066             break;
2067         case FF_CMP_VSSE:
2068             cmp[i] = c->vsse[i];
2069             break;
2070         case FF_CMP_ZERO:
2071             cmp[i] = zero_cmp;
2072             break;
2073         case FF_CMP_NSSE:
2074             cmp[i] = c->nsse[i];
2075             break;
2076 #if CONFIG_DWT
2077         case FF_CMP_W53:
2078             cmp[i]= c->w53[i];
2079             break;
2080         case FF_CMP_W97:
2081             cmp[i]= c->w97[i];
2082             break;
2083 #endif
2084         default:
2085             av_log(NULL, AV_LOG_ERROR,
2086                    "internal error in cmp function selection\n");
2087         }
2088     }
2089 }
2090
2091 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w)
2092 {
2093     long i;
2094
2095     for (i = 0; i <= w - (int) sizeof(long); i += sizeof(long)) {
2096         long a = *(long *) (src + i);
2097         long b = *(long *) (dst + i);
2098         *(long *) (dst + i) = ((a & pb_7f) + (b & pb_7f)) ^ ((a ^ b) & pb_80);
2099     }
2100     for (; i < w; i++)
2101         dst[i + 0] += src[i + 0];
2102 }
2103
2104 static void diff_bytes_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w)
2105 {
2106     long i;
2107
2108 #if !HAVE_FAST_UNALIGNED
2109     if ((long) src2 & (sizeof(long) - 1)) {
2110         for (i = 0; i + 7 < w; i += 8) {
2111             dst[i + 0] = src1[i + 0] - src2[i + 0];
2112             dst[i + 1] = src1[i + 1] - src2[i + 1];
2113             dst[i + 2] = src1[i + 2] - src2[i + 2];
2114             dst[i + 3] = src1[i + 3] - src2[i + 3];
2115             dst[i + 4] = src1[i + 4] - src2[i + 4];
2116             dst[i + 5] = src1[i + 5] - src2[i + 5];
2117             dst[i + 6] = src1[i + 6] - src2[i + 6];
2118             dst[i + 7] = src1[i + 7] - src2[i + 7];
2119         }
2120     } else
2121 #endif
2122     for (i = 0; i <= w - (int) sizeof(long); i += sizeof(long)) {
2123         long a = *(long *) (src1 + i);
2124         long b = *(long *) (src2 + i);
2125         *(long *) (dst + i) = ((a | pb_80) - (b & pb_7f)) ^
2126                               ((a ^ b ^ pb_80) & pb_80);
2127     }
2128     for (; i < w; i++)
2129         dst[i + 0] = src1[i + 0] - src2[i + 0];
2130 }
2131
2132 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1,
2133                                          const uint8_t *diff, int w,
2134                                          int *left, int *left_top)
2135 {
2136     int i;
2137     uint8_t l, lt;
2138
2139     l  = *left;
2140     lt = *left_top;
2141
2142     for (i = 0; i < w; i++) {
2143         l      = mid_pred(l, src1[i], (l + src1[i] - lt) & 0xFF) + diff[i];
2144         lt     = src1[i];
2145         dst[i] = l;
2146     }
2147
2148     *left     = l;
2149     *left_top = lt;
2150 }
2151
2152 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1,
2153                                          const uint8_t *src2, int w,
2154                                          int *left, int *left_top)
2155 {
2156     int i;
2157     uint8_t l, lt;
2158
2159     l  = *left;
2160     lt = *left_top;
2161
2162     for (i = 0; i < w; i++) {
2163         const int pred = mid_pred(l, src1[i], (l + src1[i] - lt) & 0xFF);
2164         lt     = src1[i];
2165         l      = src2[i];
2166         dst[i] = l - pred;
2167     }
2168
2169     *left     = l;
2170     *left_top = lt;
2171 }
2172
2173 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src,
2174                                       int w, int acc)
2175 {
2176     int i;
2177
2178     for (i = 0; i < w - 1; i++) {
2179         acc   += src[i];
2180         dst[i] = acc;
2181         i++;
2182         acc   += src[i];
2183         dst[i] = acc;
2184     }
2185
2186     for (; i < w; i++) {
2187         acc   += src[i];
2188         dst[i] = acc;
2189     }
2190
2191     return acc;
2192 }
2193
2194 #if HAVE_BIGENDIAN
2195 #define B 3
2196 #define G 2
2197 #define R 1
2198 #define A 0
2199 #else
2200 #define B 0
2201 #define G 1
2202 #define R 2
2203 #define A 3
2204 #endif
2205 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src,
2206                                              int w, int *red, int *green,
2207                                              int *blue, int *alpha)
2208 {
2209     int i, r = *red, g = *green, b = *blue, a = *alpha;
2210
2211     for (i = 0; i < w; i++) {
2212         b += src[4 * i + B];
2213         g += src[4 * i + G];
2214         r += src[4 * i + R];
2215         a += src[4 * i + A];
2216
2217         dst[4 * i + B] = b;
2218         dst[4 * i + G] = g;
2219         dst[4 * i + R] = r;
2220         dst[4 * i + A] = a;
2221     }
2222
2223     *red   = r;
2224     *green = g;
2225     *blue  = b;
2226     *alpha = a;
2227 }
2228 #undef B
2229 #undef G
2230 #undef R
2231 #undef A
2232
2233 #define BUTTERFLY2(o1, o2, i1, i2)              \
2234     o1 = (i1) + (i2);                           \
2235     o2 = (i1) - (i2);
2236
2237 #define BUTTERFLY1(x, y)                        \
2238     {                                           \
2239         int a, b;                               \
2240         a = x;                                  \
2241         b = y;                                  \
2242         x = a + b;                              \
2243         y = a - b;                              \
2244     }
2245
2246 #define BUTTERFLYA(x, y) (FFABS((x) + (y)) + FFABS((x) - (y)))
2247
2248 static int hadamard8_diff8x8_c(/* MpegEncContext */ void *s, uint8_t *dst,
2249                                uint8_t *src, int stride, int h)
2250 {
2251     int i, temp[64], sum = 0;
2252
2253     av_assert2(h == 8);
2254
2255     for (i = 0; i < 8; i++) {
2256         // FIXME: try pointer walks
2257         BUTTERFLY2(temp[8 * i + 0], temp[8 * i + 1],
2258                    src[stride * i + 0] - dst[stride * i + 0],
2259                    src[stride * i + 1] - dst[stride * i + 1]);
2260         BUTTERFLY2(temp[8 * i + 2], temp[8 * i + 3],
2261                    src[stride * i + 2] - dst[stride * i + 2],
2262                    src[stride * i + 3] - dst[stride * i + 3]);
2263         BUTTERFLY2(temp[8 * i + 4], temp[8 * i + 5],
2264                    src[stride * i + 4] - dst[stride * i + 4],
2265                    src[stride * i + 5] - dst[stride * i + 5]);
2266         BUTTERFLY2(temp[8 * i + 6], temp[8 * i + 7],
2267                    src[stride * i + 6] - dst[stride * i + 6],
2268                    src[stride * i + 7] - dst[stride * i + 7]);
2269
2270         BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 2]);
2271         BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 3]);
2272         BUTTERFLY1(temp[8 * i + 4], temp[8 * i + 6]);
2273         BUTTERFLY1(temp[8 * i + 5], temp[8 * i + 7]);
2274
2275         BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 4]);
2276         BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 5]);
2277         BUTTERFLY1(temp[8 * i + 2], temp[8 * i + 6]);
2278         BUTTERFLY1(temp[8 * i + 3], temp[8 * i + 7]);
2279     }
2280
2281     for (i = 0; i < 8; i++) {
2282         BUTTERFLY1(temp[8 * 0 + i], temp[8 * 1 + i]);
2283         BUTTERFLY1(temp[8 * 2 + i], temp[8 * 3 + i]);
2284         BUTTERFLY1(temp[8 * 4 + i], temp[8 * 5 + i]);
2285         BUTTERFLY1(temp[8 * 6 + i], temp[8 * 7 + i]);
2286
2287         BUTTERFLY1(temp[8 * 0 + i], temp[8 * 2 + i]);
2288         BUTTERFLY1(temp[8 * 1 + i], temp[8 * 3 + i]);
2289         BUTTERFLY1(temp[8 * 4 + i], temp[8 * 6 + i]);
2290         BUTTERFLY1(temp[8 * 5 + i], temp[8 * 7 + i]);
2291
2292         sum += BUTTERFLYA(temp[8 * 0 + i], temp[8 * 4 + i]) +
2293                BUTTERFLYA(temp[8 * 1 + i], temp[8 * 5 + i]) +
2294                BUTTERFLYA(temp[8 * 2 + i], temp[8 * 6 + i]) +
2295                BUTTERFLYA(temp[8 * 3 + i], temp[8 * 7 + i]);
2296     }
2297     return sum;
2298 }
2299
2300 static int hadamard8_intra8x8_c(/* MpegEncContext */ void *s, uint8_t *src,
2301                                 uint8_t *dummy, int stride, int h)
2302 {
2303     int i, temp[64], sum = 0;
2304
2305     av_assert2(h == 8);
2306
2307     for (i = 0; i < 8; i++) {
2308         // FIXME: try pointer walks
2309         BUTTERFLY2(temp[8 * i + 0], temp[8 * i + 1],
2310                    src[stride * i + 0], src[stride * i + 1]);
2311         BUTTERFLY2(temp[8 * i + 2], temp[8 * i + 3],
2312                    src[stride * i + 2], src[stride * i + 3]);
2313         BUTTERFLY2(temp[8 * i + 4], temp[8 * i + 5],
2314                    src[stride * i + 4], src[stride * i + 5]);
2315         BUTTERFLY2(temp[8 * i + 6], temp[8 * i + 7],
2316                    src[stride * i + 6], src[stride * i + 7]);
2317
2318         BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 2]);
2319         BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 3]);
2320         BUTTERFLY1(temp[8 * i + 4], temp[8 * i + 6]);
2321         BUTTERFLY1(temp[8 * i + 5], temp[8 * i + 7]);
2322
2323         BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 4]);
2324         BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 5]);
2325         BUTTERFLY1(temp[8 * i + 2], temp[8 * i + 6]);
2326         BUTTERFLY1(temp[8 * i + 3], temp[8 * i + 7]);
2327     }
2328
2329     for (i = 0; i < 8; i++) {
2330         BUTTERFLY1(temp[8 * 0 + i], temp[8 * 1 + i]);
2331         BUTTERFLY1(temp[8 * 2 + i], temp[8 * 3 + i]);
2332         BUTTERFLY1(temp[8 * 4 + i], temp[8 * 5 + i]);
2333         BUTTERFLY1(temp[8 * 6 + i], temp[8 * 7 + i]);
2334
2335         BUTTERFLY1(temp[8 * 0 + i], temp[8 * 2 + i]);
2336         BUTTERFLY1(temp[8 * 1 + i], temp[8 * 3 + i]);
2337         BUTTERFLY1(temp[8 * 4 + i], temp[8 * 6 + i]);
2338         BUTTERFLY1(temp[8 * 5 + i], temp[8 * 7 + i]);
2339
2340         sum +=
2341             BUTTERFLYA(temp[8 * 0 + i], temp[8 * 4 + i])
2342             + BUTTERFLYA(temp[8 * 1 + i], temp[8 * 5 + i])
2343             + BUTTERFLYA(temp[8 * 2 + i], temp[8 * 6 + i])
2344             + BUTTERFLYA(temp[8 * 3 + i], temp[8 * 7 + i]);
2345     }
2346
2347     sum -= FFABS(temp[8 * 0] + temp[8 * 4]); // -mean
2348
2349     return sum;
2350 }
2351
2352 static int dct_sad8x8_c(/* MpegEncContext */ void *c, uint8_t *src1,
2353                         uint8_t *src2, int stride, int h)
2354 {
2355     MpegEncContext *const s = (MpegEncContext *) c;
2356     LOCAL_ALIGNED_16(int16_t, temp, [64]);
2357
2358     av_assert2(h == 8);
2359
2360     s->dsp.diff_pixels(temp, src1, src2, stride);
2361     s->dsp.fdct(temp);
2362     return s->dsp.sum_abs_dctelem(temp);
2363 }
2364
2365 #if CONFIG_GPL
2366 #define DCT8_1D                                         \
2367     {                                                   \
2368         const int s07 = SRC(0) + SRC(7);                \
2369         const int s16 = SRC(1) + SRC(6);                \
2370         const int s25 = SRC(2) + SRC(5);                \
2371         const int s34 = SRC(3) + SRC(4);                \
2372         const int a0  = s07 + s34;                      \
2373         const int a1  = s16 + s25;                      \
2374         const int a2  = s07 - s34;                      \
2375         const int a3  = s16 - s25;                      \
2376         const int d07 = SRC(0) - SRC(7);                \
2377         const int d16 = SRC(1) - SRC(6);                \
2378         const int d25 = SRC(2) - SRC(5);                \
2379         const int d34 = SRC(3) - SRC(4);                \
2380         const int a4  = d16 + d25 + (d07 + (d07 >> 1)); \
2381         const int a5  = d07 - d34 - (d25 + (d25 >> 1)); \
2382         const int a6  = d07 + d34 - (d16 + (d16 >> 1)); \
2383         const int a7  = d16 - d25 + (d34 + (d34 >> 1)); \
2384         DST(0, a0 + a1);                                \
2385         DST(1, a4 + (a7 >> 2));                         \
2386         DST(2, a2 + (a3 >> 1));                         \
2387         DST(3, a5 + (a6 >> 2));                         \
2388         DST(4, a0 - a1);                                \
2389         DST(5, a6 - (a5 >> 2));                         \
2390         DST(6, (a2 >> 1) - a3);                         \
2391         DST(7, (a4 >> 2) - a7);                         \
2392     }
2393
2394 static int dct264_sad8x8_c(/* MpegEncContext */ void *c, uint8_t *src1,
2395                            uint8_t *src2, int stride, int h)
2396 {
2397     MpegEncContext *const s = (MpegEncContext *) c;
2398     int16_t dct[8][8];
2399     int i, sum = 0;
2400
2401     s->dsp.diff_pixels(dct[0], src1, src2, stride);
2402
2403 #define SRC(x) dct[i][x]
2404 #define DST(x, v) dct[i][x] = v
2405     for (i = 0; i < 8; i++)
2406         DCT8_1D
2407 #undef SRC
2408 #undef DST
2409
2410 #define SRC(x) dct[x][i]
2411 #define DST(x, v) sum += FFABS(v)
2412         for (i = 0; i < 8; i++)
2413             DCT8_1D
2414 #undef SRC
2415 #undef DST
2416             return sum;
2417 }
2418 #endif
2419
2420 static int dct_max8x8_c(/* MpegEncContext */ void *c, uint8_t *src1,
2421                         uint8_t *src2, int stride, int h)
2422 {
2423     MpegEncContext *const s = (MpegEncContext *) c;
2424     LOCAL_ALIGNED_16(int16_t, temp, [64]);
2425     int sum = 0, i;
2426
2427     av_assert2(h == 8);
2428
2429     s->dsp.diff_pixels(temp, src1, src2, stride);
2430     s->dsp.fdct(temp);
2431
2432     for (i = 0; i < 64; i++)
2433         sum = FFMAX(sum, FFABS(temp[i]));
2434
2435     return sum;
2436 }
2437
2438 static int quant_psnr8x8_c(/* MpegEncContext */ void *c, uint8_t *src1,
2439                            uint8_t *src2, int stride, int h)
2440 {
2441     MpegEncContext *const s = c;
2442     LOCAL_ALIGNED_16(int16_t, temp, [64 * 2]);
2443     int16_t *const bak = temp + 64;
2444     int sum = 0, i;
2445
2446     av_assert2(h == 8);
2447     s->mb_intra = 0;
2448
2449     s->dsp.diff_pixels(temp, src1, src2, stride);
2450
2451     memcpy(bak, temp, 64 * sizeof(int16_t));
2452
2453     s->block_last_index[0 /* FIXME */] =
2454         s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i);
2455     s->dct_unquantize_inter(s, temp, 0, s->qscale);
2456     ff_simple_idct_8(temp); // FIXME
2457
2458     for (i = 0; i < 64; i++)
2459         sum += (temp[i] - bak[i]) * (temp[i] - bak[i]);
2460
2461     return sum;
2462 }
2463
2464 static int rd8x8_c(/* MpegEncContext */ void *c, uint8_t *src1, uint8_t *src2,
2465                    int stride, int h)
2466 {
2467     MpegEncContext *const s  = (MpegEncContext *) c;
2468     const uint8_t *scantable = s->intra_scantable.permutated;
2469     LOCAL_ALIGNED_16(int16_t, temp, [64]);
2470     LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2471     LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2472     int i, last, run, bits, level, distortion, start_i;
2473     const int esc_length = s->ac_esc_length;
2474     uint8_t *length, *last_length;
2475
2476     av_assert2(h == 8);
2477
2478     copy_block8(lsrc1, src1, 8, stride, 8);
2479     copy_block8(lsrc2, src2, 8, stride, 8);
2480
2481     s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2482
2483     s->block_last_index[0 /* FIXME */] =
2484     last                               =
2485         s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i);
2486
2487     bits = 0;
2488
2489     if (s->mb_intra) {
2490         start_i     = 1;
2491         length      = s->intra_ac_vlc_length;
2492         last_length = s->intra_ac_vlc_last_length;
2493         bits       += s->luma_dc_vlc_length[temp[0] + 256]; // FIXME: chroma
2494     } else {
2495         start_i     = 0;
2496         length      = s->inter_ac_vlc_length;
2497         last_length = s->inter_ac_vlc_last_length;
2498     }
2499
2500     if (last >= start_i) {
2501         run = 0;
2502         for (i = start_i; i < last; i++) {
2503             int j = scantable[i];
2504             level = temp[j];
2505
2506             if (level) {
2507                 level += 64;
2508                 if ((level & (~127)) == 0)
2509                     bits += length[UNI_AC_ENC_INDEX(run, level)];
2510                 else
2511                     bits += esc_length;
2512                 run = 0;
2513             } else
2514                 run++;
2515         }
2516         i = scantable[last];
2517
2518         level = temp[i] + 64;
2519
2520         av_assert2(level - 64);
2521
2522         if ((level & (~127)) == 0) {
2523             bits += last_length[UNI_AC_ENC_INDEX(run, level)];
2524         } else
2525             bits += esc_length;
2526     }
2527
2528     if (last >= 0) {
2529         if (s->mb_intra)
2530             s->dct_unquantize_intra(s, temp, 0, s->qscale);
2531         else
2532             s->dct_unquantize_inter(s, temp, 0, s->qscale);
2533     }
2534
2535     s->dsp.idct_add(lsrc2, 8, temp);
2536
2537     distortion = s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2538
2539     return distortion + ((bits * s->qscale * s->qscale * 109 + 64) >> 7);
2540 }
2541
2542 static int bit8x8_c(/* MpegEncContext */ void *c, uint8_t *src1, uint8_t *src2,
2543                     int stride, int h)
2544 {
2545     MpegEncContext *const s  = (MpegEncContext *) c;
2546     const uint8_t *scantable = s->intra_scantable.permutated;
2547     LOCAL_ALIGNED_16(int16_t, temp, [64]);
2548     int i, last, run, bits, level, start_i;
2549     const int esc_length = s->ac_esc_length;
2550     uint8_t *length, *last_length;
2551
2552     av_assert2(h == 8);
2553
2554     s->dsp.diff_pixels(temp, src1, src2, stride);
2555
2556     s->block_last_index[0 /* FIXME */] =
2557     last                               =
2558         s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i);
2559
2560     bits = 0;
2561
2562     if (s->mb_intra) {
2563         start_i     = 1;
2564         length      = s->intra_ac_vlc_length;
2565         last_length = s->intra_ac_vlc_last_length;
2566         bits       += s->luma_dc_vlc_length[temp[0] + 256]; // FIXME: chroma
2567     } else {
2568         start_i     = 0;
2569         length      = s->inter_ac_vlc_length;
2570         last_length = s->inter_ac_vlc_last_length;
2571     }
2572
2573     if (last >= start_i) {
2574         run = 0;
2575         for (i = start_i; i < last; i++) {
2576             int j = scantable[i];
2577             level = temp[j];
2578
2579             if (level) {
2580                 level += 64;
2581                 if ((level & (~127)) == 0)
2582                     bits += length[UNI_AC_ENC_INDEX(run, level)];
2583                 else
2584                     bits += esc_length;
2585                 run = 0;
2586             } else
2587                 run++;
2588         }
2589         i = scantable[last];
2590
2591         level = temp[i] + 64;
2592
2593         av_assert2(level - 64);
2594
2595         if ((level & (~127)) == 0)
2596             bits += last_length[UNI_AC_ENC_INDEX(run, level)];
2597         else
2598             bits += esc_length;
2599     }
2600
2601     return bits;
2602 }
2603
2604 #define VSAD_INTRA(size)                                                \
2605 static int vsad_intra ## size ## _c(/* MpegEncContext */ void *c,       \
2606                                     uint8_t *s, uint8_t *dummy,         \
2607                                     int stride, int h)                  \
2608 {                                                                       \
2609     int score = 0, x, y;                                                \
2610                                                                         \
2611     for (y = 1; y < h; y++) {                                           \
2612         for (x = 0; x < size; x += 4) {                                 \
2613             score += FFABS(s[x]     - s[x + stride])     +              \
2614                      FFABS(s[x + 1] - s[x + stride + 1]) +              \
2615                      FFABS(s[x + 2] - s[x + 2 + stride]) +              \
2616                      FFABS(s[x + 3] - s[x + 3 + stride]);               \
2617         }                                                               \
2618         s += stride;                                                    \
2619     }                                                                   \
2620                                                                         \
2621     return score;                                                       \
2622 }
2623 VSAD_INTRA(8)
2624 VSAD_INTRA(16)
2625
2626 static int vsad16_c(/* MpegEncContext */ void *c, uint8_t *s1, uint8_t *s2,
2627                     int stride, int h)
2628 {
2629     int score = 0, x, y;
2630
2631     for (y = 1; y < h; y++) {
2632         for (x = 0; x < 16; x++)
2633             score += FFABS(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]);
2634         s1 += stride;
2635         s2 += stride;
2636     }
2637
2638     return score;
2639 }
2640
2641 #define SQ(a) ((a) * (a))
2642 #define VSSE_INTRA(size)                                                \
2643 static int vsse_intra ## size ## _c(/* MpegEncContext */ void *c,       \
2644                                     uint8_t *s, uint8_t *dummy,         \
2645                                     int stride, int h)                  \
2646 {                                                                       \
2647     int score = 0, x, y;                                                \
2648                                                                         \
2649     for (y = 1; y < h; y++) {                                           \
2650         for (x = 0; x < size; x += 4) {                                 \
2651             score += SQ(s[x]     - s[x + stride]) +                     \
2652                      SQ(s[x + 1] - s[x + stride + 1]) +                 \
2653                      SQ(s[x + 2] - s[x + stride + 2]) +                 \
2654                      SQ(s[x + 3] - s[x + stride + 3]);                  \
2655         }                                                               \
2656         s += stride;                                                    \
2657     }                                                                   \
2658                                                                         \
2659     return score;                                                       \
2660 }
2661 VSSE_INTRA(8)
2662 VSSE_INTRA(16)
2663
2664 static int vsse16_c(/* MpegEncContext */ void *c, uint8_t *s1, uint8_t *s2,
2665                     int stride, int h)
2666 {
2667     int score = 0, x, y;
2668
2669     for (y = 1; y < h; y++) {
2670         for (x = 0; x < 16; x++)
2671             score += SQ(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]);
2672         s1 += stride;
2673         s2 += stride;
2674     }
2675
2676     return score;
2677 }
2678
2679 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2680                                int size)
2681 {
2682     int score = 0, i;
2683
2684     for (i = 0; i < size; i++)
2685         score += (pix1[i] - pix2[i]) * (pix1[i] - pix2[i]);
2686     return score;
2687 }
2688
2689 #define WRAPPER8_16_SQ(name8, name16)                                   \
2690 static int name16(void /*MpegEncContext*/ *s,                           \
2691                   uint8_t *dst, uint8_t *src,                           \
2692                   int stride, int h)                                    \
2693 {                                                                       \
2694     int score = 0;                                                      \
2695                                                                         \
2696     score += name8(s, dst, src, stride, 8);                             \
2697     score += name8(s, dst + 8, src + 8, stride, 8);                     \
2698     if (h == 16) {                                                      \
2699         dst   += 8 * stride;                                            \
2700         src   += 8 * stride;                                            \
2701         score += name8(s, dst, src, stride, 8);                         \
2702         score += name8(s, dst + 8, src + 8, stride, 8);                 \
2703     }                                                                   \
2704     return score;                                                       \
2705 }
2706
2707 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2708 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2709 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2710 #if CONFIG_GPL
2711 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2712 #endif
2713 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2714 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2715 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2716 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2717
2718 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2719                                    uint32_t maxi, uint32_t maxisign)
2720 {
2721     if (a > mini)
2722         return mini;
2723     else if ((a ^ (1U << 31)) > maxisign)
2724         return maxi;
2725     else
2726         return a;
2727 }
2728
2729 static void vector_clipf_c_opposite_sign(float *dst, const float *src,
2730                                          float *min, float *max, int len)
2731 {
2732     int i;
2733     uint32_t mini        = *(uint32_t *) min;
2734     uint32_t maxi        = *(uint32_t *) max;
2735     uint32_t maxisign    = maxi ^ (1U << 31);
2736     uint32_t *dsti       = (uint32_t *) dst;
2737     const uint32_t *srci = (const uint32_t *) src;
2738
2739     for (i = 0; i < len; i += 8) {
2740         dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2741         dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2742         dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2743         dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2744         dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2745         dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2746         dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2747         dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2748     }
2749 }
2750
2751 static void vector_clipf_c(float *dst, const float *src,
2752                            float min, float max, int len)
2753 {
2754     int i;
2755
2756     if (min < 0 && max > 0) {
2757         vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2758     } else {
2759         for (i = 0; i < len; i += 8) {
2760             dst[i]     = av_clipf(src[i], min, max);
2761             dst[i + 1] = av_clipf(src[i + 1], min, max);
2762             dst[i + 2] = av_clipf(src[i + 2], min, max);
2763             dst[i + 3] = av_clipf(src[i + 3], min, max);
2764             dst[i + 4] = av_clipf(src[i + 4], min, max);
2765             dst[i + 5] = av_clipf(src[i + 5], min, max);
2766             dst[i + 6] = av_clipf(src[i + 6], min, max);
2767             dst[i + 7] = av_clipf(src[i + 7], min, max);
2768         }
2769     }
2770 }
2771
2772 static int32_t scalarproduct_int16_c(const int16_t *v1, const int16_t *v2,
2773                                      int order)
2774 {
2775     int res = 0;
2776
2777     while (order--)
2778         res += *v1++ **v2++;
2779
2780     return res;
2781 }
2782
2783 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2,
2784                                               const int16_t *v3,
2785                                               int order, int mul)
2786 {
2787     int res = 0;
2788
2789     while (order--) {
2790         res   += *v1 * *v2++;
2791         *v1++ += mul * *v3++;
2792     }
2793     return res;
2794 }
2795
2796 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2797                                 int32_t max, unsigned int len)
2798 {
2799     do {
2800         *dst++ = av_clip(*src++, min, max);
2801         *dst++ = av_clip(*src++, min, max);
2802         *dst++ = av_clip(*src++, min, max);
2803         *dst++ = av_clip(*src++, min, max);
2804         *dst++ = av_clip(*src++, min, max);
2805         *dst++ = av_clip(*src++, min, max);
2806         *dst++ = av_clip(*src++, min, max);
2807         *dst++ = av_clip(*src++, min, max);
2808         len   -= 8;
2809     } while (len > 0);
2810 }
2811
2812 static void jref_idct_put(uint8_t *dest, int line_size, int16_t *block)
2813 {
2814     ff_j_rev_dct(block);
2815     put_pixels_clamped_c(block, dest, line_size);
2816 }
2817
2818 static void jref_idct_add(uint8_t *dest, int line_size, int16_t *block)
2819 {
2820     ff_j_rev_dct(block);
2821     add_pixels_clamped_c(block, dest, line_size);
2822 }
2823
2824 static void ff_jref_idct4_put(uint8_t *dest, int line_size, int16_t *block)
2825 {
2826     ff_j_rev_dct4 (block);
2827     put_pixels_clamped4_c(block, dest, line_size);
2828 }
2829 static void ff_jref_idct4_add(uint8_t *dest, int line_size, int16_t *block)
2830 {
2831     ff_j_rev_dct4 (block);
2832     add_pixels_clamped4_c(block, dest, line_size);
2833 }
2834
2835 static void ff_jref_idct2_put(uint8_t *dest, int line_size, int16_t *block)
2836 {
2837     ff_j_rev_dct2 (block);
2838     put_pixels_clamped2_c(block, dest, line_size);
2839 }
2840 static void ff_jref_idct2_add(uint8_t *dest, int line_size, int16_t *block)
2841 {
2842     ff_j_rev_dct2 (block);
2843     add_pixels_clamped2_c(block, dest, line_size);
2844 }
2845
2846 static void ff_jref_idct1_put(uint8_t *dest, int line_size, int16_t *block)
2847 {
2848     dest[0] = av_clip_uint8((block[0] + 4)>>3);
2849 }
2850 static void ff_jref_idct1_add(uint8_t *dest, int line_size, int16_t *block)
2851 {
2852     dest[0] = av_clip_uint8(dest[0] + ((block[0] + 4)>>3));
2853 }
2854
2855 /* init static data */
2856 av_cold void ff_dsputil_static_init(void)
2857 {
2858     int i;
2859
2860     for (i = 0; i < 512; i++)
2861         ff_square_tab[i] = (i - 256) * (i - 256);
2862 }
2863
2864 int ff_check_alignment(void)
2865 {
2866     static int did_fail = 0;
2867     LOCAL_ALIGNED_16(int, aligned, [4]);
2868
2869     if ((intptr_t)aligned & 15) {
2870         if (!did_fail) {
2871 #if HAVE_MMX || HAVE_ALTIVEC
2872             av_log(NULL, AV_LOG_ERROR,
2873                 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2874                 "and may be very slow or crash. This is not a bug in libavcodec,\n"
2875                 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2876                 "Do not report crashes to FFmpeg developers.\n");
2877 #endif
2878             did_fail=1;
2879         }
2880         return -1;
2881     }
2882     return 0;
2883 }
2884
2885 av_cold void ff_dsputil_init(DSPContext *c, AVCodecContext *avctx)
2886 {
2887     ff_check_alignment();
2888
2889 #if CONFIG_ENCODERS
2890     if (avctx->bits_per_raw_sample == 10) {
2891         c->fdct    = ff_jpeg_fdct_islow_10;
2892         c->fdct248 = ff_fdct248_islow_10;
2893     } else {
2894         if (avctx->dct_algo == FF_DCT_FASTINT) {
2895             c->fdct    = ff_fdct_ifast;
2896             c->fdct248 = ff_fdct_ifast248;
2897         } else if (avctx->dct_algo == FF_DCT_FAAN) {
2898             c->fdct    = ff_faandct;
2899             c->fdct248 = ff_faandct248;
2900         } else {
2901             c->fdct    = ff_jpeg_fdct_islow_8; // slow/accurate/default
2902             c->fdct248 = ff_fdct248_islow_8;
2903         }
2904     }
2905 #endif /* CONFIG_ENCODERS */
2906
2907     if (avctx->lowres==1) {
2908         c->idct_put              = ff_jref_idct4_put;
2909         c->idct_add              = ff_jref_idct4_add;
2910         c->idct                  = ff_j_rev_dct4;
2911         c->idct_permutation_type = FF_NO_IDCT_PERM;
2912     } else if (avctx->lowres==2) {
2913         c->idct_put              =  ff_jref_idct2_put;
2914         c->idct_add              =  ff_jref_idct2_add;
2915         c->idct                  =  ff_j_rev_dct2;
2916         c->idct_permutation_type = FF_NO_IDCT_PERM;
2917     } else if (avctx->lowres==3) {
2918         c->idct_put              =  ff_jref_idct1_put;
2919         c->idct_add              =  ff_jref_idct1_add;
2920         c->idct                  =  ff_j_rev_dct1;
2921         c->idct_permutation_type = FF_NO_IDCT_PERM;
2922     } else {
2923         if (avctx->bits_per_raw_sample == 10) {
2924             c->idct_put              = ff_simple_idct_put_10;
2925             c->idct_add              = ff_simple_idct_add_10;
2926             c->idct                  = ff_simple_idct_10;
2927             c->idct_permutation_type = FF_NO_IDCT_PERM;
2928         } else if (avctx->bits_per_raw_sample == 12) {
2929             c->idct_put              = ff_simple_idct_put_12;
2930             c->idct_add              = ff_simple_idct_add_12;
2931             c->idct                  = ff_simple_idct_12;
2932             c->idct_permutation_type = FF_NO_IDCT_PERM;
2933         } else {
2934         if (avctx->idct_algo == FF_IDCT_INT) {
2935             c->idct_put              = jref_idct_put;
2936             c->idct_add              = jref_idct_add;
2937             c->idct                  = ff_j_rev_dct;
2938             c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM;
2939         } else if (avctx->idct_algo == FF_IDCT_FAAN) {
2940             c->idct_put              = ff_faanidct_put;
2941             c->idct_add              = ff_faanidct_add;
2942             c->idct                  = ff_faanidct;
2943             c->idct_permutation_type = FF_NO_IDCT_PERM;
2944         } else { // accurate/default
2945             c->idct_put              = ff_simple_idct_put_8;
2946             c->idct_add              = ff_simple_idct_add_8;
2947             c->idct                  = ff_simple_idct_8;
2948             c->idct_permutation_type = FF_NO_IDCT_PERM;
2949         }
2950         }
2951     }
2952
2953     c->diff_pixels = diff_pixels_c;
2954
2955     c->put_pixels_clamped        = put_pixels_clamped_c;
2956     c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
2957     c->add_pixels_clamped        = add_pixels_clamped_c;
2958
2959     c->sum_abs_dctelem = sum_abs_dctelem_c;
2960
2961     c->gmc1 = gmc1_c;
2962     c->gmc  = ff_gmc_c;
2963
2964     c->pix_sum   = pix_sum_c;
2965     c->pix_norm1 = pix_norm1_c;
2966
2967     c->fill_block_tab[0] = fill_block16_c;
2968     c->fill_block_tab[1] = fill_block8_c;
2969
2970     /* TODO [0] 16  [1] 8 */
2971     c->pix_abs[0][0] = pix_abs16_c;
2972     c->pix_abs[0][1] = pix_abs16_x2_c;
2973     c->pix_abs[0][2] = pix_abs16_y2_c;
2974     c->pix_abs[0][3] = pix_abs16_xy2_c;
2975     c->pix_abs[1][0] = pix_abs8_c;
2976     c->pix_abs[1][1] = pix_abs8_x2_c;
2977     c->pix_abs[1][2] = pix_abs8_y2_c;
2978     c->pix_abs[1][3] = pix_abs8_xy2_c;
2979
2980     c->put_tpel_pixels_tab[0]  = put_tpel_pixels_mc00_c;
2981     c->put_tpel_pixels_tab[1]  = put_tpel_pixels_mc10_c;
2982     c->put_tpel_pixels_tab[2]  = put_tpel_pixels_mc20_c;
2983     c->put_tpel_pixels_tab[4]  = put_tpel_pixels_mc01_c;
2984     c->put_tpel_pixels_tab[5]  = put_tpel_pixels_mc11_c;
2985     c->put_tpel_pixels_tab[6]  = put_tpel_pixels_mc21_c;
2986     c->put_tpel_pixels_tab[8]  = put_tpel_pixels_mc02_c;
2987     c->put_tpel_pixels_tab[9]  = put_tpel_pixels_mc12_c;
2988     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2989
2990     c->avg_tpel_pixels_tab[0]  = avg_tpel_pixels_mc00_c;
2991     c->avg_tpel_pixels_tab[1]  = avg_tpel_pixels_mc10_c;
2992     c->avg_tpel_pixels_tab[2]  = avg_tpel_pixels_mc20_c;
2993     c->avg_tpel_pixels_tab[4]  = avg_tpel_pixels_mc01_c;
2994     c->avg_tpel_pixels_tab[5]  = avg_tpel_pixels_mc11_c;
2995     c->avg_tpel_pixels_tab[6]  = avg_tpel_pixels_mc21_c;
2996     c->avg_tpel_pixels_tab[8]  = avg_tpel_pixels_mc02_c;
2997     c->avg_tpel_pixels_tab[9]  = avg_tpel_pixels_mc12_c;
2998     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
2999
3000 #define dspfunc(PFX, IDX, NUM)                              \
3001     c->PFX ## _pixels_tab[IDX][0]  = PFX ## NUM ## _mc00_c; \
3002     c->PFX ## _pixels_tab[IDX][1]  = PFX ## NUM ## _mc10_c; \
3003     c->PFX ## _pixels_tab[IDX][2]  = PFX ## NUM ## _mc20_c; \
3004     c->PFX ## _pixels_tab[IDX][3]  = PFX ## NUM ## _mc30_c; \
3005     c->PFX ## _pixels_tab[IDX][4]  = PFX ## NUM ## _mc01_c; \
3006     c->PFX ## _pixels_tab[IDX][5]  = PFX ## NUM ## _mc11_c; \
3007     c->PFX ## _pixels_tab[IDX][6]  = PFX ## NUM ## _mc21_c; \
3008     c->PFX ## _pixels_tab[IDX][7]  = PFX ## NUM ## _mc31_c; \
3009     c->PFX ## _pixels_tab[IDX][8]  = PFX ## NUM ## _mc02_c; \
3010     c->PFX ## _pixels_tab[IDX][9]  = PFX ## NUM ## _mc12_c; \
3011     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
3012     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
3013     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
3014     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
3015     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
3016     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
3017
3018     dspfunc(put_qpel, 0, 16);
3019     dspfunc(put_qpel, 1, 8);
3020
3021     dspfunc(put_no_rnd_qpel, 0, 16);
3022     dspfunc(put_no_rnd_qpel, 1, 8);
3023
3024     dspfunc(avg_qpel, 0, 16);
3025     dspfunc(avg_qpel, 1, 8);
3026
3027 #undef dspfunc
3028
3029     c->put_mspel_pixels_tab[0] = ff_put_pixels8x8_c;
3030     c->put_mspel_pixels_tab[1] = put_mspel8_mc10_c;
3031     c->put_mspel_pixels_tab[2] = put_mspel8_mc20_c;
3032     c->put_mspel_pixels_tab[3] = put_mspel8_mc30_c;
3033     c->put_mspel_pixels_tab[4] = put_mspel8_mc02_c;
3034     c->put_mspel_pixels_tab[5] = put_mspel8_mc12_c;
3035     c->put_mspel_pixels_tab[6] = put_mspel8_mc22_c;
3036     c->put_mspel_pixels_tab[7] = put_mspel8_mc32_c;
3037
3038 #define SET_CMP_FUNC(name)                      \
3039     c->name[0] = name ## 16_c;                  \
3040     c->name[1] = name ## 8x8_c;
3041
3042     SET_CMP_FUNC(hadamard8_diff)
3043     c->hadamard8_diff[4] = hadamard8_intra16_c;
3044     c->hadamard8_diff[5] = hadamard8_intra8x8_c;
3045     SET_CMP_FUNC(dct_sad)
3046     SET_CMP_FUNC(dct_max)
3047 #if CONFIG_GPL
3048     SET_CMP_FUNC(dct264_sad)
3049 #endif
3050     c->sad[0] = pix_abs16_c;
3051     c->sad[1] = pix_abs8_c;
3052     c->sse[0] = sse16_c;
3053     c->sse[1] = sse8_c;
3054     c->sse[2] = sse4_c;
3055     SET_CMP_FUNC(quant_psnr)
3056     SET_CMP_FUNC(rd)
3057     SET_CMP_FUNC(bit)
3058     c->vsad[0] = vsad16_c;
3059     c->vsad[4] = vsad_intra16_c;
3060     c->vsad[5] = vsad_intra8_c;
3061     c->vsse[0] = vsse16_c;
3062     c->vsse[4] = vsse_intra16_c;
3063     c->vsse[5] = vsse_intra8_c;
3064     c->nsse[0] = nsse16_c;
3065     c->nsse[1] = nsse8_c;
3066 #if CONFIG_SNOW_DECODER || CONFIG_SNOW_ENCODER
3067     ff_dsputil_init_dwt(c);
3068 #endif
3069
3070     c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
3071
3072     c->add_bytes                      = add_bytes_c;
3073     c->add_hfyu_median_prediction     = add_hfyu_median_prediction_c;
3074     c->add_hfyu_left_prediction       = add_hfyu_left_prediction_c;
3075     c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
3076
3077     c->diff_bytes                 = diff_bytes_c;
3078     c->sub_hfyu_median_prediction = sub_hfyu_median_prediction_c;
3079
3080     c->bswap_buf   = bswap_buf;
3081     c->bswap16_buf = bswap16_buf;
3082
3083     c->try_8x8basis = try_8x8basis_c;
3084     c->add_8x8basis = add_8x8basis_c;
3085
3086     c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
3087
3088     c->scalarproduct_int16 = scalarproduct_int16_c;
3089     c->vector_clip_int32   = vector_clip_int32_c;
3090     c->vector_clipf        = vector_clipf_c;
3091
3092     c->shrink[0] = av_image_copy_plane;
3093     c->shrink[1] = ff_shrink22;
3094     c->shrink[2] = ff_shrink44;
3095     c->shrink[3] = ff_shrink88;
3096
3097     c->add_pixels8 = add_pixels8_c;
3098
3099 #undef FUNC
3100 #undef FUNCC
3101 #define FUNC(f,  depth) f ## _ ## depth
3102 #define FUNCC(f, depth) f ## _ ## depth ## _c
3103
3104     c->draw_edges = FUNCC(draw_edges, 8);
3105
3106     c->clear_block  = FUNCC(clear_block, 8);
3107     c->clear_blocks = FUNCC(clear_blocks, 8);
3108
3109 #define BIT_DEPTH_FUNCS(depth)                  \
3110     c->get_pixels = FUNCC(get_pixels, depth);
3111
3112     switch (avctx->bits_per_raw_sample) {
3113     case 9:
3114     case 10:
3115     case 12:
3116     case 14:
3117         BIT_DEPTH_FUNCS(16);
3118         break;
3119     default:
3120         if (avctx->bits_per_raw_sample<=8 || avctx->codec_type != AVMEDIA_TYPE_VIDEO) {
3121             BIT_DEPTH_FUNCS(8);
3122         }
3123         break;
3124     }
3125
3126
3127     if (ARCH_ALPHA)
3128         ff_dsputil_init_alpha(c, avctx);
3129     if (ARCH_ARM)
3130         ff_dsputil_init_arm(c, avctx);
3131     if (ARCH_BFIN)
3132         ff_dsputil_init_bfin(c, avctx);
3133     if (ARCH_PPC)
3134         ff_dsputil_init_ppc(c, avctx);
3135     if (ARCH_X86)
3136         ff_dsputil_init_x86(c, avctx);
3137
3138     ff_init_scantable_permutation(c->idct_permutation,
3139                                   c->idct_permutation_type);
3140 }
3141
3142 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3143 {
3144     ff_dsputil_init(c, avctx);
3145 }
3146
3147 av_cold void avpriv_dsputil_init(DSPContext *c, AVCodecContext *avctx)
3148 {
3149     ff_dsputil_init(c, avctx);
3150 }