git.sesse.net Git - ffmpeg/blob - libavcodec/vp8dsp.c

   1 /*
   2  * Copyright (C) 2010 David Conrad
   3  * Copyright (C) 2010 Ronald S. Bultje
   4  * Copyright (C) 2014 Peter Ross
   5  *
   6  * This file is part of FFmpeg.
   7  *
   8  * FFmpeg is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License as published by the Free Software Foundation; either
  11  * version 2.1 of the License, or (at your option) any later version.
  12  *
  13  * FFmpeg is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16  * Lesser General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU Lesser General Public
  19  * License along with FFmpeg; if not, write to the Free Software
  20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21  */
  22
  23 /**
  24  * @file
  25  * VP8 compatible video decoder
  26  */
  27
  28 #include "libavutil/common.h"
  29 #include "libavutil/intreadwrite.h"
  30
  31 #include "mathops.h"
  32 #include "vp8dsp.h"
  33
  34 #define MK_IDCT_DC_ADD4_C(name) \
  35 static void name ## _idct_dc_add4uv_c(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride)\
  36 {\
  37     name ## _idct_dc_add_c(dst + stride * 0 + 0, block[0], stride);\
  38     name ## _idct_dc_add_c(dst + stride * 0 + 4, block[1], stride);\
  39     name ## _idct_dc_add_c(dst + stride * 4 + 0, block[2], stride);\
  40     name ## _idct_dc_add_c(dst + stride * 4 + 4, block[3], stride);\
  41 }\
  42 \
  43 static void name ## _idct_dc_add4y_c(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride)\
  44 {\
  45     name ## _idct_dc_add_c(dst + 0, block[0], stride);\
  46     name ## _idct_dc_add_c(dst + 4, block[1], stride);\
  47     name ## _idct_dc_add_c(dst + 8, block[2], stride);\
  48     name ## _idct_dc_add_c(dst + 12, block[3], stride);\
  49 }
  50
  51 #if CONFIG_VP7_DECODER
  52 static void vp7_luma_dc_wht_c(int16_t block[4][4][16], int16_t dc[16])
  53 {
  54     int i, a1, b1, c1, d1;
  55     int16_t tmp[16];
  56
  57     for (i = 0; i < 4; i++) {
  58         a1 = (dc[i * 4 + 0] + dc[i * 4 + 2]) * 23170;
  59         b1 = (dc[i * 4 + 0] - dc[i * 4 + 2]) * 23170;
  60         c1 = dc[i * 4 + 1] * 12540 - dc[i * 4 + 3] * 30274;
  61         d1 = dc[i * 4 + 1] * 30274 + dc[i * 4 + 3] * 12540;
  62         tmp[i * 4 + 0] = (a1 + d1) >> 14;
  63         tmp[i * 4 + 3] = (a1 - d1) >> 14;
  64         tmp[i * 4 + 1] = (b1 + c1) >> 14;
  65         tmp[i * 4 + 2] = (b1 - c1) >> 14;
  66     }
  67
  68     for (i = 0; i < 4; i++) {
  69         a1 = (tmp[i + 0] + tmp[i + 8]) * 23170;
  70         b1 = (tmp[i + 0] - tmp[i + 8]) * 23170;
  71         c1 = tmp[i + 4] * 12540 - tmp[i + 12] * 30274;
  72         d1 = tmp[i + 4] * 30274 + tmp[i + 12] * 12540;
  73         AV_ZERO64(dc + i * 4);
  74         block[0][i][0] = (a1 + d1 + 0x20000) >> 18;
  75         block[3][i][0] = (a1 - d1 + 0x20000) >> 18;
  76         block[1][i][0] = (b1 + c1 + 0x20000) >> 18;
  77         block[2][i][0] = (b1 - c1 + 0x20000) >> 18;
  78     }
  79 }
  80
  81 static void vp7_luma_dc_wht_dc_c(int16_t block[4][4][16], int16_t dc[16])
  82 {
  83     int i, val = (23170 * (23170 * dc[0] >> 14) + 0x20000) >> 18;
  84     dc[0] = 0;
  85
  86     for (i = 0; i < 4; i++) {
  87         block[i][0][0] = val;
  88         block[i][1][0] = val;
  89         block[i][2][0] = val;
  90         block[i][3][0] = val;
  91     }
  92 }
  93
  94 static void vp7_idct_add_c(uint8_t *dst, int16_t block[16], ptrdiff_t stride)
  95 {
  96     int i, a1, b1, c1, d1;
  97     int16_t tmp[16];
  98
  99     for (i = 0; i < 4; i++) {
 100         a1 = (block[i * 4 + 0] + block[i * 4 + 2]) * 23170;
 101         b1 = (block[i * 4 + 0] - block[i * 4 + 2]) * 23170;
 102         c1 = block[i * 4 + 1] * 12540 - block[i * 4 + 3] * 30274;
 103         d1 = block[i * 4 + 1] * 30274 + block[i * 4 + 3] * 12540;
 104         AV_ZERO64(block + i * 4);
 105         tmp[i * 4 + 0] = (a1 + d1) >> 14;
 106         tmp[i * 4 + 3] = (a1 - d1) >> 14;
 107         tmp[i * 4 + 1] = (b1 + c1) >> 14;
 108         tmp[i * 4 + 2] = (b1 - c1) >> 14;
 109     }
 110
 111     for (i = 0; i < 4; i++) {
 112         a1 = (tmp[i + 0] + tmp[i + 8]) * 23170;
 113         b1 = (tmp[i + 0] - tmp[i + 8]) * 23170;
 114         c1 = tmp[i + 4] * 12540 - tmp[i + 12] * 30274;
 115         d1 = tmp[i + 4] * 30274 + tmp[i + 12] * 12540;
 116         dst[0 * stride + i] = av_clip_uint8(dst[0 * stride + i] + ((a1 + d1 + 0x20000) >> 18));
 117         dst[3 * stride + i] = av_clip_uint8(dst[3 * stride + i] + ((a1 - d1 + 0x20000) >> 18));
 118         dst[1 * stride + i] = av_clip_uint8(dst[1 * stride + i] + ((b1 + c1 + 0x20000) >> 18));
 119         dst[2 * stride + i] = av_clip_uint8(dst[2 * stride + i] + ((b1 - c1 + 0x20000) >> 18));
 120     }
 121 }
 122
 123 static void vp7_idct_dc_add_c(uint8_t *dst, int16_t block[16], ptrdiff_t stride)
 124 {
 125     int i, dc = (23170 * (23170 * block[0] >> 14) + 0x20000) >> 18;
 126     block[0] = 0;
 127
 128     for (i = 0; i < 4; i++) {
 129         dst[0] = av_clip_uint8(dst[0] + dc);
 130         dst[1] = av_clip_uint8(dst[1] + dc);
 131         dst[2] = av_clip_uint8(dst[2] + dc);
 132         dst[3] = av_clip_uint8(dst[3] + dc);
 133         dst += stride;
 134     }
 135 }
 136
 137 MK_IDCT_DC_ADD4_C(vp7)
 138 #endif
 139
 140 // TODO: Maybe add dequant
 141 #if CONFIG_VP8_DECODER
 142 static void vp8_luma_dc_wht_c(int16_t block[4][4][16], int16_t dc[16])
 143 {
 144     int i, t0, t1, t2, t3;
 145
 146     for (i = 0; i < 4; i++) {
 147         t0 = dc[0 * 4 + i] + dc[3 * 4 + i];
 148         t1 = dc[1 * 4 + i] + dc[2 * 4 + i];
 149         t2 = dc[1 * 4 + i] - dc[2 * 4 + i];
 150         t3 = dc[0 * 4 + i] - dc[3 * 4 + i];
 151
 152         dc[0 * 4 + i] = t0 + t1;
 153         dc[1 * 4 + i] = t3 + t2;
 154         dc[2 * 4 + i] = t0 - t1;
 155         dc[3 * 4 + i] = t3 - t2;
 156     }
 157
 158     for (i = 0; i < 4; i++) {
 159         t0 = dc[i * 4 + 0] + dc[i * 4 + 3] + 3; // rounding
 160         t1 = dc[i * 4 + 1] + dc[i * 4 + 2];
 161         t2 = dc[i * 4 + 1] - dc[i * 4 + 2];
 162         t3 = dc[i * 4 + 0] - dc[i * 4 + 3] + 3; // rounding
 163         AV_ZERO64(dc + i * 4);
 164
 165         block[i][0][0] = (t0 + t1) >> 3;
 166         block[i][1][0] = (t3 + t2) >> 3;
 167         block[i][2][0] = (t0 - t1) >> 3;
 168         block[i][3][0] = (t3 - t2) >> 3;
 169     }
 170 }
 171
 172 static void vp8_luma_dc_wht_dc_c(int16_t block[4][4][16], int16_t dc[16])
 173 {
 174     int i, val = (dc[0] + 3) >> 3;
 175     dc[0] = 0;
 176
 177     for (i = 0; i < 4; i++) {
 178         block[i][0][0] = val;
 179         block[i][1][0] = val;
 180         block[i][2][0] = val;
 181         block[i][3][0] = val;
 182     }
 183 }
 184
 185 #define MUL_20091(a) ((((a) * 20091) >> 16) + (a))
 186 #define MUL_35468(a)  (((a) * 35468) >> 16)
 187
 188 static void vp8_idct_add_c(uint8_t *dst, int16_t block[16], ptrdiff_t stride)
 189 {
 190     int i, t0, t1, t2, t3;
 191     int16_t tmp[16];
 192
 193     for (i = 0; i < 4; i++) {
 194         t0 = block[0 * 4 + i] + block[2 * 4 + i];
 195         t1 = block[0 * 4 + i] - block[2 * 4 + i];
 196         t2 = MUL_35468(block[1 * 4 + i]) - MUL_20091(block[3 * 4 + i]);
 197         t3 = MUL_20091(block[1 * 4 + i]) + MUL_35468(block[3 * 4 + i]);
 198         block[0 * 4 + i] = 0;
 199         block[1 * 4 + i] = 0;
 200         block[2 * 4 + i] = 0;
 201         block[3 * 4 + i] = 0;
 202
 203         tmp[i * 4 + 0] = t0 + t3;
 204         tmp[i * 4 + 1] = t1 + t2;
 205         tmp[i * 4 + 2] = t1 - t2;
 206         tmp[i * 4 + 3] = t0 - t3;
 207     }
 208
 209     for (i = 0; i < 4; i++) {
 210         t0 = tmp[0 * 4 + i] + tmp[2 * 4 + i];
 211         t1 = tmp[0 * 4 + i] - tmp[2 * 4 + i];
 212         t2 = MUL_35468(tmp[1 * 4 + i]) - MUL_20091(tmp[3 * 4 + i]);
 213         t3 = MUL_20091(tmp[1 * 4 + i]) + MUL_35468(tmp[3 * 4 + i]);
 214
 215         dst[0] = av_clip_uint8(dst[0] + ((t0 + t3 + 4) >> 3));
 216         dst[1] = av_clip_uint8(dst[1] + ((t1 + t2 + 4) >> 3));
 217         dst[2] = av_clip_uint8(dst[2] + ((t1 - t2 + 4) >> 3));
 218         dst[3] = av_clip_uint8(dst[3] + ((t0 - t3 + 4) >> 3));
 219         dst   += stride;
 220     }
 221 }
 222
 223 static void vp8_idct_dc_add_c(uint8_t *dst, int16_t block[16], ptrdiff_t stride)
 224 {
 225     int i, dc = (block[0] + 4) >> 3;
 226     block[0] = 0;
 227
 228     for (i = 0; i < 4; i++) {
 229         dst[0] = av_clip_uint8(dst[0] + dc);
 230         dst[1] = av_clip_uint8(dst[1] + dc);
 231         dst[2] = av_clip_uint8(dst[2] + dc);
 232         dst[3] = av_clip_uint8(dst[3] + dc);
 233         dst   += stride;
 234     }
 235 }
 236
 237 MK_IDCT_DC_ADD4_C(vp8)
 238 #endif
 239
 240 // because I like only having two parameters to pass functions...
 241 #define LOAD_PIXELS                                                           \
 242     int av_unused p3 = p[-4 * stride];                                        \
 243     int av_unused p2 = p[-3 * stride];                                        \
 244     int av_unused p1 = p[-2 * stride];                                        \
 245     int av_unused p0 = p[-1 * stride];                                        \
 246     int av_unused q0 = p[ 0 * stride];                                        \
 247     int av_unused q1 = p[ 1 * stride];                                        \
 248     int av_unused q2 = p[ 2 * stride];                                        \
 249     int av_unused q3 = p[ 3 * stride];
 250
 251 #define clip_int8(n) (cm[n + 0x80] - 0x80)
 252
 253 static av_always_inline void filter_common(uint8_t *p, ptrdiff_t stride,
 254                                            int is4tap, int vpn)
 255 {
 256     LOAD_PIXELS
 257     int a, f1, f2;
 258     const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
 259
 260     a = 3 * (q0 - p0);
 261
 262     if (is4tap)
 263         a += clip_int8(p1 - q1);
 264
 265     a = clip_int8(a);
 266
 267     // We deviate from the spec here with c(a+3) >> 3
 268     // since that's what libvpx does.
 269     f1 = FFMIN(a + 4, 127) >> 3;
 270
 271     if (vpn == 7)
 272         f2 = f1 - ((a & 7) == 4);
 273     else
 274         f2 = FFMIN(a + 3, 127) >> 3;
 275
 276     // Despite what the spec says, we do need to clamp here to
 277     // be bitexact with libvpx.
 278     p[-1 * stride] = cm[p0 + f2];
 279     p[ 0 * stride] = cm[q0 - f1];
 280
 281     // only used for _inner on blocks without high edge variance
 282     if (!is4tap) {
 283         a = (f1 + 1) >> 1;
 284         p[-2 * stride] = cm[p1 + a];
 285         p[ 1 * stride] = cm[q1 - a];
 286     }
 287 }
 288
 289 static av_always_inline int vp7_simple_limit(uint8_t *p, ptrdiff_t stride, int flim)
 290 {
 291     LOAD_PIXELS
 292     return FFABS(p0-q0) <= flim;
 293 }
 294
 295 static av_always_inline int vp8_simple_limit(uint8_t *p, ptrdiff_t stride, int flim)
 296 {
 297     LOAD_PIXELS
 298     return 2 * FFABS(p0 - q0) + (FFABS(p1 - q1) >> 1) <= flim;
 299 }
 300
 301 /**
 302  * E - limit at the macroblock edge
 303  * I - limit for interior difference
 304  */
 305 #define NORMAL_LIMIT(vpn) \
 306 static av_always_inline int vp ## vpn ## _normal_limit(uint8_t *p, ptrdiff_t stride, \
 307                                                        int E, int I)\
 308 {                                                      \
 309     LOAD_PIXELS                                        \
 310     return vp ## vpn ## _simple_limit(p, stride, E) && \
 311            FFABS(p3 - p2) <= I &&                      \
 312            FFABS(p2 - p1) <= I &&                      \
 313            FFABS(p1 - p0) <= I &&                      \
 314            FFABS(q3 - q2) <= I &&                      \
 315            FFABS(q2 - q1) <= I &&                      \
 316            FFABS(q1 - q0) <= I;                        \
 317 }
 318
 319 NORMAL_LIMIT(7)
 320 NORMAL_LIMIT(8)
 321
 322 // high edge variance
 323 static av_always_inline int hev(uint8_t *p, ptrdiff_t stride, int thresh)
 324 {
 325     LOAD_PIXELS
 326     return FFABS(p1 - p0) > thresh || FFABS(q1 - q0) > thresh;
 327 }
 328
 329 static av_always_inline void filter_mbedge(uint8_t *p, ptrdiff_t stride)
 330 {
 331     int a0, a1, a2, w;
 332     const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
 333
 334     LOAD_PIXELS
 335
 336     w = clip_int8(p1 - q1);
 337     w = clip_int8(w + 3 * (q0 - p0));
 338
 339     a0 = (27 * w + 63) >> 7;
 340     a1 = (18 * w + 63) >> 7;
 341     a2 =  (9 * w + 63) >> 7;
 342
 343     p[-3 * stride] = cm[p2 + a2];
 344     p[-2 * stride] = cm[p1 + a1];
 345     p[-1 * stride] = cm[p0 + a0];
 346     p[ 0 * stride] = cm[q0 - a0];
 347     p[ 1 * stride] = cm[q1 - a1];
 348     p[ 2 * stride] = cm[q2 - a2];
 349 }
 350
 351 #define LOOP_FILTER(vpn, dir, size, stridea, strideb, maybe_inline)           \
 352 static maybe_inline                                                           \
 353 void vp ## vpn ## _ ## dir ## _loop_filter ## size ## _c(uint8_t *dst,        \
 354                                                ptrdiff_t stride,              \
 355                                                int flim_E, int flim_I,        \
 356                                                int hev_thresh)                \
 357 {                                                                             \
 358     int i;                                                                    \
 359     for (i = 0; i < size; i++)                                                \
 360         if (vp ## vpn ## _normal_limit(dst + i * stridea, strideb, flim_E, flim_I)) {       \
 361             if (hev(dst + i * stridea, strideb, hev_thresh))                  \
 362                 filter_common(dst + i * stridea, strideb, 1, vpn);            \
 363             else                                                              \
 364                 filter_mbedge(dst + i * stridea, strideb);                    \
 365         }                                                                     \
 366 }                                                                             \
 367                                                                               \
 368 static maybe_inline                                                           \
 369 void vp ## vpn ## _ ## dir ## _loop_filter ## size ## _inner_c(uint8_t *dst,  \
 370                                                      ptrdiff_t stride,        \
 371                                                      int flim_E, int flim_I,  \
 372                                                      int hev_thresh)          \
 373 {                                                                             \
 374     int i;                                                                    \
 375     for (i = 0; i < size; i++)                                                \
 376         if (vp ## vpn ## _normal_limit(dst + i * stridea, strideb, flim_E, flim_I)) {       \
 377             int hv = hev(dst + i * stridea, strideb, hev_thresh);             \
 378             if (hv)                                                           \
 379                 filter_common(dst + i * stridea, strideb, 1, vpn);            \
 380             else                                                              \
 381                 filter_common(dst + i * stridea, strideb, 0, vpn);            \
 382         }                                                                     \
 383 }
 384
 385 #define UV_LOOP_FILTER(vpn, dir, stridea, strideb) \
 386 LOOP_FILTER(vpn, dir, 8, stridea, strideb, av_always_inline) \
 387 static void vp ## vpn ## _ ## dir ## _loop_filter8uv_c(uint8_t *dstU, uint8_t *dstV, \
 388                                                        ptrdiff_t stride, int fE, \
 389                                                        int fI, int hev_thresh)\
 390 {\
 391   vp ## vpn ## _ ## dir ## _loop_filter8_c(dstU, stride, fE, fI, hev_thresh);\
 392   vp ## vpn ## _ ## dir ## _loop_filter8_c(dstV, stride, fE, fI, hev_thresh);\
 393 }\
 394 static void vp ## vpn ## _ ## dir ## _loop_filter8uv_inner_c(uint8_t *dstU, \
 395                                                              uint8_t *dstV, \
 396                                                              ptrdiff_t stride, int fE, \
 397                                                              int fI, int hev_thresh) \
 398 {\
 399   vp ## vpn ## _ ## dir ## _loop_filter8_inner_c(dstU, stride, fE, fI, hev_thresh);\
 400   vp ## vpn ## _ ## dir ## _loop_filter8_inner_c(dstV, stride, fE, fI, hev_thresh);\
 401 }
 402
 403 #define LOOP_FILTER_SIMPLE(vpn) \
 404 static void vp ## vpn ## _v_loop_filter_simple_c(uint8_t *dst, ptrdiff_t stride, int flim)\
 405 {\
 406     int i;\
 407 \
 408     for (i = 0; i < 16; i++)\
 409         if (vp ## vpn ## _simple_limit(dst + i, stride, flim))\
 410             filter_common(dst + i, stride, 1, vpn);\
 411 }\
 412 \
 413 static void vp ## vpn ## _h_loop_filter_simple_c(uint8_t *dst, ptrdiff_t stride, int flim)\
 414 {\
 415     int i;\
 416 \
 417     for (i = 0; i < 16; i++)\
 418         if (vp ## vpn ## _simple_limit(dst + i * stride, 1, flim))\
 419             filter_common(dst + i * stride, 1, 1, vpn);\
 420 }
 421
 422 #if CONFIG_VP7_DECODER
 423 LOOP_FILTER(7, v, 16, 1, stride,)
 424 LOOP_FILTER(7, h, 16, stride, 1,)
 425 UV_LOOP_FILTER(7, v, 1, stride)
 426 UV_LOOP_FILTER(7, h, stride, 1)
 427 LOOP_FILTER_SIMPLE(7)
 428 #endif
 429
 430 #if CONFIG_VP8_DECODER
 431 LOOP_FILTER(8, v, 16, 1, stride,)
 432 LOOP_FILTER(8, h, 16, stride, 1,)
 433 UV_LOOP_FILTER(8, v, 1, stride)
 434 UV_LOOP_FILTER(8, h, stride, 1)
 435 LOOP_FILTER_SIMPLE(8)
 436 #endif
 437
 438 static const uint8_t subpel_filters[7][6] = {
 439     { 0,  6, 123,  12,  1, 0 },
 440     { 2, 11, 108,  36,  8, 1 },
 441     { 0,  9,  93,  50,  6, 0 },
 442     { 3, 16,  77,  77, 16, 3 },
 443     { 0,  6,  50,  93,  9, 0 },
 444     { 1,  8,  36, 108, 11, 2 },
 445     { 0,  1,  12, 123,  6, 0 },
 446 };
 447
 448 #define PUT_PIXELS(WIDTH)                                                     \
 449 static void put_vp8_pixels ## WIDTH ## _c(uint8_t *dst, ptrdiff_t dststride,  \
 450                                           uint8_t *src, ptrdiff_t srcstride,  \
 451                                           int h, int x, int y)                \
 452 {                                                                             \
 453     int i;                                                                    \
 454     for (i = 0; i < h; i++, dst += dststride, src += srcstride)               \
 455         memcpy(dst, src, WIDTH);                                              \
 456 }
 457
 458 PUT_PIXELS(16)
 459 PUT_PIXELS(8)
 460 PUT_PIXELS(4)
 461
 462 #define FILTER_6TAP(src, F, stride)                                           \
 463     cm[(F[2] * src[x + 0 * stride] - F[1] * src[x - 1 * stride] +             \
 464         F[0] * src[x - 2 * stride] + F[3] * src[x + 1 * stride] -             \
 465         F[4] * src[x + 2 * stride] + F[5] * src[x + 3 * stride] + 64) >> 7]
 466
 467 #define FILTER_4TAP(src, F, stride)                                           \
 468     cm[(F[2] * src[x + 0 * stride] - F[1] * src[x - 1 * stride] +             \
 469         F[3] * src[x + 1 * stride] - F[4] * src[x + 2 * stride] + 64) >> 7]
 470
 471 #define VP8_EPEL_H(SIZE, TAPS)                                                \
 472 static void put_vp8_epel ## SIZE ## _h ## TAPS ## _c(uint8_t *dst,            \
 473                                                      ptrdiff_t dststride,     \
 474                                                      uint8_t *src,            \
 475                                                      ptrdiff_t srcstride,     \
 476                                                      int h, int mx, int my)   \
 477 {                                                                             \
 478     const uint8_t *filter = subpel_filters[mx - 1];                           \
 479     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;                       \
 480     int x, y;                                                                 \
 481     for (y = 0; y < h; y++) {                                                 \
 482         for (x = 0; x < SIZE; x++)                                            \
 483             dst[x] = FILTER_ ## TAPS ## TAP(src, filter, 1);                  \
 484         dst += dststride;                                                     \
 485         src += srcstride;                                                     \
 486     }                                                                         \
 487 }
 488
 489 #define VP8_EPEL_V(SIZE, TAPS)                                                \
 490 static void put_vp8_epel ## SIZE ## _v ## TAPS ## _c(uint8_t *dst,            \
 491                                                      ptrdiff_t dststride,     \
 492                                                      uint8_t *src,            \
 493                                                      ptrdiff_t srcstride,     \
 494                                                      int h, int mx, int my)   \
 495 {                                                                             \
 496     const uint8_t *filter = subpel_filters[my - 1];                           \
 497     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;                       \
 498     int x, y;                                                                 \
 499     for (y = 0; y < h; y++) {                                                 \
 500         for (x = 0; x < SIZE; x++)                                            \
 501             dst[x] = FILTER_ ## TAPS ## TAP(src, filter, srcstride);          \
 502         dst += dststride;                                                     \
 503         src += srcstride;                                                     \
 504     }                                                                         \
 505 }
 506
 507 #define VP8_EPEL_HV(SIZE, HTAPS, VTAPS)                                       \
 508 static void                                                                   \
 509 put_vp8_epel ## SIZE ## _h ## HTAPS ## v ## VTAPS ## _c(uint8_t *dst,         \
 510                                                         ptrdiff_t dststride,  \
 511                                                         uint8_t *src,         \
 512                                                         ptrdiff_t srcstride,  \
 513                                                         int h, int mx,        \
 514                                                         int my)               \
 515 {                                                                             \
 516     const uint8_t *filter = subpel_filters[mx - 1];                           \
 517     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;                       \
 518     int x, y;                                                                 \
 519     uint8_t tmp_array[(2 * SIZE + VTAPS - 1) * SIZE];                         \
 520     uint8_t *tmp = tmp_array;                                                 \
 521     src -= (2 - (VTAPS == 4)) * srcstride;                                    \
 522                                                                               \
 523     for (y = 0; y < h + VTAPS - 1; y++) {                                     \
 524         for (x = 0; x < SIZE; x++)                                            \
 525             tmp[x] = FILTER_ ## HTAPS ## TAP(src, filter, 1);                 \
 526         tmp += SIZE;                                                          \
 527         src += srcstride;                                                     \
 528     }                                                                         \
 529     tmp    = tmp_array + (2 - (VTAPS == 4)) * SIZE;                           \
 530     filter = subpel_filters[my - 1];                                          \
 531                                                                               \
 532     for (y = 0; y < h; y++) {                                                 \
 533         for (x = 0; x < SIZE; x++)                                            \
 534             dst[x] = FILTER_ ## VTAPS ## TAP(tmp, filter, SIZE);              \
 535         dst += dststride;                                                     \
 536         tmp += SIZE;                                                          \
 537     }                                                                         \
 538 }
 539
 540 VP8_EPEL_H(16, 4)
 541 VP8_EPEL_H(8,  4)
 542 VP8_EPEL_H(4,  4)
 543 VP8_EPEL_H(16, 6)
 544 VP8_EPEL_H(8,  6)
 545 VP8_EPEL_H(4,  6)
 546 VP8_EPEL_V(16, 4)
 547 VP8_EPEL_V(8,  4)
 548 VP8_EPEL_V(4,  4)
 549 VP8_EPEL_V(16, 6)
 550 VP8_EPEL_V(8,  6)
 551 VP8_EPEL_V(4,  6)
 552
 553 VP8_EPEL_HV(16, 4, 4)
 554 VP8_EPEL_HV(8,  4, 4)
 555 VP8_EPEL_HV(4,  4, 4)
 556 VP8_EPEL_HV(16, 4, 6)
 557 VP8_EPEL_HV(8,  4, 6)
 558 VP8_EPEL_HV(4,  4, 6)
 559 VP8_EPEL_HV(16, 6, 4)
 560 VP8_EPEL_HV(8,  6, 4)
 561 VP8_EPEL_HV(4,  6, 4)
 562 VP8_EPEL_HV(16, 6, 6)
 563 VP8_EPEL_HV(8,  6, 6)
 564 VP8_EPEL_HV(4,  6, 6)
 565
 566 #define VP8_BILINEAR(SIZE)                                                    \
 567 static void put_vp8_bilinear ## SIZE ## _h_c(uint8_t *dst, ptrdiff_t dstride, \
 568                                              uint8_t *src, ptrdiff_t sstride, \
 569                                              int h, int mx, int my)           \
 570 {                                                                             \
 571     int a = 8 - mx, b = mx;                                                   \
 572     int x, y;                                                                 \
 573     for (y = 0; y < h; y++) {                                                 \
 574         for (x = 0; x < SIZE; x++)                                            \
 575             dst[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;                  \
 576         dst += dstride;                                                       \
 577         src += sstride;                                                       \
 578     }                                                                         \
 579 }                                                                             \
 580                                                                               \
 581 static void put_vp8_bilinear ## SIZE ## _v_c(uint8_t *dst, ptrdiff_t dstride, \
 582                                              uint8_t *src, ptrdiff_t sstride, \
 583                                              int h, int mx, int my)           \
 584 {                                                                             \
 585     int c = 8 - my, d = my;                                                   \
 586     int x, y;                                                                 \
 587     for (y = 0; y < h; y++) {                                                 \
 588         for (x = 0; x < SIZE; x++)                                            \
 589             dst[x] = (c * src[x] + d * src[x + sstride] + 4) >> 3;            \
 590         dst += dstride;                                                       \
 591         src += sstride;                                                       \
 592     }                                                                         \
 593 }                                                                             \
 594                                                                               \
 595 static void put_vp8_bilinear ## SIZE ## _hv_c(uint8_t *dst,                   \
 596                                               ptrdiff_t dstride,              \
 597                                               uint8_t *src,                   \
 598                                               ptrdiff_t sstride,              \
 599                                               int h, int mx, int my)          \
 600 {                                                                             \
 601     int a = 8 - mx, b = mx;                                                   \
 602     int c = 8 - my, d = my;                                                   \
 603     int x, y;                                                                 \
 604     uint8_t tmp_array[(2 * SIZE + 1) * SIZE];                                 \
 605     uint8_t *tmp = tmp_array;                                                 \
 606     for (y = 0; y < h + 1; y++) {                                             \
 607         for (x = 0; x < SIZE; x++)                                            \
 608             tmp[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;                  \
 609         tmp += SIZE;                                                          \
 610         src += sstride;                                                       \
 611     }                                                                         \
 612     tmp = tmp_array;                                                          \
 613     for (y = 0; y < h; y++) {                                                 \
 614         for (x = 0; x < SIZE; x++)                                            \
 615             dst[x] = (c * tmp[x] + d * tmp[x + SIZE] + 4) >> 3;               \
 616         dst += dstride;                                                       \
 617         tmp += SIZE;                                                          \
 618     }                                                                         \
 619 }
 620
 621 VP8_BILINEAR(16)
 622 VP8_BILINEAR(8)
 623 VP8_BILINEAR(4)
 624
 625 #define VP8_MC_FUNC(IDX, SIZE)                                                \
 626     dsp->put_vp8_epel_pixels_tab[IDX][0][0] = put_vp8_pixels ## SIZE ## _c;   \
 627     dsp->put_vp8_epel_pixels_tab[IDX][0][1] = put_vp8_epel ## SIZE ## _h4_c;  \
 628     dsp->put_vp8_epel_pixels_tab[IDX][0][2] = put_vp8_epel ## SIZE ## _h6_c;  \
 629     dsp->put_vp8_epel_pixels_tab[IDX][1][0] = put_vp8_epel ## SIZE ## _v4_c;  \
 630     dsp->put_vp8_epel_pixels_tab[IDX][1][1] = put_vp8_epel ## SIZE ## _h4v4_c; \
 631     dsp->put_vp8_epel_pixels_tab[IDX][1][2] = put_vp8_epel ## SIZE ## _h6v4_c; \
 632     dsp->put_vp8_epel_pixels_tab[IDX][2][0] = put_vp8_epel ## SIZE ## _v6_c;  \
 633     dsp->put_vp8_epel_pixels_tab[IDX][2][1] = put_vp8_epel ## SIZE ## _h4v6_c; \
 634     dsp->put_vp8_epel_pixels_tab[IDX][2][2] = put_vp8_epel ## SIZE ## _h6v6_c
 635
 636 #define VP8_BILINEAR_MC_FUNC(IDX, SIZE)                                       \
 637     dsp->put_vp8_bilinear_pixels_tab[IDX][0][0] = put_vp8_pixels ## SIZE ## _c; \
 638     dsp->put_vp8_bilinear_pixels_tab[IDX][0][1] = put_vp8_bilinear ## SIZE ## _h_c; \
 639     dsp->put_vp8_bilinear_pixels_tab[IDX][0][2] = put_vp8_bilinear ## SIZE ## _h_c; \
 640     dsp->put_vp8_bilinear_pixels_tab[IDX][1][0] = put_vp8_bilinear ## SIZE ## _v_c; \
 641     dsp->put_vp8_bilinear_pixels_tab[IDX][1][1] = put_vp8_bilinear ## SIZE ## _hv_c; \
 642     dsp->put_vp8_bilinear_pixels_tab[IDX][1][2] = put_vp8_bilinear ## SIZE ## _hv_c; \
 643     dsp->put_vp8_bilinear_pixels_tab[IDX][2][0] = put_vp8_bilinear ## SIZE ## _v_c; \
 644     dsp->put_vp8_bilinear_pixels_tab[IDX][2][1] = put_vp8_bilinear ## SIZE ## _hv_c; \
 645     dsp->put_vp8_bilinear_pixels_tab[IDX][2][2] = put_vp8_bilinear ## SIZE ## _hv_c
 646
 647 av_cold void ff_vp8dsp_init(VP8DSPContext *dsp, int vp7)
 648 {
 649 #if CONFIG_VP7_DECODER && CONFIG_VP8_DECODER
 650 #define VPX(f) vp7 ? vp7_ ## f : vp8_ ## f
 651 #elif CONFIG_VP7_DECODER
 652 #define VPX(f) vp7_ ## f
 653 #else // CONFIG_VP8_DECODER
 654 #define VPX(f) vp8_ ## f
 655 #endif
 656
 657     dsp->vp8_luma_dc_wht    = VPX(luma_dc_wht_c);
 658     dsp->vp8_luma_dc_wht_dc = VPX(luma_dc_wht_dc_c);
 659     dsp->vp8_idct_add       = VPX(idct_add_c);
 660     dsp->vp8_idct_dc_add    = VPX(idct_dc_add_c);
 661     dsp->vp8_idct_dc_add4y  = VPX(idct_dc_add4y_c);
 662     dsp->vp8_idct_dc_add4uv = VPX(idct_dc_add4uv_c);
 663
 664     dsp->vp8_v_loop_filter16y = VPX(v_loop_filter16_c);
 665     dsp->vp8_h_loop_filter16y = VPX(h_loop_filter16_c);
 666     dsp->vp8_v_loop_filter8uv = VPX(v_loop_filter8uv_c);
 667     dsp->vp8_h_loop_filter8uv = VPX(h_loop_filter8uv_c);
 668
 669     dsp->vp8_v_loop_filter16y_inner = VPX(v_loop_filter16_inner_c);
 670     dsp->vp8_h_loop_filter16y_inner = VPX(h_loop_filter16_inner_c);
 671     dsp->vp8_v_loop_filter8uv_inner = VPX(v_loop_filter8uv_inner_c);
 672     dsp->vp8_h_loop_filter8uv_inner = VPX(h_loop_filter8uv_inner_c);
 673
 674     dsp->vp8_v_loop_filter_simple = VPX(v_loop_filter_simple_c);
 675     dsp->vp8_h_loop_filter_simple = VPX(h_loop_filter_simple_c);
 676
 677     VP8_MC_FUNC(0, 16);
 678     VP8_MC_FUNC(1, 8);
 679     VP8_MC_FUNC(2, 4);
 680
 681     VP8_BILINEAR_MC_FUNC(0, 16);
 682     VP8_BILINEAR_MC_FUNC(1, 8);
 683     VP8_BILINEAR_MC_FUNC(2, 4);
 684
 685     if (ARCH_ARM)
 686         ff_vp8dsp_init_arm(dsp, vp7);
 687     if (ARCH_PPC)
 688         ff_vp8dsp_init_ppc(dsp);
 689     if (ARCH_X86)
 690         ff_vp8dsp_init_x86(dsp, vp7);
 691 }