git.sesse.net Git - ffmpeg/blob - libavcodec/vp8.c

   1 /**
   2  * VP8 compatible video decoder
   3  *
   4  * Copyright (C) 2010 David Conrad
   5  * Copyright (C) 2010 Ronald S. Bultje
   6  * Copyright (C) 2010 Jason Garrett-Glaser
   7  *
   8  * This file is part of Libav.
   9  *
  10  * Libav is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2.1 of the License, or (at your option) any later version.
  14  *
  15  * Libav is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with Libav; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23  */
  24
  25 #include "libavutil/imgutils.h"
  26 #include "avcodec.h"
  27 #include "vp8.h"
  28 #include "vp8data.h"
  29 #include "rectangle.h"
  30
  31 #if ARCH_ARM
  32 #   include "arm/vp8.h"
  33 #endif
  34
  35 static void vp8_decode_flush(AVCodecContext *avctx)
  36 {
  37     VP8Context *s = avctx->priv_data;
  38     int i;
  39
  40     for (i = 0; i < 4; i++)
  41         if (s->frames[i].data[0])
  42             avctx->release_buffer(avctx, &s->frames[i]);
  43     memset(s->framep, 0, sizeof(s->framep));
  44
  45     av_freep(&s->macroblocks_base);
  46     av_freep(&s->filter_strength);
  47     av_freep(&s->intra4x4_pred_mode_top);
  48     av_freep(&s->top_nnz);
  49     av_freep(&s->edge_emu_buffer);
  50     av_freep(&s->top_border);
  51     av_freep(&s->segmentation_map);
  52
  53     s->macroblocks        = NULL;
  54 }
  55
  56 static int update_dimensions(VP8Context *s, int width, int height)
  57 {
  58     if (av_image_check_size(width, height, 0, s->avctx))
  59         return AVERROR_INVALIDDATA;
  60
  61     vp8_decode_flush(s->avctx);
  62
  63     avcodec_set_dimensions(s->avctx, width, height);
  64
  65     s->mb_width  = (s->avctx->coded_width +15) / 16;
  66     s->mb_height = (s->avctx->coded_height+15) / 16;
  67
  68     s->macroblocks_base        = av_mallocz((s->mb_width+s->mb_height*2+1)*sizeof(*s->macroblocks));
  69     s->filter_strength         = av_mallocz(s->mb_width*sizeof(*s->filter_strength));
  70     s->intra4x4_pred_mode_top  = av_mallocz(s->mb_width*4);
  71     s->top_nnz                 = av_mallocz(s->mb_width*sizeof(*s->top_nnz));
  72     s->top_border              = av_mallocz((s->mb_width+1)*sizeof(*s->top_border));
  73     s->segmentation_map        = av_mallocz(s->mb_width*s->mb_height);
  74
  75     if (!s->macroblocks_base || !s->filter_strength || !s->intra4x4_pred_mode_top ||
  76         !s->top_nnz || !s->top_border || !s->segmentation_map)
  77         return AVERROR(ENOMEM);
  78
  79     s->macroblocks        = s->macroblocks_base + 1;
  80
  81     return 0;
  82 }
  83
  84 static void parse_segment_info(VP8Context *s)
  85 {
  86     VP56RangeCoder *c = &s->c;
  87     int i;
  88
  89     s->segmentation.update_map = vp8_rac_get(c);
  90
  91     if (vp8_rac_get(c)) { // update segment feature data
  92         s->segmentation.absolute_vals = vp8_rac_get(c);
  93
  94         for (i = 0; i < 4; i++)
  95             s->segmentation.base_quant[i]   = vp8_rac_get_sint(c, 7);
  96
  97         for (i = 0; i < 4; i++)
  98             s->segmentation.filter_level[i] = vp8_rac_get_sint(c, 6);
  99     }
 100     if (s->segmentation.update_map)
 101         for (i = 0; i < 3; i++)
 102             s->prob->segmentid[i] = vp8_rac_get(c) ? vp8_rac_get_uint(c, 8) : 255;
 103 }
 104
 105 static void update_lf_deltas(VP8Context *s)
 106 {
 107     VP56RangeCoder *c = &s->c;
 108     int i;
 109
 110     for (i = 0; i < 4; i++)
 111         s->lf_delta.ref[i]  = vp8_rac_get_sint(c, 6);
 112
 113     for (i = MODE_I4x4; i <= VP8_MVMODE_SPLIT; i++)
 114         s->lf_delta.mode[i] = vp8_rac_get_sint(c, 6);
 115 }
 116
 117 static int setup_partitions(VP8Context *s, const uint8_t *buf, int buf_size)
 118 {
 119     const uint8_t *sizes = buf;
 120     int i;
 121
 122     s->num_coeff_partitions = 1 << vp8_rac_get_uint(&s->c, 2);
 123
 124     buf      += 3*(s->num_coeff_partitions-1);
 125     buf_size -= 3*(s->num_coeff_partitions-1);
 126     if (buf_size < 0)
 127         return -1;
 128
 129     for (i = 0; i < s->num_coeff_partitions-1; i++) {
 130         int size = AV_RL24(sizes + 3*i);
 131         if (buf_size - size < 0)
 132             return -1;
 133
 134         ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, size);
 135         buf      += size;
 136         buf_size -= size;
 137     }
 138     ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, buf_size);
 139
 140     return 0;
 141 }
 142
 143 static void get_quants(VP8Context *s)
 144 {
 145     VP56RangeCoder *c = &s->c;
 146     int i, base_qi;
 147
 148     int yac_qi     = vp8_rac_get_uint(c, 7);
 149     int ydc_delta  = vp8_rac_get_sint(c, 4);
 150     int y2dc_delta = vp8_rac_get_sint(c, 4);
 151     int y2ac_delta = vp8_rac_get_sint(c, 4);
 152     int uvdc_delta = vp8_rac_get_sint(c, 4);
 153     int uvac_delta = vp8_rac_get_sint(c, 4);
 154
 155     for (i = 0; i < 4; i++) {
 156         if (s->segmentation.enabled) {
 157             base_qi = s->segmentation.base_quant[i];
 158             if (!s->segmentation.absolute_vals)
 159                 base_qi += yac_qi;
 160         } else
 161             base_qi = yac_qi;
 162
 163         s->qmat[i].luma_qmul[0]    =       vp8_dc_qlookup[av_clip(base_qi + ydc_delta , 0, 127)];
 164         s->qmat[i].luma_qmul[1]    =       vp8_ac_qlookup[av_clip(base_qi             , 0, 127)];
 165         s->qmat[i].luma_dc_qmul[0] =   2 * vp8_dc_qlookup[av_clip(base_qi + y2dc_delta, 0, 127)];
 166         s->qmat[i].luma_dc_qmul[1] = 155 * vp8_ac_qlookup[av_clip(base_qi + y2ac_delta, 0, 127)] / 100;
 167         s->qmat[i].chroma_qmul[0]  =       vp8_dc_qlookup[av_clip(base_qi + uvdc_delta, 0, 127)];
 168         s->qmat[i].chroma_qmul[1]  =       vp8_ac_qlookup[av_clip(base_qi + uvac_delta, 0, 127)];
 169
 170         s->qmat[i].luma_dc_qmul[1] = FFMAX(s->qmat[i].luma_dc_qmul[1], 8);
 171         s->qmat[i].chroma_qmul[0]  = FFMIN(s->qmat[i].chroma_qmul[0], 132);
 172     }
 173 }
 174
 175 /**
 176  * Determine which buffers golden and altref should be updated with after this frame.
 177  * The spec isn't clear here, so I'm going by my understanding of what libvpx does
 178  *
 179  * Intra frames update all 3 references
 180  * Inter frames update VP56_FRAME_PREVIOUS if the update_last flag is set
 181  * If the update (golden|altref) flag is set, it's updated with the current frame
 182  *      if update_last is set, and VP56_FRAME_PREVIOUS otherwise.
 183  * If the flag is not set, the number read means:
 184  *      0: no update
 185  *      1: VP56_FRAME_PREVIOUS
 186  *      2: update golden with altref, or update altref with golden
 187  */
 188 static VP56Frame ref_to_update(VP8Context *s, int update, VP56Frame ref)
 189 {
 190     VP56RangeCoder *c = &s->c;
 191
 192     if (update)
 193         return VP56_FRAME_CURRENT;
 194
 195     switch (vp8_rac_get_uint(c, 2)) {
 196     case 1:
 197         return VP56_FRAME_PREVIOUS;
 198     case 2:
 199         return (ref == VP56_FRAME_GOLDEN) ? VP56_FRAME_GOLDEN2 : VP56_FRAME_GOLDEN;
 200     }
 201     return VP56_FRAME_NONE;
 202 }
 203
 204 static void update_refs(VP8Context *s)
 205 {
 206     VP56RangeCoder *c = &s->c;
 207
 208     int update_golden = vp8_rac_get(c);
 209     int update_altref = vp8_rac_get(c);
 210
 211     s->update_golden = ref_to_update(s, update_golden, VP56_FRAME_GOLDEN);
 212     s->update_altref = ref_to_update(s, update_altref, VP56_FRAME_GOLDEN2);
 213 }
 214
 215 static int decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size)
 216 {
 217     VP56RangeCoder *c = &s->c;
 218     int header_size, hscale, vscale, i, j, k, l, m, ret;
 219     int width  = s->avctx->width;
 220     int height = s->avctx->height;
 221
 222     s->keyframe  = !(buf[0] & 1);
 223     s->profile   =  (buf[0]>>1) & 7;
 224     s->invisible = !(buf[0] & 0x10);
 225     header_size  = AV_RL24(buf) >> 5;
 226     buf      += 3;
 227     buf_size -= 3;
 228
 229     if (s->profile > 3)
 230         av_log(s->avctx, AV_LOG_WARNING, "Unknown profile %d\n", s->profile);
 231
 232     if (!s->profile)
 233         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab, sizeof(s->put_pixels_tab));
 234     else    // profile 1-3 use bilinear, 4+ aren't defined so whatever
 235         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_bilinear_pixels_tab, sizeof(s->put_pixels_tab));
 236
 237     if (header_size > buf_size - 7*s->keyframe) {
 238         av_log(s->avctx, AV_LOG_ERROR, "Header size larger than data provided\n");
 239         return AVERROR_INVALIDDATA;
 240     }
 241
 242     if (s->keyframe) {
 243         if (AV_RL24(buf) != 0x2a019d) {
 244             av_log(s->avctx, AV_LOG_ERROR, "Invalid start code 0x%x\n", AV_RL24(buf));
 245             return AVERROR_INVALIDDATA;
 246         }
 247         width  = AV_RL16(buf+3) & 0x3fff;
 248         height = AV_RL16(buf+5) & 0x3fff;
 249         hscale = buf[4] >> 6;
 250         vscale = buf[6] >> 6;
 251         buf      += 7;
 252         buf_size -= 7;
 253
 254         if (hscale || vscale)
 255             av_log_missing_feature(s->avctx, "Upscaling", 1);
 256
 257         s->update_golden = s->update_altref = VP56_FRAME_CURRENT;
 258         for (i = 0; i < 4; i++)
 259             for (j = 0; j < 16; j++)
 260                 memcpy(s->prob->token[i][j], vp8_token_default_probs[i][vp8_coeff_band[j]],
 261                        sizeof(s->prob->token[i][j]));
 262         memcpy(s->prob->pred16x16, vp8_pred16x16_prob_inter, sizeof(s->prob->pred16x16));
 263         memcpy(s->prob->pred8x8c , vp8_pred8x8c_prob_inter , sizeof(s->prob->pred8x8c));
 264         memcpy(s->prob->mvc      , vp8_mv_default_prob     , sizeof(s->prob->mvc));
 265         memset(&s->segmentation, 0, sizeof(s->segmentation));
 266     }
 267
 268     if (!s->macroblocks_base || /* first frame */
 269         width != s->avctx->width || height != s->avctx->height) {
 270         if ((ret = update_dimensions(s, width, height) < 0))
 271             return ret;
 272     }
 273
 274     ff_vp56_init_range_decoder(c, buf, header_size);
 275     buf      += header_size;
 276     buf_size -= header_size;
 277
 278     if (s->keyframe) {
 279         if (vp8_rac_get(c))
 280             av_log(s->avctx, AV_LOG_WARNING, "Unspecified colorspace\n");
 281         vp8_rac_get(c); // whether we can skip clamping in dsp functions
 282     }
 283
 284     if ((s->segmentation.enabled = vp8_rac_get(c)))
 285         parse_segment_info(s);
 286     else
 287         s->segmentation.update_map = 0; // FIXME: move this to some init function?
 288
 289     s->filter.simple    = vp8_rac_get(c);
 290     s->filter.level     = vp8_rac_get_uint(c, 6);
 291     s->filter.sharpness = vp8_rac_get_uint(c, 3);
 292
 293     if ((s->lf_delta.enabled = vp8_rac_get(c)))
 294         if (vp8_rac_get(c))
 295             update_lf_deltas(s);
 296
 297     if (setup_partitions(s, buf, buf_size)) {
 298         av_log(s->avctx, AV_LOG_ERROR, "Invalid partitions\n");
 299         return AVERROR_INVALIDDATA;
 300     }
 301
 302     get_quants(s);
 303
 304     if (!s->keyframe) {
 305         update_refs(s);
 306         s->sign_bias[VP56_FRAME_GOLDEN]               = vp8_rac_get(c);
 307         s->sign_bias[VP56_FRAME_GOLDEN2 /* altref */] = vp8_rac_get(c);
 308     }
 309
 310     // if we aren't saving this frame's probabilities for future frames,
 311     // make a copy of the current probabilities
 312     if (!(s->update_probabilities = vp8_rac_get(c)))
 313         s->prob[1] = s->prob[0];
 314
 315     s->update_last = s->keyframe || vp8_rac_get(c);
 316
 317     for (i = 0; i < 4; i++)
 318         for (j = 0; j < 8; j++)
 319             for (k = 0; k < 3; k++)
 320                 for (l = 0; l < NUM_DCT_TOKENS-1; l++)
 321                     if (vp56_rac_get_prob_branchy(c, vp8_token_update_probs[i][j][k][l])) {
 322                         int prob = vp8_rac_get_uint(c, 8);
 323                         for (m = 0; vp8_coeff_band_indexes[j][m] >= 0; m++)
 324                             s->prob->token[i][vp8_coeff_band_indexes[j][m]][k][l] = prob;
 325                     }
 326
 327     if ((s->mbskip_enabled = vp8_rac_get(c)))
 328         s->prob->mbskip = vp8_rac_get_uint(c, 8);
 329
 330     if (!s->keyframe) {
 331         s->prob->intra  = vp8_rac_get_uint(c, 8);
 332         s->prob->last   = vp8_rac_get_uint(c, 8);
 333         s->prob->golden = vp8_rac_get_uint(c, 8);
 334
 335         if (vp8_rac_get(c))
 336             for (i = 0; i < 4; i++)
 337                 s->prob->pred16x16[i] = vp8_rac_get_uint(c, 8);
 338         if (vp8_rac_get(c))
 339             for (i = 0; i < 3; i++)
 340                 s->prob->pred8x8c[i]  = vp8_rac_get_uint(c, 8);
 341
 342         // 17.2 MV probability update
 343         for (i = 0; i < 2; i++)
 344             for (j = 0; j < 19; j++)
 345                 if (vp56_rac_get_prob_branchy(c, vp8_mv_update_prob[i][j]))
 346                     s->prob->mvc[i][j] = vp8_rac_get_nn(c);
 347     }
 348
 349     return 0;
 350 }
 351
 352 static av_always_inline void clamp_mv(VP8Context *s, VP56mv *dst, const VP56mv *src)
 353 {
 354     dst->x = av_clip(src->x, s->mv_min.x, s->mv_max.x);
 355     dst->y = av_clip(src->y, s->mv_min.y, s->mv_max.y);
 356 }
 357
 358 /**
 359  * Motion vector coding, 17.1.
 360  */
 361 static int read_mv_component(VP56RangeCoder *c, const uint8_t *p)
 362 {
 363     int bit, x = 0;
 364
 365     if (vp56_rac_get_prob_branchy(c, p[0])) {
 366         int i;
 367
 368         for (i = 0; i < 3; i++)
 369             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 370         for (i = 9; i > 3; i--)
 371             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 372         if (!(x & 0xFFF0) || vp56_rac_get_prob(c, p[12]))
 373             x += 8;
 374     } else {
 375         // small_mvtree
 376         const uint8_t *ps = p+2;
 377         bit = vp56_rac_get_prob(c, *ps);
 378         ps += 1 + 3*bit;
 379         x  += 4*bit;
 380         bit = vp56_rac_get_prob(c, *ps);
 381         ps += 1 + bit;
 382         x  += 2*bit;
 383         x  += vp56_rac_get_prob(c, *ps);
 384     }
 385
 386     return (x && vp56_rac_get_prob(c, p[1])) ? -x : x;
 387 }
 388
 389 static av_always_inline
 390 const uint8_t *get_submv_prob(uint32_t left, uint32_t top)
 391 {
 392     if (left == top)
 393         return vp8_submv_prob[4-!!left];
 394     if (!top)
 395         return vp8_submv_prob[2];
 396     return vp8_submv_prob[1-!!left];
 397 }
 398
 399 /**
 400  * Split motion vector prediction, 16.4.
 401  * @returns the number of motion vectors parsed (2, 4 or 16)
 402  */
 403 static av_always_inline
 404 int decode_splitmvs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb)
 405 {
 406     int part_idx;
 407     int n, num;
 408     VP8Macroblock *top_mb  = &mb[2];
 409     VP8Macroblock *left_mb = &mb[-1];
 410     const uint8_t *mbsplits_left = vp8_mbsplits[left_mb->partitioning],
 411                   *mbsplits_top = vp8_mbsplits[top_mb->partitioning],
 412                   *mbsplits_cur, *firstidx;
 413     VP56mv *top_mv  = top_mb->bmv;
 414     VP56mv *left_mv = left_mb->bmv;
 415     VP56mv *cur_mv  = mb->bmv;
 416
 417     if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[0])) {
 418         if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[1])) {
 419             part_idx = VP8_SPLITMVMODE_16x8 + vp56_rac_get_prob(c, vp8_mbsplit_prob[2]);
 420         } else {
 421             part_idx = VP8_SPLITMVMODE_8x8;
 422         }
 423     } else {
 424         part_idx = VP8_SPLITMVMODE_4x4;
 425     }
 426
 427     num = vp8_mbsplit_count[part_idx];
 428     mbsplits_cur = vp8_mbsplits[part_idx],
 429     firstidx = vp8_mbfirstidx[part_idx];
 430     mb->partitioning = part_idx;
 431
 432     for (n = 0; n < num; n++) {
 433         int k = firstidx[n];
 434         uint32_t left, above;
 435         const uint8_t *submv_prob;
 436
 437         if (!(k & 3))
 438             left = AV_RN32A(&left_mv[mbsplits_left[k + 3]]);
 439         else
 440             left  = AV_RN32A(&cur_mv[mbsplits_cur[k - 1]]);
 441         if (k <= 3)
 442             above = AV_RN32A(&top_mv[mbsplits_top[k + 12]]);
 443         else
 444             above = AV_RN32A(&cur_mv[mbsplits_cur[k - 4]]);
 445
 446         submv_prob = get_submv_prob(left, above);
 447
 448         if (vp56_rac_get_prob_branchy(c, submv_prob[0])) {
 449             if (vp56_rac_get_prob_branchy(c, submv_prob[1])) {
 450                 if (vp56_rac_get_prob_branchy(c, submv_prob[2])) {
 451                     mb->bmv[n].y = mb->mv.y + read_mv_component(c, s->prob->mvc[0]);
 452                     mb->bmv[n].x = mb->mv.x + read_mv_component(c, s->prob->mvc[1]);
 453                 } else {
 454                     AV_ZERO32(&mb->bmv[n]);
 455                 }
 456             } else {
 457                 AV_WN32A(&mb->bmv[n], above);
 458             }
 459         } else {
 460             AV_WN32A(&mb->bmv[n], left);
 461         }
 462     }
 463
 464     return num;
 465 }
 466
 467 static av_always_inline
 468 void decode_mvs(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y)
 469 {
 470     VP8Macroblock *mb_edge[3] = { mb + 2 /* top */,
 471                                   mb - 1 /* left */,
 472                                   mb + 1 /* top-left */ };
 473     enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV };
 474     enum { VP8_EDGE_TOP, VP8_EDGE_LEFT, VP8_EDGE_TOPLEFT };
 475     int idx = CNT_ZERO;
 476     int cur_sign_bias = s->sign_bias[mb->ref_frame];
 477     int8_t *sign_bias = s->sign_bias;
 478     VP56mv near_mv[4];
 479     uint8_t cnt[4] = { 0 };
 480     VP56RangeCoder *c = &s->c;
 481
 482     AV_ZERO32(&near_mv[0]);
 483     AV_ZERO32(&near_mv[1]);
 484
 485     /* Process MB on top, left and top-left */
 486     #define MV_EDGE_CHECK(n)\
 487     {\
 488         VP8Macroblock *edge = mb_edge[n];\
 489         int edge_ref = edge->ref_frame;\
 490         if (edge_ref != VP56_FRAME_CURRENT) {\
 491             uint32_t mv = AV_RN32A(&edge->mv);\
 492             if (mv) {\
 493                 if (cur_sign_bias != sign_bias[edge_ref]) {\
 494                     /* SWAR negate of the values in mv. */\
 495                     mv = ~mv;\
 496                     mv = ((mv&0x7fff7fff) + 0x00010001) ^ (mv&0x80008000);\
 497                 }\
 498                 if (!n || mv != AV_RN32A(&near_mv[idx]))\
 499                     AV_WN32A(&near_mv[++idx], mv);\
 500                 cnt[idx]      += 1 + (n != 2);\
 501             } else\
 502                 cnt[CNT_ZERO] += 1 + (n != 2);\
 503         }\
 504     }
 505
 506     MV_EDGE_CHECK(0)
 507     MV_EDGE_CHECK(1)
 508     MV_EDGE_CHECK(2)
 509
 510     mb->partitioning = VP8_SPLITMVMODE_NONE;
 511     if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_ZERO]][0])) {
 512         mb->mode = VP8_MVMODE_MV;
 513
 514         /* If we have three distinct MVs, merge first and last if they're the same */
 515         if (cnt[CNT_SPLITMV] && AV_RN32A(&near_mv[1 + VP8_EDGE_TOP]) == AV_RN32A(&near_mv[1 + VP8_EDGE_TOPLEFT]))
 516             cnt[CNT_NEAREST] += 1;
 517
 518         /* Swap near and nearest if necessary */
 519         if (cnt[CNT_NEAR] > cnt[CNT_NEAREST]) {
 520             FFSWAP(uint8_t,     cnt[CNT_NEAREST],     cnt[CNT_NEAR]);
 521             FFSWAP( VP56mv, near_mv[CNT_NEAREST], near_mv[CNT_NEAR]);
 522         }
 523
 524         if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAREST]][1])) {
 525             if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAR]][2])) {
 526
 527                 /* Choose the best mv out of 0,0 and the nearest mv */
 528                 clamp_mv(s, &mb->mv, &near_mv[CNT_ZERO + (cnt[CNT_NEAREST] >= cnt[CNT_ZERO])]);
 529                 cnt[CNT_SPLITMV] = ((mb_edge[VP8_EDGE_LEFT]->mode    == VP8_MVMODE_SPLIT) +
 530                                     (mb_edge[VP8_EDGE_TOP]->mode     == VP8_MVMODE_SPLIT)) * 2 +
 531                                     (mb_edge[VP8_EDGE_TOPLEFT]->mode == VP8_MVMODE_SPLIT);
 532
 533                 if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_SPLITMV]][3])) {
 534                     mb->mode = VP8_MVMODE_SPLIT;
 535                     mb->mv = mb->bmv[decode_splitmvs(s, c, mb) - 1];
 536                 } else {
 537                     mb->mv.y += read_mv_component(c, s->prob->mvc[0]);
 538                     mb->mv.x += read_mv_component(c, s->prob->mvc[1]);
 539                     mb->bmv[0] = mb->mv;
 540                 }
 541             } else {
 542                 clamp_mv(s, &mb->mv, &near_mv[CNT_NEAR]);
 543                 mb->bmv[0] = mb->mv;
 544             }
 545         } else {
 546             clamp_mv(s, &mb->mv, &near_mv[CNT_NEAREST]);
 547             mb->bmv[0] = mb->mv;
 548         }
 549     } else {
 550         mb->mode = VP8_MVMODE_ZERO;
 551         AV_ZERO32(&mb->mv);
 552         mb->bmv[0] = mb->mv;
 553     }
 554 }
 555
 556 static av_always_inline
 557 void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c,
 558                            int mb_x, int keyframe)
 559 {
 560     uint8_t *intra4x4 = s->intra4x4_pred_mode_mb;
 561     if (keyframe) {
 562         int x, y;
 563         uint8_t* const top = s->intra4x4_pred_mode_top + 4 * mb_x;
 564         uint8_t* const left = s->intra4x4_pred_mode_left;
 565         for (y = 0; y < 4; y++) {
 566             for (x = 0; x < 4; x++) {
 567                 const uint8_t *ctx;
 568                 ctx = vp8_pred4x4_prob_intra[top[x]][left[y]];
 569                 *intra4x4 = vp8_rac_get_tree(c, vp8_pred4x4_tree, ctx);
 570                 left[y] = top[x] = *intra4x4;
 571                 intra4x4++;
 572             }
 573         }
 574     } else {
 575         int i;
 576         for (i = 0; i < 16; i++)
 577             intra4x4[i] = vp8_rac_get_tree(c, vp8_pred4x4_tree, vp8_pred4x4_prob_inter);
 578     }
 579 }
 580
 581 static av_always_inline
 582 void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, uint8_t *segment)
 583 {
 584     VP56RangeCoder *c = &s->c;
 585
 586     if (s->segmentation.update_map)
 587         *segment = vp8_rac_get_tree(c, vp8_segmentid_tree, s->prob->segmentid);
 588     s->segment = *segment;
 589
 590     mb->skip = s->mbskip_enabled ? vp56_rac_get_prob(c, s->prob->mbskip) : 0;
 591
 592     if (s->keyframe) {
 593         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_intra, vp8_pred16x16_prob_intra);
 594
 595         if (mb->mode == MODE_I4x4) {
 596             decode_intra4x4_modes(s, c, mb_x, 1);
 597         } else {
 598             const uint32_t modes = vp8_pred4x4_mode[mb->mode] * 0x01010101u;
 599             AV_WN32A(s->intra4x4_pred_mode_top + 4 * mb_x, modes);
 600             AV_WN32A(s->intra4x4_pred_mode_left, modes);
 601         }
 602
 603         s->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, vp8_pred8x8c_prob_intra);
 604         mb->ref_frame = VP56_FRAME_CURRENT;
 605     } else if (vp56_rac_get_prob_branchy(c, s->prob->intra)) {
 606         // inter MB, 16.2
 607         if (vp56_rac_get_prob_branchy(c, s->prob->last))
 608             mb->ref_frame = vp56_rac_get_prob(c, s->prob->golden) ?
 609                 VP56_FRAME_GOLDEN2 /* altref */ : VP56_FRAME_GOLDEN;
 610         else
 611             mb->ref_frame = VP56_FRAME_PREVIOUS;
 612         s->ref_count[mb->ref_frame-1]++;
 613
 614         // motion vectors, 16.3
 615         decode_mvs(s, mb, mb_x, mb_y);
 616     } else {
 617         // intra MB, 16.1
 618         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_inter, s->prob->pred16x16);
 619
 620         if (mb->mode == MODE_I4x4)
 621             decode_intra4x4_modes(s, c, mb_x, 0);
 622
 623         s->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, s->prob->pred8x8c);
 624         mb->ref_frame = VP56_FRAME_CURRENT;
 625         mb->partitioning = VP8_SPLITMVMODE_NONE;
 626         AV_ZERO32(&mb->bmv[0]);
 627     }
 628 }
 629
 630 #ifndef decode_block_coeffs_internal
 631 /**
 632  * @param c arithmetic bitstream reader context
 633  * @param block destination for block coefficients
 634  * @param probs probabilities to use when reading trees from the bitstream
 635  * @param i initial coeff index, 0 unless a separate DC block is coded
 636  * @param zero_nhood the initial prediction context for number of surrounding
 637  *                   all-zero blocks (only left/top, so 0-2)
 638  * @param qmul array holding the dc/ac dequant factor at position 0/1
 639  * @return 0 if no coeffs were decoded
 640  *         otherwise, the index of the last coeff decoded plus one
 641  */
 642 static int decode_block_coeffs_internal(VP56RangeCoder *c, DCTELEM block[16],
 643                                         uint8_t probs[16][3][NUM_DCT_TOKENS-1],
 644                                         int i, uint8_t *token_prob, int16_t qmul[2])
 645 {
 646     goto skip_eob;
 647     do {
 648         int coeff;
 649         if (!vp56_rac_get_prob_branchy(c, token_prob[0]))   // DCT_EOB
 650             return i;
 651
 652 skip_eob:
 653         if (!vp56_rac_get_prob_branchy(c, token_prob[1])) { // DCT_0
 654             if (++i == 16)
 655                 return i; // invalid input; blocks should end with EOB
 656             token_prob = probs[i][0];
 657             goto skip_eob;
 658         }
 659
 660         if (!vp56_rac_get_prob_branchy(c, token_prob[2])) { // DCT_1
 661             coeff = 1;
 662             token_prob = probs[i+1][1];
 663         } else {
 664             if (!vp56_rac_get_prob_branchy(c, token_prob[3])) { // DCT 2,3,4
 665                 coeff = vp56_rac_get_prob_branchy(c, token_prob[4]);
 666                 if (coeff)
 667                     coeff += vp56_rac_get_prob(c, token_prob[5]);
 668                 coeff += 2;
 669             } else {
 670                 // DCT_CAT*
 671                 if (!vp56_rac_get_prob_branchy(c, token_prob[6])) {
 672                     if (!vp56_rac_get_prob_branchy(c, token_prob[7])) { // DCT_CAT1
 673                         coeff  = 5 + vp56_rac_get_prob(c, vp8_dct_cat1_prob[0]);
 674                     } else {                                    // DCT_CAT2
 675                         coeff  = 7;
 676                         coeff += vp56_rac_get_prob(c, vp8_dct_cat2_prob[0]) << 1;
 677                         coeff += vp56_rac_get_prob(c, vp8_dct_cat2_prob[1]);
 678                     }
 679                 } else {    // DCT_CAT3 and up
 680                     int a = vp56_rac_get_prob(c, token_prob[8]);
 681                     int b = vp56_rac_get_prob(c, token_prob[9+a]);
 682                     int cat = (a<<1) + b;
 683                     coeff  = 3 + (8<<cat);
 684                     coeff += vp8_rac_get_coeff(c, ff_vp8_dct_cat_prob[cat]);
 685                 }
 686             }
 687             token_prob = probs[i+1][2];
 688         }
 689         block[zigzag_scan[i]] = (vp8_rac_get(c) ? -coeff : coeff) * qmul[!!i];
 690     } while (++i < 16);
 691
 692     return i;
 693 }
 694 #endif
 695
 696 static av_always_inline
 697 int decode_block_coeffs(VP56RangeCoder *c, DCTELEM block[16],
 698                         uint8_t probs[16][3][NUM_DCT_TOKENS-1],
 699                         int i, int zero_nhood, int16_t qmul[2])
 700 {
 701     uint8_t *token_prob = probs[i][zero_nhood];
 702     if (!vp56_rac_get_prob_branchy(c, token_prob[0]))   // DCT_EOB
 703         return 0;
 704     return decode_block_coeffs_internal(c, block, probs, i, token_prob, qmul);
 705 }
 706
 707 static av_always_inline
 708 void decode_mb_coeffs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
 709                       uint8_t t_nnz[9], uint8_t l_nnz[9])
 710 {
 711     int i, x, y, luma_start = 0, luma_ctx = 3;
 712     int nnz_pred, nnz, nnz_total = 0;
 713     int segment = s->segment;
 714     int block_dc = 0;
 715
 716     if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
 717         nnz_pred = t_nnz[8] + l_nnz[8];
 718
 719         // decode DC values and do hadamard
 720         nnz = decode_block_coeffs(c, s->block_dc, s->prob->token[1], 0, nnz_pred,
 721                                   s->qmat[segment].luma_dc_qmul);
 722         l_nnz[8] = t_nnz[8] = !!nnz;
 723         if (nnz) {
 724             nnz_total += nnz;
 725             block_dc = 1;
 726             if (nnz == 1)
 727                 s->vp8dsp.vp8_luma_dc_wht_dc(s->block, s->block_dc);
 728             else
 729                 s->vp8dsp.vp8_luma_dc_wht(s->block, s->block_dc);
 730         }
 731         luma_start = 1;
 732         luma_ctx = 0;
 733     }
 734
 735     // luma blocks
 736     for (y = 0; y < 4; y++)
 737         for (x = 0; x < 4; x++) {
 738             nnz_pred = l_nnz[y] + t_nnz[x];
 739             nnz = decode_block_coeffs(c, s->block[y][x], s->prob->token[luma_ctx], luma_start,
 740                                       nnz_pred, s->qmat[segment].luma_qmul);
 741             // nnz+block_dc may be one more than the actual last index, but we don't care
 742             s->non_zero_count_cache[y][x] = nnz + block_dc;
 743             t_nnz[x] = l_nnz[y] = !!nnz;
 744             nnz_total += nnz;
 745         }
 746
 747     // chroma blocks
 748     // TODO: what to do about dimensions? 2nd dim for luma is x,
 749     // but for chroma it's (y<<1)|x
 750     for (i = 4; i < 6; i++)
 751         for (y = 0; y < 2; y++)
 752             for (x = 0; x < 2; x++) {
 753                 nnz_pred = l_nnz[i+2*y] + t_nnz[i+2*x];
 754                 nnz = decode_block_coeffs(c, s->block[i][(y<<1)+x], s->prob->token[2], 0,
 755                                           nnz_pred, s->qmat[segment].chroma_qmul);
 756                 s->non_zero_count_cache[i][(y<<1)+x] = nnz;
 757                 t_nnz[i+2*x] = l_nnz[i+2*y] = !!nnz;
 758                 nnz_total += nnz;
 759             }
 760
 761     // if there were no coded coeffs despite the macroblock not being marked skip,
 762     // we MUST not do the inner loop filter and should not do IDCT
 763     // Since skip isn't used for bitstream prediction, just manually set it.
 764     if (!nnz_total)
 765         mb->skip = 1;
 766 }
 767
 768 static av_always_inline
 769 void backup_mb_border(uint8_t *top_border, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr,
 770                       int linesize, int uvlinesize, int simple)
 771 {
 772     AV_COPY128(top_border, src_y + 15*linesize);
 773     if (!simple) {
 774         AV_COPY64(top_border+16, src_cb + 7*uvlinesize);
 775         AV_COPY64(top_border+24, src_cr + 7*uvlinesize);
 776     }
 777 }
 778
 779 static av_always_inline
 780 void xchg_mb_border(uint8_t *top_border, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr,
 781                     int linesize, int uvlinesize, int mb_x, int mb_y, int mb_width,
 782                     int simple, int xchg)
 783 {
 784     uint8_t *top_border_m1 = top_border-32;     // for TL prediction
 785     src_y  -=   linesize;
 786     src_cb -= uvlinesize;
 787     src_cr -= uvlinesize;
 788
 789 #define XCHG(a,b,xchg) do {                     \
 790         if (xchg) AV_SWAP64(b,a);               \
 791         else      AV_COPY64(b,a);               \
 792     } while (0)
 793
 794     XCHG(top_border_m1+8, src_y-8, xchg);
 795     XCHG(top_border,      src_y,   xchg);
 796     XCHG(top_border+8,    src_y+8, 1);
 797     if (mb_x < mb_width-1)
 798         XCHG(top_border+32, src_y+16, 1);
 799
 800     // only copy chroma for normal loop filter
 801     // or to initialize the top row to 127
 802     if (!simple || !mb_y) {
 803         XCHG(top_border_m1+16, src_cb-8, xchg);
 804         XCHG(top_border_m1+24, src_cr-8, xchg);
 805         XCHG(top_border+16,    src_cb, 1);
 806         XCHG(top_border+24,    src_cr, 1);
 807     }
 808 }
 809
 810 static av_always_inline
 811 int check_dc_pred8x8_mode(int mode, int mb_x, int mb_y)
 812 {
 813     if (!mb_x) {
 814         return mb_y ? TOP_DC_PRED8x8 : DC_128_PRED8x8;
 815     } else {
 816         return mb_y ? mode : LEFT_DC_PRED8x8;
 817     }
 818 }
 819
 820 static av_always_inline
 821 int check_tm_pred8x8_mode(int mode, int mb_x, int mb_y)
 822 {
 823     if (!mb_x) {
 824         return mb_y ? VERT_PRED8x8 : DC_129_PRED8x8;
 825     } else {
 826         return mb_y ? mode : HOR_PRED8x8;
 827     }
 828 }
 829
 830 static av_always_inline
 831 int check_intra_pred8x8_mode(int mode, int mb_x, int mb_y)
 832 {
 833     if (mode == DC_PRED8x8) {
 834         return check_dc_pred8x8_mode(mode, mb_x, mb_y);
 835     } else {
 836         return mode;
 837     }
 838 }
 839
 840 static av_always_inline
 841 int check_intra_pred8x8_mode_emuedge(int mode, int mb_x, int mb_y)
 842 {
 843     switch (mode) {
 844     case DC_PRED8x8:
 845         return check_dc_pred8x8_mode(mode, mb_x, mb_y);
 846     case VERT_PRED8x8:
 847         return !mb_y ? DC_127_PRED8x8 : mode;
 848     case HOR_PRED8x8:
 849         return !mb_x ? DC_129_PRED8x8 : mode;
 850     case PLANE_PRED8x8 /*TM*/:
 851         return check_tm_pred8x8_mode(mode, mb_x, mb_y);
 852     }
 853     return mode;
 854 }
 855
 856 static av_always_inline
 857 int check_tm_pred4x4_mode(int mode, int mb_x, int mb_y)
 858 {
 859     if (!mb_x) {
 860         return mb_y ? VERT_VP8_PRED : DC_129_PRED;
 861     } else {
 862         return mb_y ? mode : HOR_VP8_PRED;
 863     }
 864 }
 865
 866 static av_always_inline
 867 int check_intra_pred4x4_mode_emuedge(int mode, int mb_x, int mb_y, int *copy_buf)
 868 {
 869     switch (mode) {
 870     case VERT_PRED:
 871         if (!mb_x && mb_y) {
 872             *copy_buf = 1;
 873             return mode;
 874         }
 875         /* fall-through */
 876     case DIAG_DOWN_LEFT_PRED:
 877     case VERT_LEFT_PRED:
 878         return !mb_y ? DC_127_PRED : mode;
 879     case HOR_PRED:
 880         if (!mb_y) {
 881             *copy_buf = 1;
 882             return mode;
 883         }
 884         /* fall-through */
 885     case HOR_UP_PRED:
 886         return !mb_x ? DC_129_PRED : mode;
 887     case TM_VP8_PRED:
 888         return check_tm_pred4x4_mode(mode, mb_x, mb_y);
 889     case DC_PRED: // 4x4 DC doesn't use the same "H.264-style" exceptions as 16x16/8x8 DC
 890     case DIAG_DOWN_RIGHT_PRED:
 891     case VERT_RIGHT_PRED:
 892     case HOR_DOWN_PRED:
 893         if (!mb_y || !mb_x)
 894             *copy_buf = 1;
 895         return mode;
 896     }
 897     return mode;
 898 }
 899
 900 static av_always_inline
 901 void intra_predict(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb,
 902                    int mb_x, int mb_y)
 903 {
 904     AVCodecContext *avctx = s->avctx;
 905     int x, y, mode, nnz, tr;
 906
 907     // for the first row, we need to run xchg_mb_border to init the top edge to 127
 908     // otherwise, skip it if we aren't going to deblock
 909     if (!(avctx->flags & CODEC_FLAG_EMU_EDGE && !mb_y) && (s->deblock_filter || !mb_y))
 910         xchg_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2],
 911                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
 912                        s->filter.simple, 1);
 913
 914     if (mb->mode < MODE_I4x4) {
 915         if (avctx->flags & CODEC_FLAG_EMU_EDGE) { // tested
 916             mode = check_intra_pred8x8_mode_emuedge(mb->mode, mb_x, mb_y);
 917         } else {
 918             mode = check_intra_pred8x8_mode(mb->mode, mb_x, mb_y);
 919         }
 920         s->hpc.pred16x16[mode](dst[0], s->linesize);
 921     } else {
 922         uint8_t *ptr = dst[0];
 923         uint8_t *intra4x4 = s->intra4x4_pred_mode_mb;
 924         uint8_t tr_top[4] = { 127, 127, 127, 127 };
 925
 926         // all blocks on the right edge of the macroblock use bottom edge
 927         // the top macroblock for their topright edge
 928         uint8_t *tr_right = ptr - s->linesize + 16;
 929
 930         // if we're on the right edge of the frame, said edge is extended
 931         // from the top macroblock
 932         if (!(!mb_y && avctx->flags & CODEC_FLAG_EMU_EDGE) &&
 933             mb_x == s->mb_width-1) {
 934             tr = tr_right[-1]*0x01010101;
 935             tr_right = (uint8_t *)&tr;
 936         }
 937
 938         if (mb->skip)
 939             AV_ZERO128(s->non_zero_count_cache);
 940
 941         for (y = 0; y < 4; y++) {
 942             uint8_t *topright = ptr + 4 - s->linesize;
 943             for (x = 0; x < 4; x++) {
 944                 int copy = 0, linesize = s->linesize;
 945                 uint8_t *dst = ptr+4*x;
 946                 DECLARE_ALIGNED(4, uint8_t, copy_dst)[5*8];
 947
 948                 if ((y == 0 || x == 3) && mb_y == 0 && avctx->flags & CODEC_FLAG_EMU_EDGE) {
 949                     topright = tr_top;
 950                 } else if (x == 3)
 951                     topright = tr_right;
 952
 953                 if (avctx->flags & CODEC_FLAG_EMU_EDGE) { // mb_x+x or mb_y+y is a hack but works
 954                     mode = check_intra_pred4x4_mode_emuedge(intra4x4[x], mb_x + x, mb_y + y, &copy);
 955                     if (copy) {
 956                         dst = copy_dst + 12;
 957                         linesize = 8;
 958                         if (!(mb_y + y)) {
 959                             copy_dst[3] = 127U;
 960                             AV_WN32A(copy_dst+4, 127U * 0x01010101U);
 961                         } else {
 962                             AV_COPY32(copy_dst+4, ptr+4*x-s->linesize);
 963                             if (!(mb_x + x)) {
 964                                 copy_dst[3] = 129U;
 965                             } else {
 966                                 copy_dst[3] = ptr[4*x-s->linesize-1];
 967                             }
 968                         }
 969                         if (!(mb_x + x)) {
 970                             copy_dst[11] =
 971                             copy_dst[19] =
 972                             copy_dst[27] =
 973                             copy_dst[35] = 129U;
 974                         } else {
 975                             copy_dst[11] = ptr[4*x              -1];
 976                             copy_dst[19] = ptr[4*x+s->linesize  -1];
 977                             copy_dst[27] = ptr[4*x+s->linesize*2-1];
 978                             copy_dst[35] = ptr[4*x+s->linesize*3-1];
 979                         }
 980                     }
 981                 } else {
 982                     mode = intra4x4[x];
 983                 }
 984                 s->hpc.pred4x4[mode](dst, topright, linesize);
 985                 if (copy) {
 986                     AV_COPY32(ptr+4*x              , copy_dst+12);
 987                     AV_COPY32(ptr+4*x+s->linesize  , copy_dst+20);
 988                     AV_COPY32(ptr+4*x+s->linesize*2, copy_dst+28);
 989                     AV_COPY32(ptr+4*x+s->linesize*3, copy_dst+36);
 990                 }
 991
 992                 nnz = s->non_zero_count_cache[y][x];
 993                 if (nnz) {
 994                     if (nnz == 1)
 995                         s->vp8dsp.vp8_idct_dc_add(ptr+4*x, s->block[y][x], s->linesize);
 996                     else
 997                         s->vp8dsp.vp8_idct_add(ptr+4*x, s->block[y][x], s->linesize);
 998                 }
 999                 topright += 4;
1000             }
1001
1002             ptr   += 4*s->linesize;
1003             intra4x4 += 4;
1004         }
1005     }
1006
1007     if (avctx->flags & CODEC_FLAG_EMU_EDGE) {
1008         mode = check_intra_pred8x8_mode_emuedge(s->chroma_pred_mode, mb_x, mb_y);
1009     } else {
1010         mode = check_intra_pred8x8_mode(s->chroma_pred_mode, mb_x, mb_y);
1011     }
1012     s->hpc.pred8x8[mode](dst[1], s->uvlinesize);
1013     s->hpc.pred8x8[mode](dst[2], s->uvlinesize);
1014
1015     if (!(avctx->flags & CODEC_FLAG_EMU_EDGE && !mb_y) && (s->deblock_filter || !mb_y))
1016         xchg_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2],
1017                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1018                        s->filter.simple, 0);
1019 }
1020
1021 static const uint8_t subpel_idx[3][8] = {
1022     { 0, 1, 2, 1, 2, 1, 2, 1 }, // nr. of left extra pixels,
1023                                 // also function pointer index
1024     { 0, 3, 5, 3, 5, 3, 5, 3 }, // nr. of extra pixels required
1025     { 0, 2, 3, 2, 3, 2, 3, 2 }, // nr. of right extra pixels
1026 };
1027
1028 /**
1029  * Generic MC function.
1030  *
1031  * @param s VP8 decoding context
1032  * @param luma 1 for luma (Y) planes, 0 for chroma (Cb/Cr) planes
1033  * @param dst target buffer for block data at block position
1034  * @param src reference picture buffer at origin (0, 0)
1035  * @param mv motion vector (relative to block position) to get pixel data from
1036  * @param x_off horizontal position of block from origin (0, 0)
1037  * @param y_off vertical position of block from origin (0, 0)
1038  * @param block_w width of block (16, 8 or 4)
1039  * @param block_h height of block (always same as block_w)
1040  * @param width width of src/dst plane data
1041  * @param height height of src/dst plane data
1042  * @param linesize size of a single line of plane data, including padding
1043  * @param mc_func motion compensation function pointers (bilinear or sixtap MC)
1044  */
1045 static av_always_inline
1046 void vp8_mc_luma(VP8Context *s, uint8_t *dst, uint8_t *src, const VP56mv *mv,
1047                  int x_off, int y_off, int block_w, int block_h,
1048                  int width, int height, int linesize,
1049                  vp8_mc_func mc_func[3][3])
1050 {
1051     if (AV_RN32A(mv)) {
1052
1053         int mx = (mv->x << 1)&7, mx_idx = subpel_idx[0][mx];
1054         int my = (mv->y << 1)&7, my_idx = subpel_idx[0][my];
1055
1056         x_off += mv->x >> 2;
1057         y_off += mv->y >> 2;
1058
1059         // edge emulation
1060         src += y_off * linesize + x_off;
1061         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1062             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1063             s->dsp.emulated_edge_mc(s->edge_emu_buffer, src - my_idx * linesize - mx_idx, linesize,
1064                                     block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1065                                     x_off - mx_idx, y_off - my_idx, width, height);
1066             src = s->edge_emu_buffer + mx_idx + linesize * my_idx;
1067         }
1068         mc_func[my_idx][mx_idx](dst, linesize, src, linesize, block_h, mx, my);
1069     } else
1070         mc_func[0][0](dst, linesize, src + y_off * linesize + x_off, linesize, block_h, 0, 0);
1071 }
1072
1073 static av_always_inline
1074 void vp8_mc_chroma(VP8Context *s, uint8_t *dst1, uint8_t *dst2, uint8_t *src1,
1075                    uint8_t *src2, const VP56mv *mv, int x_off, int y_off,
1076                    int block_w, int block_h, int width, int height, int linesize,
1077                    vp8_mc_func mc_func[3][3])
1078 {
1079     if (AV_RN32A(mv)) {
1080         int mx = mv->x&7, mx_idx = subpel_idx[0][mx];
1081         int my = mv->y&7, my_idx = subpel_idx[0][my];
1082
1083         x_off += mv->x >> 3;
1084         y_off += mv->y >> 3;
1085
1086         // edge emulation
1087         src1 += y_off * linesize + x_off;
1088         src2 += y_off * linesize + x_off;
1089         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1090             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1091             s->dsp.emulated_edge_mc(s->edge_emu_buffer, src1 - my_idx * linesize - mx_idx, linesize,
1092                                     block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1093                                     x_off - mx_idx, y_off - my_idx, width, height);
1094             src1 = s->edge_emu_buffer + mx_idx + linesize * my_idx;
1095             mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my);
1096
1097             s->dsp.emulated_edge_mc(s->edge_emu_buffer, src2 - my_idx * linesize - mx_idx, linesize,
1098                                     block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1099                                     x_off - mx_idx, y_off - my_idx, width, height);
1100             src2 = s->edge_emu_buffer + mx_idx + linesize * my_idx;
1101             mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my);
1102         } else {
1103             mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my);
1104             mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my);
1105         }
1106     } else {
1107         mc_func[0][0](dst1, linesize, src1 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1108         mc_func[0][0](dst2, linesize, src2 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1109     }
1110 }
1111
1112 static av_always_inline
1113 void vp8_mc_part(VP8Context *s, uint8_t *dst[3],
1114                  AVFrame *ref_frame, int x_off, int y_off,
1115                  int bx_off, int by_off,
1116                  int block_w, int block_h,
1117                  int width, int height, VP56mv *mv)
1118 {
1119     VP56mv uvmv = *mv;
1120
1121     /* Y */
1122     vp8_mc_luma(s, dst[0] + by_off * s->linesize + bx_off,
1123                 ref_frame->data[0], mv, x_off + bx_off, y_off + by_off,
1124                 block_w, block_h, width, height, s->linesize,
1125                 s->put_pixels_tab[block_w == 8]);
1126
1127     /* U/V */
1128     if (s->profile == 3) {
1129         uvmv.x &= ~7;
1130         uvmv.y &= ~7;
1131     }
1132     x_off   >>= 1; y_off   >>= 1;
1133     bx_off  >>= 1; by_off  >>= 1;
1134     width   >>= 1; height  >>= 1;
1135     block_w >>= 1; block_h >>= 1;
1136     vp8_mc_chroma(s, dst[1] + by_off * s->uvlinesize + bx_off,
1137                   dst[2] + by_off * s->uvlinesize + bx_off, ref_frame->data[1],
1138                   ref_frame->data[2], &uvmv, x_off + bx_off, y_off + by_off,
1139                   block_w, block_h, width, height, s->uvlinesize,
1140                   s->put_pixels_tab[1 + (block_w == 4)]);
1141 }
1142
1143 /* Fetch pixels for estimated mv 4 macroblocks ahead.
1144  * Optimized for 64-byte cache lines.  Inspired by ffh264 prefetch_motion. */
1145 static av_always_inline void prefetch_motion(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, int mb_xy, int ref)
1146 {
1147     /* Don't prefetch refs that haven't been used very often this frame. */
1148     if (s->ref_count[ref-1] > (mb_xy >> 5)) {
1149         int x_off = mb_x << 4, y_off = mb_y << 4;
1150         int mx = (mb->mv.x>>2) + x_off + 8;
1151         int my = (mb->mv.y>>2) + y_off;
1152         uint8_t **src= s->framep[ref]->data;
1153         int off= mx + (my + (mb_x&3)*4)*s->linesize + 64;
1154         s->dsp.prefetch(src[0]+off, s->linesize, 4);
1155         off= (mx>>1) + ((my>>1) + (mb_x&7))*s->uvlinesize + 64;
1156         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
1157     }
1158 }
1159
1160 /**
1161  * Apply motion vectors to prediction buffer, chapter 18.
1162  */
1163 static av_always_inline
1164 void inter_predict(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb,
1165                    int mb_x, int mb_y)
1166 {
1167     int x_off = mb_x << 4, y_off = mb_y << 4;
1168     int width = 16*s->mb_width, height = 16*s->mb_height;
1169     AVFrame *ref = s->framep[mb->ref_frame];
1170     VP56mv *bmv = mb->bmv;
1171
1172     switch (mb->partitioning) {
1173     case VP8_SPLITMVMODE_NONE:
1174         vp8_mc_part(s, dst, ref, x_off, y_off,
1175                     0, 0, 16, 16, width, height, &mb->mv);
1176         break;
1177     case VP8_SPLITMVMODE_4x4: {
1178         int x, y;
1179         VP56mv uvmv;
1180
1181         /* Y */
1182         for (y = 0; y < 4; y++) {
1183             for (x = 0; x < 4; x++) {
1184                 vp8_mc_luma(s, dst[0] + 4*y*s->linesize + x*4,
1185                             ref->data[0], &bmv[4*y + x],
1186                             4*x + x_off, 4*y + y_off, 4, 4,
1187                             width, height, s->linesize,
1188                             s->put_pixels_tab[2]);
1189             }
1190         }
1191
1192         /* U/V */
1193         x_off >>= 1; y_off >>= 1; width >>= 1; height >>= 1;
1194         for (y = 0; y < 2; y++) {
1195             for (x = 0; x < 2; x++) {
1196                 uvmv.x = mb->bmv[ 2*y    * 4 + 2*x  ].x +
1197                          mb->bmv[ 2*y    * 4 + 2*x+1].x +
1198                          mb->bmv[(2*y+1) * 4 + 2*x  ].x +
1199                          mb->bmv[(2*y+1) * 4 + 2*x+1].x;
1200                 uvmv.y = mb->bmv[ 2*y    * 4 + 2*x  ].y +
1201                          mb->bmv[ 2*y    * 4 + 2*x+1].y +
1202                          mb->bmv[(2*y+1) * 4 + 2*x  ].y +
1203                          mb->bmv[(2*y+1) * 4 + 2*x+1].y;
1204                 uvmv.x = (uvmv.x + 2 + (uvmv.x >> (INT_BIT-1))) >> 2;
1205                 uvmv.y = (uvmv.y + 2 + (uvmv.y >> (INT_BIT-1))) >> 2;
1206                 if (s->profile == 3) {
1207                     uvmv.x &= ~7;
1208                     uvmv.y &= ~7;
1209                 }
1210                 vp8_mc_chroma(s, dst[1] + 4*y*s->uvlinesize + x*4,
1211                               dst[2] + 4*y*s->uvlinesize + x*4,
1212                               ref->data[1], ref->data[2], &uvmv,
1213                               4*x + x_off, 4*y + y_off, 4, 4,
1214                               width, height, s->uvlinesize,
1215                               s->put_pixels_tab[2]);
1216             }
1217         }
1218         break;
1219     }
1220     case VP8_SPLITMVMODE_16x8:
1221         vp8_mc_part(s, dst, ref, x_off, y_off,
1222                     0, 0, 16, 8, width, height, &bmv[0]);
1223         vp8_mc_part(s, dst, ref, x_off, y_off,
1224                     0, 8, 16, 8, width, height, &bmv[1]);
1225         break;
1226     case VP8_SPLITMVMODE_8x16:
1227         vp8_mc_part(s, dst, ref, x_off, y_off,
1228                     0, 0, 8, 16, width, height, &bmv[0]);
1229         vp8_mc_part(s, dst, ref, x_off, y_off,
1230                     8, 0, 8, 16, width, height, &bmv[1]);
1231         break;
1232     case VP8_SPLITMVMODE_8x8:
1233         vp8_mc_part(s, dst, ref, x_off, y_off,
1234                     0, 0, 8, 8, width, height, &bmv[0]);
1235         vp8_mc_part(s, dst, ref, x_off, y_off,
1236                     8, 0, 8, 8, width, height, &bmv[1]);
1237         vp8_mc_part(s, dst, ref, x_off, y_off,
1238                     0, 8, 8, 8, width, height, &bmv[2]);
1239         vp8_mc_part(s, dst, ref, x_off, y_off,
1240                     8, 8, 8, 8, width, height, &bmv[3]);
1241         break;
1242     }
1243 }
1244
1245 static av_always_inline void idct_mb(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb)
1246 {
1247     int x, y, ch;
1248
1249     if (mb->mode != MODE_I4x4) {
1250         uint8_t *y_dst = dst[0];
1251         for (y = 0; y < 4; y++) {
1252             uint32_t nnz4 = AV_RL32(s->non_zero_count_cache[y]);
1253             if (nnz4) {
1254                 if (nnz4&~0x01010101) {
1255                     for (x = 0; x < 4; x++) {
1256                         if ((uint8_t)nnz4 == 1)
1257                             s->vp8dsp.vp8_idct_dc_add(y_dst+4*x, s->block[y][x], s->linesize);
1258                         else if((uint8_t)nnz4 > 1)
1259                             s->vp8dsp.vp8_idct_add(y_dst+4*x, s->block[y][x], s->linesize);
1260                         nnz4 >>= 8;
1261                         if (!nnz4)
1262                             break;
1263                     }
1264                 } else {
1265                     s->vp8dsp.vp8_idct_dc_add4y(y_dst, s->block[y], s->linesize);
1266                 }
1267             }
1268             y_dst += 4*s->linesize;
1269         }
1270     }
1271
1272     for (ch = 0; ch < 2; ch++) {
1273         uint32_t nnz4 = AV_RL32(s->non_zero_count_cache[4+ch]);
1274         if (nnz4) {
1275             uint8_t *ch_dst = dst[1+ch];
1276             if (nnz4&~0x01010101) {
1277                 for (y = 0; y < 2; y++) {
1278                     for (x = 0; x < 2; x++) {
1279                         if ((uint8_t)nnz4 == 1)
1280                             s->vp8dsp.vp8_idct_dc_add(ch_dst+4*x, s->block[4+ch][(y<<1)+x], s->uvlinesize);
1281                         else if((uint8_t)nnz4 > 1)
1282                             s->vp8dsp.vp8_idct_add(ch_dst+4*x, s->block[4+ch][(y<<1)+x], s->uvlinesize);
1283                         nnz4 >>= 8;
1284                         if (!nnz4)
1285                             goto chroma_idct_end;
1286                     }
1287                     ch_dst += 4*s->uvlinesize;
1288                 }
1289             } else {
1290                 s->vp8dsp.vp8_idct_dc_add4uv(ch_dst, s->block[4+ch], s->uvlinesize);
1291             }
1292         }
1293 chroma_idct_end: ;
1294     }
1295 }
1296
1297 static av_always_inline void filter_level_for_mb(VP8Context *s, VP8Macroblock *mb, VP8FilterStrength *f )
1298 {
1299     int interior_limit, filter_level;
1300
1301     if (s->segmentation.enabled) {
1302         filter_level = s->segmentation.filter_level[s->segment];
1303         if (!s->segmentation.absolute_vals)
1304             filter_level += s->filter.level;
1305     } else
1306         filter_level = s->filter.level;
1307
1308     if (s->lf_delta.enabled) {
1309         filter_level += s->lf_delta.ref[mb->ref_frame];
1310         filter_level += s->lf_delta.mode[mb->mode];
1311     }
1312
1313 /* Like av_clip for inputs 0 and max, where max is equal to (2^n-1) */
1314 #define POW2CLIP(x,max) (((x) & ~max) ? (-(x))>>31 & max : (x));
1315     filter_level = POW2CLIP(filter_level, 63);
1316
1317     interior_limit = filter_level;
1318     if (s->filter.sharpness) {
1319         interior_limit >>= (s->filter.sharpness + 3) >> 2;
1320         interior_limit = FFMIN(interior_limit, 9 - s->filter.sharpness);
1321     }
1322     interior_limit = FFMAX(interior_limit, 1);
1323
1324     f->filter_level = filter_level;
1325     f->inner_limit = interior_limit;
1326     f->inner_filter = !mb->skip || mb->mode == MODE_I4x4 || mb->mode == VP8_MVMODE_SPLIT;
1327 }
1328
1329 static av_always_inline void filter_mb(VP8Context *s, uint8_t *dst[3], VP8FilterStrength *f, int mb_x, int mb_y)
1330 {
1331     int mbedge_lim, bedge_lim, hev_thresh;
1332     int filter_level = f->filter_level;
1333     int inner_limit = f->inner_limit;
1334     int inner_filter = f->inner_filter;
1335     int linesize = s->linesize;
1336     int uvlinesize = s->uvlinesize;
1337     static const uint8_t hev_thresh_lut[2][64] = {
1338         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
1339           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1340           3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1341           3, 3, 3, 3 },
1342         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
1343           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1344           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1345           2, 2, 2, 2 }
1346     };
1347
1348     if (!filter_level)
1349         return;
1350
1351      bedge_lim = 2*filter_level + inner_limit;
1352     mbedge_lim = bedge_lim + 4;
1353
1354     hev_thresh = hev_thresh_lut[s->keyframe][filter_level];
1355
1356     if (mb_x) {
1357         s->vp8dsp.vp8_h_loop_filter16y(dst[0],     linesize,
1358                                        mbedge_lim, inner_limit, hev_thresh);
1359         s->vp8dsp.vp8_h_loop_filter8uv(dst[1],     dst[2],      uvlinesize,
1360                                        mbedge_lim, inner_limit, hev_thresh);
1361     }
1362
1363     if (inner_filter) {
1364         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+ 4, linesize, bedge_lim,
1365                                              inner_limit, hev_thresh);
1366         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+ 8, linesize, bedge_lim,
1367                                              inner_limit, hev_thresh);
1368         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+12, linesize, bedge_lim,
1369                                              inner_limit, hev_thresh);
1370         s->vp8dsp.vp8_h_loop_filter8uv_inner(dst[1] + 4, dst[2] + 4,
1371                                              uvlinesize,  bedge_lim,
1372                                              inner_limit, hev_thresh);
1373     }
1374
1375     if (mb_y) {
1376         s->vp8dsp.vp8_v_loop_filter16y(dst[0],     linesize,
1377                                        mbedge_lim, inner_limit, hev_thresh);
1378         s->vp8dsp.vp8_v_loop_filter8uv(dst[1],     dst[2],      uvlinesize,
1379                                        mbedge_lim, inner_limit, hev_thresh);
1380     }
1381
1382     if (inner_filter) {
1383         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+ 4*linesize,
1384                                              linesize,    bedge_lim,
1385                                              inner_limit, hev_thresh);
1386         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+ 8*linesize,
1387                                              linesize,    bedge_lim,
1388                                              inner_limit, hev_thresh);
1389         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+12*linesize,
1390                                              linesize,    bedge_lim,
1391                                              inner_limit, hev_thresh);
1392         s->vp8dsp.vp8_v_loop_filter8uv_inner(dst[1] + 4 * uvlinesize,
1393                                              dst[2] + 4 * uvlinesize,
1394                                              uvlinesize,  bedge_lim,
1395                                              inner_limit, hev_thresh);
1396     }
1397 }
1398
1399 static av_always_inline void filter_mb_simple(VP8Context *s, uint8_t *dst, VP8FilterStrength *f, int mb_x, int mb_y)
1400 {
1401     int mbedge_lim, bedge_lim;
1402     int filter_level = f->filter_level;
1403     int inner_limit = f->inner_limit;
1404     int inner_filter = f->inner_filter;
1405     int linesize = s->linesize;
1406
1407     if (!filter_level)
1408         return;
1409
1410      bedge_lim = 2*filter_level + inner_limit;
1411     mbedge_lim = bedge_lim + 4;
1412
1413     if (mb_x)
1414         s->vp8dsp.vp8_h_loop_filter_simple(dst, linesize, mbedge_lim);
1415     if (inner_filter) {
1416         s->vp8dsp.vp8_h_loop_filter_simple(dst+ 4, linesize, bedge_lim);
1417         s->vp8dsp.vp8_h_loop_filter_simple(dst+ 8, linesize, bedge_lim);
1418         s->vp8dsp.vp8_h_loop_filter_simple(dst+12, linesize, bedge_lim);
1419     }
1420
1421     if (mb_y)
1422         s->vp8dsp.vp8_v_loop_filter_simple(dst, linesize, mbedge_lim);
1423     if (inner_filter) {
1424         s->vp8dsp.vp8_v_loop_filter_simple(dst+ 4*linesize, linesize, bedge_lim);
1425         s->vp8dsp.vp8_v_loop_filter_simple(dst+ 8*linesize, linesize, bedge_lim);
1426         s->vp8dsp.vp8_v_loop_filter_simple(dst+12*linesize, linesize, bedge_lim);
1427     }
1428 }
1429
1430 static void filter_mb_row(VP8Context *s, int mb_y)
1431 {
1432     VP8FilterStrength *f = s->filter_strength;
1433     uint8_t *dst[3] = {
1434         s->framep[VP56_FRAME_CURRENT]->data[0] + 16*mb_y*s->linesize,
1435         s->framep[VP56_FRAME_CURRENT]->data[1] +  8*mb_y*s->uvlinesize,
1436         s->framep[VP56_FRAME_CURRENT]->data[2] +  8*mb_y*s->uvlinesize
1437     };
1438     int mb_x;
1439
1440     for (mb_x = 0; mb_x < s->mb_width; mb_x++) {
1441         backup_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2], s->linesize, s->uvlinesize, 0);
1442         filter_mb(s, dst, f++, mb_x, mb_y);
1443         dst[0] += 16;
1444         dst[1] += 8;
1445         dst[2] += 8;
1446     }
1447 }
1448
1449 static void filter_mb_row_simple(VP8Context *s, int mb_y)
1450 {
1451     VP8FilterStrength *f = s->filter_strength;
1452     uint8_t *dst = s->framep[VP56_FRAME_CURRENT]->data[0] + 16*mb_y*s->linesize;
1453     int mb_x;
1454
1455     for (mb_x = 0; mb_x < s->mb_width; mb_x++) {
1456         backup_mb_border(s->top_border[mb_x+1], dst, NULL, NULL, s->linesize, 0, 1);
1457         filter_mb_simple(s, dst, f++, mb_x, mb_y);
1458         dst += 16;
1459     }
1460 }
1461
1462 static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
1463                             AVPacket *avpkt)
1464 {
1465     VP8Context *s = avctx->priv_data;
1466     int ret, mb_x, mb_y, i, y, referenced;
1467     enum AVDiscard skip_thresh;
1468     AVFrame *av_uninit(curframe);
1469
1470     if ((ret = decode_frame_header(s, avpkt->data, avpkt->size)) < 0)
1471         return ret;
1472
1473     referenced = s->update_last || s->update_golden == VP56_FRAME_CURRENT
1474                                 || s->update_altref == VP56_FRAME_CURRENT;
1475
1476     skip_thresh = !referenced ? AVDISCARD_NONREF :
1477                     !s->keyframe ? AVDISCARD_NONKEY : AVDISCARD_ALL;
1478
1479     if (avctx->skip_frame >= skip_thresh) {
1480         s->invisible = 1;
1481         goto skip_decode;
1482     }
1483     s->deblock_filter = s->filter.level && avctx->skip_loop_filter < skip_thresh;
1484
1485     for (i = 0; i < 4; i++)
1486         if (&s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
1487             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN] &&
1488             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2]) {
1489             curframe = s->framep[VP56_FRAME_CURRENT] = &s->frames[i];
1490             break;
1491         }
1492     if (curframe->data[0])
1493         avctx->release_buffer(avctx, curframe);
1494
1495     curframe->key_frame = s->keyframe;
1496     curframe->pict_type = s->keyframe ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
1497     curframe->reference = referenced ? 3 : 0;
1498     if ((ret = avctx->get_buffer(avctx, curframe))) {
1499         av_log(avctx, AV_LOG_ERROR, "get_buffer() failed!\n");
1500         return ret;
1501     }
1502
1503     // Given that arithmetic probabilities are updated every frame, it's quite likely
1504     // that the values we have on a random interframe are complete junk if we didn't
1505     // start decode on a keyframe. So just don't display anything rather than junk.
1506     if (!s->keyframe && (!s->framep[VP56_FRAME_PREVIOUS] ||
1507                          !s->framep[VP56_FRAME_GOLDEN] ||
1508                          !s->framep[VP56_FRAME_GOLDEN2])) {
1509         av_log(avctx, AV_LOG_WARNING, "Discarding interframe without a prior keyframe!\n");
1510         return AVERROR_INVALIDDATA;
1511     }
1512
1513     s->linesize   = curframe->linesize[0];
1514     s->uvlinesize = curframe->linesize[1];
1515
1516     if (!s->edge_emu_buffer)
1517         s->edge_emu_buffer = av_malloc(21*s->linesize);
1518
1519     memset(s->top_nnz, 0, s->mb_width*sizeof(*s->top_nnz));
1520
1521     /* Zero macroblock structures for top/top-left prediction from outside the frame. */
1522     memset(s->macroblocks + s->mb_height*2 - 1, 0, (s->mb_width+1)*sizeof(*s->macroblocks));
1523
1524     // top edge of 127 for intra prediction
1525     if (!(avctx->flags & CODEC_FLAG_EMU_EDGE)) {
1526         s->top_border[0][15] = s->top_border[0][23] = 127;
1527         memset(s->top_border[1]-1, 127, s->mb_width*sizeof(*s->top_border)+1);
1528     }
1529     memset(s->ref_count, 0, sizeof(s->ref_count));
1530     if (s->keyframe)
1531         memset(s->intra4x4_pred_mode_top, DC_PRED, s->mb_width*4);
1532
1533     #define MARGIN (16 << 2)
1534     s->mv_min.y = -MARGIN;
1535     s->mv_max.y = ((s->mb_height - 1) << 6) + MARGIN;
1536
1537     for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
1538         VP56RangeCoder *c = &s->coeff_partition[mb_y & (s->num_coeff_partitions-1)];
1539         VP8Macroblock *mb = s->macroblocks + (s->mb_height - mb_y - 1)*2;
1540         int mb_xy = mb_y*s->mb_width;
1541         uint8_t *dst[3] = {
1542             curframe->data[0] + 16*mb_y*s->linesize,
1543             curframe->data[1] +  8*mb_y*s->uvlinesize,
1544             curframe->data[2] +  8*mb_y*s->uvlinesize
1545         };
1546
1547         memset(mb - 1, 0, sizeof(*mb));   // zero left macroblock
1548         memset(s->left_nnz, 0, sizeof(s->left_nnz));
1549         AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED*0x01010101);
1550
1551         // left edge of 129 for intra prediction
1552         if (!(avctx->flags & CODEC_FLAG_EMU_EDGE)) {
1553             for (i = 0; i < 3; i++)
1554                 for (y = 0; y < 16>>!!i; y++)
1555                     dst[i][y*curframe->linesize[i]-1] = 129;
1556             if (mb_y == 1) // top left edge is also 129
1557                 s->top_border[0][15] = s->top_border[0][23] = s->top_border[0][31] = 129;
1558         }
1559
1560         s->mv_min.x = -MARGIN;
1561         s->mv_max.x = ((s->mb_width  - 1) << 6) + MARGIN;
1562
1563         for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
1564             /* Prefetch the current frame, 4 MBs ahead */
1565             s->dsp.prefetch(dst[0] + (mb_x&3)*4*s->linesize + 64, s->linesize, 4);
1566             s->dsp.prefetch(dst[1] + (mb_x&7)*s->uvlinesize + 64, dst[2] - dst[1], 2);
1567
1568             decode_mb_mode(s, mb, mb_x, mb_y, s->segmentation_map + mb_xy);
1569
1570             prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_PREVIOUS);
1571
1572             if (!mb->skip)
1573                 decode_mb_coeffs(s, c, mb, s->top_nnz[mb_x], s->left_nnz);
1574
1575             if (mb->mode <= MODE_I4x4)
1576                 intra_predict(s, dst, mb, mb_x, mb_y);
1577             else
1578                 inter_predict(s, dst, mb, mb_x, mb_y);
1579
1580             prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN);
1581
1582             if (!mb->skip) {
1583                 idct_mb(s, dst, mb);
1584             } else {
1585                 AV_ZERO64(s->left_nnz);
1586                 AV_WN64(s->top_nnz[mb_x], 0);   // array of 9, so unaligned
1587
1588                 // Reset DC block predictors if they would exist if the mb had coefficients
1589                 if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
1590                     s->left_nnz[8]      = 0;
1591                     s->top_nnz[mb_x][8] = 0;
1592                 }
1593             }
1594
1595             if (s->deblock_filter)
1596                 filter_level_for_mb(s, mb, &s->filter_strength[mb_x]);
1597
1598             prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN2);
1599
1600             dst[0] += 16;
1601             dst[1] += 8;
1602             dst[2] += 8;
1603             s->mv_min.x -= 64;
1604             s->mv_max.x -= 64;
1605         }
1606         if (s->deblock_filter) {
1607             if (s->filter.simple)
1608                 filter_mb_row_simple(s, mb_y);
1609             else
1610                 filter_mb_row(s, mb_y);
1611         }
1612         s->mv_min.y -= 64;
1613         s->mv_max.y -= 64;
1614     }
1615
1616 skip_decode:
1617     // if future frames don't use the updated probabilities,
1618     // reset them to the values we saved
1619     if (!s->update_probabilities)
1620         s->prob[0] = s->prob[1];
1621
1622     // check if golden and altref are swapped
1623     if (s->update_altref == VP56_FRAME_GOLDEN &&
1624         s->update_golden == VP56_FRAME_GOLDEN2)
1625         FFSWAP(AVFrame *, s->framep[VP56_FRAME_GOLDEN], s->framep[VP56_FRAME_GOLDEN2]);
1626     else {
1627         if (s->update_altref != VP56_FRAME_NONE)
1628             s->framep[VP56_FRAME_GOLDEN2] = s->framep[s->update_altref];
1629
1630         if (s->update_golden != VP56_FRAME_NONE)
1631             s->framep[VP56_FRAME_GOLDEN] = s->framep[s->update_golden];
1632     }
1633
1634     if (s->update_last) // move cur->prev
1635         s->framep[VP56_FRAME_PREVIOUS] = s->framep[VP56_FRAME_CURRENT];
1636
1637     // release no longer referenced frames
1638     for (i = 0; i < 4; i++)
1639         if (s->frames[i].data[0] &&
1640             &s->frames[i] != s->framep[VP56_FRAME_CURRENT] &&
1641             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
1642             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN] &&
1643             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2])
1644             avctx->release_buffer(avctx, &s->frames[i]);
1645
1646     if (!s->invisible) {
1647         *(AVFrame*)data = *s->framep[VP56_FRAME_CURRENT];
1648         *data_size = sizeof(AVFrame);
1649     }
1650
1651     return avpkt->size;
1652 }
1653
1654 static av_cold int vp8_decode_init(AVCodecContext *avctx)
1655 {
1656     VP8Context *s = avctx->priv_data;
1657
1658     s->avctx = avctx;
1659     avctx->pix_fmt = PIX_FMT_YUV420P;
1660
1661     dsputil_init(&s->dsp, avctx);
1662     ff_h264_pred_init(&s->hpc, CODEC_ID_VP8);
1663     ff_vp8dsp_init(&s->vp8dsp);
1664
1665     return 0;
1666 }
1667
1668 static av_cold int vp8_decode_free(AVCodecContext *avctx)
1669 {
1670     vp8_decode_flush(avctx);
1671     return 0;
1672 }
1673
1674 AVCodec ff_vp8_decoder = {
1675     "vp8",
1676     AVMEDIA_TYPE_VIDEO,
1677     CODEC_ID_VP8,
1678     sizeof(VP8Context),
1679     vp8_decode_init,
1680     NULL,
1681     vp8_decode_free,
1682     vp8_decode_frame,
1683     CODEC_CAP_DR1,
1684     .flush = vp8_decode_flush,
1685     .long_name = NULL_IF_CONFIG_SMALL("On2 VP8"),
1686 };