git.sesse.net Git - ffmpeg/blob - libavcodec/vp8.c

   1 /*
   2  * VP8 compatible video decoder
   3  *
   4  * Copyright (C) 2010 David Conrad
   5  * Copyright (C) 2010 Ronald S. Bultje
   6  * Copyright (C) 2010 Jason Garrett-Glaser
   7  *
   8  * This file is part of Libav.
   9  *
  10  * Libav is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2.1 of the License, or (at your option) any later version.
  14  *
  15  * Libav is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with Libav; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23  */
  24
  25 #include "libavutil/imgutils.h"
  26 #include "avcodec.h"
  27 #include "internal.h"
  28 #include "vp8.h"
  29 #include "vp8data.h"
  30 #include "rectangle.h"
  31 #include "thread.h"
  32
  33 #if ARCH_ARM
  34 #   include "arm/vp8.h"
  35 #endif
  36
  37 static void free_buffers(VP8Context *s)
  38 {
  39     av_freep(&s->macroblocks_base);
  40     av_freep(&s->filter_strength);
  41     av_freep(&s->intra4x4_pred_mode_top);
  42     av_freep(&s->top_nnz);
  43     av_freep(&s->edge_emu_buffer);
  44     av_freep(&s->top_border);
  45
  46     s->macroblocks = NULL;
  47 }
  48
  49 static int vp8_alloc_frame(VP8Context *s, AVFrame *f)
  50 {
  51     int ret;
  52     if ((ret = ff_thread_get_buffer(s->avctx, f)) < 0)
  53         return ret;
  54     if (s->num_maps_to_be_freed && !s->maps_are_invalid) {
  55         f->ref_index[0] = s->segmentation_maps[--s->num_maps_to_be_freed];
  56     } else if (!(f->ref_index[0] = av_mallocz(s->mb_width * s->mb_height))) {
  57         ff_thread_release_buffer(s->avctx, f);
  58         return AVERROR(ENOMEM);
  59     }
  60     return 0;
  61 }
  62
  63 static void vp8_release_frame(VP8Context *s, AVFrame *f, int prefer_delayed_free, int can_direct_free)
  64 {
  65     if (f->ref_index[0]) {
  66         if (prefer_delayed_free) {
  67             /* Upon a size change, we want to free the maps but other threads may still
  68              * be using them, so queue them. Upon a seek, all threads are inactive so
  69              * we want to cache one to prevent re-allocation in the next decoding
  70              * iteration, but the rest we can free directly. */
  71             int max_queued_maps = can_direct_free ? 1 : FF_ARRAY_ELEMS(s->segmentation_maps);
  72             if (s->num_maps_to_be_freed < max_queued_maps) {
  73                 s->segmentation_maps[s->num_maps_to_be_freed++] = f->ref_index[0];
  74             } else if (can_direct_free) /* vp8_decode_flush(), but our queue is full */ {
  75                 av_free(f->ref_index[0]);
  76             } /* else: MEMLEAK (should never happen, but better that than crash) */
  77             f->ref_index[0] = NULL;
  78         } else /* vp8_decode_free() */ {
  79             av_free(f->ref_index[0]);
  80         }
  81     }
  82     ff_thread_release_buffer(s->avctx, f);
  83 }
  84
  85 static void vp8_decode_flush_impl(AVCodecContext *avctx,
  86                                   int prefer_delayed_free, int can_direct_free, int free_mem)
  87 {
  88     VP8Context *s = avctx->priv_data;
  89     int i;
  90
  91     if (!avctx->internal->is_copy) {
  92         for (i = 0; i < 5; i++)
  93             if (s->frames[i].data[0])
  94                 vp8_release_frame(s, &s->frames[i], prefer_delayed_free, can_direct_free);
  95     }
  96     memset(s->framep, 0, sizeof(s->framep));
  97
  98     if (free_mem) {
  99         free_buffers(s);
 100         s->maps_are_invalid = 1;
 101     }
 102 }
 103
 104 static void vp8_decode_flush(AVCodecContext *avctx)
 105 {
 106     vp8_decode_flush_impl(avctx, 1, 1, 0);
 107 }
 108
 109 static int update_dimensions(VP8Context *s, int width, int height)
 110 {
 111     if (width  != s->avctx->width ||
 112         height != s->avctx->height) {
 113         if (av_image_check_size(width, height, 0, s->avctx))
 114             return AVERROR_INVALIDDATA;
 115
 116         vp8_decode_flush_impl(s->avctx, 1, 0, 1);
 117
 118         avcodec_set_dimensions(s->avctx, width, height);
 119     }
 120
 121     s->mb_width  = (s->avctx->coded_width +15) / 16;
 122     s->mb_height = (s->avctx->coded_height+15) / 16;
 123
 124     s->macroblocks_base        = av_mallocz((s->mb_width+s->mb_height*2+1)*sizeof(*s->macroblocks));
 125     s->filter_strength         = av_mallocz(s->mb_width*sizeof(*s->filter_strength));
 126     s->intra4x4_pred_mode_top  = av_mallocz(s->mb_width*4);
 127     s->top_nnz                 = av_mallocz(s->mb_width*sizeof(*s->top_nnz));
 128     s->top_border              = av_mallocz((s->mb_width+1)*sizeof(*s->top_border));
 129
 130     if (!s->macroblocks_base || !s->filter_strength || !s->intra4x4_pred_mode_top ||
 131         !s->top_nnz || !s->top_border)
 132         return AVERROR(ENOMEM);
 133
 134     s->macroblocks        = s->macroblocks_base + 1;
 135
 136     return 0;
 137 }
 138
 139 static void parse_segment_info(VP8Context *s)
 140 {
 141     VP56RangeCoder *c = &s->c;
 142     int i;
 143
 144     s->segmentation.update_map = vp8_rac_get(c);
 145
 146     if (vp8_rac_get(c)) { // update segment feature data
 147         s->segmentation.absolute_vals = vp8_rac_get(c);
 148
 149         for (i = 0; i < 4; i++)
 150             s->segmentation.base_quant[i]   = vp8_rac_get_sint(c, 7);
 151
 152         for (i = 0; i < 4; i++)
 153             s->segmentation.filter_level[i] = vp8_rac_get_sint(c, 6);
 154     }
 155     if (s->segmentation.update_map)
 156         for (i = 0; i < 3; i++)
 157             s->prob->segmentid[i] = vp8_rac_get(c) ? vp8_rac_get_uint(c, 8) : 255;
 158 }
 159
 160 static void update_lf_deltas(VP8Context *s)
 161 {
 162     VP56RangeCoder *c = &s->c;
 163     int i;
 164
 165     for (i = 0; i < 4; i++) {
 166         if (vp8_rac_get(c)) {
 167             s->lf_delta.ref[i] = vp8_rac_get_uint(c, 6);
 168
 169             if (vp8_rac_get(c))
 170                 s->lf_delta.ref[i] = -s->lf_delta.ref[i];
 171         }
 172     }
 173
 174     for (i = MODE_I4x4; i <= VP8_MVMODE_SPLIT; i++) {
 175         if (vp8_rac_get(c)) {
 176             s->lf_delta.mode[i] = vp8_rac_get_uint(c, 6);
 177
 178             if (vp8_rac_get(c))
 179                 s->lf_delta.mode[i] = -s->lf_delta.mode[i];
 180         }
 181     }
 182 }
 183
 184 static int setup_partitions(VP8Context *s, const uint8_t *buf, int buf_size)
 185 {
 186     const uint8_t *sizes = buf;
 187     int i;
 188
 189     s->num_coeff_partitions = 1 << vp8_rac_get_uint(&s->c, 2);
 190
 191     buf      += 3*(s->num_coeff_partitions-1);
 192     buf_size -= 3*(s->num_coeff_partitions-1);
 193     if (buf_size < 0)
 194         return -1;
 195
 196     for (i = 0; i < s->num_coeff_partitions-1; i++) {
 197         int size = AV_RL24(sizes + 3*i);
 198         if (buf_size - size < 0)
 199             return -1;
 200
 201         ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, size);
 202         buf      += size;
 203         buf_size -= size;
 204     }
 205     ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, buf_size);
 206
 207     return 0;
 208 }
 209
 210 static void get_quants(VP8Context *s)
 211 {
 212     VP56RangeCoder *c = &s->c;
 213     int i, base_qi;
 214
 215     int yac_qi     = vp8_rac_get_uint(c, 7);
 216     int ydc_delta  = vp8_rac_get_sint(c, 4);
 217     int y2dc_delta = vp8_rac_get_sint(c, 4);
 218     int y2ac_delta = vp8_rac_get_sint(c, 4);
 219     int uvdc_delta = vp8_rac_get_sint(c, 4);
 220     int uvac_delta = vp8_rac_get_sint(c, 4);
 221
 222     for (i = 0; i < 4; i++) {
 223         if (s->segmentation.enabled) {
 224             base_qi = s->segmentation.base_quant[i];
 225             if (!s->segmentation.absolute_vals)
 226                 base_qi += yac_qi;
 227         } else
 228             base_qi = yac_qi;
 229
 230         s->qmat[i].luma_qmul[0]    =       vp8_dc_qlookup[av_clip_uintp2(base_qi + ydc_delta , 7)];
 231         s->qmat[i].luma_qmul[1]    =       vp8_ac_qlookup[av_clip_uintp2(base_qi             , 7)];
 232         s->qmat[i].luma_dc_qmul[0] =   2 * vp8_dc_qlookup[av_clip_uintp2(base_qi + y2dc_delta, 7)];
 233         s->qmat[i].luma_dc_qmul[1] = 155 * vp8_ac_qlookup[av_clip_uintp2(base_qi + y2ac_delta, 7)] / 100;
 234         s->qmat[i].chroma_qmul[0]  =       vp8_dc_qlookup[av_clip_uintp2(base_qi + uvdc_delta, 7)];
 235         s->qmat[i].chroma_qmul[1]  =       vp8_ac_qlookup[av_clip_uintp2(base_qi + uvac_delta, 7)];
 236
 237         s->qmat[i].luma_dc_qmul[1] = FFMAX(s->qmat[i].luma_dc_qmul[1], 8);
 238         s->qmat[i].chroma_qmul[0]  = FFMIN(s->qmat[i].chroma_qmul[0], 132);
 239     }
 240 }
 241
 242 /**
 243  * Determine which buffers golden and altref should be updated with after this frame.
 244  * The spec isn't clear here, so I'm going by my understanding of what libvpx does
 245  *
 246  * Intra frames update all 3 references
 247  * Inter frames update VP56_FRAME_PREVIOUS if the update_last flag is set
 248  * If the update (golden|altref) flag is set, it's updated with the current frame
 249  *      if update_last is set, and VP56_FRAME_PREVIOUS otherwise.
 250  * If the flag is not set, the number read means:
 251  *      0: no update
 252  *      1: VP56_FRAME_PREVIOUS
 253  *      2: update golden with altref, or update altref with golden
 254  */
 255 static VP56Frame ref_to_update(VP8Context *s, int update, VP56Frame ref)
 256 {
 257     VP56RangeCoder *c = &s->c;
 258
 259     if (update)
 260         return VP56_FRAME_CURRENT;
 261
 262     switch (vp8_rac_get_uint(c, 2)) {
 263     case 1:
 264         return VP56_FRAME_PREVIOUS;
 265     case 2:
 266         return (ref == VP56_FRAME_GOLDEN) ? VP56_FRAME_GOLDEN2 : VP56_FRAME_GOLDEN;
 267     }
 268     return VP56_FRAME_NONE;
 269 }
 270
 271 static void update_refs(VP8Context *s)
 272 {
 273     VP56RangeCoder *c = &s->c;
 274
 275     int update_golden = vp8_rac_get(c);
 276     int update_altref = vp8_rac_get(c);
 277
 278     s->update_golden = ref_to_update(s, update_golden, VP56_FRAME_GOLDEN);
 279     s->update_altref = ref_to_update(s, update_altref, VP56_FRAME_GOLDEN2);
 280 }
 281
 282 static int decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size)
 283 {
 284     VP56RangeCoder *c = &s->c;
 285     int header_size, hscale, vscale, i, j, k, l, m, ret;
 286     int width  = s->avctx->width;
 287     int height = s->avctx->height;
 288
 289     s->keyframe  = !(buf[0] & 1);
 290     s->profile   =  (buf[0]>>1) & 7;
 291     s->invisible = !(buf[0] & 0x10);
 292     header_size  = AV_RL24(buf) >> 5;
 293     buf      += 3;
 294     buf_size -= 3;
 295
 296     if (s->profile > 3)
 297         av_log(s->avctx, AV_LOG_WARNING, "Unknown profile %d\n", s->profile);
 298
 299     if (!s->profile)
 300         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab, sizeof(s->put_pixels_tab));
 301     else    // profile 1-3 use bilinear, 4+ aren't defined so whatever
 302         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_bilinear_pixels_tab, sizeof(s->put_pixels_tab));
 303
 304     if (header_size > buf_size - 7*s->keyframe) {
 305         av_log(s->avctx, AV_LOG_ERROR, "Header size larger than data provided\n");
 306         return AVERROR_INVALIDDATA;
 307     }
 308
 309     if (s->keyframe) {
 310         if (AV_RL24(buf) != 0x2a019d) {
 311             av_log(s->avctx, AV_LOG_ERROR, "Invalid start code 0x%x\n", AV_RL24(buf));
 312             return AVERROR_INVALIDDATA;
 313         }
 314         width  = AV_RL16(buf+3) & 0x3fff;
 315         height = AV_RL16(buf+5) & 0x3fff;
 316         hscale = buf[4] >> 6;
 317         vscale = buf[6] >> 6;
 318         buf      += 7;
 319         buf_size -= 7;
 320
 321         if (hscale || vscale)
 322             av_log_missing_feature(s->avctx, "Upscaling", 1);
 323
 324         s->update_golden = s->update_altref = VP56_FRAME_CURRENT;
 325         for (i = 0; i < 4; i++)
 326             for (j = 0; j < 16; j++)
 327                 memcpy(s->prob->token[i][j], vp8_token_default_probs[i][vp8_coeff_band[j]],
 328                        sizeof(s->prob->token[i][j]));
 329         memcpy(s->prob->pred16x16, vp8_pred16x16_prob_inter, sizeof(s->prob->pred16x16));
 330         memcpy(s->prob->pred8x8c , vp8_pred8x8c_prob_inter , sizeof(s->prob->pred8x8c));
 331         memcpy(s->prob->mvc      , vp8_mv_default_prob     , sizeof(s->prob->mvc));
 332         memset(&s->segmentation, 0, sizeof(s->segmentation));
 333     }
 334
 335     if (!s->macroblocks_base || /* first frame */
 336         width != s->avctx->width || height != s->avctx->height) {
 337         if ((ret = update_dimensions(s, width, height)) < 0)
 338             return ret;
 339     }
 340
 341     ff_vp56_init_range_decoder(c, buf, header_size);
 342     buf      += header_size;
 343     buf_size -= header_size;
 344
 345     if (s->keyframe) {
 346         if (vp8_rac_get(c))
 347             av_log(s->avctx, AV_LOG_WARNING, "Unspecified colorspace\n");
 348         vp8_rac_get(c); // whether we can skip clamping in dsp functions
 349     }
 350
 351     if ((s->segmentation.enabled = vp8_rac_get(c)))
 352         parse_segment_info(s);
 353     else
 354         s->segmentation.update_map = 0; // FIXME: move this to some init function?
 355
 356     s->filter.simple    = vp8_rac_get(c);
 357     s->filter.level     = vp8_rac_get_uint(c, 6);
 358     s->filter.sharpness = vp8_rac_get_uint(c, 3);
 359
 360     if ((s->lf_delta.enabled = vp8_rac_get(c)))
 361         if (vp8_rac_get(c))
 362             update_lf_deltas(s);
 363
 364     if (setup_partitions(s, buf, buf_size)) {
 365         av_log(s->avctx, AV_LOG_ERROR, "Invalid partitions\n");
 366         return AVERROR_INVALIDDATA;
 367     }
 368
 369     get_quants(s);
 370
 371     if (!s->keyframe) {
 372         update_refs(s);
 373         s->sign_bias[VP56_FRAME_GOLDEN]               = vp8_rac_get(c);
 374         s->sign_bias[VP56_FRAME_GOLDEN2 /* altref */] = vp8_rac_get(c);
 375     }
 376
 377     // if we aren't saving this frame's probabilities for future frames,
 378     // make a copy of the current probabilities
 379     if (!(s->update_probabilities = vp8_rac_get(c)))
 380         s->prob[1] = s->prob[0];
 381
 382     s->update_last = s->keyframe || vp8_rac_get(c);
 383
 384     for (i = 0; i < 4; i++)
 385         for (j = 0; j < 8; j++)
 386             for (k = 0; k < 3; k++)
 387                 for (l = 0; l < NUM_DCT_TOKENS-1; l++)
 388                     if (vp56_rac_get_prob_branchy(c, vp8_token_update_probs[i][j][k][l])) {
 389                         int prob = vp8_rac_get_uint(c, 8);
 390                         for (m = 0; vp8_coeff_band_indexes[j][m] >= 0; m++)
 391                             s->prob->token[i][vp8_coeff_band_indexes[j][m]][k][l] = prob;
 392                     }
 393
 394     if ((s->mbskip_enabled = vp8_rac_get(c)))
 395         s->prob->mbskip = vp8_rac_get_uint(c, 8);
 396
 397     if (!s->keyframe) {
 398         s->prob->intra  = vp8_rac_get_uint(c, 8);
 399         s->prob->last   = vp8_rac_get_uint(c, 8);
 400         s->prob->golden = vp8_rac_get_uint(c, 8);
 401
 402         if (vp8_rac_get(c))
 403             for (i = 0; i < 4; i++)
 404                 s->prob->pred16x16[i] = vp8_rac_get_uint(c, 8);
 405         if (vp8_rac_get(c))
 406             for (i = 0; i < 3; i++)
 407                 s->prob->pred8x8c[i]  = vp8_rac_get_uint(c, 8);
 408
 409         // 17.2 MV probability update
 410         for (i = 0; i < 2; i++)
 411             for (j = 0; j < 19; j++)
 412                 if (vp56_rac_get_prob_branchy(c, vp8_mv_update_prob[i][j]))
 413                     s->prob->mvc[i][j] = vp8_rac_get_nn(c);
 414     }
 415
 416     return 0;
 417 }
 418
 419 static av_always_inline void clamp_mv(VP8Context *s, VP56mv *dst, const VP56mv *src)
 420 {
 421     dst->x = av_clip(src->x, s->mv_min.x, s->mv_max.x);
 422     dst->y = av_clip(src->y, s->mv_min.y, s->mv_max.y);
 423 }
 424
 425 /**
 426  * Motion vector coding, 17.1.
 427  */
 428 static int read_mv_component(VP56RangeCoder *c, const uint8_t *p)
 429 {
 430     int bit, x = 0;
 431
 432     if (vp56_rac_get_prob_branchy(c, p[0])) {
 433         int i;
 434
 435         for (i = 0; i < 3; i++)
 436             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 437         for (i = 9; i > 3; i--)
 438             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 439         if (!(x & 0xFFF0) || vp56_rac_get_prob(c, p[12]))
 440             x += 8;
 441     } else {
 442         // small_mvtree
 443         const uint8_t *ps = p+2;
 444         bit = vp56_rac_get_prob(c, *ps);
 445         ps += 1 + 3*bit;
 446         x  += 4*bit;
 447         bit = vp56_rac_get_prob(c, *ps);
 448         ps += 1 + bit;
 449         x  += 2*bit;
 450         x  += vp56_rac_get_prob(c, *ps);
 451     }
 452
 453     return (x && vp56_rac_get_prob(c, p[1])) ? -x : x;
 454 }
 455
 456 static av_always_inline
 457 const uint8_t *get_submv_prob(uint32_t left, uint32_t top)
 458 {
 459     if (left == top)
 460         return vp8_submv_prob[4-!!left];
 461     if (!top)
 462         return vp8_submv_prob[2];
 463     return vp8_submv_prob[1-!!left];
 464 }
 465
 466 /**
 467  * Split motion vector prediction, 16.4.
 468  * @returns the number of motion vectors parsed (2, 4 or 16)
 469  */
 470 static av_always_inline
 471 int decode_splitmvs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb)
 472 {
 473     int part_idx;
 474     int n, num;
 475     VP8Macroblock *top_mb  = &mb[2];
 476     VP8Macroblock *left_mb = &mb[-1];
 477     const uint8_t *mbsplits_left = vp8_mbsplits[left_mb->partitioning],
 478                   *mbsplits_top = vp8_mbsplits[top_mb->partitioning],
 479                   *mbsplits_cur, *firstidx;
 480     VP56mv *top_mv  = top_mb->bmv;
 481     VP56mv *left_mv = left_mb->bmv;
 482     VP56mv *cur_mv  = mb->bmv;
 483
 484     if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[0])) {
 485         if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[1])) {
 486             part_idx = VP8_SPLITMVMODE_16x8 + vp56_rac_get_prob(c, vp8_mbsplit_prob[2]);
 487         } else {
 488             part_idx = VP8_SPLITMVMODE_8x8;
 489         }
 490     } else {
 491         part_idx = VP8_SPLITMVMODE_4x4;
 492     }
 493
 494     num = vp8_mbsplit_count[part_idx];
 495     mbsplits_cur = vp8_mbsplits[part_idx],
 496     firstidx = vp8_mbfirstidx[part_idx];
 497     mb->partitioning = part_idx;
 498
 499     for (n = 0; n < num; n++) {
 500         int k = firstidx[n];
 501         uint32_t left, above;
 502         const uint8_t *submv_prob;
 503
 504         if (!(k & 3))
 505             left = AV_RN32A(&left_mv[mbsplits_left[k + 3]]);
 506         else
 507             left  = AV_RN32A(&cur_mv[mbsplits_cur[k - 1]]);
 508         if (k <= 3)
 509             above = AV_RN32A(&top_mv[mbsplits_top[k + 12]]);
 510         else
 511             above = AV_RN32A(&cur_mv[mbsplits_cur[k - 4]]);
 512
 513         submv_prob = get_submv_prob(left, above);
 514
 515         if (vp56_rac_get_prob_branchy(c, submv_prob[0])) {
 516             if (vp56_rac_get_prob_branchy(c, submv_prob[1])) {
 517                 if (vp56_rac_get_prob_branchy(c, submv_prob[2])) {
 518                     mb->bmv[n].y = mb->mv.y + read_mv_component(c, s->prob->mvc[0]);
 519                     mb->bmv[n].x = mb->mv.x + read_mv_component(c, s->prob->mvc[1]);
 520                 } else {
 521                     AV_ZERO32(&mb->bmv[n]);
 522                 }
 523             } else {
 524                 AV_WN32A(&mb->bmv[n], above);
 525             }
 526         } else {
 527             AV_WN32A(&mb->bmv[n], left);
 528         }
 529     }
 530
 531     return num;
 532 }
 533
 534 static av_always_inline
 535 void decode_mvs(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y)
 536 {
 537     VP8Macroblock *mb_edge[3] = { mb + 2 /* top */,
 538                                   mb - 1 /* left */,
 539                                   mb + 1 /* top-left */ };
 540     enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV };
 541     enum { VP8_EDGE_TOP, VP8_EDGE_LEFT, VP8_EDGE_TOPLEFT };
 542     int idx = CNT_ZERO;
 543     int cur_sign_bias = s->sign_bias[mb->ref_frame];
 544     int8_t *sign_bias = s->sign_bias;
 545     VP56mv near_mv[4];
 546     uint8_t cnt[4] = { 0 };
 547     VP56RangeCoder *c = &s->c;
 548
 549     AV_ZERO32(&near_mv[0]);
 550     AV_ZERO32(&near_mv[1]);
 551     AV_ZERO32(&near_mv[2]);
 552
 553     /* Process MB on top, left and top-left */
 554     #define MV_EDGE_CHECK(n)\
 555     {\
 556         VP8Macroblock *edge = mb_edge[n];\
 557         int edge_ref = edge->ref_frame;\
 558         if (edge_ref != VP56_FRAME_CURRENT) {\
 559             uint32_t mv = AV_RN32A(&edge->mv);\
 560             if (mv) {\
 561                 if (cur_sign_bias != sign_bias[edge_ref]) {\
 562                     /* SWAR negate of the values in mv. */\
 563                     mv = ~mv;\
 564                     mv = ((mv&0x7fff7fff) + 0x00010001) ^ (mv&0x80008000);\
 565                 }\
 566                 if (!n || mv != AV_RN32A(&near_mv[idx]))\
 567                     AV_WN32A(&near_mv[++idx], mv);\
 568                 cnt[idx]      += 1 + (n != 2);\
 569             } else\
 570                 cnt[CNT_ZERO] += 1 + (n != 2);\
 571         }\
 572     }
 573
 574     MV_EDGE_CHECK(0)
 575     MV_EDGE_CHECK(1)
 576     MV_EDGE_CHECK(2)
 577
 578     mb->partitioning = VP8_SPLITMVMODE_NONE;
 579     if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_ZERO]][0])) {
 580         mb->mode = VP8_MVMODE_MV;
 581
 582         /* If we have three distinct MVs, merge first and last if they're the same */
 583         if (cnt[CNT_SPLITMV] && AV_RN32A(&near_mv[1 + VP8_EDGE_TOP]) == AV_RN32A(&near_mv[1 + VP8_EDGE_TOPLEFT]))
 584             cnt[CNT_NEAREST] += 1;
 585
 586         /* Swap near and nearest if necessary */
 587         if (cnt[CNT_NEAR] > cnt[CNT_NEAREST]) {
 588             FFSWAP(uint8_t,     cnt[CNT_NEAREST],     cnt[CNT_NEAR]);
 589             FFSWAP( VP56mv, near_mv[CNT_NEAREST], near_mv[CNT_NEAR]);
 590         }
 591
 592         if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAREST]][1])) {
 593             if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAR]][2])) {
 594
 595                 /* Choose the best mv out of 0,0 and the nearest mv */
 596                 clamp_mv(s, &mb->mv, &near_mv[CNT_ZERO + (cnt[CNT_NEAREST] >= cnt[CNT_ZERO])]);
 597                 cnt[CNT_SPLITMV] = ((mb_edge[VP8_EDGE_LEFT]->mode    == VP8_MVMODE_SPLIT) +
 598                                     (mb_edge[VP8_EDGE_TOP]->mode     == VP8_MVMODE_SPLIT)) * 2 +
 599                                     (mb_edge[VP8_EDGE_TOPLEFT]->mode == VP8_MVMODE_SPLIT);
 600
 601                 if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_SPLITMV]][3])) {
 602                     mb->mode = VP8_MVMODE_SPLIT;
 603                     mb->mv = mb->bmv[decode_splitmvs(s, c, mb) - 1];
 604                 } else {
 605                     mb->mv.y += read_mv_component(c, s->prob->mvc[0]);
 606                     mb->mv.x += read_mv_component(c, s->prob->mvc[1]);
 607                     mb->bmv[0] = mb->mv;
 608                 }
 609             } else {
 610                 clamp_mv(s, &mb->mv, &near_mv[CNT_NEAR]);
 611                 mb->bmv[0] = mb->mv;
 612             }
 613         } else {
 614             clamp_mv(s, &mb->mv, &near_mv[CNT_NEAREST]);
 615             mb->bmv[0] = mb->mv;
 616         }
 617     } else {
 618         mb->mode = VP8_MVMODE_ZERO;
 619         AV_ZERO32(&mb->mv);
 620         mb->bmv[0] = mb->mv;
 621     }
 622 }
 623
 624 static av_always_inline
 625 void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c,
 626                            int mb_x, int keyframe)
 627 {
 628     uint8_t *intra4x4 = s->intra4x4_pred_mode_mb;
 629     if (keyframe) {
 630         int x, y;
 631         uint8_t* const top = s->intra4x4_pred_mode_top + 4 * mb_x;
 632         uint8_t* const left = s->intra4x4_pred_mode_left;
 633         for (y = 0; y < 4; y++) {
 634             for (x = 0; x < 4; x++) {
 635                 const uint8_t *ctx;
 636                 ctx = vp8_pred4x4_prob_intra[top[x]][left[y]];
 637                 *intra4x4 = vp8_rac_get_tree(c, vp8_pred4x4_tree, ctx);
 638                 left[y] = top[x] = *intra4x4;
 639                 intra4x4++;
 640             }
 641         }
 642     } else {
 643         int i;
 644         for (i = 0; i < 16; i++)
 645             intra4x4[i] = vp8_rac_get_tree(c, vp8_pred4x4_tree, vp8_pred4x4_prob_inter);
 646     }
 647 }
 648
 649 static av_always_inline
 650 void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, uint8_t *segment, uint8_t *ref)
 651 {
 652     VP56RangeCoder *c = &s->c;
 653
 654     if (s->segmentation.update_map)
 655         *segment = vp8_rac_get_tree(c, vp8_segmentid_tree, s->prob->segmentid);
 656     else if (s->segmentation.enabled)
 657         *segment = ref ? *ref : *segment;
 658     s->segment = *segment;
 659
 660     mb->skip = s->mbskip_enabled ? vp56_rac_get_prob(c, s->prob->mbskip) : 0;
 661
 662     if (s->keyframe) {
 663         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_intra, vp8_pred16x16_prob_intra);
 664
 665         if (mb->mode == MODE_I4x4) {
 666             decode_intra4x4_modes(s, c, mb_x, 1);
 667         } else {
 668             const uint32_t modes = vp8_pred4x4_mode[mb->mode] * 0x01010101u;
 669             AV_WN32A(s->intra4x4_pred_mode_top + 4 * mb_x, modes);
 670             AV_WN32A(s->intra4x4_pred_mode_left, modes);
 671         }
 672
 673         s->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, vp8_pred8x8c_prob_intra);
 674         mb->ref_frame = VP56_FRAME_CURRENT;
 675     } else if (vp56_rac_get_prob_branchy(c, s->prob->intra)) {
 676         // inter MB, 16.2
 677         if (vp56_rac_get_prob_branchy(c, s->prob->last))
 678             mb->ref_frame = vp56_rac_get_prob(c, s->prob->golden) ?
 679                 VP56_FRAME_GOLDEN2 /* altref */ : VP56_FRAME_GOLDEN;
 680         else
 681             mb->ref_frame = VP56_FRAME_PREVIOUS;
 682         s->ref_count[mb->ref_frame-1]++;
 683
 684         // motion vectors, 16.3
 685         decode_mvs(s, mb, mb_x, mb_y);
 686     } else {
 687         // intra MB, 16.1
 688         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_inter, s->prob->pred16x16);
 689
 690         if (mb->mode == MODE_I4x4)
 691             decode_intra4x4_modes(s, c, mb_x, 0);
 692
 693         s->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, s->prob->pred8x8c);
 694         mb->ref_frame = VP56_FRAME_CURRENT;
 695         mb->partitioning = VP8_SPLITMVMODE_NONE;
 696         AV_ZERO32(&mb->bmv[0]);
 697     }
 698 }
 699
 700 #ifndef decode_block_coeffs_internal
 701 /**
 702  * @param c arithmetic bitstream reader context
 703  * @param block destination for block coefficients
 704  * @param probs probabilities to use when reading trees from the bitstream
 705  * @param i initial coeff index, 0 unless a separate DC block is coded
 706  * @param qmul array holding the dc/ac dequant factor at position 0/1
 707  * @return 0 if no coeffs were decoded
 708  *         otherwise, the index of the last coeff decoded plus one
 709  */
 710 static int decode_block_coeffs_internal(VP56RangeCoder *r, DCTELEM block[16],
 711                                         uint8_t probs[16][3][NUM_DCT_TOKENS-1],
 712                                         int i, uint8_t *token_prob, int16_t qmul[2])
 713 {
 714     VP56RangeCoder c = *r;
 715     goto skip_eob;
 716     do {
 717         int coeff;
 718         if (!vp56_rac_get_prob_branchy(&c, token_prob[0]))   // DCT_EOB
 719             break;
 720
 721 skip_eob:
 722         if (!vp56_rac_get_prob_branchy(&c, token_prob[1])) { // DCT_0
 723             if (++i == 16)
 724                 break; // invalid input; blocks should end with EOB
 725             token_prob = probs[i][0];
 726             goto skip_eob;
 727         }
 728
 729         if (!vp56_rac_get_prob_branchy(&c, token_prob[2])) { // DCT_1
 730             coeff = 1;
 731             token_prob = probs[i+1][1];
 732         } else {
 733             if (!vp56_rac_get_prob_branchy(&c, token_prob[3])) { // DCT 2,3,4
 734                 coeff = vp56_rac_get_prob_branchy(&c, token_prob[4]);
 735                 if (coeff)
 736                     coeff += vp56_rac_get_prob(&c, token_prob[5]);
 737                 coeff += 2;
 738             } else {
 739                 // DCT_CAT*
 740                 if (!vp56_rac_get_prob_branchy(&c, token_prob[6])) {
 741                     if (!vp56_rac_get_prob_branchy(&c, token_prob[7])) { // DCT_CAT1
 742                         coeff  = 5 + vp56_rac_get_prob(&c, vp8_dct_cat1_prob[0]);
 743                     } else {                                    // DCT_CAT2
 744                         coeff  = 7;
 745                         coeff += vp56_rac_get_prob(&c, vp8_dct_cat2_prob[0]) << 1;
 746                         coeff += vp56_rac_get_prob(&c, vp8_dct_cat2_prob[1]);
 747                     }
 748                 } else {    // DCT_CAT3 and up
 749                     int a = vp56_rac_get_prob(&c, token_prob[8]);
 750                     int b = vp56_rac_get_prob(&c, token_prob[9+a]);
 751                     int cat = (a<<1) + b;
 752                     coeff  = 3 + (8<<cat);
 753                     coeff += vp8_rac_get_coeff(&c, ff_vp8_dct_cat_prob[cat]);
 754                 }
 755             }
 756             token_prob = probs[i+1][2];
 757         }
 758         block[zigzag_scan[i]] = (vp8_rac_get(&c) ? -coeff : coeff) * qmul[!!i];
 759     } while (++i < 16);
 760
 761     *r = c;
 762     return i;
 763 }
 764 #endif
 765
 766 /**
 767  * @param c arithmetic bitstream reader context
 768  * @param block destination for block coefficients
 769  * @param probs probabilities to use when reading trees from the bitstream
 770  * @param i initial coeff index, 0 unless a separate DC block is coded
 771  * @param zero_nhood the initial prediction context for number of surrounding
 772  *                   all-zero blocks (only left/top, so 0-2)
 773  * @param qmul array holding the dc/ac dequant factor at position 0/1
 774  * @return 0 if no coeffs were decoded
 775  *         otherwise, the index of the last coeff decoded plus one
 776  */
 777 static av_always_inline
 778 int decode_block_coeffs(VP56RangeCoder *c, DCTELEM block[16],
 779                         uint8_t probs[16][3][NUM_DCT_TOKENS-1],
 780                         int i, int zero_nhood, int16_t qmul[2])
 781 {
 782     uint8_t *token_prob = probs[i][zero_nhood];
 783     if (!vp56_rac_get_prob_branchy(c, token_prob[0]))   // DCT_EOB
 784         return 0;
 785     return decode_block_coeffs_internal(c, block, probs, i, token_prob, qmul);
 786 }
 787
 788 static av_always_inline
 789 void decode_mb_coeffs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
 790                       uint8_t t_nnz[9], uint8_t l_nnz[9])
 791 {
 792     int i, x, y, luma_start = 0, luma_ctx = 3;
 793     int nnz_pred, nnz, nnz_total = 0;
 794     int segment = s->segment;
 795     int block_dc = 0;
 796
 797     if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
 798         nnz_pred = t_nnz[8] + l_nnz[8];
 799
 800         // decode DC values and do hadamard
 801         nnz = decode_block_coeffs(c, s->block_dc, s->prob->token[1], 0, nnz_pred,
 802                                   s->qmat[segment].luma_dc_qmul);
 803         l_nnz[8] = t_nnz[8] = !!nnz;
 804         if (nnz) {
 805             nnz_total += nnz;
 806             block_dc = 1;
 807             if (nnz == 1)
 808                 s->vp8dsp.vp8_luma_dc_wht_dc(s->block, s->block_dc);
 809             else
 810                 s->vp8dsp.vp8_luma_dc_wht(s->block, s->block_dc);
 811         }
 812         luma_start = 1;
 813         luma_ctx = 0;
 814     }
 815
 816     // luma blocks
 817     for (y = 0; y < 4; y++)
 818         for (x = 0; x < 4; x++) {
 819             nnz_pred = l_nnz[y] + t_nnz[x];
 820             nnz = decode_block_coeffs(c, s->block[y][x], s->prob->token[luma_ctx], luma_start,
 821                                       nnz_pred, s->qmat[segment].luma_qmul);
 822             // nnz+block_dc may be one more than the actual last index, but we don't care
 823             s->non_zero_count_cache[y][x] = nnz + block_dc;
 824             t_nnz[x] = l_nnz[y] = !!nnz;
 825             nnz_total += nnz;
 826         }
 827
 828     // chroma blocks
 829     // TODO: what to do about dimensions? 2nd dim for luma is x,
 830     // but for chroma it's (y<<1)|x
 831     for (i = 4; i < 6; i++)
 832         for (y = 0; y < 2; y++)
 833             for (x = 0; x < 2; x++) {
 834                 nnz_pred = l_nnz[i+2*y] + t_nnz[i+2*x];
 835                 nnz = decode_block_coeffs(c, s->block[i][(y<<1)+x], s->prob->token[2], 0,
 836                                           nnz_pred, s->qmat[segment].chroma_qmul);
 837                 s->non_zero_count_cache[i][(y<<1)+x] = nnz;
 838                 t_nnz[i+2*x] = l_nnz[i+2*y] = !!nnz;
 839                 nnz_total += nnz;
 840             }
 841
 842     // if there were no coded coeffs despite the macroblock not being marked skip,
 843     // we MUST not do the inner loop filter and should not do IDCT
 844     // Since skip isn't used for bitstream prediction, just manually set it.
 845     if (!nnz_total)
 846         mb->skip = 1;
 847 }
 848
 849 static av_always_inline
 850 void backup_mb_border(uint8_t *top_border, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr,
 851                       int linesize, int uvlinesize, int simple)
 852 {
 853     AV_COPY128(top_border, src_y + 15*linesize);
 854     if (!simple) {
 855         AV_COPY64(top_border+16, src_cb + 7*uvlinesize);
 856         AV_COPY64(top_border+24, src_cr + 7*uvlinesize);
 857     }
 858 }
 859
 860 static av_always_inline
 861 void xchg_mb_border(uint8_t *top_border, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr,
 862                     int linesize, int uvlinesize, int mb_x, int mb_y, int mb_width,
 863                     int simple, int xchg)
 864 {
 865     uint8_t *top_border_m1 = top_border-32;     // for TL prediction
 866     src_y  -=   linesize;
 867     src_cb -= uvlinesize;
 868     src_cr -= uvlinesize;
 869
 870 #define XCHG(a,b,xchg) do {                     \
 871         if (xchg) AV_SWAP64(b,a);               \
 872         else      AV_COPY64(b,a);               \
 873     } while (0)
 874
 875     XCHG(top_border_m1+8, src_y-8, xchg);
 876     XCHG(top_border,      src_y,   xchg);
 877     XCHG(top_border+8,    src_y+8, 1);
 878     if (mb_x < mb_width-1)
 879         XCHG(top_border+32, src_y+16, 1);
 880
 881     // only copy chroma for normal loop filter
 882     // or to initialize the top row to 127
 883     if (!simple || !mb_y) {
 884         XCHG(top_border_m1+16, src_cb-8, xchg);
 885         XCHG(top_border_m1+24, src_cr-8, xchg);
 886         XCHG(top_border+16,    src_cb, 1);
 887         XCHG(top_border+24,    src_cr, 1);
 888     }
 889 }
 890
 891 static av_always_inline
 892 int check_dc_pred8x8_mode(int mode, int mb_x, int mb_y)
 893 {
 894     if (!mb_x) {
 895         return mb_y ? TOP_DC_PRED8x8 : DC_128_PRED8x8;
 896     } else {
 897         return mb_y ? mode : LEFT_DC_PRED8x8;
 898     }
 899 }
 900
 901 static av_always_inline
 902 int check_tm_pred8x8_mode(int mode, int mb_x, int mb_y)
 903 {
 904     if (!mb_x) {
 905         return mb_y ? VERT_PRED8x8 : DC_129_PRED8x8;
 906     } else {
 907         return mb_y ? mode : HOR_PRED8x8;
 908     }
 909 }
 910
 911 static av_always_inline
 912 int check_intra_pred8x8_mode(int mode, int mb_x, int mb_y)
 913 {
 914     if (mode == DC_PRED8x8) {
 915         return check_dc_pred8x8_mode(mode, mb_x, mb_y);
 916     } else {
 917         return mode;
 918     }
 919 }
 920
 921 static av_always_inline
 922 int check_intra_pred8x8_mode_emuedge(int mode, int mb_x, int mb_y)
 923 {
 924     switch (mode) {
 925     case DC_PRED8x8:
 926         return check_dc_pred8x8_mode(mode, mb_x, mb_y);
 927     case VERT_PRED8x8:
 928         return !mb_y ? DC_127_PRED8x8 : mode;
 929     case HOR_PRED8x8:
 930         return !mb_x ? DC_129_PRED8x8 : mode;
 931     case PLANE_PRED8x8 /*TM*/:
 932         return check_tm_pred8x8_mode(mode, mb_x, mb_y);
 933     }
 934     return mode;
 935 }
 936
 937 static av_always_inline
 938 int check_tm_pred4x4_mode(int mode, int mb_x, int mb_y)
 939 {
 940     if (!mb_x) {
 941         return mb_y ? VERT_VP8_PRED : DC_129_PRED;
 942     } else {
 943         return mb_y ? mode : HOR_VP8_PRED;
 944     }
 945 }
 946
 947 static av_always_inline
 948 int check_intra_pred4x4_mode_emuedge(int mode, int mb_x, int mb_y, int *copy_buf)
 949 {
 950     switch (mode) {
 951     case VERT_PRED:
 952         if (!mb_x && mb_y) {
 953             *copy_buf = 1;
 954             return mode;
 955         }
 956         /* fall-through */
 957     case DIAG_DOWN_LEFT_PRED:
 958     case VERT_LEFT_PRED:
 959         return !mb_y ? DC_127_PRED : mode;
 960     case HOR_PRED:
 961         if (!mb_y) {
 962             *copy_buf = 1;
 963             return mode;
 964         }
 965         /* fall-through */
 966     case HOR_UP_PRED:
 967         return !mb_x ? DC_129_PRED : mode;
 968     case TM_VP8_PRED:
 969         return check_tm_pred4x4_mode(mode, mb_x, mb_y);
 970     case DC_PRED: // 4x4 DC doesn't use the same "H.264-style" exceptions as 16x16/8x8 DC
 971     case DIAG_DOWN_RIGHT_PRED:
 972     case VERT_RIGHT_PRED:
 973     case HOR_DOWN_PRED:
 974         if (!mb_y || !mb_x)
 975             *copy_buf = 1;
 976         return mode;
 977     }
 978     return mode;
 979 }
 980
 981 static av_always_inline
 982 void intra_predict(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb,
 983                    int mb_x, int mb_y)
 984 {
 985     AVCodecContext *avctx = s->avctx;
 986     int x, y, mode, nnz;
 987     uint32_t tr;
 988
 989     // for the first row, we need to run xchg_mb_border to init the top edge to 127
 990     // otherwise, skip it if we aren't going to deblock
 991     if (!(avctx->flags & CODEC_FLAG_EMU_EDGE && !mb_y) && (s->deblock_filter || !mb_y))
 992         xchg_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2],
 993                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
 994                        s->filter.simple, 1);
 995
 996     if (mb->mode < MODE_I4x4) {
 997         if (avctx->flags & CODEC_FLAG_EMU_EDGE) { // tested
 998             mode = check_intra_pred8x8_mode_emuedge(mb->mode, mb_x, mb_y);
 999         } else {
1000             mode = check_intra_pred8x8_mode(mb->mode, mb_x, mb_y);
1001         }
1002         s->hpc.pred16x16[mode](dst[0], s->linesize);
1003     } else {
1004         uint8_t *ptr = dst[0];
1005         uint8_t *intra4x4 = s->intra4x4_pred_mode_mb;
1006         uint8_t tr_top[4] = { 127, 127, 127, 127 };
1007
1008         // all blocks on the right edge of the macroblock use bottom edge
1009         // the top macroblock for their topright edge
1010         uint8_t *tr_right = ptr - s->linesize + 16;
1011
1012         // if we're on the right edge of the frame, said edge is extended
1013         // from the top macroblock
1014         if (!(!mb_y && avctx->flags & CODEC_FLAG_EMU_EDGE) &&
1015             mb_x == s->mb_width-1) {
1016             tr = tr_right[-1]*0x01010101u;
1017             tr_right = (uint8_t *)&tr;
1018         }
1019
1020         if (mb->skip)
1021             AV_ZERO128(s->non_zero_count_cache);
1022
1023         for (y = 0; y < 4; y++) {
1024             uint8_t *topright = ptr + 4 - s->linesize;
1025             for (x = 0; x < 4; x++) {
1026                 int copy = 0, linesize = s->linesize;
1027                 uint8_t *dst = ptr+4*x;
1028                 DECLARE_ALIGNED(4, uint8_t, copy_dst)[5*8];
1029
1030                 if ((y == 0 || x == 3) && mb_y == 0 && avctx->flags & CODEC_FLAG_EMU_EDGE) {
1031                     topright = tr_top;
1032                 } else if (x == 3)
1033                     topright = tr_right;
1034
1035                 if (avctx->flags & CODEC_FLAG_EMU_EDGE) { // mb_x+x or mb_y+y is a hack but works
1036                     mode = check_intra_pred4x4_mode_emuedge(intra4x4[x], mb_x + x, mb_y + y, &copy);
1037                     if (copy) {
1038                         dst = copy_dst + 12;
1039                         linesize = 8;
1040                         if (!(mb_y + y)) {
1041                             copy_dst[3] = 127U;
1042                             AV_WN32A(copy_dst+4, 127U * 0x01010101U);
1043                         } else {
1044                             AV_COPY32(copy_dst+4, ptr+4*x-s->linesize);
1045                             if (!(mb_x + x)) {
1046                                 copy_dst[3] = 129U;
1047                             } else {
1048                                 copy_dst[3] = ptr[4*x-s->linesize-1];
1049                             }
1050                         }
1051                         if (!(mb_x + x)) {
1052                             copy_dst[11] =
1053                             copy_dst[19] =
1054                             copy_dst[27] =
1055                             copy_dst[35] = 129U;
1056                         } else {
1057                             copy_dst[11] = ptr[4*x              -1];
1058                             copy_dst[19] = ptr[4*x+s->linesize  -1];
1059                             copy_dst[27] = ptr[4*x+s->linesize*2-1];
1060                             copy_dst[35] = ptr[4*x+s->linesize*3-1];
1061                         }
1062                     }
1063                 } else {
1064                     mode = intra4x4[x];
1065                 }
1066                 s->hpc.pred4x4[mode](dst, topright, linesize);
1067                 if (copy) {
1068                     AV_COPY32(ptr+4*x              , copy_dst+12);
1069                     AV_COPY32(ptr+4*x+s->linesize  , copy_dst+20);
1070                     AV_COPY32(ptr+4*x+s->linesize*2, copy_dst+28);
1071                     AV_COPY32(ptr+4*x+s->linesize*3, copy_dst+36);
1072                 }
1073
1074                 nnz = s->non_zero_count_cache[y][x];
1075                 if (nnz) {
1076                     if (nnz == 1)
1077                         s->vp8dsp.vp8_idct_dc_add(ptr+4*x, s->block[y][x], s->linesize);
1078                     else
1079                         s->vp8dsp.vp8_idct_add(ptr+4*x, s->block[y][x], s->linesize);
1080                 }
1081                 topright += 4;
1082             }
1083
1084             ptr   += 4*s->linesize;
1085             intra4x4 += 4;
1086         }
1087     }
1088
1089     if (avctx->flags & CODEC_FLAG_EMU_EDGE) {
1090         mode = check_intra_pred8x8_mode_emuedge(s->chroma_pred_mode, mb_x, mb_y);
1091     } else {
1092         mode = check_intra_pred8x8_mode(s->chroma_pred_mode, mb_x, mb_y);
1093     }
1094     s->hpc.pred8x8[mode](dst[1], s->uvlinesize);
1095     s->hpc.pred8x8[mode](dst[2], s->uvlinesize);
1096
1097     if (!(avctx->flags & CODEC_FLAG_EMU_EDGE && !mb_y) && (s->deblock_filter || !mb_y))
1098         xchg_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2],
1099                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1100                        s->filter.simple, 0);
1101 }
1102
1103 static const uint8_t subpel_idx[3][8] = {
1104     { 0, 1, 2, 1, 2, 1, 2, 1 }, // nr. of left extra pixels,
1105                                 // also function pointer index
1106     { 0, 3, 5, 3, 5, 3, 5, 3 }, // nr. of extra pixels required
1107     { 0, 2, 3, 2, 3, 2, 3, 2 }, // nr. of right extra pixels
1108 };
1109
1110 /**
1111  * luma MC function
1112  *
1113  * @param s VP8 decoding context
1114  * @param dst target buffer for block data at block position
1115  * @param ref reference picture buffer at origin (0, 0)
1116  * @param mv motion vector (relative to block position) to get pixel data from
1117  * @param x_off horizontal position of block from origin (0, 0)
1118  * @param y_off vertical position of block from origin (0, 0)
1119  * @param block_w width of block (16, 8 or 4)
1120  * @param block_h height of block (always same as block_w)
1121  * @param width width of src/dst plane data
1122  * @param height height of src/dst plane data
1123  * @param linesize size of a single line of plane data, including padding
1124  * @param mc_func motion compensation function pointers (bilinear or sixtap MC)
1125  */
1126 static av_always_inline
1127 void vp8_mc_luma(VP8Context *s, uint8_t *dst, AVFrame *ref, const VP56mv *mv,
1128                  int x_off, int y_off, int block_w, int block_h,
1129                  int width, int height, int linesize,
1130                  vp8_mc_func mc_func[3][3])
1131 {
1132     uint8_t *src = ref->data[0];
1133
1134     if (AV_RN32A(mv)) {
1135
1136         int mx = (mv->x << 1)&7, mx_idx = subpel_idx[0][mx];
1137         int my = (mv->y << 1)&7, my_idx = subpel_idx[0][my];
1138
1139         x_off += mv->x >> 2;
1140         y_off += mv->y >> 2;
1141
1142         // edge emulation
1143         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 4, 0);
1144         src += y_off * linesize + x_off;
1145         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1146             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1147             s->dsp.emulated_edge_mc(s->edge_emu_buffer, src - my_idx * linesize - mx_idx, linesize,
1148                                     block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1149                                     x_off - mx_idx, y_off - my_idx, width, height);
1150             src = s->edge_emu_buffer + mx_idx + linesize * my_idx;
1151         }
1152         mc_func[my_idx][mx_idx](dst, linesize, src, linesize, block_h, mx, my);
1153     } else {
1154         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 4, 0);
1155         mc_func[0][0](dst, linesize, src + y_off * linesize + x_off, linesize, block_h, 0, 0);
1156     }
1157 }
1158
1159 /**
1160  * chroma MC function
1161  *
1162  * @param s VP8 decoding context
1163  * @param dst1 target buffer for block data at block position (U plane)
1164  * @param dst2 target buffer for block data at block position (V plane)
1165  * @param ref reference picture buffer at origin (0, 0)
1166  * @param mv motion vector (relative to block position) to get pixel data from
1167  * @param x_off horizontal position of block from origin (0, 0)
1168  * @param y_off vertical position of block from origin (0, 0)
1169  * @param block_w width of block (16, 8 or 4)
1170  * @param block_h height of block (always same as block_w)
1171  * @param width width of src/dst plane data
1172  * @param height height of src/dst plane data
1173  * @param linesize size of a single line of plane data, including padding
1174  * @param mc_func motion compensation function pointers (bilinear or sixtap MC)
1175  */
1176 static av_always_inline
1177 void vp8_mc_chroma(VP8Context *s, uint8_t *dst1, uint8_t *dst2, AVFrame *ref,
1178                    const VP56mv *mv, int x_off, int y_off,
1179                    int block_w, int block_h, int width, int height, int linesize,
1180                    vp8_mc_func mc_func[3][3])
1181 {
1182     uint8_t *src1 = ref->data[1], *src2 = ref->data[2];
1183
1184     if (AV_RN32A(mv)) {
1185         int mx = mv->x&7, mx_idx = subpel_idx[0][mx];
1186         int my = mv->y&7, my_idx = subpel_idx[0][my];
1187
1188         x_off += mv->x >> 3;
1189         y_off += mv->y >> 3;
1190
1191         // edge emulation
1192         src1 += y_off * linesize + x_off;
1193         src2 += y_off * linesize + x_off;
1194         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 3, 0);
1195         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1196             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1197             s->dsp.emulated_edge_mc(s->edge_emu_buffer, src1 - my_idx * linesize - mx_idx, linesize,
1198                                     block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1199                                     x_off - mx_idx, y_off - my_idx, width, height);
1200             src1 = s->edge_emu_buffer + mx_idx + linesize * my_idx;
1201             mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my);
1202
1203             s->dsp.emulated_edge_mc(s->edge_emu_buffer, src2 - my_idx * linesize - mx_idx, linesize,
1204                                     block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1205                                     x_off - mx_idx, y_off - my_idx, width, height);
1206             src2 = s->edge_emu_buffer + mx_idx + linesize * my_idx;
1207             mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my);
1208         } else {
1209             mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my);
1210             mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my);
1211         }
1212     } else {
1213         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 3, 0);
1214         mc_func[0][0](dst1, linesize, src1 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1215         mc_func[0][0](dst2, linesize, src2 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1216     }
1217 }
1218
1219 static av_always_inline
1220 void vp8_mc_part(VP8Context *s, uint8_t *dst[3],
1221                  AVFrame *ref_frame, int x_off, int y_off,
1222                  int bx_off, int by_off,
1223                  int block_w, int block_h,
1224                  int width, int height, VP56mv *mv)
1225 {
1226     VP56mv uvmv = *mv;
1227
1228     /* Y */
1229     vp8_mc_luma(s, dst[0] + by_off * s->linesize + bx_off,
1230                 ref_frame, mv, x_off + bx_off, y_off + by_off,
1231                 block_w, block_h, width, height, s->linesize,
1232                 s->put_pixels_tab[block_w == 8]);
1233
1234     /* U/V */
1235     if (s->profile == 3) {
1236         uvmv.x &= ~7;
1237         uvmv.y &= ~7;
1238     }
1239     x_off   >>= 1; y_off   >>= 1;
1240     bx_off  >>= 1; by_off  >>= 1;
1241     width   >>= 1; height  >>= 1;
1242     block_w >>= 1; block_h >>= 1;
1243     vp8_mc_chroma(s, dst[1] + by_off * s->uvlinesize + bx_off,
1244                   dst[2] + by_off * s->uvlinesize + bx_off, ref_frame,
1245                   &uvmv, x_off + bx_off, y_off + by_off,
1246                   block_w, block_h, width, height, s->uvlinesize,
1247                   s->put_pixels_tab[1 + (block_w == 4)]);
1248 }
1249
1250 /* Fetch pixels for estimated mv 4 macroblocks ahead.
1251  * Optimized for 64-byte cache lines.  Inspired by ffh264 prefetch_motion. */
1252 static av_always_inline void prefetch_motion(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, int mb_xy, int ref)
1253 {
1254     /* Don't prefetch refs that haven't been used very often this frame. */
1255     if (s->ref_count[ref-1] > (mb_xy >> 5)) {
1256         int x_off = mb_x << 4, y_off = mb_y << 4;
1257         int mx = (mb->mv.x>>2) + x_off + 8;
1258         int my = (mb->mv.y>>2) + y_off;
1259         uint8_t **src= s->framep[ref]->data;
1260         int off= mx + (my + (mb_x&3)*4)*s->linesize + 64;
1261         /* For threading, a ff_thread_await_progress here might be useful, but
1262          * it actually slows down the decoder. Since a bad prefetch doesn't
1263          * generate bad decoder output, we don't run it here. */
1264         s->dsp.prefetch(src[0]+off, s->linesize, 4);
1265         off= (mx>>1) + ((my>>1) + (mb_x&7))*s->uvlinesize + 64;
1266         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
1267     }
1268 }
1269
1270 /**
1271  * Apply motion vectors to prediction buffer, chapter 18.
1272  */
1273 static av_always_inline
1274 void inter_predict(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb,
1275                    int mb_x, int mb_y)
1276 {
1277     int x_off = mb_x << 4, y_off = mb_y << 4;
1278     int width = 16*s->mb_width, height = 16*s->mb_height;
1279     AVFrame *ref = s->framep[mb->ref_frame];
1280     VP56mv *bmv = mb->bmv;
1281
1282     switch (mb->partitioning) {
1283     case VP8_SPLITMVMODE_NONE:
1284         vp8_mc_part(s, dst, ref, x_off, y_off,
1285                     0, 0, 16, 16, width, height, &mb->mv);
1286         break;
1287     case VP8_SPLITMVMODE_4x4: {
1288         int x, y;
1289         VP56mv uvmv;
1290
1291         /* Y */
1292         for (y = 0; y < 4; y++) {
1293             for (x = 0; x < 4; x++) {
1294                 vp8_mc_luma(s, dst[0] + 4*y*s->linesize + x*4,
1295                             ref, &bmv[4*y + x],
1296                             4*x + x_off, 4*y + y_off, 4, 4,
1297                             width, height, s->linesize,
1298                             s->put_pixels_tab[2]);
1299             }
1300         }
1301
1302         /* U/V */
1303         x_off >>= 1; y_off >>= 1; width >>= 1; height >>= 1;
1304         for (y = 0; y < 2; y++) {
1305             for (x = 0; x < 2; x++) {
1306                 uvmv.x = mb->bmv[ 2*y    * 4 + 2*x  ].x +
1307                          mb->bmv[ 2*y    * 4 + 2*x+1].x +
1308                          mb->bmv[(2*y+1) * 4 + 2*x  ].x +
1309                          mb->bmv[(2*y+1) * 4 + 2*x+1].x;
1310                 uvmv.y = mb->bmv[ 2*y    * 4 + 2*x  ].y +
1311                          mb->bmv[ 2*y    * 4 + 2*x+1].y +
1312                          mb->bmv[(2*y+1) * 4 + 2*x  ].y +
1313                          mb->bmv[(2*y+1) * 4 + 2*x+1].y;
1314                 uvmv.x = (uvmv.x + 2 + (uvmv.x >> (INT_BIT-1))) >> 2;
1315                 uvmv.y = (uvmv.y + 2 + (uvmv.y >> (INT_BIT-1))) >> 2;
1316                 if (s->profile == 3) {
1317                     uvmv.x &= ~7;
1318                     uvmv.y &= ~7;
1319                 }
1320                 vp8_mc_chroma(s, dst[1] + 4*y*s->uvlinesize + x*4,
1321                               dst[2] + 4*y*s->uvlinesize + x*4, ref, &uvmv,
1322                               4*x + x_off, 4*y + y_off, 4, 4,
1323                               width, height, s->uvlinesize,
1324                               s->put_pixels_tab[2]);
1325             }
1326         }
1327         break;
1328     }
1329     case VP8_SPLITMVMODE_16x8:
1330         vp8_mc_part(s, dst, ref, x_off, y_off,
1331                     0, 0, 16, 8, width, height, &bmv[0]);
1332         vp8_mc_part(s, dst, ref, x_off, y_off,
1333                     0, 8, 16, 8, width, height, &bmv[1]);
1334         break;
1335     case VP8_SPLITMVMODE_8x16:
1336         vp8_mc_part(s, dst, ref, x_off, y_off,
1337                     0, 0, 8, 16, width, height, &bmv[0]);
1338         vp8_mc_part(s, dst, ref, x_off, y_off,
1339                     8, 0, 8, 16, width, height, &bmv[1]);
1340         break;
1341     case VP8_SPLITMVMODE_8x8:
1342         vp8_mc_part(s, dst, ref, x_off, y_off,
1343                     0, 0, 8, 8, width, height, &bmv[0]);
1344         vp8_mc_part(s, dst, ref, x_off, y_off,
1345                     8, 0, 8, 8, width, height, &bmv[1]);
1346         vp8_mc_part(s, dst, ref, x_off, y_off,
1347                     0, 8, 8, 8, width, height, &bmv[2]);
1348         vp8_mc_part(s, dst, ref, x_off, y_off,
1349                     8, 8, 8, 8, width, height, &bmv[3]);
1350         break;
1351     }
1352 }
1353
1354 static av_always_inline void idct_mb(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb)
1355 {
1356     int x, y, ch;
1357
1358     if (mb->mode != MODE_I4x4) {
1359         uint8_t *y_dst = dst[0];
1360         for (y = 0; y < 4; y++) {
1361             uint32_t nnz4 = AV_RL32(s->non_zero_count_cache[y]);
1362             if (nnz4) {
1363                 if (nnz4&~0x01010101) {
1364                     for (x = 0; x < 4; x++) {
1365                         if ((uint8_t)nnz4 == 1)
1366                             s->vp8dsp.vp8_idct_dc_add(y_dst+4*x, s->block[y][x], s->linesize);
1367                         else if((uint8_t)nnz4 > 1)
1368                             s->vp8dsp.vp8_idct_add(y_dst+4*x, s->block[y][x], s->linesize);
1369                         nnz4 >>= 8;
1370                         if (!nnz4)
1371                             break;
1372                     }
1373                 } else {
1374                     s->vp8dsp.vp8_idct_dc_add4y(y_dst, s->block[y], s->linesize);
1375                 }
1376             }
1377             y_dst += 4*s->linesize;
1378         }
1379     }
1380
1381     for (ch = 0; ch < 2; ch++) {
1382         uint32_t nnz4 = AV_RL32(s->non_zero_count_cache[4+ch]);
1383         if (nnz4) {
1384             uint8_t *ch_dst = dst[1+ch];
1385             if (nnz4&~0x01010101) {
1386                 for (y = 0; y < 2; y++) {
1387                     for (x = 0; x < 2; x++) {
1388                         if ((uint8_t)nnz4 == 1)
1389                             s->vp8dsp.vp8_idct_dc_add(ch_dst+4*x, s->block[4+ch][(y<<1)+x], s->uvlinesize);
1390                         else if((uint8_t)nnz4 > 1)
1391                             s->vp8dsp.vp8_idct_add(ch_dst+4*x, s->block[4+ch][(y<<1)+x], s->uvlinesize);
1392                         nnz4 >>= 8;
1393                         if (!nnz4)
1394                             goto chroma_idct_end;
1395                     }
1396                     ch_dst += 4*s->uvlinesize;
1397                 }
1398             } else {
1399                 s->vp8dsp.vp8_idct_dc_add4uv(ch_dst, s->block[4+ch], s->uvlinesize);
1400             }
1401         }
1402 chroma_idct_end: ;
1403     }
1404 }
1405
1406 static av_always_inline void filter_level_for_mb(VP8Context *s, VP8Macroblock *mb, VP8FilterStrength *f )
1407 {
1408     int interior_limit, filter_level;
1409
1410     if (s->segmentation.enabled) {
1411         filter_level = s->segmentation.filter_level[s->segment];
1412         if (!s->segmentation.absolute_vals)
1413             filter_level += s->filter.level;
1414     } else
1415         filter_level = s->filter.level;
1416
1417     if (s->lf_delta.enabled) {
1418         filter_level += s->lf_delta.ref[mb->ref_frame];
1419         filter_level += s->lf_delta.mode[mb->mode];
1420     }
1421
1422     filter_level = av_clip_uintp2(filter_level, 6);
1423
1424     interior_limit = filter_level;
1425     if (s->filter.sharpness) {
1426         interior_limit >>= (s->filter.sharpness + 3) >> 2;
1427         interior_limit = FFMIN(interior_limit, 9 - s->filter.sharpness);
1428     }
1429     interior_limit = FFMAX(interior_limit, 1);
1430
1431     f->filter_level = filter_level;
1432     f->inner_limit = interior_limit;
1433     f->inner_filter = !mb->skip || mb->mode == MODE_I4x4 || mb->mode == VP8_MVMODE_SPLIT;
1434 }
1435
1436 static av_always_inline void filter_mb(VP8Context *s, uint8_t *dst[3], VP8FilterStrength *f, int mb_x, int mb_y)
1437 {
1438     int mbedge_lim, bedge_lim, hev_thresh;
1439     int filter_level = f->filter_level;
1440     int inner_limit = f->inner_limit;
1441     int inner_filter = f->inner_filter;
1442     int linesize = s->linesize;
1443     int uvlinesize = s->uvlinesize;
1444     static const uint8_t hev_thresh_lut[2][64] = {
1445         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
1446           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1447           3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1448           3, 3, 3, 3 },
1449         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
1450           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1451           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1452           2, 2, 2, 2 }
1453     };
1454
1455     if (!filter_level)
1456         return;
1457
1458      bedge_lim = 2*filter_level + inner_limit;
1459     mbedge_lim = bedge_lim + 4;
1460
1461     hev_thresh = hev_thresh_lut[s->keyframe][filter_level];
1462
1463     if (mb_x) {
1464         s->vp8dsp.vp8_h_loop_filter16y(dst[0],     linesize,
1465                                        mbedge_lim, inner_limit, hev_thresh);
1466         s->vp8dsp.vp8_h_loop_filter8uv(dst[1],     dst[2],      uvlinesize,
1467                                        mbedge_lim, inner_limit, hev_thresh);
1468     }
1469
1470     if (inner_filter) {
1471         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+ 4, linesize, bedge_lim,
1472                                              inner_limit, hev_thresh);
1473         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+ 8, linesize, bedge_lim,
1474                                              inner_limit, hev_thresh);
1475         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+12, linesize, bedge_lim,
1476                                              inner_limit, hev_thresh);
1477         s->vp8dsp.vp8_h_loop_filter8uv_inner(dst[1] + 4, dst[2] + 4,
1478                                              uvlinesize,  bedge_lim,
1479                                              inner_limit, hev_thresh);
1480     }
1481
1482     if (mb_y) {
1483         s->vp8dsp.vp8_v_loop_filter16y(dst[0],     linesize,
1484                                        mbedge_lim, inner_limit, hev_thresh);
1485         s->vp8dsp.vp8_v_loop_filter8uv(dst[1],     dst[2],      uvlinesize,
1486                                        mbedge_lim, inner_limit, hev_thresh);
1487     }
1488
1489     if (inner_filter) {
1490         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+ 4*linesize,
1491                                              linesize,    bedge_lim,
1492                                              inner_limit, hev_thresh);
1493         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+ 8*linesize,
1494                                              linesize,    bedge_lim,
1495                                              inner_limit, hev_thresh);
1496         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+12*linesize,
1497                                              linesize,    bedge_lim,
1498                                              inner_limit, hev_thresh);
1499         s->vp8dsp.vp8_v_loop_filter8uv_inner(dst[1] + 4 * uvlinesize,
1500                                              dst[2] + 4 * uvlinesize,
1501                                              uvlinesize,  bedge_lim,
1502                                              inner_limit, hev_thresh);
1503     }
1504 }
1505
1506 static av_always_inline void filter_mb_simple(VP8Context *s, uint8_t *dst, VP8FilterStrength *f, int mb_x, int mb_y)
1507 {
1508     int mbedge_lim, bedge_lim;
1509     int filter_level = f->filter_level;
1510     int inner_limit = f->inner_limit;
1511     int inner_filter = f->inner_filter;
1512     int linesize = s->linesize;
1513
1514     if (!filter_level)
1515         return;
1516
1517      bedge_lim = 2*filter_level + inner_limit;
1518     mbedge_lim = bedge_lim + 4;
1519
1520     if (mb_x)
1521         s->vp8dsp.vp8_h_loop_filter_simple(dst, linesize, mbedge_lim);
1522     if (inner_filter) {
1523         s->vp8dsp.vp8_h_loop_filter_simple(dst+ 4, linesize, bedge_lim);
1524         s->vp8dsp.vp8_h_loop_filter_simple(dst+ 8, linesize, bedge_lim);
1525         s->vp8dsp.vp8_h_loop_filter_simple(dst+12, linesize, bedge_lim);
1526     }
1527
1528     if (mb_y)
1529         s->vp8dsp.vp8_v_loop_filter_simple(dst, linesize, mbedge_lim);
1530     if (inner_filter) {
1531         s->vp8dsp.vp8_v_loop_filter_simple(dst+ 4*linesize, linesize, bedge_lim);
1532         s->vp8dsp.vp8_v_loop_filter_simple(dst+ 8*linesize, linesize, bedge_lim);
1533         s->vp8dsp.vp8_v_loop_filter_simple(dst+12*linesize, linesize, bedge_lim);
1534     }
1535 }
1536
1537 static void filter_mb_row(VP8Context *s, AVFrame *curframe, int mb_y)
1538 {
1539     VP8FilterStrength *f = s->filter_strength;
1540     uint8_t *dst[3] = {
1541         curframe->data[0] + 16*mb_y*s->linesize,
1542         curframe->data[1] +  8*mb_y*s->uvlinesize,
1543         curframe->data[2] +  8*mb_y*s->uvlinesize
1544     };
1545     int mb_x;
1546
1547     for (mb_x = 0; mb_x < s->mb_width; mb_x++) {
1548         backup_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2], s->linesize, s->uvlinesize, 0);
1549         filter_mb(s, dst, f++, mb_x, mb_y);
1550         dst[0] += 16;
1551         dst[1] += 8;
1552         dst[2] += 8;
1553     }
1554 }
1555
1556 static void filter_mb_row_simple(VP8Context *s, AVFrame *curframe, int mb_y)
1557 {
1558     VP8FilterStrength *f = s->filter_strength;
1559     uint8_t *dst = curframe->data[0] + 16*mb_y*s->linesize;
1560     int mb_x;
1561
1562     for (mb_x = 0; mb_x < s->mb_width; mb_x++) {
1563         backup_mb_border(s->top_border[mb_x+1], dst, NULL, NULL, s->linesize, 0, 1);
1564         filter_mb_simple(s, dst, f++, mb_x, mb_y);
1565         dst += 16;
1566     }
1567 }
1568
1569 static void release_queued_segmaps(VP8Context *s, int is_close)
1570 {
1571     int leave_behind = is_close ? 0 : !s->maps_are_invalid;
1572     while (s->num_maps_to_be_freed > leave_behind)
1573         av_freep(&s->segmentation_maps[--s->num_maps_to_be_freed]);
1574     s->maps_are_invalid = 0;
1575 }
1576
1577 #define MARGIN (16 << 2)
1578 static void vp8_decode_mb_row(AVCodecContext *avctx, AVFrame *curframe,
1579                               AVFrame *prev_frame, int mb_y)
1580 {
1581     VP8Context *s = avctx->priv_data;
1582     VP56RangeCoder *c = &s->coeff_partition[mb_y & (s->num_coeff_partitions-1)];
1583     VP8Macroblock *mb = s->macroblocks + (s->mb_height - mb_y - 1)*2;
1584     int i, y, mb_x, mb_xy = mb_y*s->mb_width;
1585     uint8_t *dst[3] = {
1586         curframe->data[0] + 16*mb_y*s->linesize,
1587         curframe->data[1] +  8*mb_y*s->uvlinesize,
1588         curframe->data[2] +  8*mb_y*s->uvlinesize
1589     };
1590
1591     memset(mb - 1, 0, sizeof(*mb));   // zero left macroblock
1592     memset(s->left_nnz, 0, sizeof(s->left_nnz));
1593     AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED*0x01010101);
1594
1595     // left edge of 129 for intra prediction
1596     if (!(avctx->flags & CODEC_FLAG_EMU_EDGE)) {
1597         for (i = 0; i < 3; i++)
1598             for (y = 0; y < 16>>!!i; y++)
1599                 dst[i][y*curframe->linesize[i]-1] = 129;
1600         if (mb_y == 1) // top left edge is also 129
1601             s->top_border[0][15] = s->top_border[0][23] = s->top_border[0][31] = 129;
1602     }
1603
1604     s->mv_min.x = -MARGIN;
1605     s->mv_max.x = ((s->mb_width  - 1) << 6) + MARGIN;
1606
1607     for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
1608         /* Prefetch the current frame, 4 MBs ahead */
1609         s->dsp.prefetch(dst[0] + (mb_x&3)*4*s->linesize + 64, s->linesize, 4);
1610         s->dsp.prefetch(dst[1] + (mb_x&7)*s->uvlinesize + 64, dst[2] - dst[1], 2);
1611
1612         decode_mb_mode(s, mb, mb_x, mb_y, curframe->ref_index[0] + mb_xy,
1613                        prev_frame && prev_frame->ref_index[0] ? prev_frame->ref_index[0] + mb_xy : NULL);
1614
1615         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_PREVIOUS);
1616
1617         if (!mb->skip)
1618             decode_mb_coeffs(s, c, mb, s->top_nnz[mb_x], s->left_nnz);
1619
1620         if (mb->mode <= MODE_I4x4)
1621             intra_predict(s, dst, mb, mb_x, mb_y);
1622         else
1623             inter_predict(s, dst, mb, mb_x, mb_y);
1624
1625         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN);
1626
1627         if (!mb->skip) {
1628             idct_mb(s, dst, mb);
1629         } else {
1630             AV_ZERO64(s->left_nnz);
1631             AV_WN64(s->top_nnz[mb_x], 0);   // array of 9, so unaligned
1632
1633             // Reset DC block predictors if they would exist if the mb had coefficients
1634             if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
1635                 s->left_nnz[8]      = 0;
1636                 s->top_nnz[mb_x][8] = 0;
1637             }
1638         }
1639
1640         if (s->deblock_filter)
1641             filter_level_for_mb(s, mb, &s->filter_strength[mb_x]);
1642
1643         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN2);
1644
1645         dst[0] += 16;
1646         dst[1] += 8;
1647         dst[2] += 8;
1648         s->mv_min.x -= 64;
1649         s->mv_max.x -= 64;
1650     }
1651     if (s->deblock_filter) {
1652         if (s->filter.simple)
1653             filter_mb_row_simple(s, curframe, mb_y);
1654         else
1655             filter_mb_row(s, curframe, mb_y);
1656     }
1657     s->mv_min.y -= 64;
1658     s->mv_max.y -= 64;
1659 }
1660
1661 static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
1662                             AVPacket *avpkt)
1663 {
1664     VP8Context *s = avctx->priv_data;
1665     int ret, mb_y, i, referenced;
1666     enum AVDiscard skip_thresh;
1667     AVFrame *av_uninit(curframe), *prev_frame;
1668
1669     release_queued_segmaps(s, 0);
1670
1671     if ((ret = decode_frame_header(s, avpkt->data, avpkt->size)) < 0)
1672         goto err;
1673
1674     prev_frame = s->framep[VP56_FRAME_CURRENT];
1675
1676     referenced = s->update_last || s->update_golden == VP56_FRAME_CURRENT
1677                                 || s->update_altref == VP56_FRAME_CURRENT;
1678
1679     skip_thresh = !referenced ? AVDISCARD_NONREF :
1680                     !s->keyframe ? AVDISCARD_NONKEY : AVDISCARD_ALL;
1681
1682     if (avctx->skip_frame >= skip_thresh) {
1683         s->invisible = 1;
1684         memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
1685         goto skip_decode;
1686     }
1687     s->deblock_filter = s->filter.level && avctx->skip_loop_filter < skip_thresh;
1688
1689     // release no longer referenced frames
1690     for (i = 0; i < 5; i++)
1691         if (s->frames[i].data[0] &&
1692             &s->frames[i] != prev_frame &&
1693             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
1694             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN] &&
1695             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2])
1696             vp8_release_frame(s, &s->frames[i], 1, 0);
1697
1698     // find a free buffer
1699     for (i = 0; i < 5; i++)
1700         if (&s->frames[i] != prev_frame &&
1701             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
1702             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN] &&
1703             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2]) {
1704             curframe = s->framep[VP56_FRAME_CURRENT] = &s->frames[i];
1705             break;
1706         }
1707     if (i == 5) {
1708         av_log(avctx, AV_LOG_FATAL, "Ran out of free frames!\n");
1709         abort();
1710     }
1711     if (curframe->data[0])
1712         vp8_release_frame(s, curframe, 1, 0);
1713
1714     // Given that arithmetic probabilities are updated every frame, it's quite likely
1715     // that the values we have on a random interframe are complete junk if we didn't
1716     // start decode on a keyframe. So just don't display anything rather than junk.
1717     if (!s->keyframe && (!s->framep[VP56_FRAME_PREVIOUS] ||
1718                          !s->framep[VP56_FRAME_GOLDEN] ||
1719                          !s->framep[VP56_FRAME_GOLDEN2])) {
1720         av_log(avctx, AV_LOG_WARNING, "Discarding interframe without a prior keyframe!\n");
1721         ret = AVERROR_INVALIDDATA;
1722         goto err;
1723     }
1724
1725     curframe->key_frame = s->keyframe;
1726     curframe->pict_type = s->keyframe ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
1727     curframe->reference = referenced ? 3 : 0;
1728     if ((ret = vp8_alloc_frame(s, curframe))) {
1729         av_log(avctx, AV_LOG_ERROR, "get_buffer() failed!\n");
1730         goto err;
1731     }
1732
1733     // check if golden and altref are swapped
1734     if (s->update_altref != VP56_FRAME_NONE) {
1735         s->next_framep[VP56_FRAME_GOLDEN2]  = s->framep[s->update_altref];
1736     } else {
1737         s->next_framep[VP56_FRAME_GOLDEN2]  = s->framep[VP56_FRAME_GOLDEN2];
1738     }
1739     if (s->update_golden != VP56_FRAME_NONE) {
1740         s->next_framep[VP56_FRAME_GOLDEN]   = s->framep[s->update_golden];
1741     } else {
1742         s->next_framep[VP56_FRAME_GOLDEN]   = s->framep[VP56_FRAME_GOLDEN];
1743     }
1744     if (s->update_last) {
1745         s->next_framep[VP56_FRAME_PREVIOUS] = curframe;
1746     } else {
1747         s->next_framep[VP56_FRAME_PREVIOUS] = s->framep[VP56_FRAME_PREVIOUS];
1748     }
1749     s->next_framep[VP56_FRAME_CURRENT]      = curframe;
1750
1751     ff_thread_finish_setup(avctx);
1752
1753     s->linesize   = curframe->linesize[0];
1754     s->uvlinesize = curframe->linesize[1];
1755
1756     if (!s->edge_emu_buffer)
1757         s->edge_emu_buffer = av_malloc(21*s->linesize);
1758
1759     memset(s->top_nnz, 0, s->mb_width*sizeof(*s->top_nnz));
1760
1761     /* Zero macroblock structures for top/top-left prediction from outside the frame. */
1762     memset(s->macroblocks + s->mb_height*2 - 1, 0, (s->mb_width+1)*sizeof(*s->macroblocks));
1763
1764     // top edge of 127 for intra prediction
1765     if (!(avctx->flags & CODEC_FLAG_EMU_EDGE)) {
1766         s->top_border[0][15] = s->top_border[0][23] = 127;
1767         memset(s->top_border[1]-1, 127, s->mb_width*sizeof(*s->top_border)+1);
1768     }
1769     memset(s->ref_count, 0, sizeof(s->ref_count));
1770     if (s->keyframe)
1771         memset(s->intra4x4_pred_mode_top, DC_PRED, s->mb_width*4);
1772
1773     s->mv_min.y = -MARGIN;
1774     s->mv_max.y = ((s->mb_height - 1) << 6) + MARGIN;
1775
1776     for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
1777         if (prev_frame && s->segmentation.enabled && !s->segmentation.update_map)
1778             ff_thread_await_progress(prev_frame, mb_y, 0);
1779
1780         vp8_decode_mb_row(avctx, curframe, prev_frame, mb_y);
1781
1782         ff_thread_report_progress(curframe, mb_y, 0);
1783     }
1784
1785     ff_thread_report_progress(curframe, INT_MAX, 0);
1786     memcpy(&s->framep[0], &s->next_framep[0], sizeof(s->framep[0]) * 4);
1787
1788 skip_decode:
1789     // if future frames don't use the updated probabilities,
1790     // reset them to the values we saved
1791     if (!s->update_probabilities)
1792         s->prob[0] = s->prob[1];
1793
1794     if (!s->invisible) {
1795         *(AVFrame*)data = *curframe;
1796         *data_size = sizeof(AVFrame);
1797     }
1798
1799     return avpkt->size;
1800 err:
1801     memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
1802     return ret;
1803 }
1804
1805 static av_cold int vp8_decode_init(AVCodecContext *avctx)
1806 {
1807     VP8Context *s = avctx->priv_data;
1808
1809     s->avctx = avctx;
1810     avctx->pix_fmt = PIX_FMT_YUV420P;
1811
1812     ff_dsputil_init(&s->dsp, avctx);
1813     ff_h264_pred_init(&s->hpc, CODEC_ID_VP8, 8, 1);
1814     ff_vp8dsp_init(&s->vp8dsp);
1815
1816     return 0;
1817 }
1818
1819 static av_cold int vp8_decode_free(AVCodecContext *avctx)
1820 {
1821     vp8_decode_flush_impl(avctx, 0, 1, 1);
1822     release_queued_segmaps(avctx->priv_data, 1);
1823     return 0;
1824 }
1825
1826 static av_cold int vp8_decode_init_thread_copy(AVCodecContext *avctx)
1827 {
1828     VP8Context *s = avctx->priv_data;
1829
1830     s->avctx = avctx;
1831
1832     return 0;
1833 }
1834
1835 #define REBASE(pic) \
1836     pic ? pic - &s_src->frames[0] + &s->frames[0] : NULL
1837
1838 static int vp8_decode_update_thread_context(AVCodecContext *dst, const AVCodecContext *src)
1839 {
1840     VP8Context *s = dst->priv_data, *s_src = src->priv_data;
1841
1842     if (s->macroblocks_base &&
1843         (s_src->mb_width != s->mb_width || s_src->mb_height != s->mb_height)) {
1844         free_buffers(s);
1845         s->maps_are_invalid = 1;
1846         s->mb_width  = s_src->mb_width;
1847         s->mb_height = s_src->mb_height;
1848     }
1849
1850     s->prob[0] = s_src->prob[!s_src->update_probabilities];
1851     s->segmentation = s_src->segmentation;
1852     s->lf_delta = s_src->lf_delta;
1853     memcpy(s->sign_bias, s_src->sign_bias, sizeof(s->sign_bias));
1854
1855     memcpy(&s->frames, &s_src->frames, sizeof(s->frames));
1856     s->framep[0] = REBASE(s_src->next_framep[0]);
1857     s->framep[1] = REBASE(s_src->next_framep[1]);
1858     s->framep[2] = REBASE(s_src->next_framep[2]);
1859     s->framep[3] = REBASE(s_src->next_framep[3]);
1860
1861     return 0;
1862 }
1863
1864 AVCodec ff_vp8_decoder = {
1865     .name                  = "vp8",
1866     .type                  = AVMEDIA_TYPE_VIDEO,
1867     .id                    = CODEC_ID_VP8,
1868     .priv_data_size        = sizeof(VP8Context),
1869     .init                  = vp8_decode_init,
1870     .close                 = vp8_decode_free,
1871     .decode                = vp8_decode_frame,
1872     .capabilities          = CODEC_CAP_DR1 | CODEC_CAP_FRAME_THREADS,
1873     .flush                 = vp8_decode_flush,
1874     .long_name             = NULL_IF_CONFIG_SMALL("On2 VP8"),
1875     .init_thread_copy      = ONLY_IF_THREADS_ENABLED(vp8_decode_init_thread_copy),
1876     .update_thread_context = ONLY_IF_THREADS_ENABLED(vp8_decode_update_thread_context),
1877 };