git.sesse.net Git - ffmpeg/blob - libavcodec/vp8.c

   1 /*
   2  * VP8 compatible video decoder
   3  *
   4  * Copyright (C) 2010 David Conrad
   5  * Copyright (C) 2010 Ronald S. Bultje
   6  * Copyright (C) 2010 Jason Garrett-Glaser
   7  *
   8  * This file is part of Libav.
   9  *
  10  * Libav is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2.1 of the License, or (at your option) any later version.
  14  *
  15  * Libav is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with Libav; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23  */
  24
  25 #include "libavutil/imgutils.h"
  26 #include "avcodec.h"
  27 #include "internal.h"
  28 #include "vp8.h"
  29 #include "vp8data.h"
  30 #include "rectangle.h"
  31 #include "thread.h"
  32
  33 #if ARCH_ARM
  34 #   include "arm/vp8.h"
  35 #endif
  36
  37 static void free_buffers(VP8Context *s)
  38 {
  39     av_freep(&s->macroblocks_base);
  40     av_freep(&s->filter_strength);
  41     av_freep(&s->intra4x4_pred_mode_top);
  42     av_freep(&s->top_nnz);
  43     av_freep(&s->edge_emu_buffer);
  44     av_freep(&s->top_border);
  45
  46     s->macroblocks = NULL;
  47 }
  48
  49 static int vp8_alloc_frame(VP8Context *s, AVFrame *f)
  50 {
  51     int ret;
  52     if ((ret = ff_thread_get_buffer(s->avctx, f)) < 0)
  53         return ret;
  54     if (s->num_maps_to_be_freed && !s->maps_are_invalid) {
  55         f->ref_index[0] = s->segmentation_maps[--s->num_maps_to_be_freed];
  56     } else if (!(f->ref_index[0] = av_mallocz(s->mb_width * s->mb_height))) {
  57         ff_thread_release_buffer(s->avctx, f);
  58         return AVERROR(ENOMEM);
  59     }
  60     return 0;
  61 }
  62
  63 static void vp8_release_frame(VP8Context *s, AVFrame *f, int prefer_delayed_free, int can_direct_free)
  64 {
  65     if (f->ref_index[0]) {
  66         if (prefer_delayed_free) {
  67             /* Upon a size change, we want to free the maps but other threads may still
  68              * be using them, so queue them. Upon a seek, all threads are inactive so
  69              * we want to cache one to prevent re-allocation in the next decoding
  70              * iteration, but the rest we can free directly. */
  71             int max_queued_maps = can_direct_free ? 1 : FF_ARRAY_ELEMS(s->segmentation_maps);
  72             if (s->num_maps_to_be_freed < max_queued_maps) {
  73                 s->segmentation_maps[s->num_maps_to_be_freed++] = f->ref_index[0];
  74             } else if (can_direct_free) /* vp8_decode_flush(), but our queue is full */ {
  75                 av_free(f->ref_index[0]);
  76             } /* else: MEMLEAK (should never happen, but better that than crash) */
  77             f->ref_index[0] = NULL;
  78         } else /* vp8_decode_free() */ {
  79             av_free(f->ref_index[0]);
  80         }
  81     }
  82     ff_thread_release_buffer(s->avctx, f);
  83 }
  84
  85 static void vp8_decode_flush_impl(AVCodecContext *avctx,
  86                                   int prefer_delayed_free, int can_direct_free, int free_mem)
  87 {
  88     VP8Context *s = avctx->priv_data;
  89     int i;
  90
  91     if (!avctx->internal->is_copy) {
  92         for (i = 0; i < 5; i++)
  93             if (s->frames[i].data[0])
  94                 vp8_release_frame(s, &s->frames[i], prefer_delayed_free, can_direct_free);
  95     }
  96     memset(s->framep, 0, sizeof(s->framep));
  97
  98     if (free_mem) {
  99         free_buffers(s);
 100         s->maps_are_invalid = 1;
 101     }
 102 }
 103
 104 static void vp8_decode_flush(AVCodecContext *avctx)
 105 {
 106     vp8_decode_flush_impl(avctx, 1, 1, 0);
 107 }
 108
 109 static int update_dimensions(VP8Context *s, int width, int height)
 110 {
 111     if (width  != s->avctx->width ||
 112         height != s->avctx->height) {
 113         if (av_image_check_size(width, height, 0, s->avctx))
 114             return AVERROR_INVALIDDATA;
 115
 116         vp8_decode_flush_impl(s->avctx, 1, 0, 1);
 117
 118         avcodec_set_dimensions(s->avctx, width, height);
 119     }
 120
 121     s->mb_width  = (s->avctx->coded_width +15) / 16;
 122     s->mb_height = (s->avctx->coded_height+15) / 16;
 123
 124     s->macroblocks_base        = av_mallocz((s->mb_width+s->mb_height*2+1)*sizeof(*s->macroblocks));
 125     s->filter_strength         = av_mallocz(s->mb_width*sizeof(*s->filter_strength));
 126     s->intra4x4_pred_mode_top  = av_mallocz(s->mb_width*4);
 127     s->top_nnz                 = av_mallocz(s->mb_width*sizeof(*s->top_nnz));
 128     s->top_border              = av_mallocz((s->mb_width+1)*sizeof(*s->top_border));
 129
 130     if (!s->macroblocks_base || !s->filter_strength || !s->intra4x4_pred_mode_top ||
 131         !s->top_nnz || !s->top_border)
 132         return AVERROR(ENOMEM);
 133
 134     s->macroblocks        = s->macroblocks_base + 1;
 135
 136     return 0;
 137 }
 138
 139 static void parse_segment_info(VP8Context *s)
 140 {
 141     VP56RangeCoder *c = &s->c;
 142     int i;
 143
 144     s->segmentation.update_map = vp8_rac_get(c);
 145
 146     if (vp8_rac_get(c)) { // update segment feature data
 147         s->segmentation.absolute_vals = vp8_rac_get(c);
 148
 149         for (i = 0; i < 4; i++)
 150             s->segmentation.base_quant[i]   = vp8_rac_get_sint(c, 7);
 151
 152         for (i = 0; i < 4; i++)
 153             s->segmentation.filter_level[i] = vp8_rac_get_sint(c, 6);
 154     }
 155     if (s->segmentation.update_map)
 156         for (i = 0; i < 3; i++)
 157             s->prob->segmentid[i] = vp8_rac_get(c) ? vp8_rac_get_uint(c, 8) : 255;
 158 }
 159
 160 static void update_lf_deltas(VP8Context *s)
 161 {
 162     VP56RangeCoder *c = &s->c;
 163     int i;
 164
 165     for (i = 0; i < 4; i++)
 166         s->lf_delta.ref[i]  = vp8_rac_get_sint(c, 6);
 167
 168     for (i = MODE_I4x4; i <= VP8_MVMODE_SPLIT; i++)
 169         s->lf_delta.mode[i] = vp8_rac_get_sint(c, 6);
 170 }
 171
 172 static int setup_partitions(VP8Context *s, const uint8_t *buf, int buf_size)
 173 {
 174     const uint8_t *sizes = buf;
 175     int i;
 176
 177     s->num_coeff_partitions = 1 << vp8_rac_get_uint(&s->c, 2);
 178
 179     buf      += 3*(s->num_coeff_partitions-1);
 180     buf_size -= 3*(s->num_coeff_partitions-1);
 181     if (buf_size < 0)
 182         return -1;
 183
 184     for (i = 0; i < s->num_coeff_partitions-1; i++) {
 185         int size = AV_RL24(sizes + 3*i);
 186         if (buf_size - size < 0)
 187             return -1;
 188
 189         ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, size);
 190         buf      += size;
 191         buf_size -= size;
 192     }
 193     ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, buf_size);
 194
 195     return 0;
 196 }
 197
 198 static void get_quants(VP8Context *s)
 199 {
 200     VP56RangeCoder *c = &s->c;
 201     int i, base_qi;
 202
 203     int yac_qi     = vp8_rac_get_uint(c, 7);
 204     int ydc_delta  = vp8_rac_get_sint(c, 4);
 205     int y2dc_delta = vp8_rac_get_sint(c, 4);
 206     int y2ac_delta = vp8_rac_get_sint(c, 4);
 207     int uvdc_delta = vp8_rac_get_sint(c, 4);
 208     int uvac_delta = vp8_rac_get_sint(c, 4);
 209
 210     for (i = 0; i < 4; i++) {
 211         if (s->segmentation.enabled) {
 212             base_qi = s->segmentation.base_quant[i];
 213             if (!s->segmentation.absolute_vals)
 214                 base_qi += yac_qi;
 215         } else
 216             base_qi = yac_qi;
 217
 218         s->qmat[i].luma_qmul[0]    =       vp8_dc_qlookup[av_clip_uintp2(base_qi + ydc_delta , 7)];
 219         s->qmat[i].luma_qmul[1]    =       vp8_ac_qlookup[av_clip_uintp2(base_qi             , 7)];
 220         s->qmat[i].luma_dc_qmul[0] =   2 * vp8_dc_qlookup[av_clip_uintp2(base_qi + y2dc_delta, 7)];
 221         s->qmat[i].luma_dc_qmul[1] = 155 * vp8_ac_qlookup[av_clip_uintp2(base_qi + y2ac_delta, 7)] / 100;
 222         s->qmat[i].chroma_qmul[0]  =       vp8_dc_qlookup[av_clip_uintp2(base_qi + uvdc_delta, 7)];
 223         s->qmat[i].chroma_qmul[1]  =       vp8_ac_qlookup[av_clip_uintp2(base_qi + uvac_delta, 7)];
 224
 225         s->qmat[i].luma_dc_qmul[1] = FFMAX(s->qmat[i].luma_dc_qmul[1], 8);
 226         s->qmat[i].chroma_qmul[0]  = FFMIN(s->qmat[i].chroma_qmul[0], 132);
 227     }
 228 }
 229
 230 /**
 231  * Determine which buffers golden and altref should be updated with after this frame.
 232  * The spec isn't clear here, so I'm going by my understanding of what libvpx does
 233  *
 234  * Intra frames update all 3 references
 235  * Inter frames update VP56_FRAME_PREVIOUS if the update_last flag is set
 236  * If the update (golden|altref) flag is set, it's updated with the current frame
 237  *      if update_last is set, and VP56_FRAME_PREVIOUS otherwise.
 238  * If the flag is not set, the number read means:
 239  *      0: no update
 240  *      1: VP56_FRAME_PREVIOUS
 241  *      2: update golden with altref, or update altref with golden
 242  */
 243 static VP56Frame ref_to_update(VP8Context *s, int update, VP56Frame ref)
 244 {
 245     VP56RangeCoder *c = &s->c;
 246
 247     if (update)
 248         return VP56_FRAME_CURRENT;
 249
 250     switch (vp8_rac_get_uint(c, 2)) {
 251     case 1:
 252         return VP56_FRAME_PREVIOUS;
 253     case 2:
 254         return (ref == VP56_FRAME_GOLDEN) ? VP56_FRAME_GOLDEN2 : VP56_FRAME_GOLDEN;
 255     }
 256     return VP56_FRAME_NONE;
 257 }
 258
 259 static void update_refs(VP8Context *s)
 260 {
 261     VP56RangeCoder *c = &s->c;
 262
 263     int update_golden = vp8_rac_get(c);
 264     int update_altref = vp8_rac_get(c);
 265
 266     s->update_golden = ref_to_update(s, update_golden, VP56_FRAME_GOLDEN);
 267     s->update_altref = ref_to_update(s, update_altref, VP56_FRAME_GOLDEN2);
 268 }
 269
 270 static int decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size)
 271 {
 272     VP56RangeCoder *c = &s->c;
 273     int header_size, hscale, vscale, i, j, k, l, m, ret;
 274     int width  = s->avctx->width;
 275     int height = s->avctx->height;
 276
 277     s->keyframe  = !(buf[0] & 1);
 278     s->profile   =  (buf[0]>>1) & 7;
 279     s->invisible = !(buf[0] & 0x10);
 280     header_size  = AV_RL24(buf) >> 5;
 281     buf      += 3;
 282     buf_size -= 3;
 283
 284     if (s->profile > 3)
 285         av_log(s->avctx, AV_LOG_WARNING, "Unknown profile %d\n", s->profile);
 286
 287     if (!s->profile)
 288         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab, sizeof(s->put_pixels_tab));
 289     else    // profile 1-3 use bilinear, 4+ aren't defined so whatever
 290         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_bilinear_pixels_tab, sizeof(s->put_pixels_tab));
 291
 292     if (header_size > buf_size - 7*s->keyframe) {
 293         av_log(s->avctx, AV_LOG_ERROR, "Header size larger than data provided\n");
 294         return AVERROR_INVALIDDATA;
 295     }
 296
 297     if (s->keyframe) {
 298         if (AV_RL24(buf) != 0x2a019d) {
 299             av_log(s->avctx, AV_LOG_ERROR, "Invalid start code 0x%x\n", AV_RL24(buf));
 300             return AVERROR_INVALIDDATA;
 301         }
 302         width  = AV_RL16(buf+3) & 0x3fff;
 303         height = AV_RL16(buf+5) & 0x3fff;
 304         hscale = buf[4] >> 6;
 305         vscale = buf[6] >> 6;
 306         buf      += 7;
 307         buf_size -= 7;
 308
 309         if (hscale || vscale)
 310             av_log_missing_feature(s->avctx, "Upscaling", 1);
 311
 312         s->update_golden = s->update_altref = VP56_FRAME_CURRENT;
 313         for (i = 0; i < 4; i++)
 314             for (j = 0; j < 16; j++)
 315                 memcpy(s->prob->token[i][j], vp8_token_default_probs[i][vp8_coeff_band[j]],
 316                        sizeof(s->prob->token[i][j]));
 317         memcpy(s->prob->pred16x16, vp8_pred16x16_prob_inter, sizeof(s->prob->pred16x16));
 318         memcpy(s->prob->pred8x8c , vp8_pred8x8c_prob_inter , sizeof(s->prob->pred8x8c));
 319         memcpy(s->prob->mvc      , vp8_mv_default_prob     , sizeof(s->prob->mvc));
 320         memset(&s->segmentation, 0, sizeof(s->segmentation));
 321     }
 322
 323     if (!s->macroblocks_base || /* first frame */
 324         width != s->avctx->width || height != s->avctx->height) {
 325         if ((ret = update_dimensions(s, width, height)) < 0)
 326             return ret;
 327     }
 328
 329     ff_vp56_init_range_decoder(c, buf, header_size);
 330     buf      += header_size;
 331     buf_size -= header_size;
 332
 333     if (s->keyframe) {
 334         if (vp8_rac_get(c))
 335             av_log(s->avctx, AV_LOG_WARNING, "Unspecified colorspace\n");
 336         vp8_rac_get(c); // whether we can skip clamping in dsp functions
 337     }
 338
 339     if ((s->segmentation.enabled = vp8_rac_get(c)))
 340         parse_segment_info(s);
 341     else
 342         s->segmentation.update_map = 0; // FIXME: move this to some init function?
 343
 344     s->filter.simple    = vp8_rac_get(c);
 345     s->filter.level     = vp8_rac_get_uint(c, 6);
 346     s->filter.sharpness = vp8_rac_get_uint(c, 3);
 347
 348     if ((s->lf_delta.enabled = vp8_rac_get(c)))
 349         if (vp8_rac_get(c))
 350             update_lf_deltas(s);
 351
 352     if (setup_partitions(s, buf, buf_size)) {
 353         av_log(s->avctx, AV_LOG_ERROR, "Invalid partitions\n");
 354         return AVERROR_INVALIDDATA;
 355     }
 356
 357     get_quants(s);
 358
 359     if (!s->keyframe) {
 360         update_refs(s);
 361         s->sign_bias[VP56_FRAME_GOLDEN]               = vp8_rac_get(c);
 362         s->sign_bias[VP56_FRAME_GOLDEN2 /* altref */] = vp8_rac_get(c);
 363     }
 364
 365     // if we aren't saving this frame's probabilities for future frames,
 366     // make a copy of the current probabilities
 367     if (!(s->update_probabilities = vp8_rac_get(c)))
 368         s->prob[1] = s->prob[0];
 369
 370     s->update_last = s->keyframe || vp8_rac_get(c);
 371
 372     for (i = 0; i < 4; i++)
 373         for (j = 0; j < 8; j++)
 374             for (k = 0; k < 3; k++)
 375                 for (l = 0; l < NUM_DCT_TOKENS-1; l++)
 376                     if (vp56_rac_get_prob_branchy(c, vp8_token_update_probs[i][j][k][l])) {
 377                         int prob = vp8_rac_get_uint(c, 8);
 378                         for (m = 0; vp8_coeff_band_indexes[j][m] >= 0; m++)
 379                             s->prob->token[i][vp8_coeff_band_indexes[j][m]][k][l] = prob;
 380                     }
 381
 382     if ((s->mbskip_enabled = vp8_rac_get(c)))
 383         s->prob->mbskip = vp8_rac_get_uint(c, 8);
 384
 385     if (!s->keyframe) {
 386         s->prob->intra  = vp8_rac_get_uint(c, 8);
 387         s->prob->last   = vp8_rac_get_uint(c, 8);
 388         s->prob->golden = vp8_rac_get_uint(c, 8);
 389
 390         if (vp8_rac_get(c))
 391             for (i = 0; i < 4; i++)
 392                 s->prob->pred16x16[i] = vp8_rac_get_uint(c, 8);
 393         if (vp8_rac_get(c))
 394             for (i = 0; i < 3; i++)
 395                 s->prob->pred8x8c[i]  = vp8_rac_get_uint(c, 8);
 396
 397         // 17.2 MV probability update
 398         for (i = 0; i < 2; i++)
 399             for (j = 0; j < 19; j++)
 400                 if (vp56_rac_get_prob_branchy(c, vp8_mv_update_prob[i][j]))
 401                     s->prob->mvc[i][j] = vp8_rac_get_nn(c);
 402     }
 403
 404     return 0;
 405 }
 406
 407 static av_always_inline void clamp_mv(VP8Context *s, VP56mv *dst, const VP56mv *src)
 408 {
 409     dst->x = av_clip(src->x, s->mv_min.x, s->mv_max.x);
 410     dst->y = av_clip(src->y, s->mv_min.y, s->mv_max.y);
 411 }
 412
 413 /**
 414  * Motion vector coding, 17.1.
 415  */
 416 static int read_mv_component(VP56RangeCoder *c, const uint8_t *p)
 417 {
 418     int bit, x = 0;
 419
 420     if (vp56_rac_get_prob_branchy(c, p[0])) {
 421         int i;
 422
 423         for (i = 0; i < 3; i++)
 424             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 425         for (i = 9; i > 3; i--)
 426             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 427         if (!(x & 0xFFF0) || vp56_rac_get_prob(c, p[12]))
 428             x += 8;
 429     } else {
 430         // small_mvtree
 431         const uint8_t *ps = p+2;
 432         bit = vp56_rac_get_prob(c, *ps);
 433         ps += 1 + 3*bit;
 434         x  += 4*bit;
 435         bit = vp56_rac_get_prob(c, *ps);
 436         ps += 1 + bit;
 437         x  += 2*bit;
 438         x  += vp56_rac_get_prob(c, *ps);
 439     }
 440
 441     return (x && vp56_rac_get_prob(c, p[1])) ? -x : x;
 442 }
 443
 444 static av_always_inline
 445 const uint8_t *get_submv_prob(uint32_t left, uint32_t top)
 446 {
 447     if (left == top)
 448         return vp8_submv_prob[4-!!left];
 449     if (!top)
 450         return vp8_submv_prob[2];
 451     return vp8_submv_prob[1-!!left];
 452 }
 453
 454 /**
 455  * Split motion vector prediction, 16.4.
 456  * @returns the number of motion vectors parsed (2, 4 or 16)
 457  */
 458 static av_always_inline
 459 int decode_splitmvs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb)
 460 {
 461     int part_idx;
 462     int n, num;
 463     VP8Macroblock *top_mb  = &mb[2];
 464     VP8Macroblock *left_mb = &mb[-1];
 465     const uint8_t *mbsplits_left = vp8_mbsplits[left_mb->partitioning],
 466                   *mbsplits_top = vp8_mbsplits[top_mb->partitioning],
 467                   *mbsplits_cur, *firstidx;
 468     VP56mv *top_mv  = top_mb->bmv;
 469     VP56mv *left_mv = left_mb->bmv;
 470     VP56mv *cur_mv  = mb->bmv;
 471
 472     if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[0])) {
 473         if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[1])) {
 474             part_idx = VP8_SPLITMVMODE_16x8 + vp56_rac_get_prob(c, vp8_mbsplit_prob[2]);
 475         } else {
 476             part_idx = VP8_SPLITMVMODE_8x8;
 477         }
 478     } else {
 479         part_idx = VP8_SPLITMVMODE_4x4;
 480     }
 481
 482     num = vp8_mbsplit_count[part_idx];
 483     mbsplits_cur = vp8_mbsplits[part_idx],
 484     firstidx = vp8_mbfirstidx[part_idx];
 485     mb->partitioning = part_idx;
 486
 487     for (n = 0; n < num; n++) {
 488         int k = firstidx[n];
 489         uint32_t left, above;
 490         const uint8_t *submv_prob;
 491
 492         if (!(k & 3))
 493             left = AV_RN32A(&left_mv[mbsplits_left[k + 3]]);
 494         else
 495             left  = AV_RN32A(&cur_mv[mbsplits_cur[k - 1]]);
 496         if (k <= 3)
 497             above = AV_RN32A(&top_mv[mbsplits_top[k + 12]]);
 498         else
 499             above = AV_RN32A(&cur_mv[mbsplits_cur[k - 4]]);
 500
 501         submv_prob = get_submv_prob(left, above);
 502
 503         if (vp56_rac_get_prob_branchy(c, submv_prob[0])) {
 504             if (vp56_rac_get_prob_branchy(c, submv_prob[1])) {
 505                 if (vp56_rac_get_prob_branchy(c, submv_prob[2])) {
 506                     mb->bmv[n].y = mb->mv.y + read_mv_component(c, s->prob->mvc[0]);
 507                     mb->bmv[n].x = mb->mv.x + read_mv_component(c, s->prob->mvc[1]);
 508                 } else {
 509                     AV_ZERO32(&mb->bmv[n]);
 510                 }
 511             } else {
 512                 AV_WN32A(&mb->bmv[n], above);
 513             }
 514         } else {
 515             AV_WN32A(&mb->bmv[n], left);
 516         }
 517     }
 518
 519     return num;
 520 }
 521
 522 static av_always_inline
 523 void decode_mvs(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y)
 524 {
 525     VP8Macroblock *mb_edge[3] = { mb + 2 /* top */,
 526                                   mb - 1 /* left */,
 527                                   mb + 1 /* top-left */ };
 528     enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV };
 529     enum { VP8_EDGE_TOP, VP8_EDGE_LEFT, VP8_EDGE_TOPLEFT };
 530     int idx = CNT_ZERO;
 531     int cur_sign_bias = s->sign_bias[mb->ref_frame];
 532     int8_t *sign_bias = s->sign_bias;
 533     VP56mv near_mv[4];
 534     uint8_t cnt[4] = { 0 };
 535     VP56RangeCoder *c = &s->c;
 536
 537     AV_ZERO32(&near_mv[0]);
 538     AV_ZERO32(&near_mv[1]);
 539     AV_ZERO32(&near_mv[2]);
 540
 541     /* Process MB on top, left and top-left */
 542     #define MV_EDGE_CHECK(n)\
 543     {\
 544         VP8Macroblock *edge = mb_edge[n];\
 545         int edge_ref = edge->ref_frame;\
 546         if (edge_ref != VP56_FRAME_CURRENT) {\
 547             uint32_t mv = AV_RN32A(&edge->mv);\
 548             if (mv) {\
 549                 if (cur_sign_bias != sign_bias[edge_ref]) {\
 550                     /* SWAR negate of the values in mv. */\
 551                     mv = ~mv;\
 552                     mv = ((mv&0x7fff7fff) + 0x00010001) ^ (mv&0x80008000);\
 553                 }\
 554                 if (!n || mv != AV_RN32A(&near_mv[idx]))\
 555                     AV_WN32A(&near_mv[++idx], mv);\
 556                 cnt[idx]      += 1 + (n != 2);\
 557             } else\
 558                 cnt[CNT_ZERO] += 1 + (n != 2);\
 559         }\
 560     }
 561
 562     MV_EDGE_CHECK(0)
 563     MV_EDGE_CHECK(1)
 564     MV_EDGE_CHECK(2)
 565
 566     mb->partitioning = VP8_SPLITMVMODE_NONE;
 567     if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_ZERO]][0])) {
 568         mb->mode = VP8_MVMODE_MV;
 569
 570         /* If we have three distinct MVs, merge first and last if they're the same */
 571         if (cnt[CNT_SPLITMV] && AV_RN32A(&near_mv[1 + VP8_EDGE_TOP]) == AV_RN32A(&near_mv[1 + VP8_EDGE_TOPLEFT]))
 572             cnt[CNT_NEAREST] += 1;
 573
 574         /* Swap near and nearest if necessary */
 575         if (cnt[CNT_NEAR] > cnt[CNT_NEAREST]) {
 576             FFSWAP(uint8_t,     cnt[CNT_NEAREST],     cnt[CNT_NEAR]);
 577             FFSWAP( VP56mv, near_mv[CNT_NEAREST], near_mv[CNT_NEAR]);
 578         }
 579
 580         if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAREST]][1])) {
 581             if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAR]][2])) {
 582
 583                 /* Choose the best mv out of 0,0 and the nearest mv */
 584                 clamp_mv(s, &mb->mv, &near_mv[CNT_ZERO + (cnt[CNT_NEAREST] >= cnt[CNT_ZERO])]);
 585                 cnt[CNT_SPLITMV] = ((mb_edge[VP8_EDGE_LEFT]->mode    == VP8_MVMODE_SPLIT) +
 586                                     (mb_edge[VP8_EDGE_TOP]->mode     == VP8_MVMODE_SPLIT)) * 2 +
 587                                     (mb_edge[VP8_EDGE_TOPLEFT]->mode == VP8_MVMODE_SPLIT);
 588
 589                 if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_SPLITMV]][3])) {
 590                     mb->mode = VP8_MVMODE_SPLIT;
 591                     mb->mv = mb->bmv[decode_splitmvs(s, c, mb) - 1];
 592                 } else {
 593                     mb->mv.y += read_mv_component(c, s->prob->mvc[0]);
 594                     mb->mv.x += read_mv_component(c, s->prob->mvc[1]);
 595                     mb->bmv[0] = mb->mv;
 596                 }
 597             } else {
 598                 clamp_mv(s, &mb->mv, &near_mv[CNT_NEAR]);
 599                 mb->bmv[0] = mb->mv;
 600             }
 601         } else {
 602             clamp_mv(s, &mb->mv, &near_mv[CNT_NEAREST]);
 603             mb->bmv[0] = mb->mv;
 604         }
 605     } else {
 606         mb->mode = VP8_MVMODE_ZERO;
 607         AV_ZERO32(&mb->mv);
 608         mb->bmv[0] = mb->mv;
 609     }
 610 }
 611
 612 static av_always_inline
 613 void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c,
 614                            int mb_x, int keyframe)
 615 {
 616     uint8_t *intra4x4 = s->intra4x4_pred_mode_mb;
 617     if (keyframe) {
 618         int x, y;
 619         uint8_t* const top = s->intra4x4_pred_mode_top + 4 * mb_x;
 620         uint8_t* const left = s->intra4x4_pred_mode_left;
 621         for (y = 0; y < 4; y++) {
 622             for (x = 0; x < 4; x++) {
 623                 const uint8_t *ctx;
 624                 ctx = vp8_pred4x4_prob_intra[top[x]][left[y]];
 625                 *intra4x4 = vp8_rac_get_tree(c, vp8_pred4x4_tree, ctx);
 626                 left[y] = top[x] = *intra4x4;
 627                 intra4x4++;
 628             }
 629         }
 630     } else {
 631         int i;
 632         for (i = 0; i < 16; i++)
 633             intra4x4[i] = vp8_rac_get_tree(c, vp8_pred4x4_tree, vp8_pred4x4_prob_inter);
 634     }
 635 }
 636
 637 static av_always_inline
 638 void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, uint8_t *segment, uint8_t *ref)
 639 {
 640     VP56RangeCoder *c = &s->c;
 641
 642     if (s->segmentation.update_map)
 643         *segment = vp8_rac_get_tree(c, vp8_segmentid_tree, s->prob->segmentid);
 644     else
 645         *segment = ref ? *ref : *segment;
 646     s->segment = *segment;
 647
 648     mb->skip = s->mbskip_enabled ? vp56_rac_get_prob(c, s->prob->mbskip) : 0;
 649
 650     if (s->keyframe) {
 651         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_intra, vp8_pred16x16_prob_intra);
 652
 653         if (mb->mode == MODE_I4x4) {
 654             decode_intra4x4_modes(s, c, mb_x, 1);
 655         } else {
 656             const uint32_t modes = vp8_pred4x4_mode[mb->mode] * 0x01010101u;
 657             AV_WN32A(s->intra4x4_pred_mode_top + 4 * mb_x, modes);
 658             AV_WN32A(s->intra4x4_pred_mode_left, modes);
 659         }
 660
 661         s->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, vp8_pred8x8c_prob_intra);
 662         mb->ref_frame = VP56_FRAME_CURRENT;
 663     } else if (vp56_rac_get_prob_branchy(c, s->prob->intra)) {
 664         // inter MB, 16.2
 665         if (vp56_rac_get_prob_branchy(c, s->prob->last))
 666             mb->ref_frame = vp56_rac_get_prob(c, s->prob->golden) ?
 667                 VP56_FRAME_GOLDEN2 /* altref */ : VP56_FRAME_GOLDEN;
 668         else
 669             mb->ref_frame = VP56_FRAME_PREVIOUS;
 670         s->ref_count[mb->ref_frame-1]++;
 671
 672         // motion vectors, 16.3
 673         decode_mvs(s, mb, mb_x, mb_y);
 674     } else {
 675         // intra MB, 16.1
 676         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_inter, s->prob->pred16x16);
 677
 678         if (mb->mode == MODE_I4x4)
 679             decode_intra4x4_modes(s, c, mb_x, 0);
 680
 681         s->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, s->prob->pred8x8c);
 682         mb->ref_frame = VP56_FRAME_CURRENT;
 683         mb->partitioning = VP8_SPLITMVMODE_NONE;
 684         AV_ZERO32(&mb->bmv[0]);
 685     }
 686 }
 687
 688 #ifndef decode_block_coeffs_internal
 689 /**
 690  * @param c arithmetic bitstream reader context
 691  * @param block destination for block coefficients
 692  * @param probs probabilities to use when reading trees from the bitstream
 693  * @param i initial coeff index, 0 unless a separate DC block is coded
 694  * @param qmul array holding the dc/ac dequant factor at position 0/1
 695  * @return 0 if no coeffs were decoded
 696  *         otherwise, the index of the last coeff decoded plus one
 697  */
 698 static int decode_block_coeffs_internal(VP56RangeCoder *c, DCTELEM block[16],
 699                                         uint8_t probs[16][3][NUM_DCT_TOKENS-1],
 700                                         int i, uint8_t *token_prob, int16_t qmul[2])
 701 {
 702     goto skip_eob;
 703     do {
 704         int coeff;
 705         if (!vp56_rac_get_prob_branchy(c, token_prob[0]))   // DCT_EOB
 706             return i;
 707
 708 skip_eob:
 709         if (!vp56_rac_get_prob_branchy(c, token_prob[1])) { // DCT_0
 710             if (++i == 16)
 711                 return i; // invalid input; blocks should end with EOB
 712             token_prob = probs[i][0];
 713             goto skip_eob;
 714         }
 715
 716         if (!vp56_rac_get_prob_branchy(c, token_prob[2])) { // DCT_1
 717             coeff = 1;
 718             token_prob = probs[i+1][1];
 719         } else {
 720             if (!vp56_rac_get_prob_branchy(c, token_prob[3])) { // DCT 2,3,4
 721                 coeff = vp56_rac_get_prob_branchy(c, token_prob[4]);
 722                 if (coeff)
 723                     coeff += vp56_rac_get_prob(c, token_prob[5]);
 724                 coeff += 2;
 725             } else {
 726                 // DCT_CAT*
 727                 if (!vp56_rac_get_prob_branchy(c, token_prob[6])) {
 728                     if (!vp56_rac_get_prob_branchy(c, token_prob[7])) { // DCT_CAT1
 729                         coeff  = 5 + vp56_rac_get_prob(c, vp8_dct_cat1_prob[0]);
 730                     } else {                                    // DCT_CAT2
 731                         coeff  = 7;
 732                         coeff += vp56_rac_get_prob(c, vp8_dct_cat2_prob[0]) << 1;
 733                         coeff += vp56_rac_get_prob(c, vp8_dct_cat2_prob[1]);
 734                     }
 735                 } else {    // DCT_CAT3 and up
 736                     int a = vp56_rac_get_prob(c, token_prob[8]);
 737                     int b = vp56_rac_get_prob(c, token_prob[9+a]);
 738                     int cat = (a<<1) + b;
 739                     coeff  = 3 + (8<<cat);
 740                     coeff += vp8_rac_get_coeff(c, ff_vp8_dct_cat_prob[cat]);
 741                 }
 742             }
 743             token_prob = probs[i+1][2];
 744         }
 745         block[zigzag_scan[i]] = (vp8_rac_get(c) ? -coeff : coeff) * qmul[!!i];
 746     } while (++i < 16);
 747
 748     return i;
 749 }
 750 #endif
 751
 752 /**
 753  * @param c arithmetic bitstream reader context
 754  * @param block destination for block coefficients
 755  * @param probs probabilities to use when reading trees from the bitstream
 756  * @param i initial coeff index, 0 unless a separate DC block is coded
 757  * @param zero_nhood the initial prediction context for number of surrounding
 758  *                   all-zero blocks (only left/top, so 0-2)
 759  * @param qmul array holding the dc/ac dequant factor at position 0/1
 760  * @return 0 if no coeffs were decoded
 761  *         otherwise, the index of the last coeff decoded plus one
 762  */
 763 static av_always_inline
 764 int decode_block_coeffs(VP56RangeCoder *c, DCTELEM block[16],
 765                         uint8_t probs[16][3][NUM_DCT_TOKENS-1],
 766                         int i, int zero_nhood, int16_t qmul[2])
 767 {
 768     uint8_t *token_prob = probs[i][zero_nhood];
 769     if (!vp56_rac_get_prob_branchy(c, token_prob[0]))   // DCT_EOB
 770         return 0;
 771     return decode_block_coeffs_internal(c, block, probs, i, token_prob, qmul);
 772 }
 773
 774 static av_always_inline
 775 void decode_mb_coeffs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
 776                       uint8_t t_nnz[9], uint8_t l_nnz[9])
 777 {
 778     int i, x, y, luma_start = 0, luma_ctx = 3;
 779     int nnz_pred, nnz, nnz_total = 0;
 780     int segment = s->segment;
 781     int block_dc = 0;
 782
 783     if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
 784         nnz_pred = t_nnz[8] + l_nnz[8];
 785
 786         // decode DC values and do hadamard
 787         nnz = decode_block_coeffs(c, s->block_dc, s->prob->token[1], 0, nnz_pred,
 788                                   s->qmat[segment].luma_dc_qmul);
 789         l_nnz[8] = t_nnz[8] = !!nnz;
 790         if (nnz) {
 791             nnz_total += nnz;
 792             block_dc = 1;
 793             if (nnz == 1)
 794                 s->vp8dsp.vp8_luma_dc_wht_dc(s->block, s->block_dc);
 795             else
 796                 s->vp8dsp.vp8_luma_dc_wht(s->block, s->block_dc);
 797         }
 798         luma_start = 1;
 799         luma_ctx = 0;
 800     }
 801
 802     // luma blocks
 803     for (y = 0; y < 4; y++)
 804         for (x = 0; x < 4; x++) {
 805             nnz_pred = l_nnz[y] + t_nnz[x];
 806             nnz = decode_block_coeffs(c, s->block[y][x], s->prob->token[luma_ctx], luma_start,
 807                                       nnz_pred, s->qmat[segment].luma_qmul);
 808             // nnz+block_dc may be one more than the actual last index, but we don't care
 809             s->non_zero_count_cache[y][x] = nnz + block_dc;
 810             t_nnz[x] = l_nnz[y] = !!nnz;
 811             nnz_total += nnz;
 812         }
 813
 814     // chroma blocks
 815     // TODO: what to do about dimensions? 2nd dim for luma is x,
 816     // but for chroma it's (y<<1)|x
 817     for (i = 4; i < 6; i++)
 818         for (y = 0; y < 2; y++)
 819             for (x = 0; x < 2; x++) {
 820                 nnz_pred = l_nnz[i+2*y] + t_nnz[i+2*x];
 821                 nnz = decode_block_coeffs(c, s->block[i][(y<<1)+x], s->prob->token[2], 0,
 822                                           nnz_pred, s->qmat[segment].chroma_qmul);
 823                 s->non_zero_count_cache[i][(y<<1)+x] = nnz;
 824                 t_nnz[i+2*x] = l_nnz[i+2*y] = !!nnz;
 825                 nnz_total += nnz;
 826             }
 827
 828     // if there were no coded coeffs despite the macroblock not being marked skip,
 829     // we MUST not do the inner loop filter and should not do IDCT
 830     // Since skip isn't used for bitstream prediction, just manually set it.
 831     if (!nnz_total)
 832         mb->skip = 1;
 833 }
 834
 835 static av_always_inline
 836 void backup_mb_border(uint8_t *top_border, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr,
 837                       int linesize, int uvlinesize, int simple)
 838 {
 839     AV_COPY128(top_border, src_y + 15*linesize);
 840     if (!simple) {
 841         AV_COPY64(top_border+16, src_cb + 7*uvlinesize);
 842         AV_COPY64(top_border+24, src_cr + 7*uvlinesize);
 843     }
 844 }
 845
 846 static av_always_inline
 847 void xchg_mb_border(uint8_t *top_border, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr,
 848                     int linesize, int uvlinesize, int mb_x, int mb_y, int mb_width,
 849                     int simple, int xchg)
 850 {
 851     uint8_t *top_border_m1 = top_border-32;     // for TL prediction
 852     src_y  -=   linesize;
 853     src_cb -= uvlinesize;
 854     src_cr -= uvlinesize;
 855
 856 #define XCHG(a,b,xchg) do {                     \
 857         if (xchg) AV_SWAP64(b,a);               \
 858         else      AV_COPY64(b,a);               \
 859     } while (0)
 860
 861     XCHG(top_border_m1+8, src_y-8, xchg);
 862     XCHG(top_border,      src_y,   xchg);
 863     XCHG(top_border+8,    src_y+8, 1);
 864     if (mb_x < mb_width-1)
 865         XCHG(top_border+32, src_y+16, 1);
 866
 867     // only copy chroma for normal loop filter
 868     // or to initialize the top row to 127
 869     if (!simple || !mb_y) {
 870         XCHG(top_border_m1+16, src_cb-8, xchg);
 871         XCHG(top_border_m1+24, src_cr-8, xchg);
 872         XCHG(top_border+16,    src_cb, 1);
 873         XCHG(top_border+24,    src_cr, 1);
 874     }
 875 }
 876
 877 static av_always_inline
 878 int check_dc_pred8x8_mode(int mode, int mb_x, int mb_y)
 879 {
 880     if (!mb_x) {
 881         return mb_y ? TOP_DC_PRED8x8 : DC_128_PRED8x8;
 882     } else {
 883         return mb_y ? mode : LEFT_DC_PRED8x8;
 884     }
 885 }
 886
 887 static av_always_inline
 888 int check_tm_pred8x8_mode(int mode, int mb_x, int mb_y)
 889 {
 890     if (!mb_x) {
 891         return mb_y ? VERT_PRED8x8 : DC_129_PRED8x8;
 892     } else {
 893         return mb_y ? mode : HOR_PRED8x8;
 894     }
 895 }
 896
 897 static av_always_inline
 898 int check_intra_pred8x8_mode(int mode, int mb_x, int mb_y)
 899 {
 900     if (mode == DC_PRED8x8) {
 901         return check_dc_pred8x8_mode(mode, mb_x, mb_y);
 902     } else {
 903         return mode;
 904     }
 905 }
 906
 907 static av_always_inline
 908 int check_intra_pred8x8_mode_emuedge(int mode, int mb_x, int mb_y)
 909 {
 910     switch (mode) {
 911     case DC_PRED8x8:
 912         return check_dc_pred8x8_mode(mode, mb_x, mb_y);
 913     case VERT_PRED8x8:
 914         return !mb_y ? DC_127_PRED8x8 : mode;
 915     case HOR_PRED8x8:
 916         return !mb_x ? DC_129_PRED8x8 : mode;
 917     case PLANE_PRED8x8 /*TM*/:
 918         return check_tm_pred8x8_mode(mode, mb_x, mb_y);
 919     }
 920     return mode;
 921 }
 922
 923 static av_always_inline
 924 int check_tm_pred4x4_mode(int mode, int mb_x, int mb_y)
 925 {
 926     if (!mb_x) {
 927         return mb_y ? VERT_VP8_PRED : DC_129_PRED;
 928     } else {
 929         return mb_y ? mode : HOR_VP8_PRED;
 930     }
 931 }
 932
 933 static av_always_inline
 934 int check_intra_pred4x4_mode_emuedge(int mode, int mb_x, int mb_y, int *copy_buf)
 935 {
 936     switch (mode) {
 937     case VERT_PRED:
 938         if (!mb_x && mb_y) {
 939             *copy_buf = 1;
 940             return mode;
 941         }
 942         /* fall-through */
 943     case DIAG_DOWN_LEFT_PRED:
 944     case VERT_LEFT_PRED:
 945         return !mb_y ? DC_127_PRED : mode;
 946     case HOR_PRED:
 947         if (!mb_y) {
 948             *copy_buf = 1;
 949             return mode;
 950         }
 951         /* fall-through */
 952     case HOR_UP_PRED:
 953         return !mb_x ? DC_129_PRED : mode;
 954     case TM_VP8_PRED:
 955         return check_tm_pred4x4_mode(mode, mb_x, mb_y);
 956     case DC_PRED: // 4x4 DC doesn't use the same "H.264-style" exceptions as 16x16/8x8 DC
 957     case DIAG_DOWN_RIGHT_PRED:
 958     case VERT_RIGHT_PRED:
 959     case HOR_DOWN_PRED:
 960         if (!mb_y || !mb_x)
 961             *copy_buf = 1;
 962         return mode;
 963     }
 964     return mode;
 965 }
 966
 967 static av_always_inline
 968 void intra_predict(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb,
 969                    int mb_x, int mb_y)
 970 {
 971     AVCodecContext *avctx = s->avctx;
 972     int x, y, mode, nnz;
 973     uint32_t tr;
 974
 975     // for the first row, we need to run xchg_mb_border to init the top edge to 127
 976     // otherwise, skip it if we aren't going to deblock
 977     if (!(avctx->flags & CODEC_FLAG_EMU_EDGE && !mb_y) && (s->deblock_filter || !mb_y))
 978         xchg_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2],
 979                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
 980                        s->filter.simple, 1);
 981
 982     if (mb->mode < MODE_I4x4) {
 983         if (avctx->flags & CODEC_FLAG_EMU_EDGE) { // tested
 984             mode = check_intra_pred8x8_mode_emuedge(mb->mode, mb_x, mb_y);
 985         } else {
 986             mode = check_intra_pred8x8_mode(mb->mode, mb_x, mb_y);
 987         }
 988         s->hpc.pred16x16[mode](dst[0], s->linesize);
 989     } else {
 990         uint8_t *ptr = dst[0];
 991         uint8_t *intra4x4 = s->intra4x4_pred_mode_mb;
 992         uint8_t tr_top[4] = { 127, 127, 127, 127 };
 993
 994         // all blocks on the right edge of the macroblock use bottom edge
 995         // the top macroblock for their topright edge
 996         uint8_t *tr_right = ptr - s->linesize + 16;
 997
 998         // if we're on the right edge of the frame, said edge is extended
 999         // from the top macroblock
1000         if (!(!mb_y && avctx->flags & CODEC_FLAG_EMU_EDGE) &&
1001             mb_x == s->mb_width-1) {
1002             tr = tr_right[-1]*0x01010101u;
1003             tr_right = (uint8_t *)&tr;
1004         }
1005
1006         if (mb->skip)
1007             AV_ZERO128(s->non_zero_count_cache);
1008
1009         for (y = 0; y < 4; y++) {
1010             uint8_t *topright = ptr + 4 - s->linesize;
1011             for (x = 0; x < 4; x++) {
1012                 int copy = 0, linesize = s->linesize;
1013                 uint8_t *dst = ptr+4*x;
1014                 DECLARE_ALIGNED(4, uint8_t, copy_dst)[5*8];
1015
1016                 if ((y == 0 || x == 3) && mb_y == 0 && avctx->flags & CODEC_FLAG_EMU_EDGE) {
1017                     topright = tr_top;
1018                 } else if (x == 3)
1019                     topright = tr_right;
1020
1021                 if (avctx->flags & CODEC_FLAG_EMU_EDGE) { // mb_x+x or mb_y+y is a hack but works
1022                     mode = check_intra_pred4x4_mode_emuedge(intra4x4[x], mb_x + x, mb_y + y, &copy);
1023                     if (copy) {
1024                         dst = copy_dst + 12;
1025                         linesize = 8;
1026                         if (!(mb_y + y)) {
1027                             copy_dst[3] = 127U;
1028                             AV_WN32A(copy_dst+4, 127U * 0x01010101U);
1029                         } else {
1030                             AV_COPY32(copy_dst+4, ptr+4*x-s->linesize);
1031                             if (!(mb_x + x)) {
1032                                 copy_dst[3] = 129U;
1033                             } else {
1034                                 copy_dst[3] = ptr[4*x-s->linesize-1];
1035                             }
1036                         }
1037                         if (!(mb_x + x)) {
1038                             copy_dst[11] =
1039                             copy_dst[19] =
1040                             copy_dst[27] =
1041                             copy_dst[35] = 129U;
1042                         } else {
1043                             copy_dst[11] = ptr[4*x              -1];
1044                             copy_dst[19] = ptr[4*x+s->linesize  -1];
1045                             copy_dst[27] = ptr[4*x+s->linesize*2-1];
1046                             copy_dst[35] = ptr[4*x+s->linesize*3-1];
1047                         }
1048                     }
1049                 } else {
1050                     mode = intra4x4[x];
1051                 }
1052                 s->hpc.pred4x4[mode](dst, topright, linesize);
1053                 if (copy) {
1054                     AV_COPY32(ptr+4*x              , copy_dst+12);
1055                     AV_COPY32(ptr+4*x+s->linesize  , copy_dst+20);
1056                     AV_COPY32(ptr+4*x+s->linesize*2, copy_dst+28);
1057                     AV_COPY32(ptr+4*x+s->linesize*3, copy_dst+36);
1058                 }
1059
1060                 nnz = s->non_zero_count_cache[y][x];
1061                 if (nnz) {
1062                     if (nnz == 1)
1063                         s->vp8dsp.vp8_idct_dc_add(ptr+4*x, s->block[y][x], s->linesize);
1064                     else
1065                         s->vp8dsp.vp8_idct_add(ptr+4*x, s->block[y][x], s->linesize);
1066                 }
1067                 topright += 4;
1068             }
1069
1070             ptr   += 4*s->linesize;
1071             intra4x4 += 4;
1072         }
1073     }
1074
1075     if (avctx->flags & CODEC_FLAG_EMU_EDGE) {
1076         mode = check_intra_pred8x8_mode_emuedge(s->chroma_pred_mode, mb_x, mb_y);
1077     } else {
1078         mode = check_intra_pred8x8_mode(s->chroma_pred_mode, mb_x, mb_y);
1079     }
1080     s->hpc.pred8x8[mode](dst[1], s->uvlinesize);
1081     s->hpc.pred8x8[mode](dst[2], s->uvlinesize);
1082
1083     if (!(avctx->flags & CODEC_FLAG_EMU_EDGE && !mb_y) && (s->deblock_filter || !mb_y))
1084         xchg_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2],
1085                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1086                        s->filter.simple, 0);
1087 }
1088
1089 static const uint8_t subpel_idx[3][8] = {
1090     { 0, 1, 2, 1, 2, 1, 2, 1 }, // nr. of left extra pixels,
1091                                 // also function pointer index
1092     { 0, 3, 5, 3, 5, 3, 5, 3 }, // nr. of extra pixels required
1093     { 0, 2, 3, 2, 3, 2, 3, 2 }, // nr. of right extra pixels
1094 };
1095
1096 /**
1097  * luma MC function
1098  *
1099  * @param s VP8 decoding context
1100  * @param dst target buffer for block data at block position
1101  * @param ref reference picture buffer at origin (0, 0)
1102  * @param mv motion vector (relative to block position) to get pixel data from
1103  * @param x_off horizontal position of block from origin (0, 0)
1104  * @param y_off vertical position of block from origin (0, 0)
1105  * @param block_w width of block (16, 8 or 4)
1106  * @param block_h height of block (always same as block_w)
1107  * @param width width of src/dst plane data
1108  * @param height height of src/dst plane data
1109  * @param linesize size of a single line of plane data, including padding
1110  * @param mc_func motion compensation function pointers (bilinear or sixtap MC)
1111  */
1112 static av_always_inline
1113 void vp8_mc_luma(VP8Context *s, uint8_t *dst, AVFrame *ref, const VP56mv *mv,
1114                  int x_off, int y_off, int block_w, int block_h,
1115                  int width, int height, int linesize,
1116                  vp8_mc_func mc_func[3][3])
1117 {
1118     uint8_t *src = ref->data[0];
1119
1120     if (AV_RN32A(mv)) {
1121
1122         int mx = (mv->x << 1)&7, mx_idx = subpel_idx[0][mx];
1123         int my = (mv->y << 1)&7, my_idx = subpel_idx[0][my];
1124
1125         x_off += mv->x >> 2;
1126         y_off += mv->y >> 2;
1127
1128         // edge emulation
1129         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 4, 0);
1130         src += y_off * linesize + x_off;
1131         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1132             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1133             s->dsp.emulated_edge_mc(s->edge_emu_buffer, src - my_idx * linesize - mx_idx, linesize,
1134                                     block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1135                                     x_off - mx_idx, y_off - my_idx, width, height);
1136             src = s->edge_emu_buffer + mx_idx + linesize * my_idx;
1137         }
1138         mc_func[my_idx][mx_idx](dst, linesize, src, linesize, block_h, mx, my);
1139     } else {
1140         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 4, 0);
1141         mc_func[0][0](dst, linesize, src + y_off * linesize + x_off, linesize, block_h, 0, 0);
1142     }
1143 }
1144
1145 /**
1146  * chroma MC function
1147  *
1148  * @param s VP8 decoding context
1149  * @param dst1 target buffer for block data at block position (U plane)
1150  * @param dst2 target buffer for block data at block position (V plane)
1151  * @param ref reference picture buffer at origin (0, 0)
1152  * @param mv motion vector (relative to block position) to get pixel data from
1153  * @param x_off horizontal position of block from origin (0, 0)
1154  * @param y_off vertical position of block from origin (0, 0)
1155  * @param block_w width of block (16, 8 or 4)
1156  * @param block_h height of block (always same as block_w)
1157  * @param width width of src/dst plane data
1158  * @param height height of src/dst plane data
1159  * @param linesize size of a single line of plane data, including padding
1160  * @param mc_func motion compensation function pointers (bilinear or sixtap MC)
1161  */
1162 static av_always_inline
1163 void vp8_mc_chroma(VP8Context *s, uint8_t *dst1, uint8_t *dst2, AVFrame *ref,
1164                    const VP56mv *mv, int x_off, int y_off,
1165                    int block_w, int block_h, int width, int height, int linesize,
1166                    vp8_mc_func mc_func[3][3])
1167 {
1168     uint8_t *src1 = ref->data[1], *src2 = ref->data[2];
1169
1170     if (AV_RN32A(mv)) {
1171         int mx = mv->x&7, mx_idx = subpel_idx[0][mx];
1172         int my = mv->y&7, my_idx = subpel_idx[0][my];
1173
1174         x_off += mv->x >> 3;
1175         y_off += mv->y >> 3;
1176
1177         // edge emulation
1178         src1 += y_off * linesize + x_off;
1179         src2 += y_off * linesize + x_off;
1180         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 3, 0);
1181         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1182             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1183             s->dsp.emulated_edge_mc(s->edge_emu_buffer, src1 - my_idx * linesize - mx_idx, linesize,
1184                                     block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1185                                     x_off - mx_idx, y_off - my_idx, width, height);
1186             src1 = s->edge_emu_buffer + mx_idx + linesize * my_idx;
1187             mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my);
1188
1189             s->dsp.emulated_edge_mc(s->edge_emu_buffer, src2 - my_idx * linesize - mx_idx, linesize,
1190                                     block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1191                                     x_off - mx_idx, y_off - my_idx, width, height);
1192             src2 = s->edge_emu_buffer + mx_idx + linesize * my_idx;
1193             mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my);
1194         } else {
1195             mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my);
1196             mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my);
1197         }
1198     } else {
1199         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 3, 0);
1200         mc_func[0][0](dst1, linesize, src1 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1201         mc_func[0][0](dst2, linesize, src2 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1202     }
1203 }
1204
1205 static av_always_inline
1206 void vp8_mc_part(VP8Context *s, uint8_t *dst[3],
1207                  AVFrame *ref_frame, int x_off, int y_off,
1208                  int bx_off, int by_off,
1209                  int block_w, int block_h,
1210                  int width, int height, VP56mv *mv)
1211 {
1212     VP56mv uvmv = *mv;
1213
1214     /* Y */
1215     vp8_mc_luma(s, dst[0] + by_off * s->linesize + bx_off,
1216                 ref_frame, mv, x_off + bx_off, y_off + by_off,
1217                 block_w, block_h, width, height, s->linesize,
1218                 s->put_pixels_tab[block_w == 8]);
1219
1220     /* U/V */
1221     if (s->profile == 3) {
1222         uvmv.x &= ~7;
1223         uvmv.y &= ~7;
1224     }
1225     x_off   >>= 1; y_off   >>= 1;
1226     bx_off  >>= 1; by_off  >>= 1;
1227     width   >>= 1; height  >>= 1;
1228     block_w >>= 1; block_h >>= 1;
1229     vp8_mc_chroma(s, dst[1] + by_off * s->uvlinesize + bx_off,
1230                   dst[2] + by_off * s->uvlinesize + bx_off, ref_frame,
1231                   &uvmv, x_off + bx_off, y_off + by_off,
1232                   block_w, block_h, width, height, s->uvlinesize,
1233                   s->put_pixels_tab[1 + (block_w == 4)]);
1234 }
1235
1236 /* Fetch pixels for estimated mv 4 macroblocks ahead.
1237  * Optimized for 64-byte cache lines.  Inspired by ffh264 prefetch_motion. */
1238 static av_always_inline void prefetch_motion(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, int mb_xy, int ref)
1239 {
1240     /* Don't prefetch refs that haven't been used very often this frame. */
1241     if (s->ref_count[ref-1] > (mb_xy >> 5)) {
1242         int x_off = mb_x << 4, y_off = mb_y << 4;
1243         int mx = (mb->mv.x>>2) + x_off + 8;
1244         int my = (mb->mv.y>>2) + y_off;
1245         uint8_t **src= s->framep[ref]->data;
1246         int off= mx + (my + (mb_x&3)*4)*s->linesize + 64;
1247         /* For threading, a ff_thread_await_progress here might be useful, but
1248          * it actually slows down the decoder. Since a bad prefetch doesn't
1249          * generate bad decoder output, we don't run it here. */
1250         s->dsp.prefetch(src[0]+off, s->linesize, 4);
1251         off= (mx>>1) + ((my>>1) + (mb_x&7))*s->uvlinesize + 64;
1252         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
1253     }
1254 }
1255
1256 /**
1257  * Apply motion vectors to prediction buffer, chapter 18.
1258  */
1259 static av_always_inline
1260 void inter_predict(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb,
1261                    int mb_x, int mb_y)
1262 {
1263     int x_off = mb_x << 4, y_off = mb_y << 4;
1264     int width = 16*s->mb_width, height = 16*s->mb_height;
1265     AVFrame *ref = s->framep[mb->ref_frame];
1266     VP56mv *bmv = mb->bmv;
1267
1268     switch (mb->partitioning) {
1269     case VP8_SPLITMVMODE_NONE:
1270         vp8_mc_part(s, dst, ref, x_off, y_off,
1271                     0, 0, 16, 16, width, height, &mb->mv);
1272         break;
1273     case VP8_SPLITMVMODE_4x4: {
1274         int x, y;
1275         VP56mv uvmv;
1276
1277         /* Y */
1278         for (y = 0; y < 4; y++) {
1279             for (x = 0; x < 4; x++) {
1280                 vp8_mc_luma(s, dst[0] + 4*y*s->linesize + x*4,
1281                             ref, &bmv[4*y + x],
1282                             4*x + x_off, 4*y + y_off, 4, 4,
1283                             width, height, s->linesize,
1284                             s->put_pixels_tab[2]);
1285             }
1286         }
1287
1288         /* U/V */
1289         x_off >>= 1; y_off >>= 1; width >>= 1; height >>= 1;
1290         for (y = 0; y < 2; y++) {
1291             for (x = 0; x < 2; x++) {
1292                 uvmv.x = mb->bmv[ 2*y    * 4 + 2*x  ].x +
1293                          mb->bmv[ 2*y    * 4 + 2*x+1].x +
1294                          mb->bmv[(2*y+1) * 4 + 2*x  ].x +
1295                          mb->bmv[(2*y+1) * 4 + 2*x+1].x;
1296                 uvmv.y = mb->bmv[ 2*y    * 4 + 2*x  ].y +
1297                          mb->bmv[ 2*y    * 4 + 2*x+1].y +
1298                          mb->bmv[(2*y+1) * 4 + 2*x  ].y +
1299                          mb->bmv[(2*y+1) * 4 + 2*x+1].y;
1300                 uvmv.x = (uvmv.x + 2 + (uvmv.x >> (INT_BIT-1))) >> 2;
1301                 uvmv.y = (uvmv.y + 2 + (uvmv.y >> (INT_BIT-1))) >> 2;
1302                 if (s->profile == 3) {
1303                     uvmv.x &= ~7;
1304                     uvmv.y &= ~7;
1305                 }
1306                 vp8_mc_chroma(s, dst[1] + 4*y*s->uvlinesize + x*4,
1307                               dst[2] + 4*y*s->uvlinesize + x*4, ref, &uvmv,
1308                               4*x + x_off, 4*y + y_off, 4, 4,
1309                               width, height, s->uvlinesize,
1310                               s->put_pixels_tab[2]);
1311             }
1312         }
1313         break;
1314     }
1315     case VP8_SPLITMVMODE_16x8:
1316         vp8_mc_part(s, dst, ref, x_off, y_off,
1317                     0, 0, 16, 8, width, height, &bmv[0]);
1318         vp8_mc_part(s, dst, ref, x_off, y_off,
1319                     0, 8, 16, 8, width, height, &bmv[1]);
1320         break;
1321     case VP8_SPLITMVMODE_8x16:
1322         vp8_mc_part(s, dst, ref, x_off, y_off,
1323                     0, 0, 8, 16, width, height, &bmv[0]);
1324         vp8_mc_part(s, dst, ref, x_off, y_off,
1325                     8, 0, 8, 16, width, height, &bmv[1]);
1326         break;
1327     case VP8_SPLITMVMODE_8x8:
1328         vp8_mc_part(s, dst, ref, x_off, y_off,
1329                     0, 0, 8, 8, width, height, &bmv[0]);
1330         vp8_mc_part(s, dst, ref, x_off, y_off,
1331                     8, 0, 8, 8, width, height, &bmv[1]);
1332         vp8_mc_part(s, dst, ref, x_off, y_off,
1333                     0, 8, 8, 8, width, height, &bmv[2]);
1334         vp8_mc_part(s, dst, ref, x_off, y_off,
1335                     8, 8, 8, 8, width, height, &bmv[3]);
1336         break;
1337     }
1338 }
1339
1340 static av_always_inline void idct_mb(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb)
1341 {
1342     int x, y, ch;
1343
1344     if (mb->mode != MODE_I4x4) {
1345         uint8_t *y_dst = dst[0];
1346         for (y = 0; y < 4; y++) {
1347             uint32_t nnz4 = AV_RL32(s->non_zero_count_cache[y]);
1348             if (nnz4) {
1349                 if (nnz4&~0x01010101) {
1350                     for (x = 0; x < 4; x++) {
1351                         if ((uint8_t)nnz4 == 1)
1352                             s->vp8dsp.vp8_idct_dc_add(y_dst+4*x, s->block[y][x], s->linesize);
1353                         else if((uint8_t)nnz4 > 1)
1354                             s->vp8dsp.vp8_idct_add(y_dst+4*x, s->block[y][x], s->linesize);
1355                         nnz4 >>= 8;
1356                         if (!nnz4)
1357                             break;
1358                     }
1359                 } else {
1360                     s->vp8dsp.vp8_idct_dc_add4y(y_dst, s->block[y], s->linesize);
1361                 }
1362             }
1363             y_dst += 4*s->linesize;
1364         }
1365     }
1366
1367     for (ch = 0; ch < 2; ch++) {
1368         uint32_t nnz4 = AV_RL32(s->non_zero_count_cache[4+ch]);
1369         if (nnz4) {
1370             uint8_t *ch_dst = dst[1+ch];
1371             if (nnz4&~0x01010101) {
1372                 for (y = 0; y < 2; y++) {
1373                     for (x = 0; x < 2; x++) {
1374                         if ((uint8_t)nnz4 == 1)
1375                             s->vp8dsp.vp8_idct_dc_add(ch_dst+4*x, s->block[4+ch][(y<<1)+x], s->uvlinesize);
1376                         else if((uint8_t)nnz4 > 1)
1377                             s->vp8dsp.vp8_idct_add(ch_dst+4*x, s->block[4+ch][(y<<1)+x], s->uvlinesize);
1378                         nnz4 >>= 8;
1379                         if (!nnz4)
1380                             goto chroma_idct_end;
1381                     }
1382                     ch_dst += 4*s->uvlinesize;
1383                 }
1384             } else {
1385                 s->vp8dsp.vp8_idct_dc_add4uv(ch_dst, s->block[4+ch], s->uvlinesize);
1386             }
1387         }
1388 chroma_idct_end: ;
1389     }
1390 }
1391
1392 static av_always_inline void filter_level_for_mb(VP8Context *s, VP8Macroblock *mb, VP8FilterStrength *f )
1393 {
1394     int interior_limit, filter_level;
1395
1396     if (s->segmentation.enabled) {
1397         filter_level = s->segmentation.filter_level[s->segment];
1398         if (!s->segmentation.absolute_vals)
1399             filter_level += s->filter.level;
1400     } else
1401         filter_level = s->filter.level;
1402
1403     if (s->lf_delta.enabled) {
1404         filter_level += s->lf_delta.ref[mb->ref_frame];
1405         filter_level += s->lf_delta.mode[mb->mode];
1406     }
1407
1408     filter_level = av_clip_uintp2(filter_level, 6);
1409
1410     interior_limit = filter_level;
1411     if (s->filter.sharpness) {
1412         interior_limit >>= (s->filter.sharpness + 3) >> 2;
1413         interior_limit = FFMIN(interior_limit, 9 - s->filter.sharpness);
1414     }
1415     interior_limit = FFMAX(interior_limit, 1);
1416
1417     f->filter_level = filter_level;
1418     f->inner_limit = interior_limit;
1419     f->inner_filter = !mb->skip || mb->mode == MODE_I4x4 || mb->mode == VP8_MVMODE_SPLIT;
1420 }
1421
1422 static av_always_inline void filter_mb(VP8Context *s, uint8_t *dst[3], VP8FilterStrength *f, int mb_x, int mb_y)
1423 {
1424     int mbedge_lim, bedge_lim, hev_thresh;
1425     int filter_level = f->filter_level;
1426     int inner_limit = f->inner_limit;
1427     int inner_filter = f->inner_filter;
1428     int linesize = s->linesize;
1429     int uvlinesize = s->uvlinesize;
1430     static const uint8_t hev_thresh_lut[2][64] = {
1431         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
1432           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1433           3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1434           3, 3, 3, 3 },
1435         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
1436           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1437           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1438           2, 2, 2, 2 }
1439     };
1440
1441     if (!filter_level)
1442         return;
1443
1444      bedge_lim = 2*filter_level + inner_limit;
1445     mbedge_lim = bedge_lim + 4;
1446
1447     hev_thresh = hev_thresh_lut[s->keyframe][filter_level];
1448
1449     if (mb_x) {
1450         s->vp8dsp.vp8_h_loop_filter16y(dst[0],     linesize,
1451                                        mbedge_lim, inner_limit, hev_thresh);
1452         s->vp8dsp.vp8_h_loop_filter8uv(dst[1],     dst[2],      uvlinesize,
1453                                        mbedge_lim, inner_limit, hev_thresh);
1454     }
1455
1456     if (inner_filter) {
1457         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+ 4, linesize, bedge_lim,
1458                                              inner_limit, hev_thresh);
1459         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+ 8, linesize, bedge_lim,
1460                                              inner_limit, hev_thresh);
1461         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+12, linesize, bedge_lim,
1462                                              inner_limit, hev_thresh);
1463         s->vp8dsp.vp8_h_loop_filter8uv_inner(dst[1] + 4, dst[2] + 4,
1464                                              uvlinesize,  bedge_lim,
1465                                              inner_limit, hev_thresh);
1466     }
1467
1468     if (mb_y) {
1469         s->vp8dsp.vp8_v_loop_filter16y(dst[0],     linesize,
1470                                        mbedge_lim, inner_limit, hev_thresh);
1471         s->vp8dsp.vp8_v_loop_filter8uv(dst[1],     dst[2],      uvlinesize,
1472                                        mbedge_lim, inner_limit, hev_thresh);
1473     }
1474
1475     if (inner_filter) {
1476         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+ 4*linesize,
1477                                              linesize,    bedge_lim,
1478                                              inner_limit, hev_thresh);
1479         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+ 8*linesize,
1480                                              linesize,    bedge_lim,
1481                                              inner_limit, hev_thresh);
1482         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+12*linesize,
1483                                              linesize,    bedge_lim,
1484                                              inner_limit, hev_thresh);
1485         s->vp8dsp.vp8_v_loop_filter8uv_inner(dst[1] + 4 * uvlinesize,
1486                                              dst[2] + 4 * uvlinesize,
1487                                              uvlinesize,  bedge_lim,
1488                                              inner_limit, hev_thresh);
1489     }
1490 }
1491
1492 static av_always_inline void filter_mb_simple(VP8Context *s, uint8_t *dst, VP8FilterStrength *f, int mb_x, int mb_y)
1493 {
1494     int mbedge_lim, bedge_lim;
1495     int filter_level = f->filter_level;
1496     int inner_limit = f->inner_limit;
1497     int inner_filter = f->inner_filter;
1498     int linesize = s->linesize;
1499
1500     if (!filter_level)
1501         return;
1502
1503      bedge_lim = 2*filter_level + inner_limit;
1504     mbedge_lim = bedge_lim + 4;
1505
1506     if (mb_x)
1507         s->vp8dsp.vp8_h_loop_filter_simple(dst, linesize, mbedge_lim);
1508     if (inner_filter) {
1509         s->vp8dsp.vp8_h_loop_filter_simple(dst+ 4, linesize, bedge_lim);
1510         s->vp8dsp.vp8_h_loop_filter_simple(dst+ 8, linesize, bedge_lim);
1511         s->vp8dsp.vp8_h_loop_filter_simple(dst+12, linesize, bedge_lim);
1512     }
1513
1514     if (mb_y)
1515         s->vp8dsp.vp8_v_loop_filter_simple(dst, linesize, mbedge_lim);
1516     if (inner_filter) {
1517         s->vp8dsp.vp8_v_loop_filter_simple(dst+ 4*linesize, linesize, bedge_lim);
1518         s->vp8dsp.vp8_v_loop_filter_simple(dst+ 8*linesize, linesize, bedge_lim);
1519         s->vp8dsp.vp8_v_loop_filter_simple(dst+12*linesize, linesize, bedge_lim);
1520     }
1521 }
1522
1523 static void filter_mb_row(VP8Context *s, AVFrame *curframe, int mb_y)
1524 {
1525     VP8FilterStrength *f = s->filter_strength;
1526     uint8_t *dst[3] = {
1527         curframe->data[0] + 16*mb_y*s->linesize,
1528         curframe->data[1] +  8*mb_y*s->uvlinesize,
1529         curframe->data[2] +  8*mb_y*s->uvlinesize
1530     };
1531     int mb_x;
1532
1533     for (mb_x = 0; mb_x < s->mb_width; mb_x++) {
1534         backup_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2], s->linesize, s->uvlinesize, 0);
1535         filter_mb(s, dst, f++, mb_x, mb_y);
1536         dst[0] += 16;
1537         dst[1] += 8;
1538         dst[2] += 8;
1539     }
1540 }
1541
1542 static void filter_mb_row_simple(VP8Context *s, AVFrame *curframe, int mb_y)
1543 {
1544     VP8FilterStrength *f = s->filter_strength;
1545     uint8_t *dst = curframe->data[0] + 16*mb_y*s->linesize;
1546     int mb_x;
1547
1548     for (mb_x = 0; mb_x < s->mb_width; mb_x++) {
1549         backup_mb_border(s->top_border[mb_x+1], dst, NULL, NULL, s->linesize, 0, 1);
1550         filter_mb_simple(s, dst, f++, mb_x, mb_y);
1551         dst += 16;
1552     }
1553 }
1554
1555 static void release_queued_segmaps(VP8Context *s, int is_close)
1556 {
1557     int leave_behind = is_close ? 0 : !s->maps_are_invalid;
1558     while (s->num_maps_to_be_freed > leave_behind)
1559         av_freep(&s->segmentation_maps[--s->num_maps_to_be_freed]);
1560     s->maps_are_invalid = 0;
1561 }
1562
1563 static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
1564                             AVPacket *avpkt)
1565 {
1566     VP8Context *s = avctx->priv_data;
1567     int ret, mb_x, mb_y, i, y, referenced;
1568     enum AVDiscard skip_thresh;
1569     AVFrame *av_uninit(curframe), *prev_frame;
1570
1571     release_queued_segmaps(s, 0);
1572
1573     if ((ret = decode_frame_header(s, avpkt->data, avpkt->size)) < 0)
1574         return ret;
1575
1576     prev_frame = s->framep[VP56_FRAME_CURRENT];
1577
1578     referenced = s->update_last || s->update_golden == VP56_FRAME_CURRENT
1579                                 || s->update_altref == VP56_FRAME_CURRENT;
1580
1581     skip_thresh = !referenced ? AVDISCARD_NONREF :
1582                     !s->keyframe ? AVDISCARD_NONKEY : AVDISCARD_ALL;
1583
1584     if (avctx->skip_frame >= skip_thresh) {
1585         s->invisible = 1;
1586         goto skip_decode;
1587     }
1588     s->deblock_filter = s->filter.level && avctx->skip_loop_filter < skip_thresh;
1589
1590     // release no longer referenced frames
1591     for (i = 0; i < 5; i++)
1592         if (s->frames[i].data[0] &&
1593             &s->frames[i] != prev_frame &&
1594             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
1595             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN] &&
1596             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2])
1597             vp8_release_frame(s, &s->frames[i], 1, 0);
1598
1599     // find a free buffer
1600     for (i = 0; i < 5; i++)
1601         if (&s->frames[i] != prev_frame &&
1602             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
1603             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN] &&
1604             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2]) {
1605             curframe = s->framep[VP56_FRAME_CURRENT] = &s->frames[i];
1606             break;
1607         }
1608     if (i == 5) {
1609         av_log(avctx, AV_LOG_FATAL, "Ran out of free frames!\n");
1610         abort();
1611     }
1612     if (curframe->data[0])
1613         vp8_release_frame(s, curframe, 1, 0);
1614
1615     curframe->key_frame = s->keyframe;
1616     curframe->pict_type = s->keyframe ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
1617     curframe->reference = referenced ? 3 : 0;
1618     if ((ret = vp8_alloc_frame(s, curframe))) {
1619         av_log(avctx, AV_LOG_ERROR, "get_buffer() failed!\n");
1620         return ret;
1621     }
1622
1623     // check if golden and altref are swapped
1624     if (s->update_altref != VP56_FRAME_NONE) {
1625         s->next_framep[VP56_FRAME_GOLDEN2]  = s->framep[s->update_altref];
1626     } else {
1627         s->next_framep[VP56_FRAME_GOLDEN2]  = s->framep[VP56_FRAME_GOLDEN2];
1628     }
1629     if (s->update_golden != VP56_FRAME_NONE) {
1630         s->next_framep[VP56_FRAME_GOLDEN]   = s->framep[s->update_golden];
1631     } else {
1632         s->next_framep[VP56_FRAME_GOLDEN]   = s->framep[VP56_FRAME_GOLDEN];
1633     }
1634     if (s->update_last) {
1635         s->next_framep[VP56_FRAME_PREVIOUS] = curframe;
1636     } else {
1637         s->next_framep[VP56_FRAME_PREVIOUS] = s->framep[VP56_FRAME_PREVIOUS];
1638     }
1639     s->next_framep[VP56_FRAME_CURRENT]      = curframe;
1640
1641     ff_thread_finish_setup(avctx);
1642
1643     // Given that arithmetic probabilities are updated every frame, it's quite likely
1644     // that the values we have on a random interframe are complete junk if we didn't
1645     // start decode on a keyframe. So just don't display anything rather than junk.
1646     if (!s->keyframe && (!s->framep[VP56_FRAME_PREVIOUS] ||
1647                          !s->framep[VP56_FRAME_GOLDEN] ||
1648                          !s->framep[VP56_FRAME_GOLDEN2])) {
1649         av_log(avctx, AV_LOG_WARNING, "Discarding interframe without a prior keyframe!\n");
1650         return AVERROR_INVALIDDATA;
1651     }
1652
1653     s->linesize   = curframe->linesize[0];
1654     s->uvlinesize = curframe->linesize[1];
1655
1656     if (!s->edge_emu_buffer)
1657         s->edge_emu_buffer = av_malloc(21*s->linesize);
1658
1659     memset(s->top_nnz, 0, s->mb_width*sizeof(*s->top_nnz));
1660
1661     /* Zero macroblock structures for top/top-left prediction from outside the frame. */
1662     memset(s->macroblocks + s->mb_height*2 - 1, 0, (s->mb_width+1)*sizeof(*s->macroblocks));
1663
1664     // top edge of 127 for intra prediction
1665     if (!(avctx->flags & CODEC_FLAG_EMU_EDGE)) {
1666         s->top_border[0][15] = s->top_border[0][23] = 127;
1667         memset(s->top_border[1]-1, 127, s->mb_width*sizeof(*s->top_border)+1);
1668     }
1669     memset(s->ref_count, 0, sizeof(s->ref_count));
1670     if (s->keyframe)
1671         memset(s->intra4x4_pred_mode_top, DC_PRED, s->mb_width*4);
1672
1673 #define MARGIN (16 << 2)
1674     s->mv_min.y = -MARGIN;
1675     s->mv_max.y = ((s->mb_height - 1) << 6) + MARGIN;
1676
1677     for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
1678         VP56RangeCoder *c = &s->coeff_partition[mb_y & (s->num_coeff_partitions-1)];
1679         VP8Macroblock *mb = s->macroblocks + (s->mb_height - mb_y - 1)*2;
1680         int mb_xy = mb_y*s->mb_width;
1681         uint8_t *dst[3] = {
1682             curframe->data[0] + 16*mb_y*s->linesize,
1683             curframe->data[1] +  8*mb_y*s->uvlinesize,
1684             curframe->data[2] +  8*mb_y*s->uvlinesize
1685         };
1686
1687         memset(mb - 1, 0, sizeof(*mb));   // zero left macroblock
1688         memset(s->left_nnz, 0, sizeof(s->left_nnz));
1689         AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED*0x01010101);
1690
1691         // left edge of 129 for intra prediction
1692         if (!(avctx->flags & CODEC_FLAG_EMU_EDGE)) {
1693             for (i = 0; i < 3; i++)
1694                 for (y = 0; y < 16>>!!i; y++)
1695                     dst[i][y*curframe->linesize[i]-1] = 129;
1696             if (mb_y == 1) // top left edge is also 129
1697                 s->top_border[0][15] = s->top_border[0][23] = s->top_border[0][31] = 129;
1698         }
1699
1700         s->mv_min.x = -MARGIN;
1701         s->mv_max.x = ((s->mb_width  - 1) << 6) + MARGIN;
1702         if (prev_frame && s->segmentation.enabled && !s->segmentation.update_map)
1703             ff_thread_await_progress(prev_frame, mb_y, 0);
1704
1705         for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
1706             /* Prefetch the current frame, 4 MBs ahead */
1707             s->dsp.prefetch(dst[0] + (mb_x&3)*4*s->linesize + 64, s->linesize, 4);
1708             s->dsp.prefetch(dst[1] + (mb_x&7)*s->uvlinesize + 64, dst[2] - dst[1], 2);
1709
1710             decode_mb_mode(s, mb, mb_x, mb_y, curframe->ref_index[0] + mb_xy,
1711                            prev_frame && prev_frame->ref_index[0] ? prev_frame->ref_index[0] + mb_xy : NULL);
1712
1713             prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_PREVIOUS);
1714
1715             if (!mb->skip)
1716                 decode_mb_coeffs(s, c, mb, s->top_nnz[mb_x], s->left_nnz);
1717
1718             if (mb->mode <= MODE_I4x4)
1719                 intra_predict(s, dst, mb, mb_x, mb_y);
1720             else
1721                 inter_predict(s, dst, mb, mb_x, mb_y);
1722
1723             prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN);
1724
1725             if (!mb->skip) {
1726                 idct_mb(s, dst, mb);
1727             } else {
1728                 AV_ZERO64(s->left_nnz);
1729                 AV_WN64(s->top_nnz[mb_x], 0);   // array of 9, so unaligned
1730
1731                 // Reset DC block predictors if they would exist if the mb had coefficients
1732                 if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
1733                     s->left_nnz[8]      = 0;
1734                     s->top_nnz[mb_x][8] = 0;
1735                 }
1736             }
1737
1738             if (s->deblock_filter)
1739                 filter_level_for_mb(s, mb, &s->filter_strength[mb_x]);
1740
1741             prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN2);
1742
1743             dst[0] += 16;
1744             dst[1] += 8;
1745             dst[2] += 8;
1746             s->mv_min.x -= 64;
1747             s->mv_max.x -= 64;
1748         }
1749         if (s->deblock_filter) {
1750             if (s->filter.simple)
1751                 filter_mb_row_simple(s, curframe, mb_y);
1752             else
1753                 filter_mb_row(s, curframe, mb_y);
1754         }
1755         s->mv_min.y -= 64;
1756         s->mv_max.y -= 64;
1757
1758         ff_thread_report_progress(curframe, mb_y, 0);
1759     }
1760
1761     ff_thread_report_progress(curframe, INT_MAX, 0);
1762 skip_decode:
1763     // if future frames don't use the updated probabilities,
1764     // reset them to the values we saved
1765     if (!s->update_probabilities)
1766         s->prob[0] = s->prob[1];
1767
1768     memcpy(&s->framep[0], &s->next_framep[0], sizeof(s->framep[0]) * 4);
1769
1770     if (!s->invisible) {
1771         *(AVFrame*)data = *curframe;
1772         *data_size = sizeof(AVFrame);
1773     }
1774
1775     return avpkt->size;
1776 }
1777
1778 static av_cold int vp8_decode_init(AVCodecContext *avctx)
1779 {
1780     VP8Context *s = avctx->priv_data;
1781
1782     s->avctx = avctx;
1783     avctx->pix_fmt = PIX_FMT_YUV420P;
1784
1785     dsputil_init(&s->dsp, avctx);
1786     ff_h264_pred_init(&s->hpc, CODEC_ID_VP8, 8, 1);
1787     ff_vp8dsp_init(&s->vp8dsp);
1788
1789     return 0;
1790 }
1791
1792 static av_cold int vp8_decode_free(AVCodecContext *avctx)
1793 {
1794     vp8_decode_flush_impl(avctx, 0, 1, 1);
1795     release_queued_segmaps(avctx->priv_data, 1);
1796     return 0;
1797 }
1798
1799 static av_cold int vp8_decode_init_thread_copy(AVCodecContext *avctx)
1800 {
1801     VP8Context *s = avctx->priv_data;
1802
1803     s->avctx = avctx;
1804
1805     return 0;
1806 }
1807
1808 #define REBASE(pic) \
1809     pic ? pic - &s_src->frames[0] + &s->frames[0] : NULL
1810
1811 static int vp8_decode_update_thread_context(AVCodecContext *dst, const AVCodecContext *src)
1812 {
1813     VP8Context *s = dst->priv_data, *s_src = src->priv_data;
1814
1815     if (s->macroblocks_base &&
1816         (s_src->mb_width != s->mb_width || s_src->mb_height != s->mb_height)) {
1817         free_buffers(s);
1818         s->maps_are_invalid = 1;
1819     }
1820
1821     s->prob[0] = s_src->prob[!s_src->update_probabilities];
1822     s->segmentation = s_src->segmentation;
1823     s->lf_delta = s_src->lf_delta;
1824     memcpy(s->sign_bias, s_src->sign_bias, sizeof(s->sign_bias));
1825
1826     memcpy(&s->frames, &s_src->frames, sizeof(s->frames));
1827     s->framep[0] = REBASE(s_src->next_framep[0]);
1828     s->framep[1] = REBASE(s_src->next_framep[1]);
1829     s->framep[2] = REBASE(s_src->next_framep[2]);
1830     s->framep[3] = REBASE(s_src->next_framep[3]);
1831
1832     return 0;
1833 }
1834
1835 AVCodec ff_vp8_decoder = {
1836     .name           = "vp8",
1837     .type           = AVMEDIA_TYPE_VIDEO,
1838     .id             = CODEC_ID_VP8,
1839     .priv_data_size = sizeof(VP8Context),
1840     .init           = vp8_decode_init,
1841     .close          = vp8_decode_free,
1842     .decode         = vp8_decode_frame,
1843     .capabilities   = CODEC_CAP_DR1 | CODEC_CAP_FRAME_THREADS,
1844     .flush = vp8_decode_flush,
1845     .long_name = NULL_IF_CONFIG_SMALL("On2 VP8"),
1846     .init_thread_copy      = ONLY_IF_THREADS_ENABLED(vp8_decode_init_thread_copy),
1847     .update_thread_context = ONLY_IF_THREADS_ENABLED(vp8_decode_update_thread_context),
1848 };