git.sesse.net Git - ffmpeg/blob - libavcodec/vp8.c

   1 /**
   2  * VP8 compatible video decoder
   3  *
   4  * Copyright (C) 2010 David Conrad
   5  * Copyright (C) 2010 Ronald S. Bultje
   6  * Copyright (C) 2010 Jason Garrett-Glaser
   7  *
   8  * This file is part of FFmpeg.
   9  *
  10  * FFmpeg is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2.1 of the License, or (at your option) any later version.
  14  *
  15  * FFmpeg is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with FFmpeg; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23  */
  24
  25 #include "libavutil/imgutils.h"
  26 #include "avcodec.h"
  27 #include "vp8.h"
  28 #include "vp8data.h"
  29 #include "rectangle.h"
  30 #include "thread.h"
  31
  32 #if ARCH_ARM
  33 #   include "arm/vp8.h"
  34 #endif
  35
  36 static void free_buffers(VP8Context *s)
  37 {
  38     av_freep(&s->macroblocks_base);
  39     av_freep(&s->filter_strength);
  40     av_freep(&s->intra4x4_pred_mode_top);
  41     av_freep(&s->top_nnz);
  42     av_freep(&s->edge_emu_buffer);
  43     av_freep(&s->top_border);
  44
  45     s->macroblocks = NULL;
  46 }
  47
  48 static int vp8_alloc_frame(VP8Context *s, AVFrame *f)
  49 {
  50     int ret;
  51     if ((ret = ff_thread_get_buffer(s->avctx, f)) < 0)
  52         return ret;
  53     if (!s->maps_are_invalid && s->num_maps_to_be_freed) {
  54         f->ref_index[0] = s->segmentation_maps[--s->num_maps_to_be_freed];
  55     } else if (!(f->ref_index[0] = av_mallocz(s->mb_width * s->mb_height))) {
  56         ff_thread_release_buffer(s->avctx, f);
  57         return AVERROR(ENOMEM);
  58     }
  59     return 0;
  60 }
  61
  62 static void vp8_release_frame(VP8Context *s, AVFrame *f, int is_close)
  63 {
  64     if (!is_close) {
  65         if (f->ref_index[0]) {
  66             assert(s->num_maps_to_be_freed < FF_ARRAY_ELEMS(s->segmentation_maps));
  67             s->segmentation_maps[s->num_maps_to_be_freed++] = f->ref_index[0];
  68             f->ref_index[0] = NULL;
  69         }
  70     } else {
  71         av_freep(&f->ref_index[0]);
  72     }
  73     ff_thread_release_buffer(s->avctx, f);
  74 }
  75
  76 static void vp8_decode_flush_impl(AVCodecContext *avctx, int force, int is_close)
  77 {
  78     VP8Context *s = avctx->priv_data;
  79     int i;
  80
  81     if (!avctx->is_copy || force) {
  82         for (i = 0; i < 5; i++)
  83             if (s->frames[i].data[0])
  84                 vp8_release_frame(s, &s->frames[i], is_close);
  85     }
  86     memset(s->framep, 0, sizeof(s->framep));
  87
  88     free_buffers(s);
  89     s->maps_are_invalid = 1;
  90 }
  91
  92 static void vp8_decode_flush(AVCodecContext *avctx)
  93 {
  94     vp8_decode_flush_impl(avctx, 0, 0);
  95 }
  96
  97 static int update_dimensions(VP8Context *s, int width, int height)
  98 {
  99     if (width  != s->avctx->width ||
 100         height != s->avctx->height) {
 101         if (av_image_check_size(width, height, 0, s->avctx))
 102             return AVERROR_INVALIDDATA;
 103
 104         vp8_decode_flush_impl(s->avctx, 1, 0);
 105
 106         avcodec_set_dimensions(s->avctx, width, height);
 107     }
 108
 109     s->mb_width  = (s->avctx->coded_width +15) / 16;
 110     s->mb_height = (s->avctx->coded_height+15) / 16;
 111
 112     s->macroblocks_base        = av_mallocz((s->mb_width+s->mb_height*2+1)*sizeof(*s->macroblocks));
 113     s->filter_strength         = av_mallocz(s->mb_width*sizeof(*s->filter_strength));
 114     s->intra4x4_pred_mode_top  = av_mallocz(s->mb_width*4);
 115     s->top_nnz                 = av_mallocz(s->mb_width*sizeof(*s->top_nnz));
 116     s->top_border              = av_mallocz((s->mb_width+1)*sizeof(*s->top_border));
 117
 118     if (!s->macroblocks_base || !s->filter_strength || !s->intra4x4_pred_mode_top ||
 119         !s->top_nnz || !s->top_border)
 120         return AVERROR(ENOMEM);
 121
 122     s->macroblocks        = s->macroblocks_base + 1;
 123
 124     return 0;
 125 }
 126
 127 static void parse_segment_info(VP8Context *s)
 128 {
 129     VP56RangeCoder *c = &s->c;
 130     int i;
 131
 132     s->segmentation.update_map = vp8_rac_get(c);
 133
 134     if (vp8_rac_get(c)) { // update segment feature data
 135         s->segmentation.absolute_vals = vp8_rac_get(c);
 136
 137         for (i = 0; i < 4; i++)
 138             s->segmentation.base_quant[i]   = vp8_rac_get_sint(c, 7);
 139
 140         for (i = 0; i < 4; i++)
 141             s->segmentation.filter_level[i] = vp8_rac_get_sint(c, 6);
 142     }
 143     if (s->segmentation.update_map)
 144         for (i = 0; i < 3; i++)
 145             s->prob->segmentid[i] = vp8_rac_get(c) ? vp8_rac_get_uint(c, 8) : 255;
 146 }
 147
 148 static void update_lf_deltas(VP8Context *s)
 149 {
 150     VP56RangeCoder *c = &s->c;
 151     int i;
 152
 153     for (i = 0; i < 4; i++)
 154         s->lf_delta.ref[i]  = vp8_rac_get_sint(c, 6);
 155
 156     for (i = MODE_I4x4; i <= VP8_MVMODE_SPLIT; i++)
 157         s->lf_delta.mode[i] = vp8_rac_get_sint(c, 6);
 158 }
 159
 160 static int setup_partitions(VP8Context *s, const uint8_t *buf, int buf_size)
 161 {
 162     const uint8_t *sizes = buf;
 163     int i;
 164
 165     s->num_coeff_partitions = 1 << vp8_rac_get_uint(&s->c, 2);
 166
 167     buf      += 3*(s->num_coeff_partitions-1);
 168     buf_size -= 3*(s->num_coeff_partitions-1);
 169     if (buf_size < 0)
 170         return -1;
 171
 172     for (i = 0; i < s->num_coeff_partitions-1; i++) {
 173         int size = AV_RL24(sizes + 3*i);
 174         if (buf_size - size < 0)
 175             return -1;
 176
 177         ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, size);
 178         buf      += size;
 179         buf_size -= size;
 180     }
 181     ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, buf_size);
 182
 183     return 0;
 184 }
 185
 186 static void get_quants(VP8Context *s)
 187 {
 188     VP56RangeCoder *c = &s->c;
 189     int i, base_qi;
 190
 191     int yac_qi     = vp8_rac_get_uint(c, 7);
 192     int ydc_delta  = vp8_rac_get_sint(c, 4);
 193     int y2dc_delta = vp8_rac_get_sint(c, 4);
 194     int y2ac_delta = vp8_rac_get_sint(c, 4);
 195     int uvdc_delta = vp8_rac_get_sint(c, 4);
 196     int uvac_delta = vp8_rac_get_sint(c, 4);
 197
 198     for (i = 0; i < 4; i++) {
 199         if (s->segmentation.enabled) {
 200             base_qi = s->segmentation.base_quant[i];
 201             if (!s->segmentation.absolute_vals)
 202                 base_qi += yac_qi;
 203         } else
 204             base_qi = yac_qi;
 205
 206         s->qmat[i].luma_qmul[0]    =       vp8_dc_qlookup[av_clip_uintp2(base_qi + ydc_delta , 7)];
 207         s->qmat[i].luma_qmul[1]    =       vp8_ac_qlookup[av_clip_uintp2(base_qi             , 7)];
 208         s->qmat[i].luma_dc_qmul[0] =   2 * vp8_dc_qlookup[av_clip_uintp2(base_qi + y2dc_delta, 7)];
 209         s->qmat[i].luma_dc_qmul[1] = 155 * vp8_ac_qlookup[av_clip_uintp2(base_qi + y2ac_delta, 7)] / 100;
 210         s->qmat[i].chroma_qmul[0]  =       vp8_dc_qlookup[av_clip_uintp2(base_qi + uvdc_delta, 7)];
 211         s->qmat[i].chroma_qmul[1]  =       vp8_ac_qlookup[av_clip_uintp2(base_qi + uvac_delta, 7)];
 212
 213         s->qmat[i].luma_dc_qmul[1] = FFMAX(s->qmat[i].luma_dc_qmul[1], 8);
 214         s->qmat[i].chroma_qmul[0]  = FFMIN(s->qmat[i].chroma_qmul[0], 132);
 215     }
 216 }
 217
 218 /**
 219  * Determine which buffers golden and altref should be updated with after this frame.
 220  * The spec isn't clear here, so I'm going by my understanding of what libvpx does
 221  *
 222  * Intra frames update all 3 references
 223  * Inter frames update VP56_FRAME_PREVIOUS if the update_last flag is set
 224  * If the update (golden|altref) flag is set, it's updated with the current frame
 225  *      if update_last is set, and VP56_FRAME_PREVIOUS otherwise.
 226  * If the flag is not set, the number read means:
 227  *      0: no update
 228  *      1: VP56_FRAME_PREVIOUS
 229  *      2: update golden with altref, or update altref with golden
 230  */
 231 static VP56Frame ref_to_update(VP8Context *s, int update, VP56Frame ref)
 232 {
 233     VP56RangeCoder *c = &s->c;
 234
 235     if (update)
 236         return VP56_FRAME_CURRENT;
 237
 238     switch (vp8_rac_get_uint(c, 2)) {
 239     case 1:
 240         return VP56_FRAME_PREVIOUS;
 241     case 2:
 242         return (ref == VP56_FRAME_GOLDEN) ? VP56_FRAME_GOLDEN2 : VP56_FRAME_GOLDEN;
 243     }
 244     return VP56_FRAME_NONE;
 245 }
 246
 247 static void update_refs(VP8Context *s)
 248 {
 249     VP56RangeCoder *c = &s->c;
 250
 251     int update_golden = vp8_rac_get(c);
 252     int update_altref = vp8_rac_get(c);
 253
 254     s->update_golden = ref_to_update(s, update_golden, VP56_FRAME_GOLDEN);
 255     s->update_altref = ref_to_update(s, update_altref, VP56_FRAME_GOLDEN2);
 256 }
 257
 258 static int decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size)
 259 {
 260     VP56RangeCoder *c = &s->c;
 261     int header_size, hscale, vscale, i, j, k, l, m, ret;
 262     int width  = s->avctx->width;
 263     int height = s->avctx->height;
 264
 265     s->keyframe  = !(buf[0] & 1);
 266     s->profile   =  (buf[0]>>1) & 7;
 267     s->invisible = !(buf[0] & 0x10);
 268     header_size  = AV_RL24(buf) >> 5;
 269     buf      += 3;
 270     buf_size -= 3;
 271
 272     if (s->profile > 3)
 273         av_log(s->avctx, AV_LOG_WARNING, "Unknown profile %d\n", s->profile);
 274
 275     if (!s->profile)
 276         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab, sizeof(s->put_pixels_tab));
 277     else    // profile 1-3 use bilinear, 4+ aren't defined so whatever
 278         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_bilinear_pixels_tab, sizeof(s->put_pixels_tab));
 279
 280     if (header_size > buf_size - 7*s->keyframe) {
 281         av_log(s->avctx, AV_LOG_ERROR, "Header size larger than data provided\n");
 282         return AVERROR_INVALIDDATA;
 283     }
 284
 285     if (s->keyframe) {
 286         if (AV_RL24(buf) != 0x2a019d) {
 287             av_log(s->avctx, AV_LOG_ERROR, "Invalid start code 0x%x\n", AV_RL24(buf));
 288             return AVERROR_INVALIDDATA;
 289         }
 290         width  = AV_RL16(buf+3) & 0x3fff;
 291         height = AV_RL16(buf+5) & 0x3fff;
 292         hscale = buf[4] >> 6;
 293         vscale = buf[6] >> 6;
 294         buf      += 7;
 295         buf_size -= 7;
 296
 297         if (hscale || vscale)
 298             av_log_missing_feature(s->avctx, "Upscaling", 1);
 299
 300         s->update_golden = s->update_altref = VP56_FRAME_CURRENT;
 301         for (i = 0; i < 4; i++)
 302             for (j = 0; j < 16; j++)
 303                 memcpy(s->prob->token[i][j], vp8_token_default_probs[i][vp8_coeff_band[j]],
 304                        sizeof(s->prob->token[i][j]));
 305         memcpy(s->prob->pred16x16, vp8_pred16x16_prob_inter, sizeof(s->prob->pred16x16));
 306         memcpy(s->prob->pred8x8c , vp8_pred8x8c_prob_inter , sizeof(s->prob->pred8x8c));
 307         memcpy(s->prob->mvc      , vp8_mv_default_prob     , sizeof(s->prob->mvc));
 308         memset(&s->segmentation, 0, sizeof(s->segmentation));
 309     }
 310
 311     if (!s->macroblocks_base || /* first frame */
 312         width != s->avctx->width || height != s->avctx->height) {
 313         if ((ret = update_dimensions(s, width, height)) < 0)
 314             return ret;
 315     }
 316
 317     ff_vp56_init_range_decoder(c, buf, header_size);
 318     buf      += header_size;
 319     buf_size -= header_size;
 320
 321     if (s->keyframe) {
 322         if (vp8_rac_get(c))
 323             av_log(s->avctx, AV_LOG_WARNING, "Unspecified colorspace\n");
 324         vp8_rac_get(c); // whether we can skip clamping in dsp functions
 325     }
 326
 327     if ((s->segmentation.enabled = vp8_rac_get(c)))
 328         parse_segment_info(s);
 329     else
 330         s->segmentation.update_map = 0; // FIXME: move this to some init function?
 331
 332     s->filter.simple    = vp8_rac_get(c);
 333     s->filter.level     = vp8_rac_get_uint(c, 6);
 334     s->filter.sharpness = vp8_rac_get_uint(c, 3);
 335
 336     if ((s->lf_delta.enabled = vp8_rac_get(c)))
 337         if (vp8_rac_get(c))
 338             update_lf_deltas(s);
 339
 340     if (setup_partitions(s, buf, buf_size)) {
 341         av_log(s->avctx, AV_LOG_ERROR, "Invalid partitions\n");
 342         return AVERROR_INVALIDDATA;
 343     }
 344
 345     get_quants(s);
 346
 347     if (!s->keyframe) {
 348         update_refs(s);
 349         s->sign_bias[VP56_FRAME_GOLDEN]               = vp8_rac_get(c);
 350         s->sign_bias[VP56_FRAME_GOLDEN2 /* altref */] = vp8_rac_get(c);
 351     }
 352
 353     // if we aren't saving this frame's probabilities for future frames,
 354     // make a copy of the current probabilities
 355     if (!(s->update_probabilities = vp8_rac_get(c)))
 356         s->prob[1] = s->prob[0];
 357
 358     s->update_last = s->keyframe || vp8_rac_get(c);
 359
 360     for (i = 0; i < 4; i++)
 361         for (j = 0; j < 8; j++)
 362             for (k = 0; k < 3; k++)
 363                 for (l = 0; l < NUM_DCT_TOKENS-1; l++)
 364                     if (vp56_rac_get_prob_branchy(c, vp8_token_update_probs[i][j][k][l])) {
 365                         int prob = vp8_rac_get_uint(c, 8);
 366                         for (m = 0; vp8_coeff_band_indexes[j][m] >= 0; m++)
 367                             s->prob->token[i][vp8_coeff_band_indexes[j][m]][k][l] = prob;
 368                     }
 369
 370     if ((s->mbskip_enabled = vp8_rac_get(c)))
 371         s->prob->mbskip = vp8_rac_get_uint(c, 8);
 372
 373     if (!s->keyframe) {
 374         s->prob->intra  = vp8_rac_get_uint(c, 8);
 375         s->prob->last   = vp8_rac_get_uint(c, 8);
 376         s->prob->golden = vp8_rac_get_uint(c, 8);
 377
 378         if (vp8_rac_get(c))
 379             for (i = 0; i < 4; i++)
 380                 s->prob->pred16x16[i] = vp8_rac_get_uint(c, 8);
 381         if (vp8_rac_get(c))
 382             for (i = 0; i < 3; i++)
 383                 s->prob->pred8x8c[i]  = vp8_rac_get_uint(c, 8);
 384
 385         // 17.2 MV probability update
 386         for (i = 0; i < 2; i++)
 387             for (j = 0; j < 19; j++)
 388                 if (vp56_rac_get_prob_branchy(c, vp8_mv_update_prob[i][j]))
 389                     s->prob->mvc[i][j] = vp8_rac_get_nn(c);
 390     }
 391
 392     return 0;
 393 }
 394
 395 static av_always_inline void clamp_mv(VP8Context *s, VP56mv *dst, const VP56mv *src)
 396 {
 397     dst->x = av_clip(src->x, s->mv_min.x, s->mv_max.x);
 398     dst->y = av_clip(src->y, s->mv_min.y, s->mv_max.y);
 399 }
 400
 401 /**
 402  * Motion vector coding, 17.1.
 403  */
 404 static int read_mv_component(VP56RangeCoder *c, const uint8_t *p)
 405 {
 406     int bit, x = 0;
 407
 408     if (vp56_rac_get_prob_branchy(c, p[0])) {
 409         int i;
 410
 411         for (i = 0; i < 3; i++)
 412             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 413         for (i = 9; i > 3; i--)
 414             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 415         if (!(x & 0xFFF0) || vp56_rac_get_prob(c, p[12]))
 416             x += 8;
 417     } else {
 418         // small_mvtree
 419         const uint8_t *ps = p+2;
 420         bit = vp56_rac_get_prob(c, *ps);
 421         ps += 1 + 3*bit;
 422         x  += 4*bit;
 423         bit = vp56_rac_get_prob(c, *ps);
 424         ps += 1 + bit;
 425         x  += 2*bit;
 426         x  += vp56_rac_get_prob(c, *ps);
 427     }
 428
 429     return (x && vp56_rac_get_prob(c, p[1])) ? -x : x;
 430 }
 431
 432 static av_always_inline
 433 const uint8_t *get_submv_prob(uint32_t left, uint32_t top)
 434 {
 435     if (left == top)
 436         return vp8_submv_prob[4-!!left];
 437     if (!top)
 438         return vp8_submv_prob[2];
 439     return vp8_submv_prob[1-!!left];
 440 }
 441
 442 /**
 443  * Split motion vector prediction, 16.4.
 444  * @returns the number of motion vectors parsed (2, 4 or 16)
 445  */
 446 static av_always_inline
 447 int decode_splitmvs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb)
 448 {
 449     int part_idx;
 450     int n, num;
 451     VP8Macroblock *top_mb  = &mb[2];
 452     VP8Macroblock *left_mb = &mb[-1];
 453     const uint8_t *mbsplits_left = vp8_mbsplits[left_mb->partitioning],
 454                   *mbsplits_top = vp8_mbsplits[top_mb->partitioning],
 455                   *mbsplits_cur, *firstidx;
 456     VP56mv *top_mv  = top_mb->bmv;
 457     VP56mv *left_mv = left_mb->bmv;
 458     VP56mv *cur_mv  = mb->bmv;
 459
 460     if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[0])) {
 461         if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[1])) {
 462             part_idx = VP8_SPLITMVMODE_16x8 + vp56_rac_get_prob(c, vp8_mbsplit_prob[2]);
 463         } else {
 464             part_idx = VP8_SPLITMVMODE_8x8;
 465         }
 466     } else {
 467         part_idx = VP8_SPLITMVMODE_4x4;
 468     }
 469
 470     num = vp8_mbsplit_count[part_idx];
 471     mbsplits_cur = vp8_mbsplits[part_idx],
 472     firstidx = vp8_mbfirstidx[part_idx];
 473     mb->partitioning = part_idx;
 474
 475     for (n = 0; n < num; n++) {
 476         int k = firstidx[n];
 477         uint32_t left, above;
 478         const uint8_t *submv_prob;
 479
 480         if (!(k & 3))
 481             left = AV_RN32A(&left_mv[mbsplits_left[k + 3]]);
 482         else
 483             left  = AV_RN32A(&cur_mv[mbsplits_cur[k - 1]]);
 484         if (k <= 3)
 485             above = AV_RN32A(&top_mv[mbsplits_top[k + 12]]);
 486         else
 487             above = AV_RN32A(&cur_mv[mbsplits_cur[k - 4]]);
 488
 489         submv_prob = get_submv_prob(left, above);
 490
 491         if (vp56_rac_get_prob_branchy(c, submv_prob[0])) {
 492             if (vp56_rac_get_prob_branchy(c, submv_prob[1])) {
 493                 if (vp56_rac_get_prob_branchy(c, submv_prob[2])) {
 494                     mb->bmv[n].y = mb->mv.y + read_mv_component(c, s->prob->mvc[0]);
 495                     mb->bmv[n].x = mb->mv.x + read_mv_component(c, s->prob->mvc[1]);
 496                 } else {
 497                     AV_ZERO32(&mb->bmv[n]);
 498                 }
 499             } else {
 500                 AV_WN32A(&mb->bmv[n], above);
 501             }
 502         } else {
 503             AV_WN32A(&mb->bmv[n], left);
 504         }
 505     }
 506
 507     return num;
 508 }
 509
 510 static av_always_inline
 511 void decode_mvs(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y)
 512 {
 513     VP8Macroblock *mb_edge[3] = { mb + 2 /* top */,
 514                                   mb - 1 /* left */,
 515                                   mb + 1 /* top-left */ };
 516     enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV };
 517     enum { VP8_EDGE_TOP, VP8_EDGE_LEFT, VP8_EDGE_TOPLEFT };
 518     int idx = CNT_ZERO;
 519     int cur_sign_bias = s->sign_bias[mb->ref_frame];
 520     int8_t *sign_bias = s->sign_bias;
 521     VP56mv near_mv[4];
 522     uint8_t cnt[4] = { 0 };
 523     VP56RangeCoder *c = &s->c;
 524
 525     AV_ZERO32(&near_mv[0]);
 526     AV_ZERO32(&near_mv[1]);
 527     AV_ZERO32(&near_mv[2]);
 528
 529     /* Process MB on top, left and top-left */
 530     #define MV_EDGE_CHECK(n)\
 531     {\
 532         VP8Macroblock *edge = mb_edge[n];\
 533         int edge_ref = edge->ref_frame;\
 534         if (edge_ref != VP56_FRAME_CURRENT) {\
 535             uint32_t mv = AV_RN32A(&edge->mv);\
 536             if (mv) {\
 537                 if (cur_sign_bias != sign_bias[edge_ref]) {\
 538                     /* SWAR negate of the values in mv. */\
 539                     mv = ~mv;\
 540                     mv = ((mv&0x7fff7fff) + 0x00010001) ^ (mv&0x80008000);\
 541                 }\
 542                 if (!n || mv != AV_RN32A(&near_mv[idx]))\
 543                     AV_WN32A(&near_mv[++idx], mv);\
 544                 cnt[idx]      += 1 + (n != 2);\
 545             } else\
 546                 cnt[CNT_ZERO] += 1 + (n != 2);\
 547         }\
 548     }
 549
 550     MV_EDGE_CHECK(0)
 551     MV_EDGE_CHECK(1)
 552     MV_EDGE_CHECK(2)
 553
 554     mb->partitioning = VP8_SPLITMVMODE_NONE;
 555     if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_ZERO]][0])) {
 556         mb->mode = VP8_MVMODE_MV;
 557
 558         /* If we have three distinct MVs, merge first and last if they're the same */
 559         if (cnt[CNT_SPLITMV] && AV_RN32A(&near_mv[1 + VP8_EDGE_TOP]) == AV_RN32A(&near_mv[1 + VP8_EDGE_TOPLEFT]))
 560             cnt[CNT_NEAREST] += 1;
 561
 562         /* Swap near and nearest if necessary */
 563         if (cnt[CNT_NEAR] > cnt[CNT_NEAREST]) {
 564             FFSWAP(uint8_t,     cnt[CNT_NEAREST],     cnt[CNT_NEAR]);
 565             FFSWAP( VP56mv, near_mv[CNT_NEAREST], near_mv[CNT_NEAR]);
 566         }
 567
 568         if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAREST]][1])) {
 569             if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAR]][2])) {
 570
 571                 /* Choose the best mv out of 0,0 and the nearest mv */
 572                 clamp_mv(s, &mb->mv, &near_mv[CNT_ZERO + (cnt[CNT_NEAREST] >= cnt[CNT_ZERO])]);
 573                 cnt[CNT_SPLITMV] = ((mb_edge[VP8_EDGE_LEFT]->mode    == VP8_MVMODE_SPLIT) +
 574                                     (mb_edge[VP8_EDGE_TOP]->mode     == VP8_MVMODE_SPLIT)) * 2 +
 575                                     (mb_edge[VP8_EDGE_TOPLEFT]->mode == VP8_MVMODE_SPLIT);
 576
 577                 if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_SPLITMV]][3])) {
 578                     mb->mode = VP8_MVMODE_SPLIT;
 579                     mb->mv = mb->bmv[decode_splitmvs(s, c, mb) - 1];
 580                 } else {
 581                     mb->mv.y += read_mv_component(c, s->prob->mvc[0]);
 582                     mb->mv.x += read_mv_component(c, s->prob->mvc[1]);
 583                     mb->bmv[0] = mb->mv;
 584                 }
 585             } else {
 586                 clamp_mv(s, &mb->mv, &near_mv[CNT_NEAR]);
 587                 mb->bmv[0] = mb->mv;
 588             }
 589         } else {
 590             clamp_mv(s, &mb->mv, &near_mv[CNT_NEAREST]);
 591             mb->bmv[0] = mb->mv;
 592         }
 593     } else {
 594         mb->mode = VP8_MVMODE_ZERO;
 595         AV_ZERO32(&mb->mv);
 596         mb->bmv[0] = mb->mv;
 597     }
 598 }
 599
 600 static av_always_inline
 601 void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c,
 602                            int mb_x, int keyframe)
 603 {
 604     uint8_t *intra4x4 = s->intra4x4_pred_mode_mb;
 605     if (keyframe) {
 606         int x, y;
 607         uint8_t* const top = s->intra4x4_pred_mode_top + 4 * mb_x;
 608         uint8_t* const left = s->intra4x4_pred_mode_left;
 609         for (y = 0; y < 4; y++) {
 610             for (x = 0; x < 4; x++) {
 611                 const uint8_t *ctx;
 612                 ctx = vp8_pred4x4_prob_intra[top[x]][left[y]];
 613                 *intra4x4 = vp8_rac_get_tree(c, vp8_pred4x4_tree, ctx);
 614                 left[y] = top[x] = *intra4x4;
 615                 intra4x4++;
 616             }
 617         }
 618     } else {
 619         int i;
 620         for (i = 0; i < 16; i++)
 621             intra4x4[i] = vp8_rac_get_tree(c, vp8_pred4x4_tree, vp8_pred4x4_prob_inter);
 622     }
 623 }
 624
 625 static av_always_inline
 626 void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, uint8_t *segment, uint8_t *ref)
 627 {
 628     VP56RangeCoder *c = &s->c;
 629
 630     if (s->segmentation.update_map) {
 631         int bit  = vp56_rac_get_prob(c, s->prob->segmentid[0]);
 632         *segment = vp56_rac_get_prob(c, s->prob->segmentid[1+bit]) + 2*bit;
 633     } else
 634         *segment = ref ? *ref : *segment;
 635     s->segment = *segment;
 636
 637     mb->skip = s->mbskip_enabled ? vp56_rac_get_prob(c, s->prob->mbskip) : 0;
 638
 639     if (s->keyframe) {
 640         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_intra, vp8_pred16x16_prob_intra);
 641
 642         if (mb->mode == MODE_I4x4) {
 643             decode_intra4x4_modes(s, c, mb_x, 1);
 644         } else {
 645             const uint32_t modes = vp8_pred4x4_mode[mb->mode] * 0x01010101u;
 646             AV_WN32A(s->intra4x4_pred_mode_top + 4 * mb_x, modes);
 647             AV_WN32A(s->intra4x4_pred_mode_left, modes);
 648         }
 649
 650         s->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, vp8_pred8x8c_prob_intra);
 651         mb->ref_frame = VP56_FRAME_CURRENT;
 652     } else if (vp56_rac_get_prob_branchy(c, s->prob->intra)) {
 653         // inter MB, 16.2
 654         if (vp56_rac_get_prob_branchy(c, s->prob->last))
 655             mb->ref_frame = vp56_rac_get_prob(c, s->prob->golden) ?
 656                 VP56_FRAME_GOLDEN2 /* altref */ : VP56_FRAME_GOLDEN;
 657         else
 658             mb->ref_frame = VP56_FRAME_PREVIOUS;
 659         s->ref_count[mb->ref_frame-1]++;
 660
 661         // motion vectors, 16.3
 662         decode_mvs(s, mb, mb_x, mb_y);
 663     } else {
 664         // intra MB, 16.1
 665         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_inter, s->prob->pred16x16);
 666
 667         if (mb->mode == MODE_I4x4)
 668             decode_intra4x4_modes(s, c, mb_x, 0);
 669
 670         s->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, s->prob->pred8x8c);
 671         mb->ref_frame = VP56_FRAME_CURRENT;
 672         mb->partitioning = VP8_SPLITMVMODE_NONE;
 673         AV_ZERO32(&mb->bmv[0]);
 674     }
 675 }
 676
 677 #ifndef decode_block_coeffs_internal
 678 /**
 679  * @param c arithmetic bitstream reader context
 680  * @param block destination for block coefficients
 681  * @param probs probabilities to use when reading trees from the bitstream
 682  * @param i initial coeff index, 0 unless a separate DC block is coded
 683  * @param qmul array holding the dc/ac dequant factor at position 0/1
 684  * @return 0 if no coeffs were decoded
 685  *         otherwise, the index of the last coeff decoded plus one
 686  */
 687 static int decode_block_coeffs_internal(VP56RangeCoder *c, DCTELEM block[16],
 688                                         uint8_t probs[16][3][NUM_DCT_TOKENS-1],
 689                                         int i, uint8_t *token_prob, int16_t qmul[2])
 690 {
 691     goto skip_eob;
 692     do {
 693         int coeff;
 694         if (!vp56_rac_get_prob_branchy(c, token_prob[0]))   // DCT_EOB
 695             return i;
 696
 697 skip_eob:
 698         if (!vp56_rac_get_prob_branchy(c, token_prob[1])) { // DCT_0
 699             if (++i == 16)
 700                 return i; // invalid input; blocks should end with EOB
 701             token_prob = probs[i][0];
 702             goto skip_eob;
 703         }
 704
 705         if (!vp56_rac_get_prob_branchy(c, token_prob[2])) { // DCT_1
 706             coeff = 1;
 707             token_prob = probs[i+1][1];
 708         } else {
 709             if (!vp56_rac_get_prob_branchy(c, token_prob[3])) { // DCT 2,3,4
 710                 coeff = vp56_rac_get_prob_branchy(c, token_prob[4]);
 711                 if (coeff)
 712                     coeff += vp56_rac_get_prob(c, token_prob[5]);
 713                 coeff += 2;
 714             } else {
 715                 // DCT_CAT*
 716                 if (!vp56_rac_get_prob_branchy(c, token_prob[6])) {
 717                     if (!vp56_rac_get_prob_branchy(c, token_prob[7])) { // DCT_CAT1
 718                         coeff  = 5 + vp56_rac_get_prob(c, vp8_dct_cat1_prob[0]);
 719                     } else {                                    // DCT_CAT2
 720                         coeff  = 7;
 721                         coeff += vp56_rac_get_prob(c, vp8_dct_cat2_prob[0]) << 1;
 722                         coeff += vp56_rac_get_prob(c, vp8_dct_cat2_prob[1]);
 723                     }
 724                 } else {    // DCT_CAT3 and up
 725                     int a = vp56_rac_get_prob(c, token_prob[8]);
 726                     int b = vp56_rac_get_prob(c, token_prob[9+a]);
 727                     int cat = (a<<1) + b;
 728                     coeff  = 3 + (8<<cat);
 729                     coeff += vp8_rac_get_coeff(c, ff_vp8_dct_cat_prob[cat]);
 730                 }
 731             }
 732             token_prob = probs[i+1][2];
 733         }
 734         block[zigzag_scan[i]] = (vp8_rac_get(c) ? -coeff : coeff) * qmul[!!i];
 735     } while (++i < 16);
 736
 737     return i;
 738 }
 739 #endif
 740
 741 /**
 742  * @param c arithmetic bitstream reader context
 743  * @param block destination for block coefficients
 744  * @param probs probabilities to use when reading trees from the bitstream
 745  * @param i initial coeff index, 0 unless a separate DC block is coded
 746  * @param zero_nhood the initial prediction context for number of surrounding
 747  *                   all-zero blocks (only left/top, so 0-2)
 748  * @param qmul array holding the dc/ac dequant factor at position 0/1
 749  * @return 0 if no coeffs were decoded
 750  *         otherwise, the index of the last coeff decoded plus one
 751  */
 752 static av_always_inline
 753 int decode_block_coeffs(VP56RangeCoder *c, DCTELEM block[16],
 754                         uint8_t probs[16][3][NUM_DCT_TOKENS-1],
 755                         int i, int zero_nhood, int16_t qmul[2])
 756 {
 757     uint8_t *token_prob = probs[i][zero_nhood];
 758     if (!vp56_rac_get_prob_branchy(c, token_prob[0]))   // DCT_EOB
 759         return 0;
 760     return decode_block_coeffs_internal(c, block, probs, i, token_prob, qmul);
 761 }
 762
 763 static av_always_inline
 764 void decode_mb_coeffs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
 765                       uint8_t t_nnz[9], uint8_t l_nnz[9])
 766 {
 767     int i, x, y, luma_start = 0, luma_ctx = 3;
 768     int nnz_pred, nnz, nnz_total = 0;
 769     int segment = s->segment;
 770     int block_dc = 0;
 771
 772     if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
 773         nnz_pred = t_nnz[8] + l_nnz[8];
 774
 775         // decode DC values and do hadamard
 776         nnz = decode_block_coeffs(c, s->block_dc, s->prob->token[1], 0, nnz_pred,
 777                                   s->qmat[segment].luma_dc_qmul);
 778         l_nnz[8] = t_nnz[8] = !!nnz;
 779         if (nnz) {
 780             nnz_total += nnz;
 781             block_dc = 1;
 782             if (nnz == 1)
 783                 s->vp8dsp.vp8_luma_dc_wht_dc(s->block, s->block_dc);
 784             else
 785                 s->vp8dsp.vp8_luma_dc_wht(s->block, s->block_dc);
 786         }
 787         luma_start = 1;
 788         luma_ctx = 0;
 789     }
 790
 791     // luma blocks
 792     for (y = 0; y < 4; y++)
 793         for (x = 0; x < 4; x++) {
 794             nnz_pred = l_nnz[y] + t_nnz[x];
 795             nnz = decode_block_coeffs(c, s->block[y][x], s->prob->token[luma_ctx], luma_start,
 796                                       nnz_pred, s->qmat[segment].luma_qmul);
 797             // nnz+block_dc may be one more than the actual last index, but we don't care
 798             s->non_zero_count_cache[y][x] = nnz + block_dc;
 799             t_nnz[x] = l_nnz[y] = !!nnz;
 800             nnz_total += nnz;
 801         }
 802
 803     // chroma blocks
 804     // TODO: what to do about dimensions? 2nd dim for luma is x,
 805     // but for chroma it's (y<<1)|x
 806     for (i = 4; i < 6; i++)
 807         for (y = 0; y < 2; y++)
 808             for (x = 0; x < 2; x++) {
 809                 nnz_pred = l_nnz[i+2*y] + t_nnz[i+2*x];
 810                 nnz = decode_block_coeffs(c, s->block[i][(y<<1)+x], s->prob->token[2], 0,
 811                                           nnz_pred, s->qmat[segment].chroma_qmul);
 812                 s->non_zero_count_cache[i][(y<<1)+x] = nnz;
 813                 t_nnz[i+2*x] = l_nnz[i+2*y] = !!nnz;
 814                 nnz_total += nnz;
 815             }
 816
 817     // if there were no coded coeffs despite the macroblock not being marked skip,
 818     // we MUST not do the inner loop filter and should not do IDCT
 819     // Since skip isn't used for bitstream prediction, just manually set it.
 820     if (!nnz_total)
 821         mb->skip = 1;
 822 }
 823
 824 static av_always_inline
 825 void backup_mb_border(uint8_t *top_border, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr,
 826                       int linesize, int uvlinesize, int simple)
 827 {
 828     AV_COPY128(top_border, src_y + 15*linesize);
 829     if (!simple) {
 830         AV_COPY64(top_border+16, src_cb + 7*uvlinesize);
 831         AV_COPY64(top_border+24, src_cr + 7*uvlinesize);
 832     }
 833 }
 834
 835 static av_always_inline
 836 void xchg_mb_border(uint8_t *top_border, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr,
 837                     int linesize, int uvlinesize, int mb_x, int mb_y, int mb_width,
 838                     int simple, int xchg)
 839 {
 840     uint8_t *top_border_m1 = top_border-32;     // for TL prediction
 841     src_y  -=   linesize;
 842     src_cb -= uvlinesize;
 843     src_cr -= uvlinesize;
 844
 845 #define XCHG(a,b,xchg) do {                     \
 846         if (xchg) AV_SWAP64(b,a);               \
 847         else      AV_COPY64(b,a);               \
 848     } while (0)
 849
 850     XCHG(top_border_m1+8, src_y-8, xchg);
 851     XCHG(top_border,      src_y,   xchg);
 852     XCHG(top_border+8,    src_y+8, 1);
 853     if (mb_x < mb_width-1)
 854         XCHG(top_border+32, src_y+16, 1);
 855
 856     // only copy chroma for normal loop filter
 857     // or to initialize the top row to 127
 858     if (!simple || !mb_y) {
 859         XCHG(top_border_m1+16, src_cb-8, xchg);
 860         XCHG(top_border_m1+24, src_cr-8, xchg);
 861         XCHG(top_border+16,    src_cb, 1);
 862         XCHG(top_border+24,    src_cr, 1);
 863     }
 864 }
 865
 866 static av_always_inline
 867 int check_dc_pred8x8_mode(int mode, int mb_x, int mb_y)
 868 {
 869     if (!mb_x) {
 870         return mb_y ? TOP_DC_PRED8x8 : DC_128_PRED8x8;
 871     } else {
 872         return mb_y ? mode : LEFT_DC_PRED8x8;
 873     }
 874 }
 875
 876 static av_always_inline
 877 int check_tm_pred8x8_mode(int mode, int mb_x, int mb_y)
 878 {
 879     if (!mb_x) {
 880         return mb_y ? VERT_PRED8x8 : DC_129_PRED8x8;
 881     } else {
 882         return mb_y ? mode : HOR_PRED8x8;
 883     }
 884 }
 885
 886 static av_always_inline
 887 int check_intra_pred8x8_mode(int mode, int mb_x, int mb_y)
 888 {
 889     if (mode == DC_PRED8x8) {
 890         return check_dc_pred8x8_mode(mode, mb_x, mb_y);
 891     } else {
 892         return mode;
 893     }
 894 }
 895
 896 static av_always_inline
 897 int check_intra_pred8x8_mode_emuedge(int mode, int mb_x, int mb_y)
 898 {
 899     switch (mode) {
 900     case DC_PRED8x8:
 901         return check_dc_pred8x8_mode(mode, mb_x, mb_y);
 902     case VERT_PRED8x8:
 903         return !mb_y ? DC_127_PRED8x8 : mode;
 904     case HOR_PRED8x8:
 905         return !mb_x ? DC_129_PRED8x8 : mode;
 906     case PLANE_PRED8x8 /*TM*/:
 907         return check_tm_pred8x8_mode(mode, mb_x, mb_y);
 908     }
 909     return mode;
 910 }
 911
 912 static av_always_inline
 913 int check_tm_pred4x4_mode(int mode, int mb_x, int mb_y)
 914 {
 915     if (!mb_x) {
 916         return mb_y ? VERT_VP8_PRED : DC_129_PRED;
 917     } else {
 918         return mb_y ? mode : HOR_VP8_PRED;
 919     }
 920 }
 921
 922 static av_always_inline
 923 int check_intra_pred4x4_mode_emuedge(int mode, int mb_x, int mb_y, int *copy_buf)
 924 {
 925     switch (mode) {
 926     case VERT_PRED:
 927         if (!mb_x && mb_y) {
 928             *copy_buf = 1;
 929             return mode;
 930         }
 931         /* fall-through */
 932     case DIAG_DOWN_LEFT_PRED:
 933     case VERT_LEFT_PRED:
 934         return !mb_y ? DC_127_PRED : mode;
 935     case HOR_PRED:
 936         if (!mb_y) {
 937             *copy_buf = 1;
 938             return mode;
 939         }
 940         /* fall-through */
 941     case HOR_UP_PRED:
 942         return !mb_x ? DC_129_PRED : mode;
 943     case TM_VP8_PRED:
 944         return check_tm_pred4x4_mode(mode, mb_x, mb_y);
 945     case DC_PRED: // 4x4 DC doesn't use the same "H.264-style" exceptions as 16x16/8x8 DC
 946     case DIAG_DOWN_RIGHT_PRED:
 947     case VERT_RIGHT_PRED:
 948     case HOR_DOWN_PRED:
 949         if (!mb_y || !mb_x)
 950             *copy_buf = 1;
 951         return mode;
 952     }
 953     return mode;
 954 }
 955
 956 static av_always_inline
 957 void intra_predict(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb,
 958                    int mb_x, int mb_y)
 959 {
 960     AVCodecContext *avctx = s->avctx;
 961     int x, y, mode, nnz;
 962     uint32_t tr;
 963
 964     // for the first row, we need to run xchg_mb_border to init the top edge to 127
 965     // otherwise, skip it if we aren't going to deblock
 966     if (!(avctx->flags & CODEC_FLAG_EMU_EDGE && !mb_y) && (s->deblock_filter || !mb_y))
 967         xchg_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2],
 968                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
 969                        s->filter.simple, 1);
 970
 971     if (mb->mode < MODE_I4x4) {
 972         if (avctx->flags & CODEC_FLAG_EMU_EDGE) { // tested
 973             mode = check_intra_pred8x8_mode_emuedge(mb->mode, mb_x, mb_y);
 974         } else {
 975             mode = check_intra_pred8x8_mode(mb->mode, mb_x, mb_y);
 976         }
 977         s->hpc.pred16x16[mode](dst[0], s->linesize);
 978     } else {
 979         uint8_t *ptr = dst[0];
 980         uint8_t *intra4x4 = s->intra4x4_pred_mode_mb;
 981         uint8_t tr_top[4] = { 127, 127, 127, 127 };
 982
 983         // all blocks on the right edge of the macroblock use bottom edge
 984         // the top macroblock for their topright edge
 985         uint8_t *tr_right = ptr - s->linesize + 16;
 986
 987         // if we're on the right edge of the frame, said edge is extended
 988         // from the top macroblock
 989         if (!(!mb_y && avctx->flags & CODEC_FLAG_EMU_EDGE) &&
 990             mb_x == s->mb_width-1) {
 991             tr = tr_right[-1]*0x01010101u;
 992             tr_right = (uint8_t *)&tr;
 993         }
 994
 995         if (mb->skip)
 996             AV_ZERO128(s->non_zero_count_cache);
 997
 998         for (y = 0; y < 4; y++) {
 999             uint8_t *topright = ptr + 4 - s->linesize;
1000             for (x = 0; x < 4; x++) {
1001                 int copy = 0, linesize = s->linesize;
1002                 uint8_t *dst = ptr+4*x;
1003                 DECLARE_ALIGNED(4, uint8_t, copy_dst)[5*8];
1004
1005                 if ((y == 0 || x == 3) && mb_y == 0 && avctx->flags & CODEC_FLAG_EMU_EDGE) {
1006                     topright = tr_top;
1007                 } else if (x == 3)
1008                     topright = tr_right;
1009
1010                 if (avctx->flags & CODEC_FLAG_EMU_EDGE) { // mb_x+x or mb_y+y is a hack but works
1011                     mode = check_intra_pred4x4_mode_emuedge(intra4x4[x], mb_x + x, mb_y + y, &copy);
1012                     if (copy) {
1013                         dst = copy_dst + 12;
1014                         linesize = 8;
1015                         if (!(mb_y + y)) {
1016                             copy_dst[3] = 127U;
1017                             AV_WN32A(copy_dst+4, 127U * 0x01010101U);
1018                         } else {
1019                             AV_COPY32(copy_dst+4, ptr+4*x-s->linesize);
1020                             if (!(mb_x + x)) {
1021                                 copy_dst[3] = 129U;
1022                             } else {
1023                                 copy_dst[3] = ptr[4*x-s->linesize-1];
1024                             }
1025                         }
1026                         if (!(mb_x + x)) {
1027                             copy_dst[11] =
1028                             copy_dst[19] =
1029                             copy_dst[27] =
1030                             copy_dst[35] = 129U;
1031                         } else {
1032                             copy_dst[11] = ptr[4*x              -1];
1033                             copy_dst[19] = ptr[4*x+s->linesize  -1];
1034                             copy_dst[27] = ptr[4*x+s->linesize*2-1];
1035                             copy_dst[35] = ptr[4*x+s->linesize*3-1];
1036                         }
1037                     }
1038                 } else {
1039                     mode = intra4x4[x];
1040                 }
1041                 s->hpc.pred4x4[mode](dst, topright, linesize);
1042                 if (copy) {
1043                     AV_COPY32(ptr+4*x              , copy_dst+12);
1044                     AV_COPY32(ptr+4*x+s->linesize  , copy_dst+20);
1045                     AV_COPY32(ptr+4*x+s->linesize*2, copy_dst+28);
1046                     AV_COPY32(ptr+4*x+s->linesize*3, copy_dst+36);
1047                 }
1048
1049                 nnz = s->non_zero_count_cache[y][x];
1050                 if (nnz) {
1051                     if (nnz == 1)
1052                         s->vp8dsp.vp8_idct_dc_add(ptr+4*x, s->block[y][x], s->linesize);
1053                     else
1054                         s->vp8dsp.vp8_idct_add(ptr+4*x, s->block[y][x], s->linesize);
1055                 }
1056                 topright += 4;
1057             }
1058
1059             ptr   += 4*s->linesize;
1060             intra4x4 += 4;
1061         }
1062     }
1063
1064     if (avctx->flags & CODEC_FLAG_EMU_EDGE) {
1065         mode = check_intra_pred8x8_mode_emuedge(s->chroma_pred_mode, mb_x, mb_y);
1066     } else {
1067         mode = check_intra_pred8x8_mode(s->chroma_pred_mode, mb_x, mb_y);
1068     }
1069     s->hpc.pred8x8[mode](dst[1], s->uvlinesize);
1070     s->hpc.pred8x8[mode](dst[2], s->uvlinesize);
1071
1072     if (!(avctx->flags & CODEC_FLAG_EMU_EDGE && !mb_y) && (s->deblock_filter || !mb_y))
1073         xchg_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2],
1074                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1075                        s->filter.simple, 0);
1076 }
1077
1078 static const uint8_t subpel_idx[3][8] = {
1079     { 0, 1, 2, 1, 2, 1, 2, 1 }, // nr. of left extra pixels,
1080                                 // also function pointer index
1081     { 0, 3, 5, 3, 5, 3, 5, 3 }, // nr. of extra pixels required
1082     { 0, 2, 3, 2, 3, 2, 3, 2 }, // nr. of right extra pixels
1083 };
1084
1085 /**
1086  * luma MC function
1087  *
1088  * @param s VP8 decoding context
1089  * @param dst target buffer for block data at block position
1090  * @param ref reference picture buffer at origin (0, 0)
1091  * @param mv motion vector (relative to block position) to get pixel data from
1092  * @param x_off horizontal position of block from origin (0, 0)
1093  * @param y_off vertical position of block from origin (0, 0)
1094  * @param block_w width of block (16, 8 or 4)
1095  * @param block_h height of block (always same as block_w)
1096  * @param width width of src/dst plane data
1097  * @param height height of src/dst plane data
1098  * @param linesize size of a single line of plane data, including padding
1099  * @param mc_func motion compensation function pointers (bilinear or sixtap MC)
1100  */
1101 static av_always_inline
1102 void vp8_mc_luma(VP8Context *s, uint8_t *dst, AVFrame *ref, const VP56mv *mv,
1103                  int x_off, int y_off, int block_w, int block_h,
1104                  int width, int height, int linesize,
1105                  vp8_mc_func mc_func[3][3])
1106 {
1107     uint8_t *src = ref->data[0];
1108
1109     if (AV_RN32A(mv)) {
1110
1111         int mx = (mv->x << 1)&7, mx_idx = subpel_idx[0][mx];
1112         int my = (mv->y << 1)&7, my_idx = subpel_idx[0][my];
1113
1114         x_off += mv->x >> 2;
1115         y_off += mv->y >> 2;
1116
1117         // edge emulation
1118         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 4, 0);
1119         src += y_off * linesize + x_off;
1120         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1121             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1122             s->dsp.emulated_edge_mc(s->edge_emu_buffer, src - my_idx * linesize - mx_idx, linesize,
1123                                     block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1124                                     x_off - mx_idx, y_off - my_idx, width, height);
1125             src = s->edge_emu_buffer + mx_idx + linesize * my_idx;
1126         }
1127         mc_func[my_idx][mx_idx](dst, linesize, src, linesize, block_h, mx, my);
1128     } else {
1129         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 4, 0);
1130         mc_func[0][0](dst, linesize, src + y_off * linesize + x_off, linesize, block_h, 0, 0);
1131     }
1132 }
1133
1134 /**
1135  * chroma MC function
1136  *
1137  * @param s VP8 decoding context
1138  * @param dst1 target buffer for block data at block position (U plane)
1139  * @param dst2 target buffer for block data at block position (V plane)
1140  * @param ref reference picture buffer at origin (0, 0)
1141  * @param mv motion vector (relative to block position) to get pixel data from
1142  * @param x_off horizontal position of block from origin (0, 0)
1143  * @param y_off vertical position of block from origin (0, 0)
1144  * @param block_w width of block (16, 8 or 4)
1145  * @param block_h height of block (always same as block_w)
1146  * @param width width of src/dst plane data
1147  * @param height height of src/dst plane data
1148  * @param linesize size of a single line of plane data, including padding
1149  * @param mc_func motion compensation function pointers (bilinear or sixtap MC)
1150  */
1151 static av_always_inline
1152 void vp8_mc_chroma(VP8Context *s, uint8_t *dst1, uint8_t *dst2, AVFrame *ref,
1153                    const VP56mv *mv, int x_off, int y_off,
1154                    int block_w, int block_h, int width, int height, int linesize,
1155                    vp8_mc_func mc_func[3][3])
1156 {
1157     uint8_t *src1 = ref->data[1], *src2 = ref->data[2];
1158
1159     if (AV_RN32A(mv)) {
1160         int mx = mv->x&7, mx_idx = subpel_idx[0][mx];
1161         int my = mv->y&7, my_idx = subpel_idx[0][my];
1162
1163         x_off += mv->x >> 3;
1164         y_off += mv->y >> 3;
1165
1166         // edge emulation
1167         src1 += y_off * linesize + x_off;
1168         src2 += y_off * linesize + x_off;
1169         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 3, 0);
1170         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1171             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1172             s->dsp.emulated_edge_mc(s->edge_emu_buffer, src1 - my_idx * linesize - mx_idx, linesize,
1173                                     block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1174                                     x_off - mx_idx, y_off - my_idx, width, height);
1175             src1 = s->edge_emu_buffer + mx_idx + linesize * my_idx;
1176             mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my);
1177
1178             s->dsp.emulated_edge_mc(s->edge_emu_buffer, src2 - my_idx * linesize - mx_idx, linesize,
1179                                     block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1180                                     x_off - mx_idx, y_off - my_idx, width, height);
1181             src2 = s->edge_emu_buffer + mx_idx + linesize * my_idx;
1182             mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my);
1183         } else {
1184             mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my);
1185             mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my);
1186         }
1187     } else {
1188         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 3, 0);
1189         mc_func[0][0](dst1, linesize, src1 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1190         mc_func[0][0](dst2, linesize, src2 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1191     }
1192 }
1193
1194 static av_always_inline
1195 void vp8_mc_part(VP8Context *s, uint8_t *dst[3],
1196                  AVFrame *ref_frame, int x_off, int y_off,
1197                  int bx_off, int by_off,
1198                  int block_w, int block_h,
1199                  int width, int height, VP56mv *mv)
1200 {
1201     VP56mv uvmv = *mv;
1202
1203     /* Y */
1204     vp8_mc_luma(s, dst[0] + by_off * s->linesize + bx_off,
1205                 ref_frame, mv, x_off + bx_off, y_off + by_off,
1206                 block_w, block_h, width, height, s->linesize,
1207                 s->put_pixels_tab[block_w == 8]);
1208
1209     /* U/V */
1210     if (s->profile == 3) {
1211         uvmv.x &= ~7;
1212         uvmv.y &= ~7;
1213     }
1214     x_off   >>= 1; y_off   >>= 1;
1215     bx_off  >>= 1; by_off  >>= 1;
1216     width   >>= 1; height  >>= 1;
1217     block_w >>= 1; block_h >>= 1;
1218     vp8_mc_chroma(s, dst[1] + by_off * s->uvlinesize + bx_off,
1219                   dst[2] + by_off * s->uvlinesize + bx_off, ref_frame,
1220                   &uvmv, x_off + bx_off, y_off + by_off,
1221                   block_w, block_h, width, height, s->uvlinesize,
1222                   s->put_pixels_tab[1 + (block_w == 4)]);
1223 }
1224
1225 /* Fetch pixels for estimated mv 4 macroblocks ahead.
1226  * Optimized for 64-byte cache lines.  Inspired by ffh264 prefetch_motion. */
1227 static av_always_inline void prefetch_motion(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, int mb_xy, int ref)
1228 {
1229     /* Don't prefetch refs that haven't been used very often this frame. */
1230     if (s->ref_count[ref-1] > (mb_xy >> 5)) {
1231         int x_off = mb_x << 4, y_off = mb_y << 4;
1232         int mx = (mb->mv.x>>2) + x_off + 8;
1233         int my = (mb->mv.y>>2) + y_off;
1234         uint8_t **src= s->framep[ref]->data;
1235         int off= mx + (my + (mb_x&3)*4)*s->linesize + 64;
1236         /* For threading, a ff_thread_await_progress here might be useful, but
1237          * it actually slows down the decoder. Since a bad prefetch doesn't
1238          * generate bad decoder output, we don't run it here. */
1239         s->dsp.prefetch(src[0]+off, s->linesize, 4);
1240         off= (mx>>1) + ((my>>1) + (mb_x&7))*s->uvlinesize + 64;
1241         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
1242     }
1243 }
1244
1245 /**
1246  * Apply motion vectors to prediction buffer, chapter 18.
1247  */
1248 static av_always_inline
1249 void inter_predict(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb,
1250                    int mb_x, int mb_y)
1251 {
1252     int x_off = mb_x << 4, y_off = mb_y << 4;
1253     int width = 16*s->mb_width, height = 16*s->mb_height;
1254     AVFrame *ref = s->framep[mb->ref_frame];
1255     VP56mv *bmv = mb->bmv;
1256
1257     switch (mb->partitioning) {
1258     case VP8_SPLITMVMODE_NONE:
1259         vp8_mc_part(s, dst, ref, x_off, y_off,
1260                     0, 0, 16, 16, width, height, &mb->mv);
1261         break;
1262     case VP8_SPLITMVMODE_4x4: {
1263         int x, y;
1264         VP56mv uvmv;
1265
1266         /* Y */
1267         for (y = 0; y < 4; y++) {
1268             for (x = 0; x < 4; x++) {
1269                 vp8_mc_luma(s, dst[0] + 4*y*s->linesize + x*4,
1270                             ref, &bmv[4*y + x],
1271                             4*x + x_off, 4*y + y_off, 4, 4,
1272                             width, height, s->linesize,
1273                             s->put_pixels_tab[2]);
1274             }
1275         }
1276
1277         /* U/V */
1278         x_off >>= 1; y_off >>= 1; width >>= 1; height >>= 1;
1279         for (y = 0; y < 2; y++) {
1280             for (x = 0; x < 2; x++) {
1281                 uvmv.x = mb->bmv[ 2*y    * 4 + 2*x  ].x +
1282                          mb->bmv[ 2*y    * 4 + 2*x+1].x +
1283                          mb->bmv[(2*y+1) * 4 + 2*x  ].x +
1284                          mb->bmv[(2*y+1) * 4 + 2*x+1].x;
1285                 uvmv.y = mb->bmv[ 2*y    * 4 + 2*x  ].y +
1286                          mb->bmv[ 2*y    * 4 + 2*x+1].y +
1287                          mb->bmv[(2*y+1) * 4 + 2*x  ].y +
1288                          mb->bmv[(2*y+1) * 4 + 2*x+1].y;
1289                 uvmv.x = (uvmv.x + 2 + (uvmv.x >> (INT_BIT-1))) >> 2;
1290                 uvmv.y = (uvmv.y + 2 + (uvmv.y >> (INT_BIT-1))) >> 2;
1291                 if (s->profile == 3) {
1292                     uvmv.x &= ~7;
1293                     uvmv.y &= ~7;
1294                 }
1295                 vp8_mc_chroma(s, dst[1] + 4*y*s->uvlinesize + x*4,
1296                               dst[2] + 4*y*s->uvlinesize + x*4, ref, &uvmv,
1297                               4*x + x_off, 4*y + y_off, 4, 4,
1298                               width, height, s->uvlinesize,
1299                               s->put_pixels_tab[2]);
1300             }
1301         }
1302         break;
1303     }
1304     case VP8_SPLITMVMODE_16x8:
1305         vp8_mc_part(s, dst, ref, x_off, y_off,
1306                     0, 0, 16, 8, width, height, &bmv[0]);
1307         vp8_mc_part(s, dst, ref, x_off, y_off,
1308                     0, 8, 16, 8, width, height, &bmv[1]);
1309         break;
1310     case VP8_SPLITMVMODE_8x16:
1311         vp8_mc_part(s, dst, ref, x_off, y_off,
1312                     0, 0, 8, 16, width, height, &bmv[0]);
1313         vp8_mc_part(s, dst, ref, x_off, y_off,
1314                     8, 0, 8, 16, width, height, &bmv[1]);
1315         break;
1316     case VP8_SPLITMVMODE_8x8:
1317         vp8_mc_part(s, dst, ref, x_off, y_off,
1318                     0, 0, 8, 8, width, height, &bmv[0]);
1319         vp8_mc_part(s, dst, ref, x_off, y_off,
1320                     8, 0, 8, 8, width, height, &bmv[1]);
1321         vp8_mc_part(s, dst, ref, x_off, y_off,
1322                     0, 8, 8, 8, width, height, &bmv[2]);
1323         vp8_mc_part(s, dst, ref, x_off, y_off,
1324                     8, 8, 8, 8, width, height, &bmv[3]);
1325         break;
1326     }
1327 }
1328
1329 static av_always_inline void idct_mb(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb)
1330 {
1331     int x, y, ch;
1332
1333     if (mb->mode != MODE_I4x4) {
1334         uint8_t *y_dst = dst[0];
1335         for (y = 0; y < 4; y++) {
1336             uint32_t nnz4 = AV_RL32(s->non_zero_count_cache[y]);
1337             if (nnz4) {
1338                 if (nnz4&~0x01010101) {
1339                     for (x = 0; x < 4; x++) {
1340                         if ((uint8_t)nnz4 == 1)
1341                             s->vp8dsp.vp8_idct_dc_add(y_dst+4*x, s->block[y][x], s->linesize);
1342                         else if((uint8_t)nnz4 > 1)
1343                             s->vp8dsp.vp8_idct_add(y_dst+4*x, s->block[y][x], s->linesize);
1344                         nnz4 >>= 8;
1345                         if (!nnz4)
1346                             break;
1347                     }
1348                 } else {
1349                     s->vp8dsp.vp8_idct_dc_add4y(y_dst, s->block[y], s->linesize);
1350                 }
1351             }
1352             y_dst += 4*s->linesize;
1353         }
1354     }
1355
1356     for (ch = 0; ch < 2; ch++) {
1357         uint32_t nnz4 = AV_RL32(s->non_zero_count_cache[4+ch]);
1358         if (nnz4) {
1359             uint8_t *ch_dst = dst[1+ch];
1360             if (nnz4&~0x01010101) {
1361                 for (y = 0; y < 2; y++) {
1362                     for (x = 0; x < 2; x++) {
1363                         if ((uint8_t)nnz4 == 1)
1364                             s->vp8dsp.vp8_idct_dc_add(ch_dst+4*x, s->block[4+ch][(y<<1)+x], s->uvlinesize);
1365                         else if((uint8_t)nnz4 > 1)
1366                             s->vp8dsp.vp8_idct_add(ch_dst+4*x, s->block[4+ch][(y<<1)+x], s->uvlinesize);
1367                         nnz4 >>= 8;
1368                         if (!nnz4)
1369                             goto chroma_idct_end;
1370                     }
1371                     ch_dst += 4*s->uvlinesize;
1372                 }
1373             } else {
1374                 s->vp8dsp.vp8_idct_dc_add4uv(ch_dst, s->block[4+ch], s->uvlinesize);
1375             }
1376         }
1377 chroma_idct_end: ;
1378     }
1379 }
1380
1381 static av_always_inline void filter_level_for_mb(VP8Context *s, VP8Macroblock *mb, VP8FilterStrength *f )
1382 {
1383     int interior_limit, filter_level;
1384
1385     if (s->segmentation.enabled) {
1386         filter_level = s->segmentation.filter_level[s->segment];
1387         if (!s->segmentation.absolute_vals)
1388             filter_level += s->filter.level;
1389     } else
1390         filter_level = s->filter.level;
1391
1392     if (s->lf_delta.enabled) {
1393         filter_level += s->lf_delta.ref[mb->ref_frame];
1394         filter_level += s->lf_delta.mode[mb->mode];
1395     }
1396
1397     filter_level = av_clip_uintp2(filter_level, 6);
1398
1399     interior_limit = filter_level;
1400     if (s->filter.sharpness) {
1401         interior_limit >>= (s->filter.sharpness + 3) >> 2;
1402         interior_limit = FFMIN(interior_limit, 9 - s->filter.sharpness);
1403     }
1404     interior_limit = FFMAX(interior_limit, 1);
1405
1406     f->filter_level = filter_level;
1407     f->inner_limit = interior_limit;
1408     f->inner_filter = !mb->skip || mb->mode == MODE_I4x4 || mb->mode == VP8_MVMODE_SPLIT;
1409 }
1410
1411 static av_always_inline void filter_mb(VP8Context *s, uint8_t *dst[3], VP8FilterStrength *f, int mb_x, int mb_y)
1412 {
1413     int mbedge_lim, bedge_lim, hev_thresh;
1414     int filter_level = f->filter_level;
1415     int inner_limit = f->inner_limit;
1416     int inner_filter = f->inner_filter;
1417     int linesize = s->linesize;
1418     int uvlinesize = s->uvlinesize;
1419     static const uint8_t hev_thresh_lut[2][64] = {
1420         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
1421           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1422           3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1423           3, 3, 3, 3 },
1424         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
1425           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1426           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1427           2, 2, 2, 2 }
1428     };
1429
1430     if (!filter_level)
1431         return;
1432
1433      bedge_lim = 2*filter_level + inner_limit;
1434     mbedge_lim = bedge_lim + 4;
1435
1436     hev_thresh = hev_thresh_lut[s->keyframe][filter_level];
1437
1438     if (mb_x) {
1439         s->vp8dsp.vp8_h_loop_filter16y(dst[0],     linesize,
1440                                        mbedge_lim, inner_limit, hev_thresh);
1441         s->vp8dsp.vp8_h_loop_filter8uv(dst[1],     dst[2],      uvlinesize,
1442                                        mbedge_lim, inner_limit, hev_thresh);
1443     }
1444
1445     if (inner_filter) {
1446         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+ 4, linesize, bedge_lim,
1447                                              inner_limit, hev_thresh);
1448         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+ 8, linesize, bedge_lim,
1449                                              inner_limit, hev_thresh);
1450         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+12, linesize, bedge_lim,
1451                                              inner_limit, hev_thresh);
1452         s->vp8dsp.vp8_h_loop_filter8uv_inner(dst[1] + 4, dst[2] + 4,
1453                                              uvlinesize,  bedge_lim,
1454                                              inner_limit, hev_thresh);
1455     }
1456
1457     if (mb_y) {
1458         s->vp8dsp.vp8_v_loop_filter16y(dst[0],     linesize,
1459                                        mbedge_lim, inner_limit, hev_thresh);
1460         s->vp8dsp.vp8_v_loop_filter8uv(dst[1],     dst[2],      uvlinesize,
1461                                        mbedge_lim, inner_limit, hev_thresh);
1462     }
1463
1464     if (inner_filter) {
1465         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+ 4*linesize,
1466                                              linesize,    bedge_lim,
1467                                              inner_limit, hev_thresh);
1468         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+ 8*linesize,
1469                                              linesize,    bedge_lim,
1470                                              inner_limit, hev_thresh);
1471         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+12*linesize,
1472                                              linesize,    bedge_lim,
1473                                              inner_limit, hev_thresh);
1474         s->vp8dsp.vp8_v_loop_filter8uv_inner(dst[1] + 4 * uvlinesize,
1475                                              dst[2] + 4 * uvlinesize,
1476                                              uvlinesize,  bedge_lim,
1477                                              inner_limit, hev_thresh);
1478     }
1479 }
1480
1481 static av_always_inline void filter_mb_simple(VP8Context *s, uint8_t *dst, VP8FilterStrength *f, int mb_x, int mb_y)
1482 {
1483     int mbedge_lim, bedge_lim;
1484     int filter_level = f->filter_level;
1485     int inner_limit = f->inner_limit;
1486     int inner_filter = f->inner_filter;
1487     int linesize = s->linesize;
1488
1489     if (!filter_level)
1490         return;
1491
1492      bedge_lim = 2*filter_level + inner_limit;
1493     mbedge_lim = bedge_lim + 4;
1494
1495     if (mb_x)
1496         s->vp8dsp.vp8_h_loop_filter_simple(dst, linesize, mbedge_lim);
1497     if (inner_filter) {
1498         s->vp8dsp.vp8_h_loop_filter_simple(dst+ 4, linesize, bedge_lim);
1499         s->vp8dsp.vp8_h_loop_filter_simple(dst+ 8, linesize, bedge_lim);
1500         s->vp8dsp.vp8_h_loop_filter_simple(dst+12, linesize, bedge_lim);
1501     }
1502
1503     if (mb_y)
1504         s->vp8dsp.vp8_v_loop_filter_simple(dst, linesize, mbedge_lim);
1505     if (inner_filter) {
1506         s->vp8dsp.vp8_v_loop_filter_simple(dst+ 4*linesize, linesize, bedge_lim);
1507         s->vp8dsp.vp8_v_loop_filter_simple(dst+ 8*linesize, linesize, bedge_lim);
1508         s->vp8dsp.vp8_v_loop_filter_simple(dst+12*linesize, linesize, bedge_lim);
1509     }
1510 }
1511
1512 static void filter_mb_row(VP8Context *s, AVFrame *curframe, int mb_y)
1513 {
1514     VP8FilterStrength *f = s->filter_strength;
1515     uint8_t *dst[3] = {
1516         curframe->data[0] + 16*mb_y*s->linesize,
1517         curframe->data[1] +  8*mb_y*s->uvlinesize,
1518         curframe->data[2] +  8*mb_y*s->uvlinesize
1519     };
1520     int mb_x;
1521
1522     for (mb_x = 0; mb_x < s->mb_width; mb_x++) {
1523         backup_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2], s->linesize, s->uvlinesize, 0);
1524         filter_mb(s, dst, f++, mb_x, mb_y);
1525         dst[0] += 16;
1526         dst[1] += 8;
1527         dst[2] += 8;
1528     }
1529 }
1530
1531 static void filter_mb_row_simple(VP8Context *s, AVFrame *curframe, int mb_y)
1532 {
1533     VP8FilterStrength *f = s->filter_strength;
1534     uint8_t *dst = curframe->data[0] + 16*mb_y*s->linesize;
1535     int mb_x;
1536
1537     for (mb_x = 0; mb_x < s->mb_width; mb_x++) {
1538         backup_mb_border(s->top_border[mb_x+1], dst, NULL, NULL, s->linesize, 0, 1);
1539         filter_mb_simple(s, dst, f++, mb_x, mb_y);
1540         dst += 16;
1541     }
1542 }
1543
1544 static void release_queued_segmaps(VP8Context *s, int is_close)
1545 {
1546     int leave_behind = is_close ? 0 : !s->maps_are_invalid;
1547     while (s->num_maps_to_be_freed > leave_behind)
1548         av_freep(&s->segmentation_maps[--s->num_maps_to_be_freed]);
1549     s->maps_are_invalid = 0;
1550 }
1551
1552 static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
1553                             AVPacket *avpkt)
1554 {
1555     VP8Context *s = avctx->priv_data;
1556     int ret, mb_x, mb_y, i, y, referenced;
1557     enum AVDiscard skip_thresh;
1558     AVFrame *av_uninit(curframe), *prev_frame = s->framep[VP56_FRAME_CURRENT];
1559
1560     release_queued_segmaps(s, 0);
1561
1562     if ((ret = decode_frame_header(s, avpkt->data, avpkt->size)) < 0)
1563         return ret;
1564
1565     referenced = s->update_last || s->update_golden == VP56_FRAME_CURRENT
1566                                 || s->update_altref == VP56_FRAME_CURRENT;
1567
1568     skip_thresh = !referenced ? AVDISCARD_NONREF :
1569                     !s->keyframe ? AVDISCARD_NONKEY : AVDISCARD_ALL;
1570
1571     if (avctx->skip_frame >= skip_thresh) {
1572         s->invisible = 1;
1573         goto skip_decode;
1574     }
1575     s->deblock_filter = s->filter.level && avctx->skip_loop_filter < skip_thresh;
1576
1577     // release no longer referenced frames
1578     for (i = 0; i < 5; i++)
1579         if (s->frames[i].data[0] &&
1580             &s->frames[i] != prev_frame &&
1581             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
1582             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN] &&
1583             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2])
1584             vp8_release_frame(s, &s->frames[i], 0);
1585
1586     // find a free buffer
1587     for (i = 0; i < 5; i++)
1588         if (&s->frames[i] != prev_frame &&
1589             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
1590             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN] &&
1591             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2]) {
1592             curframe = s->framep[VP56_FRAME_CURRENT] = &s->frames[i];
1593             break;
1594         }
1595     if (i == 5) {
1596         av_log(avctx, AV_LOG_FATAL, "Ran out of free frames!\n");
1597         abort();
1598     }
1599     if (curframe->data[0])
1600         ff_thread_release_buffer(avctx, curframe);
1601
1602     curframe->key_frame = s->keyframe;
1603     curframe->pict_type = s->keyframe ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
1604     curframe->reference = referenced ? 3 : 0;
1605     if ((ret = vp8_alloc_frame(s, curframe))) {
1606         av_log(avctx, AV_LOG_ERROR, "get_buffer() failed!\n");
1607         return ret;
1608     }
1609
1610     // check if golden and altref are swapped
1611     if (s->update_altref != VP56_FRAME_NONE) {
1612         s->next_framep[VP56_FRAME_GOLDEN2]  = s->framep[s->update_altref];
1613     } else {
1614         s->next_framep[VP56_FRAME_GOLDEN2]  = s->framep[VP56_FRAME_GOLDEN2];
1615     }
1616     if (s->update_golden != VP56_FRAME_NONE) {
1617         s->next_framep[VP56_FRAME_GOLDEN]   = s->framep[s->update_golden];
1618     } else {
1619         s->next_framep[VP56_FRAME_GOLDEN]   = s->framep[VP56_FRAME_GOLDEN];
1620     }
1621     if (s->update_last) {
1622         s->next_framep[VP56_FRAME_PREVIOUS] = curframe;
1623     } else {
1624         s->next_framep[VP56_FRAME_PREVIOUS] = s->framep[VP56_FRAME_PREVIOUS];
1625     }
1626     s->next_framep[VP56_FRAME_CURRENT]      = curframe;
1627
1628     ff_thread_finish_setup(avctx);
1629
1630     // Given that arithmetic probabilities are updated every frame, it's quite likely
1631     // that the values we have on a random interframe are complete junk if we didn't
1632     // start decode on a keyframe. So just don't display anything rather than junk.
1633     if (!s->keyframe && (!s->framep[VP56_FRAME_PREVIOUS] ||
1634                          !s->framep[VP56_FRAME_GOLDEN] ||
1635                          !s->framep[VP56_FRAME_GOLDEN2])) {
1636         av_log(avctx, AV_LOG_WARNING, "Discarding interframe without a prior keyframe!\n");
1637         return AVERROR_INVALIDDATA;
1638     }
1639
1640     s->linesize   = curframe->linesize[0];
1641     s->uvlinesize = curframe->linesize[1];
1642
1643     if (!s->edge_emu_buffer)
1644         s->edge_emu_buffer = av_malloc(21*s->linesize);
1645
1646     memset(s->top_nnz, 0, s->mb_width*sizeof(*s->top_nnz));
1647
1648     /* Zero macroblock structures for top/top-left prediction from outside the frame. */
1649     memset(s->macroblocks + s->mb_height*2 - 1, 0, (s->mb_width+1)*sizeof(*s->macroblocks));
1650
1651     // top edge of 127 for intra prediction
1652     if (!(avctx->flags & CODEC_FLAG_EMU_EDGE)) {
1653         s->top_border[0][15] = s->top_border[0][23] = 127;
1654         memset(s->top_border[1]-1, 127, s->mb_width*sizeof(*s->top_border)+1);
1655     }
1656     memset(s->ref_count, 0, sizeof(s->ref_count));
1657     if (s->keyframe)
1658         memset(s->intra4x4_pred_mode_top, DC_PRED, s->mb_width*4);
1659
1660 #define MARGIN (16 << 2)
1661     s->mv_min.y = -MARGIN;
1662     s->mv_max.y = ((s->mb_height - 1) << 6) + MARGIN;
1663
1664     for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
1665         VP56RangeCoder *c = &s->coeff_partition[mb_y & (s->num_coeff_partitions-1)];
1666         VP8Macroblock *mb = s->macroblocks + (s->mb_height - mb_y - 1)*2;
1667         int mb_xy = mb_y*s->mb_width;
1668         uint8_t *dst[3] = {
1669             curframe->data[0] + 16*mb_y*s->linesize,
1670             curframe->data[1] +  8*mb_y*s->uvlinesize,
1671             curframe->data[2] +  8*mb_y*s->uvlinesize
1672         };
1673
1674         memset(mb - 1, 0, sizeof(*mb));   // zero left macroblock
1675         memset(s->left_nnz, 0, sizeof(s->left_nnz));
1676         AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED*0x01010101);
1677
1678         // left edge of 129 for intra prediction
1679         if (!(avctx->flags & CODEC_FLAG_EMU_EDGE)) {
1680             for (i = 0; i < 3; i++)
1681                 for (y = 0; y < 16>>!!i; y++)
1682                     dst[i][y*curframe->linesize[i]-1] = 129;
1683             if (mb_y == 1) // top left edge is also 129
1684                 s->top_border[0][15] = s->top_border[0][23] = s->top_border[0][31] = 129;
1685         }
1686
1687         s->mv_min.x = -MARGIN;
1688         s->mv_max.x = ((s->mb_width  - 1) << 6) + MARGIN;
1689         if (prev_frame && s->segmentation.enabled && !s->segmentation.update_map)
1690             ff_thread_await_progress(prev_frame, mb_y, 0);
1691
1692         for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
1693             /* Prefetch the current frame, 4 MBs ahead */
1694             s->dsp.prefetch(dst[0] + (mb_x&3)*4*s->linesize + 64, s->linesize, 4);
1695             s->dsp.prefetch(dst[1] + (mb_x&7)*s->uvlinesize + 64, dst[2] - dst[1], 2);
1696
1697             decode_mb_mode(s, mb, mb_x, mb_y, curframe->ref_index[0] + mb_xy,
1698                            prev_frame && prev_frame->ref_index[0] ? prev_frame->ref_index[0] + mb_xy : NULL);
1699
1700             prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_PREVIOUS);
1701
1702             if (!mb->skip)
1703                 decode_mb_coeffs(s, c, mb, s->top_nnz[mb_x], s->left_nnz);
1704
1705             if (mb->mode <= MODE_I4x4)
1706                 intra_predict(s, dst, mb, mb_x, mb_y);
1707             else
1708                 inter_predict(s, dst, mb, mb_x, mb_y);
1709
1710             prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN);
1711
1712             if (!mb->skip) {
1713                 idct_mb(s, dst, mb);
1714             } else {
1715                 AV_ZERO64(s->left_nnz);
1716                 AV_WN64(s->top_nnz[mb_x], 0);   // array of 9, so unaligned
1717
1718                 // Reset DC block predictors if they would exist if the mb had coefficients
1719                 if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
1720                     s->left_nnz[8]      = 0;
1721                     s->top_nnz[mb_x][8] = 0;
1722                 }
1723             }
1724
1725             if (s->deblock_filter)
1726                 filter_level_for_mb(s, mb, &s->filter_strength[mb_x]);
1727
1728             prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN2);
1729
1730             dst[0] += 16;
1731             dst[1] += 8;
1732             dst[2] += 8;
1733             s->mv_min.x -= 64;
1734             s->mv_max.x -= 64;
1735         }
1736         if (s->deblock_filter) {
1737             if (s->filter.simple)
1738                 filter_mb_row_simple(s, curframe, mb_y);
1739             else
1740                 filter_mb_row(s, curframe, mb_y);
1741         }
1742         s->mv_min.y -= 64;
1743         s->mv_max.y -= 64;
1744
1745         ff_thread_report_progress(curframe, mb_y, 0);
1746     }
1747
1748     ff_thread_report_progress(curframe, INT_MAX, 0);
1749 skip_decode:
1750     // if future frames don't use the updated probabilities,
1751     // reset them to the values we saved
1752     if (!s->update_probabilities)
1753         s->prob[0] = s->prob[1];
1754
1755     memcpy(&s->framep[0], &s->next_framep[0], sizeof(s->framep[0]) * 4);
1756
1757     if (!s->invisible) {
1758         *(AVFrame*)data = *curframe;
1759         *data_size = sizeof(AVFrame);
1760     }
1761
1762     return avpkt->size;
1763 }
1764
1765 static av_cold int vp8_decode_init(AVCodecContext *avctx)
1766 {
1767     VP8Context *s = avctx->priv_data;
1768
1769     s->avctx = avctx;
1770     avctx->pix_fmt = PIX_FMT_YUV420P;
1771
1772     dsputil_init(&s->dsp, avctx);
1773     ff_h264_pred_init(&s->hpc, CODEC_ID_VP8, 8, 1);
1774     ff_vp8dsp_init(&s->vp8dsp);
1775
1776     return 0;
1777 }
1778
1779 static av_cold int vp8_decode_free(AVCodecContext *avctx)
1780 {
1781     vp8_decode_flush_impl(avctx, 0, 1);
1782     release_queued_segmaps(avctx->priv_data, 1);
1783     return 0;
1784 }
1785
1786 static av_cold int vp8_decode_init_thread_copy(AVCodecContext *avctx)
1787 {
1788     VP8Context *s = avctx->priv_data;
1789
1790     s->avctx = avctx;
1791
1792     return 0;
1793 }
1794
1795 #define REBASE(pic) \
1796     pic ? pic - &s_src->frames[0] + &s->frames[0] : NULL
1797
1798 static int vp8_decode_update_thread_context(AVCodecContext *dst, const AVCodecContext *src)
1799 {
1800     VP8Context *s = dst->priv_data, *s_src = src->priv_data;
1801
1802     if (s->macroblocks_base &&
1803         (s_src->mb_width != s->mb_width || s_src->mb_height != s->mb_height)) {
1804         free_buffers(s);
1805     }
1806
1807     s->prob[0] = s_src->prob[!s_src->update_probabilities];
1808     s->segmentation = s_src->segmentation;
1809     s->lf_delta = s_src->lf_delta;
1810     memcpy(s->sign_bias, s_src->sign_bias, sizeof(s->sign_bias));
1811
1812     memcpy(&s->frames, &s_src->frames, sizeof(s->frames));
1813     s->framep[0] = REBASE(s_src->next_framep[0]);
1814     s->framep[1] = REBASE(s_src->next_framep[1]);
1815     s->framep[2] = REBASE(s_src->next_framep[2]);
1816     s->framep[3] = REBASE(s_src->next_framep[3]);
1817
1818     return 0;
1819 }
1820
1821 AVCodec ff_vp8_decoder = {
1822     .name           = "vp8",
1823     .type           = AVMEDIA_TYPE_VIDEO,
1824     .id             = CODEC_ID_VP8,
1825     .priv_data_size = sizeof(VP8Context),
1826     .init           = vp8_decode_init,
1827     .close          = vp8_decode_free,
1828     .decode         = vp8_decode_frame,
1829     .capabilities   = CODEC_CAP_DR1 | CODEC_CAP_FRAME_THREADS,
1830     .flush = vp8_decode_flush,
1831     .long_name = NULL_IF_CONFIG_SMALL("On2 VP8"),
1832     .init_thread_copy      = ONLY_IF_THREADS_ENABLED(vp8_decode_init_thread_copy),
1833     .update_thread_context = ONLY_IF_THREADS_ENABLED(vp8_decode_update_thread_context),
1834 };