git.sesse.net Git - ffmpeg/blob - libavcodec/vp8.c

   1 /**
   2  * VP8 compatible video decoder
   3  *
   4  * Copyright (C) 2010 David Conrad
   5  * Copyright (C) 2010 Ronald S. Bultje
   6  * Copyright (C) 2010 Jason Garrett-Glaser
   7  *
   8  * This file is part of FFmpeg.
   9  *
  10  * FFmpeg is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2.1 of the License, or (at your option) any later version.
  14  *
  15  * FFmpeg is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with FFmpeg; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23  */
  24
  25 #include "libavutil/imgutils.h"
  26 #include "avcodec.h"
  27 #include "internal.h"
  28 #include "vp8.h"
  29 #include "vp8data.h"
  30 #include "rectangle.h"
  31 #include "thread.h"
  32
  33 #if ARCH_ARM
  34 #   include "arm/vp8.h"
  35 #endif
  36
  37 static void free_buffers(VP8Context *s)
  38 {
  39     av_freep(&s->macroblocks_base);
  40     av_freep(&s->filter_strength);
  41     av_freep(&s->intra4x4_pred_mode_top);
  42     av_freep(&s->top_nnz);
  43     av_freep(&s->edge_emu_buffer);
  44     av_freep(&s->top_border);
  45
  46     s->macroblocks = NULL;
  47 }
  48
  49 static int vp8_alloc_frame(VP8Context *s, AVFrame *f)
  50 {
  51     int ret;
  52     if ((ret = ff_thread_get_buffer(s->avctx, f)) < 0)
  53         return ret;
  54     if (s->num_maps_to_be_freed && !s->maps_are_invalid) {
  55         f->ref_index[0] = s->segmentation_maps[--s->num_maps_to_be_freed];
  56     } else if (!(f->ref_index[0] = av_mallocz(s->mb_width * s->mb_height))) {
  57         ff_thread_release_buffer(s->avctx, f);
  58         return AVERROR(ENOMEM);
  59     }
  60     return 0;
  61 }
  62
  63 static void vp8_release_frame(VP8Context *s, AVFrame *f, int prefer_delayed_free, int can_direct_free)
  64 {
  65     if (f->ref_index[0]) {
  66         if (prefer_delayed_free) {
  67             /* Upon a size change, we want to free the maps but other threads may still
  68              * be using them, so queue them. Upon a seek, all threads are inactive so
  69              * we want to cache one to prevent re-allocation in the next decoding
  70              * iteration, but the rest we can free directly. */
  71             int max_queued_maps = can_direct_free ? 1 : FF_ARRAY_ELEMS(s->segmentation_maps);
  72             if (s->num_maps_to_be_freed < max_queued_maps) {
  73                 s->segmentation_maps[s->num_maps_to_be_freed++] = f->ref_index[0];
  74             } else if (can_direct_free) /* vp8_decode_flush(), but our queue is full */ {
  75                 av_free(f->ref_index[0]);
  76             } /* else: MEMLEAK (should never happen, but better that than crash) */
  77             f->ref_index[0] = NULL;
  78         } else /* vp8_decode_free() */ {
  79             av_free(f->ref_index[0]);
  80         }
  81     }
  82     ff_thread_release_buffer(s->avctx, f);
  83 }
  84
  85 static void vp8_decode_flush_impl(AVCodecContext *avctx,
  86                                   int prefer_delayed_free, int can_direct_free, int free_mem)
  87 {
  88     VP8Context *s = avctx->priv_data;
  89     int i;
  90
  91     if (!avctx->internal->is_copy) {
  92         for (i = 0; i < 5; i++)
  93             if (s->frames[i].data[0])
  94                 vp8_release_frame(s, &s->frames[i], prefer_delayed_free, can_direct_free);
  95     }
  96     memset(s->framep, 0, sizeof(s->framep));
  97
  98     if (free_mem) {
  99         free_buffers(s);
 100         s->maps_are_invalid = 1;
 101     }
 102 }
 103
 104 static void vp8_decode_flush(AVCodecContext *avctx)
 105 {
 106     vp8_decode_flush_impl(avctx, 1, 1, 0);
 107 }
 108
 109 static int update_dimensions(VP8Context *s, int width, int height)
 110 {
 111     if (width  != s->avctx->width ||
 112         height != s->avctx->height) {
 113         if (av_image_check_size(width, height, 0, s->avctx))
 114             return AVERROR_INVALIDDATA;
 115
 116         vp8_decode_flush_impl(s->avctx, 1, 0, 1);
 117
 118         avcodec_set_dimensions(s->avctx, width, height);
 119     }
 120
 121     s->mb_width  = (s->avctx->coded_width +15) / 16;
 122     s->mb_height = (s->avctx->coded_height+15) / 16;
 123
 124     s->macroblocks_base        = av_mallocz((s->mb_width+s->mb_height*2+1)*sizeof(*s->macroblocks));
 125     s->filter_strength         = av_mallocz(s->mb_width*sizeof(*s->filter_strength));
 126     s->intra4x4_pred_mode_top  = av_mallocz(s->mb_width*4);
 127     s->top_nnz                 = av_mallocz(s->mb_width*sizeof(*s->top_nnz));
 128     s->top_border              = av_mallocz((s->mb_width+1)*sizeof(*s->top_border));
 129
 130     if (!s->macroblocks_base || !s->filter_strength || !s->intra4x4_pred_mode_top ||
 131         !s->top_nnz || !s->top_border)
 132         return AVERROR(ENOMEM);
 133
 134     s->macroblocks        = s->macroblocks_base + 1;
 135
 136     return 0;
 137 }
 138
 139 static void parse_segment_info(VP8Context *s)
 140 {
 141     VP56RangeCoder *c = &s->c;
 142     int i;
 143
 144     s->segmentation.update_map = vp8_rac_get(c);
 145
 146     if (vp8_rac_get(c)) { // update segment feature data
 147         s->segmentation.absolute_vals = vp8_rac_get(c);
 148
 149         for (i = 0; i < 4; i++)
 150             s->segmentation.base_quant[i]   = vp8_rac_get_sint(c, 7);
 151
 152         for (i = 0; i < 4; i++)
 153             s->segmentation.filter_level[i] = vp8_rac_get_sint(c, 6);
 154     }
 155     if (s->segmentation.update_map)
 156         for (i = 0; i < 3; i++)
 157             s->prob->segmentid[i] = vp8_rac_get(c) ? vp8_rac_get_uint(c, 8) : 255;
 158 }
 159
 160 static void update_lf_deltas(VP8Context *s)
 161 {
 162     VP56RangeCoder *c = &s->c;
 163     int i;
 164
 165     for (i = 0; i < 4; i++)
 166         s->lf_delta.ref[i]  = vp8_rac_get_sint(c, 6);
 167
 168     for (i = MODE_I4x4; i <= VP8_MVMODE_SPLIT; i++)
 169         s->lf_delta.mode[i] = vp8_rac_get_sint(c, 6);
 170 }
 171
 172 static int setup_partitions(VP8Context *s, const uint8_t *buf, int buf_size)
 173 {
 174     const uint8_t *sizes = buf;
 175     int i;
 176
 177     s->num_coeff_partitions = 1 << vp8_rac_get_uint(&s->c, 2);
 178
 179     buf      += 3*(s->num_coeff_partitions-1);
 180     buf_size -= 3*(s->num_coeff_partitions-1);
 181     if (buf_size < 0)
 182         return -1;
 183
 184     for (i = 0; i < s->num_coeff_partitions-1; i++) {
 185         int size = AV_RL24(sizes + 3*i);
 186         if (buf_size - size < 0)
 187             return -1;
 188
 189         ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, size);
 190         buf      += size;
 191         buf_size -= size;
 192     }
 193     ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, buf_size);
 194
 195     return 0;
 196 }
 197
 198 static void get_quants(VP8Context *s)
 199 {
 200     VP56RangeCoder *c = &s->c;
 201     int i, base_qi;
 202
 203     int yac_qi     = vp8_rac_get_uint(c, 7);
 204     int ydc_delta  = vp8_rac_get_sint(c, 4);
 205     int y2dc_delta = vp8_rac_get_sint(c, 4);
 206     int y2ac_delta = vp8_rac_get_sint(c, 4);
 207     int uvdc_delta = vp8_rac_get_sint(c, 4);
 208     int uvac_delta = vp8_rac_get_sint(c, 4);
 209
 210     for (i = 0; i < 4; i++) {
 211         if (s->segmentation.enabled) {
 212             base_qi = s->segmentation.base_quant[i];
 213             if (!s->segmentation.absolute_vals)
 214                 base_qi += yac_qi;
 215         } else
 216             base_qi = yac_qi;
 217
 218         s->qmat[i].luma_qmul[0]    =       vp8_dc_qlookup[av_clip_uintp2(base_qi + ydc_delta , 7)];
 219         s->qmat[i].luma_qmul[1]    =       vp8_ac_qlookup[av_clip_uintp2(base_qi             , 7)];
 220         s->qmat[i].luma_dc_qmul[0] =   2 * vp8_dc_qlookup[av_clip_uintp2(base_qi + y2dc_delta, 7)];
 221         s->qmat[i].luma_dc_qmul[1] = 155 * vp8_ac_qlookup[av_clip_uintp2(base_qi + y2ac_delta, 7)] / 100;
 222         s->qmat[i].chroma_qmul[0]  =       vp8_dc_qlookup[av_clip_uintp2(base_qi + uvdc_delta, 7)];
 223         s->qmat[i].chroma_qmul[1]  =       vp8_ac_qlookup[av_clip_uintp2(base_qi + uvac_delta, 7)];
 224
 225         s->qmat[i].luma_dc_qmul[1] = FFMAX(s->qmat[i].luma_dc_qmul[1], 8);
 226         s->qmat[i].chroma_qmul[0]  = FFMIN(s->qmat[i].chroma_qmul[0], 132);
 227     }
 228 }
 229
 230 /**
 231  * Determine which buffers golden and altref should be updated with after this frame.
 232  * The spec isn't clear here, so I'm going by my understanding of what libvpx does
 233  *
 234  * Intra frames update all 3 references
 235  * Inter frames update VP56_FRAME_PREVIOUS if the update_last flag is set
 236  * If the update (golden|altref) flag is set, it's updated with the current frame
 237  *      if update_last is set, and VP56_FRAME_PREVIOUS otherwise.
 238  * If the flag is not set, the number read means:
 239  *      0: no update
 240  *      1: VP56_FRAME_PREVIOUS
 241  *      2: update golden with altref, or update altref with golden
 242  */
 243 static VP56Frame ref_to_update(VP8Context *s, int update, VP56Frame ref)
 244 {
 245     VP56RangeCoder *c = &s->c;
 246
 247     if (update)
 248         return VP56_FRAME_CURRENT;
 249
 250     switch (vp8_rac_get_uint(c, 2)) {
 251     case 1:
 252         return VP56_FRAME_PREVIOUS;
 253     case 2:
 254         return (ref == VP56_FRAME_GOLDEN) ? VP56_FRAME_GOLDEN2 : VP56_FRAME_GOLDEN;
 255     }
 256     return VP56_FRAME_NONE;
 257 }
 258
 259 static void update_refs(VP8Context *s)
 260 {
 261     VP56RangeCoder *c = &s->c;
 262
 263     int update_golden = vp8_rac_get(c);
 264     int update_altref = vp8_rac_get(c);
 265
 266     s->update_golden = ref_to_update(s, update_golden, VP56_FRAME_GOLDEN);
 267     s->update_altref = ref_to_update(s, update_altref, VP56_FRAME_GOLDEN2);
 268 }
 269
 270 static int decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size)
 271 {
 272     VP56RangeCoder *c = &s->c;
 273     int header_size, hscale, vscale, i, j, k, l, m, ret;
 274     int width  = s->avctx->width;
 275     int height = s->avctx->height;
 276
 277     s->keyframe  = !(buf[0] & 1);
 278     s->profile   =  (buf[0]>>1) & 7;
 279     s->invisible = !(buf[0] & 0x10);
 280     header_size  = AV_RL24(buf) >> 5;
 281     buf      += 3;
 282     buf_size -= 3;
 283
 284     if (s->profile > 3)
 285         av_log(s->avctx, AV_LOG_WARNING, "Unknown profile %d\n", s->profile);
 286
 287     if (!s->profile)
 288         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab, sizeof(s->put_pixels_tab));
 289     else    // profile 1-3 use bilinear, 4+ aren't defined so whatever
 290         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_bilinear_pixels_tab, sizeof(s->put_pixels_tab));
 291
 292     if (header_size > buf_size - 7*s->keyframe) {
 293         av_log(s->avctx, AV_LOG_ERROR, "Header size larger than data provided\n");
 294         return AVERROR_INVALIDDATA;
 295     }
 296
 297     if (s->keyframe) {
 298         if (AV_RL24(buf) != 0x2a019d) {
 299             av_log(s->avctx, AV_LOG_ERROR, "Invalid start code 0x%x\n", AV_RL24(buf));
 300             return AVERROR_INVALIDDATA;
 301         }
 302         width  = AV_RL16(buf+3) & 0x3fff;
 303         height = AV_RL16(buf+5) & 0x3fff;
 304         hscale = buf[4] >> 6;
 305         vscale = buf[6] >> 6;
 306         buf      += 7;
 307         buf_size -= 7;
 308
 309         if (hscale || vscale)
 310             av_log_missing_feature(s->avctx, "Upscaling", 1);
 311
 312         s->update_golden = s->update_altref = VP56_FRAME_CURRENT;
 313         for (i = 0; i < 4; i++)
 314             for (j = 0; j < 16; j++)
 315                 memcpy(s->prob->token[i][j], vp8_token_default_probs[i][vp8_coeff_band[j]],
 316                        sizeof(s->prob->token[i][j]));
 317         memcpy(s->prob->pred16x16, vp8_pred16x16_prob_inter, sizeof(s->prob->pred16x16));
 318         memcpy(s->prob->pred8x8c , vp8_pred8x8c_prob_inter , sizeof(s->prob->pred8x8c));
 319         memcpy(s->prob->mvc      , vp8_mv_default_prob     , sizeof(s->prob->mvc));
 320         memset(&s->segmentation, 0, sizeof(s->segmentation));
 321     }
 322
 323     if (!s->macroblocks_base || /* first frame */
 324         width != s->avctx->width || height != s->avctx->height) {
 325         if ((ret = update_dimensions(s, width, height)) < 0)
 326             return ret;
 327     }
 328
 329     ff_vp56_init_range_decoder(c, buf, header_size);
 330     buf      += header_size;
 331     buf_size -= header_size;
 332
 333     if (s->keyframe) {
 334         if (vp8_rac_get(c))
 335             av_log(s->avctx, AV_LOG_WARNING, "Unspecified colorspace\n");
 336         vp8_rac_get(c); // whether we can skip clamping in dsp functions
 337     }
 338
 339     if ((s->segmentation.enabled = vp8_rac_get(c)))
 340         parse_segment_info(s);
 341     else
 342         s->segmentation.update_map = 0; // FIXME: move this to some init function?
 343
 344     s->filter.simple    = vp8_rac_get(c);
 345     s->filter.level     = vp8_rac_get_uint(c, 6);
 346     s->filter.sharpness = vp8_rac_get_uint(c, 3);
 347
 348     if ((s->lf_delta.enabled = vp8_rac_get(c)))
 349         if (vp8_rac_get(c))
 350             update_lf_deltas(s);
 351
 352     if (setup_partitions(s, buf, buf_size)) {
 353         av_log(s->avctx, AV_LOG_ERROR, "Invalid partitions\n");
 354         return AVERROR_INVALIDDATA;
 355     }
 356
 357     get_quants(s);
 358
 359     if (!s->keyframe) {
 360         update_refs(s);
 361         s->sign_bias[VP56_FRAME_GOLDEN]               = vp8_rac_get(c);
 362         s->sign_bias[VP56_FRAME_GOLDEN2 /* altref */] = vp8_rac_get(c);
 363     }
 364
 365     // if we aren't saving this frame's probabilities for future frames,
 366     // make a copy of the current probabilities
 367     if (!(s->update_probabilities = vp8_rac_get(c)))
 368         s->prob[1] = s->prob[0];
 369
 370     s->update_last = s->keyframe || vp8_rac_get(c);
 371
 372     for (i = 0; i < 4; i++)
 373         for (j = 0; j < 8; j++)
 374             for (k = 0; k < 3; k++)
 375                 for (l = 0; l < NUM_DCT_TOKENS-1; l++)
 376                     if (vp56_rac_get_prob_branchy(c, vp8_token_update_probs[i][j][k][l])) {
 377                         int prob = vp8_rac_get_uint(c, 8);
 378                         for (m = 0; vp8_coeff_band_indexes[j][m] >= 0; m++)
 379                             s->prob->token[i][vp8_coeff_band_indexes[j][m]][k][l] = prob;
 380                     }
 381
 382     if ((s->mbskip_enabled = vp8_rac_get(c)))
 383         s->prob->mbskip = vp8_rac_get_uint(c, 8);
 384
 385     if (!s->keyframe) {
 386         s->prob->intra  = vp8_rac_get_uint(c, 8);
 387         s->prob->last   = vp8_rac_get_uint(c, 8);
 388         s->prob->golden = vp8_rac_get_uint(c, 8);
 389
 390         if (vp8_rac_get(c))
 391             for (i = 0; i < 4; i++)
 392                 s->prob->pred16x16[i] = vp8_rac_get_uint(c, 8);
 393         if (vp8_rac_get(c))
 394             for (i = 0; i < 3; i++)
 395                 s->prob->pred8x8c[i]  = vp8_rac_get_uint(c, 8);
 396
 397         // 17.2 MV probability update
 398         for (i = 0; i < 2; i++)
 399             for (j = 0; j < 19; j++)
 400                 if (vp56_rac_get_prob_branchy(c, vp8_mv_update_prob[i][j]))
 401                     s->prob->mvc[i][j] = vp8_rac_get_nn(c);
 402     }
 403
 404     return 0;
 405 }
 406
 407 static av_always_inline void clamp_mv(VP8Context *s, VP56mv *dst, const VP56mv *src)
 408 {
 409     dst->x = av_clip(src->x, s->mv_min.x, s->mv_max.x);
 410     dst->y = av_clip(src->y, s->mv_min.y, s->mv_max.y);
 411 }
 412
 413 /**
 414  * Motion vector coding, 17.1.
 415  */
 416 static int read_mv_component(VP56RangeCoder *c, const uint8_t *p)
 417 {
 418     int bit, x = 0;
 419
 420     if (vp56_rac_get_prob_branchy(c, p[0])) {
 421         int i;
 422
 423         for (i = 0; i < 3; i++)
 424             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 425         for (i = 9; i > 3; i--)
 426             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 427         if (!(x & 0xFFF0) || vp56_rac_get_prob(c, p[12]))
 428             x += 8;
 429     } else {
 430         // small_mvtree
 431         const uint8_t *ps = p+2;
 432         bit = vp56_rac_get_prob(c, *ps);
 433         ps += 1 + 3*bit;
 434         x  += 4*bit;
 435         bit = vp56_rac_get_prob(c, *ps);
 436         ps += 1 + bit;
 437         x  += 2*bit;
 438         x  += vp56_rac_get_prob(c, *ps);
 439     }
 440
 441     return (x && vp56_rac_get_prob(c, p[1])) ? -x : x;
 442 }
 443
 444 static av_always_inline
 445 const uint8_t *get_submv_prob(uint32_t left, uint32_t top)
 446 {
 447     if (left == top)
 448         return vp8_submv_prob[4-!!left];
 449     if (!top)
 450         return vp8_submv_prob[2];
 451     return vp8_submv_prob[1-!!left];
 452 }
 453
 454 /**
 455  * Split motion vector prediction, 16.4.
 456  * @returns the number of motion vectors parsed (2, 4 or 16)
 457  */
 458 static av_always_inline
 459 int decode_splitmvs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb)
 460 {
 461     int part_idx;
 462     int n, num;
 463     VP8Macroblock *top_mb  = &mb[2];
 464     VP8Macroblock *left_mb = &mb[-1];
 465     const uint8_t *mbsplits_left = vp8_mbsplits[left_mb->partitioning],
 466                   *mbsplits_top = vp8_mbsplits[top_mb->partitioning],
 467                   *mbsplits_cur, *firstidx;
 468     VP56mv *top_mv  = top_mb->bmv;
 469     VP56mv *left_mv = left_mb->bmv;
 470     VP56mv *cur_mv  = mb->bmv;
 471
 472     if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[0])) {
 473         if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[1])) {
 474             part_idx = VP8_SPLITMVMODE_16x8 + vp56_rac_get_prob(c, vp8_mbsplit_prob[2]);
 475         } else {
 476             part_idx = VP8_SPLITMVMODE_8x8;
 477         }
 478     } else {
 479         part_idx = VP8_SPLITMVMODE_4x4;
 480     }
 481
 482     num = vp8_mbsplit_count[part_idx];
 483     mbsplits_cur = vp8_mbsplits[part_idx],
 484     firstidx = vp8_mbfirstidx[part_idx];
 485     mb->partitioning = part_idx;
 486
 487     for (n = 0; n < num; n++) {
 488         int k = firstidx[n];
 489         uint32_t left, above;
 490         const uint8_t *submv_prob;
 491
 492         if (!(k & 3))
 493             left = AV_RN32A(&left_mv[mbsplits_left[k + 3]]);
 494         else
 495             left  = AV_RN32A(&cur_mv[mbsplits_cur[k - 1]]);
 496         if (k <= 3)
 497             above = AV_RN32A(&top_mv[mbsplits_top[k + 12]]);
 498         else
 499             above = AV_RN32A(&cur_mv[mbsplits_cur[k - 4]]);
 500
 501         submv_prob = get_submv_prob(left, above);
 502
 503         if (vp56_rac_get_prob_branchy(c, submv_prob[0])) {
 504             if (vp56_rac_get_prob_branchy(c, submv_prob[1])) {
 505                 if (vp56_rac_get_prob_branchy(c, submv_prob[2])) {
 506                     mb->bmv[n].y = mb->mv.y + read_mv_component(c, s->prob->mvc[0]);
 507                     mb->bmv[n].x = mb->mv.x + read_mv_component(c, s->prob->mvc[1]);
 508                 } else {
 509                     AV_ZERO32(&mb->bmv[n]);
 510                 }
 511             } else {
 512                 AV_WN32A(&mb->bmv[n], above);
 513             }
 514         } else {
 515             AV_WN32A(&mb->bmv[n], left);
 516         }
 517     }
 518
 519     return num;
 520 }
 521
 522 static av_always_inline
 523 void decode_mvs(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y)
 524 {
 525     VP8Macroblock *mb_edge[3] = { mb + 2 /* top */,
 526                                   mb - 1 /* left */,
 527                                   mb + 1 /* top-left */ };
 528     enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV };
 529     enum { VP8_EDGE_TOP, VP8_EDGE_LEFT, VP8_EDGE_TOPLEFT };
 530     int idx = CNT_ZERO;
 531     int cur_sign_bias = s->sign_bias[mb->ref_frame];
 532     int8_t *sign_bias = s->sign_bias;
 533     VP56mv near_mv[4];
 534     uint8_t cnt[4] = { 0 };
 535     VP56RangeCoder *c = &s->c;
 536
 537     AV_ZERO32(&near_mv[0]);
 538     AV_ZERO32(&near_mv[1]);
 539     AV_ZERO32(&near_mv[2]);
 540
 541     /* Process MB on top, left and top-left */
 542     #define MV_EDGE_CHECK(n)\
 543     {\
 544         VP8Macroblock *edge = mb_edge[n];\
 545         int edge_ref = edge->ref_frame;\
 546         if (edge_ref != VP56_FRAME_CURRENT) {\
 547             uint32_t mv = AV_RN32A(&edge->mv);\
 548             if (mv) {\
 549                 if (cur_sign_bias != sign_bias[edge_ref]) {\
 550                     /* SWAR negate of the values in mv. */\
 551                     mv = ~mv;\
 552                     mv = ((mv&0x7fff7fff) + 0x00010001) ^ (mv&0x80008000);\
 553                 }\
 554                 if (!n || mv != AV_RN32A(&near_mv[idx]))\
 555                     AV_WN32A(&near_mv[++idx], mv);\
 556                 cnt[idx]      += 1 + (n != 2);\
 557             } else\
 558                 cnt[CNT_ZERO] += 1 + (n != 2);\
 559         }\
 560     }
 561
 562     MV_EDGE_CHECK(0)
 563     MV_EDGE_CHECK(1)
 564     MV_EDGE_CHECK(2)
 565
 566     mb->partitioning = VP8_SPLITMVMODE_NONE;
 567     if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_ZERO]][0])) {
 568         mb->mode = VP8_MVMODE_MV;
 569
 570         /* If we have three distinct MVs, merge first and last if they're the same */
 571         if (cnt[CNT_SPLITMV] && AV_RN32A(&near_mv[1 + VP8_EDGE_TOP]) == AV_RN32A(&near_mv[1 + VP8_EDGE_TOPLEFT]))
 572             cnt[CNT_NEAREST] += 1;
 573
 574         /* Swap near and nearest if necessary */
 575         if (cnt[CNT_NEAR] > cnt[CNT_NEAREST]) {
 576             FFSWAP(uint8_t,     cnt[CNT_NEAREST],     cnt[CNT_NEAR]);
 577             FFSWAP( VP56mv, near_mv[CNT_NEAREST], near_mv[CNT_NEAR]);
 578         }
 579
 580         if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAREST]][1])) {
 581             if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAR]][2])) {
 582
 583                 /* Choose the best mv out of 0,0 and the nearest mv */
 584                 clamp_mv(s, &mb->mv, &near_mv[CNT_ZERO + (cnt[CNT_NEAREST] >= cnt[CNT_ZERO])]);
 585                 cnt[CNT_SPLITMV] = ((mb_edge[VP8_EDGE_LEFT]->mode    == VP8_MVMODE_SPLIT) +
 586                                     (mb_edge[VP8_EDGE_TOP]->mode     == VP8_MVMODE_SPLIT)) * 2 +
 587                                     (mb_edge[VP8_EDGE_TOPLEFT]->mode == VP8_MVMODE_SPLIT);
 588
 589                 if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_SPLITMV]][3])) {
 590                     mb->mode = VP8_MVMODE_SPLIT;
 591                     mb->mv = mb->bmv[decode_splitmvs(s, c, mb) - 1];
 592                 } else {
 593                     mb->mv.y += read_mv_component(c, s->prob->mvc[0]);
 594                     mb->mv.x += read_mv_component(c, s->prob->mvc[1]);
 595                     mb->bmv[0] = mb->mv;
 596                 }
 597             } else {
 598                 clamp_mv(s, &mb->mv, &near_mv[CNT_NEAR]);
 599                 mb->bmv[0] = mb->mv;
 600             }
 601         } else {
 602             clamp_mv(s, &mb->mv, &near_mv[CNT_NEAREST]);
 603             mb->bmv[0] = mb->mv;
 604         }
 605     } else {
 606         mb->mode = VP8_MVMODE_ZERO;
 607         AV_ZERO32(&mb->mv);
 608         mb->bmv[0] = mb->mv;
 609     }
 610 }
 611
 612 static av_always_inline
 613 void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c,
 614                            int mb_x, int keyframe)
 615 {
 616     uint8_t *intra4x4 = s->intra4x4_pred_mode_mb;
 617     if (keyframe) {
 618         int x, y;
 619         uint8_t* const top = s->intra4x4_pred_mode_top + 4 * mb_x;
 620         uint8_t* const left = s->intra4x4_pred_mode_left;
 621         for (y = 0; y < 4; y++) {
 622             for (x = 0; x < 4; x++) {
 623                 const uint8_t *ctx;
 624                 ctx = vp8_pred4x4_prob_intra[top[x]][left[y]];
 625                 *intra4x4 = vp8_rac_get_tree(c, vp8_pred4x4_tree, ctx);
 626                 left[y] = top[x] = *intra4x4;
 627                 intra4x4++;
 628             }
 629         }
 630     } else {
 631         int i;
 632         for (i = 0; i < 16; i++)
 633             intra4x4[i] = vp8_rac_get_tree(c, vp8_pred4x4_tree, vp8_pred4x4_prob_inter);
 634     }
 635 }
 636
 637 static av_always_inline
 638 void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, uint8_t *segment, uint8_t *ref)
 639 {
 640     VP56RangeCoder *c = &s->c;
 641
 642     if (s->segmentation.update_map) {
 643         int bit  = vp56_rac_get_prob(c, s->prob->segmentid[0]);
 644         *segment = vp56_rac_get_prob(c, s->prob->segmentid[1+bit]) + 2*bit;
 645     } else
 646         *segment = ref ? *ref : *segment;
 647     s->segment = *segment;
 648
 649     mb->skip = s->mbskip_enabled ? vp56_rac_get_prob(c, s->prob->mbskip) : 0;
 650
 651     if (s->keyframe) {
 652         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_intra, vp8_pred16x16_prob_intra);
 653
 654         if (mb->mode == MODE_I4x4) {
 655             decode_intra4x4_modes(s, c, mb_x, 1);
 656         } else {
 657             const uint32_t modes = vp8_pred4x4_mode[mb->mode] * 0x01010101u;
 658             AV_WN32A(s->intra4x4_pred_mode_top + 4 * mb_x, modes);
 659             AV_WN32A(s->intra4x4_pred_mode_left, modes);
 660         }
 661
 662         s->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, vp8_pred8x8c_prob_intra);
 663         mb->ref_frame = VP56_FRAME_CURRENT;
 664     } else if (vp56_rac_get_prob_branchy(c, s->prob->intra)) {
 665         // inter MB, 16.2
 666         if (vp56_rac_get_prob_branchy(c, s->prob->last))
 667             mb->ref_frame = vp56_rac_get_prob(c, s->prob->golden) ?
 668                 VP56_FRAME_GOLDEN2 /* altref */ : VP56_FRAME_GOLDEN;
 669         else
 670             mb->ref_frame = VP56_FRAME_PREVIOUS;
 671         s->ref_count[mb->ref_frame-1]++;
 672
 673         // motion vectors, 16.3
 674         decode_mvs(s, mb, mb_x, mb_y);
 675     } else {
 676         // intra MB, 16.1
 677         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_inter, s->prob->pred16x16);
 678
 679         if (mb->mode == MODE_I4x4)
 680             decode_intra4x4_modes(s, c, mb_x, 0);
 681
 682         s->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, s->prob->pred8x8c);
 683         mb->ref_frame = VP56_FRAME_CURRENT;
 684         mb->partitioning = VP8_SPLITMVMODE_NONE;
 685         AV_ZERO32(&mb->bmv[0]);
 686     }
 687 }
 688
 689 #ifndef decode_block_coeffs_internal
 690 /**
 691  * @param c arithmetic bitstream reader context
 692  * @param block destination for block coefficients
 693  * @param probs probabilities to use when reading trees from the bitstream
 694  * @param i initial coeff index, 0 unless a separate DC block is coded
 695  * @param qmul array holding the dc/ac dequant factor at position 0/1
 696  * @return 0 if no coeffs were decoded
 697  *         otherwise, the index of the last coeff decoded plus one
 698  */
 699 static int decode_block_coeffs_internal(VP56RangeCoder *c, DCTELEM block[16],
 700                                         uint8_t probs[16][3][NUM_DCT_TOKENS-1],
 701                                         int i, uint8_t *token_prob, int16_t qmul[2])
 702 {
 703     goto skip_eob;
 704     do {
 705         int coeff;
 706         if (!vp56_rac_get_prob_branchy(c, token_prob[0]))   // DCT_EOB
 707             return i;
 708
 709 skip_eob:
 710         if (!vp56_rac_get_prob_branchy(c, token_prob[1])) { // DCT_0
 711             if (++i == 16)
 712                 return i; // invalid input; blocks should end with EOB
 713             token_prob = probs[i][0];
 714             goto skip_eob;
 715         }
 716
 717         if (!vp56_rac_get_prob_branchy(c, token_prob[2])) { // DCT_1
 718             coeff = 1;
 719             token_prob = probs[i+1][1];
 720         } else {
 721             if (!vp56_rac_get_prob_branchy(c, token_prob[3])) { // DCT 2,3,4
 722                 coeff = vp56_rac_get_prob_branchy(c, token_prob[4]);
 723                 if (coeff)
 724                     coeff += vp56_rac_get_prob(c, token_prob[5]);
 725                 coeff += 2;
 726             } else {
 727                 // DCT_CAT*
 728                 if (!vp56_rac_get_prob_branchy(c, token_prob[6])) {
 729                     if (!vp56_rac_get_prob_branchy(c, token_prob[7])) { // DCT_CAT1
 730                         coeff  = 5 + vp56_rac_get_prob(c, vp8_dct_cat1_prob[0]);
 731                     } else {                                    // DCT_CAT2
 732                         coeff  = 7;
 733                         coeff += vp56_rac_get_prob(c, vp8_dct_cat2_prob[0]) << 1;
 734                         coeff += vp56_rac_get_prob(c, vp8_dct_cat2_prob[1]);
 735                     }
 736                 } else {    // DCT_CAT3 and up
 737                     int a = vp56_rac_get_prob(c, token_prob[8]);
 738                     int b = vp56_rac_get_prob(c, token_prob[9+a]);
 739                     int cat = (a<<1) + b;
 740                     coeff  = 3 + (8<<cat);
 741                     coeff += vp8_rac_get_coeff(c, ff_vp8_dct_cat_prob[cat]);
 742                 }
 743             }
 744             token_prob = probs[i+1][2];
 745         }
 746         block[zigzag_scan[i]] = (vp8_rac_get(c) ? -coeff : coeff) * qmul[!!i];
 747     } while (++i < 16);
 748
 749     return i;
 750 }
 751 #endif
 752
 753 /**
 754  * @param c arithmetic bitstream reader context
 755  * @param block destination for block coefficients
 756  * @param probs probabilities to use when reading trees from the bitstream
 757  * @param i initial coeff index, 0 unless a separate DC block is coded
 758  * @param zero_nhood the initial prediction context for number of surrounding
 759  *                   all-zero blocks (only left/top, so 0-2)
 760  * @param qmul array holding the dc/ac dequant factor at position 0/1
 761  * @return 0 if no coeffs were decoded
 762  *         otherwise, the index of the last coeff decoded plus one
 763  */
 764 static av_always_inline
 765 int decode_block_coeffs(VP56RangeCoder *c, DCTELEM block[16],
 766                         uint8_t probs[16][3][NUM_DCT_TOKENS-1],
 767                         int i, int zero_nhood, int16_t qmul[2])
 768 {
 769     uint8_t *token_prob = probs[i][zero_nhood];
 770     if (!vp56_rac_get_prob_branchy(c, token_prob[0]))   // DCT_EOB
 771         return 0;
 772     return decode_block_coeffs_internal(c, block, probs, i, token_prob, qmul);
 773 }
 774
 775 static av_always_inline
 776 void decode_mb_coeffs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
 777                       uint8_t t_nnz[9], uint8_t l_nnz[9])
 778 {
 779     int i, x, y, luma_start = 0, luma_ctx = 3;
 780     int nnz_pred, nnz, nnz_total = 0;
 781     int segment = s->segment;
 782     int block_dc = 0;
 783
 784     if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
 785         nnz_pred = t_nnz[8] + l_nnz[8];
 786
 787         // decode DC values and do hadamard
 788         nnz = decode_block_coeffs(c, s->block_dc, s->prob->token[1], 0, nnz_pred,
 789                                   s->qmat[segment].luma_dc_qmul);
 790         l_nnz[8] = t_nnz[8] = !!nnz;
 791         if (nnz) {
 792             nnz_total += nnz;
 793             block_dc = 1;
 794             if (nnz == 1)
 795                 s->vp8dsp.vp8_luma_dc_wht_dc(s->block, s->block_dc);
 796             else
 797                 s->vp8dsp.vp8_luma_dc_wht(s->block, s->block_dc);
 798         }
 799         luma_start = 1;
 800         luma_ctx = 0;
 801     }
 802
 803     // luma blocks
 804     for (y = 0; y < 4; y++)
 805         for (x = 0; x < 4; x++) {
 806             nnz_pred = l_nnz[y] + t_nnz[x];
 807             nnz = decode_block_coeffs(c, s->block[y][x], s->prob->token[luma_ctx], luma_start,
 808                                       nnz_pred, s->qmat[segment].luma_qmul);
 809             // nnz+block_dc may be one more than the actual last index, but we don't care
 810             s->non_zero_count_cache[y][x] = nnz + block_dc;
 811             t_nnz[x] = l_nnz[y] = !!nnz;
 812             nnz_total += nnz;
 813         }
 814
 815     // chroma blocks
 816     // TODO: what to do about dimensions? 2nd dim for luma is x,
 817     // but for chroma it's (y<<1)|x
 818     for (i = 4; i < 6; i++)
 819         for (y = 0; y < 2; y++)
 820             for (x = 0; x < 2; x++) {
 821                 nnz_pred = l_nnz[i+2*y] + t_nnz[i+2*x];
 822                 nnz = decode_block_coeffs(c, s->block[i][(y<<1)+x], s->prob->token[2], 0,
 823                                           nnz_pred, s->qmat[segment].chroma_qmul);
 824                 s->non_zero_count_cache[i][(y<<1)+x] = nnz;
 825                 t_nnz[i+2*x] = l_nnz[i+2*y] = !!nnz;
 826                 nnz_total += nnz;
 827             }
 828
 829     // if there were no coded coeffs despite the macroblock not being marked skip,
 830     // we MUST not do the inner loop filter and should not do IDCT
 831     // Since skip isn't used for bitstream prediction, just manually set it.
 832     if (!nnz_total)
 833         mb->skip = 1;
 834 }
 835
 836 static av_always_inline
 837 void backup_mb_border(uint8_t *top_border, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr,
 838                       int linesize, int uvlinesize, int simple)
 839 {
 840     AV_COPY128(top_border, src_y + 15*linesize);
 841     if (!simple) {
 842         AV_COPY64(top_border+16, src_cb + 7*uvlinesize);
 843         AV_COPY64(top_border+24, src_cr + 7*uvlinesize);
 844     }
 845 }
 846
 847 static av_always_inline
 848 void xchg_mb_border(uint8_t *top_border, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr,
 849                     int linesize, int uvlinesize, int mb_x, int mb_y, int mb_width,
 850                     int simple, int xchg)
 851 {
 852     uint8_t *top_border_m1 = top_border-32;     // for TL prediction
 853     src_y  -=   linesize;
 854     src_cb -= uvlinesize;
 855     src_cr -= uvlinesize;
 856
 857 #define XCHG(a,b,xchg) do {                     \
 858         if (xchg) AV_SWAP64(b,a);               \
 859         else      AV_COPY64(b,a);               \
 860     } while (0)
 861
 862     XCHG(top_border_m1+8, src_y-8, xchg);
 863     XCHG(top_border,      src_y,   xchg);
 864     XCHG(top_border+8,    src_y+8, 1);
 865     if (mb_x < mb_width-1)
 866         XCHG(top_border+32, src_y+16, 1);
 867
 868     // only copy chroma for normal loop filter
 869     // or to initialize the top row to 127
 870     if (!simple || !mb_y) {
 871         XCHG(top_border_m1+16, src_cb-8, xchg);
 872         XCHG(top_border_m1+24, src_cr-8, xchg);
 873         XCHG(top_border+16,    src_cb, 1);
 874         XCHG(top_border+24,    src_cr, 1);
 875     }
 876 }
 877
 878 static av_always_inline
 879 int check_dc_pred8x8_mode(int mode, int mb_x, int mb_y)
 880 {
 881     if (!mb_x) {
 882         return mb_y ? TOP_DC_PRED8x8 : DC_128_PRED8x8;
 883     } else {
 884         return mb_y ? mode : LEFT_DC_PRED8x8;
 885     }
 886 }
 887
 888 static av_always_inline
 889 int check_tm_pred8x8_mode(int mode, int mb_x, int mb_y)
 890 {
 891     if (!mb_x) {
 892         return mb_y ? VERT_PRED8x8 : DC_129_PRED8x8;
 893     } else {
 894         return mb_y ? mode : HOR_PRED8x8;
 895     }
 896 }
 897
 898 static av_always_inline
 899 int check_intra_pred8x8_mode(int mode, int mb_x, int mb_y)
 900 {
 901     if (mode == DC_PRED8x8) {
 902         return check_dc_pred8x8_mode(mode, mb_x, mb_y);
 903     } else {
 904         return mode;
 905     }
 906 }
 907
 908 static av_always_inline
 909 int check_intra_pred8x8_mode_emuedge(int mode, int mb_x, int mb_y)
 910 {
 911     switch (mode) {
 912     case DC_PRED8x8:
 913         return check_dc_pred8x8_mode(mode, mb_x, mb_y);
 914     case VERT_PRED8x8:
 915         return !mb_y ? DC_127_PRED8x8 : mode;
 916     case HOR_PRED8x8:
 917         return !mb_x ? DC_129_PRED8x8 : mode;
 918     case PLANE_PRED8x8 /*TM*/:
 919         return check_tm_pred8x8_mode(mode, mb_x, mb_y);
 920     }
 921     return mode;
 922 }
 923
 924 static av_always_inline
 925 int check_tm_pred4x4_mode(int mode, int mb_x, int mb_y)
 926 {
 927     if (!mb_x) {
 928         return mb_y ? VERT_VP8_PRED : DC_129_PRED;
 929     } else {
 930         return mb_y ? mode : HOR_VP8_PRED;
 931     }
 932 }
 933
 934 static av_always_inline
 935 int check_intra_pred4x4_mode_emuedge(int mode, int mb_x, int mb_y, int *copy_buf)
 936 {
 937     switch (mode) {
 938     case VERT_PRED:
 939         if (!mb_x && mb_y) {
 940             *copy_buf = 1;
 941             return mode;
 942         }
 943         /* fall-through */
 944     case DIAG_DOWN_LEFT_PRED:
 945     case VERT_LEFT_PRED:
 946         return !mb_y ? DC_127_PRED : mode;
 947     case HOR_PRED:
 948         if (!mb_y) {
 949             *copy_buf = 1;
 950             return mode;
 951         }
 952         /* fall-through */
 953     case HOR_UP_PRED:
 954         return !mb_x ? DC_129_PRED : mode;
 955     case TM_VP8_PRED:
 956         return check_tm_pred4x4_mode(mode, mb_x, mb_y);
 957     case DC_PRED: // 4x4 DC doesn't use the same "H.264-style" exceptions as 16x16/8x8 DC
 958     case DIAG_DOWN_RIGHT_PRED:
 959     case VERT_RIGHT_PRED:
 960     case HOR_DOWN_PRED:
 961         if (!mb_y || !mb_x)
 962             *copy_buf = 1;
 963         return mode;
 964     }
 965     return mode;
 966 }
 967
 968 static av_always_inline
 969 void intra_predict(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb,
 970                    int mb_x, int mb_y)
 971 {
 972     AVCodecContext *avctx = s->avctx;
 973     int x, y, mode, nnz;
 974     uint32_t tr;
 975
 976     // for the first row, we need to run xchg_mb_border to init the top edge to 127
 977     // otherwise, skip it if we aren't going to deblock
 978     if (!(avctx->flags & CODEC_FLAG_EMU_EDGE && !mb_y) && (s->deblock_filter || !mb_y))
 979         xchg_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2],
 980                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
 981                        s->filter.simple, 1);
 982
 983     if (mb->mode < MODE_I4x4) {
 984         if (avctx->flags & CODEC_FLAG_EMU_EDGE) { // tested
 985             mode = check_intra_pred8x8_mode_emuedge(mb->mode, mb_x, mb_y);
 986         } else {
 987             mode = check_intra_pred8x8_mode(mb->mode, mb_x, mb_y);
 988         }
 989         s->hpc.pred16x16[mode](dst[0], s->linesize);
 990     } else {
 991         uint8_t *ptr = dst[0];
 992         uint8_t *intra4x4 = s->intra4x4_pred_mode_mb;
 993         uint8_t tr_top[4] = { 127, 127, 127, 127 };
 994
 995         // all blocks on the right edge of the macroblock use bottom edge
 996         // the top macroblock for their topright edge
 997         uint8_t *tr_right = ptr - s->linesize + 16;
 998
 999         // if we're on the right edge of the frame, said edge is extended
1000         // from the top macroblock
1001         if (!(!mb_y && avctx->flags & CODEC_FLAG_EMU_EDGE) &&
1002             mb_x == s->mb_width-1) {
1003             tr = tr_right[-1]*0x01010101u;
1004             tr_right = (uint8_t *)&tr;
1005         }
1006
1007         if (mb->skip)
1008             AV_ZERO128(s->non_zero_count_cache);
1009
1010         for (y = 0; y < 4; y++) {
1011             uint8_t *topright = ptr + 4 - s->linesize;
1012             for (x = 0; x < 4; x++) {
1013                 int copy = 0, linesize = s->linesize;
1014                 uint8_t *dst = ptr+4*x;
1015                 DECLARE_ALIGNED(4, uint8_t, copy_dst)[5*8];
1016
1017                 if ((y == 0 || x == 3) && mb_y == 0 && avctx->flags & CODEC_FLAG_EMU_EDGE) {
1018                     topright = tr_top;
1019                 } else if (x == 3)
1020                     topright = tr_right;
1021
1022                 if (avctx->flags & CODEC_FLAG_EMU_EDGE) { // mb_x+x or mb_y+y is a hack but works
1023                     mode = check_intra_pred4x4_mode_emuedge(intra4x4[x], mb_x + x, mb_y + y, &copy);
1024                     if (copy) {
1025                         dst = copy_dst + 12;
1026                         linesize = 8;
1027                         if (!(mb_y + y)) {
1028                             copy_dst[3] = 127U;
1029                             AV_WN32A(copy_dst+4, 127U * 0x01010101U);
1030                         } else {
1031                             AV_COPY32(copy_dst+4, ptr+4*x-s->linesize);
1032                             if (!(mb_x + x)) {
1033                                 copy_dst[3] = 129U;
1034                             } else {
1035                                 copy_dst[3] = ptr[4*x-s->linesize-1];
1036                             }
1037                         }
1038                         if (!(mb_x + x)) {
1039                             copy_dst[11] =
1040                             copy_dst[19] =
1041                             copy_dst[27] =
1042                             copy_dst[35] = 129U;
1043                         } else {
1044                             copy_dst[11] = ptr[4*x              -1];
1045                             copy_dst[19] = ptr[4*x+s->linesize  -1];
1046                             copy_dst[27] = ptr[4*x+s->linesize*2-1];
1047                             copy_dst[35] = ptr[4*x+s->linesize*3-1];
1048                         }
1049                     }
1050                 } else {
1051                     mode = intra4x4[x];
1052                 }
1053                 s->hpc.pred4x4[mode](dst, topright, linesize);
1054                 if (copy) {
1055                     AV_COPY32(ptr+4*x              , copy_dst+12);
1056                     AV_COPY32(ptr+4*x+s->linesize  , copy_dst+20);
1057                     AV_COPY32(ptr+4*x+s->linesize*2, copy_dst+28);
1058                     AV_COPY32(ptr+4*x+s->linesize*3, copy_dst+36);
1059                 }
1060
1061                 nnz = s->non_zero_count_cache[y][x];
1062                 if (nnz) {
1063                     if (nnz == 1)
1064                         s->vp8dsp.vp8_idct_dc_add(ptr+4*x, s->block[y][x], s->linesize);
1065                     else
1066                         s->vp8dsp.vp8_idct_add(ptr+4*x, s->block[y][x], s->linesize);
1067                 }
1068                 topright += 4;
1069             }
1070
1071             ptr   += 4*s->linesize;
1072             intra4x4 += 4;
1073         }
1074     }
1075
1076     if (avctx->flags & CODEC_FLAG_EMU_EDGE) {
1077         mode = check_intra_pred8x8_mode_emuedge(s->chroma_pred_mode, mb_x, mb_y);
1078     } else {
1079         mode = check_intra_pred8x8_mode(s->chroma_pred_mode, mb_x, mb_y);
1080     }
1081     s->hpc.pred8x8[mode](dst[1], s->uvlinesize);
1082     s->hpc.pred8x8[mode](dst[2], s->uvlinesize);
1083
1084     if (!(avctx->flags & CODEC_FLAG_EMU_EDGE && !mb_y) && (s->deblock_filter || !mb_y))
1085         xchg_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2],
1086                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1087                        s->filter.simple, 0);
1088 }
1089
1090 static const uint8_t subpel_idx[3][8] = {
1091     { 0, 1, 2, 1, 2, 1, 2, 1 }, // nr. of left extra pixels,
1092                                 // also function pointer index
1093     { 0, 3, 5, 3, 5, 3, 5, 3 }, // nr. of extra pixels required
1094     { 0, 2, 3, 2, 3, 2, 3, 2 }, // nr. of right extra pixels
1095 };
1096
1097 /**
1098  * luma MC function
1099  *
1100  * @param s VP8 decoding context
1101  * @param dst target buffer for block data at block position
1102  * @param ref reference picture buffer at origin (0, 0)
1103  * @param mv motion vector (relative to block position) to get pixel data from
1104  * @param x_off horizontal position of block from origin (0, 0)
1105  * @param y_off vertical position of block from origin (0, 0)
1106  * @param block_w width of block (16, 8 or 4)
1107  * @param block_h height of block (always same as block_w)
1108  * @param width width of src/dst plane data
1109  * @param height height of src/dst plane data
1110  * @param linesize size of a single line of plane data, including padding
1111  * @param mc_func motion compensation function pointers (bilinear or sixtap MC)
1112  */
1113 static av_always_inline
1114 void vp8_mc_luma(VP8Context *s, uint8_t *dst, AVFrame *ref, const VP56mv *mv,
1115                  int x_off, int y_off, int block_w, int block_h,
1116                  int width, int height, int linesize,
1117                  vp8_mc_func mc_func[3][3])
1118 {
1119     uint8_t *src = ref->data[0];
1120
1121     if (AV_RN32A(mv)) {
1122
1123         int mx = (mv->x << 1)&7, mx_idx = subpel_idx[0][mx];
1124         int my = (mv->y << 1)&7, my_idx = subpel_idx[0][my];
1125
1126         x_off += mv->x >> 2;
1127         y_off += mv->y >> 2;
1128
1129         // edge emulation
1130         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 4, 0);
1131         src += y_off * linesize + x_off;
1132         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1133             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1134             s->dsp.emulated_edge_mc(s->edge_emu_buffer, src - my_idx * linesize - mx_idx, linesize,
1135                                     block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1136                                     x_off - mx_idx, y_off - my_idx, width, height);
1137             src = s->edge_emu_buffer + mx_idx + linesize * my_idx;
1138         }
1139         mc_func[my_idx][mx_idx](dst, linesize, src, linesize, block_h, mx, my);
1140     } else {
1141         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 4, 0);
1142         mc_func[0][0](dst, linesize, src + y_off * linesize + x_off, linesize, block_h, 0, 0);
1143     }
1144 }
1145
1146 /**
1147  * chroma MC function
1148  *
1149  * @param s VP8 decoding context
1150  * @param dst1 target buffer for block data at block position (U plane)
1151  * @param dst2 target buffer for block data at block position (V plane)
1152  * @param ref reference picture buffer at origin (0, 0)
1153  * @param mv motion vector (relative to block position) to get pixel data from
1154  * @param x_off horizontal position of block from origin (0, 0)
1155  * @param y_off vertical position of block from origin (0, 0)
1156  * @param block_w width of block (16, 8 or 4)
1157  * @param block_h height of block (always same as block_w)
1158  * @param width width of src/dst plane data
1159  * @param height height of src/dst plane data
1160  * @param linesize size of a single line of plane data, including padding
1161  * @param mc_func motion compensation function pointers (bilinear or sixtap MC)
1162  */
1163 static av_always_inline
1164 void vp8_mc_chroma(VP8Context *s, uint8_t *dst1, uint8_t *dst2, AVFrame *ref,
1165                    const VP56mv *mv, int x_off, int y_off,
1166                    int block_w, int block_h, int width, int height, int linesize,
1167                    vp8_mc_func mc_func[3][3])
1168 {
1169     uint8_t *src1 = ref->data[1], *src2 = ref->data[2];
1170
1171     if (AV_RN32A(mv)) {
1172         int mx = mv->x&7, mx_idx = subpel_idx[0][mx];
1173         int my = mv->y&7, my_idx = subpel_idx[0][my];
1174
1175         x_off += mv->x >> 3;
1176         y_off += mv->y >> 3;
1177
1178         // edge emulation
1179         src1 += y_off * linesize + x_off;
1180         src2 += y_off * linesize + x_off;
1181         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 3, 0);
1182         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1183             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1184             s->dsp.emulated_edge_mc(s->edge_emu_buffer, src1 - my_idx * linesize - mx_idx, linesize,
1185                                     block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1186                                     x_off - mx_idx, y_off - my_idx, width, height);
1187             src1 = s->edge_emu_buffer + mx_idx + linesize * my_idx;
1188             mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my);
1189
1190             s->dsp.emulated_edge_mc(s->edge_emu_buffer, src2 - my_idx * linesize - mx_idx, linesize,
1191                                     block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1192                                     x_off - mx_idx, y_off - my_idx, width, height);
1193             src2 = s->edge_emu_buffer + mx_idx + linesize * my_idx;
1194             mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my);
1195         } else {
1196             mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my);
1197             mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my);
1198         }
1199     } else {
1200         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 3, 0);
1201         mc_func[0][0](dst1, linesize, src1 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1202         mc_func[0][0](dst2, linesize, src2 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1203     }
1204 }
1205
1206 static av_always_inline
1207 void vp8_mc_part(VP8Context *s, uint8_t *dst[3],
1208                  AVFrame *ref_frame, int x_off, int y_off,
1209                  int bx_off, int by_off,
1210                  int block_w, int block_h,
1211                  int width, int height, VP56mv *mv)
1212 {
1213     VP56mv uvmv = *mv;
1214
1215     /* Y */
1216     vp8_mc_luma(s, dst[0] + by_off * s->linesize + bx_off,
1217                 ref_frame, mv, x_off + bx_off, y_off + by_off,
1218                 block_w, block_h, width, height, s->linesize,
1219                 s->put_pixels_tab[block_w == 8]);
1220
1221     /* U/V */
1222     if (s->profile == 3) {
1223         uvmv.x &= ~7;
1224         uvmv.y &= ~7;
1225     }
1226     x_off   >>= 1; y_off   >>= 1;
1227     bx_off  >>= 1; by_off  >>= 1;
1228     width   >>= 1; height  >>= 1;
1229     block_w >>= 1; block_h >>= 1;
1230     vp8_mc_chroma(s, dst[1] + by_off * s->uvlinesize + bx_off,
1231                   dst[2] + by_off * s->uvlinesize + bx_off, ref_frame,
1232                   &uvmv, x_off + bx_off, y_off + by_off,
1233                   block_w, block_h, width, height, s->uvlinesize,
1234                   s->put_pixels_tab[1 + (block_w == 4)]);
1235 }
1236
1237 /* Fetch pixels for estimated mv 4 macroblocks ahead.
1238  * Optimized for 64-byte cache lines.  Inspired by ffh264 prefetch_motion. */
1239 static av_always_inline void prefetch_motion(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, int mb_xy, int ref)
1240 {
1241     /* Don't prefetch refs that haven't been used very often this frame. */
1242     if (s->ref_count[ref-1] > (mb_xy >> 5)) {
1243         int x_off = mb_x << 4, y_off = mb_y << 4;
1244         int mx = (mb->mv.x>>2) + x_off + 8;
1245         int my = (mb->mv.y>>2) + y_off;
1246         uint8_t **src= s->framep[ref]->data;
1247         int off= mx + (my + (mb_x&3)*4)*s->linesize + 64;
1248         /* For threading, a ff_thread_await_progress here might be useful, but
1249          * it actually slows down the decoder. Since a bad prefetch doesn't
1250          * generate bad decoder output, we don't run it here. */
1251         s->dsp.prefetch(src[0]+off, s->linesize, 4);
1252         off= (mx>>1) + ((my>>1) + (mb_x&7))*s->uvlinesize + 64;
1253         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
1254     }
1255 }
1256
1257 /**
1258  * Apply motion vectors to prediction buffer, chapter 18.
1259  */
1260 static av_always_inline
1261 void inter_predict(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb,
1262                    int mb_x, int mb_y)
1263 {
1264     int x_off = mb_x << 4, y_off = mb_y << 4;
1265     int width = 16*s->mb_width, height = 16*s->mb_height;
1266     AVFrame *ref = s->framep[mb->ref_frame];
1267     VP56mv *bmv = mb->bmv;
1268
1269     switch (mb->partitioning) {
1270     case VP8_SPLITMVMODE_NONE:
1271         vp8_mc_part(s, dst, ref, x_off, y_off,
1272                     0, 0, 16, 16, width, height, &mb->mv);
1273         break;
1274     case VP8_SPLITMVMODE_4x4: {
1275         int x, y;
1276         VP56mv uvmv;
1277
1278         /* Y */
1279         for (y = 0; y < 4; y++) {
1280             for (x = 0; x < 4; x++) {
1281                 vp8_mc_luma(s, dst[0] + 4*y*s->linesize + x*4,
1282                             ref, &bmv[4*y + x],
1283                             4*x + x_off, 4*y + y_off, 4, 4,
1284                             width, height, s->linesize,
1285                             s->put_pixels_tab[2]);
1286             }
1287         }
1288
1289         /* U/V */
1290         x_off >>= 1; y_off >>= 1; width >>= 1; height >>= 1;
1291         for (y = 0; y < 2; y++) {
1292             for (x = 0; x < 2; x++) {
1293                 uvmv.x = mb->bmv[ 2*y    * 4 + 2*x  ].x +
1294                          mb->bmv[ 2*y    * 4 + 2*x+1].x +
1295                          mb->bmv[(2*y+1) * 4 + 2*x  ].x +
1296                          mb->bmv[(2*y+1) * 4 + 2*x+1].x;
1297                 uvmv.y = mb->bmv[ 2*y    * 4 + 2*x  ].y +
1298                          mb->bmv[ 2*y    * 4 + 2*x+1].y +
1299                          mb->bmv[(2*y+1) * 4 + 2*x  ].y +
1300                          mb->bmv[(2*y+1) * 4 + 2*x+1].y;
1301                 uvmv.x = (uvmv.x + 2 + (uvmv.x >> (INT_BIT-1))) >> 2;
1302                 uvmv.y = (uvmv.y + 2 + (uvmv.y >> (INT_BIT-1))) >> 2;
1303                 if (s->profile == 3) {
1304                     uvmv.x &= ~7;
1305                     uvmv.y &= ~7;
1306                 }
1307                 vp8_mc_chroma(s, dst[1] + 4*y*s->uvlinesize + x*4,
1308                               dst[2] + 4*y*s->uvlinesize + x*4, ref, &uvmv,
1309                               4*x + x_off, 4*y + y_off, 4, 4,
1310                               width, height, s->uvlinesize,
1311                               s->put_pixels_tab[2]);
1312             }
1313         }
1314         break;
1315     }
1316     case VP8_SPLITMVMODE_16x8:
1317         vp8_mc_part(s, dst, ref, x_off, y_off,
1318                     0, 0, 16, 8, width, height, &bmv[0]);
1319         vp8_mc_part(s, dst, ref, x_off, y_off,
1320                     0, 8, 16, 8, width, height, &bmv[1]);
1321         break;
1322     case VP8_SPLITMVMODE_8x16:
1323         vp8_mc_part(s, dst, ref, x_off, y_off,
1324                     0, 0, 8, 16, width, height, &bmv[0]);
1325         vp8_mc_part(s, dst, ref, x_off, y_off,
1326                     8, 0, 8, 16, width, height, &bmv[1]);
1327         break;
1328     case VP8_SPLITMVMODE_8x8:
1329         vp8_mc_part(s, dst, ref, x_off, y_off,
1330                     0, 0, 8, 8, width, height, &bmv[0]);
1331         vp8_mc_part(s, dst, ref, x_off, y_off,
1332                     8, 0, 8, 8, width, height, &bmv[1]);
1333         vp8_mc_part(s, dst, ref, x_off, y_off,
1334                     0, 8, 8, 8, width, height, &bmv[2]);
1335         vp8_mc_part(s, dst, ref, x_off, y_off,
1336                     8, 8, 8, 8, width, height, &bmv[3]);
1337         break;
1338     }
1339 }
1340
1341 static av_always_inline void idct_mb(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb)
1342 {
1343     int x, y, ch;
1344
1345     if (mb->mode != MODE_I4x4) {
1346         uint8_t *y_dst = dst[0];
1347         for (y = 0; y < 4; y++) {
1348             uint32_t nnz4 = AV_RL32(s->non_zero_count_cache[y]);
1349             if (nnz4) {
1350                 if (nnz4&~0x01010101) {
1351                     for (x = 0; x < 4; x++) {
1352                         if ((uint8_t)nnz4 == 1)
1353                             s->vp8dsp.vp8_idct_dc_add(y_dst+4*x, s->block[y][x], s->linesize);
1354                         else if((uint8_t)nnz4 > 1)
1355                             s->vp8dsp.vp8_idct_add(y_dst+4*x, s->block[y][x], s->linesize);
1356                         nnz4 >>= 8;
1357                         if (!nnz4)
1358                             break;
1359                     }
1360                 } else {
1361                     s->vp8dsp.vp8_idct_dc_add4y(y_dst, s->block[y], s->linesize);
1362                 }
1363             }
1364             y_dst += 4*s->linesize;
1365         }
1366     }
1367
1368     for (ch = 0; ch < 2; ch++) {
1369         uint32_t nnz4 = AV_RL32(s->non_zero_count_cache[4+ch]);
1370         if (nnz4) {
1371             uint8_t *ch_dst = dst[1+ch];
1372             if (nnz4&~0x01010101) {
1373                 for (y = 0; y < 2; y++) {
1374                     for (x = 0; x < 2; x++) {
1375                         if ((uint8_t)nnz4 == 1)
1376                             s->vp8dsp.vp8_idct_dc_add(ch_dst+4*x, s->block[4+ch][(y<<1)+x], s->uvlinesize);
1377                         else if((uint8_t)nnz4 > 1)
1378                             s->vp8dsp.vp8_idct_add(ch_dst+4*x, s->block[4+ch][(y<<1)+x], s->uvlinesize);
1379                         nnz4 >>= 8;
1380                         if (!nnz4)
1381                             goto chroma_idct_end;
1382                     }
1383                     ch_dst += 4*s->uvlinesize;
1384                 }
1385             } else {
1386                 s->vp8dsp.vp8_idct_dc_add4uv(ch_dst, s->block[4+ch], s->uvlinesize);
1387             }
1388         }
1389 chroma_idct_end: ;
1390     }
1391 }
1392
1393 static av_always_inline void filter_level_for_mb(VP8Context *s, VP8Macroblock *mb, VP8FilterStrength *f )
1394 {
1395     int interior_limit, filter_level;
1396
1397     if (s->segmentation.enabled) {
1398         filter_level = s->segmentation.filter_level[s->segment];
1399         if (!s->segmentation.absolute_vals)
1400             filter_level += s->filter.level;
1401     } else
1402         filter_level = s->filter.level;
1403
1404     if (s->lf_delta.enabled) {
1405         filter_level += s->lf_delta.ref[mb->ref_frame];
1406         filter_level += s->lf_delta.mode[mb->mode];
1407     }
1408
1409     filter_level = av_clip_uintp2(filter_level, 6);
1410
1411     interior_limit = filter_level;
1412     if (s->filter.sharpness) {
1413         interior_limit >>= (s->filter.sharpness + 3) >> 2;
1414         interior_limit = FFMIN(interior_limit, 9 - s->filter.sharpness);
1415     }
1416     interior_limit = FFMAX(interior_limit, 1);
1417
1418     f->filter_level = filter_level;
1419     f->inner_limit = interior_limit;
1420     f->inner_filter = !mb->skip || mb->mode == MODE_I4x4 || mb->mode == VP8_MVMODE_SPLIT;
1421 }
1422
1423 static av_always_inline void filter_mb(VP8Context *s, uint8_t *dst[3], VP8FilterStrength *f, int mb_x, int mb_y)
1424 {
1425     int mbedge_lim, bedge_lim, hev_thresh;
1426     int filter_level = f->filter_level;
1427     int inner_limit = f->inner_limit;
1428     int inner_filter = f->inner_filter;
1429     int linesize = s->linesize;
1430     int uvlinesize = s->uvlinesize;
1431     static const uint8_t hev_thresh_lut[2][64] = {
1432         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
1433           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1434           3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1435           3, 3, 3, 3 },
1436         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
1437           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1438           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1439           2, 2, 2, 2 }
1440     };
1441
1442     if (!filter_level)
1443         return;
1444
1445      bedge_lim = 2*filter_level + inner_limit;
1446     mbedge_lim = bedge_lim + 4;
1447
1448     hev_thresh = hev_thresh_lut[s->keyframe][filter_level];
1449
1450     if (mb_x) {
1451         s->vp8dsp.vp8_h_loop_filter16y(dst[0],     linesize,
1452                                        mbedge_lim, inner_limit, hev_thresh);
1453         s->vp8dsp.vp8_h_loop_filter8uv(dst[1],     dst[2],      uvlinesize,
1454                                        mbedge_lim, inner_limit, hev_thresh);
1455     }
1456
1457     if (inner_filter) {
1458         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+ 4, linesize, bedge_lim,
1459                                              inner_limit, hev_thresh);
1460         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+ 8, linesize, bedge_lim,
1461                                              inner_limit, hev_thresh);
1462         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+12, linesize, bedge_lim,
1463                                              inner_limit, hev_thresh);
1464         s->vp8dsp.vp8_h_loop_filter8uv_inner(dst[1] + 4, dst[2] + 4,
1465                                              uvlinesize,  bedge_lim,
1466                                              inner_limit, hev_thresh);
1467     }
1468
1469     if (mb_y) {
1470         s->vp8dsp.vp8_v_loop_filter16y(dst[0],     linesize,
1471                                        mbedge_lim, inner_limit, hev_thresh);
1472         s->vp8dsp.vp8_v_loop_filter8uv(dst[1],     dst[2],      uvlinesize,
1473                                        mbedge_lim, inner_limit, hev_thresh);
1474     }
1475
1476     if (inner_filter) {
1477         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+ 4*linesize,
1478                                              linesize,    bedge_lim,
1479                                              inner_limit, hev_thresh);
1480         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+ 8*linesize,
1481                                              linesize,    bedge_lim,
1482                                              inner_limit, hev_thresh);
1483         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+12*linesize,
1484                                              linesize,    bedge_lim,
1485                                              inner_limit, hev_thresh);
1486         s->vp8dsp.vp8_v_loop_filter8uv_inner(dst[1] + 4 * uvlinesize,
1487                                              dst[2] + 4 * uvlinesize,
1488                                              uvlinesize,  bedge_lim,
1489                                              inner_limit, hev_thresh);
1490     }
1491 }
1492
1493 static av_always_inline void filter_mb_simple(VP8Context *s, uint8_t *dst, VP8FilterStrength *f, int mb_x, int mb_y)
1494 {
1495     int mbedge_lim, bedge_lim;
1496     int filter_level = f->filter_level;
1497     int inner_limit = f->inner_limit;
1498     int inner_filter = f->inner_filter;
1499     int linesize = s->linesize;
1500
1501     if (!filter_level)
1502         return;
1503
1504      bedge_lim = 2*filter_level + inner_limit;
1505     mbedge_lim = bedge_lim + 4;
1506
1507     if (mb_x)
1508         s->vp8dsp.vp8_h_loop_filter_simple(dst, linesize, mbedge_lim);
1509     if (inner_filter) {
1510         s->vp8dsp.vp8_h_loop_filter_simple(dst+ 4, linesize, bedge_lim);
1511         s->vp8dsp.vp8_h_loop_filter_simple(dst+ 8, linesize, bedge_lim);
1512         s->vp8dsp.vp8_h_loop_filter_simple(dst+12, linesize, bedge_lim);
1513     }
1514
1515     if (mb_y)
1516         s->vp8dsp.vp8_v_loop_filter_simple(dst, linesize, mbedge_lim);
1517     if (inner_filter) {
1518         s->vp8dsp.vp8_v_loop_filter_simple(dst+ 4*linesize, linesize, bedge_lim);
1519         s->vp8dsp.vp8_v_loop_filter_simple(dst+ 8*linesize, linesize, bedge_lim);
1520         s->vp8dsp.vp8_v_loop_filter_simple(dst+12*linesize, linesize, bedge_lim);
1521     }
1522 }
1523
1524 static void filter_mb_row(VP8Context *s, AVFrame *curframe, int mb_y)
1525 {
1526     VP8FilterStrength *f = s->filter_strength;
1527     uint8_t *dst[3] = {
1528         curframe->data[0] + 16*mb_y*s->linesize,
1529         curframe->data[1] +  8*mb_y*s->uvlinesize,
1530         curframe->data[2] +  8*mb_y*s->uvlinesize
1531     };
1532     int mb_x;
1533
1534     for (mb_x = 0; mb_x < s->mb_width; mb_x++) {
1535         backup_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2], s->linesize, s->uvlinesize, 0);
1536         filter_mb(s, dst, f++, mb_x, mb_y);
1537         dst[0] += 16;
1538         dst[1] += 8;
1539         dst[2] += 8;
1540     }
1541 }
1542
1543 static void filter_mb_row_simple(VP8Context *s, AVFrame *curframe, int mb_y)
1544 {
1545     VP8FilterStrength *f = s->filter_strength;
1546     uint8_t *dst = curframe->data[0] + 16*mb_y*s->linesize;
1547     int mb_x;
1548
1549     for (mb_x = 0; mb_x < s->mb_width; mb_x++) {
1550         backup_mb_border(s->top_border[mb_x+1], dst, NULL, NULL, s->linesize, 0, 1);
1551         filter_mb_simple(s, dst, f++, mb_x, mb_y);
1552         dst += 16;
1553     }
1554 }
1555
1556 static void release_queued_segmaps(VP8Context *s, int is_close)
1557 {
1558     int leave_behind = is_close ? 0 : !s->maps_are_invalid;
1559     while (s->num_maps_to_be_freed > leave_behind)
1560         av_freep(&s->segmentation_maps[--s->num_maps_to_be_freed]);
1561     s->maps_are_invalid = 0;
1562 }
1563
1564 static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
1565                             AVPacket *avpkt)
1566 {
1567     VP8Context *s = avctx->priv_data;
1568     int ret, mb_x, mb_y, i, y, referenced;
1569     enum AVDiscard skip_thresh;
1570     AVFrame *av_uninit(curframe), *prev_frame;
1571
1572     release_queued_segmaps(s, 0);
1573
1574     if ((ret = decode_frame_header(s, avpkt->data, avpkt->size)) < 0)
1575         return ret;
1576
1577     prev_frame = s->framep[VP56_FRAME_CURRENT];
1578
1579     referenced = s->update_last || s->update_golden == VP56_FRAME_CURRENT
1580                                 || s->update_altref == VP56_FRAME_CURRENT;
1581
1582     skip_thresh = !referenced ? AVDISCARD_NONREF :
1583                     !s->keyframe ? AVDISCARD_NONKEY : AVDISCARD_ALL;
1584
1585     if (avctx->skip_frame >= skip_thresh) {
1586         s->invisible = 1;
1587         goto skip_decode;
1588     }
1589     s->deblock_filter = s->filter.level && avctx->skip_loop_filter < skip_thresh;
1590
1591     // release no longer referenced frames
1592     for (i = 0; i < 5; i++)
1593         if (s->frames[i].data[0] &&
1594             &s->frames[i] != prev_frame &&
1595             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
1596             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN] &&
1597             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2])
1598             vp8_release_frame(s, &s->frames[i], 1, 0);
1599
1600     // find a free buffer
1601     for (i = 0; i < 5; i++)
1602         if (&s->frames[i] != prev_frame &&
1603             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
1604             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN] &&
1605             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2]) {
1606             curframe = s->framep[VP56_FRAME_CURRENT] = &s->frames[i];
1607             break;
1608         }
1609     if (i == 5) {
1610         av_log(avctx, AV_LOG_FATAL, "Ran out of free frames!\n");
1611         abort();
1612     }
1613     if (curframe->data[0])
1614         vp8_release_frame(s, curframe, 1, 0);
1615
1616     curframe->key_frame = s->keyframe;
1617     curframe->pict_type = s->keyframe ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
1618     curframe->reference = referenced ? 3 : 0;
1619     if ((ret = vp8_alloc_frame(s, curframe))) {
1620         av_log(avctx, AV_LOG_ERROR, "get_buffer() failed!\n");
1621         return ret;
1622     }
1623
1624     // check if golden and altref are swapped
1625     if (s->update_altref != VP56_FRAME_NONE) {
1626         s->next_framep[VP56_FRAME_GOLDEN2]  = s->framep[s->update_altref];
1627     } else {
1628         s->next_framep[VP56_FRAME_GOLDEN2]  = s->framep[VP56_FRAME_GOLDEN2];
1629     }
1630     if (s->update_golden != VP56_FRAME_NONE) {
1631         s->next_framep[VP56_FRAME_GOLDEN]   = s->framep[s->update_golden];
1632     } else {
1633         s->next_framep[VP56_FRAME_GOLDEN]   = s->framep[VP56_FRAME_GOLDEN];
1634     }
1635     if (s->update_last) {
1636         s->next_framep[VP56_FRAME_PREVIOUS] = curframe;
1637     } else {
1638         s->next_framep[VP56_FRAME_PREVIOUS] = s->framep[VP56_FRAME_PREVIOUS];
1639     }
1640     s->next_framep[VP56_FRAME_CURRENT]      = curframe;
1641
1642     ff_thread_finish_setup(avctx);
1643
1644     // Given that arithmetic probabilities are updated every frame, it's quite likely
1645     // that the values we have on a random interframe are complete junk if we didn't
1646     // start decode on a keyframe. So just don't display anything rather than junk.
1647     if (!s->keyframe && (!s->framep[VP56_FRAME_PREVIOUS] ||
1648                          !s->framep[VP56_FRAME_GOLDEN] ||
1649                          !s->framep[VP56_FRAME_GOLDEN2])) {
1650         av_log(avctx, AV_LOG_WARNING, "Discarding interframe without a prior keyframe!\n");
1651         return AVERROR_INVALIDDATA;
1652     }
1653
1654     s->linesize   = curframe->linesize[0];
1655     s->uvlinesize = curframe->linesize[1];
1656
1657     if (!s->edge_emu_buffer)
1658         s->edge_emu_buffer = av_malloc(21*s->linesize);
1659
1660     memset(s->top_nnz, 0, s->mb_width*sizeof(*s->top_nnz));
1661
1662     /* Zero macroblock structures for top/top-left prediction from outside the frame. */
1663     memset(s->macroblocks + s->mb_height*2 - 1, 0, (s->mb_width+1)*sizeof(*s->macroblocks));
1664
1665     // top edge of 127 for intra prediction
1666     if (!(avctx->flags & CODEC_FLAG_EMU_EDGE)) {
1667         s->top_border[0][15] = s->top_border[0][23] = 127;
1668         memset(s->top_border[1]-1, 127, s->mb_width*sizeof(*s->top_border)+1);
1669     }
1670     memset(s->ref_count, 0, sizeof(s->ref_count));
1671     if (s->keyframe)
1672         memset(s->intra4x4_pred_mode_top, DC_PRED, s->mb_width*4);
1673
1674 #define MARGIN (16 << 2)
1675     s->mv_min.y = -MARGIN;
1676     s->mv_max.y = ((s->mb_height - 1) << 6) + MARGIN;
1677
1678     for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
1679         VP56RangeCoder *c = &s->coeff_partition[mb_y & (s->num_coeff_partitions-1)];
1680         VP8Macroblock *mb = s->macroblocks + (s->mb_height - mb_y - 1)*2;
1681         int mb_xy = mb_y*s->mb_width;
1682         uint8_t *dst[3] = {
1683             curframe->data[0] + 16*mb_y*s->linesize,
1684             curframe->data[1] +  8*mb_y*s->uvlinesize,
1685             curframe->data[2] +  8*mb_y*s->uvlinesize
1686         };
1687
1688         memset(mb - 1, 0, sizeof(*mb));   // zero left macroblock
1689         memset(s->left_nnz, 0, sizeof(s->left_nnz));
1690         AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED*0x01010101);
1691
1692         // left edge of 129 for intra prediction
1693         if (!(avctx->flags & CODEC_FLAG_EMU_EDGE)) {
1694             for (i = 0; i < 3; i++)
1695                 for (y = 0; y < 16>>!!i; y++)
1696                     dst[i][y*curframe->linesize[i]-1] = 129;
1697             if (mb_y == 1) // top left edge is also 129
1698                 s->top_border[0][15] = s->top_border[0][23] = s->top_border[0][31] = 129;
1699         }
1700
1701         s->mv_min.x = -MARGIN;
1702         s->mv_max.x = ((s->mb_width  - 1) << 6) + MARGIN;
1703         if (prev_frame && s->segmentation.enabled && !s->segmentation.update_map)
1704             ff_thread_await_progress(prev_frame, mb_y, 0);
1705
1706         for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
1707             /* Prefetch the current frame, 4 MBs ahead */
1708             s->dsp.prefetch(dst[0] + (mb_x&3)*4*s->linesize + 64, s->linesize, 4);
1709             s->dsp.prefetch(dst[1] + (mb_x&7)*s->uvlinesize + 64, dst[2] - dst[1], 2);
1710
1711             decode_mb_mode(s, mb, mb_x, mb_y, curframe->ref_index[0] + mb_xy,
1712                            prev_frame && prev_frame->ref_index[0] ? prev_frame->ref_index[0] + mb_xy : NULL);
1713
1714             prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_PREVIOUS);
1715
1716             if (!mb->skip)
1717                 decode_mb_coeffs(s, c, mb, s->top_nnz[mb_x], s->left_nnz);
1718
1719             if (mb->mode <= MODE_I4x4)
1720                 intra_predict(s, dst, mb, mb_x, mb_y);
1721             else
1722                 inter_predict(s, dst, mb, mb_x, mb_y);
1723
1724             prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN);
1725
1726             if (!mb->skip) {
1727                 idct_mb(s, dst, mb);
1728             } else {
1729                 AV_ZERO64(s->left_nnz);
1730                 AV_WN64(s->top_nnz[mb_x], 0);   // array of 9, so unaligned
1731
1732                 // Reset DC block predictors if they would exist if the mb had coefficients
1733                 if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
1734                     s->left_nnz[8]      = 0;
1735                     s->top_nnz[mb_x][8] = 0;
1736                 }
1737             }
1738
1739             if (s->deblock_filter)
1740                 filter_level_for_mb(s, mb, &s->filter_strength[mb_x]);
1741
1742             prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN2);
1743
1744             dst[0] += 16;
1745             dst[1] += 8;
1746             dst[2] += 8;
1747             s->mv_min.x -= 64;
1748             s->mv_max.x -= 64;
1749         }
1750         if (s->deblock_filter) {
1751             if (s->filter.simple)
1752                 filter_mb_row_simple(s, curframe, mb_y);
1753             else
1754                 filter_mb_row(s, curframe, mb_y);
1755         }
1756         s->mv_min.y -= 64;
1757         s->mv_max.y -= 64;
1758
1759         ff_thread_report_progress(curframe, mb_y, 0);
1760     }
1761
1762     ff_thread_report_progress(curframe, INT_MAX, 0);
1763 skip_decode:
1764     // if future frames don't use the updated probabilities,
1765     // reset them to the values we saved
1766     if (!s->update_probabilities)
1767         s->prob[0] = s->prob[1];
1768
1769     memcpy(&s->framep[0], &s->next_framep[0], sizeof(s->framep[0]) * 4);
1770
1771     if (!s->invisible) {
1772         *(AVFrame*)data = *curframe;
1773         *data_size = sizeof(AVFrame);
1774     }
1775
1776     return avpkt->size;
1777 }
1778
1779 static av_cold int vp8_decode_init(AVCodecContext *avctx)
1780 {
1781     VP8Context *s = avctx->priv_data;
1782
1783     s->avctx = avctx;
1784     avctx->pix_fmt = PIX_FMT_YUV420P;
1785
1786     dsputil_init(&s->dsp, avctx);
1787     ff_h264_pred_init(&s->hpc, CODEC_ID_VP8, 8, 1);
1788     ff_vp8dsp_init(&s->vp8dsp);
1789
1790     return 0;
1791 }
1792
1793 static av_cold int vp8_decode_free(AVCodecContext *avctx)
1794 {
1795     vp8_decode_flush_impl(avctx, 0, 1, 1);
1796     release_queued_segmaps(avctx->priv_data, 1);
1797     return 0;
1798 }
1799
1800 static av_cold int vp8_decode_init_thread_copy(AVCodecContext *avctx)
1801 {
1802     VP8Context *s = avctx->priv_data;
1803
1804     s->avctx = avctx;
1805
1806     return 0;
1807 }
1808
1809 #define REBASE(pic) \
1810     pic ? pic - &s_src->frames[0] + &s->frames[0] : NULL
1811
1812 static int vp8_decode_update_thread_context(AVCodecContext *dst, const AVCodecContext *src)
1813 {
1814     VP8Context *s = dst->priv_data, *s_src = src->priv_data;
1815
1816     if (s->macroblocks_base &&
1817         (s_src->mb_width != s->mb_width || s_src->mb_height != s->mb_height)) {
1818         free_buffers(s);
1819         s->maps_are_invalid = 1;
1820     }
1821
1822     s->prob[0] = s_src->prob[!s_src->update_probabilities];
1823     s->segmentation = s_src->segmentation;
1824     s->lf_delta = s_src->lf_delta;
1825     memcpy(s->sign_bias, s_src->sign_bias, sizeof(s->sign_bias));
1826
1827     memcpy(&s->frames, &s_src->frames, sizeof(s->frames));
1828     s->framep[0] = REBASE(s_src->next_framep[0]);
1829     s->framep[1] = REBASE(s_src->next_framep[1]);
1830     s->framep[2] = REBASE(s_src->next_framep[2]);
1831     s->framep[3] = REBASE(s_src->next_framep[3]);
1832
1833     return 0;
1834 }
1835
1836 AVCodec ff_vp8_decoder = {
1837     .name           = "vp8",
1838     .type           = AVMEDIA_TYPE_VIDEO,
1839     .id             = CODEC_ID_VP8,
1840     .priv_data_size = sizeof(VP8Context),
1841     .init           = vp8_decode_init,
1842     .close          = vp8_decode_free,
1843     .decode         = vp8_decode_frame,
1844     .capabilities   = CODEC_CAP_DR1 | CODEC_CAP_FRAME_THREADS,
1845     .flush = vp8_decode_flush,
1846     .long_name = NULL_IF_CONFIG_SMALL("On2 VP8"),
1847     .init_thread_copy      = ONLY_IF_THREADS_ENABLED(vp8_decode_init_thread_copy),
1848     .update_thread_context = ONLY_IF_THREADS_ENABLED(vp8_decode_update_thread_context),
1849 };