git.sesse.net Git - ffmpeg/blob - libavcodec/vp8.c

   1 /*
   2  * VP7/VP8 compatible video decoder
   3  *
   4  * Copyright (C) 2010 David Conrad
   5  * Copyright (C) 2010 Ronald S. Bultje
   6  * Copyright (C) 2010 Fiona Glaser
   7  * Copyright (C) 2012 Daniel Kang
   8  * Copyright (C) 2014 Peter Ross
   9  *
  10  * This file is part of Libav.
  11  *
  12  * Libav is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public
  14  * License as published by the Free Software Foundation; either
  15  * version 2.1 of the License, or (at your option) any later version.
  16  *
  17  * Libav is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with Libav; if not, write to the Free Software
  24  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  25  */
  26
  27 #include "libavutil/imgutils.h"
  28
  29 #include "avcodec.h"
  30 #include "internal.h"
  31 #include "mathops.h"
  32 #include "rectangle.h"
  33 #include "thread.h"
  34 #include "vp8.h"
  35 #include "vp8data.h"
  36
  37 #if ARCH_ARM
  38 #   include "arm/vp8.h"
  39 #endif
  40
  41 static void free_buffers(VP8Context *s)
  42 {
  43     int i;
  44     if (s->thread_data)
  45         for (i = 0; i < MAX_THREADS; i++) {
  46 #if HAVE_THREADS
  47             pthread_cond_destroy(&s->thread_data[i].cond);
  48             pthread_mutex_destroy(&s->thread_data[i].lock);
  49 #endif
  50             av_freep(&s->thread_data[i].filter_strength);
  51         }
  52     av_freep(&s->thread_data);
  53     av_freep(&s->macroblocks_base);
  54     av_freep(&s->intra4x4_pred_mode_top);
  55     av_freep(&s->top_nnz);
  56     av_freep(&s->top_border);
  57
  58     s->macroblocks = NULL;
  59 }
  60
  61 static int vp8_alloc_frame(VP8Context *s, VP8Frame *f, int ref)
  62 {
  63     int ret;
  64     if ((ret = ff_thread_get_buffer(s->avctx, &f->tf,
  65                                     ref ? AV_GET_BUFFER_FLAG_REF : 0)) < 0)
  66         return ret;
  67     if (!(f->seg_map = av_buffer_allocz(s->mb_width * s->mb_height))) {
  68         ff_thread_release_buffer(s->avctx, &f->tf);
  69         return AVERROR(ENOMEM);
  70     }
  71     return 0;
  72 }
  73
  74 static void vp8_release_frame(VP8Context *s, VP8Frame *f)
  75 {
  76     av_buffer_unref(&f->seg_map);
  77     ff_thread_release_buffer(s->avctx, &f->tf);
  78 }
  79
  80 #if CONFIG_VP8_DECODER
  81 static int vp8_ref_frame(VP8Context *s, VP8Frame *dst, VP8Frame *src)
  82 {
  83     int ret;
  84
  85     vp8_release_frame(s, dst);
  86
  87     if ((ret = ff_thread_ref_frame(&dst->tf, &src->tf)) < 0)
  88         return ret;
  89     if (src->seg_map &&
  90         !(dst->seg_map = av_buffer_ref(src->seg_map))) {
  91         vp8_release_frame(s, dst);
  92         return AVERROR(ENOMEM);
  93     }
  94
  95     return 0;
  96 }
  97 #endif /* CONFIG_VP8_DECODER */
  98
  99 static void vp8_decode_flush_impl(AVCodecContext *avctx, int free_mem)
 100 {
 101     VP8Context *s = avctx->priv_data;
 102     int i;
 103
 104     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++)
 105         vp8_release_frame(s, &s->frames[i]);
 106     memset(s->framep, 0, sizeof(s->framep));
 107
 108     if (free_mem)
 109         free_buffers(s);
 110 }
 111
 112 static void vp8_decode_flush(AVCodecContext *avctx)
 113 {
 114     vp8_decode_flush_impl(avctx, 0);
 115 }
 116
 117 static VP8Frame *vp8_find_free_buffer(VP8Context *s)
 118 {
 119     VP8Frame *frame = NULL;
 120     int i;
 121
 122     // find a free buffer
 123     for (i = 0; i < 5; i++)
 124         if (&s->frames[i] != s->framep[VP56_FRAME_CURRENT]  &&
 125             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
 126             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN]   &&
 127             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2]) {
 128             frame = &s->frames[i];
 129             break;
 130         }
 131     if (i == 5) {
 132         av_log(s->avctx, AV_LOG_FATAL, "Ran out of free frames!\n");
 133         abort();
 134     }
 135     if (frame->tf.f->data[0])
 136         vp8_release_frame(s, frame);
 137
 138     return frame;
 139 }
 140
 141 static av_always_inline
 142 int update_dimensions(VP8Context *s, int width, int height, int is_vp7)
 143 {
 144     AVCodecContext *avctx = s->avctx;
 145     int i, ret;
 146
 147     if (width  != s->avctx->width ||
 148         height != s->avctx->height) {
 149         vp8_decode_flush_impl(s->avctx, 1);
 150
 151         ret = ff_set_dimensions(s->avctx, width, height);
 152         if (ret < 0)
 153             return ret;
 154     }
 155
 156     s->mb_width  = (s->avctx->coded_width  + 15) / 16;
 157     s->mb_height = (s->avctx->coded_height + 15) / 16;
 158
 159     s->mb_layout = is_vp7 || avctx->active_thread_type == FF_THREAD_SLICE &&
 160                    FFMIN(s->num_coeff_partitions, avctx->thread_count) > 1;
 161     if (!s->mb_layout) { // Frame threading and one thread
 162         s->macroblocks_base       = av_mallocz((s->mb_width + s->mb_height * 2 + 1) *
 163                                                sizeof(*s->macroblocks));
 164         s->intra4x4_pred_mode_top = av_mallocz(s->mb_width * 4);
 165     } else // Sliced threading
 166         s->macroblocks_base = av_mallocz((s->mb_width + 2) * (s->mb_height + 2) *
 167                                          sizeof(*s->macroblocks));
 168     s->top_nnz     = av_mallocz(s->mb_width * sizeof(*s->top_nnz));
 169     s->top_border  = av_mallocz((s->mb_width + 1) * sizeof(*s->top_border));
 170     s->thread_data = av_mallocz(MAX_THREADS * sizeof(VP8ThreadData));
 171
 172     if (!s->macroblocks_base || !s->top_nnz || !s->top_border ||
 173         !s->thread_data || (!s->intra4x4_pred_mode_top && !s->mb_layout)) {
 174         free_buffers(s);
 175         return AVERROR(ENOMEM);
 176     }
 177
 178     for (i = 0; i < MAX_THREADS; i++) {
 179         s->thread_data[i].filter_strength =
 180             av_mallocz(s->mb_width * sizeof(*s->thread_data[0].filter_strength));
 181         if (!s->thread_data[i].filter_strength) {
 182             free_buffers(s);
 183             return AVERROR(ENOMEM);
 184         }
 185 #if HAVE_THREADS
 186         pthread_mutex_init(&s->thread_data[i].lock, NULL);
 187         pthread_cond_init(&s->thread_data[i].cond, NULL);
 188 #endif
 189     }
 190
 191     s->macroblocks = s->macroblocks_base + 1;
 192
 193     return 0;
 194 }
 195
 196 static int vp7_update_dimensions(VP8Context *s, int width, int height)
 197 {
 198     return update_dimensions(s, width, height, IS_VP7);
 199 }
 200
 201 static int vp8_update_dimensions(VP8Context *s, int width, int height)
 202 {
 203     return update_dimensions(s, width, height, IS_VP8);
 204 }
 205
 206 static void parse_segment_info(VP8Context *s)
 207 {
 208     VP56RangeCoder *c = &s->c;
 209     int i;
 210
 211     s->segmentation.update_map = vp8_rac_get(c);
 212
 213     if (vp8_rac_get(c)) { // update segment feature data
 214         s->segmentation.absolute_vals = vp8_rac_get(c);
 215
 216         for (i = 0; i < 4; i++)
 217             s->segmentation.base_quant[i]   = vp8_rac_get_sint(c, 7);
 218
 219         for (i = 0; i < 4; i++)
 220             s->segmentation.filter_level[i] = vp8_rac_get_sint(c, 6);
 221     }
 222     if (s->segmentation.update_map)
 223         for (i = 0; i < 3; i++)
 224             s->prob->segmentid[i] = vp8_rac_get(c) ? vp8_rac_get_uint(c, 8) : 255;
 225 }
 226
 227 static void update_lf_deltas(VP8Context *s)
 228 {
 229     VP56RangeCoder *c = &s->c;
 230     int i;
 231
 232     for (i = 0; i < 4; i++) {
 233         if (vp8_rac_get(c)) {
 234             s->lf_delta.ref[i] = vp8_rac_get_uint(c, 6);
 235
 236             if (vp8_rac_get(c))
 237                 s->lf_delta.ref[i] = -s->lf_delta.ref[i];
 238         }
 239     }
 240
 241     for (i = MODE_I4x4; i <= VP8_MVMODE_SPLIT; i++) {
 242         if (vp8_rac_get(c)) {
 243             s->lf_delta.mode[i] = vp8_rac_get_uint(c, 6);
 244
 245             if (vp8_rac_get(c))
 246                 s->lf_delta.mode[i] = -s->lf_delta.mode[i];
 247         }
 248     }
 249 }
 250
 251 static int setup_partitions(VP8Context *s, const uint8_t *buf, int buf_size)
 252 {
 253     const uint8_t *sizes = buf;
 254     int i;
 255
 256     s->num_coeff_partitions = 1 << vp8_rac_get_uint(&s->c, 2);
 257
 258     buf      += 3 * (s->num_coeff_partitions - 1);
 259     buf_size -= 3 * (s->num_coeff_partitions - 1);
 260     if (buf_size < 0)
 261         return -1;
 262
 263     for (i = 0; i < s->num_coeff_partitions - 1; i++) {
 264         int size = AV_RL24(sizes + 3 * i);
 265         if (buf_size - size < 0)
 266             return -1;
 267
 268         ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, size);
 269         buf      += size;
 270         buf_size -= size;
 271     }
 272     ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, buf_size);
 273
 274     return 0;
 275 }
 276
 277 static void vp7_get_quants(VP8Context *s)
 278 {
 279     VP56RangeCoder *c = &s->c;
 280
 281     int yac_qi  = vp8_rac_get_uint(c, 7);
 282     int ydc_qi  = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 283     int y2dc_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 284     int y2ac_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 285     int uvdc_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 286     int uvac_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 287
 288     s->qmat[0].luma_qmul[0]    =       vp7_ydc_qlookup[ydc_qi];
 289     s->qmat[0].luma_qmul[1]    =       vp7_yac_qlookup[yac_qi];
 290     s->qmat[0].luma_dc_qmul[0] =       vp7_y2dc_qlookup[y2dc_qi];
 291     s->qmat[0].luma_dc_qmul[1] =       vp7_y2ac_qlookup[y2ac_qi];
 292     s->qmat[0].chroma_qmul[0]  = FFMIN(vp7_ydc_qlookup[uvdc_qi], 132);
 293     s->qmat[0].chroma_qmul[1]  =       vp7_yac_qlookup[uvac_qi];
 294 }
 295
 296 static void get_quants(VP8Context *s)
 297 {
 298     VP56RangeCoder *c = &s->c;
 299     int i, base_qi;
 300
 301     int yac_qi     = vp8_rac_get_uint(c, 7);
 302     int ydc_delta  = vp8_rac_get_sint(c, 4);
 303     int y2dc_delta = vp8_rac_get_sint(c, 4);
 304     int y2ac_delta = vp8_rac_get_sint(c, 4);
 305     int uvdc_delta = vp8_rac_get_sint(c, 4);
 306     int uvac_delta = vp8_rac_get_sint(c, 4);
 307
 308     for (i = 0; i < 4; i++) {
 309         if (s->segmentation.enabled) {
 310             base_qi = s->segmentation.base_quant[i];
 311             if (!s->segmentation.absolute_vals)
 312                 base_qi += yac_qi;
 313         } else
 314             base_qi = yac_qi;
 315
 316         s->qmat[i].luma_qmul[0]    = vp8_dc_qlookup[av_clip_uintp2(base_qi + ydc_delta,  7)];
 317         s->qmat[i].luma_qmul[1]    = vp8_ac_qlookup[av_clip_uintp2(base_qi,              7)];
 318         s->qmat[i].luma_dc_qmul[0] = vp8_dc_qlookup[av_clip_uintp2(base_qi + y2dc_delta, 7)] * 2;
 319         /* 101581>>16 is equivalent to 155/100 */
 320         s->qmat[i].luma_dc_qmul[1] = vp8_ac_qlookup[av_clip_uintp2(base_qi + y2ac_delta, 7)] * 101581 >> 16;
 321         s->qmat[i].chroma_qmul[0]  = vp8_dc_qlookup[av_clip_uintp2(base_qi + uvdc_delta, 7)];
 322         s->qmat[i].chroma_qmul[1]  = vp8_ac_qlookup[av_clip_uintp2(base_qi + uvac_delta, 7)];
 323
 324         s->qmat[i].luma_dc_qmul[1] = FFMAX(s->qmat[i].luma_dc_qmul[1], 8);
 325         s->qmat[i].chroma_qmul[0]  = FFMIN(s->qmat[i].chroma_qmul[0], 132);
 326     }
 327 }
 328
 329 /**
 330  * Determine which buffers golden and altref should be updated with after this frame.
 331  * The spec isn't clear here, so I'm going by my understanding of what libvpx does
 332  *
 333  * Intra frames update all 3 references
 334  * Inter frames update VP56_FRAME_PREVIOUS if the update_last flag is set
 335  * If the update (golden|altref) flag is set, it's updated with the current frame
 336  *      if update_last is set, and VP56_FRAME_PREVIOUS otherwise.
 337  * If the flag is not set, the number read means:
 338  *      0: no update
 339  *      1: VP56_FRAME_PREVIOUS
 340  *      2: update golden with altref, or update altref with golden
 341  */
 342 static VP56Frame ref_to_update(VP8Context *s, int update, VP56Frame ref)
 343 {
 344     VP56RangeCoder *c = &s->c;
 345
 346     if (update)
 347         return VP56_FRAME_CURRENT;
 348
 349     switch (vp8_rac_get_uint(c, 2)) {
 350     case 1:
 351         return VP56_FRAME_PREVIOUS;
 352     case 2:
 353         return (ref == VP56_FRAME_GOLDEN) ? VP56_FRAME_GOLDEN2 : VP56_FRAME_GOLDEN;
 354     }
 355     return VP56_FRAME_NONE;
 356 }
 357
 358 static void vp78_reset_probability_tables(VP8Context *s)
 359 {
 360     int i, j;
 361     for (i = 0; i < 4; i++)
 362         for (j = 0; j < 16; j++)
 363             memcpy(s->prob->token[i][j], vp8_token_default_probs[i][vp8_coeff_band[j]],
 364                    sizeof(s->prob->token[i][j]));
 365 }
 366
 367 static void vp78_update_probability_tables(VP8Context *s)
 368 {
 369     VP56RangeCoder *c = &s->c;
 370     int i, j, k, l, m;
 371
 372     for (i = 0; i < 4; i++)
 373         for (j = 0; j < 8; j++)
 374             for (k = 0; k < 3; k++)
 375                 for (l = 0; l < NUM_DCT_TOKENS-1; l++)
 376                     if (vp56_rac_get_prob_branchy(c, vp8_token_update_probs[i][j][k][l])) {
 377                         int prob = vp8_rac_get_uint(c, 8);
 378                         for (m = 0; vp8_coeff_band_indexes[j][m] >= 0; m++)
 379                             s->prob->token[i][vp8_coeff_band_indexes[j][m]][k][l] = prob;
 380                     }
 381 }
 382
 383 #define VP7_MVC_SIZE 17
 384 #define VP8_MVC_SIZE 19
 385
 386 static void vp78_update_pred16x16_pred8x8_mvc_probabilities(VP8Context *s,
 387                                                             int mvc_size)
 388 {
 389     VP56RangeCoder *c = &s->c;
 390     int i, j;
 391
 392     if (vp8_rac_get(c))
 393         for (i = 0; i < 4; i++)
 394             s->prob->pred16x16[i] = vp8_rac_get_uint(c, 8);
 395     if (vp8_rac_get(c))
 396         for (i = 0; i < 3; i++)
 397             s->prob->pred8x8c[i]  = vp8_rac_get_uint(c, 8);
 398
 399     // 17.2 MV probability update
 400     for (i = 0; i < 2; i++)
 401         for (j = 0; j < mvc_size; j++)
 402             if (vp56_rac_get_prob_branchy(c, vp8_mv_update_prob[i][j]))
 403                 s->prob->mvc[i][j] = vp8_rac_get_nn(c);
 404 }
 405
 406 static void update_refs(VP8Context *s)
 407 {
 408     VP56RangeCoder *c = &s->c;
 409
 410     int update_golden = vp8_rac_get(c);
 411     int update_altref = vp8_rac_get(c);
 412
 413     s->update_golden = ref_to_update(s, update_golden, VP56_FRAME_GOLDEN);
 414     s->update_altref = ref_to_update(s, update_altref, VP56_FRAME_GOLDEN2);
 415 }
 416
 417 static void copy_luma(AVFrame *dst, AVFrame *src, int width, int height)
 418 {
 419     int i, j;
 420
 421     for (j = 1; j < 3; j++) {
 422         for (i = 0; i < height / 2; i++)
 423             memcpy(dst->data[j] + i * dst->linesize[j],
 424                    src->data[j] + i * src->linesize[j], width / 2);
 425     }
 426 }
 427
 428 static void fade(uint8_t *dst, uint8_t *src,
 429                  int width, int height, int linesize,
 430                  int alpha, int beta)
 431 {
 432     int i, j;
 433
 434     for (j = 0; j < height; j++) {
 435         for (i = 0; i < width; i++) {
 436             uint8_t y = src[j * linesize + i];
 437             dst[j * linesize + i] = av_clip_uint8(y + ((y * beta) >> 8) + alpha);
 438         }
 439     }
 440 }
 441
 442 static int vp7_fade_frame(VP8Context *s, VP56RangeCoder *c)
 443 {
 444     int alpha = (int8_t) vp8_rac_get_uint(c, 8);
 445     int beta  = (int8_t) vp8_rac_get_uint(c, 8);
 446     int ret;
 447
 448     if (!s->keyframe && (alpha || beta)) {
 449         int width  = s->mb_width * 16;
 450         int height = s->mb_height * 16;
 451         AVFrame *src, *dst;
 452
 453         if (!s->framep[VP56_FRAME_PREVIOUS])
 454             return AVERROR_INVALIDDATA;
 455
 456         dst =
 457         src = s->framep[VP56_FRAME_PREVIOUS]->tf.f;
 458
 459         /* preserve the golden frame, write a new previous frame */
 460         if (s->framep[VP56_FRAME_GOLDEN] == s->framep[VP56_FRAME_PREVIOUS]) {
 461             s->framep[VP56_FRAME_PREVIOUS] = vp8_find_free_buffer(s);
 462             if ((ret = vp8_alloc_frame(s, s->framep[VP56_FRAME_PREVIOUS], 1)) < 0)
 463                return ret;
 464
 465             dst = s->framep[VP56_FRAME_PREVIOUS]->tf.f;
 466
 467             copy_luma(dst, src, width, height);
 468         }
 469
 470         fade(dst->data[0], src->data[0],
 471              width, height, dst->linesize[0], alpha, beta);
 472     }
 473
 474     return 0;
 475 }
 476
 477 static int vp7_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size)
 478 {
 479     VP56RangeCoder *c = &s->c;
 480     int part1_size, hscale, vscale, i, j, ret;
 481     int width  = s->avctx->width;
 482     int height = s->avctx->height;
 483
 484     if (buf_size < 4) {
 485         return AVERROR_INVALIDDATA;
 486     }
 487
 488     s->profile = (buf[0] >> 1) & 7;
 489     if (s->profile > 1) {
 490         avpriv_request_sample(s->avctx, "Unknown profile %d", s->profile);
 491         return AVERROR_INVALIDDATA;
 492     }
 493
 494     s->keyframe  = !(buf[0] & 1);
 495     s->invisible = 0;
 496     part1_size   = AV_RL24(buf) >> 4;
 497
 498     buf      += 4 - s->profile;
 499     buf_size -= 4 - s->profile;
 500
 501     if (buf_size < part1_size) {
 502         return AVERROR_INVALIDDATA;
 503     }
 504
 505     memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab, sizeof(s->put_pixels_tab));
 506
 507     ff_vp56_init_range_decoder(c, buf, part1_size);
 508     buf      += part1_size;
 509     buf_size -= part1_size;
 510
 511     /* A. Dimension information (keyframes only) */
 512     if (s->keyframe) {
 513         width  = vp8_rac_get_uint(c, 12);
 514         height = vp8_rac_get_uint(c, 12);
 515         hscale = vp8_rac_get_uint(c, 2);
 516         vscale = vp8_rac_get_uint(c, 2);
 517         if (hscale || vscale)
 518             avpriv_request_sample(s->avctx, "Upscaling");
 519
 520         s->update_golden = s->update_altref = VP56_FRAME_CURRENT;
 521         vp78_reset_probability_tables(s);
 522         memcpy(s->prob->pred16x16, vp8_pred16x16_prob_inter,
 523                sizeof(s->prob->pred16x16));
 524         memcpy(s->prob->pred8x8c, vp8_pred8x8c_prob_inter,
 525                sizeof(s->prob->pred8x8c));
 526         for (i = 0; i < 2; i++)
 527             memcpy(s->prob->mvc[i], vp7_mv_default_prob[i],
 528                    sizeof(vp7_mv_default_prob[i]));
 529         memset(&s->segmentation, 0, sizeof(s->segmentation));
 530         memset(&s->lf_delta, 0, sizeof(s->lf_delta));
 531         memcpy(s->prob[0].scan, ff_zigzag_scan, sizeof(s->prob[0].scan));
 532     }
 533
 534     if (s->keyframe || s->profile > 0)
 535         memset(s->inter_dc_pred, 0 , sizeof(s->inter_dc_pred));
 536
 537     /* B. Decoding information for all four macroblock-level features */
 538     for (i = 0; i < 4; i++) {
 539         s->feature_enabled[i] = vp8_rac_get(c);
 540         if (s->feature_enabled[i]) {
 541              s->feature_present_prob[i] = vp8_rac_get_uint(c, 8);
 542
 543              for (j = 0; j < 3; j++)
 544                  s->feature_index_prob[i][j] =
 545                      vp8_rac_get(c) ? vp8_rac_get_uint(c, 8) : 255;
 546
 547              if (vp7_feature_value_size[s->profile][i])
 548                  for (j = 0; j < 4; j++)
 549                      s->feature_value[i][j] =
 550                          vp8_rac_get(c) ? vp8_rac_get_uint(c, vp7_feature_value_size[s->profile][i]) : 0;
 551         }
 552     }
 553
 554     s->segmentation.enabled    = 0;
 555     s->segmentation.update_map = 0;
 556     s->lf_delta.enabled        = 0;
 557
 558     s->num_coeff_partitions = 1;
 559     ff_vp56_init_range_decoder(&s->coeff_partition[0], buf, buf_size);
 560
 561     if (!s->macroblocks_base || /* first frame */
 562         width != s->avctx->width || height != s->avctx->height ||
 563         (width + 15) / 16 != s->mb_width || (height + 15) / 16 != s->mb_height) {
 564         if ((ret = vp7_update_dimensions(s, width, height)) < 0)
 565             return ret;
 566     }
 567
 568     /* C. Dequantization indices */
 569     vp7_get_quants(s);
 570
 571     /* D. Golden frame update flag (a Flag) for interframes only */
 572     if (!s->keyframe) {
 573         s->update_golden = vp8_rac_get(c) ? VP56_FRAME_CURRENT : VP56_FRAME_NONE;
 574         s->sign_bias[VP56_FRAME_GOLDEN] = 0;
 575     }
 576
 577     s->update_last          = 1;
 578     s->update_probabilities = 1;
 579     s->fade_present         = 1;
 580
 581     if (s->profile > 0) {
 582         s->update_probabilities = vp8_rac_get(c);
 583         if (!s->update_probabilities)
 584             s->prob[1] = s->prob[0];
 585
 586         if (!s->keyframe)
 587             s->fade_present = vp8_rac_get(c);
 588     }
 589
 590     /* E. Fading information for previous frame */
 591     if (s->fade_present && vp8_rac_get(c)) {
 592         if ((ret = vp7_fade_frame(s ,c)) < 0)
 593             return ret;
 594     }
 595
 596     /* F. Loop filter type */
 597     if (!s->profile)
 598         s->filter.simple = vp8_rac_get(c);
 599
 600     /* G. DCT coefficient ordering specification */
 601     if (vp8_rac_get(c))
 602         for (i = 1; i < 16; i++)
 603             s->prob[0].scan[i] = ff_zigzag_scan[vp8_rac_get_uint(c, 4)];
 604
 605     /* H. Loop filter levels  */
 606     if (s->profile > 0)
 607         s->filter.simple = vp8_rac_get(c);
 608     s->filter.level     = vp8_rac_get_uint(c, 6);
 609     s->filter.sharpness = vp8_rac_get_uint(c, 3);
 610
 611     /* I. DCT coefficient probability update; 13.3 Token Probability Updates */
 612     vp78_update_probability_tables(s);
 613
 614     s->mbskip_enabled = 0;
 615
 616     /* J. The remaining frame header data occurs ONLY FOR INTERFRAMES */
 617     if (!s->keyframe) {
 618         s->prob->intra  = vp8_rac_get_uint(c, 8);
 619         s->prob->last   = vp8_rac_get_uint(c, 8);
 620         vp78_update_pred16x16_pred8x8_mvc_probabilities(s, VP7_MVC_SIZE);
 621     }
 622
 623     return 0;
 624 }
 625
 626 static int vp8_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size)
 627 {
 628     VP56RangeCoder *c = &s->c;
 629     int header_size, hscale, vscale, ret;
 630     int width  = s->avctx->width;
 631     int height = s->avctx->height;
 632
 633     s->keyframe  = !(buf[0] & 1);
 634     s->profile   =  (buf[0]>>1) & 7;
 635     s->invisible = !(buf[0] & 0x10);
 636     header_size  = AV_RL24(buf) >> 5;
 637     buf      += 3;
 638     buf_size -= 3;
 639
 640     if (s->profile > 3)
 641         av_log(s->avctx, AV_LOG_WARNING, "Unknown profile %d\n", s->profile);
 642
 643     if (!s->profile)
 644         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab,
 645                sizeof(s->put_pixels_tab));
 646     else    // profile 1-3 use bilinear, 4+ aren't defined so whatever
 647         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_bilinear_pixels_tab,
 648                sizeof(s->put_pixels_tab));
 649
 650     if (header_size > buf_size - 7 * s->keyframe) {
 651         av_log(s->avctx, AV_LOG_ERROR, "Header size larger than data provided\n");
 652         return AVERROR_INVALIDDATA;
 653     }
 654
 655     if (s->keyframe) {
 656         if (AV_RL24(buf) != 0x2a019d) {
 657             av_log(s->avctx, AV_LOG_ERROR,
 658                    "Invalid start code 0x%x\n", AV_RL24(buf));
 659             return AVERROR_INVALIDDATA;
 660         }
 661         width     = AV_RL16(buf + 3) & 0x3fff;
 662         height    = AV_RL16(buf + 5) & 0x3fff;
 663         hscale    = buf[4] >> 6;
 664         vscale    = buf[6] >> 6;
 665         buf      += 7;
 666         buf_size -= 7;
 667
 668         if (hscale || vscale)
 669             avpriv_request_sample(s->avctx, "Upscaling");
 670
 671         s->update_golden = s->update_altref = VP56_FRAME_CURRENT;
 672         vp78_reset_probability_tables(s);
 673         memcpy(s->prob->pred16x16, vp8_pred16x16_prob_inter,
 674                sizeof(s->prob->pred16x16));
 675         memcpy(s->prob->pred8x8c, vp8_pred8x8c_prob_inter,
 676                sizeof(s->prob->pred8x8c));
 677         memcpy(s->prob->mvc, vp8_mv_default_prob,
 678                sizeof(s->prob->mvc));
 679         memset(&s->segmentation, 0, sizeof(s->segmentation));
 680         memset(&s->lf_delta, 0, sizeof(s->lf_delta));
 681     }
 682
 683     ff_vp56_init_range_decoder(c, buf, header_size);
 684     buf      += header_size;
 685     buf_size -= header_size;
 686
 687     if (s->keyframe) {
 688         s->colorspace = vp8_rac_get(c);
 689         if (s->colorspace)
 690             av_log(s->avctx, AV_LOG_WARNING, "Unspecified colorspace\n");
 691         s->fullrange = vp8_rac_get(c);
 692     }
 693
 694     if ((s->segmentation.enabled = vp8_rac_get(c)))
 695         parse_segment_info(s);
 696     else
 697         s->segmentation.update_map = 0; // FIXME: move this to some init function?
 698
 699     s->filter.simple    = vp8_rac_get(c);
 700     s->filter.level     = vp8_rac_get_uint(c, 6);
 701     s->filter.sharpness = vp8_rac_get_uint(c, 3);
 702
 703     if ((s->lf_delta.enabled = vp8_rac_get(c)))
 704         if (vp8_rac_get(c))
 705             update_lf_deltas(s);
 706
 707     if (setup_partitions(s, buf, buf_size)) {
 708         av_log(s->avctx, AV_LOG_ERROR, "Invalid partitions\n");
 709         return AVERROR_INVALIDDATA;
 710     }
 711
 712     if (!s->macroblocks_base || /* first frame */
 713         width != s->avctx->width || height != s->avctx->height)
 714         if ((ret = vp8_update_dimensions(s, width, height)) < 0)
 715             return ret;
 716
 717     get_quants(s);
 718
 719     if (!s->keyframe) {
 720         update_refs(s);
 721         s->sign_bias[VP56_FRAME_GOLDEN]               = vp8_rac_get(c);
 722         s->sign_bias[VP56_FRAME_GOLDEN2 /* altref */] = vp8_rac_get(c);
 723     }
 724
 725     // if we aren't saving this frame's probabilities for future frames,
 726     // make a copy of the current probabilities
 727     if (!(s->update_probabilities = vp8_rac_get(c)))
 728         s->prob[1] = s->prob[0];
 729
 730     s->update_last = s->keyframe || vp8_rac_get(c);
 731
 732     vp78_update_probability_tables(s);
 733
 734     if ((s->mbskip_enabled = vp8_rac_get(c)))
 735         s->prob->mbskip = vp8_rac_get_uint(c, 8);
 736
 737     if (!s->keyframe) {
 738         s->prob->intra  = vp8_rac_get_uint(c, 8);
 739         s->prob->last   = vp8_rac_get_uint(c, 8);
 740         s->prob->golden = vp8_rac_get_uint(c, 8);
 741         vp78_update_pred16x16_pred8x8_mvc_probabilities(s, VP8_MVC_SIZE);
 742     }
 743
 744     return 0;
 745 }
 746
 747 static av_always_inline
 748 void clamp_mv(VP8Context *s, VP56mv *dst, const VP56mv *src)
 749 {
 750     dst->x = av_clip(src->x, s->mv_min.x, s->mv_max.x);
 751     dst->y = av_clip(src->y, s->mv_min.y, s->mv_max.y);
 752 }
 753
 754 /**
 755  * Motion vector coding, 17.1.
 756  */
 757 static int read_mv_component(VP56RangeCoder *c, const uint8_t *p, int vp7)
 758 {
 759     int bit, x = 0;
 760
 761     if (vp56_rac_get_prob_branchy(c, p[0])) {
 762         int i;
 763
 764         for (i = 0; i < 3; i++)
 765             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 766         for (i = (vp7 ? 7 : 9); i > 3; i--)
 767             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 768         if (!(x & (vp7 ? 0xF0 : 0xFFF0)) || vp56_rac_get_prob(c, p[12]))
 769             x += 8;
 770     } else {
 771         // small_mvtree
 772         const uint8_t *ps = p + 2;
 773         bit = vp56_rac_get_prob(c, *ps);
 774         ps += 1 + 3 * bit;
 775         x  += 4 * bit;
 776         bit = vp56_rac_get_prob(c, *ps);
 777         ps += 1 + bit;
 778         x  += 2 * bit;
 779         x  += vp56_rac_get_prob(c, *ps);
 780     }
 781
 782     return (x && vp56_rac_get_prob(c, p[1])) ? -x : x;
 783 }
 784
 785 static av_always_inline
 786 const uint8_t *get_submv_prob(uint32_t left, uint32_t top, int is_vp7)
 787 {
 788     if (is_vp7)
 789         return vp7_submv_prob;
 790
 791     if (left == top)
 792         return vp8_submv_prob[4 - !!left];
 793     if (!top)
 794         return vp8_submv_prob[2];
 795     return vp8_submv_prob[1 - !!left];
 796 }
 797
 798 /**
 799  * Split motion vector prediction, 16.4.
 800  * @returns the number of motion vectors parsed (2, 4 or 16)
 801  */
 802 static av_always_inline
 803 int decode_splitmvs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
 804                     int layout, int is_vp7)
 805 {
 806     int part_idx;
 807     int n, num;
 808     VP8Macroblock *top_mb;
 809     VP8Macroblock *left_mb = &mb[-1];
 810     const uint8_t *mbsplits_left = vp8_mbsplits[left_mb->partitioning];
 811     const uint8_t *mbsplits_top, *mbsplits_cur, *firstidx;
 812     VP56mv *top_mv;
 813     VP56mv *left_mv = left_mb->bmv;
 814     VP56mv *cur_mv  = mb->bmv;
 815
 816     if (!layout) // layout is inlined, s->mb_layout is not
 817         top_mb = &mb[2];
 818     else
 819         top_mb = &mb[-s->mb_width - 1];
 820     mbsplits_top = vp8_mbsplits[top_mb->partitioning];
 821     top_mv       = top_mb->bmv;
 822
 823     if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[0])) {
 824         if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[1]))
 825             part_idx = VP8_SPLITMVMODE_16x8 + vp56_rac_get_prob(c, vp8_mbsplit_prob[2]);
 826         else
 827             part_idx = VP8_SPLITMVMODE_8x8;
 828     } else {
 829         part_idx = VP8_SPLITMVMODE_4x4;
 830     }
 831
 832     num              = vp8_mbsplit_count[part_idx];
 833     mbsplits_cur     = vp8_mbsplits[part_idx],
 834     firstidx         = vp8_mbfirstidx[part_idx];
 835     mb->partitioning = part_idx;
 836
 837     for (n = 0; n < num; n++) {
 838         int k = firstidx[n];
 839         uint32_t left, above;
 840         const uint8_t *submv_prob;
 841
 842         if (!(k & 3))
 843             left = AV_RN32A(&left_mv[mbsplits_left[k + 3]]);
 844         else
 845             left = AV_RN32A(&cur_mv[mbsplits_cur[k - 1]]);
 846         if (k <= 3)
 847             above = AV_RN32A(&top_mv[mbsplits_top[k + 12]]);
 848         else
 849             above = AV_RN32A(&cur_mv[mbsplits_cur[k - 4]]);
 850
 851         submv_prob = get_submv_prob(left, above, is_vp7);
 852
 853         if (vp56_rac_get_prob_branchy(c, submv_prob[0])) {
 854             if (vp56_rac_get_prob_branchy(c, submv_prob[1])) {
 855                 if (vp56_rac_get_prob_branchy(c, submv_prob[2])) {
 856                     mb->bmv[n].y = mb->mv.y +
 857                                    read_mv_component(c, s->prob->mvc[0], is_vp7);
 858                     mb->bmv[n].x = mb->mv.x +
 859                                    read_mv_component(c, s->prob->mvc[1], is_vp7);
 860                 } else {
 861                     AV_ZERO32(&mb->bmv[n]);
 862                 }
 863             } else {
 864                 AV_WN32A(&mb->bmv[n], above);
 865             }
 866         } else {
 867             AV_WN32A(&mb->bmv[n], left);
 868         }
 869     }
 870
 871     return num;
 872 }
 873
 874 /**
 875  * The vp7 reference decoder uses a padding macroblock column (added to right
 876  * edge of the frame) to guard against illegal macroblock offsets. The
 877  * algorithm has bugs that permit offsets to straddle the padding column.
 878  * This function replicates those bugs.
 879  *
 880  * @param[out] edge_x macroblock x address
 881  * @param[out] edge_y macroblock y address
 882  *
 883  * @return macroblock offset legal (boolean)
 884  */
 885 static int vp7_calculate_mb_offset(int mb_x, int mb_y, int mb_width,
 886                                    int xoffset, int yoffset, int boundary,
 887                                    int *edge_x, int *edge_y)
 888 {
 889     int vwidth = mb_width + 1;
 890     int new = (mb_y + yoffset) * vwidth + mb_x + xoffset;
 891     if (new < boundary || new % vwidth == vwidth - 1)
 892         return 0;
 893     *edge_y = new / vwidth;
 894     *edge_x = new % vwidth;
 895     return 1;
 896 }
 897
 898 static const VP56mv *get_bmv_ptr(const VP8Macroblock *mb, int subblock)
 899 {
 900     return &mb->bmv[mb->mode == VP8_MVMODE_SPLIT ? vp8_mbsplits[mb->partitioning][subblock] : 0];
 901 }
 902
 903 static av_always_inline
 904 void vp7_decode_mvs(VP8Context *s, VP8Macroblock *mb,
 905                     int mb_x, int mb_y, int layout)
 906 {
 907     VP8Macroblock *mb_edge[12];
 908     enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR };
 909     enum { VP8_EDGE_TOP, VP8_EDGE_LEFT, VP8_EDGE_TOPLEFT };
 910     int idx = CNT_ZERO;
 911     VP56mv near_mv[3];
 912     uint8_t cnt[3] = { 0 };
 913     VP56RangeCoder *c = &s->c;
 914     int i;
 915
 916     AV_ZERO32(&near_mv[0]);
 917     AV_ZERO32(&near_mv[1]);
 918     AV_ZERO32(&near_mv[2]);
 919
 920     for (i = 0; i < VP7_MV_PRED_COUNT; i++) {
 921         const VP7MVPred * pred = &vp7_mv_pred[i];
 922         int edge_x, edge_y;
 923
 924         if (vp7_calculate_mb_offset(mb_x, mb_y, s->mb_width, pred->xoffset,
 925                                     pred->yoffset, !s->profile, &edge_x, &edge_y)) {
 926             VP8Macroblock *edge = mb_edge[i] = (s->mb_layout == 1)
 927                                              ? s->macroblocks_base + 1 + edge_x +
 928                                                (s->mb_width + 1) * (edge_y + 1)
 929                                              : s->macroblocks + edge_x +
 930                                                (s->mb_height - edge_y - 1) * 2;
 931             uint32_t mv = AV_RN32A(get_bmv_ptr(edge, vp7_mv_pred[i].subblock));
 932             if (mv) {
 933                 if (AV_RN32A(&near_mv[CNT_NEAREST])) {
 934                     if (mv == AV_RN32A(&near_mv[CNT_NEAREST])) {
 935                         idx = CNT_NEAREST;
 936                     } else if (AV_RN32A(&near_mv[CNT_NEAR])) {
 937                         if (mv != AV_RN32A(&near_mv[CNT_NEAR]))
 938                             continue;
 939                         idx = CNT_NEAR;
 940                     } else {
 941                         AV_WN32A(&near_mv[CNT_NEAR], mv);
 942                         idx = CNT_NEAR;
 943                     }
 944                 } else {
 945                     AV_WN32A(&near_mv[CNT_NEAREST], mv);
 946                     idx = CNT_NEAREST;
 947                 }
 948             } else {
 949                 idx = CNT_ZERO;
 950             }
 951         } else {
 952             idx = CNT_ZERO;
 953         }
 954         cnt[idx] += vp7_mv_pred[i].score;
 955     }
 956
 957     mb->partitioning = VP8_SPLITMVMODE_NONE;
 958
 959     if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_ZERO]][0])) {
 960         mb->mode = VP8_MVMODE_MV;
 961
 962         if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAREST]][1])) {
 963
 964             if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAR]][2])) {
 965
 966                 if (cnt[CNT_NEAREST] > cnt[CNT_NEAR])
 967                     AV_WN32A(&mb->mv, cnt[CNT_ZERO] > cnt[CNT_NEAREST] ? 0 : AV_RN32A(&near_mv[CNT_NEAREST]));
 968                 else
 969                     AV_WN32A(&mb->mv, cnt[CNT_ZERO] > cnt[CNT_NEAR]    ? 0 : AV_RN32A(&near_mv[CNT_NEAR]));
 970
 971                 if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAR]][3])) {
 972                     mb->mode = VP8_MVMODE_SPLIT;
 973                     mb->mv = mb->bmv[decode_splitmvs(s, c, mb, layout, IS_VP7) - 1];
 974                 } else {
 975                     mb->mv.y += read_mv_component(c, s->prob->mvc[0], IS_VP7);
 976                     mb->mv.x += read_mv_component(c, s->prob->mvc[1], IS_VP7);
 977                     mb->bmv[0] = mb->mv;
 978                 }
 979             } else {
 980                 mb->mv = near_mv[CNT_NEAR];
 981                 mb->bmv[0] = mb->mv;
 982             }
 983         } else {
 984             mb->mv = near_mv[CNT_NEAREST];
 985             mb->bmv[0] = mb->mv;
 986         }
 987     } else {
 988         mb->mode = VP8_MVMODE_ZERO;
 989         AV_ZERO32(&mb->mv);
 990         mb->bmv[0] = mb->mv;
 991     }
 992 }
 993
 994 static av_always_inline
 995 void vp8_decode_mvs(VP8Context *s, VP8Macroblock *mb,
 996                     int mb_x, int mb_y, int layout)
 997 {
 998     VP8Macroblock *mb_edge[3] = { 0      /* top */,
 999                                   mb - 1 /* left */,
1000                                   0      /* top-left */ };
1001     enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV };
1002     enum { VP8_EDGE_TOP, VP8_EDGE_LEFT, VP8_EDGE_TOPLEFT };
1003     int idx = CNT_ZERO;
1004     int cur_sign_bias = s->sign_bias[mb->ref_frame];
1005     int8_t *sign_bias = s->sign_bias;
1006     VP56mv near_mv[4];
1007     uint8_t cnt[4] = { 0 };
1008     VP56RangeCoder *c = &s->c;
1009
1010     if (!layout) { // layout is inlined (s->mb_layout is not)
1011         mb_edge[0] = mb + 2;
1012         mb_edge[2] = mb + 1;
1013     } else {
1014         mb_edge[0] = mb - s->mb_width - 1;
1015         mb_edge[2] = mb - s->mb_width - 2;
1016     }
1017
1018     AV_ZERO32(&near_mv[0]);
1019     AV_ZERO32(&near_mv[1]);
1020     AV_ZERO32(&near_mv[2]);
1021
1022     /* Process MB on top, left and top-left */
1023 #define MV_EDGE_CHECK(n)                                                      \
1024     {                                                                         \
1025         VP8Macroblock *edge = mb_edge[n];                                     \
1026         int edge_ref = edge->ref_frame;                                       \
1027         if (edge_ref != VP56_FRAME_CURRENT) {                                 \
1028             uint32_t mv = AV_RN32A(&edge->mv);                                \
1029             if (mv) {                                                         \
1030                 if (cur_sign_bias != sign_bias[edge_ref]) {                   \
1031                     /* SWAR negate of the values in mv. */                    \
1032                     mv = ~mv;                                                 \
1033                     mv = ((mv & 0x7fff7fff) +                                 \
1034                           0x00010001) ^ (mv & 0x80008000);                    \
1035                 }                                                             \
1036                 if (!n || mv != AV_RN32A(&near_mv[idx]))                      \
1037                     AV_WN32A(&near_mv[++idx], mv);                            \
1038                 cnt[idx] += 1 + (n != 2);                                     \
1039             } else                                                            \
1040                 cnt[CNT_ZERO] += 1 + (n != 2);                                \
1041         }                                                                     \
1042     }
1043
1044     MV_EDGE_CHECK(0)
1045     MV_EDGE_CHECK(1)
1046     MV_EDGE_CHECK(2)
1047
1048     mb->partitioning = VP8_SPLITMVMODE_NONE;
1049     if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_ZERO]][0])) {
1050         mb->mode = VP8_MVMODE_MV;
1051
1052         /* If we have three distinct MVs, merge first and last if they're the same */
1053         if (cnt[CNT_SPLITMV] &&
1054             AV_RN32A(&near_mv[1 + VP8_EDGE_TOP]) == AV_RN32A(&near_mv[1 + VP8_EDGE_TOPLEFT]))
1055             cnt[CNT_NEAREST] += 1;
1056
1057         /* Swap near and nearest if necessary */
1058         if (cnt[CNT_NEAR] > cnt[CNT_NEAREST]) {
1059             FFSWAP(uint8_t,     cnt[CNT_NEAREST],     cnt[CNT_NEAR]);
1060             FFSWAP( VP56mv, near_mv[CNT_NEAREST], near_mv[CNT_NEAR]);
1061         }
1062
1063         if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAREST]][1])) {
1064             if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAR]][2])) {
1065                 /* Choose the best mv out of 0,0 and the nearest mv */
1066                 clamp_mv(s, &mb->mv, &near_mv[CNT_ZERO + (cnt[CNT_NEAREST] >= cnt[CNT_ZERO])]);
1067                 cnt[CNT_SPLITMV] = ((mb_edge[VP8_EDGE_LEFT]->mode    == VP8_MVMODE_SPLIT) +
1068                                     (mb_edge[VP8_EDGE_TOP]->mode     == VP8_MVMODE_SPLIT)) * 2 +
1069                                     (mb_edge[VP8_EDGE_TOPLEFT]->mode == VP8_MVMODE_SPLIT);
1070
1071                 if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_SPLITMV]][3])) {
1072                     mb->mode = VP8_MVMODE_SPLIT;
1073                     mb->mv = mb->bmv[decode_splitmvs(s, c, mb, layout, IS_VP8) - 1];
1074                 } else {
1075                     mb->mv.y  += read_mv_component(c, s->prob->mvc[0], IS_VP8);
1076                     mb->mv.x  += read_mv_component(c, s->prob->mvc[1], IS_VP8);
1077                     mb->bmv[0] = mb->mv;
1078                 }
1079             } else {
1080                 clamp_mv(s, &mb->mv, &near_mv[CNT_NEAR]);
1081                 mb->bmv[0] = mb->mv;
1082             }
1083         } else {
1084             clamp_mv(s, &mb->mv, &near_mv[CNT_NEAREST]);
1085             mb->bmv[0] = mb->mv;
1086         }
1087     } else {
1088         mb->mode = VP8_MVMODE_ZERO;
1089         AV_ZERO32(&mb->mv);
1090         mb->bmv[0] = mb->mv;
1091     }
1092 }
1093
1094 static av_always_inline
1095 void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
1096                            int mb_x, int keyframe, int layout)
1097 {
1098     uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
1099
1100     if (layout == 1) {
1101         VP8Macroblock *mb_top = mb - s->mb_width - 1;
1102         memcpy(mb->intra4x4_pred_mode_top, mb_top->intra4x4_pred_mode_top, 4);
1103     }
1104     if (keyframe) {
1105         int x, y;
1106         uint8_t *top;
1107         uint8_t *const left = s->intra4x4_pred_mode_left;
1108         if (layout == 1)
1109             top = mb->intra4x4_pred_mode_top;
1110         else
1111             top = s->intra4x4_pred_mode_top + 4 * mb_x;
1112         for (y = 0; y < 4; y++) {
1113             for (x = 0; x < 4; x++) {
1114                 const uint8_t *ctx;
1115                 ctx       = vp8_pred4x4_prob_intra[top[x]][left[y]];
1116                 *intra4x4 = vp8_rac_get_tree(c, vp8_pred4x4_tree, ctx);
1117                 left[y]   = top[x] = *intra4x4;
1118                 intra4x4++;
1119             }
1120         }
1121     } else {
1122         int i;
1123         for (i = 0; i < 16; i++)
1124             intra4x4[i] = vp8_rac_get_tree(c, vp8_pred4x4_tree,
1125                                            vp8_pred4x4_prob_inter);
1126     }
1127 }
1128
1129 static av_always_inline
1130 void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
1131                     uint8_t *segment, uint8_t *ref, int layout, int is_vp7)
1132 {
1133     VP56RangeCoder *c = &s->c;
1134     const char *vp7_feature_name[] = { "q-index",
1135                                        "lf-delta",
1136                                        "partial-golden-update",
1137                                        "blit-pitch" };
1138     if (is_vp7) {
1139         int i;
1140         *segment = 0;
1141         for (i = 0; i < 4; i++) {
1142             if (s->feature_enabled[i]) {
1143                 if (vp56_rac_get_prob(c, s->feature_present_prob[i])) {
1144                       int index = vp8_rac_get_tree(c, vp7_feature_index_tree,
1145                                                    s->feature_index_prob[i]);
1146                       av_log(s->avctx, AV_LOG_WARNING,
1147                              "Feature %s present in macroblock (value 0x%x)\n",
1148                              vp7_feature_name[i], s->feature_value[i][index]);
1149                 }
1150            }
1151         }
1152     } else if (s->segmentation.update_map)
1153         *segment = vp8_rac_get_tree(c, vp8_segmentid_tree, s->prob->segmentid);
1154     else if (s->segmentation.enabled)
1155         *segment = ref ? *ref : *segment;
1156     mb->segment = *segment;
1157
1158     mb->skip = s->mbskip_enabled ? vp56_rac_get_prob(c, s->prob->mbskip) : 0;
1159
1160     if (s->keyframe) {
1161         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_intra,
1162                                     vp8_pred16x16_prob_intra);
1163
1164         if (mb->mode == MODE_I4x4) {
1165             decode_intra4x4_modes(s, c, mb, mb_x, 1, layout);
1166         } else {
1167             const uint32_t modes = (is_vp7 ? vp7_pred4x4_mode
1168                                            : vp8_pred4x4_mode)[mb->mode] * 0x01010101u;
1169             if (s->mb_layout == 1)
1170                 AV_WN32A(mb->intra4x4_pred_mode_top, modes);
1171             else
1172                 AV_WN32A(s->intra4x4_pred_mode_top + 4 * mb_x, modes);
1173             AV_WN32A(s->intra4x4_pred_mode_left, modes);
1174         }
1175
1176         mb->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree,
1177                                                 vp8_pred8x8c_prob_intra);
1178         mb->ref_frame        = VP56_FRAME_CURRENT;
1179     } else if (vp56_rac_get_prob_branchy(c, s->prob->intra)) {
1180         // inter MB, 16.2
1181         if (vp56_rac_get_prob_branchy(c, s->prob->last))
1182             mb->ref_frame =
1183                 (!is_vp7 && vp56_rac_get_prob(c, s->prob->golden)) ? VP56_FRAME_GOLDEN2 /* altref */
1184                                                                    : VP56_FRAME_GOLDEN;
1185         else
1186             mb->ref_frame = VP56_FRAME_PREVIOUS;
1187         s->ref_count[mb->ref_frame - 1]++;
1188
1189         // motion vectors, 16.3
1190         if (is_vp7)
1191             vp7_decode_mvs(s, mb, mb_x, mb_y, layout);
1192         else
1193             vp8_decode_mvs(s, mb, mb_x, mb_y, layout);
1194     } else {
1195         // intra MB, 16.1
1196         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_inter, s->prob->pred16x16);
1197
1198         if (mb->mode == MODE_I4x4)
1199             decode_intra4x4_modes(s, c, mb, mb_x, 0, layout);
1200
1201         mb->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree,
1202                                                 s->prob->pred8x8c);
1203         mb->ref_frame        = VP56_FRAME_CURRENT;
1204         mb->partitioning     = VP8_SPLITMVMODE_NONE;
1205         AV_ZERO32(&mb->bmv[0]);
1206     }
1207 }
1208
1209 /**
1210  * @param r     arithmetic bitstream reader context
1211  * @param block destination for block coefficients
1212  * @param probs probabilities to use when reading trees from the bitstream
1213  * @param i     initial coeff index, 0 unless a separate DC block is coded
1214  * @param qmul  array holding the dc/ac dequant factor at position 0/1
1215  *
1216  * @return 0 if no coeffs were decoded
1217  *         otherwise, the index of the last coeff decoded plus one
1218  */
1219 static av_always_inline
1220 int decode_block_coeffs_internal(VP56RangeCoder *r, int16_t block[16],
1221                                  uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1222                                  int i, uint8_t *token_prob, int16_t qmul[2],
1223                                  const uint8_t scan[16], int vp7)
1224 {
1225     VP56RangeCoder c = *r;
1226     goto skip_eob;
1227     do {
1228         int coeff;
1229 restart:
1230         if (!vp56_rac_get_prob_branchy(&c, token_prob[0]))   // DCT_EOB
1231             break;
1232
1233 skip_eob:
1234         if (!vp56_rac_get_prob_branchy(&c, token_prob[1])) { // DCT_0
1235             if (++i == 16)
1236                 break; // invalid input; blocks should end with EOB
1237             token_prob = probs[i][0];
1238             if (vp7)
1239                 goto restart;
1240             goto skip_eob;
1241         }
1242
1243         if (!vp56_rac_get_prob_branchy(&c, token_prob[2])) { // DCT_1
1244             coeff = 1;
1245             token_prob = probs[i + 1][1];
1246         } else {
1247             if (!vp56_rac_get_prob_branchy(&c, token_prob[3])) { // DCT 2,3,4
1248                 coeff = vp56_rac_get_prob_branchy(&c, token_prob[4]);
1249                 if (coeff)
1250                     coeff += vp56_rac_get_prob(&c, token_prob[5]);
1251                 coeff += 2;
1252             } else {
1253                 // DCT_CAT*
1254                 if (!vp56_rac_get_prob_branchy(&c, token_prob[6])) {
1255                     if (!vp56_rac_get_prob_branchy(&c, token_prob[7])) { // DCT_CAT1
1256                         coeff = 5 + vp56_rac_get_prob(&c, vp8_dct_cat1_prob[0]);
1257                     } else {                                    // DCT_CAT2
1258                         coeff  = 7;
1259                         coeff += vp56_rac_get_prob(&c, vp8_dct_cat2_prob[0]) << 1;
1260                         coeff += vp56_rac_get_prob(&c, vp8_dct_cat2_prob[1]);
1261                     }
1262                 } else {    // DCT_CAT3 and up
1263                     int a   = vp56_rac_get_prob(&c, token_prob[8]);
1264                     int b   = vp56_rac_get_prob(&c, token_prob[9 + a]);
1265                     int cat = (a << 1) + b;
1266                     coeff  = 3 + (8 << cat);
1267                     coeff += vp8_rac_get_coeff(&c, ff_vp8_dct_cat_prob[cat]);
1268                 }
1269             }
1270             token_prob = probs[i + 1][2];
1271         }
1272         block[scan[i]] = (vp8_rac_get(&c) ? -coeff : coeff) * qmul[!!i];
1273     } while (++i < 16);
1274
1275     *r = c;
1276     return i;
1277 }
1278
1279 static av_always_inline
1280 int inter_predict_dc(int16_t block[16], int16_t pred[2])
1281 {
1282     int16_t dc = block[0];
1283     int ret = 0;
1284
1285     if (pred[1] > 3) {
1286         dc += pred[0];
1287         ret = 1;
1288     }
1289
1290     if (!pred[0] | !dc | ((int32_t)pred[0] ^ (int32_t)dc) >> 31) {
1291         block[0] = pred[0] = dc;
1292         pred[1] = 0;
1293     } else {
1294         if (pred[0] == dc)
1295             pred[1]++;
1296         block[0] = pred[0] = dc;
1297     }
1298
1299     return ret;
1300 }
1301
1302 static int vp7_decode_block_coeffs_internal(VP56RangeCoder *r,
1303                                             int16_t block[16],
1304                                             uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1305                                             int i, uint8_t *token_prob,
1306                                             int16_t qmul[2],
1307                                             const uint8_t scan[16])
1308 {
1309     return decode_block_coeffs_internal(r, block, probs, i,
1310                                         token_prob, qmul, scan, IS_VP7);
1311 }
1312
1313 #ifndef vp8_decode_block_coeffs_internal
1314 static int vp8_decode_block_coeffs_internal(VP56RangeCoder *r,
1315                                             int16_t block[16],
1316                                             uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1317                                             int i, uint8_t *token_prob,
1318                                             int16_t qmul[2])
1319 {
1320     return decode_block_coeffs_internal(r, block, probs, i,
1321                                         token_prob, qmul, ff_zigzag_scan, IS_VP8);
1322 }
1323 #endif
1324
1325 /**
1326  * @param c          arithmetic bitstream reader context
1327  * @param block      destination for block coefficients
1328  * @param probs      probabilities to use when reading trees from the bitstream
1329  * @param i          initial coeff index, 0 unless a separate DC block is coded
1330  * @param zero_nhood the initial prediction context for number of surrounding
1331  *                   all-zero blocks (only left/top, so 0-2)
1332  * @param qmul       array holding the dc/ac dequant factor at position 0/1
1333  *
1334  * @return 0 if no coeffs were decoded
1335  *         otherwise, the index of the last coeff decoded plus one
1336  */
1337 static av_always_inline
1338 int decode_block_coeffs(VP56RangeCoder *c, int16_t block[16],
1339                         uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1340                         int i, int zero_nhood, int16_t qmul[2],
1341                         const uint8_t scan[16], int vp7)
1342 {
1343     uint8_t *token_prob = probs[i][zero_nhood];
1344     if (!vp56_rac_get_prob_branchy(c, token_prob[0]))   // DCT_EOB
1345         return 0;
1346     return vp7 ? vp7_decode_block_coeffs_internal(c, block, probs, i,
1347                                                   token_prob, qmul, scan)
1348                : vp8_decode_block_coeffs_internal(c, block, probs, i,
1349                                                   token_prob, qmul);
1350 }
1351
1352 static av_always_inline
1353 void decode_mb_coeffs(VP8Context *s, VP8ThreadData *td, VP56RangeCoder *c,
1354                       VP8Macroblock *mb, uint8_t t_nnz[9], uint8_t l_nnz[9],
1355                       int is_vp7)
1356 {
1357     int i, x, y, luma_start = 0, luma_ctx = 3;
1358     int nnz_pred, nnz, nnz_total = 0;
1359     int segment = mb->segment;
1360     int block_dc = 0;
1361
1362     if (mb->mode != MODE_I4x4 && (is_vp7 || mb->mode != VP8_MVMODE_SPLIT)) {
1363         nnz_pred = t_nnz[8] + l_nnz[8];
1364
1365         // decode DC values and do hadamard
1366         nnz = decode_block_coeffs(c, td->block_dc, s->prob->token[1], 0,
1367                                   nnz_pred, s->qmat[segment].luma_dc_qmul,
1368                                   ff_zigzag_scan, is_vp7);
1369         l_nnz[8] = t_nnz[8] = !!nnz;
1370
1371         if (is_vp7 && mb->mode > MODE_I4x4) {
1372             nnz |=  inter_predict_dc(td->block_dc,
1373                                      s->inter_dc_pred[mb->ref_frame - 1]);
1374         }
1375
1376         if (nnz) {
1377             nnz_total += nnz;
1378             block_dc   = 1;
1379             if (nnz == 1)
1380                 s->vp8dsp.vp8_luma_dc_wht_dc(td->block, td->block_dc);
1381             else
1382                 s->vp8dsp.vp8_luma_dc_wht(td->block, td->block_dc);
1383         }
1384         luma_start = 1;
1385         luma_ctx   = 0;
1386     }
1387
1388     // luma blocks
1389     for (y = 0; y < 4; y++)
1390         for (x = 0; x < 4; x++) {
1391             nnz_pred = l_nnz[y] + t_nnz[x];
1392             nnz = decode_block_coeffs(c, td->block[y][x],
1393                                       s->prob->token[luma_ctx],
1394                                       luma_start, nnz_pred,
1395                                       s->qmat[segment].luma_qmul,
1396                                       s->prob[0].scan, is_vp7);
1397             /* nnz+block_dc may be one more than the actual last index,
1398              * but we don't care */
1399             td->non_zero_count_cache[y][x] = nnz + block_dc;
1400             t_nnz[x] = l_nnz[y] = !!nnz;
1401             nnz_total += nnz;
1402         }
1403
1404     // chroma blocks
1405     // TODO: what to do about dimensions? 2nd dim for luma is x,
1406     // but for chroma it's (y<<1)|x
1407     for (i = 4; i < 6; i++)
1408         for (y = 0; y < 2; y++)
1409             for (x = 0; x < 2; x++) {
1410                 nnz_pred = l_nnz[i + 2 * y] + t_nnz[i + 2 * x];
1411                 nnz = decode_block_coeffs(c, td->block[i][(y << 1) + x],
1412                                           s->prob->token[2], 0, nnz_pred,
1413                                           s->qmat[segment].chroma_qmul,
1414                                           s->prob[0].scan, is_vp7);
1415                 td->non_zero_count_cache[i][(y << 1) + x] = nnz;
1416                 t_nnz[i + 2 * x] = l_nnz[i + 2 * y] = !!nnz;
1417                 nnz_total += nnz;
1418             }
1419
1420     // if there were no coded coeffs despite the macroblock not being marked skip,
1421     // we MUST not do the inner loop filter and should not do IDCT
1422     // Since skip isn't used for bitstream prediction, just manually set it.
1423     if (!nnz_total)
1424         mb->skip = 1;
1425 }
1426
1427 static av_always_inline
1428 void backup_mb_border(uint8_t *top_border, uint8_t *src_y,
1429                       uint8_t *src_cb, uint8_t *src_cr,
1430                       int linesize, int uvlinesize, int simple)
1431 {
1432     AV_COPY128(top_border, src_y + 15 * linesize);
1433     if (!simple) {
1434         AV_COPY64(top_border + 16, src_cb + 7 * uvlinesize);
1435         AV_COPY64(top_border + 24, src_cr + 7 * uvlinesize);
1436     }
1437 }
1438
1439 static av_always_inline
1440 void xchg_mb_border(uint8_t *top_border, uint8_t *src_y, uint8_t *src_cb,
1441                     uint8_t *src_cr, int linesize, int uvlinesize, int mb_x,
1442                     int mb_y, int mb_width, int simple, int xchg)
1443 {
1444     uint8_t *top_border_m1 = top_border - 32;     // for TL prediction
1445     src_y  -= linesize;
1446     src_cb -= uvlinesize;
1447     src_cr -= uvlinesize;
1448
1449 #define XCHG(a, b, xchg)                                                      \
1450     do {                                                                      \
1451         if (xchg)                                                             \
1452             AV_SWAP64(b, a);                                                  \
1453         else                                                                  \
1454             AV_COPY64(b, a);                                                  \
1455     } while (0)
1456
1457     XCHG(top_border_m1 + 8, src_y - 8, xchg);
1458     XCHG(top_border, src_y, xchg);
1459     XCHG(top_border + 8, src_y + 8, 1);
1460     if (mb_x < mb_width - 1)
1461         XCHG(top_border + 32, src_y + 16, 1);
1462
1463     // only copy chroma for normal loop filter
1464     // or to initialize the top row to 127
1465     if (!simple || !mb_y) {
1466         XCHG(top_border_m1 + 16, src_cb - 8, xchg);
1467         XCHG(top_border_m1 + 24, src_cr - 8, xchg);
1468         XCHG(top_border + 16, src_cb, 1);
1469         XCHG(top_border + 24, src_cr, 1);
1470     }
1471 }
1472
1473 static av_always_inline
1474 int check_dc_pred8x8_mode(int mode, int mb_x, int mb_y)
1475 {
1476     if (!mb_x)
1477         return mb_y ? TOP_DC_PRED8x8 : DC_128_PRED8x8;
1478     else
1479         return mb_y ? mode : LEFT_DC_PRED8x8;
1480 }
1481
1482 static av_always_inline
1483 int check_tm_pred8x8_mode(int mode, int mb_x, int mb_y, int vp7)
1484 {
1485     if (!mb_x)
1486         return mb_y ? VERT_PRED8x8 : (vp7 ? DC_128_PRED8x8 : DC_129_PRED8x8);
1487     else
1488         return mb_y ? mode : HOR_PRED8x8;
1489 }
1490
1491 static av_always_inline
1492 int check_intra_pred8x8_mode_emuedge(int mode, int mb_x, int mb_y, int vp7)
1493 {
1494     switch (mode) {
1495     case DC_PRED8x8:
1496         return check_dc_pred8x8_mode(mode, mb_x, mb_y);
1497     case VERT_PRED8x8:
1498         return !mb_y ? (vp7 ? DC_128_PRED8x8 : DC_127_PRED8x8) : mode;
1499     case HOR_PRED8x8:
1500         return !mb_x ? (vp7 ? DC_128_PRED8x8 : DC_129_PRED8x8) : mode;
1501     case PLANE_PRED8x8: /* TM */
1502         return check_tm_pred8x8_mode(mode, mb_x, mb_y, vp7);
1503     }
1504     return mode;
1505 }
1506
1507 static av_always_inline
1508 int check_tm_pred4x4_mode(int mode, int mb_x, int mb_y, int vp7)
1509 {
1510     if (!mb_x) {
1511         return mb_y ? VERT_VP8_PRED : (vp7 ? DC_128_PRED : DC_129_PRED);
1512     } else {
1513         return mb_y ? mode : HOR_VP8_PRED;
1514     }
1515 }
1516
1517 static av_always_inline
1518 int check_intra_pred4x4_mode_emuedge(int mode, int mb_x, int mb_y,
1519                                      int *copy_buf, int vp7)
1520 {
1521     switch (mode) {
1522     case VERT_PRED:
1523         if (!mb_x && mb_y) {
1524             *copy_buf = 1;
1525             return mode;
1526         }
1527         /* fall-through */
1528     case DIAG_DOWN_LEFT_PRED:
1529     case VERT_LEFT_PRED:
1530         return !mb_y ? (vp7 ? DC_128_PRED : DC_127_PRED) : mode;
1531     case HOR_PRED:
1532         if (!mb_y) {
1533             *copy_buf = 1;
1534             return mode;
1535         }
1536         /* fall-through */
1537     case HOR_UP_PRED:
1538         return !mb_x ? (vp7 ? DC_128_PRED : DC_129_PRED) : mode;
1539     case TM_VP8_PRED:
1540         return check_tm_pred4x4_mode(mode, mb_x, mb_y, vp7);
1541     case DC_PRED: /* 4x4 DC doesn't use the same "H.264-style" exceptions
1542                    * as 16x16/8x8 DC */
1543     case DIAG_DOWN_RIGHT_PRED:
1544     case VERT_RIGHT_PRED:
1545     case HOR_DOWN_PRED:
1546         if (!mb_y || !mb_x)
1547             *copy_buf = 1;
1548         return mode;
1549     }
1550     return mode;
1551 }
1552
1553 static av_always_inline
1554 void intra_predict(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1555                    VP8Macroblock *mb, int mb_x, int mb_y, int is_vp7)
1556 {
1557     int x, y, mode, nnz;
1558     uint32_t tr;
1559
1560     /* for the first row, we need to run xchg_mb_border to init the top edge
1561      * to 127 otherwise, skip it if we aren't going to deblock */
1562     if (mb_y && (s->deblock_filter || !mb_y) && td->thread_nr == 0)
1563         xchg_mb_border(s->top_border[mb_x + 1], dst[0], dst[1], dst[2],
1564                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1565                        s->filter.simple, 1);
1566
1567     if (mb->mode < MODE_I4x4) {
1568         mode = check_intra_pred8x8_mode_emuedge(mb->mode, mb_x, mb_y, is_vp7);
1569         s->hpc.pred16x16[mode](dst[0], s->linesize);
1570     } else {
1571         uint8_t *ptr = dst[0];
1572         uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
1573         const uint8_t lo = is_vp7 ? 128 : 127;
1574         const uint8_t hi = is_vp7 ? 128 : 129;
1575         uint8_t tr_top[4] = { lo, lo, lo, lo };
1576
1577         // all blocks on the right edge of the macroblock use bottom edge
1578         // the top macroblock for their topright edge
1579         uint8_t *tr_right = ptr - s->linesize + 16;
1580
1581         // if we're on the right edge of the frame, said edge is extended
1582         // from the top macroblock
1583         if (mb_y && mb_x == s->mb_width - 1) {
1584             tr       = tr_right[-1] * 0x01010101u;
1585             tr_right = (uint8_t *) &tr;
1586         }
1587
1588         if (mb->skip)
1589             AV_ZERO128(td->non_zero_count_cache);
1590
1591         for (y = 0; y < 4; y++) {
1592             uint8_t *topright = ptr + 4 - s->linesize;
1593             for (x = 0; x < 4; x++) {
1594                 int copy = 0, linesize = s->linesize;
1595                 uint8_t *dst = ptr + 4 * x;
1596                 DECLARE_ALIGNED(4, uint8_t, copy_dst)[5 * 8];
1597
1598                 if ((y == 0 || x == 3) && mb_y == 0) {
1599                     topright = tr_top;
1600                 } else if (x == 3)
1601                     topright = tr_right;
1602
1603                 mode = check_intra_pred4x4_mode_emuedge(intra4x4[x], mb_x + x,
1604                                                         mb_y + y, &copy, is_vp7);
1605                 if (copy) {
1606                     dst      = copy_dst + 12;
1607                     linesize = 8;
1608                     if (!(mb_y + y)) {
1609                         copy_dst[3] = lo;
1610                         AV_WN32A(copy_dst + 4, lo * 0x01010101U);
1611                     } else {
1612                         AV_COPY32(copy_dst + 4, ptr + 4 * x - s->linesize);
1613                         if (!(mb_x + x)) {
1614                             copy_dst[3] = hi;
1615                         } else {
1616                             copy_dst[3] = ptr[4 * x - s->linesize - 1];
1617                         }
1618                     }
1619                     if (!(mb_x + x)) {
1620                         copy_dst[11] =
1621                         copy_dst[19] =
1622                         copy_dst[27] =
1623                         copy_dst[35] = hi;
1624                     } else {
1625                         copy_dst[11] = ptr[4 * x                   - 1];
1626                         copy_dst[19] = ptr[4 * x + s->linesize     - 1];
1627                         copy_dst[27] = ptr[4 * x + s->linesize * 2 - 1];
1628                         copy_dst[35] = ptr[4 * x + s->linesize * 3 - 1];
1629                     }
1630                 }
1631                 s->hpc.pred4x4[mode](dst, topright, linesize);
1632                 if (copy) {
1633                     AV_COPY32(ptr + 4 * x,                   copy_dst + 12);
1634                     AV_COPY32(ptr + 4 * x + s->linesize,     copy_dst + 20);
1635                     AV_COPY32(ptr + 4 * x + s->linesize * 2, copy_dst + 28);
1636                     AV_COPY32(ptr + 4 * x + s->linesize * 3, copy_dst + 36);
1637                 }
1638
1639                 nnz = td->non_zero_count_cache[y][x];
1640                 if (nnz) {
1641                     if (nnz == 1)
1642                         s->vp8dsp.vp8_idct_dc_add(ptr + 4 * x,
1643                                                   td->block[y][x], s->linesize);
1644                     else
1645                         s->vp8dsp.vp8_idct_add(ptr + 4 * x,
1646                                                td->block[y][x], s->linesize);
1647                 }
1648                 topright += 4;
1649             }
1650
1651             ptr      += 4 * s->linesize;
1652             intra4x4 += 4;
1653         }
1654     }
1655
1656     mode = check_intra_pred8x8_mode_emuedge(mb->chroma_pred_mode,
1657                                             mb_x, mb_y, is_vp7);
1658     s->hpc.pred8x8[mode](dst[1], s->uvlinesize);
1659     s->hpc.pred8x8[mode](dst[2], s->uvlinesize);
1660
1661     if (mb_y && (s->deblock_filter || !mb_y) && td->thread_nr == 0)
1662         xchg_mb_border(s->top_border[mb_x + 1], dst[0], dst[1], dst[2],
1663                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1664                        s->filter.simple, 0);
1665 }
1666
1667 static const uint8_t subpel_idx[3][8] = {
1668     { 0, 1, 2, 1, 2, 1, 2, 1 }, // nr. of left extra pixels,
1669                                 // also function pointer index
1670     { 0, 3, 5, 3, 5, 3, 5, 3 }, // nr. of extra pixels required
1671     { 0, 2, 3, 2, 3, 2, 3, 2 }, // nr. of right extra pixels
1672 };
1673
1674 /**
1675  * luma MC function
1676  *
1677  * @param s        VP8 decoding context
1678  * @param dst      target buffer for block data at block position
1679  * @param ref      reference picture buffer at origin (0, 0)
1680  * @param mv       motion vector (relative to block position) to get pixel data from
1681  * @param x_off    horizontal position of block from origin (0, 0)
1682  * @param y_off    vertical position of block from origin (0, 0)
1683  * @param block_w  width of block (16, 8 or 4)
1684  * @param block_h  height of block (always same as block_w)
1685  * @param width    width of src/dst plane data
1686  * @param height   height of src/dst plane data
1687  * @param linesize size of a single line of plane data, including padding
1688  * @param mc_func  motion compensation function pointers (bilinear or sixtap MC)
1689  */
1690 static av_always_inline
1691 void vp8_mc_luma(VP8Context *s, VP8ThreadData *td, uint8_t *dst,
1692                  ThreadFrame *ref, const VP56mv *mv,
1693                  int x_off, int y_off, int block_w, int block_h,
1694                  int width, int height, ptrdiff_t linesize,
1695                  vp8_mc_func mc_func[3][3])
1696 {
1697     uint8_t *src = ref->f->data[0];
1698
1699     if (AV_RN32A(mv)) {
1700         int src_linesize = linesize;
1701
1702         int mx = (mv->x << 1) & 7, mx_idx = subpel_idx[0][mx];
1703         int my = (mv->y << 1) & 7, my_idx = subpel_idx[0][my];
1704
1705         x_off += mv->x >> 2;
1706         y_off += mv->y >> 2;
1707
1708         // edge emulation
1709         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 4, 0);
1710         src += y_off * linesize + x_off;
1711         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1712             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1713             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1714                                      src - my_idx * linesize - mx_idx,
1715                                      EDGE_EMU_LINESIZE, linesize,
1716                                      block_w + subpel_idx[1][mx],
1717                                      block_h + subpel_idx[1][my],
1718                                      x_off - mx_idx, y_off - my_idx,
1719                                      width, height);
1720             src = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
1721             src_linesize = EDGE_EMU_LINESIZE;
1722         }
1723         mc_func[my_idx][mx_idx](dst, linesize, src, src_linesize, block_h, mx, my);
1724     } else {
1725         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 4, 0);
1726         mc_func[0][0](dst, linesize, src + y_off * linesize + x_off,
1727                       linesize, block_h, 0, 0);
1728     }
1729 }
1730
1731 /**
1732  * chroma MC function
1733  *
1734  * @param s        VP8 decoding context
1735  * @param dst1     target buffer for block data at block position (U plane)
1736  * @param dst2     target buffer for block data at block position (V plane)
1737  * @param ref      reference picture buffer at origin (0, 0)
1738  * @param mv       motion vector (relative to block position) to get pixel data from
1739  * @param x_off    horizontal position of block from origin (0, 0)
1740  * @param y_off    vertical position of block from origin (0, 0)
1741  * @param block_w  width of block (16, 8 or 4)
1742  * @param block_h  height of block (always same as block_w)
1743  * @param width    width of src/dst plane data
1744  * @param height   height of src/dst plane data
1745  * @param linesize size of a single line of plane data, including padding
1746  * @param mc_func  motion compensation function pointers (bilinear or sixtap MC)
1747  */
1748 static av_always_inline
1749 void vp8_mc_chroma(VP8Context *s, VP8ThreadData *td, uint8_t *dst1,
1750                    uint8_t *dst2, ThreadFrame *ref, const VP56mv *mv,
1751                    int x_off, int y_off, int block_w, int block_h,
1752                    int width, int height, ptrdiff_t linesize,
1753                    vp8_mc_func mc_func[3][3])
1754 {
1755     uint8_t *src1 = ref->f->data[1], *src2 = ref->f->data[2];
1756
1757     if (AV_RN32A(mv)) {
1758         int mx = mv->x & 7, mx_idx = subpel_idx[0][mx];
1759         int my = mv->y & 7, my_idx = subpel_idx[0][my];
1760
1761         x_off += mv->x >> 3;
1762         y_off += mv->y >> 3;
1763
1764         // edge emulation
1765         src1 += y_off * linesize + x_off;
1766         src2 += y_off * linesize + x_off;
1767         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 3, 0);
1768         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1769             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1770             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1771                                      src1 - my_idx * linesize - mx_idx,
1772                                      EDGE_EMU_LINESIZE, linesize,
1773                                      block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1774                                      x_off - mx_idx, y_off - my_idx, width, height);
1775             src1 = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
1776             mc_func[my_idx][mx_idx](dst1, linesize, src1, EDGE_EMU_LINESIZE, block_h, mx, my);
1777
1778             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1779                                      src2 - my_idx * linesize - mx_idx,
1780                                      EDGE_EMU_LINESIZE, linesize,
1781                                      block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1782                                      x_off - mx_idx, y_off - my_idx, width, height);
1783             src2 = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
1784             mc_func[my_idx][mx_idx](dst2, linesize, src2, EDGE_EMU_LINESIZE, block_h, mx, my);
1785         } else {
1786             mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my);
1787             mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my);
1788         }
1789     } else {
1790         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 3, 0);
1791         mc_func[0][0](dst1, linesize, src1 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1792         mc_func[0][0](dst2, linesize, src2 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1793     }
1794 }
1795
1796 static av_always_inline
1797 void vp8_mc_part(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1798                  ThreadFrame *ref_frame, int x_off, int y_off,
1799                  int bx_off, int by_off, int block_w, int block_h,
1800                  int width, int height, VP56mv *mv)
1801 {
1802     VP56mv uvmv = *mv;
1803
1804     /* Y */
1805     vp8_mc_luma(s, td, dst[0] + by_off * s->linesize + bx_off,
1806                 ref_frame, mv, x_off + bx_off, y_off + by_off,
1807                 block_w, block_h, width, height, s->linesize,
1808                 s->put_pixels_tab[block_w == 8]);
1809
1810     /* U/V */
1811     if (s->profile == 3) {
1812         /* this block only applies VP8; it is safe to check
1813          * only the profile, as VP7 profile <= 1 */
1814         uvmv.x &= ~7;
1815         uvmv.y &= ~7;
1816     }
1817     x_off   >>= 1;
1818     y_off   >>= 1;
1819     bx_off  >>= 1;
1820     by_off  >>= 1;
1821     width   >>= 1;
1822     height  >>= 1;
1823     block_w >>= 1;
1824     block_h >>= 1;
1825     vp8_mc_chroma(s, td, dst[1] + by_off * s->uvlinesize + bx_off,
1826                   dst[2] + by_off * s->uvlinesize + bx_off, ref_frame,
1827                   &uvmv, x_off + bx_off, y_off + by_off,
1828                   block_w, block_h, width, height, s->uvlinesize,
1829                   s->put_pixels_tab[1 + (block_w == 4)]);
1830 }
1831
1832 /* Fetch pixels for estimated mv 4 macroblocks ahead.
1833  * Optimized for 64-byte cache lines. Inspired by ffh264 prefetch_motion. */
1834 static av_always_inline
1835 void prefetch_motion(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
1836                      int mb_xy, int ref)
1837 {
1838     /* Don't prefetch refs that haven't been used very often this frame. */
1839     if (s->ref_count[ref - 1] > (mb_xy >> 5)) {
1840         int x_off = mb_x << 4, y_off = mb_y << 4;
1841         int mx = (mb->mv.x >> 2) + x_off + 8;
1842         int my = (mb->mv.y >> 2) + y_off;
1843         uint8_t **src = s->framep[ref]->tf.f->data;
1844         int off = mx + (my + (mb_x & 3) * 4) * s->linesize + 64;
1845         /* For threading, a ff_thread_await_progress here might be useful, but
1846          * it actually slows down the decoder. Since a bad prefetch doesn't
1847          * generate bad decoder output, we don't run it here. */
1848         s->vdsp.prefetch(src[0] + off, s->linesize, 4);
1849         off = (mx >> 1) + ((my >> 1) + (mb_x & 7)) * s->uvlinesize + 64;
1850         s->vdsp.prefetch(src[1] + off, src[2] - src[1], 2);
1851     }
1852 }
1853
1854 /**
1855  * Apply motion vectors to prediction buffer, chapter 18.
1856  */
1857 static av_always_inline
1858 void inter_predict(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1859                    VP8Macroblock *mb, int mb_x, int mb_y)
1860 {
1861     int x_off = mb_x << 4, y_off = mb_y << 4;
1862     int width = 16 * s->mb_width, height = 16 * s->mb_height;
1863     ThreadFrame *ref = &s->framep[mb->ref_frame]->tf;
1864     VP56mv *bmv = mb->bmv;
1865
1866     switch (mb->partitioning) {
1867     case VP8_SPLITMVMODE_NONE:
1868         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1869                     0, 0, 16, 16, width, height, &mb->mv);
1870         break;
1871     case VP8_SPLITMVMODE_4x4: {
1872         int x, y;
1873         VP56mv uvmv;
1874
1875         /* Y */
1876         for (y = 0; y < 4; y++) {
1877             for (x = 0; x < 4; x++) {
1878                 vp8_mc_luma(s, td, dst[0] + 4 * y * s->linesize + x * 4,
1879                             ref, &bmv[4 * y + x],
1880                             4 * x + x_off, 4 * y + y_off, 4, 4,
1881                             width, height, s->linesize,
1882                             s->put_pixels_tab[2]);
1883             }
1884         }
1885
1886         /* U/V */
1887         x_off  >>= 1;
1888         y_off  >>= 1;
1889         width  >>= 1;
1890         height >>= 1;
1891         for (y = 0; y < 2; y++) {
1892             for (x = 0; x < 2; x++) {
1893                 uvmv.x = mb->bmv[2 * y       * 4 + 2 * x    ].x +
1894                          mb->bmv[2 * y       * 4 + 2 * x + 1].x +
1895                          mb->bmv[(2 * y + 1) * 4 + 2 * x    ].x +
1896                          mb->bmv[(2 * y + 1) * 4 + 2 * x + 1].x;
1897                 uvmv.y = mb->bmv[2 * y       * 4 + 2 * x    ].y +
1898                          mb->bmv[2 * y       * 4 + 2 * x + 1].y +
1899                          mb->bmv[(2 * y + 1) * 4 + 2 * x    ].y +
1900                          mb->bmv[(2 * y + 1) * 4 + 2 * x + 1].y;
1901                 uvmv.x = (uvmv.x + 2 + FF_SIGNBIT(uvmv.x)) >> 2;
1902                 uvmv.y = (uvmv.y + 2 + FF_SIGNBIT(uvmv.y)) >> 2;
1903                 if (s->profile == 3) {
1904                     uvmv.x &= ~7;
1905                     uvmv.y &= ~7;
1906                 }
1907                 vp8_mc_chroma(s, td, dst[1] + 4 * y * s->uvlinesize + x * 4,
1908                               dst[2] + 4 * y * s->uvlinesize + x * 4, ref,
1909                               &uvmv, 4 * x + x_off, 4 * y + y_off, 4, 4,
1910                               width, height, s->uvlinesize,
1911                               s->put_pixels_tab[2]);
1912             }
1913         }
1914         break;
1915     }
1916     case VP8_SPLITMVMODE_16x8:
1917         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1918                     0, 0, 16, 8, width, height, &bmv[0]);
1919         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1920                     0, 8, 16, 8, width, height, &bmv[1]);
1921         break;
1922     case VP8_SPLITMVMODE_8x16:
1923         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1924                     0, 0, 8, 16, width, height, &bmv[0]);
1925         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1926                     8, 0, 8, 16, width, height, &bmv[1]);
1927         break;
1928     case VP8_SPLITMVMODE_8x8:
1929         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1930                     0, 0, 8, 8, width, height, &bmv[0]);
1931         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1932                     8, 0, 8, 8, width, height, &bmv[1]);
1933         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1934                     0, 8, 8, 8, width, height, &bmv[2]);
1935         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1936                     8, 8, 8, 8, width, height, &bmv[3]);
1937         break;
1938     }
1939 }
1940
1941 static av_always_inline
1942 void idct_mb(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3], VP8Macroblock *mb)
1943 {
1944     int x, y, ch;
1945
1946     if (mb->mode != MODE_I4x4) {
1947         uint8_t *y_dst = dst[0];
1948         for (y = 0; y < 4; y++) {
1949             uint32_t nnz4 = AV_RL32(td->non_zero_count_cache[y]);
1950             if (nnz4) {
1951                 if (nnz4 & ~0x01010101) {
1952                     for (x = 0; x < 4; x++) {
1953                         if ((uint8_t) nnz4 == 1)
1954                             s->vp8dsp.vp8_idct_dc_add(y_dst + 4 * x,
1955                                                       td->block[y][x],
1956                                                       s->linesize);
1957                         else if ((uint8_t) nnz4 > 1)
1958                             s->vp8dsp.vp8_idct_add(y_dst + 4 * x,
1959                                                    td->block[y][x],
1960                                                    s->linesize);
1961                         nnz4 >>= 8;
1962                         if (!nnz4)
1963                             break;
1964                     }
1965                 } else {
1966                     s->vp8dsp.vp8_idct_dc_add4y(y_dst, td->block[y], s->linesize);
1967                 }
1968             }
1969             y_dst += 4 * s->linesize;
1970         }
1971     }
1972
1973     for (ch = 0; ch < 2; ch++) {
1974         uint32_t nnz4 = AV_RL32(td->non_zero_count_cache[4 + ch]);
1975         if (nnz4) {
1976             uint8_t *ch_dst = dst[1 + ch];
1977             if (nnz4 & ~0x01010101) {
1978                 for (y = 0; y < 2; y++) {
1979                     for (x = 0; x < 2; x++) {
1980                         if ((uint8_t) nnz4 == 1)
1981                             s->vp8dsp.vp8_idct_dc_add(ch_dst + 4 * x,
1982                                                       td->block[4 + ch][(y << 1) + x],
1983                                                       s->uvlinesize);
1984                         else if ((uint8_t) nnz4 > 1)
1985                             s->vp8dsp.vp8_idct_add(ch_dst + 4 * x,
1986                                                    td->block[4 + ch][(y << 1) + x],
1987                                                    s->uvlinesize);
1988                         nnz4 >>= 8;
1989                         if (!nnz4)
1990                             goto chroma_idct_end;
1991                     }
1992                     ch_dst += 4 * s->uvlinesize;
1993                 }
1994             } else {
1995                 s->vp8dsp.vp8_idct_dc_add4uv(ch_dst, td->block[4 + ch], s->uvlinesize);
1996             }
1997         }
1998 chroma_idct_end:
1999         ;
2000     }
2001 }
2002
2003 static av_always_inline
2004 void filter_level_for_mb(VP8Context *s, VP8Macroblock *mb,
2005                          VP8FilterStrength *f, int is_vp7)
2006 {
2007     int interior_limit, filter_level;
2008
2009     if (s->segmentation.enabled) {
2010         filter_level = s->segmentation.filter_level[mb->segment];
2011         if (!s->segmentation.absolute_vals)
2012             filter_level += s->filter.level;
2013     } else
2014         filter_level = s->filter.level;
2015
2016     if (s->lf_delta.enabled) {
2017         filter_level += s->lf_delta.ref[mb->ref_frame];
2018         filter_level += s->lf_delta.mode[mb->mode];
2019     }
2020
2021     filter_level = av_clip_uintp2(filter_level, 6);
2022
2023     interior_limit = filter_level;
2024     if (s->filter.sharpness) {
2025         interior_limit >>= (s->filter.sharpness + 3) >> 2;
2026         interior_limit = FFMIN(interior_limit, 9 - s->filter.sharpness);
2027     }
2028     interior_limit = FFMAX(interior_limit, 1);
2029
2030     f->filter_level = filter_level;
2031     f->inner_limit = interior_limit;
2032     f->inner_filter = is_vp7 || !mb->skip || mb->mode == MODE_I4x4 ||
2033                       mb->mode == VP8_MVMODE_SPLIT;
2034 }
2035
2036 static av_always_inline
2037 void filter_mb(VP8Context *s, uint8_t *dst[3], VP8FilterStrength *f,
2038                int mb_x, int mb_y, int is_vp7)
2039 {
2040     int mbedge_lim, bedge_lim_y, bedge_lim_uv, hev_thresh;
2041     int filter_level = f->filter_level;
2042     int inner_limit = f->inner_limit;
2043     int inner_filter = f->inner_filter;
2044     int linesize = s->linesize;
2045     int uvlinesize = s->uvlinesize;
2046     static const uint8_t hev_thresh_lut[2][64] = {
2047         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
2048           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2049           3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2050           3, 3, 3, 3 },
2051         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
2052           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2053           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2054           2, 2, 2, 2 }
2055     };
2056
2057     if (!filter_level)
2058         return;
2059
2060     if (is_vp7) {
2061         bedge_lim_y  = filter_level;
2062         bedge_lim_uv = filter_level * 2;
2063         mbedge_lim   = filter_level + 2;
2064     } else {
2065         bedge_lim_y  =
2066         bedge_lim_uv = filter_level * 2 + inner_limit;
2067         mbedge_lim   = bedge_lim_y + 4;
2068     }
2069
2070     hev_thresh = hev_thresh_lut[s->keyframe][filter_level];
2071
2072     if (mb_x) {
2073         s->vp8dsp.vp8_h_loop_filter16y(dst[0], linesize,
2074                                        mbedge_lim, inner_limit, hev_thresh);
2075         s->vp8dsp.vp8_h_loop_filter8uv(dst[1], dst[2], uvlinesize,
2076                                        mbedge_lim, inner_limit, hev_thresh);
2077     }
2078
2079 #define H_LOOP_FILTER_16Y_INNER(cond)                                         \
2080     if (cond && inner_filter) {                                               \
2081         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] +  4, linesize,           \
2082                                              bedge_lim_y, inner_limit,        \
2083                                              hev_thresh);                     \
2084         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] +  8, linesize,           \
2085                                              bedge_lim_y, inner_limit,        \
2086                                              hev_thresh);                     \
2087         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] + 12, linesize,           \
2088                                              bedge_lim_y, inner_limit,        \
2089                                              hev_thresh);                     \
2090         s->vp8dsp.vp8_h_loop_filter8uv_inner(dst[1] +  4, dst[2] + 4,         \
2091                                              uvlinesize,  bedge_lim_uv,       \
2092                                              inner_limit, hev_thresh);        \
2093     }
2094
2095     H_LOOP_FILTER_16Y_INNER(!is_vp7)
2096
2097     if (mb_y) {
2098         s->vp8dsp.vp8_v_loop_filter16y(dst[0], linesize,
2099                                        mbedge_lim, inner_limit, hev_thresh);
2100         s->vp8dsp.vp8_v_loop_filter8uv(dst[1], dst[2], uvlinesize,
2101                                        mbedge_lim, inner_limit, hev_thresh);
2102     }
2103
2104     if (inner_filter) {
2105         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] +  4 * linesize,
2106                                              linesize, bedge_lim_y,
2107                                              inner_limit, hev_thresh);
2108         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] +  8 * linesize,
2109                                              linesize, bedge_lim_y,
2110                                              inner_limit, hev_thresh);
2111         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] + 12 * linesize,
2112                                              linesize, bedge_lim_y,
2113                                              inner_limit, hev_thresh);
2114         s->vp8dsp.vp8_v_loop_filter8uv_inner(dst[1] +  4 * uvlinesize,
2115                                              dst[2] +  4 * uvlinesize,
2116                                              uvlinesize, bedge_lim_uv,
2117                                              inner_limit, hev_thresh);
2118     }
2119
2120     H_LOOP_FILTER_16Y_INNER(is_vp7)
2121 }
2122
2123 static av_always_inline
2124 void filter_mb_simple(VP8Context *s, uint8_t *dst, VP8FilterStrength *f,
2125                       int mb_x, int mb_y)
2126 {
2127     int mbedge_lim, bedge_lim;
2128     int filter_level = f->filter_level;
2129     int inner_limit  = f->inner_limit;
2130     int inner_filter = f->inner_filter;
2131     int linesize     = s->linesize;
2132
2133     if (!filter_level)
2134         return;
2135
2136     bedge_lim  = 2 * filter_level + inner_limit;
2137     mbedge_lim = bedge_lim + 4;
2138
2139     if (mb_x)
2140         s->vp8dsp.vp8_h_loop_filter_simple(dst, linesize, mbedge_lim);
2141     if (inner_filter) {
2142         s->vp8dsp.vp8_h_loop_filter_simple(dst +  4, linesize, bedge_lim);
2143         s->vp8dsp.vp8_h_loop_filter_simple(dst +  8, linesize, bedge_lim);
2144         s->vp8dsp.vp8_h_loop_filter_simple(dst + 12, linesize, bedge_lim);
2145     }
2146
2147     if (mb_y)
2148         s->vp8dsp.vp8_v_loop_filter_simple(dst, linesize, mbedge_lim);
2149     if (inner_filter) {
2150         s->vp8dsp.vp8_v_loop_filter_simple(dst +  4 * linesize, linesize, bedge_lim);
2151         s->vp8dsp.vp8_v_loop_filter_simple(dst +  8 * linesize, linesize, bedge_lim);
2152         s->vp8dsp.vp8_v_loop_filter_simple(dst + 12 * linesize, linesize, bedge_lim);
2153     }
2154 }
2155
2156 #define MARGIN (16 << 2)
2157 static av_always_inline
2158 void vp78_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *curframe,
2159                                     VP8Frame *prev_frame, int is_vp7)
2160 {
2161     VP8Context *s = avctx->priv_data;
2162     int mb_x, mb_y;
2163
2164     s->mv_min.y = -MARGIN;
2165     s->mv_max.y = ((s->mb_height - 1) << 6) + MARGIN;
2166     for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
2167         VP8Macroblock *mb = s->macroblocks_base +
2168                             ((s->mb_width + 1) * (mb_y + 1) + 1);
2169         int mb_xy = mb_y * s->mb_width;
2170
2171         AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED * 0x01010101);
2172
2173         s->mv_min.x = -MARGIN;
2174         s->mv_max.x = ((s->mb_width - 1) << 6) + MARGIN;
2175         for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
2176             if (mb_y == 0)
2177                 AV_WN32A((mb - s->mb_width - 1)->intra4x4_pred_mode_top,
2178                          DC_PRED * 0x01010101);
2179             decode_mb_mode(s, mb, mb_x, mb_y, curframe->seg_map->data + mb_xy,
2180                            prev_frame && prev_frame->seg_map ?
2181                            prev_frame->seg_map->data + mb_xy : NULL, 1, is_vp7);
2182             s->mv_min.x -= 64;
2183             s->mv_max.x -= 64;
2184         }
2185         s->mv_min.y -= 64;
2186         s->mv_max.y -= 64;
2187     }
2188 }
2189
2190 static void vp7_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *cur_frame,
2191                                    VP8Frame *prev_frame)
2192 {
2193     vp78_decode_mv_mb_modes(avctx, cur_frame, prev_frame, IS_VP7);
2194 }
2195
2196 static void vp8_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *cur_frame,
2197                                    VP8Frame *prev_frame)
2198 {
2199     vp78_decode_mv_mb_modes(avctx, cur_frame, prev_frame, IS_VP8);
2200 }
2201
2202 #if HAVE_THREADS
2203 #define check_thread_pos(td, otd, mb_x_check, mb_y_check)                     \
2204     do {                                                                      \
2205         int tmp = (mb_y_check << 16) | (mb_x_check & 0xFFFF);                 \
2206         if (otd->thread_mb_pos < tmp) {                                       \
2207             pthread_mutex_lock(&otd->lock);                                   \
2208             td->wait_mb_pos = tmp;                                            \
2209             do {                                                              \
2210                 if (otd->thread_mb_pos >= tmp)                                \
2211                     break;                                                    \
2212                 pthread_cond_wait(&otd->cond, &otd->lock);                    \
2213             } while (1);                                                      \
2214             td->wait_mb_pos = INT_MAX;                                        \
2215             pthread_mutex_unlock(&otd->lock);                                 \
2216         }                                                                     \
2217     } while (0);
2218
2219 #define update_pos(td, mb_y, mb_x)                                            \
2220     do {                                                                      \
2221         int pos              = (mb_y << 16) | (mb_x & 0xFFFF);                \
2222         int sliced_threading = (avctx->active_thread_type == FF_THREAD_SLICE) && \
2223                                (num_jobs > 1);                                \
2224         int is_null          = !next_td || !prev_td;                          \
2225         int pos_check        = (is_null) ? 1                                  \
2226                                          : (next_td != td &&                  \
2227                                             pos >= next_td->wait_mb_pos) ||   \
2228                                            (prev_td != td &&                  \
2229                                             pos >= prev_td->wait_mb_pos);     \
2230         td->thread_mb_pos = pos;                                              \
2231         if (sliced_threading && pos_check) {                                  \
2232             pthread_mutex_lock(&td->lock);                                    \
2233             pthread_cond_broadcast(&td->cond);                                \
2234             pthread_mutex_unlock(&td->lock);                                  \
2235         }                                                                     \
2236     } while (0);
2237 #else
2238 #define check_thread_pos(td, otd, mb_x_check, mb_y_check)
2239 #define update_pos(td, mb_y, mb_x)
2240 #endif
2241
2242 static void vp8_decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
2243                                         int jobnr, int threadnr, int is_vp7)
2244 {
2245     VP8Context *s = avctx->priv_data;
2246     VP8ThreadData *prev_td, *next_td, *td = &s->thread_data[threadnr];
2247     int mb_y = td->thread_mb_pos >> 16;
2248     int mb_x, mb_xy = mb_y * s->mb_width;
2249     int num_jobs = s->num_jobs;
2250     VP8Frame *curframe = s->curframe, *prev_frame = s->prev_frame;
2251     VP56RangeCoder *c  = &s->coeff_partition[mb_y & (s->num_coeff_partitions - 1)];
2252     VP8Macroblock *mb;
2253     uint8_t *dst[3] = {
2254         curframe->tf.f->data[0] + 16 * mb_y * s->linesize,
2255         curframe->tf.f->data[1] +  8 * mb_y * s->uvlinesize,
2256         curframe->tf.f->data[2] +  8 * mb_y * s->uvlinesize
2257     };
2258     if (mb_y == 0)
2259         prev_td = td;
2260     else
2261         prev_td = &s->thread_data[(jobnr + num_jobs - 1) % num_jobs];
2262     if (mb_y == s->mb_height - 1)
2263         next_td = td;
2264     else
2265         next_td = &s->thread_data[(jobnr + 1) % num_jobs];
2266     if (s->mb_layout == 1)
2267         mb = s->macroblocks_base + ((s->mb_width + 1) * (mb_y + 1) + 1);
2268     else {
2269         // Make sure the previous frame has read its segmentation map,
2270         // if we re-use the same map.
2271         if (prev_frame && s->segmentation.enabled &&
2272             !s->segmentation.update_map)
2273             ff_thread_await_progress(&prev_frame->tf, mb_y, 0);
2274         mb = s->macroblocks + (s->mb_height - mb_y - 1) * 2;
2275         memset(mb - 1, 0, sizeof(*mb)); // zero left macroblock
2276         AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED * 0x01010101);
2277     }
2278
2279     if (!is_vp7 || mb_y == 0)
2280         memset(td->left_nnz, 0, sizeof(td->left_nnz));
2281
2282     s->mv_min.x = -MARGIN;
2283     s->mv_max.x = ((s->mb_width - 1) << 6) + MARGIN;
2284
2285     for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
2286         // Wait for previous thread to read mb_x+2, and reach mb_y-1.
2287         if (prev_td != td) {
2288             if (threadnr != 0) {
2289                 check_thread_pos(td, prev_td,
2290                                  mb_x + (is_vp7 ? 2 : 1),
2291                                  mb_y - (is_vp7 ? 2 : 1));
2292             } else {
2293                 check_thread_pos(td, prev_td,
2294                                  mb_x + (is_vp7 ? 2 : 1) + s->mb_width + 3,
2295                                  mb_y - (is_vp7 ? 2 : 1));
2296             }
2297         }
2298
2299         s->vdsp.prefetch(dst[0] + (mb_x & 3) * 4 * s->linesize + 64,
2300                          s->linesize, 4);
2301         s->vdsp.prefetch(dst[1] + (mb_x & 7) * s->uvlinesize + 64,
2302                          dst[2] - dst[1], 2);
2303
2304         if (!s->mb_layout)
2305             decode_mb_mode(s, mb, mb_x, mb_y, curframe->seg_map->data + mb_xy,
2306                            prev_frame && prev_frame->seg_map ?
2307                            prev_frame->seg_map->data + mb_xy : NULL, 0, is_vp7);
2308
2309         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_PREVIOUS);
2310
2311         if (!mb->skip)
2312             decode_mb_coeffs(s, td, c, mb, s->top_nnz[mb_x], td->left_nnz, is_vp7);
2313
2314         if (mb->mode <= MODE_I4x4)
2315             intra_predict(s, td, dst, mb, mb_x, mb_y, is_vp7);
2316         else
2317             inter_predict(s, td, dst, mb, mb_x, mb_y);
2318
2319         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN);
2320
2321         if (!mb->skip) {
2322             idct_mb(s, td, dst, mb);
2323         } else {
2324             AV_ZERO64(td->left_nnz);
2325             AV_WN64(s->top_nnz[mb_x], 0);   // array of 9, so unaligned
2326
2327             /* Reset DC block predictors if they would exist
2328              * if the mb had coefficients */
2329             if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
2330                 td->left_nnz[8]     = 0;
2331                 s->top_nnz[mb_x][8] = 0;
2332             }
2333         }
2334
2335         if (s->deblock_filter)
2336             filter_level_for_mb(s, mb, &td->filter_strength[mb_x], is_vp7);
2337
2338         if (s->deblock_filter && num_jobs != 1 && threadnr == num_jobs - 1) {
2339             if (s->filter.simple)
2340                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2341                                  NULL, NULL, s->linesize, 0, 1);
2342             else
2343                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2344                                  dst[1], dst[2], s->linesize, s->uvlinesize, 0);
2345         }
2346
2347         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN2);
2348
2349         dst[0]      += 16;
2350         dst[1]      += 8;
2351         dst[2]      += 8;
2352         s->mv_min.x -= 64;
2353         s->mv_max.x -= 64;
2354
2355         if (mb_x == s->mb_width + 1) {
2356             update_pos(td, mb_y, s->mb_width + 3);
2357         } else {
2358             update_pos(td, mb_y, mb_x);
2359         }
2360     }
2361 }
2362
2363 static void vp8_filter_mb_row(AVCodecContext *avctx, void *tdata,
2364                               int jobnr, int threadnr, int is_vp7)
2365 {
2366     VP8Context *s = avctx->priv_data;
2367     VP8ThreadData *td = &s->thread_data[threadnr];
2368     int mb_x, mb_y = td->thread_mb_pos >> 16, num_jobs = s->num_jobs;
2369     AVFrame *curframe = s->curframe->tf.f;
2370     VP8Macroblock *mb;
2371     VP8ThreadData *prev_td, *next_td;
2372     uint8_t *dst[3] = {
2373         curframe->data[0] + 16 * mb_y * s->linesize,
2374         curframe->data[1] +  8 * mb_y * s->uvlinesize,
2375         curframe->data[2] +  8 * mb_y * s->uvlinesize
2376     };
2377
2378     if (s->mb_layout == 1)
2379         mb = s->macroblocks_base + ((s->mb_width + 1) * (mb_y + 1) + 1);
2380     else
2381         mb = s->macroblocks + (s->mb_height - mb_y - 1) * 2;
2382
2383     if (mb_y == 0)
2384         prev_td = td;
2385     else
2386         prev_td = &s->thread_data[(jobnr + num_jobs - 1) % num_jobs];
2387     if (mb_y == s->mb_height - 1)
2388         next_td = td;
2389     else
2390         next_td = &s->thread_data[(jobnr + 1) % num_jobs];
2391
2392     for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb++) {
2393         VP8FilterStrength *f = &td->filter_strength[mb_x];
2394         if (prev_td != td)
2395             check_thread_pos(td, prev_td,
2396                              (mb_x + 1) + (s->mb_width + 3), mb_y - 1);
2397         if (next_td != td)
2398             if (next_td != &s->thread_data[0])
2399                 check_thread_pos(td, next_td, mb_x + 1, mb_y + 1);
2400
2401         if (num_jobs == 1) {
2402             if (s->filter.simple)
2403                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2404                                  NULL, NULL, s->linesize, 0, 1);
2405             else
2406                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2407                                  dst[1], dst[2], s->linesize, s->uvlinesize, 0);
2408         }
2409
2410         if (s->filter.simple)
2411             filter_mb_simple(s, dst[0], f, mb_x, mb_y);
2412         else
2413             filter_mb(s, dst, f, mb_x, mb_y, is_vp7);
2414         dst[0] += 16;
2415         dst[1] += 8;
2416         dst[2] += 8;
2417
2418         update_pos(td, mb_y, (s->mb_width + 3) + mb_x);
2419     }
2420 }
2421
2422 static av_always_inline
2423 int vp78_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata, int jobnr,
2424                               int threadnr, int is_vp7)
2425 {
2426     VP8Context *s = avctx->priv_data;
2427     VP8ThreadData *td = &s->thread_data[jobnr];
2428     VP8ThreadData *next_td = NULL, *prev_td = NULL;
2429     VP8Frame *curframe = s->curframe;
2430     int mb_y, num_jobs = s->num_jobs;
2431
2432     td->thread_nr = threadnr;
2433     for (mb_y = jobnr; mb_y < s->mb_height; mb_y += num_jobs) {
2434         if (mb_y >= s->mb_height)
2435             break;
2436         td->thread_mb_pos = mb_y << 16;
2437         vp8_decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr, is_vp7);
2438         if (s->deblock_filter)
2439             vp8_filter_mb_row(avctx, tdata, jobnr, threadnr, is_vp7);
2440         update_pos(td, mb_y, INT_MAX & 0xFFFF);
2441
2442         s->mv_min.y -= 64;
2443         s->mv_max.y -= 64;
2444
2445         if (avctx->active_thread_type == FF_THREAD_FRAME)
2446             ff_thread_report_progress(&curframe->tf, mb_y, 0);
2447     }
2448
2449     return 0;
2450 }
2451
2452 static int vp7_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata,
2453                                     int jobnr, int threadnr)
2454 {
2455     return vp78_decode_mb_row_sliced(avctx, tdata, jobnr, threadnr, IS_VP7);
2456 }
2457
2458 static int vp8_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata,
2459                                     int jobnr, int threadnr)
2460 {
2461     return vp78_decode_mb_row_sliced(avctx, tdata, jobnr, threadnr, IS_VP8);
2462 }
2463
2464
2465 static av_always_inline
2466 int vp78_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
2467                       AVPacket *avpkt, int is_vp7)
2468 {
2469     VP8Context *s = avctx->priv_data;
2470     int ret, i, referenced, num_jobs;
2471     enum AVDiscard skip_thresh;
2472     VP8Frame *av_uninit(curframe), *prev_frame;
2473
2474     if (is_vp7)
2475         ret = vp7_decode_frame_header(s, avpkt->data, avpkt->size);
2476     else
2477         ret = vp8_decode_frame_header(s, avpkt->data, avpkt->size);
2478
2479     if (ret < 0)
2480         goto err;
2481
2482     prev_frame = s->framep[VP56_FRAME_CURRENT];
2483
2484     referenced = s->update_last || s->update_golden == VP56_FRAME_CURRENT ||
2485                  s->update_altref == VP56_FRAME_CURRENT;
2486
2487     skip_thresh = !referenced ? AVDISCARD_NONREF
2488                               : !s->keyframe ? AVDISCARD_NONKEY
2489                                              : AVDISCARD_ALL;
2490
2491     if (avctx->skip_frame >= skip_thresh) {
2492         s->invisible = 1;
2493         memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
2494         goto skip_decode;
2495     }
2496     s->deblock_filter = s->filter.level && avctx->skip_loop_filter < skip_thresh;
2497
2498     // release no longer referenced frames
2499     for (i = 0; i < 5; i++)
2500         if (s->frames[i].tf.f->data[0] &&
2501             &s->frames[i] != prev_frame &&
2502             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
2503             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN]   &&
2504             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2])
2505             vp8_release_frame(s, &s->frames[i]);
2506
2507     curframe = s->framep[VP56_FRAME_CURRENT] = vp8_find_free_buffer(s);
2508
2509     if (!s->colorspace)
2510         avctx->colorspace = AVCOL_SPC_BT470BG;
2511     if (s->fullrange)
2512         avctx->color_range = AVCOL_RANGE_JPEG;
2513     else
2514         avctx->color_range = AVCOL_RANGE_MPEG;
2515
2516     /* Given that arithmetic probabilities are updated every frame, it's quite
2517      * likely that the values we have on a random interframe are complete
2518      * junk if we didn't start decode on a keyframe. So just don't display
2519      * anything rather than junk. */
2520     if (!s->keyframe && (!s->framep[VP56_FRAME_PREVIOUS] ||
2521                          !s->framep[VP56_FRAME_GOLDEN]   ||
2522                          !s->framep[VP56_FRAME_GOLDEN2])) {
2523         av_log(avctx, AV_LOG_WARNING,
2524                "Discarding interframe without a prior keyframe!\n");
2525         ret = AVERROR_INVALIDDATA;
2526         goto err;
2527     }
2528
2529     curframe->tf.f->key_frame = s->keyframe;
2530     curframe->tf.f->pict_type = s->keyframe ? AV_PICTURE_TYPE_I
2531                                             : AV_PICTURE_TYPE_P;
2532     if ((ret = vp8_alloc_frame(s, curframe, referenced))) {
2533         av_log(avctx, AV_LOG_ERROR, "get_buffer() failed!\n");
2534         goto err;
2535     }
2536
2537     // check if golden and altref are swapped
2538     if (s->update_altref != VP56_FRAME_NONE)
2539         s->next_framep[VP56_FRAME_GOLDEN2] = s->framep[s->update_altref];
2540     else
2541         s->next_framep[VP56_FRAME_GOLDEN2] = s->framep[VP56_FRAME_GOLDEN2];
2542
2543     if (s->update_golden != VP56_FRAME_NONE)
2544         s->next_framep[VP56_FRAME_GOLDEN] = s->framep[s->update_golden];
2545     else
2546         s->next_framep[VP56_FRAME_GOLDEN] = s->framep[VP56_FRAME_GOLDEN];
2547
2548     if (s->update_last)
2549         s->next_framep[VP56_FRAME_PREVIOUS] = curframe;
2550     else
2551         s->next_framep[VP56_FRAME_PREVIOUS] = s->framep[VP56_FRAME_PREVIOUS];
2552
2553     s->next_framep[VP56_FRAME_CURRENT] = curframe;
2554
2555     ff_thread_finish_setup(avctx);
2556
2557     s->linesize   = curframe->tf.f->linesize[0];
2558     s->uvlinesize = curframe->tf.f->linesize[1];
2559
2560     memset(s->top_nnz, 0, s->mb_width * sizeof(*s->top_nnz));
2561     /* Zero macroblock structures for top/top-left prediction
2562      * from outside the frame. */
2563     if (!s->mb_layout)
2564         memset(s->macroblocks + s->mb_height * 2 - 1, 0,
2565                (s->mb_width + 1) * sizeof(*s->macroblocks));
2566     if (!s->mb_layout && s->keyframe)
2567         memset(s->intra4x4_pred_mode_top, DC_PRED, s->mb_width * 4);
2568
2569     memset(s->ref_count, 0, sizeof(s->ref_count));
2570
2571     if (s->mb_layout == 1) {
2572         // Make sure the previous frame has read its segmentation map,
2573         // if we re-use the same map.
2574         if (prev_frame && s->segmentation.enabled &&
2575             !s->segmentation.update_map)
2576             ff_thread_await_progress(&prev_frame->tf, 1, 0);
2577         if (is_vp7)
2578             vp7_decode_mv_mb_modes(avctx, curframe, prev_frame);
2579         else
2580             vp8_decode_mv_mb_modes(avctx, curframe, prev_frame);
2581     }
2582
2583     if (avctx->active_thread_type == FF_THREAD_FRAME)
2584         num_jobs = 1;
2585     else
2586         num_jobs = FFMIN(s->num_coeff_partitions, avctx->thread_count);
2587     s->num_jobs   = num_jobs;
2588     s->curframe   = curframe;
2589     s->prev_frame = prev_frame;
2590     s->mv_min.y   = -MARGIN;
2591     s->mv_max.y   = ((s->mb_height - 1) << 6) + MARGIN;
2592     for (i = 0; i < MAX_THREADS; i++) {
2593         s->thread_data[i].thread_mb_pos = 0;
2594         s->thread_data[i].wait_mb_pos   = INT_MAX;
2595     }
2596     if (is_vp7)
2597         avctx->execute2(avctx, vp7_decode_mb_row_sliced, s->thread_data, NULL,
2598                         num_jobs);
2599     else
2600         avctx->execute2(avctx, vp8_decode_mb_row_sliced, s->thread_data, NULL,
2601                         num_jobs);
2602
2603     ff_thread_report_progress(&curframe->tf, INT_MAX, 0);
2604     memcpy(&s->framep[0], &s->next_framep[0], sizeof(s->framep[0]) * 4);
2605
2606 skip_decode:
2607     // if future frames don't use the updated probabilities,
2608     // reset them to the values we saved
2609     if (!s->update_probabilities)
2610         s->prob[0] = s->prob[1];
2611
2612     if (!s->invisible) {
2613         if ((ret = av_frame_ref(data, curframe->tf.f)) < 0)
2614             return ret;
2615         *got_frame = 1;
2616     }
2617
2618     return avpkt->size;
2619 err:
2620     memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
2621     return ret;
2622 }
2623
2624 int ff_vp8_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
2625                         AVPacket *avpkt)
2626 {
2627     return vp78_decode_frame(avctx, data, got_frame, avpkt, IS_VP8);
2628 }
2629
2630 #if CONFIG_VP7_DECODER
2631 static int vp7_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
2632                             AVPacket *avpkt)
2633 {
2634     return vp78_decode_frame(avctx, data, got_frame, avpkt, IS_VP7);
2635 }
2636 #endif /* CONFIG_VP7_DECODER */
2637
2638 av_cold int ff_vp8_decode_free(AVCodecContext *avctx)
2639 {
2640     VP8Context *s = avctx->priv_data;
2641     int i;
2642
2643     vp8_decode_flush_impl(avctx, 1);
2644     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++)
2645         av_frame_free(&s->frames[i].tf.f);
2646
2647     return 0;
2648 }
2649
2650 static av_cold int vp8_init_frames(VP8Context *s)
2651 {
2652     int i;
2653     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++) {
2654         s->frames[i].tf.f = av_frame_alloc();
2655         if (!s->frames[i].tf.f)
2656             return AVERROR(ENOMEM);
2657     }
2658     return 0;
2659 }
2660
2661 static av_always_inline
2662 int vp78_decode_init(AVCodecContext *avctx, int is_vp7)
2663 {
2664     VP8Context *s = avctx->priv_data;
2665     int ret;
2666
2667     s->avctx = avctx;
2668     avctx->pix_fmt = AV_PIX_FMT_YUV420P;
2669     avctx->internal->allocate_progress = 1;
2670
2671     ff_videodsp_init(&s->vdsp, 8);
2672
2673     ff_vp78dsp_init(&s->vp8dsp);
2674     if (CONFIG_VP7_DECODER && is_vp7) {
2675         ff_h264_pred_init(&s->hpc, AV_CODEC_ID_VP7, 8, 1);
2676         ff_vp7dsp_init(&s->vp8dsp);
2677     } else if (CONFIG_VP8_DECODER && !is_vp7) {
2678         ff_h264_pred_init(&s->hpc, AV_CODEC_ID_VP8, 8, 1);
2679         ff_vp8dsp_init(&s->vp8dsp);
2680     }
2681
2682     /* does not change for VP8 */
2683     memcpy(s->prob[0].scan, ff_zigzag_scan, sizeof(s->prob[0].scan));
2684
2685     if ((ret = vp8_init_frames(s)) < 0) {
2686         ff_vp8_decode_free(avctx);
2687         return ret;
2688     }
2689
2690     return 0;
2691 }
2692
2693 #if CONFIG_VP7_DECODER
2694 static int vp7_decode_init(AVCodecContext *avctx)
2695 {
2696     return vp78_decode_init(avctx, IS_VP7);
2697 }
2698 #endif /* CONFIG_VP7_DECODER */
2699
2700 av_cold int ff_vp8_decode_init(AVCodecContext *avctx)
2701 {
2702     return vp78_decode_init(avctx, IS_VP8);
2703 }
2704
2705 #if CONFIG_VP8_DECODER
2706 static av_cold int vp8_decode_init_thread_copy(AVCodecContext *avctx)
2707 {
2708     VP8Context *s = avctx->priv_data;
2709     int ret;
2710
2711     s->avctx = avctx;
2712
2713     if ((ret = vp8_init_frames(s)) < 0) {
2714         ff_vp8_decode_free(avctx);
2715         return ret;
2716     }
2717
2718     return 0;
2719 }
2720
2721 #define REBASE(pic) pic ? pic - &s_src->frames[0] + &s->frames[0] : NULL
2722
2723 static int vp8_decode_update_thread_context(AVCodecContext *dst,
2724                                             const AVCodecContext *src)
2725 {
2726     VP8Context *s = dst->priv_data, *s_src = src->priv_data;
2727     int i;
2728
2729     if (s->macroblocks_base &&
2730         (s_src->mb_width != s->mb_width || s_src->mb_height != s->mb_height)) {
2731         free_buffers(s);
2732         s->mb_width  = s_src->mb_width;
2733         s->mb_height = s_src->mb_height;
2734     }
2735
2736     s->prob[0]      = s_src->prob[!s_src->update_probabilities];
2737     s->segmentation = s_src->segmentation;
2738     s->lf_delta     = s_src->lf_delta;
2739     memcpy(s->sign_bias, s_src->sign_bias, sizeof(s->sign_bias));
2740
2741     for (i = 0; i < FF_ARRAY_ELEMS(s_src->frames); i++) {
2742         if (s_src->frames[i].tf.f->data[0]) {
2743             int ret = vp8_ref_frame(s, &s->frames[i], &s_src->frames[i]);
2744             if (ret < 0)
2745                 return ret;
2746         }
2747     }
2748
2749     s->framep[0] = REBASE(s_src->next_framep[0]);
2750     s->framep[1] = REBASE(s_src->next_framep[1]);
2751     s->framep[2] = REBASE(s_src->next_framep[2]);
2752     s->framep[3] = REBASE(s_src->next_framep[3]);
2753
2754     return 0;
2755 }
2756 #endif /* CONFIG_VP8_DECODER */
2757
2758 #if CONFIG_VP7_DECODER
2759 AVCodec ff_vp7_decoder = {
2760     .name                  = "vp7",
2761     .long_name             = NULL_IF_CONFIG_SMALL("On2 VP7"),
2762     .type                  = AVMEDIA_TYPE_VIDEO,
2763     .id                    = AV_CODEC_ID_VP7,
2764     .priv_data_size        = sizeof(VP8Context),
2765     .init                  = vp7_decode_init,
2766     .close                 = ff_vp8_decode_free,
2767     .decode                = vp7_decode_frame,
2768     .capabilities          = AV_CODEC_CAP_DR1,
2769     .flush                 = vp8_decode_flush,
2770 };
2771 #endif /* CONFIG_VP7_DECODER */
2772
2773 #if CONFIG_VP8_DECODER
2774 AVCodec ff_vp8_decoder = {
2775     .name                  = "vp8",
2776     .long_name             = NULL_IF_CONFIG_SMALL("On2 VP8"),
2777     .type                  = AVMEDIA_TYPE_VIDEO,
2778     .id                    = AV_CODEC_ID_VP8,
2779     .priv_data_size        = sizeof(VP8Context),
2780     .init                  = ff_vp8_decode_init,
2781     .close                 = ff_vp8_decode_free,
2782     .decode                = ff_vp8_decode_frame,
2783     .capabilities          = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS |
2784                              AV_CODEC_CAP_SLICE_THREADS,
2785     .flush                 = vp8_decode_flush,
2786     .init_thread_copy      = ONLY_IF_THREADS_ENABLED(vp8_decode_init_thread_copy),
2787     .update_thread_context = ONLY_IF_THREADS_ENABLED(vp8_decode_update_thread_context),
2788 };
2789 #endif /* CONFIG_VP7_DECODER */