git.sesse.net Git - ffmpeg/blob - libavcodec/vp8.c

   1 /*
   2  * VP7/VP8 compatible video decoder
   3  *
   4  * Copyright (C) 2010 David Conrad
   5  * Copyright (C) 2010 Ronald S. Bultje
   6  * Copyright (C) 2010 Fiona Glaser
   7  * Copyright (C) 2012 Daniel Kang
   8  * Copyright (C) 2014 Peter Ross
   9  *
  10  * This file is part of FFmpeg.
  11  *
  12  * FFmpeg is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public
  14  * License as published by the Free Software Foundation; either
  15  * version 2.1 of the License, or (at your option) any later version.
  16  *
  17  * FFmpeg is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with FFmpeg; if not, write to the Free Software
  24  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  25  */
  26
  27 #include "libavutil/imgutils.h"
  28
  29 #include "avcodec.h"
  30 #include "internal.h"
  31 #include "rectangle.h"
  32 #include "thread.h"
  33 #include "vp8.h"
  34 #include "vp8data.h"
  35
  36 #if ARCH_ARM
  37 #   include "arm/vp8.h"
  38 #endif
  39
  40 #if CONFIG_VP7_DECODER && CONFIG_VP8_DECODER
  41 #define VPX(vp7, f) (vp7 ? vp7_ ## f : vp8_ ## f)
  42 #elif CONFIG_VP7_DECODER
  43 #define VPX(vp7, f) vp7_ ## f
  44 #else // CONFIG_VP8_DECODER
  45 #define VPX(vp7, f) vp8_ ## f
  46 #endif
  47
  48 static void free_buffers(VP8Context *s)
  49 {
  50     int i;
  51     if (s->thread_data)
  52         for (i = 0; i < MAX_THREADS; i++) {
  53 #if HAVE_THREADS
  54             pthread_cond_destroy(&s->thread_data[i].cond);
  55             pthread_mutex_destroy(&s->thread_data[i].lock);
  56 #endif
  57             av_freep(&s->thread_data[i].filter_strength);
  58         }
  59     av_freep(&s->thread_data);
  60     av_freep(&s->macroblocks_base);
  61     av_freep(&s->intra4x4_pred_mode_top);
  62     av_freep(&s->top_nnz);
  63     av_freep(&s->top_border);
  64
  65     s->macroblocks = NULL;
  66 }
  67
  68 static int vp8_alloc_frame(VP8Context *s, VP8Frame *f, int ref)
  69 {
  70     int ret;
  71     if ((ret = ff_thread_get_buffer(s->avctx, &f->tf,
  72                                     ref ? AV_GET_BUFFER_FLAG_REF : 0)) < 0)
  73         return ret;
  74     if (!(f->seg_map = av_buffer_allocz(s->mb_width * s->mb_height))) {
  75         ff_thread_release_buffer(s->avctx, &f->tf);
  76         return AVERROR(ENOMEM);
  77     }
  78     return 0;
  79 }
  80
  81 static void vp8_release_frame(VP8Context *s, VP8Frame *f)
  82 {
  83     av_buffer_unref(&f->seg_map);
  84     ff_thread_release_buffer(s->avctx, &f->tf);
  85 }
  86
  87 #if CONFIG_VP8_DECODER
  88 static int vp8_ref_frame(VP8Context *s, VP8Frame *dst, VP8Frame *src)
  89 {
  90     int ret;
  91
  92     vp8_release_frame(s, dst);
  93
  94     if ((ret = ff_thread_ref_frame(&dst->tf, &src->tf)) < 0)
  95         return ret;
  96     if (src->seg_map &&
  97         !(dst->seg_map = av_buffer_ref(src->seg_map))) {
  98         vp8_release_frame(s, dst);
  99         return AVERROR(ENOMEM);
 100     }
 101
 102     return 0;
 103 }
 104 #endif /* CONFIG_VP8_DECODER */
 105
 106 static void vp8_decode_flush_impl(AVCodecContext *avctx, int free_mem)
 107 {
 108     VP8Context *s = avctx->priv_data;
 109     int i;
 110
 111     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++)
 112         vp8_release_frame(s, &s->frames[i]);
 113     memset(s->framep, 0, sizeof(s->framep));
 114
 115     if (free_mem)
 116         free_buffers(s);
 117 }
 118
 119 static void vp8_decode_flush(AVCodecContext *avctx)
 120 {
 121     vp8_decode_flush_impl(avctx, 0);
 122 }
 123
 124 static VP8Frame *vp8_find_free_buffer(VP8Context *s)
 125 {
 126     VP8Frame *frame = NULL;
 127     int i;
 128
 129     // find a free buffer
 130     for (i = 0; i < 5; i++)
 131         if (&s->frames[i] != s->framep[VP56_FRAME_CURRENT]  &&
 132             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
 133             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN]   &&
 134             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2]) {
 135             frame = &s->frames[i];
 136             break;
 137         }
 138     if (i == 5) {
 139         av_log(s->avctx, AV_LOG_FATAL, "Ran out of free frames!\n");
 140         abort();
 141     }
 142     if (frame->tf.f->data[0])
 143         vp8_release_frame(s, frame);
 144
 145     return frame;
 146 }
 147
 148 static av_always_inline
 149 int update_dimensions(VP8Context *s, int width, int height, int is_vp7)
 150 {
 151     AVCodecContext *avctx = s->avctx;
 152     int i, ret;
 153
 154     if (width  != s->avctx->width || ((width+15)/16 != s->mb_width || (height+15)/16 != s->mb_height) && s->macroblocks_base ||
 155         height != s->avctx->height) {
 156         vp8_decode_flush_impl(s->avctx, 1);
 157
 158         ret = ff_set_dimensions(s->avctx, width, height);
 159         if (ret < 0)
 160             return ret;
 161     }
 162
 163     s->mb_width  = (s->avctx->coded_width  + 15) / 16;
 164     s->mb_height = (s->avctx->coded_height + 15) / 16;
 165
 166     s->mb_layout = is_vp7 || avctx->active_thread_type == FF_THREAD_SLICE &&
 167                    FFMIN(s->num_coeff_partitions, avctx->thread_count) > 1;
 168     if (!s->mb_layout) { // Frame threading and one thread
 169         s->macroblocks_base       = av_mallocz((s->mb_width + s->mb_height * 2 + 1) *
 170                                                sizeof(*s->macroblocks));
 171         s->intra4x4_pred_mode_top = av_mallocz(s->mb_width * 4);
 172     } else // Sliced threading
 173         s->macroblocks_base = av_mallocz((s->mb_width + 2) * (s->mb_height + 2) *
 174                                          sizeof(*s->macroblocks));
 175     s->top_nnz     = av_mallocz(s->mb_width * sizeof(*s->top_nnz));
 176     s->top_border  = av_mallocz((s->mb_width + 1) * sizeof(*s->top_border));
 177     s->thread_data = av_mallocz(MAX_THREADS * sizeof(VP8ThreadData));
 178
 179     if (!s->macroblocks_base || !s->top_nnz || !s->top_border ||
 180         !s->thread_data || (!s->intra4x4_pred_mode_top && !s->mb_layout)) {
 181         free_buffers(s);
 182         return AVERROR(ENOMEM);
 183     }
 184
 185     for (i = 0; i < MAX_THREADS; i++) {
 186         s->thread_data[i].filter_strength =
 187             av_mallocz(s->mb_width * sizeof(*s->thread_data[0].filter_strength));
 188         if (!s->thread_data[i].filter_strength) {
 189             free_buffers(s);
 190             return AVERROR(ENOMEM);
 191         }
 192 #if HAVE_THREADS
 193         pthread_mutex_init(&s->thread_data[i].lock, NULL);
 194         pthread_cond_init(&s->thread_data[i].cond, NULL);
 195 #endif
 196     }
 197
 198     s->macroblocks = s->macroblocks_base + 1;
 199
 200     return 0;
 201 }
 202
 203 static int vp7_update_dimensions(VP8Context *s, int width, int height)
 204 {
 205     return update_dimensions(s, width, height, IS_VP7);
 206 }
 207
 208 static int vp8_update_dimensions(VP8Context *s, int width, int height)
 209 {
 210     return update_dimensions(s, width, height, IS_VP8);
 211 }
 212
 213
 214 static void parse_segment_info(VP8Context *s)
 215 {
 216     VP56RangeCoder *c = &s->c;
 217     int i;
 218
 219     s->segmentation.update_map = vp8_rac_get(c);
 220
 221     if (vp8_rac_get(c)) { // update segment feature data
 222         s->segmentation.absolute_vals = vp8_rac_get(c);
 223
 224         for (i = 0; i < 4; i++)
 225             s->segmentation.base_quant[i]   = vp8_rac_get_sint(c, 7);
 226
 227         for (i = 0; i < 4; i++)
 228             s->segmentation.filter_level[i] = vp8_rac_get_sint(c, 6);
 229     }
 230     if (s->segmentation.update_map)
 231         for (i = 0; i < 3; i++)
 232             s->prob->segmentid[i] = vp8_rac_get(c) ? vp8_rac_get_uint(c, 8) : 255;
 233 }
 234
 235 static void update_lf_deltas(VP8Context *s)
 236 {
 237     VP56RangeCoder *c = &s->c;
 238     int i;
 239
 240     for (i = 0; i < 4; i++) {
 241         if (vp8_rac_get(c)) {
 242             s->lf_delta.ref[i] = vp8_rac_get_uint(c, 6);
 243
 244             if (vp8_rac_get(c))
 245                 s->lf_delta.ref[i] = -s->lf_delta.ref[i];
 246         }
 247     }
 248
 249     for (i = MODE_I4x4; i <= VP8_MVMODE_SPLIT; i++) {
 250         if (vp8_rac_get(c)) {
 251             s->lf_delta.mode[i] = vp8_rac_get_uint(c, 6);
 252
 253             if (vp8_rac_get(c))
 254                 s->lf_delta.mode[i] = -s->lf_delta.mode[i];
 255         }
 256     }
 257 }
 258
 259 static int setup_partitions(VP8Context *s, const uint8_t *buf, int buf_size)
 260 {
 261     const uint8_t *sizes = buf;
 262     int i;
 263
 264     s->num_coeff_partitions = 1 << vp8_rac_get_uint(&s->c, 2);
 265
 266     buf      += 3 * (s->num_coeff_partitions - 1);
 267     buf_size -= 3 * (s->num_coeff_partitions - 1);
 268     if (buf_size < 0)
 269         return -1;
 270
 271     for (i = 0; i < s->num_coeff_partitions - 1; i++) {
 272         int size = AV_RL24(sizes + 3 * i);
 273         if (buf_size - size < 0)
 274             return -1;
 275
 276         ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, size);
 277         buf      += size;
 278         buf_size -= size;
 279     }
 280     ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, buf_size);
 281
 282     return 0;
 283 }
 284
 285 static void vp7_get_quants(VP8Context *s)
 286 {
 287     VP56RangeCoder *c = &s->c;
 288
 289     int yac_qi  = vp8_rac_get_uint(c, 7);
 290     int ydc_qi  = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 291     int y2dc_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 292     int y2ac_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 293     int uvdc_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 294     int uvac_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 295
 296     s->qmat[0].luma_qmul[0]    =       vp7_ydc_qlookup[ydc_qi];
 297     s->qmat[0].luma_qmul[1]    =       vp7_yac_qlookup[yac_qi];
 298     s->qmat[0].luma_dc_qmul[0] =       vp7_y2dc_qlookup[y2dc_qi];
 299     s->qmat[0].luma_dc_qmul[1] =       vp7_y2ac_qlookup[y2ac_qi];
 300     s->qmat[0].chroma_qmul[0]  = FFMIN(vp7_ydc_qlookup[uvdc_qi], 132);
 301     s->qmat[0].chroma_qmul[1]  =       vp7_yac_qlookup[uvac_qi];
 302 }
 303
 304 static void vp8_get_quants(VP8Context *s)
 305 {
 306     VP56RangeCoder *c = &s->c;
 307     int i, base_qi;
 308
 309     int yac_qi     = vp8_rac_get_uint(c, 7);
 310     int ydc_delta  = vp8_rac_get_sint(c, 4);
 311     int y2dc_delta = vp8_rac_get_sint(c, 4);
 312     int y2ac_delta = vp8_rac_get_sint(c, 4);
 313     int uvdc_delta = vp8_rac_get_sint(c, 4);
 314     int uvac_delta = vp8_rac_get_sint(c, 4);
 315
 316     for (i = 0; i < 4; i++) {
 317         if (s->segmentation.enabled) {
 318             base_qi = s->segmentation.base_quant[i];
 319             if (!s->segmentation.absolute_vals)
 320                 base_qi += yac_qi;
 321         } else
 322             base_qi = yac_qi;
 323
 324         s->qmat[i].luma_qmul[0]    = vp8_dc_qlookup[av_clip_uintp2(base_qi + ydc_delta,  7)];
 325         s->qmat[i].luma_qmul[1]    = vp8_ac_qlookup[av_clip_uintp2(base_qi,              7)];
 326         s->qmat[i].luma_dc_qmul[0] = vp8_dc_qlookup[av_clip_uintp2(base_qi + y2dc_delta, 7)] * 2;
 327         /* 101581>>16 is equivalent to 155/100 */
 328         s->qmat[i].luma_dc_qmul[1] = vp8_ac_qlookup[av_clip_uintp2(base_qi + y2ac_delta, 7)] * 101581 >> 16;
 329         s->qmat[i].chroma_qmul[0]  = vp8_dc_qlookup[av_clip_uintp2(base_qi + uvdc_delta, 7)];
 330         s->qmat[i].chroma_qmul[1]  = vp8_ac_qlookup[av_clip_uintp2(base_qi + uvac_delta, 7)];
 331
 332         s->qmat[i].luma_dc_qmul[1] = FFMAX(s->qmat[i].luma_dc_qmul[1], 8);
 333         s->qmat[i].chroma_qmul[0]  = FFMIN(s->qmat[i].chroma_qmul[0], 132);
 334     }
 335 }
 336
 337 /**
 338  * Determine which buffers golden and altref should be updated with after this frame.
 339  * The spec isn't clear here, so I'm going by my understanding of what libvpx does
 340  *
 341  * Intra frames update all 3 references
 342  * Inter frames update VP56_FRAME_PREVIOUS if the update_last flag is set
 343  * If the update (golden|altref) flag is set, it's updated with the current frame
 344  *      if update_last is set, and VP56_FRAME_PREVIOUS otherwise.
 345  * If the flag is not set, the number read means:
 346  *      0: no update
 347  *      1: VP56_FRAME_PREVIOUS
 348  *      2: update golden with altref, or update altref with golden
 349  */
 350 static VP56Frame ref_to_update(VP8Context *s, int update, VP56Frame ref)
 351 {
 352     VP56RangeCoder *c = &s->c;
 353
 354     if (update)
 355         return VP56_FRAME_CURRENT;
 356
 357     switch (vp8_rac_get_uint(c, 2)) {
 358     case 1:
 359         return VP56_FRAME_PREVIOUS;
 360     case 2:
 361         return (ref == VP56_FRAME_GOLDEN) ? VP56_FRAME_GOLDEN2 : VP56_FRAME_GOLDEN;
 362     }
 363     return VP56_FRAME_NONE;
 364 }
 365
 366 static void vp78_reset_probability_tables(VP8Context *s)
 367 {
 368     int i, j;
 369     for (i = 0; i < 4; i++)
 370         for (j = 0; j < 16; j++)
 371             memcpy(s->prob->token[i][j], vp8_token_default_probs[i][vp8_coeff_band[j]],
 372                    sizeof(s->prob->token[i][j]));
 373 }
 374
 375 static void vp78_update_probability_tables(VP8Context *s)
 376 {
 377     VP56RangeCoder *c = &s->c;
 378     int i, j, k, l, m;
 379
 380     for (i = 0; i < 4; i++)
 381         for (j = 0; j < 8; j++)
 382             for (k = 0; k < 3; k++)
 383                 for (l = 0; l < NUM_DCT_TOKENS-1; l++)
 384                     if (vp56_rac_get_prob_branchy(c, vp8_token_update_probs[i][j][k][l])) {
 385                         int prob = vp8_rac_get_uint(c, 8);
 386                         for (m = 0; vp8_coeff_band_indexes[j][m] >= 0; m++)
 387                             s->prob->token[i][vp8_coeff_band_indexes[j][m]][k][l] = prob;
 388                     }
 389 }
 390
 391 #define VP7_MVC_SIZE 17
 392 #define VP8_MVC_SIZE 19
 393
 394 static void vp78_update_pred16x16_pred8x8_mvc_probabilities(VP8Context *s,
 395                                                             int mvc_size)
 396 {
 397     VP56RangeCoder *c = &s->c;
 398     int i, j;
 399
 400     if (vp8_rac_get(c))
 401         for (i = 0; i < 4; i++)
 402             s->prob->pred16x16[i] = vp8_rac_get_uint(c, 8);
 403     if (vp8_rac_get(c))
 404         for (i = 0; i < 3; i++)
 405             s->prob->pred8x8c[i]  = vp8_rac_get_uint(c, 8);
 406
 407     // 17.2 MV probability update
 408     for (i = 0; i < 2; i++)
 409         for (j = 0; j < mvc_size; j++)
 410             if (vp56_rac_get_prob_branchy(c, vp8_mv_update_prob[i][j]))
 411                 s->prob->mvc[i][j] = vp8_rac_get_nn(c);
 412 }
 413
 414 static void update_refs(VP8Context *s)
 415 {
 416     VP56RangeCoder *c = &s->c;
 417
 418     int update_golden = vp8_rac_get(c);
 419     int update_altref = vp8_rac_get(c);
 420
 421     s->update_golden = ref_to_update(s, update_golden, VP56_FRAME_GOLDEN);
 422     s->update_altref = ref_to_update(s, update_altref, VP56_FRAME_GOLDEN2);
 423 }
 424
 425 static void copy_chroma(AVFrame *dst, AVFrame *src, int width, int height)
 426 {
 427     int i, j;
 428
 429     for (j = 1; j < 3; j++) {
 430         for (i = 0; i < height / 2; i++)
 431             memcpy(dst->data[j] + i * dst->linesize[j],
 432                    src->data[j] + i * src->linesize[j], width / 2);
 433     }
 434 }
 435
 436 static void fade(uint8_t *dst, int dst_linesize,
 437                  const uint8_t *src, int src_linesize,
 438                  int width, int height,
 439                  int alpha, int beta)
 440 {
 441     int i, j;
 442     for (j = 0; j < height; j++) {
 443         for (i = 0; i < width; i++) {
 444             uint8_t y = src[j * src_linesize + i];
 445             dst[j * dst_linesize + i] = av_clip_uint8(y + ((y * beta) >> 8) + alpha);
 446         }
 447     }
 448 }
 449
 450 static int vp7_fade_frame(VP8Context *s, VP56RangeCoder *c)
 451 {
 452     int alpha = (int8_t) vp8_rac_get_uint(c, 8);
 453     int beta  = (int8_t) vp8_rac_get_uint(c, 8);
 454     int ret;
 455
 456     if (!s->keyframe && (alpha || beta)) {
 457         int width  = s->mb_width * 16;
 458         int height = s->mb_height * 16;
 459         AVFrame *src, *dst;
 460
 461         if (!s->framep[VP56_FRAME_PREVIOUS] ||
 462             !s->framep[VP56_FRAME_GOLDEN]) {
 463             av_log(s->avctx, AV_LOG_WARNING, "Discarding interframe without a prior keyframe!\n");
 464             return AVERROR_INVALIDDATA;
 465         }
 466
 467         dst =
 468         src = s->framep[VP56_FRAME_PREVIOUS]->tf.f;
 469
 470         /* preserve the golden frame, write a new previous frame */
 471         if (s->framep[VP56_FRAME_GOLDEN] == s->framep[VP56_FRAME_PREVIOUS]) {
 472             s->framep[VP56_FRAME_PREVIOUS] = vp8_find_free_buffer(s);
 473             if ((ret = vp8_alloc_frame(s, s->framep[VP56_FRAME_PREVIOUS], 1)) < 0)
 474                 return ret;
 475
 476             dst = s->framep[VP56_FRAME_PREVIOUS]->tf.f;
 477
 478             copy_chroma(dst, src, width, height);
 479         }
 480
 481         fade(dst->data[0], dst->linesize[0],
 482              src->data[0], src->linesize[0],
 483              width, height, alpha, beta);
 484     }
 485
 486     return 0;
 487 }
 488
 489 static int vp7_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size)
 490 {
 491     VP56RangeCoder *c = &s->c;
 492     int part1_size, hscale, vscale, i, j, ret;
 493     int width  = s->avctx->width;
 494     int height = s->avctx->height;
 495
 496     s->profile = (buf[0] >> 1) & 7;
 497     if (s->profile > 1) {
 498         avpriv_request_sample(s->avctx, "Unknown profile %d", s->profile);
 499         return AVERROR_INVALIDDATA;
 500     }
 501
 502     s->keyframe  = !(buf[0] & 1);
 503     s->invisible = 0;
 504     part1_size   = AV_RL24(buf) >> 4;
 505
 506     if (buf_size < 4 - s->profile + part1_size) {
 507         av_log(s->avctx, AV_LOG_ERROR, "Buffer size %d is too small, needed : %d\n", buf_size, 4 - s->profile + part1_size);
 508         return AVERROR_INVALIDDATA;
 509     }
 510
 511     buf      += 4 - s->profile;
 512     buf_size -= 4 - s->profile;
 513
 514     memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab, sizeof(s->put_pixels_tab));
 515
 516     ff_vp56_init_range_decoder(c, buf, part1_size);
 517     buf      += part1_size;
 518     buf_size -= part1_size;
 519
 520     /* A. Dimension information (keyframes only) */
 521     if (s->keyframe) {
 522         width  = vp8_rac_get_uint(c, 12);
 523         height = vp8_rac_get_uint(c, 12);
 524         hscale = vp8_rac_get_uint(c, 2);
 525         vscale = vp8_rac_get_uint(c, 2);
 526         if (hscale || vscale)
 527             avpriv_request_sample(s->avctx, "Upscaling");
 528
 529         s->update_golden = s->update_altref = VP56_FRAME_CURRENT;
 530         vp78_reset_probability_tables(s);
 531         memcpy(s->prob->pred16x16, vp8_pred16x16_prob_inter,
 532                sizeof(s->prob->pred16x16));
 533         memcpy(s->prob->pred8x8c, vp8_pred8x8c_prob_inter,
 534                sizeof(s->prob->pred8x8c));
 535         for (i = 0; i < 2; i++)
 536             memcpy(s->prob->mvc[i], vp7_mv_default_prob[i],
 537                    sizeof(vp7_mv_default_prob[i]));
 538         memset(&s->segmentation, 0, sizeof(s->segmentation));
 539         memset(&s->lf_delta, 0, sizeof(s->lf_delta));
 540         memcpy(s->prob[0].scan, zigzag_scan, sizeof(s->prob[0].scan));
 541     }
 542
 543     if (s->keyframe || s->profile > 0)
 544         memset(s->inter_dc_pred, 0 , sizeof(s->inter_dc_pred));
 545
 546     /* B. Decoding information for all four macroblock-level features */
 547     for (i = 0; i < 4; i++) {
 548         s->feature_enabled[i] = vp8_rac_get(c);
 549         if (s->feature_enabled[i]) {
 550              s->feature_present_prob[i] = vp8_rac_get_uint(c, 8);
 551
 552              for (j = 0; j < 3; j++)
 553                  s->feature_index_prob[i][j] =
 554                      vp8_rac_get(c) ? vp8_rac_get_uint(c, 8) : 255;
 555
 556              if (vp7_feature_value_size[s->profile][i])
 557                  for (j = 0; j < 4; j++)
 558                      s->feature_value[i][j] =
 559                         vp8_rac_get(c) ? vp8_rac_get_uint(c, vp7_feature_value_size[s->profile][i]) : 0;
 560         }
 561     }
 562
 563     s->segmentation.enabled    = 0;
 564     s->segmentation.update_map = 0;
 565     s->lf_delta.enabled        = 0;
 566
 567     s->num_coeff_partitions = 1;
 568     ff_vp56_init_range_decoder(&s->coeff_partition[0], buf, buf_size);
 569
 570     if (!s->macroblocks_base || /* first frame */
 571         width != s->avctx->width || height != s->avctx->height ||
 572         (width + 15) / 16 != s->mb_width || (height + 15) / 16 != s->mb_height) {
 573         if ((ret = vp7_update_dimensions(s, width, height)) < 0)
 574             return ret;
 575     }
 576
 577     /* C. Dequantization indices */
 578     vp7_get_quants(s);
 579
 580     /* D. Golden frame update flag (a Flag) for interframes only */
 581     if (!s->keyframe) {
 582         s->update_golden = vp8_rac_get(c) ? VP56_FRAME_CURRENT : VP56_FRAME_NONE;
 583         s->sign_bias[VP56_FRAME_GOLDEN] = 0;
 584     }
 585
 586     s->update_last          = 1;
 587     s->update_probabilities = 1;
 588     s->fade_present         = 1;
 589
 590     if (s->profile > 0) {
 591         s->update_probabilities = vp8_rac_get(c);
 592         if (!s->update_probabilities)
 593             s->prob[1] = s->prob[0];
 594
 595         if (!s->keyframe)
 596             s->fade_present = vp8_rac_get(c);
 597     }
 598
 599     /* E. Fading information for previous frame */
 600     if (s->fade_present && vp8_rac_get(c)) {
 601         if ((ret = vp7_fade_frame(s ,c)) < 0)
 602             return ret;
 603     }
 604
 605     /* F. Loop filter type */
 606     if (!s->profile)
 607         s->filter.simple = vp8_rac_get(c);
 608
 609     /* G. DCT coefficient ordering specification */
 610     if (vp8_rac_get(c))
 611         for (i = 1; i < 16; i++)
 612             s->prob[0].scan[i] = zigzag_scan[vp8_rac_get_uint(c, 4)];
 613
 614     /* H. Loop filter levels  */
 615     if (s->profile > 0)
 616         s->filter.simple = vp8_rac_get(c);
 617     s->filter.level     = vp8_rac_get_uint(c, 6);
 618     s->filter.sharpness = vp8_rac_get_uint(c, 3);
 619
 620     /* I. DCT coefficient probability update; 13.3 Token Probability Updates */
 621     vp78_update_probability_tables(s);
 622
 623     s->mbskip_enabled = 0;
 624
 625     /* J. The remaining frame header data occurs ONLY FOR INTERFRAMES */
 626     if (!s->keyframe) {
 627         s->prob->intra  = vp8_rac_get_uint(c, 8);
 628         s->prob->last   = vp8_rac_get_uint(c, 8);
 629         vp78_update_pred16x16_pred8x8_mvc_probabilities(s, VP7_MVC_SIZE);
 630     }
 631
 632     return 0;
 633 }
 634
 635 static int vp8_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size)
 636 {
 637     VP56RangeCoder *c = &s->c;
 638     int header_size, hscale, vscale, ret;
 639     int width  = s->avctx->width;
 640     int height = s->avctx->height;
 641
 642     if (buf_size < 3) {
 643         av_log(s->avctx, AV_LOG_ERROR, "Insufficent data (%d) for header\n", buf_size);
 644         return AVERROR_INVALIDDATA;
 645     }
 646
 647     s->keyframe  = !(buf[0] & 1);
 648     s->profile   =  (buf[0]>>1) & 7;
 649     s->invisible = !(buf[0] & 0x10);
 650     header_size  = AV_RL24(buf) >> 5;
 651     buf      += 3;
 652     buf_size -= 3;
 653
 654     if (s->profile > 3)
 655         av_log(s->avctx, AV_LOG_WARNING, "Unknown profile %d\n", s->profile);
 656
 657     if (!s->profile)
 658         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab,
 659                sizeof(s->put_pixels_tab));
 660     else    // profile 1-3 use bilinear, 4+ aren't defined so whatever
 661         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_bilinear_pixels_tab,
 662                sizeof(s->put_pixels_tab));
 663
 664     if (header_size > buf_size - 7 * s->keyframe) {
 665         av_log(s->avctx, AV_LOG_ERROR, "Header size larger than data provided\n");
 666         return AVERROR_INVALIDDATA;
 667     }
 668
 669     if (s->keyframe) {
 670         if (AV_RL24(buf) != 0x2a019d) {
 671             av_log(s->avctx, AV_LOG_ERROR,
 672                    "Invalid start code 0x%x\n", AV_RL24(buf));
 673             return AVERROR_INVALIDDATA;
 674         }
 675         width     = AV_RL16(buf + 3) & 0x3fff;
 676         height    = AV_RL16(buf + 5) & 0x3fff;
 677         hscale    = buf[4] >> 6;
 678         vscale    = buf[6] >> 6;
 679         buf      += 7;
 680         buf_size -= 7;
 681
 682         if (hscale || vscale)
 683             avpriv_request_sample(s->avctx, "Upscaling");
 684
 685         s->update_golden = s->update_altref = VP56_FRAME_CURRENT;
 686         vp78_reset_probability_tables(s);
 687         memcpy(s->prob->pred16x16, vp8_pred16x16_prob_inter,
 688                sizeof(s->prob->pred16x16));
 689         memcpy(s->prob->pred8x8c, vp8_pred8x8c_prob_inter,
 690                sizeof(s->prob->pred8x8c));
 691         memcpy(s->prob->mvc, vp8_mv_default_prob,
 692                sizeof(s->prob->mvc));
 693         memset(&s->segmentation, 0, sizeof(s->segmentation));
 694         memset(&s->lf_delta, 0, sizeof(s->lf_delta));
 695     }
 696
 697     ff_vp56_init_range_decoder(c, buf, header_size);
 698     buf      += header_size;
 699     buf_size -= header_size;
 700
 701     if (s->keyframe) {
 702         s->colorspace = vp8_rac_get(c);
 703         if (s->colorspace)
 704             av_log(s->avctx, AV_LOG_WARNING, "Unspecified colorspace\n");
 705         s->fullrange = vp8_rac_get(c);
 706     }
 707
 708     if ((s->segmentation.enabled = vp8_rac_get(c)))
 709         parse_segment_info(s);
 710     else
 711         s->segmentation.update_map = 0; // FIXME: move this to some init function?
 712
 713     s->filter.simple    = vp8_rac_get(c);
 714     s->filter.level     = vp8_rac_get_uint(c, 6);
 715     s->filter.sharpness = vp8_rac_get_uint(c, 3);
 716
 717     if ((s->lf_delta.enabled = vp8_rac_get(c)))
 718         if (vp8_rac_get(c))
 719             update_lf_deltas(s);
 720
 721     if (setup_partitions(s, buf, buf_size)) {
 722         av_log(s->avctx, AV_LOG_ERROR, "Invalid partitions\n");
 723         return AVERROR_INVALIDDATA;
 724     }
 725
 726     if (!s->macroblocks_base || /* first frame */
 727         width != s->avctx->width || height != s->avctx->height ||
 728         (width+15)/16 != s->mb_width || (height+15)/16 != s->mb_height)
 729         if ((ret = vp8_update_dimensions(s, width, height)) < 0)
 730             return ret;
 731
 732     vp8_get_quants(s);
 733
 734     if (!s->keyframe) {
 735         update_refs(s);
 736         s->sign_bias[VP56_FRAME_GOLDEN]               = vp8_rac_get(c);
 737         s->sign_bias[VP56_FRAME_GOLDEN2 /* altref */] = vp8_rac_get(c);
 738     }
 739
 740     // if we aren't saving this frame's probabilities for future frames,
 741     // make a copy of the current probabilities
 742     if (!(s->update_probabilities = vp8_rac_get(c)))
 743         s->prob[1] = s->prob[0];
 744
 745     s->update_last = s->keyframe || vp8_rac_get(c);
 746
 747     vp78_update_probability_tables(s);
 748
 749     if ((s->mbskip_enabled = vp8_rac_get(c)))
 750         s->prob->mbskip = vp8_rac_get_uint(c, 8);
 751
 752     if (!s->keyframe) {
 753         s->prob->intra  = vp8_rac_get_uint(c, 8);
 754         s->prob->last   = vp8_rac_get_uint(c, 8);
 755         s->prob->golden = vp8_rac_get_uint(c, 8);
 756         vp78_update_pred16x16_pred8x8_mvc_probabilities(s, VP8_MVC_SIZE);
 757     }
 758
 759     return 0;
 760 }
 761
 762 static av_always_inline
 763 void clamp_mv(VP8Context *s, VP56mv *dst, const VP56mv *src)
 764 {
 765     dst->x = av_clip(src->x, av_clip(s->mv_min.x, INT16_MIN, INT16_MAX),
 766                              av_clip(s->mv_max.x, INT16_MIN, INT16_MAX));
 767     dst->y = av_clip(src->y, av_clip(s->mv_min.y, INT16_MIN, INT16_MAX),
 768                              av_clip(s->mv_max.y, INT16_MIN, INT16_MAX));
 769 }
 770
 771 /**
 772  * Motion vector coding, 17.1.
 773  */
 774 static av_always_inline int read_mv_component(VP56RangeCoder *c, const uint8_t *p, int vp7)
 775 {
 776     int bit, x = 0;
 777
 778     if (vp56_rac_get_prob_branchy(c, p[0])) {
 779         int i;
 780
 781         for (i = 0; i < 3; i++)
 782             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 783         for (i = (vp7 ? 7 : 9); i > 3; i--)
 784             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 785         if (!(x & (vp7 ? 0xF0 : 0xFFF0)) || vp56_rac_get_prob(c, p[12]))
 786             x += 8;
 787     } else {
 788         // small_mvtree
 789         const uint8_t *ps = p + 2;
 790         bit = vp56_rac_get_prob(c, *ps);
 791         ps += 1 + 3 * bit;
 792         x  += 4 * bit;
 793         bit = vp56_rac_get_prob(c, *ps);
 794         ps += 1 + bit;
 795         x  += 2 * bit;
 796         x  += vp56_rac_get_prob(c, *ps);
 797     }
 798
 799     return (x && vp56_rac_get_prob(c, p[1])) ? -x : x;
 800 }
 801
 802 static int vp7_read_mv_component(VP56RangeCoder *c, const uint8_t *p)
 803 {
 804     return read_mv_component(c, p, 1);
 805 }
 806
 807 static int vp8_read_mv_component(VP56RangeCoder *c, const uint8_t *p)
 808 {
 809     return read_mv_component(c, p, 0);
 810 }
 811
 812 static av_always_inline
 813 const uint8_t *get_submv_prob(uint32_t left, uint32_t top, int is_vp7)
 814 {
 815     if (is_vp7)
 816         return vp7_submv_prob;
 817
 818     if (left == top)
 819         return vp8_submv_prob[4 - !!left];
 820     if (!top)
 821         return vp8_submv_prob[2];
 822     return vp8_submv_prob[1 - !!left];
 823 }
 824
 825 /**
 826  * Split motion vector prediction, 16.4.
 827  * @returns the number of motion vectors parsed (2, 4 or 16)
 828  */
 829 static av_always_inline
 830 int decode_splitmvs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
 831                     int layout, int is_vp7)
 832 {
 833     int part_idx;
 834     int n, num;
 835     VP8Macroblock *top_mb;
 836     VP8Macroblock *left_mb = &mb[-1];
 837     const uint8_t *mbsplits_left = vp8_mbsplits[left_mb->partitioning];
 838     const uint8_t *mbsplits_top, *mbsplits_cur, *firstidx;
 839     VP56mv *top_mv;
 840     VP56mv *left_mv = left_mb->bmv;
 841     VP56mv *cur_mv  = mb->bmv;
 842
 843     if (!layout) // layout is inlined, s->mb_layout is not
 844         top_mb = &mb[2];
 845     else
 846         top_mb = &mb[-s->mb_width - 1];
 847     mbsplits_top = vp8_mbsplits[top_mb->partitioning];
 848     top_mv       = top_mb->bmv;
 849
 850     if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[0])) {
 851         if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[1]))
 852             part_idx = VP8_SPLITMVMODE_16x8 + vp56_rac_get_prob(c, vp8_mbsplit_prob[2]);
 853         else
 854             part_idx = VP8_SPLITMVMODE_8x8;
 855     } else {
 856         part_idx = VP8_SPLITMVMODE_4x4;
 857     }
 858
 859     num              = vp8_mbsplit_count[part_idx];
 860     mbsplits_cur     = vp8_mbsplits[part_idx],
 861     firstidx         = vp8_mbfirstidx[part_idx];
 862     mb->partitioning = part_idx;
 863
 864     for (n = 0; n < num; n++) {
 865         int k = firstidx[n];
 866         uint32_t left, above;
 867         const uint8_t *submv_prob;
 868
 869         if (!(k & 3))
 870             left = AV_RN32A(&left_mv[mbsplits_left[k + 3]]);
 871         else
 872             left = AV_RN32A(&cur_mv[mbsplits_cur[k - 1]]);
 873         if (k <= 3)
 874             above = AV_RN32A(&top_mv[mbsplits_top[k + 12]]);
 875         else
 876             above = AV_RN32A(&cur_mv[mbsplits_cur[k - 4]]);
 877
 878         submv_prob = get_submv_prob(left, above, is_vp7);
 879
 880         if (vp56_rac_get_prob_branchy(c, submv_prob[0])) {
 881             if (vp56_rac_get_prob_branchy(c, submv_prob[1])) {
 882                 if (vp56_rac_get_prob_branchy(c, submv_prob[2])) {
 883                     mb->bmv[n].y = mb->mv.y +
 884                                    read_mv_component(c, s->prob->mvc[0], is_vp7);
 885                     mb->bmv[n].x = mb->mv.x +
 886                                    read_mv_component(c, s->prob->mvc[1], is_vp7);
 887                 } else {
 888                     AV_ZERO32(&mb->bmv[n]);
 889                 }
 890             } else {
 891                 AV_WN32A(&mb->bmv[n], above);
 892             }
 893         } else {
 894             AV_WN32A(&mb->bmv[n], left);
 895         }
 896     }
 897
 898     return num;
 899 }
 900
 901 /**
 902  * The vp7 reference decoder uses a padding macroblock column (added to right
 903  * edge of the frame) to guard against illegal macroblock offsets. The
 904  * algorithm has bugs that permit offsets to straddle the padding column.
 905  * This function replicates those bugs.
 906  *
 907  * @param[out] edge_x macroblock x address
 908  * @param[out] edge_y macroblock y address
 909  *
 910  * @return macroblock offset legal (boolean)
 911  */
 912 static int vp7_calculate_mb_offset(int mb_x, int mb_y, int mb_width,
 913                                    int xoffset, int yoffset, int boundary,
 914                                    int *edge_x, int *edge_y)
 915 {
 916     int vwidth = mb_width + 1;
 917     int new = (mb_y + yoffset) * vwidth + mb_x + xoffset;
 918     if (new < boundary || new % vwidth == vwidth - 1)
 919         return 0;
 920     *edge_y = new / vwidth;
 921     *edge_x = new % vwidth;
 922     return 1;
 923 }
 924
 925 static const VP56mv *get_bmv_ptr(const VP8Macroblock *mb, int subblock)
 926 {
 927     return &mb->bmv[mb->mode == VP8_MVMODE_SPLIT ? vp8_mbsplits[mb->partitioning][subblock] : 0];
 928 }
 929
 930 static av_always_inline
 931 void vp7_decode_mvs(VP8Context *s, VP8Macroblock *mb,
 932                     int mb_x, int mb_y, int layout)
 933 {
 934     VP8Macroblock *mb_edge[12];
 935     enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR };
 936     enum { VP8_EDGE_TOP, VP8_EDGE_LEFT, VP8_EDGE_TOPLEFT };
 937     int idx = CNT_ZERO;
 938     VP56mv near_mv[3];
 939     uint8_t cnt[3] = { 0 };
 940     VP56RangeCoder *c = &s->c;
 941     int i;
 942
 943     AV_ZERO32(&near_mv[0]);
 944     AV_ZERO32(&near_mv[1]);
 945     AV_ZERO32(&near_mv[2]);
 946
 947     for (i = 0; i < VP7_MV_PRED_COUNT; i++) {
 948         const VP7MVPred * pred = &vp7_mv_pred[i];
 949         int edge_x, edge_y;
 950
 951         if (vp7_calculate_mb_offset(mb_x, mb_y, s->mb_width, pred->xoffset,
 952                                     pred->yoffset, !s->profile, &edge_x, &edge_y)) {
 953             VP8Macroblock *edge = mb_edge[i] = (s->mb_layout == 1)
 954                                              ? s->macroblocks_base + 1 + edge_x +
 955                                                (s->mb_width + 1) * (edge_y + 1)
 956                                              : s->macroblocks + edge_x +
 957                                                (s->mb_height - edge_y - 1) * 2;
 958             uint32_t mv = AV_RN32A(get_bmv_ptr(edge, vp7_mv_pred[i].subblock));
 959             if (mv) {
 960                 if (AV_RN32A(&near_mv[CNT_NEAREST])) {
 961                     if (mv == AV_RN32A(&near_mv[CNT_NEAREST])) {
 962                         idx = CNT_NEAREST;
 963                     } else if (AV_RN32A(&near_mv[CNT_NEAR])) {
 964                         if (mv != AV_RN32A(&near_mv[CNT_NEAR]))
 965                             continue;
 966                         idx = CNT_NEAR;
 967                     } else {
 968                         AV_WN32A(&near_mv[CNT_NEAR], mv);
 969                         idx = CNT_NEAR;
 970                     }
 971                 } else {
 972                     AV_WN32A(&near_mv[CNT_NEAREST], mv);
 973                     idx = CNT_NEAREST;
 974                 }
 975             } else {
 976                 idx = CNT_ZERO;
 977             }
 978         } else {
 979             idx = CNT_ZERO;
 980         }
 981         cnt[idx] += vp7_mv_pred[i].score;
 982     }
 983
 984     mb->partitioning = VP8_SPLITMVMODE_NONE;
 985
 986     if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_ZERO]][0])) {
 987         mb->mode = VP8_MVMODE_MV;
 988
 989         if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAREST]][1])) {
 990
 991             if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAR]][2])) {
 992
 993                 if (cnt[CNT_NEAREST] > cnt[CNT_NEAR])
 994                     AV_WN32A(&mb->mv, cnt[CNT_ZERO] > cnt[CNT_NEAREST] ? 0 : AV_RN32A(&near_mv[CNT_NEAREST]));
 995                 else
 996                     AV_WN32A(&mb->mv, cnt[CNT_ZERO] > cnt[CNT_NEAR]    ? 0 : AV_RN32A(&near_mv[CNT_NEAR]));
 997
 998                 if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAR]][3])) {
 999                     mb->mode = VP8_MVMODE_SPLIT;
1000                     mb->mv = mb->bmv[decode_splitmvs(s, c, mb, layout, IS_VP7) - 1];
1001                 } else {
1002                     mb->mv.y += vp7_read_mv_component(c, s->prob->mvc[0]);
1003                     mb->mv.x += vp7_read_mv_component(c, s->prob->mvc[1]);
1004                     mb->bmv[0] = mb->mv;
1005                 }
1006             } else {
1007                 mb->mv = near_mv[CNT_NEAR];
1008                 mb->bmv[0] = mb->mv;
1009             }
1010         } else {
1011             mb->mv = near_mv[CNT_NEAREST];
1012             mb->bmv[0] = mb->mv;
1013         }
1014     } else {
1015         mb->mode = VP8_MVMODE_ZERO;
1016         AV_ZERO32(&mb->mv);
1017         mb->bmv[0] = mb->mv;
1018     }
1019 }
1020
1021 static av_always_inline
1022 void vp8_decode_mvs(VP8Context *s, VP8Macroblock *mb,
1023                     int mb_x, int mb_y, int layout)
1024 {
1025     VP8Macroblock *mb_edge[3] = { 0      /* top */,
1026                                   mb - 1 /* left */,
1027                                   0      /* top-left */ };
1028     enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV };
1029     enum { VP8_EDGE_TOP, VP8_EDGE_LEFT, VP8_EDGE_TOPLEFT };
1030     int idx = CNT_ZERO;
1031     int cur_sign_bias = s->sign_bias[mb->ref_frame];
1032     int8_t *sign_bias = s->sign_bias;
1033     VP56mv near_mv[4];
1034     uint8_t cnt[4] = { 0 };
1035     VP56RangeCoder *c = &s->c;
1036
1037     if (!layout) { // layout is inlined (s->mb_layout is not)
1038         mb_edge[0] = mb + 2;
1039         mb_edge[2] = mb + 1;
1040     } else {
1041         mb_edge[0] = mb - s->mb_width - 1;
1042         mb_edge[2] = mb - s->mb_width - 2;
1043     }
1044
1045     AV_ZERO32(&near_mv[0]);
1046     AV_ZERO32(&near_mv[1]);
1047     AV_ZERO32(&near_mv[2]);
1048
1049     /* Process MB on top, left and top-left */
1050 #define MV_EDGE_CHECK(n)                                                      \
1051     {                                                                         \
1052         VP8Macroblock *edge = mb_edge[n];                                     \
1053         int edge_ref = edge->ref_frame;                                       \
1054         if (edge_ref != VP56_FRAME_CURRENT) {                                 \
1055             uint32_t mv = AV_RN32A(&edge->mv);                                \
1056             if (mv) {                                                         \
1057                 if (cur_sign_bias != sign_bias[edge_ref]) {                   \
1058                     /* SWAR negate of the values in mv. */                    \
1059                     mv = ~mv;                                                 \
1060                     mv = ((mv & 0x7fff7fff) +                                 \
1061                           0x00010001) ^ (mv & 0x80008000);                    \
1062                 }                                                             \
1063                 if (!n || mv != AV_RN32A(&near_mv[idx]))                      \
1064                     AV_WN32A(&near_mv[++idx], mv);                            \
1065                 cnt[idx] += 1 + (n != 2);                                     \
1066             } else                                                            \
1067                 cnt[CNT_ZERO] += 1 + (n != 2);                                \
1068         }                                                                     \
1069     }
1070
1071     MV_EDGE_CHECK(0)
1072     MV_EDGE_CHECK(1)
1073     MV_EDGE_CHECK(2)
1074
1075     mb->partitioning = VP8_SPLITMVMODE_NONE;
1076     if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_ZERO]][0])) {
1077         mb->mode = VP8_MVMODE_MV;
1078
1079         /* If we have three distinct MVs, merge first and last if they're the same */
1080         if (cnt[CNT_SPLITMV] &&
1081             AV_RN32A(&near_mv[1 + VP8_EDGE_TOP]) == AV_RN32A(&near_mv[1 + VP8_EDGE_TOPLEFT]))
1082             cnt[CNT_NEAREST] += 1;
1083
1084         /* Swap near and nearest if necessary */
1085         if (cnt[CNT_NEAR] > cnt[CNT_NEAREST]) {
1086             FFSWAP(uint8_t,     cnt[CNT_NEAREST],     cnt[CNT_NEAR]);
1087             FFSWAP( VP56mv, near_mv[CNT_NEAREST], near_mv[CNT_NEAR]);
1088         }
1089
1090         if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAREST]][1])) {
1091             if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAR]][2])) {
1092                 /* Choose the best mv out of 0,0 and the nearest mv */
1093                 clamp_mv(s, &mb->mv, &near_mv[CNT_ZERO + (cnt[CNT_NEAREST] >= cnt[CNT_ZERO])]);
1094                 cnt[CNT_SPLITMV] = ((mb_edge[VP8_EDGE_LEFT]->mode    == VP8_MVMODE_SPLIT) +
1095                                     (mb_edge[VP8_EDGE_TOP]->mode     == VP8_MVMODE_SPLIT)) * 2 +
1096                                     (mb_edge[VP8_EDGE_TOPLEFT]->mode == VP8_MVMODE_SPLIT);
1097
1098                 if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_SPLITMV]][3])) {
1099                     mb->mode = VP8_MVMODE_SPLIT;
1100                     mb->mv = mb->bmv[decode_splitmvs(s, c, mb, layout, IS_VP8) - 1];
1101                 } else {
1102                     mb->mv.y  += vp8_read_mv_component(c, s->prob->mvc[0]);
1103                     mb->mv.x  += vp8_read_mv_component(c, s->prob->mvc[1]);
1104                     mb->bmv[0] = mb->mv;
1105                 }
1106             } else {
1107                 clamp_mv(s, &mb->mv, &near_mv[CNT_NEAR]);
1108                 mb->bmv[0] = mb->mv;
1109             }
1110         } else {
1111             clamp_mv(s, &mb->mv, &near_mv[CNT_NEAREST]);
1112             mb->bmv[0] = mb->mv;
1113         }
1114     } else {
1115         mb->mode = VP8_MVMODE_ZERO;
1116         AV_ZERO32(&mb->mv);
1117         mb->bmv[0] = mb->mv;
1118     }
1119 }
1120
1121 static av_always_inline
1122 void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
1123                            int mb_x, int keyframe, int layout)
1124 {
1125     uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
1126
1127     if (layout) {
1128         VP8Macroblock *mb_top = mb - s->mb_width - 1;
1129         memcpy(mb->intra4x4_pred_mode_top, mb_top->intra4x4_pred_mode_top, 4);
1130     }
1131     if (keyframe) {
1132         int x, y;
1133         uint8_t *top;
1134         uint8_t *const left = s->intra4x4_pred_mode_left;
1135         if (layout)
1136             top = mb->intra4x4_pred_mode_top;
1137         else
1138             top = s->intra4x4_pred_mode_top + 4 * mb_x;
1139         for (y = 0; y < 4; y++) {
1140             for (x = 0; x < 4; x++) {
1141                 const uint8_t *ctx;
1142                 ctx       = vp8_pred4x4_prob_intra[top[x]][left[y]];
1143                 *intra4x4 = vp8_rac_get_tree(c, vp8_pred4x4_tree, ctx);
1144                 left[y]   = top[x] = *intra4x4;
1145                 intra4x4++;
1146             }
1147         }
1148     } else {
1149         int i;
1150         for (i = 0; i < 16; i++)
1151             intra4x4[i] = vp8_rac_get_tree(c, vp8_pred4x4_tree,
1152                                            vp8_pred4x4_prob_inter);
1153     }
1154 }
1155
1156 static av_always_inline
1157 void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
1158                     uint8_t *segment, uint8_t *ref, int layout, int is_vp7)
1159 {
1160     VP56RangeCoder *c = &s->c;
1161     const char *vp7_feature_name[] = { "q-index",
1162                                        "lf-delta",
1163                                        "partial-golden-update",
1164                                        "blit-pitch" };
1165     if (is_vp7) {
1166         int i;
1167         *segment = 0;
1168         for (i = 0; i < 4; i++) {
1169             if (s->feature_enabled[i]) {
1170                 if (vp56_rac_get_prob_branchy(c, s->feature_present_prob[i])) {
1171                       int index = vp8_rac_get_tree(c, vp7_feature_index_tree,
1172                                                    s->feature_index_prob[i]);
1173                       av_log(s->avctx, AV_LOG_WARNING,
1174                              "Feature %s present in macroblock (value 0x%x)\n",
1175                              vp7_feature_name[i], s->feature_value[i][index]);
1176                 }
1177            }
1178         }
1179     } else if (s->segmentation.update_map) {
1180         int bit  = vp56_rac_get_prob(c, s->prob->segmentid[0]);
1181         *segment = vp56_rac_get_prob(c, s->prob->segmentid[1+bit]) + 2*bit;
1182     } else if (s->segmentation.enabled)
1183         *segment = ref ? *ref : *segment;
1184     mb->segment = *segment;
1185
1186     mb->skip = s->mbskip_enabled ? vp56_rac_get_prob(c, s->prob->mbskip) : 0;
1187
1188     if (s->keyframe) {
1189         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_intra,
1190                                     vp8_pred16x16_prob_intra);
1191
1192         if (mb->mode == MODE_I4x4) {
1193             decode_intra4x4_modes(s, c, mb, mb_x, 1, layout);
1194         } else {
1195             const uint32_t modes = (is_vp7 ? vp7_pred4x4_mode
1196                                            : vp8_pred4x4_mode)[mb->mode] * 0x01010101u;
1197             if (s->mb_layout)
1198                 AV_WN32A(mb->intra4x4_pred_mode_top, modes);
1199             else
1200                 AV_WN32A(s->intra4x4_pred_mode_top + 4 * mb_x, modes);
1201             AV_WN32A(s->intra4x4_pred_mode_left, modes);
1202         }
1203
1204         mb->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree,
1205                                                 vp8_pred8x8c_prob_intra);
1206         mb->ref_frame        = VP56_FRAME_CURRENT;
1207     } else if (vp56_rac_get_prob_branchy(c, s->prob->intra)) {
1208         // inter MB, 16.2
1209         if (vp56_rac_get_prob_branchy(c, s->prob->last))
1210             mb->ref_frame =
1211                 (!is_vp7 && vp56_rac_get_prob(c, s->prob->golden)) ? VP56_FRAME_GOLDEN2 /* altref */
1212                                                                    : VP56_FRAME_GOLDEN;
1213         else
1214             mb->ref_frame = VP56_FRAME_PREVIOUS;
1215         s->ref_count[mb->ref_frame - 1]++;
1216
1217         // motion vectors, 16.3
1218         if (is_vp7)
1219             vp7_decode_mvs(s, mb, mb_x, mb_y, layout);
1220         else
1221             vp8_decode_mvs(s, mb, mb_x, mb_y, layout);
1222     } else {
1223         // intra MB, 16.1
1224         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_inter, s->prob->pred16x16);
1225
1226         if (mb->mode == MODE_I4x4)
1227             decode_intra4x4_modes(s, c, mb, mb_x, 0, layout);
1228
1229         mb->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree,
1230                                                 s->prob->pred8x8c);
1231         mb->ref_frame        = VP56_FRAME_CURRENT;
1232         mb->partitioning     = VP8_SPLITMVMODE_NONE;
1233         AV_ZERO32(&mb->bmv[0]);
1234     }
1235 }
1236
1237 /**
1238  * @param r     arithmetic bitstream reader context
1239  * @param block destination for block coefficients
1240  * @param probs probabilities to use when reading trees from the bitstream
1241  * @param i     initial coeff index, 0 unless a separate DC block is coded
1242  * @param qmul  array holding the dc/ac dequant factor at position 0/1
1243  *
1244  * @return 0 if no coeffs were decoded
1245  *         otherwise, the index of the last coeff decoded plus one
1246  */
1247 static av_always_inline
1248 int decode_block_coeffs_internal(VP56RangeCoder *r, int16_t block[16],
1249                                  uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1250                                  int i, uint8_t *token_prob, int16_t qmul[2],
1251                                  const uint8_t scan[16], int vp7)
1252 {
1253     VP56RangeCoder c = *r;
1254     goto skip_eob;
1255     do {
1256         int coeff;
1257 restart:
1258         if (!vp56_rac_get_prob_branchy(&c, token_prob[0]))   // DCT_EOB
1259             break;
1260
1261 skip_eob:
1262         if (!vp56_rac_get_prob_branchy(&c, token_prob[1])) { // DCT_0
1263             if (++i == 16)
1264                 break; // invalid input; blocks should end with EOB
1265             token_prob = probs[i][0];
1266             if (vp7)
1267                 goto restart;
1268             goto skip_eob;
1269         }
1270
1271         if (!vp56_rac_get_prob_branchy(&c, token_prob[2])) { // DCT_1
1272             coeff = 1;
1273             token_prob = probs[i + 1][1];
1274         } else {
1275             if (!vp56_rac_get_prob_branchy(&c, token_prob[3])) { // DCT 2,3,4
1276                 coeff = vp56_rac_get_prob_branchy(&c, token_prob[4]);
1277                 if (coeff)
1278                     coeff += vp56_rac_get_prob(&c, token_prob[5]);
1279                 coeff += 2;
1280             } else {
1281                 // DCT_CAT*
1282                 if (!vp56_rac_get_prob_branchy(&c, token_prob[6])) {
1283                     if (!vp56_rac_get_prob_branchy(&c, token_prob[7])) { // DCT_CAT1
1284                         coeff = 5 + vp56_rac_get_prob(&c, vp8_dct_cat1_prob[0]);
1285                     } else {                                    // DCT_CAT2
1286                         coeff  = 7;
1287                         coeff += vp56_rac_get_prob(&c, vp8_dct_cat2_prob[0]) << 1;
1288                         coeff += vp56_rac_get_prob(&c, vp8_dct_cat2_prob[1]);
1289                     }
1290                 } else {    // DCT_CAT3 and up
1291                     int a   = vp56_rac_get_prob(&c, token_prob[8]);
1292                     int b   = vp56_rac_get_prob(&c, token_prob[9 + a]);
1293                     int cat = (a << 1) + b;
1294                     coeff  = 3 + (8 << cat);
1295                     coeff += vp8_rac_get_coeff(&c, ff_vp8_dct_cat_prob[cat]);
1296                 }
1297             }
1298             token_prob = probs[i + 1][2];
1299         }
1300         block[scan[i]] = (vp8_rac_get(&c) ? -coeff : coeff) * qmul[!!i];
1301     } while (++i < 16);
1302
1303     *r = c;
1304     return i;
1305 }
1306
1307 static av_always_inline
1308 int inter_predict_dc(int16_t block[16], int16_t pred[2])
1309 {
1310     int16_t dc = block[0];
1311     int ret = 0;
1312
1313     if (pred[1] > 3) {
1314         dc += pred[0];
1315         ret = 1;
1316     }
1317
1318     if (!pred[0] | !dc | ((int32_t)pred[0] ^ (int32_t)dc) >> 31) {
1319         block[0] = pred[0] = dc;
1320         pred[1] = 0;
1321     } else {
1322         if (pred[0] == dc)
1323             pred[1]++;
1324         block[0] = pred[0] = dc;
1325     }
1326
1327     return ret;
1328 }
1329
1330 static int vp7_decode_block_coeffs_internal(VP56RangeCoder *r,
1331                                             int16_t block[16],
1332                                             uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1333                                             int i, uint8_t *token_prob,
1334                                             int16_t qmul[2],
1335                                             const uint8_t scan[16])
1336 {
1337     return decode_block_coeffs_internal(r, block, probs, i,
1338                                         token_prob, qmul, scan, IS_VP7);
1339 }
1340
1341 #ifndef vp8_decode_block_coeffs_internal
1342 static int vp8_decode_block_coeffs_internal(VP56RangeCoder *r,
1343                                             int16_t block[16],
1344                                             uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1345                                             int i, uint8_t *token_prob,
1346                                             int16_t qmul[2])
1347 {
1348     return decode_block_coeffs_internal(r, block, probs, i,
1349                                         token_prob, qmul, zigzag_scan, IS_VP8);
1350 }
1351 #endif
1352
1353 /**
1354  * @param c          arithmetic bitstream reader context
1355  * @param block      destination for block coefficients
1356  * @param probs      probabilities to use when reading trees from the bitstream
1357  * @param i          initial coeff index, 0 unless a separate DC block is coded
1358  * @param zero_nhood the initial prediction context for number of surrounding
1359  *                   all-zero blocks (only left/top, so 0-2)
1360  * @param qmul       array holding the dc/ac dequant factor at position 0/1
1361  * @param scan       scan pattern (VP7 only)
1362  *
1363  * @return 0 if no coeffs were decoded
1364  *         otherwise, the index of the last coeff decoded plus one
1365  */
1366 static av_always_inline
1367 int decode_block_coeffs(VP56RangeCoder *c, int16_t block[16],
1368                         uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1369                         int i, int zero_nhood, int16_t qmul[2],
1370                         const uint8_t scan[16], int vp7)
1371 {
1372     uint8_t *token_prob = probs[i][zero_nhood];
1373     if (!vp56_rac_get_prob_branchy(c, token_prob[0]))   // DCT_EOB
1374         return 0;
1375     return vp7 ? vp7_decode_block_coeffs_internal(c, block, probs, i,
1376                                                   token_prob, qmul, scan)
1377                : vp8_decode_block_coeffs_internal(c, block, probs, i,
1378                                                   token_prob, qmul);
1379 }
1380
1381 static av_always_inline
1382 void decode_mb_coeffs(VP8Context *s, VP8ThreadData *td, VP56RangeCoder *c,
1383                       VP8Macroblock *mb, uint8_t t_nnz[9], uint8_t l_nnz[9],
1384                       int is_vp7)
1385 {
1386     int i, x, y, luma_start = 0, luma_ctx = 3;
1387     int nnz_pred, nnz, nnz_total = 0;
1388     int segment = mb->segment;
1389     int block_dc = 0;
1390
1391     if (mb->mode != MODE_I4x4 && (is_vp7 || mb->mode != VP8_MVMODE_SPLIT)) {
1392         nnz_pred = t_nnz[8] + l_nnz[8];
1393
1394         // decode DC values and do hadamard
1395         nnz = decode_block_coeffs(c, td->block_dc, s->prob->token[1], 0,
1396                                   nnz_pred, s->qmat[segment].luma_dc_qmul,
1397                                   zigzag_scan, is_vp7);
1398         l_nnz[8] = t_nnz[8] = !!nnz;
1399
1400         if (is_vp7 && mb->mode > MODE_I4x4) {
1401             nnz |=  inter_predict_dc(td->block_dc,
1402                                      s->inter_dc_pred[mb->ref_frame - 1]);
1403         }
1404
1405         if (nnz) {
1406             nnz_total += nnz;
1407             block_dc   = 1;
1408             if (nnz == 1)
1409                 s->vp8dsp.vp8_luma_dc_wht_dc(td->block, td->block_dc);
1410             else
1411                 s->vp8dsp.vp8_luma_dc_wht(td->block, td->block_dc);
1412         }
1413         luma_start = 1;
1414         luma_ctx   = 0;
1415     }
1416
1417     // luma blocks
1418     for (y = 0; y < 4; y++)
1419         for (x = 0; x < 4; x++) {
1420             nnz_pred = l_nnz[y] + t_nnz[x];
1421             nnz = decode_block_coeffs(c, td->block[y][x],
1422                                       s->prob->token[luma_ctx],
1423                                       luma_start, nnz_pred,
1424                                       s->qmat[segment].luma_qmul,
1425                                       s->prob[0].scan, is_vp7);
1426             /* nnz+block_dc may be one more than the actual last index,
1427              * but we don't care */
1428             td->non_zero_count_cache[y][x] = nnz + block_dc;
1429             t_nnz[x] = l_nnz[y] = !!nnz;
1430             nnz_total += nnz;
1431         }
1432
1433     // chroma blocks
1434     // TODO: what to do about dimensions? 2nd dim for luma is x,
1435     // but for chroma it's (y<<1)|x
1436     for (i = 4; i < 6; i++)
1437         for (y = 0; y < 2; y++)
1438             for (x = 0; x < 2; x++) {
1439                 nnz_pred = l_nnz[i + 2 * y] + t_nnz[i + 2 * x];
1440                 nnz = decode_block_coeffs(c, td->block[i][(y << 1) + x],
1441                                           s->prob->token[2], 0, nnz_pred,
1442                                           s->qmat[segment].chroma_qmul,
1443                                           s->prob[0].scan, is_vp7);
1444                 td->non_zero_count_cache[i][(y << 1) + x] = nnz;
1445                 t_nnz[i + 2 * x] = l_nnz[i + 2 * y] = !!nnz;
1446                 nnz_total += nnz;
1447             }
1448
1449     // if there were no coded coeffs despite the macroblock not being marked skip,
1450     // we MUST not do the inner loop filter and should not do IDCT
1451     // Since skip isn't used for bitstream prediction, just manually set it.
1452     if (!nnz_total)
1453         mb->skip = 1;
1454 }
1455
1456 static av_always_inline
1457 void backup_mb_border(uint8_t *top_border, uint8_t *src_y,
1458                       uint8_t *src_cb, uint8_t *src_cr,
1459                       int linesize, int uvlinesize, int simple)
1460 {
1461     AV_COPY128(top_border, src_y + 15 * linesize);
1462     if (!simple) {
1463         AV_COPY64(top_border + 16, src_cb + 7 * uvlinesize);
1464         AV_COPY64(top_border + 24, src_cr + 7 * uvlinesize);
1465     }
1466 }
1467
1468 static av_always_inline
1469 void xchg_mb_border(uint8_t *top_border, uint8_t *src_y, uint8_t *src_cb,
1470                     uint8_t *src_cr, int linesize, int uvlinesize, int mb_x,
1471                     int mb_y, int mb_width, int simple, int xchg)
1472 {
1473     uint8_t *top_border_m1 = top_border - 32;     // for TL prediction
1474     src_y  -= linesize;
1475     src_cb -= uvlinesize;
1476     src_cr -= uvlinesize;
1477
1478 #define XCHG(a, b, xchg)                                                      \
1479     do {                                                                      \
1480         if (xchg)                                                             \
1481             AV_SWAP64(b, a);                                                  \
1482         else                                                                  \
1483             AV_COPY64(b, a);                                                  \
1484     } while (0)
1485
1486     XCHG(top_border_m1 + 8, src_y - 8, xchg);
1487     XCHG(top_border, src_y, xchg);
1488     XCHG(top_border + 8, src_y + 8, 1);
1489     if (mb_x < mb_width - 1)
1490         XCHG(top_border + 32, src_y + 16, 1);
1491
1492     // only copy chroma for normal loop filter
1493     // or to initialize the top row to 127
1494     if (!simple || !mb_y) {
1495         XCHG(top_border_m1 + 16, src_cb - 8, xchg);
1496         XCHG(top_border_m1 + 24, src_cr - 8, xchg);
1497         XCHG(top_border + 16, src_cb, 1);
1498         XCHG(top_border + 24, src_cr, 1);
1499     }
1500 }
1501
1502 static av_always_inline
1503 int check_dc_pred8x8_mode(int mode, int mb_x, int mb_y)
1504 {
1505     if (!mb_x)
1506         return mb_y ? TOP_DC_PRED8x8 : DC_128_PRED8x8;
1507     else
1508         return mb_y ? mode : LEFT_DC_PRED8x8;
1509 }
1510
1511 static av_always_inline
1512 int check_tm_pred8x8_mode(int mode, int mb_x, int mb_y, int vp7)
1513 {
1514     if (!mb_x)
1515         return mb_y ? VERT_PRED8x8 : (vp7 ? DC_128_PRED8x8 : DC_129_PRED8x8);
1516     else
1517         return mb_y ? mode : HOR_PRED8x8;
1518 }
1519
1520 static av_always_inline
1521 int check_intra_pred8x8_mode_emuedge(int mode, int mb_x, int mb_y, int vp7)
1522 {
1523     switch (mode) {
1524     case DC_PRED8x8:
1525         return check_dc_pred8x8_mode(mode, mb_x, mb_y);
1526     case VERT_PRED8x8:
1527         return !mb_y ? (vp7 ? DC_128_PRED8x8 : DC_127_PRED8x8) : mode;
1528     case HOR_PRED8x8:
1529         return !mb_x ? (vp7 ? DC_128_PRED8x8 : DC_129_PRED8x8) : mode;
1530     case PLANE_PRED8x8: /* TM */
1531         return check_tm_pred8x8_mode(mode, mb_x, mb_y, vp7);
1532     }
1533     return mode;
1534 }
1535
1536 static av_always_inline
1537 int check_tm_pred4x4_mode(int mode, int mb_x, int mb_y, int vp7)
1538 {
1539     if (!mb_x) {
1540         return mb_y ? VERT_VP8_PRED : (vp7 ? DC_128_PRED : DC_129_PRED);
1541     } else {
1542         return mb_y ? mode : HOR_VP8_PRED;
1543     }
1544 }
1545
1546 static av_always_inline
1547 int check_intra_pred4x4_mode_emuedge(int mode, int mb_x, int mb_y,
1548                                      int *copy_buf, int vp7)
1549 {
1550     switch (mode) {
1551     case VERT_PRED:
1552         if (!mb_x && mb_y) {
1553             *copy_buf = 1;
1554             return mode;
1555         }
1556         /* fall-through */
1557     case DIAG_DOWN_LEFT_PRED:
1558     case VERT_LEFT_PRED:
1559         return !mb_y ? (vp7 ? DC_128_PRED : DC_127_PRED) : mode;
1560     case HOR_PRED:
1561         if (!mb_y) {
1562             *copy_buf = 1;
1563             return mode;
1564         }
1565         /* fall-through */
1566     case HOR_UP_PRED:
1567         return !mb_x ? (vp7 ? DC_128_PRED : DC_129_PRED) : mode;
1568     case TM_VP8_PRED:
1569         return check_tm_pred4x4_mode(mode, mb_x, mb_y, vp7);
1570     case DC_PRED: /* 4x4 DC doesn't use the same "H.264-style" exceptions
1571                    * as 16x16/8x8 DC */
1572     case DIAG_DOWN_RIGHT_PRED:
1573     case VERT_RIGHT_PRED:
1574     case HOR_DOWN_PRED:
1575         if (!mb_y || !mb_x)
1576             *copy_buf = 1;
1577         return mode;
1578     }
1579     return mode;
1580 }
1581
1582 static av_always_inline
1583 void intra_predict(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1584                    VP8Macroblock *mb, int mb_x, int mb_y, int is_vp7)
1585 {
1586     int x, y, mode, nnz;
1587     uint32_t tr;
1588
1589     /* for the first row, we need to run xchg_mb_border to init the top edge
1590      * to 127 otherwise, skip it if we aren't going to deblock */
1591     if (mb_y && (s->deblock_filter || !mb_y) && td->thread_nr == 0)
1592         xchg_mb_border(s->top_border[mb_x + 1], dst[0], dst[1], dst[2],
1593                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1594                        s->filter.simple, 1);
1595
1596     if (mb->mode < MODE_I4x4) {
1597         mode = check_intra_pred8x8_mode_emuedge(mb->mode, mb_x, mb_y, is_vp7);
1598         s->hpc.pred16x16[mode](dst[0], s->linesize);
1599     } else {
1600         uint8_t *ptr = dst[0];
1601         uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
1602         const uint8_t lo = is_vp7 ? 128 : 127;
1603         const uint8_t hi = is_vp7 ? 128 : 129;
1604         uint8_t tr_top[4] = { lo, lo, lo, lo };
1605
1606         // all blocks on the right edge of the macroblock use bottom edge
1607         // the top macroblock for their topright edge
1608         uint8_t *tr_right = ptr - s->linesize + 16;
1609
1610         // if we're on the right edge of the frame, said edge is extended
1611         // from the top macroblock
1612         if (mb_y && mb_x == s->mb_width - 1) {
1613             tr       = tr_right[-1] * 0x01010101u;
1614             tr_right = (uint8_t *) &tr;
1615         }
1616
1617         if (mb->skip)
1618             AV_ZERO128(td->non_zero_count_cache);
1619
1620         for (y = 0; y < 4; y++) {
1621             uint8_t *topright = ptr + 4 - s->linesize;
1622             for (x = 0; x < 4; x++) {
1623                 int copy = 0, linesize = s->linesize;
1624                 uint8_t *dst = ptr + 4 * x;
1625                 LOCAL_ALIGNED(4, uint8_t, copy_dst, [5 * 8]);
1626
1627                 if ((y == 0 || x == 3) && mb_y == 0) {
1628                     topright = tr_top;
1629                 } else if (x == 3)
1630                     topright = tr_right;
1631
1632                 mode = check_intra_pred4x4_mode_emuedge(intra4x4[x], mb_x + x,
1633                                                         mb_y + y, &copy, is_vp7);
1634                 if (copy) {
1635                     dst      = copy_dst + 12;
1636                     linesize = 8;
1637                     if (!(mb_y + y)) {
1638                         copy_dst[3] = lo;
1639                         AV_WN32A(copy_dst + 4, lo * 0x01010101U);
1640                     } else {
1641                         AV_COPY32(copy_dst + 4, ptr + 4 * x - s->linesize);
1642                         if (!(mb_x + x)) {
1643                             copy_dst[3] = hi;
1644                         } else {
1645                             copy_dst[3] = ptr[4 * x - s->linesize - 1];
1646                         }
1647                     }
1648                     if (!(mb_x + x)) {
1649                         copy_dst[11] =
1650                         copy_dst[19] =
1651                         copy_dst[27] =
1652                         copy_dst[35] = hi;
1653                     } else {
1654                         copy_dst[11] = ptr[4 * x                   - 1];
1655                         copy_dst[19] = ptr[4 * x + s->linesize     - 1];
1656                         copy_dst[27] = ptr[4 * x + s->linesize * 2 - 1];
1657                         copy_dst[35] = ptr[4 * x + s->linesize * 3 - 1];
1658                     }
1659                 }
1660                 s->hpc.pred4x4[mode](dst, topright, linesize);
1661                 if (copy) {
1662                     AV_COPY32(ptr + 4 * x,                   copy_dst + 12);
1663                     AV_COPY32(ptr + 4 * x + s->linesize,     copy_dst + 20);
1664                     AV_COPY32(ptr + 4 * x + s->linesize * 2, copy_dst + 28);
1665                     AV_COPY32(ptr + 4 * x + s->linesize * 3, copy_dst + 36);
1666                 }
1667
1668                 nnz = td->non_zero_count_cache[y][x];
1669                 if (nnz) {
1670                     if (nnz == 1)
1671                         s->vp8dsp.vp8_idct_dc_add(ptr + 4 * x,
1672                                                   td->block[y][x], s->linesize);
1673                     else
1674                         s->vp8dsp.vp8_idct_add(ptr + 4 * x,
1675                                                td->block[y][x], s->linesize);
1676                 }
1677                 topright += 4;
1678             }
1679
1680             ptr      += 4 * s->linesize;
1681             intra4x4 += 4;
1682         }
1683     }
1684
1685     mode = check_intra_pred8x8_mode_emuedge(mb->chroma_pred_mode,
1686                                             mb_x, mb_y, is_vp7);
1687     s->hpc.pred8x8[mode](dst[1], s->uvlinesize);
1688     s->hpc.pred8x8[mode](dst[2], s->uvlinesize);
1689
1690     if (mb_y && (s->deblock_filter || !mb_y) && td->thread_nr == 0)
1691         xchg_mb_border(s->top_border[mb_x + 1], dst[0], dst[1], dst[2],
1692                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1693                        s->filter.simple, 0);
1694 }
1695
1696 static const uint8_t subpel_idx[3][8] = {
1697     { 0, 1, 2, 1, 2, 1, 2, 1 }, // nr. of left extra pixels,
1698                                 // also function pointer index
1699     { 0, 3, 5, 3, 5, 3, 5, 3 }, // nr. of extra pixels required
1700     { 0, 2, 3, 2, 3, 2, 3, 2 }, // nr. of right extra pixels
1701 };
1702
1703 /**
1704  * luma MC function
1705  *
1706  * @param s        VP8 decoding context
1707  * @param dst      target buffer for block data at block position
1708  * @param ref      reference picture buffer at origin (0, 0)
1709  * @param mv       motion vector (relative to block position) to get pixel data from
1710  * @param x_off    horizontal position of block from origin (0, 0)
1711  * @param y_off    vertical position of block from origin (0, 0)
1712  * @param block_w  width of block (16, 8 or 4)
1713  * @param block_h  height of block (always same as block_w)
1714  * @param width    width of src/dst plane data
1715  * @param height   height of src/dst plane data
1716  * @param linesize size of a single line of plane data, including padding
1717  * @param mc_func  motion compensation function pointers (bilinear or sixtap MC)
1718  */
1719 static av_always_inline
1720 void vp8_mc_luma(VP8Context *s, VP8ThreadData *td, uint8_t *dst,
1721                  ThreadFrame *ref, const VP56mv *mv,
1722                  int x_off, int y_off, int block_w, int block_h,
1723                  int width, int height, ptrdiff_t linesize,
1724                  vp8_mc_func mc_func[3][3])
1725 {
1726     uint8_t *src = ref->f->data[0];
1727
1728     if (AV_RN32A(mv)) {
1729         int src_linesize = linesize;
1730
1731         int mx = (mv->x * 2) & 7, mx_idx = subpel_idx[0][mx];
1732         int my = (mv->y * 2) & 7, my_idx = subpel_idx[0][my];
1733
1734         x_off += mv->x >> 2;
1735         y_off += mv->y >> 2;
1736
1737         // edge emulation
1738         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 4, 0);
1739         src += y_off * linesize + x_off;
1740         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1741             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1742             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1743                                      src - my_idx * linesize - mx_idx,
1744                                      EDGE_EMU_LINESIZE, linesize,
1745                                      block_w + subpel_idx[1][mx],
1746                                      block_h + subpel_idx[1][my],
1747                                      x_off - mx_idx, y_off - my_idx,
1748                                      width, height);
1749             src = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
1750             src_linesize = EDGE_EMU_LINESIZE;
1751         }
1752         mc_func[my_idx][mx_idx](dst, linesize, src, src_linesize, block_h, mx, my);
1753     } else {
1754         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 4, 0);
1755         mc_func[0][0](dst, linesize, src + y_off * linesize + x_off,
1756                       linesize, block_h, 0, 0);
1757     }
1758 }
1759
1760 /**
1761  * chroma MC function
1762  *
1763  * @param s        VP8 decoding context
1764  * @param dst1     target buffer for block data at block position (U plane)
1765  * @param dst2     target buffer for block data at block position (V plane)
1766  * @param ref      reference picture buffer at origin (0, 0)
1767  * @param mv       motion vector (relative to block position) to get pixel data from
1768  * @param x_off    horizontal position of block from origin (0, 0)
1769  * @param y_off    vertical position of block from origin (0, 0)
1770  * @param block_w  width of block (16, 8 or 4)
1771  * @param block_h  height of block (always same as block_w)
1772  * @param width    width of src/dst plane data
1773  * @param height   height of src/dst plane data
1774  * @param linesize size of a single line of plane data, including padding
1775  * @param mc_func  motion compensation function pointers (bilinear or sixtap MC)
1776  */
1777 static av_always_inline
1778 void vp8_mc_chroma(VP8Context *s, VP8ThreadData *td, uint8_t *dst1,
1779                    uint8_t *dst2, ThreadFrame *ref, const VP56mv *mv,
1780                    int x_off, int y_off, int block_w, int block_h,
1781                    int width, int height, ptrdiff_t linesize,
1782                    vp8_mc_func mc_func[3][3])
1783 {
1784     uint8_t *src1 = ref->f->data[1], *src2 = ref->f->data[2];
1785
1786     if (AV_RN32A(mv)) {
1787         int mx = mv->x & 7, mx_idx = subpel_idx[0][mx];
1788         int my = mv->y & 7, my_idx = subpel_idx[0][my];
1789
1790         x_off += mv->x >> 3;
1791         y_off += mv->y >> 3;
1792
1793         // edge emulation
1794         src1 += y_off * linesize + x_off;
1795         src2 += y_off * linesize + x_off;
1796         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 3, 0);
1797         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1798             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1799             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1800                                      src1 - my_idx * linesize - mx_idx,
1801                                      EDGE_EMU_LINESIZE, linesize,
1802                                      block_w + subpel_idx[1][mx],
1803                                      block_h + subpel_idx[1][my],
1804                                      x_off - mx_idx, y_off - my_idx, width, height);
1805             src1 = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
1806             mc_func[my_idx][mx_idx](dst1, linesize, src1, EDGE_EMU_LINESIZE, block_h, mx, my);
1807
1808             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1809                                      src2 - my_idx * linesize - mx_idx,
1810                                      EDGE_EMU_LINESIZE, linesize,
1811                                      block_w + subpel_idx[1][mx],
1812                                      block_h + subpel_idx[1][my],
1813                                      x_off - mx_idx, y_off - my_idx, width, height);
1814             src2 = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
1815             mc_func[my_idx][mx_idx](dst2, linesize, src2, EDGE_EMU_LINESIZE, block_h, mx, my);
1816         } else {
1817             mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my);
1818             mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my);
1819         }
1820     } else {
1821         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 3, 0);
1822         mc_func[0][0](dst1, linesize, src1 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1823         mc_func[0][0](dst2, linesize, src2 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1824     }
1825 }
1826
1827 static av_always_inline
1828 void vp8_mc_part(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1829                  ThreadFrame *ref_frame, int x_off, int y_off,
1830                  int bx_off, int by_off, int block_w, int block_h,
1831                  int width, int height, VP56mv *mv)
1832 {
1833     VP56mv uvmv = *mv;
1834
1835     /* Y */
1836     vp8_mc_luma(s, td, dst[0] + by_off * s->linesize + bx_off,
1837                 ref_frame, mv, x_off + bx_off, y_off + by_off,
1838                 block_w, block_h, width, height, s->linesize,
1839                 s->put_pixels_tab[block_w == 8]);
1840
1841     /* U/V */
1842     if (s->profile == 3) {
1843         /* this block only applies VP8; it is safe to check
1844          * only the profile, as VP7 profile <= 1 */
1845         uvmv.x &= ~7;
1846         uvmv.y &= ~7;
1847     }
1848     x_off   >>= 1;
1849     y_off   >>= 1;
1850     bx_off  >>= 1;
1851     by_off  >>= 1;
1852     width   >>= 1;
1853     height  >>= 1;
1854     block_w >>= 1;
1855     block_h >>= 1;
1856     vp8_mc_chroma(s, td, dst[1] + by_off * s->uvlinesize + bx_off,
1857                   dst[2] + by_off * s->uvlinesize + bx_off, ref_frame,
1858                   &uvmv, x_off + bx_off, y_off + by_off,
1859                   block_w, block_h, width, height, s->uvlinesize,
1860                   s->put_pixels_tab[1 + (block_w == 4)]);
1861 }
1862
1863 /* Fetch pixels for estimated mv 4 macroblocks ahead.
1864  * Optimized for 64-byte cache lines. Inspired by ffh264 prefetch_motion. */
1865 static av_always_inline
1866 void prefetch_motion(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
1867                      int mb_xy, int ref)
1868 {
1869     /* Don't prefetch refs that haven't been used very often this frame. */
1870     if (s->ref_count[ref - 1] > (mb_xy >> 5)) {
1871         int x_off = mb_x << 4, y_off = mb_y << 4;
1872         int mx = (mb->mv.x >> 2) + x_off + 8;
1873         int my = (mb->mv.y >> 2) + y_off;
1874         uint8_t **src = s->framep[ref]->tf.f->data;
1875         int off = mx + (my + (mb_x & 3) * 4) * s->linesize + 64;
1876         /* For threading, a ff_thread_await_progress here might be useful, but
1877          * it actually slows down the decoder. Since a bad prefetch doesn't
1878          * generate bad decoder output, we don't run it here. */
1879         s->vdsp.prefetch(src[0] + off, s->linesize, 4);
1880         off = (mx >> 1) + ((my >> 1) + (mb_x & 7)) * s->uvlinesize + 64;
1881         s->vdsp.prefetch(src[1] + off, src[2] - src[1], 2);
1882     }
1883 }
1884
1885 /**
1886  * Apply motion vectors to prediction buffer, chapter 18.
1887  */
1888 static av_always_inline
1889 void inter_predict(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1890                    VP8Macroblock *mb, int mb_x, int mb_y)
1891 {
1892     int x_off = mb_x << 4, y_off = mb_y << 4;
1893     int width = 16 * s->mb_width, height = 16 * s->mb_height;
1894     ThreadFrame *ref = &s->framep[mb->ref_frame]->tf;
1895     VP56mv *bmv = mb->bmv;
1896
1897     switch (mb->partitioning) {
1898     case VP8_SPLITMVMODE_NONE:
1899         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1900                     0, 0, 16, 16, width, height, &mb->mv);
1901         break;
1902     case VP8_SPLITMVMODE_4x4: {
1903         int x, y;
1904         VP56mv uvmv;
1905
1906         /* Y */
1907         for (y = 0; y < 4; y++) {
1908             for (x = 0; x < 4; x++) {
1909                 vp8_mc_luma(s, td, dst[0] + 4 * y * s->linesize + x * 4,
1910                             ref, &bmv[4 * y + x],
1911                             4 * x + x_off, 4 * y + y_off, 4, 4,
1912                             width, height, s->linesize,
1913                             s->put_pixels_tab[2]);
1914             }
1915         }
1916
1917         /* U/V */
1918         x_off  >>= 1;
1919         y_off  >>= 1;
1920         width  >>= 1;
1921         height >>= 1;
1922         for (y = 0; y < 2; y++) {
1923             for (x = 0; x < 2; x++) {
1924                 uvmv.x = mb->bmv[2 * y       * 4 + 2 * x    ].x +
1925                          mb->bmv[2 * y       * 4 + 2 * x + 1].x +
1926                          mb->bmv[(2 * y + 1) * 4 + 2 * x    ].x +
1927                          mb->bmv[(2 * y + 1) * 4 + 2 * x + 1].x;
1928                 uvmv.y = mb->bmv[2 * y       * 4 + 2 * x    ].y +
1929                          mb->bmv[2 * y       * 4 + 2 * x + 1].y +
1930                          mb->bmv[(2 * y + 1) * 4 + 2 * x    ].y +
1931                          mb->bmv[(2 * y + 1) * 4 + 2 * x + 1].y;
1932                 uvmv.x = (uvmv.x + 2 + FF_SIGNBIT(uvmv.x)) >> 2;
1933                 uvmv.y = (uvmv.y + 2 + FF_SIGNBIT(uvmv.y)) >> 2;
1934                 if (s->profile == 3) {
1935                     uvmv.x &= ~7;
1936                     uvmv.y &= ~7;
1937                 }
1938                 vp8_mc_chroma(s, td, dst[1] + 4 * y * s->uvlinesize + x * 4,
1939                               dst[2] + 4 * y * s->uvlinesize + x * 4, ref,
1940                               &uvmv, 4 * x + x_off, 4 * y + y_off, 4, 4,
1941                               width, height, s->uvlinesize,
1942                               s->put_pixels_tab[2]);
1943             }
1944         }
1945         break;
1946     }
1947     case VP8_SPLITMVMODE_16x8:
1948         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1949                     0, 0, 16, 8, width, height, &bmv[0]);
1950         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1951                     0, 8, 16, 8, width, height, &bmv[1]);
1952         break;
1953     case VP8_SPLITMVMODE_8x16:
1954         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1955                     0, 0, 8, 16, width, height, &bmv[0]);
1956         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1957                     8, 0, 8, 16, width, height, &bmv[1]);
1958         break;
1959     case VP8_SPLITMVMODE_8x8:
1960         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1961                     0, 0, 8, 8, width, height, &bmv[0]);
1962         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1963                     8, 0, 8, 8, width, height, &bmv[1]);
1964         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1965                     0, 8, 8, 8, width, height, &bmv[2]);
1966         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1967                     8, 8, 8, 8, width, height, &bmv[3]);
1968         break;
1969     }
1970 }
1971
1972 static av_always_inline
1973 void idct_mb(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3], VP8Macroblock *mb)
1974 {
1975     int x, y, ch;
1976
1977     if (mb->mode != MODE_I4x4) {
1978         uint8_t *y_dst = dst[0];
1979         for (y = 0; y < 4; y++) {
1980             uint32_t nnz4 = AV_RL32(td->non_zero_count_cache[y]);
1981             if (nnz4) {
1982                 if (nnz4 & ~0x01010101) {
1983                     for (x = 0; x < 4; x++) {
1984                         if ((uint8_t) nnz4 == 1)
1985                             s->vp8dsp.vp8_idct_dc_add(y_dst + 4 * x,
1986                                                       td->block[y][x],
1987                                                       s->linesize);
1988                         else if ((uint8_t) nnz4 > 1)
1989                             s->vp8dsp.vp8_idct_add(y_dst + 4 * x,
1990                                                    td->block[y][x],
1991                                                    s->linesize);
1992                         nnz4 >>= 8;
1993                         if (!nnz4)
1994                             break;
1995                     }
1996                 } else {
1997                     s->vp8dsp.vp8_idct_dc_add4y(y_dst, td->block[y], s->linesize);
1998                 }
1999             }
2000             y_dst += 4 * s->linesize;
2001         }
2002     }
2003
2004     for (ch = 0; ch < 2; ch++) {
2005         uint32_t nnz4 = AV_RL32(td->non_zero_count_cache[4 + ch]);
2006         if (nnz4) {
2007             uint8_t *ch_dst = dst[1 + ch];
2008             if (nnz4 & ~0x01010101) {
2009                 for (y = 0; y < 2; y++) {
2010                     for (x = 0; x < 2; x++) {
2011                         if ((uint8_t) nnz4 == 1)
2012                             s->vp8dsp.vp8_idct_dc_add(ch_dst + 4 * x,
2013                                                       td->block[4 + ch][(y << 1) + x],
2014                                                       s->uvlinesize);
2015                         else if ((uint8_t) nnz4 > 1)
2016                             s->vp8dsp.vp8_idct_add(ch_dst + 4 * x,
2017                                                    td->block[4 + ch][(y << 1) + x],
2018                                                    s->uvlinesize);
2019                         nnz4 >>= 8;
2020                         if (!nnz4)
2021                             goto chroma_idct_end;
2022                     }
2023                     ch_dst += 4 * s->uvlinesize;
2024                 }
2025             } else {
2026                 s->vp8dsp.vp8_idct_dc_add4uv(ch_dst, td->block[4 + ch], s->uvlinesize);
2027             }
2028         }
2029 chroma_idct_end:
2030         ;
2031     }
2032 }
2033
2034 static av_always_inline
2035 void filter_level_for_mb(VP8Context *s, VP8Macroblock *mb,
2036                          VP8FilterStrength *f, int is_vp7)
2037 {
2038     int interior_limit, filter_level;
2039
2040     if (s->segmentation.enabled) {
2041         filter_level = s->segmentation.filter_level[mb->segment];
2042         if (!s->segmentation.absolute_vals)
2043             filter_level += s->filter.level;
2044     } else
2045         filter_level = s->filter.level;
2046
2047     if (s->lf_delta.enabled) {
2048         filter_level += s->lf_delta.ref[mb->ref_frame];
2049         filter_level += s->lf_delta.mode[mb->mode];
2050     }
2051
2052     filter_level = av_clip_uintp2(filter_level, 6);
2053
2054     interior_limit = filter_level;
2055     if (s->filter.sharpness) {
2056         interior_limit >>= (s->filter.sharpness + 3) >> 2;
2057         interior_limit = FFMIN(interior_limit, 9 - s->filter.sharpness);
2058     }
2059     interior_limit = FFMAX(interior_limit, 1);
2060
2061     f->filter_level = filter_level;
2062     f->inner_limit = interior_limit;
2063     f->inner_filter = is_vp7 || !mb->skip || mb->mode == MODE_I4x4 ||
2064                       mb->mode == VP8_MVMODE_SPLIT;
2065 }
2066
2067 static av_always_inline
2068 void filter_mb(VP8Context *s, uint8_t *dst[3], VP8FilterStrength *f,
2069                int mb_x, int mb_y, int is_vp7)
2070 {
2071     int mbedge_lim, bedge_lim_y, bedge_lim_uv, hev_thresh;
2072     int filter_level = f->filter_level;
2073     int inner_limit = f->inner_limit;
2074     int inner_filter = f->inner_filter;
2075     int linesize = s->linesize;
2076     int uvlinesize = s->uvlinesize;
2077     static const uint8_t hev_thresh_lut[2][64] = {
2078         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
2079           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2080           3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2081           3, 3, 3, 3 },
2082         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
2083           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2084           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2085           2, 2, 2, 2 }
2086     };
2087
2088     if (!filter_level)
2089         return;
2090
2091     if (is_vp7) {
2092         bedge_lim_y  = filter_level;
2093         bedge_lim_uv = filter_level * 2;
2094         mbedge_lim   = filter_level + 2;
2095     } else {
2096         bedge_lim_y  =
2097         bedge_lim_uv = filter_level * 2 + inner_limit;
2098         mbedge_lim   = bedge_lim_y + 4;
2099     }
2100
2101     hev_thresh = hev_thresh_lut[s->keyframe][filter_level];
2102
2103     if (mb_x) {
2104         s->vp8dsp.vp8_h_loop_filter16y(dst[0], linesize,
2105                                        mbedge_lim, inner_limit, hev_thresh);
2106         s->vp8dsp.vp8_h_loop_filter8uv(dst[1], dst[2], uvlinesize,
2107                                        mbedge_lim, inner_limit, hev_thresh);
2108     }
2109
2110 #define H_LOOP_FILTER_16Y_INNER(cond)                                         \
2111     if (cond && inner_filter) {                                               \
2112         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] +  4, linesize,           \
2113                                              bedge_lim_y, inner_limit,        \
2114                                              hev_thresh);                     \
2115         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] +  8, linesize,           \
2116                                              bedge_lim_y, inner_limit,        \
2117                                              hev_thresh);                     \
2118         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] + 12, linesize,           \
2119                                              bedge_lim_y, inner_limit,        \
2120                                              hev_thresh);                     \
2121         s->vp8dsp.vp8_h_loop_filter8uv_inner(dst[1] +  4, dst[2] + 4,         \
2122                                              uvlinesize,  bedge_lim_uv,       \
2123                                              inner_limit, hev_thresh);        \
2124     }
2125
2126     H_LOOP_FILTER_16Y_INNER(!is_vp7)
2127
2128     if (mb_y) {
2129         s->vp8dsp.vp8_v_loop_filter16y(dst[0], linesize,
2130                                        mbedge_lim, inner_limit, hev_thresh);
2131         s->vp8dsp.vp8_v_loop_filter8uv(dst[1], dst[2], uvlinesize,
2132                                        mbedge_lim, inner_limit, hev_thresh);
2133     }
2134
2135     if (inner_filter) {
2136         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] +  4 * linesize,
2137                                              linesize, bedge_lim_y,
2138                                              inner_limit, hev_thresh);
2139         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] +  8 * linesize,
2140                                              linesize, bedge_lim_y,
2141                                              inner_limit, hev_thresh);
2142         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] + 12 * linesize,
2143                                              linesize, bedge_lim_y,
2144                                              inner_limit, hev_thresh);
2145         s->vp8dsp.vp8_v_loop_filter8uv_inner(dst[1] +  4 * uvlinesize,
2146                                              dst[2] +  4 * uvlinesize,
2147                                              uvlinesize, bedge_lim_uv,
2148                                              inner_limit, hev_thresh);
2149     }
2150
2151     H_LOOP_FILTER_16Y_INNER(is_vp7)
2152 }
2153
2154 static av_always_inline
2155 void filter_mb_simple(VP8Context *s, uint8_t *dst, VP8FilterStrength *f,
2156                       int mb_x, int mb_y)
2157 {
2158     int mbedge_lim, bedge_lim;
2159     int filter_level = f->filter_level;
2160     int inner_limit  = f->inner_limit;
2161     int inner_filter = f->inner_filter;
2162     int linesize     = s->linesize;
2163
2164     if (!filter_level)
2165         return;
2166
2167     bedge_lim  = 2 * filter_level + inner_limit;
2168     mbedge_lim = bedge_lim + 4;
2169
2170     if (mb_x)
2171         s->vp8dsp.vp8_h_loop_filter_simple(dst, linesize, mbedge_lim);
2172     if (inner_filter) {
2173         s->vp8dsp.vp8_h_loop_filter_simple(dst +  4, linesize, bedge_lim);
2174         s->vp8dsp.vp8_h_loop_filter_simple(dst +  8, linesize, bedge_lim);
2175         s->vp8dsp.vp8_h_loop_filter_simple(dst + 12, linesize, bedge_lim);
2176     }
2177
2178     if (mb_y)
2179         s->vp8dsp.vp8_v_loop_filter_simple(dst, linesize, mbedge_lim);
2180     if (inner_filter) {
2181         s->vp8dsp.vp8_v_loop_filter_simple(dst +  4 * linesize, linesize, bedge_lim);
2182         s->vp8dsp.vp8_v_loop_filter_simple(dst +  8 * linesize, linesize, bedge_lim);
2183         s->vp8dsp.vp8_v_loop_filter_simple(dst + 12 * linesize, linesize, bedge_lim);
2184     }
2185 }
2186
2187 #define MARGIN (16 << 2)
2188 static av_always_inline
2189 void vp78_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *curframe,
2190                                     VP8Frame *prev_frame, int is_vp7)
2191 {
2192     VP8Context *s = avctx->priv_data;
2193     int mb_x, mb_y;
2194
2195     s->mv_min.y = -MARGIN;
2196     s->mv_max.y = ((s->mb_height - 1) << 6) + MARGIN;
2197     for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
2198         VP8Macroblock *mb = s->macroblocks_base +
2199                             ((s->mb_width + 1) * (mb_y + 1) + 1);
2200         int mb_xy = mb_y * s->mb_width;
2201
2202         AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED * 0x01010101);
2203
2204         s->mv_min.x = -MARGIN;
2205         s->mv_max.x = ((s->mb_width - 1) << 6) + MARGIN;
2206         for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
2207             if (mb_y == 0)
2208                 AV_WN32A((mb - s->mb_width - 1)->intra4x4_pred_mode_top,
2209                          DC_PRED * 0x01010101);
2210             decode_mb_mode(s, mb, mb_x, mb_y, curframe->seg_map->data + mb_xy,
2211                            prev_frame && prev_frame->seg_map ?
2212                            prev_frame->seg_map->data + mb_xy : NULL, 1, is_vp7);
2213             s->mv_min.x -= 64;
2214             s->mv_max.x -= 64;
2215         }
2216         s->mv_min.y -= 64;
2217         s->mv_max.y -= 64;
2218     }
2219 }
2220
2221 static void vp7_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *cur_frame,
2222                                    VP8Frame *prev_frame)
2223 {
2224     vp78_decode_mv_mb_modes(avctx, cur_frame, prev_frame, IS_VP7);
2225 }
2226
2227 static void vp8_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *cur_frame,
2228                                    VP8Frame *prev_frame)
2229 {
2230     vp78_decode_mv_mb_modes(avctx, cur_frame, prev_frame, IS_VP8);
2231 }
2232
2233 #if HAVE_THREADS
2234 #define check_thread_pos(td, otd, mb_x_check, mb_y_check)                     \
2235     do {                                                                      \
2236         int tmp = (mb_y_check << 16) | (mb_x_check & 0xFFFF);                 \
2237         if (otd->thread_mb_pos < tmp) {                                       \
2238             pthread_mutex_lock(&otd->lock);                                   \
2239             td->wait_mb_pos = tmp;                                            \
2240             do {                                                              \
2241                 if (otd->thread_mb_pos >= tmp)                                \
2242                     break;                                                    \
2243                 pthread_cond_wait(&otd->cond, &otd->lock);                    \
2244             } while (1);                                                      \
2245             td->wait_mb_pos = INT_MAX;                                        \
2246             pthread_mutex_unlock(&otd->lock);                                 \
2247         }                                                                     \
2248     } while (0)
2249
2250 #define update_pos(td, mb_y, mb_x)                                            \
2251     do {                                                                      \
2252         int pos              = (mb_y << 16) | (mb_x & 0xFFFF);                \
2253         int sliced_threading = (avctx->active_thread_type == FF_THREAD_SLICE) && \
2254                                (num_jobs > 1);                                \
2255         int is_null          = !next_td || !prev_td;                          \
2256         int pos_check        = (is_null) ? 1                                  \
2257                                          : (next_td != td &&                  \
2258                                             pos >= next_td->wait_mb_pos) ||   \
2259                                            (prev_td != td &&                  \
2260                                             pos >= prev_td->wait_mb_pos);     \
2261         td->thread_mb_pos = pos;                                              \
2262         if (sliced_threading && pos_check) {                                  \
2263             pthread_mutex_lock(&td->lock);                                    \
2264             pthread_cond_broadcast(&td->cond);                                \
2265             pthread_mutex_unlock(&td->lock);                                  \
2266         }                                                                     \
2267     } while (0)
2268 #else
2269 #define check_thread_pos(td, otd, mb_x_check, mb_y_check) while(0)
2270 #define update_pos(td, mb_y, mb_x) while(0)
2271 #endif
2272
2273 static av_always_inline void decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
2274                                         int jobnr, int threadnr, int is_vp7)
2275 {
2276     VP8Context *s = avctx->priv_data;
2277     VP8ThreadData *prev_td, *next_td, *td = &s->thread_data[threadnr];
2278     int mb_y = td->thread_mb_pos >> 16;
2279     int mb_x, mb_xy = mb_y * s->mb_width;
2280     int num_jobs = s->num_jobs;
2281     VP8Frame *curframe = s->curframe, *prev_frame = s->prev_frame;
2282     VP56RangeCoder *c  = &s->coeff_partition[mb_y & (s->num_coeff_partitions - 1)];
2283     VP8Macroblock *mb;
2284     uint8_t *dst[3] = {
2285         curframe->tf.f->data[0] + 16 * mb_y * s->linesize,
2286         curframe->tf.f->data[1] +  8 * mb_y * s->uvlinesize,
2287         curframe->tf.f->data[2] +  8 * mb_y * s->uvlinesize
2288     };
2289     if (mb_y == 0)
2290         prev_td = td;
2291     else
2292         prev_td = &s->thread_data[(jobnr + num_jobs - 1) % num_jobs];
2293     if (mb_y == s->mb_height - 1)
2294         next_td = td;
2295     else
2296         next_td = &s->thread_data[(jobnr + 1) % num_jobs];
2297     if (s->mb_layout == 1)
2298         mb = s->macroblocks_base + ((s->mb_width + 1) * (mb_y + 1) + 1);
2299     else {
2300         // Make sure the previous frame has read its segmentation map,
2301         // if we re-use the same map.
2302         if (prev_frame && s->segmentation.enabled &&
2303             !s->segmentation.update_map)
2304             ff_thread_await_progress(&prev_frame->tf, mb_y, 0);
2305         mb = s->macroblocks + (s->mb_height - mb_y - 1) * 2;
2306         memset(mb - 1, 0, sizeof(*mb)); // zero left macroblock
2307         AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED * 0x01010101);
2308     }
2309
2310     if (!is_vp7 || mb_y == 0)
2311         memset(td->left_nnz, 0, sizeof(td->left_nnz));
2312
2313     s->mv_min.x = -MARGIN;
2314     s->mv_max.x = ((s->mb_width - 1) << 6) + MARGIN;
2315
2316     for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
2317         // Wait for previous thread to read mb_x+2, and reach mb_y-1.
2318         if (prev_td != td) {
2319             if (threadnr != 0) {
2320                 check_thread_pos(td, prev_td,
2321                                  mb_x + (is_vp7 ? 2 : 1),
2322                                  mb_y - (is_vp7 ? 2 : 1));
2323             } else {
2324                 check_thread_pos(td, prev_td,
2325                                  mb_x + (is_vp7 ? 2 : 1) + s->mb_width + 3,
2326                                  mb_y - (is_vp7 ? 2 : 1));
2327             }
2328         }
2329
2330         s->vdsp.prefetch(dst[0] + (mb_x & 3) * 4 * s->linesize + 64,
2331                          s->linesize, 4);
2332         s->vdsp.prefetch(dst[1] + (mb_x & 7) * s->uvlinesize + 64,
2333                          dst[2] - dst[1], 2);
2334
2335         if (!s->mb_layout)
2336             decode_mb_mode(s, mb, mb_x, mb_y, curframe->seg_map->data + mb_xy,
2337                            prev_frame && prev_frame->seg_map ?
2338                            prev_frame->seg_map->data + mb_xy : NULL, 0, is_vp7);
2339
2340         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_PREVIOUS);
2341
2342         if (!mb->skip)
2343             decode_mb_coeffs(s, td, c, mb, s->top_nnz[mb_x], td->left_nnz, is_vp7);
2344
2345         if (mb->mode <= MODE_I4x4)
2346             intra_predict(s, td, dst, mb, mb_x, mb_y, is_vp7);
2347         else
2348             inter_predict(s, td, dst, mb, mb_x, mb_y);
2349
2350         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN);
2351
2352         if (!mb->skip) {
2353             idct_mb(s, td, dst, mb);
2354         } else {
2355             AV_ZERO64(td->left_nnz);
2356             AV_WN64(s->top_nnz[mb_x], 0);   // array of 9, so unaligned
2357
2358             /* Reset DC block predictors if they would exist
2359              * if the mb had coefficients */
2360             if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
2361                 td->left_nnz[8]     = 0;
2362                 s->top_nnz[mb_x][8] = 0;
2363             }
2364         }
2365
2366         if (s->deblock_filter)
2367             filter_level_for_mb(s, mb, &td->filter_strength[mb_x], is_vp7);
2368
2369         if (s->deblock_filter && num_jobs != 1 && threadnr == num_jobs - 1) {
2370             if (s->filter.simple)
2371                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2372                                  NULL, NULL, s->linesize, 0, 1);
2373             else
2374                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2375                                  dst[1], dst[2], s->linesize, s->uvlinesize, 0);
2376         }
2377
2378         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN2);
2379
2380         dst[0]      += 16;
2381         dst[1]      += 8;
2382         dst[2]      += 8;
2383         s->mv_min.x -= 64;
2384         s->mv_max.x -= 64;
2385
2386         if (mb_x == s->mb_width + 1) {
2387             update_pos(td, mb_y, s->mb_width + 3);
2388         } else {
2389             update_pos(td, mb_y, mb_x);
2390         }
2391     }
2392 }
2393
2394 static void vp7_decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
2395                                         int jobnr, int threadnr)
2396 {
2397     decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr, 1);
2398 }
2399
2400 static void vp8_decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
2401                                         int jobnr, int threadnr)
2402 {
2403     decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr, 0);
2404 }
2405
2406 static av_always_inline void filter_mb_row(AVCodecContext *avctx, void *tdata,
2407                               int jobnr, int threadnr, int is_vp7)
2408 {
2409     VP8Context *s = avctx->priv_data;
2410     VP8ThreadData *td = &s->thread_data[threadnr];
2411     int mb_x, mb_y = td->thread_mb_pos >> 16, num_jobs = s->num_jobs;
2412     AVFrame *curframe = s->curframe->tf.f;
2413     VP8Macroblock *mb;
2414     VP8ThreadData *prev_td, *next_td;
2415     uint8_t *dst[3] = {
2416         curframe->data[0] + 16 * mb_y * s->linesize,
2417         curframe->data[1] +  8 * mb_y * s->uvlinesize,
2418         curframe->data[2] +  8 * mb_y * s->uvlinesize
2419     };
2420
2421     if (s->mb_layout == 1)
2422         mb = s->macroblocks_base + ((s->mb_width + 1) * (mb_y + 1) + 1);
2423     else
2424         mb = s->macroblocks + (s->mb_height - mb_y - 1) * 2;
2425
2426     if (mb_y == 0)
2427         prev_td = td;
2428     else
2429         prev_td = &s->thread_data[(jobnr + num_jobs - 1) % num_jobs];
2430     if (mb_y == s->mb_height - 1)
2431         next_td = td;
2432     else
2433         next_td = &s->thread_data[(jobnr + 1) % num_jobs];
2434
2435     for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb++) {
2436         VP8FilterStrength *f = &td->filter_strength[mb_x];
2437         if (prev_td != td)
2438             check_thread_pos(td, prev_td,
2439                              (mb_x + 1) + (s->mb_width + 3), mb_y - 1);
2440         if (next_td != td)
2441             if (next_td != &s->thread_data[0])
2442                 check_thread_pos(td, next_td, mb_x + 1, mb_y + 1);
2443
2444         if (num_jobs == 1) {
2445             if (s->filter.simple)
2446                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2447                                  NULL, NULL, s->linesize, 0, 1);
2448             else
2449                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2450                                  dst[1], dst[2], s->linesize, s->uvlinesize, 0);
2451         }
2452
2453         if (s->filter.simple)
2454             filter_mb_simple(s, dst[0], f, mb_x, mb_y);
2455         else
2456             filter_mb(s, dst, f, mb_x, mb_y, is_vp7);
2457         dst[0] += 16;
2458         dst[1] += 8;
2459         dst[2] += 8;
2460
2461         update_pos(td, mb_y, (s->mb_width + 3) + mb_x);
2462     }
2463 }
2464
2465 static void vp7_filter_mb_row(AVCodecContext *avctx, void *tdata,
2466                               int jobnr, int threadnr)
2467 {
2468     filter_mb_row(avctx, tdata, jobnr, threadnr, 1);
2469 }
2470
2471 static void vp8_filter_mb_row(AVCodecContext *avctx, void *tdata,
2472                               int jobnr, int threadnr)
2473 {
2474     filter_mb_row(avctx, tdata, jobnr, threadnr, 0);
2475 }
2476
2477 static av_always_inline
2478 int vp78_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata, int jobnr,
2479                               int threadnr, int is_vp7)
2480 {
2481     VP8Context *s = avctx->priv_data;
2482     VP8ThreadData *td = &s->thread_data[jobnr];
2483     VP8ThreadData *next_td = NULL, *prev_td = NULL;
2484     VP8Frame *curframe = s->curframe;
2485     int mb_y, num_jobs = s->num_jobs;
2486
2487     td->thread_nr = threadnr;
2488     for (mb_y = jobnr; mb_y < s->mb_height; mb_y += num_jobs) {
2489         if (mb_y >= s->mb_height)
2490             break;
2491         td->thread_mb_pos = mb_y << 16;
2492         s->decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr);
2493         if (s->deblock_filter)
2494             s->filter_mb_row(avctx, tdata, jobnr, threadnr);
2495         update_pos(td, mb_y, INT_MAX & 0xFFFF);
2496
2497         s->mv_min.y -= 64;
2498         s->mv_max.y -= 64;
2499
2500         if (avctx->active_thread_type == FF_THREAD_FRAME)
2501             ff_thread_report_progress(&curframe->tf, mb_y, 0);
2502     }
2503
2504     return 0;
2505 }
2506
2507 static int vp7_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata,
2508                                     int jobnr, int threadnr)
2509 {
2510     return vp78_decode_mb_row_sliced(avctx, tdata, jobnr, threadnr, IS_VP7);
2511 }
2512
2513 static int vp8_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata,
2514                                     int jobnr, int threadnr)
2515 {
2516     return vp78_decode_mb_row_sliced(avctx, tdata, jobnr, threadnr, IS_VP8);
2517 }
2518
2519
2520 static av_always_inline
2521 int vp78_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
2522                       AVPacket *avpkt, int is_vp7)
2523 {
2524     VP8Context *s = avctx->priv_data;
2525     int ret, i, referenced, num_jobs;
2526     enum AVDiscard skip_thresh;
2527     VP8Frame *av_uninit(curframe), *prev_frame;
2528
2529     if (is_vp7)
2530         ret = vp7_decode_frame_header(s, avpkt->data, avpkt->size);
2531     else
2532         ret = vp8_decode_frame_header(s, avpkt->data, avpkt->size);
2533
2534     if (ret < 0)
2535         goto err;
2536
2537     prev_frame = s->framep[VP56_FRAME_CURRENT];
2538
2539     referenced = s->update_last || s->update_golden == VP56_FRAME_CURRENT ||
2540                  s->update_altref == VP56_FRAME_CURRENT;
2541
2542     skip_thresh = !referenced ? AVDISCARD_NONREF
2543                               : !s->keyframe ? AVDISCARD_NONKEY
2544                                              : AVDISCARD_ALL;
2545
2546     if (avctx->skip_frame >= skip_thresh) {
2547         s->invisible = 1;
2548         memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
2549         goto skip_decode;
2550     }
2551     s->deblock_filter = s->filter.level && avctx->skip_loop_filter < skip_thresh;
2552
2553     // release no longer referenced frames
2554     for (i = 0; i < 5; i++)
2555         if (s->frames[i].tf.f->data[0] &&
2556             &s->frames[i] != prev_frame &&
2557             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
2558             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN]   &&
2559             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2])
2560             vp8_release_frame(s, &s->frames[i]);
2561
2562     curframe = s->framep[VP56_FRAME_CURRENT] = vp8_find_free_buffer(s);
2563
2564     if (!s->colorspace)
2565         avctx->colorspace = AVCOL_SPC_BT470BG;
2566     if (s->fullrange)
2567         avctx->color_range = AVCOL_RANGE_JPEG;
2568     else
2569         avctx->color_range = AVCOL_RANGE_MPEG;
2570
2571     /* Given that arithmetic probabilities are updated every frame, it's quite
2572      * likely that the values we have on a random interframe are complete
2573      * junk if we didn't start decode on a keyframe. So just don't display
2574      * anything rather than junk. */
2575     if (!s->keyframe && (!s->framep[VP56_FRAME_PREVIOUS] ||
2576                          !s->framep[VP56_FRAME_GOLDEN]   ||
2577                          !s->framep[VP56_FRAME_GOLDEN2])) {
2578         av_log(avctx, AV_LOG_WARNING,
2579                "Discarding interframe without a prior keyframe!\n");
2580         ret = AVERROR_INVALIDDATA;
2581         goto err;
2582     }
2583
2584     curframe->tf.f->key_frame = s->keyframe;
2585     curframe->tf.f->pict_type = s->keyframe ? AV_PICTURE_TYPE_I
2586                                             : AV_PICTURE_TYPE_P;
2587     if ((ret = vp8_alloc_frame(s, curframe, referenced)) < 0)
2588         goto err;
2589
2590     // check if golden and altref are swapped
2591     if (s->update_altref != VP56_FRAME_NONE)
2592         s->next_framep[VP56_FRAME_GOLDEN2] = s->framep[s->update_altref];
2593     else
2594         s->next_framep[VP56_FRAME_GOLDEN2] = s->framep[VP56_FRAME_GOLDEN2];
2595
2596     if (s->update_golden != VP56_FRAME_NONE)
2597         s->next_framep[VP56_FRAME_GOLDEN] = s->framep[s->update_golden];
2598     else
2599         s->next_framep[VP56_FRAME_GOLDEN] = s->framep[VP56_FRAME_GOLDEN];
2600
2601     if (s->update_last)
2602         s->next_framep[VP56_FRAME_PREVIOUS] = curframe;
2603     else
2604         s->next_framep[VP56_FRAME_PREVIOUS] = s->framep[VP56_FRAME_PREVIOUS];
2605
2606     s->next_framep[VP56_FRAME_CURRENT] = curframe;
2607
2608     if (avctx->codec->update_thread_context)
2609         ff_thread_finish_setup(avctx);
2610
2611     s->linesize   = curframe->tf.f->linesize[0];
2612     s->uvlinesize = curframe->tf.f->linesize[1];
2613
2614     memset(s->top_nnz, 0, s->mb_width * sizeof(*s->top_nnz));
2615     /* Zero macroblock structures for top/top-left prediction
2616      * from outside the frame. */
2617     if (!s->mb_layout)
2618         memset(s->macroblocks + s->mb_height * 2 - 1, 0,
2619                (s->mb_width + 1) * sizeof(*s->macroblocks));
2620     if (!s->mb_layout && s->keyframe)
2621         memset(s->intra4x4_pred_mode_top, DC_PRED, s->mb_width * 4);
2622
2623     memset(s->ref_count, 0, sizeof(s->ref_count));
2624
2625     if (s->mb_layout == 1) {
2626         // Make sure the previous frame has read its segmentation map,
2627         // if we re-use the same map.
2628         if (prev_frame && s->segmentation.enabled &&
2629             !s->segmentation.update_map)
2630             ff_thread_await_progress(&prev_frame->tf, 1, 0);
2631         if (is_vp7)
2632             vp7_decode_mv_mb_modes(avctx, curframe, prev_frame);
2633         else
2634             vp8_decode_mv_mb_modes(avctx, curframe, prev_frame);
2635     }
2636
2637     if (avctx->active_thread_type == FF_THREAD_FRAME)
2638         num_jobs = 1;
2639     else
2640         num_jobs = FFMIN(s->num_coeff_partitions, avctx->thread_count);
2641     s->num_jobs   = num_jobs;
2642     s->curframe   = curframe;
2643     s->prev_frame = prev_frame;
2644     s->mv_min.y   = -MARGIN;
2645     s->mv_max.y   = ((s->mb_height - 1) << 6) + MARGIN;
2646     for (i = 0; i < MAX_THREADS; i++) {
2647         s->thread_data[i].thread_mb_pos = 0;
2648         s->thread_data[i].wait_mb_pos   = INT_MAX;
2649     }
2650     if (is_vp7)
2651         avctx->execute2(avctx, vp7_decode_mb_row_sliced, s->thread_data, NULL,
2652                         num_jobs);
2653     else
2654         avctx->execute2(avctx, vp8_decode_mb_row_sliced, s->thread_data, NULL,
2655                         num_jobs);
2656
2657     ff_thread_report_progress(&curframe->tf, INT_MAX, 0);
2658     memcpy(&s->framep[0], &s->next_framep[0], sizeof(s->framep[0]) * 4);
2659
2660 skip_decode:
2661     // if future frames don't use the updated probabilities,
2662     // reset them to the values we saved
2663     if (!s->update_probabilities)
2664         s->prob[0] = s->prob[1];
2665
2666     if (!s->invisible) {
2667         if ((ret = av_frame_ref(data, curframe->tf.f)) < 0)
2668             return ret;
2669         *got_frame = 1;
2670     }
2671
2672     return avpkt->size;
2673 err:
2674     memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
2675     return ret;
2676 }
2677
2678 int ff_vp8_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
2679                         AVPacket *avpkt)
2680 {
2681     return vp78_decode_frame(avctx, data, got_frame, avpkt, IS_VP8);
2682 }
2683
2684 #if CONFIG_VP7_DECODER
2685 static int vp7_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
2686                             AVPacket *avpkt)
2687 {
2688     return vp78_decode_frame(avctx, data, got_frame, avpkt, IS_VP7);
2689 }
2690 #endif /* CONFIG_VP7_DECODER */
2691
2692 av_cold int ff_vp8_decode_free(AVCodecContext *avctx)
2693 {
2694     VP8Context *s = avctx->priv_data;
2695     int i;
2696
2697     if (!s)
2698         return 0;
2699
2700     vp8_decode_flush_impl(avctx, 1);
2701     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++)
2702         av_frame_free(&s->frames[i].tf.f);
2703
2704     return 0;
2705 }
2706
2707 static av_cold int vp8_init_frames(VP8Context *s)
2708 {
2709     int i;
2710     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++) {
2711         s->frames[i].tf.f = av_frame_alloc();
2712         if (!s->frames[i].tf.f)
2713             return AVERROR(ENOMEM);
2714     }
2715     return 0;
2716 }
2717
2718 static av_always_inline
2719 int vp78_decode_init(AVCodecContext *avctx, int is_vp7)
2720 {
2721     VP8Context *s = avctx->priv_data;
2722     int ret;
2723
2724     s->avctx = avctx;
2725     s->vp7   = avctx->codec->id == AV_CODEC_ID_VP7;
2726     avctx->pix_fmt = AV_PIX_FMT_YUV420P;
2727     avctx->internal->allocate_progress = 1;
2728
2729     ff_videodsp_init(&s->vdsp, 8);
2730
2731     ff_vp78dsp_init(&s->vp8dsp);
2732     if (CONFIG_VP7_DECODER && is_vp7) {
2733         ff_h264_pred_init(&s->hpc, AV_CODEC_ID_VP7, 8, 1);
2734         ff_vp7dsp_init(&s->vp8dsp);
2735         s->decode_mb_row_no_filter = vp7_decode_mb_row_no_filter;
2736         s->filter_mb_row           = vp7_filter_mb_row;
2737     } else if (CONFIG_VP8_DECODER && !is_vp7) {
2738         ff_h264_pred_init(&s->hpc, AV_CODEC_ID_VP8, 8, 1);
2739         ff_vp8dsp_init(&s->vp8dsp);
2740         s->decode_mb_row_no_filter = vp8_decode_mb_row_no_filter;
2741         s->filter_mb_row           = vp8_filter_mb_row;
2742     }
2743
2744     /* does not change for VP8 */
2745     memcpy(s->prob[0].scan, zigzag_scan, sizeof(s->prob[0].scan));
2746
2747     if ((ret = vp8_init_frames(s)) < 0) {
2748         ff_vp8_decode_free(avctx);
2749         return ret;
2750     }
2751
2752     return 0;
2753 }
2754
2755 #if CONFIG_VP7_DECODER
2756 static int vp7_decode_init(AVCodecContext *avctx)
2757 {
2758     return vp78_decode_init(avctx, IS_VP7);
2759 }
2760 #endif /* CONFIG_VP7_DECODER */
2761
2762 av_cold int ff_vp8_decode_init(AVCodecContext *avctx)
2763 {
2764     return vp78_decode_init(avctx, IS_VP8);
2765 }
2766
2767 #if CONFIG_VP8_DECODER
2768 static av_cold int vp8_decode_init_thread_copy(AVCodecContext *avctx)
2769 {
2770     VP8Context *s = avctx->priv_data;
2771     int ret;
2772
2773     s->avctx = avctx;
2774
2775     if ((ret = vp8_init_frames(s)) < 0) {
2776         ff_vp8_decode_free(avctx);
2777         return ret;
2778     }
2779
2780     return 0;
2781 }
2782
2783 #define REBASE(pic) ((pic) ? (pic) - &s_src->frames[0] + &s->frames[0] : NULL)
2784
2785 static int vp8_decode_update_thread_context(AVCodecContext *dst,
2786                                             const AVCodecContext *src)
2787 {
2788     VP8Context *s = dst->priv_data, *s_src = src->priv_data;
2789     int i;
2790
2791     if (s->macroblocks_base &&
2792         (s_src->mb_width != s->mb_width || s_src->mb_height != s->mb_height)) {
2793         free_buffers(s);
2794         s->mb_width  = s_src->mb_width;
2795         s->mb_height = s_src->mb_height;
2796     }
2797
2798     s->prob[0]      = s_src->prob[!s_src->update_probabilities];
2799     s->segmentation = s_src->segmentation;
2800     s->lf_delta     = s_src->lf_delta;
2801     memcpy(s->sign_bias, s_src->sign_bias, sizeof(s->sign_bias));
2802
2803     for (i = 0; i < FF_ARRAY_ELEMS(s_src->frames); i++) {
2804         if (s_src->frames[i].tf.f->data[0]) {
2805             int ret = vp8_ref_frame(s, &s->frames[i], &s_src->frames[i]);
2806             if (ret < 0)
2807                 return ret;
2808         }
2809     }
2810
2811     s->framep[0] = REBASE(s_src->next_framep[0]);
2812     s->framep[1] = REBASE(s_src->next_framep[1]);
2813     s->framep[2] = REBASE(s_src->next_framep[2]);
2814     s->framep[3] = REBASE(s_src->next_framep[3]);
2815
2816     return 0;
2817 }
2818 #endif /* CONFIG_VP8_DECODER */
2819
2820 #if CONFIG_VP7_DECODER
2821 AVCodec ff_vp7_decoder = {
2822     .name                  = "vp7",
2823     .long_name             = NULL_IF_CONFIG_SMALL("On2 VP7"),
2824     .type                  = AVMEDIA_TYPE_VIDEO,
2825     .id                    = AV_CODEC_ID_VP7,
2826     .priv_data_size        = sizeof(VP8Context),
2827     .init                  = vp7_decode_init,
2828     .close                 = ff_vp8_decode_free,
2829     .decode                = vp7_decode_frame,
2830     .capabilities          = AV_CODEC_CAP_DR1,
2831     .flush                 = vp8_decode_flush,
2832 };
2833 #endif /* CONFIG_VP7_DECODER */
2834
2835 #if CONFIG_VP8_DECODER
2836 AVCodec ff_vp8_decoder = {
2837     .name                  = "vp8",
2838     .long_name             = NULL_IF_CONFIG_SMALL("On2 VP8"),
2839     .type                  = AVMEDIA_TYPE_VIDEO,
2840     .id                    = AV_CODEC_ID_VP8,
2841     .priv_data_size        = sizeof(VP8Context),
2842     .init                  = ff_vp8_decode_init,
2843     .close                 = ff_vp8_decode_free,
2844     .decode                = ff_vp8_decode_frame,
2845     .capabilities          = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS |
2846                              AV_CODEC_CAP_SLICE_THREADS,
2847     .flush                 = vp8_decode_flush,
2848     .init_thread_copy      = ONLY_IF_THREADS_ENABLED(vp8_decode_init_thread_copy),
2849     .update_thread_context = ONLY_IF_THREADS_ENABLED(vp8_decode_update_thread_context),
2850 };
2851 #endif /* CONFIG_VP7_DECODER */