git.sesse.net Git - ffmpeg/blob - libavcodec/vp8.c

   1 /*
   2  * VP7/VP8 compatible video decoder
   3  *
   4  * Copyright (C) 2010 David Conrad
   5  * Copyright (C) 2010 Ronald S. Bultje
   6  * Copyright (C) 2010 Fiona Glaser
   7  * Copyright (C) 2012 Daniel Kang
   8  * Copyright (C) 2014 Peter Ross
   9  *
  10  * This file is part of FFmpeg.
  11  *
  12  * FFmpeg is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public
  14  * License as published by the Free Software Foundation; either
  15  * version 2.1 of the License, or (at your option) any later version.
  16  *
  17  * FFmpeg is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with FFmpeg; if not, write to the Free Software
  24  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  25  */
  26
  27 #include "libavutil/imgutils.h"
  28
  29 #include "avcodec.h"
  30 #include "hwaccel.h"
  31 #include "internal.h"
  32 #include "mathops.h"
  33 #include "rectangle.h"
  34 #include "thread.h"
  35 #include "vp8.h"
  36 #include "vp8data.h"
  37
  38 #if ARCH_ARM
  39 #   include "arm/vp8.h"
  40 #endif
  41
  42 #if CONFIG_VP7_DECODER && CONFIG_VP8_DECODER
  43 #define VPX(vp7, f) (vp7 ? vp7_ ## f : vp8_ ## f)
  44 #elif CONFIG_VP7_DECODER
  45 #define VPX(vp7, f) vp7_ ## f
  46 #else // CONFIG_VP8_DECODER
  47 #define VPX(vp7, f) vp8_ ## f
  48 #endif
  49
  50 static void free_buffers(VP8Context *s)
  51 {
  52     int i;
  53     if (s->thread_data)
  54         for (i = 0; i < MAX_THREADS; i++) {
  55 #if HAVE_THREADS
  56             pthread_cond_destroy(&s->thread_data[i].cond);
  57             pthread_mutex_destroy(&s->thread_data[i].lock);
  58 #endif
  59             av_freep(&s->thread_data[i].filter_strength);
  60         }
  61     av_freep(&s->thread_data);
  62     av_freep(&s->macroblocks_base);
  63     av_freep(&s->intra4x4_pred_mode_top);
  64     av_freep(&s->top_nnz);
  65     av_freep(&s->top_border);
  66
  67     s->macroblocks = NULL;
  68 }
  69
  70 static int vp8_alloc_frame(VP8Context *s, VP8Frame *f, int ref)
  71 {
  72     int ret;
  73     if ((ret = ff_thread_get_buffer(s->avctx, &f->tf,
  74                                     ref ? AV_GET_BUFFER_FLAG_REF : 0)) < 0)
  75         return ret;
  76     if (!(f->seg_map = av_buffer_allocz(s->mb_width * s->mb_height)))
  77         goto fail;
  78     if (s->avctx->hwaccel) {
  79         const AVHWAccel *hwaccel = s->avctx->hwaccel;
  80         if (hwaccel->frame_priv_data_size) {
  81             f->hwaccel_priv_buf = av_buffer_allocz(hwaccel->frame_priv_data_size);
  82             if (!f->hwaccel_priv_buf)
  83                 goto fail;
  84             f->hwaccel_picture_private = f->hwaccel_priv_buf->data;
  85         }
  86     }
  87     return 0;
  88
  89 fail:
  90     av_buffer_unref(&f->seg_map);
  91     ff_thread_release_buffer(s->avctx, &f->tf);
  92     return AVERROR(ENOMEM);
  93 }
  94
  95 static void vp8_release_frame(VP8Context *s, VP8Frame *f)
  96 {
  97     av_buffer_unref(&f->seg_map);
  98     av_buffer_unref(&f->hwaccel_priv_buf);
  99     f->hwaccel_picture_private = NULL;
 100     ff_thread_release_buffer(s->avctx, &f->tf);
 101 }
 102
 103 #if CONFIG_VP8_DECODER
 104 static int vp8_ref_frame(VP8Context *s, VP8Frame *dst, VP8Frame *src)
 105 {
 106     int ret;
 107
 108     vp8_release_frame(s, dst);
 109
 110     if ((ret = ff_thread_ref_frame(&dst->tf, &src->tf)) < 0)
 111         return ret;
 112     if (src->seg_map &&
 113         !(dst->seg_map = av_buffer_ref(src->seg_map))) {
 114         vp8_release_frame(s, dst);
 115         return AVERROR(ENOMEM);
 116     }
 117     if (src->hwaccel_picture_private) {
 118         dst->hwaccel_priv_buf = av_buffer_ref(src->hwaccel_priv_buf);
 119         if (!dst->hwaccel_priv_buf)
 120             return AVERROR(ENOMEM);
 121         dst->hwaccel_picture_private = dst->hwaccel_priv_buf->data;
 122     }
 123
 124     return 0;
 125 }
 126 #endif /* CONFIG_VP8_DECODER */
 127
 128 static void vp8_decode_flush_impl(AVCodecContext *avctx, int free_mem)
 129 {
 130     VP8Context *s = avctx->priv_data;
 131     int i;
 132
 133     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++)
 134         vp8_release_frame(s, &s->frames[i]);
 135     memset(s->framep, 0, sizeof(s->framep));
 136
 137     if (free_mem)
 138         free_buffers(s);
 139 }
 140
 141 static void vp8_decode_flush(AVCodecContext *avctx)
 142 {
 143     vp8_decode_flush_impl(avctx, 0);
 144 }
 145
 146 static VP8Frame *vp8_find_free_buffer(VP8Context *s)
 147 {
 148     VP8Frame *frame = NULL;
 149     int i;
 150
 151     // find a free buffer
 152     for (i = 0; i < 5; i++)
 153         if (&s->frames[i] != s->framep[VP56_FRAME_CURRENT]  &&
 154             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
 155             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN]   &&
 156             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2]) {
 157             frame = &s->frames[i];
 158             break;
 159         }
 160     if (i == 5) {
 161         av_log(s->avctx, AV_LOG_FATAL, "Ran out of free frames!\n");
 162         abort();
 163     }
 164     if (frame->tf.f->buf[0])
 165         vp8_release_frame(s, frame);
 166
 167     return frame;
 168 }
 169
 170 static av_always_inline
 171 int update_dimensions(VP8Context *s, int width, int height, int is_vp7)
 172 {
 173     AVCodecContext *avctx = s->avctx;
 174     int i, ret;
 175
 176     if (width  != s->avctx->width || ((width+15)/16 != s->mb_width || (height+15)/16 != s->mb_height) && s->macroblocks_base ||
 177         height != s->avctx->height) {
 178         vp8_decode_flush_impl(s->avctx, 1);
 179
 180         ret = ff_set_dimensions(s->avctx, width, height);
 181         if (ret < 0)
 182             return ret;
 183     }
 184
 185     s->mb_width  = (s->avctx->coded_width  + 15) / 16;
 186     s->mb_height = (s->avctx->coded_height + 15) / 16;
 187
 188     s->mb_layout = is_vp7 || avctx->active_thread_type == FF_THREAD_SLICE &&
 189                    avctx->thread_count > 1;
 190     if (!s->mb_layout) { // Frame threading and one thread
 191         s->macroblocks_base       = av_mallocz((s->mb_width + s->mb_height * 2 + 1) *
 192                                                sizeof(*s->macroblocks));
 193         s->intra4x4_pred_mode_top = av_mallocz(s->mb_width * 4);
 194     } else // Sliced threading
 195         s->macroblocks_base = av_mallocz((s->mb_width + 2) * (s->mb_height + 2) *
 196                                          sizeof(*s->macroblocks));
 197     s->top_nnz     = av_mallocz(s->mb_width * sizeof(*s->top_nnz));
 198     s->top_border  = av_mallocz((s->mb_width + 1) * sizeof(*s->top_border));
 199     s->thread_data = av_mallocz(MAX_THREADS * sizeof(VP8ThreadData));
 200
 201     if (!s->macroblocks_base || !s->top_nnz || !s->top_border ||
 202         !s->thread_data || (!s->intra4x4_pred_mode_top && !s->mb_layout)) {
 203         free_buffers(s);
 204         return AVERROR(ENOMEM);
 205     }
 206
 207     for (i = 0; i < MAX_THREADS; i++) {
 208         s->thread_data[i].filter_strength =
 209             av_mallocz(s->mb_width * sizeof(*s->thread_data[0].filter_strength));
 210         if (!s->thread_data[i].filter_strength) {
 211             free_buffers(s);
 212             return AVERROR(ENOMEM);
 213         }
 214 #if HAVE_THREADS
 215         pthread_mutex_init(&s->thread_data[i].lock, NULL);
 216         pthread_cond_init(&s->thread_data[i].cond, NULL);
 217 #endif
 218     }
 219
 220     s->macroblocks = s->macroblocks_base + 1;
 221
 222     return 0;
 223 }
 224
 225 static int vp7_update_dimensions(VP8Context *s, int width, int height)
 226 {
 227     return update_dimensions(s, width, height, IS_VP7);
 228 }
 229
 230 static int vp8_update_dimensions(VP8Context *s, int width, int height)
 231 {
 232     return update_dimensions(s, width, height, IS_VP8);
 233 }
 234
 235
 236 static void parse_segment_info(VP8Context *s)
 237 {
 238     VP56RangeCoder *c = &s->c;
 239     int i;
 240
 241     s->segmentation.update_map = vp8_rac_get(c);
 242     s->segmentation.update_feature_data = vp8_rac_get(c);
 243
 244     if (s->segmentation.update_feature_data) {
 245         s->segmentation.absolute_vals = vp8_rac_get(c);
 246
 247         for (i = 0; i < 4; i++)
 248             s->segmentation.base_quant[i]   = vp8_rac_get_sint(c, 7);
 249
 250         for (i = 0; i < 4; i++)
 251             s->segmentation.filter_level[i] = vp8_rac_get_sint(c, 6);
 252     }
 253     if (s->segmentation.update_map)
 254         for (i = 0; i < 3; i++)
 255             s->prob->segmentid[i] = vp8_rac_get(c) ? vp8_rac_get_uint(c, 8) : 255;
 256 }
 257
 258 static void update_lf_deltas(VP8Context *s)
 259 {
 260     VP56RangeCoder *c = &s->c;
 261     int i;
 262
 263     for (i = 0; i < 4; i++) {
 264         if (vp8_rac_get(c)) {
 265             s->lf_delta.ref[i] = vp8_rac_get_uint(c, 6);
 266
 267             if (vp8_rac_get(c))
 268                 s->lf_delta.ref[i] = -s->lf_delta.ref[i];
 269         }
 270     }
 271
 272     for (i = MODE_I4x4; i <= VP8_MVMODE_SPLIT; i++) {
 273         if (vp8_rac_get(c)) {
 274             s->lf_delta.mode[i] = vp8_rac_get_uint(c, 6);
 275
 276             if (vp8_rac_get(c))
 277                 s->lf_delta.mode[i] = -s->lf_delta.mode[i];
 278         }
 279     }
 280 }
 281
 282 static int setup_partitions(VP8Context *s, const uint8_t *buf, int buf_size)
 283 {
 284     const uint8_t *sizes = buf;
 285     int i;
 286     int ret;
 287
 288     s->num_coeff_partitions = 1 << vp8_rac_get_uint(&s->c, 2);
 289
 290     buf      += 3 * (s->num_coeff_partitions - 1);
 291     buf_size -= 3 * (s->num_coeff_partitions - 1);
 292     if (buf_size < 0)
 293         return -1;
 294
 295     for (i = 0; i < s->num_coeff_partitions - 1; i++) {
 296         int size = AV_RL24(sizes + 3 * i);
 297         if (buf_size - size < 0)
 298             return -1;
 299         s->coeff_partition_size[i] = size;
 300
 301         ret = ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, size);
 302         if (ret < 0)
 303             return ret;
 304         buf      += size;
 305         buf_size -= size;
 306     }
 307
 308     s->coeff_partition_size[i] = buf_size;
 309     ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, buf_size);
 310
 311     return 0;
 312 }
 313
 314 static void vp7_get_quants(VP8Context *s)
 315 {
 316     VP56RangeCoder *c = &s->c;
 317
 318     int yac_qi  = vp8_rac_get_uint(c, 7);
 319     int ydc_qi  = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 320     int y2dc_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 321     int y2ac_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 322     int uvdc_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 323     int uvac_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 324
 325     s->qmat[0].luma_qmul[0]    =       vp7_ydc_qlookup[ydc_qi];
 326     s->qmat[0].luma_qmul[1]    =       vp7_yac_qlookup[yac_qi];
 327     s->qmat[0].luma_dc_qmul[0] =       vp7_y2dc_qlookup[y2dc_qi];
 328     s->qmat[0].luma_dc_qmul[1] =       vp7_y2ac_qlookup[y2ac_qi];
 329     s->qmat[0].chroma_qmul[0]  = FFMIN(vp7_ydc_qlookup[uvdc_qi], 132);
 330     s->qmat[0].chroma_qmul[1]  =       vp7_yac_qlookup[uvac_qi];
 331 }
 332
 333 static void vp8_get_quants(VP8Context *s)
 334 {
 335     VP56RangeCoder *c = &s->c;
 336     int i, base_qi;
 337
 338     s->quant.yac_qi     = vp8_rac_get_uint(c, 7);
 339     s->quant.ydc_delta  = vp8_rac_get_sint(c, 4);
 340     s->quant.y2dc_delta = vp8_rac_get_sint(c, 4);
 341     s->quant.y2ac_delta = vp8_rac_get_sint(c, 4);
 342     s->quant.uvdc_delta = vp8_rac_get_sint(c, 4);
 343     s->quant.uvac_delta = vp8_rac_get_sint(c, 4);
 344
 345     for (i = 0; i < 4; i++) {
 346         if (s->segmentation.enabled) {
 347             base_qi = s->segmentation.base_quant[i];
 348             if (!s->segmentation.absolute_vals)
 349                 base_qi += s->quant.yac_qi;
 350         } else
 351             base_qi = s->quant.yac_qi;
 352
 353         s->qmat[i].luma_qmul[0]    = vp8_dc_qlookup[av_clip_uintp2(base_qi + s->quant.ydc_delta,  7)];
 354         s->qmat[i].luma_qmul[1]    = vp8_ac_qlookup[av_clip_uintp2(base_qi,              7)];
 355         s->qmat[i].luma_dc_qmul[0] = vp8_dc_qlookup[av_clip_uintp2(base_qi + s->quant.y2dc_delta, 7)] * 2;
 356         /* 101581>>16 is equivalent to 155/100 */
 357         s->qmat[i].luma_dc_qmul[1] = vp8_ac_qlookup[av_clip_uintp2(base_qi + s->quant.y2ac_delta, 7)] * 101581 >> 16;
 358         s->qmat[i].chroma_qmul[0]  = vp8_dc_qlookup[av_clip_uintp2(base_qi + s->quant.uvdc_delta, 7)];
 359         s->qmat[i].chroma_qmul[1]  = vp8_ac_qlookup[av_clip_uintp2(base_qi + s->quant.uvac_delta, 7)];
 360
 361         s->qmat[i].luma_dc_qmul[1] = FFMAX(s->qmat[i].luma_dc_qmul[1], 8);
 362         s->qmat[i].chroma_qmul[0]  = FFMIN(s->qmat[i].chroma_qmul[0], 132);
 363     }
 364 }
 365
 366 /**
 367  * Determine which buffers golden and altref should be updated with after this frame.
 368  * The spec isn't clear here, so I'm going by my understanding of what libvpx does
 369  *
 370  * Intra frames update all 3 references
 371  * Inter frames update VP56_FRAME_PREVIOUS if the update_last flag is set
 372  * If the update (golden|altref) flag is set, it's updated with the current frame
 373  *      if update_last is set, and VP56_FRAME_PREVIOUS otherwise.
 374  * If the flag is not set, the number read means:
 375  *      0: no update
 376  *      1: VP56_FRAME_PREVIOUS
 377  *      2: update golden with altref, or update altref with golden
 378  */
 379 static VP56Frame ref_to_update(VP8Context *s, int update, VP56Frame ref)
 380 {
 381     VP56RangeCoder *c = &s->c;
 382
 383     if (update)
 384         return VP56_FRAME_CURRENT;
 385
 386     switch (vp8_rac_get_uint(c, 2)) {
 387     case 1:
 388         return VP56_FRAME_PREVIOUS;
 389     case 2:
 390         return (ref == VP56_FRAME_GOLDEN) ? VP56_FRAME_GOLDEN2 : VP56_FRAME_GOLDEN;
 391     }
 392     return VP56_FRAME_NONE;
 393 }
 394
 395 static void vp78_reset_probability_tables(VP8Context *s)
 396 {
 397     int i, j;
 398     for (i = 0; i < 4; i++)
 399         for (j = 0; j < 16; j++)
 400             memcpy(s->prob->token[i][j], vp8_token_default_probs[i][vp8_coeff_band[j]],
 401                    sizeof(s->prob->token[i][j]));
 402 }
 403
 404 static void vp78_update_probability_tables(VP8Context *s)
 405 {
 406     VP56RangeCoder *c = &s->c;
 407     int i, j, k, l, m;
 408
 409     for (i = 0; i < 4; i++)
 410         for (j = 0; j < 8; j++)
 411             for (k = 0; k < 3; k++)
 412                 for (l = 0; l < NUM_DCT_TOKENS-1; l++)
 413                     if (vp56_rac_get_prob_branchy(c, vp8_token_update_probs[i][j][k][l])) {
 414                         int prob = vp8_rac_get_uint(c, 8);
 415                         for (m = 0; vp8_coeff_band_indexes[j][m] >= 0; m++)
 416                             s->prob->token[i][vp8_coeff_band_indexes[j][m]][k][l] = prob;
 417                     }
 418 }
 419
 420 #define VP7_MVC_SIZE 17
 421 #define VP8_MVC_SIZE 19
 422
 423 static void vp78_update_pred16x16_pred8x8_mvc_probabilities(VP8Context *s,
 424                                                             int mvc_size)
 425 {
 426     VP56RangeCoder *c = &s->c;
 427     int i, j;
 428
 429     if (vp8_rac_get(c))
 430         for (i = 0; i < 4; i++)
 431             s->prob->pred16x16[i] = vp8_rac_get_uint(c, 8);
 432     if (vp8_rac_get(c))
 433         for (i = 0; i < 3; i++)
 434             s->prob->pred8x8c[i]  = vp8_rac_get_uint(c, 8);
 435
 436     // 17.2 MV probability update
 437     for (i = 0; i < 2; i++)
 438         for (j = 0; j < mvc_size; j++)
 439             if (vp56_rac_get_prob_branchy(c, vp8_mv_update_prob[i][j]))
 440                 s->prob->mvc[i][j] = vp8_rac_get_nn(c);
 441 }
 442
 443 static void update_refs(VP8Context *s)
 444 {
 445     VP56RangeCoder *c = &s->c;
 446
 447     int update_golden = vp8_rac_get(c);
 448     int update_altref = vp8_rac_get(c);
 449
 450     s->update_golden = ref_to_update(s, update_golden, VP56_FRAME_GOLDEN);
 451     s->update_altref = ref_to_update(s, update_altref, VP56_FRAME_GOLDEN2);
 452 }
 453
 454 static void copy_chroma(AVFrame *dst, AVFrame *src, int width, int height)
 455 {
 456     int i, j;
 457
 458     for (j = 1; j < 3; j++) {
 459         for (i = 0; i < height / 2; i++)
 460             memcpy(dst->data[j] + i * dst->linesize[j],
 461                    src->data[j] + i * src->linesize[j], width / 2);
 462     }
 463 }
 464
 465 static void fade(uint8_t *dst, ptrdiff_t dst_linesize,
 466                  const uint8_t *src, ptrdiff_t src_linesize,
 467                  int width, int height,
 468                  int alpha, int beta)
 469 {
 470     int i, j;
 471     for (j = 0; j < height; j++) {
 472         for (i = 0; i < width; i++) {
 473             uint8_t y = src[j * src_linesize + i];
 474             dst[j * dst_linesize + i] = av_clip_uint8(y + ((y * beta) >> 8) + alpha);
 475         }
 476     }
 477 }
 478
 479 static int vp7_fade_frame(VP8Context *s, VP56RangeCoder *c)
 480 {
 481     int alpha = (int8_t) vp8_rac_get_uint(c, 8);
 482     int beta  = (int8_t) vp8_rac_get_uint(c, 8);
 483     int ret;
 484
 485     if (!s->keyframe && (alpha || beta)) {
 486         int width  = s->mb_width * 16;
 487         int height = s->mb_height * 16;
 488         AVFrame *src, *dst;
 489
 490         if (!s->framep[VP56_FRAME_PREVIOUS] ||
 491             !s->framep[VP56_FRAME_GOLDEN]) {
 492             av_log(s->avctx, AV_LOG_WARNING, "Discarding interframe without a prior keyframe!\n");
 493             return AVERROR_INVALIDDATA;
 494         }
 495
 496         dst =
 497         src = s->framep[VP56_FRAME_PREVIOUS]->tf.f;
 498
 499         /* preserve the golden frame, write a new previous frame */
 500         if (s->framep[VP56_FRAME_GOLDEN] == s->framep[VP56_FRAME_PREVIOUS]) {
 501             s->framep[VP56_FRAME_PREVIOUS] = vp8_find_free_buffer(s);
 502             if ((ret = vp8_alloc_frame(s, s->framep[VP56_FRAME_PREVIOUS], 1)) < 0)
 503                 return ret;
 504
 505             dst = s->framep[VP56_FRAME_PREVIOUS]->tf.f;
 506
 507             copy_chroma(dst, src, width, height);
 508         }
 509
 510         fade(dst->data[0], dst->linesize[0],
 511              src->data[0], src->linesize[0],
 512              width, height, alpha, beta);
 513     }
 514
 515     return 0;
 516 }
 517
 518 static int vp7_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size)
 519 {
 520     VP56RangeCoder *c = &s->c;
 521     int part1_size, hscale, vscale, i, j, ret;
 522     int width  = s->avctx->width;
 523     int height = s->avctx->height;
 524
 525     if (buf_size < 4) {
 526         return AVERROR_INVALIDDATA;
 527     }
 528
 529     s->profile = (buf[0] >> 1) & 7;
 530     if (s->profile > 1) {
 531         avpriv_request_sample(s->avctx, "Unknown profile %d", s->profile);
 532         return AVERROR_INVALIDDATA;
 533     }
 534
 535     s->keyframe  = !(buf[0] & 1);
 536     s->invisible = 0;
 537     part1_size   = AV_RL24(buf) >> 4;
 538
 539     if (buf_size < 4 - s->profile + part1_size) {
 540         av_log(s->avctx, AV_LOG_ERROR, "Buffer size %d is too small, needed : %d\n", buf_size, 4 - s->profile + part1_size);
 541         return AVERROR_INVALIDDATA;
 542     }
 543
 544     buf      += 4 - s->profile;
 545     buf_size -= 4 - s->profile;
 546
 547     memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab, sizeof(s->put_pixels_tab));
 548
 549     ret = ff_vp56_init_range_decoder(c, buf, part1_size);
 550     if (ret < 0)
 551         return ret;
 552     buf      += part1_size;
 553     buf_size -= part1_size;
 554
 555     /* A. Dimension information (keyframes only) */
 556     if (s->keyframe) {
 557         width  = vp8_rac_get_uint(c, 12);
 558         height = vp8_rac_get_uint(c, 12);
 559         hscale = vp8_rac_get_uint(c, 2);
 560         vscale = vp8_rac_get_uint(c, 2);
 561         if (hscale || vscale)
 562             avpriv_request_sample(s->avctx, "Upscaling");
 563
 564         s->update_golden = s->update_altref = VP56_FRAME_CURRENT;
 565         vp78_reset_probability_tables(s);
 566         memcpy(s->prob->pred16x16, vp8_pred16x16_prob_inter,
 567                sizeof(s->prob->pred16x16));
 568         memcpy(s->prob->pred8x8c, vp8_pred8x8c_prob_inter,
 569                sizeof(s->prob->pred8x8c));
 570         for (i = 0; i < 2; i++)
 571             memcpy(s->prob->mvc[i], vp7_mv_default_prob[i],
 572                    sizeof(vp7_mv_default_prob[i]));
 573         memset(&s->segmentation, 0, sizeof(s->segmentation));
 574         memset(&s->lf_delta, 0, sizeof(s->lf_delta));
 575         memcpy(s->prob[0].scan, ff_zigzag_scan, sizeof(s->prob[0].scan));
 576     }
 577
 578     if (s->keyframe || s->profile > 0)
 579         memset(s->inter_dc_pred, 0 , sizeof(s->inter_dc_pred));
 580
 581     /* B. Decoding information for all four macroblock-level features */
 582     for (i = 0; i < 4; i++) {
 583         s->feature_enabled[i] = vp8_rac_get(c);
 584         if (s->feature_enabled[i]) {
 585              s->feature_present_prob[i] = vp8_rac_get_uint(c, 8);
 586
 587              for (j = 0; j < 3; j++)
 588                  s->feature_index_prob[i][j] =
 589                      vp8_rac_get(c) ? vp8_rac_get_uint(c, 8) : 255;
 590
 591              if (vp7_feature_value_size[s->profile][i])
 592                  for (j = 0; j < 4; j++)
 593                      s->feature_value[i][j] =
 594                         vp8_rac_get(c) ? vp8_rac_get_uint(c, vp7_feature_value_size[s->profile][i]) : 0;
 595         }
 596     }
 597
 598     s->segmentation.enabled    = 0;
 599     s->segmentation.update_map = 0;
 600     s->lf_delta.enabled        = 0;
 601
 602     s->num_coeff_partitions = 1;
 603     ret = ff_vp56_init_range_decoder(&s->coeff_partition[0], buf, buf_size);
 604     if (ret < 0)
 605         return ret;
 606
 607     if (!s->macroblocks_base || /* first frame */
 608         width != s->avctx->width || height != s->avctx->height ||
 609         (width + 15) / 16 != s->mb_width || (height + 15) / 16 != s->mb_height) {
 610         if ((ret = vp7_update_dimensions(s, width, height)) < 0)
 611             return ret;
 612     }
 613
 614     /* C. Dequantization indices */
 615     vp7_get_quants(s);
 616
 617     /* D. Golden frame update flag (a Flag) for interframes only */
 618     if (!s->keyframe) {
 619         s->update_golden = vp8_rac_get(c) ? VP56_FRAME_CURRENT : VP56_FRAME_NONE;
 620         s->sign_bias[VP56_FRAME_GOLDEN] = 0;
 621     }
 622
 623     s->update_last          = 1;
 624     s->update_probabilities = 1;
 625     s->fade_present         = 1;
 626
 627     if (s->profile > 0) {
 628         s->update_probabilities = vp8_rac_get(c);
 629         if (!s->update_probabilities)
 630             s->prob[1] = s->prob[0];
 631
 632         if (!s->keyframe)
 633             s->fade_present = vp8_rac_get(c);
 634     }
 635
 636     /* E. Fading information for previous frame */
 637     if (s->fade_present && vp8_rac_get(c)) {
 638         if ((ret = vp7_fade_frame(s ,c)) < 0)
 639             return ret;
 640     }
 641
 642     /* F. Loop filter type */
 643     if (!s->profile)
 644         s->filter.simple = vp8_rac_get(c);
 645
 646     /* G. DCT coefficient ordering specification */
 647     if (vp8_rac_get(c))
 648         for (i = 1; i < 16; i++)
 649             s->prob[0].scan[i] = ff_zigzag_scan[vp8_rac_get_uint(c, 4)];
 650
 651     /* H. Loop filter levels  */
 652     if (s->profile > 0)
 653         s->filter.simple = vp8_rac_get(c);
 654     s->filter.level     = vp8_rac_get_uint(c, 6);
 655     s->filter.sharpness = vp8_rac_get_uint(c, 3);
 656
 657     /* I. DCT coefficient probability update; 13.3 Token Probability Updates */
 658     vp78_update_probability_tables(s);
 659
 660     s->mbskip_enabled = 0;
 661
 662     /* J. The remaining frame header data occurs ONLY FOR INTERFRAMES */
 663     if (!s->keyframe) {
 664         s->prob->intra  = vp8_rac_get_uint(c, 8);
 665         s->prob->last   = vp8_rac_get_uint(c, 8);
 666         vp78_update_pred16x16_pred8x8_mvc_probabilities(s, VP7_MVC_SIZE);
 667     }
 668
 669     return 0;
 670 }
 671
 672 static int vp8_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size)
 673 {
 674     VP56RangeCoder *c = &s->c;
 675     int header_size, hscale, vscale, ret;
 676     int width  = s->avctx->width;
 677     int height = s->avctx->height;
 678
 679     if (buf_size < 3) {
 680         av_log(s->avctx, AV_LOG_ERROR, "Insufficent data (%d) for header\n", buf_size);
 681         return AVERROR_INVALIDDATA;
 682     }
 683
 684     s->keyframe  = !(buf[0] & 1);
 685     s->profile   =  (buf[0]>>1) & 7;
 686     s->invisible = !(buf[0] & 0x10);
 687     header_size  = AV_RL24(buf) >> 5;
 688     buf      += 3;
 689     buf_size -= 3;
 690
 691     s->header_partition_size = header_size;
 692
 693     if (s->profile > 3)
 694         av_log(s->avctx, AV_LOG_WARNING, "Unknown profile %d\n", s->profile);
 695
 696     if (!s->profile)
 697         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab,
 698                sizeof(s->put_pixels_tab));
 699     else    // profile 1-3 use bilinear, 4+ aren't defined so whatever
 700         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_bilinear_pixels_tab,
 701                sizeof(s->put_pixels_tab));
 702
 703     if (header_size > buf_size - 7 * s->keyframe) {
 704         av_log(s->avctx, AV_LOG_ERROR, "Header size larger than data provided\n");
 705         return AVERROR_INVALIDDATA;
 706     }
 707
 708     if (s->keyframe) {
 709         if (AV_RL24(buf) != 0x2a019d) {
 710             av_log(s->avctx, AV_LOG_ERROR,
 711                    "Invalid start code 0x%x\n", AV_RL24(buf));
 712             return AVERROR_INVALIDDATA;
 713         }
 714         width     = AV_RL16(buf + 3) & 0x3fff;
 715         height    = AV_RL16(buf + 5) & 0x3fff;
 716         hscale    = buf[4] >> 6;
 717         vscale    = buf[6] >> 6;
 718         buf      += 7;
 719         buf_size -= 7;
 720
 721         if (hscale || vscale)
 722             avpriv_request_sample(s->avctx, "Upscaling");
 723
 724         s->update_golden = s->update_altref = VP56_FRAME_CURRENT;
 725         vp78_reset_probability_tables(s);
 726         memcpy(s->prob->pred16x16, vp8_pred16x16_prob_inter,
 727                sizeof(s->prob->pred16x16));
 728         memcpy(s->prob->pred8x8c, vp8_pred8x8c_prob_inter,
 729                sizeof(s->prob->pred8x8c));
 730         memcpy(s->prob->mvc, vp8_mv_default_prob,
 731                sizeof(s->prob->mvc));
 732         memset(&s->segmentation, 0, sizeof(s->segmentation));
 733         memset(&s->lf_delta, 0, sizeof(s->lf_delta));
 734     }
 735
 736     ret = ff_vp56_init_range_decoder(c, buf, header_size);
 737     if (ret < 0)
 738         return ret;
 739     buf      += header_size;
 740     buf_size -= header_size;
 741
 742     if (s->keyframe) {
 743         s->colorspace = vp8_rac_get(c);
 744         if (s->colorspace)
 745             av_log(s->avctx, AV_LOG_WARNING, "Unspecified colorspace\n");
 746         s->fullrange = vp8_rac_get(c);
 747     }
 748
 749     if ((s->segmentation.enabled = vp8_rac_get(c)))
 750         parse_segment_info(s);
 751     else
 752         s->segmentation.update_map = 0; // FIXME: move this to some init function?
 753
 754     s->filter.simple    = vp8_rac_get(c);
 755     s->filter.level     = vp8_rac_get_uint(c, 6);
 756     s->filter.sharpness = vp8_rac_get_uint(c, 3);
 757
 758     if ((s->lf_delta.enabled = vp8_rac_get(c))) {
 759         s->lf_delta.update = vp8_rac_get(c);
 760         if (s->lf_delta.update)
 761             update_lf_deltas(s);
 762     }
 763
 764     if (setup_partitions(s, buf, buf_size)) {
 765         av_log(s->avctx, AV_LOG_ERROR, "Invalid partitions\n");
 766         return AVERROR_INVALIDDATA;
 767     }
 768
 769     if (!s->macroblocks_base || /* first frame */
 770         width != s->avctx->width || height != s->avctx->height ||
 771         (width+15)/16 != s->mb_width || (height+15)/16 != s->mb_height)
 772         if ((ret = vp8_update_dimensions(s, width, height)) < 0)
 773             return ret;
 774
 775     vp8_get_quants(s);
 776
 777     if (!s->keyframe) {
 778         update_refs(s);
 779         s->sign_bias[VP56_FRAME_GOLDEN]               = vp8_rac_get(c);
 780         s->sign_bias[VP56_FRAME_GOLDEN2 /* altref */] = vp8_rac_get(c);
 781     }
 782
 783     // if we aren't saving this frame's probabilities for future frames,
 784     // make a copy of the current probabilities
 785     if (!(s->update_probabilities = vp8_rac_get(c)))
 786         s->prob[1] = s->prob[0];
 787
 788     s->update_last = s->keyframe || vp8_rac_get(c);
 789
 790     vp78_update_probability_tables(s);
 791
 792     if ((s->mbskip_enabled = vp8_rac_get(c)))
 793         s->prob->mbskip = vp8_rac_get_uint(c, 8);
 794
 795     if (!s->keyframe) {
 796         s->prob->intra  = vp8_rac_get_uint(c, 8);
 797         s->prob->last   = vp8_rac_get_uint(c, 8);
 798         s->prob->golden = vp8_rac_get_uint(c, 8);
 799         vp78_update_pred16x16_pred8x8_mvc_probabilities(s, VP8_MVC_SIZE);
 800     }
 801
 802     // Record the entropy coder state here so that hwaccels can use it.
 803     s->c.code_word = vp56_rac_renorm(&s->c);
 804     s->coder_state_at_header_end.input     = s->c.buffer - (-s->c.bits / 8);
 805     s->coder_state_at_header_end.range     = s->c.high;
 806     s->coder_state_at_header_end.value     = s->c.code_word >> 16;
 807     s->coder_state_at_header_end.bit_count = -s->c.bits % 8;
 808
 809     return 0;
 810 }
 811
 812 static av_always_inline
 813 void clamp_mv(VP8mvbounds *s, VP56mv *dst, const VP56mv *src)
 814 {
 815     dst->x = av_clip(src->x, av_clip(s->mv_min.x, INT16_MIN, INT16_MAX),
 816                              av_clip(s->mv_max.x, INT16_MIN, INT16_MAX));
 817     dst->y = av_clip(src->y, av_clip(s->mv_min.y, INT16_MIN, INT16_MAX),
 818                              av_clip(s->mv_max.y, INT16_MIN, INT16_MAX));
 819 }
 820
 821 /**
 822  * Motion vector coding, 17.1.
 823  */
 824 static av_always_inline int read_mv_component(VP56RangeCoder *c, const uint8_t *p, int vp7)
 825 {
 826     int bit, x = 0;
 827
 828     if (vp56_rac_get_prob_branchy(c, p[0])) {
 829         int i;
 830
 831         for (i = 0; i < 3; i++)
 832             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 833         for (i = (vp7 ? 7 : 9); i > 3; i--)
 834             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 835         if (!(x & (vp7 ? 0xF0 : 0xFFF0)) || vp56_rac_get_prob(c, p[12]))
 836             x += 8;
 837     } else {
 838         // small_mvtree
 839         const uint8_t *ps = p + 2;
 840         bit = vp56_rac_get_prob(c, *ps);
 841         ps += 1 + 3 * bit;
 842         x  += 4 * bit;
 843         bit = vp56_rac_get_prob(c, *ps);
 844         ps += 1 + bit;
 845         x  += 2 * bit;
 846         x  += vp56_rac_get_prob(c, *ps);
 847     }
 848
 849     return (x && vp56_rac_get_prob(c, p[1])) ? -x : x;
 850 }
 851
 852 static int vp7_read_mv_component(VP56RangeCoder *c, const uint8_t *p)
 853 {
 854     return read_mv_component(c, p, 1);
 855 }
 856
 857 static int vp8_read_mv_component(VP56RangeCoder *c, const uint8_t *p)
 858 {
 859     return read_mv_component(c, p, 0);
 860 }
 861
 862 static av_always_inline
 863 const uint8_t *get_submv_prob(uint32_t left, uint32_t top, int is_vp7)
 864 {
 865     if (is_vp7)
 866         return vp7_submv_prob;
 867
 868     if (left == top)
 869         return vp8_submv_prob[4 - !!left];
 870     if (!top)
 871         return vp8_submv_prob[2];
 872     return vp8_submv_prob[1 - !!left];
 873 }
 874
 875 /**
 876  * Split motion vector prediction, 16.4.
 877  * @returns the number of motion vectors parsed (2, 4 or 16)
 878  */
 879 static av_always_inline
 880 int decode_splitmvs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
 881                     int layout, int is_vp7)
 882 {
 883     int part_idx;
 884     int n, num;
 885     VP8Macroblock *top_mb;
 886     VP8Macroblock *left_mb = &mb[-1];
 887     const uint8_t *mbsplits_left = vp8_mbsplits[left_mb->partitioning];
 888     const uint8_t *mbsplits_top, *mbsplits_cur, *firstidx;
 889     VP56mv *top_mv;
 890     VP56mv *left_mv = left_mb->bmv;
 891     VP56mv *cur_mv  = mb->bmv;
 892
 893     if (!layout) // layout is inlined, s->mb_layout is not
 894         top_mb = &mb[2];
 895     else
 896         top_mb = &mb[-s->mb_width - 1];
 897     mbsplits_top = vp8_mbsplits[top_mb->partitioning];
 898     top_mv       = top_mb->bmv;
 899
 900     if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[0])) {
 901         if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[1]))
 902             part_idx = VP8_SPLITMVMODE_16x8 + vp56_rac_get_prob(c, vp8_mbsplit_prob[2]);
 903         else
 904             part_idx = VP8_SPLITMVMODE_8x8;
 905     } else {
 906         part_idx = VP8_SPLITMVMODE_4x4;
 907     }
 908
 909     num              = vp8_mbsplit_count[part_idx];
 910     mbsplits_cur     = vp8_mbsplits[part_idx],
 911     firstidx         = vp8_mbfirstidx[part_idx];
 912     mb->partitioning = part_idx;
 913
 914     for (n = 0; n < num; n++) {
 915         int k = firstidx[n];
 916         uint32_t left, above;
 917         const uint8_t *submv_prob;
 918
 919         if (!(k & 3))
 920             left = AV_RN32A(&left_mv[mbsplits_left[k + 3]]);
 921         else
 922             left = AV_RN32A(&cur_mv[mbsplits_cur[k - 1]]);
 923         if (k <= 3)
 924             above = AV_RN32A(&top_mv[mbsplits_top[k + 12]]);
 925         else
 926             above = AV_RN32A(&cur_mv[mbsplits_cur[k - 4]]);
 927
 928         submv_prob = get_submv_prob(left, above, is_vp7);
 929
 930         if (vp56_rac_get_prob_branchy(c, submv_prob[0])) {
 931             if (vp56_rac_get_prob_branchy(c, submv_prob[1])) {
 932                 if (vp56_rac_get_prob_branchy(c, submv_prob[2])) {
 933                     mb->bmv[n].y = mb->mv.y +
 934                                    read_mv_component(c, s->prob->mvc[0], is_vp7);
 935                     mb->bmv[n].x = mb->mv.x +
 936                                    read_mv_component(c, s->prob->mvc[1], is_vp7);
 937                 } else {
 938                     AV_ZERO32(&mb->bmv[n]);
 939                 }
 940             } else {
 941                 AV_WN32A(&mb->bmv[n], above);
 942             }
 943         } else {
 944             AV_WN32A(&mb->bmv[n], left);
 945         }
 946     }
 947
 948     return num;
 949 }
 950
 951 /**
 952  * The vp7 reference decoder uses a padding macroblock column (added to right
 953  * edge of the frame) to guard against illegal macroblock offsets. The
 954  * algorithm has bugs that permit offsets to straddle the padding column.
 955  * This function replicates those bugs.
 956  *
 957  * @param[out] edge_x macroblock x address
 958  * @param[out] edge_y macroblock y address
 959  *
 960  * @return macroblock offset legal (boolean)
 961  */
 962 static int vp7_calculate_mb_offset(int mb_x, int mb_y, int mb_width,
 963                                    int xoffset, int yoffset, int boundary,
 964                                    int *edge_x, int *edge_y)
 965 {
 966     int vwidth = mb_width + 1;
 967     int new = (mb_y + yoffset) * vwidth + mb_x + xoffset;
 968     if (new < boundary || new % vwidth == vwidth - 1)
 969         return 0;
 970     *edge_y = new / vwidth;
 971     *edge_x = new % vwidth;
 972     return 1;
 973 }
 974
 975 static const VP56mv *get_bmv_ptr(const VP8Macroblock *mb, int subblock)
 976 {
 977     return &mb->bmv[mb->mode == VP8_MVMODE_SPLIT ? vp8_mbsplits[mb->partitioning][subblock] : 0];
 978 }
 979
 980 static av_always_inline
 981 void vp7_decode_mvs(VP8Context *s, VP8Macroblock *mb,
 982                     int mb_x, int mb_y, int layout)
 983 {
 984     VP8Macroblock *mb_edge[12];
 985     enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR };
 986     enum { VP8_EDGE_TOP, VP8_EDGE_LEFT, VP8_EDGE_TOPLEFT };
 987     int idx = CNT_ZERO;
 988     VP56mv near_mv[3];
 989     uint8_t cnt[3] = { 0 };
 990     VP56RangeCoder *c = &s->c;
 991     int i;
 992
 993     AV_ZERO32(&near_mv[0]);
 994     AV_ZERO32(&near_mv[1]);
 995     AV_ZERO32(&near_mv[2]);
 996
 997     for (i = 0; i < VP7_MV_PRED_COUNT; i++) {
 998         const VP7MVPred * pred = &vp7_mv_pred[i];
 999         int edge_x, edge_y;
1000
1001         if (vp7_calculate_mb_offset(mb_x, mb_y, s->mb_width, pred->xoffset,
1002                                     pred->yoffset, !s->profile, &edge_x, &edge_y)) {
1003             VP8Macroblock *edge = mb_edge[i] = (s->mb_layout == 1)
1004                                              ? s->macroblocks_base + 1 + edge_x +
1005                                                (s->mb_width + 1) * (edge_y + 1)
1006                                              : s->macroblocks + edge_x +
1007                                                (s->mb_height - edge_y - 1) * 2;
1008             uint32_t mv = AV_RN32A(get_bmv_ptr(edge, vp7_mv_pred[i].subblock));
1009             if (mv) {
1010                 if (AV_RN32A(&near_mv[CNT_NEAREST])) {
1011                     if (mv == AV_RN32A(&near_mv[CNT_NEAREST])) {
1012                         idx = CNT_NEAREST;
1013                     } else if (AV_RN32A(&near_mv[CNT_NEAR])) {
1014                         if (mv != AV_RN32A(&near_mv[CNT_NEAR]))
1015                             continue;
1016                         idx = CNT_NEAR;
1017                     } else {
1018                         AV_WN32A(&near_mv[CNT_NEAR], mv);
1019                         idx = CNT_NEAR;
1020                     }
1021                 } else {
1022                     AV_WN32A(&near_mv[CNT_NEAREST], mv);
1023                     idx = CNT_NEAREST;
1024                 }
1025             } else {
1026                 idx = CNT_ZERO;
1027             }
1028         } else {
1029             idx = CNT_ZERO;
1030         }
1031         cnt[idx] += vp7_mv_pred[i].score;
1032     }
1033
1034     mb->partitioning = VP8_SPLITMVMODE_NONE;
1035
1036     if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_ZERO]][0])) {
1037         mb->mode = VP8_MVMODE_MV;
1038
1039         if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAREST]][1])) {
1040
1041             if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAR]][2])) {
1042
1043                 if (cnt[CNT_NEAREST] > cnt[CNT_NEAR])
1044                     AV_WN32A(&mb->mv, cnt[CNT_ZERO] > cnt[CNT_NEAREST] ? 0 : AV_RN32A(&near_mv[CNT_NEAREST]));
1045                 else
1046                     AV_WN32A(&mb->mv, cnt[CNT_ZERO] > cnt[CNT_NEAR]    ? 0 : AV_RN32A(&near_mv[CNT_NEAR]));
1047
1048                 if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAR]][3])) {
1049                     mb->mode = VP8_MVMODE_SPLIT;
1050                     mb->mv = mb->bmv[decode_splitmvs(s, c, mb, layout, IS_VP7) - 1];
1051                 } else {
1052                     mb->mv.y += vp7_read_mv_component(c, s->prob->mvc[0]);
1053                     mb->mv.x += vp7_read_mv_component(c, s->prob->mvc[1]);
1054                     mb->bmv[0] = mb->mv;
1055                 }
1056             } else {
1057                 mb->mv = near_mv[CNT_NEAR];
1058                 mb->bmv[0] = mb->mv;
1059             }
1060         } else {
1061             mb->mv = near_mv[CNT_NEAREST];
1062             mb->bmv[0] = mb->mv;
1063         }
1064     } else {
1065         mb->mode = VP8_MVMODE_ZERO;
1066         AV_ZERO32(&mb->mv);
1067         mb->bmv[0] = mb->mv;
1068     }
1069 }
1070
1071 static av_always_inline
1072 void vp8_decode_mvs(VP8Context *s, VP8mvbounds *mv_bounds, VP8Macroblock *mb,
1073                     int mb_x, int mb_y, int layout)
1074 {
1075     VP8Macroblock *mb_edge[3] = { 0      /* top */,
1076                                   mb - 1 /* left */,
1077                                   0      /* top-left */ };
1078     enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV };
1079     enum { VP8_EDGE_TOP, VP8_EDGE_LEFT, VP8_EDGE_TOPLEFT };
1080     int idx = CNT_ZERO;
1081     int cur_sign_bias = s->sign_bias[mb->ref_frame];
1082     int8_t *sign_bias = s->sign_bias;
1083     VP56mv near_mv[4];
1084     uint8_t cnt[4] = { 0 };
1085     VP56RangeCoder *c = &s->c;
1086
1087     if (!layout) { // layout is inlined (s->mb_layout is not)
1088         mb_edge[0] = mb + 2;
1089         mb_edge[2] = mb + 1;
1090     } else {
1091         mb_edge[0] = mb - s->mb_width - 1;
1092         mb_edge[2] = mb - s->mb_width - 2;
1093     }
1094
1095     AV_ZERO32(&near_mv[0]);
1096     AV_ZERO32(&near_mv[1]);
1097     AV_ZERO32(&near_mv[2]);
1098
1099     /* Process MB on top, left and top-left */
1100 #define MV_EDGE_CHECK(n)                                                      \
1101     {                                                                         \
1102         VP8Macroblock *edge = mb_edge[n];                                     \
1103         int edge_ref = edge->ref_frame;                                       \
1104         if (edge_ref != VP56_FRAME_CURRENT) {                                 \
1105             uint32_t mv = AV_RN32A(&edge->mv);                                \
1106             if (mv) {                                                         \
1107                 if (cur_sign_bias != sign_bias[edge_ref]) {                   \
1108                     /* SWAR negate of the values in mv. */                    \
1109                     mv = ~mv;                                                 \
1110                     mv = ((mv & 0x7fff7fff) +                                 \
1111                           0x00010001) ^ (mv & 0x80008000);                    \
1112                 }                                                             \
1113                 if (!n || mv != AV_RN32A(&near_mv[idx]))                      \
1114                     AV_WN32A(&near_mv[++idx], mv);                            \
1115                 cnt[idx] += 1 + (n != 2);                                     \
1116             } else                                                            \
1117                 cnt[CNT_ZERO] += 1 + (n != 2);                                \
1118         }                                                                     \
1119     }
1120
1121     MV_EDGE_CHECK(0)
1122     MV_EDGE_CHECK(1)
1123     MV_EDGE_CHECK(2)
1124
1125     mb->partitioning = VP8_SPLITMVMODE_NONE;
1126     if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_ZERO]][0])) {
1127         mb->mode = VP8_MVMODE_MV;
1128
1129         /* If we have three distinct MVs, merge first and last if they're the same */
1130         if (cnt[CNT_SPLITMV] &&
1131             AV_RN32A(&near_mv[1 + VP8_EDGE_TOP]) == AV_RN32A(&near_mv[1 + VP8_EDGE_TOPLEFT]))
1132             cnt[CNT_NEAREST] += 1;
1133
1134         /* Swap near and nearest if necessary */
1135         if (cnt[CNT_NEAR] > cnt[CNT_NEAREST]) {
1136             FFSWAP(uint8_t,     cnt[CNT_NEAREST],     cnt[CNT_NEAR]);
1137             FFSWAP( VP56mv, near_mv[CNT_NEAREST], near_mv[CNT_NEAR]);
1138         }
1139
1140         if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAREST]][1])) {
1141             if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAR]][2])) {
1142                 /* Choose the best mv out of 0,0 and the nearest mv */
1143                 clamp_mv(mv_bounds, &mb->mv, &near_mv[CNT_ZERO + (cnt[CNT_NEAREST] >= cnt[CNT_ZERO])]);
1144                 cnt[CNT_SPLITMV] = ((mb_edge[VP8_EDGE_LEFT]->mode    == VP8_MVMODE_SPLIT) +
1145                                     (mb_edge[VP8_EDGE_TOP]->mode     == VP8_MVMODE_SPLIT)) * 2 +
1146                                     (mb_edge[VP8_EDGE_TOPLEFT]->mode == VP8_MVMODE_SPLIT);
1147
1148                 if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_SPLITMV]][3])) {
1149                     mb->mode = VP8_MVMODE_SPLIT;
1150                     mb->mv = mb->bmv[decode_splitmvs(s, c, mb, layout, IS_VP8) - 1];
1151                 } else {
1152                     mb->mv.y  += vp8_read_mv_component(c, s->prob->mvc[0]);
1153                     mb->mv.x  += vp8_read_mv_component(c, s->prob->mvc[1]);
1154                     mb->bmv[0] = mb->mv;
1155                 }
1156             } else {
1157                 clamp_mv(mv_bounds, &mb->mv, &near_mv[CNT_NEAR]);
1158                 mb->bmv[0] = mb->mv;
1159             }
1160         } else {
1161             clamp_mv(mv_bounds, &mb->mv, &near_mv[CNT_NEAREST]);
1162             mb->bmv[0] = mb->mv;
1163         }
1164     } else {
1165         mb->mode = VP8_MVMODE_ZERO;
1166         AV_ZERO32(&mb->mv);
1167         mb->bmv[0] = mb->mv;
1168     }
1169 }
1170
1171 static av_always_inline
1172 void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
1173                            int mb_x, int keyframe, int layout)
1174 {
1175     uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
1176
1177     if (layout) {
1178         VP8Macroblock *mb_top = mb - s->mb_width - 1;
1179         memcpy(mb->intra4x4_pred_mode_top, mb_top->intra4x4_pred_mode_top, 4);
1180     }
1181     if (keyframe) {
1182         int x, y;
1183         uint8_t *top;
1184         uint8_t *const left = s->intra4x4_pred_mode_left;
1185         if (layout)
1186             top = mb->intra4x4_pred_mode_top;
1187         else
1188             top = s->intra4x4_pred_mode_top + 4 * mb_x;
1189         for (y = 0; y < 4; y++) {
1190             for (x = 0; x < 4; x++) {
1191                 const uint8_t *ctx;
1192                 ctx       = vp8_pred4x4_prob_intra[top[x]][left[y]];
1193                 *intra4x4 = vp8_rac_get_tree(c, vp8_pred4x4_tree, ctx);
1194                 left[y]   = top[x] = *intra4x4;
1195                 intra4x4++;
1196             }
1197         }
1198     } else {
1199         int i;
1200         for (i = 0; i < 16; i++)
1201             intra4x4[i] = vp8_rac_get_tree(c, vp8_pred4x4_tree,
1202                                            vp8_pred4x4_prob_inter);
1203     }
1204 }
1205
1206 static av_always_inline
1207 void decode_mb_mode(VP8Context *s, VP8mvbounds *mv_bounds,
1208                     VP8Macroblock *mb, int mb_x, int mb_y,
1209                     uint8_t *segment, uint8_t *ref, int layout, int is_vp7)
1210 {
1211     VP56RangeCoder *c = &s->c;
1212     static const char * const vp7_feature_name[] = { "q-index",
1213                                                      "lf-delta",
1214                                                      "partial-golden-update",
1215                                                      "blit-pitch" };
1216     if (is_vp7) {
1217         int i;
1218         *segment = 0;
1219         for (i = 0; i < 4; i++) {
1220             if (s->feature_enabled[i]) {
1221                 if (vp56_rac_get_prob_branchy(c, s->feature_present_prob[i])) {
1222                       int index = vp8_rac_get_tree(c, vp7_feature_index_tree,
1223                                                    s->feature_index_prob[i]);
1224                       av_log(s->avctx, AV_LOG_WARNING,
1225                              "Feature %s present in macroblock (value 0x%x)\n",
1226                              vp7_feature_name[i], s->feature_value[i][index]);
1227                 }
1228            }
1229         }
1230     } else if (s->segmentation.update_map) {
1231         int bit  = vp56_rac_get_prob(c, s->prob->segmentid[0]);
1232         *segment = vp56_rac_get_prob(c, s->prob->segmentid[1+bit]) + 2*bit;
1233     } else if (s->segmentation.enabled)
1234         *segment = ref ? *ref : *segment;
1235     mb->segment = *segment;
1236
1237     mb->skip = s->mbskip_enabled ? vp56_rac_get_prob(c, s->prob->mbskip) : 0;
1238
1239     if (s->keyframe) {
1240         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_intra,
1241                                     vp8_pred16x16_prob_intra);
1242
1243         if (mb->mode == MODE_I4x4) {
1244             decode_intra4x4_modes(s, c, mb, mb_x, 1, layout);
1245         } else {
1246             const uint32_t modes = (is_vp7 ? vp7_pred4x4_mode
1247                                            : vp8_pred4x4_mode)[mb->mode] * 0x01010101u;
1248             if (s->mb_layout)
1249                 AV_WN32A(mb->intra4x4_pred_mode_top, modes);
1250             else
1251                 AV_WN32A(s->intra4x4_pred_mode_top + 4 * mb_x, modes);
1252             AV_WN32A(s->intra4x4_pred_mode_left, modes);
1253         }
1254
1255         mb->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree,
1256                                                 vp8_pred8x8c_prob_intra);
1257         mb->ref_frame        = VP56_FRAME_CURRENT;
1258     } else if (vp56_rac_get_prob_branchy(c, s->prob->intra)) {
1259         // inter MB, 16.2
1260         if (vp56_rac_get_prob_branchy(c, s->prob->last))
1261             mb->ref_frame =
1262                 (!is_vp7 && vp56_rac_get_prob(c, s->prob->golden)) ? VP56_FRAME_GOLDEN2 /* altref */
1263                                                                    : VP56_FRAME_GOLDEN;
1264         else
1265             mb->ref_frame = VP56_FRAME_PREVIOUS;
1266         s->ref_count[mb->ref_frame - 1]++;
1267
1268         // motion vectors, 16.3
1269         if (is_vp7)
1270             vp7_decode_mvs(s, mb, mb_x, mb_y, layout);
1271         else
1272             vp8_decode_mvs(s, mv_bounds, mb, mb_x, mb_y, layout);
1273     } else {
1274         // intra MB, 16.1
1275         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_inter, s->prob->pred16x16);
1276
1277         if (mb->mode == MODE_I4x4)
1278             decode_intra4x4_modes(s, c, mb, mb_x, 0, layout);
1279
1280         mb->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree,
1281                                                 s->prob->pred8x8c);
1282         mb->ref_frame        = VP56_FRAME_CURRENT;
1283         mb->partitioning     = VP8_SPLITMVMODE_NONE;
1284         AV_ZERO32(&mb->bmv[0]);
1285     }
1286 }
1287
1288 /**
1289  * @param r     arithmetic bitstream reader context
1290  * @param block destination for block coefficients
1291  * @param probs probabilities to use when reading trees from the bitstream
1292  * @param i     initial coeff index, 0 unless a separate DC block is coded
1293  * @param qmul  array holding the dc/ac dequant factor at position 0/1
1294  *
1295  * @return 0 if no coeffs were decoded
1296  *         otherwise, the index of the last coeff decoded plus one
1297  */
1298 static av_always_inline
1299 int decode_block_coeffs_internal(VP56RangeCoder *r, int16_t block[16],
1300                                  uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1301                                  int i, uint8_t *token_prob, int16_t qmul[2],
1302                                  const uint8_t scan[16], int vp7)
1303 {
1304     VP56RangeCoder c = *r;
1305     goto skip_eob;
1306     do {
1307         int coeff;
1308 restart:
1309         if (!vp56_rac_get_prob_branchy(&c, token_prob[0]))   // DCT_EOB
1310             break;
1311
1312 skip_eob:
1313         if (!vp56_rac_get_prob_branchy(&c, token_prob[1])) { // DCT_0
1314             if (++i == 16)
1315                 break; // invalid input; blocks should end with EOB
1316             token_prob = probs[i][0];
1317             if (vp7)
1318                 goto restart;
1319             goto skip_eob;
1320         }
1321
1322         if (!vp56_rac_get_prob_branchy(&c, token_prob[2])) { // DCT_1
1323             coeff = 1;
1324             token_prob = probs[i + 1][1];
1325         } else {
1326             if (!vp56_rac_get_prob_branchy(&c, token_prob[3])) { // DCT 2,3,4
1327                 coeff = vp56_rac_get_prob_branchy(&c, token_prob[4]);
1328                 if (coeff)
1329                     coeff += vp56_rac_get_prob(&c, token_prob[5]);
1330                 coeff += 2;
1331             } else {
1332                 // DCT_CAT*
1333                 if (!vp56_rac_get_prob_branchy(&c, token_prob[6])) {
1334                     if (!vp56_rac_get_prob_branchy(&c, token_prob[7])) { // DCT_CAT1
1335                         coeff = 5 + vp56_rac_get_prob(&c, vp8_dct_cat1_prob[0]);
1336                     } else {                                    // DCT_CAT2
1337                         coeff  = 7;
1338                         coeff += vp56_rac_get_prob(&c, vp8_dct_cat2_prob[0]) << 1;
1339                         coeff += vp56_rac_get_prob(&c, vp8_dct_cat2_prob[1]);
1340                     }
1341                 } else {    // DCT_CAT3 and up
1342                     int a   = vp56_rac_get_prob(&c, token_prob[8]);
1343                     int b   = vp56_rac_get_prob(&c, token_prob[9 + a]);
1344                     int cat = (a << 1) + b;
1345                     coeff  = 3 + (8 << cat);
1346                     coeff += vp8_rac_get_coeff(&c, ff_vp8_dct_cat_prob[cat]);
1347                 }
1348             }
1349             token_prob = probs[i + 1][2];
1350         }
1351         block[scan[i]] = (vp8_rac_get(&c) ? -coeff : coeff) * qmul[!!i];
1352     } while (++i < 16);
1353
1354     *r = c;
1355     return i;
1356 }
1357
1358 static av_always_inline
1359 int inter_predict_dc(int16_t block[16], int16_t pred[2])
1360 {
1361     int16_t dc = block[0];
1362     int ret = 0;
1363
1364     if (pred[1] > 3) {
1365         dc += pred[0];
1366         ret = 1;
1367     }
1368
1369     if (!pred[0] | !dc | ((int32_t)pred[0] ^ (int32_t)dc) >> 31) {
1370         block[0] = pred[0] = dc;
1371         pred[1] = 0;
1372     } else {
1373         if (pred[0] == dc)
1374             pred[1]++;
1375         block[0] = pred[0] = dc;
1376     }
1377
1378     return ret;
1379 }
1380
1381 static int vp7_decode_block_coeffs_internal(VP56RangeCoder *r,
1382                                             int16_t block[16],
1383                                             uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1384                                             int i, uint8_t *token_prob,
1385                                             int16_t qmul[2],
1386                                             const uint8_t scan[16])
1387 {
1388     return decode_block_coeffs_internal(r, block, probs, i,
1389                                         token_prob, qmul, scan, IS_VP7);
1390 }
1391
1392 #ifndef vp8_decode_block_coeffs_internal
1393 static int vp8_decode_block_coeffs_internal(VP56RangeCoder *r,
1394                                             int16_t block[16],
1395                                             uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1396                                             int i, uint8_t *token_prob,
1397                                             int16_t qmul[2])
1398 {
1399     return decode_block_coeffs_internal(r, block, probs, i,
1400                                         token_prob, qmul, ff_zigzag_scan, IS_VP8);
1401 }
1402 #endif
1403
1404 /**
1405  * @param c          arithmetic bitstream reader context
1406  * @param block      destination for block coefficients
1407  * @param probs      probabilities to use when reading trees from the bitstream
1408  * @param i          initial coeff index, 0 unless a separate DC block is coded
1409  * @param zero_nhood the initial prediction context for number of surrounding
1410  *                   all-zero blocks (only left/top, so 0-2)
1411  * @param qmul       array holding the dc/ac dequant factor at position 0/1
1412  * @param scan       scan pattern (VP7 only)
1413  *
1414  * @return 0 if no coeffs were decoded
1415  *         otherwise, the index of the last coeff decoded plus one
1416  */
1417 static av_always_inline
1418 int decode_block_coeffs(VP56RangeCoder *c, int16_t block[16],
1419                         uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1420                         int i, int zero_nhood, int16_t qmul[2],
1421                         const uint8_t scan[16], int vp7)
1422 {
1423     uint8_t *token_prob = probs[i][zero_nhood];
1424     if (!vp56_rac_get_prob_branchy(c, token_prob[0]))   // DCT_EOB
1425         return 0;
1426     return vp7 ? vp7_decode_block_coeffs_internal(c, block, probs, i,
1427                                                   token_prob, qmul, scan)
1428                : vp8_decode_block_coeffs_internal(c, block, probs, i,
1429                                                   token_prob, qmul);
1430 }
1431
1432 static av_always_inline
1433 void decode_mb_coeffs(VP8Context *s, VP8ThreadData *td, VP56RangeCoder *c,
1434                       VP8Macroblock *mb, uint8_t t_nnz[9], uint8_t l_nnz[9],
1435                       int is_vp7)
1436 {
1437     int i, x, y, luma_start = 0, luma_ctx = 3;
1438     int nnz_pred, nnz, nnz_total = 0;
1439     int segment = mb->segment;
1440     int block_dc = 0;
1441
1442     if (mb->mode != MODE_I4x4 && (is_vp7 || mb->mode != VP8_MVMODE_SPLIT)) {
1443         nnz_pred = t_nnz[8] + l_nnz[8];
1444
1445         // decode DC values and do hadamard
1446         nnz = decode_block_coeffs(c, td->block_dc, s->prob->token[1], 0,
1447                                   nnz_pred, s->qmat[segment].luma_dc_qmul,
1448                                   ff_zigzag_scan, is_vp7);
1449         l_nnz[8] = t_nnz[8] = !!nnz;
1450
1451         if (is_vp7 && mb->mode > MODE_I4x4) {
1452             nnz |=  inter_predict_dc(td->block_dc,
1453                                      s->inter_dc_pred[mb->ref_frame - 1]);
1454         }
1455
1456         if (nnz) {
1457             nnz_total += nnz;
1458             block_dc   = 1;
1459             if (nnz == 1)
1460                 s->vp8dsp.vp8_luma_dc_wht_dc(td->block, td->block_dc);
1461             else
1462                 s->vp8dsp.vp8_luma_dc_wht(td->block, td->block_dc);
1463         }
1464         luma_start = 1;
1465         luma_ctx   = 0;
1466     }
1467
1468     // luma blocks
1469     for (y = 0; y < 4; y++)
1470         for (x = 0; x < 4; x++) {
1471             nnz_pred = l_nnz[y] + t_nnz[x];
1472             nnz = decode_block_coeffs(c, td->block[y][x],
1473                                       s->prob->token[luma_ctx],
1474                                       luma_start, nnz_pred,
1475                                       s->qmat[segment].luma_qmul,
1476                                       s->prob[0].scan, is_vp7);
1477             /* nnz+block_dc may be one more than the actual last index,
1478              * but we don't care */
1479             td->non_zero_count_cache[y][x] = nnz + block_dc;
1480             t_nnz[x] = l_nnz[y] = !!nnz;
1481             nnz_total += nnz;
1482         }
1483
1484     // chroma blocks
1485     // TODO: what to do about dimensions? 2nd dim for luma is x,
1486     // but for chroma it's (y<<1)|x
1487     for (i = 4; i < 6; i++)
1488         for (y = 0; y < 2; y++)
1489             for (x = 0; x < 2; x++) {
1490                 nnz_pred = l_nnz[i + 2 * y] + t_nnz[i + 2 * x];
1491                 nnz = decode_block_coeffs(c, td->block[i][(y << 1) + x],
1492                                           s->prob->token[2], 0, nnz_pred,
1493                                           s->qmat[segment].chroma_qmul,
1494                                           s->prob[0].scan, is_vp7);
1495                 td->non_zero_count_cache[i][(y << 1) + x] = nnz;
1496                 t_nnz[i + 2 * x] = l_nnz[i + 2 * y] = !!nnz;
1497                 nnz_total += nnz;
1498             }
1499
1500     // if there were no coded coeffs despite the macroblock not being marked skip,
1501     // we MUST not do the inner loop filter and should not do IDCT
1502     // Since skip isn't used for bitstream prediction, just manually set it.
1503     if (!nnz_total)
1504         mb->skip = 1;
1505 }
1506
1507 static av_always_inline
1508 void backup_mb_border(uint8_t *top_border, uint8_t *src_y,
1509                       uint8_t *src_cb, uint8_t *src_cr,
1510                       ptrdiff_t linesize, ptrdiff_t uvlinesize, int simple)
1511 {
1512     AV_COPY128(top_border, src_y + 15 * linesize);
1513     if (!simple) {
1514         AV_COPY64(top_border + 16, src_cb + 7 * uvlinesize);
1515         AV_COPY64(top_border + 24, src_cr + 7 * uvlinesize);
1516     }
1517 }
1518
1519 static av_always_inline
1520 void xchg_mb_border(uint8_t *top_border, uint8_t *src_y, uint8_t *src_cb,
1521                     uint8_t *src_cr, ptrdiff_t linesize, ptrdiff_t uvlinesize, int mb_x,
1522                     int mb_y, int mb_width, int simple, int xchg)
1523 {
1524     uint8_t *top_border_m1 = top_border - 32;     // for TL prediction
1525     src_y  -= linesize;
1526     src_cb -= uvlinesize;
1527     src_cr -= uvlinesize;
1528
1529 #define XCHG(a, b, xchg)                                                      \
1530     do {                                                                      \
1531         if (xchg)                                                             \
1532             AV_SWAP64(b, a);                                                  \
1533         else                                                                  \
1534             AV_COPY64(b, a);                                                  \
1535     } while (0)
1536
1537     XCHG(top_border_m1 + 8, src_y - 8, xchg);
1538     XCHG(top_border, src_y, xchg);
1539     XCHG(top_border + 8, src_y + 8, 1);
1540     if (mb_x < mb_width - 1)
1541         XCHG(top_border + 32, src_y + 16, 1);
1542
1543     // only copy chroma for normal loop filter
1544     // or to initialize the top row to 127
1545     if (!simple || !mb_y) {
1546         XCHG(top_border_m1 + 16, src_cb - 8, xchg);
1547         XCHG(top_border_m1 + 24, src_cr - 8, xchg);
1548         XCHG(top_border + 16, src_cb, 1);
1549         XCHG(top_border + 24, src_cr, 1);
1550     }
1551 }
1552
1553 static av_always_inline
1554 int check_dc_pred8x8_mode(int mode, int mb_x, int mb_y)
1555 {
1556     if (!mb_x)
1557         return mb_y ? TOP_DC_PRED8x8 : DC_128_PRED8x8;
1558     else
1559         return mb_y ? mode : LEFT_DC_PRED8x8;
1560 }
1561
1562 static av_always_inline
1563 int check_tm_pred8x8_mode(int mode, int mb_x, int mb_y, int vp7)
1564 {
1565     if (!mb_x)
1566         return mb_y ? VERT_PRED8x8 : (vp7 ? DC_128_PRED8x8 : DC_129_PRED8x8);
1567     else
1568         return mb_y ? mode : HOR_PRED8x8;
1569 }
1570
1571 static av_always_inline
1572 int check_intra_pred8x8_mode_emuedge(int mode, int mb_x, int mb_y, int vp7)
1573 {
1574     switch (mode) {
1575     case DC_PRED8x8:
1576         return check_dc_pred8x8_mode(mode, mb_x, mb_y);
1577     case VERT_PRED8x8:
1578         return !mb_y ? (vp7 ? DC_128_PRED8x8 : DC_127_PRED8x8) : mode;
1579     case HOR_PRED8x8:
1580         return !mb_x ? (vp7 ? DC_128_PRED8x8 : DC_129_PRED8x8) : mode;
1581     case PLANE_PRED8x8: /* TM */
1582         return check_tm_pred8x8_mode(mode, mb_x, mb_y, vp7);
1583     }
1584     return mode;
1585 }
1586
1587 static av_always_inline
1588 int check_tm_pred4x4_mode(int mode, int mb_x, int mb_y, int vp7)
1589 {
1590     if (!mb_x) {
1591         return mb_y ? VERT_VP8_PRED : (vp7 ? DC_128_PRED : DC_129_PRED);
1592     } else {
1593         return mb_y ? mode : HOR_VP8_PRED;
1594     }
1595 }
1596
1597 static av_always_inline
1598 int check_intra_pred4x4_mode_emuedge(int mode, int mb_x, int mb_y,
1599                                      int *copy_buf, int vp7)
1600 {
1601     switch (mode) {
1602     case VERT_PRED:
1603         if (!mb_x && mb_y) {
1604             *copy_buf = 1;
1605             return mode;
1606         }
1607         /* fall-through */
1608     case DIAG_DOWN_LEFT_PRED:
1609     case VERT_LEFT_PRED:
1610         return !mb_y ? (vp7 ? DC_128_PRED : DC_127_PRED) : mode;
1611     case HOR_PRED:
1612         if (!mb_y) {
1613             *copy_buf = 1;
1614             return mode;
1615         }
1616         /* fall-through */
1617     case HOR_UP_PRED:
1618         return !mb_x ? (vp7 ? DC_128_PRED : DC_129_PRED) : mode;
1619     case TM_VP8_PRED:
1620         return check_tm_pred4x4_mode(mode, mb_x, mb_y, vp7);
1621     case DC_PRED: /* 4x4 DC doesn't use the same "H.264-style" exceptions
1622                    * as 16x16/8x8 DC */
1623     case DIAG_DOWN_RIGHT_PRED:
1624     case VERT_RIGHT_PRED:
1625     case HOR_DOWN_PRED:
1626         if (!mb_y || !mb_x)
1627             *copy_buf = 1;
1628         return mode;
1629     }
1630     return mode;
1631 }
1632
1633 static av_always_inline
1634 void intra_predict(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1635                    VP8Macroblock *mb, int mb_x, int mb_y, int is_vp7)
1636 {
1637     int x, y, mode, nnz;
1638     uint32_t tr;
1639
1640     /* for the first row, we need to run xchg_mb_border to init the top edge
1641      * to 127 otherwise, skip it if we aren't going to deblock */
1642     if (mb_y && (s->deblock_filter || !mb_y) && td->thread_nr == 0)
1643         xchg_mb_border(s->top_border[mb_x + 1], dst[0], dst[1], dst[2],
1644                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1645                        s->filter.simple, 1);
1646
1647     if (mb->mode < MODE_I4x4) {
1648         mode = check_intra_pred8x8_mode_emuedge(mb->mode, mb_x, mb_y, is_vp7);
1649         s->hpc.pred16x16[mode](dst[0], s->linesize);
1650     } else {
1651         uint8_t *ptr = dst[0];
1652         uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
1653         const uint8_t lo = is_vp7 ? 128 : 127;
1654         const uint8_t hi = is_vp7 ? 128 : 129;
1655         uint8_t tr_top[4] = { lo, lo, lo, lo };
1656
1657         // all blocks on the right edge of the macroblock use bottom edge
1658         // the top macroblock for their topright edge
1659         uint8_t *tr_right = ptr - s->linesize + 16;
1660
1661         // if we're on the right edge of the frame, said edge is extended
1662         // from the top macroblock
1663         if (mb_y && mb_x == s->mb_width - 1) {
1664             tr       = tr_right[-1] * 0x01010101u;
1665             tr_right = (uint8_t *) &tr;
1666         }
1667
1668         if (mb->skip)
1669             AV_ZERO128(td->non_zero_count_cache);
1670
1671         for (y = 0; y < 4; y++) {
1672             uint8_t *topright = ptr + 4 - s->linesize;
1673             for (x = 0; x < 4; x++) {
1674                 int copy = 0;
1675                 ptrdiff_t linesize = s->linesize;
1676                 uint8_t *dst = ptr + 4 * x;
1677                 LOCAL_ALIGNED(4, uint8_t, copy_dst, [5 * 8]);
1678
1679                 if ((y == 0 || x == 3) && mb_y == 0) {
1680                     topright = tr_top;
1681                 } else if (x == 3)
1682                     topright = tr_right;
1683
1684                 mode = check_intra_pred4x4_mode_emuedge(intra4x4[x], mb_x + x,
1685                                                         mb_y + y, &copy, is_vp7);
1686                 if (copy) {
1687                     dst      = copy_dst + 12;
1688                     linesize = 8;
1689                     if (!(mb_y + y)) {
1690                         copy_dst[3] = lo;
1691                         AV_WN32A(copy_dst + 4, lo * 0x01010101U);
1692                     } else {
1693                         AV_COPY32(copy_dst + 4, ptr + 4 * x - s->linesize);
1694                         if (!(mb_x + x)) {
1695                             copy_dst[3] = hi;
1696                         } else {
1697                             copy_dst[3] = ptr[4 * x - s->linesize - 1];
1698                         }
1699                     }
1700                     if (!(mb_x + x)) {
1701                         copy_dst[11] =
1702                         copy_dst[19] =
1703                         copy_dst[27] =
1704                         copy_dst[35] = hi;
1705                     } else {
1706                         copy_dst[11] = ptr[4 * x                   - 1];
1707                         copy_dst[19] = ptr[4 * x + s->linesize     - 1];
1708                         copy_dst[27] = ptr[4 * x + s->linesize * 2 - 1];
1709                         copy_dst[35] = ptr[4 * x + s->linesize * 3 - 1];
1710                     }
1711                 }
1712                 s->hpc.pred4x4[mode](dst, topright, linesize);
1713                 if (copy) {
1714                     AV_COPY32(ptr + 4 * x,                   copy_dst + 12);
1715                     AV_COPY32(ptr + 4 * x + s->linesize,     copy_dst + 20);
1716                     AV_COPY32(ptr + 4 * x + s->linesize * 2, copy_dst + 28);
1717                     AV_COPY32(ptr + 4 * x + s->linesize * 3, copy_dst + 36);
1718                 }
1719
1720                 nnz = td->non_zero_count_cache[y][x];
1721                 if (nnz) {
1722                     if (nnz == 1)
1723                         s->vp8dsp.vp8_idct_dc_add(ptr + 4 * x,
1724                                                   td->block[y][x], s->linesize);
1725                     else
1726                         s->vp8dsp.vp8_idct_add(ptr + 4 * x,
1727                                                td->block[y][x], s->linesize);
1728                 }
1729                 topright += 4;
1730             }
1731
1732             ptr      += 4 * s->linesize;
1733             intra4x4 += 4;
1734         }
1735     }
1736
1737     mode = check_intra_pred8x8_mode_emuedge(mb->chroma_pred_mode,
1738                                             mb_x, mb_y, is_vp7);
1739     s->hpc.pred8x8[mode](dst[1], s->uvlinesize);
1740     s->hpc.pred8x8[mode](dst[2], s->uvlinesize);
1741
1742     if (mb_y && (s->deblock_filter || !mb_y) && td->thread_nr == 0)
1743         xchg_mb_border(s->top_border[mb_x + 1], dst[0], dst[1], dst[2],
1744                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1745                        s->filter.simple, 0);
1746 }
1747
1748 static const uint8_t subpel_idx[3][8] = {
1749     { 0, 1, 2, 1, 2, 1, 2, 1 }, // nr. of left extra pixels,
1750                                 // also function pointer index
1751     { 0, 3, 5, 3, 5, 3, 5, 3 }, // nr. of extra pixels required
1752     { 0, 2, 3, 2, 3, 2, 3, 2 }, // nr. of right extra pixels
1753 };
1754
1755 /**
1756  * luma MC function
1757  *
1758  * @param s        VP8 decoding context
1759  * @param dst      target buffer for block data at block position
1760  * @param ref      reference picture buffer at origin (0, 0)
1761  * @param mv       motion vector (relative to block position) to get pixel data from
1762  * @param x_off    horizontal position of block from origin (0, 0)
1763  * @param y_off    vertical position of block from origin (0, 0)
1764  * @param block_w  width of block (16, 8 or 4)
1765  * @param block_h  height of block (always same as block_w)
1766  * @param width    width of src/dst plane data
1767  * @param height   height of src/dst plane data
1768  * @param linesize size of a single line of plane data, including padding
1769  * @param mc_func  motion compensation function pointers (bilinear or sixtap MC)
1770  */
1771 static av_always_inline
1772 void vp8_mc_luma(VP8Context *s, VP8ThreadData *td, uint8_t *dst,
1773                  ThreadFrame *ref, const VP56mv *mv,
1774                  int x_off, int y_off, int block_w, int block_h,
1775                  int width, int height, ptrdiff_t linesize,
1776                  vp8_mc_func mc_func[3][3])
1777 {
1778     uint8_t *src = ref->f->data[0];
1779
1780     if (AV_RN32A(mv)) {
1781         ptrdiff_t src_linesize = linesize;
1782
1783         int mx = (mv->x * 2) & 7, mx_idx = subpel_idx[0][mx];
1784         int my = (mv->y * 2) & 7, my_idx = subpel_idx[0][my];
1785
1786         x_off += mv->x >> 2;
1787         y_off += mv->y >> 2;
1788
1789         // edge emulation
1790         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 4, 0);
1791         src += y_off * linesize + x_off;
1792         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1793             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1794             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1795                                      src - my_idx * linesize - mx_idx,
1796                                      EDGE_EMU_LINESIZE, linesize,
1797                                      block_w + subpel_idx[1][mx],
1798                                      block_h + subpel_idx[1][my],
1799                                      x_off - mx_idx, y_off - my_idx,
1800                                      width, height);
1801             src = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
1802             src_linesize = EDGE_EMU_LINESIZE;
1803         }
1804         mc_func[my_idx][mx_idx](dst, linesize, src, src_linesize, block_h, mx, my);
1805     } else {
1806         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 4, 0);
1807         mc_func[0][0](dst, linesize, src + y_off * linesize + x_off,
1808                       linesize, block_h, 0, 0);
1809     }
1810 }
1811
1812 /**
1813  * chroma MC function
1814  *
1815  * @param s        VP8 decoding context
1816  * @param dst1     target buffer for block data at block position (U plane)
1817  * @param dst2     target buffer for block data at block position (V plane)
1818  * @param ref      reference picture buffer at origin (0, 0)
1819  * @param mv       motion vector (relative to block position) to get pixel data from
1820  * @param x_off    horizontal position of block from origin (0, 0)
1821  * @param y_off    vertical position of block from origin (0, 0)
1822  * @param block_w  width of block (16, 8 or 4)
1823  * @param block_h  height of block (always same as block_w)
1824  * @param width    width of src/dst plane data
1825  * @param height   height of src/dst plane data
1826  * @param linesize size of a single line of plane data, including padding
1827  * @param mc_func  motion compensation function pointers (bilinear or sixtap MC)
1828  */
1829 static av_always_inline
1830 void vp8_mc_chroma(VP8Context *s, VP8ThreadData *td, uint8_t *dst1,
1831                    uint8_t *dst2, ThreadFrame *ref, const VP56mv *mv,
1832                    int x_off, int y_off, int block_w, int block_h,
1833                    int width, int height, ptrdiff_t linesize,
1834                    vp8_mc_func mc_func[3][3])
1835 {
1836     uint8_t *src1 = ref->f->data[1], *src2 = ref->f->data[2];
1837
1838     if (AV_RN32A(mv)) {
1839         int mx = mv->x & 7, mx_idx = subpel_idx[0][mx];
1840         int my = mv->y & 7, my_idx = subpel_idx[0][my];
1841
1842         x_off += mv->x >> 3;
1843         y_off += mv->y >> 3;
1844
1845         // edge emulation
1846         src1 += y_off * linesize + x_off;
1847         src2 += y_off * linesize + x_off;
1848         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 3, 0);
1849         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1850             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1851             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1852                                      src1 - my_idx * linesize - mx_idx,
1853                                      EDGE_EMU_LINESIZE, linesize,
1854                                      block_w + subpel_idx[1][mx],
1855                                      block_h + subpel_idx[1][my],
1856                                      x_off - mx_idx, y_off - my_idx, width, height);
1857             src1 = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
1858             mc_func[my_idx][mx_idx](dst1, linesize, src1, EDGE_EMU_LINESIZE, block_h, mx, my);
1859
1860             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1861                                      src2 - my_idx * linesize - mx_idx,
1862                                      EDGE_EMU_LINESIZE, linesize,
1863                                      block_w + subpel_idx[1][mx],
1864                                      block_h + subpel_idx[1][my],
1865                                      x_off - mx_idx, y_off - my_idx, width, height);
1866             src2 = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
1867             mc_func[my_idx][mx_idx](dst2, linesize, src2, EDGE_EMU_LINESIZE, block_h, mx, my);
1868         } else {
1869             mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my);
1870             mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my);
1871         }
1872     } else {
1873         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 3, 0);
1874         mc_func[0][0](dst1, linesize, src1 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1875         mc_func[0][0](dst2, linesize, src2 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1876     }
1877 }
1878
1879 static av_always_inline
1880 void vp8_mc_part(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1881                  ThreadFrame *ref_frame, int x_off, int y_off,
1882                  int bx_off, int by_off, int block_w, int block_h,
1883                  int width, int height, VP56mv *mv)
1884 {
1885     VP56mv uvmv = *mv;
1886
1887     /* Y */
1888     vp8_mc_luma(s, td, dst[0] + by_off * s->linesize + bx_off,
1889                 ref_frame, mv, x_off + bx_off, y_off + by_off,
1890                 block_w, block_h, width, height, s->linesize,
1891                 s->put_pixels_tab[block_w == 8]);
1892
1893     /* U/V */
1894     if (s->profile == 3) {
1895         /* this block only applies VP8; it is safe to check
1896          * only the profile, as VP7 profile <= 1 */
1897         uvmv.x &= ~7;
1898         uvmv.y &= ~7;
1899     }
1900     x_off   >>= 1;
1901     y_off   >>= 1;
1902     bx_off  >>= 1;
1903     by_off  >>= 1;
1904     width   >>= 1;
1905     height  >>= 1;
1906     block_w >>= 1;
1907     block_h >>= 1;
1908     vp8_mc_chroma(s, td, dst[1] + by_off * s->uvlinesize + bx_off,
1909                   dst[2] + by_off * s->uvlinesize + bx_off, ref_frame,
1910                   &uvmv, x_off + bx_off, y_off + by_off,
1911                   block_w, block_h, width, height, s->uvlinesize,
1912                   s->put_pixels_tab[1 + (block_w == 4)]);
1913 }
1914
1915 /* Fetch pixels for estimated mv 4 macroblocks ahead.
1916  * Optimized for 64-byte cache lines. Inspired by ffh264 prefetch_motion. */
1917 static av_always_inline
1918 void prefetch_motion(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
1919                      int mb_xy, int ref)
1920 {
1921     /* Don't prefetch refs that haven't been used very often this frame. */
1922     if (s->ref_count[ref - 1] > (mb_xy >> 5)) {
1923         int x_off = mb_x << 4, y_off = mb_y << 4;
1924         int mx = (mb->mv.x >> 2) + x_off + 8;
1925         int my = (mb->mv.y >> 2) + y_off;
1926         uint8_t **src = s->framep[ref]->tf.f->data;
1927         int off = mx + (my + (mb_x & 3) * 4) * s->linesize + 64;
1928         /* For threading, a ff_thread_await_progress here might be useful, but
1929          * it actually slows down the decoder. Since a bad prefetch doesn't
1930          * generate bad decoder output, we don't run it here. */
1931         s->vdsp.prefetch(src[0] + off, s->linesize, 4);
1932         off = (mx >> 1) + ((my >> 1) + (mb_x & 7)) * s->uvlinesize + 64;
1933         s->vdsp.prefetch(src[1] + off, src[2] - src[1], 2);
1934     }
1935 }
1936
1937 /**
1938  * Apply motion vectors to prediction buffer, chapter 18.
1939  */
1940 static av_always_inline
1941 void inter_predict(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1942                    VP8Macroblock *mb, int mb_x, int mb_y)
1943 {
1944     int x_off = mb_x << 4, y_off = mb_y << 4;
1945     int width = 16 * s->mb_width, height = 16 * s->mb_height;
1946     ThreadFrame *ref = &s->framep[mb->ref_frame]->tf;
1947     VP56mv *bmv = mb->bmv;
1948
1949     switch (mb->partitioning) {
1950     case VP8_SPLITMVMODE_NONE:
1951         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1952                     0, 0, 16, 16, width, height, &mb->mv);
1953         break;
1954     case VP8_SPLITMVMODE_4x4: {
1955         int x, y;
1956         VP56mv uvmv;
1957
1958         /* Y */
1959         for (y = 0; y < 4; y++) {
1960             for (x = 0; x < 4; x++) {
1961                 vp8_mc_luma(s, td, dst[0] + 4 * y * s->linesize + x * 4,
1962                             ref, &bmv[4 * y + x],
1963                             4 * x + x_off, 4 * y + y_off, 4, 4,
1964                             width, height, s->linesize,
1965                             s->put_pixels_tab[2]);
1966             }
1967         }
1968
1969         /* U/V */
1970         x_off  >>= 1;
1971         y_off  >>= 1;
1972         width  >>= 1;
1973         height >>= 1;
1974         for (y = 0; y < 2; y++) {
1975             for (x = 0; x < 2; x++) {
1976                 uvmv.x = mb->bmv[2 * y       * 4 + 2 * x    ].x +
1977                          mb->bmv[2 * y       * 4 + 2 * x + 1].x +
1978                          mb->bmv[(2 * y + 1) * 4 + 2 * x    ].x +
1979                          mb->bmv[(2 * y + 1) * 4 + 2 * x + 1].x;
1980                 uvmv.y = mb->bmv[2 * y       * 4 + 2 * x    ].y +
1981                          mb->bmv[2 * y       * 4 + 2 * x + 1].y +
1982                          mb->bmv[(2 * y + 1) * 4 + 2 * x    ].y +
1983                          mb->bmv[(2 * y + 1) * 4 + 2 * x + 1].y;
1984                 uvmv.x = (uvmv.x + 2 + FF_SIGNBIT(uvmv.x)) >> 2;
1985                 uvmv.y = (uvmv.y + 2 + FF_SIGNBIT(uvmv.y)) >> 2;
1986                 if (s->profile == 3) {
1987                     uvmv.x &= ~7;
1988                     uvmv.y &= ~7;
1989                 }
1990                 vp8_mc_chroma(s, td, dst[1] + 4 * y * s->uvlinesize + x * 4,
1991                               dst[2] + 4 * y * s->uvlinesize + x * 4, ref,
1992                               &uvmv, 4 * x + x_off, 4 * y + y_off, 4, 4,
1993                               width, height, s->uvlinesize,
1994                               s->put_pixels_tab[2]);
1995             }
1996         }
1997         break;
1998     }
1999     case VP8_SPLITMVMODE_16x8:
2000         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2001                     0, 0, 16, 8, width, height, &bmv[0]);
2002         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2003                     0, 8, 16, 8, width, height, &bmv[1]);
2004         break;
2005     case VP8_SPLITMVMODE_8x16:
2006         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2007                     0, 0, 8, 16, width, height, &bmv[0]);
2008         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2009                     8, 0, 8, 16, width, height, &bmv[1]);
2010         break;
2011     case VP8_SPLITMVMODE_8x8:
2012         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2013                     0, 0, 8, 8, width, height, &bmv[0]);
2014         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2015                     8, 0, 8, 8, width, height, &bmv[1]);
2016         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2017                     0, 8, 8, 8, width, height, &bmv[2]);
2018         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2019                     8, 8, 8, 8, width, height, &bmv[3]);
2020         break;
2021     }
2022 }
2023
2024 static av_always_inline
2025 void idct_mb(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3], VP8Macroblock *mb)
2026 {
2027     int x, y, ch;
2028
2029     if (mb->mode != MODE_I4x4) {
2030         uint8_t *y_dst = dst[0];
2031         for (y = 0; y < 4; y++) {
2032             uint32_t nnz4 = AV_RL32(td->non_zero_count_cache[y]);
2033             if (nnz4) {
2034                 if (nnz4 & ~0x01010101) {
2035                     for (x = 0; x < 4; x++) {
2036                         if ((uint8_t) nnz4 == 1)
2037                             s->vp8dsp.vp8_idct_dc_add(y_dst + 4 * x,
2038                                                       td->block[y][x],
2039                                                       s->linesize);
2040                         else if ((uint8_t) nnz4 > 1)
2041                             s->vp8dsp.vp8_idct_add(y_dst + 4 * x,
2042                                                    td->block[y][x],
2043                                                    s->linesize);
2044                         nnz4 >>= 8;
2045                         if (!nnz4)
2046                             break;
2047                     }
2048                 } else {
2049                     s->vp8dsp.vp8_idct_dc_add4y(y_dst, td->block[y], s->linesize);
2050                 }
2051             }
2052             y_dst += 4 * s->linesize;
2053         }
2054     }
2055
2056     for (ch = 0; ch < 2; ch++) {
2057         uint32_t nnz4 = AV_RL32(td->non_zero_count_cache[4 + ch]);
2058         if (nnz4) {
2059             uint8_t *ch_dst = dst[1 + ch];
2060             if (nnz4 & ~0x01010101) {
2061                 for (y = 0; y < 2; y++) {
2062                     for (x = 0; x < 2; x++) {
2063                         if ((uint8_t) nnz4 == 1)
2064                             s->vp8dsp.vp8_idct_dc_add(ch_dst + 4 * x,
2065                                                       td->block[4 + ch][(y << 1) + x],
2066                                                       s->uvlinesize);
2067                         else if ((uint8_t) nnz4 > 1)
2068                             s->vp8dsp.vp8_idct_add(ch_dst + 4 * x,
2069                                                    td->block[4 + ch][(y << 1) + x],
2070                                                    s->uvlinesize);
2071                         nnz4 >>= 8;
2072                         if (!nnz4)
2073                             goto chroma_idct_end;
2074                     }
2075                     ch_dst += 4 * s->uvlinesize;
2076                 }
2077             } else {
2078                 s->vp8dsp.vp8_idct_dc_add4uv(ch_dst, td->block[4 + ch], s->uvlinesize);
2079             }
2080         }
2081 chroma_idct_end:
2082         ;
2083     }
2084 }
2085
2086 static av_always_inline
2087 void filter_level_for_mb(VP8Context *s, VP8Macroblock *mb,
2088                          VP8FilterStrength *f, int is_vp7)
2089 {
2090     int interior_limit, filter_level;
2091
2092     if (s->segmentation.enabled) {
2093         filter_level = s->segmentation.filter_level[mb->segment];
2094         if (!s->segmentation.absolute_vals)
2095             filter_level += s->filter.level;
2096     } else
2097         filter_level = s->filter.level;
2098
2099     if (s->lf_delta.enabled) {
2100         filter_level += s->lf_delta.ref[mb->ref_frame];
2101         filter_level += s->lf_delta.mode[mb->mode];
2102     }
2103
2104     filter_level = av_clip_uintp2(filter_level, 6);
2105
2106     interior_limit = filter_level;
2107     if (s->filter.sharpness) {
2108         interior_limit >>= (s->filter.sharpness + 3) >> 2;
2109         interior_limit = FFMIN(interior_limit, 9 - s->filter.sharpness);
2110     }
2111     interior_limit = FFMAX(interior_limit, 1);
2112
2113     f->filter_level = filter_level;
2114     f->inner_limit = interior_limit;
2115     f->inner_filter = is_vp7 || !mb->skip || mb->mode == MODE_I4x4 ||
2116                       mb->mode == VP8_MVMODE_SPLIT;
2117 }
2118
2119 static av_always_inline
2120 void filter_mb(VP8Context *s, uint8_t *dst[3], VP8FilterStrength *f,
2121                int mb_x, int mb_y, int is_vp7)
2122 {
2123     int mbedge_lim, bedge_lim_y, bedge_lim_uv, hev_thresh;
2124     int filter_level = f->filter_level;
2125     int inner_limit = f->inner_limit;
2126     int inner_filter = f->inner_filter;
2127     ptrdiff_t linesize   = s->linesize;
2128     ptrdiff_t uvlinesize = s->uvlinesize;
2129     static const uint8_t hev_thresh_lut[2][64] = {
2130         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
2131           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2132           3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2133           3, 3, 3, 3 },
2134         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
2135           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2136           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2137           2, 2, 2, 2 }
2138     };
2139
2140     if (!filter_level)
2141         return;
2142
2143     if (is_vp7) {
2144         bedge_lim_y  = filter_level;
2145         bedge_lim_uv = filter_level * 2;
2146         mbedge_lim   = filter_level + 2;
2147     } else {
2148         bedge_lim_y  =
2149         bedge_lim_uv = filter_level * 2 + inner_limit;
2150         mbedge_lim   = bedge_lim_y + 4;
2151     }
2152
2153     hev_thresh = hev_thresh_lut[s->keyframe][filter_level];
2154
2155     if (mb_x) {
2156         s->vp8dsp.vp8_h_loop_filter16y(dst[0], linesize,
2157                                        mbedge_lim, inner_limit, hev_thresh);
2158         s->vp8dsp.vp8_h_loop_filter8uv(dst[1], dst[2], uvlinesize,
2159                                        mbedge_lim, inner_limit, hev_thresh);
2160     }
2161
2162 #define H_LOOP_FILTER_16Y_INNER(cond)                                         \
2163     if (cond && inner_filter) {                                               \
2164         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] +  4, linesize,           \
2165                                              bedge_lim_y, inner_limit,        \
2166                                              hev_thresh);                     \
2167         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] +  8, linesize,           \
2168                                              bedge_lim_y, inner_limit,        \
2169                                              hev_thresh);                     \
2170         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] + 12, linesize,           \
2171                                              bedge_lim_y, inner_limit,        \
2172                                              hev_thresh);                     \
2173         s->vp8dsp.vp8_h_loop_filter8uv_inner(dst[1] +  4, dst[2] + 4,         \
2174                                              uvlinesize,  bedge_lim_uv,       \
2175                                              inner_limit, hev_thresh);        \
2176     }
2177
2178     H_LOOP_FILTER_16Y_INNER(!is_vp7)
2179
2180     if (mb_y) {
2181         s->vp8dsp.vp8_v_loop_filter16y(dst[0], linesize,
2182                                        mbedge_lim, inner_limit, hev_thresh);
2183         s->vp8dsp.vp8_v_loop_filter8uv(dst[1], dst[2], uvlinesize,
2184                                        mbedge_lim, inner_limit, hev_thresh);
2185     }
2186
2187     if (inner_filter) {
2188         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] +  4 * linesize,
2189                                              linesize, bedge_lim_y,
2190                                              inner_limit, hev_thresh);
2191         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] +  8 * linesize,
2192                                              linesize, bedge_lim_y,
2193                                              inner_limit, hev_thresh);
2194         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] + 12 * linesize,
2195                                              linesize, bedge_lim_y,
2196                                              inner_limit, hev_thresh);
2197         s->vp8dsp.vp8_v_loop_filter8uv_inner(dst[1] +  4 * uvlinesize,
2198                                              dst[2] +  4 * uvlinesize,
2199                                              uvlinesize, bedge_lim_uv,
2200                                              inner_limit, hev_thresh);
2201     }
2202
2203     H_LOOP_FILTER_16Y_INNER(is_vp7)
2204 }
2205
2206 static av_always_inline
2207 void filter_mb_simple(VP8Context *s, uint8_t *dst, VP8FilterStrength *f,
2208                       int mb_x, int mb_y)
2209 {
2210     int mbedge_lim, bedge_lim;
2211     int filter_level = f->filter_level;
2212     int inner_limit  = f->inner_limit;
2213     int inner_filter = f->inner_filter;
2214     ptrdiff_t linesize = s->linesize;
2215
2216     if (!filter_level)
2217         return;
2218
2219     bedge_lim  = 2 * filter_level + inner_limit;
2220     mbedge_lim = bedge_lim + 4;
2221
2222     if (mb_x)
2223         s->vp8dsp.vp8_h_loop_filter_simple(dst, linesize, mbedge_lim);
2224     if (inner_filter) {
2225         s->vp8dsp.vp8_h_loop_filter_simple(dst +  4, linesize, bedge_lim);
2226         s->vp8dsp.vp8_h_loop_filter_simple(dst +  8, linesize, bedge_lim);
2227         s->vp8dsp.vp8_h_loop_filter_simple(dst + 12, linesize, bedge_lim);
2228     }
2229
2230     if (mb_y)
2231         s->vp8dsp.vp8_v_loop_filter_simple(dst, linesize, mbedge_lim);
2232     if (inner_filter) {
2233         s->vp8dsp.vp8_v_loop_filter_simple(dst +  4 * linesize, linesize, bedge_lim);
2234         s->vp8dsp.vp8_v_loop_filter_simple(dst +  8 * linesize, linesize, bedge_lim);
2235         s->vp8dsp.vp8_v_loop_filter_simple(dst + 12 * linesize, linesize, bedge_lim);
2236     }
2237 }
2238
2239 #define MARGIN (16 << 2)
2240 static av_always_inline
2241 void vp78_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *curframe,
2242                                     VP8Frame *prev_frame, int is_vp7)
2243 {
2244     VP8Context *s = avctx->priv_data;
2245     int mb_x, mb_y;
2246
2247     s->mv_bounds.mv_min.y = -MARGIN;
2248     s->mv_bounds.mv_max.y = ((s->mb_height - 1) << 6) + MARGIN;
2249     for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
2250         VP8Macroblock *mb = s->macroblocks_base +
2251                             ((s->mb_width + 1) * (mb_y + 1) + 1);
2252         int mb_xy = mb_y * s->mb_width;
2253
2254         AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED * 0x01010101);
2255
2256         s->mv_bounds.mv_min.x = -MARGIN;
2257         s->mv_bounds.mv_max.x = ((s->mb_width - 1) << 6) + MARGIN;
2258         for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
2259             if (mb_y == 0)
2260                 AV_WN32A((mb - s->mb_width - 1)->intra4x4_pred_mode_top,
2261                          DC_PRED * 0x01010101);
2262             decode_mb_mode(s, &s->mv_bounds, mb, mb_x, mb_y, curframe->seg_map->data + mb_xy,
2263                            prev_frame && prev_frame->seg_map ?
2264                            prev_frame->seg_map->data + mb_xy : NULL, 1, is_vp7);
2265             s->mv_bounds.mv_min.x -= 64;
2266             s->mv_bounds.mv_max.x -= 64;
2267         }
2268         s->mv_bounds.mv_min.y -= 64;
2269         s->mv_bounds.mv_max.y -= 64;
2270     }
2271 }
2272
2273 static void vp7_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *cur_frame,
2274                                    VP8Frame *prev_frame)
2275 {
2276     vp78_decode_mv_mb_modes(avctx, cur_frame, prev_frame, IS_VP7);
2277 }
2278
2279 static void vp8_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *cur_frame,
2280                                    VP8Frame *prev_frame)
2281 {
2282     vp78_decode_mv_mb_modes(avctx, cur_frame, prev_frame, IS_VP8);
2283 }
2284
2285 #if HAVE_THREADS
2286 #define check_thread_pos(td, otd, mb_x_check, mb_y_check)                     \
2287     do {                                                                      \
2288         int tmp = (mb_y_check << 16) | (mb_x_check & 0xFFFF);                 \
2289         if (atomic_load(&otd->thread_mb_pos) < tmp) {                         \
2290             pthread_mutex_lock(&otd->lock);                                   \
2291             atomic_store(&td->wait_mb_pos, tmp);                              \
2292             do {                                                              \
2293                 if (atomic_load(&otd->thread_mb_pos) >= tmp)                  \
2294                     break;                                                    \
2295                 pthread_cond_wait(&otd->cond, &otd->lock);                    \
2296             } while (1);                                                      \
2297             atomic_store(&td->wait_mb_pos, INT_MAX);                          \
2298             pthread_mutex_unlock(&otd->lock);                                 \
2299         }                                                                     \
2300     } while (0)
2301
2302 #define update_pos(td, mb_y, mb_x)                                            \
2303     do {                                                                      \
2304         int pos              = (mb_y << 16) | (mb_x & 0xFFFF);                \
2305         int sliced_threading = (avctx->active_thread_type == FF_THREAD_SLICE) && \
2306                                (num_jobs > 1);                                \
2307         int is_null          = !next_td || !prev_td;                          \
2308         int pos_check        = (is_null) ? 1 :                                \
2309             (next_td != td && pos >= atomic_load(&next_td->wait_mb_pos)) ||   \
2310             (prev_td != td && pos >= atomic_load(&prev_td->wait_mb_pos));     \
2311         atomic_store(&td->thread_mb_pos, pos);                                \
2312         if (sliced_threading && pos_check) {                                  \
2313             pthread_mutex_lock(&td->lock);                                    \
2314             pthread_cond_broadcast(&td->cond);                                \
2315             pthread_mutex_unlock(&td->lock);                                  \
2316         }                                                                     \
2317     } while (0)
2318 #else
2319 #define check_thread_pos(td, otd, mb_x_check, mb_y_check) while(0)
2320 #define update_pos(td, mb_y, mb_x) while(0)
2321 #endif
2322
2323 static av_always_inline int decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
2324                                         int jobnr, int threadnr, int is_vp7)
2325 {
2326     VP8Context *s = avctx->priv_data;
2327     VP8ThreadData *prev_td, *next_td, *td = &s->thread_data[threadnr];
2328     int mb_y = atomic_load(&td->thread_mb_pos) >> 16;
2329     int mb_x, mb_xy = mb_y * s->mb_width;
2330     int num_jobs = s->num_jobs;
2331     VP8Frame *curframe = s->curframe, *prev_frame = s->prev_frame;
2332     VP56RangeCoder *c  = &s->coeff_partition[mb_y & (s->num_coeff_partitions - 1)];
2333     VP8Macroblock *mb;
2334     uint8_t *dst[3] = {
2335         curframe->tf.f->data[0] + 16 * mb_y * s->linesize,
2336         curframe->tf.f->data[1] +  8 * mb_y * s->uvlinesize,
2337         curframe->tf.f->data[2] +  8 * mb_y * s->uvlinesize
2338     };
2339
2340     if (c->end <= c->buffer && c->bits >= 0)
2341          return AVERROR_INVALIDDATA;
2342
2343     if (mb_y == 0)
2344         prev_td = td;
2345     else
2346         prev_td = &s->thread_data[(jobnr + num_jobs - 1) % num_jobs];
2347     if (mb_y == s->mb_height - 1)
2348         next_td = td;
2349     else
2350         next_td = &s->thread_data[(jobnr + 1) % num_jobs];
2351     if (s->mb_layout == 1)
2352         mb = s->macroblocks_base + ((s->mb_width + 1) * (mb_y + 1) + 1);
2353     else {
2354         // Make sure the previous frame has read its segmentation map,
2355         // if we re-use the same map.
2356         if (prev_frame && s->segmentation.enabled &&
2357             !s->segmentation.update_map)
2358             ff_thread_await_progress(&prev_frame->tf, mb_y, 0);
2359         mb = s->macroblocks + (s->mb_height - mb_y - 1) * 2;
2360         memset(mb - 1, 0, sizeof(*mb)); // zero left macroblock
2361         AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED * 0x01010101);
2362     }
2363
2364     if (!is_vp7 || mb_y == 0)
2365         memset(td->left_nnz, 0, sizeof(td->left_nnz));
2366
2367     td->mv_bounds.mv_min.x = -MARGIN;
2368     td->mv_bounds.mv_max.x = ((s->mb_width - 1) << 6) + MARGIN;
2369
2370     for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
2371         if (c->end <= c->buffer && c->bits >= 0)
2372             return AVERROR_INVALIDDATA;
2373         // Wait for previous thread to read mb_x+2, and reach mb_y-1.
2374         if (prev_td != td) {
2375             if (threadnr != 0) {
2376                 check_thread_pos(td, prev_td,
2377                                  mb_x + (is_vp7 ? 2 : 1),
2378                                  mb_y - (is_vp7 ? 2 : 1));
2379             } else {
2380                 check_thread_pos(td, prev_td,
2381                                  mb_x + (is_vp7 ? 2 : 1) + s->mb_width + 3,
2382                                  mb_y - (is_vp7 ? 2 : 1));
2383             }
2384         }
2385
2386         s->vdsp.prefetch(dst[0] + (mb_x & 3) * 4 * s->linesize + 64,
2387                          s->linesize, 4);
2388         s->vdsp.prefetch(dst[1] + (mb_x & 7) * s->uvlinesize + 64,
2389                          dst[2] - dst[1], 2);
2390
2391         if (!s->mb_layout)
2392             decode_mb_mode(s, &td->mv_bounds, mb, mb_x, mb_y, curframe->seg_map->data + mb_xy,
2393                            prev_frame && prev_frame->seg_map ?
2394                            prev_frame->seg_map->data + mb_xy : NULL, 0, is_vp7);
2395
2396         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_PREVIOUS);
2397
2398         if (!mb->skip)
2399             decode_mb_coeffs(s, td, c, mb, s->top_nnz[mb_x], td->left_nnz, is_vp7);
2400
2401         if (mb->mode <= MODE_I4x4)
2402             intra_predict(s, td, dst, mb, mb_x, mb_y, is_vp7);
2403         else
2404             inter_predict(s, td, dst, mb, mb_x, mb_y);
2405
2406         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN);
2407
2408         if (!mb->skip) {
2409             idct_mb(s, td, dst, mb);
2410         } else {
2411             AV_ZERO64(td->left_nnz);
2412             AV_WN64(s->top_nnz[mb_x], 0);   // array of 9, so unaligned
2413
2414             /* Reset DC block predictors if they would exist
2415              * if the mb had coefficients */
2416             if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
2417                 td->left_nnz[8]     = 0;
2418                 s->top_nnz[mb_x][8] = 0;
2419             }
2420         }
2421
2422         if (s->deblock_filter)
2423             filter_level_for_mb(s, mb, &td->filter_strength[mb_x], is_vp7);
2424
2425         if (s->deblock_filter && num_jobs != 1 && threadnr == num_jobs - 1) {
2426             if (s->filter.simple)
2427                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2428                                  NULL, NULL, s->linesize, 0, 1);
2429             else
2430                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2431                                  dst[1], dst[2], s->linesize, s->uvlinesize, 0);
2432         }
2433
2434         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN2);
2435
2436         dst[0]      += 16;
2437         dst[1]      += 8;
2438         dst[2]      += 8;
2439         td->mv_bounds.mv_min.x -= 64;
2440         td->mv_bounds.mv_max.x -= 64;
2441
2442         if (mb_x == s->mb_width + 1) {
2443             update_pos(td, mb_y, s->mb_width + 3);
2444         } else {
2445             update_pos(td, mb_y, mb_x);
2446         }
2447     }
2448     return 0;
2449 }
2450
2451 static int vp7_decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
2452                                         int jobnr, int threadnr)
2453 {
2454     return decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr, 1);
2455 }
2456
2457 static int vp8_decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
2458                                         int jobnr, int threadnr)
2459 {
2460     return decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr, 0);
2461 }
2462
2463 static av_always_inline void filter_mb_row(AVCodecContext *avctx, void *tdata,
2464                               int jobnr, int threadnr, int is_vp7)
2465 {
2466     VP8Context *s = avctx->priv_data;
2467     VP8ThreadData *td = &s->thread_data[threadnr];
2468     int mb_x, mb_y = atomic_load(&td->thread_mb_pos) >> 16, num_jobs = s->num_jobs;
2469     AVFrame *curframe = s->curframe->tf.f;
2470     VP8Macroblock *mb;
2471     VP8ThreadData *prev_td, *next_td;
2472     uint8_t *dst[3] = {
2473         curframe->data[0] + 16 * mb_y * s->linesize,
2474         curframe->data[1] +  8 * mb_y * s->uvlinesize,
2475         curframe->data[2] +  8 * mb_y * s->uvlinesize
2476     };
2477
2478     if (s->mb_layout == 1)
2479         mb = s->macroblocks_base + ((s->mb_width + 1) * (mb_y + 1) + 1);
2480     else
2481         mb = s->macroblocks + (s->mb_height - mb_y - 1) * 2;
2482
2483     if (mb_y == 0)
2484         prev_td = td;
2485     else
2486         prev_td = &s->thread_data[(jobnr + num_jobs - 1) % num_jobs];
2487     if (mb_y == s->mb_height - 1)
2488         next_td = td;
2489     else
2490         next_td = &s->thread_data[(jobnr + 1) % num_jobs];
2491
2492     for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb++) {
2493         VP8FilterStrength *f = &td->filter_strength[mb_x];
2494         if (prev_td != td)
2495             check_thread_pos(td, prev_td,
2496                              (mb_x + 1) + (s->mb_width + 3), mb_y - 1);
2497         if (next_td != td)
2498             if (next_td != &s->thread_data[0])
2499                 check_thread_pos(td, next_td, mb_x + 1, mb_y + 1);
2500
2501         if (num_jobs == 1) {
2502             if (s->filter.simple)
2503                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2504                                  NULL, NULL, s->linesize, 0, 1);
2505             else
2506                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2507                                  dst[1], dst[2], s->linesize, s->uvlinesize, 0);
2508         }
2509
2510         if (s->filter.simple)
2511             filter_mb_simple(s, dst[0], f, mb_x, mb_y);
2512         else
2513             filter_mb(s, dst, f, mb_x, mb_y, is_vp7);
2514         dst[0] += 16;
2515         dst[1] += 8;
2516         dst[2] += 8;
2517
2518         update_pos(td, mb_y, (s->mb_width + 3) + mb_x);
2519     }
2520 }
2521
2522 static void vp7_filter_mb_row(AVCodecContext *avctx, void *tdata,
2523                               int jobnr, int threadnr)
2524 {
2525     filter_mb_row(avctx, tdata, jobnr, threadnr, 1);
2526 }
2527
2528 static void vp8_filter_mb_row(AVCodecContext *avctx, void *tdata,
2529                               int jobnr, int threadnr)
2530 {
2531     filter_mb_row(avctx, tdata, jobnr, threadnr, 0);
2532 }
2533
2534 static av_always_inline
2535 int vp78_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata, int jobnr,
2536                               int threadnr, int is_vp7)
2537 {
2538     VP8Context *s = avctx->priv_data;
2539     VP8ThreadData *td = &s->thread_data[jobnr];
2540     VP8ThreadData *next_td = NULL, *prev_td = NULL;
2541     VP8Frame *curframe = s->curframe;
2542     int mb_y, num_jobs = s->num_jobs;
2543     int ret;
2544
2545     td->thread_nr = threadnr;
2546     td->mv_bounds.mv_min.y   = -MARGIN - 64 * threadnr;
2547     td->mv_bounds.mv_max.y   = ((s->mb_height - 1) << 6) + MARGIN - 64 * threadnr;
2548     for (mb_y = jobnr; mb_y < s->mb_height; mb_y += num_jobs) {
2549         atomic_store(&td->thread_mb_pos, mb_y << 16);
2550         ret = s->decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr);
2551         if (ret < 0) {
2552             update_pos(td, s->mb_height, INT_MAX & 0xFFFF);
2553             return ret;
2554         }
2555         if (s->deblock_filter)
2556             s->filter_mb_row(avctx, tdata, jobnr, threadnr);
2557         update_pos(td, mb_y, INT_MAX & 0xFFFF);
2558
2559         td->mv_bounds.mv_min.y -= 64 * num_jobs;
2560         td->mv_bounds.mv_max.y -= 64 * num_jobs;
2561
2562         if (avctx->active_thread_type == FF_THREAD_FRAME)
2563             ff_thread_report_progress(&curframe->tf, mb_y, 0);
2564     }
2565
2566     return 0;
2567 }
2568
2569 static int vp7_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata,
2570                                     int jobnr, int threadnr)
2571 {
2572     return vp78_decode_mb_row_sliced(avctx, tdata, jobnr, threadnr, IS_VP7);
2573 }
2574
2575 static int vp8_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata,
2576                                     int jobnr, int threadnr)
2577 {
2578     return vp78_decode_mb_row_sliced(avctx, tdata, jobnr, threadnr, IS_VP8);
2579 }
2580
2581 static av_always_inline
2582 int vp78_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
2583                       AVPacket *avpkt, int is_vp7)
2584 {
2585     VP8Context *s = avctx->priv_data;
2586     int ret, i, referenced, num_jobs;
2587     enum AVDiscard skip_thresh;
2588     VP8Frame *av_uninit(curframe), *prev_frame;
2589
2590     if (is_vp7)
2591         ret = vp7_decode_frame_header(s, avpkt->data, avpkt->size);
2592     else
2593         ret = vp8_decode_frame_header(s, avpkt->data, avpkt->size);
2594
2595     if (ret < 0)
2596         goto err;
2597
2598     if (s->actually_webp) {
2599         // avctx->pix_fmt already set in caller.
2600     } else if (!is_vp7 && s->pix_fmt == AV_PIX_FMT_NONE) {
2601         enum AVPixelFormat pix_fmts[] = {
2602 #if CONFIG_VP8_VAAPI_HWACCEL
2603             AV_PIX_FMT_VAAPI,
2604 #endif
2605 #if CONFIG_VP8_NVDEC_HWACCEL
2606             AV_PIX_FMT_CUDA,
2607 #endif
2608             AV_PIX_FMT_YUV420P,
2609             AV_PIX_FMT_NONE,
2610         };
2611
2612         s->pix_fmt = ff_get_format(s->avctx, pix_fmts);
2613         if (s->pix_fmt < 0) {
2614             ret = AVERROR(EINVAL);
2615             goto err;
2616         }
2617         avctx->pix_fmt = s->pix_fmt;
2618     }
2619
2620     prev_frame = s->framep[VP56_FRAME_CURRENT];
2621
2622     referenced = s->update_last || s->update_golden == VP56_FRAME_CURRENT ||
2623                  s->update_altref == VP56_FRAME_CURRENT;
2624
2625     skip_thresh = !referenced ? AVDISCARD_NONREF
2626                               : !s->keyframe ? AVDISCARD_NONKEY
2627                                              : AVDISCARD_ALL;
2628
2629     if (avctx->skip_frame >= skip_thresh) {
2630         s->invisible = 1;
2631         memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
2632         goto skip_decode;
2633     }
2634     s->deblock_filter = s->filter.level && avctx->skip_loop_filter < skip_thresh;
2635
2636     // release no longer referenced frames
2637     for (i = 0; i < 5; i++)
2638         if (s->frames[i].tf.f->buf[0] &&
2639             &s->frames[i] != prev_frame &&
2640             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
2641             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN]   &&
2642             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2])
2643             vp8_release_frame(s, &s->frames[i]);
2644
2645     curframe = s->framep[VP56_FRAME_CURRENT] = vp8_find_free_buffer(s);
2646
2647     if (!s->colorspace)
2648         avctx->colorspace = AVCOL_SPC_BT470BG;
2649     if (s->fullrange)
2650         avctx->color_range = AVCOL_RANGE_JPEG;
2651     else
2652         avctx->color_range = AVCOL_RANGE_MPEG;
2653
2654     /* Given that arithmetic probabilities are updated every frame, it's quite
2655      * likely that the values we have on a random interframe are complete
2656      * junk if we didn't start decode on a keyframe. So just don't display
2657      * anything rather than junk. */
2658     if (!s->keyframe && (!s->framep[VP56_FRAME_PREVIOUS] ||
2659                          !s->framep[VP56_FRAME_GOLDEN]   ||
2660                          !s->framep[VP56_FRAME_GOLDEN2])) {
2661         av_log(avctx, AV_LOG_WARNING,
2662                "Discarding interframe without a prior keyframe!\n");
2663         ret = AVERROR_INVALIDDATA;
2664         goto err;
2665     }
2666
2667     curframe->tf.f->key_frame = s->keyframe;
2668     curframe->tf.f->pict_type = s->keyframe ? AV_PICTURE_TYPE_I
2669                                             : AV_PICTURE_TYPE_P;
2670     if ((ret = vp8_alloc_frame(s, curframe, referenced)) < 0)
2671         goto err;
2672
2673     // check if golden and altref are swapped
2674     if (s->update_altref != VP56_FRAME_NONE)
2675         s->next_framep[VP56_FRAME_GOLDEN2] = s->framep[s->update_altref];
2676     else
2677         s->next_framep[VP56_FRAME_GOLDEN2] = s->framep[VP56_FRAME_GOLDEN2];
2678
2679     if (s->update_golden != VP56_FRAME_NONE)
2680         s->next_framep[VP56_FRAME_GOLDEN] = s->framep[s->update_golden];
2681     else
2682         s->next_framep[VP56_FRAME_GOLDEN] = s->framep[VP56_FRAME_GOLDEN];
2683
2684     if (s->update_last)
2685         s->next_framep[VP56_FRAME_PREVIOUS] = curframe;
2686     else
2687         s->next_framep[VP56_FRAME_PREVIOUS] = s->framep[VP56_FRAME_PREVIOUS];
2688
2689     s->next_framep[VP56_FRAME_CURRENT] = curframe;
2690
2691     ff_thread_finish_setup(avctx);
2692
2693     if (avctx->hwaccel) {
2694         ret = avctx->hwaccel->start_frame(avctx, avpkt->data, avpkt->size);
2695         if (ret < 0)
2696             goto err;
2697
2698         ret = avctx->hwaccel->decode_slice(avctx, avpkt->data, avpkt->size);
2699         if (ret < 0)
2700             goto err;
2701
2702         ret = avctx->hwaccel->end_frame(avctx);
2703         if (ret < 0)
2704             goto err;
2705
2706     } else {
2707         s->linesize   = curframe->tf.f->linesize[0];
2708         s->uvlinesize = curframe->tf.f->linesize[1];
2709
2710         memset(s->top_nnz, 0, s->mb_width * sizeof(*s->top_nnz));
2711         /* Zero macroblock structures for top/top-left prediction
2712          * from outside the frame. */
2713         if (!s->mb_layout)
2714             memset(s->macroblocks + s->mb_height * 2 - 1, 0,
2715                    (s->mb_width + 1) * sizeof(*s->macroblocks));
2716         if (!s->mb_layout && s->keyframe)
2717             memset(s->intra4x4_pred_mode_top, DC_PRED, s->mb_width * 4);
2718
2719         memset(s->ref_count, 0, sizeof(s->ref_count));
2720
2721         if (s->mb_layout == 1) {
2722             // Make sure the previous frame has read its segmentation map,
2723             // if we re-use the same map.
2724             if (prev_frame && s->segmentation.enabled &&
2725                 !s->segmentation.update_map)
2726                 ff_thread_await_progress(&prev_frame->tf, 1, 0);
2727             if (is_vp7)
2728                 vp7_decode_mv_mb_modes(avctx, curframe, prev_frame);
2729             else
2730                 vp8_decode_mv_mb_modes(avctx, curframe, prev_frame);
2731         }
2732
2733         if (avctx->active_thread_type == FF_THREAD_FRAME)
2734             num_jobs = 1;
2735         else
2736             num_jobs = FFMIN(s->num_coeff_partitions, avctx->thread_count);
2737         s->num_jobs   = num_jobs;
2738         s->curframe   = curframe;
2739         s->prev_frame = prev_frame;
2740         s->mv_bounds.mv_min.y   = -MARGIN;
2741         s->mv_bounds.mv_max.y   = ((s->mb_height - 1) << 6) + MARGIN;
2742         for (i = 0; i < MAX_THREADS; i++) {
2743             VP8ThreadData *td = &s->thread_data[i];
2744             atomic_init(&td->thread_mb_pos, 0);
2745             atomic_init(&td->wait_mb_pos, INT_MAX);
2746         }
2747         if (is_vp7)
2748             avctx->execute2(avctx, vp7_decode_mb_row_sliced, s->thread_data, NULL,
2749                             num_jobs);
2750         else
2751             avctx->execute2(avctx, vp8_decode_mb_row_sliced, s->thread_data, NULL,
2752                             num_jobs);
2753     }
2754
2755     ff_thread_report_progress(&curframe->tf, INT_MAX, 0);
2756     memcpy(&s->framep[0], &s->next_framep[0], sizeof(s->framep[0]) * 4);
2757
2758 skip_decode:
2759     // if future frames don't use the updated probabilities,
2760     // reset them to the values we saved
2761     if (!s->update_probabilities)
2762         s->prob[0] = s->prob[1];
2763
2764     if (!s->invisible) {
2765         if ((ret = av_frame_ref(data, curframe->tf.f)) < 0)
2766             return ret;
2767         *got_frame = 1;
2768     }
2769
2770     return avpkt->size;
2771 err:
2772     memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
2773     return ret;
2774 }
2775
2776 int ff_vp8_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
2777                         AVPacket *avpkt)
2778 {
2779     return vp78_decode_frame(avctx, data, got_frame, avpkt, IS_VP8);
2780 }
2781
2782 #if CONFIG_VP7_DECODER
2783 static int vp7_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
2784                             AVPacket *avpkt)
2785 {
2786     return vp78_decode_frame(avctx, data, got_frame, avpkt, IS_VP7);
2787 }
2788 #endif /* CONFIG_VP7_DECODER */
2789
2790 av_cold int ff_vp8_decode_free(AVCodecContext *avctx)
2791 {
2792     VP8Context *s = avctx->priv_data;
2793     int i;
2794
2795     if (!s)
2796         return 0;
2797
2798     vp8_decode_flush_impl(avctx, 1);
2799     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++)
2800         av_frame_free(&s->frames[i].tf.f);
2801
2802     return 0;
2803 }
2804
2805 static av_cold int vp8_init_frames(VP8Context *s)
2806 {
2807     int i;
2808     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++) {
2809         s->frames[i].tf.f = av_frame_alloc();
2810         if (!s->frames[i].tf.f)
2811             return AVERROR(ENOMEM);
2812     }
2813     return 0;
2814 }
2815
2816 static av_always_inline
2817 int vp78_decode_init(AVCodecContext *avctx, int is_vp7)
2818 {
2819     VP8Context *s = avctx->priv_data;
2820     int ret;
2821
2822     s->avctx = avctx;
2823     s->vp7   = avctx->codec->id == AV_CODEC_ID_VP7;
2824     s->pix_fmt = AV_PIX_FMT_NONE;
2825     avctx->pix_fmt = AV_PIX_FMT_YUV420P;
2826     avctx->internal->allocate_progress = 1;
2827
2828     ff_videodsp_init(&s->vdsp, 8);
2829
2830     ff_vp78dsp_init(&s->vp8dsp);
2831     if (CONFIG_VP7_DECODER && is_vp7) {
2832         ff_h264_pred_init(&s->hpc, AV_CODEC_ID_VP7, 8, 1);
2833         ff_vp7dsp_init(&s->vp8dsp);
2834         s->decode_mb_row_no_filter = vp7_decode_mb_row_no_filter;
2835         s->filter_mb_row           = vp7_filter_mb_row;
2836     } else if (CONFIG_VP8_DECODER && !is_vp7) {
2837         ff_h264_pred_init(&s->hpc, AV_CODEC_ID_VP8, 8, 1);
2838         ff_vp8dsp_init(&s->vp8dsp);
2839         s->decode_mb_row_no_filter = vp8_decode_mb_row_no_filter;
2840         s->filter_mb_row           = vp8_filter_mb_row;
2841     }
2842
2843     /* does not change for VP8 */
2844     memcpy(s->prob[0].scan, ff_zigzag_scan, sizeof(s->prob[0].scan));
2845
2846     if ((ret = vp8_init_frames(s)) < 0) {
2847         ff_vp8_decode_free(avctx);
2848         return ret;
2849     }
2850
2851     return 0;
2852 }
2853
2854 #if CONFIG_VP7_DECODER
2855 static int vp7_decode_init(AVCodecContext *avctx)
2856 {
2857     return vp78_decode_init(avctx, IS_VP7);
2858 }
2859 #endif /* CONFIG_VP7_DECODER */
2860
2861 av_cold int ff_vp8_decode_init(AVCodecContext *avctx)
2862 {
2863     return vp78_decode_init(avctx, IS_VP8);
2864 }
2865
2866 #if CONFIG_VP8_DECODER
2867 #if HAVE_THREADS
2868 static av_cold int vp8_decode_init_thread_copy(AVCodecContext *avctx)
2869 {
2870     VP8Context *s = avctx->priv_data;
2871     int ret;
2872
2873     s->avctx = avctx;
2874
2875     if ((ret = vp8_init_frames(s)) < 0) {
2876         ff_vp8_decode_free(avctx);
2877         return ret;
2878     }
2879
2880     return 0;
2881 }
2882
2883 #define REBASE(pic) ((pic) ? (pic) - &s_src->frames[0] + &s->frames[0] : NULL)
2884
2885 static int vp8_decode_update_thread_context(AVCodecContext *dst,
2886                                             const AVCodecContext *src)
2887 {
2888     VP8Context *s = dst->priv_data, *s_src = src->priv_data;
2889     int i;
2890
2891     if (s->macroblocks_base &&
2892         (s_src->mb_width != s->mb_width || s_src->mb_height != s->mb_height)) {
2893         free_buffers(s);
2894         s->mb_width  = s_src->mb_width;
2895         s->mb_height = s_src->mb_height;
2896     }
2897
2898     s->pix_fmt      = s_src->pix_fmt;
2899     s->prob[0]      = s_src->prob[!s_src->update_probabilities];
2900     s->segmentation = s_src->segmentation;
2901     s->lf_delta     = s_src->lf_delta;
2902     memcpy(s->sign_bias, s_src->sign_bias, sizeof(s->sign_bias));
2903
2904     for (i = 0; i < FF_ARRAY_ELEMS(s_src->frames); i++) {
2905         if (s_src->frames[i].tf.f->buf[0]) {
2906             int ret = vp8_ref_frame(s, &s->frames[i], &s_src->frames[i]);
2907             if (ret < 0)
2908                 return ret;
2909         }
2910     }
2911
2912     s->framep[0] = REBASE(s_src->next_framep[0]);
2913     s->framep[1] = REBASE(s_src->next_framep[1]);
2914     s->framep[2] = REBASE(s_src->next_framep[2]);
2915     s->framep[3] = REBASE(s_src->next_framep[3]);
2916
2917     return 0;
2918 }
2919 #endif /* HAVE_THREADS */
2920 #endif /* CONFIG_VP8_DECODER */
2921
2922 #if CONFIG_VP7_DECODER
2923 AVCodec ff_vp7_decoder = {
2924     .name                  = "vp7",
2925     .long_name             = NULL_IF_CONFIG_SMALL("On2 VP7"),
2926     .type                  = AVMEDIA_TYPE_VIDEO,
2927     .id                    = AV_CODEC_ID_VP7,
2928     .priv_data_size        = sizeof(VP8Context),
2929     .init                  = vp7_decode_init,
2930     .close                 = ff_vp8_decode_free,
2931     .decode                = vp7_decode_frame,
2932     .capabilities          = AV_CODEC_CAP_DR1,
2933     .flush                 = vp8_decode_flush,
2934 };
2935 #endif /* CONFIG_VP7_DECODER */
2936
2937 #if CONFIG_VP8_DECODER
2938 AVCodec ff_vp8_decoder = {
2939     .name                  = "vp8",
2940     .long_name             = NULL_IF_CONFIG_SMALL("On2 VP8"),
2941     .type                  = AVMEDIA_TYPE_VIDEO,
2942     .id                    = AV_CODEC_ID_VP8,
2943     .priv_data_size        = sizeof(VP8Context),
2944     .init                  = ff_vp8_decode_init,
2945     .close                 = ff_vp8_decode_free,
2946     .decode                = ff_vp8_decode_frame,
2947     .capabilities          = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS |
2948                              AV_CODEC_CAP_SLICE_THREADS,
2949     .flush                 = vp8_decode_flush,
2950     .init_thread_copy      = ONLY_IF_THREADS_ENABLED(vp8_decode_init_thread_copy),
2951     .update_thread_context = ONLY_IF_THREADS_ENABLED(vp8_decode_update_thread_context),
2952     .hw_configs            = (const AVCodecHWConfigInternal*[]) {
2953 #if CONFIG_VP8_VAAPI_HWACCEL
2954                                HWACCEL_VAAPI(vp8),
2955 #endif
2956 #if CONFIG_VP8_NVDEC_HWACCEL
2957                                HWACCEL_NVDEC(vp8),
2958 #endif
2959                                NULL
2960                            },
2961 };
2962 #endif /* CONFIG_VP7_DECODER */