git.sesse.net Git - ffmpeg/blob - libavcodec/vp8.c

   1 /*
   2  * VP7/VP8 compatible video decoder
   3  *
   4  * Copyright (C) 2010 David Conrad
   5  * Copyright (C) 2010 Ronald S. Bultje
   6  * Copyright (C) 2010 Fiona Glaser
   7  * Copyright (C) 2012 Daniel Kang
   8  * Copyright (C) 2014 Peter Ross
   9  *
  10  * This file is part of FFmpeg.
  11  *
  12  * FFmpeg is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public
  14  * License as published by the Free Software Foundation; either
  15  * version 2.1 of the License, or (at your option) any later version.
  16  *
  17  * FFmpeg is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with FFmpeg; if not, write to the Free Software
  24  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  25  */
  26
  27 #include "libavutil/imgutils.h"
  28 #include "libavutil/mem_internal.h"
  29
  30 #include "avcodec.h"
  31 #include "hwconfig.h"
  32 #include "internal.h"
  33 #include "mathops.h"
  34 #include "rectangle.h"
  35 #include "thread.h"
  36 #include "vp8.h"
  37 #include "vp8data.h"
  38
  39 #if ARCH_ARM
  40 #   include "arm/vp8.h"
  41 #endif
  42
  43 #if CONFIG_VP7_DECODER && CONFIG_VP8_DECODER
  44 #define VPX(vp7, f) (vp7 ? vp7_ ## f : vp8_ ## f)
  45 #elif CONFIG_VP7_DECODER
  46 #define VPX(vp7, f) vp7_ ## f
  47 #else // CONFIG_VP8_DECODER
  48 #define VPX(vp7, f) vp8_ ## f
  49 #endif
  50
  51 static void free_buffers(VP8Context *s)
  52 {
  53     int i;
  54     if (s->thread_data)
  55         for (i = 0; i < MAX_THREADS; i++) {
  56 #if HAVE_THREADS
  57             pthread_cond_destroy(&s->thread_data[i].cond);
  58             pthread_mutex_destroy(&s->thread_data[i].lock);
  59 #endif
  60             av_freep(&s->thread_data[i].filter_strength);
  61         }
  62     av_freep(&s->thread_data);
  63     av_freep(&s->macroblocks_base);
  64     av_freep(&s->intra4x4_pred_mode_top);
  65     av_freep(&s->top_nnz);
  66     av_freep(&s->top_border);
  67
  68     s->macroblocks = NULL;
  69 }
  70
  71 static int vp8_alloc_frame(VP8Context *s, VP8Frame *f, int ref)
  72 {
  73     int ret;
  74     if ((ret = ff_thread_get_buffer(s->avctx, &f->tf,
  75                                     ref ? AV_GET_BUFFER_FLAG_REF : 0)) < 0)
  76         return ret;
  77     if (!(f->seg_map = av_buffer_allocz(s->mb_width * s->mb_height)))
  78         goto fail;
  79     if (s->avctx->hwaccel) {
  80         const AVHWAccel *hwaccel = s->avctx->hwaccel;
  81         if (hwaccel->frame_priv_data_size) {
  82             f->hwaccel_priv_buf = av_buffer_allocz(hwaccel->frame_priv_data_size);
  83             if (!f->hwaccel_priv_buf)
  84                 goto fail;
  85             f->hwaccel_picture_private = f->hwaccel_priv_buf->data;
  86         }
  87     }
  88     return 0;
  89
  90 fail:
  91     av_buffer_unref(&f->seg_map);
  92     ff_thread_release_buffer(s->avctx, &f->tf);
  93     return AVERROR(ENOMEM);
  94 }
  95
  96 static void vp8_release_frame(VP8Context *s, VP8Frame *f)
  97 {
  98     av_buffer_unref(&f->seg_map);
  99     av_buffer_unref(&f->hwaccel_priv_buf);
 100     f->hwaccel_picture_private = NULL;
 101     ff_thread_release_buffer(s->avctx, &f->tf);
 102 }
 103
 104 #if CONFIG_VP8_DECODER
 105 static int vp8_ref_frame(VP8Context *s, VP8Frame *dst, VP8Frame *src)
 106 {
 107     int ret;
 108
 109     vp8_release_frame(s, dst);
 110
 111     if ((ret = ff_thread_ref_frame(&dst->tf, &src->tf)) < 0)
 112         return ret;
 113     if (src->seg_map &&
 114         !(dst->seg_map = av_buffer_ref(src->seg_map))) {
 115         vp8_release_frame(s, dst);
 116         return AVERROR(ENOMEM);
 117     }
 118     if (src->hwaccel_picture_private) {
 119         dst->hwaccel_priv_buf = av_buffer_ref(src->hwaccel_priv_buf);
 120         if (!dst->hwaccel_priv_buf)
 121             return AVERROR(ENOMEM);
 122         dst->hwaccel_picture_private = dst->hwaccel_priv_buf->data;
 123     }
 124
 125     return 0;
 126 }
 127 #endif /* CONFIG_VP8_DECODER */
 128
 129 static void vp8_decode_flush_impl(AVCodecContext *avctx, int free_mem)
 130 {
 131     VP8Context *s = avctx->priv_data;
 132     int i;
 133
 134     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++)
 135         vp8_release_frame(s, &s->frames[i]);
 136     memset(s->framep, 0, sizeof(s->framep));
 137
 138     if (free_mem)
 139         free_buffers(s);
 140 }
 141
 142 static void vp8_decode_flush(AVCodecContext *avctx)
 143 {
 144     vp8_decode_flush_impl(avctx, 0);
 145 }
 146
 147 static VP8Frame *vp8_find_free_buffer(VP8Context *s)
 148 {
 149     VP8Frame *frame = NULL;
 150     int i;
 151
 152     // find a free buffer
 153     for (i = 0; i < 5; i++)
 154         if (&s->frames[i] != s->framep[VP56_FRAME_CURRENT]  &&
 155             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
 156             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN]   &&
 157             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2]) {
 158             frame = &s->frames[i];
 159             break;
 160         }
 161     if (i == 5) {
 162         av_log(s->avctx, AV_LOG_FATAL, "Ran out of free frames!\n");
 163         abort();
 164     }
 165     if (frame->tf.f->buf[0])
 166         vp8_release_frame(s, frame);
 167
 168     return frame;
 169 }
 170
 171 static enum AVPixelFormat get_pixel_format(VP8Context *s)
 172 {
 173     enum AVPixelFormat pix_fmts[] = {
 174 #if CONFIG_VP8_VAAPI_HWACCEL
 175         AV_PIX_FMT_VAAPI,
 176 #endif
 177 #if CONFIG_VP8_NVDEC_HWACCEL
 178         AV_PIX_FMT_CUDA,
 179 #endif
 180         AV_PIX_FMT_YUV420P,
 181         AV_PIX_FMT_NONE,
 182     };
 183
 184     return ff_get_format(s->avctx, pix_fmts);
 185 }
 186
 187 static av_always_inline
 188 int update_dimensions(VP8Context *s, int width, int height, int is_vp7)
 189 {
 190     AVCodecContext *avctx = s->avctx;
 191     int i, ret, dim_reset = 0;
 192
 193     if (width  != s->avctx->width || ((width+15)/16 != s->mb_width || (height+15)/16 != s->mb_height) && s->macroblocks_base ||
 194         height != s->avctx->height) {
 195         vp8_decode_flush_impl(s->avctx, 1);
 196
 197         ret = ff_set_dimensions(s->avctx, width, height);
 198         if (ret < 0)
 199             return ret;
 200
 201         dim_reset = (s->macroblocks_base != NULL);
 202     }
 203
 204     if ((s->pix_fmt == AV_PIX_FMT_NONE || dim_reset) &&
 205          !s->actually_webp && !is_vp7) {
 206         s->pix_fmt = get_pixel_format(s);
 207         if (s->pix_fmt < 0)
 208             return AVERROR(EINVAL);
 209         avctx->pix_fmt = s->pix_fmt;
 210     }
 211
 212     s->mb_width  = (s->avctx->coded_width  + 15) / 16;
 213     s->mb_height = (s->avctx->coded_height + 15) / 16;
 214
 215     s->mb_layout = is_vp7 || avctx->active_thread_type == FF_THREAD_SLICE &&
 216                    avctx->thread_count > 1;
 217     if (!s->mb_layout) { // Frame threading and one thread
 218         s->macroblocks_base       = av_mallocz((s->mb_width + s->mb_height * 2 + 1) *
 219                                                sizeof(*s->macroblocks));
 220         s->intra4x4_pred_mode_top = av_mallocz(s->mb_width * 4);
 221     } else // Sliced threading
 222         s->macroblocks_base = av_mallocz((s->mb_width + 2) * (s->mb_height + 2) *
 223                                          sizeof(*s->macroblocks));
 224     s->top_nnz     = av_mallocz(s->mb_width * sizeof(*s->top_nnz));
 225     s->top_border  = av_mallocz((s->mb_width + 1) * sizeof(*s->top_border));
 226     s->thread_data = av_mallocz(MAX_THREADS * sizeof(VP8ThreadData));
 227
 228     if (!s->macroblocks_base || !s->top_nnz || !s->top_border ||
 229         !s->thread_data || (!s->intra4x4_pred_mode_top && !s->mb_layout)) {
 230         free_buffers(s);
 231         return AVERROR(ENOMEM);
 232     }
 233
 234     for (i = 0; i < MAX_THREADS; i++) {
 235         s->thread_data[i].filter_strength =
 236             av_mallocz(s->mb_width * sizeof(*s->thread_data[0].filter_strength));
 237         if (!s->thread_data[i].filter_strength) {
 238             free_buffers(s);
 239             return AVERROR(ENOMEM);
 240         }
 241 #if HAVE_THREADS
 242         pthread_mutex_init(&s->thread_data[i].lock, NULL);
 243         pthread_cond_init(&s->thread_data[i].cond, NULL);
 244 #endif
 245     }
 246
 247     s->macroblocks = s->macroblocks_base + 1;
 248
 249     return 0;
 250 }
 251
 252 static int vp7_update_dimensions(VP8Context *s, int width, int height)
 253 {
 254     return update_dimensions(s, width, height, IS_VP7);
 255 }
 256
 257 static int vp8_update_dimensions(VP8Context *s, int width, int height)
 258 {
 259     return update_dimensions(s, width, height, IS_VP8);
 260 }
 261
 262
 263 static void parse_segment_info(VP8Context *s)
 264 {
 265     VP56RangeCoder *c = &s->c;
 266     int i;
 267
 268     s->segmentation.update_map = vp8_rac_get(c);
 269     s->segmentation.update_feature_data = vp8_rac_get(c);
 270
 271     if (s->segmentation.update_feature_data) {
 272         s->segmentation.absolute_vals = vp8_rac_get(c);
 273
 274         for (i = 0; i < 4; i++)
 275             s->segmentation.base_quant[i]   = vp8_rac_get_sint(c, 7);
 276
 277         for (i = 0; i < 4; i++)
 278             s->segmentation.filter_level[i] = vp8_rac_get_sint(c, 6);
 279     }
 280     if (s->segmentation.update_map)
 281         for (i = 0; i < 3; i++)
 282             s->prob->segmentid[i] = vp8_rac_get(c) ? vp8_rac_get_uint(c, 8) : 255;
 283 }
 284
 285 static void update_lf_deltas(VP8Context *s)
 286 {
 287     VP56RangeCoder *c = &s->c;
 288     int i;
 289
 290     for (i = 0; i < 4; i++) {
 291         if (vp8_rac_get(c)) {
 292             s->lf_delta.ref[i] = vp8_rac_get_uint(c, 6);
 293
 294             if (vp8_rac_get(c))
 295                 s->lf_delta.ref[i] = -s->lf_delta.ref[i];
 296         }
 297     }
 298
 299     for (i = MODE_I4x4; i <= VP8_MVMODE_SPLIT; i++) {
 300         if (vp8_rac_get(c)) {
 301             s->lf_delta.mode[i] = vp8_rac_get_uint(c, 6);
 302
 303             if (vp8_rac_get(c))
 304                 s->lf_delta.mode[i] = -s->lf_delta.mode[i];
 305         }
 306     }
 307 }
 308
 309 static int setup_partitions(VP8Context *s, const uint8_t *buf, int buf_size)
 310 {
 311     const uint8_t *sizes = buf;
 312     int i;
 313     int ret;
 314
 315     s->num_coeff_partitions = 1 << vp8_rac_get_uint(&s->c, 2);
 316
 317     buf      += 3 * (s->num_coeff_partitions - 1);
 318     buf_size -= 3 * (s->num_coeff_partitions - 1);
 319     if (buf_size < 0)
 320         return -1;
 321
 322     for (i = 0; i < s->num_coeff_partitions - 1; i++) {
 323         int size = AV_RL24(sizes + 3 * i);
 324         if (buf_size - size < 0)
 325             return -1;
 326         s->coeff_partition_size[i] = size;
 327
 328         ret = ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, size);
 329         if (ret < 0)
 330             return ret;
 331         buf      += size;
 332         buf_size -= size;
 333     }
 334
 335     s->coeff_partition_size[i] = buf_size;
 336     ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, buf_size);
 337
 338     return 0;
 339 }
 340
 341 static void vp7_get_quants(VP8Context *s)
 342 {
 343     VP56RangeCoder *c = &s->c;
 344
 345     int yac_qi  = vp8_rac_get_uint(c, 7);
 346     int ydc_qi  = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 347     int y2dc_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 348     int y2ac_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 349     int uvdc_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 350     int uvac_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 351
 352     s->qmat[0].luma_qmul[0]    =       vp7_ydc_qlookup[ydc_qi];
 353     s->qmat[0].luma_qmul[1]    =       vp7_yac_qlookup[yac_qi];
 354     s->qmat[0].luma_dc_qmul[0] =       vp7_y2dc_qlookup[y2dc_qi];
 355     s->qmat[0].luma_dc_qmul[1] =       vp7_y2ac_qlookup[y2ac_qi];
 356     s->qmat[0].chroma_qmul[0]  = FFMIN(vp7_ydc_qlookup[uvdc_qi], 132);
 357     s->qmat[0].chroma_qmul[1]  =       vp7_yac_qlookup[uvac_qi];
 358 }
 359
 360 static void vp8_get_quants(VP8Context *s)
 361 {
 362     VP56RangeCoder *c = &s->c;
 363     int i, base_qi;
 364
 365     s->quant.yac_qi     = vp8_rac_get_uint(c, 7);
 366     s->quant.ydc_delta  = vp8_rac_get_sint(c, 4);
 367     s->quant.y2dc_delta = vp8_rac_get_sint(c, 4);
 368     s->quant.y2ac_delta = vp8_rac_get_sint(c, 4);
 369     s->quant.uvdc_delta = vp8_rac_get_sint(c, 4);
 370     s->quant.uvac_delta = vp8_rac_get_sint(c, 4);
 371
 372     for (i = 0; i < 4; i++) {
 373         if (s->segmentation.enabled) {
 374             base_qi = s->segmentation.base_quant[i];
 375             if (!s->segmentation.absolute_vals)
 376                 base_qi += s->quant.yac_qi;
 377         } else
 378             base_qi = s->quant.yac_qi;
 379
 380         s->qmat[i].luma_qmul[0]    = vp8_dc_qlookup[av_clip_uintp2(base_qi + s->quant.ydc_delta,  7)];
 381         s->qmat[i].luma_qmul[1]    = vp8_ac_qlookup[av_clip_uintp2(base_qi,              7)];
 382         s->qmat[i].luma_dc_qmul[0] = vp8_dc_qlookup[av_clip_uintp2(base_qi + s->quant.y2dc_delta, 7)] * 2;
 383         /* 101581>>16 is equivalent to 155/100 */
 384         s->qmat[i].luma_dc_qmul[1] = vp8_ac_qlookup[av_clip_uintp2(base_qi + s->quant.y2ac_delta, 7)] * 101581 >> 16;
 385         s->qmat[i].chroma_qmul[0]  = vp8_dc_qlookup[av_clip_uintp2(base_qi + s->quant.uvdc_delta, 7)];
 386         s->qmat[i].chroma_qmul[1]  = vp8_ac_qlookup[av_clip_uintp2(base_qi + s->quant.uvac_delta, 7)];
 387
 388         s->qmat[i].luma_dc_qmul[1] = FFMAX(s->qmat[i].luma_dc_qmul[1], 8);
 389         s->qmat[i].chroma_qmul[0]  = FFMIN(s->qmat[i].chroma_qmul[0], 132);
 390     }
 391 }
 392
 393 /**
 394  * Determine which buffers golden and altref should be updated with after this frame.
 395  * The spec isn't clear here, so I'm going by my understanding of what libvpx does
 396  *
 397  * Intra frames update all 3 references
 398  * Inter frames update VP56_FRAME_PREVIOUS if the update_last flag is set
 399  * If the update (golden|altref) flag is set, it's updated with the current frame
 400  *      if update_last is set, and VP56_FRAME_PREVIOUS otherwise.
 401  * If the flag is not set, the number read means:
 402  *      0: no update
 403  *      1: VP56_FRAME_PREVIOUS
 404  *      2: update golden with altref, or update altref with golden
 405  */
 406 static VP56Frame ref_to_update(VP8Context *s, int update, VP56Frame ref)
 407 {
 408     VP56RangeCoder *c = &s->c;
 409
 410     if (update)
 411         return VP56_FRAME_CURRENT;
 412
 413     switch (vp8_rac_get_uint(c, 2)) {
 414     case 1:
 415         return VP56_FRAME_PREVIOUS;
 416     case 2:
 417         return (ref == VP56_FRAME_GOLDEN) ? VP56_FRAME_GOLDEN2 : VP56_FRAME_GOLDEN;
 418     }
 419     return VP56_FRAME_NONE;
 420 }
 421
 422 static void vp78_reset_probability_tables(VP8Context *s)
 423 {
 424     int i, j;
 425     for (i = 0; i < 4; i++)
 426         for (j = 0; j < 16; j++)
 427             memcpy(s->prob->token[i][j], vp8_token_default_probs[i][vp8_coeff_band[j]],
 428                    sizeof(s->prob->token[i][j]));
 429 }
 430
 431 static void vp78_update_probability_tables(VP8Context *s)
 432 {
 433     VP56RangeCoder *c = &s->c;
 434     int i, j, k, l, m;
 435
 436     for (i = 0; i < 4; i++)
 437         for (j = 0; j < 8; j++)
 438             for (k = 0; k < 3; k++)
 439                 for (l = 0; l < NUM_DCT_TOKENS-1; l++)
 440                     if (vp56_rac_get_prob_branchy(c, vp8_token_update_probs[i][j][k][l])) {
 441                         int prob = vp8_rac_get_uint(c, 8);
 442                         for (m = 0; vp8_coeff_band_indexes[j][m] >= 0; m++)
 443                             s->prob->token[i][vp8_coeff_band_indexes[j][m]][k][l] = prob;
 444                     }
 445 }
 446
 447 #define VP7_MVC_SIZE 17
 448 #define VP8_MVC_SIZE 19
 449
 450 static void vp78_update_pred16x16_pred8x8_mvc_probabilities(VP8Context *s,
 451                                                             int mvc_size)
 452 {
 453     VP56RangeCoder *c = &s->c;
 454     int i, j;
 455
 456     if (vp8_rac_get(c))
 457         for (i = 0; i < 4; i++)
 458             s->prob->pred16x16[i] = vp8_rac_get_uint(c, 8);
 459     if (vp8_rac_get(c))
 460         for (i = 0; i < 3; i++)
 461             s->prob->pred8x8c[i]  = vp8_rac_get_uint(c, 8);
 462
 463     // 17.2 MV probability update
 464     for (i = 0; i < 2; i++)
 465         for (j = 0; j < mvc_size; j++)
 466             if (vp56_rac_get_prob_branchy(c, vp8_mv_update_prob[i][j]))
 467                 s->prob->mvc[i][j] = vp8_rac_get_nn(c);
 468 }
 469
 470 static void update_refs(VP8Context *s)
 471 {
 472     VP56RangeCoder *c = &s->c;
 473
 474     int update_golden = vp8_rac_get(c);
 475     int update_altref = vp8_rac_get(c);
 476
 477     s->update_golden = ref_to_update(s, update_golden, VP56_FRAME_GOLDEN);
 478     s->update_altref = ref_to_update(s, update_altref, VP56_FRAME_GOLDEN2);
 479 }
 480
 481 static void copy_chroma(AVFrame *dst, AVFrame *src, int width, int height)
 482 {
 483     int i, j;
 484
 485     for (j = 1; j < 3; j++) {
 486         for (i = 0; i < height / 2; i++)
 487             memcpy(dst->data[j] + i * dst->linesize[j],
 488                    src->data[j] + i * src->linesize[j], width / 2);
 489     }
 490 }
 491
 492 static void fade(uint8_t *dst, ptrdiff_t dst_linesize,
 493                  const uint8_t *src, ptrdiff_t src_linesize,
 494                  int width, int height,
 495                  int alpha, int beta)
 496 {
 497     int i, j;
 498     for (j = 0; j < height; j++) {
 499         const uint8_t *src2 = src + j * src_linesize;
 500         uint8_t *dst2 = dst + j * dst_linesize;
 501         for (i = 0; i < width; i++) {
 502             uint8_t y = src2[i];
 503             dst2[i] = av_clip_uint8(y + ((y * beta) >> 8) + alpha);
 504         }
 505     }
 506 }
 507
 508 static int vp7_fade_frame(VP8Context *s, int alpha, int beta)
 509 {
 510     int ret;
 511
 512     if (!s->keyframe && (alpha || beta)) {
 513         int width  = s->mb_width * 16;
 514         int height = s->mb_height * 16;
 515         AVFrame *src, *dst;
 516
 517         if (!s->framep[VP56_FRAME_PREVIOUS] ||
 518             !s->framep[VP56_FRAME_GOLDEN]) {
 519             av_log(s->avctx, AV_LOG_WARNING, "Discarding interframe without a prior keyframe!\n");
 520             return AVERROR_INVALIDDATA;
 521         }
 522
 523         dst =
 524         src = s->framep[VP56_FRAME_PREVIOUS]->tf.f;
 525
 526         /* preserve the golden frame, write a new previous frame */
 527         if (s->framep[VP56_FRAME_GOLDEN] == s->framep[VP56_FRAME_PREVIOUS]) {
 528             s->framep[VP56_FRAME_PREVIOUS] = vp8_find_free_buffer(s);
 529             if ((ret = vp8_alloc_frame(s, s->framep[VP56_FRAME_PREVIOUS], 1)) < 0)
 530                 return ret;
 531
 532             dst = s->framep[VP56_FRAME_PREVIOUS]->tf.f;
 533
 534             copy_chroma(dst, src, width, height);
 535         }
 536
 537         fade(dst->data[0], dst->linesize[0],
 538              src->data[0], src->linesize[0],
 539              width, height, alpha, beta);
 540     }
 541
 542     return 0;
 543 }
 544
 545 static int vp7_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size)
 546 {
 547     VP56RangeCoder *c = &s->c;
 548     int part1_size, hscale, vscale, i, j, ret;
 549     int width  = s->avctx->width;
 550     int height = s->avctx->height;
 551     int alpha = 0;
 552     int beta  = 0;
 553
 554     if (buf_size < 4) {
 555         return AVERROR_INVALIDDATA;
 556     }
 557
 558     s->profile = (buf[0] >> 1) & 7;
 559     if (s->profile > 1) {
 560         avpriv_request_sample(s->avctx, "Unknown profile %d", s->profile);
 561         return AVERROR_INVALIDDATA;
 562     }
 563
 564     s->keyframe  = !(buf[0] & 1);
 565     s->invisible = 0;
 566     part1_size   = AV_RL24(buf) >> 4;
 567
 568     if (buf_size < 4 - s->profile + part1_size) {
 569         av_log(s->avctx, AV_LOG_ERROR, "Buffer size %d is too small, needed : %d\n", buf_size, 4 - s->profile + part1_size);
 570         return AVERROR_INVALIDDATA;
 571     }
 572
 573     buf      += 4 - s->profile;
 574     buf_size -= 4 - s->profile;
 575
 576     memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab, sizeof(s->put_pixels_tab));
 577
 578     ret = ff_vp56_init_range_decoder(c, buf, part1_size);
 579     if (ret < 0)
 580         return ret;
 581     buf      += part1_size;
 582     buf_size -= part1_size;
 583
 584     /* A. Dimension information (keyframes only) */
 585     if (s->keyframe) {
 586         width  = vp8_rac_get_uint(c, 12);
 587         height = vp8_rac_get_uint(c, 12);
 588         hscale = vp8_rac_get_uint(c, 2);
 589         vscale = vp8_rac_get_uint(c, 2);
 590         if (hscale || vscale)
 591             avpriv_request_sample(s->avctx, "Upscaling");
 592
 593         s->update_golden = s->update_altref = VP56_FRAME_CURRENT;
 594         vp78_reset_probability_tables(s);
 595         memcpy(s->prob->pred16x16, vp8_pred16x16_prob_inter,
 596                sizeof(s->prob->pred16x16));
 597         memcpy(s->prob->pred8x8c, vp8_pred8x8c_prob_inter,
 598                sizeof(s->prob->pred8x8c));
 599         for (i = 0; i < 2; i++)
 600             memcpy(s->prob->mvc[i], vp7_mv_default_prob[i],
 601                    sizeof(vp7_mv_default_prob[i]));
 602         memset(&s->segmentation, 0, sizeof(s->segmentation));
 603         memset(&s->lf_delta, 0, sizeof(s->lf_delta));
 604         memcpy(s->prob[0].scan, ff_zigzag_scan, sizeof(s->prob[0].scan));
 605     }
 606
 607     if (s->keyframe || s->profile > 0)
 608         memset(s->inter_dc_pred, 0 , sizeof(s->inter_dc_pred));
 609
 610     /* B. Decoding information for all four macroblock-level features */
 611     for (i = 0; i < 4; i++) {
 612         s->feature_enabled[i] = vp8_rac_get(c);
 613         if (s->feature_enabled[i]) {
 614              s->feature_present_prob[i] = vp8_rac_get_uint(c, 8);
 615
 616              for (j = 0; j < 3; j++)
 617                  s->feature_index_prob[i][j] =
 618                      vp8_rac_get(c) ? vp8_rac_get_uint(c, 8) : 255;
 619
 620              if (vp7_feature_value_size[s->profile][i])
 621                  for (j = 0; j < 4; j++)
 622                      s->feature_value[i][j] =
 623                         vp8_rac_get(c) ? vp8_rac_get_uint(c, vp7_feature_value_size[s->profile][i]) : 0;
 624         }
 625     }
 626
 627     s->segmentation.enabled    = 0;
 628     s->segmentation.update_map = 0;
 629     s->lf_delta.enabled        = 0;
 630
 631     s->num_coeff_partitions = 1;
 632     ret = ff_vp56_init_range_decoder(&s->coeff_partition[0], buf, buf_size);
 633     if (ret < 0)
 634         return ret;
 635
 636     if (!s->macroblocks_base || /* first frame */
 637         width != s->avctx->width || height != s->avctx->height ||
 638         (width + 15) / 16 != s->mb_width || (height + 15) / 16 != s->mb_height) {
 639         if ((ret = vp7_update_dimensions(s, width, height)) < 0)
 640             return ret;
 641     }
 642
 643     /* C. Dequantization indices */
 644     vp7_get_quants(s);
 645
 646     /* D. Golden frame update flag (a Flag) for interframes only */
 647     if (!s->keyframe) {
 648         s->update_golden = vp8_rac_get(c) ? VP56_FRAME_CURRENT : VP56_FRAME_NONE;
 649         s->sign_bias[VP56_FRAME_GOLDEN] = 0;
 650     }
 651
 652     s->update_last          = 1;
 653     s->update_probabilities = 1;
 654     s->fade_present         = 1;
 655
 656     if (s->profile > 0) {
 657         s->update_probabilities = vp8_rac_get(c);
 658         if (!s->update_probabilities)
 659             s->prob[1] = s->prob[0];
 660
 661         if (!s->keyframe)
 662             s->fade_present = vp8_rac_get(c);
 663     }
 664
 665     if (vpX_rac_is_end(c))
 666         return AVERROR_INVALIDDATA;
 667     /* E. Fading information for previous frame */
 668     if (s->fade_present && vp8_rac_get(c)) {
 669         alpha = (int8_t) vp8_rac_get_uint(c, 8);
 670         beta  = (int8_t) vp8_rac_get_uint(c, 8);
 671     }
 672
 673     /* F. Loop filter type */
 674     if (!s->profile)
 675         s->filter.simple = vp8_rac_get(c);
 676
 677     /* G. DCT coefficient ordering specification */
 678     if (vp8_rac_get(c))
 679         for (i = 1; i < 16; i++)
 680             s->prob[0].scan[i] = ff_zigzag_scan[vp8_rac_get_uint(c, 4)];
 681
 682     /* H. Loop filter levels  */
 683     if (s->profile > 0)
 684         s->filter.simple = vp8_rac_get(c);
 685     s->filter.level     = vp8_rac_get_uint(c, 6);
 686     s->filter.sharpness = vp8_rac_get_uint(c, 3);
 687
 688     /* I. DCT coefficient probability update; 13.3 Token Probability Updates */
 689     vp78_update_probability_tables(s);
 690
 691     s->mbskip_enabled = 0;
 692
 693     /* J. The remaining frame header data occurs ONLY FOR INTERFRAMES */
 694     if (!s->keyframe) {
 695         s->prob->intra  = vp8_rac_get_uint(c, 8);
 696         s->prob->last   = vp8_rac_get_uint(c, 8);
 697         vp78_update_pred16x16_pred8x8_mvc_probabilities(s, VP7_MVC_SIZE);
 698     }
 699
 700     if (vpX_rac_is_end(c))
 701         return AVERROR_INVALIDDATA;
 702
 703     if ((ret = vp7_fade_frame(s, alpha, beta)) < 0)
 704         return ret;
 705
 706     return 0;
 707 }
 708
 709 static int vp8_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size)
 710 {
 711     VP56RangeCoder *c = &s->c;
 712     int header_size, hscale, vscale, ret;
 713     int width  = s->avctx->width;
 714     int height = s->avctx->height;
 715
 716     if (buf_size < 3) {
 717         av_log(s->avctx, AV_LOG_ERROR, "Insufficent data (%d) for header\n", buf_size);
 718         return AVERROR_INVALIDDATA;
 719     }
 720
 721     s->keyframe  = !(buf[0] & 1);
 722     s->profile   =  (buf[0]>>1) & 7;
 723     s->invisible = !(buf[0] & 0x10);
 724     header_size  = AV_RL24(buf) >> 5;
 725     buf      += 3;
 726     buf_size -= 3;
 727
 728     s->header_partition_size = header_size;
 729
 730     if (s->profile > 3)
 731         av_log(s->avctx, AV_LOG_WARNING, "Unknown profile %d\n", s->profile);
 732
 733     if (!s->profile)
 734         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab,
 735                sizeof(s->put_pixels_tab));
 736     else    // profile 1-3 use bilinear, 4+ aren't defined so whatever
 737         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_bilinear_pixels_tab,
 738                sizeof(s->put_pixels_tab));
 739
 740     if (header_size > buf_size - 7 * s->keyframe) {
 741         av_log(s->avctx, AV_LOG_ERROR, "Header size larger than data provided\n");
 742         return AVERROR_INVALIDDATA;
 743     }
 744
 745     if (s->keyframe) {
 746         if (AV_RL24(buf) != 0x2a019d) {
 747             av_log(s->avctx, AV_LOG_ERROR,
 748                    "Invalid start code 0x%x\n", AV_RL24(buf));
 749             return AVERROR_INVALIDDATA;
 750         }
 751         width     = AV_RL16(buf + 3) & 0x3fff;
 752         height    = AV_RL16(buf + 5) & 0x3fff;
 753         hscale    = buf[4] >> 6;
 754         vscale    = buf[6] >> 6;
 755         buf      += 7;
 756         buf_size -= 7;
 757
 758         if (hscale || vscale)
 759             avpriv_request_sample(s->avctx, "Upscaling");
 760
 761         s->update_golden = s->update_altref = VP56_FRAME_CURRENT;
 762         vp78_reset_probability_tables(s);
 763         memcpy(s->prob->pred16x16, vp8_pred16x16_prob_inter,
 764                sizeof(s->prob->pred16x16));
 765         memcpy(s->prob->pred8x8c, vp8_pred8x8c_prob_inter,
 766                sizeof(s->prob->pred8x8c));
 767         memcpy(s->prob->mvc, vp8_mv_default_prob,
 768                sizeof(s->prob->mvc));
 769         memset(&s->segmentation, 0, sizeof(s->segmentation));
 770         memset(&s->lf_delta, 0, sizeof(s->lf_delta));
 771     }
 772
 773     ret = ff_vp56_init_range_decoder(c, buf, header_size);
 774     if (ret < 0)
 775         return ret;
 776     buf      += header_size;
 777     buf_size -= header_size;
 778
 779     if (s->keyframe) {
 780         s->colorspace = vp8_rac_get(c);
 781         if (s->colorspace)
 782             av_log(s->avctx, AV_LOG_WARNING, "Unspecified colorspace\n");
 783         s->fullrange = vp8_rac_get(c);
 784     }
 785
 786     if ((s->segmentation.enabled = vp8_rac_get(c)))
 787         parse_segment_info(s);
 788     else
 789         s->segmentation.update_map = 0; // FIXME: move this to some init function?
 790
 791     s->filter.simple    = vp8_rac_get(c);
 792     s->filter.level     = vp8_rac_get_uint(c, 6);
 793     s->filter.sharpness = vp8_rac_get_uint(c, 3);
 794
 795     if ((s->lf_delta.enabled = vp8_rac_get(c))) {
 796         s->lf_delta.update = vp8_rac_get(c);
 797         if (s->lf_delta.update)
 798             update_lf_deltas(s);
 799     }
 800
 801     if (setup_partitions(s, buf, buf_size)) {
 802         av_log(s->avctx, AV_LOG_ERROR, "Invalid partitions\n");
 803         return AVERROR_INVALIDDATA;
 804     }
 805
 806     if (!s->macroblocks_base || /* first frame */
 807         width != s->avctx->width || height != s->avctx->height ||
 808         (width+15)/16 != s->mb_width || (height+15)/16 != s->mb_height)
 809         if ((ret = vp8_update_dimensions(s, width, height)) < 0)
 810             return ret;
 811
 812     vp8_get_quants(s);
 813
 814     if (!s->keyframe) {
 815         update_refs(s);
 816         s->sign_bias[VP56_FRAME_GOLDEN]               = vp8_rac_get(c);
 817         s->sign_bias[VP56_FRAME_GOLDEN2 /* altref */] = vp8_rac_get(c);
 818     }
 819
 820     // if we aren't saving this frame's probabilities for future frames,
 821     // make a copy of the current probabilities
 822     if (!(s->update_probabilities = vp8_rac_get(c)))
 823         s->prob[1] = s->prob[0];
 824
 825     s->update_last = s->keyframe || vp8_rac_get(c);
 826
 827     vp78_update_probability_tables(s);
 828
 829     if ((s->mbskip_enabled = vp8_rac_get(c)))
 830         s->prob->mbskip = vp8_rac_get_uint(c, 8);
 831
 832     if (!s->keyframe) {
 833         s->prob->intra  = vp8_rac_get_uint(c, 8);
 834         s->prob->last   = vp8_rac_get_uint(c, 8);
 835         s->prob->golden = vp8_rac_get_uint(c, 8);
 836         vp78_update_pred16x16_pred8x8_mvc_probabilities(s, VP8_MVC_SIZE);
 837     }
 838
 839     // Record the entropy coder state here so that hwaccels can use it.
 840     s->c.code_word = vp56_rac_renorm(&s->c);
 841     s->coder_state_at_header_end.input     = s->c.buffer - (-s->c.bits / 8);
 842     s->coder_state_at_header_end.range     = s->c.high;
 843     s->coder_state_at_header_end.value     = s->c.code_word >> 16;
 844     s->coder_state_at_header_end.bit_count = -s->c.bits % 8;
 845
 846     return 0;
 847 }
 848
 849 static av_always_inline
 850 void clamp_mv(VP8mvbounds *s, VP56mv *dst, const VP56mv *src)
 851 {
 852     dst->x = av_clip(src->x, av_clip(s->mv_min.x, INT16_MIN, INT16_MAX),
 853                              av_clip(s->mv_max.x, INT16_MIN, INT16_MAX));
 854     dst->y = av_clip(src->y, av_clip(s->mv_min.y, INT16_MIN, INT16_MAX),
 855                              av_clip(s->mv_max.y, INT16_MIN, INT16_MAX));
 856 }
 857
 858 /**
 859  * Motion vector coding, 17.1.
 860  */
 861 static av_always_inline int read_mv_component(VP56RangeCoder *c, const uint8_t *p, int vp7)
 862 {
 863     int bit, x = 0;
 864
 865     if (vp56_rac_get_prob_branchy(c, p[0])) {
 866         int i;
 867
 868         for (i = 0; i < 3; i++)
 869             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 870         for (i = (vp7 ? 7 : 9); i > 3; i--)
 871             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 872         if (!(x & (vp7 ? 0xF0 : 0xFFF0)) || vp56_rac_get_prob(c, p[12]))
 873             x += 8;
 874     } else {
 875         // small_mvtree
 876         const uint8_t *ps = p + 2;
 877         bit = vp56_rac_get_prob(c, *ps);
 878         ps += 1 + 3 * bit;
 879         x  += 4 * bit;
 880         bit = vp56_rac_get_prob(c, *ps);
 881         ps += 1 + bit;
 882         x  += 2 * bit;
 883         x  += vp56_rac_get_prob(c, *ps);
 884     }
 885
 886     return (x && vp56_rac_get_prob(c, p[1])) ? -x : x;
 887 }
 888
 889 static int vp7_read_mv_component(VP56RangeCoder *c, const uint8_t *p)
 890 {
 891     return read_mv_component(c, p, 1);
 892 }
 893
 894 static int vp8_read_mv_component(VP56RangeCoder *c, const uint8_t *p)
 895 {
 896     return read_mv_component(c, p, 0);
 897 }
 898
 899 static av_always_inline
 900 const uint8_t *get_submv_prob(uint32_t left, uint32_t top, int is_vp7)
 901 {
 902     if (is_vp7)
 903         return vp7_submv_prob;
 904
 905     if (left == top)
 906         return vp8_submv_prob[4 - !!left];
 907     if (!top)
 908         return vp8_submv_prob[2];
 909     return vp8_submv_prob[1 - !!left];
 910 }
 911
 912 /**
 913  * Split motion vector prediction, 16.4.
 914  * @returns the number of motion vectors parsed (2, 4 or 16)
 915  */
 916 static av_always_inline
 917 int decode_splitmvs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
 918                     int layout, int is_vp7)
 919 {
 920     int part_idx;
 921     int n, num;
 922     VP8Macroblock *top_mb;
 923     VP8Macroblock *left_mb = &mb[-1];
 924     const uint8_t *mbsplits_left = vp8_mbsplits[left_mb->partitioning];
 925     const uint8_t *mbsplits_top, *mbsplits_cur, *firstidx;
 926     VP56mv *top_mv;
 927     VP56mv *left_mv = left_mb->bmv;
 928     VP56mv *cur_mv  = mb->bmv;
 929
 930     if (!layout) // layout is inlined, s->mb_layout is not
 931         top_mb = &mb[2];
 932     else
 933         top_mb = &mb[-s->mb_width - 1];
 934     mbsplits_top = vp8_mbsplits[top_mb->partitioning];
 935     top_mv       = top_mb->bmv;
 936
 937     if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[0])) {
 938         if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[1]))
 939             part_idx = VP8_SPLITMVMODE_16x8 + vp56_rac_get_prob(c, vp8_mbsplit_prob[2]);
 940         else
 941             part_idx = VP8_SPLITMVMODE_8x8;
 942     } else {
 943         part_idx = VP8_SPLITMVMODE_4x4;
 944     }
 945
 946     num              = vp8_mbsplit_count[part_idx];
 947     mbsplits_cur     = vp8_mbsplits[part_idx],
 948     firstidx         = vp8_mbfirstidx[part_idx];
 949     mb->partitioning = part_idx;
 950
 951     for (n = 0; n < num; n++) {
 952         int k = firstidx[n];
 953         uint32_t left, above;
 954         const uint8_t *submv_prob;
 955
 956         if (!(k & 3))
 957             left = AV_RN32A(&left_mv[mbsplits_left[k + 3]]);
 958         else
 959             left = AV_RN32A(&cur_mv[mbsplits_cur[k - 1]]);
 960         if (k <= 3)
 961             above = AV_RN32A(&top_mv[mbsplits_top[k + 12]]);
 962         else
 963             above = AV_RN32A(&cur_mv[mbsplits_cur[k - 4]]);
 964
 965         submv_prob = get_submv_prob(left, above, is_vp7);
 966
 967         if (vp56_rac_get_prob_branchy(c, submv_prob[0])) {
 968             if (vp56_rac_get_prob_branchy(c, submv_prob[1])) {
 969                 if (vp56_rac_get_prob_branchy(c, submv_prob[2])) {
 970                     mb->bmv[n].y = mb->mv.y +
 971                                    read_mv_component(c, s->prob->mvc[0], is_vp7);
 972                     mb->bmv[n].x = mb->mv.x +
 973                                    read_mv_component(c, s->prob->mvc[1], is_vp7);
 974                 } else {
 975                     AV_ZERO32(&mb->bmv[n]);
 976                 }
 977             } else {
 978                 AV_WN32A(&mb->bmv[n], above);
 979             }
 980         } else {
 981             AV_WN32A(&mb->bmv[n], left);
 982         }
 983     }
 984
 985     return num;
 986 }
 987
 988 /**
 989  * The vp7 reference decoder uses a padding macroblock column (added to right
 990  * edge of the frame) to guard against illegal macroblock offsets. The
 991  * algorithm has bugs that permit offsets to straddle the padding column.
 992  * This function replicates those bugs.
 993  *
 994  * @param[out] edge_x macroblock x address
 995  * @param[out] edge_y macroblock y address
 996  *
 997  * @return macroblock offset legal (boolean)
 998  */
 999 static int vp7_calculate_mb_offset(int mb_x, int mb_y, int mb_width,
1000                                    int xoffset, int yoffset, int boundary,
1001                                    int *edge_x, int *edge_y)
1002 {
1003     int vwidth = mb_width + 1;
1004     int new = (mb_y + yoffset) * vwidth + mb_x + xoffset;
1005     if (new < boundary || new % vwidth == vwidth - 1)
1006         return 0;
1007     *edge_y = new / vwidth;
1008     *edge_x = new % vwidth;
1009     return 1;
1010 }
1011
1012 static const VP56mv *get_bmv_ptr(const VP8Macroblock *mb, int subblock)
1013 {
1014     return &mb->bmv[mb->mode == VP8_MVMODE_SPLIT ? vp8_mbsplits[mb->partitioning][subblock] : 0];
1015 }
1016
1017 static av_always_inline
1018 void vp7_decode_mvs(VP8Context *s, VP8Macroblock *mb,
1019                     int mb_x, int mb_y, int layout)
1020 {
1021     VP8Macroblock *mb_edge[12];
1022     enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR };
1023     enum { VP8_EDGE_TOP, VP8_EDGE_LEFT, VP8_EDGE_TOPLEFT };
1024     int idx = CNT_ZERO;
1025     VP56mv near_mv[3];
1026     uint8_t cnt[3] = { 0 };
1027     VP56RangeCoder *c = &s->c;
1028     int i;
1029
1030     AV_ZERO32(&near_mv[0]);
1031     AV_ZERO32(&near_mv[1]);
1032     AV_ZERO32(&near_mv[2]);
1033
1034     for (i = 0; i < VP7_MV_PRED_COUNT; i++) {
1035         const VP7MVPred * pred = &vp7_mv_pred[i];
1036         int edge_x, edge_y;
1037
1038         if (vp7_calculate_mb_offset(mb_x, mb_y, s->mb_width, pred->xoffset,
1039                                     pred->yoffset, !s->profile, &edge_x, &edge_y)) {
1040             VP8Macroblock *edge = mb_edge[i] = (s->mb_layout == 1)
1041                                              ? s->macroblocks_base + 1 + edge_x +
1042                                                (s->mb_width + 1) * (edge_y + 1)
1043                                              : s->macroblocks + edge_x +
1044                                                (s->mb_height - edge_y - 1) * 2;
1045             uint32_t mv = AV_RN32A(get_bmv_ptr(edge, vp7_mv_pred[i].subblock));
1046             if (mv) {
1047                 if (AV_RN32A(&near_mv[CNT_NEAREST])) {
1048                     if (mv == AV_RN32A(&near_mv[CNT_NEAREST])) {
1049                         idx = CNT_NEAREST;
1050                     } else if (AV_RN32A(&near_mv[CNT_NEAR])) {
1051                         if (mv != AV_RN32A(&near_mv[CNT_NEAR]))
1052                             continue;
1053                         idx = CNT_NEAR;
1054                     } else {
1055                         AV_WN32A(&near_mv[CNT_NEAR], mv);
1056                         idx = CNT_NEAR;
1057                     }
1058                 } else {
1059                     AV_WN32A(&near_mv[CNT_NEAREST], mv);
1060                     idx = CNT_NEAREST;
1061                 }
1062             } else {
1063                 idx = CNT_ZERO;
1064             }
1065         } else {
1066             idx = CNT_ZERO;
1067         }
1068         cnt[idx] += vp7_mv_pred[i].score;
1069     }
1070
1071     mb->partitioning = VP8_SPLITMVMODE_NONE;
1072
1073     if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_ZERO]][0])) {
1074         mb->mode = VP8_MVMODE_MV;
1075
1076         if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAREST]][1])) {
1077
1078             if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAR]][2])) {
1079
1080                 if (cnt[CNT_NEAREST] > cnt[CNT_NEAR])
1081                     AV_WN32A(&mb->mv, cnt[CNT_ZERO] > cnt[CNT_NEAREST] ? 0 : AV_RN32A(&near_mv[CNT_NEAREST]));
1082                 else
1083                     AV_WN32A(&mb->mv, cnt[CNT_ZERO] > cnt[CNT_NEAR]    ? 0 : AV_RN32A(&near_mv[CNT_NEAR]));
1084
1085                 if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAR]][3])) {
1086                     mb->mode = VP8_MVMODE_SPLIT;
1087                     mb->mv = mb->bmv[decode_splitmvs(s, c, mb, layout, IS_VP7) - 1];
1088                 } else {
1089                     mb->mv.y += vp7_read_mv_component(c, s->prob->mvc[0]);
1090                     mb->mv.x += vp7_read_mv_component(c, s->prob->mvc[1]);
1091                     mb->bmv[0] = mb->mv;
1092                 }
1093             } else {
1094                 mb->mv = near_mv[CNT_NEAR];
1095                 mb->bmv[0] = mb->mv;
1096             }
1097         } else {
1098             mb->mv = near_mv[CNT_NEAREST];
1099             mb->bmv[0] = mb->mv;
1100         }
1101     } else {
1102         mb->mode = VP8_MVMODE_ZERO;
1103         AV_ZERO32(&mb->mv);
1104         mb->bmv[0] = mb->mv;
1105     }
1106 }
1107
1108 static av_always_inline
1109 void vp8_decode_mvs(VP8Context *s, VP8mvbounds *mv_bounds, VP8Macroblock *mb,
1110                     int mb_x, int mb_y, int layout)
1111 {
1112     VP8Macroblock *mb_edge[3] = { 0      /* top */,
1113                                   mb - 1 /* left */,
1114                                   0      /* top-left */ };
1115     enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV };
1116     enum { VP8_EDGE_TOP, VP8_EDGE_LEFT, VP8_EDGE_TOPLEFT };
1117     int idx = CNT_ZERO;
1118     int cur_sign_bias = s->sign_bias[mb->ref_frame];
1119     int8_t *sign_bias = s->sign_bias;
1120     VP56mv near_mv[4];
1121     uint8_t cnt[4] = { 0 };
1122     VP56RangeCoder *c = &s->c;
1123
1124     if (!layout) { // layout is inlined (s->mb_layout is not)
1125         mb_edge[0] = mb + 2;
1126         mb_edge[2] = mb + 1;
1127     } else {
1128         mb_edge[0] = mb - s->mb_width - 1;
1129         mb_edge[2] = mb - s->mb_width - 2;
1130     }
1131
1132     AV_ZERO32(&near_mv[0]);
1133     AV_ZERO32(&near_mv[1]);
1134     AV_ZERO32(&near_mv[2]);
1135
1136     /* Process MB on top, left and top-left */
1137 #define MV_EDGE_CHECK(n)                                                      \
1138     {                                                                         \
1139         VP8Macroblock *edge = mb_edge[n];                                     \
1140         int edge_ref = edge->ref_frame;                                       \
1141         if (edge_ref != VP56_FRAME_CURRENT) {                                 \
1142             uint32_t mv = AV_RN32A(&edge->mv);                                \
1143             if (mv) {                                                         \
1144                 if (cur_sign_bias != sign_bias[edge_ref]) {                   \
1145                     /* SWAR negate of the values in mv. */                    \
1146                     mv = ~mv;                                                 \
1147                     mv = ((mv & 0x7fff7fff) +                                 \
1148                           0x00010001) ^ (mv & 0x80008000);                    \
1149                 }                                                             \
1150                 if (!n || mv != AV_RN32A(&near_mv[idx]))                      \
1151                     AV_WN32A(&near_mv[++idx], mv);                            \
1152                 cnt[idx] += 1 + (n != 2);                                     \
1153             } else                                                            \
1154                 cnt[CNT_ZERO] += 1 + (n != 2);                                \
1155         }                                                                     \
1156     }
1157
1158     MV_EDGE_CHECK(0)
1159     MV_EDGE_CHECK(1)
1160     MV_EDGE_CHECK(2)
1161
1162     mb->partitioning = VP8_SPLITMVMODE_NONE;
1163     if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_ZERO]][0])) {
1164         mb->mode = VP8_MVMODE_MV;
1165
1166         /* If we have three distinct MVs, merge first and last if they're the same */
1167         if (cnt[CNT_SPLITMV] &&
1168             AV_RN32A(&near_mv[1 + VP8_EDGE_TOP]) == AV_RN32A(&near_mv[1 + VP8_EDGE_TOPLEFT]))
1169             cnt[CNT_NEAREST] += 1;
1170
1171         /* Swap near and nearest if necessary */
1172         if (cnt[CNT_NEAR] > cnt[CNT_NEAREST]) {
1173             FFSWAP(uint8_t,     cnt[CNT_NEAREST],     cnt[CNT_NEAR]);
1174             FFSWAP( VP56mv, near_mv[CNT_NEAREST], near_mv[CNT_NEAR]);
1175         }
1176
1177         if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAREST]][1])) {
1178             if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAR]][2])) {
1179                 /* Choose the best mv out of 0,0 and the nearest mv */
1180                 clamp_mv(mv_bounds, &mb->mv, &near_mv[CNT_ZERO + (cnt[CNT_NEAREST] >= cnt[CNT_ZERO])]);
1181                 cnt[CNT_SPLITMV] = ((mb_edge[VP8_EDGE_LEFT]->mode    == VP8_MVMODE_SPLIT) +
1182                                     (mb_edge[VP8_EDGE_TOP]->mode     == VP8_MVMODE_SPLIT)) * 2 +
1183                                     (mb_edge[VP8_EDGE_TOPLEFT]->mode == VP8_MVMODE_SPLIT);
1184
1185                 if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_SPLITMV]][3])) {
1186                     mb->mode = VP8_MVMODE_SPLIT;
1187                     mb->mv = mb->bmv[decode_splitmvs(s, c, mb, layout, IS_VP8) - 1];
1188                 } else {
1189                     mb->mv.y  += vp8_read_mv_component(c, s->prob->mvc[0]);
1190                     mb->mv.x  += vp8_read_mv_component(c, s->prob->mvc[1]);
1191                     mb->bmv[0] = mb->mv;
1192                 }
1193             } else {
1194                 clamp_mv(mv_bounds, &mb->mv, &near_mv[CNT_NEAR]);
1195                 mb->bmv[0] = mb->mv;
1196             }
1197         } else {
1198             clamp_mv(mv_bounds, &mb->mv, &near_mv[CNT_NEAREST]);
1199             mb->bmv[0] = mb->mv;
1200         }
1201     } else {
1202         mb->mode = VP8_MVMODE_ZERO;
1203         AV_ZERO32(&mb->mv);
1204         mb->bmv[0] = mb->mv;
1205     }
1206 }
1207
1208 static av_always_inline
1209 void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
1210                            int mb_x, int keyframe, int layout)
1211 {
1212     uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
1213
1214     if (layout) {
1215         VP8Macroblock *mb_top = mb - s->mb_width - 1;
1216         memcpy(mb->intra4x4_pred_mode_top, mb_top->intra4x4_pred_mode_top, 4);
1217     }
1218     if (keyframe) {
1219         int x, y;
1220         uint8_t *top;
1221         uint8_t *const left = s->intra4x4_pred_mode_left;
1222         if (layout)
1223             top = mb->intra4x4_pred_mode_top;
1224         else
1225             top = s->intra4x4_pred_mode_top + 4 * mb_x;
1226         for (y = 0; y < 4; y++) {
1227             for (x = 0; x < 4; x++) {
1228                 const uint8_t *ctx;
1229                 ctx       = vp8_pred4x4_prob_intra[top[x]][left[y]];
1230                 *intra4x4 = vp8_rac_get_tree(c, vp8_pred4x4_tree, ctx);
1231                 left[y]   = top[x] = *intra4x4;
1232                 intra4x4++;
1233             }
1234         }
1235     } else {
1236         int i;
1237         for (i = 0; i < 16; i++)
1238             intra4x4[i] = vp8_rac_get_tree(c, vp8_pred4x4_tree,
1239                                            vp8_pred4x4_prob_inter);
1240     }
1241 }
1242
1243 static av_always_inline
1244 void decode_mb_mode(VP8Context *s, VP8mvbounds *mv_bounds,
1245                     VP8Macroblock *mb, int mb_x, int mb_y,
1246                     uint8_t *segment, uint8_t *ref, int layout, int is_vp7)
1247 {
1248     VP56RangeCoder *c = &s->c;
1249     static const char * const vp7_feature_name[] = { "q-index",
1250                                                      "lf-delta",
1251                                                      "partial-golden-update",
1252                                                      "blit-pitch" };
1253     if (is_vp7) {
1254         int i;
1255         *segment = 0;
1256         for (i = 0; i < 4; i++) {
1257             if (s->feature_enabled[i]) {
1258                 if (vp56_rac_get_prob_branchy(c, s->feature_present_prob[i])) {
1259                       int index = vp8_rac_get_tree(c, vp7_feature_index_tree,
1260                                                    s->feature_index_prob[i]);
1261                       av_log(s->avctx, AV_LOG_WARNING,
1262                              "Feature %s present in macroblock (value 0x%x)\n",
1263                              vp7_feature_name[i], s->feature_value[i][index]);
1264                 }
1265            }
1266         }
1267     } else if (s->segmentation.update_map) {
1268         int bit  = vp56_rac_get_prob(c, s->prob->segmentid[0]);
1269         *segment = vp56_rac_get_prob(c, s->prob->segmentid[1+bit]) + 2*bit;
1270     } else if (s->segmentation.enabled)
1271         *segment = ref ? *ref : *segment;
1272     mb->segment = *segment;
1273
1274     mb->skip = s->mbskip_enabled ? vp56_rac_get_prob(c, s->prob->mbskip) : 0;
1275
1276     if (s->keyframe) {
1277         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_intra,
1278                                     vp8_pred16x16_prob_intra);
1279
1280         if (mb->mode == MODE_I4x4) {
1281             decode_intra4x4_modes(s, c, mb, mb_x, 1, layout);
1282         } else {
1283             const uint32_t modes = (is_vp7 ? vp7_pred4x4_mode
1284                                            : vp8_pred4x4_mode)[mb->mode] * 0x01010101u;
1285             if (s->mb_layout)
1286                 AV_WN32A(mb->intra4x4_pred_mode_top, modes);
1287             else
1288                 AV_WN32A(s->intra4x4_pred_mode_top + 4 * mb_x, modes);
1289             AV_WN32A(s->intra4x4_pred_mode_left, modes);
1290         }
1291
1292         mb->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree,
1293                                                 vp8_pred8x8c_prob_intra);
1294         mb->ref_frame        = VP56_FRAME_CURRENT;
1295     } else if (vp56_rac_get_prob_branchy(c, s->prob->intra)) {
1296         // inter MB, 16.2
1297         if (vp56_rac_get_prob_branchy(c, s->prob->last))
1298             mb->ref_frame =
1299                 (!is_vp7 && vp56_rac_get_prob(c, s->prob->golden)) ? VP56_FRAME_GOLDEN2 /* altref */
1300                                                                    : VP56_FRAME_GOLDEN;
1301         else
1302             mb->ref_frame = VP56_FRAME_PREVIOUS;
1303         s->ref_count[mb->ref_frame - 1]++;
1304
1305         // motion vectors, 16.3
1306         if (is_vp7)
1307             vp7_decode_mvs(s, mb, mb_x, mb_y, layout);
1308         else
1309             vp8_decode_mvs(s, mv_bounds, mb, mb_x, mb_y, layout);
1310     } else {
1311         // intra MB, 16.1
1312         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_inter, s->prob->pred16x16);
1313
1314         if (mb->mode == MODE_I4x4)
1315             decode_intra4x4_modes(s, c, mb, mb_x, 0, layout);
1316
1317         mb->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree,
1318                                                 s->prob->pred8x8c);
1319         mb->ref_frame        = VP56_FRAME_CURRENT;
1320         mb->partitioning     = VP8_SPLITMVMODE_NONE;
1321         AV_ZERO32(&mb->bmv[0]);
1322     }
1323 }
1324
1325 /**
1326  * @param r     arithmetic bitstream reader context
1327  * @param block destination for block coefficients
1328  * @param probs probabilities to use when reading trees from the bitstream
1329  * @param i     initial coeff index, 0 unless a separate DC block is coded
1330  * @param qmul  array holding the dc/ac dequant factor at position 0/1
1331  *
1332  * @return 0 if no coeffs were decoded
1333  *         otherwise, the index of the last coeff decoded plus one
1334  */
1335 static av_always_inline
1336 int decode_block_coeffs_internal(VP56RangeCoder *r, int16_t block[16],
1337                                  uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1338                                  int i, uint8_t *token_prob, int16_t qmul[2],
1339                                  const uint8_t scan[16], int vp7)
1340 {
1341     VP56RangeCoder c = *r;
1342     goto skip_eob;
1343     do {
1344         int coeff;
1345 restart:
1346         if (!vp56_rac_get_prob_branchy(&c, token_prob[0]))   // DCT_EOB
1347             break;
1348
1349 skip_eob:
1350         if (!vp56_rac_get_prob_branchy(&c, token_prob[1])) { // DCT_0
1351             if (++i == 16)
1352                 break; // invalid input; blocks should end with EOB
1353             token_prob = probs[i][0];
1354             if (vp7)
1355                 goto restart;
1356             goto skip_eob;
1357         }
1358
1359         if (!vp56_rac_get_prob_branchy(&c, token_prob[2])) { // DCT_1
1360             coeff = 1;
1361             token_prob = probs[i + 1][1];
1362         } else {
1363             if (!vp56_rac_get_prob_branchy(&c, token_prob[3])) { // DCT 2,3,4
1364                 coeff = vp56_rac_get_prob_branchy(&c, token_prob[4]);
1365                 if (coeff)
1366                     coeff += vp56_rac_get_prob(&c, token_prob[5]);
1367                 coeff += 2;
1368             } else {
1369                 // DCT_CAT*
1370                 if (!vp56_rac_get_prob_branchy(&c, token_prob[6])) {
1371                     if (!vp56_rac_get_prob_branchy(&c, token_prob[7])) { // DCT_CAT1
1372                         coeff = 5 + vp56_rac_get_prob(&c, vp8_dct_cat1_prob[0]);
1373                     } else {                                    // DCT_CAT2
1374                         coeff  = 7;
1375                         coeff += vp56_rac_get_prob(&c, vp8_dct_cat2_prob[0]) << 1;
1376                         coeff += vp56_rac_get_prob(&c, vp8_dct_cat2_prob[1]);
1377                     }
1378                 } else {    // DCT_CAT3 and up
1379                     int a   = vp56_rac_get_prob(&c, token_prob[8]);
1380                     int b   = vp56_rac_get_prob(&c, token_prob[9 + a]);
1381                     int cat = (a << 1) + b;
1382                     coeff  = 3 + (8 << cat);
1383                     coeff += vp8_rac_get_coeff(&c, ff_vp8_dct_cat_prob[cat]);
1384                 }
1385             }
1386             token_prob = probs[i + 1][2];
1387         }
1388         block[scan[i]] = (vp8_rac_get(&c) ? -coeff : coeff) * qmul[!!i];
1389     } while (++i < 16);
1390
1391     *r = c;
1392     return i;
1393 }
1394
1395 static av_always_inline
1396 int inter_predict_dc(int16_t block[16], int16_t pred[2])
1397 {
1398     int16_t dc = block[0];
1399     int ret = 0;
1400
1401     if (pred[1] > 3) {
1402         dc += pred[0];
1403         ret = 1;
1404     }
1405
1406     if (!pred[0] | !dc | ((int32_t)pred[0] ^ (int32_t)dc) >> 31) {
1407         block[0] = pred[0] = dc;
1408         pred[1] = 0;
1409     } else {
1410         if (pred[0] == dc)
1411             pred[1]++;
1412         block[0] = pred[0] = dc;
1413     }
1414
1415     return ret;
1416 }
1417
1418 static int vp7_decode_block_coeffs_internal(VP56RangeCoder *r,
1419                                             int16_t block[16],
1420                                             uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1421                                             int i, uint8_t *token_prob,
1422                                             int16_t qmul[2],
1423                                             const uint8_t scan[16])
1424 {
1425     return decode_block_coeffs_internal(r, block, probs, i,
1426                                         token_prob, qmul, scan, IS_VP7);
1427 }
1428
1429 #ifndef vp8_decode_block_coeffs_internal
1430 static int vp8_decode_block_coeffs_internal(VP56RangeCoder *r,
1431                                             int16_t block[16],
1432                                             uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1433                                             int i, uint8_t *token_prob,
1434                                             int16_t qmul[2])
1435 {
1436     return decode_block_coeffs_internal(r, block, probs, i,
1437                                         token_prob, qmul, ff_zigzag_scan, IS_VP8);
1438 }
1439 #endif
1440
1441 /**
1442  * @param c          arithmetic bitstream reader context
1443  * @param block      destination for block coefficients
1444  * @param probs      probabilities to use when reading trees from the bitstream
1445  * @param i          initial coeff index, 0 unless a separate DC block is coded
1446  * @param zero_nhood the initial prediction context for number of surrounding
1447  *                   all-zero blocks (only left/top, so 0-2)
1448  * @param qmul       array holding the dc/ac dequant factor at position 0/1
1449  * @param scan       scan pattern (VP7 only)
1450  *
1451  * @return 0 if no coeffs were decoded
1452  *         otherwise, the index of the last coeff decoded plus one
1453  */
1454 static av_always_inline
1455 int decode_block_coeffs(VP56RangeCoder *c, int16_t block[16],
1456                         uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1457                         int i, int zero_nhood, int16_t qmul[2],
1458                         const uint8_t scan[16], int vp7)
1459 {
1460     uint8_t *token_prob = probs[i][zero_nhood];
1461     if (!vp56_rac_get_prob_branchy(c, token_prob[0]))   // DCT_EOB
1462         return 0;
1463     return vp7 ? vp7_decode_block_coeffs_internal(c, block, probs, i,
1464                                                   token_prob, qmul, scan)
1465                : vp8_decode_block_coeffs_internal(c, block, probs, i,
1466                                                   token_prob, qmul);
1467 }
1468
1469 static av_always_inline
1470 void decode_mb_coeffs(VP8Context *s, VP8ThreadData *td, VP56RangeCoder *c,
1471                       VP8Macroblock *mb, uint8_t t_nnz[9], uint8_t l_nnz[9],
1472                       int is_vp7)
1473 {
1474     int i, x, y, luma_start = 0, luma_ctx = 3;
1475     int nnz_pred, nnz, nnz_total = 0;
1476     int segment = mb->segment;
1477     int block_dc = 0;
1478
1479     if (mb->mode != MODE_I4x4 && (is_vp7 || mb->mode != VP8_MVMODE_SPLIT)) {
1480         nnz_pred = t_nnz[8] + l_nnz[8];
1481
1482         // decode DC values and do hadamard
1483         nnz = decode_block_coeffs(c, td->block_dc, s->prob->token[1], 0,
1484                                   nnz_pred, s->qmat[segment].luma_dc_qmul,
1485                                   ff_zigzag_scan, is_vp7);
1486         l_nnz[8] = t_nnz[8] = !!nnz;
1487
1488         if (is_vp7 && mb->mode > MODE_I4x4) {
1489             nnz |=  inter_predict_dc(td->block_dc,
1490                                      s->inter_dc_pred[mb->ref_frame - 1]);
1491         }
1492
1493         if (nnz) {
1494             nnz_total += nnz;
1495             block_dc   = 1;
1496             if (nnz == 1)
1497                 s->vp8dsp.vp8_luma_dc_wht_dc(td->block, td->block_dc);
1498             else
1499                 s->vp8dsp.vp8_luma_dc_wht(td->block, td->block_dc);
1500         }
1501         luma_start = 1;
1502         luma_ctx   = 0;
1503     }
1504
1505     // luma blocks
1506     for (y = 0; y < 4; y++)
1507         for (x = 0; x < 4; x++) {
1508             nnz_pred = l_nnz[y] + t_nnz[x];
1509             nnz = decode_block_coeffs(c, td->block[y][x],
1510                                       s->prob->token[luma_ctx],
1511                                       luma_start, nnz_pred,
1512                                       s->qmat[segment].luma_qmul,
1513                                       s->prob[0].scan, is_vp7);
1514             /* nnz+block_dc may be one more than the actual last index,
1515              * but we don't care */
1516             td->non_zero_count_cache[y][x] = nnz + block_dc;
1517             t_nnz[x] = l_nnz[y] = !!nnz;
1518             nnz_total += nnz;
1519         }
1520
1521     // chroma blocks
1522     // TODO: what to do about dimensions? 2nd dim for luma is x,
1523     // but for chroma it's (y<<1)|x
1524     for (i = 4; i < 6; i++)
1525         for (y = 0; y < 2; y++)
1526             for (x = 0; x < 2; x++) {
1527                 nnz_pred = l_nnz[i + 2 * y] + t_nnz[i + 2 * x];
1528                 nnz = decode_block_coeffs(c, td->block[i][(y << 1) + x],
1529                                           s->prob->token[2], 0, nnz_pred,
1530                                           s->qmat[segment].chroma_qmul,
1531                                           s->prob[0].scan, is_vp7);
1532                 td->non_zero_count_cache[i][(y << 1) + x] = nnz;
1533                 t_nnz[i + 2 * x] = l_nnz[i + 2 * y] = !!nnz;
1534                 nnz_total += nnz;
1535             }
1536
1537     // if there were no coded coeffs despite the macroblock not being marked skip,
1538     // we MUST not do the inner loop filter and should not do IDCT
1539     // Since skip isn't used for bitstream prediction, just manually set it.
1540     if (!nnz_total)
1541         mb->skip = 1;
1542 }
1543
1544 static av_always_inline
1545 void backup_mb_border(uint8_t *top_border, uint8_t *src_y,
1546                       uint8_t *src_cb, uint8_t *src_cr,
1547                       ptrdiff_t linesize, ptrdiff_t uvlinesize, int simple)
1548 {
1549     AV_COPY128(top_border, src_y + 15 * linesize);
1550     if (!simple) {
1551         AV_COPY64(top_border + 16, src_cb + 7 * uvlinesize);
1552         AV_COPY64(top_border + 24, src_cr + 7 * uvlinesize);
1553     }
1554 }
1555
1556 static av_always_inline
1557 void xchg_mb_border(uint8_t *top_border, uint8_t *src_y, uint8_t *src_cb,
1558                     uint8_t *src_cr, ptrdiff_t linesize, ptrdiff_t uvlinesize, int mb_x,
1559                     int mb_y, int mb_width, int simple, int xchg)
1560 {
1561     uint8_t *top_border_m1 = top_border - 32;     // for TL prediction
1562     src_y  -= linesize;
1563     src_cb -= uvlinesize;
1564     src_cr -= uvlinesize;
1565
1566 #define XCHG(a, b, xchg)                                                      \
1567     do {                                                                      \
1568         if (xchg)                                                             \
1569             AV_SWAP64(b, a);                                                  \
1570         else                                                                  \
1571             AV_COPY64(b, a);                                                  \
1572     } while (0)
1573
1574     XCHG(top_border_m1 + 8, src_y - 8, xchg);
1575     XCHG(top_border, src_y, xchg);
1576     XCHG(top_border + 8, src_y + 8, 1);
1577     if (mb_x < mb_width - 1)
1578         XCHG(top_border + 32, src_y + 16, 1);
1579
1580     // only copy chroma for normal loop filter
1581     // or to initialize the top row to 127
1582     if (!simple || !mb_y) {
1583         XCHG(top_border_m1 + 16, src_cb - 8, xchg);
1584         XCHG(top_border_m1 + 24, src_cr - 8, xchg);
1585         XCHG(top_border + 16, src_cb, 1);
1586         XCHG(top_border + 24, src_cr, 1);
1587     }
1588 }
1589
1590 static av_always_inline
1591 int check_dc_pred8x8_mode(int mode, int mb_x, int mb_y)
1592 {
1593     if (!mb_x)
1594         return mb_y ? TOP_DC_PRED8x8 : DC_128_PRED8x8;
1595     else
1596         return mb_y ? mode : LEFT_DC_PRED8x8;
1597 }
1598
1599 static av_always_inline
1600 int check_tm_pred8x8_mode(int mode, int mb_x, int mb_y, int vp7)
1601 {
1602     if (!mb_x)
1603         return mb_y ? VERT_PRED8x8 : (vp7 ? DC_128_PRED8x8 : DC_129_PRED8x8);
1604     else
1605         return mb_y ? mode : HOR_PRED8x8;
1606 }
1607
1608 static av_always_inline
1609 int check_intra_pred8x8_mode_emuedge(int mode, int mb_x, int mb_y, int vp7)
1610 {
1611     switch (mode) {
1612     case DC_PRED8x8:
1613         return check_dc_pred8x8_mode(mode, mb_x, mb_y);
1614     case VERT_PRED8x8:
1615         return !mb_y ? (vp7 ? DC_128_PRED8x8 : DC_127_PRED8x8) : mode;
1616     case HOR_PRED8x8:
1617         return !mb_x ? (vp7 ? DC_128_PRED8x8 : DC_129_PRED8x8) : mode;
1618     case PLANE_PRED8x8: /* TM */
1619         return check_tm_pred8x8_mode(mode, mb_x, mb_y, vp7);
1620     }
1621     return mode;
1622 }
1623
1624 static av_always_inline
1625 int check_tm_pred4x4_mode(int mode, int mb_x, int mb_y, int vp7)
1626 {
1627     if (!mb_x) {
1628         return mb_y ? VERT_VP8_PRED : (vp7 ? DC_128_PRED : DC_129_PRED);
1629     } else {
1630         return mb_y ? mode : HOR_VP8_PRED;
1631     }
1632 }
1633
1634 static av_always_inline
1635 int check_intra_pred4x4_mode_emuedge(int mode, int mb_x, int mb_y,
1636                                      int *copy_buf, int vp7)
1637 {
1638     switch (mode) {
1639     case VERT_PRED:
1640         if (!mb_x && mb_y) {
1641             *copy_buf = 1;
1642             return mode;
1643         }
1644         /* fall-through */
1645     case DIAG_DOWN_LEFT_PRED:
1646     case VERT_LEFT_PRED:
1647         return !mb_y ? (vp7 ? DC_128_PRED : DC_127_PRED) : mode;
1648     case HOR_PRED:
1649         if (!mb_y) {
1650             *copy_buf = 1;
1651             return mode;
1652         }
1653         /* fall-through */
1654     case HOR_UP_PRED:
1655         return !mb_x ? (vp7 ? DC_128_PRED : DC_129_PRED) : mode;
1656     case TM_VP8_PRED:
1657         return check_tm_pred4x4_mode(mode, mb_x, mb_y, vp7);
1658     case DC_PRED: /* 4x4 DC doesn't use the same "H.264-style" exceptions
1659                    * as 16x16/8x8 DC */
1660     case DIAG_DOWN_RIGHT_PRED:
1661     case VERT_RIGHT_PRED:
1662     case HOR_DOWN_PRED:
1663         if (!mb_y || !mb_x)
1664             *copy_buf = 1;
1665         return mode;
1666     }
1667     return mode;
1668 }
1669
1670 static av_always_inline
1671 void intra_predict(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1672                    VP8Macroblock *mb, int mb_x, int mb_y, int is_vp7)
1673 {
1674     int x, y, mode, nnz;
1675     uint32_t tr;
1676
1677     /* for the first row, we need to run xchg_mb_border to init the top edge
1678      * to 127 otherwise, skip it if we aren't going to deblock */
1679     if (mb_y && (s->deblock_filter || !mb_y) && td->thread_nr == 0)
1680         xchg_mb_border(s->top_border[mb_x + 1], dst[0], dst[1], dst[2],
1681                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1682                        s->filter.simple, 1);
1683
1684     if (mb->mode < MODE_I4x4) {
1685         mode = check_intra_pred8x8_mode_emuedge(mb->mode, mb_x, mb_y, is_vp7);
1686         s->hpc.pred16x16[mode](dst[0], s->linesize);
1687     } else {
1688         uint8_t *ptr = dst[0];
1689         uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
1690         const uint8_t lo = is_vp7 ? 128 : 127;
1691         const uint8_t hi = is_vp7 ? 128 : 129;
1692         uint8_t tr_top[4] = { lo, lo, lo, lo };
1693
1694         // all blocks on the right edge of the macroblock use bottom edge
1695         // the top macroblock for their topright edge
1696         uint8_t *tr_right = ptr - s->linesize + 16;
1697
1698         // if we're on the right edge of the frame, said edge is extended
1699         // from the top macroblock
1700         if (mb_y && mb_x == s->mb_width - 1) {
1701             tr       = tr_right[-1] * 0x01010101u;
1702             tr_right = (uint8_t *) &tr;
1703         }
1704
1705         if (mb->skip)
1706             AV_ZERO128(td->non_zero_count_cache);
1707
1708         for (y = 0; y < 4; y++) {
1709             uint8_t *topright = ptr + 4 - s->linesize;
1710             for (x = 0; x < 4; x++) {
1711                 int copy = 0;
1712                 ptrdiff_t linesize = s->linesize;
1713                 uint8_t *dst = ptr + 4 * x;
1714                 LOCAL_ALIGNED(4, uint8_t, copy_dst, [5 * 8]);
1715
1716                 if ((y == 0 || x == 3) && mb_y == 0) {
1717                     topright = tr_top;
1718                 } else if (x == 3)
1719                     topright = tr_right;
1720
1721                 mode = check_intra_pred4x4_mode_emuedge(intra4x4[x], mb_x + x,
1722                                                         mb_y + y, &copy, is_vp7);
1723                 if (copy) {
1724                     dst      = copy_dst + 12;
1725                     linesize = 8;
1726                     if (!(mb_y + y)) {
1727                         copy_dst[3] = lo;
1728                         AV_WN32A(copy_dst + 4, lo * 0x01010101U);
1729                     } else {
1730                         AV_COPY32(copy_dst + 4, ptr + 4 * x - s->linesize);
1731                         if (!(mb_x + x)) {
1732                             copy_dst[3] = hi;
1733                         } else {
1734                             copy_dst[3] = ptr[4 * x - s->linesize - 1];
1735                         }
1736                     }
1737                     if (!(mb_x + x)) {
1738                         copy_dst[11] =
1739                         copy_dst[19] =
1740                         copy_dst[27] =
1741                         copy_dst[35] = hi;
1742                     } else {
1743                         copy_dst[11] = ptr[4 * x                   - 1];
1744                         copy_dst[19] = ptr[4 * x + s->linesize     - 1];
1745                         copy_dst[27] = ptr[4 * x + s->linesize * 2 - 1];
1746                         copy_dst[35] = ptr[4 * x + s->linesize * 3 - 1];
1747                     }
1748                 }
1749                 s->hpc.pred4x4[mode](dst, topright, linesize);
1750                 if (copy) {
1751                     AV_COPY32(ptr + 4 * x,                   copy_dst + 12);
1752                     AV_COPY32(ptr + 4 * x + s->linesize,     copy_dst + 20);
1753                     AV_COPY32(ptr + 4 * x + s->linesize * 2, copy_dst + 28);
1754                     AV_COPY32(ptr + 4 * x + s->linesize * 3, copy_dst + 36);
1755                 }
1756
1757                 nnz = td->non_zero_count_cache[y][x];
1758                 if (nnz) {
1759                     if (nnz == 1)
1760                         s->vp8dsp.vp8_idct_dc_add(ptr + 4 * x,
1761                                                   td->block[y][x], s->linesize);
1762                     else
1763                         s->vp8dsp.vp8_idct_add(ptr + 4 * x,
1764                                                td->block[y][x], s->linesize);
1765                 }
1766                 topright += 4;
1767             }
1768
1769             ptr      += 4 * s->linesize;
1770             intra4x4 += 4;
1771         }
1772     }
1773
1774     mode = check_intra_pred8x8_mode_emuedge(mb->chroma_pred_mode,
1775                                             mb_x, mb_y, is_vp7);
1776     s->hpc.pred8x8[mode](dst[1], s->uvlinesize);
1777     s->hpc.pred8x8[mode](dst[2], s->uvlinesize);
1778
1779     if (mb_y && (s->deblock_filter || !mb_y) && td->thread_nr == 0)
1780         xchg_mb_border(s->top_border[mb_x + 1], dst[0], dst[1], dst[2],
1781                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1782                        s->filter.simple, 0);
1783 }
1784
1785 static const uint8_t subpel_idx[3][8] = {
1786     { 0, 1, 2, 1, 2, 1, 2, 1 }, // nr. of left extra pixels,
1787                                 // also function pointer index
1788     { 0, 3, 5, 3, 5, 3, 5, 3 }, // nr. of extra pixels required
1789     { 0, 2, 3, 2, 3, 2, 3, 2 }, // nr. of right extra pixels
1790 };
1791
1792 /**
1793  * luma MC function
1794  *
1795  * @param s        VP8 decoding context
1796  * @param dst      target buffer for block data at block position
1797  * @param ref      reference picture buffer at origin (0, 0)
1798  * @param mv       motion vector (relative to block position) to get pixel data from
1799  * @param x_off    horizontal position of block from origin (0, 0)
1800  * @param y_off    vertical position of block from origin (0, 0)
1801  * @param block_w  width of block (16, 8 or 4)
1802  * @param block_h  height of block (always same as block_w)
1803  * @param width    width of src/dst plane data
1804  * @param height   height of src/dst plane data
1805  * @param linesize size of a single line of plane data, including padding
1806  * @param mc_func  motion compensation function pointers (bilinear or sixtap MC)
1807  */
1808 static av_always_inline
1809 void vp8_mc_luma(VP8Context *s, VP8ThreadData *td, uint8_t *dst,
1810                  ThreadFrame *ref, const VP56mv *mv,
1811                  int x_off, int y_off, int block_w, int block_h,
1812                  int width, int height, ptrdiff_t linesize,
1813                  vp8_mc_func mc_func[3][3])
1814 {
1815     uint8_t *src = ref->f->data[0];
1816
1817     if (AV_RN32A(mv)) {
1818         ptrdiff_t src_linesize = linesize;
1819
1820         int mx = (mv->x * 2) & 7, mx_idx = subpel_idx[0][mx];
1821         int my = (mv->y * 2) & 7, my_idx = subpel_idx[0][my];
1822
1823         x_off += mv->x >> 2;
1824         y_off += mv->y >> 2;
1825
1826         // edge emulation
1827         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 4, 0);
1828         src += y_off * linesize + x_off;
1829         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1830             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1831             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1832                                      src - my_idx * linesize - mx_idx,
1833                                      EDGE_EMU_LINESIZE, linesize,
1834                                      block_w + subpel_idx[1][mx],
1835                                      block_h + subpel_idx[1][my],
1836                                      x_off - mx_idx, y_off - my_idx,
1837                                      width, height);
1838             src = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
1839             src_linesize = EDGE_EMU_LINESIZE;
1840         }
1841         mc_func[my_idx][mx_idx](dst, linesize, src, src_linesize, block_h, mx, my);
1842     } else {
1843         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 4, 0);
1844         mc_func[0][0](dst, linesize, src + y_off * linesize + x_off,
1845                       linesize, block_h, 0, 0);
1846     }
1847 }
1848
1849 /**
1850  * chroma MC function
1851  *
1852  * @param s        VP8 decoding context
1853  * @param dst1     target buffer for block data at block position (U plane)
1854  * @param dst2     target buffer for block data at block position (V plane)
1855  * @param ref      reference picture buffer at origin (0, 0)
1856  * @param mv       motion vector (relative to block position) to get pixel data from
1857  * @param x_off    horizontal position of block from origin (0, 0)
1858  * @param y_off    vertical position of block from origin (0, 0)
1859  * @param block_w  width of block (16, 8 or 4)
1860  * @param block_h  height of block (always same as block_w)
1861  * @param width    width of src/dst plane data
1862  * @param height   height of src/dst plane data
1863  * @param linesize size of a single line of plane data, including padding
1864  * @param mc_func  motion compensation function pointers (bilinear or sixtap MC)
1865  */
1866 static av_always_inline
1867 void vp8_mc_chroma(VP8Context *s, VP8ThreadData *td, uint8_t *dst1,
1868                    uint8_t *dst2, ThreadFrame *ref, const VP56mv *mv,
1869                    int x_off, int y_off, int block_w, int block_h,
1870                    int width, int height, ptrdiff_t linesize,
1871                    vp8_mc_func mc_func[3][3])
1872 {
1873     uint8_t *src1 = ref->f->data[1], *src2 = ref->f->data[2];
1874
1875     if (AV_RN32A(mv)) {
1876         int mx = mv->x & 7, mx_idx = subpel_idx[0][mx];
1877         int my = mv->y & 7, my_idx = subpel_idx[0][my];
1878
1879         x_off += mv->x >> 3;
1880         y_off += mv->y >> 3;
1881
1882         // edge emulation
1883         src1 += y_off * linesize + x_off;
1884         src2 += y_off * linesize + x_off;
1885         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 3, 0);
1886         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1887             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1888             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1889                                      src1 - my_idx * linesize - mx_idx,
1890                                      EDGE_EMU_LINESIZE, linesize,
1891                                      block_w + subpel_idx[1][mx],
1892                                      block_h + subpel_idx[1][my],
1893                                      x_off - mx_idx, y_off - my_idx, width, height);
1894             src1 = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
1895             mc_func[my_idx][mx_idx](dst1, linesize, src1, EDGE_EMU_LINESIZE, block_h, mx, my);
1896
1897             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1898                                      src2 - my_idx * linesize - mx_idx,
1899                                      EDGE_EMU_LINESIZE, linesize,
1900                                      block_w + subpel_idx[1][mx],
1901                                      block_h + subpel_idx[1][my],
1902                                      x_off - mx_idx, y_off - my_idx, width, height);
1903             src2 = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
1904             mc_func[my_idx][mx_idx](dst2, linesize, src2, EDGE_EMU_LINESIZE, block_h, mx, my);
1905         } else {
1906             mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my);
1907             mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my);
1908         }
1909     } else {
1910         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 3, 0);
1911         mc_func[0][0](dst1, linesize, src1 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1912         mc_func[0][0](dst2, linesize, src2 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1913     }
1914 }
1915
1916 static av_always_inline
1917 void vp8_mc_part(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1918                  ThreadFrame *ref_frame, int x_off, int y_off,
1919                  int bx_off, int by_off, int block_w, int block_h,
1920                  int width, int height, VP56mv *mv)
1921 {
1922     VP56mv uvmv = *mv;
1923
1924     /* Y */
1925     vp8_mc_luma(s, td, dst[0] + by_off * s->linesize + bx_off,
1926                 ref_frame, mv, x_off + bx_off, y_off + by_off,
1927                 block_w, block_h, width, height, s->linesize,
1928                 s->put_pixels_tab[block_w == 8]);
1929
1930     /* U/V */
1931     if (s->profile == 3) {
1932         /* this block only applies VP8; it is safe to check
1933          * only the profile, as VP7 profile <= 1 */
1934         uvmv.x &= ~7;
1935         uvmv.y &= ~7;
1936     }
1937     x_off   >>= 1;
1938     y_off   >>= 1;
1939     bx_off  >>= 1;
1940     by_off  >>= 1;
1941     width   >>= 1;
1942     height  >>= 1;
1943     block_w >>= 1;
1944     block_h >>= 1;
1945     vp8_mc_chroma(s, td, dst[1] + by_off * s->uvlinesize + bx_off,
1946                   dst[2] + by_off * s->uvlinesize + bx_off, ref_frame,
1947                   &uvmv, x_off + bx_off, y_off + by_off,
1948                   block_w, block_h, width, height, s->uvlinesize,
1949                   s->put_pixels_tab[1 + (block_w == 4)]);
1950 }
1951
1952 /* Fetch pixels for estimated mv 4 macroblocks ahead.
1953  * Optimized for 64-byte cache lines. Inspired by ffh264 prefetch_motion. */
1954 static av_always_inline
1955 void prefetch_motion(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
1956                      int mb_xy, int ref)
1957 {
1958     /* Don't prefetch refs that haven't been used very often this frame. */
1959     if (s->ref_count[ref - 1] > (mb_xy >> 5)) {
1960         int x_off = mb_x << 4, y_off = mb_y << 4;
1961         int mx = (mb->mv.x >> 2) + x_off + 8;
1962         int my = (mb->mv.y >> 2) + y_off;
1963         uint8_t **src = s->framep[ref]->tf.f->data;
1964         int off = mx + (my + (mb_x & 3) * 4) * s->linesize + 64;
1965         /* For threading, a ff_thread_await_progress here might be useful, but
1966          * it actually slows down the decoder. Since a bad prefetch doesn't
1967          * generate bad decoder output, we don't run it here. */
1968         s->vdsp.prefetch(src[0] + off, s->linesize, 4);
1969         off = (mx >> 1) + ((my >> 1) + (mb_x & 7)) * s->uvlinesize + 64;
1970         s->vdsp.prefetch(src[1] + off, src[2] - src[1], 2);
1971     }
1972 }
1973
1974 /**
1975  * Apply motion vectors to prediction buffer, chapter 18.
1976  */
1977 static av_always_inline
1978 void inter_predict(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1979                    VP8Macroblock *mb, int mb_x, int mb_y)
1980 {
1981     int x_off = mb_x << 4, y_off = mb_y << 4;
1982     int width = 16 * s->mb_width, height = 16 * s->mb_height;
1983     ThreadFrame *ref = &s->framep[mb->ref_frame]->tf;
1984     VP56mv *bmv = mb->bmv;
1985
1986     switch (mb->partitioning) {
1987     case VP8_SPLITMVMODE_NONE:
1988         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1989                     0, 0, 16, 16, width, height, &mb->mv);
1990         break;
1991     case VP8_SPLITMVMODE_4x4: {
1992         int x, y;
1993         VP56mv uvmv;
1994
1995         /* Y */
1996         for (y = 0; y < 4; y++) {
1997             for (x = 0; x < 4; x++) {
1998                 vp8_mc_luma(s, td, dst[0] + 4 * y * s->linesize + x * 4,
1999                             ref, &bmv[4 * y + x],
2000                             4 * x + x_off, 4 * y + y_off, 4, 4,
2001                             width, height, s->linesize,
2002                             s->put_pixels_tab[2]);
2003             }
2004         }
2005
2006         /* U/V */
2007         x_off  >>= 1;
2008         y_off  >>= 1;
2009         width  >>= 1;
2010         height >>= 1;
2011         for (y = 0; y < 2; y++) {
2012             for (x = 0; x < 2; x++) {
2013                 uvmv.x = mb->bmv[2 * y       * 4 + 2 * x    ].x +
2014                          mb->bmv[2 * y       * 4 + 2 * x + 1].x +
2015                          mb->bmv[(2 * y + 1) * 4 + 2 * x    ].x +
2016                          mb->bmv[(2 * y + 1) * 4 + 2 * x + 1].x;
2017                 uvmv.y = mb->bmv[2 * y       * 4 + 2 * x    ].y +
2018                          mb->bmv[2 * y       * 4 + 2 * x + 1].y +
2019                          mb->bmv[(2 * y + 1) * 4 + 2 * x    ].y +
2020                          mb->bmv[(2 * y + 1) * 4 + 2 * x + 1].y;
2021                 uvmv.x = (uvmv.x + 2 + FF_SIGNBIT(uvmv.x)) >> 2;
2022                 uvmv.y = (uvmv.y + 2 + FF_SIGNBIT(uvmv.y)) >> 2;
2023                 if (s->profile == 3) {
2024                     uvmv.x &= ~7;
2025                     uvmv.y &= ~7;
2026                 }
2027                 vp8_mc_chroma(s, td, dst[1] + 4 * y * s->uvlinesize + x * 4,
2028                               dst[2] + 4 * y * s->uvlinesize + x * 4, ref,
2029                               &uvmv, 4 * x + x_off, 4 * y + y_off, 4, 4,
2030                               width, height, s->uvlinesize,
2031                               s->put_pixels_tab[2]);
2032             }
2033         }
2034         break;
2035     }
2036     case VP8_SPLITMVMODE_16x8:
2037         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2038                     0, 0, 16, 8, width, height, &bmv[0]);
2039         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2040                     0, 8, 16, 8, width, height, &bmv[1]);
2041         break;
2042     case VP8_SPLITMVMODE_8x16:
2043         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2044                     0, 0, 8, 16, width, height, &bmv[0]);
2045         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2046                     8, 0, 8, 16, width, height, &bmv[1]);
2047         break;
2048     case VP8_SPLITMVMODE_8x8:
2049         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2050                     0, 0, 8, 8, width, height, &bmv[0]);
2051         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2052                     8, 0, 8, 8, width, height, &bmv[1]);
2053         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2054                     0, 8, 8, 8, width, height, &bmv[2]);
2055         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2056                     8, 8, 8, 8, width, height, &bmv[3]);
2057         break;
2058     }
2059 }
2060
2061 static av_always_inline
2062 void idct_mb(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3], VP8Macroblock *mb)
2063 {
2064     int x, y, ch;
2065
2066     if (mb->mode != MODE_I4x4) {
2067         uint8_t *y_dst = dst[0];
2068         for (y = 0; y < 4; y++) {
2069             uint32_t nnz4 = AV_RL32(td->non_zero_count_cache[y]);
2070             if (nnz4) {
2071                 if (nnz4 & ~0x01010101) {
2072                     for (x = 0; x < 4; x++) {
2073                         if ((uint8_t) nnz4 == 1)
2074                             s->vp8dsp.vp8_idct_dc_add(y_dst + 4 * x,
2075                                                       td->block[y][x],
2076                                                       s->linesize);
2077                         else if ((uint8_t) nnz4 > 1)
2078                             s->vp8dsp.vp8_idct_add(y_dst + 4 * x,
2079                                                    td->block[y][x],
2080                                                    s->linesize);
2081                         nnz4 >>= 8;
2082                         if (!nnz4)
2083                             break;
2084                     }
2085                 } else {
2086                     s->vp8dsp.vp8_idct_dc_add4y(y_dst, td->block[y], s->linesize);
2087                 }
2088             }
2089             y_dst += 4 * s->linesize;
2090         }
2091     }
2092
2093     for (ch = 0; ch < 2; ch++) {
2094         uint32_t nnz4 = AV_RL32(td->non_zero_count_cache[4 + ch]);
2095         if (nnz4) {
2096             uint8_t *ch_dst = dst[1 + ch];
2097             if (nnz4 & ~0x01010101) {
2098                 for (y = 0; y < 2; y++) {
2099                     for (x = 0; x < 2; x++) {
2100                         if ((uint8_t) nnz4 == 1)
2101                             s->vp8dsp.vp8_idct_dc_add(ch_dst + 4 * x,
2102                                                       td->block[4 + ch][(y << 1) + x],
2103                                                       s->uvlinesize);
2104                         else if ((uint8_t) nnz4 > 1)
2105                             s->vp8dsp.vp8_idct_add(ch_dst + 4 * x,
2106                                                    td->block[4 + ch][(y << 1) + x],
2107                                                    s->uvlinesize);
2108                         nnz4 >>= 8;
2109                         if (!nnz4)
2110                             goto chroma_idct_end;
2111                     }
2112                     ch_dst += 4 * s->uvlinesize;
2113                 }
2114             } else {
2115                 s->vp8dsp.vp8_idct_dc_add4uv(ch_dst, td->block[4 + ch], s->uvlinesize);
2116             }
2117         }
2118 chroma_idct_end:
2119         ;
2120     }
2121 }
2122
2123 static av_always_inline
2124 void filter_level_for_mb(VP8Context *s, VP8Macroblock *mb,
2125                          VP8FilterStrength *f, int is_vp7)
2126 {
2127     int interior_limit, filter_level;
2128
2129     if (s->segmentation.enabled) {
2130         filter_level = s->segmentation.filter_level[mb->segment];
2131         if (!s->segmentation.absolute_vals)
2132             filter_level += s->filter.level;
2133     } else
2134         filter_level = s->filter.level;
2135
2136     if (s->lf_delta.enabled) {
2137         filter_level += s->lf_delta.ref[mb->ref_frame];
2138         filter_level += s->lf_delta.mode[mb->mode];
2139     }
2140
2141     filter_level = av_clip_uintp2(filter_level, 6);
2142
2143     interior_limit = filter_level;
2144     if (s->filter.sharpness) {
2145         interior_limit >>= (s->filter.sharpness + 3) >> 2;
2146         interior_limit = FFMIN(interior_limit, 9 - s->filter.sharpness);
2147     }
2148     interior_limit = FFMAX(interior_limit, 1);
2149
2150     f->filter_level = filter_level;
2151     f->inner_limit = interior_limit;
2152     f->inner_filter = is_vp7 || !mb->skip || mb->mode == MODE_I4x4 ||
2153                       mb->mode == VP8_MVMODE_SPLIT;
2154 }
2155
2156 static av_always_inline
2157 void filter_mb(VP8Context *s, uint8_t *dst[3], VP8FilterStrength *f,
2158                int mb_x, int mb_y, int is_vp7)
2159 {
2160     int mbedge_lim, bedge_lim_y, bedge_lim_uv, hev_thresh;
2161     int filter_level = f->filter_level;
2162     int inner_limit = f->inner_limit;
2163     int inner_filter = f->inner_filter;
2164     ptrdiff_t linesize   = s->linesize;
2165     ptrdiff_t uvlinesize = s->uvlinesize;
2166     static const uint8_t hev_thresh_lut[2][64] = {
2167         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
2168           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2169           3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2170           3, 3, 3, 3 },
2171         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
2172           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2173           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2174           2, 2, 2, 2 }
2175     };
2176
2177     if (!filter_level)
2178         return;
2179
2180     if (is_vp7) {
2181         bedge_lim_y  = filter_level;
2182         bedge_lim_uv = filter_level * 2;
2183         mbedge_lim   = filter_level + 2;
2184     } else {
2185         bedge_lim_y  =
2186         bedge_lim_uv = filter_level * 2 + inner_limit;
2187         mbedge_lim   = bedge_lim_y + 4;
2188     }
2189
2190     hev_thresh = hev_thresh_lut[s->keyframe][filter_level];
2191
2192     if (mb_x) {
2193         s->vp8dsp.vp8_h_loop_filter16y(dst[0], linesize,
2194                                        mbedge_lim, inner_limit, hev_thresh);
2195         s->vp8dsp.vp8_h_loop_filter8uv(dst[1], dst[2], uvlinesize,
2196                                        mbedge_lim, inner_limit, hev_thresh);
2197     }
2198
2199 #define H_LOOP_FILTER_16Y_INNER(cond)                                         \
2200     if (cond && inner_filter) {                                               \
2201         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] +  4, linesize,           \
2202                                              bedge_lim_y, inner_limit,        \
2203                                              hev_thresh);                     \
2204         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] +  8, linesize,           \
2205                                              bedge_lim_y, inner_limit,        \
2206                                              hev_thresh);                     \
2207         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] + 12, linesize,           \
2208                                              bedge_lim_y, inner_limit,        \
2209                                              hev_thresh);                     \
2210         s->vp8dsp.vp8_h_loop_filter8uv_inner(dst[1] +  4, dst[2] + 4,         \
2211                                              uvlinesize,  bedge_lim_uv,       \
2212                                              inner_limit, hev_thresh);        \
2213     }
2214
2215     H_LOOP_FILTER_16Y_INNER(!is_vp7)
2216
2217     if (mb_y) {
2218         s->vp8dsp.vp8_v_loop_filter16y(dst[0], linesize,
2219                                        mbedge_lim, inner_limit, hev_thresh);
2220         s->vp8dsp.vp8_v_loop_filter8uv(dst[1], dst[2], uvlinesize,
2221                                        mbedge_lim, inner_limit, hev_thresh);
2222     }
2223
2224     if (inner_filter) {
2225         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] +  4 * linesize,
2226                                              linesize, bedge_lim_y,
2227                                              inner_limit, hev_thresh);
2228         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] +  8 * linesize,
2229                                              linesize, bedge_lim_y,
2230                                              inner_limit, hev_thresh);
2231         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] + 12 * linesize,
2232                                              linesize, bedge_lim_y,
2233                                              inner_limit, hev_thresh);
2234         s->vp8dsp.vp8_v_loop_filter8uv_inner(dst[1] +  4 * uvlinesize,
2235                                              dst[2] +  4 * uvlinesize,
2236                                              uvlinesize, bedge_lim_uv,
2237                                              inner_limit, hev_thresh);
2238     }
2239
2240     H_LOOP_FILTER_16Y_INNER(is_vp7)
2241 }
2242
2243 static av_always_inline
2244 void filter_mb_simple(VP8Context *s, uint8_t *dst, VP8FilterStrength *f,
2245                       int mb_x, int mb_y)
2246 {
2247     int mbedge_lim, bedge_lim;
2248     int filter_level = f->filter_level;
2249     int inner_limit  = f->inner_limit;
2250     int inner_filter = f->inner_filter;
2251     ptrdiff_t linesize = s->linesize;
2252
2253     if (!filter_level)
2254         return;
2255
2256     bedge_lim  = 2 * filter_level + inner_limit;
2257     mbedge_lim = bedge_lim + 4;
2258
2259     if (mb_x)
2260         s->vp8dsp.vp8_h_loop_filter_simple(dst, linesize, mbedge_lim);
2261     if (inner_filter) {
2262         s->vp8dsp.vp8_h_loop_filter_simple(dst +  4, linesize, bedge_lim);
2263         s->vp8dsp.vp8_h_loop_filter_simple(dst +  8, linesize, bedge_lim);
2264         s->vp8dsp.vp8_h_loop_filter_simple(dst + 12, linesize, bedge_lim);
2265     }
2266
2267     if (mb_y)
2268         s->vp8dsp.vp8_v_loop_filter_simple(dst, linesize, mbedge_lim);
2269     if (inner_filter) {
2270         s->vp8dsp.vp8_v_loop_filter_simple(dst +  4 * linesize, linesize, bedge_lim);
2271         s->vp8dsp.vp8_v_loop_filter_simple(dst +  8 * linesize, linesize, bedge_lim);
2272         s->vp8dsp.vp8_v_loop_filter_simple(dst + 12 * linesize, linesize, bedge_lim);
2273     }
2274 }
2275
2276 #define MARGIN (16 << 2)
2277 static av_always_inline
2278 int vp78_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *curframe,
2279                                     VP8Frame *prev_frame, int is_vp7)
2280 {
2281     VP8Context *s = avctx->priv_data;
2282     int mb_x, mb_y;
2283
2284     s->mv_bounds.mv_min.y = -MARGIN;
2285     s->mv_bounds.mv_max.y = ((s->mb_height - 1) << 6) + MARGIN;
2286     for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
2287         VP8Macroblock *mb = s->macroblocks_base +
2288                             ((s->mb_width + 1) * (mb_y + 1) + 1);
2289         int mb_xy = mb_y * s->mb_width;
2290
2291         AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED * 0x01010101);
2292
2293         s->mv_bounds.mv_min.x = -MARGIN;
2294         s->mv_bounds.mv_max.x = ((s->mb_width - 1) << 6) + MARGIN;
2295
2296         for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
2297             if (vpX_rac_is_end(&s->c)) {
2298                 return AVERROR_INVALIDDATA;
2299             }
2300             if (mb_y == 0)
2301                 AV_WN32A((mb - s->mb_width - 1)->intra4x4_pred_mode_top,
2302                          DC_PRED * 0x01010101);
2303             decode_mb_mode(s, &s->mv_bounds, mb, mb_x, mb_y, curframe->seg_map->data + mb_xy,
2304                            prev_frame && prev_frame->seg_map ?
2305                            prev_frame->seg_map->data + mb_xy : NULL, 1, is_vp7);
2306             s->mv_bounds.mv_min.x -= 64;
2307             s->mv_bounds.mv_max.x -= 64;
2308         }
2309         s->mv_bounds.mv_min.y -= 64;
2310         s->mv_bounds.mv_max.y -= 64;
2311     }
2312     return 0;
2313 }
2314
2315 static int vp7_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *cur_frame,
2316                                    VP8Frame *prev_frame)
2317 {
2318     return vp78_decode_mv_mb_modes(avctx, cur_frame, prev_frame, IS_VP7);
2319 }
2320
2321 static int vp8_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *cur_frame,
2322                                    VP8Frame *prev_frame)
2323 {
2324     return vp78_decode_mv_mb_modes(avctx, cur_frame, prev_frame, IS_VP8);
2325 }
2326
2327 #if HAVE_THREADS
2328 #define check_thread_pos(td, otd, mb_x_check, mb_y_check)                     \
2329     do {                                                                      \
2330         int tmp = (mb_y_check << 16) | (mb_x_check & 0xFFFF);                 \
2331         if (atomic_load(&otd->thread_mb_pos) < tmp) {                         \
2332             pthread_mutex_lock(&otd->lock);                                   \
2333             atomic_store(&td->wait_mb_pos, tmp);                              \
2334             do {                                                              \
2335                 if (atomic_load(&otd->thread_mb_pos) >= tmp)                  \
2336                     break;                                                    \
2337                 pthread_cond_wait(&otd->cond, &otd->lock);                    \
2338             } while (1);                                                      \
2339             atomic_store(&td->wait_mb_pos, INT_MAX);                          \
2340             pthread_mutex_unlock(&otd->lock);                                 \
2341         }                                                                     \
2342     } while (0)
2343
2344 #define update_pos(td, mb_y, mb_x)                                            \
2345     do {                                                                      \
2346         int pos              = (mb_y << 16) | (mb_x & 0xFFFF);                \
2347         int sliced_threading = (avctx->active_thread_type == FF_THREAD_SLICE) && \
2348                                (num_jobs > 1);                                \
2349         int is_null          = !next_td || !prev_td;                          \
2350         int pos_check        = (is_null) ? 1 :                                \
2351             (next_td != td && pos >= atomic_load(&next_td->wait_mb_pos)) ||   \
2352             (prev_td != td && pos >= atomic_load(&prev_td->wait_mb_pos));     \
2353         atomic_store(&td->thread_mb_pos, pos);                                \
2354         if (sliced_threading && pos_check) {                                  \
2355             pthread_mutex_lock(&td->lock);                                    \
2356             pthread_cond_broadcast(&td->cond);                                \
2357             pthread_mutex_unlock(&td->lock);                                  \
2358         }                                                                     \
2359     } while (0)
2360 #else
2361 #define check_thread_pos(td, otd, mb_x_check, mb_y_check) while(0)
2362 #define update_pos(td, mb_y, mb_x) while(0)
2363 #endif
2364
2365 static av_always_inline int decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
2366                                         int jobnr, int threadnr, int is_vp7)
2367 {
2368     VP8Context *s = avctx->priv_data;
2369     VP8ThreadData *prev_td, *next_td, *td = &s->thread_data[threadnr];
2370     int mb_y = atomic_load(&td->thread_mb_pos) >> 16;
2371     int mb_x, mb_xy = mb_y * s->mb_width;
2372     int num_jobs = s->num_jobs;
2373     VP8Frame *curframe = s->curframe, *prev_frame = s->prev_frame;
2374     VP56RangeCoder *c  = &s->coeff_partition[mb_y & (s->num_coeff_partitions - 1)];
2375     VP8Macroblock *mb;
2376     uint8_t *dst[3] = {
2377         curframe->tf.f->data[0] + 16 * mb_y * s->linesize,
2378         curframe->tf.f->data[1] +  8 * mb_y * s->uvlinesize,
2379         curframe->tf.f->data[2] +  8 * mb_y * s->uvlinesize
2380     };
2381
2382     if (vpX_rac_is_end(c))
2383          return AVERROR_INVALIDDATA;
2384
2385     if (mb_y == 0)
2386         prev_td = td;
2387     else
2388         prev_td = &s->thread_data[(jobnr + num_jobs - 1) % num_jobs];
2389     if (mb_y == s->mb_height - 1)
2390         next_td = td;
2391     else
2392         next_td = &s->thread_data[(jobnr + 1) % num_jobs];
2393     if (s->mb_layout == 1)
2394         mb = s->macroblocks_base + ((s->mb_width + 1) * (mb_y + 1) + 1);
2395     else {
2396         // Make sure the previous frame has read its segmentation map,
2397         // if we re-use the same map.
2398         if (prev_frame && s->segmentation.enabled &&
2399             !s->segmentation.update_map)
2400             ff_thread_await_progress(&prev_frame->tf, mb_y, 0);
2401         mb = s->macroblocks + (s->mb_height - mb_y - 1) * 2;
2402         memset(mb - 1, 0, sizeof(*mb)); // zero left macroblock
2403         AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED * 0x01010101);
2404     }
2405
2406     if (!is_vp7 || mb_y == 0)
2407         memset(td->left_nnz, 0, sizeof(td->left_nnz));
2408
2409     td->mv_bounds.mv_min.x = -MARGIN;
2410     td->mv_bounds.mv_max.x = ((s->mb_width - 1) << 6) + MARGIN;
2411
2412     for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
2413         if (vpX_rac_is_end(c))
2414             return AVERROR_INVALIDDATA;
2415         // Wait for previous thread to read mb_x+2, and reach mb_y-1.
2416         if (prev_td != td) {
2417             if (threadnr != 0) {
2418                 check_thread_pos(td, prev_td,
2419                                  mb_x + (is_vp7 ? 2 : 1),
2420                                  mb_y - (is_vp7 ? 2 : 1));
2421             } else {
2422                 check_thread_pos(td, prev_td,
2423                                  mb_x + (is_vp7 ? 2 : 1) + s->mb_width + 3,
2424                                  mb_y - (is_vp7 ? 2 : 1));
2425             }
2426         }
2427
2428         s->vdsp.prefetch(dst[0] + (mb_x & 3) * 4 * s->linesize + 64,
2429                          s->linesize, 4);
2430         s->vdsp.prefetch(dst[1] + (mb_x & 7) * s->uvlinesize + 64,
2431                          dst[2] - dst[1], 2);
2432
2433         if (!s->mb_layout)
2434             decode_mb_mode(s, &td->mv_bounds, mb, mb_x, mb_y, curframe->seg_map->data + mb_xy,
2435                            prev_frame && prev_frame->seg_map ?
2436                            prev_frame->seg_map->data + mb_xy : NULL, 0, is_vp7);
2437
2438         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_PREVIOUS);
2439
2440         if (!mb->skip)
2441             decode_mb_coeffs(s, td, c, mb, s->top_nnz[mb_x], td->left_nnz, is_vp7);
2442
2443         if (mb->mode <= MODE_I4x4)
2444             intra_predict(s, td, dst, mb, mb_x, mb_y, is_vp7);
2445         else
2446             inter_predict(s, td, dst, mb, mb_x, mb_y);
2447
2448         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN);
2449
2450         if (!mb->skip) {
2451             idct_mb(s, td, dst, mb);
2452         } else {
2453             AV_ZERO64(td->left_nnz);
2454             AV_WN64(s->top_nnz[mb_x], 0);   // array of 9, so unaligned
2455
2456             /* Reset DC block predictors if they would exist
2457              * if the mb had coefficients */
2458             if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
2459                 td->left_nnz[8]     = 0;
2460                 s->top_nnz[mb_x][8] = 0;
2461             }
2462         }
2463
2464         if (s->deblock_filter)
2465             filter_level_for_mb(s, mb, &td->filter_strength[mb_x], is_vp7);
2466
2467         if (s->deblock_filter && num_jobs != 1 && threadnr == num_jobs - 1) {
2468             if (s->filter.simple)
2469                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2470                                  NULL, NULL, s->linesize, 0, 1);
2471             else
2472                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2473                                  dst[1], dst[2], s->linesize, s->uvlinesize, 0);
2474         }
2475
2476         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN2);
2477
2478         dst[0]      += 16;
2479         dst[1]      += 8;
2480         dst[2]      += 8;
2481         td->mv_bounds.mv_min.x -= 64;
2482         td->mv_bounds.mv_max.x -= 64;
2483
2484         if (mb_x == s->mb_width + 1) {
2485             update_pos(td, mb_y, s->mb_width + 3);
2486         } else {
2487             update_pos(td, mb_y, mb_x);
2488         }
2489     }
2490     return 0;
2491 }
2492
2493 static int vp7_decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
2494                                         int jobnr, int threadnr)
2495 {
2496     return decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr, 1);
2497 }
2498
2499 static int vp8_decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
2500                                         int jobnr, int threadnr)
2501 {
2502     return decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr, 0);
2503 }
2504
2505 static av_always_inline void filter_mb_row(AVCodecContext *avctx, void *tdata,
2506                               int jobnr, int threadnr, int is_vp7)
2507 {
2508     VP8Context *s = avctx->priv_data;
2509     VP8ThreadData *td = &s->thread_data[threadnr];
2510     int mb_x, mb_y = atomic_load(&td->thread_mb_pos) >> 16, num_jobs = s->num_jobs;
2511     AVFrame *curframe = s->curframe->tf.f;
2512     VP8Macroblock *mb;
2513     VP8ThreadData *prev_td, *next_td;
2514     uint8_t *dst[3] = {
2515         curframe->data[0] + 16 * mb_y * s->linesize,
2516         curframe->data[1] +  8 * mb_y * s->uvlinesize,
2517         curframe->data[2] +  8 * mb_y * s->uvlinesize
2518     };
2519
2520     if (s->mb_layout == 1)
2521         mb = s->macroblocks_base + ((s->mb_width + 1) * (mb_y + 1) + 1);
2522     else
2523         mb = s->macroblocks + (s->mb_height - mb_y - 1) * 2;
2524
2525     if (mb_y == 0)
2526         prev_td = td;
2527     else
2528         prev_td = &s->thread_data[(jobnr + num_jobs - 1) % num_jobs];
2529     if (mb_y == s->mb_height - 1)
2530         next_td = td;
2531     else
2532         next_td = &s->thread_data[(jobnr + 1) % num_jobs];
2533
2534     for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb++) {
2535         VP8FilterStrength *f = &td->filter_strength[mb_x];
2536         if (prev_td != td)
2537             check_thread_pos(td, prev_td,
2538                              (mb_x + 1) + (s->mb_width + 3), mb_y - 1);
2539         if (next_td != td)
2540             if (next_td != &s->thread_data[0])
2541                 check_thread_pos(td, next_td, mb_x + 1, mb_y + 1);
2542
2543         if (num_jobs == 1) {
2544             if (s->filter.simple)
2545                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2546                                  NULL, NULL, s->linesize, 0, 1);
2547             else
2548                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2549                                  dst[1], dst[2], s->linesize, s->uvlinesize, 0);
2550         }
2551
2552         if (s->filter.simple)
2553             filter_mb_simple(s, dst[0], f, mb_x, mb_y);
2554         else
2555             filter_mb(s, dst, f, mb_x, mb_y, is_vp7);
2556         dst[0] += 16;
2557         dst[1] += 8;
2558         dst[2] += 8;
2559
2560         update_pos(td, mb_y, (s->mb_width + 3) + mb_x);
2561     }
2562 }
2563
2564 static void vp7_filter_mb_row(AVCodecContext *avctx, void *tdata,
2565                               int jobnr, int threadnr)
2566 {
2567     filter_mb_row(avctx, tdata, jobnr, threadnr, 1);
2568 }
2569
2570 static void vp8_filter_mb_row(AVCodecContext *avctx, void *tdata,
2571                               int jobnr, int threadnr)
2572 {
2573     filter_mb_row(avctx, tdata, jobnr, threadnr, 0);
2574 }
2575
2576 static av_always_inline
2577 int vp78_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata, int jobnr,
2578                               int threadnr, int is_vp7)
2579 {
2580     VP8Context *s = avctx->priv_data;
2581     VP8ThreadData *td = &s->thread_data[jobnr];
2582     VP8ThreadData *next_td = NULL, *prev_td = NULL;
2583     VP8Frame *curframe = s->curframe;
2584     int mb_y, num_jobs = s->num_jobs;
2585     int ret;
2586
2587     td->thread_nr = threadnr;
2588     td->mv_bounds.mv_min.y   = -MARGIN - 64 * threadnr;
2589     td->mv_bounds.mv_max.y   = ((s->mb_height - 1) << 6) + MARGIN - 64 * threadnr;
2590     for (mb_y = jobnr; mb_y < s->mb_height; mb_y += num_jobs) {
2591         atomic_store(&td->thread_mb_pos, mb_y << 16);
2592         ret = s->decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr);
2593         if (ret < 0) {
2594             update_pos(td, s->mb_height, INT_MAX & 0xFFFF);
2595             return ret;
2596         }
2597         if (s->deblock_filter)
2598             s->filter_mb_row(avctx, tdata, jobnr, threadnr);
2599         update_pos(td, mb_y, INT_MAX & 0xFFFF);
2600
2601         td->mv_bounds.mv_min.y -= 64 * num_jobs;
2602         td->mv_bounds.mv_max.y -= 64 * num_jobs;
2603
2604         if (avctx->active_thread_type == FF_THREAD_FRAME)
2605             ff_thread_report_progress(&curframe->tf, mb_y, 0);
2606     }
2607
2608     return 0;
2609 }
2610
2611 static int vp7_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata,
2612                                     int jobnr, int threadnr)
2613 {
2614     return vp78_decode_mb_row_sliced(avctx, tdata, jobnr, threadnr, IS_VP7);
2615 }
2616
2617 static int vp8_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata,
2618                                     int jobnr, int threadnr)
2619 {
2620     return vp78_decode_mb_row_sliced(avctx, tdata, jobnr, threadnr, IS_VP8);
2621 }
2622
2623 static av_always_inline
2624 int vp78_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
2625                       const AVPacket *avpkt, int is_vp7)
2626 {
2627     VP8Context *s = avctx->priv_data;
2628     int ret, i, referenced, num_jobs;
2629     enum AVDiscard skip_thresh;
2630     VP8Frame *av_uninit(curframe), *prev_frame;
2631
2632     if (is_vp7)
2633         ret = vp7_decode_frame_header(s, avpkt->data, avpkt->size);
2634     else
2635         ret = vp8_decode_frame_header(s, avpkt->data, avpkt->size);
2636
2637     if (ret < 0)
2638         goto err;
2639
2640     if (s->actually_webp) {
2641         // avctx->pix_fmt already set in caller.
2642     } else if (!is_vp7 && s->pix_fmt == AV_PIX_FMT_NONE) {
2643         s->pix_fmt = get_pixel_format(s);
2644         if (s->pix_fmt < 0) {
2645             ret = AVERROR(EINVAL);
2646             goto err;
2647         }
2648         avctx->pix_fmt = s->pix_fmt;
2649     }
2650
2651     prev_frame = s->framep[VP56_FRAME_CURRENT];
2652
2653     referenced = s->update_last || s->update_golden == VP56_FRAME_CURRENT ||
2654                  s->update_altref == VP56_FRAME_CURRENT;
2655
2656     skip_thresh = !referenced ? AVDISCARD_NONREF
2657                               : !s->keyframe ? AVDISCARD_NONKEY
2658                                              : AVDISCARD_ALL;
2659
2660     if (avctx->skip_frame >= skip_thresh) {
2661         s->invisible = 1;
2662         memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
2663         goto skip_decode;
2664     }
2665     s->deblock_filter = s->filter.level && avctx->skip_loop_filter < skip_thresh;
2666
2667     // release no longer referenced frames
2668     for (i = 0; i < 5; i++)
2669         if (s->frames[i].tf.f->buf[0] &&
2670             &s->frames[i] != prev_frame &&
2671             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
2672             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN]   &&
2673             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2])
2674             vp8_release_frame(s, &s->frames[i]);
2675
2676     curframe = s->framep[VP56_FRAME_CURRENT] = vp8_find_free_buffer(s);
2677
2678     if (!s->colorspace)
2679         avctx->colorspace = AVCOL_SPC_BT470BG;
2680     if (s->fullrange)
2681         avctx->color_range = AVCOL_RANGE_JPEG;
2682     else
2683         avctx->color_range = AVCOL_RANGE_MPEG;
2684
2685     /* Given that arithmetic probabilities are updated every frame, it's quite
2686      * likely that the values we have on a random interframe are complete
2687      * junk if we didn't start decode on a keyframe. So just don't display
2688      * anything rather than junk. */
2689     if (!s->keyframe && (!s->framep[VP56_FRAME_PREVIOUS] ||
2690                          !s->framep[VP56_FRAME_GOLDEN]   ||
2691                          !s->framep[VP56_FRAME_GOLDEN2])) {
2692         av_log(avctx, AV_LOG_WARNING,
2693                "Discarding interframe without a prior keyframe!\n");
2694         ret = AVERROR_INVALIDDATA;
2695         goto err;
2696     }
2697
2698     curframe->tf.f->key_frame = s->keyframe;
2699     curframe->tf.f->pict_type = s->keyframe ? AV_PICTURE_TYPE_I
2700                                             : AV_PICTURE_TYPE_P;
2701     if ((ret = vp8_alloc_frame(s, curframe, referenced)) < 0)
2702         goto err;
2703
2704     // check if golden and altref are swapped
2705     if (s->update_altref != VP56_FRAME_NONE)
2706         s->next_framep[VP56_FRAME_GOLDEN2] = s->framep[s->update_altref];
2707     else
2708         s->next_framep[VP56_FRAME_GOLDEN2] = s->framep[VP56_FRAME_GOLDEN2];
2709
2710     if (s->update_golden != VP56_FRAME_NONE)
2711         s->next_framep[VP56_FRAME_GOLDEN] = s->framep[s->update_golden];
2712     else
2713         s->next_framep[VP56_FRAME_GOLDEN] = s->framep[VP56_FRAME_GOLDEN];
2714
2715     if (s->update_last)
2716         s->next_framep[VP56_FRAME_PREVIOUS] = curframe;
2717     else
2718         s->next_framep[VP56_FRAME_PREVIOUS] = s->framep[VP56_FRAME_PREVIOUS];
2719
2720     s->next_framep[VP56_FRAME_CURRENT] = curframe;
2721
2722     if (avctx->codec->update_thread_context)
2723         ff_thread_finish_setup(avctx);
2724
2725     if (avctx->hwaccel) {
2726         ret = avctx->hwaccel->start_frame(avctx, avpkt->data, avpkt->size);
2727         if (ret < 0)
2728             goto err;
2729
2730         ret = avctx->hwaccel->decode_slice(avctx, avpkt->data, avpkt->size);
2731         if (ret < 0)
2732             goto err;
2733
2734         ret = avctx->hwaccel->end_frame(avctx);
2735         if (ret < 0)
2736             goto err;
2737
2738     } else {
2739         s->linesize   = curframe->tf.f->linesize[0];
2740         s->uvlinesize = curframe->tf.f->linesize[1];
2741
2742         memset(s->top_nnz, 0, s->mb_width * sizeof(*s->top_nnz));
2743         /* Zero macroblock structures for top/top-left prediction
2744          * from outside the frame. */
2745         if (!s->mb_layout)
2746             memset(s->macroblocks + s->mb_height * 2 - 1, 0,
2747                    (s->mb_width + 1) * sizeof(*s->macroblocks));
2748         if (!s->mb_layout && s->keyframe)
2749             memset(s->intra4x4_pred_mode_top, DC_PRED, s->mb_width * 4);
2750
2751         memset(s->ref_count, 0, sizeof(s->ref_count));
2752
2753         if (s->mb_layout == 1) {
2754             // Make sure the previous frame has read its segmentation map,
2755             // if we re-use the same map.
2756             if (prev_frame && s->segmentation.enabled &&
2757                 !s->segmentation.update_map)
2758                 ff_thread_await_progress(&prev_frame->tf, 1, 0);
2759             if (is_vp7)
2760                 ret = vp7_decode_mv_mb_modes(avctx, curframe, prev_frame);
2761             else
2762                 ret = vp8_decode_mv_mb_modes(avctx, curframe, prev_frame);
2763             if (ret < 0)
2764                 goto err;
2765         }
2766
2767         if (avctx->active_thread_type == FF_THREAD_FRAME)
2768             num_jobs = 1;
2769         else
2770             num_jobs = FFMIN(s->num_coeff_partitions, avctx->thread_count);
2771         s->num_jobs   = num_jobs;
2772         s->curframe   = curframe;
2773         s->prev_frame = prev_frame;
2774         s->mv_bounds.mv_min.y   = -MARGIN;
2775         s->mv_bounds.mv_max.y   = ((s->mb_height - 1) << 6) + MARGIN;
2776         for (i = 0; i < MAX_THREADS; i++) {
2777             VP8ThreadData *td = &s->thread_data[i];
2778             atomic_init(&td->thread_mb_pos, 0);
2779             atomic_init(&td->wait_mb_pos, INT_MAX);
2780         }
2781         if (is_vp7)
2782             avctx->execute2(avctx, vp7_decode_mb_row_sliced, s->thread_data, NULL,
2783                             num_jobs);
2784         else
2785             avctx->execute2(avctx, vp8_decode_mb_row_sliced, s->thread_data, NULL,
2786                             num_jobs);
2787     }
2788
2789     ff_thread_report_progress(&curframe->tf, INT_MAX, 0);
2790     memcpy(&s->framep[0], &s->next_framep[0], sizeof(s->framep[0]) * 4);
2791
2792 skip_decode:
2793     // if future frames don't use the updated probabilities,
2794     // reset them to the values we saved
2795     if (!s->update_probabilities)
2796         s->prob[0] = s->prob[1];
2797
2798     if (!s->invisible) {
2799         if ((ret = av_frame_ref(data, curframe->tf.f)) < 0)
2800             return ret;
2801         *got_frame = 1;
2802     }
2803
2804     return avpkt->size;
2805 err:
2806     memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
2807     return ret;
2808 }
2809
2810 int ff_vp8_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
2811                         AVPacket *avpkt)
2812 {
2813     return vp78_decode_frame(avctx, data, got_frame, avpkt, IS_VP8);
2814 }
2815
2816 #if CONFIG_VP7_DECODER
2817 static int vp7_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
2818                             AVPacket *avpkt)
2819 {
2820     return vp78_decode_frame(avctx, data, got_frame, avpkt, IS_VP7);
2821 }
2822 #endif /* CONFIG_VP7_DECODER */
2823
2824 av_cold int ff_vp8_decode_free(AVCodecContext *avctx)
2825 {
2826     VP8Context *s = avctx->priv_data;
2827     int i;
2828
2829     if (!s)
2830         return 0;
2831
2832     vp8_decode_flush_impl(avctx, 1);
2833     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++)
2834         av_frame_free(&s->frames[i].tf.f);
2835
2836     return 0;
2837 }
2838
2839 static av_cold int vp8_init_frames(VP8Context *s)
2840 {
2841     int i;
2842     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++) {
2843         s->frames[i].tf.f = av_frame_alloc();
2844         if (!s->frames[i].tf.f)
2845             return AVERROR(ENOMEM);
2846     }
2847     return 0;
2848 }
2849
2850 static av_always_inline
2851 int vp78_decode_init(AVCodecContext *avctx, int is_vp7)
2852 {
2853     VP8Context *s = avctx->priv_data;
2854     int ret;
2855
2856     s->avctx = avctx;
2857     s->vp7   = avctx->codec->id == AV_CODEC_ID_VP7;
2858     s->pix_fmt = AV_PIX_FMT_NONE;
2859     avctx->pix_fmt = AV_PIX_FMT_YUV420P;
2860
2861     ff_videodsp_init(&s->vdsp, 8);
2862
2863     ff_vp78dsp_init(&s->vp8dsp);
2864     if (CONFIG_VP7_DECODER && is_vp7) {
2865         ff_h264_pred_init(&s->hpc, AV_CODEC_ID_VP7, 8, 1);
2866         ff_vp7dsp_init(&s->vp8dsp);
2867         s->decode_mb_row_no_filter = vp7_decode_mb_row_no_filter;
2868         s->filter_mb_row           = vp7_filter_mb_row;
2869     } else if (CONFIG_VP8_DECODER && !is_vp7) {
2870         ff_h264_pred_init(&s->hpc, AV_CODEC_ID_VP8, 8, 1);
2871         ff_vp8dsp_init(&s->vp8dsp);
2872         s->decode_mb_row_no_filter = vp8_decode_mb_row_no_filter;
2873         s->filter_mb_row           = vp8_filter_mb_row;
2874     }
2875
2876     /* does not change for VP8 */
2877     memcpy(s->prob[0].scan, ff_zigzag_scan, sizeof(s->prob[0].scan));
2878
2879     if ((ret = vp8_init_frames(s)) < 0) {
2880         ff_vp8_decode_free(avctx);
2881         return ret;
2882     }
2883
2884     return 0;
2885 }
2886
2887 #if CONFIG_VP7_DECODER
2888 static int vp7_decode_init(AVCodecContext *avctx)
2889 {
2890     return vp78_decode_init(avctx, IS_VP7);
2891 }
2892 #endif /* CONFIG_VP7_DECODER */
2893
2894 av_cold int ff_vp8_decode_init(AVCodecContext *avctx)
2895 {
2896     return vp78_decode_init(avctx, IS_VP8);
2897 }
2898
2899 #if CONFIG_VP8_DECODER
2900 #if HAVE_THREADS
2901 #define REBASE(pic) ((pic) ? (pic) - &s_src->frames[0] + &s->frames[0] : NULL)
2902
2903 static int vp8_decode_update_thread_context(AVCodecContext *dst,
2904                                             const AVCodecContext *src)
2905 {
2906     VP8Context *s = dst->priv_data, *s_src = src->priv_data;
2907     int i;
2908
2909     if (s->macroblocks_base &&
2910         (s_src->mb_width != s->mb_width || s_src->mb_height != s->mb_height)) {
2911         free_buffers(s);
2912         s->mb_width  = s_src->mb_width;
2913         s->mb_height = s_src->mb_height;
2914     }
2915
2916     s->pix_fmt      = s_src->pix_fmt;
2917     s->prob[0]      = s_src->prob[!s_src->update_probabilities];
2918     s->segmentation = s_src->segmentation;
2919     s->lf_delta     = s_src->lf_delta;
2920     memcpy(s->sign_bias, s_src->sign_bias, sizeof(s->sign_bias));
2921
2922     for (i = 0; i < FF_ARRAY_ELEMS(s_src->frames); i++) {
2923         if (s_src->frames[i].tf.f->buf[0]) {
2924             int ret = vp8_ref_frame(s, &s->frames[i], &s_src->frames[i]);
2925             if (ret < 0)
2926                 return ret;
2927         }
2928     }
2929
2930     s->framep[0] = REBASE(s_src->next_framep[0]);
2931     s->framep[1] = REBASE(s_src->next_framep[1]);
2932     s->framep[2] = REBASE(s_src->next_framep[2]);
2933     s->framep[3] = REBASE(s_src->next_framep[3]);
2934
2935     return 0;
2936 }
2937 #endif /* HAVE_THREADS */
2938 #endif /* CONFIG_VP8_DECODER */
2939
2940 #if CONFIG_VP7_DECODER
2941 const AVCodec ff_vp7_decoder = {
2942     .name                  = "vp7",
2943     .long_name             = NULL_IF_CONFIG_SMALL("On2 VP7"),
2944     .type                  = AVMEDIA_TYPE_VIDEO,
2945     .id                    = AV_CODEC_ID_VP7,
2946     .priv_data_size        = sizeof(VP8Context),
2947     .init                  = vp7_decode_init,
2948     .close                 = ff_vp8_decode_free,
2949     .decode                = vp7_decode_frame,
2950     .capabilities          = AV_CODEC_CAP_DR1,
2951     .flush                 = vp8_decode_flush,
2952 };
2953 #endif /* CONFIG_VP7_DECODER */
2954
2955 #if CONFIG_VP8_DECODER
2956 const AVCodec ff_vp8_decoder = {
2957     .name                  = "vp8",
2958     .long_name             = NULL_IF_CONFIG_SMALL("On2 VP8"),
2959     .type                  = AVMEDIA_TYPE_VIDEO,
2960     .id                    = AV_CODEC_ID_VP8,
2961     .priv_data_size        = sizeof(VP8Context),
2962     .init                  = ff_vp8_decode_init,
2963     .close                 = ff_vp8_decode_free,
2964     .decode                = ff_vp8_decode_frame,
2965     .capabilities          = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS |
2966                              AV_CODEC_CAP_SLICE_THREADS,
2967     .flush                 = vp8_decode_flush,
2968     .update_thread_context = ONLY_IF_THREADS_ENABLED(vp8_decode_update_thread_context),
2969     .hw_configs            = (const AVCodecHWConfigInternal *const []) {
2970 #if CONFIG_VP8_VAAPI_HWACCEL
2971                                HWACCEL_VAAPI(vp8),
2972 #endif
2973 #if CONFIG_VP8_NVDEC_HWACCEL
2974                                HWACCEL_NVDEC(vp8),
2975 #endif
2976                                NULL
2977                            },
2978     .caps_internal         = FF_CODEC_CAP_ALLOCATE_PROGRESS,
2979 };
2980 #endif /* CONFIG_VP7_DECODER */