git.sesse.net Git - ffmpeg/blob - libavcodec/vp8.c

   1 /*
   2  * VP7/VP8 compatible video decoder
   3  *
   4  * Copyright (C) 2010 David Conrad
   5  * Copyright (C) 2010 Ronald S. Bultje
   6  * Copyright (C) 2010 Fiona Glaser
   7  * Copyright (C) 2012 Daniel Kang
   8  * Copyright (C) 2014 Peter Ross
   9  *
  10  * This file is part of FFmpeg.
  11  *
  12  * FFmpeg is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public
  14  * License as published by the Free Software Foundation; either
  15  * version 2.1 of the License, or (at your option) any later version.
  16  *
  17  * FFmpeg is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with FFmpeg; if not, write to the Free Software
  24  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  25  */
  26
  27 #include "libavutil/imgutils.h"
  28
  29 #include "avcodec.h"
  30 #include "hwaccel.h"
  31 #include "internal.h"
  32 #include "mathops.h"
  33 #include "rectangle.h"
  34 #include "thread.h"
  35 #include "vp8.h"
  36 #include "vp8data.h"
  37
  38 #if ARCH_ARM
  39 #   include "arm/vp8.h"
  40 #endif
  41
  42 #if CONFIG_VP7_DECODER && CONFIG_VP8_DECODER
  43 #define VPX(vp7, f) (vp7 ? vp7_ ## f : vp8_ ## f)
  44 #elif CONFIG_VP7_DECODER
  45 #define VPX(vp7, f) vp7_ ## f
  46 #else // CONFIG_VP8_DECODER
  47 #define VPX(vp7, f) vp8_ ## f
  48 #endif
  49
  50 static void free_buffers(VP8Context *s)
  51 {
  52     int i;
  53     if (s->thread_data)
  54         for (i = 0; i < MAX_THREADS; i++) {
  55 #if HAVE_THREADS
  56             pthread_cond_destroy(&s->thread_data[i].cond);
  57             pthread_mutex_destroy(&s->thread_data[i].lock);
  58 #endif
  59             av_freep(&s->thread_data[i].filter_strength);
  60         }
  61     av_freep(&s->thread_data);
  62     av_freep(&s->macroblocks_base);
  63     av_freep(&s->intra4x4_pred_mode_top);
  64     av_freep(&s->top_nnz);
  65     av_freep(&s->top_border);
  66
  67     s->macroblocks = NULL;
  68 }
  69
  70 static int vp8_alloc_frame(VP8Context *s, VP8Frame *f, int ref)
  71 {
  72     int ret;
  73     if ((ret = ff_thread_get_buffer(s->avctx, &f->tf,
  74                                     ref ? AV_GET_BUFFER_FLAG_REF : 0)) < 0)
  75         return ret;
  76     if (!(f->seg_map = av_buffer_allocz(s->mb_width * s->mb_height)))
  77         goto fail;
  78     if (s->avctx->hwaccel) {
  79         const AVHWAccel *hwaccel = s->avctx->hwaccel;
  80         if (hwaccel->frame_priv_data_size) {
  81             f->hwaccel_priv_buf = av_buffer_allocz(hwaccel->frame_priv_data_size);
  82             if (!f->hwaccel_priv_buf)
  83                 goto fail;
  84             f->hwaccel_picture_private = f->hwaccel_priv_buf->data;
  85         }
  86     }
  87     return 0;
  88
  89 fail:
  90     av_buffer_unref(&f->seg_map);
  91     ff_thread_release_buffer(s->avctx, &f->tf);
  92     return AVERROR(ENOMEM);
  93 }
  94
  95 static void vp8_release_frame(VP8Context *s, VP8Frame *f)
  96 {
  97     av_buffer_unref(&f->seg_map);
  98     av_buffer_unref(&f->hwaccel_priv_buf);
  99     f->hwaccel_picture_private = NULL;
 100     ff_thread_release_buffer(s->avctx, &f->tf);
 101 }
 102
 103 #if CONFIG_VP8_DECODER
 104 static int vp8_ref_frame(VP8Context *s, VP8Frame *dst, VP8Frame *src)
 105 {
 106     int ret;
 107
 108     vp8_release_frame(s, dst);
 109
 110     if ((ret = ff_thread_ref_frame(&dst->tf, &src->tf)) < 0)
 111         return ret;
 112     if (src->seg_map &&
 113         !(dst->seg_map = av_buffer_ref(src->seg_map))) {
 114         vp8_release_frame(s, dst);
 115         return AVERROR(ENOMEM);
 116     }
 117     if (src->hwaccel_picture_private) {
 118         dst->hwaccel_priv_buf = av_buffer_ref(src->hwaccel_priv_buf);
 119         if (!dst->hwaccel_priv_buf)
 120             return AVERROR(ENOMEM);
 121         dst->hwaccel_picture_private = dst->hwaccel_priv_buf->data;
 122     }
 123
 124     return 0;
 125 }
 126 #endif /* CONFIG_VP8_DECODER */
 127
 128 static void vp8_decode_flush_impl(AVCodecContext *avctx, int free_mem)
 129 {
 130     VP8Context *s = avctx->priv_data;
 131     int i;
 132
 133     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++)
 134         vp8_release_frame(s, &s->frames[i]);
 135     memset(s->framep, 0, sizeof(s->framep));
 136
 137     if (free_mem)
 138         free_buffers(s);
 139 }
 140
 141 static void vp8_decode_flush(AVCodecContext *avctx)
 142 {
 143     vp8_decode_flush_impl(avctx, 0);
 144 }
 145
 146 static VP8Frame *vp8_find_free_buffer(VP8Context *s)
 147 {
 148     VP8Frame *frame = NULL;
 149     int i;
 150
 151     // find a free buffer
 152     for (i = 0; i < 5; i++)
 153         if (&s->frames[i] != s->framep[VP56_FRAME_CURRENT]  &&
 154             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
 155             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN]   &&
 156             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2]) {
 157             frame = &s->frames[i];
 158             break;
 159         }
 160     if (i == 5) {
 161         av_log(s->avctx, AV_LOG_FATAL, "Ran out of free frames!\n");
 162         abort();
 163     }
 164     if (frame->tf.f->buf[0])
 165         vp8_release_frame(s, frame);
 166
 167     return frame;
 168 }
 169
 170 static enum AVPixelFormat get_pixel_format(VP8Context *s)
 171 {
 172     enum AVPixelFormat pix_fmts[] = {
 173 #if CONFIG_VP8_VAAPI_HWACCEL
 174         AV_PIX_FMT_VAAPI,
 175 #endif
 176 #if CONFIG_VP8_NVDEC_HWACCEL
 177         AV_PIX_FMT_CUDA,
 178 #endif
 179         AV_PIX_FMT_YUV420P,
 180         AV_PIX_FMT_NONE,
 181     };
 182
 183     return ff_get_format(s->avctx, pix_fmts);
 184 }
 185
 186 static av_always_inline
 187 int update_dimensions(VP8Context *s, int width, int height, int is_vp7)
 188 {
 189     AVCodecContext *avctx = s->avctx;
 190     int i, ret;
 191
 192     if (width  != s->avctx->width || ((width+15)/16 != s->mb_width || (height+15)/16 != s->mb_height) && s->macroblocks_base ||
 193         height != s->avctx->height) {
 194         vp8_decode_flush_impl(s->avctx, 1);
 195
 196         ret = ff_set_dimensions(s->avctx, width, height);
 197         if (ret < 0)
 198             return ret;
 199     }
 200
 201     if (!s->actually_webp && !is_vp7) {
 202         s->pix_fmt = get_pixel_format(s);
 203         if (s->pix_fmt < 0)
 204             return AVERROR(EINVAL);
 205         avctx->pix_fmt = s->pix_fmt;
 206     }
 207
 208     s->mb_width  = (s->avctx->coded_width  + 15) / 16;
 209     s->mb_height = (s->avctx->coded_height + 15) / 16;
 210
 211     s->mb_layout = is_vp7 || avctx->active_thread_type == FF_THREAD_SLICE &&
 212                    avctx->thread_count > 1;
 213     if (!s->mb_layout) { // Frame threading and one thread
 214         s->macroblocks_base       = av_mallocz((s->mb_width + s->mb_height * 2 + 1) *
 215                                                sizeof(*s->macroblocks));
 216         s->intra4x4_pred_mode_top = av_mallocz(s->mb_width * 4);
 217     } else // Sliced threading
 218         s->macroblocks_base = av_mallocz((s->mb_width + 2) * (s->mb_height + 2) *
 219                                          sizeof(*s->macroblocks));
 220     s->top_nnz     = av_mallocz(s->mb_width * sizeof(*s->top_nnz));
 221     s->top_border  = av_mallocz((s->mb_width + 1) * sizeof(*s->top_border));
 222     s->thread_data = av_mallocz(MAX_THREADS * sizeof(VP8ThreadData));
 223
 224     if (!s->macroblocks_base || !s->top_nnz || !s->top_border ||
 225         !s->thread_data || (!s->intra4x4_pred_mode_top && !s->mb_layout)) {
 226         free_buffers(s);
 227         return AVERROR(ENOMEM);
 228     }
 229
 230     for (i = 0; i < MAX_THREADS; i++) {
 231         s->thread_data[i].filter_strength =
 232             av_mallocz(s->mb_width * sizeof(*s->thread_data[0].filter_strength));
 233         if (!s->thread_data[i].filter_strength) {
 234             free_buffers(s);
 235             return AVERROR(ENOMEM);
 236         }
 237 #if HAVE_THREADS
 238         pthread_mutex_init(&s->thread_data[i].lock, NULL);
 239         pthread_cond_init(&s->thread_data[i].cond, NULL);
 240 #endif
 241     }
 242
 243     s->macroblocks = s->macroblocks_base + 1;
 244
 245     return 0;
 246 }
 247
 248 static int vp7_update_dimensions(VP8Context *s, int width, int height)
 249 {
 250     return update_dimensions(s, width, height, IS_VP7);
 251 }
 252
 253 static int vp8_update_dimensions(VP8Context *s, int width, int height)
 254 {
 255     return update_dimensions(s, width, height, IS_VP8);
 256 }
 257
 258
 259 static void parse_segment_info(VP8Context *s)
 260 {
 261     VP56RangeCoder *c = &s->c;
 262     int i;
 263
 264     s->segmentation.update_map = vp8_rac_get(c);
 265     s->segmentation.update_feature_data = vp8_rac_get(c);
 266
 267     if (s->segmentation.update_feature_data) {
 268         s->segmentation.absolute_vals = vp8_rac_get(c);
 269
 270         for (i = 0; i < 4; i++)
 271             s->segmentation.base_quant[i]   = vp8_rac_get_sint(c, 7);
 272
 273         for (i = 0; i < 4; i++)
 274             s->segmentation.filter_level[i] = vp8_rac_get_sint(c, 6);
 275     }
 276     if (s->segmentation.update_map)
 277         for (i = 0; i < 3; i++)
 278             s->prob->segmentid[i] = vp8_rac_get(c) ? vp8_rac_get_uint(c, 8) : 255;
 279 }
 280
 281 static void update_lf_deltas(VP8Context *s)
 282 {
 283     VP56RangeCoder *c = &s->c;
 284     int i;
 285
 286     for (i = 0; i < 4; i++) {
 287         if (vp8_rac_get(c)) {
 288             s->lf_delta.ref[i] = vp8_rac_get_uint(c, 6);
 289
 290             if (vp8_rac_get(c))
 291                 s->lf_delta.ref[i] = -s->lf_delta.ref[i];
 292         }
 293     }
 294
 295     for (i = MODE_I4x4; i <= VP8_MVMODE_SPLIT; i++) {
 296         if (vp8_rac_get(c)) {
 297             s->lf_delta.mode[i] = vp8_rac_get_uint(c, 6);
 298
 299             if (vp8_rac_get(c))
 300                 s->lf_delta.mode[i] = -s->lf_delta.mode[i];
 301         }
 302     }
 303 }
 304
 305 static int setup_partitions(VP8Context *s, const uint8_t *buf, int buf_size)
 306 {
 307     const uint8_t *sizes = buf;
 308     int i;
 309     int ret;
 310
 311     s->num_coeff_partitions = 1 << vp8_rac_get_uint(&s->c, 2);
 312
 313     buf      += 3 * (s->num_coeff_partitions - 1);
 314     buf_size -= 3 * (s->num_coeff_partitions - 1);
 315     if (buf_size < 0)
 316         return -1;
 317
 318     for (i = 0; i < s->num_coeff_partitions - 1; i++) {
 319         int size = AV_RL24(sizes + 3 * i);
 320         if (buf_size - size < 0)
 321             return -1;
 322         s->coeff_partition_size[i] = size;
 323
 324         ret = ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, size);
 325         if (ret < 0)
 326             return ret;
 327         buf      += size;
 328         buf_size -= size;
 329     }
 330
 331     s->coeff_partition_size[i] = buf_size;
 332     ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, buf_size);
 333
 334     return 0;
 335 }
 336
 337 static void vp7_get_quants(VP8Context *s)
 338 {
 339     VP56RangeCoder *c = &s->c;
 340
 341     int yac_qi  = vp8_rac_get_uint(c, 7);
 342     int ydc_qi  = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 343     int y2dc_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 344     int y2ac_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 345     int uvdc_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 346     int uvac_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 347
 348     s->qmat[0].luma_qmul[0]    =       vp7_ydc_qlookup[ydc_qi];
 349     s->qmat[0].luma_qmul[1]    =       vp7_yac_qlookup[yac_qi];
 350     s->qmat[0].luma_dc_qmul[0] =       vp7_y2dc_qlookup[y2dc_qi];
 351     s->qmat[0].luma_dc_qmul[1] =       vp7_y2ac_qlookup[y2ac_qi];
 352     s->qmat[0].chroma_qmul[0]  = FFMIN(vp7_ydc_qlookup[uvdc_qi], 132);
 353     s->qmat[0].chroma_qmul[1]  =       vp7_yac_qlookup[uvac_qi];
 354 }
 355
 356 static void vp8_get_quants(VP8Context *s)
 357 {
 358     VP56RangeCoder *c = &s->c;
 359     int i, base_qi;
 360
 361     s->quant.yac_qi     = vp8_rac_get_uint(c, 7);
 362     s->quant.ydc_delta  = vp8_rac_get_sint(c, 4);
 363     s->quant.y2dc_delta = vp8_rac_get_sint(c, 4);
 364     s->quant.y2ac_delta = vp8_rac_get_sint(c, 4);
 365     s->quant.uvdc_delta = vp8_rac_get_sint(c, 4);
 366     s->quant.uvac_delta = vp8_rac_get_sint(c, 4);
 367
 368     for (i = 0; i < 4; i++) {
 369         if (s->segmentation.enabled) {
 370             base_qi = s->segmentation.base_quant[i];
 371             if (!s->segmentation.absolute_vals)
 372                 base_qi += s->quant.yac_qi;
 373         } else
 374             base_qi = s->quant.yac_qi;
 375
 376         s->qmat[i].luma_qmul[0]    = vp8_dc_qlookup[av_clip_uintp2(base_qi + s->quant.ydc_delta,  7)];
 377         s->qmat[i].luma_qmul[1]    = vp8_ac_qlookup[av_clip_uintp2(base_qi,              7)];
 378         s->qmat[i].luma_dc_qmul[0] = vp8_dc_qlookup[av_clip_uintp2(base_qi + s->quant.y2dc_delta, 7)] * 2;
 379         /* 101581>>16 is equivalent to 155/100 */
 380         s->qmat[i].luma_dc_qmul[1] = vp8_ac_qlookup[av_clip_uintp2(base_qi + s->quant.y2ac_delta, 7)] * 101581 >> 16;
 381         s->qmat[i].chroma_qmul[0]  = vp8_dc_qlookup[av_clip_uintp2(base_qi + s->quant.uvdc_delta, 7)];
 382         s->qmat[i].chroma_qmul[1]  = vp8_ac_qlookup[av_clip_uintp2(base_qi + s->quant.uvac_delta, 7)];
 383
 384         s->qmat[i].luma_dc_qmul[1] = FFMAX(s->qmat[i].luma_dc_qmul[1], 8);
 385         s->qmat[i].chroma_qmul[0]  = FFMIN(s->qmat[i].chroma_qmul[0], 132);
 386     }
 387 }
 388
 389 /**
 390  * Determine which buffers golden and altref should be updated with after this frame.
 391  * The spec isn't clear here, so I'm going by my understanding of what libvpx does
 392  *
 393  * Intra frames update all 3 references
 394  * Inter frames update VP56_FRAME_PREVIOUS if the update_last flag is set
 395  * If the update (golden|altref) flag is set, it's updated with the current frame
 396  *      if update_last is set, and VP56_FRAME_PREVIOUS otherwise.
 397  * If the flag is not set, the number read means:
 398  *      0: no update
 399  *      1: VP56_FRAME_PREVIOUS
 400  *      2: update golden with altref, or update altref with golden
 401  */
 402 static VP56Frame ref_to_update(VP8Context *s, int update, VP56Frame ref)
 403 {
 404     VP56RangeCoder *c = &s->c;
 405
 406     if (update)
 407         return VP56_FRAME_CURRENT;
 408
 409     switch (vp8_rac_get_uint(c, 2)) {
 410     case 1:
 411         return VP56_FRAME_PREVIOUS;
 412     case 2:
 413         return (ref == VP56_FRAME_GOLDEN) ? VP56_FRAME_GOLDEN2 : VP56_FRAME_GOLDEN;
 414     }
 415     return VP56_FRAME_NONE;
 416 }
 417
 418 static void vp78_reset_probability_tables(VP8Context *s)
 419 {
 420     int i, j;
 421     for (i = 0; i < 4; i++)
 422         for (j = 0; j < 16; j++)
 423             memcpy(s->prob->token[i][j], vp8_token_default_probs[i][vp8_coeff_band[j]],
 424                    sizeof(s->prob->token[i][j]));
 425 }
 426
 427 static void vp78_update_probability_tables(VP8Context *s)
 428 {
 429     VP56RangeCoder *c = &s->c;
 430     int i, j, k, l, m;
 431
 432     for (i = 0; i < 4; i++)
 433         for (j = 0; j < 8; j++)
 434             for (k = 0; k < 3; k++)
 435                 for (l = 0; l < NUM_DCT_TOKENS-1; l++)
 436                     if (vp56_rac_get_prob_branchy(c, vp8_token_update_probs[i][j][k][l])) {
 437                         int prob = vp8_rac_get_uint(c, 8);
 438                         for (m = 0; vp8_coeff_band_indexes[j][m] >= 0; m++)
 439                             s->prob->token[i][vp8_coeff_band_indexes[j][m]][k][l] = prob;
 440                     }
 441 }
 442
 443 #define VP7_MVC_SIZE 17
 444 #define VP8_MVC_SIZE 19
 445
 446 static void vp78_update_pred16x16_pred8x8_mvc_probabilities(VP8Context *s,
 447                                                             int mvc_size)
 448 {
 449     VP56RangeCoder *c = &s->c;
 450     int i, j;
 451
 452     if (vp8_rac_get(c))
 453         for (i = 0; i < 4; i++)
 454             s->prob->pred16x16[i] = vp8_rac_get_uint(c, 8);
 455     if (vp8_rac_get(c))
 456         for (i = 0; i < 3; i++)
 457             s->prob->pred8x8c[i]  = vp8_rac_get_uint(c, 8);
 458
 459     // 17.2 MV probability update
 460     for (i = 0; i < 2; i++)
 461         for (j = 0; j < mvc_size; j++)
 462             if (vp56_rac_get_prob_branchy(c, vp8_mv_update_prob[i][j]))
 463                 s->prob->mvc[i][j] = vp8_rac_get_nn(c);
 464 }
 465
 466 static void update_refs(VP8Context *s)
 467 {
 468     VP56RangeCoder *c = &s->c;
 469
 470     int update_golden = vp8_rac_get(c);
 471     int update_altref = vp8_rac_get(c);
 472
 473     s->update_golden = ref_to_update(s, update_golden, VP56_FRAME_GOLDEN);
 474     s->update_altref = ref_to_update(s, update_altref, VP56_FRAME_GOLDEN2);
 475 }
 476
 477 static void copy_chroma(AVFrame *dst, AVFrame *src, int width, int height)
 478 {
 479     int i, j;
 480
 481     for (j = 1; j < 3; j++) {
 482         for (i = 0; i < height / 2; i++)
 483             memcpy(dst->data[j] + i * dst->linesize[j],
 484                    src->data[j] + i * src->linesize[j], width / 2);
 485     }
 486 }
 487
 488 static void fade(uint8_t *dst, ptrdiff_t dst_linesize,
 489                  const uint8_t *src, ptrdiff_t src_linesize,
 490                  int width, int height,
 491                  int alpha, int beta)
 492 {
 493     int i, j;
 494     for (j = 0; j < height; j++) {
 495         const uint8_t *src2 = src + j * src_linesize;
 496         uint8_t *dst2 = dst + j * dst_linesize;
 497         for (i = 0; i < width; i++) {
 498             uint8_t y = src2[i];
 499             dst2[i] = av_clip_uint8(y + ((y * beta) >> 8) + alpha);
 500         }
 501     }
 502 }
 503
 504 static int vp7_fade_frame(VP8Context *s, VP56RangeCoder *c)
 505 {
 506     int alpha = (int8_t) vp8_rac_get_uint(c, 8);
 507     int beta  = (int8_t) vp8_rac_get_uint(c, 8);
 508     int ret;
 509
 510     if (!s->keyframe && (alpha || beta)) {
 511         int width  = s->mb_width * 16;
 512         int height = s->mb_height * 16;
 513         AVFrame *src, *dst;
 514
 515         if (!s->framep[VP56_FRAME_PREVIOUS] ||
 516             !s->framep[VP56_FRAME_GOLDEN]) {
 517             av_log(s->avctx, AV_LOG_WARNING, "Discarding interframe without a prior keyframe!\n");
 518             return AVERROR_INVALIDDATA;
 519         }
 520
 521         dst =
 522         src = s->framep[VP56_FRAME_PREVIOUS]->tf.f;
 523
 524         /* preserve the golden frame, write a new previous frame */
 525         if (s->framep[VP56_FRAME_GOLDEN] == s->framep[VP56_FRAME_PREVIOUS]) {
 526             s->framep[VP56_FRAME_PREVIOUS] = vp8_find_free_buffer(s);
 527             if ((ret = vp8_alloc_frame(s, s->framep[VP56_FRAME_PREVIOUS], 1)) < 0)
 528                 return ret;
 529
 530             dst = s->framep[VP56_FRAME_PREVIOUS]->tf.f;
 531
 532             copy_chroma(dst, src, width, height);
 533         }
 534
 535         fade(dst->data[0], dst->linesize[0],
 536              src->data[0], src->linesize[0],
 537              width, height, alpha, beta);
 538     }
 539
 540     return 0;
 541 }
 542
 543 static int vp7_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size)
 544 {
 545     VP56RangeCoder *c = &s->c;
 546     int part1_size, hscale, vscale, i, j, ret;
 547     int width  = s->avctx->width;
 548     int height = s->avctx->height;
 549
 550     if (buf_size < 4) {
 551         return AVERROR_INVALIDDATA;
 552     }
 553
 554     s->profile = (buf[0] >> 1) & 7;
 555     if (s->profile > 1) {
 556         avpriv_request_sample(s->avctx, "Unknown profile %d", s->profile);
 557         return AVERROR_INVALIDDATA;
 558     }
 559
 560     s->keyframe  = !(buf[0] & 1);
 561     s->invisible = 0;
 562     part1_size   = AV_RL24(buf) >> 4;
 563
 564     if (buf_size < 4 - s->profile + part1_size) {
 565         av_log(s->avctx, AV_LOG_ERROR, "Buffer size %d is too small, needed : %d\n", buf_size, 4 - s->profile + part1_size);
 566         return AVERROR_INVALIDDATA;
 567     }
 568
 569     buf      += 4 - s->profile;
 570     buf_size -= 4 - s->profile;
 571
 572     memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab, sizeof(s->put_pixels_tab));
 573
 574     ret = ff_vp56_init_range_decoder(c, buf, part1_size);
 575     if (ret < 0)
 576         return ret;
 577     buf      += part1_size;
 578     buf_size -= part1_size;
 579
 580     /* A. Dimension information (keyframes only) */
 581     if (s->keyframe) {
 582         width  = vp8_rac_get_uint(c, 12);
 583         height = vp8_rac_get_uint(c, 12);
 584         hscale = vp8_rac_get_uint(c, 2);
 585         vscale = vp8_rac_get_uint(c, 2);
 586         if (hscale || vscale)
 587             avpriv_request_sample(s->avctx, "Upscaling");
 588
 589         s->update_golden = s->update_altref = VP56_FRAME_CURRENT;
 590         vp78_reset_probability_tables(s);
 591         memcpy(s->prob->pred16x16, vp8_pred16x16_prob_inter,
 592                sizeof(s->prob->pred16x16));
 593         memcpy(s->prob->pred8x8c, vp8_pred8x8c_prob_inter,
 594                sizeof(s->prob->pred8x8c));
 595         for (i = 0; i < 2; i++)
 596             memcpy(s->prob->mvc[i], vp7_mv_default_prob[i],
 597                    sizeof(vp7_mv_default_prob[i]));
 598         memset(&s->segmentation, 0, sizeof(s->segmentation));
 599         memset(&s->lf_delta, 0, sizeof(s->lf_delta));
 600         memcpy(s->prob[0].scan, ff_zigzag_scan, sizeof(s->prob[0].scan));
 601     }
 602
 603     if (s->keyframe || s->profile > 0)
 604         memset(s->inter_dc_pred, 0 , sizeof(s->inter_dc_pred));
 605
 606     /* B. Decoding information for all four macroblock-level features */
 607     for (i = 0; i < 4; i++) {
 608         s->feature_enabled[i] = vp8_rac_get(c);
 609         if (s->feature_enabled[i]) {
 610              s->feature_present_prob[i] = vp8_rac_get_uint(c, 8);
 611
 612              for (j = 0; j < 3; j++)
 613                  s->feature_index_prob[i][j] =
 614                      vp8_rac_get(c) ? vp8_rac_get_uint(c, 8) : 255;
 615
 616              if (vp7_feature_value_size[s->profile][i])
 617                  for (j = 0; j < 4; j++)
 618                      s->feature_value[i][j] =
 619                         vp8_rac_get(c) ? vp8_rac_get_uint(c, vp7_feature_value_size[s->profile][i]) : 0;
 620         }
 621     }
 622
 623     s->segmentation.enabled    = 0;
 624     s->segmentation.update_map = 0;
 625     s->lf_delta.enabled        = 0;
 626
 627     s->num_coeff_partitions = 1;
 628     ret = ff_vp56_init_range_decoder(&s->coeff_partition[0], buf, buf_size);
 629     if (ret < 0)
 630         return ret;
 631
 632     if (!s->macroblocks_base || /* first frame */
 633         width != s->avctx->width || height != s->avctx->height ||
 634         (width + 15) / 16 != s->mb_width || (height + 15) / 16 != s->mb_height) {
 635         if ((ret = vp7_update_dimensions(s, width, height)) < 0)
 636             return ret;
 637     }
 638
 639     /* C. Dequantization indices */
 640     vp7_get_quants(s);
 641
 642     /* D. Golden frame update flag (a Flag) for interframes only */
 643     if (!s->keyframe) {
 644         s->update_golden = vp8_rac_get(c) ? VP56_FRAME_CURRENT : VP56_FRAME_NONE;
 645         s->sign_bias[VP56_FRAME_GOLDEN] = 0;
 646     }
 647
 648     s->update_last          = 1;
 649     s->update_probabilities = 1;
 650     s->fade_present         = 1;
 651
 652     if (s->profile > 0) {
 653         s->update_probabilities = vp8_rac_get(c);
 654         if (!s->update_probabilities)
 655             s->prob[1] = s->prob[0];
 656
 657         if (!s->keyframe)
 658             s->fade_present = vp8_rac_get(c);
 659     }
 660
 661     if (c->end <= c->buffer && c->bits >= 0)
 662         return AVERROR_INVALIDDATA;
 663     /* E. Fading information for previous frame */
 664     if (s->fade_present && vp8_rac_get(c)) {
 665         if ((ret = vp7_fade_frame(s ,c)) < 0)
 666             return ret;
 667     }
 668
 669     /* F. Loop filter type */
 670     if (!s->profile)
 671         s->filter.simple = vp8_rac_get(c);
 672
 673     /* G. DCT coefficient ordering specification */
 674     if (vp8_rac_get(c))
 675         for (i = 1; i < 16; i++)
 676             s->prob[0].scan[i] = ff_zigzag_scan[vp8_rac_get_uint(c, 4)];
 677
 678     /* H. Loop filter levels  */
 679     if (s->profile > 0)
 680         s->filter.simple = vp8_rac_get(c);
 681     s->filter.level     = vp8_rac_get_uint(c, 6);
 682     s->filter.sharpness = vp8_rac_get_uint(c, 3);
 683
 684     /* I. DCT coefficient probability update; 13.3 Token Probability Updates */
 685     vp78_update_probability_tables(s);
 686
 687     s->mbskip_enabled = 0;
 688
 689     /* J. The remaining frame header data occurs ONLY FOR INTERFRAMES */
 690     if (!s->keyframe) {
 691         s->prob->intra  = vp8_rac_get_uint(c, 8);
 692         s->prob->last   = vp8_rac_get_uint(c, 8);
 693         vp78_update_pred16x16_pred8x8_mvc_probabilities(s, VP7_MVC_SIZE);
 694     }
 695
 696     return 0;
 697 }
 698
 699 static int vp8_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size)
 700 {
 701     VP56RangeCoder *c = &s->c;
 702     int header_size, hscale, vscale, ret;
 703     int width  = s->avctx->width;
 704     int height = s->avctx->height;
 705
 706     if (buf_size < 3) {
 707         av_log(s->avctx, AV_LOG_ERROR, "Insufficent data (%d) for header\n", buf_size);
 708         return AVERROR_INVALIDDATA;
 709     }
 710
 711     s->keyframe  = !(buf[0] & 1);
 712     s->profile   =  (buf[0]>>1) & 7;
 713     s->invisible = !(buf[0] & 0x10);
 714     header_size  = AV_RL24(buf) >> 5;
 715     buf      += 3;
 716     buf_size -= 3;
 717
 718     s->header_partition_size = header_size;
 719
 720     if (s->profile > 3)
 721         av_log(s->avctx, AV_LOG_WARNING, "Unknown profile %d\n", s->profile);
 722
 723     if (!s->profile)
 724         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab,
 725                sizeof(s->put_pixels_tab));
 726     else    // profile 1-3 use bilinear, 4+ aren't defined so whatever
 727         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_bilinear_pixels_tab,
 728                sizeof(s->put_pixels_tab));
 729
 730     if (header_size > buf_size - 7 * s->keyframe) {
 731         av_log(s->avctx, AV_LOG_ERROR, "Header size larger than data provided\n");
 732         return AVERROR_INVALIDDATA;
 733     }
 734
 735     if (s->keyframe) {
 736         if (AV_RL24(buf) != 0x2a019d) {
 737             av_log(s->avctx, AV_LOG_ERROR,
 738                    "Invalid start code 0x%x\n", AV_RL24(buf));
 739             return AVERROR_INVALIDDATA;
 740         }
 741         width     = AV_RL16(buf + 3) & 0x3fff;
 742         height    = AV_RL16(buf + 5) & 0x3fff;
 743         hscale    = buf[4] >> 6;
 744         vscale    = buf[6] >> 6;
 745         buf      += 7;
 746         buf_size -= 7;
 747
 748         if (hscale || vscale)
 749             avpriv_request_sample(s->avctx, "Upscaling");
 750
 751         s->update_golden = s->update_altref = VP56_FRAME_CURRENT;
 752         vp78_reset_probability_tables(s);
 753         memcpy(s->prob->pred16x16, vp8_pred16x16_prob_inter,
 754                sizeof(s->prob->pred16x16));
 755         memcpy(s->prob->pred8x8c, vp8_pred8x8c_prob_inter,
 756                sizeof(s->prob->pred8x8c));
 757         memcpy(s->prob->mvc, vp8_mv_default_prob,
 758                sizeof(s->prob->mvc));
 759         memset(&s->segmentation, 0, sizeof(s->segmentation));
 760         memset(&s->lf_delta, 0, sizeof(s->lf_delta));
 761     }
 762
 763     ret = ff_vp56_init_range_decoder(c, buf, header_size);
 764     if (ret < 0)
 765         return ret;
 766     buf      += header_size;
 767     buf_size -= header_size;
 768
 769     if (s->keyframe) {
 770         s->colorspace = vp8_rac_get(c);
 771         if (s->colorspace)
 772             av_log(s->avctx, AV_LOG_WARNING, "Unspecified colorspace\n");
 773         s->fullrange = vp8_rac_get(c);
 774     }
 775
 776     if ((s->segmentation.enabled = vp8_rac_get(c)))
 777         parse_segment_info(s);
 778     else
 779         s->segmentation.update_map = 0; // FIXME: move this to some init function?
 780
 781     s->filter.simple    = vp8_rac_get(c);
 782     s->filter.level     = vp8_rac_get_uint(c, 6);
 783     s->filter.sharpness = vp8_rac_get_uint(c, 3);
 784
 785     if ((s->lf_delta.enabled = vp8_rac_get(c))) {
 786         s->lf_delta.update = vp8_rac_get(c);
 787         if (s->lf_delta.update)
 788             update_lf_deltas(s);
 789     }
 790
 791     if (setup_partitions(s, buf, buf_size)) {
 792         av_log(s->avctx, AV_LOG_ERROR, "Invalid partitions\n");
 793         return AVERROR_INVALIDDATA;
 794     }
 795
 796     if (!s->macroblocks_base || /* first frame */
 797         width != s->avctx->width || height != s->avctx->height ||
 798         (width+15)/16 != s->mb_width || (height+15)/16 != s->mb_height)
 799         if ((ret = vp8_update_dimensions(s, width, height)) < 0)
 800             return ret;
 801
 802     vp8_get_quants(s);
 803
 804     if (!s->keyframe) {
 805         update_refs(s);
 806         s->sign_bias[VP56_FRAME_GOLDEN]               = vp8_rac_get(c);
 807         s->sign_bias[VP56_FRAME_GOLDEN2 /* altref */] = vp8_rac_get(c);
 808     }
 809
 810     // if we aren't saving this frame's probabilities for future frames,
 811     // make a copy of the current probabilities
 812     if (!(s->update_probabilities = vp8_rac_get(c)))
 813         s->prob[1] = s->prob[0];
 814
 815     s->update_last = s->keyframe || vp8_rac_get(c);
 816
 817     vp78_update_probability_tables(s);
 818
 819     if ((s->mbskip_enabled = vp8_rac_get(c)))
 820         s->prob->mbskip = vp8_rac_get_uint(c, 8);
 821
 822     if (!s->keyframe) {
 823         s->prob->intra  = vp8_rac_get_uint(c, 8);
 824         s->prob->last   = vp8_rac_get_uint(c, 8);
 825         s->prob->golden = vp8_rac_get_uint(c, 8);
 826         vp78_update_pred16x16_pred8x8_mvc_probabilities(s, VP8_MVC_SIZE);
 827     }
 828
 829     // Record the entropy coder state here so that hwaccels can use it.
 830     s->c.code_word = vp56_rac_renorm(&s->c);
 831     s->coder_state_at_header_end.input     = s->c.buffer - (-s->c.bits / 8);
 832     s->coder_state_at_header_end.range     = s->c.high;
 833     s->coder_state_at_header_end.value     = s->c.code_word >> 16;
 834     s->coder_state_at_header_end.bit_count = -s->c.bits % 8;
 835
 836     return 0;
 837 }
 838
 839 static av_always_inline
 840 void clamp_mv(VP8mvbounds *s, VP56mv *dst, const VP56mv *src)
 841 {
 842     dst->x = av_clip(src->x, av_clip(s->mv_min.x, INT16_MIN, INT16_MAX),
 843                              av_clip(s->mv_max.x, INT16_MIN, INT16_MAX));
 844     dst->y = av_clip(src->y, av_clip(s->mv_min.y, INT16_MIN, INT16_MAX),
 845                              av_clip(s->mv_max.y, INT16_MIN, INT16_MAX));
 846 }
 847
 848 /**
 849  * Motion vector coding, 17.1.
 850  */
 851 static av_always_inline int read_mv_component(VP56RangeCoder *c, const uint8_t *p, int vp7)
 852 {
 853     int bit, x = 0;
 854
 855     if (vp56_rac_get_prob_branchy(c, p[0])) {
 856         int i;
 857
 858         for (i = 0; i < 3; i++)
 859             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 860         for (i = (vp7 ? 7 : 9); i > 3; i--)
 861             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 862         if (!(x & (vp7 ? 0xF0 : 0xFFF0)) || vp56_rac_get_prob(c, p[12]))
 863             x += 8;
 864     } else {
 865         // small_mvtree
 866         const uint8_t *ps = p + 2;
 867         bit = vp56_rac_get_prob(c, *ps);
 868         ps += 1 + 3 * bit;
 869         x  += 4 * bit;
 870         bit = vp56_rac_get_prob(c, *ps);
 871         ps += 1 + bit;
 872         x  += 2 * bit;
 873         x  += vp56_rac_get_prob(c, *ps);
 874     }
 875
 876     return (x && vp56_rac_get_prob(c, p[1])) ? -x : x;
 877 }
 878
 879 static int vp7_read_mv_component(VP56RangeCoder *c, const uint8_t *p)
 880 {
 881     return read_mv_component(c, p, 1);
 882 }
 883
 884 static int vp8_read_mv_component(VP56RangeCoder *c, const uint8_t *p)
 885 {
 886     return read_mv_component(c, p, 0);
 887 }
 888
 889 static av_always_inline
 890 const uint8_t *get_submv_prob(uint32_t left, uint32_t top, int is_vp7)
 891 {
 892     if (is_vp7)
 893         return vp7_submv_prob;
 894
 895     if (left == top)
 896         return vp8_submv_prob[4 - !!left];
 897     if (!top)
 898         return vp8_submv_prob[2];
 899     return vp8_submv_prob[1 - !!left];
 900 }
 901
 902 /**
 903  * Split motion vector prediction, 16.4.
 904  * @returns the number of motion vectors parsed (2, 4 or 16)
 905  */
 906 static av_always_inline
 907 int decode_splitmvs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
 908                     int layout, int is_vp7)
 909 {
 910     int part_idx;
 911     int n, num;
 912     VP8Macroblock *top_mb;
 913     VP8Macroblock *left_mb = &mb[-1];
 914     const uint8_t *mbsplits_left = vp8_mbsplits[left_mb->partitioning];
 915     const uint8_t *mbsplits_top, *mbsplits_cur, *firstidx;
 916     VP56mv *top_mv;
 917     VP56mv *left_mv = left_mb->bmv;
 918     VP56mv *cur_mv  = mb->bmv;
 919
 920     if (!layout) // layout is inlined, s->mb_layout is not
 921         top_mb = &mb[2];
 922     else
 923         top_mb = &mb[-s->mb_width - 1];
 924     mbsplits_top = vp8_mbsplits[top_mb->partitioning];
 925     top_mv       = top_mb->bmv;
 926
 927     if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[0])) {
 928         if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[1]))
 929             part_idx = VP8_SPLITMVMODE_16x8 + vp56_rac_get_prob(c, vp8_mbsplit_prob[2]);
 930         else
 931             part_idx = VP8_SPLITMVMODE_8x8;
 932     } else {
 933         part_idx = VP8_SPLITMVMODE_4x4;
 934     }
 935
 936     num              = vp8_mbsplit_count[part_idx];
 937     mbsplits_cur     = vp8_mbsplits[part_idx],
 938     firstidx         = vp8_mbfirstidx[part_idx];
 939     mb->partitioning = part_idx;
 940
 941     for (n = 0; n < num; n++) {
 942         int k = firstidx[n];
 943         uint32_t left, above;
 944         const uint8_t *submv_prob;
 945
 946         if (!(k & 3))
 947             left = AV_RN32A(&left_mv[mbsplits_left[k + 3]]);
 948         else
 949             left = AV_RN32A(&cur_mv[mbsplits_cur[k - 1]]);
 950         if (k <= 3)
 951             above = AV_RN32A(&top_mv[mbsplits_top[k + 12]]);
 952         else
 953             above = AV_RN32A(&cur_mv[mbsplits_cur[k - 4]]);
 954
 955         submv_prob = get_submv_prob(left, above, is_vp7);
 956
 957         if (vp56_rac_get_prob_branchy(c, submv_prob[0])) {
 958             if (vp56_rac_get_prob_branchy(c, submv_prob[1])) {
 959                 if (vp56_rac_get_prob_branchy(c, submv_prob[2])) {
 960                     mb->bmv[n].y = mb->mv.y +
 961                                    read_mv_component(c, s->prob->mvc[0], is_vp7);
 962                     mb->bmv[n].x = mb->mv.x +
 963                                    read_mv_component(c, s->prob->mvc[1], is_vp7);
 964                 } else {
 965                     AV_ZERO32(&mb->bmv[n]);
 966                 }
 967             } else {
 968                 AV_WN32A(&mb->bmv[n], above);
 969             }
 970         } else {
 971             AV_WN32A(&mb->bmv[n], left);
 972         }
 973     }
 974
 975     return num;
 976 }
 977
 978 /**
 979  * The vp7 reference decoder uses a padding macroblock column (added to right
 980  * edge of the frame) to guard against illegal macroblock offsets. The
 981  * algorithm has bugs that permit offsets to straddle the padding column.
 982  * This function replicates those bugs.
 983  *
 984  * @param[out] edge_x macroblock x address
 985  * @param[out] edge_y macroblock y address
 986  *
 987  * @return macroblock offset legal (boolean)
 988  */
 989 static int vp7_calculate_mb_offset(int mb_x, int mb_y, int mb_width,
 990                                    int xoffset, int yoffset, int boundary,
 991                                    int *edge_x, int *edge_y)
 992 {
 993     int vwidth = mb_width + 1;
 994     int new = (mb_y + yoffset) * vwidth + mb_x + xoffset;
 995     if (new < boundary || new % vwidth == vwidth - 1)
 996         return 0;
 997     *edge_y = new / vwidth;
 998     *edge_x = new % vwidth;
 999     return 1;
1000 }
1001
1002 static const VP56mv *get_bmv_ptr(const VP8Macroblock *mb, int subblock)
1003 {
1004     return &mb->bmv[mb->mode == VP8_MVMODE_SPLIT ? vp8_mbsplits[mb->partitioning][subblock] : 0];
1005 }
1006
1007 static av_always_inline
1008 void vp7_decode_mvs(VP8Context *s, VP8Macroblock *mb,
1009                     int mb_x, int mb_y, int layout)
1010 {
1011     VP8Macroblock *mb_edge[12];
1012     enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR };
1013     enum { VP8_EDGE_TOP, VP8_EDGE_LEFT, VP8_EDGE_TOPLEFT };
1014     int idx = CNT_ZERO;
1015     VP56mv near_mv[3];
1016     uint8_t cnt[3] = { 0 };
1017     VP56RangeCoder *c = &s->c;
1018     int i;
1019
1020     AV_ZERO32(&near_mv[0]);
1021     AV_ZERO32(&near_mv[1]);
1022     AV_ZERO32(&near_mv[2]);
1023
1024     for (i = 0; i < VP7_MV_PRED_COUNT; i++) {
1025         const VP7MVPred * pred = &vp7_mv_pred[i];
1026         int edge_x, edge_y;
1027
1028         if (vp7_calculate_mb_offset(mb_x, mb_y, s->mb_width, pred->xoffset,
1029                                     pred->yoffset, !s->profile, &edge_x, &edge_y)) {
1030             VP8Macroblock *edge = mb_edge[i] = (s->mb_layout == 1)
1031                                              ? s->macroblocks_base + 1 + edge_x +
1032                                                (s->mb_width + 1) * (edge_y + 1)
1033                                              : s->macroblocks + edge_x +
1034                                                (s->mb_height - edge_y - 1) * 2;
1035             uint32_t mv = AV_RN32A(get_bmv_ptr(edge, vp7_mv_pred[i].subblock));
1036             if (mv) {
1037                 if (AV_RN32A(&near_mv[CNT_NEAREST])) {
1038                     if (mv == AV_RN32A(&near_mv[CNT_NEAREST])) {
1039                         idx = CNT_NEAREST;
1040                     } else if (AV_RN32A(&near_mv[CNT_NEAR])) {
1041                         if (mv != AV_RN32A(&near_mv[CNT_NEAR]))
1042                             continue;
1043                         idx = CNT_NEAR;
1044                     } else {
1045                         AV_WN32A(&near_mv[CNT_NEAR], mv);
1046                         idx = CNT_NEAR;
1047                     }
1048                 } else {
1049                     AV_WN32A(&near_mv[CNT_NEAREST], mv);
1050                     idx = CNT_NEAREST;
1051                 }
1052             } else {
1053                 idx = CNT_ZERO;
1054             }
1055         } else {
1056             idx = CNT_ZERO;
1057         }
1058         cnt[idx] += vp7_mv_pred[i].score;
1059     }
1060
1061     mb->partitioning = VP8_SPLITMVMODE_NONE;
1062
1063     if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_ZERO]][0])) {
1064         mb->mode = VP8_MVMODE_MV;
1065
1066         if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAREST]][1])) {
1067
1068             if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAR]][2])) {
1069
1070                 if (cnt[CNT_NEAREST] > cnt[CNT_NEAR])
1071                     AV_WN32A(&mb->mv, cnt[CNT_ZERO] > cnt[CNT_NEAREST] ? 0 : AV_RN32A(&near_mv[CNT_NEAREST]));
1072                 else
1073                     AV_WN32A(&mb->mv, cnt[CNT_ZERO] > cnt[CNT_NEAR]    ? 0 : AV_RN32A(&near_mv[CNT_NEAR]));
1074
1075                 if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAR]][3])) {
1076                     mb->mode = VP8_MVMODE_SPLIT;
1077                     mb->mv = mb->bmv[decode_splitmvs(s, c, mb, layout, IS_VP7) - 1];
1078                 } else {
1079                     mb->mv.y += vp7_read_mv_component(c, s->prob->mvc[0]);
1080                     mb->mv.x += vp7_read_mv_component(c, s->prob->mvc[1]);
1081                     mb->bmv[0] = mb->mv;
1082                 }
1083             } else {
1084                 mb->mv = near_mv[CNT_NEAR];
1085                 mb->bmv[0] = mb->mv;
1086             }
1087         } else {
1088             mb->mv = near_mv[CNT_NEAREST];
1089             mb->bmv[0] = mb->mv;
1090         }
1091     } else {
1092         mb->mode = VP8_MVMODE_ZERO;
1093         AV_ZERO32(&mb->mv);
1094         mb->bmv[0] = mb->mv;
1095     }
1096 }
1097
1098 static av_always_inline
1099 void vp8_decode_mvs(VP8Context *s, VP8mvbounds *mv_bounds, VP8Macroblock *mb,
1100                     int mb_x, int mb_y, int layout)
1101 {
1102     VP8Macroblock *mb_edge[3] = { 0      /* top */,
1103                                   mb - 1 /* left */,
1104                                   0      /* top-left */ };
1105     enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV };
1106     enum { VP8_EDGE_TOP, VP8_EDGE_LEFT, VP8_EDGE_TOPLEFT };
1107     int idx = CNT_ZERO;
1108     int cur_sign_bias = s->sign_bias[mb->ref_frame];
1109     int8_t *sign_bias = s->sign_bias;
1110     VP56mv near_mv[4];
1111     uint8_t cnt[4] = { 0 };
1112     VP56RangeCoder *c = &s->c;
1113
1114     if (!layout) { // layout is inlined (s->mb_layout is not)
1115         mb_edge[0] = mb + 2;
1116         mb_edge[2] = mb + 1;
1117     } else {
1118         mb_edge[0] = mb - s->mb_width - 1;
1119         mb_edge[2] = mb - s->mb_width - 2;
1120     }
1121
1122     AV_ZERO32(&near_mv[0]);
1123     AV_ZERO32(&near_mv[1]);
1124     AV_ZERO32(&near_mv[2]);
1125
1126     /* Process MB on top, left and top-left */
1127 #define MV_EDGE_CHECK(n)                                                      \
1128     {                                                                         \
1129         VP8Macroblock *edge = mb_edge[n];                                     \
1130         int edge_ref = edge->ref_frame;                                       \
1131         if (edge_ref != VP56_FRAME_CURRENT) {                                 \
1132             uint32_t mv = AV_RN32A(&edge->mv);                                \
1133             if (mv) {                                                         \
1134                 if (cur_sign_bias != sign_bias[edge_ref]) {                   \
1135                     /* SWAR negate of the values in mv. */                    \
1136                     mv = ~mv;                                                 \
1137                     mv = ((mv & 0x7fff7fff) +                                 \
1138                           0x00010001) ^ (mv & 0x80008000);                    \
1139                 }                                                             \
1140                 if (!n || mv != AV_RN32A(&near_mv[idx]))                      \
1141                     AV_WN32A(&near_mv[++idx], mv);                            \
1142                 cnt[idx] += 1 + (n != 2);                                     \
1143             } else                                                            \
1144                 cnt[CNT_ZERO] += 1 + (n != 2);                                \
1145         }                                                                     \
1146     }
1147
1148     MV_EDGE_CHECK(0)
1149     MV_EDGE_CHECK(1)
1150     MV_EDGE_CHECK(2)
1151
1152     mb->partitioning = VP8_SPLITMVMODE_NONE;
1153     if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_ZERO]][0])) {
1154         mb->mode = VP8_MVMODE_MV;
1155
1156         /* If we have three distinct MVs, merge first and last if they're the same */
1157         if (cnt[CNT_SPLITMV] &&
1158             AV_RN32A(&near_mv[1 + VP8_EDGE_TOP]) == AV_RN32A(&near_mv[1 + VP8_EDGE_TOPLEFT]))
1159             cnt[CNT_NEAREST] += 1;
1160
1161         /* Swap near and nearest if necessary */
1162         if (cnt[CNT_NEAR] > cnt[CNT_NEAREST]) {
1163             FFSWAP(uint8_t,     cnt[CNT_NEAREST],     cnt[CNT_NEAR]);
1164             FFSWAP( VP56mv, near_mv[CNT_NEAREST], near_mv[CNT_NEAR]);
1165         }
1166
1167         if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAREST]][1])) {
1168             if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAR]][2])) {
1169                 /* Choose the best mv out of 0,0 and the nearest mv */
1170                 clamp_mv(mv_bounds, &mb->mv, &near_mv[CNT_ZERO + (cnt[CNT_NEAREST] >= cnt[CNT_ZERO])]);
1171                 cnt[CNT_SPLITMV] = ((mb_edge[VP8_EDGE_LEFT]->mode    == VP8_MVMODE_SPLIT) +
1172                                     (mb_edge[VP8_EDGE_TOP]->mode     == VP8_MVMODE_SPLIT)) * 2 +
1173                                     (mb_edge[VP8_EDGE_TOPLEFT]->mode == VP8_MVMODE_SPLIT);
1174
1175                 if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_SPLITMV]][3])) {
1176                     mb->mode = VP8_MVMODE_SPLIT;
1177                     mb->mv = mb->bmv[decode_splitmvs(s, c, mb, layout, IS_VP8) - 1];
1178                 } else {
1179                     mb->mv.y  += vp8_read_mv_component(c, s->prob->mvc[0]);
1180                     mb->mv.x  += vp8_read_mv_component(c, s->prob->mvc[1]);
1181                     mb->bmv[0] = mb->mv;
1182                 }
1183             } else {
1184                 clamp_mv(mv_bounds, &mb->mv, &near_mv[CNT_NEAR]);
1185                 mb->bmv[0] = mb->mv;
1186             }
1187         } else {
1188             clamp_mv(mv_bounds, &mb->mv, &near_mv[CNT_NEAREST]);
1189             mb->bmv[0] = mb->mv;
1190         }
1191     } else {
1192         mb->mode = VP8_MVMODE_ZERO;
1193         AV_ZERO32(&mb->mv);
1194         mb->bmv[0] = mb->mv;
1195     }
1196 }
1197
1198 static av_always_inline
1199 void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
1200                            int mb_x, int keyframe, int layout)
1201 {
1202     uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
1203
1204     if (layout) {
1205         VP8Macroblock *mb_top = mb - s->mb_width - 1;
1206         memcpy(mb->intra4x4_pred_mode_top, mb_top->intra4x4_pred_mode_top, 4);
1207     }
1208     if (keyframe) {
1209         int x, y;
1210         uint8_t *top;
1211         uint8_t *const left = s->intra4x4_pred_mode_left;
1212         if (layout)
1213             top = mb->intra4x4_pred_mode_top;
1214         else
1215             top = s->intra4x4_pred_mode_top + 4 * mb_x;
1216         for (y = 0; y < 4; y++) {
1217             for (x = 0; x < 4; x++) {
1218                 const uint8_t *ctx;
1219                 ctx       = vp8_pred4x4_prob_intra[top[x]][left[y]];
1220                 *intra4x4 = vp8_rac_get_tree(c, vp8_pred4x4_tree, ctx);
1221                 left[y]   = top[x] = *intra4x4;
1222                 intra4x4++;
1223             }
1224         }
1225     } else {
1226         int i;
1227         for (i = 0; i < 16; i++)
1228             intra4x4[i] = vp8_rac_get_tree(c, vp8_pred4x4_tree,
1229                                            vp8_pred4x4_prob_inter);
1230     }
1231 }
1232
1233 static av_always_inline
1234 void decode_mb_mode(VP8Context *s, VP8mvbounds *mv_bounds,
1235                     VP8Macroblock *mb, int mb_x, int mb_y,
1236                     uint8_t *segment, uint8_t *ref, int layout, int is_vp7)
1237 {
1238     VP56RangeCoder *c = &s->c;
1239     static const char * const vp7_feature_name[] = { "q-index",
1240                                                      "lf-delta",
1241                                                      "partial-golden-update",
1242                                                      "blit-pitch" };
1243     if (is_vp7) {
1244         int i;
1245         *segment = 0;
1246         for (i = 0; i < 4; i++) {
1247             if (s->feature_enabled[i]) {
1248                 if (vp56_rac_get_prob_branchy(c, s->feature_present_prob[i])) {
1249                       int index = vp8_rac_get_tree(c, vp7_feature_index_tree,
1250                                                    s->feature_index_prob[i]);
1251                       av_log(s->avctx, AV_LOG_WARNING,
1252                              "Feature %s present in macroblock (value 0x%x)\n",
1253                              vp7_feature_name[i], s->feature_value[i][index]);
1254                 }
1255            }
1256         }
1257     } else if (s->segmentation.update_map) {
1258         int bit  = vp56_rac_get_prob(c, s->prob->segmentid[0]);
1259         *segment = vp56_rac_get_prob(c, s->prob->segmentid[1+bit]) + 2*bit;
1260     } else if (s->segmentation.enabled)
1261         *segment = ref ? *ref : *segment;
1262     mb->segment = *segment;
1263
1264     mb->skip = s->mbskip_enabled ? vp56_rac_get_prob(c, s->prob->mbskip) : 0;
1265
1266     if (s->keyframe) {
1267         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_intra,
1268                                     vp8_pred16x16_prob_intra);
1269
1270         if (mb->mode == MODE_I4x4) {
1271             decode_intra4x4_modes(s, c, mb, mb_x, 1, layout);
1272         } else {
1273             const uint32_t modes = (is_vp7 ? vp7_pred4x4_mode
1274                                            : vp8_pred4x4_mode)[mb->mode] * 0x01010101u;
1275             if (s->mb_layout)
1276                 AV_WN32A(mb->intra4x4_pred_mode_top, modes);
1277             else
1278                 AV_WN32A(s->intra4x4_pred_mode_top + 4 * mb_x, modes);
1279             AV_WN32A(s->intra4x4_pred_mode_left, modes);
1280         }
1281
1282         mb->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree,
1283                                                 vp8_pred8x8c_prob_intra);
1284         mb->ref_frame        = VP56_FRAME_CURRENT;
1285     } else if (vp56_rac_get_prob_branchy(c, s->prob->intra)) {
1286         // inter MB, 16.2
1287         if (vp56_rac_get_prob_branchy(c, s->prob->last))
1288             mb->ref_frame =
1289                 (!is_vp7 && vp56_rac_get_prob(c, s->prob->golden)) ? VP56_FRAME_GOLDEN2 /* altref */
1290                                                                    : VP56_FRAME_GOLDEN;
1291         else
1292             mb->ref_frame = VP56_FRAME_PREVIOUS;
1293         s->ref_count[mb->ref_frame - 1]++;
1294
1295         // motion vectors, 16.3
1296         if (is_vp7)
1297             vp7_decode_mvs(s, mb, mb_x, mb_y, layout);
1298         else
1299             vp8_decode_mvs(s, mv_bounds, mb, mb_x, mb_y, layout);
1300     } else {
1301         // intra MB, 16.1
1302         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_inter, s->prob->pred16x16);
1303
1304         if (mb->mode == MODE_I4x4)
1305             decode_intra4x4_modes(s, c, mb, mb_x, 0, layout);
1306
1307         mb->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree,
1308                                                 s->prob->pred8x8c);
1309         mb->ref_frame        = VP56_FRAME_CURRENT;
1310         mb->partitioning     = VP8_SPLITMVMODE_NONE;
1311         AV_ZERO32(&mb->bmv[0]);
1312     }
1313 }
1314
1315 /**
1316  * @param r     arithmetic bitstream reader context
1317  * @param block destination for block coefficients
1318  * @param probs probabilities to use when reading trees from the bitstream
1319  * @param i     initial coeff index, 0 unless a separate DC block is coded
1320  * @param qmul  array holding the dc/ac dequant factor at position 0/1
1321  *
1322  * @return 0 if no coeffs were decoded
1323  *         otherwise, the index of the last coeff decoded plus one
1324  */
1325 static av_always_inline
1326 int decode_block_coeffs_internal(VP56RangeCoder *r, int16_t block[16],
1327                                  uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1328                                  int i, uint8_t *token_prob, int16_t qmul[2],
1329                                  const uint8_t scan[16], int vp7)
1330 {
1331     VP56RangeCoder c = *r;
1332     goto skip_eob;
1333     do {
1334         int coeff;
1335 restart:
1336         if (!vp56_rac_get_prob_branchy(&c, token_prob[0]))   // DCT_EOB
1337             break;
1338
1339 skip_eob:
1340         if (!vp56_rac_get_prob_branchy(&c, token_prob[1])) { // DCT_0
1341             if (++i == 16)
1342                 break; // invalid input; blocks should end with EOB
1343             token_prob = probs[i][0];
1344             if (vp7)
1345                 goto restart;
1346             goto skip_eob;
1347         }
1348
1349         if (!vp56_rac_get_prob_branchy(&c, token_prob[2])) { // DCT_1
1350             coeff = 1;
1351             token_prob = probs[i + 1][1];
1352         } else {
1353             if (!vp56_rac_get_prob_branchy(&c, token_prob[3])) { // DCT 2,3,4
1354                 coeff = vp56_rac_get_prob_branchy(&c, token_prob[4]);
1355                 if (coeff)
1356                     coeff += vp56_rac_get_prob(&c, token_prob[5]);
1357                 coeff += 2;
1358             } else {
1359                 // DCT_CAT*
1360                 if (!vp56_rac_get_prob_branchy(&c, token_prob[6])) {
1361                     if (!vp56_rac_get_prob_branchy(&c, token_prob[7])) { // DCT_CAT1
1362                         coeff = 5 + vp56_rac_get_prob(&c, vp8_dct_cat1_prob[0]);
1363                     } else {                                    // DCT_CAT2
1364                         coeff  = 7;
1365                         coeff += vp56_rac_get_prob(&c, vp8_dct_cat2_prob[0]) << 1;
1366                         coeff += vp56_rac_get_prob(&c, vp8_dct_cat2_prob[1]);
1367                     }
1368                 } else {    // DCT_CAT3 and up
1369                     int a   = vp56_rac_get_prob(&c, token_prob[8]);
1370                     int b   = vp56_rac_get_prob(&c, token_prob[9 + a]);
1371                     int cat = (a << 1) + b;
1372                     coeff  = 3 + (8 << cat);
1373                     coeff += vp8_rac_get_coeff(&c, ff_vp8_dct_cat_prob[cat]);
1374                 }
1375             }
1376             token_prob = probs[i + 1][2];
1377         }
1378         block[scan[i]] = (vp8_rac_get(&c) ? -coeff : coeff) * qmul[!!i];
1379     } while (++i < 16);
1380
1381     *r = c;
1382     return i;
1383 }
1384
1385 static av_always_inline
1386 int inter_predict_dc(int16_t block[16], int16_t pred[2])
1387 {
1388     int16_t dc = block[0];
1389     int ret = 0;
1390
1391     if (pred[1] > 3) {
1392         dc += pred[0];
1393         ret = 1;
1394     }
1395
1396     if (!pred[0] | !dc | ((int32_t)pred[0] ^ (int32_t)dc) >> 31) {
1397         block[0] = pred[0] = dc;
1398         pred[1] = 0;
1399     } else {
1400         if (pred[0] == dc)
1401             pred[1]++;
1402         block[0] = pred[0] = dc;
1403     }
1404
1405     return ret;
1406 }
1407
1408 static int vp7_decode_block_coeffs_internal(VP56RangeCoder *r,
1409                                             int16_t block[16],
1410                                             uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1411                                             int i, uint8_t *token_prob,
1412                                             int16_t qmul[2],
1413                                             const uint8_t scan[16])
1414 {
1415     return decode_block_coeffs_internal(r, block, probs, i,
1416                                         token_prob, qmul, scan, IS_VP7);
1417 }
1418
1419 #ifndef vp8_decode_block_coeffs_internal
1420 static int vp8_decode_block_coeffs_internal(VP56RangeCoder *r,
1421                                             int16_t block[16],
1422                                             uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1423                                             int i, uint8_t *token_prob,
1424                                             int16_t qmul[2])
1425 {
1426     return decode_block_coeffs_internal(r, block, probs, i,
1427                                         token_prob, qmul, ff_zigzag_scan, IS_VP8);
1428 }
1429 #endif
1430
1431 /**
1432  * @param c          arithmetic bitstream reader context
1433  * @param block      destination for block coefficients
1434  * @param probs      probabilities to use when reading trees from the bitstream
1435  * @param i          initial coeff index, 0 unless a separate DC block is coded
1436  * @param zero_nhood the initial prediction context for number of surrounding
1437  *                   all-zero blocks (only left/top, so 0-2)
1438  * @param qmul       array holding the dc/ac dequant factor at position 0/1
1439  * @param scan       scan pattern (VP7 only)
1440  *
1441  * @return 0 if no coeffs were decoded
1442  *         otherwise, the index of the last coeff decoded plus one
1443  */
1444 static av_always_inline
1445 int decode_block_coeffs(VP56RangeCoder *c, int16_t block[16],
1446                         uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1447                         int i, int zero_nhood, int16_t qmul[2],
1448                         const uint8_t scan[16], int vp7)
1449 {
1450     uint8_t *token_prob = probs[i][zero_nhood];
1451     if (!vp56_rac_get_prob_branchy(c, token_prob[0]))   // DCT_EOB
1452         return 0;
1453     return vp7 ? vp7_decode_block_coeffs_internal(c, block, probs, i,
1454                                                   token_prob, qmul, scan)
1455                : vp8_decode_block_coeffs_internal(c, block, probs, i,
1456                                                   token_prob, qmul);
1457 }
1458
1459 static av_always_inline
1460 void decode_mb_coeffs(VP8Context *s, VP8ThreadData *td, VP56RangeCoder *c,
1461                       VP8Macroblock *mb, uint8_t t_nnz[9], uint8_t l_nnz[9],
1462                       int is_vp7)
1463 {
1464     int i, x, y, luma_start = 0, luma_ctx = 3;
1465     int nnz_pred, nnz, nnz_total = 0;
1466     int segment = mb->segment;
1467     int block_dc = 0;
1468
1469     if (mb->mode != MODE_I4x4 && (is_vp7 || mb->mode != VP8_MVMODE_SPLIT)) {
1470         nnz_pred = t_nnz[8] + l_nnz[8];
1471
1472         // decode DC values and do hadamard
1473         nnz = decode_block_coeffs(c, td->block_dc, s->prob->token[1], 0,
1474                                   nnz_pred, s->qmat[segment].luma_dc_qmul,
1475                                   ff_zigzag_scan, is_vp7);
1476         l_nnz[8] = t_nnz[8] = !!nnz;
1477
1478         if (is_vp7 && mb->mode > MODE_I4x4) {
1479             nnz |=  inter_predict_dc(td->block_dc,
1480                                      s->inter_dc_pred[mb->ref_frame - 1]);
1481         }
1482
1483         if (nnz) {
1484             nnz_total += nnz;
1485             block_dc   = 1;
1486             if (nnz == 1)
1487                 s->vp8dsp.vp8_luma_dc_wht_dc(td->block, td->block_dc);
1488             else
1489                 s->vp8dsp.vp8_luma_dc_wht(td->block, td->block_dc);
1490         }
1491         luma_start = 1;
1492         luma_ctx   = 0;
1493     }
1494
1495     // luma blocks
1496     for (y = 0; y < 4; y++)
1497         for (x = 0; x < 4; x++) {
1498             nnz_pred = l_nnz[y] + t_nnz[x];
1499             nnz = decode_block_coeffs(c, td->block[y][x],
1500                                       s->prob->token[luma_ctx],
1501                                       luma_start, nnz_pred,
1502                                       s->qmat[segment].luma_qmul,
1503                                       s->prob[0].scan, is_vp7);
1504             /* nnz+block_dc may be one more than the actual last index,
1505              * but we don't care */
1506             td->non_zero_count_cache[y][x] = nnz + block_dc;
1507             t_nnz[x] = l_nnz[y] = !!nnz;
1508             nnz_total += nnz;
1509         }
1510
1511     // chroma blocks
1512     // TODO: what to do about dimensions? 2nd dim for luma is x,
1513     // but for chroma it's (y<<1)|x
1514     for (i = 4; i < 6; i++)
1515         for (y = 0; y < 2; y++)
1516             for (x = 0; x < 2; x++) {
1517                 nnz_pred = l_nnz[i + 2 * y] + t_nnz[i + 2 * x];
1518                 nnz = decode_block_coeffs(c, td->block[i][(y << 1) + x],
1519                                           s->prob->token[2], 0, nnz_pred,
1520                                           s->qmat[segment].chroma_qmul,
1521                                           s->prob[0].scan, is_vp7);
1522                 td->non_zero_count_cache[i][(y << 1) + x] = nnz;
1523                 t_nnz[i + 2 * x] = l_nnz[i + 2 * y] = !!nnz;
1524                 nnz_total += nnz;
1525             }
1526
1527     // if there were no coded coeffs despite the macroblock not being marked skip,
1528     // we MUST not do the inner loop filter and should not do IDCT
1529     // Since skip isn't used for bitstream prediction, just manually set it.
1530     if (!nnz_total)
1531         mb->skip = 1;
1532 }
1533
1534 static av_always_inline
1535 void backup_mb_border(uint8_t *top_border, uint8_t *src_y,
1536                       uint8_t *src_cb, uint8_t *src_cr,
1537                       ptrdiff_t linesize, ptrdiff_t uvlinesize, int simple)
1538 {
1539     AV_COPY128(top_border, src_y + 15 * linesize);
1540     if (!simple) {
1541         AV_COPY64(top_border + 16, src_cb + 7 * uvlinesize);
1542         AV_COPY64(top_border + 24, src_cr + 7 * uvlinesize);
1543     }
1544 }
1545
1546 static av_always_inline
1547 void xchg_mb_border(uint8_t *top_border, uint8_t *src_y, uint8_t *src_cb,
1548                     uint8_t *src_cr, ptrdiff_t linesize, ptrdiff_t uvlinesize, int mb_x,
1549                     int mb_y, int mb_width, int simple, int xchg)
1550 {
1551     uint8_t *top_border_m1 = top_border - 32;     // for TL prediction
1552     src_y  -= linesize;
1553     src_cb -= uvlinesize;
1554     src_cr -= uvlinesize;
1555
1556 #define XCHG(a, b, xchg)                                                      \
1557     do {                                                                      \
1558         if (xchg)                                                             \
1559             AV_SWAP64(b, a);                                                  \
1560         else                                                                  \
1561             AV_COPY64(b, a);                                                  \
1562     } while (0)
1563
1564     XCHG(top_border_m1 + 8, src_y - 8, xchg);
1565     XCHG(top_border, src_y, xchg);
1566     XCHG(top_border + 8, src_y + 8, 1);
1567     if (mb_x < mb_width - 1)
1568         XCHG(top_border + 32, src_y + 16, 1);
1569
1570     // only copy chroma for normal loop filter
1571     // or to initialize the top row to 127
1572     if (!simple || !mb_y) {
1573         XCHG(top_border_m1 + 16, src_cb - 8, xchg);
1574         XCHG(top_border_m1 + 24, src_cr - 8, xchg);
1575         XCHG(top_border + 16, src_cb, 1);
1576         XCHG(top_border + 24, src_cr, 1);
1577     }
1578 }
1579
1580 static av_always_inline
1581 int check_dc_pred8x8_mode(int mode, int mb_x, int mb_y)
1582 {
1583     if (!mb_x)
1584         return mb_y ? TOP_DC_PRED8x8 : DC_128_PRED8x8;
1585     else
1586         return mb_y ? mode : LEFT_DC_PRED8x8;
1587 }
1588
1589 static av_always_inline
1590 int check_tm_pred8x8_mode(int mode, int mb_x, int mb_y, int vp7)
1591 {
1592     if (!mb_x)
1593         return mb_y ? VERT_PRED8x8 : (vp7 ? DC_128_PRED8x8 : DC_129_PRED8x8);
1594     else
1595         return mb_y ? mode : HOR_PRED8x8;
1596 }
1597
1598 static av_always_inline
1599 int check_intra_pred8x8_mode_emuedge(int mode, int mb_x, int mb_y, int vp7)
1600 {
1601     switch (mode) {
1602     case DC_PRED8x8:
1603         return check_dc_pred8x8_mode(mode, mb_x, mb_y);
1604     case VERT_PRED8x8:
1605         return !mb_y ? (vp7 ? DC_128_PRED8x8 : DC_127_PRED8x8) : mode;
1606     case HOR_PRED8x8:
1607         return !mb_x ? (vp7 ? DC_128_PRED8x8 : DC_129_PRED8x8) : mode;
1608     case PLANE_PRED8x8: /* TM */
1609         return check_tm_pred8x8_mode(mode, mb_x, mb_y, vp7);
1610     }
1611     return mode;
1612 }
1613
1614 static av_always_inline
1615 int check_tm_pred4x4_mode(int mode, int mb_x, int mb_y, int vp7)
1616 {
1617     if (!mb_x) {
1618         return mb_y ? VERT_VP8_PRED : (vp7 ? DC_128_PRED : DC_129_PRED);
1619     } else {
1620         return mb_y ? mode : HOR_VP8_PRED;
1621     }
1622 }
1623
1624 static av_always_inline
1625 int check_intra_pred4x4_mode_emuedge(int mode, int mb_x, int mb_y,
1626                                      int *copy_buf, int vp7)
1627 {
1628     switch (mode) {
1629     case VERT_PRED:
1630         if (!mb_x && mb_y) {
1631             *copy_buf = 1;
1632             return mode;
1633         }
1634         /* fall-through */
1635     case DIAG_DOWN_LEFT_PRED:
1636     case VERT_LEFT_PRED:
1637         return !mb_y ? (vp7 ? DC_128_PRED : DC_127_PRED) : mode;
1638     case HOR_PRED:
1639         if (!mb_y) {
1640             *copy_buf = 1;
1641             return mode;
1642         }
1643         /* fall-through */
1644     case HOR_UP_PRED:
1645         return !mb_x ? (vp7 ? DC_128_PRED : DC_129_PRED) : mode;
1646     case TM_VP8_PRED:
1647         return check_tm_pred4x4_mode(mode, mb_x, mb_y, vp7);
1648     case DC_PRED: /* 4x4 DC doesn't use the same "H.264-style" exceptions
1649                    * as 16x16/8x8 DC */
1650     case DIAG_DOWN_RIGHT_PRED:
1651     case VERT_RIGHT_PRED:
1652     case HOR_DOWN_PRED:
1653         if (!mb_y || !mb_x)
1654             *copy_buf = 1;
1655         return mode;
1656     }
1657     return mode;
1658 }
1659
1660 static av_always_inline
1661 void intra_predict(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1662                    VP8Macroblock *mb, int mb_x, int mb_y, int is_vp7)
1663 {
1664     int x, y, mode, nnz;
1665     uint32_t tr;
1666
1667     /* for the first row, we need to run xchg_mb_border to init the top edge
1668      * to 127 otherwise, skip it if we aren't going to deblock */
1669     if (mb_y && (s->deblock_filter || !mb_y) && td->thread_nr == 0)
1670         xchg_mb_border(s->top_border[mb_x + 1], dst[0], dst[1], dst[2],
1671                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1672                        s->filter.simple, 1);
1673
1674     if (mb->mode < MODE_I4x4) {
1675         mode = check_intra_pred8x8_mode_emuedge(mb->mode, mb_x, mb_y, is_vp7);
1676         s->hpc.pred16x16[mode](dst[0], s->linesize);
1677     } else {
1678         uint8_t *ptr = dst[0];
1679         uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
1680         const uint8_t lo = is_vp7 ? 128 : 127;
1681         const uint8_t hi = is_vp7 ? 128 : 129;
1682         uint8_t tr_top[4] = { lo, lo, lo, lo };
1683
1684         // all blocks on the right edge of the macroblock use bottom edge
1685         // the top macroblock for their topright edge
1686         uint8_t *tr_right = ptr - s->linesize + 16;
1687
1688         // if we're on the right edge of the frame, said edge is extended
1689         // from the top macroblock
1690         if (mb_y && mb_x == s->mb_width - 1) {
1691             tr       = tr_right[-1] * 0x01010101u;
1692             tr_right = (uint8_t *) &tr;
1693         }
1694
1695         if (mb->skip)
1696             AV_ZERO128(td->non_zero_count_cache);
1697
1698         for (y = 0; y < 4; y++) {
1699             uint8_t *topright = ptr + 4 - s->linesize;
1700             for (x = 0; x < 4; x++) {
1701                 int copy = 0;
1702                 ptrdiff_t linesize = s->linesize;
1703                 uint8_t *dst = ptr + 4 * x;
1704                 LOCAL_ALIGNED(4, uint8_t, copy_dst, [5 * 8]);
1705
1706                 if ((y == 0 || x == 3) && mb_y == 0) {
1707                     topright = tr_top;
1708                 } else if (x == 3)
1709                     topright = tr_right;
1710
1711                 mode = check_intra_pred4x4_mode_emuedge(intra4x4[x], mb_x + x,
1712                                                         mb_y + y, &copy, is_vp7);
1713                 if (copy) {
1714                     dst      = copy_dst + 12;
1715                     linesize = 8;
1716                     if (!(mb_y + y)) {
1717                         copy_dst[3] = lo;
1718                         AV_WN32A(copy_dst + 4, lo * 0x01010101U);
1719                     } else {
1720                         AV_COPY32(copy_dst + 4, ptr + 4 * x - s->linesize);
1721                         if (!(mb_x + x)) {
1722                             copy_dst[3] = hi;
1723                         } else {
1724                             copy_dst[3] = ptr[4 * x - s->linesize - 1];
1725                         }
1726                     }
1727                     if (!(mb_x + x)) {
1728                         copy_dst[11] =
1729                         copy_dst[19] =
1730                         copy_dst[27] =
1731                         copy_dst[35] = hi;
1732                     } else {
1733                         copy_dst[11] = ptr[4 * x                   - 1];
1734                         copy_dst[19] = ptr[4 * x + s->linesize     - 1];
1735                         copy_dst[27] = ptr[4 * x + s->linesize * 2 - 1];
1736                         copy_dst[35] = ptr[4 * x + s->linesize * 3 - 1];
1737                     }
1738                 }
1739                 s->hpc.pred4x4[mode](dst, topright, linesize);
1740                 if (copy) {
1741                     AV_COPY32(ptr + 4 * x,                   copy_dst + 12);
1742                     AV_COPY32(ptr + 4 * x + s->linesize,     copy_dst + 20);
1743                     AV_COPY32(ptr + 4 * x + s->linesize * 2, copy_dst + 28);
1744                     AV_COPY32(ptr + 4 * x + s->linesize * 3, copy_dst + 36);
1745                 }
1746
1747                 nnz = td->non_zero_count_cache[y][x];
1748                 if (nnz) {
1749                     if (nnz == 1)
1750                         s->vp8dsp.vp8_idct_dc_add(ptr + 4 * x,
1751                                                   td->block[y][x], s->linesize);
1752                     else
1753                         s->vp8dsp.vp8_idct_add(ptr + 4 * x,
1754                                                td->block[y][x], s->linesize);
1755                 }
1756                 topright += 4;
1757             }
1758
1759             ptr      += 4 * s->linesize;
1760             intra4x4 += 4;
1761         }
1762     }
1763
1764     mode = check_intra_pred8x8_mode_emuedge(mb->chroma_pred_mode,
1765                                             mb_x, mb_y, is_vp7);
1766     s->hpc.pred8x8[mode](dst[1], s->uvlinesize);
1767     s->hpc.pred8x8[mode](dst[2], s->uvlinesize);
1768
1769     if (mb_y && (s->deblock_filter || !mb_y) && td->thread_nr == 0)
1770         xchg_mb_border(s->top_border[mb_x + 1], dst[0], dst[1], dst[2],
1771                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1772                        s->filter.simple, 0);
1773 }
1774
1775 static const uint8_t subpel_idx[3][8] = {
1776     { 0, 1, 2, 1, 2, 1, 2, 1 }, // nr. of left extra pixels,
1777                                 // also function pointer index
1778     { 0, 3, 5, 3, 5, 3, 5, 3 }, // nr. of extra pixels required
1779     { 0, 2, 3, 2, 3, 2, 3, 2 }, // nr. of right extra pixels
1780 };
1781
1782 /**
1783  * luma MC function
1784  *
1785  * @param s        VP8 decoding context
1786  * @param dst      target buffer for block data at block position
1787  * @param ref      reference picture buffer at origin (0, 0)
1788  * @param mv       motion vector (relative to block position) to get pixel data from
1789  * @param x_off    horizontal position of block from origin (0, 0)
1790  * @param y_off    vertical position of block from origin (0, 0)
1791  * @param block_w  width of block (16, 8 or 4)
1792  * @param block_h  height of block (always same as block_w)
1793  * @param width    width of src/dst plane data
1794  * @param height   height of src/dst plane data
1795  * @param linesize size of a single line of plane data, including padding
1796  * @param mc_func  motion compensation function pointers (bilinear or sixtap MC)
1797  */
1798 static av_always_inline
1799 void vp8_mc_luma(VP8Context *s, VP8ThreadData *td, uint8_t *dst,
1800                  ThreadFrame *ref, const VP56mv *mv,
1801                  int x_off, int y_off, int block_w, int block_h,
1802                  int width, int height, ptrdiff_t linesize,
1803                  vp8_mc_func mc_func[3][3])
1804 {
1805     uint8_t *src = ref->f->data[0];
1806
1807     if (AV_RN32A(mv)) {
1808         ptrdiff_t src_linesize = linesize;
1809
1810         int mx = (mv->x * 2) & 7, mx_idx = subpel_idx[0][mx];
1811         int my = (mv->y * 2) & 7, my_idx = subpel_idx[0][my];
1812
1813         x_off += mv->x >> 2;
1814         y_off += mv->y >> 2;
1815
1816         // edge emulation
1817         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 4, 0);
1818         src += y_off * linesize + x_off;
1819         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1820             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1821             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1822                                      src - my_idx * linesize - mx_idx,
1823                                      EDGE_EMU_LINESIZE, linesize,
1824                                      block_w + subpel_idx[1][mx],
1825                                      block_h + subpel_idx[1][my],
1826                                      x_off - mx_idx, y_off - my_idx,
1827                                      width, height);
1828             src = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
1829             src_linesize = EDGE_EMU_LINESIZE;
1830         }
1831         mc_func[my_idx][mx_idx](dst, linesize, src, src_linesize, block_h, mx, my);
1832     } else {
1833         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 4, 0);
1834         mc_func[0][0](dst, linesize, src + y_off * linesize + x_off,
1835                       linesize, block_h, 0, 0);
1836     }
1837 }
1838
1839 /**
1840  * chroma MC function
1841  *
1842  * @param s        VP8 decoding context
1843  * @param dst1     target buffer for block data at block position (U plane)
1844  * @param dst2     target buffer for block data at block position (V plane)
1845  * @param ref      reference picture buffer at origin (0, 0)
1846  * @param mv       motion vector (relative to block position) to get pixel data from
1847  * @param x_off    horizontal position of block from origin (0, 0)
1848  * @param y_off    vertical position of block from origin (0, 0)
1849  * @param block_w  width of block (16, 8 or 4)
1850  * @param block_h  height of block (always same as block_w)
1851  * @param width    width of src/dst plane data
1852  * @param height   height of src/dst plane data
1853  * @param linesize size of a single line of plane data, including padding
1854  * @param mc_func  motion compensation function pointers (bilinear or sixtap MC)
1855  */
1856 static av_always_inline
1857 void vp8_mc_chroma(VP8Context *s, VP8ThreadData *td, uint8_t *dst1,
1858                    uint8_t *dst2, ThreadFrame *ref, const VP56mv *mv,
1859                    int x_off, int y_off, int block_w, int block_h,
1860                    int width, int height, ptrdiff_t linesize,
1861                    vp8_mc_func mc_func[3][3])
1862 {
1863     uint8_t *src1 = ref->f->data[1], *src2 = ref->f->data[2];
1864
1865     if (AV_RN32A(mv)) {
1866         int mx = mv->x & 7, mx_idx = subpel_idx[0][mx];
1867         int my = mv->y & 7, my_idx = subpel_idx[0][my];
1868
1869         x_off += mv->x >> 3;
1870         y_off += mv->y >> 3;
1871
1872         // edge emulation
1873         src1 += y_off * linesize + x_off;
1874         src2 += y_off * linesize + x_off;
1875         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 3, 0);
1876         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1877             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1878             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1879                                      src1 - my_idx * linesize - mx_idx,
1880                                      EDGE_EMU_LINESIZE, linesize,
1881                                      block_w + subpel_idx[1][mx],
1882                                      block_h + subpel_idx[1][my],
1883                                      x_off - mx_idx, y_off - my_idx, width, height);
1884             src1 = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
1885             mc_func[my_idx][mx_idx](dst1, linesize, src1, EDGE_EMU_LINESIZE, block_h, mx, my);
1886
1887             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1888                                      src2 - my_idx * linesize - mx_idx,
1889                                      EDGE_EMU_LINESIZE, linesize,
1890                                      block_w + subpel_idx[1][mx],
1891                                      block_h + subpel_idx[1][my],
1892                                      x_off - mx_idx, y_off - my_idx, width, height);
1893             src2 = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
1894             mc_func[my_idx][mx_idx](dst2, linesize, src2, EDGE_EMU_LINESIZE, block_h, mx, my);
1895         } else {
1896             mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my);
1897             mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my);
1898         }
1899     } else {
1900         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 3, 0);
1901         mc_func[0][0](dst1, linesize, src1 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1902         mc_func[0][0](dst2, linesize, src2 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1903     }
1904 }
1905
1906 static av_always_inline
1907 void vp8_mc_part(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1908                  ThreadFrame *ref_frame, int x_off, int y_off,
1909                  int bx_off, int by_off, int block_w, int block_h,
1910                  int width, int height, VP56mv *mv)
1911 {
1912     VP56mv uvmv = *mv;
1913
1914     /* Y */
1915     vp8_mc_luma(s, td, dst[0] + by_off * s->linesize + bx_off,
1916                 ref_frame, mv, x_off + bx_off, y_off + by_off,
1917                 block_w, block_h, width, height, s->linesize,
1918                 s->put_pixels_tab[block_w == 8]);
1919
1920     /* U/V */
1921     if (s->profile == 3) {
1922         /* this block only applies VP8; it is safe to check
1923          * only the profile, as VP7 profile <= 1 */
1924         uvmv.x &= ~7;
1925         uvmv.y &= ~7;
1926     }
1927     x_off   >>= 1;
1928     y_off   >>= 1;
1929     bx_off  >>= 1;
1930     by_off  >>= 1;
1931     width   >>= 1;
1932     height  >>= 1;
1933     block_w >>= 1;
1934     block_h >>= 1;
1935     vp8_mc_chroma(s, td, dst[1] + by_off * s->uvlinesize + bx_off,
1936                   dst[2] + by_off * s->uvlinesize + bx_off, ref_frame,
1937                   &uvmv, x_off + bx_off, y_off + by_off,
1938                   block_w, block_h, width, height, s->uvlinesize,
1939                   s->put_pixels_tab[1 + (block_w == 4)]);
1940 }
1941
1942 /* Fetch pixels for estimated mv 4 macroblocks ahead.
1943  * Optimized for 64-byte cache lines. Inspired by ffh264 prefetch_motion. */
1944 static av_always_inline
1945 void prefetch_motion(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
1946                      int mb_xy, int ref)
1947 {
1948     /* Don't prefetch refs that haven't been used very often this frame. */
1949     if (s->ref_count[ref - 1] > (mb_xy >> 5)) {
1950         int x_off = mb_x << 4, y_off = mb_y << 4;
1951         int mx = (mb->mv.x >> 2) + x_off + 8;
1952         int my = (mb->mv.y >> 2) + y_off;
1953         uint8_t **src = s->framep[ref]->tf.f->data;
1954         int off = mx + (my + (mb_x & 3) * 4) * s->linesize + 64;
1955         /* For threading, a ff_thread_await_progress here might be useful, but
1956          * it actually slows down the decoder. Since a bad prefetch doesn't
1957          * generate bad decoder output, we don't run it here. */
1958         s->vdsp.prefetch(src[0] + off, s->linesize, 4);
1959         off = (mx >> 1) + ((my >> 1) + (mb_x & 7)) * s->uvlinesize + 64;
1960         s->vdsp.prefetch(src[1] + off, src[2] - src[1], 2);
1961     }
1962 }
1963
1964 /**
1965  * Apply motion vectors to prediction buffer, chapter 18.
1966  */
1967 static av_always_inline
1968 void inter_predict(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1969                    VP8Macroblock *mb, int mb_x, int mb_y)
1970 {
1971     int x_off = mb_x << 4, y_off = mb_y << 4;
1972     int width = 16 * s->mb_width, height = 16 * s->mb_height;
1973     ThreadFrame *ref = &s->framep[mb->ref_frame]->tf;
1974     VP56mv *bmv = mb->bmv;
1975
1976     switch (mb->partitioning) {
1977     case VP8_SPLITMVMODE_NONE:
1978         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1979                     0, 0, 16, 16, width, height, &mb->mv);
1980         break;
1981     case VP8_SPLITMVMODE_4x4: {
1982         int x, y;
1983         VP56mv uvmv;
1984
1985         /* Y */
1986         for (y = 0; y < 4; y++) {
1987             for (x = 0; x < 4; x++) {
1988                 vp8_mc_luma(s, td, dst[0] + 4 * y * s->linesize + x * 4,
1989                             ref, &bmv[4 * y + x],
1990                             4 * x + x_off, 4 * y + y_off, 4, 4,
1991                             width, height, s->linesize,
1992                             s->put_pixels_tab[2]);
1993             }
1994         }
1995
1996         /* U/V */
1997         x_off  >>= 1;
1998         y_off  >>= 1;
1999         width  >>= 1;
2000         height >>= 1;
2001         for (y = 0; y < 2; y++) {
2002             for (x = 0; x < 2; x++) {
2003                 uvmv.x = mb->bmv[2 * y       * 4 + 2 * x    ].x +
2004                          mb->bmv[2 * y       * 4 + 2 * x + 1].x +
2005                          mb->bmv[(2 * y + 1) * 4 + 2 * x    ].x +
2006                          mb->bmv[(2 * y + 1) * 4 + 2 * x + 1].x;
2007                 uvmv.y = mb->bmv[2 * y       * 4 + 2 * x    ].y +
2008                          mb->bmv[2 * y       * 4 + 2 * x + 1].y +
2009                          mb->bmv[(2 * y + 1) * 4 + 2 * x    ].y +
2010                          mb->bmv[(2 * y + 1) * 4 + 2 * x + 1].y;
2011                 uvmv.x = (uvmv.x + 2 + FF_SIGNBIT(uvmv.x)) >> 2;
2012                 uvmv.y = (uvmv.y + 2 + FF_SIGNBIT(uvmv.y)) >> 2;
2013                 if (s->profile == 3) {
2014                     uvmv.x &= ~7;
2015                     uvmv.y &= ~7;
2016                 }
2017                 vp8_mc_chroma(s, td, dst[1] + 4 * y * s->uvlinesize + x * 4,
2018                               dst[2] + 4 * y * s->uvlinesize + x * 4, ref,
2019                               &uvmv, 4 * x + x_off, 4 * y + y_off, 4, 4,
2020                               width, height, s->uvlinesize,
2021                               s->put_pixels_tab[2]);
2022             }
2023         }
2024         break;
2025     }
2026     case VP8_SPLITMVMODE_16x8:
2027         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2028                     0, 0, 16, 8, width, height, &bmv[0]);
2029         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2030                     0, 8, 16, 8, width, height, &bmv[1]);
2031         break;
2032     case VP8_SPLITMVMODE_8x16:
2033         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2034                     0, 0, 8, 16, width, height, &bmv[0]);
2035         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2036                     8, 0, 8, 16, width, height, &bmv[1]);
2037         break;
2038     case VP8_SPLITMVMODE_8x8:
2039         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2040                     0, 0, 8, 8, width, height, &bmv[0]);
2041         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2042                     8, 0, 8, 8, width, height, &bmv[1]);
2043         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2044                     0, 8, 8, 8, width, height, &bmv[2]);
2045         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2046                     8, 8, 8, 8, width, height, &bmv[3]);
2047         break;
2048     }
2049 }
2050
2051 static av_always_inline
2052 void idct_mb(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3], VP8Macroblock *mb)
2053 {
2054     int x, y, ch;
2055
2056     if (mb->mode != MODE_I4x4) {
2057         uint8_t *y_dst = dst[0];
2058         for (y = 0; y < 4; y++) {
2059             uint32_t nnz4 = AV_RL32(td->non_zero_count_cache[y]);
2060             if (nnz4) {
2061                 if (nnz4 & ~0x01010101) {
2062                     for (x = 0; x < 4; x++) {
2063                         if ((uint8_t) nnz4 == 1)
2064                             s->vp8dsp.vp8_idct_dc_add(y_dst + 4 * x,
2065                                                       td->block[y][x],
2066                                                       s->linesize);
2067                         else if ((uint8_t) nnz4 > 1)
2068                             s->vp8dsp.vp8_idct_add(y_dst + 4 * x,
2069                                                    td->block[y][x],
2070                                                    s->linesize);
2071                         nnz4 >>= 8;
2072                         if (!nnz4)
2073                             break;
2074                     }
2075                 } else {
2076                     s->vp8dsp.vp8_idct_dc_add4y(y_dst, td->block[y], s->linesize);
2077                 }
2078             }
2079             y_dst += 4 * s->linesize;
2080         }
2081     }
2082
2083     for (ch = 0; ch < 2; ch++) {
2084         uint32_t nnz4 = AV_RL32(td->non_zero_count_cache[4 + ch]);
2085         if (nnz4) {
2086             uint8_t *ch_dst = dst[1 + ch];
2087             if (nnz4 & ~0x01010101) {
2088                 for (y = 0; y < 2; y++) {
2089                     for (x = 0; x < 2; x++) {
2090                         if ((uint8_t) nnz4 == 1)
2091                             s->vp8dsp.vp8_idct_dc_add(ch_dst + 4 * x,
2092                                                       td->block[4 + ch][(y << 1) + x],
2093                                                       s->uvlinesize);
2094                         else if ((uint8_t) nnz4 > 1)
2095                             s->vp8dsp.vp8_idct_add(ch_dst + 4 * x,
2096                                                    td->block[4 + ch][(y << 1) + x],
2097                                                    s->uvlinesize);
2098                         nnz4 >>= 8;
2099                         if (!nnz4)
2100                             goto chroma_idct_end;
2101                     }
2102                     ch_dst += 4 * s->uvlinesize;
2103                 }
2104             } else {
2105                 s->vp8dsp.vp8_idct_dc_add4uv(ch_dst, td->block[4 + ch], s->uvlinesize);
2106             }
2107         }
2108 chroma_idct_end:
2109         ;
2110     }
2111 }
2112
2113 static av_always_inline
2114 void filter_level_for_mb(VP8Context *s, VP8Macroblock *mb,
2115                          VP8FilterStrength *f, int is_vp7)
2116 {
2117     int interior_limit, filter_level;
2118
2119     if (s->segmentation.enabled) {
2120         filter_level = s->segmentation.filter_level[mb->segment];
2121         if (!s->segmentation.absolute_vals)
2122             filter_level += s->filter.level;
2123     } else
2124         filter_level = s->filter.level;
2125
2126     if (s->lf_delta.enabled) {
2127         filter_level += s->lf_delta.ref[mb->ref_frame];
2128         filter_level += s->lf_delta.mode[mb->mode];
2129     }
2130
2131     filter_level = av_clip_uintp2(filter_level, 6);
2132
2133     interior_limit = filter_level;
2134     if (s->filter.sharpness) {
2135         interior_limit >>= (s->filter.sharpness + 3) >> 2;
2136         interior_limit = FFMIN(interior_limit, 9 - s->filter.sharpness);
2137     }
2138     interior_limit = FFMAX(interior_limit, 1);
2139
2140     f->filter_level = filter_level;
2141     f->inner_limit = interior_limit;
2142     f->inner_filter = is_vp7 || !mb->skip || mb->mode == MODE_I4x4 ||
2143                       mb->mode == VP8_MVMODE_SPLIT;
2144 }
2145
2146 static av_always_inline
2147 void filter_mb(VP8Context *s, uint8_t *dst[3], VP8FilterStrength *f,
2148                int mb_x, int mb_y, int is_vp7)
2149 {
2150     int mbedge_lim, bedge_lim_y, bedge_lim_uv, hev_thresh;
2151     int filter_level = f->filter_level;
2152     int inner_limit = f->inner_limit;
2153     int inner_filter = f->inner_filter;
2154     ptrdiff_t linesize   = s->linesize;
2155     ptrdiff_t uvlinesize = s->uvlinesize;
2156     static const uint8_t hev_thresh_lut[2][64] = {
2157         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
2158           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2159           3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2160           3, 3, 3, 3 },
2161         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
2162           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2163           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2164           2, 2, 2, 2 }
2165     };
2166
2167     if (!filter_level)
2168         return;
2169
2170     if (is_vp7) {
2171         bedge_lim_y  = filter_level;
2172         bedge_lim_uv = filter_level * 2;
2173         mbedge_lim   = filter_level + 2;
2174     } else {
2175         bedge_lim_y  =
2176         bedge_lim_uv = filter_level * 2 + inner_limit;
2177         mbedge_lim   = bedge_lim_y + 4;
2178     }
2179
2180     hev_thresh = hev_thresh_lut[s->keyframe][filter_level];
2181
2182     if (mb_x) {
2183         s->vp8dsp.vp8_h_loop_filter16y(dst[0], linesize,
2184                                        mbedge_lim, inner_limit, hev_thresh);
2185         s->vp8dsp.vp8_h_loop_filter8uv(dst[1], dst[2], uvlinesize,
2186                                        mbedge_lim, inner_limit, hev_thresh);
2187     }
2188
2189 #define H_LOOP_FILTER_16Y_INNER(cond)                                         \
2190     if (cond && inner_filter) {                                               \
2191         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] +  4, linesize,           \
2192                                              bedge_lim_y, inner_limit,        \
2193                                              hev_thresh);                     \
2194         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] +  8, linesize,           \
2195                                              bedge_lim_y, inner_limit,        \
2196                                              hev_thresh);                     \
2197         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] + 12, linesize,           \
2198                                              bedge_lim_y, inner_limit,        \
2199                                              hev_thresh);                     \
2200         s->vp8dsp.vp8_h_loop_filter8uv_inner(dst[1] +  4, dst[2] + 4,         \
2201                                              uvlinesize,  bedge_lim_uv,       \
2202                                              inner_limit, hev_thresh);        \
2203     }
2204
2205     H_LOOP_FILTER_16Y_INNER(!is_vp7)
2206
2207     if (mb_y) {
2208         s->vp8dsp.vp8_v_loop_filter16y(dst[0], linesize,
2209                                        mbedge_lim, inner_limit, hev_thresh);
2210         s->vp8dsp.vp8_v_loop_filter8uv(dst[1], dst[2], uvlinesize,
2211                                        mbedge_lim, inner_limit, hev_thresh);
2212     }
2213
2214     if (inner_filter) {
2215         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] +  4 * linesize,
2216                                              linesize, bedge_lim_y,
2217                                              inner_limit, hev_thresh);
2218         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] +  8 * linesize,
2219                                              linesize, bedge_lim_y,
2220                                              inner_limit, hev_thresh);
2221         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] + 12 * linesize,
2222                                              linesize, bedge_lim_y,
2223                                              inner_limit, hev_thresh);
2224         s->vp8dsp.vp8_v_loop_filter8uv_inner(dst[1] +  4 * uvlinesize,
2225                                              dst[2] +  4 * uvlinesize,
2226                                              uvlinesize, bedge_lim_uv,
2227                                              inner_limit, hev_thresh);
2228     }
2229
2230     H_LOOP_FILTER_16Y_INNER(is_vp7)
2231 }
2232
2233 static av_always_inline
2234 void filter_mb_simple(VP8Context *s, uint8_t *dst, VP8FilterStrength *f,
2235                       int mb_x, int mb_y)
2236 {
2237     int mbedge_lim, bedge_lim;
2238     int filter_level = f->filter_level;
2239     int inner_limit  = f->inner_limit;
2240     int inner_filter = f->inner_filter;
2241     ptrdiff_t linesize = s->linesize;
2242
2243     if (!filter_level)
2244         return;
2245
2246     bedge_lim  = 2 * filter_level + inner_limit;
2247     mbedge_lim = bedge_lim + 4;
2248
2249     if (mb_x)
2250         s->vp8dsp.vp8_h_loop_filter_simple(dst, linesize, mbedge_lim);
2251     if (inner_filter) {
2252         s->vp8dsp.vp8_h_loop_filter_simple(dst +  4, linesize, bedge_lim);
2253         s->vp8dsp.vp8_h_loop_filter_simple(dst +  8, linesize, bedge_lim);
2254         s->vp8dsp.vp8_h_loop_filter_simple(dst + 12, linesize, bedge_lim);
2255     }
2256
2257     if (mb_y)
2258         s->vp8dsp.vp8_v_loop_filter_simple(dst, linesize, mbedge_lim);
2259     if (inner_filter) {
2260         s->vp8dsp.vp8_v_loop_filter_simple(dst +  4 * linesize, linesize, bedge_lim);
2261         s->vp8dsp.vp8_v_loop_filter_simple(dst +  8 * linesize, linesize, bedge_lim);
2262         s->vp8dsp.vp8_v_loop_filter_simple(dst + 12 * linesize, linesize, bedge_lim);
2263     }
2264 }
2265
2266 #define MARGIN (16 << 2)
2267 static av_always_inline
2268 void vp78_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *curframe,
2269                                     VP8Frame *prev_frame, int is_vp7)
2270 {
2271     VP8Context *s = avctx->priv_data;
2272     int mb_x, mb_y;
2273
2274     s->mv_bounds.mv_min.y = -MARGIN;
2275     s->mv_bounds.mv_max.y = ((s->mb_height - 1) << 6) + MARGIN;
2276     for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
2277         VP8Macroblock *mb = s->macroblocks_base +
2278                             ((s->mb_width + 1) * (mb_y + 1) + 1);
2279         int mb_xy = mb_y * s->mb_width;
2280
2281         AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED * 0x01010101);
2282
2283         s->mv_bounds.mv_min.x = -MARGIN;
2284         s->mv_bounds.mv_max.x = ((s->mb_width - 1) << 6) + MARGIN;
2285         for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
2286             if (mb_y == 0)
2287                 AV_WN32A((mb - s->mb_width - 1)->intra4x4_pred_mode_top,
2288                          DC_PRED * 0x01010101);
2289             decode_mb_mode(s, &s->mv_bounds, mb, mb_x, mb_y, curframe->seg_map->data + mb_xy,
2290                            prev_frame && prev_frame->seg_map ?
2291                            prev_frame->seg_map->data + mb_xy : NULL, 1, is_vp7);
2292             s->mv_bounds.mv_min.x -= 64;
2293             s->mv_bounds.mv_max.x -= 64;
2294         }
2295         s->mv_bounds.mv_min.y -= 64;
2296         s->mv_bounds.mv_max.y -= 64;
2297     }
2298 }
2299
2300 static void vp7_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *cur_frame,
2301                                    VP8Frame *prev_frame)
2302 {
2303     vp78_decode_mv_mb_modes(avctx, cur_frame, prev_frame, IS_VP7);
2304 }
2305
2306 static void vp8_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *cur_frame,
2307                                    VP8Frame *prev_frame)
2308 {
2309     vp78_decode_mv_mb_modes(avctx, cur_frame, prev_frame, IS_VP8);
2310 }
2311
2312 #if HAVE_THREADS
2313 #define check_thread_pos(td, otd, mb_x_check, mb_y_check)                     \
2314     do {                                                                      \
2315         int tmp = (mb_y_check << 16) | (mb_x_check & 0xFFFF);                 \
2316         if (atomic_load(&otd->thread_mb_pos) < tmp) {                         \
2317             pthread_mutex_lock(&otd->lock);                                   \
2318             atomic_store(&td->wait_mb_pos, tmp);                              \
2319             do {                                                              \
2320                 if (atomic_load(&otd->thread_mb_pos) >= tmp)                  \
2321                     break;                                                    \
2322                 pthread_cond_wait(&otd->cond, &otd->lock);                    \
2323             } while (1);                                                      \
2324             atomic_store(&td->wait_mb_pos, INT_MAX);                          \
2325             pthread_mutex_unlock(&otd->lock);                                 \
2326         }                                                                     \
2327     } while (0)
2328
2329 #define update_pos(td, mb_y, mb_x)                                            \
2330     do {                                                                      \
2331         int pos              = (mb_y << 16) | (mb_x & 0xFFFF);                \
2332         int sliced_threading = (avctx->active_thread_type == FF_THREAD_SLICE) && \
2333                                (num_jobs > 1);                                \
2334         int is_null          = !next_td || !prev_td;                          \
2335         int pos_check        = (is_null) ? 1 :                                \
2336             (next_td != td && pos >= atomic_load(&next_td->wait_mb_pos)) ||   \
2337             (prev_td != td && pos >= atomic_load(&prev_td->wait_mb_pos));     \
2338         atomic_store(&td->thread_mb_pos, pos);                                \
2339         if (sliced_threading && pos_check) {                                  \
2340             pthread_mutex_lock(&td->lock);                                    \
2341             pthread_cond_broadcast(&td->cond);                                \
2342             pthread_mutex_unlock(&td->lock);                                  \
2343         }                                                                     \
2344     } while (0)
2345 #else
2346 #define check_thread_pos(td, otd, mb_x_check, mb_y_check) while(0)
2347 #define update_pos(td, mb_y, mb_x) while(0)
2348 #endif
2349
2350 static av_always_inline int decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
2351                                         int jobnr, int threadnr, int is_vp7)
2352 {
2353     VP8Context *s = avctx->priv_data;
2354     VP8ThreadData *prev_td, *next_td, *td = &s->thread_data[threadnr];
2355     int mb_y = atomic_load(&td->thread_mb_pos) >> 16;
2356     int mb_x, mb_xy = mb_y * s->mb_width;
2357     int num_jobs = s->num_jobs;
2358     VP8Frame *curframe = s->curframe, *prev_frame = s->prev_frame;
2359     VP56RangeCoder *c  = &s->coeff_partition[mb_y & (s->num_coeff_partitions - 1)];
2360     VP8Macroblock *mb;
2361     uint8_t *dst[3] = {
2362         curframe->tf.f->data[0] + 16 * mb_y * s->linesize,
2363         curframe->tf.f->data[1] +  8 * mb_y * s->uvlinesize,
2364         curframe->tf.f->data[2] +  8 * mb_y * s->uvlinesize
2365     };
2366
2367     if (c->end <= c->buffer && c->bits >= 0)
2368          return AVERROR_INVALIDDATA;
2369
2370     if (mb_y == 0)
2371         prev_td = td;
2372     else
2373         prev_td = &s->thread_data[(jobnr + num_jobs - 1) % num_jobs];
2374     if (mb_y == s->mb_height - 1)
2375         next_td = td;
2376     else
2377         next_td = &s->thread_data[(jobnr + 1) % num_jobs];
2378     if (s->mb_layout == 1)
2379         mb = s->macroblocks_base + ((s->mb_width + 1) * (mb_y + 1) + 1);
2380     else {
2381         // Make sure the previous frame has read its segmentation map,
2382         // if we re-use the same map.
2383         if (prev_frame && s->segmentation.enabled &&
2384             !s->segmentation.update_map)
2385             ff_thread_await_progress(&prev_frame->tf, mb_y, 0);
2386         mb = s->macroblocks + (s->mb_height - mb_y - 1) * 2;
2387         memset(mb - 1, 0, sizeof(*mb)); // zero left macroblock
2388         AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED * 0x01010101);
2389     }
2390
2391     if (!is_vp7 || mb_y == 0)
2392         memset(td->left_nnz, 0, sizeof(td->left_nnz));
2393
2394     td->mv_bounds.mv_min.x = -MARGIN;
2395     td->mv_bounds.mv_max.x = ((s->mb_width - 1) << 6) + MARGIN;
2396
2397     for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
2398         if (c->end <= c->buffer && c->bits >= 0)
2399             return AVERROR_INVALIDDATA;
2400         // Wait for previous thread to read mb_x+2, and reach mb_y-1.
2401         if (prev_td != td) {
2402             if (threadnr != 0) {
2403                 check_thread_pos(td, prev_td,
2404                                  mb_x + (is_vp7 ? 2 : 1),
2405                                  mb_y - (is_vp7 ? 2 : 1));
2406             } else {
2407                 check_thread_pos(td, prev_td,
2408                                  mb_x + (is_vp7 ? 2 : 1) + s->mb_width + 3,
2409                                  mb_y - (is_vp7 ? 2 : 1));
2410             }
2411         }
2412
2413         s->vdsp.prefetch(dst[0] + (mb_x & 3) * 4 * s->linesize + 64,
2414                          s->linesize, 4);
2415         s->vdsp.prefetch(dst[1] + (mb_x & 7) * s->uvlinesize + 64,
2416                          dst[2] - dst[1], 2);
2417
2418         if (!s->mb_layout)
2419             decode_mb_mode(s, &td->mv_bounds, mb, mb_x, mb_y, curframe->seg_map->data + mb_xy,
2420                            prev_frame && prev_frame->seg_map ?
2421                            prev_frame->seg_map->data + mb_xy : NULL, 0, is_vp7);
2422
2423         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_PREVIOUS);
2424
2425         if (!mb->skip)
2426             decode_mb_coeffs(s, td, c, mb, s->top_nnz[mb_x], td->left_nnz, is_vp7);
2427
2428         if (mb->mode <= MODE_I4x4)
2429             intra_predict(s, td, dst, mb, mb_x, mb_y, is_vp7);
2430         else
2431             inter_predict(s, td, dst, mb, mb_x, mb_y);
2432
2433         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN);
2434
2435         if (!mb->skip) {
2436             idct_mb(s, td, dst, mb);
2437         } else {
2438             AV_ZERO64(td->left_nnz);
2439             AV_WN64(s->top_nnz[mb_x], 0);   // array of 9, so unaligned
2440
2441             /* Reset DC block predictors if they would exist
2442              * if the mb had coefficients */
2443             if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
2444                 td->left_nnz[8]     = 0;
2445                 s->top_nnz[mb_x][8] = 0;
2446             }
2447         }
2448
2449         if (s->deblock_filter)
2450             filter_level_for_mb(s, mb, &td->filter_strength[mb_x], is_vp7);
2451
2452         if (s->deblock_filter && num_jobs != 1 && threadnr == num_jobs - 1) {
2453             if (s->filter.simple)
2454                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2455                                  NULL, NULL, s->linesize, 0, 1);
2456             else
2457                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2458                                  dst[1], dst[2], s->linesize, s->uvlinesize, 0);
2459         }
2460
2461         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN2);
2462
2463         dst[0]      += 16;
2464         dst[1]      += 8;
2465         dst[2]      += 8;
2466         td->mv_bounds.mv_min.x -= 64;
2467         td->mv_bounds.mv_max.x -= 64;
2468
2469         if (mb_x == s->mb_width + 1) {
2470             update_pos(td, mb_y, s->mb_width + 3);
2471         } else {
2472             update_pos(td, mb_y, mb_x);
2473         }
2474     }
2475     return 0;
2476 }
2477
2478 static int vp7_decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
2479                                         int jobnr, int threadnr)
2480 {
2481     return decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr, 1);
2482 }
2483
2484 static int vp8_decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
2485                                         int jobnr, int threadnr)
2486 {
2487     return decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr, 0);
2488 }
2489
2490 static av_always_inline void filter_mb_row(AVCodecContext *avctx, void *tdata,
2491                               int jobnr, int threadnr, int is_vp7)
2492 {
2493     VP8Context *s = avctx->priv_data;
2494     VP8ThreadData *td = &s->thread_data[threadnr];
2495     int mb_x, mb_y = atomic_load(&td->thread_mb_pos) >> 16, num_jobs = s->num_jobs;
2496     AVFrame *curframe = s->curframe->tf.f;
2497     VP8Macroblock *mb;
2498     VP8ThreadData *prev_td, *next_td;
2499     uint8_t *dst[3] = {
2500         curframe->data[0] + 16 * mb_y * s->linesize,
2501         curframe->data[1] +  8 * mb_y * s->uvlinesize,
2502         curframe->data[2] +  8 * mb_y * s->uvlinesize
2503     };
2504
2505     if (s->mb_layout == 1)
2506         mb = s->macroblocks_base + ((s->mb_width + 1) * (mb_y + 1) + 1);
2507     else
2508         mb = s->macroblocks + (s->mb_height - mb_y - 1) * 2;
2509
2510     if (mb_y == 0)
2511         prev_td = td;
2512     else
2513         prev_td = &s->thread_data[(jobnr + num_jobs - 1) % num_jobs];
2514     if (mb_y == s->mb_height - 1)
2515         next_td = td;
2516     else
2517         next_td = &s->thread_data[(jobnr + 1) % num_jobs];
2518
2519     for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb++) {
2520         VP8FilterStrength *f = &td->filter_strength[mb_x];
2521         if (prev_td != td)
2522             check_thread_pos(td, prev_td,
2523                              (mb_x + 1) + (s->mb_width + 3), mb_y - 1);
2524         if (next_td != td)
2525             if (next_td != &s->thread_data[0])
2526                 check_thread_pos(td, next_td, mb_x + 1, mb_y + 1);
2527
2528         if (num_jobs == 1) {
2529             if (s->filter.simple)
2530                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2531                                  NULL, NULL, s->linesize, 0, 1);
2532             else
2533                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2534                                  dst[1], dst[2], s->linesize, s->uvlinesize, 0);
2535         }
2536
2537         if (s->filter.simple)
2538             filter_mb_simple(s, dst[0], f, mb_x, mb_y);
2539         else
2540             filter_mb(s, dst, f, mb_x, mb_y, is_vp7);
2541         dst[0] += 16;
2542         dst[1] += 8;
2543         dst[2] += 8;
2544
2545         update_pos(td, mb_y, (s->mb_width + 3) + mb_x);
2546     }
2547 }
2548
2549 static void vp7_filter_mb_row(AVCodecContext *avctx, void *tdata,
2550                               int jobnr, int threadnr)
2551 {
2552     filter_mb_row(avctx, tdata, jobnr, threadnr, 1);
2553 }
2554
2555 static void vp8_filter_mb_row(AVCodecContext *avctx, void *tdata,
2556                               int jobnr, int threadnr)
2557 {
2558     filter_mb_row(avctx, tdata, jobnr, threadnr, 0);
2559 }
2560
2561 static av_always_inline
2562 int vp78_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata, int jobnr,
2563                               int threadnr, int is_vp7)
2564 {
2565     VP8Context *s = avctx->priv_data;
2566     VP8ThreadData *td = &s->thread_data[jobnr];
2567     VP8ThreadData *next_td = NULL, *prev_td = NULL;
2568     VP8Frame *curframe = s->curframe;
2569     int mb_y, num_jobs = s->num_jobs;
2570     int ret;
2571
2572     td->thread_nr = threadnr;
2573     td->mv_bounds.mv_min.y   = -MARGIN - 64 * threadnr;
2574     td->mv_bounds.mv_max.y   = ((s->mb_height - 1) << 6) + MARGIN - 64 * threadnr;
2575     for (mb_y = jobnr; mb_y < s->mb_height; mb_y += num_jobs) {
2576         atomic_store(&td->thread_mb_pos, mb_y << 16);
2577         ret = s->decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr);
2578         if (ret < 0) {
2579             update_pos(td, s->mb_height, INT_MAX & 0xFFFF);
2580             return ret;
2581         }
2582         if (s->deblock_filter)
2583             s->filter_mb_row(avctx, tdata, jobnr, threadnr);
2584         update_pos(td, mb_y, INT_MAX & 0xFFFF);
2585
2586         td->mv_bounds.mv_min.y -= 64 * num_jobs;
2587         td->mv_bounds.mv_max.y -= 64 * num_jobs;
2588
2589         if (avctx->active_thread_type == FF_THREAD_FRAME)
2590             ff_thread_report_progress(&curframe->tf, mb_y, 0);
2591     }
2592
2593     return 0;
2594 }
2595
2596 static int vp7_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata,
2597                                     int jobnr, int threadnr)
2598 {
2599     return vp78_decode_mb_row_sliced(avctx, tdata, jobnr, threadnr, IS_VP7);
2600 }
2601
2602 static int vp8_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata,
2603                                     int jobnr, int threadnr)
2604 {
2605     return vp78_decode_mb_row_sliced(avctx, tdata, jobnr, threadnr, IS_VP8);
2606 }
2607
2608 static av_always_inline
2609 int vp78_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
2610                       AVPacket *avpkt, int is_vp7)
2611 {
2612     VP8Context *s = avctx->priv_data;
2613     int ret, i, referenced, num_jobs;
2614     enum AVDiscard skip_thresh;
2615     VP8Frame *av_uninit(curframe), *prev_frame;
2616
2617     if (is_vp7)
2618         ret = vp7_decode_frame_header(s, avpkt->data, avpkt->size);
2619     else
2620         ret = vp8_decode_frame_header(s, avpkt->data, avpkt->size);
2621
2622     if (ret < 0)
2623         goto err;
2624
2625     if (s->actually_webp) {
2626         // avctx->pix_fmt already set in caller.
2627     } else if (!is_vp7 && s->pix_fmt == AV_PIX_FMT_NONE) {
2628         s->pix_fmt = get_pixel_format(s);
2629         if (s->pix_fmt < 0) {
2630             ret = AVERROR(EINVAL);
2631             goto err;
2632         }
2633         avctx->pix_fmt = s->pix_fmt;
2634     }
2635
2636     prev_frame = s->framep[VP56_FRAME_CURRENT];
2637
2638     referenced = s->update_last || s->update_golden == VP56_FRAME_CURRENT ||
2639                  s->update_altref == VP56_FRAME_CURRENT;
2640
2641     skip_thresh = !referenced ? AVDISCARD_NONREF
2642                               : !s->keyframe ? AVDISCARD_NONKEY
2643                                              : AVDISCARD_ALL;
2644
2645     if (avctx->skip_frame >= skip_thresh) {
2646         s->invisible = 1;
2647         memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
2648         goto skip_decode;
2649     }
2650     s->deblock_filter = s->filter.level && avctx->skip_loop_filter < skip_thresh;
2651
2652     // release no longer referenced frames
2653     for (i = 0; i < 5; i++)
2654         if (s->frames[i].tf.f->buf[0] &&
2655             &s->frames[i] != prev_frame &&
2656             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
2657             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN]   &&
2658             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2])
2659             vp8_release_frame(s, &s->frames[i]);
2660
2661     curframe = s->framep[VP56_FRAME_CURRENT] = vp8_find_free_buffer(s);
2662
2663     if (!s->colorspace)
2664         avctx->colorspace = AVCOL_SPC_BT470BG;
2665     if (s->fullrange)
2666         avctx->color_range = AVCOL_RANGE_JPEG;
2667     else
2668         avctx->color_range = AVCOL_RANGE_MPEG;
2669
2670     /* Given that arithmetic probabilities are updated every frame, it's quite
2671      * likely that the values we have on a random interframe are complete
2672      * junk if we didn't start decode on a keyframe. So just don't display
2673      * anything rather than junk. */
2674     if (!s->keyframe && (!s->framep[VP56_FRAME_PREVIOUS] ||
2675                          !s->framep[VP56_FRAME_GOLDEN]   ||
2676                          !s->framep[VP56_FRAME_GOLDEN2])) {
2677         av_log(avctx, AV_LOG_WARNING,
2678                "Discarding interframe without a prior keyframe!\n");
2679         ret = AVERROR_INVALIDDATA;
2680         goto err;
2681     }
2682
2683     curframe->tf.f->key_frame = s->keyframe;
2684     curframe->tf.f->pict_type = s->keyframe ? AV_PICTURE_TYPE_I
2685                                             : AV_PICTURE_TYPE_P;
2686     if ((ret = vp8_alloc_frame(s, curframe, referenced)) < 0)
2687         goto err;
2688
2689     // check if golden and altref are swapped
2690     if (s->update_altref != VP56_FRAME_NONE)
2691         s->next_framep[VP56_FRAME_GOLDEN2] = s->framep[s->update_altref];
2692     else
2693         s->next_framep[VP56_FRAME_GOLDEN2] = s->framep[VP56_FRAME_GOLDEN2];
2694
2695     if (s->update_golden != VP56_FRAME_NONE)
2696         s->next_framep[VP56_FRAME_GOLDEN] = s->framep[s->update_golden];
2697     else
2698         s->next_framep[VP56_FRAME_GOLDEN] = s->framep[VP56_FRAME_GOLDEN];
2699
2700     if (s->update_last)
2701         s->next_framep[VP56_FRAME_PREVIOUS] = curframe;
2702     else
2703         s->next_framep[VP56_FRAME_PREVIOUS] = s->framep[VP56_FRAME_PREVIOUS];
2704
2705     s->next_framep[VP56_FRAME_CURRENT] = curframe;
2706
2707     ff_thread_finish_setup(avctx);
2708
2709     if (avctx->hwaccel) {
2710         ret = avctx->hwaccel->start_frame(avctx, avpkt->data, avpkt->size);
2711         if (ret < 0)
2712             goto err;
2713
2714         ret = avctx->hwaccel->decode_slice(avctx, avpkt->data, avpkt->size);
2715         if (ret < 0)
2716             goto err;
2717
2718         ret = avctx->hwaccel->end_frame(avctx);
2719         if (ret < 0)
2720             goto err;
2721
2722     } else {
2723         s->linesize   = curframe->tf.f->linesize[0];
2724         s->uvlinesize = curframe->tf.f->linesize[1];
2725
2726         memset(s->top_nnz, 0, s->mb_width * sizeof(*s->top_nnz));
2727         /* Zero macroblock structures for top/top-left prediction
2728          * from outside the frame. */
2729         if (!s->mb_layout)
2730             memset(s->macroblocks + s->mb_height * 2 - 1, 0,
2731                    (s->mb_width + 1) * sizeof(*s->macroblocks));
2732         if (!s->mb_layout && s->keyframe)
2733             memset(s->intra4x4_pred_mode_top, DC_PRED, s->mb_width * 4);
2734
2735         memset(s->ref_count, 0, sizeof(s->ref_count));
2736
2737         if (s->mb_layout == 1) {
2738             // Make sure the previous frame has read its segmentation map,
2739             // if we re-use the same map.
2740             if (prev_frame && s->segmentation.enabled &&
2741                 !s->segmentation.update_map)
2742                 ff_thread_await_progress(&prev_frame->tf, 1, 0);
2743             if (is_vp7)
2744                 vp7_decode_mv_mb_modes(avctx, curframe, prev_frame);
2745             else
2746                 vp8_decode_mv_mb_modes(avctx, curframe, prev_frame);
2747         }
2748
2749         if (avctx->active_thread_type == FF_THREAD_FRAME)
2750             num_jobs = 1;
2751         else
2752             num_jobs = FFMIN(s->num_coeff_partitions, avctx->thread_count);
2753         s->num_jobs   = num_jobs;
2754         s->curframe   = curframe;
2755         s->prev_frame = prev_frame;
2756         s->mv_bounds.mv_min.y   = -MARGIN;
2757         s->mv_bounds.mv_max.y   = ((s->mb_height - 1) << 6) + MARGIN;
2758         for (i = 0; i < MAX_THREADS; i++) {
2759             VP8ThreadData *td = &s->thread_data[i];
2760             atomic_init(&td->thread_mb_pos, 0);
2761             atomic_init(&td->wait_mb_pos, INT_MAX);
2762         }
2763         if (is_vp7)
2764             avctx->execute2(avctx, vp7_decode_mb_row_sliced, s->thread_data, NULL,
2765                             num_jobs);
2766         else
2767             avctx->execute2(avctx, vp8_decode_mb_row_sliced, s->thread_data, NULL,
2768                             num_jobs);
2769     }
2770
2771     ff_thread_report_progress(&curframe->tf, INT_MAX, 0);
2772     memcpy(&s->framep[0], &s->next_framep[0], sizeof(s->framep[0]) * 4);
2773
2774 skip_decode:
2775     // if future frames don't use the updated probabilities,
2776     // reset them to the values we saved
2777     if (!s->update_probabilities)
2778         s->prob[0] = s->prob[1];
2779
2780     if (!s->invisible) {
2781         if ((ret = av_frame_ref(data, curframe->tf.f)) < 0)
2782             return ret;
2783         *got_frame = 1;
2784     }
2785
2786     return avpkt->size;
2787 err:
2788     memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
2789     return ret;
2790 }
2791
2792 int ff_vp8_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
2793                         AVPacket *avpkt)
2794 {
2795     return vp78_decode_frame(avctx, data, got_frame, avpkt, IS_VP8);
2796 }
2797
2798 #if CONFIG_VP7_DECODER
2799 static int vp7_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
2800                             AVPacket *avpkt)
2801 {
2802     return vp78_decode_frame(avctx, data, got_frame, avpkt, IS_VP7);
2803 }
2804 #endif /* CONFIG_VP7_DECODER */
2805
2806 av_cold int ff_vp8_decode_free(AVCodecContext *avctx)
2807 {
2808     VP8Context *s = avctx->priv_data;
2809     int i;
2810
2811     if (!s)
2812         return 0;
2813
2814     vp8_decode_flush_impl(avctx, 1);
2815     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++)
2816         av_frame_free(&s->frames[i].tf.f);
2817
2818     return 0;
2819 }
2820
2821 static av_cold int vp8_init_frames(VP8Context *s)
2822 {
2823     int i;
2824     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++) {
2825         s->frames[i].tf.f = av_frame_alloc();
2826         if (!s->frames[i].tf.f)
2827             return AVERROR(ENOMEM);
2828     }
2829     return 0;
2830 }
2831
2832 static av_always_inline
2833 int vp78_decode_init(AVCodecContext *avctx, int is_vp7)
2834 {
2835     VP8Context *s = avctx->priv_data;
2836     int ret;
2837
2838     s->avctx = avctx;
2839     s->vp7   = avctx->codec->id == AV_CODEC_ID_VP7;
2840     s->pix_fmt = AV_PIX_FMT_NONE;
2841     avctx->pix_fmt = AV_PIX_FMT_YUV420P;
2842     avctx->internal->allocate_progress = 1;
2843
2844     ff_videodsp_init(&s->vdsp, 8);
2845
2846     ff_vp78dsp_init(&s->vp8dsp);
2847     if (CONFIG_VP7_DECODER && is_vp7) {
2848         ff_h264_pred_init(&s->hpc, AV_CODEC_ID_VP7, 8, 1);
2849         ff_vp7dsp_init(&s->vp8dsp);
2850         s->decode_mb_row_no_filter = vp7_decode_mb_row_no_filter;
2851         s->filter_mb_row           = vp7_filter_mb_row;
2852     } else if (CONFIG_VP8_DECODER && !is_vp7) {
2853         ff_h264_pred_init(&s->hpc, AV_CODEC_ID_VP8, 8, 1);
2854         ff_vp8dsp_init(&s->vp8dsp);
2855         s->decode_mb_row_no_filter = vp8_decode_mb_row_no_filter;
2856         s->filter_mb_row           = vp8_filter_mb_row;
2857     }
2858
2859     /* does not change for VP8 */
2860     memcpy(s->prob[0].scan, ff_zigzag_scan, sizeof(s->prob[0].scan));
2861
2862     if ((ret = vp8_init_frames(s)) < 0) {
2863         ff_vp8_decode_free(avctx);
2864         return ret;
2865     }
2866
2867     return 0;
2868 }
2869
2870 #if CONFIG_VP7_DECODER
2871 static int vp7_decode_init(AVCodecContext *avctx)
2872 {
2873     return vp78_decode_init(avctx, IS_VP7);
2874 }
2875 #endif /* CONFIG_VP7_DECODER */
2876
2877 av_cold int ff_vp8_decode_init(AVCodecContext *avctx)
2878 {
2879     return vp78_decode_init(avctx, IS_VP8);
2880 }
2881
2882 #if CONFIG_VP8_DECODER
2883 #if HAVE_THREADS
2884 static av_cold int vp8_decode_init_thread_copy(AVCodecContext *avctx)
2885 {
2886     VP8Context *s = avctx->priv_data;
2887     int ret;
2888
2889     s->avctx = avctx;
2890
2891     if ((ret = vp8_init_frames(s)) < 0) {
2892         ff_vp8_decode_free(avctx);
2893         return ret;
2894     }
2895
2896     return 0;
2897 }
2898
2899 #define REBASE(pic) ((pic) ? (pic) - &s_src->frames[0] + &s->frames[0] : NULL)
2900
2901 static int vp8_decode_update_thread_context(AVCodecContext *dst,
2902                                             const AVCodecContext *src)
2903 {
2904     VP8Context *s = dst->priv_data, *s_src = src->priv_data;
2905     int i;
2906
2907     if (s->macroblocks_base &&
2908         (s_src->mb_width != s->mb_width || s_src->mb_height != s->mb_height)) {
2909         free_buffers(s);
2910         s->mb_width  = s_src->mb_width;
2911         s->mb_height = s_src->mb_height;
2912     }
2913
2914     s->pix_fmt      = s_src->pix_fmt;
2915     s->prob[0]      = s_src->prob[!s_src->update_probabilities];
2916     s->segmentation = s_src->segmentation;
2917     s->lf_delta     = s_src->lf_delta;
2918     memcpy(s->sign_bias, s_src->sign_bias, sizeof(s->sign_bias));
2919
2920     for (i = 0; i < FF_ARRAY_ELEMS(s_src->frames); i++) {
2921         if (s_src->frames[i].tf.f->buf[0]) {
2922             int ret = vp8_ref_frame(s, &s->frames[i], &s_src->frames[i]);
2923             if (ret < 0)
2924                 return ret;
2925         }
2926     }
2927
2928     s->framep[0] = REBASE(s_src->next_framep[0]);
2929     s->framep[1] = REBASE(s_src->next_framep[1]);
2930     s->framep[2] = REBASE(s_src->next_framep[2]);
2931     s->framep[3] = REBASE(s_src->next_framep[3]);
2932
2933     return 0;
2934 }
2935 #endif /* HAVE_THREADS */
2936 #endif /* CONFIG_VP8_DECODER */
2937
2938 #if CONFIG_VP7_DECODER
2939 AVCodec ff_vp7_decoder = {
2940     .name                  = "vp7",
2941     .long_name             = NULL_IF_CONFIG_SMALL("On2 VP7"),
2942     .type                  = AVMEDIA_TYPE_VIDEO,
2943     .id                    = AV_CODEC_ID_VP7,
2944     .priv_data_size        = sizeof(VP8Context),
2945     .init                  = vp7_decode_init,
2946     .close                 = ff_vp8_decode_free,
2947     .decode                = vp7_decode_frame,
2948     .capabilities          = AV_CODEC_CAP_DR1,
2949     .flush                 = vp8_decode_flush,
2950 };
2951 #endif /* CONFIG_VP7_DECODER */
2952
2953 #if CONFIG_VP8_DECODER
2954 AVCodec ff_vp8_decoder = {
2955     .name                  = "vp8",
2956     .long_name             = NULL_IF_CONFIG_SMALL("On2 VP8"),
2957     .type                  = AVMEDIA_TYPE_VIDEO,
2958     .id                    = AV_CODEC_ID_VP8,
2959     .priv_data_size        = sizeof(VP8Context),
2960     .init                  = ff_vp8_decode_init,
2961     .close                 = ff_vp8_decode_free,
2962     .decode                = ff_vp8_decode_frame,
2963     .capabilities          = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS |
2964                              AV_CODEC_CAP_SLICE_THREADS,
2965     .flush                 = vp8_decode_flush,
2966     .init_thread_copy      = ONLY_IF_THREADS_ENABLED(vp8_decode_init_thread_copy),
2967     .update_thread_context = ONLY_IF_THREADS_ENABLED(vp8_decode_update_thread_context),
2968     .hw_configs            = (const AVCodecHWConfigInternal*[]) {
2969 #if CONFIG_VP8_VAAPI_HWACCEL
2970                                HWACCEL_VAAPI(vp8),
2971 #endif
2972 #if CONFIG_VP8_NVDEC_HWACCEL
2973                                HWACCEL_NVDEC(vp8),
2974 #endif
2975                                NULL
2976                            },
2977 };
2978 #endif /* CONFIG_VP7_DECODER */