git.sesse.net Git - ffmpeg/blob - libavcodec/vp8.c

   1 /*
   2  * VP7/VP8 compatible video decoder
   3  *
   4  * Copyright (C) 2010 David Conrad
   5  * Copyright (C) 2010 Ronald S. Bultje
   6  * Copyright (C) 2010 Fiona Glaser
   7  * Copyright (C) 2012 Daniel Kang
   8  * Copyright (C) 2014 Peter Ross
   9  *
  10  * This file is part of FFmpeg.
  11  *
  12  * FFmpeg is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public
  14  * License as published by the Free Software Foundation; either
  15  * version 2.1 of the License, or (at your option) any later version.
  16  *
  17  * FFmpeg is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with FFmpeg; if not, write to the Free Software
  24  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  25  */
  26
  27 #include "libavutil/imgutils.h"
  28
  29 #include "avcodec.h"
  30 #include "hwaccel.h"
  31 #include "internal.h"
  32 #include "mathops.h"
  33 #include "rectangle.h"
  34 #include "thread.h"
  35 #include "vp8.h"
  36 #include "vp8data.h"
  37
  38 #if ARCH_ARM
  39 #   include "arm/vp8.h"
  40 #endif
  41
  42 #if CONFIG_VP7_DECODER && CONFIG_VP8_DECODER
  43 #define VPX(vp7, f) (vp7 ? vp7_ ## f : vp8_ ## f)
  44 #elif CONFIG_VP7_DECODER
  45 #define VPX(vp7, f) vp7_ ## f
  46 #else // CONFIG_VP8_DECODER
  47 #define VPX(vp7, f) vp8_ ## f
  48 #endif
  49
  50 static void free_buffers(VP8Context *s)
  51 {
  52     int i;
  53     if (s->thread_data)
  54         for (i = 0; i < MAX_THREADS; i++) {
  55 #if HAVE_THREADS
  56             pthread_cond_destroy(&s->thread_data[i].cond);
  57             pthread_mutex_destroy(&s->thread_data[i].lock);
  58 #endif
  59             av_freep(&s->thread_data[i].filter_strength);
  60         }
  61     av_freep(&s->thread_data);
  62     av_freep(&s->macroblocks_base);
  63     av_freep(&s->intra4x4_pred_mode_top);
  64     av_freep(&s->top_nnz);
  65     av_freep(&s->top_border);
  66
  67     s->macroblocks = NULL;
  68 }
  69
  70 static int vp8_alloc_frame(VP8Context *s, VP8Frame *f, int ref)
  71 {
  72     int ret;
  73     if ((ret = ff_thread_get_buffer(s->avctx, &f->tf,
  74                                     ref ? AV_GET_BUFFER_FLAG_REF : 0)) < 0)
  75         return ret;
  76     if (!(f->seg_map = av_buffer_allocz(s->mb_width * s->mb_height)))
  77         goto fail;
  78     if (s->avctx->hwaccel) {
  79         const AVHWAccel *hwaccel = s->avctx->hwaccel;
  80         if (hwaccel->frame_priv_data_size) {
  81             f->hwaccel_priv_buf = av_buffer_allocz(hwaccel->frame_priv_data_size);
  82             if (!f->hwaccel_priv_buf)
  83                 goto fail;
  84             f->hwaccel_picture_private = f->hwaccel_priv_buf->data;
  85         }
  86     }
  87     return 0;
  88
  89 fail:
  90     av_buffer_unref(&f->seg_map);
  91     ff_thread_release_buffer(s->avctx, &f->tf);
  92     return AVERROR(ENOMEM);
  93 }
  94
  95 static void vp8_release_frame(VP8Context *s, VP8Frame *f)
  96 {
  97     av_buffer_unref(&f->seg_map);
  98     av_buffer_unref(&f->hwaccel_priv_buf);
  99     f->hwaccel_picture_private = NULL;
 100     ff_thread_release_buffer(s->avctx, &f->tf);
 101 }
 102
 103 #if CONFIG_VP8_DECODER
 104 static int vp8_ref_frame(VP8Context *s, VP8Frame *dst, VP8Frame *src)
 105 {
 106     int ret;
 107
 108     vp8_release_frame(s, dst);
 109
 110     if ((ret = ff_thread_ref_frame(&dst->tf, &src->tf)) < 0)
 111         return ret;
 112     if (src->seg_map &&
 113         !(dst->seg_map = av_buffer_ref(src->seg_map))) {
 114         vp8_release_frame(s, dst);
 115         return AVERROR(ENOMEM);
 116     }
 117     if (src->hwaccel_picture_private) {
 118         dst->hwaccel_priv_buf = av_buffer_ref(src->hwaccel_priv_buf);
 119         if (!dst->hwaccel_priv_buf)
 120             return AVERROR(ENOMEM);
 121         dst->hwaccel_picture_private = dst->hwaccel_priv_buf->data;
 122     }
 123
 124     return 0;
 125 }
 126 #endif /* CONFIG_VP8_DECODER */
 127
 128 static void vp8_decode_flush_impl(AVCodecContext *avctx, int free_mem)
 129 {
 130     VP8Context *s = avctx->priv_data;
 131     int i;
 132
 133     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++)
 134         vp8_release_frame(s, &s->frames[i]);
 135     memset(s->framep, 0, sizeof(s->framep));
 136
 137     if (free_mem)
 138         free_buffers(s);
 139 }
 140
 141 static void vp8_decode_flush(AVCodecContext *avctx)
 142 {
 143     vp8_decode_flush_impl(avctx, 0);
 144 }
 145
 146 static VP8Frame *vp8_find_free_buffer(VP8Context *s)
 147 {
 148     VP8Frame *frame = NULL;
 149     int i;
 150
 151     // find a free buffer
 152     for (i = 0; i < 5; i++)
 153         if (&s->frames[i] != s->framep[VP56_FRAME_CURRENT]  &&
 154             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
 155             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN]   &&
 156             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2]) {
 157             frame = &s->frames[i];
 158             break;
 159         }
 160     if (i == 5) {
 161         av_log(s->avctx, AV_LOG_FATAL, "Ran out of free frames!\n");
 162         abort();
 163     }
 164     if (frame->tf.f->buf[0])
 165         vp8_release_frame(s, frame);
 166
 167     return frame;
 168 }
 169
 170 static enum AVPixelFormat get_pixel_format(VP8Context *s)
 171 {
 172     enum AVPixelFormat pix_fmts[] = {
 173 #if CONFIG_VP8_VAAPI_HWACCEL
 174         AV_PIX_FMT_VAAPI,
 175 #endif
 176 #if CONFIG_VP8_NVDEC_HWACCEL
 177         AV_PIX_FMT_CUDA,
 178 #endif
 179         AV_PIX_FMT_YUV420P,
 180         AV_PIX_FMT_NONE,
 181     };
 182
 183     return ff_get_format(s->avctx, pix_fmts);
 184 }
 185
 186 static av_always_inline
 187 int update_dimensions(VP8Context *s, int width, int height, int is_vp7)
 188 {
 189     AVCodecContext *avctx = s->avctx;
 190     int i, ret;
 191
 192     if (width  != s->avctx->width || ((width+15)/16 != s->mb_width || (height+15)/16 != s->mb_height) && s->macroblocks_base ||
 193         height != s->avctx->height) {
 194         vp8_decode_flush_impl(s->avctx, 1);
 195
 196         ret = ff_set_dimensions(s->avctx, width, height);
 197         if (ret < 0)
 198             return ret;
 199     }
 200
 201     if (!s->actually_webp && !is_vp7) {
 202         s->pix_fmt = get_pixel_format(s);
 203         if (s->pix_fmt < 0)
 204             return AVERROR(EINVAL);
 205         avctx->pix_fmt = s->pix_fmt;
 206     }
 207
 208     s->mb_width  = (s->avctx->coded_width  + 15) / 16;
 209     s->mb_height = (s->avctx->coded_height + 15) / 16;
 210
 211     s->mb_layout = is_vp7 || avctx->active_thread_type == FF_THREAD_SLICE &&
 212                    avctx->thread_count > 1;
 213     if (!s->mb_layout) { // Frame threading and one thread
 214         s->macroblocks_base       = av_mallocz((s->mb_width + s->mb_height * 2 + 1) *
 215                                                sizeof(*s->macroblocks));
 216         s->intra4x4_pred_mode_top = av_mallocz(s->mb_width * 4);
 217     } else // Sliced threading
 218         s->macroblocks_base = av_mallocz((s->mb_width + 2) * (s->mb_height + 2) *
 219                                          sizeof(*s->macroblocks));
 220     s->top_nnz     = av_mallocz(s->mb_width * sizeof(*s->top_nnz));
 221     s->top_border  = av_mallocz((s->mb_width + 1) * sizeof(*s->top_border));
 222     s->thread_data = av_mallocz(MAX_THREADS * sizeof(VP8ThreadData));
 223
 224     if (!s->macroblocks_base || !s->top_nnz || !s->top_border ||
 225         !s->thread_data || (!s->intra4x4_pred_mode_top && !s->mb_layout)) {
 226         free_buffers(s);
 227         return AVERROR(ENOMEM);
 228     }
 229
 230     for (i = 0; i < MAX_THREADS; i++) {
 231         s->thread_data[i].filter_strength =
 232             av_mallocz(s->mb_width * sizeof(*s->thread_data[0].filter_strength));
 233         if (!s->thread_data[i].filter_strength) {
 234             free_buffers(s);
 235             return AVERROR(ENOMEM);
 236         }
 237 #if HAVE_THREADS
 238         pthread_mutex_init(&s->thread_data[i].lock, NULL);
 239         pthread_cond_init(&s->thread_data[i].cond, NULL);
 240 #endif
 241     }
 242
 243     s->macroblocks = s->macroblocks_base + 1;
 244
 245     return 0;
 246 }
 247
 248 static int vp7_update_dimensions(VP8Context *s, int width, int height)
 249 {
 250     return update_dimensions(s, width, height, IS_VP7);
 251 }
 252
 253 static int vp8_update_dimensions(VP8Context *s, int width, int height)
 254 {
 255     return update_dimensions(s, width, height, IS_VP8);
 256 }
 257
 258
 259 static void parse_segment_info(VP8Context *s)
 260 {
 261     VP56RangeCoder *c = &s->c;
 262     int i;
 263
 264     s->segmentation.update_map = vp8_rac_get(c);
 265     s->segmentation.update_feature_data = vp8_rac_get(c);
 266
 267     if (s->segmentation.update_feature_data) {
 268         s->segmentation.absolute_vals = vp8_rac_get(c);
 269
 270         for (i = 0; i < 4; i++)
 271             s->segmentation.base_quant[i]   = vp8_rac_get_sint(c, 7);
 272
 273         for (i = 0; i < 4; i++)
 274             s->segmentation.filter_level[i] = vp8_rac_get_sint(c, 6);
 275     }
 276     if (s->segmentation.update_map)
 277         for (i = 0; i < 3; i++)
 278             s->prob->segmentid[i] = vp8_rac_get(c) ? vp8_rac_get_uint(c, 8) : 255;
 279 }
 280
 281 static void update_lf_deltas(VP8Context *s)
 282 {
 283     VP56RangeCoder *c = &s->c;
 284     int i;
 285
 286     for (i = 0; i < 4; i++) {
 287         if (vp8_rac_get(c)) {
 288             s->lf_delta.ref[i] = vp8_rac_get_uint(c, 6);
 289
 290             if (vp8_rac_get(c))
 291                 s->lf_delta.ref[i] = -s->lf_delta.ref[i];
 292         }
 293     }
 294
 295     for (i = MODE_I4x4; i <= VP8_MVMODE_SPLIT; i++) {
 296         if (vp8_rac_get(c)) {
 297             s->lf_delta.mode[i] = vp8_rac_get_uint(c, 6);
 298
 299             if (vp8_rac_get(c))
 300                 s->lf_delta.mode[i] = -s->lf_delta.mode[i];
 301         }
 302     }
 303 }
 304
 305 static int setup_partitions(VP8Context *s, const uint8_t *buf, int buf_size)
 306 {
 307     const uint8_t *sizes = buf;
 308     int i;
 309     int ret;
 310
 311     s->num_coeff_partitions = 1 << vp8_rac_get_uint(&s->c, 2);
 312
 313     buf      += 3 * (s->num_coeff_partitions - 1);
 314     buf_size -= 3 * (s->num_coeff_partitions - 1);
 315     if (buf_size < 0)
 316         return -1;
 317
 318     for (i = 0; i < s->num_coeff_partitions - 1; i++) {
 319         int size = AV_RL24(sizes + 3 * i);
 320         if (buf_size - size < 0)
 321             return -1;
 322         s->coeff_partition_size[i] = size;
 323
 324         ret = ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, size);
 325         if (ret < 0)
 326             return ret;
 327         buf      += size;
 328         buf_size -= size;
 329     }
 330
 331     s->coeff_partition_size[i] = buf_size;
 332     ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, buf_size);
 333
 334     return 0;
 335 }
 336
 337 static void vp7_get_quants(VP8Context *s)
 338 {
 339     VP56RangeCoder *c = &s->c;
 340
 341     int yac_qi  = vp8_rac_get_uint(c, 7);
 342     int ydc_qi  = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 343     int y2dc_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 344     int y2ac_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 345     int uvdc_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 346     int uvac_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 347
 348     s->qmat[0].luma_qmul[0]    =       vp7_ydc_qlookup[ydc_qi];
 349     s->qmat[0].luma_qmul[1]    =       vp7_yac_qlookup[yac_qi];
 350     s->qmat[0].luma_dc_qmul[0] =       vp7_y2dc_qlookup[y2dc_qi];
 351     s->qmat[0].luma_dc_qmul[1] =       vp7_y2ac_qlookup[y2ac_qi];
 352     s->qmat[0].chroma_qmul[0]  = FFMIN(vp7_ydc_qlookup[uvdc_qi], 132);
 353     s->qmat[0].chroma_qmul[1]  =       vp7_yac_qlookup[uvac_qi];
 354 }
 355
 356 static void vp8_get_quants(VP8Context *s)
 357 {
 358     VP56RangeCoder *c = &s->c;
 359     int i, base_qi;
 360
 361     s->quant.yac_qi     = vp8_rac_get_uint(c, 7);
 362     s->quant.ydc_delta  = vp8_rac_get_sint(c, 4);
 363     s->quant.y2dc_delta = vp8_rac_get_sint(c, 4);
 364     s->quant.y2ac_delta = vp8_rac_get_sint(c, 4);
 365     s->quant.uvdc_delta = vp8_rac_get_sint(c, 4);
 366     s->quant.uvac_delta = vp8_rac_get_sint(c, 4);
 367
 368     for (i = 0; i < 4; i++) {
 369         if (s->segmentation.enabled) {
 370             base_qi = s->segmentation.base_quant[i];
 371             if (!s->segmentation.absolute_vals)
 372                 base_qi += s->quant.yac_qi;
 373         } else
 374             base_qi = s->quant.yac_qi;
 375
 376         s->qmat[i].luma_qmul[0]    = vp8_dc_qlookup[av_clip_uintp2(base_qi + s->quant.ydc_delta,  7)];
 377         s->qmat[i].luma_qmul[1]    = vp8_ac_qlookup[av_clip_uintp2(base_qi,              7)];
 378         s->qmat[i].luma_dc_qmul[0] = vp8_dc_qlookup[av_clip_uintp2(base_qi + s->quant.y2dc_delta, 7)] * 2;
 379         /* 101581>>16 is equivalent to 155/100 */
 380         s->qmat[i].luma_dc_qmul[1] = vp8_ac_qlookup[av_clip_uintp2(base_qi + s->quant.y2ac_delta, 7)] * 101581 >> 16;
 381         s->qmat[i].chroma_qmul[0]  = vp8_dc_qlookup[av_clip_uintp2(base_qi + s->quant.uvdc_delta, 7)];
 382         s->qmat[i].chroma_qmul[1]  = vp8_ac_qlookup[av_clip_uintp2(base_qi + s->quant.uvac_delta, 7)];
 383
 384         s->qmat[i].luma_dc_qmul[1] = FFMAX(s->qmat[i].luma_dc_qmul[1], 8);
 385         s->qmat[i].chroma_qmul[0]  = FFMIN(s->qmat[i].chroma_qmul[0], 132);
 386     }
 387 }
 388
 389 /**
 390  * Determine which buffers golden and altref should be updated with after this frame.
 391  * The spec isn't clear here, so I'm going by my understanding of what libvpx does
 392  *
 393  * Intra frames update all 3 references
 394  * Inter frames update VP56_FRAME_PREVIOUS if the update_last flag is set
 395  * If the update (golden|altref) flag is set, it's updated with the current frame
 396  *      if update_last is set, and VP56_FRAME_PREVIOUS otherwise.
 397  * If the flag is not set, the number read means:
 398  *      0: no update
 399  *      1: VP56_FRAME_PREVIOUS
 400  *      2: update golden with altref, or update altref with golden
 401  */
 402 static VP56Frame ref_to_update(VP8Context *s, int update, VP56Frame ref)
 403 {
 404     VP56RangeCoder *c = &s->c;
 405
 406     if (update)
 407         return VP56_FRAME_CURRENT;
 408
 409     switch (vp8_rac_get_uint(c, 2)) {
 410     case 1:
 411         return VP56_FRAME_PREVIOUS;
 412     case 2:
 413         return (ref == VP56_FRAME_GOLDEN) ? VP56_FRAME_GOLDEN2 : VP56_FRAME_GOLDEN;
 414     }
 415     return VP56_FRAME_NONE;
 416 }
 417
 418 static void vp78_reset_probability_tables(VP8Context *s)
 419 {
 420     int i, j;
 421     for (i = 0; i < 4; i++)
 422         for (j = 0; j < 16; j++)
 423             memcpy(s->prob->token[i][j], vp8_token_default_probs[i][vp8_coeff_band[j]],
 424                    sizeof(s->prob->token[i][j]));
 425 }
 426
 427 static void vp78_update_probability_tables(VP8Context *s)
 428 {
 429     VP56RangeCoder *c = &s->c;
 430     int i, j, k, l, m;
 431
 432     for (i = 0; i < 4; i++)
 433         for (j = 0; j < 8; j++)
 434             for (k = 0; k < 3; k++)
 435                 for (l = 0; l < NUM_DCT_TOKENS-1; l++)
 436                     if (vp56_rac_get_prob_branchy(c, vp8_token_update_probs[i][j][k][l])) {
 437                         int prob = vp8_rac_get_uint(c, 8);
 438                         for (m = 0; vp8_coeff_band_indexes[j][m] >= 0; m++)
 439                             s->prob->token[i][vp8_coeff_band_indexes[j][m]][k][l] = prob;
 440                     }
 441 }
 442
 443 #define VP7_MVC_SIZE 17
 444 #define VP8_MVC_SIZE 19
 445
 446 static void vp78_update_pred16x16_pred8x8_mvc_probabilities(VP8Context *s,
 447                                                             int mvc_size)
 448 {
 449     VP56RangeCoder *c = &s->c;
 450     int i, j;
 451
 452     if (vp8_rac_get(c))
 453         for (i = 0; i < 4; i++)
 454             s->prob->pred16x16[i] = vp8_rac_get_uint(c, 8);
 455     if (vp8_rac_get(c))
 456         for (i = 0; i < 3; i++)
 457             s->prob->pred8x8c[i]  = vp8_rac_get_uint(c, 8);
 458
 459     // 17.2 MV probability update
 460     for (i = 0; i < 2; i++)
 461         for (j = 0; j < mvc_size; j++)
 462             if (vp56_rac_get_prob_branchy(c, vp8_mv_update_prob[i][j]))
 463                 s->prob->mvc[i][j] = vp8_rac_get_nn(c);
 464 }
 465
 466 static void update_refs(VP8Context *s)
 467 {
 468     VP56RangeCoder *c = &s->c;
 469
 470     int update_golden = vp8_rac_get(c);
 471     int update_altref = vp8_rac_get(c);
 472
 473     s->update_golden = ref_to_update(s, update_golden, VP56_FRAME_GOLDEN);
 474     s->update_altref = ref_to_update(s, update_altref, VP56_FRAME_GOLDEN2);
 475 }
 476
 477 static void copy_chroma(AVFrame *dst, AVFrame *src, int width, int height)
 478 {
 479     int i, j;
 480
 481     for (j = 1; j < 3; j++) {
 482         for (i = 0; i < height / 2; i++)
 483             memcpy(dst->data[j] + i * dst->linesize[j],
 484                    src->data[j] + i * src->linesize[j], width / 2);
 485     }
 486 }
 487
 488 static void fade(uint8_t *dst, ptrdiff_t dst_linesize,
 489                  const uint8_t *src, ptrdiff_t src_linesize,
 490                  int width, int height,
 491                  int alpha, int beta)
 492 {
 493     int i, j;
 494     for (j = 0; j < height; j++) {
 495         for (i = 0; i < width; i++) {
 496             uint8_t y = src[j * src_linesize + i];
 497             dst[j * dst_linesize + i] = av_clip_uint8(y + ((y * beta) >> 8) + alpha);
 498         }
 499     }
 500 }
 501
 502 static int vp7_fade_frame(VP8Context *s, VP56RangeCoder *c)
 503 {
 504     int alpha = (int8_t) vp8_rac_get_uint(c, 8);
 505     int beta  = (int8_t) vp8_rac_get_uint(c, 8);
 506     int ret;
 507
 508     if (!s->keyframe && (alpha || beta)) {
 509         int width  = s->mb_width * 16;
 510         int height = s->mb_height * 16;
 511         AVFrame *src, *dst;
 512
 513         if (!s->framep[VP56_FRAME_PREVIOUS] ||
 514             !s->framep[VP56_FRAME_GOLDEN]) {
 515             av_log(s->avctx, AV_LOG_WARNING, "Discarding interframe without a prior keyframe!\n");
 516             return AVERROR_INVALIDDATA;
 517         }
 518
 519         dst =
 520         src = s->framep[VP56_FRAME_PREVIOUS]->tf.f;
 521
 522         /* preserve the golden frame, write a new previous frame */
 523         if (s->framep[VP56_FRAME_GOLDEN] == s->framep[VP56_FRAME_PREVIOUS]) {
 524             s->framep[VP56_FRAME_PREVIOUS] = vp8_find_free_buffer(s);
 525             if ((ret = vp8_alloc_frame(s, s->framep[VP56_FRAME_PREVIOUS], 1)) < 0)
 526                 return ret;
 527
 528             dst = s->framep[VP56_FRAME_PREVIOUS]->tf.f;
 529
 530             copy_chroma(dst, src, width, height);
 531         }
 532
 533         fade(dst->data[0], dst->linesize[0],
 534              src->data[0], src->linesize[0],
 535              width, height, alpha, beta);
 536     }
 537
 538     return 0;
 539 }
 540
 541 static int vp7_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size)
 542 {
 543     VP56RangeCoder *c = &s->c;
 544     int part1_size, hscale, vscale, i, j, ret;
 545     int width  = s->avctx->width;
 546     int height = s->avctx->height;
 547
 548     if (buf_size < 4) {
 549         return AVERROR_INVALIDDATA;
 550     }
 551
 552     s->profile = (buf[0] >> 1) & 7;
 553     if (s->profile > 1) {
 554         avpriv_request_sample(s->avctx, "Unknown profile %d", s->profile);
 555         return AVERROR_INVALIDDATA;
 556     }
 557
 558     s->keyframe  = !(buf[0] & 1);
 559     s->invisible = 0;
 560     part1_size   = AV_RL24(buf) >> 4;
 561
 562     if (buf_size < 4 - s->profile + part1_size) {
 563         av_log(s->avctx, AV_LOG_ERROR, "Buffer size %d is too small, needed : %d\n", buf_size, 4 - s->profile + part1_size);
 564         return AVERROR_INVALIDDATA;
 565     }
 566
 567     buf      += 4 - s->profile;
 568     buf_size -= 4 - s->profile;
 569
 570     memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab, sizeof(s->put_pixels_tab));
 571
 572     ret = ff_vp56_init_range_decoder(c, buf, part1_size);
 573     if (ret < 0)
 574         return ret;
 575     buf      += part1_size;
 576     buf_size -= part1_size;
 577
 578     /* A. Dimension information (keyframes only) */
 579     if (s->keyframe) {
 580         width  = vp8_rac_get_uint(c, 12);
 581         height = vp8_rac_get_uint(c, 12);
 582         hscale = vp8_rac_get_uint(c, 2);
 583         vscale = vp8_rac_get_uint(c, 2);
 584         if (hscale || vscale)
 585             avpriv_request_sample(s->avctx, "Upscaling");
 586
 587         s->update_golden = s->update_altref = VP56_FRAME_CURRENT;
 588         vp78_reset_probability_tables(s);
 589         memcpy(s->prob->pred16x16, vp8_pred16x16_prob_inter,
 590                sizeof(s->prob->pred16x16));
 591         memcpy(s->prob->pred8x8c, vp8_pred8x8c_prob_inter,
 592                sizeof(s->prob->pred8x8c));
 593         for (i = 0; i < 2; i++)
 594             memcpy(s->prob->mvc[i], vp7_mv_default_prob[i],
 595                    sizeof(vp7_mv_default_prob[i]));
 596         memset(&s->segmentation, 0, sizeof(s->segmentation));
 597         memset(&s->lf_delta, 0, sizeof(s->lf_delta));
 598         memcpy(s->prob[0].scan, ff_zigzag_scan, sizeof(s->prob[0].scan));
 599     }
 600
 601     if (s->keyframe || s->profile > 0)
 602         memset(s->inter_dc_pred, 0 , sizeof(s->inter_dc_pred));
 603
 604     /* B. Decoding information for all four macroblock-level features */
 605     for (i = 0; i < 4; i++) {
 606         s->feature_enabled[i] = vp8_rac_get(c);
 607         if (s->feature_enabled[i]) {
 608              s->feature_present_prob[i] = vp8_rac_get_uint(c, 8);
 609
 610              for (j = 0; j < 3; j++)
 611                  s->feature_index_prob[i][j] =
 612                      vp8_rac_get(c) ? vp8_rac_get_uint(c, 8) : 255;
 613
 614              if (vp7_feature_value_size[s->profile][i])
 615                  for (j = 0; j < 4; j++)
 616                      s->feature_value[i][j] =
 617                         vp8_rac_get(c) ? vp8_rac_get_uint(c, vp7_feature_value_size[s->profile][i]) : 0;
 618         }
 619     }
 620
 621     s->segmentation.enabled    = 0;
 622     s->segmentation.update_map = 0;
 623     s->lf_delta.enabled        = 0;
 624
 625     s->num_coeff_partitions = 1;
 626     ret = ff_vp56_init_range_decoder(&s->coeff_partition[0], buf, buf_size);
 627     if (ret < 0)
 628         return ret;
 629
 630     if (!s->macroblocks_base || /* first frame */
 631         width != s->avctx->width || height != s->avctx->height ||
 632         (width + 15) / 16 != s->mb_width || (height + 15) / 16 != s->mb_height) {
 633         if ((ret = vp7_update_dimensions(s, width, height)) < 0)
 634             return ret;
 635     }
 636
 637     /* C. Dequantization indices */
 638     vp7_get_quants(s);
 639
 640     /* D. Golden frame update flag (a Flag) for interframes only */
 641     if (!s->keyframe) {
 642         s->update_golden = vp8_rac_get(c) ? VP56_FRAME_CURRENT : VP56_FRAME_NONE;
 643         s->sign_bias[VP56_FRAME_GOLDEN] = 0;
 644     }
 645
 646     s->update_last          = 1;
 647     s->update_probabilities = 1;
 648     s->fade_present         = 1;
 649
 650     if (s->profile > 0) {
 651         s->update_probabilities = vp8_rac_get(c);
 652         if (!s->update_probabilities)
 653             s->prob[1] = s->prob[0];
 654
 655         if (!s->keyframe)
 656             s->fade_present = vp8_rac_get(c);
 657     }
 658
 659     /* E. Fading information for previous frame */
 660     if (s->fade_present && vp8_rac_get(c)) {
 661         if ((ret = vp7_fade_frame(s ,c)) < 0)
 662             return ret;
 663     }
 664
 665     /* F. Loop filter type */
 666     if (!s->profile)
 667         s->filter.simple = vp8_rac_get(c);
 668
 669     /* G. DCT coefficient ordering specification */
 670     if (vp8_rac_get(c))
 671         for (i = 1; i < 16; i++)
 672             s->prob[0].scan[i] = ff_zigzag_scan[vp8_rac_get_uint(c, 4)];
 673
 674     /* H. Loop filter levels  */
 675     if (s->profile > 0)
 676         s->filter.simple = vp8_rac_get(c);
 677     s->filter.level     = vp8_rac_get_uint(c, 6);
 678     s->filter.sharpness = vp8_rac_get_uint(c, 3);
 679
 680     /* I. DCT coefficient probability update; 13.3 Token Probability Updates */
 681     vp78_update_probability_tables(s);
 682
 683     s->mbskip_enabled = 0;
 684
 685     /* J. The remaining frame header data occurs ONLY FOR INTERFRAMES */
 686     if (!s->keyframe) {
 687         s->prob->intra  = vp8_rac_get_uint(c, 8);
 688         s->prob->last   = vp8_rac_get_uint(c, 8);
 689         vp78_update_pred16x16_pred8x8_mvc_probabilities(s, VP7_MVC_SIZE);
 690     }
 691
 692     return 0;
 693 }
 694
 695 static int vp8_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size)
 696 {
 697     VP56RangeCoder *c = &s->c;
 698     int header_size, hscale, vscale, ret;
 699     int width  = s->avctx->width;
 700     int height = s->avctx->height;
 701
 702     if (buf_size < 3) {
 703         av_log(s->avctx, AV_LOG_ERROR, "Insufficent data (%d) for header\n", buf_size);
 704         return AVERROR_INVALIDDATA;
 705     }
 706
 707     s->keyframe  = !(buf[0] & 1);
 708     s->profile   =  (buf[0]>>1) & 7;
 709     s->invisible = !(buf[0] & 0x10);
 710     header_size  = AV_RL24(buf) >> 5;
 711     buf      += 3;
 712     buf_size -= 3;
 713
 714     s->header_partition_size = header_size;
 715
 716     if (s->profile > 3)
 717         av_log(s->avctx, AV_LOG_WARNING, "Unknown profile %d\n", s->profile);
 718
 719     if (!s->profile)
 720         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab,
 721                sizeof(s->put_pixels_tab));
 722     else    // profile 1-3 use bilinear, 4+ aren't defined so whatever
 723         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_bilinear_pixels_tab,
 724                sizeof(s->put_pixels_tab));
 725
 726     if (header_size > buf_size - 7 * s->keyframe) {
 727         av_log(s->avctx, AV_LOG_ERROR, "Header size larger than data provided\n");
 728         return AVERROR_INVALIDDATA;
 729     }
 730
 731     if (s->keyframe) {
 732         if (AV_RL24(buf) != 0x2a019d) {
 733             av_log(s->avctx, AV_LOG_ERROR,
 734                    "Invalid start code 0x%x\n", AV_RL24(buf));
 735             return AVERROR_INVALIDDATA;
 736         }
 737         width     = AV_RL16(buf + 3) & 0x3fff;
 738         height    = AV_RL16(buf + 5) & 0x3fff;
 739         hscale    = buf[4] >> 6;
 740         vscale    = buf[6] >> 6;
 741         buf      += 7;
 742         buf_size -= 7;
 743
 744         if (hscale || vscale)
 745             avpriv_request_sample(s->avctx, "Upscaling");
 746
 747         s->update_golden = s->update_altref = VP56_FRAME_CURRENT;
 748         vp78_reset_probability_tables(s);
 749         memcpy(s->prob->pred16x16, vp8_pred16x16_prob_inter,
 750                sizeof(s->prob->pred16x16));
 751         memcpy(s->prob->pred8x8c, vp8_pred8x8c_prob_inter,
 752                sizeof(s->prob->pred8x8c));
 753         memcpy(s->prob->mvc, vp8_mv_default_prob,
 754                sizeof(s->prob->mvc));
 755         memset(&s->segmentation, 0, sizeof(s->segmentation));
 756         memset(&s->lf_delta, 0, sizeof(s->lf_delta));
 757     }
 758
 759     ret = ff_vp56_init_range_decoder(c, buf, header_size);
 760     if (ret < 0)
 761         return ret;
 762     buf      += header_size;
 763     buf_size -= header_size;
 764
 765     if (s->keyframe) {
 766         s->colorspace = vp8_rac_get(c);
 767         if (s->colorspace)
 768             av_log(s->avctx, AV_LOG_WARNING, "Unspecified colorspace\n");
 769         s->fullrange = vp8_rac_get(c);
 770     }
 771
 772     if ((s->segmentation.enabled = vp8_rac_get(c)))
 773         parse_segment_info(s);
 774     else
 775         s->segmentation.update_map = 0; // FIXME: move this to some init function?
 776
 777     s->filter.simple    = vp8_rac_get(c);
 778     s->filter.level     = vp8_rac_get_uint(c, 6);
 779     s->filter.sharpness = vp8_rac_get_uint(c, 3);
 780
 781     if ((s->lf_delta.enabled = vp8_rac_get(c))) {
 782         s->lf_delta.update = vp8_rac_get(c);
 783         if (s->lf_delta.update)
 784             update_lf_deltas(s);
 785     }
 786
 787     if (setup_partitions(s, buf, buf_size)) {
 788         av_log(s->avctx, AV_LOG_ERROR, "Invalid partitions\n");
 789         return AVERROR_INVALIDDATA;
 790     }
 791
 792     if (!s->macroblocks_base || /* first frame */
 793         width != s->avctx->width || height != s->avctx->height ||
 794         (width+15)/16 != s->mb_width || (height+15)/16 != s->mb_height)
 795         if ((ret = vp8_update_dimensions(s, width, height)) < 0)
 796             return ret;
 797
 798     vp8_get_quants(s);
 799
 800     if (!s->keyframe) {
 801         update_refs(s);
 802         s->sign_bias[VP56_FRAME_GOLDEN]               = vp8_rac_get(c);
 803         s->sign_bias[VP56_FRAME_GOLDEN2 /* altref */] = vp8_rac_get(c);
 804     }
 805
 806     // if we aren't saving this frame's probabilities for future frames,
 807     // make a copy of the current probabilities
 808     if (!(s->update_probabilities = vp8_rac_get(c)))
 809         s->prob[1] = s->prob[0];
 810
 811     s->update_last = s->keyframe || vp8_rac_get(c);
 812
 813     vp78_update_probability_tables(s);
 814
 815     if ((s->mbskip_enabled = vp8_rac_get(c)))
 816         s->prob->mbskip = vp8_rac_get_uint(c, 8);
 817
 818     if (!s->keyframe) {
 819         s->prob->intra  = vp8_rac_get_uint(c, 8);
 820         s->prob->last   = vp8_rac_get_uint(c, 8);
 821         s->prob->golden = vp8_rac_get_uint(c, 8);
 822         vp78_update_pred16x16_pred8x8_mvc_probabilities(s, VP8_MVC_SIZE);
 823     }
 824
 825     // Record the entropy coder state here so that hwaccels can use it.
 826     s->c.code_word = vp56_rac_renorm(&s->c);
 827     s->coder_state_at_header_end.input     = s->c.buffer - (-s->c.bits / 8);
 828     s->coder_state_at_header_end.range     = s->c.high;
 829     s->coder_state_at_header_end.value     = s->c.code_word >> 16;
 830     s->coder_state_at_header_end.bit_count = -s->c.bits % 8;
 831
 832     return 0;
 833 }
 834
 835 static av_always_inline
 836 void clamp_mv(VP8mvbounds *s, VP56mv *dst, const VP56mv *src)
 837 {
 838     dst->x = av_clip(src->x, av_clip(s->mv_min.x, INT16_MIN, INT16_MAX),
 839                              av_clip(s->mv_max.x, INT16_MIN, INT16_MAX));
 840     dst->y = av_clip(src->y, av_clip(s->mv_min.y, INT16_MIN, INT16_MAX),
 841                              av_clip(s->mv_max.y, INT16_MIN, INT16_MAX));
 842 }
 843
 844 /**
 845  * Motion vector coding, 17.1.
 846  */
 847 static av_always_inline int read_mv_component(VP56RangeCoder *c, const uint8_t *p, int vp7)
 848 {
 849     int bit, x = 0;
 850
 851     if (vp56_rac_get_prob_branchy(c, p[0])) {
 852         int i;
 853
 854         for (i = 0; i < 3; i++)
 855             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 856         for (i = (vp7 ? 7 : 9); i > 3; i--)
 857             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 858         if (!(x & (vp7 ? 0xF0 : 0xFFF0)) || vp56_rac_get_prob(c, p[12]))
 859             x += 8;
 860     } else {
 861         // small_mvtree
 862         const uint8_t *ps = p + 2;
 863         bit = vp56_rac_get_prob(c, *ps);
 864         ps += 1 + 3 * bit;
 865         x  += 4 * bit;
 866         bit = vp56_rac_get_prob(c, *ps);
 867         ps += 1 + bit;
 868         x  += 2 * bit;
 869         x  += vp56_rac_get_prob(c, *ps);
 870     }
 871
 872     return (x && vp56_rac_get_prob(c, p[1])) ? -x : x;
 873 }
 874
 875 static int vp7_read_mv_component(VP56RangeCoder *c, const uint8_t *p)
 876 {
 877     return read_mv_component(c, p, 1);
 878 }
 879
 880 static int vp8_read_mv_component(VP56RangeCoder *c, const uint8_t *p)
 881 {
 882     return read_mv_component(c, p, 0);
 883 }
 884
 885 static av_always_inline
 886 const uint8_t *get_submv_prob(uint32_t left, uint32_t top, int is_vp7)
 887 {
 888     if (is_vp7)
 889         return vp7_submv_prob;
 890
 891     if (left == top)
 892         return vp8_submv_prob[4 - !!left];
 893     if (!top)
 894         return vp8_submv_prob[2];
 895     return vp8_submv_prob[1 - !!left];
 896 }
 897
 898 /**
 899  * Split motion vector prediction, 16.4.
 900  * @returns the number of motion vectors parsed (2, 4 or 16)
 901  */
 902 static av_always_inline
 903 int decode_splitmvs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
 904                     int layout, int is_vp7)
 905 {
 906     int part_idx;
 907     int n, num;
 908     VP8Macroblock *top_mb;
 909     VP8Macroblock *left_mb = &mb[-1];
 910     const uint8_t *mbsplits_left = vp8_mbsplits[left_mb->partitioning];
 911     const uint8_t *mbsplits_top, *mbsplits_cur, *firstidx;
 912     VP56mv *top_mv;
 913     VP56mv *left_mv = left_mb->bmv;
 914     VP56mv *cur_mv  = mb->bmv;
 915
 916     if (!layout) // layout is inlined, s->mb_layout is not
 917         top_mb = &mb[2];
 918     else
 919         top_mb = &mb[-s->mb_width - 1];
 920     mbsplits_top = vp8_mbsplits[top_mb->partitioning];
 921     top_mv       = top_mb->bmv;
 922
 923     if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[0])) {
 924         if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[1]))
 925             part_idx = VP8_SPLITMVMODE_16x8 + vp56_rac_get_prob(c, vp8_mbsplit_prob[2]);
 926         else
 927             part_idx = VP8_SPLITMVMODE_8x8;
 928     } else {
 929         part_idx = VP8_SPLITMVMODE_4x4;
 930     }
 931
 932     num              = vp8_mbsplit_count[part_idx];
 933     mbsplits_cur     = vp8_mbsplits[part_idx],
 934     firstidx         = vp8_mbfirstidx[part_idx];
 935     mb->partitioning = part_idx;
 936
 937     for (n = 0; n < num; n++) {
 938         int k = firstidx[n];
 939         uint32_t left, above;
 940         const uint8_t *submv_prob;
 941
 942         if (!(k & 3))
 943             left = AV_RN32A(&left_mv[mbsplits_left[k + 3]]);
 944         else
 945             left = AV_RN32A(&cur_mv[mbsplits_cur[k - 1]]);
 946         if (k <= 3)
 947             above = AV_RN32A(&top_mv[mbsplits_top[k + 12]]);
 948         else
 949             above = AV_RN32A(&cur_mv[mbsplits_cur[k - 4]]);
 950
 951         submv_prob = get_submv_prob(left, above, is_vp7);
 952
 953         if (vp56_rac_get_prob_branchy(c, submv_prob[0])) {
 954             if (vp56_rac_get_prob_branchy(c, submv_prob[1])) {
 955                 if (vp56_rac_get_prob_branchy(c, submv_prob[2])) {
 956                     mb->bmv[n].y = mb->mv.y +
 957                                    read_mv_component(c, s->prob->mvc[0], is_vp7);
 958                     mb->bmv[n].x = mb->mv.x +
 959                                    read_mv_component(c, s->prob->mvc[1], is_vp7);
 960                 } else {
 961                     AV_ZERO32(&mb->bmv[n]);
 962                 }
 963             } else {
 964                 AV_WN32A(&mb->bmv[n], above);
 965             }
 966         } else {
 967             AV_WN32A(&mb->bmv[n], left);
 968         }
 969     }
 970
 971     return num;
 972 }
 973
 974 /**
 975  * The vp7 reference decoder uses a padding macroblock column (added to right
 976  * edge of the frame) to guard against illegal macroblock offsets. The
 977  * algorithm has bugs that permit offsets to straddle the padding column.
 978  * This function replicates those bugs.
 979  *
 980  * @param[out] edge_x macroblock x address
 981  * @param[out] edge_y macroblock y address
 982  *
 983  * @return macroblock offset legal (boolean)
 984  */
 985 static int vp7_calculate_mb_offset(int mb_x, int mb_y, int mb_width,
 986                                    int xoffset, int yoffset, int boundary,
 987                                    int *edge_x, int *edge_y)
 988 {
 989     int vwidth = mb_width + 1;
 990     int new = (mb_y + yoffset) * vwidth + mb_x + xoffset;
 991     if (new < boundary || new % vwidth == vwidth - 1)
 992         return 0;
 993     *edge_y = new / vwidth;
 994     *edge_x = new % vwidth;
 995     return 1;
 996 }
 997
 998 static const VP56mv *get_bmv_ptr(const VP8Macroblock *mb, int subblock)
 999 {
1000     return &mb->bmv[mb->mode == VP8_MVMODE_SPLIT ? vp8_mbsplits[mb->partitioning][subblock] : 0];
1001 }
1002
1003 static av_always_inline
1004 void vp7_decode_mvs(VP8Context *s, VP8Macroblock *mb,
1005                     int mb_x, int mb_y, int layout)
1006 {
1007     VP8Macroblock *mb_edge[12];
1008     enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR };
1009     enum { VP8_EDGE_TOP, VP8_EDGE_LEFT, VP8_EDGE_TOPLEFT };
1010     int idx = CNT_ZERO;
1011     VP56mv near_mv[3];
1012     uint8_t cnt[3] = { 0 };
1013     VP56RangeCoder *c = &s->c;
1014     int i;
1015
1016     AV_ZERO32(&near_mv[0]);
1017     AV_ZERO32(&near_mv[1]);
1018     AV_ZERO32(&near_mv[2]);
1019
1020     for (i = 0; i < VP7_MV_PRED_COUNT; i++) {
1021         const VP7MVPred * pred = &vp7_mv_pred[i];
1022         int edge_x, edge_y;
1023
1024         if (vp7_calculate_mb_offset(mb_x, mb_y, s->mb_width, pred->xoffset,
1025                                     pred->yoffset, !s->profile, &edge_x, &edge_y)) {
1026             VP8Macroblock *edge = mb_edge[i] = (s->mb_layout == 1)
1027                                              ? s->macroblocks_base + 1 + edge_x +
1028                                                (s->mb_width + 1) * (edge_y + 1)
1029                                              : s->macroblocks + edge_x +
1030                                                (s->mb_height - edge_y - 1) * 2;
1031             uint32_t mv = AV_RN32A(get_bmv_ptr(edge, vp7_mv_pred[i].subblock));
1032             if (mv) {
1033                 if (AV_RN32A(&near_mv[CNT_NEAREST])) {
1034                     if (mv == AV_RN32A(&near_mv[CNT_NEAREST])) {
1035                         idx = CNT_NEAREST;
1036                     } else if (AV_RN32A(&near_mv[CNT_NEAR])) {
1037                         if (mv != AV_RN32A(&near_mv[CNT_NEAR]))
1038                             continue;
1039                         idx = CNT_NEAR;
1040                     } else {
1041                         AV_WN32A(&near_mv[CNT_NEAR], mv);
1042                         idx = CNT_NEAR;
1043                     }
1044                 } else {
1045                     AV_WN32A(&near_mv[CNT_NEAREST], mv);
1046                     idx = CNT_NEAREST;
1047                 }
1048             } else {
1049                 idx = CNT_ZERO;
1050             }
1051         } else {
1052             idx = CNT_ZERO;
1053         }
1054         cnt[idx] += vp7_mv_pred[i].score;
1055     }
1056
1057     mb->partitioning = VP8_SPLITMVMODE_NONE;
1058
1059     if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_ZERO]][0])) {
1060         mb->mode = VP8_MVMODE_MV;
1061
1062         if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAREST]][1])) {
1063
1064             if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAR]][2])) {
1065
1066                 if (cnt[CNT_NEAREST] > cnt[CNT_NEAR])
1067                     AV_WN32A(&mb->mv, cnt[CNT_ZERO] > cnt[CNT_NEAREST] ? 0 : AV_RN32A(&near_mv[CNT_NEAREST]));
1068                 else
1069                     AV_WN32A(&mb->mv, cnt[CNT_ZERO] > cnt[CNT_NEAR]    ? 0 : AV_RN32A(&near_mv[CNT_NEAR]));
1070
1071                 if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAR]][3])) {
1072                     mb->mode = VP8_MVMODE_SPLIT;
1073                     mb->mv = mb->bmv[decode_splitmvs(s, c, mb, layout, IS_VP7) - 1];
1074                 } else {
1075                     mb->mv.y += vp7_read_mv_component(c, s->prob->mvc[0]);
1076                     mb->mv.x += vp7_read_mv_component(c, s->prob->mvc[1]);
1077                     mb->bmv[0] = mb->mv;
1078                 }
1079             } else {
1080                 mb->mv = near_mv[CNT_NEAR];
1081                 mb->bmv[0] = mb->mv;
1082             }
1083         } else {
1084             mb->mv = near_mv[CNT_NEAREST];
1085             mb->bmv[0] = mb->mv;
1086         }
1087     } else {
1088         mb->mode = VP8_MVMODE_ZERO;
1089         AV_ZERO32(&mb->mv);
1090         mb->bmv[0] = mb->mv;
1091     }
1092 }
1093
1094 static av_always_inline
1095 void vp8_decode_mvs(VP8Context *s, VP8mvbounds *mv_bounds, VP8Macroblock *mb,
1096                     int mb_x, int mb_y, int layout)
1097 {
1098     VP8Macroblock *mb_edge[3] = { 0      /* top */,
1099                                   mb - 1 /* left */,
1100                                   0      /* top-left */ };
1101     enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV };
1102     enum { VP8_EDGE_TOP, VP8_EDGE_LEFT, VP8_EDGE_TOPLEFT };
1103     int idx = CNT_ZERO;
1104     int cur_sign_bias = s->sign_bias[mb->ref_frame];
1105     int8_t *sign_bias = s->sign_bias;
1106     VP56mv near_mv[4];
1107     uint8_t cnt[4] = { 0 };
1108     VP56RangeCoder *c = &s->c;
1109
1110     if (!layout) { // layout is inlined (s->mb_layout is not)
1111         mb_edge[0] = mb + 2;
1112         mb_edge[2] = mb + 1;
1113     } else {
1114         mb_edge[0] = mb - s->mb_width - 1;
1115         mb_edge[2] = mb - s->mb_width - 2;
1116     }
1117
1118     AV_ZERO32(&near_mv[0]);
1119     AV_ZERO32(&near_mv[1]);
1120     AV_ZERO32(&near_mv[2]);
1121
1122     /* Process MB on top, left and top-left */
1123 #define MV_EDGE_CHECK(n)                                                      \
1124     {                                                                         \
1125         VP8Macroblock *edge = mb_edge[n];                                     \
1126         int edge_ref = edge->ref_frame;                                       \
1127         if (edge_ref != VP56_FRAME_CURRENT) {                                 \
1128             uint32_t mv = AV_RN32A(&edge->mv);                                \
1129             if (mv) {                                                         \
1130                 if (cur_sign_bias != sign_bias[edge_ref]) {                   \
1131                     /* SWAR negate of the values in mv. */                    \
1132                     mv = ~mv;                                                 \
1133                     mv = ((mv & 0x7fff7fff) +                                 \
1134                           0x00010001) ^ (mv & 0x80008000);                    \
1135                 }                                                             \
1136                 if (!n || mv != AV_RN32A(&near_mv[idx]))                      \
1137                     AV_WN32A(&near_mv[++idx], mv);                            \
1138                 cnt[idx] += 1 + (n != 2);                                     \
1139             } else                                                            \
1140                 cnt[CNT_ZERO] += 1 + (n != 2);                                \
1141         }                                                                     \
1142     }
1143
1144     MV_EDGE_CHECK(0)
1145     MV_EDGE_CHECK(1)
1146     MV_EDGE_CHECK(2)
1147
1148     mb->partitioning = VP8_SPLITMVMODE_NONE;
1149     if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_ZERO]][0])) {
1150         mb->mode = VP8_MVMODE_MV;
1151
1152         /* If we have three distinct MVs, merge first and last if they're the same */
1153         if (cnt[CNT_SPLITMV] &&
1154             AV_RN32A(&near_mv[1 + VP8_EDGE_TOP]) == AV_RN32A(&near_mv[1 + VP8_EDGE_TOPLEFT]))
1155             cnt[CNT_NEAREST] += 1;
1156
1157         /* Swap near and nearest if necessary */
1158         if (cnt[CNT_NEAR] > cnt[CNT_NEAREST]) {
1159             FFSWAP(uint8_t,     cnt[CNT_NEAREST],     cnt[CNT_NEAR]);
1160             FFSWAP( VP56mv, near_mv[CNT_NEAREST], near_mv[CNT_NEAR]);
1161         }
1162
1163         if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAREST]][1])) {
1164             if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAR]][2])) {
1165                 /* Choose the best mv out of 0,0 and the nearest mv */
1166                 clamp_mv(mv_bounds, &mb->mv, &near_mv[CNT_ZERO + (cnt[CNT_NEAREST] >= cnt[CNT_ZERO])]);
1167                 cnt[CNT_SPLITMV] = ((mb_edge[VP8_EDGE_LEFT]->mode    == VP8_MVMODE_SPLIT) +
1168                                     (mb_edge[VP8_EDGE_TOP]->mode     == VP8_MVMODE_SPLIT)) * 2 +
1169                                     (mb_edge[VP8_EDGE_TOPLEFT]->mode == VP8_MVMODE_SPLIT);
1170
1171                 if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_SPLITMV]][3])) {
1172                     mb->mode = VP8_MVMODE_SPLIT;
1173                     mb->mv = mb->bmv[decode_splitmvs(s, c, mb, layout, IS_VP8) - 1];
1174                 } else {
1175                     mb->mv.y  += vp8_read_mv_component(c, s->prob->mvc[0]);
1176                     mb->mv.x  += vp8_read_mv_component(c, s->prob->mvc[1]);
1177                     mb->bmv[0] = mb->mv;
1178                 }
1179             } else {
1180                 clamp_mv(mv_bounds, &mb->mv, &near_mv[CNT_NEAR]);
1181                 mb->bmv[0] = mb->mv;
1182             }
1183         } else {
1184             clamp_mv(mv_bounds, &mb->mv, &near_mv[CNT_NEAREST]);
1185             mb->bmv[0] = mb->mv;
1186         }
1187     } else {
1188         mb->mode = VP8_MVMODE_ZERO;
1189         AV_ZERO32(&mb->mv);
1190         mb->bmv[0] = mb->mv;
1191     }
1192 }
1193
1194 static av_always_inline
1195 void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
1196                            int mb_x, int keyframe, int layout)
1197 {
1198     uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
1199
1200     if (layout) {
1201         VP8Macroblock *mb_top = mb - s->mb_width - 1;
1202         memcpy(mb->intra4x4_pred_mode_top, mb_top->intra4x4_pred_mode_top, 4);
1203     }
1204     if (keyframe) {
1205         int x, y;
1206         uint8_t *top;
1207         uint8_t *const left = s->intra4x4_pred_mode_left;
1208         if (layout)
1209             top = mb->intra4x4_pred_mode_top;
1210         else
1211             top = s->intra4x4_pred_mode_top + 4 * mb_x;
1212         for (y = 0; y < 4; y++) {
1213             for (x = 0; x < 4; x++) {
1214                 const uint8_t *ctx;
1215                 ctx       = vp8_pred4x4_prob_intra[top[x]][left[y]];
1216                 *intra4x4 = vp8_rac_get_tree(c, vp8_pred4x4_tree, ctx);
1217                 left[y]   = top[x] = *intra4x4;
1218                 intra4x4++;
1219             }
1220         }
1221     } else {
1222         int i;
1223         for (i = 0; i < 16; i++)
1224             intra4x4[i] = vp8_rac_get_tree(c, vp8_pred4x4_tree,
1225                                            vp8_pred4x4_prob_inter);
1226     }
1227 }
1228
1229 static av_always_inline
1230 void decode_mb_mode(VP8Context *s, VP8mvbounds *mv_bounds,
1231                     VP8Macroblock *mb, int mb_x, int mb_y,
1232                     uint8_t *segment, uint8_t *ref, int layout, int is_vp7)
1233 {
1234     VP56RangeCoder *c = &s->c;
1235     static const char * const vp7_feature_name[] = { "q-index",
1236                                                      "lf-delta",
1237                                                      "partial-golden-update",
1238                                                      "blit-pitch" };
1239     if (is_vp7) {
1240         int i;
1241         *segment = 0;
1242         for (i = 0; i < 4; i++) {
1243             if (s->feature_enabled[i]) {
1244                 if (vp56_rac_get_prob_branchy(c, s->feature_present_prob[i])) {
1245                       int index = vp8_rac_get_tree(c, vp7_feature_index_tree,
1246                                                    s->feature_index_prob[i]);
1247                       av_log(s->avctx, AV_LOG_WARNING,
1248                              "Feature %s present in macroblock (value 0x%x)\n",
1249                              vp7_feature_name[i], s->feature_value[i][index]);
1250                 }
1251            }
1252         }
1253     } else if (s->segmentation.update_map) {
1254         int bit  = vp56_rac_get_prob(c, s->prob->segmentid[0]);
1255         *segment = vp56_rac_get_prob(c, s->prob->segmentid[1+bit]) + 2*bit;
1256     } else if (s->segmentation.enabled)
1257         *segment = ref ? *ref : *segment;
1258     mb->segment = *segment;
1259
1260     mb->skip = s->mbskip_enabled ? vp56_rac_get_prob(c, s->prob->mbskip) : 0;
1261
1262     if (s->keyframe) {
1263         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_intra,
1264                                     vp8_pred16x16_prob_intra);
1265
1266         if (mb->mode == MODE_I4x4) {
1267             decode_intra4x4_modes(s, c, mb, mb_x, 1, layout);
1268         } else {
1269             const uint32_t modes = (is_vp7 ? vp7_pred4x4_mode
1270                                            : vp8_pred4x4_mode)[mb->mode] * 0x01010101u;
1271             if (s->mb_layout)
1272                 AV_WN32A(mb->intra4x4_pred_mode_top, modes);
1273             else
1274                 AV_WN32A(s->intra4x4_pred_mode_top + 4 * mb_x, modes);
1275             AV_WN32A(s->intra4x4_pred_mode_left, modes);
1276         }
1277
1278         mb->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree,
1279                                                 vp8_pred8x8c_prob_intra);
1280         mb->ref_frame        = VP56_FRAME_CURRENT;
1281     } else if (vp56_rac_get_prob_branchy(c, s->prob->intra)) {
1282         // inter MB, 16.2
1283         if (vp56_rac_get_prob_branchy(c, s->prob->last))
1284             mb->ref_frame =
1285                 (!is_vp7 && vp56_rac_get_prob(c, s->prob->golden)) ? VP56_FRAME_GOLDEN2 /* altref */
1286                                                                    : VP56_FRAME_GOLDEN;
1287         else
1288             mb->ref_frame = VP56_FRAME_PREVIOUS;
1289         s->ref_count[mb->ref_frame - 1]++;
1290
1291         // motion vectors, 16.3
1292         if (is_vp7)
1293             vp7_decode_mvs(s, mb, mb_x, mb_y, layout);
1294         else
1295             vp8_decode_mvs(s, mv_bounds, mb, mb_x, mb_y, layout);
1296     } else {
1297         // intra MB, 16.1
1298         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_inter, s->prob->pred16x16);
1299
1300         if (mb->mode == MODE_I4x4)
1301             decode_intra4x4_modes(s, c, mb, mb_x, 0, layout);
1302
1303         mb->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree,
1304                                                 s->prob->pred8x8c);
1305         mb->ref_frame        = VP56_FRAME_CURRENT;
1306         mb->partitioning     = VP8_SPLITMVMODE_NONE;
1307         AV_ZERO32(&mb->bmv[0]);
1308     }
1309 }
1310
1311 /**
1312  * @param r     arithmetic bitstream reader context
1313  * @param block destination for block coefficients
1314  * @param probs probabilities to use when reading trees from the bitstream
1315  * @param i     initial coeff index, 0 unless a separate DC block is coded
1316  * @param qmul  array holding the dc/ac dequant factor at position 0/1
1317  *
1318  * @return 0 if no coeffs were decoded
1319  *         otherwise, the index of the last coeff decoded plus one
1320  */
1321 static av_always_inline
1322 int decode_block_coeffs_internal(VP56RangeCoder *r, int16_t block[16],
1323                                  uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1324                                  int i, uint8_t *token_prob, int16_t qmul[2],
1325                                  const uint8_t scan[16], int vp7)
1326 {
1327     VP56RangeCoder c = *r;
1328     goto skip_eob;
1329     do {
1330         int coeff;
1331 restart:
1332         if (!vp56_rac_get_prob_branchy(&c, token_prob[0]))   // DCT_EOB
1333             break;
1334
1335 skip_eob:
1336         if (!vp56_rac_get_prob_branchy(&c, token_prob[1])) { // DCT_0
1337             if (++i == 16)
1338                 break; // invalid input; blocks should end with EOB
1339             token_prob = probs[i][0];
1340             if (vp7)
1341                 goto restart;
1342             goto skip_eob;
1343         }
1344
1345         if (!vp56_rac_get_prob_branchy(&c, token_prob[2])) { // DCT_1
1346             coeff = 1;
1347             token_prob = probs[i + 1][1];
1348         } else {
1349             if (!vp56_rac_get_prob_branchy(&c, token_prob[3])) { // DCT 2,3,4
1350                 coeff = vp56_rac_get_prob_branchy(&c, token_prob[4]);
1351                 if (coeff)
1352                     coeff += vp56_rac_get_prob(&c, token_prob[5]);
1353                 coeff += 2;
1354             } else {
1355                 // DCT_CAT*
1356                 if (!vp56_rac_get_prob_branchy(&c, token_prob[6])) {
1357                     if (!vp56_rac_get_prob_branchy(&c, token_prob[7])) { // DCT_CAT1
1358                         coeff = 5 + vp56_rac_get_prob(&c, vp8_dct_cat1_prob[0]);
1359                     } else {                                    // DCT_CAT2
1360                         coeff  = 7;
1361                         coeff += vp56_rac_get_prob(&c, vp8_dct_cat2_prob[0]) << 1;
1362                         coeff += vp56_rac_get_prob(&c, vp8_dct_cat2_prob[1]);
1363                     }
1364                 } else {    // DCT_CAT3 and up
1365                     int a   = vp56_rac_get_prob(&c, token_prob[8]);
1366                     int b   = vp56_rac_get_prob(&c, token_prob[9 + a]);
1367                     int cat = (a << 1) + b;
1368                     coeff  = 3 + (8 << cat);
1369                     coeff += vp8_rac_get_coeff(&c, ff_vp8_dct_cat_prob[cat]);
1370                 }
1371             }
1372             token_prob = probs[i + 1][2];
1373         }
1374         block[scan[i]] = (vp8_rac_get(&c) ? -coeff : coeff) * qmul[!!i];
1375     } while (++i < 16);
1376
1377     *r = c;
1378     return i;
1379 }
1380
1381 static av_always_inline
1382 int inter_predict_dc(int16_t block[16], int16_t pred[2])
1383 {
1384     int16_t dc = block[0];
1385     int ret = 0;
1386
1387     if (pred[1] > 3) {
1388         dc += pred[0];
1389         ret = 1;
1390     }
1391
1392     if (!pred[0] | !dc | ((int32_t)pred[0] ^ (int32_t)dc) >> 31) {
1393         block[0] = pred[0] = dc;
1394         pred[1] = 0;
1395     } else {
1396         if (pred[0] == dc)
1397             pred[1]++;
1398         block[0] = pred[0] = dc;
1399     }
1400
1401     return ret;
1402 }
1403
1404 static int vp7_decode_block_coeffs_internal(VP56RangeCoder *r,
1405                                             int16_t block[16],
1406                                             uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1407                                             int i, uint8_t *token_prob,
1408                                             int16_t qmul[2],
1409                                             const uint8_t scan[16])
1410 {
1411     return decode_block_coeffs_internal(r, block, probs, i,
1412                                         token_prob, qmul, scan, IS_VP7);
1413 }
1414
1415 #ifndef vp8_decode_block_coeffs_internal
1416 static int vp8_decode_block_coeffs_internal(VP56RangeCoder *r,
1417                                             int16_t block[16],
1418                                             uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1419                                             int i, uint8_t *token_prob,
1420                                             int16_t qmul[2])
1421 {
1422     return decode_block_coeffs_internal(r, block, probs, i,
1423                                         token_prob, qmul, ff_zigzag_scan, IS_VP8);
1424 }
1425 #endif
1426
1427 /**
1428  * @param c          arithmetic bitstream reader context
1429  * @param block      destination for block coefficients
1430  * @param probs      probabilities to use when reading trees from the bitstream
1431  * @param i          initial coeff index, 0 unless a separate DC block is coded
1432  * @param zero_nhood the initial prediction context for number of surrounding
1433  *                   all-zero blocks (only left/top, so 0-2)
1434  * @param qmul       array holding the dc/ac dequant factor at position 0/1
1435  * @param scan       scan pattern (VP7 only)
1436  *
1437  * @return 0 if no coeffs were decoded
1438  *         otherwise, the index of the last coeff decoded plus one
1439  */
1440 static av_always_inline
1441 int decode_block_coeffs(VP56RangeCoder *c, int16_t block[16],
1442                         uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1443                         int i, int zero_nhood, int16_t qmul[2],
1444                         const uint8_t scan[16], int vp7)
1445 {
1446     uint8_t *token_prob = probs[i][zero_nhood];
1447     if (!vp56_rac_get_prob_branchy(c, token_prob[0]))   // DCT_EOB
1448         return 0;
1449     return vp7 ? vp7_decode_block_coeffs_internal(c, block, probs, i,
1450                                                   token_prob, qmul, scan)
1451                : vp8_decode_block_coeffs_internal(c, block, probs, i,
1452                                                   token_prob, qmul);
1453 }
1454
1455 static av_always_inline
1456 void decode_mb_coeffs(VP8Context *s, VP8ThreadData *td, VP56RangeCoder *c,
1457                       VP8Macroblock *mb, uint8_t t_nnz[9], uint8_t l_nnz[9],
1458                       int is_vp7)
1459 {
1460     int i, x, y, luma_start = 0, luma_ctx = 3;
1461     int nnz_pred, nnz, nnz_total = 0;
1462     int segment = mb->segment;
1463     int block_dc = 0;
1464
1465     if (mb->mode != MODE_I4x4 && (is_vp7 || mb->mode != VP8_MVMODE_SPLIT)) {
1466         nnz_pred = t_nnz[8] + l_nnz[8];
1467
1468         // decode DC values and do hadamard
1469         nnz = decode_block_coeffs(c, td->block_dc, s->prob->token[1], 0,
1470                                   nnz_pred, s->qmat[segment].luma_dc_qmul,
1471                                   ff_zigzag_scan, is_vp7);
1472         l_nnz[8] = t_nnz[8] = !!nnz;
1473
1474         if (is_vp7 && mb->mode > MODE_I4x4) {
1475             nnz |=  inter_predict_dc(td->block_dc,
1476                                      s->inter_dc_pred[mb->ref_frame - 1]);
1477         }
1478
1479         if (nnz) {
1480             nnz_total += nnz;
1481             block_dc   = 1;
1482             if (nnz == 1)
1483                 s->vp8dsp.vp8_luma_dc_wht_dc(td->block, td->block_dc);
1484             else
1485                 s->vp8dsp.vp8_luma_dc_wht(td->block, td->block_dc);
1486         }
1487         luma_start = 1;
1488         luma_ctx   = 0;
1489     }
1490
1491     // luma blocks
1492     for (y = 0; y < 4; y++)
1493         for (x = 0; x < 4; x++) {
1494             nnz_pred = l_nnz[y] + t_nnz[x];
1495             nnz = decode_block_coeffs(c, td->block[y][x],
1496                                       s->prob->token[luma_ctx],
1497                                       luma_start, nnz_pred,
1498                                       s->qmat[segment].luma_qmul,
1499                                       s->prob[0].scan, is_vp7);
1500             /* nnz+block_dc may be one more than the actual last index,
1501              * but we don't care */
1502             td->non_zero_count_cache[y][x] = nnz + block_dc;
1503             t_nnz[x] = l_nnz[y] = !!nnz;
1504             nnz_total += nnz;
1505         }
1506
1507     // chroma blocks
1508     // TODO: what to do about dimensions? 2nd dim for luma is x,
1509     // but for chroma it's (y<<1)|x
1510     for (i = 4; i < 6; i++)
1511         for (y = 0; y < 2; y++)
1512             for (x = 0; x < 2; x++) {
1513                 nnz_pred = l_nnz[i + 2 * y] + t_nnz[i + 2 * x];
1514                 nnz = decode_block_coeffs(c, td->block[i][(y << 1) + x],
1515                                           s->prob->token[2], 0, nnz_pred,
1516                                           s->qmat[segment].chroma_qmul,
1517                                           s->prob[0].scan, is_vp7);
1518                 td->non_zero_count_cache[i][(y << 1) + x] = nnz;
1519                 t_nnz[i + 2 * x] = l_nnz[i + 2 * y] = !!nnz;
1520                 nnz_total += nnz;
1521             }
1522
1523     // if there were no coded coeffs despite the macroblock not being marked skip,
1524     // we MUST not do the inner loop filter and should not do IDCT
1525     // Since skip isn't used for bitstream prediction, just manually set it.
1526     if (!nnz_total)
1527         mb->skip = 1;
1528 }
1529
1530 static av_always_inline
1531 void backup_mb_border(uint8_t *top_border, uint8_t *src_y,
1532                       uint8_t *src_cb, uint8_t *src_cr,
1533                       ptrdiff_t linesize, ptrdiff_t uvlinesize, int simple)
1534 {
1535     AV_COPY128(top_border, src_y + 15 * linesize);
1536     if (!simple) {
1537         AV_COPY64(top_border + 16, src_cb + 7 * uvlinesize);
1538         AV_COPY64(top_border + 24, src_cr + 7 * uvlinesize);
1539     }
1540 }
1541
1542 static av_always_inline
1543 void xchg_mb_border(uint8_t *top_border, uint8_t *src_y, uint8_t *src_cb,
1544                     uint8_t *src_cr, ptrdiff_t linesize, ptrdiff_t uvlinesize, int mb_x,
1545                     int mb_y, int mb_width, int simple, int xchg)
1546 {
1547     uint8_t *top_border_m1 = top_border - 32;     // for TL prediction
1548     src_y  -= linesize;
1549     src_cb -= uvlinesize;
1550     src_cr -= uvlinesize;
1551
1552 #define XCHG(a, b, xchg)                                                      \
1553     do {                                                                      \
1554         if (xchg)                                                             \
1555             AV_SWAP64(b, a);                                                  \
1556         else                                                                  \
1557             AV_COPY64(b, a);                                                  \
1558     } while (0)
1559
1560     XCHG(top_border_m1 + 8, src_y - 8, xchg);
1561     XCHG(top_border, src_y, xchg);
1562     XCHG(top_border + 8, src_y + 8, 1);
1563     if (mb_x < mb_width - 1)
1564         XCHG(top_border + 32, src_y + 16, 1);
1565
1566     // only copy chroma for normal loop filter
1567     // or to initialize the top row to 127
1568     if (!simple || !mb_y) {
1569         XCHG(top_border_m1 + 16, src_cb - 8, xchg);
1570         XCHG(top_border_m1 + 24, src_cr - 8, xchg);
1571         XCHG(top_border + 16, src_cb, 1);
1572         XCHG(top_border + 24, src_cr, 1);
1573     }
1574 }
1575
1576 static av_always_inline
1577 int check_dc_pred8x8_mode(int mode, int mb_x, int mb_y)
1578 {
1579     if (!mb_x)
1580         return mb_y ? TOP_DC_PRED8x8 : DC_128_PRED8x8;
1581     else
1582         return mb_y ? mode : LEFT_DC_PRED8x8;
1583 }
1584
1585 static av_always_inline
1586 int check_tm_pred8x8_mode(int mode, int mb_x, int mb_y, int vp7)
1587 {
1588     if (!mb_x)
1589         return mb_y ? VERT_PRED8x8 : (vp7 ? DC_128_PRED8x8 : DC_129_PRED8x8);
1590     else
1591         return mb_y ? mode : HOR_PRED8x8;
1592 }
1593
1594 static av_always_inline
1595 int check_intra_pred8x8_mode_emuedge(int mode, int mb_x, int mb_y, int vp7)
1596 {
1597     switch (mode) {
1598     case DC_PRED8x8:
1599         return check_dc_pred8x8_mode(mode, mb_x, mb_y);
1600     case VERT_PRED8x8:
1601         return !mb_y ? (vp7 ? DC_128_PRED8x8 : DC_127_PRED8x8) : mode;
1602     case HOR_PRED8x8:
1603         return !mb_x ? (vp7 ? DC_128_PRED8x8 : DC_129_PRED8x8) : mode;
1604     case PLANE_PRED8x8: /* TM */
1605         return check_tm_pred8x8_mode(mode, mb_x, mb_y, vp7);
1606     }
1607     return mode;
1608 }
1609
1610 static av_always_inline
1611 int check_tm_pred4x4_mode(int mode, int mb_x, int mb_y, int vp7)
1612 {
1613     if (!mb_x) {
1614         return mb_y ? VERT_VP8_PRED : (vp7 ? DC_128_PRED : DC_129_PRED);
1615     } else {
1616         return mb_y ? mode : HOR_VP8_PRED;
1617     }
1618 }
1619
1620 static av_always_inline
1621 int check_intra_pred4x4_mode_emuedge(int mode, int mb_x, int mb_y,
1622                                      int *copy_buf, int vp7)
1623 {
1624     switch (mode) {
1625     case VERT_PRED:
1626         if (!mb_x && mb_y) {
1627             *copy_buf = 1;
1628             return mode;
1629         }
1630         /* fall-through */
1631     case DIAG_DOWN_LEFT_PRED:
1632     case VERT_LEFT_PRED:
1633         return !mb_y ? (vp7 ? DC_128_PRED : DC_127_PRED) : mode;
1634     case HOR_PRED:
1635         if (!mb_y) {
1636             *copy_buf = 1;
1637             return mode;
1638         }
1639         /* fall-through */
1640     case HOR_UP_PRED:
1641         return !mb_x ? (vp7 ? DC_128_PRED : DC_129_PRED) : mode;
1642     case TM_VP8_PRED:
1643         return check_tm_pred4x4_mode(mode, mb_x, mb_y, vp7);
1644     case DC_PRED: /* 4x4 DC doesn't use the same "H.264-style" exceptions
1645                    * as 16x16/8x8 DC */
1646     case DIAG_DOWN_RIGHT_PRED:
1647     case VERT_RIGHT_PRED:
1648     case HOR_DOWN_PRED:
1649         if (!mb_y || !mb_x)
1650             *copy_buf = 1;
1651         return mode;
1652     }
1653     return mode;
1654 }
1655
1656 static av_always_inline
1657 void intra_predict(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1658                    VP8Macroblock *mb, int mb_x, int mb_y, int is_vp7)
1659 {
1660     int x, y, mode, nnz;
1661     uint32_t tr;
1662
1663     /* for the first row, we need to run xchg_mb_border to init the top edge
1664      * to 127 otherwise, skip it if we aren't going to deblock */
1665     if (mb_y && (s->deblock_filter || !mb_y) && td->thread_nr == 0)
1666         xchg_mb_border(s->top_border[mb_x + 1], dst[0], dst[1], dst[2],
1667                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1668                        s->filter.simple, 1);
1669
1670     if (mb->mode < MODE_I4x4) {
1671         mode = check_intra_pred8x8_mode_emuedge(mb->mode, mb_x, mb_y, is_vp7);
1672         s->hpc.pred16x16[mode](dst[0], s->linesize);
1673     } else {
1674         uint8_t *ptr = dst[0];
1675         uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
1676         const uint8_t lo = is_vp7 ? 128 : 127;
1677         const uint8_t hi = is_vp7 ? 128 : 129;
1678         uint8_t tr_top[4] = { lo, lo, lo, lo };
1679
1680         // all blocks on the right edge of the macroblock use bottom edge
1681         // the top macroblock for their topright edge
1682         uint8_t *tr_right = ptr - s->linesize + 16;
1683
1684         // if we're on the right edge of the frame, said edge is extended
1685         // from the top macroblock
1686         if (mb_y && mb_x == s->mb_width - 1) {
1687             tr       = tr_right[-1] * 0x01010101u;
1688             tr_right = (uint8_t *) &tr;
1689         }
1690
1691         if (mb->skip)
1692             AV_ZERO128(td->non_zero_count_cache);
1693
1694         for (y = 0; y < 4; y++) {
1695             uint8_t *topright = ptr + 4 - s->linesize;
1696             for (x = 0; x < 4; x++) {
1697                 int copy = 0;
1698                 ptrdiff_t linesize = s->linesize;
1699                 uint8_t *dst = ptr + 4 * x;
1700                 LOCAL_ALIGNED(4, uint8_t, copy_dst, [5 * 8]);
1701
1702                 if ((y == 0 || x == 3) && mb_y == 0) {
1703                     topright = tr_top;
1704                 } else if (x == 3)
1705                     topright = tr_right;
1706
1707                 mode = check_intra_pred4x4_mode_emuedge(intra4x4[x], mb_x + x,
1708                                                         mb_y + y, &copy, is_vp7);
1709                 if (copy) {
1710                     dst      = copy_dst + 12;
1711                     linesize = 8;
1712                     if (!(mb_y + y)) {
1713                         copy_dst[3] = lo;
1714                         AV_WN32A(copy_dst + 4, lo * 0x01010101U);
1715                     } else {
1716                         AV_COPY32(copy_dst + 4, ptr + 4 * x - s->linesize);
1717                         if (!(mb_x + x)) {
1718                             copy_dst[3] = hi;
1719                         } else {
1720                             copy_dst[3] = ptr[4 * x - s->linesize - 1];
1721                         }
1722                     }
1723                     if (!(mb_x + x)) {
1724                         copy_dst[11] =
1725                         copy_dst[19] =
1726                         copy_dst[27] =
1727                         copy_dst[35] = hi;
1728                     } else {
1729                         copy_dst[11] = ptr[4 * x                   - 1];
1730                         copy_dst[19] = ptr[4 * x + s->linesize     - 1];
1731                         copy_dst[27] = ptr[4 * x + s->linesize * 2 - 1];
1732                         copy_dst[35] = ptr[4 * x + s->linesize * 3 - 1];
1733                     }
1734                 }
1735                 s->hpc.pred4x4[mode](dst, topright, linesize);
1736                 if (copy) {
1737                     AV_COPY32(ptr + 4 * x,                   copy_dst + 12);
1738                     AV_COPY32(ptr + 4 * x + s->linesize,     copy_dst + 20);
1739                     AV_COPY32(ptr + 4 * x + s->linesize * 2, copy_dst + 28);
1740                     AV_COPY32(ptr + 4 * x + s->linesize * 3, copy_dst + 36);
1741                 }
1742
1743                 nnz = td->non_zero_count_cache[y][x];
1744                 if (nnz) {
1745                     if (nnz == 1)
1746                         s->vp8dsp.vp8_idct_dc_add(ptr + 4 * x,
1747                                                   td->block[y][x], s->linesize);
1748                     else
1749                         s->vp8dsp.vp8_idct_add(ptr + 4 * x,
1750                                                td->block[y][x], s->linesize);
1751                 }
1752                 topright += 4;
1753             }
1754
1755             ptr      += 4 * s->linesize;
1756             intra4x4 += 4;
1757         }
1758     }
1759
1760     mode = check_intra_pred8x8_mode_emuedge(mb->chroma_pred_mode,
1761                                             mb_x, mb_y, is_vp7);
1762     s->hpc.pred8x8[mode](dst[1], s->uvlinesize);
1763     s->hpc.pred8x8[mode](dst[2], s->uvlinesize);
1764
1765     if (mb_y && (s->deblock_filter || !mb_y) && td->thread_nr == 0)
1766         xchg_mb_border(s->top_border[mb_x + 1], dst[0], dst[1], dst[2],
1767                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1768                        s->filter.simple, 0);
1769 }
1770
1771 static const uint8_t subpel_idx[3][8] = {
1772     { 0, 1, 2, 1, 2, 1, 2, 1 }, // nr. of left extra pixels,
1773                                 // also function pointer index
1774     { 0, 3, 5, 3, 5, 3, 5, 3 }, // nr. of extra pixels required
1775     { 0, 2, 3, 2, 3, 2, 3, 2 }, // nr. of right extra pixels
1776 };
1777
1778 /**
1779  * luma MC function
1780  *
1781  * @param s        VP8 decoding context
1782  * @param dst      target buffer for block data at block position
1783  * @param ref      reference picture buffer at origin (0, 0)
1784  * @param mv       motion vector (relative to block position) to get pixel data from
1785  * @param x_off    horizontal position of block from origin (0, 0)
1786  * @param y_off    vertical position of block from origin (0, 0)
1787  * @param block_w  width of block (16, 8 or 4)
1788  * @param block_h  height of block (always same as block_w)
1789  * @param width    width of src/dst plane data
1790  * @param height   height of src/dst plane data
1791  * @param linesize size of a single line of plane data, including padding
1792  * @param mc_func  motion compensation function pointers (bilinear or sixtap MC)
1793  */
1794 static av_always_inline
1795 void vp8_mc_luma(VP8Context *s, VP8ThreadData *td, uint8_t *dst,
1796                  ThreadFrame *ref, const VP56mv *mv,
1797                  int x_off, int y_off, int block_w, int block_h,
1798                  int width, int height, ptrdiff_t linesize,
1799                  vp8_mc_func mc_func[3][3])
1800 {
1801     uint8_t *src = ref->f->data[0];
1802
1803     if (AV_RN32A(mv)) {
1804         ptrdiff_t src_linesize = linesize;
1805
1806         int mx = (mv->x * 2) & 7, mx_idx = subpel_idx[0][mx];
1807         int my = (mv->y * 2) & 7, my_idx = subpel_idx[0][my];
1808
1809         x_off += mv->x >> 2;
1810         y_off += mv->y >> 2;
1811
1812         // edge emulation
1813         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 4, 0);
1814         src += y_off * linesize + x_off;
1815         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1816             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1817             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1818                                      src - my_idx * linesize - mx_idx,
1819                                      EDGE_EMU_LINESIZE, linesize,
1820                                      block_w + subpel_idx[1][mx],
1821                                      block_h + subpel_idx[1][my],
1822                                      x_off - mx_idx, y_off - my_idx,
1823                                      width, height);
1824             src = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
1825             src_linesize = EDGE_EMU_LINESIZE;
1826         }
1827         mc_func[my_idx][mx_idx](dst, linesize, src, src_linesize, block_h, mx, my);
1828     } else {
1829         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 4, 0);
1830         mc_func[0][0](dst, linesize, src + y_off * linesize + x_off,
1831                       linesize, block_h, 0, 0);
1832     }
1833 }
1834
1835 /**
1836  * chroma MC function
1837  *
1838  * @param s        VP8 decoding context
1839  * @param dst1     target buffer for block data at block position (U plane)
1840  * @param dst2     target buffer for block data at block position (V plane)
1841  * @param ref      reference picture buffer at origin (0, 0)
1842  * @param mv       motion vector (relative to block position) to get pixel data from
1843  * @param x_off    horizontal position of block from origin (0, 0)
1844  * @param y_off    vertical position of block from origin (0, 0)
1845  * @param block_w  width of block (16, 8 or 4)
1846  * @param block_h  height of block (always same as block_w)
1847  * @param width    width of src/dst plane data
1848  * @param height   height of src/dst plane data
1849  * @param linesize size of a single line of plane data, including padding
1850  * @param mc_func  motion compensation function pointers (bilinear or sixtap MC)
1851  */
1852 static av_always_inline
1853 void vp8_mc_chroma(VP8Context *s, VP8ThreadData *td, uint8_t *dst1,
1854                    uint8_t *dst2, ThreadFrame *ref, const VP56mv *mv,
1855                    int x_off, int y_off, int block_w, int block_h,
1856                    int width, int height, ptrdiff_t linesize,
1857                    vp8_mc_func mc_func[3][3])
1858 {
1859     uint8_t *src1 = ref->f->data[1], *src2 = ref->f->data[2];
1860
1861     if (AV_RN32A(mv)) {
1862         int mx = mv->x & 7, mx_idx = subpel_idx[0][mx];
1863         int my = mv->y & 7, my_idx = subpel_idx[0][my];
1864
1865         x_off += mv->x >> 3;
1866         y_off += mv->y >> 3;
1867
1868         // edge emulation
1869         src1 += y_off * linesize + x_off;
1870         src2 += y_off * linesize + x_off;
1871         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 3, 0);
1872         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1873             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1874             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1875                                      src1 - my_idx * linesize - mx_idx,
1876                                      EDGE_EMU_LINESIZE, linesize,
1877                                      block_w + subpel_idx[1][mx],
1878                                      block_h + subpel_idx[1][my],
1879                                      x_off - mx_idx, y_off - my_idx, width, height);
1880             src1 = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
1881             mc_func[my_idx][mx_idx](dst1, linesize, src1, EDGE_EMU_LINESIZE, block_h, mx, my);
1882
1883             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1884                                      src2 - my_idx * linesize - mx_idx,
1885                                      EDGE_EMU_LINESIZE, linesize,
1886                                      block_w + subpel_idx[1][mx],
1887                                      block_h + subpel_idx[1][my],
1888                                      x_off - mx_idx, y_off - my_idx, width, height);
1889             src2 = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
1890             mc_func[my_idx][mx_idx](dst2, linesize, src2, EDGE_EMU_LINESIZE, block_h, mx, my);
1891         } else {
1892             mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my);
1893             mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my);
1894         }
1895     } else {
1896         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 3, 0);
1897         mc_func[0][0](dst1, linesize, src1 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1898         mc_func[0][0](dst2, linesize, src2 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1899     }
1900 }
1901
1902 static av_always_inline
1903 void vp8_mc_part(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1904                  ThreadFrame *ref_frame, int x_off, int y_off,
1905                  int bx_off, int by_off, int block_w, int block_h,
1906                  int width, int height, VP56mv *mv)
1907 {
1908     VP56mv uvmv = *mv;
1909
1910     /* Y */
1911     vp8_mc_luma(s, td, dst[0] + by_off * s->linesize + bx_off,
1912                 ref_frame, mv, x_off + bx_off, y_off + by_off,
1913                 block_w, block_h, width, height, s->linesize,
1914                 s->put_pixels_tab[block_w == 8]);
1915
1916     /* U/V */
1917     if (s->profile == 3) {
1918         /* this block only applies VP8; it is safe to check
1919          * only the profile, as VP7 profile <= 1 */
1920         uvmv.x &= ~7;
1921         uvmv.y &= ~7;
1922     }
1923     x_off   >>= 1;
1924     y_off   >>= 1;
1925     bx_off  >>= 1;
1926     by_off  >>= 1;
1927     width   >>= 1;
1928     height  >>= 1;
1929     block_w >>= 1;
1930     block_h >>= 1;
1931     vp8_mc_chroma(s, td, dst[1] + by_off * s->uvlinesize + bx_off,
1932                   dst[2] + by_off * s->uvlinesize + bx_off, ref_frame,
1933                   &uvmv, x_off + bx_off, y_off + by_off,
1934                   block_w, block_h, width, height, s->uvlinesize,
1935                   s->put_pixels_tab[1 + (block_w == 4)]);
1936 }
1937
1938 /* Fetch pixels for estimated mv 4 macroblocks ahead.
1939  * Optimized for 64-byte cache lines. Inspired by ffh264 prefetch_motion. */
1940 static av_always_inline
1941 void prefetch_motion(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
1942                      int mb_xy, int ref)
1943 {
1944     /* Don't prefetch refs that haven't been used very often this frame. */
1945     if (s->ref_count[ref - 1] > (mb_xy >> 5)) {
1946         int x_off = mb_x << 4, y_off = mb_y << 4;
1947         int mx = (mb->mv.x >> 2) + x_off + 8;
1948         int my = (mb->mv.y >> 2) + y_off;
1949         uint8_t **src = s->framep[ref]->tf.f->data;
1950         int off = mx + (my + (mb_x & 3) * 4) * s->linesize + 64;
1951         /* For threading, a ff_thread_await_progress here might be useful, but
1952          * it actually slows down the decoder. Since a bad prefetch doesn't
1953          * generate bad decoder output, we don't run it here. */
1954         s->vdsp.prefetch(src[0] + off, s->linesize, 4);
1955         off = (mx >> 1) + ((my >> 1) + (mb_x & 7)) * s->uvlinesize + 64;
1956         s->vdsp.prefetch(src[1] + off, src[2] - src[1], 2);
1957     }
1958 }
1959
1960 /**
1961  * Apply motion vectors to prediction buffer, chapter 18.
1962  */
1963 static av_always_inline
1964 void inter_predict(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1965                    VP8Macroblock *mb, int mb_x, int mb_y)
1966 {
1967     int x_off = mb_x << 4, y_off = mb_y << 4;
1968     int width = 16 * s->mb_width, height = 16 * s->mb_height;
1969     ThreadFrame *ref = &s->framep[mb->ref_frame]->tf;
1970     VP56mv *bmv = mb->bmv;
1971
1972     switch (mb->partitioning) {
1973     case VP8_SPLITMVMODE_NONE:
1974         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1975                     0, 0, 16, 16, width, height, &mb->mv);
1976         break;
1977     case VP8_SPLITMVMODE_4x4: {
1978         int x, y;
1979         VP56mv uvmv;
1980
1981         /* Y */
1982         for (y = 0; y < 4; y++) {
1983             for (x = 0; x < 4; x++) {
1984                 vp8_mc_luma(s, td, dst[0] + 4 * y * s->linesize + x * 4,
1985                             ref, &bmv[4 * y + x],
1986                             4 * x + x_off, 4 * y + y_off, 4, 4,
1987                             width, height, s->linesize,
1988                             s->put_pixels_tab[2]);
1989             }
1990         }
1991
1992         /* U/V */
1993         x_off  >>= 1;
1994         y_off  >>= 1;
1995         width  >>= 1;
1996         height >>= 1;
1997         for (y = 0; y < 2; y++) {
1998             for (x = 0; x < 2; x++) {
1999                 uvmv.x = mb->bmv[2 * y       * 4 + 2 * x    ].x +
2000                          mb->bmv[2 * y       * 4 + 2 * x + 1].x +
2001                          mb->bmv[(2 * y + 1) * 4 + 2 * x    ].x +
2002                          mb->bmv[(2 * y + 1) * 4 + 2 * x + 1].x;
2003                 uvmv.y = mb->bmv[2 * y       * 4 + 2 * x    ].y +
2004                          mb->bmv[2 * y       * 4 + 2 * x + 1].y +
2005                          mb->bmv[(2 * y + 1) * 4 + 2 * x    ].y +
2006                          mb->bmv[(2 * y + 1) * 4 + 2 * x + 1].y;
2007                 uvmv.x = (uvmv.x + 2 + FF_SIGNBIT(uvmv.x)) >> 2;
2008                 uvmv.y = (uvmv.y + 2 + FF_SIGNBIT(uvmv.y)) >> 2;
2009                 if (s->profile == 3) {
2010                     uvmv.x &= ~7;
2011                     uvmv.y &= ~7;
2012                 }
2013                 vp8_mc_chroma(s, td, dst[1] + 4 * y * s->uvlinesize + x * 4,
2014                               dst[2] + 4 * y * s->uvlinesize + x * 4, ref,
2015                               &uvmv, 4 * x + x_off, 4 * y + y_off, 4, 4,
2016                               width, height, s->uvlinesize,
2017                               s->put_pixels_tab[2]);
2018             }
2019         }
2020         break;
2021     }
2022     case VP8_SPLITMVMODE_16x8:
2023         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2024                     0, 0, 16, 8, width, height, &bmv[0]);
2025         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2026                     0, 8, 16, 8, width, height, &bmv[1]);
2027         break;
2028     case VP8_SPLITMVMODE_8x16:
2029         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2030                     0, 0, 8, 16, width, height, &bmv[0]);
2031         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2032                     8, 0, 8, 16, width, height, &bmv[1]);
2033         break;
2034     case VP8_SPLITMVMODE_8x8:
2035         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2036                     0, 0, 8, 8, width, height, &bmv[0]);
2037         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2038                     8, 0, 8, 8, width, height, &bmv[1]);
2039         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2040                     0, 8, 8, 8, width, height, &bmv[2]);
2041         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2042                     8, 8, 8, 8, width, height, &bmv[3]);
2043         break;
2044     }
2045 }
2046
2047 static av_always_inline
2048 void idct_mb(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3], VP8Macroblock *mb)
2049 {
2050     int x, y, ch;
2051
2052     if (mb->mode != MODE_I4x4) {
2053         uint8_t *y_dst = dst[0];
2054         for (y = 0; y < 4; y++) {
2055             uint32_t nnz4 = AV_RL32(td->non_zero_count_cache[y]);
2056             if (nnz4) {
2057                 if (nnz4 & ~0x01010101) {
2058                     for (x = 0; x < 4; x++) {
2059                         if ((uint8_t) nnz4 == 1)
2060                             s->vp8dsp.vp8_idct_dc_add(y_dst + 4 * x,
2061                                                       td->block[y][x],
2062                                                       s->linesize);
2063                         else if ((uint8_t) nnz4 > 1)
2064                             s->vp8dsp.vp8_idct_add(y_dst + 4 * x,
2065                                                    td->block[y][x],
2066                                                    s->linesize);
2067                         nnz4 >>= 8;
2068                         if (!nnz4)
2069                             break;
2070                     }
2071                 } else {
2072                     s->vp8dsp.vp8_idct_dc_add4y(y_dst, td->block[y], s->linesize);
2073                 }
2074             }
2075             y_dst += 4 * s->linesize;
2076         }
2077     }
2078
2079     for (ch = 0; ch < 2; ch++) {
2080         uint32_t nnz4 = AV_RL32(td->non_zero_count_cache[4 + ch]);
2081         if (nnz4) {
2082             uint8_t *ch_dst = dst[1 + ch];
2083             if (nnz4 & ~0x01010101) {
2084                 for (y = 0; y < 2; y++) {
2085                     for (x = 0; x < 2; x++) {
2086                         if ((uint8_t) nnz4 == 1)
2087                             s->vp8dsp.vp8_idct_dc_add(ch_dst + 4 * x,
2088                                                       td->block[4 + ch][(y << 1) + x],
2089                                                       s->uvlinesize);
2090                         else if ((uint8_t) nnz4 > 1)
2091                             s->vp8dsp.vp8_idct_add(ch_dst + 4 * x,
2092                                                    td->block[4 + ch][(y << 1) + x],
2093                                                    s->uvlinesize);
2094                         nnz4 >>= 8;
2095                         if (!nnz4)
2096                             goto chroma_idct_end;
2097                     }
2098                     ch_dst += 4 * s->uvlinesize;
2099                 }
2100             } else {
2101                 s->vp8dsp.vp8_idct_dc_add4uv(ch_dst, td->block[4 + ch], s->uvlinesize);
2102             }
2103         }
2104 chroma_idct_end:
2105         ;
2106     }
2107 }
2108
2109 static av_always_inline
2110 void filter_level_for_mb(VP8Context *s, VP8Macroblock *mb,
2111                          VP8FilterStrength *f, int is_vp7)
2112 {
2113     int interior_limit, filter_level;
2114
2115     if (s->segmentation.enabled) {
2116         filter_level = s->segmentation.filter_level[mb->segment];
2117         if (!s->segmentation.absolute_vals)
2118             filter_level += s->filter.level;
2119     } else
2120         filter_level = s->filter.level;
2121
2122     if (s->lf_delta.enabled) {
2123         filter_level += s->lf_delta.ref[mb->ref_frame];
2124         filter_level += s->lf_delta.mode[mb->mode];
2125     }
2126
2127     filter_level = av_clip_uintp2(filter_level, 6);
2128
2129     interior_limit = filter_level;
2130     if (s->filter.sharpness) {
2131         interior_limit >>= (s->filter.sharpness + 3) >> 2;
2132         interior_limit = FFMIN(interior_limit, 9 - s->filter.sharpness);
2133     }
2134     interior_limit = FFMAX(interior_limit, 1);
2135
2136     f->filter_level = filter_level;
2137     f->inner_limit = interior_limit;
2138     f->inner_filter = is_vp7 || !mb->skip || mb->mode == MODE_I4x4 ||
2139                       mb->mode == VP8_MVMODE_SPLIT;
2140 }
2141
2142 static av_always_inline
2143 void filter_mb(VP8Context *s, uint8_t *dst[3], VP8FilterStrength *f,
2144                int mb_x, int mb_y, int is_vp7)
2145 {
2146     int mbedge_lim, bedge_lim_y, bedge_lim_uv, hev_thresh;
2147     int filter_level = f->filter_level;
2148     int inner_limit = f->inner_limit;
2149     int inner_filter = f->inner_filter;
2150     ptrdiff_t linesize   = s->linesize;
2151     ptrdiff_t uvlinesize = s->uvlinesize;
2152     static const uint8_t hev_thresh_lut[2][64] = {
2153         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
2154           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2155           3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2156           3, 3, 3, 3 },
2157         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
2158           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2159           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2160           2, 2, 2, 2 }
2161     };
2162
2163     if (!filter_level)
2164         return;
2165
2166     if (is_vp7) {
2167         bedge_lim_y  = filter_level;
2168         bedge_lim_uv = filter_level * 2;
2169         mbedge_lim   = filter_level + 2;
2170     } else {
2171         bedge_lim_y  =
2172         bedge_lim_uv = filter_level * 2 + inner_limit;
2173         mbedge_lim   = bedge_lim_y + 4;
2174     }
2175
2176     hev_thresh = hev_thresh_lut[s->keyframe][filter_level];
2177
2178     if (mb_x) {
2179         s->vp8dsp.vp8_h_loop_filter16y(dst[0], linesize,
2180                                        mbedge_lim, inner_limit, hev_thresh);
2181         s->vp8dsp.vp8_h_loop_filter8uv(dst[1], dst[2], uvlinesize,
2182                                        mbedge_lim, inner_limit, hev_thresh);
2183     }
2184
2185 #define H_LOOP_FILTER_16Y_INNER(cond)                                         \
2186     if (cond && inner_filter) {                                               \
2187         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] +  4, linesize,           \
2188                                              bedge_lim_y, inner_limit,        \
2189                                              hev_thresh);                     \
2190         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] +  8, linesize,           \
2191                                              bedge_lim_y, inner_limit,        \
2192                                              hev_thresh);                     \
2193         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] + 12, linesize,           \
2194                                              bedge_lim_y, inner_limit,        \
2195                                              hev_thresh);                     \
2196         s->vp8dsp.vp8_h_loop_filter8uv_inner(dst[1] +  4, dst[2] + 4,         \
2197                                              uvlinesize,  bedge_lim_uv,       \
2198                                              inner_limit, hev_thresh);        \
2199     }
2200
2201     H_LOOP_FILTER_16Y_INNER(!is_vp7)
2202
2203     if (mb_y) {
2204         s->vp8dsp.vp8_v_loop_filter16y(dst[0], linesize,
2205                                        mbedge_lim, inner_limit, hev_thresh);
2206         s->vp8dsp.vp8_v_loop_filter8uv(dst[1], dst[2], uvlinesize,
2207                                        mbedge_lim, inner_limit, hev_thresh);
2208     }
2209
2210     if (inner_filter) {
2211         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] +  4 * linesize,
2212                                              linesize, bedge_lim_y,
2213                                              inner_limit, hev_thresh);
2214         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] +  8 * linesize,
2215                                              linesize, bedge_lim_y,
2216                                              inner_limit, hev_thresh);
2217         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] + 12 * linesize,
2218                                              linesize, bedge_lim_y,
2219                                              inner_limit, hev_thresh);
2220         s->vp8dsp.vp8_v_loop_filter8uv_inner(dst[1] +  4 * uvlinesize,
2221                                              dst[2] +  4 * uvlinesize,
2222                                              uvlinesize, bedge_lim_uv,
2223                                              inner_limit, hev_thresh);
2224     }
2225
2226     H_LOOP_FILTER_16Y_INNER(is_vp7)
2227 }
2228
2229 static av_always_inline
2230 void filter_mb_simple(VP8Context *s, uint8_t *dst, VP8FilterStrength *f,
2231                       int mb_x, int mb_y)
2232 {
2233     int mbedge_lim, bedge_lim;
2234     int filter_level = f->filter_level;
2235     int inner_limit  = f->inner_limit;
2236     int inner_filter = f->inner_filter;
2237     ptrdiff_t linesize = s->linesize;
2238
2239     if (!filter_level)
2240         return;
2241
2242     bedge_lim  = 2 * filter_level + inner_limit;
2243     mbedge_lim = bedge_lim + 4;
2244
2245     if (mb_x)
2246         s->vp8dsp.vp8_h_loop_filter_simple(dst, linesize, mbedge_lim);
2247     if (inner_filter) {
2248         s->vp8dsp.vp8_h_loop_filter_simple(dst +  4, linesize, bedge_lim);
2249         s->vp8dsp.vp8_h_loop_filter_simple(dst +  8, linesize, bedge_lim);
2250         s->vp8dsp.vp8_h_loop_filter_simple(dst + 12, linesize, bedge_lim);
2251     }
2252
2253     if (mb_y)
2254         s->vp8dsp.vp8_v_loop_filter_simple(dst, linesize, mbedge_lim);
2255     if (inner_filter) {
2256         s->vp8dsp.vp8_v_loop_filter_simple(dst +  4 * linesize, linesize, bedge_lim);
2257         s->vp8dsp.vp8_v_loop_filter_simple(dst +  8 * linesize, linesize, bedge_lim);
2258         s->vp8dsp.vp8_v_loop_filter_simple(dst + 12 * linesize, linesize, bedge_lim);
2259     }
2260 }
2261
2262 #define MARGIN (16 << 2)
2263 static av_always_inline
2264 void vp78_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *curframe,
2265                                     VP8Frame *prev_frame, int is_vp7)
2266 {
2267     VP8Context *s = avctx->priv_data;
2268     int mb_x, mb_y;
2269
2270     s->mv_bounds.mv_min.y = -MARGIN;
2271     s->mv_bounds.mv_max.y = ((s->mb_height - 1) << 6) + MARGIN;
2272     for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
2273         VP8Macroblock *mb = s->macroblocks_base +
2274                             ((s->mb_width + 1) * (mb_y + 1) + 1);
2275         int mb_xy = mb_y * s->mb_width;
2276
2277         AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED * 0x01010101);
2278
2279         s->mv_bounds.mv_min.x = -MARGIN;
2280         s->mv_bounds.mv_max.x = ((s->mb_width - 1) << 6) + MARGIN;
2281         for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
2282             if (mb_y == 0)
2283                 AV_WN32A((mb - s->mb_width - 1)->intra4x4_pred_mode_top,
2284                          DC_PRED * 0x01010101);
2285             decode_mb_mode(s, &s->mv_bounds, mb, mb_x, mb_y, curframe->seg_map->data + mb_xy,
2286                            prev_frame && prev_frame->seg_map ?
2287                            prev_frame->seg_map->data + mb_xy : NULL, 1, is_vp7);
2288             s->mv_bounds.mv_min.x -= 64;
2289             s->mv_bounds.mv_max.x -= 64;
2290         }
2291         s->mv_bounds.mv_min.y -= 64;
2292         s->mv_bounds.mv_max.y -= 64;
2293     }
2294 }
2295
2296 static void vp7_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *cur_frame,
2297                                    VP8Frame *prev_frame)
2298 {
2299     vp78_decode_mv_mb_modes(avctx, cur_frame, prev_frame, IS_VP7);
2300 }
2301
2302 static void vp8_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *cur_frame,
2303                                    VP8Frame *prev_frame)
2304 {
2305     vp78_decode_mv_mb_modes(avctx, cur_frame, prev_frame, IS_VP8);
2306 }
2307
2308 #if HAVE_THREADS
2309 #define check_thread_pos(td, otd, mb_x_check, mb_y_check)                     \
2310     do {                                                                      \
2311         int tmp = (mb_y_check << 16) | (mb_x_check & 0xFFFF);                 \
2312         if (atomic_load(&otd->thread_mb_pos) < tmp) {                         \
2313             pthread_mutex_lock(&otd->lock);                                   \
2314             atomic_store(&td->wait_mb_pos, tmp);                              \
2315             do {                                                              \
2316                 if (atomic_load(&otd->thread_mb_pos) >= tmp)                  \
2317                     break;                                                    \
2318                 pthread_cond_wait(&otd->cond, &otd->lock);                    \
2319             } while (1);                                                      \
2320             atomic_store(&td->wait_mb_pos, INT_MAX);                          \
2321             pthread_mutex_unlock(&otd->lock);                                 \
2322         }                                                                     \
2323     } while (0)
2324
2325 #define update_pos(td, mb_y, mb_x)                                            \
2326     do {                                                                      \
2327         int pos              = (mb_y << 16) | (mb_x & 0xFFFF);                \
2328         int sliced_threading = (avctx->active_thread_type == FF_THREAD_SLICE) && \
2329                                (num_jobs > 1);                                \
2330         int is_null          = !next_td || !prev_td;                          \
2331         int pos_check        = (is_null) ? 1 :                                \
2332             (next_td != td && pos >= atomic_load(&next_td->wait_mb_pos)) ||   \
2333             (prev_td != td && pos >= atomic_load(&prev_td->wait_mb_pos));     \
2334         atomic_store(&td->thread_mb_pos, pos);                                \
2335         if (sliced_threading && pos_check) {                                  \
2336             pthread_mutex_lock(&td->lock);                                    \
2337             pthread_cond_broadcast(&td->cond);                                \
2338             pthread_mutex_unlock(&td->lock);                                  \
2339         }                                                                     \
2340     } while (0)
2341 #else
2342 #define check_thread_pos(td, otd, mb_x_check, mb_y_check) while(0)
2343 #define update_pos(td, mb_y, mb_x) while(0)
2344 #endif
2345
2346 static av_always_inline int decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
2347                                         int jobnr, int threadnr, int is_vp7)
2348 {
2349     VP8Context *s = avctx->priv_data;
2350     VP8ThreadData *prev_td, *next_td, *td = &s->thread_data[threadnr];
2351     int mb_y = atomic_load(&td->thread_mb_pos) >> 16;
2352     int mb_x, mb_xy = mb_y * s->mb_width;
2353     int num_jobs = s->num_jobs;
2354     VP8Frame *curframe = s->curframe, *prev_frame = s->prev_frame;
2355     VP56RangeCoder *c  = &s->coeff_partition[mb_y & (s->num_coeff_partitions - 1)];
2356     VP8Macroblock *mb;
2357     uint8_t *dst[3] = {
2358         curframe->tf.f->data[0] + 16 * mb_y * s->linesize,
2359         curframe->tf.f->data[1] +  8 * mb_y * s->uvlinesize,
2360         curframe->tf.f->data[2] +  8 * mb_y * s->uvlinesize
2361     };
2362
2363     if (c->end <= c->buffer && c->bits >= 0)
2364          return AVERROR_INVALIDDATA;
2365
2366     if (mb_y == 0)
2367         prev_td = td;
2368     else
2369         prev_td = &s->thread_data[(jobnr + num_jobs - 1) % num_jobs];
2370     if (mb_y == s->mb_height - 1)
2371         next_td = td;
2372     else
2373         next_td = &s->thread_data[(jobnr + 1) % num_jobs];
2374     if (s->mb_layout == 1)
2375         mb = s->macroblocks_base + ((s->mb_width + 1) * (mb_y + 1) + 1);
2376     else {
2377         // Make sure the previous frame has read its segmentation map,
2378         // if we re-use the same map.
2379         if (prev_frame && s->segmentation.enabled &&
2380             !s->segmentation.update_map)
2381             ff_thread_await_progress(&prev_frame->tf, mb_y, 0);
2382         mb = s->macroblocks + (s->mb_height - mb_y - 1) * 2;
2383         memset(mb - 1, 0, sizeof(*mb)); // zero left macroblock
2384         AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED * 0x01010101);
2385     }
2386
2387     if (!is_vp7 || mb_y == 0)
2388         memset(td->left_nnz, 0, sizeof(td->left_nnz));
2389
2390     td->mv_bounds.mv_min.x = -MARGIN;
2391     td->mv_bounds.mv_max.x = ((s->mb_width - 1) << 6) + MARGIN;
2392
2393     for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
2394         if (c->end <= c->buffer && c->bits >= 0)
2395             return AVERROR_INVALIDDATA;
2396         // Wait for previous thread to read mb_x+2, and reach mb_y-1.
2397         if (prev_td != td) {
2398             if (threadnr != 0) {
2399                 check_thread_pos(td, prev_td,
2400                                  mb_x + (is_vp7 ? 2 : 1),
2401                                  mb_y - (is_vp7 ? 2 : 1));
2402             } else {
2403                 check_thread_pos(td, prev_td,
2404                                  mb_x + (is_vp7 ? 2 : 1) + s->mb_width + 3,
2405                                  mb_y - (is_vp7 ? 2 : 1));
2406             }
2407         }
2408
2409         s->vdsp.prefetch(dst[0] + (mb_x & 3) * 4 * s->linesize + 64,
2410                          s->linesize, 4);
2411         s->vdsp.prefetch(dst[1] + (mb_x & 7) * s->uvlinesize + 64,
2412                          dst[2] - dst[1], 2);
2413
2414         if (!s->mb_layout)
2415             decode_mb_mode(s, &td->mv_bounds, mb, mb_x, mb_y, curframe->seg_map->data + mb_xy,
2416                            prev_frame && prev_frame->seg_map ?
2417                            prev_frame->seg_map->data + mb_xy : NULL, 0, is_vp7);
2418
2419         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_PREVIOUS);
2420
2421         if (!mb->skip)
2422             decode_mb_coeffs(s, td, c, mb, s->top_nnz[mb_x], td->left_nnz, is_vp7);
2423
2424         if (mb->mode <= MODE_I4x4)
2425             intra_predict(s, td, dst, mb, mb_x, mb_y, is_vp7);
2426         else
2427             inter_predict(s, td, dst, mb, mb_x, mb_y);
2428
2429         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN);
2430
2431         if (!mb->skip) {
2432             idct_mb(s, td, dst, mb);
2433         } else {
2434             AV_ZERO64(td->left_nnz);
2435             AV_WN64(s->top_nnz[mb_x], 0);   // array of 9, so unaligned
2436
2437             /* Reset DC block predictors if they would exist
2438              * if the mb had coefficients */
2439             if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
2440                 td->left_nnz[8]     = 0;
2441                 s->top_nnz[mb_x][8] = 0;
2442             }
2443         }
2444
2445         if (s->deblock_filter)
2446             filter_level_for_mb(s, mb, &td->filter_strength[mb_x], is_vp7);
2447
2448         if (s->deblock_filter && num_jobs != 1 && threadnr == num_jobs - 1) {
2449             if (s->filter.simple)
2450                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2451                                  NULL, NULL, s->linesize, 0, 1);
2452             else
2453                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2454                                  dst[1], dst[2], s->linesize, s->uvlinesize, 0);
2455         }
2456
2457         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN2);
2458
2459         dst[0]      += 16;
2460         dst[1]      += 8;
2461         dst[2]      += 8;
2462         td->mv_bounds.mv_min.x -= 64;
2463         td->mv_bounds.mv_max.x -= 64;
2464
2465         if (mb_x == s->mb_width + 1) {
2466             update_pos(td, mb_y, s->mb_width + 3);
2467         } else {
2468             update_pos(td, mb_y, mb_x);
2469         }
2470     }
2471     return 0;
2472 }
2473
2474 static int vp7_decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
2475                                         int jobnr, int threadnr)
2476 {
2477     return decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr, 1);
2478 }
2479
2480 static int vp8_decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
2481                                         int jobnr, int threadnr)
2482 {
2483     return decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr, 0);
2484 }
2485
2486 static av_always_inline void filter_mb_row(AVCodecContext *avctx, void *tdata,
2487                               int jobnr, int threadnr, int is_vp7)
2488 {
2489     VP8Context *s = avctx->priv_data;
2490     VP8ThreadData *td = &s->thread_data[threadnr];
2491     int mb_x, mb_y = atomic_load(&td->thread_mb_pos) >> 16, num_jobs = s->num_jobs;
2492     AVFrame *curframe = s->curframe->tf.f;
2493     VP8Macroblock *mb;
2494     VP8ThreadData *prev_td, *next_td;
2495     uint8_t *dst[3] = {
2496         curframe->data[0] + 16 * mb_y * s->linesize,
2497         curframe->data[1] +  8 * mb_y * s->uvlinesize,
2498         curframe->data[2] +  8 * mb_y * s->uvlinesize
2499     };
2500
2501     if (s->mb_layout == 1)
2502         mb = s->macroblocks_base + ((s->mb_width + 1) * (mb_y + 1) + 1);
2503     else
2504         mb = s->macroblocks + (s->mb_height - mb_y - 1) * 2;
2505
2506     if (mb_y == 0)
2507         prev_td = td;
2508     else
2509         prev_td = &s->thread_data[(jobnr + num_jobs - 1) % num_jobs];
2510     if (mb_y == s->mb_height - 1)
2511         next_td = td;
2512     else
2513         next_td = &s->thread_data[(jobnr + 1) % num_jobs];
2514
2515     for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb++) {
2516         VP8FilterStrength *f = &td->filter_strength[mb_x];
2517         if (prev_td != td)
2518             check_thread_pos(td, prev_td,
2519                              (mb_x + 1) + (s->mb_width + 3), mb_y - 1);
2520         if (next_td != td)
2521             if (next_td != &s->thread_data[0])
2522                 check_thread_pos(td, next_td, mb_x + 1, mb_y + 1);
2523
2524         if (num_jobs == 1) {
2525             if (s->filter.simple)
2526                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2527                                  NULL, NULL, s->linesize, 0, 1);
2528             else
2529                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2530                                  dst[1], dst[2], s->linesize, s->uvlinesize, 0);
2531         }
2532
2533         if (s->filter.simple)
2534             filter_mb_simple(s, dst[0], f, mb_x, mb_y);
2535         else
2536             filter_mb(s, dst, f, mb_x, mb_y, is_vp7);
2537         dst[0] += 16;
2538         dst[1] += 8;
2539         dst[2] += 8;
2540
2541         update_pos(td, mb_y, (s->mb_width + 3) + mb_x);
2542     }
2543 }
2544
2545 static void vp7_filter_mb_row(AVCodecContext *avctx, void *tdata,
2546                               int jobnr, int threadnr)
2547 {
2548     filter_mb_row(avctx, tdata, jobnr, threadnr, 1);
2549 }
2550
2551 static void vp8_filter_mb_row(AVCodecContext *avctx, void *tdata,
2552                               int jobnr, int threadnr)
2553 {
2554     filter_mb_row(avctx, tdata, jobnr, threadnr, 0);
2555 }
2556
2557 static av_always_inline
2558 int vp78_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata, int jobnr,
2559                               int threadnr, int is_vp7)
2560 {
2561     VP8Context *s = avctx->priv_data;
2562     VP8ThreadData *td = &s->thread_data[jobnr];
2563     VP8ThreadData *next_td = NULL, *prev_td = NULL;
2564     VP8Frame *curframe = s->curframe;
2565     int mb_y, num_jobs = s->num_jobs;
2566     int ret;
2567
2568     td->thread_nr = threadnr;
2569     td->mv_bounds.mv_min.y   = -MARGIN - 64 * threadnr;
2570     td->mv_bounds.mv_max.y   = ((s->mb_height - 1) << 6) + MARGIN - 64 * threadnr;
2571     for (mb_y = jobnr; mb_y < s->mb_height; mb_y += num_jobs) {
2572         atomic_store(&td->thread_mb_pos, mb_y << 16);
2573         ret = s->decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr);
2574         if (ret < 0) {
2575             update_pos(td, s->mb_height, INT_MAX & 0xFFFF);
2576             return ret;
2577         }
2578         if (s->deblock_filter)
2579             s->filter_mb_row(avctx, tdata, jobnr, threadnr);
2580         update_pos(td, mb_y, INT_MAX & 0xFFFF);
2581
2582         td->mv_bounds.mv_min.y -= 64 * num_jobs;
2583         td->mv_bounds.mv_max.y -= 64 * num_jobs;
2584
2585         if (avctx->active_thread_type == FF_THREAD_FRAME)
2586             ff_thread_report_progress(&curframe->tf, mb_y, 0);
2587     }
2588
2589     return 0;
2590 }
2591
2592 static int vp7_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata,
2593                                     int jobnr, int threadnr)
2594 {
2595     return vp78_decode_mb_row_sliced(avctx, tdata, jobnr, threadnr, IS_VP7);
2596 }
2597
2598 static int vp8_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata,
2599                                     int jobnr, int threadnr)
2600 {
2601     return vp78_decode_mb_row_sliced(avctx, tdata, jobnr, threadnr, IS_VP8);
2602 }
2603
2604 static av_always_inline
2605 int vp78_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
2606                       AVPacket *avpkt, int is_vp7)
2607 {
2608     VP8Context *s = avctx->priv_data;
2609     int ret, i, referenced, num_jobs;
2610     enum AVDiscard skip_thresh;
2611     VP8Frame *av_uninit(curframe), *prev_frame;
2612
2613     if (is_vp7)
2614         ret = vp7_decode_frame_header(s, avpkt->data, avpkt->size);
2615     else
2616         ret = vp8_decode_frame_header(s, avpkt->data, avpkt->size);
2617
2618     if (ret < 0)
2619         goto err;
2620
2621     if (s->actually_webp) {
2622         // avctx->pix_fmt already set in caller.
2623     } else if (!is_vp7 && s->pix_fmt == AV_PIX_FMT_NONE) {
2624         s->pix_fmt = get_pixel_format(s);
2625         if (s->pix_fmt < 0) {
2626             ret = AVERROR(EINVAL);
2627             goto err;
2628         }
2629         avctx->pix_fmt = s->pix_fmt;
2630     }
2631
2632     prev_frame = s->framep[VP56_FRAME_CURRENT];
2633
2634     referenced = s->update_last || s->update_golden == VP56_FRAME_CURRENT ||
2635                  s->update_altref == VP56_FRAME_CURRENT;
2636
2637     skip_thresh = !referenced ? AVDISCARD_NONREF
2638                               : !s->keyframe ? AVDISCARD_NONKEY
2639                                              : AVDISCARD_ALL;
2640
2641     if (avctx->skip_frame >= skip_thresh) {
2642         s->invisible = 1;
2643         memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
2644         goto skip_decode;
2645     }
2646     s->deblock_filter = s->filter.level && avctx->skip_loop_filter < skip_thresh;
2647
2648     // release no longer referenced frames
2649     for (i = 0; i < 5; i++)
2650         if (s->frames[i].tf.f->buf[0] &&
2651             &s->frames[i] != prev_frame &&
2652             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
2653             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN]   &&
2654             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2])
2655             vp8_release_frame(s, &s->frames[i]);
2656
2657     curframe = s->framep[VP56_FRAME_CURRENT] = vp8_find_free_buffer(s);
2658
2659     if (!s->colorspace)
2660         avctx->colorspace = AVCOL_SPC_BT470BG;
2661     if (s->fullrange)
2662         avctx->color_range = AVCOL_RANGE_JPEG;
2663     else
2664         avctx->color_range = AVCOL_RANGE_MPEG;
2665
2666     /* Given that arithmetic probabilities are updated every frame, it's quite
2667      * likely that the values we have on a random interframe are complete
2668      * junk if we didn't start decode on a keyframe. So just don't display
2669      * anything rather than junk. */
2670     if (!s->keyframe && (!s->framep[VP56_FRAME_PREVIOUS] ||
2671                          !s->framep[VP56_FRAME_GOLDEN]   ||
2672                          !s->framep[VP56_FRAME_GOLDEN2])) {
2673         av_log(avctx, AV_LOG_WARNING,
2674                "Discarding interframe without a prior keyframe!\n");
2675         ret = AVERROR_INVALIDDATA;
2676         goto err;
2677     }
2678
2679     curframe->tf.f->key_frame = s->keyframe;
2680     curframe->tf.f->pict_type = s->keyframe ? AV_PICTURE_TYPE_I
2681                                             : AV_PICTURE_TYPE_P;
2682     if ((ret = vp8_alloc_frame(s, curframe, referenced)) < 0)
2683         goto err;
2684
2685     // check if golden and altref are swapped
2686     if (s->update_altref != VP56_FRAME_NONE)
2687         s->next_framep[VP56_FRAME_GOLDEN2] = s->framep[s->update_altref];
2688     else
2689         s->next_framep[VP56_FRAME_GOLDEN2] = s->framep[VP56_FRAME_GOLDEN2];
2690
2691     if (s->update_golden != VP56_FRAME_NONE)
2692         s->next_framep[VP56_FRAME_GOLDEN] = s->framep[s->update_golden];
2693     else
2694         s->next_framep[VP56_FRAME_GOLDEN] = s->framep[VP56_FRAME_GOLDEN];
2695
2696     if (s->update_last)
2697         s->next_framep[VP56_FRAME_PREVIOUS] = curframe;
2698     else
2699         s->next_framep[VP56_FRAME_PREVIOUS] = s->framep[VP56_FRAME_PREVIOUS];
2700
2701     s->next_framep[VP56_FRAME_CURRENT] = curframe;
2702
2703     ff_thread_finish_setup(avctx);
2704
2705     if (avctx->hwaccel) {
2706         ret = avctx->hwaccel->start_frame(avctx, avpkt->data, avpkt->size);
2707         if (ret < 0)
2708             goto err;
2709
2710         ret = avctx->hwaccel->decode_slice(avctx, avpkt->data, avpkt->size);
2711         if (ret < 0)
2712             goto err;
2713
2714         ret = avctx->hwaccel->end_frame(avctx);
2715         if (ret < 0)
2716             goto err;
2717
2718     } else {
2719         s->linesize   = curframe->tf.f->linesize[0];
2720         s->uvlinesize = curframe->tf.f->linesize[1];
2721
2722         memset(s->top_nnz, 0, s->mb_width * sizeof(*s->top_nnz));
2723         /* Zero macroblock structures for top/top-left prediction
2724          * from outside the frame. */
2725         if (!s->mb_layout)
2726             memset(s->macroblocks + s->mb_height * 2 - 1, 0,
2727                    (s->mb_width + 1) * sizeof(*s->macroblocks));
2728         if (!s->mb_layout && s->keyframe)
2729             memset(s->intra4x4_pred_mode_top, DC_PRED, s->mb_width * 4);
2730
2731         memset(s->ref_count, 0, sizeof(s->ref_count));
2732
2733         if (s->mb_layout == 1) {
2734             // Make sure the previous frame has read its segmentation map,
2735             // if we re-use the same map.
2736             if (prev_frame && s->segmentation.enabled &&
2737                 !s->segmentation.update_map)
2738                 ff_thread_await_progress(&prev_frame->tf, 1, 0);
2739             if (is_vp7)
2740                 vp7_decode_mv_mb_modes(avctx, curframe, prev_frame);
2741             else
2742                 vp8_decode_mv_mb_modes(avctx, curframe, prev_frame);
2743         }
2744
2745         if (avctx->active_thread_type == FF_THREAD_FRAME)
2746             num_jobs = 1;
2747         else
2748             num_jobs = FFMIN(s->num_coeff_partitions, avctx->thread_count);
2749         s->num_jobs   = num_jobs;
2750         s->curframe   = curframe;
2751         s->prev_frame = prev_frame;
2752         s->mv_bounds.mv_min.y   = -MARGIN;
2753         s->mv_bounds.mv_max.y   = ((s->mb_height - 1) << 6) + MARGIN;
2754         for (i = 0; i < MAX_THREADS; i++) {
2755             VP8ThreadData *td = &s->thread_data[i];
2756             atomic_init(&td->thread_mb_pos, 0);
2757             atomic_init(&td->wait_mb_pos, INT_MAX);
2758         }
2759         if (is_vp7)
2760             avctx->execute2(avctx, vp7_decode_mb_row_sliced, s->thread_data, NULL,
2761                             num_jobs);
2762         else
2763             avctx->execute2(avctx, vp8_decode_mb_row_sliced, s->thread_data, NULL,
2764                             num_jobs);
2765     }
2766
2767     ff_thread_report_progress(&curframe->tf, INT_MAX, 0);
2768     memcpy(&s->framep[0], &s->next_framep[0], sizeof(s->framep[0]) * 4);
2769
2770 skip_decode:
2771     // if future frames don't use the updated probabilities,
2772     // reset them to the values we saved
2773     if (!s->update_probabilities)
2774         s->prob[0] = s->prob[1];
2775
2776     if (!s->invisible) {
2777         if ((ret = av_frame_ref(data, curframe->tf.f)) < 0)
2778             return ret;
2779         *got_frame = 1;
2780     }
2781
2782     return avpkt->size;
2783 err:
2784     memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
2785     return ret;
2786 }
2787
2788 int ff_vp8_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
2789                         AVPacket *avpkt)
2790 {
2791     return vp78_decode_frame(avctx, data, got_frame, avpkt, IS_VP8);
2792 }
2793
2794 #if CONFIG_VP7_DECODER
2795 static int vp7_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
2796                             AVPacket *avpkt)
2797 {
2798     return vp78_decode_frame(avctx, data, got_frame, avpkt, IS_VP7);
2799 }
2800 #endif /* CONFIG_VP7_DECODER */
2801
2802 av_cold int ff_vp8_decode_free(AVCodecContext *avctx)
2803 {
2804     VP8Context *s = avctx->priv_data;
2805     int i;
2806
2807     if (!s)
2808         return 0;
2809
2810     vp8_decode_flush_impl(avctx, 1);
2811     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++)
2812         av_frame_free(&s->frames[i].tf.f);
2813
2814     return 0;
2815 }
2816
2817 static av_cold int vp8_init_frames(VP8Context *s)
2818 {
2819     int i;
2820     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++) {
2821         s->frames[i].tf.f = av_frame_alloc();
2822         if (!s->frames[i].tf.f)
2823             return AVERROR(ENOMEM);
2824     }
2825     return 0;
2826 }
2827
2828 static av_always_inline
2829 int vp78_decode_init(AVCodecContext *avctx, int is_vp7)
2830 {
2831     VP8Context *s = avctx->priv_data;
2832     int ret;
2833
2834     s->avctx = avctx;
2835     s->vp7   = avctx->codec->id == AV_CODEC_ID_VP7;
2836     s->pix_fmt = AV_PIX_FMT_NONE;
2837     avctx->pix_fmt = AV_PIX_FMT_YUV420P;
2838     avctx->internal->allocate_progress = 1;
2839
2840     ff_videodsp_init(&s->vdsp, 8);
2841
2842     ff_vp78dsp_init(&s->vp8dsp);
2843     if (CONFIG_VP7_DECODER && is_vp7) {
2844         ff_h264_pred_init(&s->hpc, AV_CODEC_ID_VP7, 8, 1);
2845         ff_vp7dsp_init(&s->vp8dsp);
2846         s->decode_mb_row_no_filter = vp7_decode_mb_row_no_filter;
2847         s->filter_mb_row           = vp7_filter_mb_row;
2848     } else if (CONFIG_VP8_DECODER && !is_vp7) {
2849         ff_h264_pred_init(&s->hpc, AV_CODEC_ID_VP8, 8, 1);
2850         ff_vp8dsp_init(&s->vp8dsp);
2851         s->decode_mb_row_no_filter = vp8_decode_mb_row_no_filter;
2852         s->filter_mb_row           = vp8_filter_mb_row;
2853     }
2854
2855     /* does not change for VP8 */
2856     memcpy(s->prob[0].scan, ff_zigzag_scan, sizeof(s->prob[0].scan));
2857
2858     if ((ret = vp8_init_frames(s)) < 0) {
2859         ff_vp8_decode_free(avctx);
2860         return ret;
2861     }
2862
2863     return 0;
2864 }
2865
2866 #if CONFIG_VP7_DECODER
2867 static int vp7_decode_init(AVCodecContext *avctx)
2868 {
2869     return vp78_decode_init(avctx, IS_VP7);
2870 }
2871 #endif /* CONFIG_VP7_DECODER */
2872
2873 av_cold int ff_vp8_decode_init(AVCodecContext *avctx)
2874 {
2875     return vp78_decode_init(avctx, IS_VP8);
2876 }
2877
2878 #if CONFIG_VP8_DECODER
2879 #if HAVE_THREADS
2880 static av_cold int vp8_decode_init_thread_copy(AVCodecContext *avctx)
2881 {
2882     VP8Context *s = avctx->priv_data;
2883     int ret;
2884
2885     s->avctx = avctx;
2886
2887     if ((ret = vp8_init_frames(s)) < 0) {
2888         ff_vp8_decode_free(avctx);
2889         return ret;
2890     }
2891
2892     return 0;
2893 }
2894
2895 #define REBASE(pic) ((pic) ? (pic) - &s_src->frames[0] + &s->frames[0] : NULL)
2896
2897 static int vp8_decode_update_thread_context(AVCodecContext *dst,
2898                                             const AVCodecContext *src)
2899 {
2900     VP8Context *s = dst->priv_data, *s_src = src->priv_data;
2901     int i;
2902
2903     if (s->macroblocks_base &&
2904         (s_src->mb_width != s->mb_width || s_src->mb_height != s->mb_height)) {
2905         free_buffers(s);
2906         s->mb_width  = s_src->mb_width;
2907         s->mb_height = s_src->mb_height;
2908     }
2909
2910     s->pix_fmt      = s_src->pix_fmt;
2911     s->prob[0]      = s_src->prob[!s_src->update_probabilities];
2912     s->segmentation = s_src->segmentation;
2913     s->lf_delta     = s_src->lf_delta;
2914     memcpy(s->sign_bias, s_src->sign_bias, sizeof(s->sign_bias));
2915
2916     for (i = 0; i < FF_ARRAY_ELEMS(s_src->frames); i++) {
2917         if (s_src->frames[i].tf.f->buf[0]) {
2918             int ret = vp8_ref_frame(s, &s->frames[i], &s_src->frames[i]);
2919             if (ret < 0)
2920                 return ret;
2921         }
2922     }
2923
2924     s->framep[0] = REBASE(s_src->next_framep[0]);
2925     s->framep[1] = REBASE(s_src->next_framep[1]);
2926     s->framep[2] = REBASE(s_src->next_framep[2]);
2927     s->framep[3] = REBASE(s_src->next_framep[3]);
2928
2929     return 0;
2930 }
2931 #endif /* HAVE_THREADS */
2932 #endif /* CONFIG_VP8_DECODER */
2933
2934 #if CONFIG_VP7_DECODER
2935 AVCodec ff_vp7_decoder = {
2936     .name                  = "vp7",
2937     .long_name             = NULL_IF_CONFIG_SMALL("On2 VP7"),
2938     .type                  = AVMEDIA_TYPE_VIDEO,
2939     .id                    = AV_CODEC_ID_VP7,
2940     .priv_data_size        = sizeof(VP8Context),
2941     .init                  = vp7_decode_init,
2942     .close                 = ff_vp8_decode_free,
2943     .decode                = vp7_decode_frame,
2944     .capabilities          = AV_CODEC_CAP_DR1,
2945     .flush                 = vp8_decode_flush,
2946 };
2947 #endif /* CONFIG_VP7_DECODER */
2948
2949 #if CONFIG_VP8_DECODER
2950 AVCodec ff_vp8_decoder = {
2951     .name                  = "vp8",
2952     .long_name             = NULL_IF_CONFIG_SMALL("On2 VP8"),
2953     .type                  = AVMEDIA_TYPE_VIDEO,
2954     .id                    = AV_CODEC_ID_VP8,
2955     .priv_data_size        = sizeof(VP8Context),
2956     .init                  = ff_vp8_decode_init,
2957     .close                 = ff_vp8_decode_free,
2958     .decode                = ff_vp8_decode_frame,
2959     .capabilities          = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS |
2960                              AV_CODEC_CAP_SLICE_THREADS,
2961     .flush                 = vp8_decode_flush,
2962     .init_thread_copy      = ONLY_IF_THREADS_ENABLED(vp8_decode_init_thread_copy),
2963     .update_thread_context = ONLY_IF_THREADS_ENABLED(vp8_decode_update_thread_context),
2964     .hw_configs            = (const AVCodecHWConfigInternal*[]) {
2965 #if CONFIG_VP8_VAAPI_HWACCEL
2966                                HWACCEL_VAAPI(vp8),
2967 #endif
2968 #if CONFIG_VP8_NVDEC_HWACCEL
2969                                HWACCEL_NVDEC(vp8),
2970 #endif
2971                                NULL
2972                            },
2973 };
2974 #endif /* CONFIG_VP7_DECODER */