git.sesse.net Git - ffmpeg/blob - libavcodec/vp8.c

   1 /*
   2  * VP7/VP8 compatible video decoder
   3  *
   4  * Copyright (C) 2010 David Conrad
   5  * Copyright (C) 2010 Ronald S. Bultje
   6  * Copyright (C) 2010 Fiona Glaser
   7  * Copyright (C) 2012 Daniel Kang
   8  * Copyright (C) 2014 Peter Ross
   9  *
  10  * This file is part of FFmpeg.
  11  *
  12  * FFmpeg is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public
  14  * License as published by the Free Software Foundation; either
  15  * version 2.1 of the License, or (at your option) any later version.
  16  *
  17  * FFmpeg is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with FFmpeg; if not, write to the Free Software
  24  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  25  */
  26
  27 #include "libavutil/imgutils.h"
  28
  29 #include "avcodec.h"
  30 #include "hwconfig.h"
  31 #include "internal.h"
  32 #include "mathops.h"
  33 #include "rectangle.h"
  34 #include "thread.h"
  35 #include "vp8.h"
  36 #include "vp8data.h"
  37
  38 #if ARCH_ARM
  39 #   include "arm/vp8.h"
  40 #endif
  41
  42 #if CONFIG_VP7_DECODER && CONFIG_VP8_DECODER
  43 #define VPX(vp7, f) (vp7 ? vp7_ ## f : vp8_ ## f)
  44 #elif CONFIG_VP7_DECODER
  45 #define VPX(vp7, f) vp7_ ## f
  46 #else // CONFIG_VP8_DECODER
  47 #define VPX(vp7, f) vp8_ ## f
  48 #endif
  49
  50 static void free_buffers(VP8Context *s)
  51 {
  52     int i;
  53     if (s->thread_data)
  54         for (i = 0; i < MAX_THREADS; i++) {
  55 #if HAVE_THREADS
  56             pthread_cond_destroy(&s->thread_data[i].cond);
  57             pthread_mutex_destroy(&s->thread_data[i].lock);
  58 #endif
  59             av_freep(&s->thread_data[i].filter_strength);
  60         }
  61     av_freep(&s->thread_data);
  62     av_freep(&s->macroblocks_base);
  63     av_freep(&s->intra4x4_pred_mode_top);
  64     av_freep(&s->top_nnz);
  65     av_freep(&s->top_border);
  66
  67     s->macroblocks = NULL;
  68 }
  69
  70 static int vp8_alloc_frame(VP8Context *s, VP8Frame *f, int ref)
  71 {
  72     int ret;
  73     if ((ret = ff_thread_get_buffer(s->avctx, &f->tf,
  74                                     ref ? AV_GET_BUFFER_FLAG_REF : 0)) < 0)
  75         return ret;
  76     if (!(f->seg_map = av_buffer_allocz(s->mb_width * s->mb_height)))
  77         goto fail;
  78     if (s->avctx->hwaccel) {
  79         const AVHWAccel *hwaccel = s->avctx->hwaccel;
  80         if (hwaccel->frame_priv_data_size) {
  81             f->hwaccel_priv_buf = av_buffer_allocz(hwaccel->frame_priv_data_size);
  82             if (!f->hwaccel_priv_buf)
  83                 goto fail;
  84             f->hwaccel_picture_private = f->hwaccel_priv_buf->data;
  85         }
  86     }
  87     return 0;
  88
  89 fail:
  90     av_buffer_unref(&f->seg_map);
  91     ff_thread_release_buffer(s->avctx, &f->tf);
  92     return AVERROR(ENOMEM);
  93 }
  94
  95 static void vp8_release_frame(VP8Context *s, VP8Frame *f)
  96 {
  97     av_buffer_unref(&f->seg_map);
  98     av_buffer_unref(&f->hwaccel_priv_buf);
  99     f->hwaccel_picture_private = NULL;
 100     ff_thread_release_buffer(s->avctx, &f->tf);
 101 }
 102
 103 #if CONFIG_VP8_DECODER
 104 static int vp8_ref_frame(VP8Context *s, VP8Frame *dst, VP8Frame *src)
 105 {
 106     int ret;
 107
 108     vp8_release_frame(s, dst);
 109
 110     if ((ret = ff_thread_ref_frame(&dst->tf, &src->tf)) < 0)
 111         return ret;
 112     if (src->seg_map &&
 113         !(dst->seg_map = av_buffer_ref(src->seg_map))) {
 114         vp8_release_frame(s, dst);
 115         return AVERROR(ENOMEM);
 116     }
 117     if (src->hwaccel_picture_private) {
 118         dst->hwaccel_priv_buf = av_buffer_ref(src->hwaccel_priv_buf);
 119         if (!dst->hwaccel_priv_buf)
 120             return AVERROR(ENOMEM);
 121         dst->hwaccel_picture_private = dst->hwaccel_priv_buf->data;
 122     }
 123
 124     return 0;
 125 }
 126 #endif /* CONFIG_VP8_DECODER */
 127
 128 static void vp8_decode_flush_impl(AVCodecContext *avctx, int free_mem)
 129 {
 130     VP8Context *s = avctx->priv_data;
 131     int i;
 132
 133     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++)
 134         vp8_release_frame(s, &s->frames[i]);
 135     memset(s->framep, 0, sizeof(s->framep));
 136
 137     if (free_mem)
 138         free_buffers(s);
 139 }
 140
 141 static void vp8_decode_flush(AVCodecContext *avctx)
 142 {
 143     vp8_decode_flush_impl(avctx, 0);
 144 }
 145
 146 static VP8Frame *vp8_find_free_buffer(VP8Context *s)
 147 {
 148     VP8Frame *frame = NULL;
 149     int i;
 150
 151     // find a free buffer
 152     for (i = 0; i < 5; i++)
 153         if (&s->frames[i] != s->framep[VP56_FRAME_CURRENT]  &&
 154             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
 155             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN]   &&
 156             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2]) {
 157             frame = &s->frames[i];
 158             break;
 159         }
 160     if (i == 5) {
 161         av_log(s->avctx, AV_LOG_FATAL, "Ran out of free frames!\n");
 162         abort();
 163     }
 164     if (frame->tf.f->buf[0])
 165         vp8_release_frame(s, frame);
 166
 167     return frame;
 168 }
 169
 170 static enum AVPixelFormat get_pixel_format(VP8Context *s)
 171 {
 172     enum AVPixelFormat pix_fmts[] = {
 173 #if CONFIG_VP8_VAAPI_HWACCEL
 174         AV_PIX_FMT_VAAPI,
 175 #endif
 176 #if CONFIG_VP8_NVDEC_HWACCEL
 177         AV_PIX_FMT_CUDA,
 178 #endif
 179         AV_PIX_FMT_YUV420P,
 180         AV_PIX_FMT_NONE,
 181     };
 182
 183     return ff_get_format(s->avctx, pix_fmts);
 184 }
 185
 186 static av_always_inline
 187 int update_dimensions(VP8Context *s, int width, int height, int is_vp7)
 188 {
 189     AVCodecContext *avctx = s->avctx;
 190     int i, ret;
 191
 192     if (width  != s->avctx->width || ((width+15)/16 != s->mb_width || (height+15)/16 != s->mb_height) && s->macroblocks_base ||
 193         height != s->avctx->height) {
 194         vp8_decode_flush_impl(s->avctx, 1);
 195
 196         ret = ff_set_dimensions(s->avctx, width, height);
 197         if (ret < 0)
 198             return ret;
 199     }
 200
 201     if (!s->actually_webp && !is_vp7) {
 202         s->pix_fmt = get_pixel_format(s);
 203         if (s->pix_fmt < 0)
 204             return AVERROR(EINVAL);
 205         avctx->pix_fmt = s->pix_fmt;
 206     }
 207
 208     s->mb_width  = (s->avctx->coded_width  + 15) / 16;
 209     s->mb_height = (s->avctx->coded_height + 15) / 16;
 210
 211     s->mb_layout = is_vp7 || avctx->active_thread_type == FF_THREAD_SLICE &&
 212                    avctx->thread_count > 1;
 213     if (!s->mb_layout) { // Frame threading and one thread
 214         s->macroblocks_base       = av_mallocz((s->mb_width + s->mb_height * 2 + 1) *
 215                                                sizeof(*s->macroblocks));
 216         s->intra4x4_pred_mode_top = av_mallocz(s->mb_width * 4);
 217     } else // Sliced threading
 218         s->macroblocks_base = av_mallocz((s->mb_width + 2) * (s->mb_height + 2) *
 219                                          sizeof(*s->macroblocks));
 220     s->top_nnz     = av_mallocz(s->mb_width * sizeof(*s->top_nnz));
 221     s->top_border  = av_mallocz((s->mb_width + 1) * sizeof(*s->top_border));
 222     s->thread_data = av_mallocz(MAX_THREADS * sizeof(VP8ThreadData));
 223
 224     if (!s->macroblocks_base || !s->top_nnz || !s->top_border ||
 225         !s->thread_data || (!s->intra4x4_pred_mode_top && !s->mb_layout)) {
 226         free_buffers(s);
 227         return AVERROR(ENOMEM);
 228     }
 229
 230     for (i = 0; i < MAX_THREADS; i++) {
 231         s->thread_data[i].filter_strength =
 232             av_mallocz(s->mb_width * sizeof(*s->thread_data[0].filter_strength));
 233         if (!s->thread_data[i].filter_strength) {
 234             free_buffers(s);
 235             return AVERROR(ENOMEM);
 236         }
 237 #if HAVE_THREADS
 238         pthread_mutex_init(&s->thread_data[i].lock, NULL);
 239         pthread_cond_init(&s->thread_data[i].cond, NULL);
 240 #endif
 241     }
 242
 243     s->macroblocks = s->macroblocks_base + 1;
 244
 245     return 0;
 246 }
 247
 248 static int vp7_update_dimensions(VP8Context *s, int width, int height)
 249 {
 250     return update_dimensions(s, width, height, IS_VP7);
 251 }
 252
 253 static int vp8_update_dimensions(VP8Context *s, int width, int height)
 254 {
 255     return update_dimensions(s, width, height, IS_VP8);
 256 }
 257
 258
 259 static void parse_segment_info(VP8Context *s)
 260 {
 261     VP56RangeCoder *c = &s->c;
 262     int i;
 263
 264     s->segmentation.update_map = vp8_rac_get(c);
 265     s->segmentation.update_feature_data = vp8_rac_get(c);
 266
 267     if (s->segmentation.update_feature_data) {
 268         s->segmentation.absolute_vals = vp8_rac_get(c);
 269
 270         for (i = 0; i < 4; i++)
 271             s->segmentation.base_quant[i]   = vp8_rac_get_sint(c, 7);
 272
 273         for (i = 0; i < 4; i++)
 274             s->segmentation.filter_level[i] = vp8_rac_get_sint(c, 6);
 275     }
 276     if (s->segmentation.update_map)
 277         for (i = 0; i < 3; i++)
 278             s->prob->segmentid[i] = vp8_rac_get(c) ? vp8_rac_get_uint(c, 8) : 255;
 279 }
 280
 281 static void update_lf_deltas(VP8Context *s)
 282 {
 283     VP56RangeCoder *c = &s->c;
 284     int i;
 285
 286     for (i = 0; i < 4; i++) {
 287         if (vp8_rac_get(c)) {
 288             s->lf_delta.ref[i] = vp8_rac_get_uint(c, 6);
 289
 290             if (vp8_rac_get(c))
 291                 s->lf_delta.ref[i] = -s->lf_delta.ref[i];
 292         }
 293     }
 294
 295     for (i = MODE_I4x4; i <= VP8_MVMODE_SPLIT; i++) {
 296         if (vp8_rac_get(c)) {
 297             s->lf_delta.mode[i] = vp8_rac_get_uint(c, 6);
 298
 299             if (vp8_rac_get(c))
 300                 s->lf_delta.mode[i] = -s->lf_delta.mode[i];
 301         }
 302     }
 303 }
 304
 305 static int setup_partitions(VP8Context *s, const uint8_t *buf, int buf_size)
 306 {
 307     const uint8_t *sizes = buf;
 308     int i;
 309     int ret;
 310
 311     s->num_coeff_partitions = 1 << vp8_rac_get_uint(&s->c, 2);
 312
 313     buf      += 3 * (s->num_coeff_partitions - 1);
 314     buf_size -= 3 * (s->num_coeff_partitions - 1);
 315     if (buf_size < 0)
 316         return -1;
 317
 318     for (i = 0; i < s->num_coeff_partitions - 1; i++) {
 319         int size = AV_RL24(sizes + 3 * i);
 320         if (buf_size - size < 0)
 321             return -1;
 322         s->coeff_partition_size[i] = size;
 323
 324         ret = ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, size);
 325         if (ret < 0)
 326             return ret;
 327         buf      += size;
 328         buf_size -= size;
 329     }
 330
 331     s->coeff_partition_size[i] = buf_size;
 332     ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, buf_size);
 333
 334     return 0;
 335 }
 336
 337 static void vp7_get_quants(VP8Context *s)
 338 {
 339     VP56RangeCoder *c = &s->c;
 340
 341     int yac_qi  = vp8_rac_get_uint(c, 7);
 342     int ydc_qi  = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 343     int y2dc_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 344     int y2ac_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 345     int uvdc_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 346     int uvac_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 347
 348     s->qmat[0].luma_qmul[0]    =       vp7_ydc_qlookup[ydc_qi];
 349     s->qmat[0].luma_qmul[1]    =       vp7_yac_qlookup[yac_qi];
 350     s->qmat[0].luma_dc_qmul[0] =       vp7_y2dc_qlookup[y2dc_qi];
 351     s->qmat[0].luma_dc_qmul[1] =       vp7_y2ac_qlookup[y2ac_qi];
 352     s->qmat[0].chroma_qmul[0]  = FFMIN(vp7_ydc_qlookup[uvdc_qi], 132);
 353     s->qmat[0].chroma_qmul[1]  =       vp7_yac_qlookup[uvac_qi];
 354 }
 355
 356 static void vp8_get_quants(VP8Context *s)
 357 {
 358     VP56RangeCoder *c = &s->c;
 359     int i, base_qi;
 360
 361     s->quant.yac_qi     = vp8_rac_get_uint(c, 7);
 362     s->quant.ydc_delta  = vp8_rac_get_sint(c, 4);
 363     s->quant.y2dc_delta = vp8_rac_get_sint(c, 4);
 364     s->quant.y2ac_delta = vp8_rac_get_sint(c, 4);
 365     s->quant.uvdc_delta = vp8_rac_get_sint(c, 4);
 366     s->quant.uvac_delta = vp8_rac_get_sint(c, 4);
 367
 368     for (i = 0; i < 4; i++) {
 369         if (s->segmentation.enabled) {
 370             base_qi = s->segmentation.base_quant[i];
 371             if (!s->segmentation.absolute_vals)
 372                 base_qi += s->quant.yac_qi;
 373         } else
 374             base_qi = s->quant.yac_qi;
 375
 376         s->qmat[i].luma_qmul[0]    = vp8_dc_qlookup[av_clip_uintp2(base_qi + s->quant.ydc_delta,  7)];
 377         s->qmat[i].luma_qmul[1]    = vp8_ac_qlookup[av_clip_uintp2(base_qi,              7)];
 378         s->qmat[i].luma_dc_qmul[0] = vp8_dc_qlookup[av_clip_uintp2(base_qi + s->quant.y2dc_delta, 7)] * 2;
 379         /* 101581>>16 is equivalent to 155/100 */
 380         s->qmat[i].luma_dc_qmul[1] = vp8_ac_qlookup[av_clip_uintp2(base_qi + s->quant.y2ac_delta, 7)] * 101581 >> 16;
 381         s->qmat[i].chroma_qmul[0]  = vp8_dc_qlookup[av_clip_uintp2(base_qi + s->quant.uvdc_delta, 7)];
 382         s->qmat[i].chroma_qmul[1]  = vp8_ac_qlookup[av_clip_uintp2(base_qi + s->quant.uvac_delta, 7)];
 383
 384         s->qmat[i].luma_dc_qmul[1] = FFMAX(s->qmat[i].luma_dc_qmul[1], 8);
 385         s->qmat[i].chroma_qmul[0]  = FFMIN(s->qmat[i].chroma_qmul[0], 132);
 386     }
 387 }
 388
 389 /**
 390  * Determine which buffers golden and altref should be updated with after this frame.
 391  * The spec isn't clear here, so I'm going by my understanding of what libvpx does
 392  *
 393  * Intra frames update all 3 references
 394  * Inter frames update VP56_FRAME_PREVIOUS if the update_last flag is set
 395  * If the update (golden|altref) flag is set, it's updated with the current frame
 396  *      if update_last is set, and VP56_FRAME_PREVIOUS otherwise.
 397  * If the flag is not set, the number read means:
 398  *      0: no update
 399  *      1: VP56_FRAME_PREVIOUS
 400  *      2: update golden with altref, or update altref with golden
 401  */
 402 static VP56Frame ref_to_update(VP8Context *s, int update, VP56Frame ref)
 403 {
 404     VP56RangeCoder *c = &s->c;
 405
 406     if (update)
 407         return VP56_FRAME_CURRENT;
 408
 409     switch (vp8_rac_get_uint(c, 2)) {
 410     case 1:
 411         return VP56_FRAME_PREVIOUS;
 412     case 2:
 413         return (ref == VP56_FRAME_GOLDEN) ? VP56_FRAME_GOLDEN2 : VP56_FRAME_GOLDEN;
 414     }
 415     return VP56_FRAME_NONE;
 416 }
 417
 418 static void vp78_reset_probability_tables(VP8Context *s)
 419 {
 420     int i, j;
 421     for (i = 0; i < 4; i++)
 422         for (j = 0; j < 16; j++)
 423             memcpy(s->prob->token[i][j], vp8_token_default_probs[i][vp8_coeff_band[j]],
 424                    sizeof(s->prob->token[i][j]));
 425 }
 426
 427 static void vp78_update_probability_tables(VP8Context *s)
 428 {
 429     VP56RangeCoder *c = &s->c;
 430     int i, j, k, l, m;
 431
 432     for (i = 0; i < 4; i++)
 433         for (j = 0; j < 8; j++)
 434             for (k = 0; k < 3; k++)
 435                 for (l = 0; l < NUM_DCT_TOKENS-1; l++)
 436                     if (vp56_rac_get_prob_branchy(c, vp8_token_update_probs[i][j][k][l])) {
 437                         int prob = vp8_rac_get_uint(c, 8);
 438                         for (m = 0; vp8_coeff_band_indexes[j][m] >= 0; m++)
 439                             s->prob->token[i][vp8_coeff_band_indexes[j][m]][k][l] = prob;
 440                     }
 441 }
 442
 443 #define VP7_MVC_SIZE 17
 444 #define VP8_MVC_SIZE 19
 445
 446 static void vp78_update_pred16x16_pred8x8_mvc_probabilities(VP8Context *s,
 447                                                             int mvc_size)
 448 {
 449     VP56RangeCoder *c = &s->c;
 450     int i, j;
 451
 452     if (vp8_rac_get(c))
 453         for (i = 0; i < 4; i++)
 454             s->prob->pred16x16[i] = vp8_rac_get_uint(c, 8);
 455     if (vp8_rac_get(c))
 456         for (i = 0; i < 3; i++)
 457             s->prob->pred8x8c[i]  = vp8_rac_get_uint(c, 8);
 458
 459     // 17.2 MV probability update
 460     for (i = 0; i < 2; i++)
 461         for (j = 0; j < mvc_size; j++)
 462             if (vp56_rac_get_prob_branchy(c, vp8_mv_update_prob[i][j]))
 463                 s->prob->mvc[i][j] = vp8_rac_get_nn(c);
 464 }
 465
 466 static void update_refs(VP8Context *s)
 467 {
 468     VP56RangeCoder *c = &s->c;
 469
 470     int update_golden = vp8_rac_get(c);
 471     int update_altref = vp8_rac_get(c);
 472
 473     s->update_golden = ref_to_update(s, update_golden, VP56_FRAME_GOLDEN);
 474     s->update_altref = ref_to_update(s, update_altref, VP56_FRAME_GOLDEN2);
 475 }
 476
 477 static void copy_chroma(AVFrame *dst, AVFrame *src, int width, int height)
 478 {
 479     int i, j;
 480
 481     for (j = 1; j < 3; j++) {
 482         for (i = 0; i < height / 2; i++)
 483             memcpy(dst->data[j] + i * dst->linesize[j],
 484                    src->data[j] + i * src->linesize[j], width / 2);
 485     }
 486 }
 487
 488 static void fade(uint8_t *dst, ptrdiff_t dst_linesize,
 489                  const uint8_t *src, ptrdiff_t src_linesize,
 490                  int width, int height,
 491                  int alpha, int beta)
 492 {
 493     int i, j;
 494     for (j = 0; j < height; j++) {
 495         const uint8_t *src2 = src + j * src_linesize;
 496         uint8_t *dst2 = dst + j * dst_linesize;
 497         for (i = 0; i < width; i++) {
 498             uint8_t y = src2[i];
 499             dst2[i] = av_clip_uint8(y + ((y * beta) >> 8) + alpha);
 500         }
 501     }
 502 }
 503
 504 static int vp7_fade_frame(VP8Context *s, int alpha, int beta)
 505 {
 506     int ret;
 507
 508     if (!s->keyframe && (alpha || beta)) {
 509         int width  = s->mb_width * 16;
 510         int height = s->mb_height * 16;
 511         AVFrame *src, *dst;
 512
 513         if (!s->framep[VP56_FRAME_PREVIOUS] ||
 514             !s->framep[VP56_FRAME_GOLDEN]) {
 515             av_log(s->avctx, AV_LOG_WARNING, "Discarding interframe without a prior keyframe!\n");
 516             return AVERROR_INVALIDDATA;
 517         }
 518
 519         dst =
 520         src = s->framep[VP56_FRAME_PREVIOUS]->tf.f;
 521
 522         /* preserve the golden frame, write a new previous frame */
 523         if (s->framep[VP56_FRAME_GOLDEN] == s->framep[VP56_FRAME_PREVIOUS]) {
 524             s->framep[VP56_FRAME_PREVIOUS] = vp8_find_free_buffer(s);
 525             if ((ret = vp8_alloc_frame(s, s->framep[VP56_FRAME_PREVIOUS], 1)) < 0)
 526                 return ret;
 527
 528             dst = s->framep[VP56_FRAME_PREVIOUS]->tf.f;
 529
 530             copy_chroma(dst, src, width, height);
 531         }
 532
 533         fade(dst->data[0], dst->linesize[0],
 534              src->data[0], src->linesize[0],
 535              width, height, alpha, beta);
 536     }
 537
 538     return 0;
 539 }
 540
 541 static int vp7_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size)
 542 {
 543     VP56RangeCoder *c = &s->c;
 544     int part1_size, hscale, vscale, i, j, ret;
 545     int width  = s->avctx->width;
 546     int height = s->avctx->height;
 547     int alpha = 0;
 548     int beta  = 0;
 549
 550     if (buf_size < 4) {
 551         return AVERROR_INVALIDDATA;
 552     }
 553
 554     s->profile = (buf[0] >> 1) & 7;
 555     if (s->profile > 1) {
 556         avpriv_request_sample(s->avctx, "Unknown profile %d", s->profile);
 557         return AVERROR_INVALIDDATA;
 558     }
 559
 560     s->keyframe  = !(buf[0] & 1);
 561     s->invisible = 0;
 562     part1_size   = AV_RL24(buf) >> 4;
 563
 564     if (buf_size < 4 - s->profile + part1_size) {
 565         av_log(s->avctx, AV_LOG_ERROR, "Buffer size %d is too small, needed : %d\n", buf_size, 4 - s->profile + part1_size);
 566         return AVERROR_INVALIDDATA;
 567     }
 568
 569     buf      += 4 - s->profile;
 570     buf_size -= 4 - s->profile;
 571
 572     memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab, sizeof(s->put_pixels_tab));
 573
 574     ret = ff_vp56_init_range_decoder(c, buf, part1_size);
 575     if (ret < 0)
 576         return ret;
 577     buf      += part1_size;
 578     buf_size -= part1_size;
 579
 580     /* A. Dimension information (keyframes only) */
 581     if (s->keyframe) {
 582         width  = vp8_rac_get_uint(c, 12);
 583         height = vp8_rac_get_uint(c, 12);
 584         hscale = vp8_rac_get_uint(c, 2);
 585         vscale = vp8_rac_get_uint(c, 2);
 586         if (hscale || vscale)
 587             avpriv_request_sample(s->avctx, "Upscaling");
 588
 589         s->update_golden = s->update_altref = VP56_FRAME_CURRENT;
 590         vp78_reset_probability_tables(s);
 591         memcpy(s->prob->pred16x16, vp8_pred16x16_prob_inter,
 592                sizeof(s->prob->pred16x16));
 593         memcpy(s->prob->pred8x8c, vp8_pred8x8c_prob_inter,
 594                sizeof(s->prob->pred8x8c));
 595         for (i = 0; i < 2; i++)
 596             memcpy(s->prob->mvc[i], vp7_mv_default_prob[i],
 597                    sizeof(vp7_mv_default_prob[i]));
 598         memset(&s->segmentation, 0, sizeof(s->segmentation));
 599         memset(&s->lf_delta, 0, sizeof(s->lf_delta));
 600         memcpy(s->prob[0].scan, ff_zigzag_scan, sizeof(s->prob[0].scan));
 601     }
 602
 603     if (s->keyframe || s->profile > 0)
 604         memset(s->inter_dc_pred, 0 , sizeof(s->inter_dc_pred));
 605
 606     /* B. Decoding information for all four macroblock-level features */
 607     for (i = 0; i < 4; i++) {
 608         s->feature_enabled[i] = vp8_rac_get(c);
 609         if (s->feature_enabled[i]) {
 610              s->feature_present_prob[i] = vp8_rac_get_uint(c, 8);
 611
 612              for (j = 0; j < 3; j++)
 613                  s->feature_index_prob[i][j] =
 614                      vp8_rac_get(c) ? vp8_rac_get_uint(c, 8) : 255;
 615
 616              if (vp7_feature_value_size[s->profile][i])
 617                  for (j = 0; j < 4; j++)
 618                      s->feature_value[i][j] =
 619                         vp8_rac_get(c) ? vp8_rac_get_uint(c, vp7_feature_value_size[s->profile][i]) : 0;
 620         }
 621     }
 622
 623     s->segmentation.enabled    = 0;
 624     s->segmentation.update_map = 0;
 625     s->lf_delta.enabled        = 0;
 626
 627     s->num_coeff_partitions = 1;
 628     ret = ff_vp56_init_range_decoder(&s->coeff_partition[0], buf, buf_size);
 629     if (ret < 0)
 630         return ret;
 631
 632     if (!s->macroblocks_base || /* first frame */
 633         width != s->avctx->width || height != s->avctx->height ||
 634         (width + 15) / 16 != s->mb_width || (height + 15) / 16 != s->mb_height) {
 635         if ((ret = vp7_update_dimensions(s, width, height)) < 0)
 636             return ret;
 637     }
 638
 639     /* C. Dequantization indices */
 640     vp7_get_quants(s);
 641
 642     /* D. Golden frame update flag (a Flag) for interframes only */
 643     if (!s->keyframe) {
 644         s->update_golden = vp8_rac_get(c) ? VP56_FRAME_CURRENT : VP56_FRAME_NONE;
 645         s->sign_bias[VP56_FRAME_GOLDEN] = 0;
 646     }
 647
 648     s->update_last          = 1;
 649     s->update_probabilities = 1;
 650     s->fade_present         = 1;
 651
 652     if (s->profile > 0) {
 653         s->update_probabilities = vp8_rac_get(c);
 654         if (!s->update_probabilities)
 655             s->prob[1] = s->prob[0];
 656
 657         if (!s->keyframe)
 658             s->fade_present = vp8_rac_get(c);
 659     }
 660
 661     if (vpX_rac_is_end(c))
 662         return AVERROR_INVALIDDATA;
 663     /* E. Fading information for previous frame */
 664     if (s->fade_present && vp8_rac_get(c)) {
 665         alpha = (int8_t) vp8_rac_get_uint(c, 8);
 666         beta  = (int8_t) vp8_rac_get_uint(c, 8);
 667     }
 668
 669     /* F. Loop filter type */
 670     if (!s->profile)
 671         s->filter.simple = vp8_rac_get(c);
 672
 673     /* G. DCT coefficient ordering specification */
 674     if (vp8_rac_get(c))
 675         for (i = 1; i < 16; i++)
 676             s->prob[0].scan[i] = ff_zigzag_scan[vp8_rac_get_uint(c, 4)];
 677
 678     /* H. Loop filter levels  */
 679     if (s->profile > 0)
 680         s->filter.simple = vp8_rac_get(c);
 681     s->filter.level     = vp8_rac_get_uint(c, 6);
 682     s->filter.sharpness = vp8_rac_get_uint(c, 3);
 683
 684     /* I. DCT coefficient probability update; 13.3 Token Probability Updates */
 685     vp78_update_probability_tables(s);
 686
 687     s->mbskip_enabled = 0;
 688
 689     /* J. The remaining frame header data occurs ONLY FOR INTERFRAMES */
 690     if (!s->keyframe) {
 691         s->prob->intra  = vp8_rac_get_uint(c, 8);
 692         s->prob->last   = vp8_rac_get_uint(c, 8);
 693         vp78_update_pred16x16_pred8x8_mvc_probabilities(s, VP7_MVC_SIZE);
 694     }
 695
 696     if (vpX_rac_is_end(c))
 697         return AVERROR_INVALIDDATA;
 698
 699     if ((ret = vp7_fade_frame(s, alpha, beta)) < 0)
 700         return ret;
 701
 702     return 0;
 703 }
 704
 705 static int vp8_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size)
 706 {
 707     VP56RangeCoder *c = &s->c;
 708     int header_size, hscale, vscale, ret;
 709     int width  = s->avctx->width;
 710     int height = s->avctx->height;
 711
 712     if (buf_size < 3) {
 713         av_log(s->avctx, AV_LOG_ERROR, "Insufficent data (%d) for header\n", buf_size);
 714         return AVERROR_INVALIDDATA;
 715     }
 716
 717     s->keyframe  = !(buf[0] & 1);
 718     s->profile   =  (buf[0]>>1) & 7;
 719     s->invisible = !(buf[0] & 0x10);
 720     header_size  = AV_RL24(buf) >> 5;
 721     buf      += 3;
 722     buf_size -= 3;
 723
 724     s->header_partition_size = header_size;
 725
 726     if (s->profile > 3)
 727         av_log(s->avctx, AV_LOG_WARNING, "Unknown profile %d\n", s->profile);
 728
 729     if (!s->profile)
 730         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab,
 731                sizeof(s->put_pixels_tab));
 732     else    // profile 1-3 use bilinear, 4+ aren't defined so whatever
 733         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_bilinear_pixels_tab,
 734                sizeof(s->put_pixels_tab));
 735
 736     if (header_size > buf_size - 7 * s->keyframe) {
 737         av_log(s->avctx, AV_LOG_ERROR, "Header size larger than data provided\n");
 738         return AVERROR_INVALIDDATA;
 739     }
 740
 741     if (s->keyframe) {
 742         if (AV_RL24(buf) != 0x2a019d) {
 743             av_log(s->avctx, AV_LOG_ERROR,
 744                    "Invalid start code 0x%x\n", AV_RL24(buf));
 745             return AVERROR_INVALIDDATA;
 746         }
 747         width     = AV_RL16(buf + 3) & 0x3fff;
 748         height    = AV_RL16(buf + 5) & 0x3fff;
 749         hscale    = buf[4] >> 6;
 750         vscale    = buf[6] >> 6;
 751         buf      += 7;
 752         buf_size -= 7;
 753
 754         if (hscale || vscale)
 755             avpriv_request_sample(s->avctx, "Upscaling");
 756
 757         s->update_golden = s->update_altref = VP56_FRAME_CURRENT;
 758         vp78_reset_probability_tables(s);
 759         memcpy(s->prob->pred16x16, vp8_pred16x16_prob_inter,
 760                sizeof(s->prob->pred16x16));
 761         memcpy(s->prob->pred8x8c, vp8_pred8x8c_prob_inter,
 762                sizeof(s->prob->pred8x8c));
 763         memcpy(s->prob->mvc, vp8_mv_default_prob,
 764                sizeof(s->prob->mvc));
 765         memset(&s->segmentation, 0, sizeof(s->segmentation));
 766         memset(&s->lf_delta, 0, sizeof(s->lf_delta));
 767     }
 768
 769     ret = ff_vp56_init_range_decoder(c, buf, header_size);
 770     if (ret < 0)
 771         return ret;
 772     buf      += header_size;
 773     buf_size -= header_size;
 774
 775     if (s->keyframe) {
 776         s->colorspace = vp8_rac_get(c);
 777         if (s->colorspace)
 778             av_log(s->avctx, AV_LOG_WARNING, "Unspecified colorspace\n");
 779         s->fullrange = vp8_rac_get(c);
 780     }
 781
 782     if ((s->segmentation.enabled = vp8_rac_get(c)))
 783         parse_segment_info(s);
 784     else
 785         s->segmentation.update_map = 0; // FIXME: move this to some init function?
 786
 787     s->filter.simple    = vp8_rac_get(c);
 788     s->filter.level     = vp8_rac_get_uint(c, 6);
 789     s->filter.sharpness = vp8_rac_get_uint(c, 3);
 790
 791     if ((s->lf_delta.enabled = vp8_rac_get(c))) {
 792         s->lf_delta.update = vp8_rac_get(c);
 793         if (s->lf_delta.update)
 794             update_lf_deltas(s);
 795     }
 796
 797     if (setup_partitions(s, buf, buf_size)) {
 798         av_log(s->avctx, AV_LOG_ERROR, "Invalid partitions\n");
 799         return AVERROR_INVALIDDATA;
 800     }
 801
 802     if (!s->macroblocks_base || /* first frame */
 803         width != s->avctx->width || height != s->avctx->height ||
 804         (width+15)/16 != s->mb_width || (height+15)/16 != s->mb_height)
 805         if ((ret = vp8_update_dimensions(s, width, height)) < 0)
 806             return ret;
 807
 808     vp8_get_quants(s);
 809
 810     if (!s->keyframe) {
 811         update_refs(s);
 812         s->sign_bias[VP56_FRAME_GOLDEN]               = vp8_rac_get(c);
 813         s->sign_bias[VP56_FRAME_GOLDEN2 /* altref */] = vp8_rac_get(c);
 814     }
 815
 816     // if we aren't saving this frame's probabilities for future frames,
 817     // make a copy of the current probabilities
 818     if (!(s->update_probabilities = vp8_rac_get(c)))
 819         s->prob[1] = s->prob[0];
 820
 821     s->update_last = s->keyframe || vp8_rac_get(c);
 822
 823     vp78_update_probability_tables(s);
 824
 825     if ((s->mbskip_enabled = vp8_rac_get(c)))
 826         s->prob->mbskip = vp8_rac_get_uint(c, 8);
 827
 828     if (!s->keyframe) {
 829         s->prob->intra  = vp8_rac_get_uint(c, 8);
 830         s->prob->last   = vp8_rac_get_uint(c, 8);
 831         s->prob->golden = vp8_rac_get_uint(c, 8);
 832         vp78_update_pred16x16_pred8x8_mvc_probabilities(s, VP8_MVC_SIZE);
 833     }
 834
 835     // Record the entropy coder state here so that hwaccels can use it.
 836     s->c.code_word = vp56_rac_renorm(&s->c);
 837     s->coder_state_at_header_end.input     = s->c.buffer - (-s->c.bits / 8);
 838     s->coder_state_at_header_end.range     = s->c.high;
 839     s->coder_state_at_header_end.value     = s->c.code_word >> 16;
 840     s->coder_state_at_header_end.bit_count = -s->c.bits % 8;
 841
 842     return 0;
 843 }
 844
 845 static av_always_inline
 846 void clamp_mv(VP8mvbounds *s, VP56mv *dst, const VP56mv *src)
 847 {
 848     dst->x = av_clip(src->x, av_clip(s->mv_min.x, INT16_MIN, INT16_MAX),
 849                              av_clip(s->mv_max.x, INT16_MIN, INT16_MAX));
 850     dst->y = av_clip(src->y, av_clip(s->mv_min.y, INT16_MIN, INT16_MAX),
 851                              av_clip(s->mv_max.y, INT16_MIN, INT16_MAX));
 852 }
 853
 854 /**
 855  * Motion vector coding, 17.1.
 856  */
 857 static av_always_inline int read_mv_component(VP56RangeCoder *c, const uint8_t *p, int vp7)
 858 {
 859     int bit, x = 0;
 860
 861     if (vp56_rac_get_prob_branchy(c, p[0])) {
 862         int i;
 863
 864         for (i = 0; i < 3; i++)
 865             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 866         for (i = (vp7 ? 7 : 9); i > 3; i--)
 867             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 868         if (!(x & (vp7 ? 0xF0 : 0xFFF0)) || vp56_rac_get_prob(c, p[12]))
 869             x += 8;
 870     } else {
 871         // small_mvtree
 872         const uint8_t *ps = p + 2;
 873         bit = vp56_rac_get_prob(c, *ps);
 874         ps += 1 + 3 * bit;
 875         x  += 4 * bit;
 876         bit = vp56_rac_get_prob(c, *ps);
 877         ps += 1 + bit;
 878         x  += 2 * bit;
 879         x  += vp56_rac_get_prob(c, *ps);
 880     }
 881
 882     return (x && vp56_rac_get_prob(c, p[1])) ? -x : x;
 883 }
 884
 885 static int vp7_read_mv_component(VP56RangeCoder *c, const uint8_t *p)
 886 {
 887     return read_mv_component(c, p, 1);
 888 }
 889
 890 static int vp8_read_mv_component(VP56RangeCoder *c, const uint8_t *p)
 891 {
 892     return read_mv_component(c, p, 0);
 893 }
 894
 895 static av_always_inline
 896 const uint8_t *get_submv_prob(uint32_t left, uint32_t top, int is_vp7)
 897 {
 898     if (is_vp7)
 899         return vp7_submv_prob;
 900
 901     if (left == top)
 902         return vp8_submv_prob[4 - !!left];
 903     if (!top)
 904         return vp8_submv_prob[2];
 905     return vp8_submv_prob[1 - !!left];
 906 }
 907
 908 /**
 909  * Split motion vector prediction, 16.4.
 910  * @returns the number of motion vectors parsed (2, 4 or 16)
 911  */
 912 static av_always_inline
 913 int decode_splitmvs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
 914                     int layout, int is_vp7)
 915 {
 916     int part_idx;
 917     int n, num;
 918     VP8Macroblock *top_mb;
 919     VP8Macroblock *left_mb = &mb[-1];
 920     const uint8_t *mbsplits_left = vp8_mbsplits[left_mb->partitioning];
 921     const uint8_t *mbsplits_top, *mbsplits_cur, *firstidx;
 922     VP56mv *top_mv;
 923     VP56mv *left_mv = left_mb->bmv;
 924     VP56mv *cur_mv  = mb->bmv;
 925
 926     if (!layout) // layout is inlined, s->mb_layout is not
 927         top_mb = &mb[2];
 928     else
 929         top_mb = &mb[-s->mb_width - 1];
 930     mbsplits_top = vp8_mbsplits[top_mb->partitioning];
 931     top_mv       = top_mb->bmv;
 932
 933     if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[0])) {
 934         if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[1]))
 935             part_idx = VP8_SPLITMVMODE_16x8 + vp56_rac_get_prob(c, vp8_mbsplit_prob[2]);
 936         else
 937             part_idx = VP8_SPLITMVMODE_8x8;
 938     } else {
 939         part_idx = VP8_SPLITMVMODE_4x4;
 940     }
 941
 942     num              = vp8_mbsplit_count[part_idx];
 943     mbsplits_cur     = vp8_mbsplits[part_idx],
 944     firstidx         = vp8_mbfirstidx[part_idx];
 945     mb->partitioning = part_idx;
 946
 947     for (n = 0; n < num; n++) {
 948         int k = firstidx[n];
 949         uint32_t left, above;
 950         const uint8_t *submv_prob;
 951
 952         if (!(k & 3))
 953             left = AV_RN32A(&left_mv[mbsplits_left[k + 3]]);
 954         else
 955             left = AV_RN32A(&cur_mv[mbsplits_cur[k - 1]]);
 956         if (k <= 3)
 957             above = AV_RN32A(&top_mv[mbsplits_top[k + 12]]);
 958         else
 959             above = AV_RN32A(&cur_mv[mbsplits_cur[k - 4]]);
 960
 961         submv_prob = get_submv_prob(left, above, is_vp7);
 962
 963         if (vp56_rac_get_prob_branchy(c, submv_prob[0])) {
 964             if (vp56_rac_get_prob_branchy(c, submv_prob[1])) {
 965                 if (vp56_rac_get_prob_branchy(c, submv_prob[2])) {
 966                     mb->bmv[n].y = mb->mv.y +
 967                                    read_mv_component(c, s->prob->mvc[0], is_vp7);
 968                     mb->bmv[n].x = mb->mv.x +
 969                                    read_mv_component(c, s->prob->mvc[1], is_vp7);
 970                 } else {
 971                     AV_ZERO32(&mb->bmv[n]);
 972                 }
 973             } else {
 974                 AV_WN32A(&mb->bmv[n], above);
 975             }
 976         } else {
 977             AV_WN32A(&mb->bmv[n], left);
 978         }
 979     }
 980
 981     return num;
 982 }
 983
 984 /**
 985  * The vp7 reference decoder uses a padding macroblock column (added to right
 986  * edge of the frame) to guard against illegal macroblock offsets. The
 987  * algorithm has bugs that permit offsets to straddle the padding column.
 988  * This function replicates those bugs.
 989  *
 990  * @param[out] edge_x macroblock x address
 991  * @param[out] edge_y macroblock y address
 992  *
 993  * @return macroblock offset legal (boolean)
 994  */
 995 static int vp7_calculate_mb_offset(int mb_x, int mb_y, int mb_width,
 996                                    int xoffset, int yoffset, int boundary,
 997                                    int *edge_x, int *edge_y)
 998 {
 999     int vwidth = mb_width + 1;
1000     int new = (mb_y + yoffset) * vwidth + mb_x + xoffset;
1001     if (new < boundary || new % vwidth == vwidth - 1)
1002         return 0;
1003     *edge_y = new / vwidth;
1004     *edge_x = new % vwidth;
1005     return 1;
1006 }
1007
1008 static const VP56mv *get_bmv_ptr(const VP8Macroblock *mb, int subblock)
1009 {
1010     return &mb->bmv[mb->mode == VP8_MVMODE_SPLIT ? vp8_mbsplits[mb->partitioning][subblock] : 0];
1011 }
1012
1013 static av_always_inline
1014 void vp7_decode_mvs(VP8Context *s, VP8Macroblock *mb,
1015                     int mb_x, int mb_y, int layout)
1016 {
1017     VP8Macroblock *mb_edge[12];
1018     enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR };
1019     enum { VP8_EDGE_TOP, VP8_EDGE_LEFT, VP8_EDGE_TOPLEFT };
1020     int idx = CNT_ZERO;
1021     VP56mv near_mv[3];
1022     uint8_t cnt[3] = { 0 };
1023     VP56RangeCoder *c = &s->c;
1024     int i;
1025
1026     AV_ZERO32(&near_mv[0]);
1027     AV_ZERO32(&near_mv[1]);
1028     AV_ZERO32(&near_mv[2]);
1029
1030     for (i = 0; i < VP7_MV_PRED_COUNT; i++) {
1031         const VP7MVPred * pred = &vp7_mv_pred[i];
1032         int edge_x, edge_y;
1033
1034         if (vp7_calculate_mb_offset(mb_x, mb_y, s->mb_width, pred->xoffset,
1035                                     pred->yoffset, !s->profile, &edge_x, &edge_y)) {
1036             VP8Macroblock *edge = mb_edge[i] = (s->mb_layout == 1)
1037                                              ? s->macroblocks_base + 1 + edge_x +
1038                                                (s->mb_width + 1) * (edge_y + 1)
1039                                              : s->macroblocks + edge_x +
1040                                                (s->mb_height - edge_y - 1) * 2;
1041             uint32_t mv = AV_RN32A(get_bmv_ptr(edge, vp7_mv_pred[i].subblock));
1042             if (mv) {
1043                 if (AV_RN32A(&near_mv[CNT_NEAREST])) {
1044                     if (mv == AV_RN32A(&near_mv[CNT_NEAREST])) {
1045                         idx = CNT_NEAREST;
1046                     } else if (AV_RN32A(&near_mv[CNT_NEAR])) {
1047                         if (mv != AV_RN32A(&near_mv[CNT_NEAR]))
1048                             continue;
1049                         idx = CNT_NEAR;
1050                     } else {
1051                         AV_WN32A(&near_mv[CNT_NEAR], mv);
1052                         idx = CNT_NEAR;
1053                     }
1054                 } else {
1055                     AV_WN32A(&near_mv[CNT_NEAREST], mv);
1056                     idx = CNT_NEAREST;
1057                 }
1058             } else {
1059                 idx = CNT_ZERO;
1060             }
1061         } else {
1062             idx = CNT_ZERO;
1063         }
1064         cnt[idx] += vp7_mv_pred[i].score;
1065     }
1066
1067     mb->partitioning = VP8_SPLITMVMODE_NONE;
1068
1069     if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_ZERO]][0])) {
1070         mb->mode = VP8_MVMODE_MV;
1071
1072         if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAREST]][1])) {
1073
1074             if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAR]][2])) {
1075
1076                 if (cnt[CNT_NEAREST] > cnt[CNT_NEAR])
1077                     AV_WN32A(&mb->mv, cnt[CNT_ZERO] > cnt[CNT_NEAREST] ? 0 : AV_RN32A(&near_mv[CNT_NEAREST]));
1078                 else
1079                     AV_WN32A(&mb->mv, cnt[CNT_ZERO] > cnt[CNT_NEAR]    ? 0 : AV_RN32A(&near_mv[CNT_NEAR]));
1080
1081                 if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAR]][3])) {
1082                     mb->mode = VP8_MVMODE_SPLIT;
1083                     mb->mv = mb->bmv[decode_splitmvs(s, c, mb, layout, IS_VP7) - 1];
1084                 } else {
1085                     mb->mv.y += vp7_read_mv_component(c, s->prob->mvc[0]);
1086                     mb->mv.x += vp7_read_mv_component(c, s->prob->mvc[1]);
1087                     mb->bmv[0] = mb->mv;
1088                 }
1089             } else {
1090                 mb->mv = near_mv[CNT_NEAR];
1091                 mb->bmv[0] = mb->mv;
1092             }
1093         } else {
1094             mb->mv = near_mv[CNT_NEAREST];
1095             mb->bmv[0] = mb->mv;
1096         }
1097     } else {
1098         mb->mode = VP8_MVMODE_ZERO;
1099         AV_ZERO32(&mb->mv);
1100         mb->bmv[0] = mb->mv;
1101     }
1102 }
1103
1104 static av_always_inline
1105 void vp8_decode_mvs(VP8Context *s, VP8mvbounds *mv_bounds, VP8Macroblock *mb,
1106                     int mb_x, int mb_y, int layout)
1107 {
1108     VP8Macroblock *mb_edge[3] = { 0      /* top */,
1109                                   mb - 1 /* left */,
1110                                   0      /* top-left */ };
1111     enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV };
1112     enum { VP8_EDGE_TOP, VP8_EDGE_LEFT, VP8_EDGE_TOPLEFT };
1113     int idx = CNT_ZERO;
1114     int cur_sign_bias = s->sign_bias[mb->ref_frame];
1115     int8_t *sign_bias = s->sign_bias;
1116     VP56mv near_mv[4];
1117     uint8_t cnt[4] = { 0 };
1118     VP56RangeCoder *c = &s->c;
1119
1120     if (!layout) { // layout is inlined (s->mb_layout is not)
1121         mb_edge[0] = mb + 2;
1122         mb_edge[2] = mb + 1;
1123     } else {
1124         mb_edge[0] = mb - s->mb_width - 1;
1125         mb_edge[2] = mb - s->mb_width - 2;
1126     }
1127
1128     AV_ZERO32(&near_mv[0]);
1129     AV_ZERO32(&near_mv[1]);
1130     AV_ZERO32(&near_mv[2]);
1131
1132     /* Process MB on top, left and top-left */
1133 #define MV_EDGE_CHECK(n)                                                      \
1134     {                                                                         \
1135         VP8Macroblock *edge = mb_edge[n];                                     \
1136         int edge_ref = edge->ref_frame;                                       \
1137         if (edge_ref != VP56_FRAME_CURRENT) {                                 \
1138             uint32_t mv = AV_RN32A(&edge->mv);                                \
1139             if (mv) {                                                         \
1140                 if (cur_sign_bias != sign_bias[edge_ref]) {                   \
1141                     /* SWAR negate of the values in mv. */                    \
1142                     mv = ~mv;                                                 \
1143                     mv = ((mv & 0x7fff7fff) +                                 \
1144                           0x00010001) ^ (mv & 0x80008000);                    \
1145                 }                                                             \
1146                 if (!n || mv != AV_RN32A(&near_mv[idx]))                      \
1147                     AV_WN32A(&near_mv[++idx], mv);                            \
1148                 cnt[idx] += 1 + (n != 2);                                     \
1149             } else                                                            \
1150                 cnt[CNT_ZERO] += 1 + (n != 2);                                \
1151         }                                                                     \
1152     }
1153
1154     MV_EDGE_CHECK(0)
1155     MV_EDGE_CHECK(1)
1156     MV_EDGE_CHECK(2)
1157
1158     mb->partitioning = VP8_SPLITMVMODE_NONE;
1159     if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_ZERO]][0])) {
1160         mb->mode = VP8_MVMODE_MV;
1161
1162         /* If we have three distinct MVs, merge first and last if they're the same */
1163         if (cnt[CNT_SPLITMV] &&
1164             AV_RN32A(&near_mv[1 + VP8_EDGE_TOP]) == AV_RN32A(&near_mv[1 + VP8_EDGE_TOPLEFT]))
1165             cnt[CNT_NEAREST] += 1;
1166
1167         /* Swap near and nearest if necessary */
1168         if (cnt[CNT_NEAR] > cnt[CNT_NEAREST]) {
1169             FFSWAP(uint8_t,     cnt[CNT_NEAREST],     cnt[CNT_NEAR]);
1170             FFSWAP( VP56mv, near_mv[CNT_NEAREST], near_mv[CNT_NEAR]);
1171         }
1172
1173         if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAREST]][1])) {
1174             if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAR]][2])) {
1175                 /* Choose the best mv out of 0,0 and the nearest mv */
1176                 clamp_mv(mv_bounds, &mb->mv, &near_mv[CNT_ZERO + (cnt[CNT_NEAREST] >= cnt[CNT_ZERO])]);
1177                 cnt[CNT_SPLITMV] = ((mb_edge[VP8_EDGE_LEFT]->mode    == VP8_MVMODE_SPLIT) +
1178                                     (mb_edge[VP8_EDGE_TOP]->mode     == VP8_MVMODE_SPLIT)) * 2 +
1179                                     (mb_edge[VP8_EDGE_TOPLEFT]->mode == VP8_MVMODE_SPLIT);
1180
1181                 if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_SPLITMV]][3])) {
1182                     mb->mode = VP8_MVMODE_SPLIT;
1183                     mb->mv = mb->bmv[decode_splitmvs(s, c, mb, layout, IS_VP8) - 1];
1184                 } else {
1185                     mb->mv.y  += vp8_read_mv_component(c, s->prob->mvc[0]);
1186                     mb->mv.x  += vp8_read_mv_component(c, s->prob->mvc[1]);
1187                     mb->bmv[0] = mb->mv;
1188                 }
1189             } else {
1190                 clamp_mv(mv_bounds, &mb->mv, &near_mv[CNT_NEAR]);
1191                 mb->bmv[0] = mb->mv;
1192             }
1193         } else {
1194             clamp_mv(mv_bounds, &mb->mv, &near_mv[CNT_NEAREST]);
1195             mb->bmv[0] = mb->mv;
1196         }
1197     } else {
1198         mb->mode = VP8_MVMODE_ZERO;
1199         AV_ZERO32(&mb->mv);
1200         mb->bmv[0] = mb->mv;
1201     }
1202 }
1203
1204 static av_always_inline
1205 void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
1206                            int mb_x, int keyframe, int layout)
1207 {
1208     uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
1209
1210     if (layout) {
1211         VP8Macroblock *mb_top = mb - s->mb_width - 1;
1212         memcpy(mb->intra4x4_pred_mode_top, mb_top->intra4x4_pred_mode_top, 4);
1213     }
1214     if (keyframe) {
1215         int x, y;
1216         uint8_t *top;
1217         uint8_t *const left = s->intra4x4_pred_mode_left;
1218         if (layout)
1219             top = mb->intra4x4_pred_mode_top;
1220         else
1221             top = s->intra4x4_pred_mode_top + 4 * mb_x;
1222         for (y = 0; y < 4; y++) {
1223             for (x = 0; x < 4; x++) {
1224                 const uint8_t *ctx;
1225                 ctx       = vp8_pred4x4_prob_intra[top[x]][left[y]];
1226                 *intra4x4 = vp8_rac_get_tree(c, vp8_pred4x4_tree, ctx);
1227                 left[y]   = top[x] = *intra4x4;
1228                 intra4x4++;
1229             }
1230         }
1231     } else {
1232         int i;
1233         for (i = 0; i < 16; i++)
1234             intra4x4[i] = vp8_rac_get_tree(c, vp8_pred4x4_tree,
1235                                            vp8_pred4x4_prob_inter);
1236     }
1237 }
1238
1239 static av_always_inline
1240 void decode_mb_mode(VP8Context *s, VP8mvbounds *mv_bounds,
1241                     VP8Macroblock *mb, int mb_x, int mb_y,
1242                     uint8_t *segment, uint8_t *ref, int layout, int is_vp7)
1243 {
1244     VP56RangeCoder *c = &s->c;
1245     static const char * const vp7_feature_name[] = { "q-index",
1246                                                      "lf-delta",
1247                                                      "partial-golden-update",
1248                                                      "blit-pitch" };
1249     if (is_vp7) {
1250         int i;
1251         *segment = 0;
1252         for (i = 0; i < 4; i++) {
1253             if (s->feature_enabled[i]) {
1254                 if (vp56_rac_get_prob_branchy(c, s->feature_present_prob[i])) {
1255                       int index = vp8_rac_get_tree(c, vp7_feature_index_tree,
1256                                                    s->feature_index_prob[i]);
1257                       av_log(s->avctx, AV_LOG_WARNING,
1258                              "Feature %s present in macroblock (value 0x%x)\n",
1259                              vp7_feature_name[i], s->feature_value[i][index]);
1260                 }
1261            }
1262         }
1263     } else if (s->segmentation.update_map) {
1264         int bit  = vp56_rac_get_prob(c, s->prob->segmentid[0]);
1265         *segment = vp56_rac_get_prob(c, s->prob->segmentid[1+bit]) + 2*bit;
1266     } else if (s->segmentation.enabled)
1267         *segment = ref ? *ref : *segment;
1268     mb->segment = *segment;
1269
1270     mb->skip = s->mbskip_enabled ? vp56_rac_get_prob(c, s->prob->mbskip) : 0;
1271
1272     if (s->keyframe) {
1273         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_intra,
1274                                     vp8_pred16x16_prob_intra);
1275
1276         if (mb->mode == MODE_I4x4) {
1277             decode_intra4x4_modes(s, c, mb, mb_x, 1, layout);
1278         } else {
1279             const uint32_t modes = (is_vp7 ? vp7_pred4x4_mode
1280                                            : vp8_pred4x4_mode)[mb->mode] * 0x01010101u;
1281             if (s->mb_layout)
1282                 AV_WN32A(mb->intra4x4_pred_mode_top, modes);
1283             else
1284                 AV_WN32A(s->intra4x4_pred_mode_top + 4 * mb_x, modes);
1285             AV_WN32A(s->intra4x4_pred_mode_left, modes);
1286         }
1287
1288         mb->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree,
1289                                                 vp8_pred8x8c_prob_intra);
1290         mb->ref_frame        = VP56_FRAME_CURRENT;
1291     } else if (vp56_rac_get_prob_branchy(c, s->prob->intra)) {
1292         // inter MB, 16.2
1293         if (vp56_rac_get_prob_branchy(c, s->prob->last))
1294             mb->ref_frame =
1295                 (!is_vp7 && vp56_rac_get_prob(c, s->prob->golden)) ? VP56_FRAME_GOLDEN2 /* altref */
1296                                                                    : VP56_FRAME_GOLDEN;
1297         else
1298             mb->ref_frame = VP56_FRAME_PREVIOUS;
1299         s->ref_count[mb->ref_frame - 1]++;
1300
1301         // motion vectors, 16.3
1302         if (is_vp7)
1303             vp7_decode_mvs(s, mb, mb_x, mb_y, layout);
1304         else
1305             vp8_decode_mvs(s, mv_bounds, mb, mb_x, mb_y, layout);
1306     } else {
1307         // intra MB, 16.1
1308         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_inter, s->prob->pred16x16);
1309
1310         if (mb->mode == MODE_I4x4)
1311             decode_intra4x4_modes(s, c, mb, mb_x, 0, layout);
1312
1313         mb->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree,
1314                                                 s->prob->pred8x8c);
1315         mb->ref_frame        = VP56_FRAME_CURRENT;
1316         mb->partitioning     = VP8_SPLITMVMODE_NONE;
1317         AV_ZERO32(&mb->bmv[0]);
1318     }
1319 }
1320
1321 /**
1322  * @param r     arithmetic bitstream reader context
1323  * @param block destination for block coefficients
1324  * @param probs probabilities to use when reading trees from the bitstream
1325  * @param i     initial coeff index, 0 unless a separate DC block is coded
1326  * @param qmul  array holding the dc/ac dequant factor at position 0/1
1327  *
1328  * @return 0 if no coeffs were decoded
1329  *         otherwise, the index of the last coeff decoded plus one
1330  */
1331 static av_always_inline
1332 int decode_block_coeffs_internal(VP56RangeCoder *r, int16_t block[16],
1333                                  uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1334                                  int i, uint8_t *token_prob, int16_t qmul[2],
1335                                  const uint8_t scan[16], int vp7)
1336 {
1337     VP56RangeCoder c = *r;
1338     goto skip_eob;
1339     do {
1340         int coeff;
1341 restart:
1342         if (!vp56_rac_get_prob_branchy(&c, token_prob[0]))   // DCT_EOB
1343             break;
1344
1345 skip_eob:
1346         if (!vp56_rac_get_prob_branchy(&c, token_prob[1])) { // DCT_0
1347             if (++i == 16)
1348                 break; // invalid input; blocks should end with EOB
1349             token_prob = probs[i][0];
1350             if (vp7)
1351                 goto restart;
1352             goto skip_eob;
1353         }
1354
1355         if (!vp56_rac_get_prob_branchy(&c, token_prob[2])) { // DCT_1
1356             coeff = 1;
1357             token_prob = probs[i + 1][1];
1358         } else {
1359             if (!vp56_rac_get_prob_branchy(&c, token_prob[3])) { // DCT 2,3,4
1360                 coeff = vp56_rac_get_prob_branchy(&c, token_prob[4]);
1361                 if (coeff)
1362                     coeff += vp56_rac_get_prob(&c, token_prob[5]);
1363                 coeff += 2;
1364             } else {
1365                 // DCT_CAT*
1366                 if (!vp56_rac_get_prob_branchy(&c, token_prob[6])) {
1367                     if (!vp56_rac_get_prob_branchy(&c, token_prob[7])) { // DCT_CAT1
1368                         coeff = 5 + vp56_rac_get_prob(&c, vp8_dct_cat1_prob[0]);
1369                     } else {                                    // DCT_CAT2
1370                         coeff  = 7;
1371                         coeff += vp56_rac_get_prob(&c, vp8_dct_cat2_prob[0]) << 1;
1372                         coeff += vp56_rac_get_prob(&c, vp8_dct_cat2_prob[1]);
1373                     }
1374                 } else {    // DCT_CAT3 and up
1375                     int a   = vp56_rac_get_prob(&c, token_prob[8]);
1376                     int b   = vp56_rac_get_prob(&c, token_prob[9 + a]);
1377                     int cat = (a << 1) + b;
1378                     coeff  = 3 + (8 << cat);
1379                     coeff += vp8_rac_get_coeff(&c, ff_vp8_dct_cat_prob[cat]);
1380                 }
1381             }
1382             token_prob = probs[i + 1][2];
1383         }
1384         block[scan[i]] = (vp8_rac_get(&c) ? -coeff : coeff) * qmul[!!i];
1385     } while (++i < 16);
1386
1387     *r = c;
1388     return i;
1389 }
1390
1391 static av_always_inline
1392 int inter_predict_dc(int16_t block[16], int16_t pred[2])
1393 {
1394     int16_t dc = block[0];
1395     int ret = 0;
1396
1397     if (pred[1] > 3) {
1398         dc += pred[0];
1399         ret = 1;
1400     }
1401
1402     if (!pred[0] | !dc | ((int32_t)pred[0] ^ (int32_t)dc) >> 31) {
1403         block[0] = pred[0] = dc;
1404         pred[1] = 0;
1405     } else {
1406         if (pred[0] == dc)
1407             pred[1]++;
1408         block[0] = pred[0] = dc;
1409     }
1410
1411     return ret;
1412 }
1413
1414 static int vp7_decode_block_coeffs_internal(VP56RangeCoder *r,
1415                                             int16_t block[16],
1416                                             uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1417                                             int i, uint8_t *token_prob,
1418                                             int16_t qmul[2],
1419                                             const uint8_t scan[16])
1420 {
1421     return decode_block_coeffs_internal(r, block, probs, i,
1422                                         token_prob, qmul, scan, IS_VP7);
1423 }
1424
1425 #ifndef vp8_decode_block_coeffs_internal
1426 static int vp8_decode_block_coeffs_internal(VP56RangeCoder *r,
1427                                             int16_t block[16],
1428                                             uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1429                                             int i, uint8_t *token_prob,
1430                                             int16_t qmul[2])
1431 {
1432     return decode_block_coeffs_internal(r, block, probs, i,
1433                                         token_prob, qmul, ff_zigzag_scan, IS_VP8);
1434 }
1435 #endif
1436
1437 /**
1438  * @param c          arithmetic bitstream reader context
1439  * @param block      destination for block coefficients
1440  * @param probs      probabilities to use when reading trees from the bitstream
1441  * @param i          initial coeff index, 0 unless a separate DC block is coded
1442  * @param zero_nhood the initial prediction context for number of surrounding
1443  *                   all-zero blocks (only left/top, so 0-2)
1444  * @param qmul       array holding the dc/ac dequant factor at position 0/1
1445  * @param scan       scan pattern (VP7 only)
1446  *
1447  * @return 0 if no coeffs were decoded
1448  *         otherwise, the index of the last coeff decoded plus one
1449  */
1450 static av_always_inline
1451 int decode_block_coeffs(VP56RangeCoder *c, int16_t block[16],
1452                         uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1453                         int i, int zero_nhood, int16_t qmul[2],
1454                         const uint8_t scan[16], int vp7)
1455 {
1456     uint8_t *token_prob = probs[i][zero_nhood];
1457     if (!vp56_rac_get_prob_branchy(c, token_prob[0]))   // DCT_EOB
1458         return 0;
1459     return vp7 ? vp7_decode_block_coeffs_internal(c, block, probs, i,
1460                                                   token_prob, qmul, scan)
1461                : vp8_decode_block_coeffs_internal(c, block, probs, i,
1462                                                   token_prob, qmul);
1463 }
1464
1465 static av_always_inline
1466 void decode_mb_coeffs(VP8Context *s, VP8ThreadData *td, VP56RangeCoder *c,
1467                       VP8Macroblock *mb, uint8_t t_nnz[9], uint8_t l_nnz[9],
1468                       int is_vp7)
1469 {
1470     int i, x, y, luma_start = 0, luma_ctx = 3;
1471     int nnz_pred, nnz, nnz_total = 0;
1472     int segment = mb->segment;
1473     int block_dc = 0;
1474
1475     if (mb->mode != MODE_I4x4 && (is_vp7 || mb->mode != VP8_MVMODE_SPLIT)) {
1476         nnz_pred = t_nnz[8] + l_nnz[8];
1477
1478         // decode DC values and do hadamard
1479         nnz = decode_block_coeffs(c, td->block_dc, s->prob->token[1], 0,
1480                                   nnz_pred, s->qmat[segment].luma_dc_qmul,
1481                                   ff_zigzag_scan, is_vp7);
1482         l_nnz[8] = t_nnz[8] = !!nnz;
1483
1484         if (is_vp7 && mb->mode > MODE_I4x4) {
1485             nnz |=  inter_predict_dc(td->block_dc,
1486                                      s->inter_dc_pred[mb->ref_frame - 1]);
1487         }
1488
1489         if (nnz) {
1490             nnz_total += nnz;
1491             block_dc   = 1;
1492             if (nnz == 1)
1493                 s->vp8dsp.vp8_luma_dc_wht_dc(td->block, td->block_dc);
1494             else
1495                 s->vp8dsp.vp8_luma_dc_wht(td->block, td->block_dc);
1496         }
1497         luma_start = 1;
1498         luma_ctx   = 0;
1499     }
1500
1501     // luma blocks
1502     for (y = 0; y < 4; y++)
1503         for (x = 0; x < 4; x++) {
1504             nnz_pred = l_nnz[y] + t_nnz[x];
1505             nnz = decode_block_coeffs(c, td->block[y][x],
1506                                       s->prob->token[luma_ctx],
1507                                       luma_start, nnz_pred,
1508                                       s->qmat[segment].luma_qmul,
1509                                       s->prob[0].scan, is_vp7);
1510             /* nnz+block_dc may be one more than the actual last index,
1511              * but we don't care */
1512             td->non_zero_count_cache[y][x] = nnz + block_dc;
1513             t_nnz[x] = l_nnz[y] = !!nnz;
1514             nnz_total += nnz;
1515         }
1516
1517     // chroma blocks
1518     // TODO: what to do about dimensions? 2nd dim for luma is x,
1519     // but for chroma it's (y<<1)|x
1520     for (i = 4; i < 6; i++)
1521         for (y = 0; y < 2; y++)
1522             for (x = 0; x < 2; x++) {
1523                 nnz_pred = l_nnz[i + 2 * y] + t_nnz[i + 2 * x];
1524                 nnz = decode_block_coeffs(c, td->block[i][(y << 1) + x],
1525                                           s->prob->token[2], 0, nnz_pred,
1526                                           s->qmat[segment].chroma_qmul,
1527                                           s->prob[0].scan, is_vp7);
1528                 td->non_zero_count_cache[i][(y << 1) + x] = nnz;
1529                 t_nnz[i + 2 * x] = l_nnz[i + 2 * y] = !!nnz;
1530                 nnz_total += nnz;
1531             }
1532
1533     // if there were no coded coeffs despite the macroblock not being marked skip,
1534     // we MUST not do the inner loop filter and should not do IDCT
1535     // Since skip isn't used for bitstream prediction, just manually set it.
1536     if (!nnz_total)
1537         mb->skip = 1;
1538 }
1539
1540 static av_always_inline
1541 void backup_mb_border(uint8_t *top_border, uint8_t *src_y,
1542                       uint8_t *src_cb, uint8_t *src_cr,
1543                       ptrdiff_t linesize, ptrdiff_t uvlinesize, int simple)
1544 {
1545     AV_COPY128(top_border, src_y + 15 * linesize);
1546     if (!simple) {
1547         AV_COPY64(top_border + 16, src_cb + 7 * uvlinesize);
1548         AV_COPY64(top_border + 24, src_cr + 7 * uvlinesize);
1549     }
1550 }
1551
1552 static av_always_inline
1553 void xchg_mb_border(uint8_t *top_border, uint8_t *src_y, uint8_t *src_cb,
1554                     uint8_t *src_cr, ptrdiff_t linesize, ptrdiff_t uvlinesize, int mb_x,
1555                     int mb_y, int mb_width, int simple, int xchg)
1556 {
1557     uint8_t *top_border_m1 = top_border - 32;     // for TL prediction
1558     src_y  -= linesize;
1559     src_cb -= uvlinesize;
1560     src_cr -= uvlinesize;
1561
1562 #define XCHG(a, b, xchg)                                                      \
1563     do {                                                                      \
1564         if (xchg)                                                             \
1565             AV_SWAP64(b, a);                                                  \
1566         else                                                                  \
1567             AV_COPY64(b, a);                                                  \
1568     } while (0)
1569
1570     XCHG(top_border_m1 + 8, src_y - 8, xchg);
1571     XCHG(top_border, src_y, xchg);
1572     XCHG(top_border + 8, src_y + 8, 1);
1573     if (mb_x < mb_width - 1)
1574         XCHG(top_border + 32, src_y + 16, 1);
1575
1576     // only copy chroma for normal loop filter
1577     // or to initialize the top row to 127
1578     if (!simple || !mb_y) {
1579         XCHG(top_border_m1 + 16, src_cb - 8, xchg);
1580         XCHG(top_border_m1 + 24, src_cr - 8, xchg);
1581         XCHG(top_border + 16, src_cb, 1);
1582         XCHG(top_border + 24, src_cr, 1);
1583     }
1584 }
1585
1586 static av_always_inline
1587 int check_dc_pred8x8_mode(int mode, int mb_x, int mb_y)
1588 {
1589     if (!mb_x)
1590         return mb_y ? TOP_DC_PRED8x8 : DC_128_PRED8x8;
1591     else
1592         return mb_y ? mode : LEFT_DC_PRED8x8;
1593 }
1594
1595 static av_always_inline
1596 int check_tm_pred8x8_mode(int mode, int mb_x, int mb_y, int vp7)
1597 {
1598     if (!mb_x)
1599         return mb_y ? VERT_PRED8x8 : (vp7 ? DC_128_PRED8x8 : DC_129_PRED8x8);
1600     else
1601         return mb_y ? mode : HOR_PRED8x8;
1602 }
1603
1604 static av_always_inline
1605 int check_intra_pred8x8_mode_emuedge(int mode, int mb_x, int mb_y, int vp7)
1606 {
1607     switch (mode) {
1608     case DC_PRED8x8:
1609         return check_dc_pred8x8_mode(mode, mb_x, mb_y);
1610     case VERT_PRED8x8:
1611         return !mb_y ? (vp7 ? DC_128_PRED8x8 : DC_127_PRED8x8) : mode;
1612     case HOR_PRED8x8:
1613         return !mb_x ? (vp7 ? DC_128_PRED8x8 : DC_129_PRED8x8) : mode;
1614     case PLANE_PRED8x8: /* TM */
1615         return check_tm_pred8x8_mode(mode, mb_x, mb_y, vp7);
1616     }
1617     return mode;
1618 }
1619
1620 static av_always_inline
1621 int check_tm_pred4x4_mode(int mode, int mb_x, int mb_y, int vp7)
1622 {
1623     if (!mb_x) {
1624         return mb_y ? VERT_VP8_PRED : (vp7 ? DC_128_PRED : DC_129_PRED);
1625     } else {
1626         return mb_y ? mode : HOR_VP8_PRED;
1627     }
1628 }
1629
1630 static av_always_inline
1631 int check_intra_pred4x4_mode_emuedge(int mode, int mb_x, int mb_y,
1632                                      int *copy_buf, int vp7)
1633 {
1634     switch (mode) {
1635     case VERT_PRED:
1636         if (!mb_x && mb_y) {
1637             *copy_buf = 1;
1638             return mode;
1639         }
1640         /* fall-through */
1641     case DIAG_DOWN_LEFT_PRED:
1642     case VERT_LEFT_PRED:
1643         return !mb_y ? (vp7 ? DC_128_PRED : DC_127_PRED) : mode;
1644     case HOR_PRED:
1645         if (!mb_y) {
1646             *copy_buf = 1;
1647             return mode;
1648         }
1649         /* fall-through */
1650     case HOR_UP_PRED:
1651         return !mb_x ? (vp7 ? DC_128_PRED : DC_129_PRED) : mode;
1652     case TM_VP8_PRED:
1653         return check_tm_pred4x4_mode(mode, mb_x, mb_y, vp7);
1654     case DC_PRED: /* 4x4 DC doesn't use the same "H.264-style" exceptions
1655                    * as 16x16/8x8 DC */
1656     case DIAG_DOWN_RIGHT_PRED:
1657     case VERT_RIGHT_PRED:
1658     case HOR_DOWN_PRED:
1659         if (!mb_y || !mb_x)
1660             *copy_buf = 1;
1661         return mode;
1662     }
1663     return mode;
1664 }
1665
1666 static av_always_inline
1667 void intra_predict(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1668                    VP8Macroblock *mb, int mb_x, int mb_y, int is_vp7)
1669 {
1670     int x, y, mode, nnz;
1671     uint32_t tr;
1672
1673     /* for the first row, we need to run xchg_mb_border to init the top edge
1674      * to 127 otherwise, skip it if we aren't going to deblock */
1675     if (mb_y && (s->deblock_filter || !mb_y) && td->thread_nr == 0)
1676         xchg_mb_border(s->top_border[mb_x + 1], dst[0], dst[1], dst[2],
1677                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1678                        s->filter.simple, 1);
1679
1680     if (mb->mode < MODE_I4x4) {
1681         mode = check_intra_pred8x8_mode_emuedge(mb->mode, mb_x, mb_y, is_vp7);
1682         s->hpc.pred16x16[mode](dst[0], s->linesize);
1683     } else {
1684         uint8_t *ptr = dst[0];
1685         uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
1686         const uint8_t lo = is_vp7 ? 128 : 127;
1687         const uint8_t hi = is_vp7 ? 128 : 129;
1688         uint8_t tr_top[4] = { lo, lo, lo, lo };
1689
1690         // all blocks on the right edge of the macroblock use bottom edge
1691         // the top macroblock for their topright edge
1692         uint8_t *tr_right = ptr - s->linesize + 16;
1693
1694         // if we're on the right edge of the frame, said edge is extended
1695         // from the top macroblock
1696         if (mb_y && mb_x == s->mb_width - 1) {
1697             tr       = tr_right[-1] * 0x01010101u;
1698             tr_right = (uint8_t *) &tr;
1699         }
1700
1701         if (mb->skip)
1702             AV_ZERO128(td->non_zero_count_cache);
1703
1704         for (y = 0; y < 4; y++) {
1705             uint8_t *topright = ptr + 4 - s->linesize;
1706             for (x = 0; x < 4; x++) {
1707                 int copy = 0;
1708                 ptrdiff_t linesize = s->linesize;
1709                 uint8_t *dst = ptr + 4 * x;
1710                 LOCAL_ALIGNED(4, uint8_t, copy_dst, [5 * 8]);
1711
1712                 if ((y == 0 || x == 3) && mb_y == 0) {
1713                     topright = tr_top;
1714                 } else if (x == 3)
1715                     topright = tr_right;
1716
1717                 mode = check_intra_pred4x4_mode_emuedge(intra4x4[x], mb_x + x,
1718                                                         mb_y + y, &copy, is_vp7);
1719                 if (copy) {
1720                     dst      = copy_dst + 12;
1721                     linesize = 8;
1722                     if (!(mb_y + y)) {
1723                         copy_dst[3] = lo;
1724                         AV_WN32A(copy_dst + 4, lo * 0x01010101U);
1725                     } else {
1726                         AV_COPY32(copy_dst + 4, ptr + 4 * x - s->linesize);
1727                         if (!(mb_x + x)) {
1728                             copy_dst[3] = hi;
1729                         } else {
1730                             copy_dst[3] = ptr[4 * x - s->linesize - 1];
1731                         }
1732                     }
1733                     if (!(mb_x + x)) {
1734                         copy_dst[11] =
1735                         copy_dst[19] =
1736                         copy_dst[27] =
1737                         copy_dst[35] = hi;
1738                     } else {
1739                         copy_dst[11] = ptr[4 * x                   - 1];
1740                         copy_dst[19] = ptr[4 * x + s->linesize     - 1];
1741                         copy_dst[27] = ptr[4 * x + s->linesize * 2 - 1];
1742                         copy_dst[35] = ptr[4 * x + s->linesize * 3 - 1];
1743                     }
1744                 }
1745                 s->hpc.pred4x4[mode](dst, topright, linesize);
1746                 if (copy) {
1747                     AV_COPY32(ptr + 4 * x,                   copy_dst + 12);
1748                     AV_COPY32(ptr + 4 * x + s->linesize,     copy_dst + 20);
1749                     AV_COPY32(ptr + 4 * x + s->linesize * 2, copy_dst + 28);
1750                     AV_COPY32(ptr + 4 * x + s->linesize * 3, copy_dst + 36);
1751                 }
1752
1753                 nnz = td->non_zero_count_cache[y][x];
1754                 if (nnz) {
1755                     if (nnz == 1)
1756                         s->vp8dsp.vp8_idct_dc_add(ptr + 4 * x,
1757                                                   td->block[y][x], s->linesize);
1758                     else
1759                         s->vp8dsp.vp8_idct_add(ptr + 4 * x,
1760                                                td->block[y][x], s->linesize);
1761                 }
1762                 topright += 4;
1763             }
1764
1765             ptr      += 4 * s->linesize;
1766             intra4x4 += 4;
1767         }
1768     }
1769
1770     mode = check_intra_pred8x8_mode_emuedge(mb->chroma_pred_mode,
1771                                             mb_x, mb_y, is_vp7);
1772     s->hpc.pred8x8[mode](dst[1], s->uvlinesize);
1773     s->hpc.pred8x8[mode](dst[2], s->uvlinesize);
1774
1775     if (mb_y && (s->deblock_filter || !mb_y) && td->thread_nr == 0)
1776         xchg_mb_border(s->top_border[mb_x + 1], dst[0], dst[1], dst[2],
1777                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1778                        s->filter.simple, 0);
1779 }
1780
1781 static const uint8_t subpel_idx[3][8] = {
1782     { 0, 1, 2, 1, 2, 1, 2, 1 }, // nr. of left extra pixels,
1783                                 // also function pointer index
1784     { 0, 3, 5, 3, 5, 3, 5, 3 }, // nr. of extra pixels required
1785     { 0, 2, 3, 2, 3, 2, 3, 2 }, // nr. of right extra pixels
1786 };
1787
1788 /**
1789  * luma MC function
1790  *
1791  * @param s        VP8 decoding context
1792  * @param dst      target buffer for block data at block position
1793  * @param ref      reference picture buffer at origin (0, 0)
1794  * @param mv       motion vector (relative to block position) to get pixel data from
1795  * @param x_off    horizontal position of block from origin (0, 0)
1796  * @param y_off    vertical position of block from origin (0, 0)
1797  * @param block_w  width of block (16, 8 or 4)
1798  * @param block_h  height of block (always same as block_w)
1799  * @param width    width of src/dst plane data
1800  * @param height   height of src/dst plane data
1801  * @param linesize size of a single line of plane data, including padding
1802  * @param mc_func  motion compensation function pointers (bilinear or sixtap MC)
1803  */
1804 static av_always_inline
1805 void vp8_mc_luma(VP8Context *s, VP8ThreadData *td, uint8_t *dst,
1806                  ThreadFrame *ref, const VP56mv *mv,
1807                  int x_off, int y_off, int block_w, int block_h,
1808                  int width, int height, ptrdiff_t linesize,
1809                  vp8_mc_func mc_func[3][3])
1810 {
1811     uint8_t *src = ref->f->data[0];
1812
1813     if (AV_RN32A(mv)) {
1814         ptrdiff_t src_linesize = linesize;
1815
1816         int mx = (mv->x * 2) & 7, mx_idx = subpel_idx[0][mx];
1817         int my = (mv->y * 2) & 7, my_idx = subpel_idx[0][my];
1818
1819         x_off += mv->x >> 2;
1820         y_off += mv->y >> 2;
1821
1822         // edge emulation
1823         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 4, 0);
1824         src += y_off * linesize + x_off;
1825         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1826             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1827             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1828                                      src - my_idx * linesize - mx_idx,
1829                                      EDGE_EMU_LINESIZE, linesize,
1830                                      block_w + subpel_idx[1][mx],
1831                                      block_h + subpel_idx[1][my],
1832                                      x_off - mx_idx, y_off - my_idx,
1833                                      width, height);
1834             src = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
1835             src_linesize = EDGE_EMU_LINESIZE;
1836         }
1837         mc_func[my_idx][mx_idx](dst, linesize, src, src_linesize, block_h, mx, my);
1838     } else {
1839         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 4, 0);
1840         mc_func[0][0](dst, linesize, src + y_off * linesize + x_off,
1841                       linesize, block_h, 0, 0);
1842     }
1843 }
1844
1845 /**
1846  * chroma MC function
1847  *
1848  * @param s        VP8 decoding context
1849  * @param dst1     target buffer for block data at block position (U plane)
1850  * @param dst2     target buffer for block data at block position (V plane)
1851  * @param ref      reference picture buffer at origin (0, 0)
1852  * @param mv       motion vector (relative to block position) to get pixel data from
1853  * @param x_off    horizontal position of block from origin (0, 0)
1854  * @param y_off    vertical position of block from origin (0, 0)
1855  * @param block_w  width of block (16, 8 or 4)
1856  * @param block_h  height of block (always same as block_w)
1857  * @param width    width of src/dst plane data
1858  * @param height   height of src/dst plane data
1859  * @param linesize size of a single line of plane data, including padding
1860  * @param mc_func  motion compensation function pointers (bilinear or sixtap MC)
1861  */
1862 static av_always_inline
1863 void vp8_mc_chroma(VP8Context *s, VP8ThreadData *td, uint8_t *dst1,
1864                    uint8_t *dst2, ThreadFrame *ref, const VP56mv *mv,
1865                    int x_off, int y_off, int block_w, int block_h,
1866                    int width, int height, ptrdiff_t linesize,
1867                    vp8_mc_func mc_func[3][3])
1868 {
1869     uint8_t *src1 = ref->f->data[1], *src2 = ref->f->data[2];
1870
1871     if (AV_RN32A(mv)) {
1872         int mx = mv->x & 7, mx_idx = subpel_idx[0][mx];
1873         int my = mv->y & 7, my_idx = subpel_idx[0][my];
1874
1875         x_off += mv->x >> 3;
1876         y_off += mv->y >> 3;
1877
1878         // edge emulation
1879         src1 += y_off * linesize + x_off;
1880         src2 += y_off * linesize + x_off;
1881         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 3, 0);
1882         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1883             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1884             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1885                                      src1 - my_idx * linesize - mx_idx,
1886                                      EDGE_EMU_LINESIZE, linesize,
1887                                      block_w + subpel_idx[1][mx],
1888                                      block_h + subpel_idx[1][my],
1889                                      x_off - mx_idx, y_off - my_idx, width, height);
1890             src1 = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
1891             mc_func[my_idx][mx_idx](dst1, linesize, src1, EDGE_EMU_LINESIZE, block_h, mx, my);
1892
1893             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1894                                      src2 - my_idx * linesize - mx_idx,
1895                                      EDGE_EMU_LINESIZE, linesize,
1896                                      block_w + subpel_idx[1][mx],
1897                                      block_h + subpel_idx[1][my],
1898                                      x_off - mx_idx, y_off - my_idx, width, height);
1899             src2 = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
1900             mc_func[my_idx][mx_idx](dst2, linesize, src2, EDGE_EMU_LINESIZE, block_h, mx, my);
1901         } else {
1902             mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my);
1903             mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my);
1904         }
1905     } else {
1906         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 3, 0);
1907         mc_func[0][0](dst1, linesize, src1 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1908         mc_func[0][0](dst2, linesize, src2 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1909     }
1910 }
1911
1912 static av_always_inline
1913 void vp8_mc_part(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1914                  ThreadFrame *ref_frame, int x_off, int y_off,
1915                  int bx_off, int by_off, int block_w, int block_h,
1916                  int width, int height, VP56mv *mv)
1917 {
1918     VP56mv uvmv = *mv;
1919
1920     /* Y */
1921     vp8_mc_luma(s, td, dst[0] + by_off * s->linesize + bx_off,
1922                 ref_frame, mv, x_off + bx_off, y_off + by_off,
1923                 block_w, block_h, width, height, s->linesize,
1924                 s->put_pixels_tab[block_w == 8]);
1925
1926     /* U/V */
1927     if (s->profile == 3) {
1928         /* this block only applies VP8; it is safe to check
1929          * only the profile, as VP7 profile <= 1 */
1930         uvmv.x &= ~7;
1931         uvmv.y &= ~7;
1932     }
1933     x_off   >>= 1;
1934     y_off   >>= 1;
1935     bx_off  >>= 1;
1936     by_off  >>= 1;
1937     width   >>= 1;
1938     height  >>= 1;
1939     block_w >>= 1;
1940     block_h >>= 1;
1941     vp8_mc_chroma(s, td, dst[1] + by_off * s->uvlinesize + bx_off,
1942                   dst[2] + by_off * s->uvlinesize + bx_off, ref_frame,
1943                   &uvmv, x_off + bx_off, y_off + by_off,
1944                   block_w, block_h, width, height, s->uvlinesize,
1945                   s->put_pixels_tab[1 + (block_w == 4)]);
1946 }
1947
1948 /* Fetch pixels for estimated mv 4 macroblocks ahead.
1949  * Optimized for 64-byte cache lines. Inspired by ffh264 prefetch_motion. */
1950 static av_always_inline
1951 void prefetch_motion(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
1952                      int mb_xy, int ref)
1953 {
1954     /* Don't prefetch refs that haven't been used very often this frame. */
1955     if (s->ref_count[ref - 1] > (mb_xy >> 5)) {
1956         int x_off = mb_x << 4, y_off = mb_y << 4;
1957         int mx = (mb->mv.x >> 2) + x_off + 8;
1958         int my = (mb->mv.y >> 2) + y_off;
1959         uint8_t **src = s->framep[ref]->tf.f->data;
1960         int off = mx + (my + (mb_x & 3) * 4) * s->linesize + 64;
1961         /* For threading, a ff_thread_await_progress here might be useful, but
1962          * it actually slows down the decoder. Since a bad prefetch doesn't
1963          * generate bad decoder output, we don't run it here. */
1964         s->vdsp.prefetch(src[0] + off, s->linesize, 4);
1965         off = (mx >> 1) + ((my >> 1) + (mb_x & 7)) * s->uvlinesize + 64;
1966         s->vdsp.prefetch(src[1] + off, src[2] - src[1], 2);
1967     }
1968 }
1969
1970 /**
1971  * Apply motion vectors to prediction buffer, chapter 18.
1972  */
1973 static av_always_inline
1974 void inter_predict(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1975                    VP8Macroblock *mb, int mb_x, int mb_y)
1976 {
1977     int x_off = mb_x << 4, y_off = mb_y << 4;
1978     int width = 16 * s->mb_width, height = 16 * s->mb_height;
1979     ThreadFrame *ref = &s->framep[mb->ref_frame]->tf;
1980     VP56mv *bmv = mb->bmv;
1981
1982     switch (mb->partitioning) {
1983     case VP8_SPLITMVMODE_NONE:
1984         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1985                     0, 0, 16, 16, width, height, &mb->mv);
1986         break;
1987     case VP8_SPLITMVMODE_4x4: {
1988         int x, y;
1989         VP56mv uvmv;
1990
1991         /* Y */
1992         for (y = 0; y < 4; y++) {
1993             for (x = 0; x < 4; x++) {
1994                 vp8_mc_luma(s, td, dst[0] + 4 * y * s->linesize + x * 4,
1995                             ref, &bmv[4 * y + x],
1996                             4 * x + x_off, 4 * y + y_off, 4, 4,
1997                             width, height, s->linesize,
1998                             s->put_pixels_tab[2]);
1999             }
2000         }
2001
2002         /* U/V */
2003         x_off  >>= 1;
2004         y_off  >>= 1;
2005         width  >>= 1;
2006         height >>= 1;
2007         for (y = 0; y < 2; y++) {
2008             for (x = 0; x < 2; x++) {
2009                 uvmv.x = mb->bmv[2 * y       * 4 + 2 * x    ].x +
2010                          mb->bmv[2 * y       * 4 + 2 * x + 1].x +
2011                          mb->bmv[(2 * y + 1) * 4 + 2 * x    ].x +
2012                          mb->bmv[(2 * y + 1) * 4 + 2 * x + 1].x;
2013                 uvmv.y = mb->bmv[2 * y       * 4 + 2 * x    ].y +
2014                          mb->bmv[2 * y       * 4 + 2 * x + 1].y +
2015                          mb->bmv[(2 * y + 1) * 4 + 2 * x    ].y +
2016                          mb->bmv[(2 * y + 1) * 4 + 2 * x + 1].y;
2017                 uvmv.x = (uvmv.x + 2 + FF_SIGNBIT(uvmv.x)) >> 2;
2018                 uvmv.y = (uvmv.y + 2 + FF_SIGNBIT(uvmv.y)) >> 2;
2019                 if (s->profile == 3) {
2020                     uvmv.x &= ~7;
2021                     uvmv.y &= ~7;
2022                 }
2023                 vp8_mc_chroma(s, td, dst[1] + 4 * y * s->uvlinesize + x * 4,
2024                               dst[2] + 4 * y * s->uvlinesize + x * 4, ref,
2025                               &uvmv, 4 * x + x_off, 4 * y + y_off, 4, 4,
2026                               width, height, s->uvlinesize,
2027                               s->put_pixels_tab[2]);
2028             }
2029         }
2030         break;
2031     }
2032     case VP8_SPLITMVMODE_16x8:
2033         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2034                     0, 0, 16, 8, width, height, &bmv[0]);
2035         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2036                     0, 8, 16, 8, width, height, &bmv[1]);
2037         break;
2038     case VP8_SPLITMVMODE_8x16:
2039         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2040                     0, 0, 8, 16, width, height, &bmv[0]);
2041         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2042                     8, 0, 8, 16, width, height, &bmv[1]);
2043         break;
2044     case VP8_SPLITMVMODE_8x8:
2045         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2046                     0, 0, 8, 8, width, height, &bmv[0]);
2047         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2048                     8, 0, 8, 8, width, height, &bmv[1]);
2049         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2050                     0, 8, 8, 8, width, height, &bmv[2]);
2051         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2052                     8, 8, 8, 8, width, height, &bmv[3]);
2053         break;
2054     }
2055 }
2056
2057 static av_always_inline
2058 void idct_mb(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3], VP8Macroblock *mb)
2059 {
2060     int x, y, ch;
2061
2062     if (mb->mode != MODE_I4x4) {
2063         uint8_t *y_dst = dst[0];
2064         for (y = 0; y < 4; y++) {
2065             uint32_t nnz4 = AV_RL32(td->non_zero_count_cache[y]);
2066             if (nnz4) {
2067                 if (nnz4 & ~0x01010101) {
2068                     for (x = 0; x < 4; x++) {
2069                         if ((uint8_t) nnz4 == 1)
2070                             s->vp8dsp.vp8_idct_dc_add(y_dst + 4 * x,
2071                                                       td->block[y][x],
2072                                                       s->linesize);
2073                         else if ((uint8_t) nnz4 > 1)
2074                             s->vp8dsp.vp8_idct_add(y_dst + 4 * x,
2075                                                    td->block[y][x],
2076                                                    s->linesize);
2077                         nnz4 >>= 8;
2078                         if (!nnz4)
2079                             break;
2080                     }
2081                 } else {
2082                     s->vp8dsp.vp8_idct_dc_add4y(y_dst, td->block[y], s->linesize);
2083                 }
2084             }
2085             y_dst += 4 * s->linesize;
2086         }
2087     }
2088
2089     for (ch = 0; ch < 2; ch++) {
2090         uint32_t nnz4 = AV_RL32(td->non_zero_count_cache[4 + ch]);
2091         if (nnz4) {
2092             uint8_t *ch_dst = dst[1 + ch];
2093             if (nnz4 & ~0x01010101) {
2094                 for (y = 0; y < 2; y++) {
2095                     for (x = 0; x < 2; x++) {
2096                         if ((uint8_t) nnz4 == 1)
2097                             s->vp8dsp.vp8_idct_dc_add(ch_dst + 4 * x,
2098                                                       td->block[4 + ch][(y << 1) + x],
2099                                                       s->uvlinesize);
2100                         else if ((uint8_t) nnz4 > 1)
2101                             s->vp8dsp.vp8_idct_add(ch_dst + 4 * x,
2102                                                    td->block[4 + ch][(y << 1) + x],
2103                                                    s->uvlinesize);
2104                         nnz4 >>= 8;
2105                         if (!nnz4)
2106                             goto chroma_idct_end;
2107                     }
2108                     ch_dst += 4 * s->uvlinesize;
2109                 }
2110             } else {
2111                 s->vp8dsp.vp8_idct_dc_add4uv(ch_dst, td->block[4 + ch], s->uvlinesize);
2112             }
2113         }
2114 chroma_idct_end:
2115         ;
2116     }
2117 }
2118
2119 static av_always_inline
2120 void filter_level_for_mb(VP8Context *s, VP8Macroblock *mb,
2121                          VP8FilterStrength *f, int is_vp7)
2122 {
2123     int interior_limit, filter_level;
2124
2125     if (s->segmentation.enabled) {
2126         filter_level = s->segmentation.filter_level[mb->segment];
2127         if (!s->segmentation.absolute_vals)
2128             filter_level += s->filter.level;
2129     } else
2130         filter_level = s->filter.level;
2131
2132     if (s->lf_delta.enabled) {
2133         filter_level += s->lf_delta.ref[mb->ref_frame];
2134         filter_level += s->lf_delta.mode[mb->mode];
2135     }
2136
2137     filter_level = av_clip_uintp2(filter_level, 6);
2138
2139     interior_limit = filter_level;
2140     if (s->filter.sharpness) {
2141         interior_limit >>= (s->filter.sharpness + 3) >> 2;
2142         interior_limit = FFMIN(interior_limit, 9 - s->filter.sharpness);
2143     }
2144     interior_limit = FFMAX(interior_limit, 1);
2145
2146     f->filter_level = filter_level;
2147     f->inner_limit = interior_limit;
2148     f->inner_filter = is_vp7 || !mb->skip || mb->mode == MODE_I4x4 ||
2149                       mb->mode == VP8_MVMODE_SPLIT;
2150 }
2151
2152 static av_always_inline
2153 void filter_mb(VP8Context *s, uint8_t *dst[3], VP8FilterStrength *f,
2154                int mb_x, int mb_y, int is_vp7)
2155 {
2156     int mbedge_lim, bedge_lim_y, bedge_lim_uv, hev_thresh;
2157     int filter_level = f->filter_level;
2158     int inner_limit = f->inner_limit;
2159     int inner_filter = f->inner_filter;
2160     ptrdiff_t linesize   = s->linesize;
2161     ptrdiff_t uvlinesize = s->uvlinesize;
2162     static const uint8_t hev_thresh_lut[2][64] = {
2163         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
2164           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2165           3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2166           3, 3, 3, 3 },
2167         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
2168           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2169           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2170           2, 2, 2, 2 }
2171     };
2172
2173     if (!filter_level)
2174         return;
2175
2176     if (is_vp7) {
2177         bedge_lim_y  = filter_level;
2178         bedge_lim_uv = filter_level * 2;
2179         mbedge_lim   = filter_level + 2;
2180     } else {
2181         bedge_lim_y  =
2182         bedge_lim_uv = filter_level * 2 + inner_limit;
2183         mbedge_lim   = bedge_lim_y + 4;
2184     }
2185
2186     hev_thresh = hev_thresh_lut[s->keyframe][filter_level];
2187
2188     if (mb_x) {
2189         s->vp8dsp.vp8_h_loop_filter16y(dst[0], linesize,
2190                                        mbedge_lim, inner_limit, hev_thresh);
2191         s->vp8dsp.vp8_h_loop_filter8uv(dst[1], dst[2], uvlinesize,
2192                                        mbedge_lim, inner_limit, hev_thresh);
2193     }
2194
2195 #define H_LOOP_FILTER_16Y_INNER(cond)                                         \
2196     if (cond && inner_filter) {                                               \
2197         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] +  4, linesize,           \
2198                                              bedge_lim_y, inner_limit,        \
2199                                              hev_thresh);                     \
2200         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] +  8, linesize,           \
2201                                              bedge_lim_y, inner_limit,        \
2202                                              hev_thresh);                     \
2203         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] + 12, linesize,           \
2204                                              bedge_lim_y, inner_limit,        \
2205                                              hev_thresh);                     \
2206         s->vp8dsp.vp8_h_loop_filter8uv_inner(dst[1] +  4, dst[2] + 4,         \
2207                                              uvlinesize,  bedge_lim_uv,       \
2208                                              inner_limit, hev_thresh);        \
2209     }
2210
2211     H_LOOP_FILTER_16Y_INNER(!is_vp7)
2212
2213     if (mb_y) {
2214         s->vp8dsp.vp8_v_loop_filter16y(dst[0], linesize,
2215                                        mbedge_lim, inner_limit, hev_thresh);
2216         s->vp8dsp.vp8_v_loop_filter8uv(dst[1], dst[2], uvlinesize,
2217                                        mbedge_lim, inner_limit, hev_thresh);
2218     }
2219
2220     if (inner_filter) {
2221         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] +  4 * linesize,
2222                                              linesize, bedge_lim_y,
2223                                              inner_limit, hev_thresh);
2224         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] +  8 * linesize,
2225                                              linesize, bedge_lim_y,
2226                                              inner_limit, hev_thresh);
2227         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] + 12 * linesize,
2228                                              linesize, bedge_lim_y,
2229                                              inner_limit, hev_thresh);
2230         s->vp8dsp.vp8_v_loop_filter8uv_inner(dst[1] +  4 * uvlinesize,
2231                                              dst[2] +  4 * uvlinesize,
2232                                              uvlinesize, bedge_lim_uv,
2233                                              inner_limit, hev_thresh);
2234     }
2235
2236     H_LOOP_FILTER_16Y_INNER(is_vp7)
2237 }
2238
2239 static av_always_inline
2240 void filter_mb_simple(VP8Context *s, uint8_t *dst, VP8FilterStrength *f,
2241                       int mb_x, int mb_y)
2242 {
2243     int mbedge_lim, bedge_lim;
2244     int filter_level = f->filter_level;
2245     int inner_limit  = f->inner_limit;
2246     int inner_filter = f->inner_filter;
2247     ptrdiff_t linesize = s->linesize;
2248
2249     if (!filter_level)
2250         return;
2251
2252     bedge_lim  = 2 * filter_level + inner_limit;
2253     mbedge_lim = bedge_lim + 4;
2254
2255     if (mb_x)
2256         s->vp8dsp.vp8_h_loop_filter_simple(dst, linesize, mbedge_lim);
2257     if (inner_filter) {
2258         s->vp8dsp.vp8_h_loop_filter_simple(dst +  4, linesize, bedge_lim);
2259         s->vp8dsp.vp8_h_loop_filter_simple(dst +  8, linesize, bedge_lim);
2260         s->vp8dsp.vp8_h_loop_filter_simple(dst + 12, linesize, bedge_lim);
2261     }
2262
2263     if (mb_y)
2264         s->vp8dsp.vp8_v_loop_filter_simple(dst, linesize, mbedge_lim);
2265     if (inner_filter) {
2266         s->vp8dsp.vp8_v_loop_filter_simple(dst +  4 * linesize, linesize, bedge_lim);
2267         s->vp8dsp.vp8_v_loop_filter_simple(dst +  8 * linesize, linesize, bedge_lim);
2268         s->vp8dsp.vp8_v_loop_filter_simple(dst + 12 * linesize, linesize, bedge_lim);
2269     }
2270 }
2271
2272 #define MARGIN (16 << 2)
2273 static av_always_inline
2274 int vp78_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *curframe,
2275                                     VP8Frame *prev_frame, int is_vp7)
2276 {
2277     VP8Context *s = avctx->priv_data;
2278     int mb_x, mb_y;
2279
2280     s->mv_bounds.mv_min.y = -MARGIN;
2281     s->mv_bounds.mv_max.y = ((s->mb_height - 1) << 6) + MARGIN;
2282     for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
2283         VP8Macroblock *mb = s->macroblocks_base +
2284                             ((s->mb_width + 1) * (mb_y + 1) + 1);
2285         int mb_xy = mb_y * s->mb_width;
2286
2287         AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED * 0x01010101);
2288
2289         s->mv_bounds.mv_min.x = -MARGIN;
2290         s->mv_bounds.mv_max.x = ((s->mb_width - 1) << 6) + MARGIN;
2291
2292         if (vpX_rac_is_end(&s->c)) {
2293             return AVERROR_INVALIDDATA;
2294         }
2295         for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
2296             if (mb_y == 0)
2297                 AV_WN32A((mb - s->mb_width - 1)->intra4x4_pred_mode_top,
2298                          DC_PRED * 0x01010101);
2299             decode_mb_mode(s, &s->mv_bounds, mb, mb_x, mb_y, curframe->seg_map->data + mb_xy,
2300                            prev_frame && prev_frame->seg_map ?
2301                            prev_frame->seg_map->data + mb_xy : NULL, 1, is_vp7);
2302             s->mv_bounds.mv_min.x -= 64;
2303             s->mv_bounds.mv_max.x -= 64;
2304         }
2305         s->mv_bounds.mv_min.y -= 64;
2306         s->mv_bounds.mv_max.y -= 64;
2307     }
2308     return 0;
2309 }
2310
2311 static int vp7_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *cur_frame,
2312                                    VP8Frame *prev_frame)
2313 {
2314     return vp78_decode_mv_mb_modes(avctx, cur_frame, prev_frame, IS_VP7);
2315 }
2316
2317 static int vp8_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *cur_frame,
2318                                    VP8Frame *prev_frame)
2319 {
2320     return vp78_decode_mv_mb_modes(avctx, cur_frame, prev_frame, IS_VP8);
2321 }
2322
2323 #if HAVE_THREADS
2324 #define check_thread_pos(td, otd, mb_x_check, mb_y_check)                     \
2325     do {                                                                      \
2326         int tmp = (mb_y_check << 16) | (mb_x_check & 0xFFFF);                 \
2327         if (atomic_load(&otd->thread_mb_pos) < tmp) {                         \
2328             pthread_mutex_lock(&otd->lock);                                   \
2329             atomic_store(&td->wait_mb_pos, tmp);                              \
2330             do {                                                              \
2331                 if (atomic_load(&otd->thread_mb_pos) >= tmp)                  \
2332                     break;                                                    \
2333                 pthread_cond_wait(&otd->cond, &otd->lock);                    \
2334             } while (1);                                                      \
2335             atomic_store(&td->wait_mb_pos, INT_MAX);                          \
2336             pthread_mutex_unlock(&otd->lock);                                 \
2337         }                                                                     \
2338     } while (0)
2339
2340 #define update_pos(td, mb_y, mb_x)                                            \
2341     do {                                                                      \
2342         int pos              = (mb_y << 16) | (mb_x & 0xFFFF);                \
2343         int sliced_threading = (avctx->active_thread_type == FF_THREAD_SLICE) && \
2344                                (num_jobs > 1);                                \
2345         int is_null          = !next_td || !prev_td;                          \
2346         int pos_check        = (is_null) ? 1 :                                \
2347             (next_td != td && pos >= atomic_load(&next_td->wait_mb_pos)) ||   \
2348             (prev_td != td && pos >= atomic_load(&prev_td->wait_mb_pos));     \
2349         atomic_store(&td->thread_mb_pos, pos);                                \
2350         if (sliced_threading && pos_check) {                                  \
2351             pthread_mutex_lock(&td->lock);                                    \
2352             pthread_cond_broadcast(&td->cond);                                \
2353             pthread_mutex_unlock(&td->lock);                                  \
2354         }                                                                     \
2355     } while (0)
2356 #else
2357 #define check_thread_pos(td, otd, mb_x_check, mb_y_check) while(0)
2358 #define update_pos(td, mb_y, mb_x) while(0)
2359 #endif
2360
2361 static av_always_inline int decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
2362                                         int jobnr, int threadnr, int is_vp7)
2363 {
2364     VP8Context *s = avctx->priv_data;
2365     VP8ThreadData *prev_td, *next_td, *td = &s->thread_data[threadnr];
2366     int mb_y = atomic_load(&td->thread_mb_pos) >> 16;
2367     int mb_x, mb_xy = mb_y * s->mb_width;
2368     int num_jobs = s->num_jobs;
2369     VP8Frame *curframe = s->curframe, *prev_frame = s->prev_frame;
2370     VP56RangeCoder *c  = &s->coeff_partition[mb_y & (s->num_coeff_partitions - 1)];
2371     VP8Macroblock *mb;
2372     uint8_t *dst[3] = {
2373         curframe->tf.f->data[0] + 16 * mb_y * s->linesize,
2374         curframe->tf.f->data[1] +  8 * mb_y * s->uvlinesize,
2375         curframe->tf.f->data[2] +  8 * mb_y * s->uvlinesize
2376     };
2377
2378     if (vpX_rac_is_end(c))
2379          return AVERROR_INVALIDDATA;
2380
2381     if (mb_y == 0)
2382         prev_td = td;
2383     else
2384         prev_td = &s->thread_data[(jobnr + num_jobs - 1) % num_jobs];
2385     if (mb_y == s->mb_height - 1)
2386         next_td = td;
2387     else
2388         next_td = &s->thread_data[(jobnr + 1) % num_jobs];
2389     if (s->mb_layout == 1)
2390         mb = s->macroblocks_base + ((s->mb_width + 1) * (mb_y + 1) + 1);
2391     else {
2392         // Make sure the previous frame has read its segmentation map,
2393         // if we re-use the same map.
2394         if (prev_frame && s->segmentation.enabled &&
2395             !s->segmentation.update_map)
2396             ff_thread_await_progress(&prev_frame->tf, mb_y, 0);
2397         mb = s->macroblocks + (s->mb_height - mb_y - 1) * 2;
2398         memset(mb - 1, 0, sizeof(*mb)); // zero left macroblock
2399         AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED * 0x01010101);
2400     }
2401
2402     if (!is_vp7 || mb_y == 0)
2403         memset(td->left_nnz, 0, sizeof(td->left_nnz));
2404
2405     td->mv_bounds.mv_min.x = -MARGIN;
2406     td->mv_bounds.mv_max.x = ((s->mb_width - 1) << 6) + MARGIN;
2407
2408     for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
2409         if (vpX_rac_is_end(c))
2410             return AVERROR_INVALIDDATA;
2411         // Wait for previous thread to read mb_x+2, and reach mb_y-1.
2412         if (prev_td != td) {
2413             if (threadnr != 0) {
2414                 check_thread_pos(td, prev_td,
2415                                  mb_x + (is_vp7 ? 2 : 1),
2416                                  mb_y - (is_vp7 ? 2 : 1));
2417             } else {
2418                 check_thread_pos(td, prev_td,
2419                                  mb_x + (is_vp7 ? 2 : 1) + s->mb_width + 3,
2420                                  mb_y - (is_vp7 ? 2 : 1));
2421             }
2422         }
2423
2424         s->vdsp.prefetch(dst[0] + (mb_x & 3) * 4 * s->linesize + 64,
2425                          s->linesize, 4);
2426         s->vdsp.prefetch(dst[1] + (mb_x & 7) * s->uvlinesize + 64,
2427                          dst[2] - dst[1], 2);
2428
2429         if (!s->mb_layout)
2430             decode_mb_mode(s, &td->mv_bounds, mb, mb_x, mb_y, curframe->seg_map->data + mb_xy,
2431                            prev_frame && prev_frame->seg_map ?
2432                            prev_frame->seg_map->data + mb_xy : NULL, 0, is_vp7);
2433
2434         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_PREVIOUS);
2435
2436         if (!mb->skip)
2437             decode_mb_coeffs(s, td, c, mb, s->top_nnz[mb_x], td->left_nnz, is_vp7);
2438
2439         if (mb->mode <= MODE_I4x4)
2440             intra_predict(s, td, dst, mb, mb_x, mb_y, is_vp7);
2441         else
2442             inter_predict(s, td, dst, mb, mb_x, mb_y);
2443
2444         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN);
2445
2446         if (!mb->skip) {
2447             idct_mb(s, td, dst, mb);
2448         } else {
2449             AV_ZERO64(td->left_nnz);
2450             AV_WN64(s->top_nnz[mb_x], 0);   // array of 9, so unaligned
2451
2452             /* Reset DC block predictors if they would exist
2453              * if the mb had coefficients */
2454             if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
2455                 td->left_nnz[8]     = 0;
2456                 s->top_nnz[mb_x][8] = 0;
2457             }
2458         }
2459
2460         if (s->deblock_filter)
2461             filter_level_for_mb(s, mb, &td->filter_strength[mb_x], is_vp7);
2462
2463         if (s->deblock_filter && num_jobs != 1 && threadnr == num_jobs - 1) {
2464             if (s->filter.simple)
2465                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2466                                  NULL, NULL, s->linesize, 0, 1);
2467             else
2468                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2469                                  dst[1], dst[2], s->linesize, s->uvlinesize, 0);
2470         }
2471
2472         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN2);
2473
2474         dst[0]      += 16;
2475         dst[1]      += 8;
2476         dst[2]      += 8;
2477         td->mv_bounds.mv_min.x -= 64;
2478         td->mv_bounds.mv_max.x -= 64;
2479
2480         if (mb_x == s->mb_width + 1) {
2481             update_pos(td, mb_y, s->mb_width + 3);
2482         } else {
2483             update_pos(td, mb_y, mb_x);
2484         }
2485     }
2486     return 0;
2487 }
2488
2489 static int vp7_decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
2490                                         int jobnr, int threadnr)
2491 {
2492     return decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr, 1);
2493 }
2494
2495 static int vp8_decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
2496                                         int jobnr, int threadnr)
2497 {
2498     return decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr, 0);
2499 }
2500
2501 static av_always_inline void filter_mb_row(AVCodecContext *avctx, void *tdata,
2502                               int jobnr, int threadnr, int is_vp7)
2503 {
2504     VP8Context *s = avctx->priv_data;
2505     VP8ThreadData *td = &s->thread_data[threadnr];
2506     int mb_x, mb_y = atomic_load(&td->thread_mb_pos) >> 16, num_jobs = s->num_jobs;
2507     AVFrame *curframe = s->curframe->tf.f;
2508     VP8Macroblock *mb;
2509     VP8ThreadData *prev_td, *next_td;
2510     uint8_t *dst[3] = {
2511         curframe->data[0] + 16 * mb_y * s->linesize,
2512         curframe->data[1] +  8 * mb_y * s->uvlinesize,
2513         curframe->data[2] +  8 * mb_y * s->uvlinesize
2514     };
2515
2516     if (s->mb_layout == 1)
2517         mb = s->macroblocks_base + ((s->mb_width + 1) * (mb_y + 1) + 1);
2518     else
2519         mb = s->macroblocks + (s->mb_height - mb_y - 1) * 2;
2520
2521     if (mb_y == 0)
2522         prev_td = td;
2523     else
2524         prev_td = &s->thread_data[(jobnr + num_jobs - 1) % num_jobs];
2525     if (mb_y == s->mb_height - 1)
2526         next_td = td;
2527     else
2528         next_td = &s->thread_data[(jobnr + 1) % num_jobs];
2529
2530     for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb++) {
2531         VP8FilterStrength *f = &td->filter_strength[mb_x];
2532         if (prev_td != td)
2533             check_thread_pos(td, prev_td,
2534                              (mb_x + 1) + (s->mb_width + 3), mb_y - 1);
2535         if (next_td != td)
2536             if (next_td != &s->thread_data[0])
2537                 check_thread_pos(td, next_td, mb_x + 1, mb_y + 1);
2538
2539         if (num_jobs == 1) {
2540             if (s->filter.simple)
2541                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2542                                  NULL, NULL, s->linesize, 0, 1);
2543             else
2544                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2545                                  dst[1], dst[2], s->linesize, s->uvlinesize, 0);
2546         }
2547
2548         if (s->filter.simple)
2549             filter_mb_simple(s, dst[0], f, mb_x, mb_y);
2550         else
2551             filter_mb(s, dst, f, mb_x, mb_y, is_vp7);
2552         dst[0] += 16;
2553         dst[1] += 8;
2554         dst[2] += 8;
2555
2556         update_pos(td, mb_y, (s->mb_width + 3) + mb_x);
2557     }
2558 }
2559
2560 static void vp7_filter_mb_row(AVCodecContext *avctx, void *tdata,
2561                               int jobnr, int threadnr)
2562 {
2563     filter_mb_row(avctx, tdata, jobnr, threadnr, 1);
2564 }
2565
2566 static void vp8_filter_mb_row(AVCodecContext *avctx, void *tdata,
2567                               int jobnr, int threadnr)
2568 {
2569     filter_mb_row(avctx, tdata, jobnr, threadnr, 0);
2570 }
2571
2572 static av_always_inline
2573 int vp78_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata, int jobnr,
2574                               int threadnr, int is_vp7)
2575 {
2576     VP8Context *s = avctx->priv_data;
2577     VP8ThreadData *td = &s->thread_data[jobnr];
2578     VP8ThreadData *next_td = NULL, *prev_td = NULL;
2579     VP8Frame *curframe = s->curframe;
2580     int mb_y, num_jobs = s->num_jobs;
2581     int ret;
2582
2583     td->thread_nr = threadnr;
2584     td->mv_bounds.mv_min.y   = -MARGIN - 64 * threadnr;
2585     td->mv_bounds.mv_max.y   = ((s->mb_height - 1) << 6) + MARGIN - 64 * threadnr;
2586     for (mb_y = jobnr; mb_y < s->mb_height; mb_y += num_jobs) {
2587         atomic_store(&td->thread_mb_pos, mb_y << 16);
2588         ret = s->decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr);
2589         if (ret < 0) {
2590             update_pos(td, s->mb_height, INT_MAX & 0xFFFF);
2591             return ret;
2592         }
2593         if (s->deblock_filter)
2594             s->filter_mb_row(avctx, tdata, jobnr, threadnr);
2595         update_pos(td, mb_y, INT_MAX & 0xFFFF);
2596
2597         td->mv_bounds.mv_min.y -= 64 * num_jobs;
2598         td->mv_bounds.mv_max.y -= 64 * num_jobs;
2599
2600         if (avctx->active_thread_type == FF_THREAD_FRAME)
2601             ff_thread_report_progress(&curframe->tf, mb_y, 0);
2602     }
2603
2604     return 0;
2605 }
2606
2607 static int vp7_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata,
2608                                     int jobnr, int threadnr)
2609 {
2610     return vp78_decode_mb_row_sliced(avctx, tdata, jobnr, threadnr, IS_VP7);
2611 }
2612
2613 static int vp8_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata,
2614                                     int jobnr, int threadnr)
2615 {
2616     return vp78_decode_mb_row_sliced(avctx, tdata, jobnr, threadnr, IS_VP8);
2617 }
2618
2619 static av_always_inline
2620 int vp78_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
2621                       AVPacket *avpkt, int is_vp7)
2622 {
2623     VP8Context *s = avctx->priv_data;
2624     int ret, i, referenced, num_jobs;
2625     enum AVDiscard skip_thresh;
2626     VP8Frame *av_uninit(curframe), *prev_frame;
2627
2628     if (is_vp7)
2629         ret = vp7_decode_frame_header(s, avpkt->data, avpkt->size);
2630     else
2631         ret = vp8_decode_frame_header(s, avpkt->data, avpkt->size);
2632
2633     if (ret < 0)
2634         goto err;
2635
2636     if (s->actually_webp) {
2637         // avctx->pix_fmt already set in caller.
2638     } else if (!is_vp7 && s->pix_fmt == AV_PIX_FMT_NONE) {
2639         s->pix_fmt = get_pixel_format(s);
2640         if (s->pix_fmt < 0) {
2641             ret = AVERROR(EINVAL);
2642             goto err;
2643         }
2644         avctx->pix_fmt = s->pix_fmt;
2645     }
2646
2647     prev_frame = s->framep[VP56_FRAME_CURRENT];
2648
2649     referenced = s->update_last || s->update_golden == VP56_FRAME_CURRENT ||
2650                  s->update_altref == VP56_FRAME_CURRENT;
2651
2652     skip_thresh = !referenced ? AVDISCARD_NONREF
2653                               : !s->keyframe ? AVDISCARD_NONKEY
2654                                              : AVDISCARD_ALL;
2655
2656     if (avctx->skip_frame >= skip_thresh) {
2657         s->invisible = 1;
2658         memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
2659         goto skip_decode;
2660     }
2661     s->deblock_filter = s->filter.level && avctx->skip_loop_filter < skip_thresh;
2662
2663     // release no longer referenced frames
2664     for (i = 0; i < 5; i++)
2665         if (s->frames[i].tf.f->buf[0] &&
2666             &s->frames[i] != prev_frame &&
2667             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
2668             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN]   &&
2669             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2])
2670             vp8_release_frame(s, &s->frames[i]);
2671
2672     curframe = s->framep[VP56_FRAME_CURRENT] = vp8_find_free_buffer(s);
2673
2674     if (!s->colorspace)
2675         avctx->colorspace = AVCOL_SPC_BT470BG;
2676     if (s->fullrange)
2677         avctx->color_range = AVCOL_RANGE_JPEG;
2678     else
2679         avctx->color_range = AVCOL_RANGE_MPEG;
2680
2681     /* Given that arithmetic probabilities are updated every frame, it's quite
2682      * likely that the values we have on a random interframe are complete
2683      * junk if we didn't start decode on a keyframe. So just don't display
2684      * anything rather than junk. */
2685     if (!s->keyframe && (!s->framep[VP56_FRAME_PREVIOUS] ||
2686                          !s->framep[VP56_FRAME_GOLDEN]   ||
2687                          !s->framep[VP56_FRAME_GOLDEN2])) {
2688         av_log(avctx, AV_LOG_WARNING,
2689                "Discarding interframe without a prior keyframe!\n");
2690         ret = AVERROR_INVALIDDATA;
2691         goto err;
2692     }
2693
2694     curframe->tf.f->key_frame = s->keyframe;
2695     curframe->tf.f->pict_type = s->keyframe ? AV_PICTURE_TYPE_I
2696                                             : AV_PICTURE_TYPE_P;
2697     if ((ret = vp8_alloc_frame(s, curframe, referenced)) < 0)
2698         goto err;
2699
2700     // check if golden and altref are swapped
2701     if (s->update_altref != VP56_FRAME_NONE)
2702         s->next_framep[VP56_FRAME_GOLDEN2] = s->framep[s->update_altref];
2703     else
2704         s->next_framep[VP56_FRAME_GOLDEN2] = s->framep[VP56_FRAME_GOLDEN2];
2705
2706     if (s->update_golden != VP56_FRAME_NONE)
2707         s->next_framep[VP56_FRAME_GOLDEN] = s->framep[s->update_golden];
2708     else
2709         s->next_framep[VP56_FRAME_GOLDEN] = s->framep[VP56_FRAME_GOLDEN];
2710
2711     if (s->update_last)
2712         s->next_framep[VP56_FRAME_PREVIOUS] = curframe;
2713     else
2714         s->next_framep[VP56_FRAME_PREVIOUS] = s->framep[VP56_FRAME_PREVIOUS];
2715
2716     s->next_framep[VP56_FRAME_CURRENT] = curframe;
2717
2718     if (avctx->codec->update_thread_context)
2719         ff_thread_finish_setup(avctx);
2720
2721     if (avctx->hwaccel) {
2722         ret = avctx->hwaccel->start_frame(avctx, avpkt->data, avpkt->size);
2723         if (ret < 0)
2724             goto err;
2725
2726         ret = avctx->hwaccel->decode_slice(avctx, avpkt->data, avpkt->size);
2727         if (ret < 0)
2728             goto err;
2729
2730         ret = avctx->hwaccel->end_frame(avctx);
2731         if (ret < 0)
2732             goto err;
2733
2734     } else {
2735         s->linesize   = curframe->tf.f->linesize[0];
2736         s->uvlinesize = curframe->tf.f->linesize[1];
2737
2738         memset(s->top_nnz, 0, s->mb_width * sizeof(*s->top_nnz));
2739         /* Zero macroblock structures for top/top-left prediction
2740          * from outside the frame. */
2741         if (!s->mb_layout)
2742             memset(s->macroblocks + s->mb_height * 2 - 1, 0,
2743                    (s->mb_width + 1) * sizeof(*s->macroblocks));
2744         if (!s->mb_layout && s->keyframe)
2745             memset(s->intra4x4_pred_mode_top, DC_PRED, s->mb_width * 4);
2746
2747         memset(s->ref_count, 0, sizeof(s->ref_count));
2748
2749         if (s->mb_layout == 1) {
2750             // Make sure the previous frame has read its segmentation map,
2751             // if we re-use the same map.
2752             if (prev_frame && s->segmentation.enabled &&
2753                 !s->segmentation.update_map)
2754                 ff_thread_await_progress(&prev_frame->tf, 1, 0);
2755             if (is_vp7)
2756                 ret = vp7_decode_mv_mb_modes(avctx, curframe, prev_frame);
2757             else
2758                 ret = vp8_decode_mv_mb_modes(avctx, curframe, prev_frame);
2759             if (ret < 0)
2760                 goto err;
2761         }
2762
2763         if (avctx->active_thread_type == FF_THREAD_FRAME)
2764             num_jobs = 1;
2765         else
2766             num_jobs = FFMIN(s->num_coeff_partitions, avctx->thread_count);
2767         s->num_jobs   = num_jobs;
2768         s->curframe   = curframe;
2769         s->prev_frame = prev_frame;
2770         s->mv_bounds.mv_min.y   = -MARGIN;
2771         s->mv_bounds.mv_max.y   = ((s->mb_height - 1) << 6) + MARGIN;
2772         for (i = 0; i < MAX_THREADS; i++) {
2773             VP8ThreadData *td = &s->thread_data[i];
2774             atomic_init(&td->thread_mb_pos, 0);
2775             atomic_init(&td->wait_mb_pos, INT_MAX);
2776         }
2777         if (is_vp7)
2778             avctx->execute2(avctx, vp7_decode_mb_row_sliced, s->thread_data, NULL,
2779                             num_jobs);
2780         else
2781             avctx->execute2(avctx, vp8_decode_mb_row_sliced, s->thread_data, NULL,
2782                             num_jobs);
2783     }
2784
2785     ff_thread_report_progress(&curframe->tf, INT_MAX, 0);
2786     memcpy(&s->framep[0], &s->next_framep[0], sizeof(s->framep[0]) * 4);
2787
2788 skip_decode:
2789     // if future frames don't use the updated probabilities,
2790     // reset them to the values we saved
2791     if (!s->update_probabilities)
2792         s->prob[0] = s->prob[1];
2793
2794     if (!s->invisible) {
2795         if ((ret = av_frame_ref(data, curframe->tf.f)) < 0)
2796             return ret;
2797         *got_frame = 1;
2798     }
2799
2800     return avpkt->size;
2801 err:
2802     memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
2803     return ret;
2804 }
2805
2806 int ff_vp8_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
2807                         AVPacket *avpkt)
2808 {
2809     return vp78_decode_frame(avctx, data, got_frame, avpkt, IS_VP8);
2810 }
2811
2812 #if CONFIG_VP7_DECODER
2813 static int vp7_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
2814                             AVPacket *avpkt)
2815 {
2816     return vp78_decode_frame(avctx, data, got_frame, avpkt, IS_VP7);
2817 }
2818 #endif /* CONFIG_VP7_DECODER */
2819
2820 av_cold int ff_vp8_decode_free(AVCodecContext *avctx)
2821 {
2822     VP8Context *s = avctx->priv_data;
2823     int i;
2824
2825     if (!s)
2826         return 0;
2827
2828     vp8_decode_flush_impl(avctx, 1);
2829     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++)
2830         av_frame_free(&s->frames[i].tf.f);
2831
2832     return 0;
2833 }
2834
2835 static av_cold int vp8_init_frames(VP8Context *s)
2836 {
2837     int i;
2838     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++) {
2839         s->frames[i].tf.f = av_frame_alloc();
2840         if (!s->frames[i].tf.f)
2841             return AVERROR(ENOMEM);
2842     }
2843     return 0;
2844 }
2845
2846 static av_always_inline
2847 int vp78_decode_init(AVCodecContext *avctx, int is_vp7)
2848 {
2849     VP8Context *s = avctx->priv_data;
2850     int ret;
2851
2852     s->avctx = avctx;
2853     s->vp7   = avctx->codec->id == AV_CODEC_ID_VP7;
2854     s->pix_fmt = AV_PIX_FMT_NONE;
2855     avctx->pix_fmt = AV_PIX_FMT_YUV420P;
2856
2857     ff_videodsp_init(&s->vdsp, 8);
2858
2859     ff_vp78dsp_init(&s->vp8dsp);
2860     if (CONFIG_VP7_DECODER && is_vp7) {
2861         ff_h264_pred_init(&s->hpc, AV_CODEC_ID_VP7, 8, 1);
2862         ff_vp7dsp_init(&s->vp8dsp);
2863         s->decode_mb_row_no_filter = vp7_decode_mb_row_no_filter;
2864         s->filter_mb_row           = vp7_filter_mb_row;
2865     } else if (CONFIG_VP8_DECODER && !is_vp7) {
2866         ff_h264_pred_init(&s->hpc, AV_CODEC_ID_VP8, 8, 1);
2867         ff_vp8dsp_init(&s->vp8dsp);
2868         s->decode_mb_row_no_filter = vp8_decode_mb_row_no_filter;
2869         s->filter_mb_row           = vp8_filter_mb_row;
2870     }
2871
2872     /* does not change for VP8 */
2873     memcpy(s->prob[0].scan, ff_zigzag_scan, sizeof(s->prob[0].scan));
2874
2875     if ((ret = vp8_init_frames(s)) < 0) {
2876         ff_vp8_decode_free(avctx);
2877         return ret;
2878     }
2879
2880     return 0;
2881 }
2882
2883 #if CONFIG_VP7_DECODER
2884 static int vp7_decode_init(AVCodecContext *avctx)
2885 {
2886     return vp78_decode_init(avctx, IS_VP7);
2887 }
2888 #endif /* CONFIG_VP7_DECODER */
2889
2890 av_cold int ff_vp8_decode_init(AVCodecContext *avctx)
2891 {
2892     return vp78_decode_init(avctx, IS_VP8);
2893 }
2894
2895 #if CONFIG_VP8_DECODER
2896 #if HAVE_THREADS
2897 #define REBASE(pic) ((pic) ? (pic) - &s_src->frames[0] + &s->frames[0] : NULL)
2898
2899 static int vp8_decode_update_thread_context(AVCodecContext *dst,
2900                                             const AVCodecContext *src)
2901 {
2902     VP8Context *s = dst->priv_data, *s_src = src->priv_data;
2903     int i;
2904
2905     if (s->macroblocks_base &&
2906         (s_src->mb_width != s->mb_width || s_src->mb_height != s->mb_height)) {
2907         free_buffers(s);
2908         s->mb_width  = s_src->mb_width;
2909         s->mb_height = s_src->mb_height;
2910     }
2911
2912     s->pix_fmt      = s_src->pix_fmt;
2913     s->prob[0]      = s_src->prob[!s_src->update_probabilities];
2914     s->segmentation = s_src->segmentation;
2915     s->lf_delta     = s_src->lf_delta;
2916     memcpy(s->sign_bias, s_src->sign_bias, sizeof(s->sign_bias));
2917
2918     for (i = 0; i < FF_ARRAY_ELEMS(s_src->frames); i++) {
2919         if (s_src->frames[i].tf.f->buf[0]) {
2920             int ret = vp8_ref_frame(s, &s->frames[i], &s_src->frames[i]);
2921             if (ret < 0)
2922                 return ret;
2923         }
2924     }
2925
2926     s->framep[0] = REBASE(s_src->next_framep[0]);
2927     s->framep[1] = REBASE(s_src->next_framep[1]);
2928     s->framep[2] = REBASE(s_src->next_framep[2]);
2929     s->framep[3] = REBASE(s_src->next_framep[3]);
2930
2931     return 0;
2932 }
2933 #endif /* HAVE_THREADS */
2934 #endif /* CONFIG_VP8_DECODER */
2935
2936 #if CONFIG_VP7_DECODER
2937 AVCodec ff_vp7_decoder = {
2938     .name                  = "vp7",
2939     .long_name             = NULL_IF_CONFIG_SMALL("On2 VP7"),
2940     .type                  = AVMEDIA_TYPE_VIDEO,
2941     .id                    = AV_CODEC_ID_VP7,
2942     .priv_data_size        = sizeof(VP8Context),
2943     .init                  = vp7_decode_init,
2944     .close                 = ff_vp8_decode_free,
2945     .decode                = vp7_decode_frame,
2946     .capabilities          = AV_CODEC_CAP_DR1,
2947     .flush                 = vp8_decode_flush,
2948 };
2949 #endif /* CONFIG_VP7_DECODER */
2950
2951 #if CONFIG_VP8_DECODER
2952 AVCodec ff_vp8_decoder = {
2953     .name                  = "vp8",
2954     .long_name             = NULL_IF_CONFIG_SMALL("On2 VP8"),
2955     .type                  = AVMEDIA_TYPE_VIDEO,
2956     .id                    = AV_CODEC_ID_VP8,
2957     .priv_data_size        = sizeof(VP8Context),
2958     .init                  = ff_vp8_decode_init,
2959     .close                 = ff_vp8_decode_free,
2960     .decode                = ff_vp8_decode_frame,
2961     .capabilities          = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS |
2962                              AV_CODEC_CAP_SLICE_THREADS,
2963     .flush                 = vp8_decode_flush,
2964     .update_thread_context = ONLY_IF_THREADS_ENABLED(vp8_decode_update_thread_context),
2965     .hw_configs            = (const AVCodecHWConfigInternal*[]) {
2966 #if CONFIG_VP8_VAAPI_HWACCEL
2967                                HWACCEL_VAAPI(vp8),
2968 #endif
2969 #if CONFIG_VP8_NVDEC_HWACCEL
2970                                HWACCEL_NVDEC(vp8),
2971 #endif
2972                                NULL
2973                            },
2974     .caps_internal         = FF_CODEC_CAP_ALLOCATE_PROGRESS,
2975 };
2976 #endif /* CONFIG_VP7_DECODER */