git.sesse.net Git - ffmpeg/blob - libavcodec/vp8.c

   1 /*
   2  * VP7/VP8 compatible video decoder
   3  *
   4  * Copyright (C) 2010 David Conrad
   5  * Copyright (C) 2010 Ronald S. Bultje
   6  * Copyright (C) 2010 Fiona Glaser
   7  * Copyright (C) 2012 Daniel Kang
   8  * Copyright (C) 2014 Peter Ross
   9  *
  10  * This file is part of FFmpeg.
  11  *
  12  * FFmpeg is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public
  14  * License as published by the Free Software Foundation; either
  15  * version 2.1 of the License, or (at your option) any later version.
  16  *
  17  * FFmpeg is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with FFmpeg; if not, write to the Free Software
  24  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  25  */
  26
  27 #include "libavutil/imgutils.h"
  28
  29 #include "avcodec.h"
  30 #include "hwaccel.h"
  31 #include "internal.h"
  32 #include "mathops.h"
  33 #include "rectangle.h"
  34 #include "thread.h"
  35 #include "vp8.h"
  36 #include "vp8data.h"
  37
  38 #if ARCH_ARM
  39 #   include "arm/vp8.h"
  40 #endif
  41
  42 #if CONFIG_VP7_DECODER && CONFIG_VP8_DECODER
  43 #define VPX(vp7, f) (vp7 ? vp7_ ## f : vp8_ ## f)
  44 #elif CONFIG_VP7_DECODER
  45 #define VPX(vp7, f) vp7_ ## f
  46 #else // CONFIG_VP8_DECODER
  47 #define VPX(vp7, f) vp8_ ## f
  48 #endif
  49
  50 static void free_buffers(VP8Context *s)
  51 {
  52     int i;
  53     if (s->thread_data)
  54         for (i = 0; i < MAX_THREADS; i++) {
  55 #if HAVE_THREADS
  56             pthread_cond_destroy(&s->thread_data[i].cond);
  57             pthread_mutex_destroy(&s->thread_data[i].lock);
  58 #endif
  59             av_freep(&s->thread_data[i].filter_strength);
  60         }
  61     av_freep(&s->thread_data);
  62     av_freep(&s->macroblocks_base);
  63     av_freep(&s->intra4x4_pred_mode_top);
  64     av_freep(&s->top_nnz);
  65     av_freep(&s->top_border);
  66
  67     s->macroblocks = NULL;
  68 }
  69
  70 static int vp8_alloc_frame(VP8Context *s, VP8Frame *f, int ref)
  71 {
  72     int ret;
  73     if ((ret = ff_thread_get_buffer(s->avctx, &f->tf,
  74                                     ref ? AV_GET_BUFFER_FLAG_REF : 0)) < 0)
  75         return ret;
  76     if (!(f->seg_map = av_buffer_allocz(s->mb_width * s->mb_height)))
  77         goto fail;
  78     if (s->avctx->hwaccel) {
  79         const AVHWAccel *hwaccel = s->avctx->hwaccel;
  80         if (hwaccel->frame_priv_data_size) {
  81             f->hwaccel_priv_buf = av_buffer_allocz(hwaccel->frame_priv_data_size);
  82             if (!f->hwaccel_priv_buf)
  83                 goto fail;
  84             f->hwaccel_picture_private = f->hwaccel_priv_buf->data;
  85         }
  86     }
  87     return 0;
  88
  89 fail:
  90     av_buffer_unref(&f->seg_map);
  91     ff_thread_release_buffer(s->avctx, &f->tf);
  92     return AVERROR(ENOMEM);
  93 }
  94
  95 static void vp8_release_frame(VP8Context *s, VP8Frame *f)
  96 {
  97     av_buffer_unref(&f->seg_map);
  98     av_buffer_unref(&f->hwaccel_priv_buf);
  99     f->hwaccel_picture_private = NULL;
 100     ff_thread_release_buffer(s->avctx, &f->tf);
 101 }
 102
 103 #if CONFIG_VP8_DECODER
 104 static int vp8_ref_frame(VP8Context *s, VP8Frame *dst, VP8Frame *src)
 105 {
 106     int ret;
 107
 108     vp8_release_frame(s, dst);
 109
 110     if ((ret = ff_thread_ref_frame(&dst->tf, &src->tf)) < 0)
 111         return ret;
 112     if (src->seg_map &&
 113         !(dst->seg_map = av_buffer_ref(src->seg_map))) {
 114         vp8_release_frame(s, dst);
 115         return AVERROR(ENOMEM);
 116     }
 117     if (src->hwaccel_picture_private) {
 118         dst->hwaccel_priv_buf = av_buffer_ref(src->hwaccel_priv_buf);
 119         if (!dst->hwaccel_priv_buf)
 120             return AVERROR(ENOMEM);
 121         dst->hwaccel_picture_private = dst->hwaccel_priv_buf->data;
 122     }
 123
 124     return 0;
 125 }
 126 #endif /* CONFIG_VP8_DECODER */
 127
 128 static void vp8_decode_flush_impl(AVCodecContext *avctx, int free_mem)
 129 {
 130     VP8Context *s = avctx->priv_data;
 131     int i;
 132
 133     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++)
 134         vp8_release_frame(s, &s->frames[i]);
 135     memset(s->framep, 0, sizeof(s->framep));
 136
 137     if (free_mem)
 138         free_buffers(s);
 139 }
 140
 141 static void vp8_decode_flush(AVCodecContext *avctx)
 142 {
 143     vp8_decode_flush_impl(avctx, 0);
 144 }
 145
 146 static VP8Frame *vp8_find_free_buffer(VP8Context *s)
 147 {
 148     VP8Frame *frame = NULL;
 149     int i;
 150
 151     // find a free buffer
 152     for (i = 0; i < 5; i++)
 153         if (&s->frames[i] != s->framep[VP56_FRAME_CURRENT]  &&
 154             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
 155             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN]   &&
 156             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2]) {
 157             frame = &s->frames[i];
 158             break;
 159         }
 160     if (i == 5) {
 161         av_log(s->avctx, AV_LOG_FATAL, "Ran out of free frames!\n");
 162         abort();
 163     }
 164     if (frame->tf.f->buf[0])
 165         vp8_release_frame(s, frame);
 166
 167     return frame;
 168 }
 169
 170 static enum AVPixelFormat get_pixel_format(VP8Context *s)
 171 {
 172     enum AVPixelFormat pix_fmts[] = {
 173 #if CONFIG_VP8_VAAPI_HWACCEL
 174         AV_PIX_FMT_VAAPI,
 175 #endif
 176 #if CONFIG_VP8_NVDEC_HWACCEL
 177         AV_PIX_FMT_CUDA,
 178 #endif
 179         AV_PIX_FMT_YUV420P,
 180         AV_PIX_FMT_NONE,
 181     };
 182
 183     return ff_get_format(s->avctx, pix_fmts);
 184 }
 185
 186 static av_always_inline
 187 int update_dimensions(VP8Context *s, int width, int height, int is_vp7)
 188 {
 189     AVCodecContext *avctx = s->avctx;
 190     int i, ret;
 191
 192     if (width  != s->avctx->width || ((width+15)/16 != s->mb_width || (height+15)/16 != s->mb_height) && s->macroblocks_base ||
 193         height != s->avctx->height) {
 194         vp8_decode_flush_impl(s->avctx, 1);
 195
 196         ret = ff_set_dimensions(s->avctx, width, height);
 197         if (ret < 0)
 198             return ret;
 199     }
 200
 201     if (!s->actually_webp && !is_vp7) {
 202         s->pix_fmt = get_pixel_format(s);
 203         if (s->pix_fmt < 0)
 204             return AVERROR(EINVAL);
 205         avctx->pix_fmt = s->pix_fmt;
 206     }
 207
 208     s->mb_width  = (s->avctx->coded_width  + 15) / 16;
 209     s->mb_height = (s->avctx->coded_height + 15) / 16;
 210
 211     s->mb_layout = is_vp7 || avctx->active_thread_type == FF_THREAD_SLICE &&
 212                    avctx->thread_count > 1;
 213     if (!s->mb_layout) { // Frame threading and one thread
 214         s->macroblocks_base       = av_mallocz((s->mb_width + s->mb_height * 2 + 1) *
 215                                                sizeof(*s->macroblocks));
 216         s->intra4x4_pred_mode_top = av_mallocz(s->mb_width * 4);
 217     } else // Sliced threading
 218         s->macroblocks_base = av_mallocz((s->mb_width + 2) * (s->mb_height + 2) *
 219                                          sizeof(*s->macroblocks));
 220     s->top_nnz     = av_mallocz(s->mb_width * sizeof(*s->top_nnz));
 221     s->top_border  = av_mallocz((s->mb_width + 1) * sizeof(*s->top_border));
 222     s->thread_data = av_mallocz(MAX_THREADS * sizeof(VP8ThreadData));
 223
 224     if (!s->macroblocks_base || !s->top_nnz || !s->top_border ||
 225         !s->thread_data || (!s->intra4x4_pred_mode_top && !s->mb_layout)) {
 226         free_buffers(s);
 227         return AVERROR(ENOMEM);
 228     }
 229
 230     for (i = 0; i < MAX_THREADS; i++) {
 231         s->thread_data[i].filter_strength =
 232             av_mallocz(s->mb_width * sizeof(*s->thread_data[0].filter_strength));
 233         if (!s->thread_data[i].filter_strength) {
 234             free_buffers(s);
 235             return AVERROR(ENOMEM);
 236         }
 237 #if HAVE_THREADS
 238         pthread_mutex_init(&s->thread_data[i].lock, NULL);
 239         pthread_cond_init(&s->thread_data[i].cond, NULL);
 240 #endif
 241     }
 242
 243     s->macroblocks = s->macroblocks_base + 1;
 244
 245     return 0;
 246 }
 247
 248 static int vp7_update_dimensions(VP8Context *s, int width, int height)
 249 {
 250     return update_dimensions(s, width, height, IS_VP7);
 251 }
 252
 253 static int vp8_update_dimensions(VP8Context *s, int width, int height)
 254 {
 255     return update_dimensions(s, width, height, IS_VP8);
 256 }
 257
 258
 259 static void parse_segment_info(VP8Context *s)
 260 {
 261     VP56RangeCoder *c = &s->c;
 262     int i;
 263
 264     s->segmentation.update_map = vp8_rac_get(c);
 265     s->segmentation.update_feature_data = vp8_rac_get(c);
 266
 267     if (s->segmentation.update_feature_data) {
 268         s->segmentation.absolute_vals = vp8_rac_get(c);
 269
 270         for (i = 0; i < 4; i++)
 271             s->segmentation.base_quant[i]   = vp8_rac_get_sint(c, 7);
 272
 273         for (i = 0; i < 4; i++)
 274             s->segmentation.filter_level[i] = vp8_rac_get_sint(c, 6);
 275     }
 276     if (s->segmentation.update_map)
 277         for (i = 0; i < 3; i++)
 278             s->prob->segmentid[i] = vp8_rac_get(c) ? vp8_rac_get_uint(c, 8) : 255;
 279 }
 280
 281 static void update_lf_deltas(VP8Context *s)
 282 {
 283     VP56RangeCoder *c = &s->c;
 284     int i;
 285
 286     for (i = 0; i < 4; i++) {
 287         if (vp8_rac_get(c)) {
 288             s->lf_delta.ref[i] = vp8_rac_get_uint(c, 6);
 289
 290             if (vp8_rac_get(c))
 291                 s->lf_delta.ref[i] = -s->lf_delta.ref[i];
 292         }
 293     }
 294
 295     for (i = MODE_I4x4; i <= VP8_MVMODE_SPLIT; i++) {
 296         if (vp8_rac_get(c)) {
 297             s->lf_delta.mode[i] = vp8_rac_get_uint(c, 6);
 298
 299             if (vp8_rac_get(c))
 300                 s->lf_delta.mode[i] = -s->lf_delta.mode[i];
 301         }
 302     }
 303 }
 304
 305 static int setup_partitions(VP8Context *s, const uint8_t *buf, int buf_size)
 306 {
 307     const uint8_t *sizes = buf;
 308     int i;
 309     int ret;
 310
 311     s->num_coeff_partitions = 1 << vp8_rac_get_uint(&s->c, 2);
 312
 313     buf      += 3 * (s->num_coeff_partitions - 1);
 314     buf_size -= 3 * (s->num_coeff_partitions - 1);
 315     if (buf_size < 0)
 316         return -1;
 317
 318     for (i = 0; i < s->num_coeff_partitions - 1; i++) {
 319         int size = AV_RL24(sizes + 3 * i);
 320         if (buf_size - size < 0)
 321             return -1;
 322         s->coeff_partition_size[i] = size;
 323
 324         ret = ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, size);
 325         if (ret < 0)
 326             return ret;
 327         buf      += size;
 328         buf_size -= size;
 329     }
 330
 331     s->coeff_partition_size[i] = buf_size;
 332     ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, buf_size);
 333
 334     return 0;
 335 }
 336
 337 static void vp7_get_quants(VP8Context *s)
 338 {
 339     VP56RangeCoder *c = &s->c;
 340
 341     int yac_qi  = vp8_rac_get_uint(c, 7);
 342     int ydc_qi  = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 343     int y2dc_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 344     int y2ac_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 345     int uvdc_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 346     int uvac_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 347
 348     s->qmat[0].luma_qmul[0]    =       vp7_ydc_qlookup[ydc_qi];
 349     s->qmat[0].luma_qmul[1]    =       vp7_yac_qlookup[yac_qi];
 350     s->qmat[0].luma_dc_qmul[0] =       vp7_y2dc_qlookup[y2dc_qi];
 351     s->qmat[0].luma_dc_qmul[1] =       vp7_y2ac_qlookup[y2ac_qi];
 352     s->qmat[0].chroma_qmul[0]  = FFMIN(vp7_ydc_qlookup[uvdc_qi], 132);
 353     s->qmat[0].chroma_qmul[1]  =       vp7_yac_qlookup[uvac_qi];
 354 }
 355
 356 static void vp8_get_quants(VP8Context *s)
 357 {
 358     VP56RangeCoder *c = &s->c;
 359     int i, base_qi;
 360
 361     s->quant.yac_qi     = vp8_rac_get_uint(c, 7);
 362     s->quant.ydc_delta  = vp8_rac_get_sint(c, 4);
 363     s->quant.y2dc_delta = vp8_rac_get_sint(c, 4);
 364     s->quant.y2ac_delta = vp8_rac_get_sint(c, 4);
 365     s->quant.uvdc_delta = vp8_rac_get_sint(c, 4);
 366     s->quant.uvac_delta = vp8_rac_get_sint(c, 4);
 367
 368     for (i = 0; i < 4; i++) {
 369         if (s->segmentation.enabled) {
 370             base_qi = s->segmentation.base_quant[i];
 371             if (!s->segmentation.absolute_vals)
 372                 base_qi += s->quant.yac_qi;
 373         } else
 374             base_qi = s->quant.yac_qi;
 375
 376         s->qmat[i].luma_qmul[0]    = vp8_dc_qlookup[av_clip_uintp2(base_qi + s->quant.ydc_delta,  7)];
 377         s->qmat[i].luma_qmul[1]    = vp8_ac_qlookup[av_clip_uintp2(base_qi,              7)];
 378         s->qmat[i].luma_dc_qmul[0] = vp8_dc_qlookup[av_clip_uintp2(base_qi + s->quant.y2dc_delta, 7)] * 2;
 379         /* 101581>>16 is equivalent to 155/100 */
 380         s->qmat[i].luma_dc_qmul[1] = vp8_ac_qlookup[av_clip_uintp2(base_qi + s->quant.y2ac_delta, 7)] * 101581 >> 16;
 381         s->qmat[i].chroma_qmul[0]  = vp8_dc_qlookup[av_clip_uintp2(base_qi + s->quant.uvdc_delta, 7)];
 382         s->qmat[i].chroma_qmul[1]  = vp8_ac_qlookup[av_clip_uintp2(base_qi + s->quant.uvac_delta, 7)];
 383
 384         s->qmat[i].luma_dc_qmul[1] = FFMAX(s->qmat[i].luma_dc_qmul[1], 8);
 385         s->qmat[i].chroma_qmul[0]  = FFMIN(s->qmat[i].chroma_qmul[0], 132);
 386     }
 387 }
 388
 389 /**
 390  * Determine which buffers golden and altref should be updated with after this frame.
 391  * The spec isn't clear here, so I'm going by my understanding of what libvpx does
 392  *
 393  * Intra frames update all 3 references
 394  * Inter frames update VP56_FRAME_PREVIOUS if the update_last flag is set
 395  * If the update (golden|altref) flag is set, it's updated with the current frame
 396  *      if update_last is set, and VP56_FRAME_PREVIOUS otherwise.
 397  * If the flag is not set, the number read means:
 398  *      0: no update
 399  *      1: VP56_FRAME_PREVIOUS
 400  *      2: update golden with altref, or update altref with golden
 401  */
 402 static VP56Frame ref_to_update(VP8Context *s, int update, VP56Frame ref)
 403 {
 404     VP56RangeCoder *c = &s->c;
 405
 406     if (update)
 407         return VP56_FRAME_CURRENT;
 408
 409     switch (vp8_rac_get_uint(c, 2)) {
 410     case 1:
 411         return VP56_FRAME_PREVIOUS;
 412     case 2:
 413         return (ref == VP56_FRAME_GOLDEN) ? VP56_FRAME_GOLDEN2 : VP56_FRAME_GOLDEN;
 414     }
 415     return VP56_FRAME_NONE;
 416 }
 417
 418 static void vp78_reset_probability_tables(VP8Context *s)
 419 {
 420     int i, j;
 421     for (i = 0; i < 4; i++)
 422         for (j = 0; j < 16; j++)
 423             memcpy(s->prob->token[i][j], vp8_token_default_probs[i][vp8_coeff_band[j]],
 424                    sizeof(s->prob->token[i][j]));
 425 }
 426
 427 static void vp78_update_probability_tables(VP8Context *s)
 428 {
 429     VP56RangeCoder *c = &s->c;
 430     int i, j, k, l, m;
 431
 432     for (i = 0; i < 4; i++)
 433         for (j = 0; j < 8; j++)
 434             for (k = 0; k < 3; k++)
 435                 for (l = 0; l < NUM_DCT_TOKENS-1; l++)
 436                     if (vp56_rac_get_prob_branchy(c, vp8_token_update_probs[i][j][k][l])) {
 437                         int prob = vp8_rac_get_uint(c, 8);
 438                         for (m = 0; vp8_coeff_band_indexes[j][m] >= 0; m++)
 439                             s->prob->token[i][vp8_coeff_band_indexes[j][m]][k][l] = prob;
 440                     }
 441 }
 442
 443 #define VP7_MVC_SIZE 17
 444 #define VP8_MVC_SIZE 19
 445
 446 static void vp78_update_pred16x16_pred8x8_mvc_probabilities(VP8Context *s,
 447                                                             int mvc_size)
 448 {
 449     VP56RangeCoder *c = &s->c;
 450     int i, j;
 451
 452     if (vp8_rac_get(c))
 453         for (i = 0; i < 4; i++)
 454             s->prob->pred16x16[i] = vp8_rac_get_uint(c, 8);
 455     if (vp8_rac_get(c))
 456         for (i = 0; i < 3; i++)
 457             s->prob->pred8x8c[i]  = vp8_rac_get_uint(c, 8);
 458
 459     // 17.2 MV probability update
 460     for (i = 0; i < 2; i++)
 461         for (j = 0; j < mvc_size; j++)
 462             if (vp56_rac_get_prob_branchy(c, vp8_mv_update_prob[i][j]))
 463                 s->prob->mvc[i][j] = vp8_rac_get_nn(c);
 464 }
 465
 466 static void update_refs(VP8Context *s)
 467 {
 468     VP56RangeCoder *c = &s->c;
 469
 470     int update_golden = vp8_rac_get(c);
 471     int update_altref = vp8_rac_get(c);
 472
 473     s->update_golden = ref_to_update(s, update_golden, VP56_FRAME_GOLDEN);
 474     s->update_altref = ref_to_update(s, update_altref, VP56_FRAME_GOLDEN2);
 475 }
 476
 477 static void copy_chroma(AVFrame *dst, AVFrame *src, int width, int height)
 478 {
 479     int i, j;
 480
 481     for (j = 1; j < 3; j++) {
 482         for (i = 0; i < height / 2; i++)
 483             memcpy(dst->data[j] + i * dst->linesize[j],
 484                    src->data[j] + i * src->linesize[j], width / 2);
 485     }
 486 }
 487
 488 static void fade(uint8_t *dst, ptrdiff_t dst_linesize,
 489                  const uint8_t *src, ptrdiff_t src_linesize,
 490                  int width, int height,
 491                  int alpha, int beta)
 492 {
 493     int i, j;
 494     for (j = 0; j < height; j++) {
 495         const uint8_t *src2 = src + j * src_linesize;
 496         uint8_t *dst2 = dst + j * dst_linesize;
 497         for (i = 0; i < width; i++) {
 498             uint8_t y = src2[i];
 499             dst2[i] = av_clip_uint8(y + ((y * beta) >> 8) + alpha);
 500         }
 501     }
 502 }
 503
 504 static int vp7_fade_frame(VP8Context *s, VP56RangeCoder *c)
 505 {
 506     int alpha = (int8_t) vp8_rac_get_uint(c, 8);
 507     int beta  = (int8_t) vp8_rac_get_uint(c, 8);
 508     int ret;
 509
 510     if (c->end <= c->buffer && c->bits >= 0)
 511         return AVERROR_INVALIDDATA;
 512
 513     if (!s->keyframe && (alpha || beta)) {
 514         int width  = s->mb_width * 16;
 515         int height = s->mb_height * 16;
 516         AVFrame *src, *dst;
 517
 518         if (!s->framep[VP56_FRAME_PREVIOUS] ||
 519             !s->framep[VP56_FRAME_GOLDEN]) {
 520             av_log(s->avctx, AV_LOG_WARNING, "Discarding interframe without a prior keyframe!\n");
 521             return AVERROR_INVALIDDATA;
 522         }
 523
 524         dst =
 525         src = s->framep[VP56_FRAME_PREVIOUS]->tf.f;
 526
 527         /* preserve the golden frame, write a new previous frame */
 528         if (s->framep[VP56_FRAME_GOLDEN] == s->framep[VP56_FRAME_PREVIOUS]) {
 529             s->framep[VP56_FRAME_PREVIOUS] = vp8_find_free_buffer(s);
 530             if ((ret = vp8_alloc_frame(s, s->framep[VP56_FRAME_PREVIOUS], 1)) < 0)
 531                 return ret;
 532
 533             dst = s->framep[VP56_FRAME_PREVIOUS]->tf.f;
 534
 535             copy_chroma(dst, src, width, height);
 536         }
 537
 538         fade(dst->data[0], dst->linesize[0],
 539              src->data[0], src->linesize[0],
 540              width, height, alpha, beta);
 541     }
 542
 543     return 0;
 544 }
 545
 546 static int vp7_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size)
 547 {
 548     VP56RangeCoder *c = &s->c;
 549     int part1_size, hscale, vscale, i, j, ret;
 550     int width  = s->avctx->width;
 551     int height = s->avctx->height;
 552
 553     if (buf_size < 4) {
 554         return AVERROR_INVALIDDATA;
 555     }
 556
 557     s->profile = (buf[0] >> 1) & 7;
 558     if (s->profile > 1) {
 559         avpriv_request_sample(s->avctx, "Unknown profile %d", s->profile);
 560         return AVERROR_INVALIDDATA;
 561     }
 562
 563     s->keyframe  = !(buf[0] & 1);
 564     s->invisible = 0;
 565     part1_size   = AV_RL24(buf) >> 4;
 566
 567     if (buf_size < 4 - s->profile + part1_size) {
 568         av_log(s->avctx, AV_LOG_ERROR, "Buffer size %d is too small, needed : %d\n", buf_size, 4 - s->profile + part1_size);
 569         return AVERROR_INVALIDDATA;
 570     }
 571
 572     buf      += 4 - s->profile;
 573     buf_size -= 4 - s->profile;
 574
 575     memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab, sizeof(s->put_pixels_tab));
 576
 577     ret = ff_vp56_init_range_decoder(c, buf, part1_size);
 578     if (ret < 0)
 579         return ret;
 580     buf      += part1_size;
 581     buf_size -= part1_size;
 582
 583     /* A. Dimension information (keyframes only) */
 584     if (s->keyframe) {
 585         width  = vp8_rac_get_uint(c, 12);
 586         height = vp8_rac_get_uint(c, 12);
 587         hscale = vp8_rac_get_uint(c, 2);
 588         vscale = vp8_rac_get_uint(c, 2);
 589         if (hscale || vscale)
 590             avpriv_request_sample(s->avctx, "Upscaling");
 591
 592         s->update_golden = s->update_altref = VP56_FRAME_CURRENT;
 593         vp78_reset_probability_tables(s);
 594         memcpy(s->prob->pred16x16, vp8_pred16x16_prob_inter,
 595                sizeof(s->prob->pred16x16));
 596         memcpy(s->prob->pred8x8c, vp8_pred8x8c_prob_inter,
 597                sizeof(s->prob->pred8x8c));
 598         for (i = 0; i < 2; i++)
 599             memcpy(s->prob->mvc[i], vp7_mv_default_prob[i],
 600                    sizeof(vp7_mv_default_prob[i]));
 601         memset(&s->segmentation, 0, sizeof(s->segmentation));
 602         memset(&s->lf_delta, 0, sizeof(s->lf_delta));
 603         memcpy(s->prob[0].scan, ff_zigzag_scan, sizeof(s->prob[0].scan));
 604     }
 605
 606     if (s->keyframe || s->profile > 0)
 607         memset(s->inter_dc_pred, 0 , sizeof(s->inter_dc_pred));
 608
 609     /* B. Decoding information for all four macroblock-level features */
 610     for (i = 0; i < 4; i++) {
 611         s->feature_enabled[i] = vp8_rac_get(c);
 612         if (s->feature_enabled[i]) {
 613              s->feature_present_prob[i] = vp8_rac_get_uint(c, 8);
 614
 615              for (j = 0; j < 3; j++)
 616                  s->feature_index_prob[i][j] =
 617                      vp8_rac_get(c) ? vp8_rac_get_uint(c, 8) : 255;
 618
 619              if (vp7_feature_value_size[s->profile][i])
 620                  for (j = 0; j < 4; j++)
 621                      s->feature_value[i][j] =
 622                         vp8_rac_get(c) ? vp8_rac_get_uint(c, vp7_feature_value_size[s->profile][i]) : 0;
 623         }
 624     }
 625
 626     s->segmentation.enabled    = 0;
 627     s->segmentation.update_map = 0;
 628     s->lf_delta.enabled        = 0;
 629
 630     s->num_coeff_partitions = 1;
 631     ret = ff_vp56_init_range_decoder(&s->coeff_partition[0], buf, buf_size);
 632     if (ret < 0)
 633         return ret;
 634
 635     if (!s->macroblocks_base || /* first frame */
 636         width != s->avctx->width || height != s->avctx->height ||
 637         (width + 15) / 16 != s->mb_width || (height + 15) / 16 != s->mb_height) {
 638         if ((ret = vp7_update_dimensions(s, width, height)) < 0)
 639             return ret;
 640     }
 641
 642     /* C. Dequantization indices */
 643     vp7_get_quants(s);
 644
 645     /* D. Golden frame update flag (a Flag) for interframes only */
 646     if (!s->keyframe) {
 647         s->update_golden = vp8_rac_get(c) ? VP56_FRAME_CURRENT : VP56_FRAME_NONE;
 648         s->sign_bias[VP56_FRAME_GOLDEN] = 0;
 649     }
 650
 651     s->update_last          = 1;
 652     s->update_probabilities = 1;
 653     s->fade_present         = 1;
 654
 655     if (s->profile > 0) {
 656         s->update_probabilities = vp8_rac_get(c);
 657         if (!s->update_probabilities)
 658             s->prob[1] = s->prob[0];
 659
 660         if (!s->keyframe)
 661             s->fade_present = vp8_rac_get(c);
 662     }
 663
 664     if (c->end <= c->buffer && c->bits >= 0)
 665         return AVERROR_INVALIDDATA;
 666     /* E. Fading information for previous frame */
 667     if (s->fade_present && vp8_rac_get(c)) {
 668         if ((ret = vp7_fade_frame(s ,c)) < 0)
 669             return ret;
 670     }
 671
 672     /* F. Loop filter type */
 673     if (!s->profile)
 674         s->filter.simple = vp8_rac_get(c);
 675
 676     /* G. DCT coefficient ordering specification */
 677     if (vp8_rac_get(c))
 678         for (i = 1; i < 16; i++)
 679             s->prob[0].scan[i] = ff_zigzag_scan[vp8_rac_get_uint(c, 4)];
 680
 681     /* H. Loop filter levels  */
 682     if (s->profile > 0)
 683         s->filter.simple = vp8_rac_get(c);
 684     s->filter.level     = vp8_rac_get_uint(c, 6);
 685     s->filter.sharpness = vp8_rac_get_uint(c, 3);
 686
 687     /* I. DCT coefficient probability update; 13.3 Token Probability Updates */
 688     vp78_update_probability_tables(s);
 689
 690     s->mbskip_enabled = 0;
 691
 692     /* J. The remaining frame header data occurs ONLY FOR INTERFRAMES */
 693     if (!s->keyframe) {
 694         s->prob->intra  = vp8_rac_get_uint(c, 8);
 695         s->prob->last   = vp8_rac_get_uint(c, 8);
 696         vp78_update_pred16x16_pred8x8_mvc_probabilities(s, VP7_MVC_SIZE);
 697     }
 698
 699     return 0;
 700 }
 701
 702 static int vp8_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size)
 703 {
 704     VP56RangeCoder *c = &s->c;
 705     int header_size, hscale, vscale, ret;
 706     int width  = s->avctx->width;
 707     int height = s->avctx->height;
 708
 709     if (buf_size < 3) {
 710         av_log(s->avctx, AV_LOG_ERROR, "Insufficent data (%d) for header\n", buf_size);
 711         return AVERROR_INVALIDDATA;
 712     }
 713
 714     s->keyframe  = !(buf[0] & 1);
 715     s->profile   =  (buf[0]>>1) & 7;
 716     s->invisible = !(buf[0] & 0x10);
 717     header_size  = AV_RL24(buf) >> 5;
 718     buf      += 3;
 719     buf_size -= 3;
 720
 721     s->header_partition_size = header_size;
 722
 723     if (s->profile > 3)
 724         av_log(s->avctx, AV_LOG_WARNING, "Unknown profile %d\n", s->profile);
 725
 726     if (!s->profile)
 727         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab,
 728                sizeof(s->put_pixels_tab));
 729     else    // profile 1-3 use bilinear, 4+ aren't defined so whatever
 730         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_bilinear_pixels_tab,
 731                sizeof(s->put_pixels_tab));
 732
 733     if (header_size > buf_size - 7 * s->keyframe) {
 734         av_log(s->avctx, AV_LOG_ERROR, "Header size larger than data provided\n");
 735         return AVERROR_INVALIDDATA;
 736     }
 737
 738     if (s->keyframe) {
 739         if (AV_RL24(buf) != 0x2a019d) {
 740             av_log(s->avctx, AV_LOG_ERROR,
 741                    "Invalid start code 0x%x\n", AV_RL24(buf));
 742             return AVERROR_INVALIDDATA;
 743         }
 744         width     = AV_RL16(buf + 3) & 0x3fff;
 745         height    = AV_RL16(buf + 5) & 0x3fff;
 746         hscale    = buf[4] >> 6;
 747         vscale    = buf[6] >> 6;
 748         buf      += 7;
 749         buf_size -= 7;
 750
 751         if (hscale || vscale)
 752             avpriv_request_sample(s->avctx, "Upscaling");
 753
 754         s->update_golden = s->update_altref = VP56_FRAME_CURRENT;
 755         vp78_reset_probability_tables(s);
 756         memcpy(s->prob->pred16x16, vp8_pred16x16_prob_inter,
 757                sizeof(s->prob->pred16x16));
 758         memcpy(s->prob->pred8x8c, vp8_pred8x8c_prob_inter,
 759                sizeof(s->prob->pred8x8c));
 760         memcpy(s->prob->mvc, vp8_mv_default_prob,
 761                sizeof(s->prob->mvc));
 762         memset(&s->segmentation, 0, sizeof(s->segmentation));
 763         memset(&s->lf_delta, 0, sizeof(s->lf_delta));
 764     }
 765
 766     ret = ff_vp56_init_range_decoder(c, buf, header_size);
 767     if (ret < 0)
 768         return ret;
 769     buf      += header_size;
 770     buf_size -= header_size;
 771
 772     if (s->keyframe) {
 773         s->colorspace = vp8_rac_get(c);
 774         if (s->colorspace)
 775             av_log(s->avctx, AV_LOG_WARNING, "Unspecified colorspace\n");
 776         s->fullrange = vp8_rac_get(c);
 777     }
 778
 779     if ((s->segmentation.enabled = vp8_rac_get(c)))
 780         parse_segment_info(s);
 781     else
 782         s->segmentation.update_map = 0; // FIXME: move this to some init function?
 783
 784     s->filter.simple    = vp8_rac_get(c);
 785     s->filter.level     = vp8_rac_get_uint(c, 6);
 786     s->filter.sharpness = vp8_rac_get_uint(c, 3);
 787
 788     if ((s->lf_delta.enabled = vp8_rac_get(c))) {
 789         s->lf_delta.update = vp8_rac_get(c);
 790         if (s->lf_delta.update)
 791             update_lf_deltas(s);
 792     }
 793
 794     if (setup_partitions(s, buf, buf_size)) {
 795         av_log(s->avctx, AV_LOG_ERROR, "Invalid partitions\n");
 796         return AVERROR_INVALIDDATA;
 797     }
 798
 799     if (!s->macroblocks_base || /* first frame */
 800         width != s->avctx->width || height != s->avctx->height ||
 801         (width+15)/16 != s->mb_width || (height+15)/16 != s->mb_height)
 802         if ((ret = vp8_update_dimensions(s, width, height)) < 0)
 803             return ret;
 804
 805     vp8_get_quants(s);
 806
 807     if (!s->keyframe) {
 808         update_refs(s);
 809         s->sign_bias[VP56_FRAME_GOLDEN]               = vp8_rac_get(c);
 810         s->sign_bias[VP56_FRAME_GOLDEN2 /* altref */] = vp8_rac_get(c);
 811     }
 812
 813     // if we aren't saving this frame's probabilities for future frames,
 814     // make a copy of the current probabilities
 815     if (!(s->update_probabilities = vp8_rac_get(c)))
 816         s->prob[1] = s->prob[0];
 817
 818     s->update_last = s->keyframe || vp8_rac_get(c);
 819
 820     vp78_update_probability_tables(s);
 821
 822     if ((s->mbskip_enabled = vp8_rac_get(c)))
 823         s->prob->mbskip = vp8_rac_get_uint(c, 8);
 824
 825     if (!s->keyframe) {
 826         s->prob->intra  = vp8_rac_get_uint(c, 8);
 827         s->prob->last   = vp8_rac_get_uint(c, 8);
 828         s->prob->golden = vp8_rac_get_uint(c, 8);
 829         vp78_update_pred16x16_pred8x8_mvc_probabilities(s, VP8_MVC_SIZE);
 830     }
 831
 832     // Record the entropy coder state here so that hwaccels can use it.
 833     s->c.code_word = vp56_rac_renorm(&s->c);
 834     s->coder_state_at_header_end.input     = s->c.buffer - (-s->c.bits / 8);
 835     s->coder_state_at_header_end.range     = s->c.high;
 836     s->coder_state_at_header_end.value     = s->c.code_word >> 16;
 837     s->coder_state_at_header_end.bit_count = -s->c.bits % 8;
 838
 839     return 0;
 840 }
 841
 842 static av_always_inline
 843 void clamp_mv(VP8mvbounds *s, VP56mv *dst, const VP56mv *src)
 844 {
 845     dst->x = av_clip(src->x, av_clip(s->mv_min.x, INT16_MIN, INT16_MAX),
 846                              av_clip(s->mv_max.x, INT16_MIN, INT16_MAX));
 847     dst->y = av_clip(src->y, av_clip(s->mv_min.y, INT16_MIN, INT16_MAX),
 848                              av_clip(s->mv_max.y, INT16_MIN, INT16_MAX));
 849 }
 850
 851 /**
 852  * Motion vector coding, 17.1.
 853  */
 854 static av_always_inline int read_mv_component(VP56RangeCoder *c, const uint8_t *p, int vp7)
 855 {
 856     int bit, x = 0;
 857
 858     if (vp56_rac_get_prob_branchy(c, p[0])) {
 859         int i;
 860
 861         for (i = 0; i < 3; i++)
 862             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 863         for (i = (vp7 ? 7 : 9); i > 3; i--)
 864             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 865         if (!(x & (vp7 ? 0xF0 : 0xFFF0)) || vp56_rac_get_prob(c, p[12]))
 866             x += 8;
 867     } else {
 868         // small_mvtree
 869         const uint8_t *ps = p + 2;
 870         bit = vp56_rac_get_prob(c, *ps);
 871         ps += 1 + 3 * bit;
 872         x  += 4 * bit;
 873         bit = vp56_rac_get_prob(c, *ps);
 874         ps += 1 + bit;
 875         x  += 2 * bit;
 876         x  += vp56_rac_get_prob(c, *ps);
 877     }
 878
 879     return (x && vp56_rac_get_prob(c, p[1])) ? -x : x;
 880 }
 881
 882 static int vp7_read_mv_component(VP56RangeCoder *c, const uint8_t *p)
 883 {
 884     return read_mv_component(c, p, 1);
 885 }
 886
 887 static int vp8_read_mv_component(VP56RangeCoder *c, const uint8_t *p)
 888 {
 889     return read_mv_component(c, p, 0);
 890 }
 891
 892 static av_always_inline
 893 const uint8_t *get_submv_prob(uint32_t left, uint32_t top, int is_vp7)
 894 {
 895     if (is_vp7)
 896         return vp7_submv_prob;
 897
 898     if (left == top)
 899         return vp8_submv_prob[4 - !!left];
 900     if (!top)
 901         return vp8_submv_prob[2];
 902     return vp8_submv_prob[1 - !!left];
 903 }
 904
 905 /**
 906  * Split motion vector prediction, 16.4.
 907  * @returns the number of motion vectors parsed (2, 4 or 16)
 908  */
 909 static av_always_inline
 910 int decode_splitmvs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
 911                     int layout, int is_vp7)
 912 {
 913     int part_idx;
 914     int n, num;
 915     VP8Macroblock *top_mb;
 916     VP8Macroblock *left_mb = &mb[-1];
 917     const uint8_t *mbsplits_left = vp8_mbsplits[left_mb->partitioning];
 918     const uint8_t *mbsplits_top, *mbsplits_cur, *firstidx;
 919     VP56mv *top_mv;
 920     VP56mv *left_mv = left_mb->bmv;
 921     VP56mv *cur_mv  = mb->bmv;
 922
 923     if (!layout) // layout is inlined, s->mb_layout is not
 924         top_mb = &mb[2];
 925     else
 926         top_mb = &mb[-s->mb_width - 1];
 927     mbsplits_top = vp8_mbsplits[top_mb->partitioning];
 928     top_mv       = top_mb->bmv;
 929
 930     if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[0])) {
 931         if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[1]))
 932             part_idx = VP8_SPLITMVMODE_16x8 + vp56_rac_get_prob(c, vp8_mbsplit_prob[2]);
 933         else
 934             part_idx = VP8_SPLITMVMODE_8x8;
 935     } else {
 936         part_idx = VP8_SPLITMVMODE_4x4;
 937     }
 938
 939     num              = vp8_mbsplit_count[part_idx];
 940     mbsplits_cur     = vp8_mbsplits[part_idx],
 941     firstidx         = vp8_mbfirstidx[part_idx];
 942     mb->partitioning = part_idx;
 943
 944     for (n = 0; n < num; n++) {
 945         int k = firstidx[n];
 946         uint32_t left, above;
 947         const uint8_t *submv_prob;
 948
 949         if (!(k & 3))
 950             left = AV_RN32A(&left_mv[mbsplits_left[k + 3]]);
 951         else
 952             left = AV_RN32A(&cur_mv[mbsplits_cur[k - 1]]);
 953         if (k <= 3)
 954             above = AV_RN32A(&top_mv[mbsplits_top[k + 12]]);
 955         else
 956             above = AV_RN32A(&cur_mv[mbsplits_cur[k - 4]]);
 957
 958         submv_prob = get_submv_prob(left, above, is_vp7);
 959
 960         if (vp56_rac_get_prob_branchy(c, submv_prob[0])) {
 961             if (vp56_rac_get_prob_branchy(c, submv_prob[1])) {
 962                 if (vp56_rac_get_prob_branchy(c, submv_prob[2])) {
 963                     mb->bmv[n].y = mb->mv.y +
 964                                    read_mv_component(c, s->prob->mvc[0], is_vp7);
 965                     mb->bmv[n].x = mb->mv.x +
 966                                    read_mv_component(c, s->prob->mvc[1], is_vp7);
 967                 } else {
 968                     AV_ZERO32(&mb->bmv[n]);
 969                 }
 970             } else {
 971                 AV_WN32A(&mb->bmv[n], above);
 972             }
 973         } else {
 974             AV_WN32A(&mb->bmv[n], left);
 975         }
 976     }
 977
 978     return num;
 979 }
 980
 981 /**
 982  * The vp7 reference decoder uses a padding macroblock column (added to right
 983  * edge of the frame) to guard against illegal macroblock offsets. The
 984  * algorithm has bugs that permit offsets to straddle the padding column.
 985  * This function replicates those bugs.
 986  *
 987  * @param[out] edge_x macroblock x address
 988  * @param[out] edge_y macroblock y address
 989  *
 990  * @return macroblock offset legal (boolean)
 991  */
 992 static int vp7_calculate_mb_offset(int mb_x, int mb_y, int mb_width,
 993                                    int xoffset, int yoffset, int boundary,
 994                                    int *edge_x, int *edge_y)
 995 {
 996     int vwidth = mb_width + 1;
 997     int new = (mb_y + yoffset) * vwidth + mb_x + xoffset;
 998     if (new < boundary || new % vwidth == vwidth - 1)
 999         return 0;
1000     *edge_y = new / vwidth;
1001     *edge_x = new % vwidth;
1002     return 1;
1003 }
1004
1005 static const VP56mv *get_bmv_ptr(const VP8Macroblock *mb, int subblock)
1006 {
1007     return &mb->bmv[mb->mode == VP8_MVMODE_SPLIT ? vp8_mbsplits[mb->partitioning][subblock] : 0];
1008 }
1009
1010 static av_always_inline
1011 void vp7_decode_mvs(VP8Context *s, VP8Macroblock *mb,
1012                     int mb_x, int mb_y, int layout)
1013 {
1014     VP8Macroblock *mb_edge[12];
1015     enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR };
1016     enum { VP8_EDGE_TOP, VP8_EDGE_LEFT, VP8_EDGE_TOPLEFT };
1017     int idx = CNT_ZERO;
1018     VP56mv near_mv[3];
1019     uint8_t cnt[3] = { 0 };
1020     VP56RangeCoder *c = &s->c;
1021     int i;
1022
1023     AV_ZERO32(&near_mv[0]);
1024     AV_ZERO32(&near_mv[1]);
1025     AV_ZERO32(&near_mv[2]);
1026
1027     for (i = 0; i < VP7_MV_PRED_COUNT; i++) {
1028         const VP7MVPred * pred = &vp7_mv_pred[i];
1029         int edge_x, edge_y;
1030
1031         if (vp7_calculate_mb_offset(mb_x, mb_y, s->mb_width, pred->xoffset,
1032                                     pred->yoffset, !s->profile, &edge_x, &edge_y)) {
1033             VP8Macroblock *edge = mb_edge[i] = (s->mb_layout == 1)
1034                                              ? s->macroblocks_base + 1 + edge_x +
1035                                                (s->mb_width + 1) * (edge_y + 1)
1036                                              : s->macroblocks + edge_x +
1037                                                (s->mb_height - edge_y - 1) * 2;
1038             uint32_t mv = AV_RN32A(get_bmv_ptr(edge, vp7_mv_pred[i].subblock));
1039             if (mv) {
1040                 if (AV_RN32A(&near_mv[CNT_NEAREST])) {
1041                     if (mv == AV_RN32A(&near_mv[CNT_NEAREST])) {
1042                         idx = CNT_NEAREST;
1043                     } else if (AV_RN32A(&near_mv[CNT_NEAR])) {
1044                         if (mv != AV_RN32A(&near_mv[CNT_NEAR]))
1045                             continue;
1046                         idx = CNT_NEAR;
1047                     } else {
1048                         AV_WN32A(&near_mv[CNT_NEAR], mv);
1049                         idx = CNT_NEAR;
1050                     }
1051                 } else {
1052                     AV_WN32A(&near_mv[CNT_NEAREST], mv);
1053                     idx = CNT_NEAREST;
1054                 }
1055             } else {
1056                 idx = CNT_ZERO;
1057             }
1058         } else {
1059             idx = CNT_ZERO;
1060         }
1061         cnt[idx] += vp7_mv_pred[i].score;
1062     }
1063
1064     mb->partitioning = VP8_SPLITMVMODE_NONE;
1065
1066     if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_ZERO]][0])) {
1067         mb->mode = VP8_MVMODE_MV;
1068
1069         if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAREST]][1])) {
1070
1071             if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAR]][2])) {
1072
1073                 if (cnt[CNT_NEAREST] > cnt[CNT_NEAR])
1074                     AV_WN32A(&mb->mv, cnt[CNT_ZERO] > cnt[CNT_NEAREST] ? 0 : AV_RN32A(&near_mv[CNT_NEAREST]));
1075                 else
1076                     AV_WN32A(&mb->mv, cnt[CNT_ZERO] > cnt[CNT_NEAR]    ? 0 : AV_RN32A(&near_mv[CNT_NEAR]));
1077
1078                 if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAR]][3])) {
1079                     mb->mode = VP8_MVMODE_SPLIT;
1080                     mb->mv = mb->bmv[decode_splitmvs(s, c, mb, layout, IS_VP7) - 1];
1081                 } else {
1082                     mb->mv.y += vp7_read_mv_component(c, s->prob->mvc[0]);
1083                     mb->mv.x += vp7_read_mv_component(c, s->prob->mvc[1]);
1084                     mb->bmv[0] = mb->mv;
1085                 }
1086             } else {
1087                 mb->mv = near_mv[CNT_NEAR];
1088                 mb->bmv[0] = mb->mv;
1089             }
1090         } else {
1091             mb->mv = near_mv[CNT_NEAREST];
1092             mb->bmv[0] = mb->mv;
1093         }
1094     } else {
1095         mb->mode = VP8_MVMODE_ZERO;
1096         AV_ZERO32(&mb->mv);
1097         mb->bmv[0] = mb->mv;
1098     }
1099 }
1100
1101 static av_always_inline
1102 void vp8_decode_mvs(VP8Context *s, VP8mvbounds *mv_bounds, VP8Macroblock *mb,
1103                     int mb_x, int mb_y, int layout)
1104 {
1105     VP8Macroblock *mb_edge[3] = { 0      /* top */,
1106                                   mb - 1 /* left */,
1107                                   0      /* top-left */ };
1108     enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV };
1109     enum { VP8_EDGE_TOP, VP8_EDGE_LEFT, VP8_EDGE_TOPLEFT };
1110     int idx = CNT_ZERO;
1111     int cur_sign_bias = s->sign_bias[mb->ref_frame];
1112     int8_t *sign_bias = s->sign_bias;
1113     VP56mv near_mv[4];
1114     uint8_t cnt[4] = { 0 };
1115     VP56RangeCoder *c = &s->c;
1116
1117     if (!layout) { // layout is inlined (s->mb_layout is not)
1118         mb_edge[0] = mb + 2;
1119         mb_edge[2] = mb + 1;
1120     } else {
1121         mb_edge[0] = mb - s->mb_width - 1;
1122         mb_edge[2] = mb - s->mb_width - 2;
1123     }
1124
1125     AV_ZERO32(&near_mv[0]);
1126     AV_ZERO32(&near_mv[1]);
1127     AV_ZERO32(&near_mv[2]);
1128
1129     /* Process MB on top, left and top-left */
1130 #define MV_EDGE_CHECK(n)                                                      \
1131     {                                                                         \
1132         VP8Macroblock *edge = mb_edge[n];                                     \
1133         int edge_ref = edge->ref_frame;                                       \
1134         if (edge_ref != VP56_FRAME_CURRENT) {                                 \
1135             uint32_t mv = AV_RN32A(&edge->mv);                                \
1136             if (mv) {                                                         \
1137                 if (cur_sign_bias != sign_bias[edge_ref]) {                   \
1138                     /* SWAR negate of the values in mv. */                    \
1139                     mv = ~mv;                                                 \
1140                     mv = ((mv & 0x7fff7fff) +                                 \
1141                           0x00010001) ^ (mv & 0x80008000);                    \
1142                 }                                                             \
1143                 if (!n || mv != AV_RN32A(&near_mv[idx]))                      \
1144                     AV_WN32A(&near_mv[++idx], mv);                            \
1145                 cnt[idx] += 1 + (n != 2);                                     \
1146             } else                                                            \
1147                 cnt[CNT_ZERO] += 1 + (n != 2);                                \
1148         }                                                                     \
1149     }
1150
1151     MV_EDGE_CHECK(0)
1152     MV_EDGE_CHECK(1)
1153     MV_EDGE_CHECK(2)
1154
1155     mb->partitioning = VP8_SPLITMVMODE_NONE;
1156     if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_ZERO]][0])) {
1157         mb->mode = VP8_MVMODE_MV;
1158
1159         /* If we have three distinct MVs, merge first and last if they're the same */
1160         if (cnt[CNT_SPLITMV] &&
1161             AV_RN32A(&near_mv[1 + VP8_EDGE_TOP]) == AV_RN32A(&near_mv[1 + VP8_EDGE_TOPLEFT]))
1162             cnt[CNT_NEAREST] += 1;
1163
1164         /* Swap near and nearest if necessary */
1165         if (cnt[CNT_NEAR] > cnt[CNT_NEAREST]) {
1166             FFSWAP(uint8_t,     cnt[CNT_NEAREST],     cnt[CNT_NEAR]);
1167             FFSWAP( VP56mv, near_mv[CNT_NEAREST], near_mv[CNT_NEAR]);
1168         }
1169
1170         if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAREST]][1])) {
1171             if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAR]][2])) {
1172                 /* Choose the best mv out of 0,0 and the nearest mv */
1173                 clamp_mv(mv_bounds, &mb->mv, &near_mv[CNT_ZERO + (cnt[CNT_NEAREST] >= cnt[CNT_ZERO])]);
1174                 cnt[CNT_SPLITMV] = ((mb_edge[VP8_EDGE_LEFT]->mode    == VP8_MVMODE_SPLIT) +
1175                                     (mb_edge[VP8_EDGE_TOP]->mode     == VP8_MVMODE_SPLIT)) * 2 +
1176                                     (mb_edge[VP8_EDGE_TOPLEFT]->mode == VP8_MVMODE_SPLIT);
1177
1178                 if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_SPLITMV]][3])) {
1179                     mb->mode = VP8_MVMODE_SPLIT;
1180                     mb->mv = mb->bmv[decode_splitmvs(s, c, mb, layout, IS_VP8) - 1];
1181                 } else {
1182                     mb->mv.y  += vp8_read_mv_component(c, s->prob->mvc[0]);
1183                     mb->mv.x  += vp8_read_mv_component(c, s->prob->mvc[1]);
1184                     mb->bmv[0] = mb->mv;
1185                 }
1186             } else {
1187                 clamp_mv(mv_bounds, &mb->mv, &near_mv[CNT_NEAR]);
1188                 mb->bmv[0] = mb->mv;
1189             }
1190         } else {
1191             clamp_mv(mv_bounds, &mb->mv, &near_mv[CNT_NEAREST]);
1192             mb->bmv[0] = mb->mv;
1193         }
1194     } else {
1195         mb->mode = VP8_MVMODE_ZERO;
1196         AV_ZERO32(&mb->mv);
1197         mb->bmv[0] = mb->mv;
1198     }
1199 }
1200
1201 static av_always_inline
1202 void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
1203                            int mb_x, int keyframe, int layout)
1204 {
1205     uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
1206
1207     if (layout) {
1208         VP8Macroblock *mb_top = mb - s->mb_width - 1;
1209         memcpy(mb->intra4x4_pred_mode_top, mb_top->intra4x4_pred_mode_top, 4);
1210     }
1211     if (keyframe) {
1212         int x, y;
1213         uint8_t *top;
1214         uint8_t *const left = s->intra4x4_pred_mode_left;
1215         if (layout)
1216             top = mb->intra4x4_pred_mode_top;
1217         else
1218             top = s->intra4x4_pred_mode_top + 4 * mb_x;
1219         for (y = 0; y < 4; y++) {
1220             for (x = 0; x < 4; x++) {
1221                 const uint8_t *ctx;
1222                 ctx       = vp8_pred4x4_prob_intra[top[x]][left[y]];
1223                 *intra4x4 = vp8_rac_get_tree(c, vp8_pred4x4_tree, ctx);
1224                 left[y]   = top[x] = *intra4x4;
1225                 intra4x4++;
1226             }
1227         }
1228     } else {
1229         int i;
1230         for (i = 0; i < 16; i++)
1231             intra4x4[i] = vp8_rac_get_tree(c, vp8_pred4x4_tree,
1232                                            vp8_pred4x4_prob_inter);
1233     }
1234 }
1235
1236 static av_always_inline
1237 void decode_mb_mode(VP8Context *s, VP8mvbounds *mv_bounds,
1238                     VP8Macroblock *mb, int mb_x, int mb_y,
1239                     uint8_t *segment, uint8_t *ref, int layout, int is_vp7)
1240 {
1241     VP56RangeCoder *c = &s->c;
1242     static const char * const vp7_feature_name[] = { "q-index",
1243                                                      "lf-delta",
1244                                                      "partial-golden-update",
1245                                                      "blit-pitch" };
1246     if (is_vp7) {
1247         int i;
1248         *segment = 0;
1249         for (i = 0; i < 4; i++) {
1250             if (s->feature_enabled[i]) {
1251                 if (vp56_rac_get_prob_branchy(c, s->feature_present_prob[i])) {
1252                       int index = vp8_rac_get_tree(c, vp7_feature_index_tree,
1253                                                    s->feature_index_prob[i]);
1254                       av_log(s->avctx, AV_LOG_WARNING,
1255                              "Feature %s present in macroblock (value 0x%x)\n",
1256                              vp7_feature_name[i], s->feature_value[i][index]);
1257                 }
1258            }
1259         }
1260     } else if (s->segmentation.update_map) {
1261         int bit  = vp56_rac_get_prob(c, s->prob->segmentid[0]);
1262         *segment = vp56_rac_get_prob(c, s->prob->segmentid[1+bit]) + 2*bit;
1263     } else if (s->segmentation.enabled)
1264         *segment = ref ? *ref : *segment;
1265     mb->segment = *segment;
1266
1267     mb->skip = s->mbskip_enabled ? vp56_rac_get_prob(c, s->prob->mbskip) : 0;
1268
1269     if (s->keyframe) {
1270         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_intra,
1271                                     vp8_pred16x16_prob_intra);
1272
1273         if (mb->mode == MODE_I4x4) {
1274             decode_intra4x4_modes(s, c, mb, mb_x, 1, layout);
1275         } else {
1276             const uint32_t modes = (is_vp7 ? vp7_pred4x4_mode
1277                                            : vp8_pred4x4_mode)[mb->mode] * 0x01010101u;
1278             if (s->mb_layout)
1279                 AV_WN32A(mb->intra4x4_pred_mode_top, modes);
1280             else
1281                 AV_WN32A(s->intra4x4_pred_mode_top + 4 * mb_x, modes);
1282             AV_WN32A(s->intra4x4_pred_mode_left, modes);
1283         }
1284
1285         mb->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree,
1286                                                 vp8_pred8x8c_prob_intra);
1287         mb->ref_frame        = VP56_FRAME_CURRENT;
1288     } else if (vp56_rac_get_prob_branchy(c, s->prob->intra)) {
1289         // inter MB, 16.2
1290         if (vp56_rac_get_prob_branchy(c, s->prob->last))
1291             mb->ref_frame =
1292                 (!is_vp7 && vp56_rac_get_prob(c, s->prob->golden)) ? VP56_FRAME_GOLDEN2 /* altref */
1293                                                                    : VP56_FRAME_GOLDEN;
1294         else
1295             mb->ref_frame = VP56_FRAME_PREVIOUS;
1296         s->ref_count[mb->ref_frame - 1]++;
1297
1298         // motion vectors, 16.3
1299         if (is_vp7)
1300             vp7_decode_mvs(s, mb, mb_x, mb_y, layout);
1301         else
1302             vp8_decode_mvs(s, mv_bounds, mb, mb_x, mb_y, layout);
1303     } else {
1304         // intra MB, 16.1
1305         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_inter, s->prob->pred16x16);
1306
1307         if (mb->mode == MODE_I4x4)
1308             decode_intra4x4_modes(s, c, mb, mb_x, 0, layout);
1309
1310         mb->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree,
1311                                                 s->prob->pred8x8c);
1312         mb->ref_frame        = VP56_FRAME_CURRENT;
1313         mb->partitioning     = VP8_SPLITMVMODE_NONE;
1314         AV_ZERO32(&mb->bmv[0]);
1315     }
1316 }
1317
1318 /**
1319  * @param r     arithmetic bitstream reader context
1320  * @param block destination for block coefficients
1321  * @param probs probabilities to use when reading trees from the bitstream
1322  * @param i     initial coeff index, 0 unless a separate DC block is coded
1323  * @param qmul  array holding the dc/ac dequant factor at position 0/1
1324  *
1325  * @return 0 if no coeffs were decoded
1326  *         otherwise, the index of the last coeff decoded plus one
1327  */
1328 static av_always_inline
1329 int decode_block_coeffs_internal(VP56RangeCoder *r, int16_t block[16],
1330                                  uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1331                                  int i, uint8_t *token_prob, int16_t qmul[2],
1332                                  const uint8_t scan[16], int vp7)
1333 {
1334     VP56RangeCoder c = *r;
1335     goto skip_eob;
1336     do {
1337         int coeff;
1338 restart:
1339         if (!vp56_rac_get_prob_branchy(&c, token_prob[0]))   // DCT_EOB
1340             break;
1341
1342 skip_eob:
1343         if (!vp56_rac_get_prob_branchy(&c, token_prob[1])) { // DCT_0
1344             if (++i == 16)
1345                 break; // invalid input; blocks should end with EOB
1346             token_prob = probs[i][0];
1347             if (vp7)
1348                 goto restart;
1349             goto skip_eob;
1350         }
1351
1352         if (!vp56_rac_get_prob_branchy(&c, token_prob[2])) { // DCT_1
1353             coeff = 1;
1354             token_prob = probs[i + 1][1];
1355         } else {
1356             if (!vp56_rac_get_prob_branchy(&c, token_prob[3])) { // DCT 2,3,4
1357                 coeff = vp56_rac_get_prob_branchy(&c, token_prob[4]);
1358                 if (coeff)
1359                     coeff += vp56_rac_get_prob(&c, token_prob[5]);
1360                 coeff += 2;
1361             } else {
1362                 // DCT_CAT*
1363                 if (!vp56_rac_get_prob_branchy(&c, token_prob[6])) {
1364                     if (!vp56_rac_get_prob_branchy(&c, token_prob[7])) { // DCT_CAT1
1365                         coeff = 5 + vp56_rac_get_prob(&c, vp8_dct_cat1_prob[0]);
1366                     } else {                                    // DCT_CAT2
1367                         coeff  = 7;
1368                         coeff += vp56_rac_get_prob(&c, vp8_dct_cat2_prob[0]) << 1;
1369                         coeff += vp56_rac_get_prob(&c, vp8_dct_cat2_prob[1]);
1370                     }
1371                 } else {    // DCT_CAT3 and up
1372                     int a   = vp56_rac_get_prob(&c, token_prob[8]);
1373                     int b   = vp56_rac_get_prob(&c, token_prob[9 + a]);
1374                     int cat = (a << 1) + b;
1375                     coeff  = 3 + (8 << cat);
1376                     coeff += vp8_rac_get_coeff(&c, ff_vp8_dct_cat_prob[cat]);
1377                 }
1378             }
1379             token_prob = probs[i + 1][2];
1380         }
1381         block[scan[i]] = (vp8_rac_get(&c) ? -coeff : coeff) * qmul[!!i];
1382     } while (++i < 16);
1383
1384     *r = c;
1385     return i;
1386 }
1387
1388 static av_always_inline
1389 int inter_predict_dc(int16_t block[16], int16_t pred[2])
1390 {
1391     int16_t dc = block[0];
1392     int ret = 0;
1393
1394     if (pred[1] > 3) {
1395         dc += pred[0];
1396         ret = 1;
1397     }
1398
1399     if (!pred[0] | !dc | ((int32_t)pred[0] ^ (int32_t)dc) >> 31) {
1400         block[0] = pred[0] = dc;
1401         pred[1] = 0;
1402     } else {
1403         if (pred[0] == dc)
1404             pred[1]++;
1405         block[0] = pred[0] = dc;
1406     }
1407
1408     return ret;
1409 }
1410
1411 static int vp7_decode_block_coeffs_internal(VP56RangeCoder *r,
1412                                             int16_t block[16],
1413                                             uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1414                                             int i, uint8_t *token_prob,
1415                                             int16_t qmul[2],
1416                                             const uint8_t scan[16])
1417 {
1418     return decode_block_coeffs_internal(r, block, probs, i,
1419                                         token_prob, qmul, scan, IS_VP7);
1420 }
1421
1422 #ifndef vp8_decode_block_coeffs_internal
1423 static int vp8_decode_block_coeffs_internal(VP56RangeCoder *r,
1424                                             int16_t block[16],
1425                                             uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1426                                             int i, uint8_t *token_prob,
1427                                             int16_t qmul[2])
1428 {
1429     return decode_block_coeffs_internal(r, block, probs, i,
1430                                         token_prob, qmul, ff_zigzag_scan, IS_VP8);
1431 }
1432 #endif
1433
1434 /**
1435  * @param c          arithmetic bitstream reader context
1436  * @param block      destination for block coefficients
1437  * @param probs      probabilities to use when reading trees from the bitstream
1438  * @param i          initial coeff index, 0 unless a separate DC block is coded
1439  * @param zero_nhood the initial prediction context for number of surrounding
1440  *                   all-zero blocks (only left/top, so 0-2)
1441  * @param qmul       array holding the dc/ac dequant factor at position 0/1
1442  * @param scan       scan pattern (VP7 only)
1443  *
1444  * @return 0 if no coeffs were decoded
1445  *         otherwise, the index of the last coeff decoded plus one
1446  */
1447 static av_always_inline
1448 int decode_block_coeffs(VP56RangeCoder *c, int16_t block[16],
1449                         uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1450                         int i, int zero_nhood, int16_t qmul[2],
1451                         const uint8_t scan[16], int vp7)
1452 {
1453     uint8_t *token_prob = probs[i][zero_nhood];
1454     if (!vp56_rac_get_prob_branchy(c, token_prob[0]))   // DCT_EOB
1455         return 0;
1456     return vp7 ? vp7_decode_block_coeffs_internal(c, block, probs, i,
1457                                                   token_prob, qmul, scan)
1458                : vp8_decode_block_coeffs_internal(c, block, probs, i,
1459                                                   token_prob, qmul);
1460 }
1461
1462 static av_always_inline
1463 void decode_mb_coeffs(VP8Context *s, VP8ThreadData *td, VP56RangeCoder *c,
1464                       VP8Macroblock *mb, uint8_t t_nnz[9], uint8_t l_nnz[9],
1465                       int is_vp7)
1466 {
1467     int i, x, y, luma_start = 0, luma_ctx = 3;
1468     int nnz_pred, nnz, nnz_total = 0;
1469     int segment = mb->segment;
1470     int block_dc = 0;
1471
1472     if (mb->mode != MODE_I4x4 && (is_vp7 || mb->mode != VP8_MVMODE_SPLIT)) {
1473         nnz_pred = t_nnz[8] + l_nnz[8];
1474
1475         // decode DC values and do hadamard
1476         nnz = decode_block_coeffs(c, td->block_dc, s->prob->token[1], 0,
1477                                   nnz_pred, s->qmat[segment].luma_dc_qmul,
1478                                   ff_zigzag_scan, is_vp7);
1479         l_nnz[8] = t_nnz[8] = !!nnz;
1480
1481         if (is_vp7 && mb->mode > MODE_I4x4) {
1482             nnz |=  inter_predict_dc(td->block_dc,
1483                                      s->inter_dc_pred[mb->ref_frame - 1]);
1484         }
1485
1486         if (nnz) {
1487             nnz_total += nnz;
1488             block_dc   = 1;
1489             if (nnz == 1)
1490                 s->vp8dsp.vp8_luma_dc_wht_dc(td->block, td->block_dc);
1491             else
1492                 s->vp8dsp.vp8_luma_dc_wht(td->block, td->block_dc);
1493         }
1494         luma_start = 1;
1495         luma_ctx   = 0;
1496     }
1497
1498     // luma blocks
1499     for (y = 0; y < 4; y++)
1500         for (x = 0; x < 4; x++) {
1501             nnz_pred = l_nnz[y] + t_nnz[x];
1502             nnz = decode_block_coeffs(c, td->block[y][x],
1503                                       s->prob->token[luma_ctx],
1504                                       luma_start, nnz_pred,
1505                                       s->qmat[segment].luma_qmul,
1506                                       s->prob[0].scan, is_vp7);
1507             /* nnz+block_dc may be one more than the actual last index,
1508              * but we don't care */
1509             td->non_zero_count_cache[y][x] = nnz + block_dc;
1510             t_nnz[x] = l_nnz[y] = !!nnz;
1511             nnz_total += nnz;
1512         }
1513
1514     // chroma blocks
1515     // TODO: what to do about dimensions? 2nd dim for luma is x,
1516     // but for chroma it's (y<<1)|x
1517     for (i = 4; i < 6; i++)
1518         for (y = 0; y < 2; y++)
1519             for (x = 0; x < 2; x++) {
1520                 nnz_pred = l_nnz[i + 2 * y] + t_nnz[i + 2 * x];
1521                 nnz = decode_block_coeffs(c, td->block[i][(y << 1) + x],
1522                                           s->prob->token[2], 0, nnz_pred,
1523                                           s->qmat[segment].chroma_qmul,
1524                                           s->prob[0].scan, is_vp7);
1525                 td->non_zero_count_cache[i][(y << 1) + x] = nnz;
1526                 t_nnz[i + 2 * x] = l_nnz[i + 2 * y] = !!nnz;
1527                 nnz_total += nnz;
1528             }
1529
1530     // if there were no coded coeffs despite the macroblock not being marked skip,
1531     // we MUST not do the inner loop filter and should not do IDCT
1532     // Since skip isn't used for bitstream prediction, just manually set it.
1533     if (!nnz_total)
1534         mb->skip = 1;
1535 }
1536
1537 static av_always_inline
1538 void backup_mb_border(uint8_t *top_border, uint8_t *src_y,
1539                       uint8_t *src_cb, uint8_t *src_cr,
1540                       ptrdiff_t linesize, ptrdiff_t uvlinesize, int simple)
1541 {
1542     AV_COPY128(top_border, src_y + 15 * linesize);
1543     if (!simple) {
1544         AV_COPY64(top_border + 16, src_cb + 7 * uvlinesize);
1545         AV_COPY64(top_border + 24, src_cr + 7 * uvlinesize);
1546     }
1547 }
1548
1549 static av_always_inline
1550 void xchg_mb_border(uint8_t *top_border, uint8_t *src_y, uint8_t *src_cb,
1551                     uint8_t *src_cr, ptrdiff_t linesize, ptrdiff_t uvlinesize, int mb_x,
1552                     int mb_y, int mb_width, int simple, int xchg)
1553 {
1554     uint8_t *top_border_m1 = top_border - 32;     // for TL prediction
1555     src_y  -= linesize;
1556     src_cb -= uvlinesize;
1557     src_cr -= uvlinesize;
1558
1559 #define XCHG(a, b, xchg)                                                      \
1560     do {                                                                      \
1561         if (xchg)                                                             \
1562             AV_SWAP64(b, a);                                                  \
1563         else                                                                  \
1564             AV_COPY64(b, a);                                                  \
1565     } while (0)
1566
1567     XCHG(top_border_m1 + 8, src_y - 8, xchg);
1568     XCHG(top_border, src_y, xchg);
1569     XCHG(top_border + 8, src_y + 8, 1);
1570     if (mb_x < mb_width - 1)
1571         XCHG(top_border + 32, src_y + 16, 1);
1572
1573     // only copy chroma for normal loop filter
1574     // or to initialize the top row to 127
1575     if (!simple || !mb_y) {
1576         XCHG(top_border_m1 + 16, src_cb - 8, xchg);
1577         XCHG(top_border_m1 + 24, src_cr - 8, xchg);
1578         XCHG(top_border + 16, src_cb, 1);
1579         XCHG(top_border + 24, src_cr, 1);
1580     }
1581 }
1582
1583 static av_always_inline
1584 int check_dc_pred8x8_mode(int mode, int mb_x, int mb_y)
1585 {
1586     if (!mb_x)
1587         return mb_y ? TOP_DC_PRED8x8 : DC_128_PRED8x8;
1588     else
1589         return mb_y ? mode : LEFT_DC_PRED8x8;
1590 }
1591
1592 static av_always_inline
1593 int check_tm_pred8x8_mode(int mode, int mb_x, int mb_y, int vp7)
1594 {
1595     if (!mb_x)
1596         return mb_y ? VERT_PRED8x8 : (vp7 ? DC_128_PRED8x8 : DC_129_PRED8x8);
1597     else
1598         return mb_y ? mode : HOR_PRED8x8;
1599 }
1600
1601 static av_always_inline
1602 int check_intra_pred8x8_mode_emuedge(int mode, int mb_x, int mb_y, int vp7)
1603 {
1604     switch (mode) {
1605     case DC_PRED8x8:
1606         return check_dc_pred8x8_mode(mode, mb_x, mb_y);
1607     case VERT_PRED8x8:
1608         return !mb_y ? (vp7 ? DC_128_PRED8x8 : DC_127_PRED8x8) : mode;
1609     case HOR_PRED8x8:
1610         return !mb_x ? (vp7 ? DC_128_PRED8x8 : DC_129_PRED8x8) : mode;
1611     case PLANE_PRED8x8: /* TM */
1612         return check_tm_pred8x8_mode(mode, mb_x, mb_y, vp7);
1613     }
1614     return mode;
1615 }
1616
1617 static av_always_inline
1618 int check_tm_pred4x4_mode(int mode, int mb_x, int mb_y, int vp7)
1619 {
1620     if (!mb_x) {
1621         return mb_y ? VERT_VP8_PRED : (vp7 ? DC_128_PRED : DC_129_PRED);
1622     } else {
1623         return mb_y ? mode : HOR_VP8_PRED;
1624     }
1625 }
1626
1627 static av_always_inline
1628 int check_intra_pred4x4_mode_emuedge(int mode, int mb_x, int mb_y,
1629                                      int *copy_buf, int vp7)
1630 {
1631     switch (mode) {
1632     case VERT_PRED:
1633         if (!mb_x && mb_y) {
1634             *copy_buf = 1;
1635             return mode;
1636         }
1637         /* fall-through */
1638     case DIAG_DOWN_LEFT_PRED:
1639     case VERT_LEFT_PRED:
1640         return !mb_y ? (vp7 ? DC_128_PRED : DC_127_PRED) : mode;
1641     case HOR_PRED:
1642         if (!mb_y) {
1643             *copy_buf = 1;
1644             return mode;
1645         }
1646         /* fall-through */
1647     case HOR_UP_PRED:
1648         return !mb_x ? (vp7 ? DC_128_PRED : DC_129_PRED) : mode;
1649     case TM_VP8_PRED:
1650         return check_tm_pred4x4_mode(mode, mb_x, mb_y, vp7);
1651     case DC_PRED: /* 4x4 DC doesn't use the same "H.264-style" exceptions
1652                    * as 16x16/8x8 DC */
1653     case DIAG_DOWN_RIGHT_PRED:
1654     case VERT_RIGHT_PRED:
1655     case HOR_DOWN_PRED:
1656         if (!mb_y || !mb_x)
1657             *copy_buf = 1;
1658         return mode;
1659     }
1660     return mode;
1661 }
1662
1663 static av_always_inline
1664 void intra_predict(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1665                    VP8Macroblock *mb, int mb_x, int mb_y, int is_vp7)
1666 {
1667     int x, y, mode, nnz;
1668     uint32_t tr;
1669
1670     /* for the first row, we need to run xchg_mb_border to init the top edge
1671      * to 127 otherwise, skip it if we aren't going to deblock */
1672     if (mb_y && (s->deblock_filter || !mb_y) && td->thread_nr == 0)
1673         xchg_mb_border(s->top_border[mb_x + 1], dst[0], dst[1], dst[2],
1674                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1675                        s->filter.simple, 1);
1676
1677     if (mb->mode < MODE_I4x4) {
1678         mode = check_intra_pred8x8_mode_emuedge(mb->mode, mb_x, mb_y, is_vp7);
1679         s->hpc.pred16x16[mode](dst[0], s->linesize);
1680     } else {
1681         uint8_t *ptr = dst[0];
1682         uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
1683         const uint8_t lo = is_vp7 ? 128 : 127;
1684         const uint8_t hi = is_vp7 ? 128 : 129;
1685         uint8_t tr_top[4] = { lo, lo, lo, lo };
1686
1687         // all blocks on the right edge of the macroblock use bottom edge
1688         // the top macroblock for their topright edge
1689         uint8_t *tr_right = ptr - s->linesize + 16;
1690
1691         // if we're on the right edge of the frame, said edge is extended
1692         // from the top macroblock
1693         if (mb_y && mb_x == s->mb_width - 1) {
1694             tr       = tr_right[-1] * 0x01010101u;
1695             tr_right = (uint8_t *) &tr;
1696         }
1697
1698         if (mb->skip)
1699             AV_ZERO128(td->non_zero_count_cache);
1700
1701         for (y = 0; y < 4; y++) {
1702             uint8_t *topright = ptr + 4 - s->linesize;
1703             for (x = 0; x < 4; x++) {
1704                 int copy = 0;
1705                 ptrdiff_t linesize = s->linesize;
1706                 uint8_t *dst = ptr + 4 * x;
1707                 LOCAL_ALIGNED(4, uint8_t, copy_dst, [5 * 8]);
1708
1709                 if ((y == 0 || x == 3) && mb_y == 0) {
1710                     topright = tr_top;
1711                 } else if (x == 3)
1712                     topright = tr_right;
1713
1714                 mode = check_intra_pred4x4_mode_emuedge(intra4x4[x], mb_x + x,
1715                                                         mb_y + y, &copy, is_vp7);
1716                 if (copy) {
1717                     dst      = copy_dst + 12;
1718                     linesize = 8;
1719                     if (!(mb_y + y)) {
1720                         copy_dst[3] = lo;
1721                         AV_WN32A(copy_dst + 4, lo * 0x01010101U);
1722                     } else {
1723                         AV_COPY32(copy_dst + 4, ptr + 4 * x - s->linesize);
1724                         if (!(mb_x + x)) {
1725                             copy_dst[3] = hi;
1726                         } else {
1727                             copy_dst[3] = ptr[4 * x - s->linesize - 1];
1728                         }
1729                     }
1730                     if (!(mb_x + x)) {
1731                         copy_dst[11] =
1732                         copy_dst[19] =
1733                         copy_dst[27] =
1734                         copy_dst[35] = hi;
1735                     } else {
1736                         copy_dst[11] = ptr[4 * x                   - 1];
1737                         copy_dst[19] = ptr[4 * x + s->linesize     - 1];
1738                         copy_dst[27] = ptr[4 * x + s->linesize * 2 - 1];
1739                         copy_dst[35] = ptr[4 * x + s->linesize * 3 - 1];
1740                     }
1741                 }
1742                 s->hpc.pred4x4[mode](dst, topright, linesize);
1743                 if (copy) {
1744                     AV_COPY32(ptr + 4 * x,                   copy_dst + 12);
1745                     AV_COPY32(ptr + 4 * x + s->linesize,     copy_dst + 20);
1746                     AV_COPY32(ptr + 4 * x + s->linesize * 2, copy_dst + 28);
1747                     AV_COPY32(ptr + 4 * x + s->linesize * 3, copy_dst + 36);
1748                 }
1749
1750                 nnz = td->non_zero_count_cache[y][x];
1751                 if (nnz) {
1752                     if (nnz == 1)
1753                         s->vp8dsp.vp8_idct_dc_add(ptr + 4 * x,
1754                                                   td->block[y][x], s->linesize);
1755                     else
1756                         s->vp8dsp.vp8_idct_add(ptr + 4 * x,
1757                                                td->block[y][x], s->linesize);
1758                 }
1759                 topright += 4;
1760             }
1761
1762             ptr      += 4 * s->linesize;
1763             intra4x4 += 4;
1764         }
1765     }
1766
1767     mode = check_intra_pred8x8_mode_emuedge(mb->chroma_pred_mode,
1768                                             mb_x, mb_y, is_vp7);
1769     s->hpc.pred8x8[mode](dst[1], s->uvlinesize);
1770     s->hpc.pred8x8[mode](dst[2], s->uvlinesize);
1771
1772     if (mb_y && (s->deblock_filter || !mb_y) && td->thread_nr == 0)
1773         xchg_mb_border(s->top_border[mb_x + 1], dst[0], dst[1], dst[2],
1774                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1775                        s->filter.simple, 0);
1776 }
1777
1778 static const uint8_t subpel_idx[3][8] = {
1779     { 0, 1, 2, 1, 2, 1, 2, 1 }, // nr. of left extra pixels,
1780                                 // also function pointer index
1781     { 0, 3, 5, 3, 5, 3, 5, 3 }, // nr. of extra pixels required
1782     { 0, 2, 3, 2, 3, 2, 3, 2 }, // nr. of right extra pixels
1783 };
1784
1785 /**
1786  * luma MC function
1787  *
1788  * @param s        VP8 decoding context
1789  * @param dst      target buffer for block data at block position
1790  * @param ref      reference picture buffer at origin (0, 0)
1791  * @param mv       motion vector (relative to block position) to get pixel data from
1792  * @param x_off    horizontal position of block from origin (0, 0)
1793  * @param y_off    vertical position of block from origin (0, 0)
1794  * @param block_w  width of block (16, 8 or 4)
1795  * @param block_h  height of block (always same as block_w)
1796  * @param width    width of src/dst plane data
1797  * @param height   height of src/dst plane data
1798  * @param linesize size of a single line of plane data, including padding
1799  * @param mc_func  motion compensation function pointers (bilinear or sixtap MC)
1800  */
1801 static av_always_inline
1802 void vp8_mc_luma(VP8Context *s, VP8ThreadData *td, uint8_t *dst,
1803                  ThreadFrame *ref, const VP56mv *mv,
1804                  int x_off, int y_off, int block_w, int block_h,
1805                  int width, int height, ptrdiff_t linesize,
1806                  vp8_mc_func mc_func[3][3])
1807 {
1808     uint8_t *src = ref->f->data[0];
1809
1810     if (AV_RN32A(mv)) {
1811         ptrdiff_t src_linesize = linesize;
1812
1813         int mx = (mv->x * 2) & 7, mx_idx = subpel_idx[0][mx];
1814         int my = (mv->y * 2) & 7, my_idx = subpel_idx[0][my];
1815
1816         x_off += mv->x >> 2;
1817         y_off += mv->y >> 2;
1818
1819         // edge emulation
1820         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 4, 0);
1821         src += y_off * linesize + x_off;
1822         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1823             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1824             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1825                                      src - my_idx * linesize - mx_idx,
1826                                      EDGE_EMU_LINESIZE, linesize,
1827                                      block_w + subpel_idx[1][mx],
1828                                      block_h + subpel_idx[1][my],
1829                                      x_off - mx_idx, y_off - my_idx,
1830                                      width, height);
1831             src = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
1832             src_linesize = EDGE_EMU_LINESIZE;
1833         }
1834         mc_func[my_idx][mx_idx](dst, linesize, src, src_linesize, block_h, mx, my);
1835     } else {
1836         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 4, 0);
1837         mc_func[0][0](dst, linesize, src + y_off * linesize + x_off,
1838                       linesize, block_h, 0, 0);
1839     }
1840 }
1841
1842 /**
1843  * chroma MC function
1844  *
1845  * @param s        VP8 decoding context
1846  * @param dst1     target buffer for block data at block position (U plane)
1847  * @param dst2     target buffer for block data at block position (V plane)
1848  * @param ref      reference picture buffer at origin (0, 0)
1849  * @param mv       motion vector (relative to block position) to get pixel data from
1850  * @param x_off    horizontal position of block from origin (0, 0)
1851  * @param y_off    vertical position of block from origin (0, 0)
1852  * @param block_w  width of block (16, 8 or 4)
1853  * @param block_h  height of block (always same as block_w)
1854  * @param width    width of src/dst plane data
1855  * @param height   height of src/dst plane data
1856  * @param linesize size of a single line of plane data, including padding
1857  * @param mc_func  motion compensation function pointers (bilinear or sixtap MC)
1858  */
1859 static av_always_inline
1860 void vp8_mc_chroma(VP8Context *s, VP8ThreadData *td, uint8_t *dst1,
1861                    uint8_t *dst2, ThreadFrame *ref, const VP56mv *mv,
1862                    int x_off, int y_off, int block_w, int block_h,
1863                    int width, int height, ptrdiff_t linesize,
1864                    vp8_mc_func mc_func[3][3])
1865 {
1866     uint8_t *src1 = ref->f->data[1], *src2 = ref->f->data[2];
1867
1868     if (AV_RN32A(mv)) {
1869         int mx = mv->x & 7, mx_idx = subpel_idx[0][mx];
1870         int my = mv->y & 7, my_idx = subpel_idx[0][my];
1871
1872         x_off += mv->x >> 3;
1873         y_off += mv->y >> 3;
1874
1875         // edge emulation
1876         src1 += y_off * linesize + x_off;
1877         src2 += y_off * linesize + x_off;
1878         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 3, 0);
1879         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1880             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1881             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1882                                      src1 - my_idx * linesize - mx_idx,
1883                                      EDGE_EMU_LINESIZE, linesize,
1884                                      block_w + subpel_idx[1][mx],
1885                                      block_h + subpel_idx[1][my],
1886                                      x_off - mx_idx, y_off - my_idx, width, height);
1887             src1 = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
1888             mc_func[my_idx][mx_idx](dst1, linesize, src1, EDGE_EMU_LINESIZE, block_h, mx, my);
1889
1890             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1891                                      src2 - my_idx * linesize - mx_idx,
1892                                      EDGE_EMU_LINESIZE, linesize,
1893                                      block_w + subpel_idx[1][mx],
1894                                      block_h + subpel_idx[1][my],
1895                                      x_off - mx_idx, y_off - my_idx, width, height);
1896             src2 = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
1897             mc_func[my_idx][mx_idx](dst2, linesize, src2, EDGE_EMU_LINESIZE, block_h, mx, my);
1898         } else {
1899             mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my);
1900             mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my);
1901         }
1902     } else {
1903         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 3, 0);
1904         mc_func[0][0](dst1, linesize, src1 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1905         mc_func[0][0](dst2, linesize, src2 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1906     }
1907 }
1908
1909 static av_always_inline
1910 void vp8_mc_part(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1911                  ThreadFrame *ref_frame, int x_off, int y_off,
1912                  int bx_off, int by_off, int block_w, int block_h,
1913                  int width, int height, VP56mv *mv)
1914 {
1915     VP56mv uvmv = *mv;
1916
1917     /* Y */
1918     vp8_mc_luma(s, td, dst[0] + by_off * s->linesize + bx_off,
1919                 ref_frame, mv, x_off + bx_off, y_off + by_off,
1920                 block_w, block_h, width, height, s->linesize,
1921                 s->put_pixels_tab[block_w == 8]);
1922
1923     /* U/V */
1924     if (s->profile == 3) {
1925         /* this block only applies VP8; it is safe to check
1926          * only the profile, as VP7 profile <= 1 */
1927         uvmv.x &= ~7;
1928         uvmv.y &= ~7;
1929     }
1930     x_off   >>= 1;
1931     y_off   >>= 1;
1932     bx_off  >>= 1;
1933     by_off  >>= 1;
1934     width   >>= 1;
1935     height  >>= 1;
1936     block_w >>= 1;
1937     block_h >>= 1;
1938     vp8_mc_chroma(s, td, dst[1] + by_off * s->uvlinesize + bx_off,
1939                   dst[2] + by_off * s->uvlinesize + bx_off, ref_frame,
1940                   &uvmv, x_off + bx_off, y_off + by_off,
1941                   block_w, block_h, width, height, s->uvlinesize,
1942                   s->put_pixels_tab[1 + (block_w == 4)]);
1943 }
1944
1945 /* Fetch pixels for estimated mv 4 macroblocks ahead.
1946  * Optimized for 64-byte cache lines. Inspired by ffh264 prefetch_motion. */
1947 static av_always_inline
1948 void prefetch_motion(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
1949                      int mb_xy, int ref)
1950 {
1951     /* Don't prefetch refs that haven't been used very often this frame. */
1952     if (s->ref_count[ref - 1] > (mb_xy >> 5)) {
1953         int x_off = mb_x << 4, y_off = mb_y << 4;
1954         int mx = (mb->mv.x >> 2) + x_off + 8;
1955         int my = (mb->mv.y >> 2) + y_off;
1956         uint8_t **src = s->framep[ref]->tf.f->data;
1957         int off = mx + (my + (mb_x & 3) * 4) * s->linesize + 64;
1958         /* For threading, a ff_thread_await_progress here might be useful, but
1959          * it actually slows down the decoder. Since a bad prefetch doesn't
1960          * generate bad decoder output, we don't run it here. */
1961         s->vdsp.prefetch(src[0] + off, s->linesize, 4);
1962         off = (mx >> 1) + ((my >> 1) + (mb_x & 7)) * s->uvlinesize + 64;
1963         s->vdsp.prefetch(src[1] + off, src[2] - src[1], 2);
1964     }
1965 }
1966
1967 /**
1968  * Apply motion vectors to prediction buffer, chapter 18.
1969  */
1970 static av_always_inline
1971 void inter_predict(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1972                    VP8Macroblock *mb, int mb_x, int mb_y)
1973 {
1974     int x_off = mb_x << 4, y_off = mb_y << 4;
1975     int width = 16 * s->mb_width, height = 16 * s->mb_height;
1976     ThreadFrame *ref = &s->framep[mb->ref_frame]->tf;
1977     VP56mv *bmv = mb->bmv;
1978
1979     switch (mb->partitioning) {
1980     case VP8_SPLITMVMODE_NONE:
1981         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1982                     0, 0, 16, 16, width, height, &mb->mv);
1983         break;
1984     case VP8_SPLITMVMODE_4x4: {
1985         int x, y;
1986         VP56mv uvmv;
1987
1988         /* Y */
1989         for (y = 0; y < 4; y++) {
1990             for (x = 0; x < 4; x++) {
1991                 vp8_mc_luma(s, td, dst[0] + 4 * y * s->linesize + x * 4,
1992                             ref, &bmv[4 * y + x],
1993                             4 * x + x_off, 4 * y + y_off, 4, 4,
1994                             width, height, s->linesize,
1995                             s->put_pixels_tab[2]);
1996             }
1997         }
1998
1999         /* U/V */
2000         x_off  >>= 1;
2001         y_off  >>= 1;
2002         width  >>= 1;
2003         height >>= 1;
2004         for (y = 0; y < 2; y++) {
2005             for (x = 0; x < 2; x++) {
2006                 uvmv.x = mb->bmv[2 * y       * 4 + 2 * x    ].x +
2007                          mb->bmv[2 * y       * 4 + 2 * x + 1].x +
2008                          mb->bmv[(2 * y + 1) * 4 + 2 * x    ].x +
2009                          mb->bmv[(2 * y + 1) * 4 + 2 * x + 1].x;
2010                 uvmv.y = mb->bmv[2 * y       * 4 + 2 * x    ].y +
2011                          mb->bmv[2 * y       * 4 + 2 * x + 1].y +
2012                          mb->bmv[(2 * y + 1) * 4 + 2 * x    ].y +
2013                          mb->bmv[(2 * y + 1) * 4 + 2 * x + 1].y;
2014                 uvmv.x = (uvmv.x + 2 + FF_SIGNBIT(uvmv.x)) >> 2;
2015                 uvmv.y = (uvmv.y + 2 + FF_SIGNBIT(uvmv.y)) >> 2;
2016                 if (s->profile == 3) {
2017                     uvmv.x &= ~7;
2018                     uvmv.y &= ~7;
2019                 }
2020                 vp8_mc_chroma(s, td, dst[1] + 4 * y * s->uvlinesize + x * 4,
2021                               dst[2] + 4 * y * s->uvlinesize + x * 4, ref,
2022                               &uvmv, 4 * x + x_off, 4 * y + y_off, 4, 4,
2023                               width, height, s->uvlinesize,
2024                               s->put_pixels_tab[2]);
2025             }
2026         }
2027         break;
2028     }
2029     case VP8_SPLITMVMODE_16x8:
2030         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2031                     0, 0, 16, 8, width, height, &bmv[0]);
2032         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2033                     0, 8, 16, 8, width, height, &bmv[1]);
2034         break;
2035     case VP8_SPLITMVMODE_8x16:
2036         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2037                     0, 0, 8, 16, width, height, &bmv[0]);
2038         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2039                     8, 0, 8, 16, width, height, &bmv[1]);
2040         break;
2041     case VP8_SPLITMVMODE_8x8:
2042         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2043                     0, 0, 8, 8, width, height, &bmv[0]);
2044         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2045                     8, 0, 8, 8, width, height, &bmv[1]);
2046         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2047                     0, 8, 8, 8, width, height, &bmv[2]);
2048         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2049                     8, 8, 8, 8, width, height, &bmv[3]);
2050         break;
2051     }
2052 }
2053
2054 static av_always_inline
2055 void idct_mb(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3], VP8Macroblock *mb)
2056 {
2057     int x, y, ch;
2058
2059     if (mb->mode != MODE_I4x4) {
2060         uint8_t *y_dst = dst[0];
2061         for (y = 0; y < 4; y++) {
2062             uint32_t nnz4 = AV_RL32(td->non_zero_count_cache[y]);
2063             if (nnz4) {
2064                 if (nnz4 & ~0x01010101) {
2065                     for (x = 0; x < 4; x++) {
2066                         if ((uint8_t) nnz4 == 1)
2067                             s->vp8dsp.vp8_idct_dc_add(y_dst + 4 * x,
2068                                                       td->block[y][x],
2069                                                       s->linesize);
2070                         else if ((uint8_t) nnz4 > 1)
2071                             s->vp8dsp.vp8_idct_add(y_dst + 4 * x,
2072                                                    td->block[y][x],
2073                                                    s->linesize);
2074                         nnz4 >>= 8;
2075                         if (!nnz4)
2076                             break;
2077                     }
2078                 } else {
2079                     s->vp8dsp.vp8_idct_dc_add4y(y_dst, td->block[y], s->linesize);
2080                 }
2081             }
2082             y_dst += 4 * s->linesize;
2083         }
2084     }
2085
2086     for (ch = 0; ch < 2; ch++) {
2087         uint32_t nnz4 = AV_RL32(td->non_zero_count_cache[4 + ch]);
2088         if (nnz4) {
2089             uint8_t *ch_dst = dst[1 + ch];
2090             if (nnz4 & ~0x01010101) {
2091                 for (y = 0; y < 2; y++) {
2092                     for (x = 0; x < 2; x++) {
2093                         if ((uint8_t) nnz4 == 1)
2094                             s->vp8dsp.vp8_idct_dc_add(ch_dst + 4 * x,
2095                                                       td->block[4 + ch][(y << 1) + x],
2096                                                       s->uvlinesize);
2097                         else if ((uint8_t) nnz4 > 1)
2098                             s->vp8dsp.vp8_idct_add(ch_dst + 4 * x,
2099                                                    td->block[4 + ch][(y << 1) + x],
2100                                                    s->uvlinesize);
2101                         nnz4 >>= 8;
2102                         if (!nnz4)
2103                             goto chroma_idct_end;
2104                     }
2105                     ch_dst += 4 * s->uvlinesize;
2106                 }
2107             } else {
2108                 s->vp8dsp.vp8_idct_dc_add4uv(ch_dst, td->block[4 + ch], s->uvlinesize);
2109             }
2110         }
2111 chroma_idct_end:
2112         ;
2113     }
2114 }
2115
2116 static av_always_inline
2117 void filter_level_for_mb(VP8Context *s, VP8Macroblock *mb,
2118                          VP8FilterStrength *f, int is_vp7)
2119 {
2120     int interior_limit, filter_level;
2121
2122     if (s->segmentation.enabled) {
2123         filter_level = s->segmentation.filter_level[mb->segment];
2124         if (!s->segmentation.absolute_vals)
2125             filter_level += s->filter.level;
2126     } else
2127         filter_level = s->filter.level;
2128
2129     if (s->lf_delta.enabled) {
2130         filter_level += s->lf_delta.ref[mb->ref_frame];
2131         filter_level += s->lf_delta.mode[mb->mode];
2132     }
2133
2134     filter_level = av_clip_uintp2(filter_level, 6);
2135
2136     interior_limit = filter_level;
2137     if (s->filter.sharpness) {
2138         interior_limit >>= (s->filter.sharpness + 3) >> 2;
2139         interior_limit = FFMIN(interior_limit, 9 - s->filter.sharpness);
2140     }
2141     interior_limit = FFMAX(interior_limit, 1);
2142
2143     f->filter_level = filter_level;
2144     f->inner_limit = interior_limit;
2145     f->inner_filter = is_vp7 || !mb->skip || mb->mode == MODE_I4x4 ||
2146                       mb->mode == VP8_MVMODE_SPLIT;
2147 }
2148
2149 static av_always_inline
2150 void filter_mb(VP8Context *s, uint8_t *dst[3], VP8FilterStrength *f,
2151                int mb_x, int mb_y, int is_vp7)
2152 {
2153     int mbedge_lim, bedge_lim_y, bedge_lim_uv, hev_thresh;
2154     int filter_level = f->filter_level;
2155     int inner_limit = f->inner_limit;
2156     int inner_filter = f->inner_filter;
2157     ptrdiff_t linesize   = s->linesize;
2158     ptrdiff_t uvlinesize = s->uvlinesize;
2159     static const uint8_t hev_thresh_lut[2][64] = {
2160         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
2161           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2162           3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2163           3, 3, 3, 3 },
2164         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
2165           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2166           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2167           2, 2, 2, 2 }
2168     };
2169
2170     if (!filter_level)
2171         return;
2172
2173     if (is_vp7) {
2174         bedge_lim_y  = filter_level;
2175         bedge_lim_uv = filter_level * 2;
2176         mbedge_lim   = filter_level + 2;
2177     } else {
2178         bedge_lim_y  =
2179         bedge_lim_uv = filter_level * 2 + inner_limit;
2180         mbedge_lim   = bedge_lim_y + 4;
2181     }
2182
2183     hev_thresh = hev_thresh_lut[s->keyframe][filter_level];
2184
2185     if (mb_x) {
2186         s->vp8dsp.vp8_h_loop_filter16y(dst[0], linesize,
2187                                        mbedge_lim, inner_limit, hev_thresh);
2188         s->vp8dsp.vp8_h_loop_filter8uv(dst[1], dst[2], uvlinesize,
2189                                        mbedge_lim, inner_limit, hev_thresh);
2190     }
2191
2192 #define H_LOOP_FILTER_16Y_INNER(cond)                                         \
2193     if (cond && inner_filter) {                                               \
2194         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] +  4, linesize,           \
2195                                              bedge_lim_y, inner_limit,        \
2196                                              hev_thresh);                     \
2197         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] +  8, linesize,           \
2198                                              bedge_lim_y, inner_limit,        \
2199                                              hev_thresh);                     \
2200         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] + 12, linesize,           \
2201                                              bedge_lim_y, inner_limit,        \
2202                                              hev_thresh);                     \
2203         s->vp8dsp.vp8_h_loop_filter8uv_inner(dst[1] +  4, dst[2] + 4,         \
2204                                              uvlinesize,  bedge_lim_uv,       \
2205                                              inner_limit, hev_thresh);        \
2206     }
2207
2208     H_LOOP_FILTER_16Y_INNER(!is_vp7)
2209
2210     if (mb_y) {
2211         s->vp8dsp.vp8_v_loop_filter16y(dst[0], linesize,
2212                                        mbedge_lim, inner_limit, hev_thresh);
2213         s->vp8dsp.vp8_v_loop_filter8uv(dst[1], dst[2], uvlinesize,
2214                                        mbedge_lim, inner_limit, hev_thresh);
2215     }
2216
2217     if (inner_filter) {
2218         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] +  4 * linesize,
2219                                              linesize, bedge_lim_y,
2220                                              inner_limit, hev_thresh);
2221         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] +  8 * linesize,
2222                                              linesize, bedge_lim_y,
2223                                              inner_limit, hev_thresh);
2224         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] + 12 * linesize,
2225                                              linesize, bedge_lim_y,
2226                                              inner_limit, hev_thresh);
2227         s->vp8dsp.vp8_v_loop_filter8uv_inner(dst[1] +  4 * uvlinesize,
2228                                              dst[2] +  4 * uvlinesize,
2229                                              uvlinesize, bedge_lim_uv,
2230                                              inner_limit, hev_thresh);
2231     }
2232
2233     H_LOOP_FILTER_16Y_INNER(is_vp7)
2234 }
2235
2236 static av_always_inline
2237 void filter_mb_simple(VP8Context *s, uint8_t *dst, VP8FilterStrength *f,
2238                       int mb_x, int mb_y)
2239 {
2240     int mbedge_lim, bedge_lim;
2241     int filter_level = f->filter_level;
2242     int inner_limit  = f->inner_limit;
2243     int inner_filter = f->inner_filter;
2244     ptrdiff_t linesize = s->linesize;
2245
2246     if (!filter_level)
2247         return;
2248
2249     bedge_lim  = 2 * filter_level + inner_limit;
2250     mbedge_lim = bedge_lim + 4;
2251
2252     if (mb_x)
2253         s->vp8dsp.vp8_h_loop_filter_simple(dst, linesize, mbedge_lim);
2254     if (inner_filter) {
2255         s->vp8dsp.vp8_h_loop_filter_simple(dst +  4, linesize, bedge_lim);
2256         s->vp8dsp.vp8_h_loop_filter_simple(dst +  8, linesize, bedge_lim);
2257         s->vp8dsp.vp8_h_loop_filter_simple(dst + 12, linesize, bedge_lim);
2258     }
2259
2260     if (mb_y)
2261         s->vp8dsp.vp8_v_loop_filter_simple(dst, linesize, mbedge_lim);
2262     if (inner_filter) {
2263         s->vp8dsp.vp8_v_loop_filter_simple(dst +  4 * linesize, linesize, bedge_lim);
2264         s->vp8dsp.vp8_v_loop_filter_simple(dst +  8 * linesize, linesize, bedge_lim);
2265         s->vp8dsp.vp8_v_loop_filter_simple(dst + 12 * linesize, linesize, bedge_lim);
2266     }
2267 }
2268
2269 #define MARGIN (16 << 2)
2270 static av_always_inline
2271 int vp78_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *curframe,
2272                                     VP8Frame *prev_frame, int is_vp7)
2273 {
2274     VP8Context *s = avctx->priv_data;
2275     int mb_x, mb_y;
2276
2277     s->mv_bounds.mv_min.y = -MARGIN;
2278     s->mv_bounds.mv_max.y = ((s->mb_height - 1) << 6) + MARGIN;
2279     for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
2280         VP8Macroblock *mb = s->macroblocks_base +
2281                             ((s->mb_width + 1) * (mb_y + 1) + 1);
2282         int mb_xy = mb_y * s->mb_width;
2283
2284         AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED * 0x01010101);
2285
2286         s->mv_bounds.mv_min.x = -MARGIN;
2287         s->mv_bounds.mv_max.x = ((s->mb_width - 1) << 6) + MARGIN;
2288
2289         if (vpX_rac_is_end(&s->c)) {
2290             return AVERROR_INVALIDDATA;
2291         }
2292         for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
2293             if (mb_y == 0)
2294                 AV_WN32A((mb - s->mb_width - 1)->intra4x4_pred_mode_top,
2295                          DC_PRED * 0x01010101);
2296             decode_mb_mode(s, &s->mv_bounds, mb, mb_x, mb_y, curframe->seg_map->data + mb_xy,
2297                            prev_frame && prev_frame->seg_map ?
2298                            prev_frame->seg_map->data + mb_xy : NULL, 1, is_vp7);
2299             s->mv_bounds.mv_min.x -= 64;
2300             s->mv_bounds.mv_max.x -= 64;
2301         }
2302         s->mv_bounds.mv_min.y -= 64;
2303         s->mv_bounds.mv_max.y -= 64;
2304     }
2305     return 0;
2306 }
2307
2308 static int vp7_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *cur_frame,
2309                                    VP8Frame *prev_frame)
2310 {
2311     return vp78_decode_mv_mb_modes(avctx, cur_frame, prev_frame, IS_VP7);
2312 }
2313
2314 static int vp8_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *cur_frame,
2315                                    VP8Frame *prev_frame)
2316 {
2317     return vp78_decode_mv_mb_modes(avctx, cur_frame, prev_frame, IS_VP8);
2318 }
2319
2320 #if HAVE_THREADS
2321 #define check_thread_pos(td, otd, mb_x_check, mb_y_check)                     \
2322     do {                                                                      \
2323         int tmp = (mb_y_check << 16) | (mb_x_check & 0xFFFF);                 \
2324         if (atomic_load(&otd->thread_mb_pos) < tmp) {                         \
2325             pthread_mutex_lock(&otd->lock);                                   \
2326             atomic_store(&td->wait_mb_pos, tmp);                              \
2327             do {                                                              \
2328                 if (atomic_load(&otd->thread_mb_pos) >= tmp)                  \
2329                     break;                                                    \
2330                 pthread_cond_wait(&otd->cond, &otd->lock);                    \
2331             } while (1);                                                      \
2332             atomic_store(&td->wait_mb_pos, INT_MAX);                          \
2333             pthread_mutex_unlock(&otd->lock);                                 \
2334         }                                                                     \
2335     } while (0)
2336
2337 #define update_pos(td, mb_y, mb_x)                                            \
2338     do {                                                                      \
2339         int pos              = (mb_y << 16) | (mb_x & 0xFFFF);                \
2340         int sliced_threading = (avctx->active_thread_type == FF_THREAD_SLICE) && \
2341                                (num_jobs > 1);                                \
2342         int is_null          = !next_td || !prev_td;                          \
2343         int pos_check        = (is_null) ? 1 :                                \
2344             (next_td != td && pos >= atomic_load(&next_td->wait_mb_pos)) ||   \
2345             (prev_td != td && pos >= atomic_load(&prev_td->wait_mb_pos));     \
2346         atomic_store(&td->thread_mb_pos, pos);                                \
2347         if (sliced_threading && pos_check) {                                  \
2348             pthread_mutex_lock(&td->lock);                                    \
2349             pthread_cond_broadcast(&td->cond);                                \
2350             pthread_mutex_unlock(&td->lock);                                  \
2351         }                                                                     \
2352     } while (0)
2353 #else
2354 #define check_thread_pos(td, otd, mb_x_check, mb_y_check) while(0)
2355 #define update_pos(td, mb_y, mb_x) while(0)
2356 #endif
2357
2358 static av_always_inline int decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
2359                                         int jobnr, int threadnr, int is_vp7)
2360 {
2361     VP8Context *s = avctx->priv_data;
2362     VP8ThreadData *prev_td, *next_td, *td = &s->thread_data[threadnr];
2363     int mb_y = atomic_load(&td->thread_mb_pos) >> 16;
2364     int mb_x, mb_xy = mb_y * s->mb_width;
2365     int num_jobs = s->num_jobs;
2366     VP8Frame *curframe = s->curframe, *prev_frame = s->prev_frame;
2367     VP56RangeCoder *c  = &s->coeff_partition[mb_y & (s->num_coeff_partitions - 1)];
2368     VP8Macroblock *mb;
2369     uint8_t *dst[3] = {
2370         curframe->tf.f->data[0] + 16 * mb_y * s->linesize,
2371         curframe->tf.f->data[1] +  8 * mb_y * s->uvlinesize,
2372         curframe->tf.f->data[2] +  8 * mb_y * s->uvlinesize
2373     };
2374
2375     if (c->end <= c->buffer && c->bits >= 0)
2376          return AVERROR_INVALIDDATA;
2377
2378     if (mb_y == 0)
2379         prev_td = td;
2380     else
2381         prev_td = &s->thread_data[(jobnr + num_jobs - 1) % num_jobs];
2382     if (mb_y == s->mb_height - 1)
2383         next_td = td;
2384     else
2385         next_td = &s->thread_data[(jobnr + 1) % num_jobs];
2386     if (s->mb_layout == 1)
2387         mb = s->macroblocks_base + ((s->mb_width + 1) * (mb_y + 1) + 1);
2388     else {
2389         // Make sure the previous frame has read its segmentation map,
2390         // if we re-use the same map.
2391         if (prev_frame && s->segmentation.enabled &&
2392             !s->segmentation.update_map)
2393             ff_thread_await_progress(&prev_frame->tf, mb_y, 0);
2394         mb = s->macroblocks + (s->mb_height - mb_y - 1) * 2;
2395         memset(mb - 1, 0, sizeof(*mb)); // zero left macroblock
2396         AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED * 0x01010101);
2397     }
2398
2399     if (!is_vp7 || mb_y == 0)
2400         memset(td->left_nnz, 0, sizeof(td->left_nnz));
2401
2402     td->mv_bounds.mv_min.x = -MARGIN;
2403     td->mv_bounds.mv_max.x = ((s->mb_width - 1) << 6) + MARGIN;
2404
2405     for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
2406         if (c->end <= c->buffer && c->bits >= 0)
2407             return AVERROR_INVALIDDATA;
2408         // Wait for previous thread to read mb_x+2, and reach mb_y-1.
2409         if (prev_td != td) {
2410             if (threadnr != 0) {
2411                 check_thread_pos(td, prev_td,
2412                                  mb_x + (is_vp7 ? 2 : 1),
2413                                  mb_y - (is_vp7 ? 2 : 1));
2414             } else {
2415                 check_thread_pos(td, prev_td,
2416                                  mb_x + (is_vp7 ? 2 : 1) + s->mb_width + 3,
2417                                  mb_y - (is_vp7 ? 2 : 1));
2418             }
2419         }
2420
2421         s->vdsp.prefetch(dst[0] + (mb_x & 3) * 4 * s->linesize + 64,
2422                          s->linesize, 4);
2423         s->vdsp.prefetch(dst[1] + (mb_x & 7) * s->uvlinesize + 64,
2424                          dst[2] - dst[1], 2);
2425
2426         if (!s->mb_layout)
2427             decode_mb_mode(s, &td->mv_bounds, mb, mb_x, mb_y, curframe->seg_map->data + mb_xy,
2428                            prev_frame && prev_frame->seg_map ?
2429                            prev_frame->seg_map->data + mb_xy : NULL, 0, is_vp7);
2430
2431         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_PREVIOUS);
2432
2433         if (!mb->skip)
2434             decode_mb_coeffs(s, td, c, mb, s->top_nnz[mb_x], td->left_nnz, is_vp7);
2435
2436         if (mb->mode <= MODE_I4x4)
2437             intra_predict(s, td, dst, mb, mb_x, mb_y, is_vp7);
2438         else
2439             inter_predict(s, td, dst, mb, mb_x, mb_y);
2440
2441         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN);
2442
2443         if (!mb->skip) {
2444             idct_mb(s, td, dst, mb);
2445         } else {
2446             AV_ZERO64(td->left_nnz);
2447             AV_WN64(s->top_nnz[mb_x], 0);   // array of 9, so unaligned
2448
2449             /* Reset DC block predictors if they would exist
2450              * if the mb had coefficients */
2451             if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
2452                 td->left_nnz[8]     = 0;
2453                 s->top_nnz[mb_x][8] = 0;
2454             }
2455         }
2456
2457         if (s->deblock_filter)
2458             filter_level_for_mb(s, mb, &td->filter_strength[mb_x], is_vp7);
2459
2460         if (s->deblock_filter && num_jobs != 1 && threadnr == num_jobs - 1) {
2461             if (s->filter.simple)
2462                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2463                                  NULL, NULL, s->linesize, 0, 1);
2464             else
2465                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2466                                  dst[1], dst[2], s->linesize, s->uvlinesize, 0);
2467         }
2468
2469         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN2);
2470
2471         dst[0]      += 16;
2472         dst[1]      += 8;
2473         dst[2]      += 8;
2474         td->mv_bounds.mv_min.x -= 64;
2475         td->mv_bounds.mv_max.x -= 64;
2476
2477         if (mb_x == s->mb_width + 1) {
2478             update_pos(td, mb_y, s->mb_width + 3);
2479         } else {
2480             update_pos(td, mb_y, mb_x);
2481         }
2482     }
2483     return 0;
2484 }
2485
2486 static int vp7_decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
2487                                         int jobnr, int threadnr)
2488 {
2489     return decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr, 1);
2490 }
2491
2492 static int vp8_decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
2493                                         int jobnr, int threadnr)
2494 {
2495     return decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr, 0);
2496 }
2497
2498 static av_always_inline void filter_mb_row(AVCodecContext *avctx, void *tdata,
2499                               int jobnr, int threadnr, int is_vp7)
2500 {
2501     VP8Context *s = avctx->priv_data;
2502     VP8ThreadData *td = &s->thread_data[threadnr];
2503     int mb_x, mb_y = atomic_load(&td->thread_mb_pos) >> 16, num_jobs = s->num_jobs;
2504     AVFrame *curframe = s->curframe->tf.f;
2505     VP8Macroblock *mb;
2506     VP8ThreadData *prev_td, *next_td;
2507     uint8_t *dst[3] = {
2508         curframe->data[0] + 16 * mb_y * s->linesize,
2509         curframe->data[1] +  8 * mb_y * s->uvlinesize,
2510         curframe->data[2] +  8 * mb_y * s->uvlinesize
2511     };
2512
2513     if (s->mb_layout == 1)
2514         mb = s->macroblocks_base + ((s->mb_width + 1) * (mb_y + 1) + 1);
2515     else
2516         mb = s->macroblocks + (s->mb_height - mb_y - 1) * 2;
2517
2518     if (mb_y == 0)
2519         prev_td = td;
2520     else
2521         prev_td = &s->thread_data[(jobnr + num_jobs - 1) % num_jobs];
2522     if (mb_y == s->mb_height - 1)
2523         next_td = td;
2524     else
2525         next_td = &s->thread_data[(jobnr + 1) % num_jobs];
2526
2527     for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb++) {
2528         VP8FilterStrength *f = &td->filter_strength[mb_x];
2529         if (prev_td != td)
2530             check_thread_pos(td, prev_td,
2531                              (mb_x + 1) + (s->mb_width + 3), mb_y - 1);
2532         if (next_td != td)
2533             if (next_td != &s->thread_data[0])
2534                 check_thread_pos(td, next_td, mb_x + 1, mb_y + 1);
2535
2536         if (num_jobs == 1) {
2537             if (s->filter.simple)
2538                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2539                                  NULL, NULL, s->linesize, 0, 1);
2540             else
2541                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2542                                  dst[1], dst[2], s->linesize, s->uvlinesize, 0);
2543         }
2544
2545         if (s->filter.simple)
2546             filter_mb_simple(s, dst[0], f, mb_x, mb_y);
2547         else
2548             filter_mb(s, dst, f, mb_x, mb_y, is_vp7);
2549         dst[0] += 16;
2550         dst[1] += 8;
2551         dst[2] += 8;
2552
2553         update_pos(td, mb_y, (s->mb_width + 3) + mb_x);
2554     }
2555 }
2556
2557 static void vp7_filter_mb_row(AVCodecContext *avctx, void *tdata,
2558                               int jobnr, int threadnr)
2559 {
2560     filter_mb_row(avctx, tdata, jobnr, threadnr, 1);
2561 }
2562
2563 static void vp8_filter_mb_row(AVCodecContext *avctx, void *tdata,
2564                               int jobnr, int threadnr)
2565 {
2566     filter_mb_row(avctx, tdata, jobnr, threadnr, 0);
2567 }
2568
2569 static av_always_inline
2570 int vp78_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata, int jobnr,
2571                               int threadnr, int is_vp7)
2572 {
2573     VP8Context *s = avctx->priv_data;
2574     VP8ThreadData *td = &s->thread_data[jobnr];
2575     VP8ThreadData *next_td = NULL, *prev_td = NULL;
2576     VP8Frame *curframe = s->curframe;
2577     int mb_y, num_jobs = s->num_jobs;
2578     int ret;
2579
2580     td->thread_nr = threadnr;
2581     td->mv_bounds.mv_min.y   = -MARGIN - 64 * threadnr;
2582     td->mv_bounds.mv_max.y   = ((s->mb_height - 1) << 6) + MARGIN - 64 * threadnr;
2583     for (mb_y = jobnr; mb_y < s->mb_height; mb_y += num_jobs) {
2584         atomic_store(&td->thread_mb_pos, mb_y << 16);
2585         ret = s->decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr);
2586         if (ret < 0) {
2587             update_pos(td, s->mb_height, INT_MAX & 0xFFFF);
2588             return ret;
2589         }
2590         if (s->deblock_filter)
2591             s->filter_mb_row(avctx, tdata, jobnr, threadnr);
2592         update_pos(td, mb_y, INT_MAX & 0xFFFF);
2593
2594         td->mv_bounds.mv_min.y -= 64 * num_jobs;
2595         td->mv_bounds.mv_max.y -= 64 * num_jobs;
2596
2597         if (avctx->active_thread_type == FF_THREAD_FRAME)
2598             ff_thread_report_progress(&curframe->tf, mb_y, 0);
2599     }
2600
2601     return 0;
2602 }
2603
2604 static int vp7_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata,
2605                                     int jobnr, int threadnr)
2606 {
2607     return vp78_decode_mb_row_sliced(avctx, tdata, jobnr, threadnr, IS_VP7);
2608 }
2609
2610 static int vp8_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata,
2611                                     int jobnr, int threadnr)
2612 {
2613     return vp78_decode_mb_row_sliced(avctx, tdata, jobnr, threadnr, IS_VP8);
2614 }
2615
2616 static av_always_inline
2617 int vp78_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
2618                       AVPacket *avpkt, int is_vp7)
2619 {
2620     VP8Context *s = avctx->priv_data;
2621     int ret, i, referenced, num_jobs;
2622     enum AVDiscard skip_thresh;
2623     VP8Frame *av_uninit(curframe), *prev_frame;
2624
2625     if (is_vp7)
2626         ret = vp7_decode_frame_header(s, avpkt->data, avpkt->size);
2627     else
2628         ret = vp8_decode_frame_header(s, avpkt->data, avpkt->size);
2629
2630     if (ret < 0)
2631         goto err;
2632
2633     if (s->actually_webp) {
2634         // avctx->pix_fmt already set in caller.
2635     } else if (!is_vp7 && s->pix_fmt == AV_PIX_FMT_NONE) {
2636         s->pix_fmt = get_pixel_format(s);
2637         if (s->pix_fmt < 0) {
2638             ret = AVERROR(EINVAL);
2639             goto err;
2640         }
2641         avctx->pix_fmt = s->pix_fmt;
2642     }
2643
2644     prev_frame = s->framep[VP56_FRAME_CURRENT];
2645
2646     referenced = s->update_last || s->update_golden == VP56_FRAME_CURRENT ||
2647                  s->update_altref == VP56_FRAME_CURRENT;
2648
2649     skip_thresh = !referenced ? AVDISCARD_NONREF
2650                               : !s->keyframe ? AVDISCARD_NONKEY
2651                                              : AVDISCARD_ALL;
2652
2653     if (avctx->skip_frame >= skip_thresh) {
2654         s->invisible = 1;
2655         memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
2656         goto skip_decode;
2657     }
2658     s->deblock_filter = s->filter.level && avctx->skip_loop_filter < skip_thresh;
2659
2660     // release no longer referenced frames
2661     for (i = 0; i < 5; i++)
2662         if (s->frames[i].tf.f->buf[0] &&
2663             &s->frames[i] != prev_frame &&
2664             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
2665             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN]   &&
2666             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2])
2667             vp8_release_frame(s, &s->frames[i]);
2668
2669     curframe = s->framep[VP56_FRAME_CURRENT] = vp8_find_free_buffer(s);
2670
2671     if (!s->colorspace)
2672         avctx->colorspace = AVCOL_SPC_BT470BG;
2673     if (s->fullrange)
2674         avctx->color_range = AVCOL_RANGE_JPEG;
2675     else
2676         avctx->color_range = AVCOL_RANGE_MPEG;
2677
2678     /* Given that arithmetic probabilities are updated every frame, it's quite
2679      * likely that the values we have on a random interframe are complete
2680      * junk if we didn't start decode on a keyframe. So just don't display
2681      * anything rather than junk. */
2682     if (!s->keyframe && (!s->framep[VP56_FRAME_PREVIOUS] ||
2683                          !s->framep[VP56_FRAME_GOLDEN]   ||
2684                          !s->framep[VP56_FRAME_GOLDEN2])) {
2685         av_log(avctx, AV_LOG_WARNING,
2686                "Discarding interframe without a prior keyframe!\n");
2687         ret = AVERROR_INVALIDDATA;
2688         goto err;
2689     }
2690
2691     curframe->tf.f->key_frame = s->keyframe;
2692     curframe->tf.f->pict_type = s->keyframe ? AV_PICTURE_TYPE_I
2693                                             : AV_PICTURE_TYPE_P;
2694     if ((ret = vp8_alloc_frame(s, curframe, referenced)) < 0)
2695         goto err;
2696
2697     // check if golden and altref are swapped
2698     if (s->update_altref != VP56_FRAME_NONE)
2699         s->next_framep[VP56_FRAME_GOLDEN2] = s->framep[s->update_altref];
2700     else
2701         s->next_framep[VP56_FRAME_GOLDEN2] = s->framep[VP56_FRAME_GOLDEN2];
2702
2703     if (s->update_golden != VP56_FRAME_NONE)
2704         s->next_framep[VP56_FRAME_GOLDEN] = s->framep[s->update_golden];
2705     else
2706         s->next_framep[VP56_FRAME_GOLDEN] = s->framep[VP56_FRAME_GOLDEN];
2707
2708     if (s->update_last)
2709         s->next_framep[VP56_FRAME_PREVIOUS] = curframe;
2710     else
2711         s->next_framep[VP56_FRAME_PREVIOUS] = s->framep[VP56_FRAME_PREVIOUS];
2712
2713     s->next_framep[VP56_FRAME_CURRENT] = curframe;
2714
2715     ff_thread_finish_setup(avctx);
2716
2717     if (avctx->hwaccel) {
2718         ret = avctx->hwaccel->start_frame(avctx, avpkt->data, avpkt->size);
2719         if (ret < 0)
2720             goto err;
2721
2722         ret = avctx->hwaccel->decode_slice(avctx, avpkt->data, avpkt->size);
2723         if (ret < 0)
2724             goto err;
2725
2726         ret = avctx->hwaccel->end_frame(avctx);
2727         if (ret < 0)
2728             goto err;
2729
2730     } else {
2731         s->linesize   = curframe->tf.f->linesize[0];
2732         s->uvlinesize = curframe->tf.f->linesize[1];
2733
2734         memset(s->top_nnz, 0, s->mb_width * sizeof(*s->top_nnz));
2735         /* Zero macroblock structures for top/top-left prediction
2736          * from outside the frame. */
2737         if (!s->mb_layout)
2738             memset(s->macroblocks + s->mb_height * 2 - 1, 0,
2739                    (s->mb_width + 1) * sizeof(*s->macroblocks));
2740         if (!s->mb_layout && s->keyframe)
2741             memset(s->intra4x4_pred_mode_top, DC_PRED, s->mb_width * 4);
2742
2743         memset(s->ref_count, 0, sizeof(s->ref_count));
2744
2745         if (s->mb_layout == 1) {
2746             // Make sure the previous frame has read its segmentation map,
2747             // if we re-use the same map.
2748             if (prev_frame && s->segmentation.enabled &&
2749                 !s->segmentation.update_map)
2750                 ff_thread_await_progress(&prev_frame->tf, 1, 0);
2751             if (is_vp7)
2752                 ret = vp7_decode_mv_mb_modes(avctx, curframe, prev_frame);
2753             else
2754                 ret = vp8_decode_mv_mb_modes(avctx, curframe, prev_frame);
2755             if (ret < 0)
2756                 goto err;
2757         }
2758
2759         if (avctx->active_thread_type == FF_THREAD_FRAME)
2760             num_jobs = 1;
2761         else
2762             num_jobs = FFMIN(s->num_coeff_partitions, avctx->thread_count);
2763         s->num_jobs   = num_jobs;
2764         s->curframe   = curframe;
2765         s->prev_frame = prev_frame;
2766         s->mv_bounds.mv_min.y   = -MARGIN;
2767         s->mv_bounds.mv_max.y   = ((s->mb_height - 1) << 6) + MARGIN;
2768         for (i = 0; i < MAX_THREADS; i++) {
2769             VP8ThreadData *td = &s->thread_data[i];
2770             atomic_init(&td->thread_mb_pos, 0);
2771             atomic_init(&td->wait_mb_pos, INT_MAX);
2772         }
2773         if (is_vp7)
2774             avctx->execute2(avctx, vp7_decode_mb_row_sliced, s->thread_data, NULL,
2775                             num_jobs);
2776         else
2777             avctx->execute2(avctx, vp8_decode_mb_row_sliced, s->thread_data, NULL,
2778                             num_jobs);
2779     }
2780
2781     ff_thread_report_progress(&curframe->tf, INT_MAX, 0);
2782     memcpy(&s->framep[0], &s->next_framep[0], sizeof(s->framep[0]) * 4);
2783
2784 skip_decode:
2785     // if future frames don't use the updated probabilities,
2786     // reset them to the values we saved
2787     if (!s->update_probabilities)
2788         s->prob[0] = s->prob[1];
2789
2790     if (!s->invisible) {
2791         if ((ret = av_frame_ref(data, curframe->tf.f)) < 0)
2792             return ret;
2793         *got_frame = 1;
2794     }
2795
2796     return avpkt->size;
2797 err:
2798     memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
2799     return ret;
2800 }
2801
2802 int ff_vp8_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
2803                         AVPacket *avpkt)
2804 {
2805     return vp78_decode_frame(avctx, data, got_frame, avpkt, IS_VP8);
2806 }
2807
2808 #if CONFIG_VP7_DECODER
2809 static int vp7_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
2810                             AVPacket *avpkt)
2811 {
2812     return vp78_decode_frame(avctx, data, got_frame, avpkt, IS_VP7);
2813 }
2814 #endif /* CONFIG_VP7_DECODER */
2815
2816 av_cold int ff_vp8_decode_free(AVCodecContext *avctx)
2817 {
2818     VP8Context *s = avctx->priv_data;
2819     int i;
2820
2821     if (!s)
2822         return 0;
2823
2824     vp8_decode_flush_impl(avctx, 1);
2825     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++)
2826         av_frame_free(&s->frames[i].tf.f);
2827
2828     return 0;
2829 }
2830
2831 static av_cold int vp8_init_frames(VP8Context *s)
2832 {
2833     int i;
2834     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++) {
2835         s->frames[i].tf.f = av_frame_alloc();
2836         if (!s->frames[i].tf.f)
2837             return AVERROR(ENOMEM);
2838     }
2839     return 0;
2840 }
2841
2842 static av_always_inline
2843 int vp78_decode_init(AVCodecContext *avctx, int is_vp7)
2844 {
2845     VP8Context *s = avctx->priv_data;
2846     int ret;
2847
2848     s->avctx = avctx;
2849     s->vp7   = avctx->codec->id == AV_CODEC_ID_VP7;
2850     s->pix_fmt = AV_PIX_FMT_NONE;
2851     avctx->pix_fmt = AV_PIX_FMT_YUV420P;
2852     avctx->internal->allocate_progress = 1;
2853
2854     ff_videodsp_init(&s->vdsp, 8);
2855
2856     ff_vp78dsp_init(&s->vp8dsp);
2857     if (CONFIG_VP7_DECODER && is_vp7) {
2858         ff_h264_pred_init(&s->hpc, AV_CODEC_ID_VP7, 8, 1);
2859         ff_vp7dsp_init(&s->vp8dsp);
2860         s->decode_mb_row_no_filter = vp7_decode_mb_row_no_filter;
2861         s->filter_mb_row           = vp7_filter_mb_row;
2862     } else if (CONFIG_VP8_DECODER && !is_vp7) {
2863         ff_h264_pred_init(&s->hpc, AV_CODEC_ID_VP8, 8, 1);
2864         ff_vp8dsp_init(&s->vp8dsp);
2865         s->decode_mb_row_no_filter = vp8_decode_mb_row_no_filter;
2866         s->filter_mb_row           = vp8_filter_mb_row;
2867     }
2868
2869     /* does not change for VP8 */
2870     memcpy(s->prob[0].scan, ff_zigzag_scan, sizeof(s->prob[0].scan));
2871
2872     if ((ret = vp8_init_frames(s)) < 0) {
2873         ff_vp8_decode_free(avctx);
2874         return ret;
2875     }
2876
2877     return 0;
2878 }
2879
2880 #if CONFIG_VP7_DECODER
2881 static int vp7_decode_init(AVCodecContext *avctx)
2882 {
2883     return vp78_decode_init(avctx, IS_VP7);
2884 }
2885 #endif /* CONFIG_VP7_DECODER */
2886
2887 av_cold int ff_vp8_decode_init(AVCodecContext *avctx)
2888 {
2889     return vp78_decode_init(avctx, IS_VP8);
2890 }
2891
2892 #if CONFIG_VP8_DECODER
2893 #if HAVE_THREADS
2894 static av_cold int vp8_decode_init_thread_copy(AVCodecContext *avctx)
2895 {
2896     VP8Context *s = avctx->priv_data;
2897     int ret;
2898
2899     s->avctx = avctx;
2900
2901     if ((ret = vp8_init_frames(s)) < 0) {
2902         ff_vp8_decode_free(avctx);
2903         return ret;
2904     }
2905
2906     return 0;
2907 }
2908
2909 #define REBASE(pic) ((pic) ? (pic) - &s_src->frames[0] + &s->frames[0] : NULL)
2910
2911 static int vp8_decode_update_thread_context(AVCodecContext *dst,
2912                                             const AVCodecContext *src)
2913 {
2914     VP8Context *s = dst->priv_data, *s_src = src->priv_data;
2915     int i;
2916
2917     if (s->macroblocks_base &&
2918         (s_src->mb_width != s->mb_width || s_src->mb_height != s->mb_height)) {
2919         free_buffers(s);
2920         s->mb_width  = s_src->mb_width;
2921         s->mb_height = s_src->mb_height;
2922     }
2923
2924     s->pix_fmt      = s_src->pix_fmt;
2925     s->prob[0]      = s_src->prob[!s_src->update_probabilities];
2926     s->segmentation = s_src->segmentation;
2927     s->lf_delta     = s_src->lf_delta;
2928     memcpy(s->sign_bias, s_src->sign_bias, sizeof(s->sign_bias));
2929
2930     for (i = 0; i < FF_ARRAY_ELEMS(s_src->frames); i++) {
2931         if (s_src->frames[i].tf.f->buf[0]) {
2932             int ret = vp8_ref_frame(s, &s->frames[i], &s_src->frames[i]);
2933             if (ret < 0)
2934                 return ret;
2935         }
2936     }
2937
2938     s->framep[0] = REBASE(s_src->next_framep[0]);
2939     s->framep[1] = REBASE(s_src->next_framep[1]);
2940     s->framep[2] = REBASE(s_src->next_framep[2]);
2941     s->framep[3] = REBASE(s_src->next_framep[3]);
2942
2943     return 0;
2944 }
2945 #endif /* HAVE_THREADS */
2946 #endif /* CONFIG_VP8_DECODER */
2947
2948 #if CONFIG_VP7_DECODER
2949 AVCodec ff_vp7_decoder = {
2950     .name                  = "vp7",
2951     .long_name             = NULL_IF_CONFIG_SMALL("On2 VP7"),
2952     .type                  = AVMEDIA_TYPE_VIDEO,
2953     .id                    = AV_CODEC_ID_VP7,
2954     .priv_data_size        = sizeof(VP8Context),
2955     .init                  = vp7_decode_init,
2956     .close                 = ff_vp8_decode_free,
2957     .decode                = vp7_decode_frame,
2958     .capabilities          = AV_CODEC_CAP_DR1,
2959     .flush                 = vp8_decode_flush,
2960 };
2961 #endif /* CONFIG_VP7_DECODER */
2962
2963 #if CONFIG_VP8_DECODER
2964 AVCodec ff_vp8_decoder = {
2965     .name                  = "vp8",
2966     .long_name             = NULL_IF_CONFIG_SMALL("On2 VP8"),
2967     .type                  = AVMEDIA_TYPE_VIDEO,
2968     .id                    = AV_CODEC_ID_VP8,
2969     .priv_data_size        = sizeof(VP8Context),
2970     .init                  = ff_vp8_decode_init,
2971     .close                 = ff_vp8_decode_free,
2972     .decode                = ff_vp8_decode_frame,
2973     .capabilities          = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS |
2974                              AV_CODEC_CAP_SLICE_THREADS,
2975     .flush                 = vp8_decode_flush,
2976     .init_thread_copy      = ONLY_IF_THREADS_ENABLED(vp8_decode_init_thread_copy),
2977     .update_thread_context = ONLY_IF_THREADS_ENABLED(vp8_decode_update_thread_context),
2978     .hw_configs            = (const AVCodecHWConfigInternal*[]) {
2979 #if CONFIG_VP8_VAAPI_HWACCEL
2980                                HWACCEL_VAAPI(vp8),
2981 #endif
2982 #if CONFIG_VP8_NVDEC_HWACCEL
2983                                HWACCEL_NVDEC(vp8),
2984 #endif
2985                                NULL
2986                            },
2987 };
2988 #endif /* CONFIG_VP7_DECODER */