git.sesse.net Git - ffmpeg/blob - libavcodec/vp8.c

   1 /*
   2  * VP7/VP8 compatible video decoder
   3  *
   4  * Copyright (C) 2010 David Conrad
   5  * Copyright (C) 2010 Ronald S. Bultje
   6  * Copyright (C) 2010 Fiona Glaser
   7  * Copyright (C) 2012 Daniel Kang
   8  * Copyright (C) 2014 Peter Ross
   9  *
  10  * This file is part of FFmpeg.
  11  *
  12  * FFmpeg is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public
  14  * License as published by the Free Software Foundation; either
  15  * version 2.1 of the License, or (at your option) any later version.
  16  *
  17  * FFmpeg is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with FFmpeg; if not, write to the Free Software
  24  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  25  */
  26
  27 #include "libavutil/imgutils.h"
  28
  29 #include "avcodec.h"
  30 #include "hwconfig.h"
  31 #include "internal.h"
  32 #include "mathops.h"
  33 #include "rectangle.h"
  34 #include "thread.h"
  35 #include "vp8.h"
  36 #include "vp8data.h"
  37
  38 #if ARCH_ARM
  39 #   include "arm/vp8.h"
  40 #endif
  41
  42 #if CONFIG_VP7_DECODER && CONFIG_VP8_DECODER
  43 #define VPX(vp7, f) (vp7 ? vp7_ ## f : vp8_ ## f)
  44 #elif CONFIG_VP7_DECODER
  45 #define VPX(vp7, f) vp7_ ## f
  46 #else // CONFIG_VP8_DECODER
  47 #define VPX(vp7, f) vp8_ ## f
  48 #endif
  49
  50 static void free_buffers(VP8Context *s)
  51 {
  52     int i;
  53     if (s->thread_data)
  54         for (i = 0; i < MAX_THREADS; i++) {
  55 #if HAVE_THREADS
  56             pthread_cond_destroy(&s->thread_data[i].cond);
  57             pthread_mutex_destroy(&s->thread_data[i].lock);
  58 #endif
  59             av_freep(&s->thread_data[i].filter_strength);
  60         }
  61     av_freep(&s->thread_data);
  62     av_freep(&s->macroblocks_base);
  63     av_freep(&s->intra4x4_pred_mode_top);
  64     av_freep(&s->top_nnz);
  65     av_freep(&s->top_border);
  66
  67     s->macroblocks = NULL;
  68 }
  69
  70 static int vp8_alloc_frame(VP8Context *s, VP8Frame *f, int ref)
  71 {
  72     int ret;
  73     if ((ret = ff_thread_get_buffer(s->avctx, &f->tf,
  74                                     ref ? AV_GET_BUFFER_FLAG_REF : 0)) < 0)
  75         return ret;
  76     if (!(f->seg_map = av_buffer_allocz(s->mb_width * s->mb_height)))
  77         goto fail;
  78     if (s->avctx->hwaccel) {
  79         const AVHWAccel *hwaccel = s->avctx->hwaccel;
  80         if (hwaccel->frame_priv_data_size) {
  81             f->hwaccel_priv_buf = av_buffer_allocz(hwaccel->frame_priv_data_size);
  82             if (!f->hwaccel_priv_buf)
  83                 goto fail;
  84             f->hwaccel_picture_private = f->hwaccel_priv_buf->data;
  85         }
  86     }
  87     return 0;
  88
  89 fail:
  90     av_buffer_unref(&f->seg_map);
  91     ff_thread_release_buffer(s->avctx, &f->tf);
  92     return AVERROR(ENOMEM);
  93 }
  94
  95 static void vp8_release_frame(VP8Context *s, VP8Frame *f)
  96 {
  97     av_buffer_unref(&f->seg_map);
  98     av_buffer_unref(&f->hwaccel_priv_buf);
  99     f->hwaccel_picture_private = NULL;
 100     ff_thread_release_buffer(s->avctx, &f->tf);
 101 }
 102
 103 #if CONFIG_VP8_DECODER
 104 static int vp8_ref_frame(VP8Context *s, VP8Frame *dst, VP8Frame *src)
 105 {
 106     int ret;
 107
 108     vp8_release_frame(s, dst);
 109
 110     if ((ret = ff_thread_ref_frame(&dst->tf, &src->tf)) < 0)
 111         return ret;
 112     if (src->seg_map &&
 113         !(dst->seg_map = av_buffer_ref(src->seg_map))) {
 114         vp8_release_frame(s, dst);
 115         return AVERROR(ENOMEM);
 116     }
 117     if (src->hwaccel_picture_private) {
 118         dst->hwaccel_priv_buf = av_buffer_ref(src->hwaccel_priv_buf);
 119         if (!dst->hwaccel_priv_buf)
 120             return AVERROR(ENOMEM);
 121         dst->hwaccel_picture_private = dst->hwaccel_priv_buf->data;
 122     }
 123
 124     return 0;
 125 }
 126 #endif /* CONFIG_VP8_DECODER */
 127
 128 static void vp8_decode_flush_impl(AVCodecContext *avctx, int free_mem)
 129 {
 130     VP8Context *s = avctx->priv_data;
 131     int i;
 132
 133     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++)
 134         vp8_release_frame(s, &s->frames[i]);
 135     memset(s->framep, 0, sizeof(s->framep));
 136
 137     if (free_mem)
 138         free_buffers(s);
 139 }
 140
 141 static void vp8_decode_flush(AVCodecContext *avctx)
 142 {
 143     vp8_decode_flush_impl(avctx, 0);
 144 }
 145
 146 static VP8Frame *vp8_find_free_buffer(VP8Context *s)
 147 {
 148     VP8Frame *frame = NULL;
 149     int i;
 150
 151     // find a free buffer
 152     for (i = 0; i < 5; i++)
 153         if (&s->frames[i] != s->framep[VP56_FRAME_CURRENT]  &&
 154             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
 155             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN]   &&
 156             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2]) {
 157             frame = &s->frames[i];
 158             break;
 159         }
 160     if (i == 5) {
 161         av_log(s->avctx, AV_LOG_FATAL, "Ran out of free frames!\n");
 162         abort();
 163     }
 164     if (frame->tf.f->buf[0])
 165         vp8_release_frame(s, frame);
 166
 167     return frame;
 168 }
 169
 170 static enum AVPixelFormat get_pixel_format(VP8Context *s)
 171 {
 172     enum AVPixelFormat pix_fmts[] = {
 173 #if CONFIG_VP8_VAAPI_HWACCEL
 174         AV_PIX_FMT_VAAPI,
 175 #endif
 176 #if CONFIG_VP8_NVDEC_HWACCEL
 177         AV_PIX_FMT_CUDA,
 178 #endif
 179         AV_PIX_FMT_YUV420P,
 180         AV_PIX_FMT_NONE,
 181     };
 182
 183     return ff_get_format(s->avctx, pix_fmts);
 184 }
 185
 186 static av_always_inline
 187 int update_dimensions(VP8Context *s, int width, int height, int is_vp7)
 188 {
 189     AVCodecContext *avctx = s->avctx;
 190     int i, ret, dim_reset = 0;
 191
 192     if (width  != s->avctx->width || ((width+15)/16 != s->mb_width || (height+15)/16 != s->mb_height) && s->macroblocks_base ||
 193         height != s->avctx->height) {
 194         vp8_decode_flush_impl(s->avctx, 1);
 195
 196         ret = ff_set_dimensions(s->avctx, width, height);
 197         if (ret < 0)
 198             return ret;
 199
 200         dim_reset = (s->macroblocks_base != NULL);
 201     }
 202
 203     if ((s->pix_fmt == AV_PIX_FMT_NONE || dim_reset) &&
 204          !s->actually_webp && !is_vp7) {
 205         s->pix_fmt = get_pixel_format(s);
 206         if (s->pix_fmt < 0)
 207             return AVERROR(EINVAL);
 208         avctx->pix_fmt = s->pix_fmt;
 209     }
 210
 211     s->mb_width  = (s->avctx->coded_width  + 15) / 16;
 212     s->mb_height = (s->avctx->coded_height + 15) / 16;
 213
 214     s->mb_layout = is_vp7 || avctx->active_thread_type == FF_THREAD_SLICE &&
 215                    avctx->thread_count > 1;
 216     if (!s->mb_layout) { // Frame threading and one thread
 217         s->macroblocks_base       = av_mallocz((s->mb_width + s->mb_height * 2 + 1) *
 218                                                sizeof(*s->macroblocks));
 219         s->intra4x4_pred_mode_top = av_mallocz(s->mb_width * 4);
 220     } else // Sliced threading
 221         s->macroblocks_base = av_mallocz((s->mb_width + 2) * (s->mb_height + 2) *
 222                                          sizeof(*s->macroblocks));
 223     s->top_nnz     = av_mallocz(s->mb_width * sizeof(*s->top_nnz));
 224     s->top_border  = av_mallocz((s->mb_width + 1) * sizeof(*s->top_border));
 225     s->thread_data = av_mallocz(MAX_THREADS * sizeof(VP8ThreadData));
 226
 227     if (!s->macroblocks_base || !s->top_nnz || !s->top_border ||
 228         !s->thread_data || (!s->intra4x4_pred_mode_top && !s->mb_layout)) {
 229         free_buffers(s);
 230         return AVERROR(ENOMEM);
 231     }
 232
 233     for (i = 0; i < MAX_THREADS; i++) {
 234         s->thread_data[i].filter_strength =
 235             av_mallocz(s->mb_width * sizeof(*s->thread_data[0].filter_strength));
 236         if (!s->thread_data[i].filter_strength) {
 237             free_buffers(s);
 238             return AVERROR(ENOMEM);
 239         }
 240 #if HAVE_THREADS
 241         pthread_mutex_init(&s->thread_data[i].lock, NULL);
 242         pthread_cond_init(&s->thread_data[i].cond, NULL);
 243 #endif
 244     }
 245
 246     s->macroblocks = s->macroblocks_base + 1;
 247
 248     return 0;
 249 }
 250
 251 static int vp7_update_dimensions(VP8Context *s, int width, int height)
 252 {
 253     return update_dimensions(s, width, height, IS_VP7);
 254 }
 255
 256 static int vp8_update_dimensions(VP8Context *s, int width, int height)
 257 {
 258     return update_dimensions(s, width, height, IS_VP8);
 259 }
 260
 261
 262 static void parse_segment_info(VP8Context *s)
 263 {
 264     VP56RangeCoder *c = &s->c;
 265     int i;
 266
 267     s->segmentation.update_map = vp8_rac_get(c);
 268     s->segmentation.update_feature_data = vp8_rac_get(c);
 269
 270     if (s->segmentation.update_feature_data) {
 271         s->segmentation.absolute_vals = vp8_rac_get(c);
 272
 273         for (i = 0; i < 4; i++)
 274             s->segmentation.base_quant[i]   = vp8_rac_get_sint(c, 7);
 275
 276         for (i = 0; i < 4; i++)
 277             s->segmentation.filter_level[i] = vp8_rac_get_sint(c, 6);
 278     }
 279     if (s->segmentation.update_map)
 280         for (i = 0; i < 3; i++)
 281             s->prob->segmentid[i] = vp8_rac_get(c) ? vp8_rac_get_uint(c, 8) : 255;
 282 }
 283
 284 static void update_lf_deltas(VP8Context *s)
 285 {
 286     VP56RangeCoder *c = &s->c;
 287     int i;
 288
 289     for (i = 0; i < 4; i++) {
 290         if (vp8_rac_get(c)) {
 291             s->lf_delta.ref[i] = vp8_rac_get_uint(c, 6);
 292
 293             if (vp8_rac_get(c))
 294                 s->lf_delta.ref[i] = -s->lf_delta.ref[i];
 295         }
 296     }
 297
 298     for (i = MODE_I4x4; i <= VP8_MVMODE_SPLIT; i++) {
 299         if (vp8_rac_get(c)) {
 300             s->lf_delta.mode[i] = vp8_rac_get_uint(c, 6);
 301
 302             if (vp8_rac_get(c))
 303                 s->lf_delta.mode[i] = -s->lf_delta.mode[i];
 304         }
 305     }
 306 }
 307
 308 static int setup_partitions(VP8Context *s, const uint8_t *buf, int buf_size)
 309 {
 310     const uint8_t *sizes = buf;
 311     int i;
 312     int ret;
 313
 314     s->num_coeff_partitions = 1 << vp8_rac_get_uint(&s->c, 2);
 315
 316     buf      += 3 * (s->num_coeff_partitions - 1);
 317     buf_size -= 3 * (s->num_coeff_partitions - 1);
 318     if (buf_size < 0)
 319         return -1;
 320
 321     for (i = 0; i < s->num_coeff_partitions - 1; i++) {
 322         int size = AV_RL24(sizes + 3 * i);
 323         if (buf_size - size < 0)
 324             return -1;
 325         s->coeff_partition_size[i] = size;
 326
 327         ret = ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, size);
 328         if (ret < 0)
 329             return ret;
 330         buf      += size;
 331         buf_size -= size;
 332     }
 333
 334     s->coeff_partition_size[i] = buf_size;
 335     ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, buf_size);
 336
 337     return 0;
 338 }
 339
 340 static void vp7_get_quants(VP8Context *s)
 341 {
 342     VP56RangeCoder *c = &s->c;
 343
 344     int yac_qi  = vp8_rac_get_uint(c, 7);
 345     int ydc_qi  = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 346     int y2dc_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 347     int y2ac_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 348     int uvdc_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 349     int uvac_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 350
 351     s->qmat[0].luma_qmul[0]    =       vp7_ydc_qlookup[ydc_qi];
 352     s->qmat[0].luma_qmul[1]    =       vp7_yac_qlookup[yac_qi];
 353     s->qmat[0].luma_dc_qmul[0] =       vp7_y2dc_qlookup[y2dc_qi];
 354     s->qmat[0].luma_dc_qmul[1] =       vp7_y2ac_qlookup[y2ac_qi];
 355     s->qmat[0].chroma_qmul[0]  = FFMIN(vp7_ydc_qlookup[uvdc_qi], 132);
 356     s->qmat[0].chroma_qmul[1]  =       vp7_yac_qlookup[uvac_qi];
 357 }
 358
 359 static void vp8_get_quants(VP8Context *s)
 360 {
 361     VP56RangeCoder *c = &s->c;
 362     int i, base_qi;
 363
 364     s->quant.yac_qi     = vp8_rac_get_uint(c, 7);
 365     s->quant.ydc_delta  = vp8_rac_get_sint(c, 4);
 366     s->quant.y2dc_delta = vp8_rac_get_sint(c, 4);
 367     s->quant.y2ac_delta = vp8_rac_get_sint(c, 4);
 368     s->quant.uvdc_delta = vp8_rac_get_sint(c, 4);
 369     s->quant.uvac_delta = vp8_rac_get_sint(c, 4);
 370
 371     for (i = 0; i < 4; i++) {
 372         if (s->segmentation.enabled) {
 373             base_qi = s->segmentation.base_quant[i];
 374             if (!s->segmentation.absolute_vals)
 375                 base_qi += s->quant.yac_qi;
 376         } else
 377             base_qi = s->quant.yac_qi;
 378
 379         s->qmat[i].luma_qmul[0]    = vp8_dc_qlookup[av_clip_uintp2(base_qi + s->quant.ydc_delta,  7)];
 380         s->qmat[i].luma_qmul[1]    = vp8_ac_qlookup[av_clip_uintp2(base_qi,              7)];
 381         s->qmat[i].luma_dc_qmul[0] = vp8_dc_qlookup[av_clip_uintp2(base_qi + s->quant.y2dc_delta, 7)] * 2;
 382         /* 101581>>16 is equivalent to 155/100 */
 383         s->qmat[i].luma_dc_qmul[1] = vp8_ac_qlookup[av_clip_uintp2(base_qi + s->quant.y2ac_delta, 7)] * 101581 >> 16;
 384         s->qmat[i].chroma_qmul[0]  = vp8_dc_qlookup[av_clip_uintp2(base_qi + s->quant.uvdc_delta, 7)];
 385         s->qmat[i].chroma_qmul[1]  = vp8_ac_qlookup[av_clip_uintp2(base_qi + s->quant.uvac_delta, 7)];
 386
 387         s->qmat[i].luma_dc_qmul[1] = FFMAX(s->qmat[i].luma_dc_qmul[1], 8);
 388         s->qmat[i].chroma_qmul[0]  = FFMIN(s->qmat[i].chroma_qmul[0], 132);
 389     }
 390 }
 391
 392 /**
 393  * Determine which buffers golden and altref should be updated with after this frame.
 394  * The spec isn't clear here, so I'm going by my understanding of what libvpx does
 395  *
 396  * Intra frames update all 3 references
 397  * Inter frames update VP56_FRAME_PREVIOUS if the update_last flag is set
 398  * If the update (golden|altref) flag is set, it's updated with the current frame
 399  *      if update_last is set, and VP56_FRAME_PREVIOUS otherwise.
 400  * If the flag is not set, the number read means:
 401  *      0: no update
 402  *      1: VP56_FRAME_PREVIOUS
 403  *      2: update golden with altref, or update altref with golden
 404  */
 405 static VP56Frame ref_to_update(VP8Context *s, int update, VP56Frame ref)
 406 {
 407     VP56RangeCoder *c = &s->c;
 408
 409     if (update)
 410         return VP56_FRAME_CURRENT;
 411
 412     switch (vp8_rac_get_uint(c, 2)) {
 413     case 1:
 414         return VP56_FRAME_PREVIOUS;
 415     case 2:
 416         return (ref == VP56_FRAME_GOLDEN) ? VP56_FRAME_GOLDEN2 : VP56_FRAME_GOLDEN;
 417     }
 418     return VP56_FRAME_NONE;
 419 }
 420
 421 static void vp78_reset_probability_tables(VP8Context *s)
 422 {
 423     int i, j;
 424     for (i = 0; i < 4; i++)
 425         for (j = 0; j < 16; j++)
 426             memcpy(s->prob->token[i][j], vp8_token_default_probs[i][vp8_coeff_band[j]],
 427                    sizeof(s->prob->token[i][j]));
 428 }
 429
 430 static void vp78_update_probability_tables(VP8Context *s)
 431 {
 432     VP56RangeCoder *c = &s->c;
 433     int i, j, k, l, m;
 434
 435     for (i = 0; i < 4; i++)
 436         for (j = 0; j < 8; j++)
 437             for (k = 0; k < 3; k++)
 438                 for (l = 0; l < NUM_DCT_TOKENS-1; l++)
 439                     if (vp56_rac_get_prob_branchy(c, vp8_token_update_probs[i][j][k][l])) {
 440                         int prob = vp8_rac_get_uint(c, 8);
 441                         for (m = 0; vp8_coeff_band_indexes[j][m] >= 0; m++)
 442                             s->prob->token[i][vp8_coeff_band_indexes[j][m]][k][l] = prob;
 443                     }
 444 }
 445
 446 #define VP7_MVC_SIZE 17
 447 #define VP8_MVC_SIZE 19
 448
 449 static void vp78_update_pred16x16_pred8x8_mvc_probabilities(VP8Context *s,
 450                                                             int mvc_size)
 451 {
 452     VP56RangeCoder *c = &s->c;
 453     int i, j;
 454
 455     if (vp8_rac_get(c))
 456         for (i = 0; i < 4; i++)
 457             s->prob->pred16x16[i] = vp8_rac_get_uint(c, 8);
 458     if (vp8_rac_get(c))
 459         for (i = 0; i < 3; i++)
 460             s->prob->pred8x8c[i]  = vp8_rac_get_uint(c, 8);
 461
 462     // 17.2 MV probability update
 463     for (i = 0; i < 2; i++)
 464         for (j = 0; j < mvc_size; j++)
 465             if (vp56_rac_get_prob_branchy(c, vp8_mv_update_prob[i][j]))
 466                 s->prob->mvc[i][j] = vp8_rac_get_nn(c);
 467 }
 468
 469 static void update_refs(VP8Context *s)
 470 {
 471     VP56RangeCoder *c = &s->c;
 472
 473     int update_golden = vp8_rac_get(c);
 474     int update_altref = vp8_rac_get(c);
 475
 476     s->update_golden = ref_to_update(s, update_golden, VP56_FRAME_GOLDEN);
 477     s->update_altref = ref_to_update(s, update_altref, VP56_FRAME_GOLDEN2);
 478 }
 479
 480 static void copy_chroma(AVFrame *dst, AVFrame *src, int width, int height)
 481 {
 482     int i, j;
 483
 484     for (j = 1; j < 3; j++) {
 485         for (i = 0; i < height / 2; i++)
 486             memcpy(dst->data[j] + i * dst->linesize[j],
 487                    src->data[j] + i * src->linesize[j], width / 2);
 488     }
 489 }
 490
 491 static void fade(uint8_t *dst, ptrdiff_t dst_linesize,
 492                  const uint8_t *src, ptrdiff_t src_linesize,
 493                  int width, int height,
 494                  int alpha, int beta)
 495 {
 496     int i, j;
 497     for (j = 0; j < height; j++) {
 498         const uint8_t *src2 = src + j * src_linesize;
 499         uint8_t *dst2 = dst + j * dst_linesize;
 500         for (i = 0; i < width; i++) {
 501             uint8_t y = src2[i];
 502             dst2[i] = av_clip_uint8(y + ((y * beta) >> 8) + alpha);
 503         }
 504     }
 505 }
 506
 507 static int vp7_fade_frame(VP8Context *s, int alpha, int beta)
 508 {
 509     int ret;
 510
 511     if (!s->keyframe && (alpha || beta)) {
 512         int width  = s->mb_width * 16;
 513         int height = s->mb_height * 16;
 514         AVFrame *src, *dst;
 515
 516         if (!s->framep[VP56_FRAME_PREVIOUS] ||
 517             !s->framep[VP56_FRAME_GOLDEN]) {
 518             av_log(s->avctx, AV_LOG_WARNING, "Discarding interframe without a prior keyframe!\n");
 519             return AVERROR_INVALIDDATA;
 520         }
 521
 522         dst =
 523         src = s->framep[VP56_FRAME_PREVIOUS]->tf.f;
 524
 525         /* preserve the golden frame, write a new previous frame */
 526         if (s->framep[VP56_FRAME_GOLDEN] == s->framep[VP56_FRAME_PREVIOUS]) {
 527             s->framep[VP56_FRAME_PREVIOUS] = vp8_find_free_buffer(s);
 528             if ((ret = vp8_alloc_frame(s, s->framep[VP56_FRAME_PREVIOUS], 1)) < 0)
 529                 return ret;
 530
 531             dst = s->framep[VP56_FRAME_PREVIOUS]->tf.f;
 532
 533             copy_chroma(dst, src, width, height);
 534         }
 535
 536         fade(dst->data[0], dst->linesize[0],
 537              src->data[0], src->linesize[0],
 538              width, height, alpha, beta);
 539     }
 540
 541     return 0;
 542 }
 543
 544 static int vp7_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size)
 545 {
 546     VP56RangeCoder *c = &s->c;
 547     int part1_size, hscale, vscale, i, j, ret;
 548     int width  = s->avctx->width;
 549     int height = s->avctx->height;
 550     int alpha = 0;
 551     int beta  = 0;
 552
 553     if (buf_size < 4) {
 554         return AVERROR_INVALIDDATA;
 555     }
 556
 557     s->profile = (buf[0] >> 1) & 7;
 558     if (s->profile > 1) {
 559         avpriv_request_sample(s->avctx, "Unknown profile %d", s->profile);
 560         return AVERROR_INVALIDDATA;
 561     }
 562
 563     s->keyframe  = !(buf[0] & 1);
 564     s->invisible = 0;
 565     part1_size   = AV_RL24(buf) >> 4;
 566
 567     if (buf_size < 4 - s->profile + part1_size) {
 568         av_log(s->avctx, AV_LOG_ERROR, "Buffer size %d is too small, needed : %d\n", buf_size, 4 - s->profile + part1_size);
 569         return AVERROR_INVALIDDATA;
 570     }
 571
 572     buf      += 4 - s->profile;
 573     buf_size -= 4 - s->profile;
 574
 575     memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab, sizeof(s->put_pixels_tab));
 576
 577     ret = ff_vp56_init_range_decoder(c, buf, part1_size);
 578     if (ret < 0)
 579         return ret;
 580     buf      += part1_size;
 581     buf_size -= part1_size;
 582
 583     /* A. Dimension information (keyframes only) */
 584     if (s->keyframe) {
 585         width  = vp8_rac_get_uint(c, 12);
 586         height = vp8_rac_get_uint(c, 12);
 587         hscale = vp8_rac_get_uint(c, 2);
 588         vscale = vp8_rac_get_uint(c, 2);
 589         if (hscale || vscale)
 590             avpriv_request_sample(s->avctx, "Upscaling");
 591
 592         s->update_golden = s->update_altref = VP56_FRAME_CURRENT;
 593         vp78_reset_probability_tables(s);
 594         memcpy(s->prob->pred16x16, vp8_pred16x16_prob_inter,
 595                sizeof(s->prob->pred16x16));
 596         memcpy(s->prob->pred8x8c, vp8_pred8x8c_prob_inter,
 597                sizeof(s->prob->pred8x8c));
 598         for (i = 0; i < 2; i++)
 599             memcpy(s->prob->mvc[i], vp7_mv_default_prob[i],
 600                    sizeof(vp7_mv_default_prob[i]));
 601         memset(&s->segmentation, 0, sizeof(s->segmentation));
 602         memset(&s->lf_delta, 0, sizeof(s->lf_delta));
 603         memcpy(s->prob[0].scan, ff_zigzag_scan, sizeof(s->prob[0].scan));
 604     }
 605
 606     if (s->keyframe || s->profile > 0)
 607         memset(s->inter_dc_pred, 0 , sizeof(s->inter_dc_pred));
 608
 609     /* B. Decoding information for all four macroblock-level features */
 610     for (i = 0; i < 4; i++) {
 611         s->feature_enabled[i] = vp8_rac_get(c);
 612         if (s->feature_enabled[i]) {
 613              s->feature_present_prob[i] = vp8_rac_get_uint(c, 8);
 614
 615              for (j = 0; j < 3; j++)
 616                  s->feature_index_prob[i][j] =
 617                      vp8_rac_get(c) ? vp8_rac_get_uint(c, 8) : 255;
 618
 619              if (vp7_feature_value_size[s->profile][i])
 620                  for (j = 0; j < 4; j++)
 621                      s->feature_value[i][j] =
 622                         vp8_rac_get(c) ? vp8_rac_get_uint(c, vp7_feature_value_size[s->profile][i]) : 0;
 623         }
 624     }
 625
 626     s->segmentation.enabled    = 0;
 627     s->segmentation.update_map = 0;
 628     s->lf_delta.enabled        = 0;
 629
 630     s->num_coeff_partitions = 1;
 631     ret = ff_vp56_init_range_decoder(&s->coeff_partition[0], buf, buf_size);
 632     if (ret < 0)
 633         return ret;
 634
 635     if (!s->macroblocks_base || /* first frame */
 636         width != s->avctx->width || height != s->avctx->height ||
 637         (width + 15) / 16 != s->mb_width || (height + 15) / 16 != s->mb_height) {
 638         if ((ret = vp7_update_dimensions(s, width, height)) < 0)
 639             return ret;
 640     }
 641
 642     /* C. Dequantization indices */
 643     vp7_get_quants(s);
 644
 645     /* D. Golden frame update flag (a Flag) for interframes only */
 646     if (!s->keyframe) {
 647         s->update_golden = vp8_rac_get(c) ? VP56_FRAME_CURRENT : VP56_FRAME_NONE;
 648         s->sign_bias[VP56_FRAME_GOLDEN] = 0;
 649     }
 650
 651     s->update_last          = 1;
 652     s->update_probabilities = 1;
 653     s->fade_present         = 1;
 654
 655     if (s->profile > 0) {
 656         s->update_probabilities = vp8_rac_get(c);
 657         if (!s->update_probabilities)
 658             s->prob[1] = s->prob[0];
 659
 660         if (!s->keyframe)
 661             s->fade_present = vp8_rac_get(c);
 662     }
 663
 664     if (vpX_rac_is_end(c))
 665         return AVERROR_INVALIDDATA;
 666     /* E. Fading information for previous frame */
 667     if (s->fade_present && vp8_rac_get(c)) {
 668         alpha = (int8_t) vp8_rac_get_uint(c, 8);
 669         beta  = (int8_t) vp8_rac_get_uint(c, 8);
 670     }
 671
 672     /* F. Loop filter type */
 673     if (!s->profile)
 674         s->filter.simple = vp8_rac_get(c);
 675
 676     /* G. DCT coefficient ordering specification */
 677     if (vp8_rac_get(c))
 678         for (i = 1; i < 16; i++)
 679             s->prob[0].scan[i] = ff_zigzag_scan[vp8_rac_get_uint(c, 4)];
 680
 681     /* H. Loop filter levels  */
 682     if (s->profile > 0)
 683         s->filter.simple = vp8_rac_get(c);
 684     s->filter.level     = vp8_rac_get_uint(c, 6);
 685     s->filter.sharpness = vp8_rac_get_uint(c, 3);
 686
 687     /* I. DCT coefficient probability update; 13.3 Token Probability Updates */
 688     vp78_update_probability_tables(s);
 689
 690     s->mbskip_enabled = 0;
 691
 692     /* J. The remaining frame header data occurs ONLY FOR INTERFRAMES */
 693     if (!s->keyframe) {
 694         s->prob->intra  = vp8_rac_get_uint(c, 8);
 695         s->prob->last   = vp8_rac_get_uint(c, 8);
 696         vp78_update_pred16x16_pred8x8_mvc_probabilities(s, VP7_MVC_SIZE);
 697     }
 698
 699     if (vpX_rac_is_end(c))
 700         return AVERROR_INVALIDDATA;
 701
 702     if ((ret = vp7_fade_frame(s, alpha, beta)) < 0)
 703         return ret;
 704
 705     return 0;
 706 }
 707
 708 static int vp8_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size)
 709 {
 710     VP56RangeCoder *c = &s->c;
 711     int header_size, hscale, vscale, ret;
 712     int width  = s->avctx->width;
 713     int height = s->avctx->height;
 714
 715     if (buf_size < 3) {
 716         av_log(s->avctx, AV_LOG_ERROR, "Insufficent data (%d) for header\n", buf_size);
 717         return AVERROR_INVALIDDATA;
 718     }
 719
 720     s->keyframe  = !(buf[0] & 1);
 721     s->profile   =  (buf[0]>>1) & 7;
 722     s->invisible = !(buf[0] & 0x10);
 723     header_size  = AV_RL24(buf) >> 5;
 724     buf      += 3;
 725     buf_size -= 3;
 726
 727     s->header_partition_size = header_size;
 728
 729     if (s->profile > 3)
 730         av_log(s->avctx, AV_LOG_WARNING, "Unknown profile %d\n", s->profile);
 731
 732     if (!s->profile)
 733         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab,
 734                sizeof(s->put_pixels_tab));
 735     else    // profile 1-3 use bilinear, 4+ aren't defined so whatever
 736         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_bilinear_pixels_tab,
 737                sizeof(s->put_pixels_tab));
 738
 739     if (header_size > buf_size - 7 * s->keyframe) {
 740         av_log(s->avctx, AV_LOG_ERROR, "Header size larger than data provided\n");
 741         return AVERROR_INVALIDDATA;
 742     }
 743
 744     if (s->keyframe) {
 745         if (AV_RL24(buf) != 0x2a019d) {
 746             av_log(s->avctx, AV_LOG_ERROR,
 747                    "Invalid start code 0x%x\n", AV_RL24(buf));
 748             return AVERROR_INVALIDDATA;
 749         }
 750         width     = AV_RL16(buf + 3) & 0x3fff;
 751         height    = AV_RL16(buf + 5) & 0x3fff;
 752         hscale    = buf[4] >> 6;
 753         vscale    = buf[6] >> 6;
 754         buf      += 7;
 755         buf_size -= 7;
 756
 757         if (hscale || vscale)
 758             avpriv_request_sample(s->avctx, "Upscaling");
 759
 760         s->update_golden = s->update_altref = VP56_FRAME_CURRENT;
 761         vp78_reset_probability_tables(s);
 762         memcpy(s->prob->pred16x16, vp8_pred16x16_prob_inter,
 763                sizeof(s->prob->pred16x16));
 764         memcpy(s->prob->pred8x8c, vp8_pred8x8c_prob_inter,
 765                sizeof(s->prob->pred8x8c));
 766         memcpy(s->prob->mvc, vp8_mv_default_prob,
 767                sizeof(s->prob->mvc));
 768         memset(&s->segmentation, 0, sizeof(s->segmentation));
 769         memset(&s->lf_delta, 0, sizeof(s->lf_delta));
 770     }
 771
 772     ret = ff_vp56_init_range_decoder(c, buf, header_size);
 773     if (ret < 0)
 774         return ret;
 775     buf      += header_size;
 776     buf_size -= header_size;
 777
 778     if (s->keyframe) {
 779         s->colorspace = vp8_rac_get(c);
 780         if (s->colorspace)
 781             av_log(s->avctx, AV_LOG_WARNING, "Unspecified colorspace\n");
 782         s->fullrange = vp8_rac_get(c);
 783     }
 784
 785     if ((s->segmentation.enabled = vp8_rac_get(c)))
 786         parse_segment_info(s);
 787     else
 788         s->segmentation.update_map = 0; // FIXME: move this to some init function?
 789
 790     s->filter.simple    = vp8_rac_get(c);
 791     s->filter.level     = vp8_rac_get_uint(c, 6);
 792     s->filter.sharpness = vp8_rac_get_uint(c, 3);
 793
 794     if ((s->lf_delta.enabled = vp8_rac_get(c))) {
 795         s->lf_delta.update = vp8_rac_get(c);
 796         if (s->lf_delta.update)
 797             update_lf_deltas(s);
 798     }
 799
 800     if (setup_partitions(s, buf, buf_size)) {
 801         av_log(s->avctx, AV_LOG_ERROR, "Invalid partitions\n");
 802         return AVERROR_INVALIDDATA;
 803     }
 804
 805     if (!s->macroblocks_base || /* first frame */
 806         width != s->avctx->width || height != s->avctx->height ||
 807         (width+15)/16 != s->mb_width || (height+15)/16 != s->mb_height)
 808         if ((ret = vp8_update_dimensions(s, width, height)) < 0)
 809             return ret;
 810
 811     vp8_get_quants(s);
 812
 813     if (!s->keyframe) {
 814         update_refs(s);
 815         s->sign_bias[VP56_FRAME_GOLDEN]               = vp8_rac_get(c);
 816         s->sign_bias[VP56_FRAME_GOLDEN2 /* altref */] = vp8_rac_get(c);
 817     }
 818
 819     // if we aren't saving this frame's probabilities for future frames,
 820     // make a copy of the current probabilities
 821     if (!(s->update_probabilities = vp8_rac_get(c)))
 822         s->prob[1] = s->prob[0];
 823
 824     s->update_last = s->keyframe || vp8_rac_get(c);
 825
 826     vp78_update_probability_tables(s);
 827
 828     if ((s->mbskip_enabled = vp8_rac_get(c)))
 829         s->prob->mbskip = vp8_rac_get_uint(c, 8);
 830
 831     if (!s->keyframe) {
 832         s->prob->intra  = vp8_rac_get_uint(c, 8);
 833         s->prob->last   = vp8_rac_get_uint(c, 8);
 834         s->prob->golden = vp8_rac_get_uint(c, 8);
 835         vp78_update_pred16x16_pred8x8_mvc_probabilities(s, VP8_MVC_SIZE);
 836     }
 837
 838     // Record the entropy coder state here so that hwaccels can use it.
 839     s->c.code_word = vp56_rac_renorm(&s->c);
 840     s->coder_state_at_header_end.input     = s->c.buffer - (-s->c.bits / 8);
 841     s->coder_state_at_header_end.range     = s->c.high;
 842     s->coder_state_at_header_end.value     = s->c.code_word >> 16;
 843     s->coder_state_at_header_end.bit_count = -s->c.bits % 8;
 844
 845     return 0;
 846 }
 847
 848 static av_always_inline
 849 void clamp_mv(VP8mvbounds *s, VP56mv *dst, const VP56mv *src)
 850 {
 851     dst->x = av_clip(src->x, av_clip(s->mv_min.x, INT16_MIN, INT16_MAX),
 852                              av_clip(s->mv_max.x, INT16_MIN, INT16_MAX));
 853     dst->y = av_clip(src->y, av_clip(s->mv_min.y, INT16_MIN, INT16_MAX),
 854                              av_clip(s->mv_max.y, INT16_MIN, INT16_MAX));
 855 }
 856
 857 /**
 858  * Motion vector coding, 17.1.
 859  */
 860 static av_always_inline int read_mv_component(VP56RangeCoder *c, const uint8_t *p, int vp7)
 861 {
 862     int bit, x = 0;
 863
 864     if (vp56_rac_get_prob_branchy(c, p[0])) {
 865         int i;
 866
 867         for (i = 0; i < 3; i++)
 868             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 869         for (i = (vp7 ? 7 : 9); i > 3; i--)
 870             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 871         if (!(x & (vp7 ? 0xF0 : 0xFFF0)) || vp56_rac_get_prob(c, p[12]))
 872             x += 8;
 873     } else {
 874         // small_mvtree
 875         const uint8_t *ps = p + 2;
 876         bit = vp56_rac_get_prob(c, *ps);
 877         ps += 1 + 3 * bit;
 878         x  += 4 * bit;
 879         bit = vp56_rac_get_prob(c, *ps);
 880         ps += 1 + bit;
 881         x  += 2 * bit;
 882         x  += vp56_rac_get_prob(c, *ps);
 883     }
 884
 885     return (x && vp56_rac_get_prob(c, p[1])) ? -x : x;
 886 }
 887
 888 static int vp7_read_mv_component(VP56RangeCoder *c, const uint8_t *p)
 889 {
 890     return read_mv_component(c, p, 1);
 891 }
 892
 893 static int vp8_read_mv_component(VP56RangeCoder *c, const uint8_t *p)
 894 {
 895     return read_mv_component(c, p, 0);
 896 }
 897
 898 static av_always_inline
 899 const uint8_t *get_submv_prob(uint32_t left, uint32_t top, int is_vp7)
 900 {
 901     if (is_vp7)
 902         return vp7_submv_prob;
 903
 904     if (left == top)
 905         return vp8_submv_prob[4 - !!left];
 906     if (!top)
 907         return vp8_submv_prob[2];
 908     return vp8_submv_prob[1 - !!left];
 909 }
 910
 911 /**
 912  * Split motion vector prediction, 16.4.
 913  * @returns the number of motion vectors parsed (2, 4 or 16)
 914  */
 915 static av_always_inline
 916 int decode_splitmvs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
 917                     int layout, int is_vp7)
 918 {
 919     int part_idx;
 920     int n, num;
 921     VP8Macroblock *top_mb;
 922     VP8Macroblock *left_mb = &mb[-1];
 923     const uint8_t *mbsplits_left = vp8_mbsplits[left_mb->partitioning];
 924     const uint8_t *mbsplits_top, *mbsplits_cur, *firstidx;
 925     VP56mv *top_mv;
 926     VP56mv *left_mv = left_mb->bmv;
 927     VP56mv *cur_mv  = mb->bmv;
 928
 929     if (!layout) // layout is inlined, s->mb_layout is not
 930         top_mb = &mb[2];
 931     else
 932         top_mb = &mb[-s->mb_width - 1];
 933     mbsplits_top = vp8_mbsplits[top_mb->partitioning];
 934     top_mv       = top_mb->bmv;
 935
 936     if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[0])) {
 937         if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[1]))
 938             part_idx = VP8_SPLITMVMODE_16x8 + vp56_rac_get_prob(c, vp8_mbsplit_prob[2]);
 939         else
 940             part_idx = VP8_SPLITMVMODE_8x8;
 941     } else {
 942         part_idx = VP8_SPLITMVMODE_4x4;
 943     }
 944
 945     num              = vp8_mbsplit_count[part_idx];
 946     mbsplits_cur     = vp8_mbsplits[part_idx],
 947     firstidx         = vp8_mbfirstidx[part_idx];
 948     mb->partitioning = part_idx;
 949
 950     for (n = 0; n < num; n++) {
 951         int k = firstidx[n];
 952         uint32_t left, above;
 953         const uint8_t *submv_prob;
 954
 955         if (!(k & 3))
 956             left = AV_RN32A(&left_mv[mbsplits_left[k + 3]]);
 957         else
 958             left = AV_RN32A(&cur_mv[mbsplits_cur[k - 1]]);
 959         if (k <= 3)
 960             above = AV_RN32A(&top_mv[mbsplits_top[k + 12]]);
 961         else
 962             above = AV_RN32A(&cur_mv[mbsplits_cur[k - 4]]);
 963
 964         submv_prob = get_submv_prob(left, above, is_vp7);
 965
 966         if (vp56_rac_get_prob_branchy(c, submv_prob[0])) {
 967             if (vp56_rac_get_prob_branchy(c, submv_prob[1])) {
 968                 if (vp56_rac_get_prob_branchy(c, submv_prob[2])) {
 969                     mb->bmv[n].y = mb->mv.y +
 970                                    read_mv_component(c, s->prob->mvc[0], is_vp7);
 971                     mb->bmv[n].x = mb->mv.x +
 972                                    read_mv_component(c, s->prob->mvc[1], is_vp7);
 973                 } else {
 974                     AV_ZERO32(&mb->bmv[n]);
 975                 }
 976             } else {
 977                 AV_WN32A(&mb->bmv[n], above);
 978             }
 979         } else {
 980             AV_WN32A(&mb->bmv[n], left);
 981         }
 982     }
 983
 984     return num;
 985 }
 986
 987 /**
 988  * The vp7 reference decoder uses a padding macroblock column (added to right
 989  * edge of the frame) to guard against illegal macroblock offsets. The
 990  * algorithm has bugs that permit offsets to straddle the padding column.
 991  * This function replicates those bugs.
 992  *
 993  * @param[out] edge_x macroblock x address
 994  * @param[out] edge_y macroblock y address
 995  *
 996  * @return macroblock offset legal (boolean)
 997  */
 998 static int vp7_calculate_mb_offset(int mb_x, int mb_y, int mb_width,
 999                                    int xoffset, int yoffset, int boundary,
1000                                    int *edge_x, int *edge_y)
1001 {
1002     int vwidth = mb_width + 1;
1003     int new = (mb_y + yoffset) * vwidth + mb_x + xoffset;
1004     if (new < boundary || new % vwidth == vwidth - 1)
1005         return 0;
1006     *edge_y = new / vwidth;
1007     *edge_x = new % vwidth;
1008     return 1;
1009 }
1010
1011 static const VP56mv *get_bmv_ptr(const VP8Macroblock *mb, int subblock)
1012 {
1013     return &mb->bmv[mb->mode == VP8_MVMODE_SPLIT ? vp8_mbsplits[mb->partitioning][subblock] : 0];
1014 }
1015
1016 static av_always_inline
1017 void vp7_decode_mvs(VP8Context *s, VP8Macroblock *mb,
1018                     int mb_x, int mb_y, int layout)
1019 {
1020     VP8Macroblock *mb_edge[12];
1021     enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR };
1022     enum { VP8_EDGE_TOP, VP8_EDGE_LEFT, VP8_EDGE_TOPLEFT };
1023     int idx = CNT_ZERO;
1024     VP56mv near_mv[3];
1025     uint8_t cnt[3] = { 0 };
1026     VP56RangeCoder *c = &s->c;
1027     int i;
1028
1029     AV_ZERO32(&near_mv[0]);
1030     AV_ZERO32(&near_mv[1]);
1031     AV_ZERO32(&near_mv[2]);
1032
1033     for (i = 0; i < VP7_MV_PRED_COUNT; i++) {
1034         const VP7MVPred * pred = &vp7_mv_pred[i];
1035         int edge_x, edge_y;
1036
1037         if (vp7_calculate_mb_offset(mb_x, mb_y, s->mb_width, pred->xoffset,
1038                                     pred->yoffset, !s->profile, &edge_x, &edge_y)) {
1039             VP8Macroblock *edge = mb_edge[i] = (s->mb_layout == 1)
1040                                              ? s->macroblocks_base + 1 + edge_x +
1041                                                (s->mb_width + 1) * (edge_y + 1)
1042                                              : s->macroblocks + edge_x +
1043                                                (s->mb_height - edge_y - 1) * 2;
1044             uint32_t mv = AV_RN32A(get_bmv_ptr(edge, vp7_mv_pred[i].subblock));
1045             if (mv) {
1046                 if (AV_RN32A(&near_mv[CNT_NEAREST])) {
1047                     if (mv == AV_RN32A(&near_mv[CNT_NEAREST])) {
1048                         idx = CNT_NEAREST;
1049                     } else if (AV_RN32A(&near_mv[CNT_NEAR])) {
1050                         if (mv != AV_RN32A(&near_mv[CNT_NEAR]))
1051                             continue;
1052                         idx = CNT_NEAR;
1053                     } else {
1054                         AV_WN32A(&near_mv[CNT_NEAR], mv);
1055                         idx = CNT_NEAR;
1056                     }
1057                 } else {
1058                     AV_WN32A(&near_mv[CNT_NEAREST], mv);
1059                     idx = CNT_NEAREST;
1060                 }
1061             } else {
1062                 idx = CNT_ZERO;
1063             }
1064         } else {
1065             idx = CNT_ZERO;
1066         }
1067         cnt[idx] += vp7_mv_pred[i].score;
1068     }
1069
1070     mb->partitioning = VP8_SPLITMVMODE_NONE;
1071
1072     if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_ZERO]][0])) {
1073         mb->mode = VP8_MVMODE_MV;
1074
1075         if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAREST]][1])) {
1076
1077             if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAR]][2])) {
1078
1079                 if (cnt[CNT_NEAREST] > cnt[CNT_NEAR])
1080                     AV_WN32A(&mb->mv, cnt[CNT_ZERO] > cnt[CNT_NEAREST] ? 0 : AV_RN32A(&near_mv[CNT_NEAREST]));
1081                 else
1082                     AV_WN32A(&mb->mv, cnt[CNT_ZERO] > cnt[CNT_NEAR]    ? 0 : AV_RN32A(&near_mv[CNT_NEAR]));
1083
1084                 if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAR]][3])) {
1085                     mb->mode = VP8_MVMODE_SPLIT;
1086                     mb->mv = mb->bmv[decode_splitmvs(s, c, mb, layout, IS_VP7) - 1];
1087                 } else {
1088                     mb->mv.y += vp7_read_mv_component(c, s->prob->mvc[0]);
1089                     mb->mv.x += vp7_read_mv_component(c, s->prob->mvc[1]);
1090                     mb->bmv[0] = mb->mv;
1091                 }
1092             } else {
1093                 mb->mv = near_mv[CNT_NEAR];
1094                 mb->bmv[0] = mb->mv;
1095             }
1096         } else {
1097             mb->mv = near_mv[CNT_NEAREST];
1098             mb->bmv[0] = mb->mv;
1099         }
1100     } else {
1101         mb->mode = VP8_MVMODE_ZERO;
1102         AV_ZERO32(&mb->mv);
1103         mb->bmv[0] = mb->mv;
1104     }
1105 }
1106
1107 static av_always_inline
1108 void vp8_decode_mvs(VP8Context *s, VP8mvbounds *mv_bounds, VP8Macroblock *mb,
1109                     int mb_x, int mb_y, int layout)
1110 {
1111     VP8Macroblock *mb_edge[3] = { 0      /* top */,
1112                                   mb - 1 /* left */,
1113                                   0      /* top-left */ };
1114     enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV };
1115     enum { VP8_EDGE_TOP, VP8_EDGE_LEFT, VP8_EDGE_TOPLEFT };
1116     int idx = CNT_ZERO;
1117     int cur_sign_bias = s->sign_bias[mb->ref_frame];
1118     int8_t *sign_bias = s->sign_bias;
1119     VP56mv near_mv[4];
1120     uint8_t cnt[4] = { 0 };
1121     VP56RangeCoder *c = &s->c;
1122
1123     if (!layout) { // layout is inlined (s->mb_layout is not)
1124         mb_edge[0] = mb + 2;
1125         mb_edge[2] = mb + 1;
1126     } else {
1127         mb_edge[0] = mb - s->mb_width - 1;
1128         mb_edge[2] = mb - s->mb_width - 2;
1129     }
1130
1131     AV_ZERO32(&near_mv[0]);
1132     AV_ZERO32(&near_mv[1]);
1133     AV_ZERO32(&near_mv[2]);
1134
1135     /* Process MB on top, left and top-left */
1136 #define MV_EDGE_CHECK(n)                                                      \
1137     {                                                                         \
1138         VP8Macroblock *edge = mb_edge[n];                                     \
1139         int edge_ref = edge->ref_frame;                                       \
1140         if (edge_ref != VP56_FRAME_CURRENT) {                                 \
1141             uint32_t mv = AV_RN32A(&edge->mv);                                \
1142             if (mv) {                                                         \
1143                 if (cur_sign_bias != sign_bias[edge_ref]) {                   \
1144                     /* SWAR negate of the values in mv. */                    \
1145                     mv = ~mv;                                                 \
1146                     mv = ((mv & 0x7fff7fff) +                                 \
1147                           0x00010001) ^ (mv & 0x80008000);                    \
1148                 }                                                             \
1149                 if (!n || mv != AV_RN32A(&near_mv[idx]))                      \
1150                     AV_WN32A(&near_mv[++idx], mv);                            \
1151                 cnt[idx] += 1 + (n != 2);                                     \
1152             } else                                                            \
1153                 cnt[CNT_ZERO] += 1 + (n != 2);                                \
1154         }                                                                     \
1155     }
1156
1157     MV_EDGE_CHECK(0)
1158     MV_EDGE_CHECK(1)
1159     MV_EDGE_CHECK(2)
1160
1161     mb->partitioning = VP8_SPLITMVMODE_NONE;
1162     if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_ZERO]][0])) {
1163         mb->mode = VP8_MVMODE_MV;
1164
1165         /* If we have three distinct MVs, merge first and last if they're the same */
1166         if (cnt[CNT_SPLITMV] &&
1167             AV_RN32A(&near_mv[1 + VP8_EDGE_TOP]) == AV_RN32A(&near_mv[1 + VP8_EDGE_TOPLEFT]))
1168             cnt[CNT_NEAREST] += 1;
1169
1170         /* Swap near and nearest if necessary */
1171         if (cnt[CNT_NEAR] > cnt[CNT_NEAREST]) {
1172             FFSWAP(uint8_t,     cnt[CNT_NEAREST],     cnt[CNT_NEAR]);
1173             FFSWAP( VP56mv, near_mv[CNT_NEAREST], near_mv[CNT_NEAR]);
1174         }
1175
1176         if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAREST]][1])) {
1177             if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAR]][2])) {
1178                 /* Choose the best mv out of 0,0 and the nearest mv */
1179                 clamp_mv(mv_bounds, &mb->mv, &near_mv[CNT_ZERO + (cnt[CNT_NEAREST] >= cnt[CNT_ZERO])]);
1180                 cnt[CNT_SPLITMV] = ((mb_edge[VP8_EDGE_LEFT]->mode    == VP8_MVMODE_SPLIT) +
1181                                     (mb_edge[VP8_EDGE_TOP]->mode     == VP8_MVMODE_SPLIT)) * 2 +
1182                                     (mb_edge[VP8_EDGE_TOPLEFT]->mode == VP8_MVMODE_SPLIT);
1183
1184                 if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_SPLITMV]][3])) {
1185                     mb->mode = VP8_MVMODE_SPLIT;
1186                     mb->mv = mb->bmv[decode_splitmvs(s, c, mb, layout, IS_VP8) - 1];
1187                 } else {
1188                     mb->mv.y  += vp8_read_mv_component(c, s->prob->mvc[0]);
1189                     mb->mv.x  += vp8_read_mv_component(c, s->prob->mvc[1]);
1190                     mb->bmv[0] = mb->mv;
1191                 }
1192             } else {
1193                 clamp_mv(mv_bounds, &mb->mv, &near_mv[CNT_NEAR]);
1194                 mb->bmv[0] = mb->mv;
1195             }
1196         } else {
1197             clamp_mv(mv_bounds, &mb->mv, &near_mv[CNT_NEAREST]);
1198             mb->bmv[0] = mb->mv;
1199         }
1200     } else {
1201         mb->mode = VP8_MVMODE_ZERO;
1202         AV_ZERO32(&mb->mv);
1203         mb->bmv[0] = mb->mv;
1204     }
1205 }
1206
1207 static av_always_inline
1208 void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
1209                            int mb_x, int keyframe, int layout)
1210 {
1211     uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
1212
1213     if (layout) {
1214         VP8Macroblock *mb_top = mb - s->mb_width - 1;
1215         memcpy(mb->intra4x4_pred_mode_top, mb_top->intra4x4_pred_mode_top, 4);
1216     }
1217     if (keyframe) {
1218         int x, y;
1219         uint8_t *top;
1220         uint8_t *const left = s->intra4x4_pred_mode_left;
1221         if (layout)
1222             top = mb->intra4x4_pred_mode_top;
1223         else
1224             top = s->intra4x4_pred_mode_top + 4 * mb_x;
1225         for (y = 0; y < 4; y++) {
1226             for (x = 0; x < 4; x++) {
1227                 const uint8_t *ctx;
1228                 ctx       = vp8_pred4x4_prob_intra[top[x]][left[y]];
1229                 *intra4x4 = vp8_rac_get_tree(c, vp8_pred4x4_tree, ctx);
1230                 left[y]   = top[x] = *intra4x4;
1231                 intra4x4++;
1232             }
1233         }
1234     } else {
1235         int i;
1236         for (i = 0; i < 16; i++)
1237             intra4x4[i] = vp8_rac_get_tree(c, vp8_pred4x4_tree,
1238                                            vp8_pred4x4_prob_inter);
1239     }
1240 }
1241
1242 static av_always_inline
1243 void decode_mb_mode(VP8Context *s, VP8mvbounds *mv_bounds,
1244                     VP8Macroblock *mb, int mb_x, int mb_y,
1245                     uint8_t *segment, uint8_t *ref, int layout, int is_vp7)
1246 {
1247     VP56RangeCoder *c = &s->c;
1248     static const char * const vp7_feature_name[] = { "q-index",
1249                                                      "lf-delta",
1250                                                      "partial-golden-update",
1251                                                      "blit-pitch" };
1252     if (is_vp7) {
1253         int i;
1254         *segment = 0;
1255         for (i = 0; i < 4; i++) {
1256             if (s->feature_enabled[i]) {
1257                 if (vp56_rac_get_prob_branchy(c, s->feature_present_prob[i])) {
1258                       int index = vp8_rac_get_tree(c, vp7_feature_index_tree,
1259                                                    s->feature_index_prob[i]);
1260                       av_log(s->avctx, AV_LOG_WARNING,
1261                              "Feature %s present in macroblock (value 0x%x)\n",
1262                              vp7_feature_name[i], s->feature_value[i][index]);
1263                 }
1264            }
1265         }
1266     } else if (s->segmentation.update_map) {
1267         int bit  = vp56_rac_get_prob(c, s->prob->segmentid[0]);
1268         *segment = vp56_rac_get_prob(c, s->prob->segmentid[1+bit]) + 2*bit;
1269     } else if (s->segmentation.enabled)
1270         *segment = ref ? *ref : *segment;
1271     mb->segment = *segment;
1272
1273     mb->skip = s->mbskip_enabled ? vp56_rac_get_prob(c, s->prob->mbskip) : 0;
1274
1275     if (s->keyframe) {
1276         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_intra,
1277                                     vp8_pred16x16_prob_intra);
1278
1279         if (mb->mode == MODE_I4x4) {
1280             decode_intra4x4_modes(s, c, mb, mb_x, 1, layout);
1281         } else {
1282             const uint32_t modes = (is_vp7 ? vp7_pred4x4_mode
1283                                            : vp8_pred4x4_mode)[mb->mode] * 0x01010101u;
1284             if (s->mb_layout)
1285                 AV_WN32A(mb->intra4x4_pred_mode_top, modes);
1286             else
1287                 AV_WN32A(s->intra4x4_pred_mode_top + 4 * mb_x, modes);
1288             AV_WN32A(s->intra4x4_pred_mode_left, modes);
1289         }
1290
1291         mb->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree,
1292                                                 vp8_pred8x8c_prob_intra);
1293         mb->ref_frame        = VP56_FRAME_CURRENT;
1294     } else if (vp56_rac_get_prob_branchy(c, s->prob->intra)) {
1295         // inter MB, 16.2
1296         if (vp56_rac_get_prob_branchy(c, s->prob->last))
1297             mb->ref_frame =
1298                 (!is_vp7 && vp56_rac_get_prob(c, s->prob->golden)) ? VP56_FRAME_GOLDEN2 /* altref */
1299                                                                    : VP56_FRAME_GOLDEN;
1300         else
1301             mb->ref_frame = VP56_FRAME_PREVIOUS;
1302         s->ref_count[mb->ref_frame - 1]++;
1303
1304         // motion vectors, 16.3
1305         if (is_vp7)
1306             vp7_decode_mvs(s, mb, mb_x, mb_y, layout);
1307         else
1308             vp8_decode_mvs(s, mv_bounds, mb, mb_x, mb_y, layout);
1309     } else {
1310         // intra MB, 16.1
1311         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_inter, s->prob->pred16x16);
1312
1313         if (mb->mode == MODE_I4x4)
1314             decode_intra4x4_modes(s, c, mb, mb_x, 0, layout);
1315
1316         mb->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree,
1317                                                 s->prob->pred8x8c);
1318         mb->ref_frame        = VP56_FRAME_CURRENT;
1319         mb->partitioning     = VP8_SPLITMVMODE_NONE;
1320         AV_ZERO32(&mb->bmv[0]);
1321     }
1322 }
1323
1324 /**
1325  * @param r     arithmetic bitstream reader context
1326  * @param block destination for block coefficients
1327  * @param probs probabilities to use when reading trees from the bitstream
1328  * @param i     initial coeff index, 0 unless a separate DC block is coded
1329  * @param qmul  array holding the dc/ac dequant factor at position 0/1
1330  *
1331  * @return 0 if no coeffs were decoded
1332  *         otherwise, the index of the last coeff decoded plus one
1333  */
1334 static av_always_inline
1335 int decode_block_coeffs_internal(VP56RangeCoder *r, int16_t block[16],
1336                                  uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1337                                  int i, uint8_t *token_prob, int16_t qmul[2],
1338                                  const uint8_t scan[16], int vp7)
1339 {
1340     VP56RangeCoder c = *r;
1341     goto skip_eob;
1342     do {
1343         int coeff;
1344 restart:
1345         if (!vp56_rac_get_prob_branchy(&c, token_prob[0]))   // DCT_EOB
1346             break;
1347
1348 skip_eob:
1349         if (!vp56_rac_get_prob_branchy(&c, token_prob[1])) { // DCT_0
1350             if (++i == 16)
1351                 break; // invalid input; blocks should end with EOB
1352             token_prob = probs[i][0];
1353             if (vp7)
1354                 goto restart;
1355             goto skip_eob;
1356         }
1357
1358         if (!vp56_rac_get_prob_branchy(&c, token_prob[2])) { // DCT_1
1359             coeff = 1;
1360             token_prob = probs[i + 1][1];
1361         } else {
1362             if (!vp56_rac_get_prob_branchy(&c, token_prob[3])) { // DCT 2,3,4
1363                 coeff = vp56_rac_get_prob_branchy(&c, token_prob[4]);
1364                 if (coeff)
1365                     coeff += vp56_rac_get_prob(&c, token_prob[5]);
1366                 coeff += 2;
1367             } else {
1368                 // DCT_CAT*
1369                 if (!vp56_rac_get_prob_branchy(&c, token_prob[6])) {
1370                     if (!vp56_rac_get_prob_branchy(&c, token_prob[7])) { // DCT_CAT1
1371                         coeff = 5 + vp56_rac_get_prob(&c, vp8_dct_cat1_prob[0]);
1372                     } else {                                    // DCT_CAT2
1373                         coeff  = 7;
1374                         coeff += vp56_rac_get_prob(&c, vp8_dct_cat2_prob[0]) << 1;
1375                         coeff += vp56_rac_get_prob(&c, vp8_dct_cat2_prob[1]);
1376                     }
1377                 } else {    // DCT_CAT3 and up
1378                     int a   = vp56_rac_get_prob(&c, token_prob[8]);
1379                     int b   = vp56_rac_get_prob(&c, token_prob[9 + a]);
1380                     int cat = (a << 1) + b;
1381                     coeff  = 3 + (8 << cat);
1382                     coeff += vp8_rac_get_coeff(&c, ff_vp8_dct_cat_prob[cat]);
1383                 }
1384             }
1385             token_prob = probs[i + 1][2];
1386         }
1387         block[scan[i]] = (vp8_rac_get(&c) ? -coeff : coeff) * qmul[!!i];
1388     } while (++i < 16);
1389
1390     *r = c;
1391     return i;
1392 }
1393
1394 static av_always_inline
1395 int inter_predict_dc(int16_t block[16], int16_t pred[2])
1396 {
1397     int16_t dc = block[0];
1398     int ret = 0;
1399
1400     if (pred[1] > 3) {
1401         dc += pred[0];
1402         ret = 1;
1403     }
1404
1405     if (!pred[0] | !dc | ((int32_t)pred[0] ^ (int32_t)dc) >> 31) {
1406         block[0] = pred[0] = dc;
1407         pred[1] = 0;
1408     } else {
1409         if (pred[0] == dc)
1410             pred[1]++;
1411         block[0] = pred[0] = dc;
1412     }
1413
1414     return ret;
1415 }
1416
1417 static int vp7_decode_block_coeffs_internal(VP56RangeCoder *r,
1418                                             int16_t block[16],
1419                                             uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1420                                             int i, uint8_t *token_prob,
1421                                             int16_t qmul[2],
1422                                             const uint8_t scan[16])
1423 {
1424     return decode_block_coeffs_internal(r, block, probs, i,
1425                                         token_prob, qmul, scan, IS_VP7);
1426 }
1427
1428 #ifndef vp8_decode_block_coeffs_internal
1429 static int vp8_decode_block_coeffs_internal(VP56RangeCoder *r,
1430                                             int16_t block[16],
1431                                             uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1432                                             int i, uint8_t *token_prob,
1433                                             int16_t qmul[2])
1434 {
1435     return decode_block_coeffs_internal(r, block, probs, i,
1436                                         token_prob, qmul, ff_zigzag_scan, IS_VP8);
1437 }
1438 #endif
1439
1440 /**
1441  * @param c          arithmetic bitstream reader context
1442  * @param block      destination for block coefficients
1443  * @param probs      probabilities to use when reading trees from the bitstream
1444  * @param i          initial coeff index, 0 unless a separate DC block is coded
1445  * @param zero_nhood the initial prediction context for number of surrounding
1446  *                   all-zero blocks (only left/top, so 0-2)
1447  * @param qmul       array holding the dc/ac dequant factor at position 0/1
1448  * @param scan       scan pattern (VP7 only)
1449  *
1450  * @return 0 if no coeffs were decoded
1451  *         otherwise, the index of the last coeff decoded plus one
1452  */
1453 static av_always_inline
1454 int decode_block_coeffs(VP56RangeCoder *c, int16_t block[16],
1455                         uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1456                         int i, int zero_nhood, int16_t qmul[2],
1457                         const uint8_t scan[16], int vp7)
1458 {
1459     uint8_t *token_prob = probs[i][zero_nhood];
1460     if (!vp56_rac_get_prob_branchy(c, token_prob[0]))   // DCT_EOB
1461         return 0;
1462     return vp7 ? vp7_decode_block_coeffs_internal(c, block, probs, i,
1463                                                   token_prob, qmul, scan)
1464                : vp8_decode_block_coeffs_internal(c, block, probs, i,
1465                                                   token_prob, qmul);
1466 }
1467
1468 static av_always_inline
1469 void decode_mb_coeffs(VP8Context *s, VP8ThreadData *td, VP56RangeCoder *c,
1470                       VP8Macroblock *mb, uint8_t t_nnz[9], uint8_t l_nnz[9],
1471                       int is_vp7)
1472 {
1473     int i, x, y, luma_start = 0, luma_ctx = 3;
1474     int nnz_pred, nnz, nnz_total = 0;
1475     int segment = mb->segment;
1476     int block_dc = 0;
1477
1478     if (mb->mode != MODE_I4x4 && (is_vp7 || mb->mode != VP8_MVMODE_SPLIT)) {
1479         nnz_pred = t_nnz[8] + l_nnz[8];
1480
1481         // decode DC values and do hadamard
1482         nnz = decode_block_coeffs(c, td->block_dc, s->prob->token[1], 0,
1483                                   nnz_pred, s->qmat[segment].luma_dc_qmul,
1484                                   ff_zigzag_scan, is_vp7);
1485         l_nnz[8] = t_nnz[8] = !!nnz;
1486
1487         if (is_vp7 && mb->mode > MODE_I4x4) {
1488             nnz |=  inter_predict_dc(td->block_dc,
1489                                      s->inter_dc_pred[mb->ref_frame - 1]);
1490         }
1491
1492         if (nnz) {
1493             nnz_total += nnz;
1494             block_dc   = 1;
1495             if (nnz == 1)
1496                 s->vp8dsp.vp8_luma_dc_wht_dc(td->block, td->block_dc);
1497             else
1498                 s->vp8dsp.vp8_luma_dc_wht(td->block, td->block_dc);
1499         }
1500         luma_start = 1;
1501         luma_ctx   = 0;
1502     }
1503
1504     // luma blocks
1505     for (y = 0; y < 4; y++)
1506         for (x = 0; x < 4; x++) {
1507             nnz_pred = l_nnz[y] + t_nnz[x];
1508             nnz = decode_block_coeffs(c, td->block[y][x],
1509                                       s->prob->token[luma_ctx],
1510                                       luma_start, nnz_pred,
1511                                       s->qmat[segment].luma_qmul,
1512                                       s->prob[0].scan, is_vp7);
1513             /* nnz+block_dc may be one more than the actual last index,
1514              * but we don't care */
1515             td->non_zero_count_cache[y][x] = nnz + block_dc;
1516             t_nnz[x] = l_nnz[y] = !!nnz;
1517             nnz_total += nnz;
1518         }
1519
1520     // chroma blocks
1521     // TODO: what to do about dimensions? 2nd dim for luma is x,
1522     // but for chroma it's (y<<1)|x
1523     for (i = 4; i < 6; i++)
1524         for (y = 0; y < 2; y++)
1525             for (x = 0; x < 2; x++) {
1526                 nnz_pred = l_nnz[i + 2 * y] + t_nnz[i + 2 * x];
1527                 nnz = decode_block_coeffs(c, td->block[i][(y << 1) + x],
1528                                           s->prob->token[2], 0, nnz_pred,
1529                                           s->qmat[segment].chroma_qmul,
1530                                           s->prob[0].scan, is_vp7);
1531                 td->non_zero_count_cache[i][(y << 1) + x] = nnz;
1532                 t_nnz[i + 2 * x] = l_nnz[i + 2 * y] = !!nnz;
1533                 nnz_total += nnz;
1534             }
1535
1536     // if there were no coded coeffs despite the macroblock not being marked skip,
1537     // we MUST not do the inner loop filter and should not do IDCT
1538     // Since skip isn't used for bitstream prediction, just manually set it.
1539     if (!nnz_total)
1540         mb->skip = 1;
1541 }
1542
1543 static av_always_inline
1544 void backup_mb_border(uint8_t *top_border, uint8_t *src_y,
1545                       uint8_t *src_cb, uint8_t *src_cr,
1546                       ptrdiff_t linesize, ptrdiff_t uvlinesize, int simple)
1547 {
1548     AV_COPY128(top_border, src_y + 15 * linesize);
1549     if (!simple) {
1550         AV_COPY64(top_border + 16, src_cb + 7 * uvlinesize);
1551         AV_COPY64(top_border + 24, src_cr + 7 * uvlinesize);
1552     }
1553 }
1554
1555 static av_always_inline
1556 void xchg_mb_border(uint8_t *top_border, uint8_t *src_y, uint8_t *src_cb,
1557                     uint8_t *src_cr, ptrdiff_t linesize, ptrdiff_t uvlinesize, int mb_x,
1558                     int mb_y, int mb_width, int simple, int xchg)
1559 {
1560     uint8_t *top_border_m1 = top_border - 32;     // for TL prediction
1561     src_y  -= linesize;
1562     src_cb -= uvlinesize;
1563     src_cr -= uvlinesize;
1564
1565 #define XCHG(a, b, xchg)                                                      \
1566     do {                                                                      \
1567         if (xchg)                                                             \
1568             AV_SWAP64(b, a);                                                  \
1569         else                                                                  \
1570             AV_COPY64(b, a);                                                  \
1571     } while (0)
1572
1573     XCHG(top_border_m1 + 8, src_y - 8, xchg);
1574     XCHG(top_border, src_y, xchg);
1575     XCHG(top_border + 8, src_y + 8, 1);
1576     if (mb_x < mb_width - 1)
1577         XCHG(top_border + 32, src_y + 16, 1);
1578
1579     // only copy chroma for normal loop filter
1580     // or to initialize the top row to 127
1581     if (!simple || !mb_y) {
1582         XCHG(top_border_m1 + 16, src_cb - 8, xchg);
1583         XCHG(top_border_m1 + 24, src_cr - 8, xchg);
1584         XCHG(top_border + 16, src_cb, 1);
1585         XCHG(top_border + 24, src_cr, 1);
1586     }
1587 }
1588
1589 static av_always_inline
1590 int check_dc_pred8x8_mode(int mode, int mb_x, int mb_y)
1591 {
1592     if (!mb_x)
1593         return mb_y ? TOP_DC_PRED8x8 : DC_128_PRED8x8;
1594     else
1595         return mb_y ? mode : LEFT_DC_PRED8x8;
1596 }
1597
1598 static av_always_inline
1599 int check_tm_pred8x8_mode(int mode, int mb_x, int mb_y, int vp7)
1600 {
1601     if (!mb_x)
1602         return mb_y ? VERT_PRED8x8 : (vp7 ? DC_128_PRED8x8 : DC_129_PRED8x8);
1603     else
1604         return mb_y ? mode : HOR_PRED8x8;
1605 }
1606
1607 static av_always_inline
1608 int check_intra_pred8x8_mode_emuedge(int mode, int mb_x, int mb_y, int vp7)
1609 {
1610     switch (mode) {
1611     case DC_PRED8x8:
1612         return check_dc_pred8x8_mode(mode, mb_x, mb_y);
1613     case VERT_PRED8x8:
1614         return !mb_y ? (vp7 ? DC_128_PRED8x8 : DC_127_PRED8x8) : mode;
1615     case HOR_PRED8x8:
1616         return !mb_x ? (vp7 ? DC_128_PRED8x8 : DC_129_PRED8x8) : mode;
1617     case PLANE_PRED8x8: /* TM */
1618         return check_tm_pred8x8_mode(mode, mb_x, mb_y, vp7);
1619     }
1620     return mode;
1621 }
1622
1623 static av_always_inline
1624 int check_tm_pred4x4_mode(int mode, int mb_x, int mb_y, int vp7)
1625 {
1626     if (!mb_x) {
1627         return mb_y ? VERT_VP8_PRED : (vp7 ? DC_128_PRED : DC_129_PRED);
1628     } else {
1629         return mb_y ? mode : HOR_VP8_PRED;
1630     }
1631 }
1632
1633 static av_always_inline
1634 int check_intra_pred4x4_mode_emuedge(int mode, int mb_x, int mb_y,
1635                                      int *copy_buf, int vp7)
1636 {
1637     switch (mode) {
1638     case VERT_PRED:
1639         if (!mb_x && mb_y) {
1640             *copy_buf = 1;
1641             return mode;
1642         }
1643         /* fall-through */
1644     case DIAG_DOWN_LEFT_PRED:
1645     case VERT_LEFT_PRED:
1646         return !mb_y ? (vp7 ? DC_128_PRED : DC_127_PRED) : mode;
1647     case HOR_PRED:
1648         if (!mb_y) {
1649             *copy_buf = 1;
1650             return mode;
1651         }
1652         /* fall-through */
1653     case HOR_UP_PRED:
1654         return !mb_x ? (vp7 ? DC_128_PRED : DC_129_PRED) : mode;
1655     case TM_VP8_PRED:
1656         return check_tm_pred4x4_mode(mode, mb_x, mb_y, vp7);
1657     case DC_PRED: /* 4x4 DC doesn't use the same "H.264-style" exceptions
1658                    * as 16x16/8x8 DC */
1659     case DIAG_DOWN_RIGHT_PRED:
1660     case VERT_RIGHT_PRED:
1661     case HOR_DOWN_PRED:
1662         if (!mb_y || !mb_x)
1663             *copy_buf = 1;
1664         return mode;
1665     }
1666     return mode;
1667 }
1668
1669 static av_always_inline
1670 void intra_predict(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1671                    VP8Macroblock *mb, int mb_x, int mb_y, int is_vp7)
1672 {
1673     int x, y, mode, nnz;
1674     uint32_t tr;
1675
1676     /* for the first row, we need to run xchg_mb_border to init the top edge
1677      * to 127 otherwise, skip it if we aren't going to deblock */
1678     if (mb_y && (s->deblock_filter || !mb_y) && td->thread_nr == 0)
1679         xchg_mb_border(s->top_border[mb_x + 1], dst[0], dst[1], dst[2],
1680                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1681                        s->filter.simple, 1);
1682
1683     if (mb->mode < MODE_I4x4) {
1684         mode = check_intra_pred8x8_mode_emuedge(mb->mode, mb_x, mb_y, is_vp7);
1685         s->hpc.pred16x16[mode](dst[0], s->linesize);
1686     } else {
1687         uint8_t *ptr = dst[0];
1688         uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
1689         const uint8_t lo = is_vp7 ? 128 : 127;
1690         const uint8_t hi = is_vp7 ? 128 : 129;
1691         uint8_t tr_top[4] = { lo, lo, lo, lo };
1692
1693         // all blocks on the right edge of the macroblock use bottom edge
1694         // the top macroblock for their topright edge
1695         uint8_t *tr_right = ptr - s->linesize + 16;
1696
1697         // if we're on the right edge of the frame, said edge is extended
1698         // from the top macroblock
1699         if (mb_y && mb_x == s->mb_width - 1) {
1700             tr       = tr_right[-1] * 0x01010101u;
1701             tr_right = (uint8_t *) &tr;
1702         }
1703
1704         if (mb->skip)
1705             AV_ZERO128(td->non_zero_count_cache);
1706
1707         for (y = 0; y < 4; y++) {
1708             uint8_t *topright = ptr + 4 - s->linesize;
1709             for (x = 0; x < 4; x++) {
1710                 int copy = 0;
1711                 ptrdiff_t linesize = s->linesize;
1712                 uint8_t *dst = ptr + 4 * x;
1713                 LOCAL_ALIGNED(4, uint8_t, copy_dst, [5 * 8]);
1714
1715                 if ((y == 0 || x == 3) && mb_y == 0) {
1716                     topright = tr_top;
1717                 } else if (x == 3)
1718                     topright = tr_right;
1719
1720                 mode = check_intra_pred4x4_mode_emuedge(intra4x4[x], mb_x + x,
1721                                                         mb_y + y, &copy, is_vp7);
1722                 if (copy) {
1723                     dst      = copy_dst + 12;
1724                     linesize = 8;
1725                     if (!(mb_y + y)) {
1726                         copy_dst[3] = lo;
1727                         AV_WN32A(copy_dst + 4, lo * 0x01010101U);
1728                     } else {
1729                         AV_COPY32(copy_dst + 4, ptr + 4 * x - s->linesize);
1730                         if (!(mb_x + x)) {
1731                             copy_dst[3] = hi;
1732                         } else {
1733                             copy_dst[3] = ptr[4 * x - s->linesize - 1];
1734                         }
1735                     }
1736                     if (!(mb_x + x)) {
1737                         copy_dst[11] =
1738                         copy_dst[19] =
1739                         copy_dst[27] =
1740                         copy_dst[35] = hi;
1741                     } else {
1742                         copy_dst[11] = ptr[4 * x                   - 1];
1743                         copy_dst[19] = ptr[4 * x + s->linesize     - 1];
1744                         copy_dst[27] = ptr[4 * x + s->linesize * 2 - 1];
1745                         copy_dst[35] = ptr[4 * x + s->linesize * 3 - 1];
1746                     }
1747                 }
1748                 s->hpc.pred4x4[mode](dst, topright, linesize);
1749                 if (copy) {
1750                     AV_COPY32(ptr + 4 * x,                   copy_dst + 12);
1751                     AV_COPY32(ptr + 4 * x + s->linesize,     copy_dst + 20);
1752                     AV_COPY32(ptr + 4 * x + s->linesize * 2, copy_dst + 28);
1753                     AV_COPY32(ptr + 4 * x + s->linesize * 3, copy_dst + 36);
1754                 }
1755
1756                 nnz = td->non_zero_count_cache[y][x];
1757                 if (nnz) {
1758                     if (nnz == 1)
1759                         s->vp8dsp.vp8_idct_dc_add(ptr + 4 * x,
1760                                                   td->block[y][x], s->linesize);
1761                     else
1762                         s->vp8dsp.vp8_idct_add(ptr + 4 * x,
1763                                                td->block[y][x], s->linesize);
1764                 }
1765                 topright += 4;
1766             }
1767
1768             ptr      += 4 * s->linesize;
1769             intra4x4 += 4;
1770         }
1771     }
1772
1773     mode = check_intra_pred8x8_mode_emuedge(mb->chroma_pred_mode,
1774                                             mb_x, mb_y, is_vp7);
1775     s->hpc.pred8x8[mode](dst[1], s->uvlinesize);
1776     s->hpc.pred8x8[mode](dst[2], s->uvlinesize);
1777
1778     if (mb_y && (s->deblock_filter || !mb_y) && td->thread_nr == 0)
1779         xchg_mb_border(s->top_border[mb_x + 1], dst[0], dst[1], dst[2],
1780                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1781                        s->filter.simple, 0);
1782 }
1783
1784 static const uint8_t subpel_idx[3][8] = {
1785     { 0, 1, 2, 1, 2, 1, 2, 1 }, // nr. of left extra pixels,
1786                                 // also function pointer index
1787     { 0, 3, 5, 3, 5, 3, 5, 3 }, // nr. of extra pixels required
1788     { 0, 2, 3, 2, 3, 2, 3, 2 }, // nr. of right extra pixels
1789 };
1790
1791 /**
1792  * luma MC function
1793  *
1794  * @param s        VP8 decoding context
1795  * @param dst      target buffer for block data at block position
1796  * @param ref      reference picture buffer at origin (0, 0)
1797  * @param mv       motion vector (relative to block position) to get pixel data from
1798  * @param x_off    horizontal position of block from origin (0, 0)
1799  * @param y_off    vertical position of block from origin (0, 0)
1800  * @param block_w  width of block (16, 8 or 4)
1801  * @param block_h  height of block (always same as block_w)
1802  * @param width    width of src/dst plane data
1803  * @param height   height of src/dst plane data
1804  * @param linesize size of a single line of plane data, including padding
1805  * @param mc_func  motion compensation function pointers (bilinear or sixtap MC)
1806  */
1807 static av_always_inline
1808 void vp8_mc_luma(VP8Context *s, VP8ThreadData *td, uint8_t *dst,
1809                  ThreadFrame *ref, const VP56mv *mv,
1810                  int x_off, int y_off, int block_w, int block_h,
1811                  int width, int height, ptrdiff_t linesize,
1812                  vp8_mc_func mc_func[3][3])
1813 {
1814     uint8_t *src = ref->f->data[0];
1815
1816     if (AV_RN32A(mv)) {
1817         ptrdiff_t src_linesize = linesize;
1818
1819         int mx = (mv->x * 2) & 7, mx_idx = subpel_idx[0][mx];
1820         int my = (mv->y * 2) & 7, my_idx = subpel_idx[0][my];
1821
1822         x_off += mv->x >> 2;
1823         y_off += mv->y >> 2;
1824
1825         // edge emulation
1826         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 4, 0);
1827         src += y_off * linesize + x_off;
1828         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1829             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1830             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1831                                      src - my_idx * linesize - mx_idx,
1832                                      EDGE_EMU_LINESIZE, linesize,
1833                                      block_w + subpel_idx[1][mx],
1834                                      block_h + subpel_idx[1][my],
1835                                      x_off - mx_idx, y_off - my_idx,
1836                                      width, height);
1837             src = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
1838             src_linesize = EDGE_EMU_LINESIZE;
1839         }
1840         mc_func[my_idx][mx_idx](dst, linesize, src, src_linesize, block_h, mx, my);
1841     } else {
1842         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 4, 0);
1843         mc_func[0][0](dst, linesize, src + y_off * linesize + x_off,
1844                       linesize, block_h, 0, 0);
1845     }
1846 }
1847
1848 /**
1849  * chroma MC function
1850  *
1851  * @param s        VP8 decoding context
1852  * @param dst1     target buffer for block data at block position (U plane)
1853  * @param dst2     target buffer for block data at block position (V plane)
1854  * @param ref      reference picture buffer at origin (0, 0)
1855  * @param mv       motion vector (relative to block position) to get pixel data from
1856  * @param x_off    horizontal position of block from origin (0, 0)
1857  * @param y_off    vertical position of block from origin (0, 0)
1858  * @param block_w  width of block (16, 8 or 4)
1859  * @param block_h  height of block (always same as block_w)
1860  * @param width    width of src/dst plane data
1861  * @param height   height of src/dst plane data
1862  * @param linesize size of a single line of plane data, including padding
1863  * @param mc_func  motion compensation function pointers (bilinear or sixtap MC)
1864  */
1865 static av_always_inline
1866 void vp8_mc_chroma(VP8Context *s, VP8ThreadData *td, uint8_t *dst1,
1867                    uint8_t *dst2, ThreadFrame *ref, const VP56mv *mv,
1868                    int x_off, int y_off, int block_w, int block_h,
1869                    int width, int height, ptrdiff_t linesize,
1870                    vp8_mc_func mc_func[3][3])
1871 {
1872     uint8_t *src1 = ref->f->data[1], *src2 = ref->f->data[2];
1873
1874     if (AV_RN32A(mv)) {
1875         int mx = mv->x & 7, mx_idx = subpel_idx[0][mx];
1876         int my = mv->y & 7, my_idx = subpel_idx[0][my];
1877
1878         x_off += mv->x >> 3;
1879         y_off += mv->y >> 3;
1880
1881         // edge emulation
1882         src1 += y_off * linesize + x_off;
1883         src2 += y_off * linesize + x_off;
1884         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 3, 0);
1885         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1886             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1887             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1888                                      src1 - my_idx * linesize - mx_idx,
1889                                      EDGE_EMU_LINESIZE, linesize,
1890                                      block_w + subpel_idx[1][mx],
1891                                      block_h + subpel_idx[1][my],
1892                                      x_off - mx_idx, y_off - my_idx, width, height);
1893             src1 = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
1894             mc_func[my_idx][mx_idx](dst1, linesize, src1, EDGE_EMU_LINESIZE, block_h, mx, my);
1895
1896             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1897                                      src2 - my_idx * linesize - mx_idx,
1898                                      EDGE_EMU_LINESIZE, linesize,
1899                                      block_w + subpel_idx[1][mx],
1900                                      block_h + subpel_idx[1][my],
1901                                      x_off - mx_idx, y_off - my_idx, width, height);
1902             src2 = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
1903             mc_func[my_idx][mx_idx](dst2, linesize, src2, EDGE_EMU_LINESIZE, block_h, mx, my);
1904         } else {
1905             mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my);
1906             mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my);
1907         }
1908     } else {
1909         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 3, 0);
1910         mc_func[0][0](dst1, linesize, src1 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1911         mc_func[0][0](dst2, linesize, src2 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1912     }
1913 }
1914
1915 static av_always_inline
1916 void vp8_mc_part(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1917                  ThreadFrame *ref_frame, int x_off, int y_off,
1918                  int bx_off, int by_off, int block_w, int block_h,
1919                  int width, int height, VP56mv *mv)
1920 {
1921     VP56mv uvmv = *mv;
1922
1923     /* Y */
1924     vp8_mc_luma(s, td, dst[0] + by_off * s->linesize + bx_off,
1925                 ref_frame, mv, x_off + bx_off, y_off + by_off,
1926                 block_w, block_h, width, height, s->linesize,
1927                 s->put_pixels_tab[block_w == 8]);
1928
1929     /* U/V */
1930     if (s->profile == 3) {
1931         /* this block only applies VP8; it is safe to check
1932          * only the profile, as VP7 profile <= 1 */
1933         uvmv.x &= ~7;
1934         uvmv.y &= ~7;
1935     }
1936     x_off   >>= 1;
1937     y_off   >>= 1;
1938     bx_off  >>= 1;
1939     by_off  >>= 1;
1940     width   >>= 1;
1941     height  >>= 1;
1942     block_w >>= 1;
1943     block_h >>= 1;
1944     vp8_mc_chroma(s, td, dst[1] + by_off * s->uvlinesize + bx_off,
1945                   dst[2] + by_off * s->uvlinesize + bx_off, ref_frame,
1946                   &uvmv, x_off + bx_off, y_off + by_off,
1947                   block_w, block_h, width, height, s->uvlinesize,
1948                   s->put_pixels_tab[1 + (block_w == 4)]);
1949 }
1950
1951 /* Fetch pixels for estimated mv 4 macroblocks ahead.
1952  * Optimized for 64-byte cache lines. Inspired by ffh264 prefetch_motion. */
1953 static av_always_inline
1954 void prefetch_motion(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
1955                      int mb_xy, int ref)
1956 {
1957     /* Don't prefetch refs that haven't been used very often this frame. */
1958     if (s->ref_count[ref - 1] > (mb_xy >> 5)) {
1959         int x_off = mb_x << 4, y_off = mb_y << 4;
1960         int mx = (mb->mv.x >> 2) + x_off + 8;
1961         int my = (mb->mv.y >> 2) + y_off;
1962         uint8_t **src = s->framep[ref]->tf.f->data;
1963         int off = mx + (my + (mb_x & 3) * 4) * s->linesize + 64;
1964         /* For threading, a ff_thread_await_progress here might be useful, but
1965          * it actually slows down the decoder. Since a bad prefetch doesn't
1966          * generate bad decoder output, we don't run it here. */
1967         s->vdsp.prefetch(src[0] + off, s->linesize, 4);
1968         off = (mx >> 1) + ((my >> 1) + (mb_x & 7)) * s->uvlinesize + 64;
1969         s->vdsp.prefetch(src[1] + off, src[2] - src[1], 2);
1970     }
1971 }
1972
1973 /**
1974  * Apply motion vectors to prediction buffer, chapter 18.
1975  */
1976 static av_always_inline
1977 void inter_predict(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1978                    VP8Macroblock *mb, int mb_x, int mb_y)
1979 {
1980     int x_off = mb_x << 4, y_off = mb_y << 4;
1981     int width = 16 * s->mb_width, height = 16 * s->mb_height;
1982     ThreadFrame *ref = &s->framep[mb->ref_frame]->tf;
1983     VP56mv *bmv = mb->bmv;
1984
1985     switch (mb->partitioning) {
1986     case VP8_SPLITMVMODE_NONE:
1987         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1988                     0, 0, 16, 16, width, height, &mb->mv);
1989         break;
1990     case VP8_SPLITMVMODE_4x4: {
1991         int x, y;
1992         VP56mv uvmv;
1993
1994         /* Y */
1995         for (y = 0; y < 4; y++) {
1996             for (x = 0; x < 4; x++) {
1997                 vp8_mc_luma(s, td, dst[0] + 4 * y * s->linesize + x * 4,
1998                             ref, &bmv[4 * y + x],
1999                             4 * x + x_off, 4 * y + y_off, 4, 4,
2000                             width, height, s->linesize,
2001                             s->put_pixels_tab[2]);
2002             }
2003         }
2004
2005         /* U/V */
2006         x_off  >>= 1;
2007         y_off  >>= 1;
2008         width  >>= 1;
2009         height >>= 1;
2010         for (y = 0; y < 2; y++) {
2011             for (x = 0; x < 2; x++) {
2012                 uvmv.x = mb->bmv[2 * y       * 4 + 2 * x    ].x +
2013                          mb->bmv[2 * y       * 4 + 2 * x + 1].x +
2014                          mb->bmv[(2 * y + 1) * 4 + 2 * x    ].x +
2015                          mb->bmv[(2 * y + 1) * 4 + 2 * x + 1].x;
2016                 uvmv.y = mb->bmv[2 * y       * 4 + 2 * x    ].y +
2017                          mb->bmv[2 * y       * 4 + 2 * x + 1].y +
2018                          mb->bmv[(2 * y + 1) * 4 + 2 * x    ].y +
2019                          mb->bmv[(2 * y + 1) * 4 + 2 * x + 1].y;
2020                 uvmv.x = (uvmv.x + 2 + FF_SIGNBIT(uvmv.x)) >> 2;
2021                 uvmv.y = (uvmv.y + 2 + FF_SIGNBIT(uvmv.y)) >> 2;
2022                 if (s->profile == 3) {
2023                     uvmv.x &= ~7;
2024                     uvmv.y &= ~7;
2025                 }
2026                 vp8_mc_chroma(s, td, dst[1] + 4 * y * s->uvlinesize + x * 4,
2027                               dst[2] + 4 * y * s->uvlinesize + x * 4, ref,
2028                               &uvmv, 4 * x + x_off, 4 * y + y_off, 4, 4,
2029                               width, height, s->uvlinesize,
2030                               s->put_pixels_tab[2]);
2031             }
2032         }
2033         break;
2034     }
2035     case VP8_SPLITMVMODE_16x8:
2036         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2037                     0, 0, 16, 8, width, height, &bmv[0]);
2038         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2039                     0, 8, 16, 8, width, height, &bmv[1]);
2040         break;
2041     case VP8_SPLITMVMODE_8x16:
2042         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2043                     0, 0, 8, 16, width, height, &bmv[0]);
2044         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2045                     8, 0, 8, 16, width, height, &bmv[1]);
2046         break;
2047     case VP8_SPLITMVMODE_8x8:
2048         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2049                     0, 0, 8, 8, width, height, &bmv[0]);
2050         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2051                     8, 0, 8, 8, width, height, &bmv[1]);
2052         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2053                     0, 8, 8, 8, width, height, &bmv[2]);
2054         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2055                     8, 8, 8, 8, width, height, &bmv[3]);
2056         break;
2057     }
2058 }
2059
2060 static av_always_inline
2061 void idct_mb(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3], VP8Macroblock *mb)
2062 {
2063     int x, y, ch;
2064
2065     if (mb->mode != MODE_I4x4) {
2066         uint8_t *y_dst = dst[0];
2067         for (y = 0; y < 4; y++) {
2068             uint32_t nnz4 = AV_RL32(td->non_zero_count_cache[y]);
2069             if (nnz4) {
2070                 if (nnz4 & ~0x01010101) {
2071                     for (x = 0; x < 4; x++) {
2072                         if ((uint8_t) nnz4 == 1)
2073                             s->vp8dsp.vp8_idct_dc_add(y_dst + 4 * x,
2074                                                       td->block[y][x],
2075                                                       s->linesize);
2076                         else if ((uint8_t) nnz4 > 1)
2077                             s->vp8dsp.vp8_idct_add(y_dst + 4 * x,
2078                                                    td->block[y][x],
2079                                                    s->linesize);
2080                         nnz4 >>= 8;
2081                         if (!nnz4)
2082                             break;
2083                     }
2084                 } else {
2085                     s->vp8dsp.vp8_idct_dc_add4y(y_dst, td->block[y], s->linesize);
2086                 }
2087             }
2088             y_dst += 4 * s->linesize;
2089         }
2090     }
2091
2092     for (ch = 0; ch < 2; ch++) {
2093         uint32_t nnz4 = AV_RL32(td->non_zero_count_cache[4 + ch]);
2094         if (nnz4) {
2095             uint8_t *ch_dst = dst[1 + ch];
2096             if (nnz4 & ~0x01010101) {
2097                 for (y = 0; y < 2; y++) {
2098                     for (x = 0; x < 2; x++) {
2099                         if ((uint8_t) nnz4 == 1)
2100                             s->vp8dsp.vp8_idct_dc_add(ch_dst + 4 * x,
2101                                                       td->block[4 + ch][(y << 1) + x],
2102                                                       s->uvlinesize);
2103                         else if ((uint8_t) nnz4 > 1)
2104                             s->vp8dsp.vp8_idct_add(ch_dst + 4 * x,
2105                                                    td->block[4 + ch][(y << 1) + x],
2106                                                    s->uvlinesize);
2107                         nnz4 >>= 8;
2108                         if (!nnz4)
2109                             goto chroma_idct_end;
2110                     }
2111                     ch_dst += 4 * s->uvlinesize;
2112                 }
2113             } else {
2114                 s->vp8dsp.vp8_idct_dc_add4uv(ch_dst, td->block[4 + ch], s->uvlinesize);
2115             }
2116         }
2117 chroma_idct_end:
2118         ;
2119     }
2120 }
2121
2122 static av_always_inline
2123 void filter_level_for_mb(VP8Context *s, VP8Macroblock *mb,
2124                          VP8FilterStrength *f, int is_vp7)
2125 {
2126     int interior_limit, filter_level;
2127
2128     if (s->segmentation.enabled) {
2129         filter_level = s->segmentation.filter_level[mb->segment];
2130         if (!s->segmentation.absolute_vals)
2131             filter_level += s->filter.level;
2132     } else
2133         filter_level = s->filter.level;
2134
2135     if (s->lf_delta.enabled) {
2136         filter_level += s->lf_delta.ref[mb->ref_frame];
2137         filter_level += s->lf_delta.mode[mb->mode];
2138     }
2139
2140     filter_level = av_clip_uintp2(filter_level, 6);
2141
2142     interior_limit = filter_level;
2143     if (s->filter.sharpness) {
2144         interior_limit >>= (s->filter.sharpness + 3) >> 2;
2145         interior_limit = FFMIN(interior_limit, 9 - s->filter.sharpness);
2146     }
2147     interior_limit = FFMAX(interior_limit, 1);
2148
2149     f->filter_level = filter_level;
2150     f->inner_limit = interior_limit;
2151     f->inner_filter = is_vp7 || !mb->skip || mb->mode == MODE_I4x4 ||
2152                       mb->mode == VP8_MVMODE_SPLIT;
2153 }
2154
2155 static av_always_inline
2156 void filter_mb(VP8Context *s, uint8_t *dst[3], VP8FilterStrength *f,
2157                int mb_x, int mb_y, int is_vp7)
2158 {
2159     int mbedge_lim, bedge_lim_y, bedge_lim_uv, hev_thresh;
2160     int filter_level = f->filter_level;
2161     int inner_limit = f->inner_limit;
2162     int inner_filter = f->inner_filter;
2163     ptrdiff_t linesize   = s->linesize;
2164     ptrdiff_t uvlinesize = s->uvlinesize;
2165     static const uint8_t hev_thresh_lut[2][64] = {
2166         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
2167           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2168           3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2169           3, 3, 3, 3 },
2170         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
2171           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2172           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2173           2, 2, 2, 2 }
2174     };
2175
2176     if (!filter_level)
2177         return;
2178
2179     if (is_vp7) {
2180         bedge_lim_y  = filter_level;
2181         bedge_lim_uv = filter_level * 2;
2182         mbedge_lim   = filter_level + 2;
2183     } else {
2184         bedge_lim_y  =
2185         bedge_lim_uv = filter_level * 2 + inner_limit;
2186         mbedge_lim   = bedge_lim_y + 4;
2187     }
2188
2189     hev_thresh = hev_thresh_lut[s->keyframe][filter_level];
2190
2191     if (mb_x) {
2192         s->vp8dsp.vp8_h_loop_filter16y(dst[0], linesize,
2193                                        mbedge_lim, inner_limit, hev_thresh);
2194         s->vp8dsp.vp8_h_loop_filter8uv(dst[1], dst[2], uvlinesize,
2195                                        mbedge_lim, inner_limit, hev_thresh);
2196     }
2197
2198 #define H_LOOP_FILTER_16Y_INNER(cond)                                         \
2199     if (cond && inner_filter) {                                               \
2200         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] +  4, linesize,           \
2201                                              bedge_lim_y, inner_limit,        \
2202                                              hev_thresh);                     \
2203         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] +  8, linesize,           \
2204                                              bedge_lim_y, inner_limit,        \
2205                                              hev_thresh);                     \
2206         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] + 12, linesize,           \
2207                                              bedge_lim_y, inner_limit,        \
2208                                              hev_thresh);                     \
2209         s->vp8dsp.vp8_h_loop_filter8uv_inner(dst[1] +  4, dst[2] + 4,         \
2210                                              uvlinesize,  bedge_lim_uv,       \
2211                                              inner_limit, hev_thresh);        \
2212     }
2213
2214     H_LOOP_FILTER_16Y_INNER(!is_vp7)
2215
2216     if (mb_y) {
2217         s->vp8dsp.vp8_v_loop_filter16y(dst[0], linesize,
2218                                        mbedge_lim, inner_limit, hev_thresh);
2219         s->vp8dsp.vp8_v_loop_filter8uv(dst[1], dst[2], uvlinesize,
2220                                        mbedge_lim, inner_limit, hev_thresh);
2221     }
2222
2223     if (inner_filter) {
2224         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] +  4 * linesize,
2225                                              linesize, bedge_lim_y,
2226                                              inner_limit, hev_thresh);
2227         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] +  8 * linesize,
2228                                              linesize, bedge_lim_y,
2229                                              inner_limit, hev_thresh);
2230         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] + 12 * linesize,
2231                                              linesize, bedge_lim_y,
2232                                              inner_limit, hev_thresh);
2233         s->vp8dsp.vp8_v_loop_filter8uv_inner(dst[1] +  4 * uvlinesize,
2234                                              dst[2] +  4 * uvlinesize,
2235                                              uvlinesize, bedge_lim_uv,
2236                                              inner_limit, hev_thresh);
2237     }
2238
2239     H_LOOP_FILTER_16Y_INNER(is_vp7)
2240 }
2241
2242 static av_always_inline
2243 void filter_mb_simple(VP8Context *s, uint8_t *dst, VP8FilterStrength *f,
2244                       int mb_x, int mb_y)
2245 {
2246     int mbedge_lim, bedge_lim;
2247     int filter_level = f->filter_level;
2248     int inner_limit  = f->inner_limit;
2249     int inner_filter = f->inner_filter;
2250     ptrdiff_t linesize = s->linesize;
2251
2252     if (!filter_level)
2253         return;
2254
2255     bedge_lim  = 2 * filter_level + inner_limit;
2256     mbedge_lim = bedge_lim + 4;
2257
2258     if (mb_x)
2259         s->vp8dsp.vp8_h_loop_filter_simple(dst, linesize, mbedge_lim);
2260     if (inner_filter) {
2261         s->vp8dsp.vp8_h_loop_filter_simple(dst +  4, linesize, bedge_lim);
2262         s->vp8dsp.vp8_h_loop_filter_simple(dst +  8, linesize, bedge_lim);
2263         s->vp8dsp.vp8_h_loop_filter_simple(dst + 12, linesize, bedge_lim);
2264     }
2265
2266     if (mb_y)
2267         s->vp8dsp.vp8_v_loop_filter_simple(dst, linesize, mbedge_lim);
2268     if (inner_filter) {
2269         s->vp8dsp.vp8_v_loop_filter_simple(dst +  4 * linesize, linesize, bedge_lim);
2270         s->vp8dsp.vp8_v_loop_filter_simple(dst +  8 * linesize, linesize, bedge_lim);
2271         s->vp8dsp.vp8_v_loop_filter_simple(dst + 12 * linesize, linesize, bedge_lim);
2272     }
2273 }
2274
2275 #define MARGIN (16 << 2)
2276 static av_always_inline
2277 int vp78_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *curframe,
2278                                     VP8Frame *prev_frame, int is_vp7)
2279 {
2280     VP8Context *s = avctx->priv_data;
2281     int mb_x, mb_y;
2282
2283     s->mv_bounds.mv_min.y = -MARGIN;
2284     s->mv_bounds.mv_max.y = ((s->mb_height - 1) << 6) + MARGIN;
2285     for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
2286         VP8Macroblock *mb = s->macroblocks_base +
2287                             ((s->mb_width + 1) * (mb_y + 1) + 1);
2288         int mb_xy = mb_y * s->mb_width;
2289
2290         AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED * 0x01010101);
2291
2292         s->mv_bounds.mv_min.x = -MARGIN;
2293         s->mv_bounds.mv_max.x = ((s->mb_width - 1) << 6) + MARGIN;
2294
2295         if (vpX_rac_is_end(&s->c)) {
2296             return AVERROR_INVALIDDATA;
2297         }
2298         for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
2299             if (mb_y == 0)
2300                 AV_WN32A((mb - s->mb_width - 1)->intra4x4_pred_mode_top,
2301                          DC_PRED * 0x01010101);
2302             decode_mb_mode(s, &s->mv_bounds, mb, mb_x, mb_y, curframe->seg_map->data + mb_xy,
2303                            prev_frame && prev_frame->seg_map ?
2304                            prev_frame->seg_map->data + mb_xy : NULL, 1, is_vp7);
2305             s->mv_bounds.mv_min.x -= 64;
2306             s->mv_bounds.mv_max.x -= 64;
2307         }
2308         s->mv_bounds.mv_min.y -= 64;
2309         s->mv_bounds.mv_max.y -= 64;
2310     }
2311     return 0;
2312 }
2313
2314 static int vp7_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *cur_frame,
2315                                    VP8Frame *prev_frame)
2316 {
2317     return vp78_decode_mv_mb_modes(avctx, cur_frame, prev_frame, IS_VP7);
2318 }
2319
2320 static int vp8_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *cur_frame,
2321                                    VP8Frame *prev_frame)
2322 {
2323     return vp78_decode_mv_mb_modes(avctx, cur_frame, prev_frame, IS_VP8);
2324 }
2325
2326 #if HAVE_THREADS
2327 #define check_thread_pos(td, otd, mb_x_check, mb_y_check)                     \
2328     do {                                                                      \
2329         int tmp = (mb_y_check << 16) | (mb_x_check & 0xFFFF);                 \
2330         if (atomic_load(&otd->thread_mb_pos) < tmp) {                         \
2331             pthread_mutex_lock(&otd->lock);                                   \
2332             atomic_store(&td->wait_mb_pos, tmp);                              \
2333             do {                                                              \
2334                 if (atomic_load(&otd->thread_mb_pos) >= tmp)                  \
2335                     break;                                                    \
2336                 pthread_cond_wait(&otd->cond, &otd->lock);                    \
2337             } while (1);                                                      \
2338             atomic_store(&td->wait_mb_pos, INT_MAX);                          \
2339             pthread_mutex_unlock(&otd->lock);                                 \
2340         }                                                                     \
2341     } while (0)
2342
2343 #define update_pos(td, mb_y, mb_x)                                            \
2344     do {                                                                      \
2345         int pos              = (mb_y << 16) | (mb_x & 0xFFFF);                \
2346         int sliced_threading = (avctx->active_thread_type == FF_THREAD_SLICE) && \
2347                                (num_jobs > 1);                                \
2348         int is_null          = !next_td || !prev_td;                          \
2349         int pos_check        = (is_null) ? 1 :                                \
2350             (next_td != td && pos >= atomic_load(&next_td->wait_mb_pos)) ||   \
2351             (prev_td != td && pos >= atomic_load(&prev_td->wait_mb_pos));     \
2352         atomic_store(&td->thread_mb_pos, pos);                                \
2353         if (sliced_threading && pos_check) {                                  \
2354             pthread_mutex_lock(&td->lock);                                    \
2355             pthread_cond_broadcast(&td->cond);                                \
2356             pthread_mutex_unlock(&td->lock);                                  \
2357         }                                                                     \
2358     } while (0)
2359 #else
2360 #define check_thread_pos(td, otd, mb_x_check, mb_y_check) while(0)
2361 #define update_pos(td, mb_y, mb_x) while(0)
2362 #endif
2363
2364 static av_always_inline int decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
2365                                         int jobnr, int threadnr, int is_vp7)
2366 {
2367     VP8Context *s = avctx->priv_data;
2368     VP8ThreadData *prev_td, *next_td, *td = &s->thread_data[threadnr];
2369     int mb_y = atomic_load(&td->thread_mb_pos) >> 16;
2370     int mb_x, mb_xy = mb_y * s->mb_width;
2371     int num_jobs = s->num_jobs;
2372     VP8Frame *curframe = s->curframe, *prev_frame = s->prev_frame;
2373     VP56RangeCoder *c  = &s->coeff_partition[mb_y & (s->num_coeff_partitions - 1)];
2374     VP8Macroblock *mb;
2375     uint8_t *dst[3] = {
2376         curframe->tf.f->data[0] + 16 * mb_y * s->linesize,
2377         curframe->tf.f->data[1] +  8 * mb_y * s->uvlinesize,
2378         curframe->tf.f->data[2] +  8 * mb_y * s->uvlinesize
2379     };
2380
2381     if (vpX_rac_is_end(c))
2382          return AVERROR_INVALIDDATA;
2383
2384     if (mb_y == 0)
2385         prev_td = td;
2386     else
2387         prev_td = &s->thread_data[(jobnr + num_jobs - 1) % num_jobs];
2388     if (mb_y == s->mb_height - 1)
2389         next_td = td;
2390     else
2391         next_td = &s->thread_data[(jobnr + 1) % num_jobs];
2392     if (s->mb_layout == 1)
2393         mb = s->macroblocks_base + ((s->mb_width + 1) * (mb_y + 1) + 1);
2394     else {
2395         // Make sure the previous frame has read its segmentation map,
2396         // if we re-use the same map.
2397         if (prev_frame && s->segmentation.enabled &&
2398             !s->segmentation.update_map)
2399             ff_thread_await_progress(&prev_frame->tf, mb_y, 0);
2400         mb = s->macroblocks + (s->mb_height - mb_y - 1) * 2;
2401         memset(mb - 1, 0, sizeof(*mb)); // zero left macroblock
2402         AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED * 0x01010101);
2403     }
2404
2405     if (!is_vp7 || mb_y == 0)
2406         memset(td->left_nnz, 0, sizeof(td->left_nnz));
2407
2408     td->mv_bounds.mv_min.x = -MARGIN;
2409     td->mv_bounds.mv_max.x = ((s->mb_width - 1) << 6) + MARGIN;
2410
2411     for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
2412         if (vpX_rac_is_end(c))
2413             return AVERROR_INVALIDDATA;
2414         // Wait for previous thread to read mb_x+2, and reach mb_y-1.
2415         if (prev_td != td) {
2416             if (threadnr != 0) {
2417                 check_thread_pos(td, prev_td,
2418                                  mb_x + (is_vp7 ? 2 : 1),
2419                                  mb_y - (is_vp7 ? 2 : 1));
2420             } else {
2421                 check_thread_pos(td, prev_td,
2422                                  mb_x + (is_vp7 ? 2 : 1) + s->mb_width + 3,
2423                                  mb_y - (is_vp7 ? 2 : 1));
2424             }
2425         }
2426
2427         s->vdsp.prefetch(dst[0] + (mb_x & 3) * 4 * s->linesize + 64,
2428                          s->linesize, 4);
2429         s->vdsp.prefetch(dst[1] + (mb_x & 7) * s->uvlinesize + 64,
2430                          dst[2] - dst[1], 2);
2431
2432         if (!s->mb_layout)
2433             decode_mb_mode(s, &td->mv_bounds, mb, mb_x, mb_y, curframe->seg_map->data + mb_xy,
2434                            prev_frame && prev_frame->seg_map ?
2435                            prev_frame->seg_map->data + mb_xy : NULL, 0, is_vp7);
2436
2437         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_PREVIOUS);
2438
2439         if (!mb->skip)
2440             decode_mb_coeffs(s, td, c, mb, s->top_nnz[mb_x], td->left_nnz, is_vp7);
2441
2442         if (mb->mode <= MODE_I4x4)
2443             intra_predict(s, td, dst, mb, mb_x, mb_y, is_vp7);
2444         else
2445             inter_predict(s, td, dst, mb, mb_x, mb_y);
2446
2447         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN);
2448
2449         if (!mb->skip) {
2450             idct_mb(s, td, dst, mb);
2451         } else {
2452             AV_ZERO64(td->left_nnz);
2453             AV_WN64(s->top_nnz[mb_x], 0);   // array of 9, so unaligned
2454
2455             /* Reset DC block predictors if they would exist
2456              * if the mb had coefficients */
2457             if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
2458                 td->left_nnz[8]     = 0;
2459                 s->top_nnz[mb_x][8] = 0;
2460             }
2461         }
2462
2463         if (s->deblock_filter)
2464             filter_level_for_mb(s, mb, &td->filter_strength[mb_x], is_vp7);
2465
2466         if (s->deblock_filter && num_jobs != 1 && threadnr == num_jobs - 1) {
2467             if (s->filter.simple)
2468                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2469                                  NULL, NULL, s->linesize, 0, 1);
2470             else
2471                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2472                                  dst[1], dst[2], s->linesize, s->uvlinesize, 0);
2473         }
2474
2475         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN2);
2476
2477         dst[0]      += 16;
2478         dst[1]      += 8;
2479         dst[2]      += 8;
2480         td->mv_bounds.mv_min.x -= 64;
2481         td->mv_bounds.mv_max.x -= 64;
2482
2483         if (mb_x == s->mb_width + 1) {
2484             update_pos(td, mb_y, s->mb_width + 3);
2485         } else {
2486             update_pos(td, mb_y, mb_x);
2487         }
2488     }
2489     return 0;
2490 }
2491
2492 static int vp7_decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
2493                                         int jobnr, int threadnr)
2494 {
2495     return decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr, 1);
2496 }
2497
2498 static int vp8_decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
2499                                         int jobnr, int threadnr)
2500 {
2501     return decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr, 0);
2502 }
2503
2504 static av_always_inline void filter_mb_row(AVCodecContext *avctx, void *tdata,
2505                               int jobnr, int threadnr, int is_vp7)
2506 {
2507     VP8Context *s = avctx->priv_data;
2508     VP8ThreadData *td = &s->thread_data[threadnr];
2509     int mb_x, mb_y = atomic_load(&td->thread_mb_pos) >> 16, num_jobs = s->num_jobs;
2510     AVFrame *curframe = s->curframe->tf.f;
2511     VP8Macroblock *mb;
2512     VP8ThreadData *prev_td, *next_td;
2513     uint8_t *dst[3] = {
2514         curframe->data[0] + 16 * mb_y * s->linesize,
2515         curframe->data[1] +  8 * mb_y * s->uvlinesize,
2516         curframe->data[2] +  8 * mb_y * s->uvlinesize
2517     };
2518
2519     if (s->mb_layout == 1)
2520         mb = s->macroblocks_base + ((s->mb_width + 1) * (mb_y + 1) + 1);
2521     else
2522         mb = s->macroblocks + (s->mb_height - mb_y - 1) * 2;
2523
2524     if (mb_y == 0)
2525         prev_td = td;
2526     else
2527         prev_td = &s->thread_data[(jobnr + num_jobs - 1) % num_jobs];
2528     if (mb_y == s->mb_height - 1)
2529         next_td = td;
2530     else
2531         next_td = &s->thread_data[(jobnr + 1) % num_jobs];
2532
2533     for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb++) {
2534         VP8FilterStrength *f = &td->filter_strength[mb_x];
2535         if (prev_td != td)
2536             check_thread_pos(td, prev_td,
2537                              (mb_x + 1) + (s->mb_width + 3), mb_y - 1);
2538         if (next_td != td)
2539             if (next_td != &s->thread_data[0])
2540                 check_thread_pos(td, next_td, mb_x + 1, mb_y + 1);
2541
2542         if (num_jobs == 1) {
2543             if (s->filter.simple)
2544                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2545                                  NULL, NULL, s->linesize, 0, 1);
2546             else
2547                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2548                                  dst[1], dst[2], s->linesize, s->uvlinesize, 0);
2549         }
2550
2551         if (s->filter.simple)
2552             filter_mb_simple(s, dst[0], f, mb_x, mb_y);
2553         else
2554             filter_mb(s, dst, f, mb_x, mb_y, is_vp7);
2555         dst[0] += 16;
2556         dst[1] += 8;
2557         dst[2] += 8;
2558
2559         update_pos(td, mb_y, (s->mb_width + 3) + mb_x);
2560     }
2561 }
2562
2563 static void vp7_filter_mb_row(AVCodecContext *avctx, void *tdata,
2564                               int jobnr, int threadnr)
2565 {
2566     filter_mb_row(avctx, tdata, jobnr, threadnr, 1);
2567 }
2568
2569 static void vp8_filter_mb_row(AVCodecContext *avctx, void *tdata,
2570                               int jobnr, int threadnr)
2571 {
2572     filter_mb_row(avctx, tdata, jobnr, threadnr, 0);
2573 }
2574
2575 static av_always_inline
2576 int vp78_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata, int jobnr,
2577                               int threadnr, int is_vp7)
2578 {
2579     VP8Context *s = avctx->priv_data;
2580     VP8ThreadData *td = &s->thread_data[jobnr];
2581     VP8ThreadData *next_td = NULL, *prev_td = NULL;
2582     VP8Frame *curframe = s->curframe;
2583     int mb_y, num_jobs = s->num_jobs;
2584     int ret;
2585
2586     td->thread_nr = threadnr;
2587     td->mv_bounds.mv_min.y   = -MARGIN - 64 * threadnr;
2588     td->mv_bounds.mv_max.y   = ((s->mb_height - 1) << 6) + MARGIN - 64 * threadnr;
2589     for (mb_y = jobnr; mb_y < s->mb_height; mb_y += num_jobs) {
2590         atomic_store(&td->thread_mb_pos, mb_y << 16);
2591         ret = s->decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr);
2592         if (ret < 0) {
2593             update_pos(td, s->mb_height, INT_MAX & 0xFFFF);
2594             return ret;
2595         }
2596         if (s->deblock_filter)
2597             s->filter_mb_row(avctx, tdata, jobnr, threadnr);
2598         update_pos(td, mb_y, INT_MAX & 0xFFFF);
2599
2600         td->mv_bounds.mv_min.y -= 64 * num_jobs;
2601         td->mv_bounds.mv_max.y -= 64 * num_jobs;
2602
2603         if (avctx->active_thread_type == FF_THREAD_FRAME)
2604             ff_thread_report_progress(&curframe->tf, mb_y, 0);
2605     }
2606
2607     return 0;
2608 }
2609
2610 static int vp7_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata,
2611                                     int jobnr, int threadnr)
2612 {
2613     return vp78_decode_mb_row_sliced(avctx, tdata, jobnr, threadnr, IS_VP7);
2614 }
2615
2616 static int vp8_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata,
2617                                     int jobnr, int threadnr)
2618 {
2619     return vp78_decode_mb_row_sliced(avctx, tdata, jobnr, threadnr, IS_VP8);
2620 }
2621
2622 static av_always_inline
2623 int vp78_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
2624                       AVPacket *avpkt, int is_vp7)
2625 {
2626     VP8Context *s = avctx->priv_data;
2627     int ret, i, referenced, num_jobs;
2628     enum AVDiscard skip_thresh;
2629     VP8Frame *av_uninit(curframe), *prev_frame;
2630
2631     if (is_vp7)
2632         ret = vp7_decode_frame_header(s, avpkt->data, avpkt->size);
2633     else
2634         ret = vp8_decode_frame_header(s, avpkt->data, avpkt->size);
2635
2636     if (ret < 0)
2637         goto err;
2638
2639     if (s->actually_webp) {
2640         // avctx->pix_fmt already set in caller.
2641     } else if (!is_vp7 && s->pix_fmt == AV_PIX_FMT_NONE) {
2642         s->pix_fmt = get_pixel_format(s);
2643         if (s->pix_fmt < 0) {
2644             ret = AVERROR(EINVAL);
2645             goto err;
2646         }
2647         avctx->pix_fmt = s->pix_fmt;
2648     }
2649
2650     prev_frame = s->framep[VP56_FRAME_CURRENT];
2651
2652     referenced = s->update_last || s->update_golden == VP56_FRAME_CURRENT ||
2653                  s->update_altref == VP56_FRAME_CURRENT;
2654
2655     skip_thresh = !referenced ? AVDISCARD_NONREF
2656                               : !s->keyframe ? AVDISCARD_NONKEY
2657                                              : AVDISCARD_ALL;
2658
2659     if (avctx->skip_frame >= skip_thresh) {
2660         s->invisible = 1;
2661         memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
2662         goto skip_decode;
2663     }
2664     s->deblock_filter = s->filter.level && avctx->skip_loop_filter < skip_thresh;
2665
2666     // release no longer referenced frames
2667     for (i = 0; i < 5; i++)
2668         if (s->frames[i].tf.f->buf[0] &&
2669             &s->frames[i] != prev_frame &&
2670             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
2671             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN]   &&
2672             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2])
2673             vp8_release_frame(s, &s->frames[i]);
2674
2675     curframe = s->framep[VP56_FRAME_CURRENT] = vp8_find_free_buffer(s);
2676
2677     if (!s->colorspace)
2678         avctx->colorspace = AVCOL_SPC_BT470BG;
2679     if (s->fullrange)
2680         avctx->color_range = AVCOL_RANGE_JPEG;
2681     else
2682         avctx->color_range = AVCOL_RANGE_MPEG;
2683
2684     /* Given that arithmetic probabilities are updated every frame, it's quite
2685      * likely that the values we have on a random interframe are complete
2686      * junk if we didn't start decode on a keyframe. So just don't display
2687      * anything rather than junk. */
2688     if (!s->keyframe && (!s->framep[VP56_FRAME_PREVIOUS] ||
2689                          !s->framep[VP56_FRAME_GOLDEN]   ||
2690                          !s->framep[VP56_FRAME_GOLDEN2])) {
2691         av_log(avctx, AV_LOG_WARNING,
2692                "Discarding interframe without a prior keyframe!\n");
2693         ret = AVERROR_INVALIDDATA;
2694         goto err;
2695     }
2696
2697     curframe->tf.f->key_frame = s->keyframe;
2698     curframe->tf.f->pict_type = s->keyframe ? AV_PICTURE_TYPE_I
2699                                             : AV_PICTURE_TYPE_P;
2700     if ((ret = vp8_alloc_frame(s, curframe, referenced)) < 0)
2701         goto err;
2702
2703     // check if golden and altref are swapped
2704     if (s->update_altref != VP56_FRAME_NONE)
2705         s->next_framep[VP56_FRAME_GOLDEN2] = s->framep[s->update_altref];
2706     else
2707         s->next_framep[VP56_FRAME_GOLDEN2] = s->framep[VP56_FRAME_GOLDEN2];
2708
2709     if (s->update_golden != VP56_FRAME_NONE)
2710         s->next_framep[VP56_FRAME_GOLDEN] = s->framep[s->update_golden];
2711     else
2712         s->next_framep[VP56_FRAME_GOLDEN] = s->framep[VP56_FRAME_GOLDEN];
2713
2714     if (s->update_last)
2715         s->next_framep[VP56_FRAME_PREVIOUS] = curframe;
2716     else
2717         s->next_framep[VP56_FRAME_PREVIOUS] = s->framep[VP56_FRAME_PREVIOUS];
2718
2719     s->next_framep[VP56_FRAME_CURRENT] = curframe;
2720
2721     if (avctx->codec->update_thread_context)
2722         ff_thread_finish_setup(avctx);
2723
2724     if (avctx->hwaccel) {
2725         ret = avctx->hwaccel->start_frame(avctx, avpkt->data, avpkt->size);
2726         if (ret < 0)
2727             goto err;
2728
2729         ret = avctx->hwaccel->decode_slice(avctx, avpkt->data, avpkt->size);
2730         if (ret < 0)
2731             goto err;
2732
2733         ret = avctx->hwaccel->end_frame(avctx);
2734         if (ret < 0)
2735             goto err;
2736
2737     } else {
2738         s->linesize   = curframe->tf.f->linesize[0];
2739         s->uvlinesize = curframe->tf.f->linesize[1];
2740
2741         memset(s->top_nnz, 0, s->mb_width * sizeof(*s->top_nnz));
2742         /* Zero macroblock structures for top/top-left prediction
2743          * from outside the frame. */
2744         if (!s->mb_layout)
2745             memset(s->macroblocks + s->mb_height * 2 - 1, 0,
2746                    (s->mb_width + 1) * sizeof(*s->macroblocks));
2747         if (!s->mb_layout && s->keyframe)
2748             memset(s->intra4x4_pred_mode_top, DC_PRED, s->mb_width * 4);
2749
2750         memset(s->ref_count, 0, sizeof(s->ref_count));
2751
2752         if (s->mb_layout == 1) {
2753             // Make sure the previous frame has read its segmentation map,
2754             // if we re-use the same map.
2755             if (prev_frame && s->segmentation.enabled &&
2756                 !s->segmentation.update_map)
2757                 ff_thread_await_progress(&prev_frame->tf, 1, 0);
2758             if (is_vp7)
2759                 ret = vp7_decode_mv_mb_modes(avctx, curframe, prev_frame);
2760             else
2761                 ret = vp8_decode_mv_mb_modes(avctx, curframe, prev_frame);
2762             if (ret < 0)
2763                 goto err;
2764         }
2765
2766         if (avctx->active_thread_type == FF_THREAD_FRAME)
2767             num_jobs = 1;
2768         else
2769             num_jobs = FFMIN(s->num_coeff_partitions, avctx->thread_count);
2770         s->num_jobs   = num_jobs;
2771         s->curframe   = curframe;
2772         s->prev_frame = prev_frame;
2773         s->mv_bounds.mv_min.y   = -MARGIN;
2774         s->mv_bounds.mv_max.y   = ((s->mb_height - 1) << 6) + MARGIN;
2775         for (i = 0; i < MAX_THREADS; i++) {
2776             VP8ThreadData *td = &s->thread_data[i];
2777             atomic_init(&td->thread_mb_pos, 0);
2778             atomic_init(&td->wait_mb_pos, INT_MAX);
2779         }
2780         if (is_vp7)
2781             avctx->execute2(avctx, vp7_decode_mb_row_sliced, s->thread_data, NULL,
2782                             num_jobs);
2783         else
2784             avctx->execute2(avctx, vp8_decode_mb_row_sliced, s->thread_data, NULL,
2785                             num_jobs);
2786     }
2787
2788     ff_thread_report_progress(&curframe->tf, INT_MAX, 0);
2789     memcpy(&s->framep[0], &s->next_framep[0], sizeof(s->framep[0]) * 4);
2790
2791 skip_decode:
2792     // if future frames don't use the updated probabilities,
2793     // reset them to the values we saved
2794     if (!s->update_probabilities)
2795         s->prob[0] = s->prob[1];
2796
2797     if (!s->invisible) {
2798         if ((ret = av_frame_ref(data, curframe->tf.f)) < 0)
2799             return ret;
2800         *got_frame = 1;
2801     }
2802
2803     return avpkt->size;
2804 err:
2805     memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
2806     return ret;
2807 }
2808
2809 int ff_vp8_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
2810                         AVPacket *avpkt)
2811 {
2812     return vp78_decode_frame(avctx, data, got_frame, avpkt, IS_VP8);
2813 }
2814
2815 #if CONFIG_VP7_DECODER
2816 static int vp7_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
2817                             AVPacket *avpkt)
2818 {
2819     return vp78_decode_frame(avctx, data, got_frame, avpkt, IS_VP7);
2820 }
2821 #endif /* CONFIG_VP7_DECODER */
2822
2823 av_cold int ff_vp8_decode_free(AVCodecContext *avctx)
2824 {
2825     VP8Context *s = avctx->priv_data;
2826     int i;
2827
2828     if (!s)
2829         return 0;
2830
2831     vp8_decode_flush_impl(avctx, 1);
2832     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++)
2833         av_frame_free(&s->frames[i].tf.f);
2834
2835     return 0;
2836 }
2837
2838 static av_cold int vp8_init_frames(VP8Context *s)
2839 {
2840     int i;
2841     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++) {
2842         s->frames[i].tf.f = av_frame_alloc();
2843         if (!s->frames[i].tf.f)
2844             return AVERROR(ENOMEM);
2845     }
2846     return 0;
2847 }
2848
2849 static av_always_inline
2850 int vp78_decode_init(AVCodecContext *avctx, int is_vp7)
2851 {
2852     VP8Context *s = avctx->priv_data;
2853     int ret;
2854
2855     s->avctx = avctx;
2856     s->vp7   = avctx->codec->id == AV_CODEC_ID_VP7;
2857     s->pix_fmt = AV_PIX_FMT_NONE;
2858     avctx->pix_fmt = AV_PIX_FMT_YUV420P;
2859
2860     ff_videodsp_init(&s->vdsp, 8);
2861
2862     ff_vp78dsp_init(&s->vp8dsp);
2863     if (CONFIG_VP7_DECODER && is_vp7) {
2864         ff_h264_pred_init(&s->hpc, AV_CODEC_ID_VP7, 8, 1);
2865         ff_vp7dsp_init(&s->vp8dsp);
2866         s->decode_mb_row_no_filter = vp7_decode_mb_row_no_filter;
2867         s->filter_mb_row           = vp7_filter_mb_row;
2868     } else if (CONFIG_VP8_DECODER && !is_vp7) {
2869         ff_h264_pred_init(&s->hpc, AV_CODEC_ID_VP8, 8, 1);
2870         ff_vp8dsp_init(&s->vp8dsp);
2871         s->decode_mb_row_no_filter = vp8_decode_mb_row_no_filter;
2872         s->filter_mb_row           = vp8_filter_mb_row;
2873     }
2874
2875     /* does not change for VP8 */
2876     memcpy(s->prob[0].scan, ff_zigzag_scan, sizeof(s->prob[0].scan));
2877
2878     if ((ret = vp8_init_frames(s)) < 0) {
2879         ff_vp8_decode_free(avctx);
2880         return ret;
2881     }
2882
2883     return 0;
2884 }
2885
2886 #if CONFIG_VP7_DECODER
2887 static int vp7_decode_init(AVCodecContext *avctx)
2888 {
2889     return vp78_decode_init(avctx, IS_VP7);
2890 }
2891 #endif /* CONFIG_VP7_DECODER */
2892
2893 av_cold int ff_vp8_decode_init(AVCodecContext *avctx)
2894 {
2895     return vp78_decode_init(avctx, IS_VP8);
2896 }
2897
2898 #if CONFIG_VP8_DECODER
2899 #if HAVE_THREADS
2900 #define REBASE(pic) ((pic) ? (pic) - &s_src->frames[0] + &s->frames[0] : NULL)
2901
2902 static int vp8_decode_update_thread_context(AVCodecContext *dst,
2903                                             const AVCodecContext *src)
2904 {
2905     VP8Context *s = dst->priv_data, *s_src = src->priv_data;
2906     int i;
2907
2908     if (s->macroblocks_base &&
2909         (s_src->mb_width != s->mb_width || s_src->mb_height != s->mb_height)) {
2910         free_buffers(s);
2911         s->mb_width  = s_src->mb_width;
2912         s->mb_height = s_src->mb_height;
2913     }
2914
2915     s->pix_fmt      = s_src->pix_fmt;
2916     s->prob[0]      = s_src->prob[!s_src->update_probabilities];
2917     s->segmentation = s_src->segmentation;
2918     s->lf_delta     = s_src->lf_delta;
2919     memcpy(s->sign_bias, s_src->sign_bias, sizeof(s->sign_bias));
2920
2921     for (i = 0; i < FF_ARRAY_ELEMS(s_src->frames); i++) {
2922         if (s_src->frames[i].tf.f->buf[0]) {
2923             int ret = vp8_ref_frame(s, &s->frames[i], &s_src->frames[i]);
2924             if (ret < 0)
2925                 return ret;
2926         }
2927     }
2928
2929     s->framep[0] = REBASE(s_src->next_framep[0]);
2930     s->framep[1] = REBASE(s_src->next_framep[1]);
2931     s->framep[2] = REBASE(s_src->next_framep[2]);
2932     s->framep[3] = REBASE(s_src->next_framep[3]);
2933
2934     return 0;
2935 }
2936 #endif /* HAVE_THREADS */
2937 #endif /* CONFIG_VP8_DECODER */
2938
2939 #if CONFIG_VP7_DECODER
2940 AVCodec ff_vp7_decoder = {
2941     .name                  = "vp7",
2942     .long_name             = NULL_IF_CONFIG_SMALL("On2 VP7"),
2943     .type                  = AVMEDIA_TYPE_VIDEO,
2944     .id                    = AV_CODEC_ID_VP7,
2945     .priv_data_size        = sizeof(VP8Context),
2946     .init                  = vp7_decode_init,
2947     .close                 = ff_vp8_decode_free,
2948     .decode                = vp7_decode_frame,
2949     .capabilities          = AV_CODEC_CAP_DR1,
2950     .flush                 = vp8_decode_flush,
2951 };
2952 #endif /* CONFIG_VP7_DECODER */
2953
2954 #if CONFIG_VP8_DECODER
2955 AVCodec ff_vp8_decoder = {
2956     .name                  = "vp8",
2957     .long_name             = NULL_IF_CONFIG_SMALL("On2 VP8"),
2958     .type                  = AVMEDIA_TYPE_VIDEO,
2959     .id                    = AV_CODEC_ID_VP8,
2960     .priv_data_size        = sizeof(VP8Context),
2961     .init                  = ff_vp8_decode_init,
2962     .close                 = ff_vp8_decode_free,
2963     .decode                = ff_vp8_decode_frame,
2964     .capabilities          = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS |
2965                              AV_CODEC_CAP_SLICE_THREADS,
2966     .flush                 = vp8_decode_flush,
2967     .update_thread_context = ONLY_IF_THREADS_ENABLED(vp8_decode_update_thread_context),
2968     .hw_configs            = (const AVCodecHWConfigInternal*[]) {
2969 #if CONFIG_VP8_VAAPI_HWACCEL
2970                                HWACCEL_VAAPI(vp8),
2971 #endif
2972 #if CONFIG_VP8_NVDEC_HWACCEL
2973                                HWACCEL_NVDEC(vp8),
2974 #endif
2975                                NULL
2976                            },
2977     .caps_internal         = FF_CODEC_CAP_ALLOCATE_PROGRESS,
2978 };
2979 #endif /* CONFIG_VP7_DECODER */