git.sesse.net Git - ffmpeg/blob - libavcodec/vp8.c

   1 /*
   2  * VP7/VP8 compatible video decoder
   3  *
   4  * Copyright (C) 2010 David Conrad
   5  * Copyright (C) 2010 Ronald S. Bultje
   6  * Copyright (C) 2010 Fiona Glaser
   7  * Copyright (C) 2012 Daniel Kang
   8  * Copyright (C) 2014 Peter Ross
   9  *
  10  * This file is part of FFmpeg.
  11  *
  12  * FFmpeg is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public
  14  * License as published by the Free Software Foundation; either
  15  * version 2.1 of the License, or (at your option) any later version.
  16  *
  17  * FFmpeg is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with FFmpeg; if not, write to the Free Software
  24  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  25  */
  26
  27 #include "libavutil/imgutils.h"
  28
  29 #include "avcodec.h"
  30 #include "internal.h"
  31 #include "rectangle.h"
  32 #include "thread.h"
  33 #include "vp8.h"
  34 #include "vp8data.h"
  35
  36 #if ARCH_ARM
  37 #   include "arm/vp8.h"
  38 #endif
  39
  40 #if CONFIG_VP7_DECODER && CONFIG_VP8_DECODER
  41 #define VPX(vp7, f) (vp7 ? vp7_ ## f : vp8_ ## f)
  42 #elif CONFIG_VP7_DECODER
  43 #define VPX(vp7, f) vp7_ ## f
  44 #else // CONFIG_VP8_DECODER
  45 #define VPX(vp7, f) vp8_ ## f
  46 #endif
  47
  48 static void free_buffers(VP8Context *s)
  49 {
  50     int i;
  51     if (s->thread_data)
  52         for (i = 0; i < MAX_THREADS; i++) {
  53 #if HAVE_THREADS
  54             pthread_cond_destroy(&s->thread_data[i].cond);
  55             pthread_mutex_destroy(&s->thread_data[i].lock);
  56 #endif
  57             av_freep(&s->thread_data[i].filter_strength);
  58         }
  59     av_freep(&s->thread_data);
  60     av_freep(&s->macroblocks_base);
  61     av_freep(&s->intra4x4_pred_mode_top);
  62     av_freep(&s->top_nnz);
  63     av_freep(&s->top_border);
  64
  65     s->macroblocks = NULL;
  66 }
  67
  68 static int vp8_alloc_frame(VP8Context *s, VP8Frame *f, int ref)
  69 {
  70     int ret;
  71     if ((ret = ff_thread_get_buffer(s->avctx, &f->tf,
  72                                     ref ? AV_GET_BUFFER_FLAG_REF : 0)) < 0)
  73         return ret;
  74     if (!(f->seg_map = av_buffer_allocz(s->mb_width * s->mb_height))) {
  75         ff_thread_release_buffer(s->avctx, &f->tf);
  76         return AVERROR(ENOMEM);
  77     }
  78     return 0;
  79 }
  80
  81 static void vp8_release_frame(VP8Context *s, VP8Frame *f)
  82 {
  83     av_buffer_unref(&f->seg_map);
  84     ff_thread_release_buffer(s->avctx, &f->tf);
  85 }
  86
  87 #if CONFIG_VP8_DECODER
  88 static int vp8_ref_frame(VP8Context *s, VP8Frame *dst, VP8Frame *src)
  89 {
  90     int ret;
  91
  92     vp8_release_frame(s, dst);
  93
  94     if ((ret = ff_thread_ref_frame(&dst->tf, &src->tf)) < 0)
  95         return ret;
  96     if (src->seg_map &&
  97         !(dst->seg_map = av_buffer_ref(src->seg_map))) {
  98         vp8_release_frame(s, dst);
  99         return AVERROR(ENOMEM);
 100     }
 101
 102     return 0;
 103 }
 104 #endif /* CONFIG_VP8_DECODER */
 105
 106 static void vp8_decode_flush_impl(AVCodecContext *avctx, int free_mem)
 107 {
 108     VP8Context *s = avctx->priv_data;
 109     int i;
 110
 111     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++)
 112         vp8_release_frame(s, &s->frames[i]);
 113     memset(s->framep, 0, sizeof(s->framep));
 114
 115     if (free_mem)
 116         free_buffers(s);
 117 }
 118
 119 static void vp8_decode_flush(AVCodecContext *avctx)
 120 {
 121     vp8_decode_flush_impl(avctx, 0);
 122 }
 123
 124 static VP8Frame *vp8_find_free_buffer(VP8Context *s)
 125 {
 126     VP8Frame *frame = NULL;
 127     int i;
 128
 129     // find a free buffer
 130     for (i = 0; i < 5; i++)
 131         if (&s->frames[i] != s->framep[VP56_FRAME_CURRENT]  &&
 132             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
 133             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN]   &&
 134             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2]) {
 135             frame = &s->frames[i];
 136             break;
 137         }
 138     if (i == 5) {
 139         av_log(s->avctx, AV_LOG_FATAL, "Ran out of free frames!\n");
 140         abort();
 141     }
 142     if (frame->tf.f->data[0])
 143         vp8_release_frame(s, frame);
 144
 145     return frame;
 146 }
 147
 148 static av_always_inline
 149 int update_dimensions(VP8Context *s, int width, int height, int is_vp7)
 150 {
 151     AVCodecContext *avctx = s->avctx;
 152     int i, ret;
 153
 154     if (width  != s->avctx->width || ((width+15)/16 != s->mb_width || (height+15)/16 != s->mb_height) && s->macroblocks_base ||
 155         height != s->avctx->height) {
 156         vp8_decode_flush_impl(s->avctx, 1);
 157
 158         ret = ff_set_dimensions(s->avctx, width, height);
 159         if (ret < 0)
 160             return ret;
 161     }
 162
 163     s->mb_width  = (s->avctx->coded_width  + 15) / 16;
 164     s->mb_height = (s->avctx->coded_height + 15) / 16;
 165
 166     s->mb_layout = is_vp7 || avctx->active_thread_type == FF_THREAD_SLICE &&
 167                    avctx->thread_count > 1;
 168     if (!s->mb_layout) { // Frame threading and one thread
 169         s->macroblocks_base       = av_mallocz((s->mb_width + s->mb_height * 2 + 1) *
 170                                                sizeof(*s->macroblocks));
 171         s->intra4x4_pred_mode_top = av_mallocz(s->mb_width * 4);
 172     } else // Sliced threading
 173         s->macroblocks_base = av_mallocz((s->mb_width + 2) * (s->mb_height + 2) *
 174                                          sizeof(*s->macroblocks));
 175     s->top_nnz     = av_mallocz(s->mb_width * sizeof(*s->top_nnz));
 176     s->top_border  = av_mallocz((s->mb_width + 1) * sizeof(*s->top_border));
 177     s->thread_data = av_mallocz(MAX_THREADS * sizeof(VP8ThreadData));
 178
 179     if (!s->macroblocks_base || !s->top_nnz || !s->top_border ||
 180         !s->thread_data || (!s->intra4x4_pred_mode_top && !s->mb_layout)) {
 181         free_buffers(s);
 182         return AVERROR(ENOMEM);
 183     }
 184
 185     for (i = 0; i < MAX_THREADS; i++) {
 186         s->thread_data[i].filter_strength =
 187             av_mallocz(s->mb_width * sizeof(*s->thread_data[0].filter_strength));
 188         if (!s->thread_data[i].filter_strength) {
 189             free_buffers(s);
 190             return AVERROR(ENOMEM);
 191         }
 192 #if HAVE_THREADS
 193         pthread_mutex_init(&s->thread_data[i].lock, NULL);
 194         pthread_cond_init(&s->thread_data[i].cond, NULL);
 195 #endif
 196     }
 197
 198     s->macroblocks = s->macroblocks_base + 1;
 199
 200     return 0;
 201 }
 202
 203 static int vp7_update_dimensions(VP8Context *s, int width, int height)
 204 {
 205     return update_dimensions(s, width, height, IS_VP7);
 206 }
 207
 208 static int vp8_update_dimensions(VP8Context *s, int width, int height)
 209 {
 210     return update_dimensions(s, width, height, IS_VP8);
 211 }
 212
 213
 214 static void parse_segment_info(VP8Context *s)
 215 {
 216     VP56RangeCoder *c = &s->c;
 217     int i;
 218
 219     s->segmentation.update_map = vp8_rac_get(c);
 220
 221     if (vp8_rac_get(c)) { // update segment feature data
 222         s->segmentation.absolute_vals = vp8_rac_get(c);
 223
 224         for (i = 0; i < 4; i++)
 225             s->segmentation.base_quant[i]   = vp8_rac_get_sint(c, 7);
 226
 227         for (i = 0; i < 4; i++)
 228             s->segmentation.filter_level[i] = vp8_rac_get_sint(c, 6);
 229     }
 230     if (s->segmentation.update_map)
 231         for (i = 0; i < 3; i++)
 232             s->prob->segmentid[i] = vp8_rac_get(c) ? vp8_rac_get_uint(c, 8) : 255;
 233 }
 234
 235 static void update_lf_deltas(VP8Context *s)
 236 {
 237     VP56RangeCoder *c = &s->c;
 238     int i;
 239
 240     for (i = 0; i < 4; i++) {
 241         if (vp8_rac_get(c)) {
 242             s->lf_delta.ref[i] = vp8_rac_get_uint(c, 6);
 243
 244             if (vp8_rac_get(c))
 245                 s->lf_delta.ref[i] = -s->lf_delta.ref[i];
 246         }
 247     }
 248
 249     for (i = MODE_I4x4; i <= VP8_MVMODE_SPLIT; i++) {
 250         if (vp8_rac_get(c)) {
 251             s->lf_delta.mode[i] = vp8_rac_get_uint(c, 6);
 252
 253             if (vp8_rac_get(c))
 254                 s->lf_delta.mode[i] = -s->lf_delta.mode[i];
 255         }
 256     }
 257 }
 258
 259 static int setup_partitions(VP8Context *s, const uint8_t *buf, int buf_size)
 260 {
 261     const uint8_t *sizes = buf;
 262     int i;
 263
 264     s->num_coeff_partitions = 1 << vp8_rac_get_uint(&s->c, 2);
 265
 266     buf      += 3 * (s->num_coeff_partitions - 1);
 267     buf_size -= 3 * (s->num_coeff_partitions - 1);
 268     if (buf_size < 0)
 269         return -1;
 270
 271     for (i = 0; i < s->num_coeff_partitions - 1; i++) {
 272         int size = AV_RL24(sizes + 3 * i);
 273         if (buf_size - size < 0)
 274             return -1;
 275
 276         ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, size);
 277         buf      += size;
 278         buf_size -= size;
 279     }
 280     ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, buf_size);
 281
 282     return 0;
 283 }
 284
 285 static void vp7_get_quants(VP8Context *s)
 286 {
 287     VP56RangeCoder *c = &s->c;
 288
 289     int yac_qi  = vp8_rac_get_uint(c, 7);
 290     int ydc_qi  = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 291     int y2dc_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 292     int y2ac_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 293     int uvdc_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 294     int uvac_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 295
 296     s->qmat[0].luma_qmul[0]    =       vp7_ydc_qlookup[ydc_qi];
 297     s->qmat[0].luma_qmul[1]    =       vp7_yac_qlookup[yac_qi];
 298     s->qmat[0].luma_dc_qmul[0] =       vp7_y2dc_qlookup[y2dc_qi];
 299     s->qmat[0].luma_dc_qmul[1] =       vp7_y2ac_qlookup[y2ac_qi];
 300     s->qmat[0].chroma_qmul[0]  = FFMIN(vp7_ydc_qlookup[uvdc_qi], 132);
 301     s->qmat[0].chroma_qmul[1]  =       vp7_yac_qlookup[uvac_qi];
 302 }
 303
 304 static void vp8_get_quants(VP8Context *s)
 305 {
 306     VP56RangeCoder *c = &s->c;
 307     int i, base_qi;
 308
 309     int yac_qi     = vp8_rac_get_uint(c, 7);
 310     int ydc_delta  = vp8_rac_get_sint(c, 4);
 311     int y2dc_delta = vp8_rac_get_sint(c, 4);
 312     int y2ac_delta = vp8_rac_get_sint(c, 4);
 313     int uvdc_delta = vp8_rac_get_sint(c, 4);
 314     int uvac_delta = vp8_rac_get_sint(c, 4);
 315
 316     for (i = 0; i < 4; i++) {
 317         if (s->segmentation.enabled) {
 318             base_qi = s->segmentation.base_quant[i];
 319             if (!s->segmentation.absolute_vals)
 320                 base_qi += yac_qi;
 321         } else
 322             base_qi = yac_qi;
 323
 324         s->qmat[i].luma_qmul[0]    = vp8_dc_qlookup[av_clip_uintp2(base_qi + ydc_delta,  7)];
 325         s->qmat[i].luma_qmul[1]    = vp8_ac_qlookup[av_clip_uintp2(base_qi,              7)];
 326         s->qmat[i].luma_dc_qmul[0] = vp8_dc_qlookup[av_clip_uintp2(base_qi + y2dc_delta, 7)] * 2;
 327         /* 101581>>16 is equivalent to 155/100 */
 328         s->qmat[i].luma_dc_qmul[1] = vp8_ac_qlookup[av_clip_uintp2(base_qi + y2ac_delta, 7)] * 101581 >> 16;
 329         s->qmat[i].chroma_qmul[0]  = vp8_dc_qlookup[av_clip_uintp2(base_qi + uvdc_delta, 7)];
 330         s->qmat[i].chroma_qmul[1]  = vp8_ac_qlookup[av_clip_uintp2(base_qi + uvac_delta, 7)];
 331
 332         s->qmat[i].luma_dc_qmul[1] = FFMAX(s->qmat[i].luma_dc_qmul[1], 8);
 333         s->qmat[i].chroma_qmul[0]  = FFMIN(s->qmat[i].chroma_qmul[0], 132);
 334     }
 335 }
 336
 337 /**
 338  * Determine which buffers golden and altref should be updated with after this frame.
 339  * The spec isn't clear here, so I'm going by my understanding of what libvpx does
 340  *
 341  * Intra frames update all 3 references
 342  * Inter frames update VP56_FRAME_PREVIOUS if the update_last flag is set
 343  * If the update (golden|altref) flag is set, it's updated with the current frame
 344  *      if update_last is set, and VP56_FRAME_PREVIOUS otherwise.
 345  * If the flag is not set, the number read means:
 346  *      0: no update
 347  *      1: VP56_FRAME_PREVIOUS
 348  *      2: update golden with altref, or update altref with golden
 349  */
 350 static VP56Frame ref_to_update(VP8Context *s, int update, VP56Frame ref)
 351 {
 352     VP56RangeCoder *c = &s->c;
 353
 354     if (update)
 355         return VP56_FRAME_CURRENT;
 356
 357     switch (vp8_rac_get_uint(c, 2)) {
 358     case 1:
 359         return VP56_FRAME_PREVIOUS;
 360     case 2:
 361         return (ref == VP56_FRAME_GOLDEN) ? VP56_FRAME_GOLDEN2 : VP56_FRAME_GOLDEN;
 362     }
 363     return VP56_FRAME_NONE;
 364 }
 365
 366 static void vp78_reset_probability_tables(VP8Context *s)
 367 {
 368     int i, j;
 369     for (i = 0; i < 4; i++)
 370         for (j = 0; j < 16; j++)
 371             memcpy(s->prob->token[i][j], vp8_token_default_probs[i][vp8_coeff_band[j]],
 372                    sizeof(s->prob->token[i][j]));
 373 }
 374
 375 static void vp78_update_probability_tables(VP8Context *s)
 376 {
 377     VP56RangeCoder *c = &s->c;
 378     int i, j, k, l, m;
 379
 380     for (i = 0; i < 4; i++)
 381         for (j = 0; j < 8; j++)
 382             for (k = 0; k < 3; k++)
 383                 for (l = 0; l < NUM_DCT_TOKENS-1; l++)
 384                     if (vp56_rac_get_prob_branchy(c, vp8_token_update_probs[i][j][k][l])) {
 385                         int prob = vp8_rac_get_uint(c, 8);
 386                         for (m = 0; vp8_coeff_band_indexes[j][m] >= 0; m++)
 387                             s->prob->token[i][vp8_coeff_band_indexes[j][m]][k][l] = prob;
 388                     }
 389 }
 390
 391 #define VP7_MVC_SIZE 17
 392 #define VP8_MVC_SIZE 19
 393
 394 static void vp78_update_pred16x16_pred8x8_mvc_probabilities(VP8Context *s,
 395                                                             int mvc_size)
 396 {
 397     VP56RangeCoder *c = &s->c;
 398     int i, j;
 399
 400     if (vp8_rac_get(c))
 401         for (i = 0; i < 4; i++)
 402             s->prob->pred16x16[i] = vp8_rac_get_uint(c, 8);
 403     if (vp8_rac_get(c))
 404         for (i = 0; i < 3; i++)
 405             s->prob->pred8x8c[i]  = vp8_rac_get_uint(c, 8);
 406
 407     // 17.2 MV probability update
 408     for (i = 0; i < 2; i++)
 409         for (j = 0; j < mvc_size; j++)
 410             if (vp56_rac_get_prob_branchy(c, vp8_mv_update_prob[i][j]))
 411                 s->prob->mvc[i][j] = vp8_rac_get_nn(c);
 412 }
 413
 414 static void update_refs(VP8Context *s)
 415 {
 416     VP56RangeCoder *c = &s->c;
 417
 418     int update_golden = vp8_rac_get(c);
 419     int update_altref = vp8_rac_get(c);
 420
 421     s->update_golden = ref_to_update(s, update_golden, VP56_FRAME_GOLDEN);
 422     s->update_altref = ref_to_update(s, update_altref, VP56_FRAME_GOLDEN2);
 423 }
 424
 425 static void copy_chroma(AVFrame *dst, AVFrame *src, int width, int height)
 426 {
 427     int i, j;
 428
 429     for (j = 1; j < 3; j++) {
 430         for (i = 0; i < height / 2; i++)
 431             memcpy(dst->data[j] + i * dst->linesize[j],
 432                    src->data[j] + i * src->linesize[j], width / 2);
 433     }
 434 }
 435
 436 static void fade(uint8_t *dst, int dst_linesize,
 437                  const uint8_t *src, int src_linesize,
 438                  int width, int height,
 439                  int alpha, int beta)
 440 {
 441     int i, j;
 442     for (j = 0; j < height; j++) {
 443         for (i = 0; i < width; i++) {
 444             uint8_t y = src[j * src_linesize + i];
 445             dst[j * dst_linesize + i] = av_clip_uint8(y + ((y * beta) >> 8) + alpha);
 446         }
 447     }
 448 }
 449
 450 static int vp7_fade_frame(VP8Context *s, VP56RangeCoder *c)
 451 {
 452     int alpha = (int8_t) vp8_rac_get_uint(c, 8);
 453     int beta  = (int8_t) vp8_rac_get_uint(c, 8);
 454     int ret;
 455
 456     if (!s->keyframe && (alpha || beta)) {
 457         int width  = s->mb_width * 16;
 458         int height = s->mb_height * 16;
 459         AVFrame *src, *dst;
 460
 461         if (!s->framep[VP56_FRAME_PREVIOUS] ||
 462             !s->framep[VP56_FRAME_GOLDEN]) {
 463             av_log(s->avctx, AV_LOG_WARNING, "Discarding interframe without a prior keyframe!\n");
 464             return AVERROR_INVALIDDATA;
 465         }
 466
 467         dst =
 468         src = s->framep[VP56_FRAME_PREVIOUS]->tf.f;
 469
 470         /* preserve the golden frame, write a new previous frame */
 471         if (s->framep[VP56_FRAME_GOLDEN] == s->framep[VP56_FRAME_PREVIOUS]) {
 472             s->framep[VP56_FRAME_PREVIOUS] = vp8_find_free_buffer(s);
 473             if ((ret = vp8_alloc_frame(s, s->framep[VP56_FRAME_PREVIOUS], 1)) < 0)
 474                 return ret;
 475
 476             dst = s->framep[VP56_FRAME_PREVIOUS]->tf.f;
 477
 478             copy_chroma(dst, src, width, height);
 479         }
 480
 481         fade(dst->data[0], dst->linesize[0],
 482              src->data[0], src->linesize[0],
 483              width, height, alpha, beta);
 484     }
 485
 486     return 0;
 487 }
 488
 489 static int vp7_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size)
 490 {
 491     VP56RangeCoder *c = &s->c;
 492     int part1_size, hscale, vscale, i, j, ret;
 493     int width  = s->avctx->width;
 494     int height = s->avctx->height;
 495
 496     if (buf_size < 4) {
 497         return AVERROR_INVALIDDATA;
 498     }
 499
 500     s->profile = (buf[0] >> 1) & 7;
 501     if (s->profile > 1) {
 502         avpriv_request_sample(s->avctx, "Unknown profile %d", s->profile);
 503         return AVERROR_INVALIDDATA;
 504     }
 505
 506     s->keyframe  = !(buf[0] & 1);
 507     s->invisible = 0;
 508     part1_size   = AV_RL24(buf) >> 4;
 509
 510     if (buf_size < 4 - s->profile + part1_size) {
 511         av_log(s->avctx, AV_LOG_ERROR, "Buffer size %d is too small, needed : %d\n", buf_size, 4 - s->profile + part1_size);
 512         return AVERROR_INVALIDDATA;
 513     }
 514
 515     buf      += 4 - s->profile;
 516     buf_size -= 4 - s->profile;
 517
 518     memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab, sizeof(s->put_pixels_tab));
 519
 520     ff_vp56_init_range_decoder(c, buf, part1_size);
 521     buf      += part1_size;
 522     buf_size -= part1_size;
 523
 524     /* A. Dimension information (keyframes only) */
 525     if (s->keyframe) {
 526         width  = vp8_rac_get_uint(c, 12);
 527         height = vp8_rac_get_uint(c, 12);
 528         hscale = vp8_rac_get_uint(c, 2);
 529         vscale = vp8_rac_get_uint(c, 2);
 530         if (hscale || vscale)
 531             avpriv_request_sample(s->avctx, "Upscaling");
 532
 533         s->update_golden = s->update_altref = VP56_FRAME_CURRENT;
 534         vp78_reset_probability_tables(s);
 535         memcpy(s->prob->pred16x16, vp8_pred16x16_prob_inter,
 536                sizeof(s->prob->pred16x16));
 537         memcpy(s->prob->pred8x8c, vp8_pred8x8c_prob_inter,
 538                sizeof(s->prob->pred8x8c));
 539         for (i = 0; i < 2; i++)
 540             memcpy(s->prob->mvc[i], vp7_mv_default_prob[i],
 541                    sizeof(vp7_mv_default_prob[i]));
 542         memset(&s->segmentation, 0, sizeof(s->segmentation));
 543         memset(&s->lf_delta, 0, sizeof(s->lf_delta));
 544         memcpy(s->prob[0].scan, zigzag_scan, sizeof(s->prob[0].scan));
 545     }
 546
 547     if (s->keyframe || s->profile > 0)
 548         memset(s->inter_dc_pred, 0 , sizeof(s->inter_dc_pred));
 549
 550     /* B. Decoding information for all four macroblock-level features */
 551     for (i = 0; i < 4; i++) {
 552         s->feature_enabled[i] = vp8_rac_get(c);
 553         if (s->feature_enabled[i]) {
 554              s->feature_present_prob[i] = vp8_rac_get_uint(c, 8);
 555
 556              for (j = 0; j < 3; j++)
 557                  s->feature_index_prob[i][j] =
 558                      vp8_rac_get(c) ? vp8_rac_get_uint(c, 8) : 255;
 559
 560              if (vp7_feature_value_size[s->profile][i])
 561                  for (j = 0; j < 4; j++)
 562                      s->feature_value[i][j] =
 563                         vp8_rac_get(c) ? vp8_rac_get_uint(c, vp7_feature_value_size[s->profile][i]) : 0;
 564         }
 565     }
 566
 567     s->segmentation.enabled    = 0;
 568     s->segmentation.update_map = 0;
 569     s->lf_delta.enabled        = 0;
 570
 571     s->num_coeff_partitions = 1;
 572     ff_vp56_init_range_decoder(&s->coeff_partition[0], buf, buf_size);
 573
 574     if (!s->macroblocks_base || /* first frame */
 575         width != s->avctx->width || height != s->avctx->height ||
 576         (width + 15) / 16 != s->mb_width || (height + 15) / 16 != s->mb_height) {
 577         if ((ret = vp7_update_dimensions(s, width, height)) < 0)
 578             return ret;
 579     }
 580
 581     /* C. Dequantization indices */
 582     vp7_get_quants(s);
 583
 584     /* D. Golden frame update flag (a Flag) for interframes only */
 585     if (!s->keyframe) {
 586         s->update_golden = vp8_rac_get(c) ? VP56_FRAME_CURRENT : VP56_FRAME_NONE;
 587         s->sign_bias[VP56_FRAME_GOLDEN] = 0;
 588     }
 589
 590     s->update_last          = 1;
 591     s->update_probabilities = 1;
 592     s->fade_present         = 1;
 593
 594     if (s->profile > 0) {
 595         s->update_probabilities = vp8_rac_get(c);
 596         if (!s->update_probabilities)
 597             s->prob[1] = s->prob[0];
 598
 599         if (!s->keyframe)
 600             s->fade_present = vp8_rac_get(c);
 601     }
 602
 603     /* E. Fading information for previous frame */
 604     if (s->fade_present && vp8_rac_get(c)) {
 605         if ((ret = vp7_fade_frame(s ,c)) < 0)
 606             return ret;
 607     }
 608
 609     /* F. Loop filter type */
 610     if (!s->profile)
 611         s->filter.simple = vp8_rac_get(c);
 612
 613     /* G. DCT coefficient ordering specification */
 614     if (vp8_rac_get(c))
 615         for (i = 1; i < 16; i++)
 616             s->prob[0].scan[i] = zigzag_scan[vp8_rac_get_uint(c, 4)];
 617
 618     /* H. Loop filter levels  */
 619     if (s->profile > 0)
 620         s->filter.simple = vp8_rac_get(c);
 621     s->filter.level     = vp8_rac_get_uint(c, 6);
 622     s->filter.sharpness = vp8_rac_get_uint(c, 3);
 623
 624     /* I. DCT coefficient probability update; 13.3 Token Probability Updates */
 625     vp78_update_probability_tables(s);
 626
 627     s->mbskip_enabled = 0;
 628
 629     /* J. The remaining frame header data occurs ONLY FOR INTERFRAMES */
 630     if (!s->keyframe) {
 631         s->prob->intra  = vp8_rac_get_uint(c, 8);
 632         s->prob->last   = vp8_rac_get_uint(c, 8);
 633         vp78_update_pred16x16_pred8x8_mvc_probabilities(s, VP7_MVC_SIZE);
 634     }
 635
 636     return 0;
 637 }
 638
 639 static int vp8_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size)
 640 {
 641     VP56RangeCoder *c = &s->c;
 642     int header_size, hscale, vscale, ret;
 643     int width  = s->avctx->width;
 644     int height = s->avctx->height;
 645
 646     if (buf_size < 3) {
 647         av_log(s->avctx, AV_LOG_ERROR, "Insufficent data (%d) for header\n", buf_size);
 648         return AVERROR_INVALIDDATA;
 649     }
 650
 651     s->keyframe  = !(buf[0] & 1);
 652     s->profile   =  (buf[0]>>1) & 7;
 653     s->invisible = !(buf[0] & 0x10);
 654     header_size  = AV_RL24(buf) >> 5;
 655     buf      += 3;
 656     buf_size -= 3;
 657
 658     if (s->profile > 3)
 659         av_log(s->avctx, AV_LOG_WARNING, "Unknown profile %d\n", s->profile);
 660
 661     if (!s->profile)
 662         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab,
 663                sizeof(s->put_pixels_tab));
 664     else    // profile 1-3 use bilinear, 4+ aren't defined so whatever
 665         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_bilinear_pixels_tab,
 666                sizeof(s->put_pixels_tab));
 667
 668     if (header_size > buf_size - 7 * s->keyframe) {
 669         av_log(s->avctx, AV_LOG_ERROR, "Header size larger than data provided\n");
 670         return AVERROR_INVALIDDATA;
 671     }
 672
 673     if (s->keyframe) {
 674         if (AV_RL24(buf) != 0x2a019d) {
 675             av_log(s->avctx, AV_LOG_ERROR,
 676                    "Invalid start code 0x%x\n", AV_RL24(buf));
 677             return AVERROR_INVALIDDATA;
 678         }
 679         width     = AV_RL16(buf + 3) & 0x3fff;
 680         height    = AV_RL16(buf + 5) & 0x3fff;
 681         hscale    = buf[4] >> 6;
 682         vscale    = buf[6] >> 6;
 683         buf      += 7;
 684         buf_size -= 7;
 685
 686         if (hscale || vscale)
 687             avpriv_request_sample(s->avctx, "Upscaling");
 688
 689         s->update_golden = s->update_altref = VP56_FRAME_CURRENT;
 690         vp78_reset_probability_tables(s);
 691         memcpy(s->prob->pred16x16, vp8_pred16x16_prob_inter,
 692                sizeof(s->prob->pred16x16));
 693         memcpy(s->prob->pred8x8c, vp8_pred8x8c_prob_inter,
 694                sizeof(s->prob->pred8x8c));
 695         memcpy(s->prob->mvc, vp8_mv_default_prob,
 696                sizeof(s->prob->mvc));
 697         memset(&s->segmentation, 0, sizeof(s->segmentation));
 698         memset(&s->lf_delta, 0, sizeof(s->lf_delta));
 699     }
 700
 701     ff_vp56_init_range_decoder(c, buf, header_size);
 702     buf      += header_size;
 703     buf_size -= header_size;
 704
 705     if (s->keyframe) {
 706         s->colorspace = vp8_rac_get(c);
 707         if (s->colorspace)
 708             av_log(s->avctx, AV_LOG_WARNING, "Unspecified colorspace\n");
 709         s->fullrange = vp8_rac_get(c);
 710     }
 711
 712     if ((s->segmentation.enabled = vp8_rac_get(c)))
 713         parse_segment_info(s);
 714     else
 715         s->segmentation.update_map = 0; // FIXME: move this to some init function?
 716
 717     s->filter.simple    = vp8_rac_get(c);
 718     s->filter.level     = vp8_rac_get_uint(c, 6);
 719     s->filter.sharpness = vp8_rac_get_uint(c, 3);
 720
 721     if ((s->lf_delta.enabled = vp8_rac_get(c)))
 722         if (vp8_rac_get(c))
 723             update_lf_deltas(s);
 724
 725     if (setup_partitions(s, buf, buf_size)) {
 726         av_log(s->avctx, AV_LOG_ERROR, "Invalid partitions\n");
 727         return AVERROR_INVALIDDATA;
 728     }
 729
 730     if (!s->macroblocks_base || /* first frame */
 731         width != s->avctx->width || height != s->avctx->height ||
 732         (width+15)/16 != s->mb_width || (height+15)/16 != s->mb_height)
 733         if ((ret = vp8_update_dimensions(s, width, height)) < 0)
 734             return ret;
 735
 736     vp8_get_quants(s);
 737
 738     if (!s->keyframe) {
 739         update_refs(s);
 740         s->sign_bias[VP56_FRAME_GOLDEN]               = vp8_rac_get(c);
 741         s->sign_bias[VP56_FRAME_GOLDEN2 /* altref */] = vp8_rac_get(c);
 742     }
 743
 744     // if we aren't saving this frame's probabilities for future frames,
 745     // make a copy of the current probabilities
 746     if (!(s->update_probabilities = vp8_rac_get(c)))
 747         s->prob[1] = s->prob[0];
 748
 749     s->update_last = s->keyframe || vp8_rac_get(c);
 750
 751     vp78_update_probability_tables(s);
 752
 753     if ((s->mbskip_enabled = vp8_rac_get(c)))
 754         s->prob->mbskip = vp8_rac_get_uint(c, 8);
 755
 756     if (!s->keyframe) {
 757         s->prob->intra  = vp8_rac_get_uint(c, 8);
 758         s->prob->last   = vp8_rac_get_uint(c, 8);
 759         s->prob->golden = vp8_rac_get_uint(c, 8);
 760         vp78_update_pred16x16_pred8x8_mvc_probabilities(s, VP8_MVC_SIZE);
 761     }
 762
 763     return 0;
 764 }
 765
 766 static av_always_inline
 767 void clamp_mv(VP8Context *s, VP56mv *dst, const VP56mv *src)
 768 {
 769     dst->x = av_clip(src->x, av_clip(s->mv_min.x, INT16_MIN, INT16_MAX),
 770                              av_clip(s->mv_max.x, INT16_MIN, INT16_MAX));
 771     dst->y = av_clip(src->y, av_clip(s->mv_min.y, INT16_MIN, INT16_MAX),
 772                              av_clip(s->mv_max.y, INT16_MIN, INT16_MAX));
 773 }
 774
 775 /**
 776  * Motion vector coding, 17.1.
 777  */
 778 static av_always_inline int read_mv_component(VP56RangeCoder *c, const uint8_t *p, int vp7)
 779 {
 780     int bit, x = 0;
 781
 782     if (vp56_rac_get_prob_branchy(c, p[0])) {
 783         int i;
 784
 785         for (i = 0; i < 3; i++)
 786             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 787         for (i = (vp7 ? 7 : 9); i > 3; i--)
 788             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 789         if (!(x & (vp7 ? 0xF0 : 0xFFF0)) || vp56_rac_get_prob(c, p[12]))
 790             x += 8;
 791     } else {
 792         // small_mvtree
 793         const uint8_t *ps = p + 2;
 794         bit = vp56_rac_get_prob(c, *ps);
 795         ps += 1 + 3 * bit;
 796         x  += 4 * bit;
 797         bit = vp56_rac_get_prob(c, *ps);
 798         ps += 1 + bit;
 799         x  += 2 * bit;
 800         x  += vp56_rac_get_prob(c, *ps);
 801     }
 802
 803     return (x && vp56_rac_get_prob(c, p[1])) ? -x : x;
 804 }
 805
 806 static int vp7_read_mv_component(VP56RangeCoder *c, const uint8_t *p)
 807 {
 808     return read_mv_component(c, p, 1);
 809 }
 810
 811 static int vp8_read_mv_component(VP56RangeCoder *c, const uint8_t *p)
 812 {
 813     return read_mv_component(c, p, 0);
 814 }
 815
 816 static av_always_inline
 817 const uint8_t *get_submv_prob(uint32_t left, uint32_t top, int is_vp7)
 818 {
 819     if (is_vp7)
 820         return vp7_submv_prob;
 821
 822     if (left == top)
 823         return vp8_submv_prob[4 - !!left];
 824     if (!top)
 825         return vp8_submv_prob[2];
 826     return vp8_submv_prob[1 - !!left];
 827 }
 828
 829 /**
 830  * Split motion vector prediction, 16.4.
 831  * @returns the number of motion vectors parsed (2, 4 or 16)
 832  */
 833 static av_always_inline
 834 int decode_splitmvs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
 835                     int layout, int is_vp7)
 836 {
 837     int part_idx;
 838     int n, num;
 839     VP8Macroblock *top_mb;
 840     VP8Macroblock *left_mb = &mb[-1];
 841     const uint8_t *mbsplits_left = vp8_mbsplits[left_mb->partitioning];
 842     const uint8_t *mbsplits_top, *mbsplits_cur, *firstidx;
 843     VP56mv *top_mv;
 844     VP56mv *left_mv = left_mb->bmv;
 845     VP56mv *cur_mv  = mb->bmv;
 846
 847     if (!layout) // layout is inlined, s->mb_layout is not
 848         top_mb = &mb[2];
 849     else
 850         top_mb = &mb[-s->mb_width - 1];
 851     mbsplits_top = vp8_mbsplits[top_mb->partitioning];
 852     top_mv       = top_mb->bmv;
 853
 854     if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[0])) {
 855         if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[1]))
 856             part_idx = VP8_SPLITMVMODE_16x8 + vp56_rac_get_prob(c, vp8_mbsplit_prob[2]);
 857         else
 858             part_idx = VP8_SPLITMVMODE_8x8;
 859     } else {
 860         part_idx = VP8_SPLITMVMODE_4x4;
 861     }
 862
 863     num              = vp8_mbsplit_count[part_idx];
 864     mbsplits_cur     = vp8_mbsplits[part_idx],
 865     firstidx         = vp8_mbfirstidx[part_idx];
 866     mb->partitioning = part_idx;
 867
 868     for (n = 0; n < num; n++) {
 869         int k = firstidx[n];
 870         uint32_t left, above;
 871         const uint8_t *submv_prob;
 872
 873         if (!(k & 3))
 874             left = AV_RN32A(&left_mv[mbsplits_left[k + 3]]);
 875         else
 876             left = AV_RN32A(&cur_mv[mbsplits_cur[k - 1]]);
 877         if (k <= 3)
 878             above = AV_RN32A(&top_mv[mbsplits_top[k + 12]]);
 879         else
 880             above = AV_RN32A(&cur_mv[mbsplits_cur[k - 4]]);
 881
 882         submv_prob = get_submv_prob(left, above, is_vp7);
 883
 884         if (vp56_rac_get_prob_branchy(c, submv_prob[0])) {
 885             if (vp56_rac_get_prob_branchy(c, submv_prob[1])) {
 886                 if (vp56_rac_get_prob_branchy(c, submv_prob[2])) {
 887                     mb->bmv[n].y = mb->mv.y +
 888                                    read_mv_component(c, s->prob->mvc[0], is_vp7);
 889                     mb->bmv[n].x = mb->mv.x +
 890                                    read_mv_component(c, s->prob->mvc[1], is_vp7);
 891                 } else {
 892                     AV_ZERO32(&mb->bmv[n]);
 893                 }
 894             } else {
 895                 AV_WN32A(&mb->bmv[n], above);
 896             }
 897         } else {
 898             AV_WN32A(&mb->bmv[n], left);
 899         }
 900     }
 901
 902     return num;
 903 }
 904
 905 /**
 906  * The vp7 reference decoder uses a padding macroblock column (added to right
 907  * edge of the frame) to guard against illegal macroblock offsets. The
 908  * algorithm has bugs that permit offsets to straddle the padding column.
 909  * This function replicates those bugs.
 910  *
 911  * @param[out] edge_x macroblock x address
 912  * @param[out] edge_y macroblock y address
 913  *
 914  * @return macroblock offset legal (boolean)
 915  */
 916 static int vp7_calculate_mb_offset(int mb_x, int mb_y, int mb_width,
 917                                    int xoffset, int yoffset, int boundary,
 918                                    int *edge_x, int *edge_y)
 919 {
 920     int vwidth = mb_width + 1;
 921     int new = (mb_y + yoffset) * vwidth + mb_x + xoffset;
 922     if (new < boundary || new % vwidth == vwidth - 1)
 923         return 0;
 924     *edge_y = new / vwidth;
 925     *edge_x = new % vwidth;
 926     return 1;
 927 }
 928
 929 static const VP56mv *get_bmv_ptr(const VP8Macroblock *mb, int subblock)
 930 {
 931     return &mb->bmv[mb->mode == VP8_MVMODE_SPLIT ? vp8_mbsplits[mb->partitioning][subblock] : 0];
 932 }
 933
 934 static av_always_inline
 935 void vp7_decode_mvs(VP8Context *s, VP8Macroblock *mb,
 936                     int mb_x, int mb_y, int layout)
 937 {
 938     VP8Macroblock *mb_edge[12];
 939     enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR };
 940     enum { VP8_EDGE_TOP, VP8_EDGE_LEFT, VP8_EDGE_TOPLEFT };
 941     int idx = CNT_ZERO;
 942     VP56mv near_mv[3];
 943     uint8_t cnt[3] = { 0 };
 944     VP56RangeCoder *c = &s->c;
 945     int i;
 946
 947     AV_ZERO32(&near_mv[0]);
 948     AV_ZERO32(&near_mv[1]);
 949     AV_ZERO32(&near_mv[2]);
 950
 951     for (i = 0; i < VP7_MV_PRED_COUNT; i++) {
 952         const VP7MVPred * pred = &vp7_mv_pred[i];
 953         int edge_x, edge_y;
 954
 955         if (vp7_calculate_mb_offset(mb_x, mb_y, s->mb_width, pred->xoffset,
 956                                     pred->yoffset, !s->profile, &edge_x, &edge_y)) {
 957             VP8Macroblock *edge = mb_edge[i] = (s->mb_layout == 1)
 958                                              ? s->macroblocks_base + 1 + edge_x +
 959                                                (s->mb_width + 1) * (edge_y + 1)
 960                                              : s->macroblocks + edge_x +
 961                                                (s->mb_height - edge_y - 1) * 2;
 962             uint32_t mv = AV_RN32A(get_bmv_ptr(edge, vp7_mv_pred[i].subblock));
 963             if (mv) {
 964                 if (AV_RN32A(&near_mv[CNT_NEAREST])) {
 965                     if (mv == AV_RN32A(&near_mv[CNT_NEAREST])) {
 966                         idx = CNT_NEAREST;
 967                     } else if (AV_RN32A(&near_mv[CNT_NEAR])) {
 968                         if (mv != AV_RN32A(&near_mv[CNT_NEAR]))
 969                             continue;
 970                         idx = CNT_NEAR;
 971                     } else {
 972                         AV_WN32A(&near_mv[CNT_NEAR], mv);
 973                         idx = CNT_NEAR;
 974                     }
 975                 } else {
 976                     AV_WN32A(&near_mv[CNT_NEAREST], mv);
 977                     idx = CNT_NEAREST;
 978                 }
 979             } else {
 980                 idx = CNT_ZERO;
 981             }
 982         } else {
 983             idx = CNT_ZERO;
 984         }
 985         cnt[idx] += vp7_mv_pred[i].score;
 986     }
 987
 988     mb->partitioning = VP8_SPLITMVMODE_NONE;
 989
 990     if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_ZERO]][0])) {
 991         mb->mode = VP8_MVMODE_MV;
 992
 993         if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAREST]][1])) {
 994
 995             if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAR]][2])) {
 996
 997                 if (cnt[CNT_NEAREST] > cnt[CNT_NEAR])
 998                     AV_WN32A(&mb->mv, cnt[CNT_ZERO] > cnt[CNT_NEAREST] ? 0 : AV_RN32A(&near_mv[CNT_NEAREST]));
 999                 else
1000                     AV_WN32A(&mb->mv, cnt[CNT_ZERO] > cnt[CNT_NEAR]    ? 0 : AV_RN32A(&near_mv[CNT_NEAR]));
1001
1002                 if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAR]][3])) {
1003                     mb->mode = VP8_MVMODE_SPLIT;
1004                     mb->mv = mb->bmv[decode_splitmvs(s, c, mb, layout, IS_VP7) - 1];
1005                 } else {
1006                     mb->mv.y += vp7_read_mv_component(c, s->prob->mvc[0]);
1007                     mb->mv.x += vp7_read_mv_component(c, s->prob->mvc[1]);
1008                     mb->bmv[0] = mb->mv;
1009                 }
1010             } else {
1011                 mb->mv = near_mv[CNT_NEAR];
1012                 mb->bmv[0] = mb->mv;
1013             }
1014         } else {
1015             mb->mv = near_mv[CNT_NEAREST];
1016             mb->bmv[0] = mb->mv;
1017         }
1018     } else {
1019         mb->mode = VP8_MVMODE_ZERO;
1020         AV_ZERO32(&mb->mv);
1021         mb->bmv[0] = mb->mv;
1022     }
1023 }
1024
1025 static av_always_inline
1026 void vp8_decode_mvs(VP8Context *s, VP8Macroblock *mb,
1027                     int mb_x, int mb_y, int layout)
1028 {
1029     VP8Macroblock *mb_edge[3] = { 0      /* top */,
1030                                   mb - 1 /* left */,
1031                                   0      /* top-left */ };
1032     enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV };
1033     enum { VP8_EDGE_TOP, VP8_EDGE_LEFT, VP8_EDGE_TOPLEFT };
1034     int idx = CNT_ZERO;
1035     int cur_sign_bias = s->sign_bias[mb->ref_frame];
1036     int8_t *sign_bias = s->sign_bias;
1037     VP56mv near_mv[4];
1038     uint8_t cnt[4] = { 0 };
1039     VP56RangeCoder *c = &s->c;
1040
1041     if (!layout) { // layout is inlined (s->mb_layout is not)
1042         mb_edge[0] = mb + 2;
1043         mb_edge[2] = mb + 1;
1044     } else {
1045         mb_edge[0] = mb - s->mb_width - 1;
1046         mb_edge[2] = mb - s->mb_width - 2;
1047     }
1048
1049     AV_ZERO32(&near_mv[0]);
1050     AV_ZERO32(&near_mv[1]);
1051     AV_ZERO32(&near_mv[2]);
1052
1053     /* Process MB on top, left and top-left */
1054 #define MV_EDGE_CHECK(n)                                                      \
1055     {                                                                         \
1056         VP8Macroblock *edge = mb_edge[n];                                     \
1057         int edge_ref = edge->ref_frame;                                       \
1058         if (edge_ref != VP56_FRAME_CURRENT) {                                 \
1059             uint32_t mv = AV_RN32A(&edge->mv);                                \
1060             if (mv) {                                                         \
1061                 if (cur_sign_bias != sign_bias[edge_ref]) {                   \
1062                     /* SWAR negate of the values in mv. */                    \
1063                     mv = ~mv;                                                 \
1064                     mv = ((mv & 0x7fff7fff) +                                 \
1065                           0x00010001) ^ (mv & 0x80008000);                    \
1066                 }                                                             \
1067                 if (!n || mv != AV_RN32A(&near_mv[idx]))                      \
1068                     AV_WN32A(&near_mv[++idx], mv);                            \
1069                 cnt[idx] += 1 + (n != 2);                                     \
1070             } else                                                            \
1071                 cnt[CNT_ZERO] += 1 + (n != 2);                                \
1072         }                                                                     \
1073     }
1074
1075     MV_EDGE_CHECK(0)
1076     MV_EDGE_CHECK(1)
1077     MV_EDGE_CHECK(2)
1078
1079     mb->partitioning = VP8_SPLITMVMODE_NONE;
1080     if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_ZERO]][0])) {
1081         mb->mode = VP8_MVMODE_MV;
1082
1083         /* If we have three distinct MVs, merge first and last if they're the same */
1084         if (cnt[CNT_SPLITMV] &&
1085             AV_RN32A(&near_mv[1 + VP8_EDGE_TOP]) == AV_RN32A(&near_mv[1 + VP8_EDGE_TOPLEFT]))
1086             cnt[CNT_NEAREST] += 1;
1087
1088         /* Swap near and nearest if necessary */
1089         if (cnt[CNT_NEAR] > cnt[CNT_NEAREST]) {
1090             FFSWAP(uint8_t,     cnt[CNT_NEAREST],     cnt[CNT_NEAR]);
1091             FFSWAP( VP56mv, near_mv[CNT_NEAREST], near_mv[CNT_NEAR]);
1092         }
1093
1094         if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAREST]][1])) {
1095             if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAR]][2])) {
1096                 /* Choose the best mv out of 0,0 and the nearest mv */
1097                 clamp_mv(s, &mb->mv, &near_mv[CNT_ZERO + (cnt[CNT_NEAREST] >= cnt[CNT_ZERO])]);
1098                 cnt[CNT_SPLITMV] = ((mb_edge[VP8_EDGE_LEFT]->mode    == VP8_MVMODE_SPLIT) +
1099                                     (mb_edge[VP8_EDGE_TOP]->mode     == VP8_MVMODE_SPLIT)) * 2 +
1100                                     (mb_edge[VP8_EDGE_TOPLEFT]->mode == VP8_MVMODE_SPLIT);
1101
1102                 if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_SPLITMV]][3])) {
1103                     mb->mode = VP8_MVMODE_SPLIT;
1104                     mb->mv = mb->bmv[decode_splitmvs(s, c, mb, layout, IS_VP8) - 1];
1105                 } else {
1106                     mb->mv.y  += vp8_read_mv_component(c, s->prob->mvc[0]);
1107                     mb->mv.x  += vp8_read_mv_component(c, s->prob->mvc[1]);
1108                     mb->bmv[0] = mb->mv;
1109                 }
1110             } else {
1111                 clamp_mv(s, &mb->mv, &near_mv[CNT_NEAR]);
1112                 mb->bmv[0] = mb->mv;
1113             }
1114         } else {
1115             clamp_mv(s, &mb->mv, &near_mv[CNT_NEAREST]);
1116             mb->bmv[0] = mb->mv;
1117         }
1118     } else {
1119         mb->mode = VP8_MVMODE_ZERO;
1120         AV_ZERO32(&mb->mv);
1121         mb->bmv[0] = mb->mv;
1122     }
1123 }
1124
1125 static av_always_inline
1126 void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
1127                            int mb_x, int keyframe, int layout)
1128 {
1129     uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
1130
1131     if (layout) {
1132         VP8Macroblock *mb_top = mb - s->mb_width - 1;
1133         memcpy(mb->intra4x4_pred_mode_top, mb_top->intra4x4_pred_mode_top, 4);
1134     }
1135     if (keyframe) {
1136         int x, y;
1137         uint8_t *top;
1138         uint8_t *const left = s->intra4x4_pred_mode_left;
1139         if (layout)
1140             top = mb->intra4x4_pred_mode_top;
1141         else
1142             top = s->intra4x4_pred_mode_top + 4 * mb_x;
1143         for (y = 0; y < 4; y++) {
1144             for (x = 0; x < 4; x++) {
1145                 const uint8_t *ctx;
1146                 ctx       = vp8_pred4x4_prob_intra[top[x]][left[y]];
1147                 *intra4x4 = vp8_rac_get_tree(c, vp8_pred4x4_tree, ctx);
1148                 left[y]   = top[x] = *intra4x4;
1149                 intra4x4++;
1150             }
1151         }
1152     } else {
1153         int i;
1154         for (i = 0; i < 16; i++)
1155             intra4x4[i] = vp8_rac_get_tree(c, vp8_pred4x4_tree,
1156                                            vp8_pred4x4_prob_inter);
1157     }
1158 }
1159
1160 static av_always_inline
1161 void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
1162                     uint8_t *segment, uint8_t *ref, int layout, int is_vp7)
1163 {
1164     VP56RangeCoder *c = &s->c;
1165     const char *vp7_feature_name[] = { "q-index",
1166                                        "lf-delta",
1167                                        "partial-golden-update",
1168                                        "blit-pitch" };
1169     if (is_vp7) {
1170         int i;
1171         *segment = 0;
1172         for (i = 0; i < 4; i++) {
1173             if (s->feature_enabled[i]) {
1174                 if (vp56_rac_get_prob_branchy(c, s->feature_present_prob[i])) {
1175                       int index = vp8_rac_get_tree(c, vp7_feature_index_tree,
1176                                                    s->feature_index_prob[i]);
1177                       av_log(s->avctx, AV_LOG_WARNING,
1178                              "Feature %s present in macroblock (value 0x%x)\n",
1179                              vp7_feature_name[i], s->feature_value[i][index]);
1180                 }
1181            }
1182         }
1183     } else if (s->segmentation.update_map) {
1184         int bit  = vp56_rac_get_prob(c, s->prob->segmentid[0]);
1185         *segment = vp56_rac_get_prob(c, s->prob->segmentid[1+bit]) + 2*bit;
1186     } else if (s->segmentation.enabled)
1187         *segment = ref ? *ref : *segment;
1188     mb->segment = *segment;
1189
1190     mb->skip = s->mbskip_enabled ? vp56_rac_get_prob(c, s->prob->mbskip) : 0;
1191
1192     if (s->keyframe) {
1193         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_intra,
1194                                     vp8_pred16x16_prob_intra);
1195
1196         if (mb->mode == MODE_I4x4) {
1197             decode_intra4x4_modes(s, c, mb, mb_x, 1, layout);
1198         } else {
1199             const uint32_t modes = (is_vp7 ? vp7_pred4x4_mode
1200                                            : vp8_pred4x4_mode)[mb->mode] * 0x01010101u;
1201             if (s->mb_layout)
1202                 AV_WN32A(mb->intra4x4_pred_mode_top, modes);
1203             else
1204                 AV_WN32A(s->intra4x4_pred_mode_top + 4 * mb_x, modes);
1205             AV_WN32A(s->intra4x4_pred_mode_left, modes);
1206         }
1207
1208         mb->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree,
1209                                                 vp8_pred8x8c_prob_intra);
1210         mb->ref_frame        = VP56_FRAME_CURRENT;
1211     } else if (vp56_rac_get_prob_branchy(c, s->prob->intra)) {
1212         // inter MB, 16.2
1213         if (vp56_rac_get_prob_branchy(c, s->prob->last))
1214             mb->ref_frame =
1215                 (!is_vp7 && vp56_rac_get_prob(c, s->prob->golden)) ? VP56_FRAME_GOLDEN2 /* altref */
1216                                                                    : VP56_FRAME_GOLDEN;
1217         else
1218             mb->ref_frame = VP56_FRAME_PREVIOUS;
1219         s->ref_count[mb->ref_frame - 1]++;
1220
1221         // motion vectors, 16.3
1222         if (is_vp7)
1223             vp7_decode_mvs(s, mb, mb_x, mb_y, layout);
1224         else
1225             vp8_decode_mvs(s, mb, mb_x, mb_y, layout);
1226     } else {
1227         // intra MB, 16.1
1228         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_inter, s->prob->pred16x16);
1229
1230         if (mb->mode == MODE_I4x4)
1231             decode_intra4x4_modes(s, c, mb, mb_x, 0, layout);
1232
1233         mb->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree,
1234                                                 s->prob->pred8x8c);
1235         mb->ref_frame        = VP56_FRAME_CURRENT;
1236         mb->partitioning     = VP8_SPLITMVMODE_NONE;
1237         AV_ZERO32(&mb->bmv[0]);
1238     }
1239 }
1240
1241 /**
1242  * @param r     arithmetic bitstream reader context
1243  * @param block destination for block coefficients
1244  * @param probs probabilities to use when reading trees from the bitstream
1245  * @param i     initial coeff index, 0 unless a separate DC block is coded
1246  * @param qmul  array holding the dc/ac dequant factor at position 0/1
1247  *
1248  * @return 0 if no coeffs were decoded
1249  *         otherwise, the index of the last coeff decoded plus one
1250  */
1251 static av_always_inline
1252 int decode_block_coeffs_internal(VP56RangeCoder *r, int16_t block[16],
1253                                  uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1254                                  int i, uint8_t *token_prob, int16_t qmul[2],
1255                                  const uint8_t scan[16], int vp7)
1256 {
1257     VP56RangeCoder c = *r;
1258     goto skip_eob;
1259     do {
1260         int coeff;
1261 restart:
1262         if (!vp56_rac_get_prob_branchy(&c, token_prob[0]))   // DCT_EOB
1263             break;
1264
1265 skip_eob:
1266         if (!vp56_rac_get_prob_branchy(&c, token_prob[1])) { // DCT_0
1267             if (++i == 16)
1268                 break; // invalid input; blocks should end with EOB
1269             token_prob = probs[i][0];
1270             if (vp7)
1271                 goto restart;
1272             goto skip_eob;
1273         }
1274
1275         if (!vp56_rac_get_prob_branchy(&c, token_prob[2])) { // DCT_1
1276             coeff = 1;
1277             token_prob = probs[i + 1][1];
1278         } else {
1279             if (!vp56_rac_get_prob_branchy(&c, token_prob[3])) { // DCT 2,3,4
1280                 coeff = vp56_rac_get_prob_branchy(&c, token_prob[4]);
1281                 if (coeff)
1282                     coeff += vp56_rac_get_prob(&c, token_prob[5]);
1283                 coeff += 2;
1284             } else {
1285                 // DCT_CAT*
1286                 if (!vp56_rac_get_prob_branchy(&c, token_prob[6])) {
1287                     if (!vp56_rac_get_prob_branchy(&c, token_prob[7])) { // DCT_CAT1
1288                         coeff = 5 + vp56_rac_get_prob(&c, vp8_dct_cat1_prob[0]);
1289                     } else {                                    // DCT_CAT2
1290                         coeff  = 7;
1291                         coeff += vp56_rac_get_prob(&c, vp8_dct_cat2_prob[0]) << 1;
1292                         coeff += vp56_rac_get_prob(&c, vp8_dct_cat2_prob[1]);
1293                     }
1294                 } else {    // DCT_CAT3 and up
1295                     int a   = vp56_rac_get_prob(&c, token_prob[8]);
1296                     int b   = vp56_rac_get_prob(&c, token_prob[9 + a]);
1297                     int cat = (a << 1) + b;
1298                     coeff  = 3 + (8 << cat);
1299                     coeff += vp8_rac_get_coeff(&c, ff_vp8_dct_cat_prob[cat]);
1300                 }
1301             }
1302             token_prob = probs[i + 1][2];
1303         }
1304         block[scan[i]] = (vp8_rac_get(&c) ? -coeff : coeff) * qmul[!!i];
1305     } while (++i < 16);
1306
1307     *r = c;
1308     return i;
1309 }
1310
1311 static av_always_inline
1312 int inter_predict_dc(int16_t block[16], int16_t pred[2])
1313 {
1314     int16_t dc = block[0];
1315     int ret = 0;
1316
1317     if (pred[1] > 3) {
1318         dc += pred[0];
1319         ret = 1;
1320     }
1321
1322     if (!pred[0] | !dc | ((int32_t)pred[0] ^ (int32_t)dc) >> 31) {
1323         block[0] = pred[0] = dc;
1324         pred[1] = 0;
1325     } else {
1326         if (pred[0] == dc)
1327             pred[1]++;
1328         block[0] = pred[0] = dc;
1329     }
1330
1331     return ret;
1332 }
1333
1334 static int vp7_decode_block_coeffs_internal(VP56RangeCoder *r,
1335                                             int16_t block[16],
1336                                             uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1337                                             int i, uint8_t *token_prob,
1338                                             int16_t qmul[2],
1339                                             const uint8_t scan[16])
1340 {
1341     return decode_block_coeffs_internal(r, block, probs, i,
1342                                         token_prob, qmul, scan, IS_VP7);
1343 }
1344
1345 #ifndef vp8_decode_block_coeffs_internal
1346 static int vp8_decode_block_coeffs_internal(VP56RangeCoder *r,
1347                                             int16_t block[16],
1348                                             uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1349                                             int i, uint8_t *token_prob,
1350                                             int16_t qmul[2])
1351 {
1352     return decode_block_coeffs_internal(r, block, probs, i,
1353                                         token_prob, qmul, zigzag_scan, IS_VP8);
1354 }
1355 #endif
1356
1357 /**
1358  * @param c          arithmetic bitstream reader context
1359  * @param block      destination for block coefficients
1360  * @param probs      probabilities to use when reading trees from the bitstream
1361  * @param i          initial coeff index, 0 unless a separate DC block is coded
1362  * @param zero_nhood the initial prediction context for number of surrounding
1363  *                   all-zero blocks (only left/top, so 0-2)
1364  * @param qmul       array holding the dc/ac dequant factor at position 0/1
1365  * @param scan       scan pattern (VP7 only)
1366  *
1367  * @return 0 if no coeffs were decoded
1368  *         otherwise, the index of the last coeff decoded plus one
1369  */
1370 static av_always_inline
1371 int decode_block_coeffs(VP56RangeCoder *c, int16_t block[16],
1372                         uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1373                         int i, int zero_nhood, int16_t qmul[2],
1374                         const uint8_t scan[16], int vp7)
1375 {
1376     uint8_t *token_prob = probs[i][zero_nhood];
1377     if (!vp56_rac_get_prob_branchy(c, token_prob[0]))   // DCT_EOB
1378         return 0;
1379     return vp7 ? vp7_decode_block_coeffs_internal(c, block, probs, i,
1380                                                   token_prob, qmul, scan)
1381                : vp8_decode_block_coeffs_internal(c, block, probs, i,
1382                                                   token_prob, qmul);
1383 }
1384
1385 static av_always_inline
1386 void decode_mb_coeffs(VP8Context *s, VP8ThreadData *td, VP56RangeCoder *c,
1387                       VP8Macroblock *mb, uint8_t t_nnz[9], uint8_t l_nnz[9],
1388                       int is_vp7)
1389 {
1390     int i, x, y, luma_start = 0, luma_ctx = 3;
1391     int nnz_pred, nnz, nnz_total = 0;
1392     int segment = mb->segment;
1393     int block_dc = 0;
1394
1395     if (mb->mode != MODE_I4x4 && (is_vp7 || mb->mode != VP8_MVMODE_SPLIT)) {
1396         nnz_pred = t_nnz[8] + l_nnz[8];
1397
1398         // decode DC values and do hadamard
1399         nnz = decode_block_coeffs(c, td->block_dc, s->prob->token[1], 0,
1400                                   nnz_pred, s->qmat[segment].luma_dc_qmul,
1401                                   zigzag_scan, is_vp7);
1402         l_nnz[8] = t_nnz[8] = !!nnz;
1403
1404         if (is_vp7 && mb->mode > MODE_I4x4) {
1405             nnz |=  inter_predict_dc(td->block_dc,
1406                                      s->inter_dc_pred[mb->ref_frame - 1]);
1407         }
1408
1409         if (nnz) {
1410             nnz_total += nnz;
1411             block_dc   = 1;
1412             if (nnz == 1)
1413                 s->vp8dsp.vp8_luma_dc_wht_dc(td->block, td->block_dc);
1414             else
1415                 s->vp8dsp.vp8_luma_dc_wht(td->block, td->block_dc);
1416         }
1417         luma_start = 1;
1418         luma_ctx   = 0;
1419     }
1420
1421     // luma blocks
1422     for (y = 0; y < 4; y++)
1423         for (x = 0; x < 4; x++) {
1424             nnz_pred = l_nnz[y] + t_nnz[x];
1425             nnz = decode_block_coeffs(c, td->block[y][x],
1426                                       s->prob->token[luma_ctx],
1427                                       luma_start, nnz_pred,
1428                                       s->qmat[segment].luma_qmul,
1429                                       s->prob[0].scan, is_vp7);
1430             /* nnz+block_dc may be one more than the actual last index,
1431              * but we don't care */
1432             td->non_zero_count_cache[y][x] = nnz + block_dc;
1433             t_nnz[x] = l_nnz[y] = !!nnz;
1434             nnz_total += nnz;
1435         }
1436
1437     // chroma blocks
1438     // TODO: what to do about dimensions? 2nd dim for luma is x,
1439     // but for chroma it's (y<<1)|x
1440     for (i = 4; i < 6; i++)
1441         for (y = 0; y < 2; y++)
1442             for (x = 0; x < 2; x++) {
1443                 nnz_pred = l_nnz[i + 2 * y] + t_nnz[i + 2 * x];
1444                 nnz = decode_block_coeffs(c, td->block[i][(y << 1) + x],
1445                                           s->prob->token[2], 0, nnz_pred,
1446                                           s->qmat[segment].chroma_qmul,
1447                                           s->prob[0].scan, is_vp7);
1448                 td->non_zero_count_cache[i][(y << 1) + x] = nnz;
1449                 t_nnz[i + 2 * x] = l_nnz[i + 2 * y] = !!nnz;
1450                 nnz_total += nnz;
1451             }
1452
1453     // if there were no coded coeffs despite the macroblock not being marked skip,
1454     // we MUST not do the inner loop filter and should not do IDCT
1455     // Since skip isn't used for bitstream prediction, just manually set it.
1456     if (!nnz_total)
1457         mb->skip = 1;
1458 }
1459
1460 static av_always_inline
1461 void backup_mb_border(uint8_t *top_border, uint8_t *src_y,
1462                       uint8_t *src_cb, uint8_t *src_cr,
1463                       int linesize, int uvlinesize, int simple)
1464 {
1465     AV_COPY128(top_border, src_y + 15 * linesize);
1466     if (!simple) {
1467         AV_COPY64(top_border + 16, src_cb + 7 * uvlinesize);
1468         AV_COPY64(top_border + 24, src_cr + 7 * uvlinesize);
1469     }
1470 }
1471
1472 static av_always_inline
1473 void xchg_mb_border(uint8_t *top_border, uint8_t *src_y, uint8_t *src_cb,
1474                     uint8_t *src_cr, int linesize, int uvlinesize, int mb_x,
1475                     int mb_y, int mb_width, int simple, int xchg)
1476 {
1477     uint8_t *top_border_m1 = top_border - 32;     // for TL prediction
1478     src_y  -= linesize;
1479     src_cb -= uvlinesize;
1480     src_cr -= uvlinesize;
1481
1482 #define XCHG(a, b, xchg)                                                      \
1483     do {                                                                      \
1484         if (xchg)                                                             \
1485             AV_SWAP64(b, a);                                                  \
1486         else                                                                  \
1487             AV_COPY64(b, a);                                                  \
1488     } while (0)
1489
1490     XCHG(top_border_m1 + 8, src_y - 8, xchg);
1491     XCHG(top_border, src_y, xchg);
1492     XCHG(top_border + 8, src_y + 8, 1);
1493     if (mb_x < mb_width - 1)
1494         XCHG(top_border + 32, src_y + 16, 1);
1495
1496     // only copy chroma for normal loop filter
1497     // or to initialize the top row to 127
1498     if (!simple || !mb_y) {
1499         XCHG(top_border_m1 + 16, src_cb - 8, xchg);
1500         XCHG(top_border_m1 + 24, src_cr - 8, xchg);
1501         XCHG(top_border + 16, src_cb, 1);
1502         XCHG(top_border + 24, src_cr, 1);
1503     }
1504 }
1505
1506 static av_always_inline
1507 int check_dc_pred8x8_mode(int mode, int mb_x, int mb_y)
1508 {
1509     if (!mb_x)
1510         return mb_y ? TOP_DC_PRED8x8 : DC_128_PRED8x8;
1511     else
1512         return mb_y ? mode : LEFT_DC_PRED8x8;
1513 }
1514
1515 static av_always_inline
1516 int check_tm_pred8x8_mode(int mode, int mb_x, int mb_y, int vp7)
1517 {
1518     if (!mb_x)
1519         return mb_y ? VERT_PRED8x8 : (vp7 ? DC_128_PRED8x8 : DC_129_PRED8x8);
1520     else
1521         return mb_y ? mode : HOR_PRED8x8;
1522 }
1523
1524 static av_always_inline
1525 int check_intra_pred8x8_mode_emuedge(int mode, int mb_x, int mb_y, int vp7)
1526 {
1527     switch (mode) {
1528     case DC_PRED8x8:
1529         return check_dc_pred8x8_mode(mode, mb_x, mb_y);
1530     case VERT_PRED8x8:
1531         return !mb_y ? (vp7 ? DC_128_PRED8x8 : DC_127_PRED8x8) : mode;
1532     case HOR_PRED8x8:
1533         return !mb_x ? (vp7 ? DC_128_PRED8x8 : DC_129_PRED8x8) : mode;
1534     case PLANE_PRED8x8: /* TM */
1535         return check_tm_pred8x8_mode(mode, mb_x, mb_y, vp7);
1536     }
1537     return mode;
1538 }
1539
1540 static av_always_inline
1541 int check_tm_pred4x4_mode(int mode, int mb_x, int mb_y, int vp7)
1542 {
1543     if (!mb_x) {
1544         return mb_y ? VERT_VP8_PRED : (vp7 ? DC_128_PRED : DC_129_PRED);
1545     } else {
1546         return mb_y ? mode : HOR_VP8_PRED;
1547     }
1548 }
1549
1550 static av_always_inline
1551 int check_intra_pred4x4_mode_emuedge(int mode, int mb_x, int mb_y,
1552                                      int *copy_buf, int vp7)
1553 {
1554     switch (mode) {
1555     case VERT_PRED:
1556         if (!mb_x && mb_y) {
1557             *copy_buf = 1;
1558             return mode;
1559         }
1560         /* fall-through */
1561     case DIAG_DOWN_LEFT_PRED:
1562     case VERT_LEFT_PRED:
1563         return !mb_y ? (vp7 ? DC_128_PRED : DC_127_PRED) : mode;
1564     case HOR_PRED:
1565         if (!mb_y) {
1566             *copy_buf = 1;
1567             return mode;
1568         }
1569         /* fall-through */
1570     case HOR_UP_PRED:
1571         return !mb_x ? (vp7 ? DC_128_PRED : DC_129_PRED) : mode;
1572     case TM_VP8_PRED:
1573         return check_tm_pred4x4_mode(mode, mb_x, mb_y, vp7);
1574     case DC_PRED: /* 4x4 DC doesn't use the same "H.264-style" exceptions
1575                    * as 16x16/8x8 DC */
1576     case DIAG_DOWN_RIGHT_PRED:
1577     case VERT_RIGHT_PRED:
1578     case HOR_DOWN_PRED:
1579         if (!mb_y || !mb_x)
1580             *copy_buf = 1;
1581         return mode;
1582     }
1583     return mode;
1584 }
1585
1586 static av_always_inline
1587 void intra_predict(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1588                    VP8Macroblock *mb, int mb_x, int mb_y, int is_vp7)
1589 {
1590     int x, y, mode, nnz;
1591     uint32_t tr;
1592
1593     /* for the first row, we need to run xchg_mb_border to init the top edge
1594      * to 127 otherwise, skip it if we aren't going to deblock */
1595     if (mb_y && (s->deblock_filter || !mb_y) && td->thread_nr == 0)
1596         xchg_mb_border(s->top_border[mb_x + 1], dst[0], dst[1], dst[2],
1597                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1598                        s->filter.simple, 1);
1599
1600     if (mb->mode < MODE_I4x4) {
1601         mode = check_intra_pred8x8_mode_emuedge(mb->mode, mb_x, mb_y, is_vp7);
1602         s->hpc.pred16x16[mode](dst[0], s->linesize);
1603     } else {
1604         uint8_t *ptr = dst[0];
1605         uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
1606         const uint8_t lo = is_vp7 ? 128 : 127;
1607         const uint8_t hi = is_vp7 ? 128 : 129;
1608         uint8_t tr_top[4] = { lo, lo, lo, lo };
1609
1610         // all blocks on the right edge of the macroblock use bottom edge
1611         // the top macroblock for their topright edge
1612         uint8_t *tr_right = ptr - s->linesize + 16;
1613
1614         // if we're on the right edge of the frame, said edge is extended
1615         // from the top macroblock
1616         if (mb_y && mb_x == s->mb_width - 1) {
1617             tr       = tr_right[-1] * 0x01010101u;
1618             tr_right = (uint8_t *) &tr;
1619         }
1620
1621         if (mb->skip)
1622             AV_ZERO128(td->non_zero_count_cache);
1623
1624         for (y = 0; y < 4; y++) {
1625             uint8_t *topright = ptr + 4 - s->linesize;
1626             for (x = 0; x < 4; x++) {
1627                 int copy = 0, linesize = s->linesize;
1628                 uint8_t *dst = ptr + 4 * x;
1629                 LOCAL_ALIGNED(4, uint8_t, copy_dst, [5 * 8]);
1630
1631                 if ((y == 0 || x == 3) && mb_y == 0) {
1632                     topright = tr_top;
1633                 } else if (x == 3)
1634                     topright = tr_right;
1635
1636                 mode = check_intra_pred4x4_mode_emuedge(intra4x4[x], mb_x + x,
1637                                                         mb_y + y, &copy, is_vp7);
1638                 if (copy) {
1639                     dst      = copy_dst + 12;
1640                     linesize = 8;
1641                     if (!(mb_y + y)) {
1642                         copy_dst[3] = lo;
1643                         AV_WN32A(copy_dst + 4, lo * 0x01010101U);
1644                     } else {
1645                         AV_COPY32(copy_dst + 4, ptr + 4 * x - s->linesize);
1646                         if (!(mb_x + x)) {
1647                             copy_dst[3] = hi;
1648                         } else {
1649                             copy_dst[3] = ptr[4 * x - s->linesize - 1];
1650                         }
1651                     }
1652                     if (!(mb_x + x)) {
1653                         copy_dst[11] =
1654                         copy_dst[19] =
1655                         copy_dst[27] =
1656                         copy_dst[35] = hi;
1657                     } else {
1658                         copy_dst[11] = ptr[4 * x                   - 1];
1659                         copy_dst[19] = ptr[4 * x + s->linesize     - 1];
1660                         copy_dst[27] = ptr[4 * x + s->linesize * 2 - 1];
1661                         copy_dst[35] = ptr[4 * x + s->linesize * 3 - 1];
1662                     }
1663                 }
1664                 s->hpc.pred4x4[mode](dst, topright, linesize);
1665                 if (copy) {
1666                     AV_COPY32(ptr + 4 * x,                   copy_dst + 12);
1667                     AV_COPY32(ptr + 4 * x + s->linesize,     copy_dst + 20);
1668                     AV_COPY32(ptr + 4 * x + s->linesize * 2, copy_dst + 28);
1669                     AV_COPY32(ptr + 4 * x + s->linesize * 3, copy_dst + 36);
1670                 }
1671
1672                 nnz = td->non_zero_count_cache[y][x];
1673                 if (nnz) {
1674                     if (nnz == 1)
1675                         s->vp8dsp.vp8_idct_dc_add(ptr + 4 * x,
1676                                                   td->block[y][x], s->linesize);
1677                     else
1678                         s->vp8dsp.vp8_idct_add(ptr + 4 * x,
1679                                                td->block[y][x], s->linesize);
1680                 }
1681                 topright += 4;
1682             }
1683
1684             ptr      += 4 * s->linesize;
1685             intra4x4 += 4;
1686         }
1687     }
1688
1689     mode = check_intra_pred8x8_mode_emuedge(mb->chroma_pred_mode,
1690                                             mb_x, mb_y, is_vp7);
1691     s->hpc.pred8x8[mode](dst[1], s->uvlinesize);
1692     s->hpc.pred8x8[mode](dst[2], s->uvlinesize);
1693
1694     if (mb_y && (s->deblock_filter || !mb_y) && td->thread_nr == 0)
1695         xchg_mb_border(s->top_border[mb_x + 1], dst[0], dst[1], dst[2],
1696                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1697                        s->filter.simple, 0);
1698 }
1699
1700 static const uint8_t subpel_idx[3][8] = {
1701     { 0, 1, 2, 1, 2, 1, 2, 1 }, // nr. of left extra pixels,
1702                                 // also function pointer index
1703     { 0, 3, 5, 3, 5, 3, 5, 3 }, // nr. of extra pixels required
1704     { 0, 2, 3, 2, 3, 2, 3, 2 }, // nr. of right extra pixels
1705 };
1706
1707 /**
1708  * luma MC function
1709  *
1710  * @param s        VP8 decoding context
1711  * @param dst      target buffer for block data at block position
1712  * @param ref      reference picture buffer at origin (0, 0)
1713  * @param mv       motion vector (relative to block position) to get pixel data from
1714  * @param x_off    horizontal position of block from origin (0, 0)
1715  * @param y_off    vertical position of block from origin (0, 0)
1716  * @param block_w  width of block (16, 8 or 4)
1717  * @param block_h  height of block (always same as block_w)
1718  * @param width    width of src/dst plane data
1719  * @param height   height of src/dst plane data
1720  * @param linesize size of a single line of plane data, including padding
1721  * @param mc_func  motion compensation function pointers (bilinear or sixtap MC)
1722  */
1723 static av_always_inline
1724 void vp8_mc_luma(VP8Context *s, VP8ThreadData *td, uint8_t *dst,
1725                  ThreadFrame *ref, const VP56mv *mv,
1726                  int x_off, int y_off, int block_w, int block_h,
1727                  int width, int height, ptrdiff_t linesize,
1728                  vp8_mc_func mc_func[3][3])
1729 {
1730     uint8_t *src = ref->f->data[0];
1731
1732     if (AV_RN32A(mv)) {
1733         int src_linesize = linesize;
1734
1735         int mx = (mv->x * 2) & 7, mx_idx = subpel_idx[0][mx];
1736         int my = (mv->y * 2) & 7, my_idx = subpel_idx[0][my];
1737
1738         x_off += mv->x >> 2;
1739         y_off += mv->y >> 2;
1740
1741         // edge emulation
1742         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 4, 0);
1743         src += y_off * linesize + x_off;
1744         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1745             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1746             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1747                                      src - my_idx * linesize - mx_idx,
1748                                      EDGE_EMU_LINESIZE, linesize,
1749                                      block_w + subpel_idx[1][mx],
1750                                      block_h + subpel_idx[1][my],
1751                                      x_off - mx_idx, y_off - my_idx,
1752                                      width, height);
1753             src = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
1754             src_linesize = EDGE_EMU_LINESIZE;
1755         }
1756         mc_func[my_idx][mx_idx](dst, linesize, src, src_linesize, block_h, mx, my);
1757     } else {
1758         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 4, 0);
1759         mc_func[0][0](dst, linesize, src + y_off * linesize + x_off,
1760                       linesize, block_h, 0, 0);
1761     }
1762 }
1763
1764 /**
1765  * chroma MC function
1766  *
1767  * @param s        VP8 decoding context
1768  * @param dst1     target buffer for block data at block position (U plane)
1769  * @param dst2     target buffer for block data at block position (V plane)
1770  * @param ref      reference picture buffer at origin (0, 0)
1771  * @param mv       motion vector (relative to block position) to get pixel data from
1772  * @param x_off    horizontal position of block from origin (0, 0)
1773  * @param y_off    vertical position of block from origin (0, 0)
1774  * @param block_w  width of block (16, 8 or 4)
1775  * @param block_h  height of block (always same as block_w)
1776  * @param width    width of src/dst plane data
1777  * @param height   height of src/dst plane data
1778  * @param linesize size of a single line of plane data, including padding
1779  * @param mc_func  motion compensation function pointers (bilinear or sixtap MC)
1780  */
1781 static av_always_inline
1782 void vp8_mc_chroma(VP8Context *s, VP8ThreadData *td, uint8_t *dst1,
1783                    uint8_t *dst2, ThreadFrame *ref, const VP56mv *mv,
1784                    int x_off, int y_off, int block_w, int block_h,
1785                    int width, int height, ptrdiff_t linesize,
1786                    vp8_mc_func mc_func[3][3])
1787 {
1788     uint8_t *src1 = ref->f->data[1], *src2 = ref->f->data[2];
1789
1790     if (AV_RN32A(mv)) {
1791         int mx = mv->x & 7, mx_idx = subpel_idx[0][mx];
1792         int my = mv->y & 7, my_idx = subpel_idx[0][my];
1793
1794         x_off += mv->x >> 3;
1795         y_off += mv->y >> 3;
1796
1797         // edge emulation
1798         src1 += y_off * linesize + x_off;
1799         src2 += y_off * linesize + x_off;
1800         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 3, 0);
1801         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1802             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1803             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1804                                      src1 - my_idx * linesize - mx_idx,
1805                                      EDGE_EMU_LINESIZE, linesize,
1806                                      block_w + subpel_idx[1][mx],
1807                                      block_h + subpel_idx[1][my],
1808                                      x_off - mx_idx, y_off - my_idx, width, height);
1809             src1 = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
1810             mc_func[my_idx][mx_idx](dst1, linesize, src1, EDGE_EMU_LINESIZE, block_h, mx, my);
1811
1812             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1813                                      src2 - my_idx * linesize - mx_idx,
1814                                      EDGE_EMU_LINESIZE, linesize,
1815                                      block_w + subpel_idx[1][mx],
1816                                      block_h + subpel_idx[1][my],
1817                                      x_off - mx_idx, y_off - my_idx, width, height);
1818             src2 = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
1819             mc_func[my_idx][mx_idx](dst2, linesize, src2, EDGE_EMU_LINESIZE, block_h, mx, my);
1820         } else {
1821             mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my);
1822             mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my);
1823         }
1824     } else {
1825         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 3, 0);
1826         mc_func[0][0](dst1, linesize, src1 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1827         mc_func[0][0](dst2, linesize, src2 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1828     }
1829 }
1830
1831 static av_always_inline
1832 void vp8_mc_part(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1833                  ThreadFrame *ref_frame, int x_off, int y_off,
1834                  int bx_off, int by_off, int block_w, int block_h,
1835                  int width, int height, VP56mv *mv)
1836 {
1837     VP56mv uvmv = *mv;
1838
1839     /* Y */
1840     vp8_mc_luma(s, td, dst[0] + by_off * s->linesize + bx_off,
1841                 ref_frame, mv, x_off + bx_off, y_off + by_off,
1842                 block_w, block_h, width, height, s->linesize,
1843                 s->put_pixels_tab[block_w == 8]);
1844
1845     /* U/V */
1846     if (s->profile == 3) {
1847         /* this block only applies VP8; it is safe to check
1848          * only the profile, as VP7 profile <= 1 */
1849         uvmv.x &= ~7;
1850         uvmv.y &= ~7;
1851     }
1852     x_off   >>= 1;
1853     y_off   >>= 1;
1854     bx_off  >>= 1;
1855     by_off  >>= 1;
1856     width   >>= 1;
1857     height  >>= 1;
1858     block_w >>= 1;
1859     block_h >>= 1;
1860     vp8_mc_chroma(s, td, dst[1] + by_off * s->uvlinesize + bx_off,
1861                   dst[2] + by_off * s->uvlinesize + bx_off, ref_frame,
1862                   &uvmv, x_off + bx_off, y_off + by_off,
1863                   block_w, block_h, width, height, s->uvlinesize,
1864                   s->put_pixels_tab[1 + (block_w == 4)]);
1865 }
1866
1867 /* Fetch pixels for estimated mv 4 macroblocks ahead.
1868  * Optimized for 64-byte cache lines. Inspired by ffh264 prefetch_motion. */
1869 static av_always_inline
1870 void prefetch_motion(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
1871                      int mb_xy, int ref)
1872 {
1873     /* Don't prefetch refs that haven't been used very often this frame. */
1874     if (s->ref_count[ref - 1] > (mb_xy >> 5)) {
1875         int x_off = mb_x << 4, y_off = mb_y << 4;
1876         int mx = (mb->mv.x >> 2) + x_off + 8;
1877         int my = (mb->mv.y >> 2) + y_off;
1878         uint8_t **src = s->framep[ref]->tf.f->data;
1879         int off = mx + (my + (mb_x & 3) * 4) * s->linesize + 64;
1880         /* For threading, a ff_thread_await_progress here might be useful, but
1881          * it actually slows down the decoder. Since a bad prefetch doesn't
1882          * generate bad decoder output, we don't run it here. */
1883         s->vdsp.prefetch(src[0] + off, s->linesize, 4);
1884         off = (mx >> 1) + ((my >> 1) + (mb_x & 7)) * s->uvlinesize + 64;
1885         s->vdsp.prefetch(src[1] + off, src[2] - src[1], 2);
1886     }
1887 }
1888
1889 /**
1890  * Apply motion vectors to prediction buffer, chapter 18.
1891  */
1892 static av_always_inline
1893 void inter_predict(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1894                    VP8Macroblock *mb, int mb_x, int mb_y)
1895 {
1896     int x_off = mb_x << 4, y_off = mb_y << 4;
1897     int width = 16 * s->mb_width, height = 16 * s->mb_height;
1898     ThreadFrame *ref = &s->framep[mb->ref_frame]->tf;
1899     VP56mv *bmv = mb->bmv;
1900
1901     switch (mb->partitioning) {
1902     case VP8_SPLITMVMODE_NONE:
1903         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1904                     0, 0, 16, 16, width, height, &mb->mv);
1905         break;
1906     case VP8_SPLITMVMODE_4x4: {
1907         int x, y;
1908         VP56mv uvmv;
1909
1910         /* Y */
1911         for (y = 0; y < 4; y++) {
1912             for (x = 0; x < 4; x++) {
1913                 vp8_mc_luma(s, td, dst[0] + 4 * y * s->linesize + x * 4,
1914                             ref, &bmv[4 * y + x],
1915                             4 * x + x_off, 4 * y + y_off, 4, 4,
1916                             width, height, s->linesize,
1917                             s->put_pixels_tab[2]);
1918             }
1919         }
1920
1921         /* U/V */
1922         x_off  >>= 1;
1923         y_off  >>= 1;
1924         width  >>= 1;
1925         height >>= 1;
1926         for (y = 0; y < 2; y++) {
1927             for (x = 0; x < 2; x++) {
1928                 uvmv.x = mb->bmv[2 * y       * 4 + 2 * x    ].x +
1929                          mb->bmv[2 * y       * 4 + 2 * x + 1].x +
1930                          mb->bmv[(2 * y + 1) * 4 + 2 * x    ].x +
1931                          mb->bmv[(2 * y + 1) * 4 + 2 * x + 1].x;
1932                 uvmv.y = mb->bmv[2 * y       * 4 + 2 * x    ].y +
1933                          mb->bmv[2 * y       * 4 + 2 * x + 1].y +
1934                          mb->bmv[(2 * y + 1) * 4 + 2 * x    ].y +
1935                          mb->bmv[(2 * y + 1) * 4 + 2 * x + 1].y;
1936                 uvmv.x = (uvmv.x + 2 + FF_SIGNBIT(uvmv.x)) >> 2;
1937                 uvmv.y = (uvmv.y + 2 + FF_SIGNBIT(uvmv.y)) >> 2;
1938                 if (s->profile == 3) {
1939                     uvmv.x &= ~7;
1940                     uvmv.y &= ~7;
1941                 }
1942                 vp8_mc_chroma(s, td, dst[1] + 4 * y * s->uvlinesize + x * 4,
1943                               dst[2] + 4 * y * s->uvlinesize + x * 4, ref,
1944                               &uvmv, 4 * x + x_off, 4 * y + y_off, 4, 4,
1945                               width, height, s->uvlinesize,
1946                               s->put_pixels_tab[2]);
1947             }
1948         }
1949         break;
1950     }
1951     case VP8_SPLITMVMODE_16x8:
1952         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1953                     0, 0, 16, 8, width, height, &bmv[0]);
1954         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1955                     0, 8, 16, 8, width, height, &bmv[1]);
1956         break;
1957     case VP8_SPLITMVMODE_8x16:
1958         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1959                     0, 0, 8, 16, width, height, &bmv[0]);
1960         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1961                     8, 0, 8, 16, width, height, &bmv[1]);
1962         break;
1963     case VP8_SPLITMVMODE_8x8:
1964         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1965                     0, 0, 8, 8, width, height, &bmv[0]);
1966         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1967                     8, 0, 8, 8, width, height, &bmv[1]);
1968         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1969                     0, 8, 8, 8, width, height, &bmv[2]);
1970         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1971                     8, 8, 8, 8, width, height, &bmv[3]);
1972         break;
1973     }
1974 }
1975
1976 static av_always_inline
1977 void idct_mb(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3], VP8Macroblock *mb)
1978 {
1979     int x, y, ch;
1980
1981     if (mb->mode != MODE_I4x4) {
1982         uint8_t *y_dst = dst[0];
1983         for (y = 0; y < 4; y++) {
1984             uint32_t nnz4 = AV_RL32(td->non_zero_count_cache[y]);
1985             if (nnz4) {
1986                 if (nnz4 & ~0x01010101) {
1987                     for (x = 0; x < 4; x++) {
1988                         if ((uint8_t) nnz4 == 1)
1989                             s->vp8dsp.vp8_idct_dc_add(y_dst + 4 * x,
1990                                                       td->block[y][x],
1991                                                       s->linesize);
1992                         else if ((uint8_t) nnz4 > 1)
1993                             s->vp8dsp.vp8_idct_add(y_dst + 4 * x,
1994                                                    td->block[y][x],
1995                                                    s->linesize);
1996                         nnz4 >>= 8;
1997                         if (!nnz4)
1998                             break;
1999                     }
2000                 } else {
2001                     s->vp8dsp.vp8_idct_dc_add4y(y_dst, td->block[y], s->linesize);
2002                 }
2003             }
2004             y_dst += 4 * s->linesize;
2005         }
2006     }
2007
2008     for (ch = 0; ch < 2; ch++) {
2009         uint32_t nnz4 = AV_RL32(td->non_zero_count_cache[4 + ch]);
2010         if (nnz4) {
2011             uint8_t *ch_dst = dst[1 + ch];
2012             if (nnz4 & ~0x01010101) {
2013                 for (y = 0; y < 2; y++) {
2014                     for (x = 0; x < 2; x++) {
2015                         if ((uint8_t) nnz4 == 1)
2016                             s->vp8dsp.vp8_idct_dc_add(ch_dst + 4 * x,
2017                                                       td->block[4 + ch][(y << 1) + x],
2018                                                       s->uvlinesize);
2019                         else if ((uint8_t) nnz4 > 1)
2020                             s->vp8dsp.vp8_idct_add(ch_dst + 4 * x,
2021                                                    td->block[4 + ch][(y << 1) + x],
2022                                                    s->uvlinesize);
2023                         nnz4 >>= 8;
2024                         if (!nnz4)
2025                             goto chroma_idct_end;
2026                     }
2027                     ch_dst += 4 * s->uvlinesize;
2028                 }
2029             } else {
2030                 s->vp8dsp.vp8_idct_dc_add4uv(ch_dst, td->block[4 + ch], s->uvlinesize);
2031             }
2032         }
2033 chroma_idct_end:
2034         ;
2035     }
2036 }
2037
2038 static av_always_inline
2039 void filter_level_for_mb(VP8Context *s, VP8Macroblock *mb,
2040                          VP8FilterStrength *f, int is_vp7)
2041 {
2042     int interior_limit, filter_level;
2043
2044     if (s->segmentation.enabled) {
2045         filter_level = s->segmentation.filter_level[mb->segment];
2046         if (!s->segmentation.absolute_vals)
2047             filter_level += s->filter.level;
2048     } else
2049         filter_level = s->filter.level;
2050
2051     if (s->lf_delta.enabled) {
2052         filter_level += s->lf_delta.ref[mb->ref_frame];
2053         filter_level += s->lf_delta.mode[mb->mode];
2054     }
2055
2056     filter_level = av_clip_uintp2(filter_level, 6);
2057
2058     interior_limit = filter_level;
2059     if (s->filter.sharpness) {
2060         interior_limit >>= (s->filter.sharpness + 3) >> 2;
2061         interior_limit = FFMIN(interior_limit, 9 - s->filter.sharpness);
2062     }
2063     interior_limit = FFMAX(interior_limit, 1);
2064
2065     f->filter_level = filter_level;
2066     f->inner_limit = interior_limit;
2067     f->inner_filter = is_vp7 || !mb->skip || mb->mode == MODE_I4x4 ||
2068                       mb->mode == VP8_MVMODE_SPLIT;
2069 }
2070
2071 static av_always_inline
2072 void filter_mb(VP8Context *s, uint8_t *dst[3], VP8FilterStrength *f,
2073                int mb_x, int mb_y, int is_vp7)
2074 {
2075     int mbedge_lim, bedge_lim_y, bedge_lim_uv, hev_thresh;
2076     int filter_level = f->filter_level;
2077     int inner_limit = f->inner_limit;
2078     int inner_filter = f->inner_filter;
2079     int linesize = s->linesize;
2080     int uvlinesize = s->uvlinesize;
2081     static const uint8_t hev_thresh_lut[2][64] = {
2082         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
2083           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2084           3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2085           3, 3, 3, 3 },
2086         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
2087           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2088           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2089           2, 2, 2, 2 }
2090     };
2091
2092     if (!filter_level)
2093         return;
2094
2095     if (is_vp7) {
2096         bedge_lim_y  = filter_level;
2097         bedge_lim_uv = filter_level * 2;
2098         mbedge_lim   = filter_level + 2;
2099     } else {
2100         bedge_lim_y  =
2101         bedge_lim_uv = filter_level * 2 + inner_limit;
2102         mbedge_lim   = bedge_lim_y + 4;
2103     }
2104
2105     hev_thresh = hev_thresh_lut[s->keyframe][filter_level];
2106
2107     if (mb_x) {
2108         s->vp8dsp.vp8_h_loop_filter16y(dst[0], linesize,
2109                                        mbedge_lim, inner_limit, hev_thresh);
2110         s->vp8dsp.vp8_h_loop_filter8uv(dst[1], dst[2], uvlinesize,
2111                                        mbedge_lim, inner_limit, hev_thresh);
2112     }
2113
2114 #define H_LOOP_FILTER_16Y_INNER(cond)                                         \
2115     if (cond && inner_filter) {                                               \
2116         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] +  4, linesize,           \
2117                                              bedge_lim_y, inner_limit,        \
2118                                              hev_thresh);                     \
2119         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] +  8, linesize,           \
2120                                              bedge_lim_y, inner_limit,        \
2121                                              hev_thresh);                     \
2122         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] + 12, linesize,           \
2123                                              bedge_lim_y, inner_limit,        \
2124                                              hev_thresh);                     \
2125         s->vp8dsp.vp8_h_loop_filter8uv_inner(dst[1] +  4, dst[2] + 4,         \
2126                                              uvlinesize,  bedge_lim_uv,       \
2127                                              inner_limit, hev_thresh);        \
2128     }
2129
2130     H_LOOP_FILTER_16Y_INNER(!is_vp7)
2131
2132     if (mb_y) {
2133         s->vp8dsp.vp8_v_loop_filter16y(dst[0], linesize,
2134                                        mbedge_lim, inner_limit, hev_thresh);
2135         s->vp8dsp.vp8_v_loop_filter8uv(dst[1], dst[2], uvlinesize,
2136                                        mbedge_lim, inner_limit, hev_thresh);
2137     }
2138
2139     if (inner_filter) {
2140         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] +  4 * linesize,
2141                                              linesize, bedge_lim_y,
2142                                              inner_limit, hev_thresh);
2143         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] +  8 * linesize,
2144                                              linesize, bedge_lim_y,
2145                                              inner_limit, hev_thresh);
2146         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] + 12 * linesize,
2147                                              linesize, bedge_lim_y,
2148                                              inner_limit, hev_thresh);
2149         s->vp8dsp.vp8_v_loop_filter8uv_inner(dst[1] +  4 * uvlinesize,
2150                                              dst[2] +  4 * uvlinesize,
2151                                              uvlinesize, bedge_lim_uv,
2152                                              inner_limit, hev_thresh);
2153     }
2154
2155     H_LOOP_FILTER_16Y_INNER(is_vp7)
2156 }
2157
2158 static av_always_inline
2159 void filter_mb_simple(VP8Context *s, uint8_t *dst, VP8FilterStrength *f,
2160                       int mb_x, int mb_y)
2161 {
2162     int mbedge_lim, bedge_lim;
2163     int filter_level = f->filter_level;
2164     int inner_limit  = f->inner_limit;
2165     int inner_filter = f->inner_filter;
2166     int linesize     = s->linesize;
2167
2168     if (!filter_level)
2169         return;
2170
2171     bedge_lim  = 2 * filter_level + inner_limit;
2172     mbedge_lim = bedge_lim + 4;
2173
2174     if (mb_x)
2175         s->vp8dsp.vp8_h_loop_filter_simple(dst, linesize, mbedge_lim);
2176     if (inner_filter) {
2177         s->vp8dsp.vp8_h_loop_filter_simple(dst +  4, linesize, bedge_lim);
2178         s->vp8dsp.vp8_h_loop_filter_simple(dst +  8, linesize, bedge_lim);
2179         s->vp8dsp.vp8_h_loop_filter_simple(dst + 12, linesize, bedge_lim);
2180     }
2181
2182     if (mb_y)
2183         s->vp8dsp.vp8_v_loop_filter_simple(dst, linesize, mbedge_lim);
2184     if (inner_filter) {
2185         s->vp8dsp.vp8_v_loop_filter_simple(dst +  4 * linesize, linesize, bedge_lim);
2186         s->vp8dsp.vp8_v_loop_filter_simple(dst +  8 * linesize, linesize, bedge_lim);
2187         s->vp8dsp.vp8_v_loop_filter_simple(dst + 12 * linesize, linesize, bedge_lim);
2188     }
2189 }
2190
2191 #define MARGIN (16 << 2)
2192 static av_always_inline
2193 void vp78_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *curframe,
2194                                     VP8Frame *prev_frame, int is_vp7)
2195 {
2196     VP8Context *s = avctx->priv_data;
2197     int mb_x, mb_y;
2198
2199     s->mv_min.y = -MARGIN;
2200     s->mv_max.y = ((s->mb_height - 1) << 6) + MARGIN;
2201     for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
2202         VP8Macroblock *mb = s->macroblocks_base +
2203                             ((s->mb_width + 1) * (mb_y + 1) + 1);
2204         int mb_xy = mb_y * s->mb_width;
2205
2206         AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED * 0x01010101);
2207
2208         s->mv_min.x = -MARGIN;
2209         s->mv_max.x = ((s->mb_width - 1) << 6) + MARGIN;
2210         for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
2211             if (mb_y == 0)
2212                 AV_WN32A((mb - s->mb_width - 1)->intra4x4_pred_mode_top,
2213                          DC_PRED * 0x01010101);
2214             decode_mb_mode(s, mb, mb_x, mb_y, curframe->seg_map->data + mb_xy,
2215                            prev_frame && prev_frame->seg_map ?
2216                            prev_frame->seg_map->data + mb_xy : NULL, 1, is_vp7);
2217             s->mv_min.x -= 64;
2218             s->mv_max.x -= 64;
2219         }
2220         s->mv_min.y -= 64;
2221         s->mv_max.y -= 64;
2222     }
2223 }
2224
2225 static void vp7_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *cur_frame,
2226                                    VP8Frame *prev_frame)
2227 {
2228     vp78_decode_mv_mb_modes(avctx, cur_frame, prev_frame, IS_VP7);
2229 }
2230
2231 static void vp8_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *cur_frame,
2232                                    VP8Frame *prev_frame)
2233 {
2234     vp78_decode_mv_mb_modes(avctx, cur_frame, prev_frame, IS_VP8);
2235 }
2236
2237 #if HAVE_THREADS
2238 #define check_thread_pos(td, otd, mb_x_check, mb_y_check)                     \
2239     do {                                                                      \
2240         int tmp = (mb_y_check << 16) | (mb_x_check & 0xFFFF);                 \
2241         if (otd->thread_mb_pos < tmp) {                                       \
2242             pthread_mutex_lock(&otd->lock);                                   \
2243             td->wait_mb_pos = tmp;                                            \
2244             do {                                                              \
2245                 if (otd->thread_mb_pos >= tmp)                                \
2246                     break;                                                    \
2247                 pthread_cond_wait(&otd->cond, &otd->lock);                    \
2248             } while (1);                                                      \
2249             td->wait_mb_pos = INT_MAX;                                        \
2250             pthread_mutex_unlock(&otd->lock);                                 \
2251         }                                                                     \
2252     } while (0)
2253
2254 #define update_pos(td, mb_y, mb_x)                                            \
2255     do {                                                                      \
2256         int pos              = (mb_y << 16) | (mb_x & 0xFFFF);                \
2257         int sliced_threading = (avctx->active_thread_type == FF_THREAD_SLICE) && \
2258                                (num_jobs > 1);                                \
2259         int is_null          = !next_td || !prev_td;                          \
2260         int pos_check        = (is_null) ? 1                                  \
2261                                          : (next_td != td &&                  \
2262                                             pos >= next_td->wait_mb_pos) ||   \
2263                                            (prev_td != td &&                  \
2264                                             pos >= prev_td->wait_mb_pos);     \
2265         td->thread_mb_pos = pos;                                              \
2266         if (sliced_threading && pos_check) {                                  \
2267             pthread_mutex_lock(&td->lock);                                    \
2268             pthread_cond_broadcast(&td->cond);                                \
2269             pthread_mutex_unlock(&td->lock);                                  \
2270         }                                                                     \
2271     } while (0)
2272 #else
2273 #define check_thread_pos(td, otd, mb_x_check, mb_y_check) while(0)
2274 #define update_pos(td, mb_y, mb_x) while(0)
2275 #endif
2276
2277 static av_always_inline void decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
2278                                         int jobnr, int threadnr, int is_vp7)
2279 {
2280     VP8Context *s = avctx->priv_data;
2281     VP8ThreadData *prev_td, *next_td, *td = &s->thread_data[threadnr];
2282     int mb_y = td->thread_mb_pos >> 16;
2283     int mb_x, mb_xy = mb_y * s->mb_width;
2284     int num_jobs = s->num_jobs;
2285     VP8Frame *curframe = s->curframe, *prev_frame = s->prev_frame;
2286     VP56RangeCoder *c  = &s->coeff_partition[mb_y & (s->num_coeff_partitions - 1)];
2287     VP8Macroblock *mb;
2288     uint8_t *dst[3] = {
2289         curframe->tf.f->data[0] + 16 * mb_y * s->linesize,
2290         curframe->tf.f->data[1] +  8 * mb_y * s->uvlinesize,
2291         curframe->tf.f->data[2] +  8 * mb_y * s->uvlinesize
2292     };
2293     if (mb_y == 0)
2294         prev_td = td;
2295     else
2296         prev_td = &s->thread_data[(jobnr + num_jobs - 1) % num_jobs];
2297     if (mb_y == s->mb_height - 1)
2298         next_td = td;
2299     else
2300         next_td = &s->thread_data[(jobnr + 1) % num_jobs];
2301     if (s->mb_layout == 1)
2302         mb = s->macroblocks_base + ((s->mb_width + 1) * (mb_y + 1) + 1);
2303     else {
2304         // Make sure the previous frame has read its segmentation map,
2305         // if we re-use the same map.
2306         if (prev_frame && s->segmentation.enabled &&
2307             !s->segmentation.update_map)
2308             ff_thread_await_progress(&prev_frame->tf, mb_y, 0);
2309         mb = s->macroblocks + (s->mb_height - mb_y - 1) * 2;
2310         memset(mb - 1, 0, sizeof(*mb)); // zero left macroblock
2311         AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED * 0x01010101);
2312     }
2313
2314     if (!is_vp7 || mb_y == 0)
2315         memset(td->left_nnz, 0, sizeof(td->left_nnz));
2316
2317     s->mv_min.x = -MARGIN;
2318     s->mv_max.x = ((s->mb_width - 1) << 6) + MARGIN;
2319
2320     for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
2321         // Wait for previous thread to read mb_x+2, and reach mb_y-1.
2322         if (prev_td != td) {
2323             if (threadnr != 0) {
2324                 check_thread_pos(td, prev_td,
2325                                  mb_x + (is_vp7 ? 2 : 1),
2326                                  mb_y - (is_vp7 ? 2 : 1));
2327             } else {
2328                 check_thread_pos(td, prev_td,
2329                                  mb_x + (is_vp7 ? 2 : 1) + s->mb_width + 3,
2330                                  mb_y - (is_vp7 ? 2 : 1));
2331             }
2332         }
2333
2334         s->vdsp.prefetch(dst[0] + (mb_x & 3) * 4 * s->linesize + 64,
2335                          s->linesize, 4);
2336         s->vdsp.prefetch(dst[1] + (mb_x & 7) * s->uvlinesize + 64,
2337                          dst[2] - dst[1], 2);
2338
2339         if (!s->mb_layout)
2340             decode_mb_mode(s, mb, mb_x, mb_y, curframe->seg_map->data + mb_xy,
2341                            prev_frame && prev_frame->seg_map ?
2342                            prev_frame->seg_map->data + mb_xy : NULL, 0, is_vp7);
2343
2344         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_PREVIOUS);
2345
2346         if (!mb->skip)
2347             decode_mb_coeffs(s, td, c, mb, s->top_nnz[mb_x], td->left_nnz, is_vp7);
2348
2349         if (mb->mode <= MODE_I4x4)
2350             intra_predict(s, td, dst, mb, mb_x, mb_y, is_vp7);
2351         else
2352             inter_predict(s, td, dst, mb, mb_x, mb_y);
2353
2354         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN);
2355
2356         if (!mb->skip) {
2357             idct_mb(s, td, dst, mb);
2358         } else {
2359             AV_ZERO64(td->left_nnz);
2360             AV_WN64(s->top_nnz[mb_x], 0);   // array of 9, so unaligned
2361
2362             /* Reset DC block predictors if they would exist
2363              * if the mb had coefficients */
2364             if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
2365                 td->left_nnz[8]     = 0;
2366                 s->top_nnz[mb_x][8] = 0;
2367             }
2368         }
2369
2370         if (s->deblock_filter)
2371             filter_level_for_mb(s, mb, &td->filter_strength[mb_x], is_vp7);
2372
2373         if (s->deblock_filter && num_jobs != 1 && threadnr == num_jobs - 1) {
2374             if (s->filter.simple)
2375                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2376                                  NULL, NULL, s->linesize, 0, 1);
2377             else
2378                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2379                                  dst[1], dst[2], s->linesize, s->uvlinesize, 0);
2380         }
2381
2382         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN2);
2383
2384         dst[0]      += 16;
2385         dst[1]      += 8;
2386         dst[2]      += 8;
2387         s->mv_min.x -= 64;
2388         s->mv_max.x -= 64;
2389
2390         if (mb_x == s->mb_width + 1) {
2391             update_pos(td, mb_y, s->mb_width + 3);
2392         } else {
2393             update_pos(td, mb_y, mb_x);
2394         }
2395     }
2396 }
2397
2398 static void vp7_decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
2399                                         int jobnr, int threadnr)
2400 {
2401     decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr, 1);
2402 }
2403
2404 static void vp8_decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
2405                                         int jobnr, int threadnr)
2406 {
2407     decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr, 0);
2408 }
2409
2410 static av_always_inline void filter_mb_row(AVCodecContext *avctx, void *tdata,
2411                               int jobnr, int threadnr, int is_vp7)
2412 {
2413     VP8Context *s = avctx->priv_data;
2414     VP8ThreadData *td = &s->thread_data[threadnr];
2415     int mb_x, mb_y = td->thread_mb_pos >> 16, num_jobs = s->num_jobs;
2416     AVFrame *curframe = s->curframe->tf.f;
2417     VP8Macroblock *mb;
2418     VP8ThreadData *prev_td, *next_td;
2419     uint8_t *dst[3] = {
2420         curframe->data[0] + 16 * mb_y * s->linesize,
2421         curframe->data[1] +  8 * mb_y * s->uvlinesize,
2422         curframe->data[2] +  8 * mb_y * s->uvlinesize
2423     };
2424
2425     if (s->mb_layout == 1)
2426         mb = s->macroblocks_base + ((s->mb_width + 1) * (mb_y + 1) + 1);
2427     else
2428         mb = s->macroblocks + (s->mb_height - mb_y - 1) * 2;
2429
2430     if (mb_y == 0)
2431         prev_td = td;
2432     else
2433         prev_td = &s->thread_data[(jobnr + num_jobs - 1) % num_jobs];
2434     if (mb_y == s->mb_height - 1)
2435         next_td = td;
2436     else
2437         next_td = &s->thread_data[(jobnr + 1) % num_jobs];
2438
2439     for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb++) {
2440         VP8FilterStrength *f = &td->filter_strength[mb_x];
2441         if (prev_td != td)
2442             check_thread_pos(td, prev_td,
2443                              (mb_x + 1) + (s->mb_width + 3), mb_y - 1);
2444         if (next_td != td)
2445             if (next_td != &s->thread_data[0])
2446                 check_thread_pos(td, next_td, mb_x + 1, mb_y + 1);
2447
2448         if (num_jobs == 1) {
2449             if (s->filter.simple)
2450                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2451                                  NULL, NULL, s->linesize, 0, 1);
2452             else
2453                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2454                                  dst[1], dst[2], s->linesize, s->uvlinesize, 0);
2455         }
2456
2457         if (s->filter.simple)
2458             filter_mb_simple(s, dst[0], f, mb_x, mb_y);
2459         else
2460             filter_mb(s, dst, f, mb_x, mb_y, is_vp7);
2461         dst[0] += 16;
2462         dst[1] += 8;
2463         dst[2] += 8;
2464
2465         update_pos(td, mb_y, (s->mb_width + 3) + mb_x);
2466     }
2467 }
2468
2469 static void vp7_filter_mb_row(AVCodecContext *avctx, void *tdata,
2470                               int jobnr, int threadnr)
2471 {
2472     filter_mb_row(avctx, tdata, jobnr, threadnr, 1);
2473 }
2474
2475 static void vp8_filter_mb_row(AVCodecContext *avctx, void *tdata,
2476                               int jobnr, int threadnr)
2477 {
2478     filter_mb_row(avctx, tdata, jobnr, threadnr, 0);
2479 }
2480
2481 static av_always_inline
2482 int vp78_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata, int jobnr,
2483                               int threadnr, int is_vp7)
2484 {
2485     VP8Context *s = avctx->priv_data;
2486     VP8ThreadData *td = &s->thread_data[jobnr];
2487     VP8ThreadData *next_td = NULL, *prev_td = NULL;
2488     VP8Frame *curframe = s->curframe;
2489     int mb_y, num_jobs = s->num_jobs;
2490
2491     td->thread_nr = threadnr;
2492     for (mb_y = jobnr; mb_y < s->mb_height; mb_y += num_jobs) {
2493         if (mb_y >= s->mb_height)
2494             break;
2495         td->thread_mb_pos = mb_y << 16;
2496         s->decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr);
2497         if (s->deblock_filter)
2498             s->filter_mb_row(avctx, tdata, jobnr, threadnr);
2499         update_pos(td, mb_y, INT_MAX & 0xFFFF);
2500
2501         s->mv_min.y -= 64;
2502         s->mv_max.y -= 64;
2503
2504         if (avctx->active_thread_type == FF_THREAD_FRAME)
2505             ff_thread_report_progress(&curframe->tf, mb_y, 0);
2506     }
2507
2508     return 0;
2509 }
2510
2511 static int vp7_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata,
2512                                     int jobnr, int threadnr)
2513 {
2514     return vp78_decode_mb_row_sliced(avctx, tdata, jobnr, threadnr, IS_VP7);
2515 }
2516
2517 static int vp8_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata,
2518                                     int jobnr, int threadnr)
2519 {
2520     return vp78_decode_mb_row_sliced(avctx, tdata, jobnr, threadnr, IS_VP8);
2521 }
2522
2523
2524 static av_always_inline
2525 int vp78_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
2526                       AVPacket *avpkt, int is_vp7)
2527 {
2528     VP8Context *s = avctx->priv_data;
2529     int ret, i, referenced, num_jobs;
2530     enum AVDiscard skip_thresh;
2531     VP8Frame *av_uninit(curframe), *prev_frame;
2532
2533     if (is_vp7)
2534         ret = vp7_decode_frame_header(s, avpkt->data, avpkt->size);
2535     else
2536         ret = vp8_decode_frame_header(s, avpkt->data, avpkt->size);
2537
2538     if (ret < 0)
2539         goto err;
2540
2541     prev_frame = s->framep[VP56_FRAME_CURRENT];
2542
2543     referenced = s->update_last || s->update_golden == VP56_FRAME_CURRENT ||
2544                  s->update_altref == VP56_FRAME_CURRENT;
2545
2546     skip_thresh = !referenced ? AVDISCARD_NONREF
2547                               : !s->keyframe ? AVDISCARD_NONKEY
2548                                              : AVDISCARD_ALL;
2549
2550     if (avctx->skip_frame >= skip_thresh) {
2551         s->invisible = 1;
2552         memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
2553         goto skip_decode;
2554     }
2555     s->deblock_filter = s->filter.level && avctx->skip_loop_filter < skip_thresh;
2556
2557     // release no longer referenced frames
2558     for (i = 0; i < 5; i++)
2559         if (s->frames[i].tf.f->data[0] &&
2560             &s->frames[i] != prev_frame &&
2561             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
2562             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN]   &&
2563             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2])
2564             vp8_release_frame(s, &s->frames[i]);
2565
2566     curframe = s->framep[VP56_FRAME_CURRENT] = vp8_find_free_buffer(s);
2567
2568     if (!s->colorspace)
2569         avctx->colorspace = AVCOL_SPC_BT470BG;
2570     if (s->fullrange)
2571         avctx->color_range = AVCOL_RANGE_JPEG;
2572     else
2573         avctx->color_range = AVCOL_RANGE_MPEG;
2574
2575     /* Given that arithmetic probabilities are updated every frame, it's quite
2576      * likely that the values we have on a random interframe are complete
2577      * junk if we didn't start decode on a keyframe. So just don't display
2578      * anything rather than junk. */
2579     if (!s->keyframe && (!s->framep[VP56_FRAME_PREVIOUS] ||
2580                          !s->framep[VP56_FRAME_GOLDEN]   ||
2581                          !s->framep[VP56_FRAME_GOLDEN2])) {
2582         av_log(avctx, AV_LOG_WARNING,
2583                "Discarding interframe without a prior keyframe!\n");
2584         ret = AVERROR_INVALIDDATA;
2585         goto err;
2586     }
2587
2588     curframe->tf.f->key_frame = s->keyframe;
2589     curframe->tf.f->pict_type = s->keyframe ? AV_PICTURE_TYPE_I
2590                                             : AV_PICTURE_TYPE_P;
2591     if ((ret = vp8_alloc_frame(s, curframe, referenced)) < 0)
2592         goto err;
2593
2594     // check if golden and altref are swapped
2595     if (s->update_altref != VP56_FRAME_NONE)
2596         s->next_framep[VP56_FRAME_GOLDEN2] = s->framep[s->update_altref];
2597     else
2598         s->next_framep[VP56_FRAME_GOLDEN2] = s->framep[VP56_FRAME_GOLDEN2];
2599
2600     if (s->update_golden != VP56_FRAME_NONE)
2601         s->next_framep[VP56_FRAME_GOLDEN] = s->framep[s->update_golden];
2602     else
2603         s->next_framep[VP56_FRAME_GOLDEN] = s->framep[VP56_FRAME_GOLDEN];
2604
2605     if (s->update_last)
2606         s->next_framep[VP56_FRAME_PREVIOUS] = curframe;
2607     else
2608         s->next_framep[VP56_FRAME_PREVIOUS] = s->framep[VP56_FRAME_PREVIOUS];
2609
2610     s->next_framep[VP56_FRAME_CURRENT] = curframe;
2611
2612     if (avctx->codec->update_thread_context)
2613         ff_thread_finish_setup(avctx);
2614
2615     s->linesize   = curframe->tf.f->linesize[0];
2616     s->uvlinesize = curframe->tf.f->linesize[1];
2617
2618     memset(s->top_nnz, 0, s->mb_width * sizeof(*s->top_nnz));
2619     /* Zero macroblock structures for top/top-left prediction
2620      * from outside the frame. */
2621     if (!s->mb_layout)
2622         memset(s->macroblocks + s->mb_height * 2 - 1, 0,
2623                (s->mb_width + 1) * sizeof(*s->macroblocks));
2624     if (!s->mb_layout && s->keyframe)
2625         memset(s->intra4x4_pred_mode_top, DC_PRED, s->mb_width * 4);
2626
2627     memset(s->ref_count, 0, sizeof(s->ref_count));
2628
2629     if (s->mb_layout == 1) {
2630         // Make sure the previous frame has read its segmentation map,
2631         // if we re-use the same map.
2632         if (prev_frame && s->segmentation.enabled &&
2633             !s->segmentation.update_map)
2634             ff_thread_await_progress(&prev_frame->tf, 1, 0);
2635         if (is_vp7)
2636             vp7_decode_mv_mb_modes(avctx, curframe, prev_frame);
2637         else
2638             vp8_decode_mv_mb_modes(avctx, curframe, prev_frame);
2639     }
2640
2641     if (avctx->active_thread_type == FF_THREAD_FRAME)
2642         num_jobs = 1;
2643     else
2644         num_jobs = FFMIN(s->num_coeff_partitions, avctx->thread_count);
2645     s->num_jobs   = num_jobs;
2646     s->curframe   = curframe;
2647     s->prev_frame = prev_frame;
2648     s->mv_min.y   = -MARGIN;
2649     s->mv_max.y   = ((s->mb_height - 1) << 6) + MARGIN;
2650     for (i = 0; i < MAX_THREADS; i++) {
2651         s->thread_data[i].thread_mb_pos = 0;
2652         s->thread_data[i].wait_mb_pos   = INT_MAX;
2653     }
2654     if (is_vp7)
2655         avctx->execute2(avctx, vp7_decode_mb_row_sliced, s->thread_data, NULL,
2656                         num_jobs);
2657     else
2658         avctx->execute2(avctx, vp8_decode_mb_row_sliced, s->thread_data, NULL,
2659                         num_jobs);
2660
2661     ff_thread_report_progress(&curframe->tf, INT_MAX, 0);
2662     memcpy(&s->framep[0], &s->next_framep[0], sizeof(s->framep[0]) * 4);
2663
2664 skip_decode:
2665     // if future frames don't use the updated probabilities,
2666     // reset them to the values we saved
2667     if (!s->update_probabilities)
2668         s->prob[0] = s->prob[1];
2669
2670     if (!s->invisible) {
2671         if ((ret = av_frame_ref(data, curframe->tf.f)) < 0)
2672             return ret;
2673         *got_frame = 1;
2674     }
2675
2676     return avpkt->size;
2677 err:
2678     memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
2679     return ret;
2680 }
2681
2682 int ff_vp8_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
2683                         AVPacket *avpkt)
2684 {
2685     return vp78_decode_frame(avctx, data, got_frame, avpkt, IS_VP8);
2686 }
2687
2688 #if CONFIG_VP7_DECODER
2689 static int vp7_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
2690                             AVPacket *avpkt)
2691 {
2692     return vp78_decode_frame(avctx, data, got_frame, avpkt, IS_VP7);
2693 }
2694 #endif /* CONFIG_VP7_DECODER */
2695
2696 av_cold int ff_vp8_decode_free(AVCodecContext *avctx)
2697 {
2698     VP8Context *s = avctx->priv_data;
2699     int i;
2700
2701     if (!s)
2702         return 0;
2703
2704     vp8_decode_flush_impl(avctx, 1);
2705     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++)
2706         av_frame_free(&s->frames[i].tf.f);
2707
2708     return 0;
2709 }
2710
2711 static av_cold int vp8_init_frames(VP8Context *s)
2712 {
2713     int i;
2714     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++) {
2715         s->frames[i].tf.f = av_frame_alloc();
2716         if (!s->frames[i].tf.f)
2717             return AVERROR(ENOMEM);
2718     }
2719     return 0;
2720 }
2721
2722 static av_always_inline
2723 int vp78_decode_init(AVCodecContext *avctx, int is_vp7)
2724 {
2725     VP8Context *s = avctx->priv_data;
2726     int ret;
2727
2728     s->avctx = avctx;
2729     s->vp7   = avctx->codec->id == AV_CODEC_ID_VP7;
2730     avctx->pix_fmt = AV_PIX_FMT_YUV420P;
2731     avctx->internal->allocate_progress = 1;
2732
2733     ff_videodsp_init(&s->vdsp, 8);
2734
2735     ff_vp78dsp_init(&s->vp8dsp);
2736     if (CONFIG_VP7_DECODER && is_vp7) {
2737         ff_h264_pred_init(&s->hpc, AV_CODEC_ID_VP7, 8, 1);
2738         ff_vp7dsp_init(&s->vp8dsp);
2739         s->decode_mb_row_no_filter = vp7_decode_mb_row_no_filter;
2740         s->filter_mb_row           = vp7_filter_mb_row;
2741     } else if (CONFIG_VP8_DECODER && !is_vp7) {
2742         ff_h264_pred_init(&s->hpc, AV_CODEC_ID_VP8, 8, 1);
2743         ff_vp8dsp_init(&s->vp8dsp);
2744         s->decode_mb_row_no_filter = vp8_decode_mb_row_no_filter;
2745         s->filter_mb_row           = vp8_filter_mb_row;
2746     }
2747
2748     /* does not change for VP8 */
2749     memcpy(s->prob[0].scan, zigzag_scan, sizeof(s->prob[0].scan));
2750
2751     if ((ret = vp8_init_frames(s)) < 0) {
2752         ff_vp8_decode_free(avctx);
2753         return ret;
2754     }
2755
2756     return 0;
2757 }
2758
2759 #if CONFIG_VP7_DECODER
2760 static int vp7_decode_init(AVCodecContext *avctx)
2761 {
2762     return vp78_decode_init(avctx, IS_VP7);
2763 }
2764 #endif /* CONFIG_VP7_DECODER */
2765
2766 av_cold int ff_vp8_decode_init(AVCodecContext *avctx)
2767 {
2768     return vp78_decode_init(avctx, IS_VP8);
2769 }
2770
2771 #if CONFIG_VP8_DECODER
2772 #if HAVE_THREADS
2773 static av_cold int vp8_decode_init_thread_copy(AVCodecContext *avctx)
2774 {
2775     VP8Context *s = avctx->priv_data;
2776     int ret;
2777
2778     s->avctx = avctx;
2779
2780     if ((ret = vp8_init_frames(s)) < 0) {
2781         ff_vp8_decode_free(avctx);
2782         return ret;
2783     }
2784
2785     return 0;
2786 }
2787
2788 #define REBASE(pic) ((pic) ? (pic) - &s_src->frames[0] + &s->frames[0] : NULL)
2789
2790 static int vp8_decode_update_thread_context(AVCodecContext *dst,
2791                                             const AVCodecContext *src)
2792 {
2793     VP8Context *s = dst->priv_data, *s_src = src->priv_data;
2794     int i;
2795
2796     if (s->macroblocks_base &&
2797         (s_src->mb_width != s->mb_width || s_src->mb_height != s->mb_height)) {
2798         free_buffers(s);
2799         s->mb_width  = s_src->mb_width;
2800         s->mb_height = s_src->mb_height;
2801     }
2802
2803     s->prob[0]      = s_src->prob[!s_src->update_probabilities];
2804     s->segmentation = s_src->segmentation;
2805     s->lf_delta     = s_src->lf_delta;
2806     memcpy(s->sign_bias, s_src->sign_bias, sizeof(s->sign_bias));
2807
2808     for (i = 0; i < FF_ARRAY_ELEMS(s_src->frames); i++) {
2809         if (s_src->frames[i].tf.f->data[0]) {
2810             int ret = vp8_ref_frame(s, &s->frames[i], &s_src->frames[i]);
2811             if (ret < 0)
2812                 return ret;
2813         }
2814     }
2815
2816     s->framep[0] = REBASE(s_src->next_framep[0]);
2817     s->framep[1] = REBASE(s_src->next_framep[1]);
2818     s->framep[2] = REBASE(s_src->next_framep[2]);
2819     s->framep[3] = REBASE(s_src->next_framep[3]);
2820
2821     return 0;
2822 }
2823 #endif /* HAVE_THREADS */
2824 #endif /* CONFIG_VP8_DECODER */
2825
2826 #if CONFIG_VP7_DECODER
2827 AVCodec ff_vp7_decoder = {
2828     .name                  = "vp7",
2829     .long_name             = NULL_IF_CONFIG_SMALL("On2 VP7"),
2830     .type                  = AVMEDIA_TYPE_VIDEO,
2831     .id                    = AV_CODEC_ID_VP7,
2832     .priv_data_size        = sizeof(VP8Context),
2833     .init                  = vp7_decode_init,
2834     .close                 = ff_vp8_decode_free,
2835     .decode                = vp7_decode_frame,
2836     .capabilities          = AV_CODEC_CAP_DR1,
2837     .flush                 = vp8_decode_flush,
2838 };
2839 #endif /* CONFIG_VP7_DECODER */
2840
2841 #if CONFIG_VP8_DECODER
2842 AVCodec ff_vp8_decoder = {
2843     .name                  = "vp8",
2844     .long_name             = NULL_IF_CONFIG_SMALL("On2 VP8"),
2845     .type                  = AVMEDIA_TYPE_VIDEO,
2846     .id                    = AV_CODEC_ID_VP8,
2847     .priv_data_size        = sizeof(VP8Context),
2848     .init                  = ff_vp8_decode_init,
2849     .close                 = ff_vp8_decode_free,
2850     .decode                = ff_vp8_decode_frame,
2851     .capabilities          = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS |
2852                              AV_CODEC_CAP_SLICE_THREADS,
2853     .flush                 = vp8_decode_flush,
2854     .init_thread_copy      = ONLY_IF_THREADS_ENABLED(vp8_decode_init_thread_copy),
2855     .update_thread_context = ONLY_IF_THREADS_ENABLED(vp8_decode_update_thread_context),
2856 };
2857 #endif /* CONFIG_VP7_DECODER */