git.sesse.net Git - ffmpeg/blob - libavcodec/vp8.c

   1 /*
   2  * VP7/VP8 compatible video decoder
   3  *
   4  * Copyright (C) 2010 David Conrad
   5  * Copyright (C) 2010 Ronald S. Bultje
   6  * Copyright (C) 2010 Fiona Glaser
   7  * Copyright (C) 2012 Daniel Kang
   8  * Copyright (C) 2014 Peter Ross
   9  *
  10  * This file is part of FFmpeg.
  11  *
  12  * FFmpeg is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public
  14  * License as published by the Free Software Foundation; either
  15  * version 2.1 of the License, or (at your option) any later version.
  16  *
  17  * FFmpeg is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with FFmpeg; if not, write to the Free Software
  24  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  25  */
  26
  27 #include "libavutil/imgutils.h"
  28
  29 #include "avcodec.h"
  30 #include "internal.h"
  31 #include "rectangle.h"
  32 #include "thread.h"
  33 #include "vp8.h"
  34 #include "vp8data.h"
  35
  36 #if ARCH_ARM
  37 #   include "arm/vp8.h"
  38 #endif
  39
  40 #if CONFIG_VP7_DECODER && CONFIG_VP8_DECODER
  41 #define VPX(vp7, f) (vp7 ? vp7_ ## f : vp8_ ## f)
  42 #elif CONFIG_VP7_DECODER
  43 #define VPX(vp7, f) vp7_ ## f
  44 #else // CONFIG_VP8_DECODER
  45 #define VPX(vp7, f) vp8_ ## f
  46 #endif
  47
  48 static void free_buffers(VP8Context *s)
  49 {
  50     int i;
  51     if (s->thread_data)
  52         for (i = 0; i < MAX_THREADS; i++) {
  53 #if HAVE_THREADS
  54             pthread_cond_destroy(&s->thread_data[i].cond);
  55             pthread_mutex_destroy(&s->thread_data[i].lock);
  56 #endif
  57             av_freep(&s->thread_data[i].filter_strength);
  58         }
  59     av_freep(&s->thread_data);
  60     av_freep(&s->macroblocks_base);
  61     av_freep(&s->intra4x4_pred_mode_top);
  62     av_freep(&s->top_nnz);
  63     av_freep(&s->top_border);
  64
  65     s->macroblocks = NULL;
  66 }
  67
  68 static int vp8_alloc_frame(VP8Context *s, VP8Frame *f, int ref)
  69 {
  70     int ret;
  71     if ((ret = ff_thread_get_buffer(s->avctx, &f->tf,
  72                                     ref ? AV_GET_BUFFER_FLAG_REF : 0)) < 0)
  73         return ret;
  74     if (!(f->seg_map = av_buffer_allocz(s->mb_width * s->mb_height))) {
  75         ff_thread_release_buffer(s->avctx, &f->tf);
  76         return AVERROR(ENOMEM);
  77     }
  78     return 0;
  79 }
  80
  81 static void vp8_release_frame(VP8Context *s, VP8Frame *f)
  82 {
  83     av_buffer_unref(&f->seg_map);
  84     ff_thread_release_buffer(s->avctx, &f->tf);
  85 }
  86
  87 #if CONFIG_VP8_DECODER
  88 static int vp8_ref_frame(VP8Context *s, VP8Frame *dst, VP8Frame *src)
  89 {
  90     int ret;
  91
  92     vp8_release_frame(s, dst);
  93
  94     if ((ret = ff_thread_ref_frame(&dst->tf, &src->tf)) < 0)
  95         return ret;
  96     if (src->seg_map &&
  97         !(dst->seg_map = av_buffer_ref(src->seg_map))) {
  98         vp8_release_frame(s, dst);
  99         return AVERROR(ENOMEM);
 100     }
 101
 102     return 0;
 103 }
 104 #endif /* CONFIG_VP8_DECODER */
 105
 106 static void vp8_decode_flush_impl(AVCodecContext *avctx, int free_mem)
 107 {
 108     VP8Context *s = avctx->priv_data;
 109     int i;
 110
 111     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++)
 112         vp8_release_frame(s, &s->frames[i]);
 113     memset(s->framep, 0, sizeof(s->framep));
 114
 115     if (free_mem)
 116         free_buffers(s);
 117 }
 118
 119 static void vp8_decode_flush(AVCodecContext *avctx)
 120 {
 121     vp8_decode_flush_impl(avctx, 0);
 122 }
 123
 124 static VP8Frame *vp8_find_free_buffer(VP8Context *s)
 125 {
 126     VP8Frame *frame = NULL;
 127     int i;
 128
 129     // find a free buffer
 130     for (i = 0; i < 5; i++)
 131         if (&s->frames[i] != s->framep[VP56_FRAME_CURRENT]  &&
 132             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
 133             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN]   &&
 134             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2]) {
 135             frame = &s->frames[i];
 136             break;
 137         }
 138     if (i == 5) {
 139         av_log(s->avctx, AV_LOG_FATAL, "Ran out of free frames!\n");
 140         abort();
 141     }
 142     if (frame->tf.f->data[0])
 143         vp8_release_frame(s, frame);
 144
 145     return frame;
 146 }
 147
 148 static av_always_inline
 149 int update_dimensions(VP8Context *s, int width, int height, int is_vp7)
 150 {
 151     AVCodecContext *avctx = s->avctx;
 152     int i, ret;
 153
 154     if (width  != s->avctx->width || ((width+15)/16 != s->mb_width || (height+15)/16 != s->mb_height) && s->macroblocks_base ||
 155         height != s->avctx->height) {
 156         vp8_decode_flush_impl(s->avctx, 1);
 157
 158         ret = ff_set_dimensions(s->avctx, width, height);
 159         if (ret < 0)
 160             return ret;
 161     }
 162
 163     s->mb_width  = (s->avctx->coded_width  + 15) / 16;
 164     s->mb_height = (s->avctx->coded_height + 15) / 16;
 165
 166     s->mb_layout = is_vp7 || avctx->active_thread_type == FF_THREAD_SLICE &&
 167                    FFMIN(s->num_coeff_partitions, avctx->thread_count) > 1;
 168     if (!s->mb_layout) { // Frame threading and one thread
 169         s->macroblocks_base       = av_mallocz((s->mb_width + s->mb_height * 2 + 1) *
 170                                                sizeof(*s->macroblocks));
 171         s->intra4x4_pred_mode_top = av_mallocz(s->mb_width * 4);
 172     } else // Sliced threading
 173         s->macroblocks_base = av_mallocz((s->mb_width + 2) * (s->mb_height + 2) *
 174                                          sizeof(*s->macroblocks));
 175     s->top_nnz     = av_mallocz(s->mb_width * sizeof(*s->top_nnz));
 176     s->top_border  = av_mallocz((s->mb_width + 1) * sizeof(*s->top_border));
 177     s->thread_data = av_mallocz(MAX_THREADS * sizeof(VP8ThreadData));
 178
 179     for (i = 0; i < MAX_THREADS; i++) {
 180         s->thread_data[i].filter_strength =
 181             av_mallocz(s->mb_width * sizeof(*s->thread_data[0].filter_strength));
 182 #if HAVE_THREADS
 183         pthread_mutex_init(&s->thread_data[i].lock, NULL);
 184         pthread_cond_init(&s->thread_data[i].cond, NULL);
 185 #endif
 186     }
 187
 188     if (!s->macroblocks_base || !s->top_nnz || !s->top_border ||
 189         (!s->intra4x4_pred_mode_top && !s->mb_layout))
 190         return AVERROR(ENOMEM);
 191
 192     s->macroblocks = s->macroblocks_base + 1;
 193
 194     return 0;
 195 }
 196
 197 static int vp7_update_dimensions(VP8Context *s, int width, int height)
 198 {
 199     return update_dimensions(s, width, height, IS_VP7);
 200 }
 201
 202 static int vp8_update_dimensions(VP8Context *s, int width, int height)
 203 {
 204     return update_dimensions(s, width, height, IS_VP8);
 205 }
 206
 207
 208 static void parse_segment_info(VP8Context *s)
 209 {
 210     VP56RangeCoder *c = &s->c;
 211     int i;
 212
 213     s->segmentation.update_map = vp8_rac_get(c);
 214
 215     if (vp8_rac_get(c)) { // update segment feature data
 216         s->segmentation.absolute_vals = vp8_rac_get(c);
 217
 218         for (i = 0; i < 4; i++)
 219             s->segmentation.base_quant[i]   = vp8_rac_get_sint(c, 7);
 220
 221         for (i = 0; i < 4; i++)
 222             s->segmentation.filter_level[i] = vp8_rac_get_sint(c, 6);
 223     }
 224     if (s->segmentation.update_map)
 225         for (i = 0; i < 3; i++)
 226             s->prob->segmentid[i] = vp8_rac_get(c) ? vp8_rac_get_uint(c, 8) : 255;
 227 }
 228
 229 static void update_lf_deltas(VP8Context *s)
 230 {
 231     VP56RangeCoder *c = &s->c;
 232     int i;
 233
 234     for (i = 0; i < 4; i++) {
 235         if (vp8_rac_get(c)) {
 236             s->lf_delta.ref[i] = vp8_rac_get_uint(c, 6);
 237
 238             if (vp8_rac_get(c))
 239                 s->lf_delta.ref[i] = -s->lf_delta.ref[i];
 240         }
 241     }
 242
 243     for (i = MODE_I4x4; i <= VP8_MVMODE_SPLIT; i++) {
 244         if (vp8_rac_get(c)) {
 245             s->lf_delta.mode[i] = vp8_rac_get_uint(c, 6);
 246
 247             if (vp8_rac_get(c))
 248                 s->lf_delta.mode[i] = -s->lf_delta.mode[i];
 249         }
 250     }
 251 }
 252
 253 static int setup_partitions(VP8Context *s, const uint8_t *buf, int buf_size)
 254 {
 255     const uint8_t *sizes = buf;
 256     int i;
 257
 258     s->num_coeff_partitions = 1 << vp8_rac_get_uint(&s->c, 2);
 259
 260     buf      += 3 * (s->num_coeff_partitions - 1);
 261     buf_size -= 3 * (s->num_coeff_partitions - 1);
 262     if (buf_size < 0)
 263         return -1;
 264
 265     for (i = 0; i < s->num_coeff_partitions - 1; i++) {
 266         int size = AV_RL24(sizes + 3 * i);
 267         if (buf_size - size < 0)
 268             return -1;
 269
 270         ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, size);
 271         buf      += size;
 272         buf_size -= size;
 273     }
 274     ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, buf_size);
 275
 276     return 0;
 277 }
 278
 279 static void vp7_get_quants(VP8Context *s)
 280 {
 281     VP56RangeCoder *c = &s->c;
 282
 283     int yac_qi  = vp8_rac_get_uint(c, 7);
 284     int ydc_qi  = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 285     int y2dc_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 286     int y2ac_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 287     int uvdc_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 288     int uvac_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 289
 290     s->qmat[0].luma_qmul[0]    =       vp7_ydc_qlookup[ydc_qi];
 291     s->qmat[0].luma_qmul[1]    =       vp7_yac_qlookup[yac_qi];
 292     s->qmat[0].luma_dc_qmul[0] =       vp7_y2dc_qlookup[y2dc_qi];
 293     s->qmat[0].luma_dc_qmul[1] =       vp7_y2ac_qlookup[y2ac_qi];
 294     s->qmat[0].chroma_qmul[0]  = FFMIN(vp7_ydc_qlookup[uvdc_qi], 132);
 295     s->qmat[0].chroma_qmul[1]  =       vp7_yac_qlookup[uvac_qi];
 296 }
 297
 298 static void vp8_get_quants(VP8Context *s)
 299 {
 300     VP56RangeCoder *c = &s->c;
 301     int i, base_qi;
 302
 303     int yac_qi     = vp8_rac_get_uint(c, 7);
 304     int ydc_delta  = vp8_rac_get_sint(c, 4);
 305     int y2dc_delta = vp8_rac_get_sint(c, 4);
 306     int y2ac_delta = vp8_rac_get_sint(c, 4);
 307     int uvdc_delta = vp8_rac_get_sint(c, 4);
 308     int uvac_delta = vp8_rac_get_sint(c, 4);
 309
 310     for (i = 0; i < 4; i++) {
 311         if (s->segmentation.enabled) {
 312             base_qi = s->segmentation.base_quant[i];
 313             if (!s->segmentation.absolute_vals)
 314                 base_qi += yac_qi;
 315         } else
 316             base_qi = yac_qi;
 317
 318         s->qmat[i].luma_qmul[0]    = vp8_dc_qlookup[av_clip_uintp2(base_qi + ydc_delta,  7)];
 319         s->qmat[i].luma_qmul[1]    = vp8_ac_qlookup[av_clip_uintp2(base_qi,              7)];
 320         s->qmat[i].luma_dc_qmul[0] = vp8_dc_qlookup[av_clip_uintp2(base_qi + y2dc_delta, 7)] * 2;
 321         /* 101581>>16 is equivalent to 155/100 */
 322         s->qmat[i].luma_dc_qmul[1] = vp8_ac_qlookup[av_clip_uintp2(base_qi + y2ac_delta, 7)] * 101581 >> 16;
 323         s->qmat[i].chroma_qmul[0]  = vp8_dc_qlookup[av_clip_uintp2(base_qi + uvdc_delta, 7)];
 324         s->qmat[i].chroma_qmul[1]  = vp8_ac_qlookup[av_clip_uintp2(base_qi + uvac_delta, 7)];
 325
 326         s->qmat[i].luma_dc_qmul[1] = FFMAX(s->qmat[i].luma_dc_qmul[1], 8);
 327         s->qmat[i].chroma_qmul[0]  = FFMIN(s->qmat[i].chroma_qmul[0], 132);
 328     }
 329 }
 330
 331 /**
 332  * Determine which buffers golden and altref should be updated with after this frame.
 333  * The spec isn't clear here, so I'm going by my understanding of what libvpx does
 334  *
 335  * Intra frames update all 3 references
 336  * Inter frames update VP56_FRAME_PREVIOUS if the update_last flag is set
 337  * If the update (golden|altref) flag is set, it's updated with the current frame
 338  *      if update_last is set, and VP56_FRAME_PREVIOUS otherwise.
 339  * If the flag is not set, the number read means:
 340  *      0: no update
 341  *      1: VP56_FRAME_PREVIOUS
 342  *      2: update golden with altref, or update altref with golden
 343  */
 344 static VP56Frame ref_to_update(VP8Context *s, int update, VP56Frame ref)
 345 {
 346     VP56RangeCoder *c = &s->c;
 347
 348     if (update)
 349         return VP56_FRAME_CURRENT;
 350
 351     switch (vp8_rac_get_uint(c, 2)) {
 352     case 1:
 353         return VP56_FRAME_PREVIOUS;
 354     case 2:
 355         return (ref == VP56_FRAME_GOLDEN) ? VP56_FRAME_GOLDEN2 : VP56_FRAME_GOLDEN;
 356     }
 357     return VP56_FRAME_NONE;
 358 }
 359
 360 static void vp78_reset_probability_tables(VP8Context *s)
 361 {
 362     int i, j;
 363     for (i = 0; i < 4; i++)
 364         for (j = 0; j < 16; j++)
 365             memcpy(s->prob->token[i][j], vp8_token_default_probs[i][vp8_coeff_band[j]],
 366                    sizeof(s->prob->token[i][j]));
 367 }
 368
 369 static void vp78_update_probability_tables(VP8Context *s)
 370 {
 371     VP56RangeCoder *c = &s->c;
 372     int i, j, k, l, m;
 373
 374     for (i = 0; i < 4; i++)
 375         for (j = 0; j < 8; j++)
 376             for (k = 0; k < 3; k++)
 377                 for (l = 0; l < NUM_DCT_TOKENS-1; l++)
 378                     if (vp56_rac_get_prob_branchy(c, vp8_token_update_probs[i][j][k][l])) {
 379                         int prob = vp8_rac_get_uint(c, 8);
 380                         for (m = 0; vp8_coeff_band_indexes[j][m] >= 0; m++)
 381                             s->prob->token[i][vp8_coeff_band_indexes[j][m]][k][l] = prob;
 382                     }
 383 }
 384
 385 #define VP7_MVC_SIZE 17
 386 #define VP8_MVC_SIZE 19
 387
 388 static void vp78_update_pred16x16_pred8x8_mvc_probabilities(VP8Context *s,
 389                                                             int mvc_size)
 390 {
 391     VP56RangeCoder *c = &s->c;
 392     int i, j;
 393
 394     if (vp8_rac_get(c))
 395         for (i = 0; i < 4; i++)
 396             s->prob->pred16x16[i] = vp8_rac_get_uint(c, 8);
 397     if (vp8_rac_get(c))
 398         for (i = 0; i < 3; i++)
 399             s->prob->pred8x8c[i]  = vp8_rac_get_uint(c, 8);
 400
 401     // 17.2 MV probability update
 402     for (i = 0; i < 2; i++)
 403         for (j = 0; j < mvc_size; j++)
 404             if (vp56_rac_get_prob_branchy(c, vp8_mv_update_prob[i][j]))
 405                 s->prob->mvc[i][j] = vp8_rac_get_nn(c);
 406 }
 407
 408 static void update_refs(VP8Context *s)
 409 {
 410     VP56RangeCoder *c = &s->c;
 411
 412     int update_golden = vp8_rac_get(c);
 413     int update_altref = vp8_rac_get(c);
 414
 415     s->update_golden = ref_to_update(s, update_golden, VP56_FRAME_GOLDEN);
 416     s->update_altref = ref_to_update(s, update_altref, VP56_FRAME_GOLDEN2);
 417 }
 418
 419 static void copy_chroma(AVFrame *dst, AVFrame *src, int width, int height)
 420 {
 421     int i, j;
 422
 423     for (j = 1; j < 3; j++) {
 424         for (i = 0; i < height / 2; i++)
 425             memcpy(dst->data[j] + i * dst->linesize[j],
 426                    src->data[j] + i * src->linesize[j], width / 2);
 427     }
 428 }
 429
 430 static void fade(uint8_t *dst, int dst_linesize,
 431                  const uint8_t *src, int src_linesize,
 432                  int width, int height,
 433                  int alpha, int beta)
 434 {
 435     int i, j;
 436     for (j = 0; j < height; j++) {
 437         for (i = 0; i < width; i++) {
 438             uint8_t y = src[j * src_linesize + i];
 439             dst[j * dst_linesize + i] = av_clip_uint8(y + ((y * beta) >> 8) + alpha);
 440         }
 441     }
 442 }
 443
 444 static int vp7_fade_frame(VP8Context *s, VP56RangeCoder *c)
 445 {
 446     int alpha = (int8_t) vp8_rac_get_uint(c, 8);
 447     int beta  = (int8_t) vp8_rac_get_uint(c, 8);
 448     int ret;
 449
 450     if (!s->keyframe && (alpha || beta)) {
 451         int width  = s->mb_width * 16;
 452         int height = s->mb_height * 16;
 453         AVFrame *src, *dst;
 454
 455         if (!s->framep[VP56_FRAME_PREVIOUS] ||
 456             !s->framep[VP56_FRAME_GOLDEN]) {
 457             av_log(s->avctx, AV_LOG_WARNING, "Discarding interframe without a prior keyframe!\n");
 458             return AVERROR_INVALIDDATA;
 459         }
 460
 461         dst =
 462         src = s->framep[VP56_FRAME_PREVIOUS]->tf.f;
 463
 464         /* preserve the golden frame, write a new previous frame */
 465         if (s->framep[VP56_FRAME_GOLDEN] == s->framep[VP56_FRAME_PREVIOUS]) {
 466             s->framep[VP56_FRAME_PREVIOUS] = vp8_find_free_buffer(s);
 467             if ((ret = vp8_alloc_frame(s, s->framep[VP56_FRAME_PREVIOUS], 1)) < 0)
 468                 return ret;
 469
 470             dst = s->framep[VP56_FRAME_PREVIOUS]->tf.f;
 471
 472             copy_chroma(dst, src, width, height);
 473         }
 474
 475         fade(dst->data[0], dst->linesize[0],
 476              src->data[0], src->linesize[0],
 477              width, height, alpha, beta);
 478     }
 479
 480     return 0;
 481 }
 482
 483 static int vp7_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size)
 484 {
 485     VP56RangeCoder *c = &s->c;
 486     int part1_size, hscale, vscale, i, j, ret;
 487     int width  = s->avctx->width;
 488     int height = s->avctx->height;
 489
 490     s->profile = (buf[0] >> 1) & 7;
 491     if (s->profile > 1) {
 492         avpriv_request_sample(s->avctx, "Unknown profile %d", s->profile);
 493         return AVERROR_INVALIDDATA;
 494     }
 495
 496     s->keyframe  = !(buf[0] & 1);
 497     s->invisible = 0;
 498     part1_size   = AV_RL24(buf) >> 4;
 499
 500     if (buf_size < 4 - s->profile + part1_size) {
 501         av_log(s->avctx, AV_LOG_ERROR, "Buffer size %d is too small, needed : %d\n", buf_size, 4 - s->profile + part1_size);
 502         return AVERROR_INVALIDDATA;
 503     }
 504
 505     buf      += 4 - s->profile;
 506     buf_size -= 4 - s->profile;
 507
 508     memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab, sizeof(s->put_pixels_tab));
 509
 510     ff_vp56_init_range_decoder(c, buf, part1_size);
 511     buf      += part1_size;
 512     buf_size -= part1_size;
 513
 514     /* A. Dimension information (keyframes only) */
 515     if (s->keyframe) {
 516         width  = vp8_rac_get_uint(c, 12);
 517         height = vp8_rac_get_uint(c, 12);
 518         hscale = vp8_rac_get_uint(c, 2);
 519         vscale = vp8_rac_get_uint(c, 2);
 520         if (hscale || vscale)
 521             avpriv_request_sample(s->avctx, "Upscaling");
 522
 523         s->update_golden = s->update_altref = VP56_FRAME_CURRENT;
 524         vp78_reset_probability_tables(s);
 525         memcpy(s->prob->pred16x16, vp8_pred16x16_prob_inter,
 526                sizeof(s->prob->pred16x16));
 527         memcpy(s->prob->pred8x8c, vp8_pred8x8c_prob_inter,
 528                sizeof(s->prob->pred8x8c));
 529         for (i = 0; i < 2; i++)
 530             memcpy(s->prob->mvc[i], vp7_mv_default_prob[i],
 531                    sizeof(vp7_mv_default_prob[i]));
 532         memset(&s->segmentation, 0, sizeof(s->segmentation));
 533         memset(&s->lf_delta, 0, sizeof(s->lf_delta));
 534         memcpy(s->prob[0].scan, zigzag_scan, sizeof(s->prob[0].scan));
 535     }
 536
 537     if (s->keyframe || s->profile > 0)
 538         memset(s->inter_dc_pred, 0 , sizeof(s->inter_dc_pred));
 539
 540     /* B. Decoding information for all four macroblock-level features */
 541     for (i = 0; i < 4; i++) {
 542         s->feature_enabled[i] = vp8_rac_get(c);
 543         if (s->feature_enabled[i]) {
 544              s->feature_present_prob[i] = vp8_rac_get_uint(c, 8);
 545
 546              for (j = 0; j < 3; j++)
 547                  s->feature_index_prob[i][j] =
 548                      vp8_rac_get(c) ? vp8_rac_get_uint(c, 8) : 255;
 549
 550              if (vp7_feature_value_size[s->profile][i])
 551                  for (j = 0; j < 4; j++)
 552                      s->feature_value[i][j] =
 553                         vp8_rac_get(c) ? vp8_rac_get_uint(c, vp7_feature_value_size[s->profile][i]) : 0;
 554         }
 555     }
 556
 557     s->segmentation.enabled    = 0;
 558     s->segmentation.update_map = 0;
 559     s->lf_delta.enabled        = 0;
 560
 561     s->num_coeff_partitions = 1;
 562     ff_vp56_init_range_decoder(&s->coeff_partition[0], buf, buf_size);
 563
 564     if (!s->macroblocks_base || /* first frame */
 565         width != s->avctx->width || height != s->avctx->height ||
 566         (width + 15) / 16 != s->mb_width || (height + 15) / 16 != s->mb_height) {
 567         if ((ret = vp7_update_dimensions(s, width, height)) < 0)
 568             return ret;
 569     }
 570
 571     /* C. Dequantization indices */
 572     vp7_get_quants(s);
 573
 574     /* D. Golden frame update flag (a Flag) for interframes only */
 575     if (!s->keyframe) {
 576         s->update_golden = vp8_rac_get(c) ? VP56_FRAME_CURRENT : VP56_FRAME_NONE;
 577         s->sign_bias[VP56_FRAME_GOLDEN] = 0;
 578     }
 579
 580     s->update_last          = 1;
 581     s->update_probabilities = 1;
 582     s->fade_present         = 1;
 583
 584     if (s->profile > 0) {
 585         s->update_probabilities = vp8_rac_get(c);
 586         if (!s->update_probabilities)
 587             s->prob[1] = s->prob[0];
 588
 589         if (!s->keyframe)
 590             s->fade_present = vp8_rac_get(c);
 591     }
 592
 593     /* E. Fading information for previous frame */
 594     if (s->fade_present && vp8_rac_get(c)) {
 595         if ((ret = vp7_fade_frame(s ,c)) < 0)
 596             return ret;
 597     }
 598
 599     /* F. Loop filter type */
 600     if (!s->profile)
 601         s->filter.simple = vp8_rac_get(c);
 602
 603     /* G. DCT coefficient ordering specification */
 604     if (vp8_rac_get(c))
 605         for (i = 1; i < 16; i++)
 606             s->prob[0].scan[i] = zigzag_scan[vp8_rac_get_uint(c, 4)];
 607
 608     /* H. Loop filter levels  */
 609     if (s->profile > 0)
 610         s->filter.simple = vp8_rac_get(c);
 611     s->filter.level     = vp8_rac_get_uint(c, 6);
 612     s->filter.sharpness = vp8_rac_get_uint(c, 3);
 613
 614     /* I. DCT coefficient probability update; 13.3 Token Probability Updates */
 615     vp78_update_probability_tables(s);
 616
 617     s->mbskip_enabled = 0;
 618
 619     /* J. The remaining frame header data occurs ONLY FOR INTERFRAMES */
 620     if (!s->keyframe) {
 621         s->prob->intra  = vp8_rac_get_uint(c, 8);
 622         s->prob->last   = vp8_rac_get_uint(c, 8);
 623         vp78_update_pred16x16_pred8x8_mvc_probabilities(s, VP7_MVC_SIZE);
 624     }
 625
 626     return 0;
 627 }
 628
 629 static int vp8_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size)
 630 {
 631     VP56RangeCoder *c = &s->c;
 632     int header_size, hscale, vscale, ret;
 633     int width  = s->avctx->width;
 634     int height = s->avctx->height;
 635
 636     s->keyframe  = !(buf[0] & 1);
 637     s->profile   =  (buf[0]>>1) & 7;
 638     s->invisible = !(buf[0] & 0x10);
 639     header_size  = AV_RL24(buf) >> 5;
 640     buf      += 3;
 641     buf_size -= 3;
 642
 643     if (s->profile > 3)
 644         av_log(s->avctx, AV_LOG_WARNING, "Unknown profile %d\n", s->profile);
 645
 646     if (!s->profile)
 647         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab,
 648                sizeof(s->put_pixels_tab));
 649     else    // profile 1-3 use bilinear, 4+ aren't defined so whatever
 650         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_bilinear_pixels_tab,
 651                sizeof(s->put_pixels_tab));
 652
 653     if (header_size > buf_size - 7 * s->keyframe) {
 654         av_log(s->avctx, AV_LOG_ERROR, "Header size larger than data provided\n");
 655         return AVERROR_INVALIDDATA;
 656     }
 657
 658     if (s->keyframe) {
 659         if (AV_RL24(buf) != 0x2a019d) {
 660             av_log(s->avctx, AV_LOG_ERROR,
 661                    "Invalid start code 0x%x\n", AV_RL24(buf));
 662             return AVERROR_INVALIDDATA;
 663         }
 664         width     = AV_RL16(buf + 3) & 0x3fff;
 665         height    = AV_RL16(buf + 5) & 0x3fff;
 666         hscale    = buf[4] >> 6;
 667         vscale    = buf[6] >> 6;
 668         buf      += 7;
 669         buf_size -= 7;
 670
 671         if (hscale || vscale)
 672             avpriv_request_sample(s->avctx, "Upscaling");
 673
 674         s->update_golden = s->update_altref = VP56_FRAME_CURRENT;
 675         vp78_reset_probability_tables(s);
 676         memcpy(s->prob->pred16x16, vp8_pred16x16_prob_inter,
 677                sizeof(s->prob->pred16x16));
 678         memcpy(s->prob->pred8x8c, vp8_pred8x8c_prob_inter,
 679                sizeof(s->prob->pred8x8c));
 680         memcpy(s->prob->mvc, vp8_mv_default_prob,
 681                sizeof(s->prob->mvc));
 682         memset(&s->segmentation, 0, sizeof(s->segmentation));
 683         memset(&s->lf_delta, 0, sizeof(s->lf_delta));
 684     }
 685
 686     ff_vp56_init_range_decoder(c, buf, header_size);
 687     buf      += header_size;
 688     buf_size -= header_size;
 689
 690     if (s->keyframe) {
 691         s->colorspace = vp8_rac_get(c);
 692         if (s->colorspace)
 693             av_log(s->avctx, AV_LOG_WARNING, "Unspecified colorspace\n");
 694         s->fullrange = vp8_rac_get(c);
 695     }
 696
 697     if ((s->segmentation.enabled = vp8_rac_get(c)))
 698         parse_segment_info(s);
 699     else
 700         s->segmentation.update_map = 0; // FIXME: move this to some init function?
 701
 702     s->filter.simple    = vp8_rac_get(c);
 703     s->filter.level     = vp8_rac_get_uint(c, 6);
 704     s->filter.sharpness = vp8_rac_get_uint(c, 3);
 705
 706     if ((s->lf_delta.enabled = vp8_rac_get(c)))
 707         if (vp8_rac_get(c))
 708             update_lf_deltas(s);
 709
 710     if (setup_partitions(s, buf, buf_size)) {
 711         av_log(s->avctx, AV_LOG_ERROR, "Invalid partitions\n");
 712         return AVERROR_INVALIDDATA;
 713     }
 714
 715     if (!s->macroblocks_base || /* first frame */
 716         width != s->avctx->width || height != s->avctx->height ||
 717         (width+15)/16 != s->mb_width || (height+15)/16 != s->mb_height)
 718         if ((ret = vp8_update_dimensions(s, width, height)) < 0)
 719             return ret;
 720
 721     vp8_get_quants(s);
 722
 723     if (!s->keyframe) {
 724         update_refs(s);
 725         s->sign_bias[VP56_FRAME_GOLDEN]               = vp8_rac_get(c);
 726         s->sign_bias[VP56_FRAME_GOLDEN2 /* altref */] = vp8_rac_get(c);
 727     }
 728
 729     // if we aren't saving this frame's probabilities for future frames,
 730     // make a copy of the current probabilities
 731     if (!(s->update_probabilities = vp8_rac_get(c)))
 732         s->prob[1] = s->prob[0];
 733
 734     s->update_last = s->keyframe || vp8_rac_get(c);
 735
 736     vp78_update_probability_tables(s);
 737
 738     if ((s->mbskip_enabled = vp8_rac_get(c)))
 739         s->prob->mbskip = vp8_rac_get_uint(c, 8);
 740
 741     if (!s->keyframe) {
 742         s->prob->intra  = vp8_rac_get_uint(c, 8);
 743         s->prob->last   = vp8_rac_get_uint(c, 8);
 744         s->prob->golden = vp8_rac_get_uint(c, 8);
 745         vp78_update_pred16x16_pred8x8_mvc_probabilities(s, VP8_MVC_SIZE);
 746     }
 747
 748     return 0;
 749 }
 750
 751 static av_always_inline
 752 void clamp_mv(VP8Context *s, VP56mv *dst, const VP56mv *src)
 753 {
 754     dst->x = av_clip(src->x, s->mv_min.x, s->mv_max.x);
 755     dst->y = av_clip(src->y, s->mv_min.y, s->mv_max.y);
 756 }
 757
 758 /**
 759  * Motion vector coding, 17.1.
 760  */
 761 static av_always_inline int read_mv_component(VP56RangeCoder *c, const uint8_t *p, int vp7)
 762 {
 763     int bit, x = 0;
 764
 765     if (vp56_rac_get_prob_branchy(c, p[0])) {
 766         int i;
 767
 768         for (i = 0; i < 3; i++)
 769             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 770         for (i = (vp7 ? 7 : 9); i > 3; i--)
 771             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 772         if (!(x & (vp7 ? 0xF0 : 0xFFF0)) || vp56_rac_get_prob(c, p[12]))
 773             x += 8;
 774     } else {
 775         // small_mvtree
 776         const uint8_t *ps = p + 2;
 777         bit = vp56_rac_get_prob(c, *ps);
 778         ps += 1 + 3 * bit;
 779         x  += 4 * bit;
 780         bit = vp56_rac_get_prob(c, *ps);
 781         ps += 1 + bit;
 782         x  += 2 * bit;
 783         x  += vp56_rac_get_prob(c, *ps);
 784     }
 785
 786     return (x && vp56_rac_get_prob(c, p[1])) ? -x : x;
 787 }
 788
 789 static int vp7_read_mv_component(VP56RangeCoder *c, const uint8_t *p)
 790 {
 791     return read_mv_component(c, p, 1);
 792 }
 793
 794 static int vp8_read_mv_component(VP56RangeCoder *c, const uint8_t *p)
 795 {
 796     return read_mv_component(c, p, 0);
 797 }
 798
 799 static av_always_inline
 800 const uint8_t *get_submv_prob(uint32_t left, uint32_t top, int is_vp7)
 801 {
 802     if (is_vp7)
 803         return vp7_submv_prob;
 804
 805     if (left == top)
 806         return vp8_submv_prob[4 - !!left];
 807     if (!top)
 808         return vp8_submv_prob[2];
 809     return vp8_submv_prob[1 - !!left];
 810 }
 811
 812 /**
 813  * Split motion vector prediction, 16.4.
 814  * @returns the number of motion vectors parsed (2, 4 or 16)
 815  */
 816 static av_always_inline
 817 int decode_splitmvs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
 818                     int layout, int is_vp7)
 819 {
 820     int part_idx;
 821     int n, num;
 822     VP8Macroblock *top_mb;
 823     VP8Macroblock *left_mb = &mb[-1];
 824     const uint8_t *mbsplits_left = vp8_mbsplits[left_mb->partitioning];
 825     const uint8_t *mbsplits_top, *mbsplits_cur, *firstidx;
 826     VP56mv *top_mv;
 827     VP56mv *left_mv = left_mb->bmv;
 828     VP56mv *cur_mv  = mb->bmv;
 829
 830     if (!layout) // layout is inlined, s->mb_layout is not
 831         top_mb = &mb[2];
 832     else
 833         top_mb = &mb[-s->mb_width - 1];
 834     mbsplits_top = vp8_mbsplits[top_mb->partitioning];
 835     top_mv       = top_mb->bmv;
 836
 837     if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[0])) {
 838         if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[1]))
 839             part_idx = VP8_SPLITMVMODE_16x8 + vp56_rac_get_prob(c, vp8_mbsplit_prob[2]);
 840         else
 841             part_idx = VP8_SPLITMVMODE_8x8;
 842     } else {
 843         part_idx = VP8_SPLITMVMODE_4x4;
 844     }
 845
 846     num              = vp8_mbsplit_count[part_idx];
 847     mbsplits_cur     = vp8_mbsplits[part_idx],
 848     firstidx         = vp8_mbfirstidx[part_idx];
 849     mb->partitioning = part_idx;
 850
 851     for (n = 0; n < num; n++) {
 852         int k = firstidx[n];
 853         uint32_t left, above;
 854         const uint8_t *submv_prob;
 855
 856         if (!(k & 3))
 857             left = AV_RN32A(&left_mv[mbsplits_left[k + 3]]);
 858         else
 859             left = AV_RN32A(&cur_mv[mbsplits_cur[k - 1]]);
 860         if (k <= 3)
 861             above = AV_RN32A(&top_mv[mbsplits_top[k + 12]]);
 862         else
 863             above = AV_RN32A(&cur_mv[mbsplits_cur[k - 4]]);
 864
 865         submv_prob = get_submv_prob(left, above, is_vp7);
 866
 867         if (vp56_rac_get_prob_branchy(c, submv_prob[0])) {
 868             if (vp56_rac_get_prob_branchy(c, submv_prob[1])) {
 869                 if (vp56_rac_get_prob_branchy(c, submv_prob[2])) {
 870                     mb->bmv[n].y = mb->mv.y +
 871                                    read_mv_component(c, s->prob->mvc[0], is_vp7);
 872                     mb->bmv[n].x = mb->mv.x +
 873                                    read_mv_component(c, s->prob->mvc[1], is_vp7);
 874                 } else {
 875                     AV_ZERO32(&mb->bmv[n]);
 876                 }
 877             } else {
 878                 AV_WN32A(&mb->bmv[n], above);
 879             }
 880         } else {
 881             AV_WN32A(&mb->bmv[n], left);
 882         }
 883     }
 884
 885     return num;
 886 }
 887
 888 /**
 889  * The vp7 reference decoder uses a padding macroblock column (added to right
 890  * edge of the frame) to guard against illegal macroblock offsets. The
 891  * algorithm has bugs that permit offsets to straddle the padding column.
 892  * This function replicates those bugs.
 893  *
 894  * @param[out] edge_x macroblock x address
 895  * @param[out] edge_y macroblock y address
 896  *
 897  * @return macroblock offset legal (boolean)
 898  */
 899 static int vp7_calculate_mb_offset(int mb_x, int mb_y, int mb_width,
 900                                    int xoffset, int yoffset, int boundary,
 901                                    int *edge_x, int *edge_y)
 902 {
 903     int vwidth = mb_width + 1;
 904     int new = (mb_y + yoffset) * vwidth + mb_x + xoffset;
 905     if (new < boundary || new % vwidth == vwidth - 1)
 906         return 0;
 907     *edge_y = new / vwidth;
 908     *edge_x = new % vwidth;
 909     return 1;
 910 }
 911
 912 static const VP56mv *get_bmv_ptr(const VP8Macroblock *mb, int subblock)
 913 {
 914     return &mb->bmv[mb->mode == VP8_MVMODE_SPLIT ? vp8_mbsplits[mb->partitioning][subblock] : 0];
 915 }
 916
 917 static av_always_inline
 918 void vp7_decode_mvs(VP8Context *s, VP8Macroblock *mb,
 919                     int mb_x, int mb_y, int layout)
 920 {
 921     VP8Macroblock *mb_edge[12];
 922     enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR };
 923     enum { VP8_EDGE_TOP, VP8_EDGE_LEFT, VP8_EDGE_TOPLEFT };
 924     int idx = CNT_ZERO;
 925     VP56mv near_mv[3];
 926     uint8_t cnt[3] = { 0 };
 927     VP56RangeCoder *c = &s->c;
 928     int i;
 929
 930     AV_ZERO32(&near_mv[0]);
 931     AV_ZERO32(&near_mv[1]);
 932     AV_ZERO32(&near_mv[2]);
 933
 934     for (i = 0; i < VP7_MV_PRED_COUNT; i++) {
 935         const VP7MVPred * pred = &vp7_mv_pred[i];
 936         int edge_x, edge_y;
 937
 938         if (vp7_calculate_mb_offset(mb_x, mb_y, s->mb_width, pred->xoffset,
 939                                     pred->yoffset, !s->profile, &edge_x, &edge_y)) {
 940             VP8Macroblock *edge = mb_edge[i] = (s->mb_layout == 1)
 941                                              ? s->macroblocks_base + 1 + edge_x +
 942                                                (s->mb_width + 1) * (edge_y + 1)
 943                                              : s->macroblocks + edge_x +
 944                                                (s->mb_height - edge_y - 1) * 2;
 945             uint32_t mv = AV_RN32A(get_bmv_ptr(edge, vp7_mv_pred[i].subblock));
 946             if (mv) {
 947                 if (AV_RN32A(&near_mv[CNT_NEAREST])) {
 948                     if (mv == AV_RN32A(&near_mv[CNT_NEAREST])) {
 949                         idx = CNT_NEAREST;
 950                     } else if (AV_RN32A(&near_mv[CNT_NEAR])) {
 951                         if (mv != AV_RN32A(&near_mv[CNT_NEAR]))
 952                             continue;
 953                         idx = CNT_NEAR;
 954                     } else {
 955                         AV_WN32A(&near_mv[CNT_NEAR], mv);
 956                         idx = CNT_NEAR;
 957                     }
 958                 } else {
 959                     AV_WN32A(&near_mv[CNT_NEAREST], mv);
 960                     idx = CNT_NEAREST;
 961                 }
 962             } else {
 963                 idx = CNT_ZERO;
 964             }
 965         } else {
 966             idx = CNT_ZERO;
 967         }
 968         cnt[idx] += vp7_mv_pred[i].score;
 969     }
 970
 971     mb->partitioning = VP8_SPLITMVMODE_NONE;
 972
 973     if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_ZERO]][0])) {
 974         mb->mode = VP8_MVMODE_MV;
 975
 976         if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAREST]][1])) {
 977
 978             if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAR]][2])) {
 979
 980                 if (cnt[CNT_NEAREST] > cnt[CNT_NEAR])
 981                     AV_WN32A(&mb->mv, cnt[CNT_ZERO] > cnt[CNT_NEAREST] ? 0 : AV_RN32A(&near_mv[CNT_NEAREST]));
 982                 else
 983                     AV_WN32A(&mb->mv, cnt[CNT_ZERO] > cnt[CNT_NEAR]    ? 0 : AV_RN32A(&near_mv[CNT_NEAR]));
 984
 985                 if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAR]][3])) {
 986                     mb->mode = VP8_MVMODE_SPLIT;
 987                     mb->mv = mb->bmv[decode_splitmvs(s, c, mb, layout, IS_VP7) - 1];
 988                 } else {
 989                     mb->mv.y += vp7_read_mv_component(c, s->prob->mvc[0]);
 990                     mb->mv.x += vp7_read_mv_component(c, s->prob->mvc[1]);
 991                     mb->bmv[0] = mb->mv;
 992                 }
 993             } else {
 994                 mb->mv = near_mv[CNT_NEAR];
 995                 mb->bmv[0] = mb->mv;
 996             }
 997         } else {
 998             mb->mv = near_mv[CNT_NEAREST];
 999             mb->bmv[0] = mb->mv;
1000         }
1001     } else {
1002         mb->mode = VP8_MVMODE_ZERO;
1003         AV_ZERO32(&mb->mv);
1004         mb->bmv[0] = mb->mv;
1005     }
1006 }
1007
1008 static av_always_inline
1009 void vp8_decode_mvs(VP8Context *s, VP8Macroblock *mb,
1010                     int mb_x, int mb_y, int layout)
1011 {
1012     VP8Macroblock *mb_edge[3] = { 0      /* top */,
1013                                   mb - 1 /* left */,
1014                                   0      /* top-left */ };
1015     enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV };
1016     enum { VP8_EDGE_TOP, VP8_EDGE_LEFT, VP8_EDGE_TOPLEFT };
1017     int idx = CNT_ZERO;
1018     int cur_sign_bias = s->sign_bias[mb->ref_frame];
1019     int8_t *sign_bias = s->sign_bias;
1020     VP56mv near_mv[4];
1021     uint8_t cnt[4] = { 0 };
1022     VP56RangeCoder *c = &s->c;
1023
1024     if (!layout) { // layout is inlined (s->mb_layout is not)
1025         mb_edge[0] = mb + 2;
1026         mb_edge[2] = mb + 1;
1027     } else {
1028         mb_edge[0] = mb - s->mb_width - 1;
1029         mb_edge[2] = mb - s->mb_width - 2;
1030     }
1031
1032     AV_ZERO32(&near_mv[0]);
1033     AV_ZERO32(&near_mv[1]);
1034     AV_ZERO32(&near_mv[2]);
1035
1036     /* Process MB on top, left and top-left */
1037 #define MV_EDGE_CHECK(n)                                                      \
1038     {                                                                         \
1039         VP8Macroblock *edge = mb_edge[n];                                     \
1040         int edge_ref = edge->ref_frame;                                       \
1041         if (edge_ref != VP56_FRAME_CURRENT) {                                 \
1042             uint32_t mv = AV_RN32A(&edge->mv);                                \
1043             if (mv) {                                                         \
1044                 if (cur_sign_bias != sign_bias[edge_ref]) {                   \
1045                     /* SWAR negate of the values in mv. */                    \
1046                     mv = ~mv;                                                 \
1047                     mv = ((mv & 0x7fff7fff) +                                 \
1048                           0x00010001) ^ (mv & 0x80008000);                    \
1049                 }                                                             \
1050                 if (!n || mv != AV_RN32A(&near_mv[idx]))                      \
1051                     AV_WN32A(&near_mv[++idx], mv);                            \
1052                 cnt[idx] += 1 + (n != 2);                                     \
1053             } else                                                            \
1054                 cnt[CNT_ZERO] += 1 + (n != 2);                                \
1055         }                                                                     \
1056     }
1057
1058     MV_EDGE_CHECK(0)
1059     MV_EDGE_CHECK(1)
1060     MV_EDGE_CHECK(2)
1061
1062     mb->partitioning = VP8_SPLITMVMODE_NONE;
1063     if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_ZERO]][0])) {
1064         mb->mode = VP8_MVMODE_MV;
1065
1066         /* If we have three distinct MVs, merge first and last if they're the same */
1067         if (cnt[CNT_SPLITMV] &&
1068             AV_RN32A(&near_mv[1 + VP8_EDGE_TOP]) == AV_RN32A(&near_mv[1 + VP8_EDGE_TOPLEFT]))
1069             cnt[CNT_NEAREST] += 1;
1070
1071         /* Swap near and nearest if necessary */
1072         if (cnt[CNT_NEAR] > cnt[CNT_NEAREST]) {
1073             FFSWAP(uint8_t,     cnt[CNT_NEAREST],     cnt[CNT_NEAR]);
1074             FFSWAP( VP56mv, near_mv[CNT_NEAREST], near_mv[CNT_NEAR]);
1075         }
1076
1077         if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAREST]][1])) {
1078             if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAR]][2])) {
1079                 /* Choose the best mv out of 0,0 and the nearest mv */
1080                 clamp_mv(s, &mb->mv, &near_mv[CNT_ZERO + (cnt[CNT_NEAREST] >= cnt[CNT_ZERO])]);
1081                 cnt[CNT_SPLITMV] = ((mb_edge[VP8_EDGE_LEFT]->mode    == VP8_MVMODE_SPLIT) +
1082                                     (mb_edge[VP8_EDGE_TOP]->mode     == VP8_MVMODE_SPLIT)) * 2 +
1083                                     (mb_edge[VP8_EDGE_TOPLEFT]->mode == VP8_MVMODE_SPLIT);
1084
1085                 if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_SPLITMV]][3])) {
1086                     mb->mode = VP8_MVMODE_SPLIT;
1087                     mb->mv = mb->bmv[decode_splitmvs(s, c, mb, layout, IS_VP8) - 1];
1088                 } else {
1089                     mb->mv.y  += vp8_read_mv_component(c, s->prob->mvc[0]);
1090                     mb->mv.x  += vp8_read_mv_component(c, s->prob->mvc[1]);
1091                     mb->bmv[0] = mb->mv;
1092                 }
1093             } else {
1094                 clamp_mv(s, &mb->mv, &near_mv[CNT_NEAR]);
1095                 mb->bmv[0] = mb->mv;
1096             }
1097         } else {
1098             clamp_mv(s, &mb->mv, &near_mv[CNT_NEAREST]);
1099             mb->bmv[0] = mb->mv;
1100         }
1101     } else {
1102         mb->mode = VP8_MVMODE_ZERO;
1103         AV_ZERO32(&mb->mv);
1104         mb->bmv[0] = mb->mv;
1105     }
1106 }
1107
1108 static av_always_inline
1109 void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
1110                            int mb_x, int keyframe, int layout)
1111 {
1112     uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
1113
1114     if (layout) {
1115         VP8Macroblock *mb_top = mb - s->mb_width - 1;
1116         memcpy(mb->intra4x4_pred_mode_top, mb_top->intra4x4_pred_mode_top, 4);
1117     }
1118     if (keyframe) {
1119         int x, y;
1120         uint8_t *top;
1121         uint8_t *const left = s->intra4x4_pred_mode_left;
1122         if (layout)
1123             top = mb->intra4x4_pred_mode_top;
1124         else
1125             top = s->intra4x4_pred_mode_top + 4 * mb_x;
1126         for (y = 0; y < 4; y++) {
1127             for (x = 0; x < 4; x++) {
1128                 const uint8_t *ctx;
1129                 ctx       = vp8_pred4x4_prob_intra[top[x]][left[y]];
1130                 *intra4x4 = vp8_rac_get_tree(c, vp8_pred4x4_tree, ctx);
1131                 left[y]   = top[x] = *intra4x4;
1132                 intra4x4++;
1133             }
1134         }
1135     } else {
1136         int i;
1137         for (i = 0; i < 16; i++)
1138             intra4x4[i] = vp8_rac_get_tree(c, vp8_pred4x4_tree,
1139                                            vp8_pred4x4_prob_inter);
1140     }
1141 }
1142
1143 static av_always_inline
1144 void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
1145                     uint8_t *segment, uint8_t *ref, int layout, int is_vp7)
1146 {
1147     VP56RangeCoder *c = &s->c;
1148     const char *vp7_feature_name[] = { "q-index",
1149                                        "lf-delta",
1150                                        "partial-golden-update",
1151                                        "blit-pitch" };
1152     if (is_vp7) {
1153         int i;
1154         *segment = 0;
1155         for (i = 0; i < 4; i++) {
1156             if (s->feature_enabled[i]) {
1157                 if (vp56_rac_get_prob_branchy(c, s->feature_present_prob[i])) {
1158                       int index = vp8_rac_get_tree(c, vp7_feature_index_tree,
1159                                                    s->feature_index_prob[i]);
1160                       av_log(s->avctx, AV_LOG_WARNING,
1161                              "Feature %s present in macroblock (value 0x%x)\n",
1162                              vp7_feature_name[i], s->feature_value[i][index]);
1163                 }
1164            }
1165         }
1166     } else if (s->segmentation.update_map) {
1167         int bit  = vp56_rac_get_prob(c, s->prob->segmentid[0]);
1168         *segment = vp56_rac_get_prob(c, s->prob->segmentid[1+bit]) + 2*bit;
1169     } else if (s->segmentation.enabled)
1170         *segment = ref ? *ref : *segment;
1171     mb->segment = *segment;
1172
1173     mb->skip = s->mbskip_enabled ? vp56_rac_get_prob(c, s->prob->mbskip) : 0;
1174
1175     if (s->keyframe) {
1176         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_intra,
1177                                     vp8_pred16x16_prob_intra);
1178
1179         if (mb->mode == MODE_I4x4) {
1180             decode_intra4x4_modes(s, c, mb, mb_x, 1, layout);
1181         } else {
1182             const uint32_t modes = (is_vp7 ? vp7_pred4x4_mode
1183                                            : vp8_pred4x4_mode)[mb->mode] * 0x01010101u;
1184             if (s->mb_layout)
1185                 AV_WN32A(mb->intra4x4_pred_mode_top, modes);
1186             else
1187                 AV_WN32A(s->intra4x4_pred_mode_top + 4 * mb_x, modes);
1188             AV_WN32A(s->intra4x4_pred_mode_left, modes);
1189         }
1190
1191         mb->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree,
1192                                                 vp8_pred8x8c_prob_intra);
1193         mb->ref_frame        = VP56_FRAME_CURRENT;
1194     } else if (vp56_rac_get_prob_branchy(c, s->prob->intra)) {
1195         // inter MB, 16.2
1196         if (vp56_rac_get_prob_branchy(c, s->prob->last))
1197             mb->ref_frame =
1198                 (!is_vp7 && vp56_rac_get_prob(c, s->prob->golden)) ? VP56_FRAME_GOLDEN2 /* altref */
1199                                                                    : VP56_FRAME_GOLDEN;
1200         else
1201             mb->ref_frame = VP56_FRAME_PREVIOUS;
1202         s->ref_count[mb->ref_frame - 1]++;
1203
1204         // motion vectors, 16.3
1205         if (is_vp7)
1206             vp7_decode_mvs(s, mb, mb_x, mb_y, layout);
1207         else
1208             vp8_decode_mvs(s, mb, mb_x, mb_y, layout);
1209     } else {
1210         // intra MB, 16.1
1211         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_inter, s->prob->pred16x16);
1212
1213         if (mb->mode == MODE_I4x4)
1214             decode_intra4x4_modes(s, c, mb, mb_x, 0, layout);
1215
1216         mb->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree,
1217                                                 s->prob->pred8x8c);
1218         mb->ref_frame        = VP56_FRAME_CURRENT;
1219         mb->partitioning     = VP8_SPLITMVMODE_NONE;
1220         AV_ZERO32(&mb->bmv[0]);
1221     }
1222 }
1223
1224 /**
1225  * @param r     arithmetic bitstream reader context
1226  * @param block destination for block coefficients
1227  * @param probs probabilities to use when reading trees from the bitstream
1228  * @param i     initial coeff index, 0 unless a separate DC block is coded
1229  * @param qmul  array holding the dc/ac dequant factor at position 0/1
1230  *
1231  * @return 0 if no coeffs were decoded
1232  *         otherwise, the index of the last coeff decoded plus one
1233  */
1234 static av_always_inline
1235 int decode_block_coeffs_internal(VP56RangeCoder *r, int16_t block[16],
1236                                  uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1237                                  int i, uint8_t *token_prob, int16_t qmul[2],
1238                                  const uint8_t scan[16], int vp7)
1239 {
1240     VP56RangeCoder c = *r;
1241     goto skip_eob;
1242     do {
1243         int coeff;
1244 restart:
1245         if (!vp56_rac_get_prob_branchy(&c, token_prob[0]))   // DCT_EOB
1246             break;
1247
1248 skip_eob:
1249         if (!vp56_rac_get_prob_branchy(&c, token_prob[1])) { // DCT_0
1250             if (++i == 16)
1251                 break; // invalid input; blocks should end with EOB
1252             token_prob = probs[i][0];
1253             if (vp7)
1254                 goto restart;
1255             goto skip_eob;
1256         }
1257
1258         if (!vp56_rac_get_prob_branchy(&c, token_prob[2])) { // DCT_1
1259             coeff = 1;
1260             token_prob = probs[i + 1][1];
1261         } else {
1262             if (!vp56_rac_get_prob_branchy(&c, token_prob[3])) { // DCT 2,3,4
1263                 coeff = vp56_rac_get_prob_branchy(&c, token_prob[4]);
1264                 if (coeff)
1265                     coeff += vp56_rac_get_prob(&c, token_prob[5]);
1266                 coeff += 2;
1267             } else {
1268                 // DCT_CAT*
1269                 if (!vp56_rac_get_prob_branchy(&c, token_prob[6])) {
1270                     if (!vp56_rac_get_prob_branchy(&c, token_prob[7])) { // DCT_CAT1
1271                         coeff = 5 + vp56_rac_get_prob(&c, vp8_dct_cat1_prob[0]);
1272                     } else {                                    // DCT_CAT2
1273                         coeff  = 7;
1274                         coeff += vp56_rac_get_prob(&c, vp8_dct_cat2_prob[0]) << 1;
1275                         coeff += vp56_rac_get_prob(&c, vp8_dct_cat2_prob[1]);
1276                     }
1277                 } else {    // DCT_CAT3 and up
1278                     int a   = vp56_rac_get_prob(&c, token_prob[8]);
1279                     int b   = vp56_rac_get_prob(&c, token_prob[9 + a]);
1280                     int cat = (a << 1) + b;
1281                     coeff  = 3 + (8 << cat);
1282                     coeff += vp8_rac_get_coeff(&c, ff_vp8_dct_cat_prob[cat]);
1283                 }
1284             }
1285             token_prob = probs[i + 1][2];
1286         }
1287         block[scan[i]] = (vp8_rac_get(&c) ? -coeff : coeff) * qmul[!!i];
1288     } while (++i < 16);
1289
1290     *r = c;
1291     return i;
1292 }
1293
1294 static av_always_inline
1295 int inter_predict_dc(int16_t block[16], int16_t pred[2])
1296 {
1297     int16_t dc = block[0];
1298     int ret = 0;
1299
1300     if (pred[1] > 3) {
1301         dc += pred[0];
1302         ret = 1;
1303     }
1304
1305     if (!pred[0] | !dc | ((int32_t)pred[0] ^ (int32_t)dc) >> 31) {
1306         block[0] = pred[0] = dc;
1307         pred[1] = 0;
1308     } else {
1309         if (pred[0] == dc)
1310             pred[1]++;
1311         block[0] = pred[0] = dc;
1312     }
1313
1314     return ret;
1315 }
1316
1317 static int vp7_decode_block_coeffs_internal(VP56RangeCoder *r,
1318                                             int16_t block[16],
1319                                             uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1320                                             int i, uint8_t *token_prob,
1321                                             int16_t qmul[2],
1322                                             const uint8_t scan[16])
1323 {
1324     return decode_block_coeffs_internal(r, block, probs, i,
1325                                         token_prob, qmul, scan, IS_VP7);
1326 }
1327
1328 #ifndef vp8_decode_block_coeffs_internal
1329 static int vp8_decode_block_coeffs_internal(VP56RangeCoder *r,
1330                                             int16_t block[16],
1331                                             uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1332                                             int i, uint8_t *token_prob,
1333                                             int16_t qmul[2])
1334 {
1335     return decode_block_coeffs_internal(r, block, probs, i,
1336                                         token_prob, qmul, zigzag_scan, IS_VP8);
1337 }
1338 #endif
1339
1340 /**
1341  * @param c          arithmetic bitstream reader context
1342  * @param block      destination for block coefficients
1343  * @param probs      probabilities to use when reading trees from the bitstream
1344  * @param i          initial coeff index, 0 unless a separate DC block is coded
1345  * @param zero_nhood the initial prediction context for number of surrounding
1346  *                   all-zero blocks (only left/top, so 0-2)
1347  * @param qmul       array holding the dc/ac dequant factor at position 0/1
1348  * @param scan       scan pattern (VP7 only)
1349  *
1350  * @return 0 if no coeffs were decoded
1351  *         otherwise, the index of the last coeff decoded plus one
1352  */
1353 static av_always_inline
1354 int decode_block_coeffs(VP56RangeCoder *c, int16_t block[16],
1355                         uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1356                         int i, int zero_nhood, int16_t qmul[2],
1357                         const uint8_t scan[16], int vp7)
1358 {
1359     uint8_t *token_prob = probs[i][zero_nhood];
1360     if (!vp56_rac_get_prob_branchy(c, token_prob[0]))   // DCT_EOB
1361         return 0;
1362     return vp7 ? vp7_decode_block_coeffs_internal(c, block, probs, i,
1363                                                   token_prob, qmul, scan)
1364                : vp8_decode_block_coeffs_internal(c, block, probs, i,
1365                                                   token_prob, qmul);
1366 }
1367
1368 static av_always_inline
1369 void decode_mb_coeffs(VP8Context *s, VP8ThreadData *td, VP56RangeCoder *c,
1370                       VP8Macroblock *mb, uint8_t t_nnz[9], uint8_t l_nnz[9],
1371                       int is_vp7)
1372 {
1373     int i, x, y, luma_start = 0, luma_ctx = 3;
1374     int nnz_pred, nnz, nnz_total = 0;
1375     int segment = mb->segment;
1376     int block_dc = 0;
1377
1378     if (mb->mode != MODE_I4x4 && (is_vp7 || mb->mode != VP8_MVMODE_SPLIT)) {
1379         nnz_pred = t_nnz[8] + l_nnz[8];
1380
1381         // decode DC values and do hadamard
1382         nnz = decode_block_coeffs(c, td->block_dc, s->prob->token[1], 0,
1383                                   nnz_pred, s->qmat[segment].luma_dc_qmul,
1384                                   zigzag_scan, is_vp7);
1385         l_nnz[8] = t_nnz[8] = !!nnz;
1386
1387         if (is_vp7 && mb->mode > MODE_I4x4) {
1388             nnz |=  inter_predict_dc(td->block_dc,
1389                                      s->inter_dc_pred[mb->ref_frame - 1]);
1390         }
1391
1392         if (nnz) {
1393             nnz_total += nnz;
1394             block_dc   = 1;
1395             if (nnz == 1)
1396                 s->vp8dsp.vp8_luma_dc_wht_dc(td->block, td->block_dc);
1397             else
1398                 s->vp8dsp.vp8_luma_dc_wht(td->block, td->block_dc);
1399         }
1400         luma_start = 1;
1401         luma_ctx   = 0;
1402     }
1403
1404     // luma blocks
1405     for (y = 0; y < 4; y++)
1406         for (x = 0; x < 4; x++) {
1407             nnz_pred = l_nnz[y] + t_nnz[x];
1408             nnz = decode_block_coeffs(c, td->block[y][x],
1409                                       s->prob->token[luma_ctx],
1410                                       luma_start, nnz_pred,
1411                                       s->qmat[segment].luma_qmul,
1412                                       s->prob[0].scan, is_vp7);
1413             /* nnz+block_dc may be one more than the actual last index,
1414              * but we don't care */
1415             td->non_zero_count_cache[y][x] = nnz + block_dc;
1416             t_nnz[x] = l_nnz[y] = !!nnz;
1417             nnz_total += nnz;
1418         }
1419
1420     // chroma blocks
1421     // TODO: what to do about dimensions? 2nd dim for luma is x,
1422     // but for chroma it's (y<<1)|x
1423     for (i = 4; i < 6; i++)
1424         for (y = 0; y < 2; y++)
1425             for (x = 0; x < 2; x++) {
1426                 nnz_pred = l_nnz[i + 2 * y] + t_nnz[i + 2 * x];
1427                 nnz = decode_block_coeffs(c, td->block[i][(y << 1) + x],
1428                                           s->prob->token[2], 0, nnz_pred,
1429                                           s->qmat[segment].chroma_qmul,
1430                                           s->prob[0].scan, is_vp7);
1431                 td->non_zero_count_cache[i][(y << 1) + x] = nnz;
1432                 t_nnz[i + 2 * x] = l_nnz[i + 2 * y] = !!nnz;
1433                 nnz_total += nnz;
1434             }
1435
1436     // if there were no coded coeffs despite the macroblock not being marked skip,
1437     // we MUST not do the inner loop filter and should not do IDCT
1438     // Since skip isn't used for bitstream prediction, just manually set it.
1439     if (!nnz_total)
1440         mb->skip = 1;
1441 }
1442
1443 static av_always_inline
1444 void backup_mb_border(uint8_t *top_border, uint8_t *src_y,
1445                       uint8_t *src_cb, uint8_t *src_cr,
1446                       int linesize, int uvlinesize, int simple)
1447 {
1448     AV_COPY128(top_border, src_y + 15 * linesize);
1449     if (!simple) {
1450         AV_COPY64(top_border + 16, src_cb + 7 * uvlinesize);
1451         AV_COPY64(top_border + 24, src_cr + 7 * uvlinesize);
1452     }
1453 }
1454
1455 static av_always_inline
1456 void xchg_mb_border(uint8_t *top_border, uint8_t *src_y, uint8_t *src_cb,
1457                     uint8_t *src_cr, int linesize, int uvlinesize, int mb_x,
1458                     int mb_y, int mb_width, int simple, int xchg)
1459 {
1460     uint8_t *top_border_m1 = top_border - 32;     // for TL prediction
1461     src_y  -= linesize;
1462     src_cb -= uvlinesize;
1463     src_cr -= uvlinesize;
1464
1465 #define XCHG(a, b, xchg)                                                      \
1466     do {                                                                      \
1467         if (xchg)                                                             \
1468             AV_SWAP64(b, a);                                                  \
1469         else                                                                  \
1470             AV_COPY64(b, a);                                                  \
1471     } while (0)
1472
1473     XCHG(top_border_m1 + 8, src_y - 8, xchg);
1474     XCHG(top_border, src_y, xchg);
1475     XCHG(top_border + 8, src_y + 8, 1);
1476     if (mb_x < mb_width - 1)
1477         XCHG(top_border + 32, src_y + 16, 1);
1478
1479     // only copy chroma for normal loop filter
1480     // or to initialize the top row to 127
1481     if (!simple || !mb_y) {
1482         XCHG(top_border_m1 + 16, src_cb - 8, xchg);
1483         XCHG(top_border_m1 + 24, src_cr - 8, xchg);
1484         XCHG(top_border + 16, src_cb, 1);
1485         XCHG(top_border + 24, src_cr, 1);
1486     }
1487 }
1488
1489 static av_always_inline
1490 int check_dc_pred8x8_mode(int mode, int mb_x, int mb_y)
1491 {
1492     if (!mb_x)
1493         return mb_y ? TOP_DC_PRED8x8 : DC_128_PRED8x8;
1494     else
1495         return mb_y ? mode : LEFT_DC_PRED8x8;
1496 }
1497
1498 static av_always_inline
1499 int check_tm_pred8x8_mode(int mode, int mb_x, int mb_y, int vp7)
1500 {
1501     if (!mb_x)
1502         return mb_y ? VERT_PRED8x8 : (vp7 ? DC_128_PRED8x8 : DC_129_PRED8x8);
1503     else
1504         return mb_y ? mode : HOR_PRED8x8;
1505 }
1506
1507 static av_always_inline
1508 int check_intra_pred8x8_mode_emuedge(int mode, int mb_x, int mb_y, int vp7)
1509 {
1510     switch (mode) {
1511     case DC_PRED8x8:
1512         return check_dc_pred8x8_mode(mode, mb_x, mb_y);
1513     case VERT_PRED8x8:
1514         return !mb_y ? (vp7 ? DC_128_PRED8x8 : DC_127_PRED8x8) : mode;
1515     case HOR_PRED8x8:
1516         return !mb_x ? (vp7 ? DC_128_PRED8x8 : DC_129_PRED8x8) : mode;
1517     case PLANE_PRED8x8: /* TM */
1518         return check_tm_pred8x8_mode(mode, mb_x, mb_y, vp7);
1519     }
1520     return mode;
1521 }
1522
1523 static av_always_inline
1524 int check_tm_pred4x4_mode(int mode, int mb_x, int mb_y, int vp7)
1525 {
1526     if (!mb_x) {
1527         return mb_y ? VERT_VP8_PRED : (vp7 ? DC_128_PRED : DC_129_PRED);
1528     } else {
1529         return mb_y ? mode : HOR_VP8_PRED;
1530     }
1531 }
1532
1533 static av_always_inline
1534 int check_intra_pred4x4_mode_emuedge(int mode, int mb_x, int mb_y,
1535                                      int *copy_buf, int vp7)
1536 {
1537     switch (mode) {
1538     case VERT_PRED:
1539         if (!mb_x && mb_y) {
1540             *copy_buf = 1;
1541             return mode;
1542         }
1543         /* fall-through */
1544     case DIAG_DOWN_LEFT_PRED:
1545     case VERT_LEFT_PRED:
1546         return !mb_y ? (vp7 ? DC_128_PRED : DC_127_PRED) : mode;
1547     case HOR_PRED:
1548         if (!mb_y) {
1549             *copy_buf = 1;
1550             return mode;
1551         }
1552         /* fall-through */
1553     case HOR_UP_PRED:
1554         return !mb_x ? (vp7 ? DC_128_PRED : DC_129_PRED) : mode;
1555     case TM_VP8_PRED:
1556         return check_tm_pred4x4_mode(mode, mb_x, mb_y, vp7);
1557     case DC_PRED: /* 4x4 DC doesn't use the same "H.264-style" exceptions
1558                    * as 16x16/8x8 DC */
1559     case DIAG_DOWN_RIGHT_PRED:
1560     case VERT_RIGHT_PRED:
1561     case HOR_DOWN_PRED:
1562         if (!mb_y || !mb_x)
1563             *copy_buf = 1;
1564         return mode;
1565     }
1566     return mode;
1567 }
1568
1569 static av_always_inline
1570 void intra_predict(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1571                    VP8Macroblock *mb, int mb_x, int mb_y, int is_vp7)
1572 {
1573     int x, y, mode, nnz;
1574     uint32_t tr;
1575
1576     /* for the first row, we need to run xchg_mb_border to init the top edge
1577      * to 127 otherwise, skip it if we aren't going to deblock */
1578     if (mb_y && (s->deblock_filter || !mb_y) && td->thread_nr == 0)
1579         xchg_mb_border(s->top_border[mb_x + 1], dst[0], dst[1], dst[2],
1580                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1581                        s->filter.simple, 1);
1582
1583     if (mb->mode < MODE_I4x4) {
1584         mode = check_intra_pred8x8_mode_emuedge(mb->mode, mb_x, mb_y, is_vp7);
1585         s->hpc.pred16x16[mode](dst[0], s->linesize);
1586     } else {
1587         uint8_t *ptr = dst[0];
1588         uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
1589         const uint8_t lo = is_vp7 ? 128 : 127;
1590         const uint8_t hi = is_vp7 ? 128 : 129;
1591         uint8_t tr_top[4] = { lo, lo, lo, lo };
1592
1593         // all blocks on the right edge of the macroblock use bottom edge
1594         // the top macroblock for their topright edge
1595         uint8_t *tr_right = ptr - s->linesize + 16;
1596
1597         // if we're on the right edge of the frame, said edge is extended
1598         // from the top macroblock
1599         if (mb_y && mb_x == s->mb_width - 1) {
1600             tr       = tr_right[-1] * 0x01010101u;
1601             tr_right = (uint8_t *) &tr;
1602         }
1603
1604         if (mb->skip)
1605             AV_ZERO128(td->non_zero_count_cache);
1606
1607         for (y = 0; y < 4; y++) {
1608             uint8_t *topright = ptr + 4 - s->linesize;
1609             for (x = 0; x < 4; x++) {
1610                 int copy = 0, linesize = s->linesize;
1611                 uint8_t *dst = ptr + 4 * x;
1612                 DECLARE_ALIGNED(4, uint8_t, copy_dst)[5 * 8];
1613
1614                 if ((y == 0 || x == 3) && mb_y == 0) {
1615                     topright = tr_top;
1616                 } else if (x == 3)
1617                     topright = tr_right;
1618
1619                 mode = check_intra_pred4x4_mode_emuedge(intra4x4[x], mb_x + x,
1620                                                         mb_y + y, &copy, is_vp7);
1621                 if (copy) {
1622                     dst      = copy_dst + 12;
1623                     linesize = 8;
1624                     if (!(mb_y + y)) {
1625                         copy_dst[3] = lo;
1626                         AV_WN32A(copy_dst + 4, lo * 0x01010101U);
1627                     } else {
1628                         AV_COPY32(copy_dst + 4, ptr + 4 * x - s->linesize);
1629                         if (!(mb_x + x)) {
1630                             copy_dst[3] = hi;
1631                         } else {
1632                             copy_dst[3] = ptr[4 * x - s->linesize - 1];
1633                         }
1634                     }
1635                     if (!(mb_x + x)) {
1636                         copy_dst[11] =
1637                         copy_dst[19] =
1638                         copy_dst[27] =
1639                         copy_dst[35] = hi;
1640                     } else {
1641                         copy_dst[11] = ptr[4 * x                   - 1];
1642                         copy_dst[19] = ptr[4 * x + s->linesize     - 1];
1643                         copy_dst[27] = ptr[4 * x + s->linesize * 2 - 1];
1644                         copy_dst[35] = ptr[4 * x + s->linesize * 3 - 1];
1645                     }
1646                 }
1647                 s->hpc.pred4x4[mode](dst, topright, linesize);
1648                 if (copy) {
1649                     AV_COPY32(ptr + 4 * x,                   copy_dst + 12);
1650                     AV_COPY32(ptr + 4 * x + s->linesize,     copy_dst + 20);
1651                     AV_COPY32(ptr + 4 * x + s->linesize * 2, copy_dst + 28);
1652                     AV_COPY32(ptr + 4 * x + s->linesize * 3, copy_dst + 36);
1653                 }
1654
1655                 nnz = td->non_zero_count_cache[y][x];
1656                 if (nnz) {
1657                     if (nnz == 1)
1658                         s->vp8dsp.vp8_idct_dc_add(ptr + 4 * x,
1659                                                   td->block[y][x], s->linesize);
1660                     else
1661                         s->vp8dsp.vp8_idct_add(ptr + 4 * x,
1662                                                td->block[y][x], s->linesize);
1663                 }
1664                 topright += 4;
1665             }
1666
1667             ptr      += 4 * s->linesize;
1668             intra4x4 += 4;
1669         }
1670     }
1671
1672     mode = check_intra_pred8x8_mode_emuedge(mb->chroma_pred_mode,
1673                                             mb_x, mb_y, is_vp7);
1674     s->hpc.pred8x8[mode](dst[1], s->uvlinesize);
1675     s->hpc.pred8x8[mode](dst[2], s->uvlinesize);
1676
1677     if (mb_y && (s->deblock_filter || !mb_y) && td->thread_nr == 0)
1678         xchg_mb_border(s->top_border[mb_x + 1], dst[0], dst[1], dst[2],
1679                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1680                        s->filter.simple, 0);
1681 }
1682
1683 static const uint8_t subpel_idx[3][8] = {
1684     { 0, 1, 2, 1, 2, 1, 2, 1 }, // nr. of left extra pixels,
1685                                 // also function pointer index
1686     { 0, 3, 5, 3, 5, 3, 5, 3 }, // nr. of extra pixels required
1687     { 0, 2, 3, 2, 3, 2, 3, 2 }, // nr. of right extra pixels
1688 };
1689
1690 /**
1691  * luma MC function
1692  *
1693  * @param s        VP8 decoding context
1694  * @param dst      target buffer for block data at block position
1695  * @param ref      reference picture buffer at origin (0, 0)
1696  * @param mv       motion vector (relative to block position) to get pixel data from
1697  * @param x_off    horizontal position of block from origin (0, 0)
1698  * @param y_off    vertical position of block from origin (0, 0)
1699  * @param block_w  width of block (16, 8 or 4)
1700  * @param block_h  height of block (always same as block_w)
1701  * @param width    width of src/dst plane data
1702  * @param height   height of src/dst plane data
1703  * @param linesize size of a single line of plane data, including padding
1704  * @param mc_func  motion compensation function pointers (bilinear or sixtap MC)
1705  */
1706 static av_always_inline
1707 void vp8_mc_luma(VP8Context *s, VP8ThreadData *td, uint8_t *dst,
1708                  ThreadFrame *ref, const VP56mv *mv,
1709                  int x_off, int y_off, int block_w, int block_h,
1710                  int width, int height, ptrdiff_t linesize,
1711                  vp8_mc_func mc_func[3][3])
1712 {
1713     uint8_t *src = ref->f->data[0];
1714
1715     if (AV_RN32A(mv)) {
1716         int src_linesize = linesize;
1717
1718         int mx = (mv->x << 1) & 7, mx_idx = subpel_idx[0][mx];
1719         int my = (mv->y << 1) & 7, my_idx = subpel_idx[0][my];
1720
1721         x_off += mv->x >> 2;
1722         y_off += mv->y >> 2;
1723
1724         // edge emulation
1725         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 4, 0);
1726         src += y_off * linesize + x_off;
1727         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1728             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1729             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1730                                      src - my_idx * linesize - mx_idx,
1731                                      EDGE_EMU_LINESIZE, linesize,
1732                                      block_w + subpel_idx[1][mx],
1733                                      block_h + subpel_idx[1][my],
1734                                      x_off - mx_idx, y_off - my_idx,
1735                                      width, height);
1736             src = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
1737             src_linesize = EDGE_EMU_LINESIZE;
1738         }
1739         mc_func[my_idx][mx_idx](dst, linesize, src, src_linesize, block_h, mx, my);
1740     } else {
1741         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 4, 0);
1742         mc_func[0][0](dst, linesize, src + y_off * linesize + x_off,
1743                       linesize, block_h, 0, 0);
1744     }
1745 }
1746
1747 /**
1748  * chroma MC function
1749  *
1750  * @param s        VP8 decoding context
1751  * @param dst1     target buffer for block data at block position (U plane)
1752  * @param dst2     target buffer for block data at block position (V plane)
1753  * @param ref      reference picture buffer at origin (0, 0)
1754  * @param mv       motion vector (relative to block position) to get pixel data from
1755  * @param x_off    horizontal position of block from origin (0, 0)
1756  * @param y_off    vertical position of block from origin (0, 0)
1757  * @param block_w  width of block (16, 8 or 4)
1758  * @param block_h  height of block (always same as block_w)
1759  * @param width    width of src/dst plane data
1760  * @param height   height of src/dst plane data
1761  * @param linesize size of a single line of plane data, including padding
1762  * @param mc_func  motion compensation function pointers (bilinear or sixtap MC)
1763  */
1764 static av_always_inline
1765 void vp8_mc_chroma(VP8Context *s, VP8ThreadData *td, uint8_t *dst1,
1766                    uint8_t *dst2, ThreadFrame *ref, const VP56mv *mv,
1767                    int x_off, int y_off, int block_w, int block_h,
1768                    int width, int height, ptrdiff_t linesize,
1769                    vp8_mc_func mc_func[3][3])
1770 {
1771     uint8_t *src1 = ref->f->data[1], *src2 = ref->f->data[2];
1772
1773     if (AV_RN32A(mv)) {
1774         int mx = mv->x & 7, mx_idx = subpel_idx[0][mx];
1775         int my = mv->y & 7, my_idx = subpel_idx[0][my];
1776
1777         x_off += mv->x >> 3;
1778         y_off += mv->y >> 3;
1779
1780         // edge emulation
1781         src1 += y_off * linesize + x_off;
1782         src2 += y_off * linesize + x_off;
1783         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 3, 0);
1784         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1785             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1786             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1787                                      src1 - my_idx * linesize - mx_idx,
1788                                      EDGE_EMU_LINESIZE, linesize,
1789                                      block_w + subpel_idx[1][mx],
1790                                      block_h + subpel_idx[1][my],
1791                                      x_off - mx_idx, y_off - my_idx, width, height);
1792             src1 = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
1793             mc_func[my_idx][mx_idx](dst1, linesize, src1, EDGE_EMU_LINESIZE, block_h, mx, my);
1794
1795             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1796                                      src2 - my_idx * linesize - mx_idx,
1797                                      EDGE_EMU_LINESIZE, linesize,
1798                                      block_w + subpel_idx[1][mx],
1799                                      block_h + subpel_idx[1][my],
1800                                      x_off - mx_idx, y_off - my_idx, width, height);
1801             src2 = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
1802             mc_func[my_idx][mx_idx](dst2, linesize, src2, EDGE_EMU_LINESIZE, block_h, mx, my);
1803         } else {
1804             mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my);
1805             mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my);
1806         }
1807     } else {
1808         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 3, 0);
1809         mc_func[0][0](dst1, linesize, src1 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1810         mc_func[0][0](dst2, linesize, src2 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1811     }
1812 }
1813
1814 static av_always_inline
1815 void vp8_mc_part(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1816                  ThreadFrame *ref_frame, int x_off, int y_off,
1817                  int bx_off, int by_off, int block_w, int block_h,
1818                  int width, int height, VP56mv *mv)
1819 {
1820     VP56mv uvmv = *mv;
1821
1822     /* Y */
1823     vp8_mc_luma(s, td, dst[0] + by_off * s->linesize + bx_off,
1824                 ref_frame, mv, x_off + bx_off, y_off + by_off,
1825                 block_w, block_h, width, height, s->linesize,
1826                 s->put_pixels_tab[block_w == 8]);
1827
1828     /* U/V */
1829     if (s->profile == 3) {
1830         /* this block only applies VP8; it is safe to check
1831          * only the profile, as VP7 profile <= 1 */
1832         uvmv.x &= ~7;
1833         uvmv.y &= ~7;
1834     }
1835     x_off   >>= 1;
1836     y_off   >>= 1;
1837     bx_off  >>= 1;
1838     by_off  >>= 1;
1839     width   >>= 1;
1840     height  >>= 1;
1841     block_w >>= 1;
1842     block_h >>= 1;
1843     vp8_mc_chroma(s, td, dst[1] + by_off * s->uvlinesize + bx_off,
1844                   dst[2] + by_off * s->uvlinesize + bx_off, ref_frame,
1845                   &uvmv, x_off + bx_off, y_off + by_off,
1846                   block_w, block_h, width, height, s->uvlinesize,
1847                   s->put_pixels_tab[1 + (block_w == 4)]);
1848 }
1849
1850 /* Fetch pixels for estimated mv 4 macroblocks ahead.
1851  * Optimized for 64-byte cache lines. Inspired by ffh264 prefetch_motion. */
1852 static av_always_inline
1853 void prefetch_motion(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
1854                      int mb_xy, int ref)
1855 {
1856     /* Don't prefetch refs that haven't been used very often this frame. */
1857     if (s->ref_count[ref - 1] > (mb_xy >> 5)) {
1858         int x_off = mb_x << 4, y_off = mb_y << 4;
1859         int mx = (mb->mv.x >> 2) + x_off + 8;
1860         int my = (mb->mv.y >> 2) + y_off;
1861         uint8_t **src = s->framep[ref]->tf.f->data;
1862         int off = mx + (my + (mb_x & 3) * 4) * s->linesize + 64;
1863         /* For threading, a ff_thread_await_progress here might be useful, but
1864          * it actually slows down the decoder. Since a bad prefetch doesn't
1865          * generate bad decoder output, we don't run it here. */
1866         s->vdsp.prefetch(src[0] + off, s->linesize, 4);
1867         off = (mx >> 1) + ((my >> 1) + (mb_x & 7)) * s->uvlinesize + 64;
1868         s->vdsp.prefetch(src[1] + off, src[2] - src[1], 2);
1869     }
1870 }
1871
1872 /**
1873  * Apply motion vectors to prediction buffer, chapter 18.
1874  */
1875 static av_always_inline
1876 void inter_predict(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1877                    VP8Macroblock *mb, int mb_x, int mb_y)
1878 {
1879     int x_off = mb_x << 4, y_off = mb_y << 4;
1880     int width = 16 * s->mb_width, height = 16 * s->mb_height;
1881     ThreadFrame *ref = &s->framep[mb->ref_frame]->tf;
1882     VP56mv *bmv = mb->bmv;
1883
1884     switch (mb->partitioning) {
1885     case VP8_SPLITMVMODE_NONE:
1886         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1887                     0, 0, 16, 16, width, height, &mb->mv);
1888         break;
1889     case VP8_SPLITMVMODE_4x4: {
1890         int x, y;
1891         VP56mv uvmv;
1892
1893         /* Y */
1894         for (y = 0; y < 4; y++) {
1895             for (x = 0; x < 4; x++) {
1896                 vp8_mc_luma(s, td, dst[0] + 4 * y * s->linesize + x * 4,
1897                             ref, &bmv[4 * y + x],
1898                             4 * x + x_off, 4 * y + y_off, 4, 4,
1899                             width, height, s->linesize,
1900                             s->put_pixels_tab[2]);
1901             }
1902         }
1903
1904         /* U/V */
1905         x_off  >>= 1;
1906         y_off  >>= 1;
1907         width  >>= 1;
1908         height >>= 1;
1909         for (y = 0; y < 2; y++) {
1910             for (x = 0; x < 2; x++) {
1911                 uvmv.x = mb->bmv[2 * y       * 4 + 2 * x    ].x +
1912                          mb->bmv[2 * y       * 4 + 2 * x + 1].x +
1913                          mb->bmv[(2 * y + 1) * 4 + 2 * x    ].x +
1914                          mb->bmv[(2 * y + 1) * 4 + 2 * x + 1].x;
1915                 uvmv.y = mb->bmv[2 * y       * 4 + 2 * x    ].y +
1916                          mb->bmv[2 * y       * 4 + 2 * x + 1].y +
1917                          mb->bmv[(2 * y + 1) * 4 + 2 * x    ].y +
1918                          mb->bmv[(2 * y + 1) * 4 + 2 * x + 1].y;
1919                 uvmv.x = (uvmv.x + 2 + FF_SIGNBIT(uvmv.x)) >> 2;
1920                 uvmv.y = (uvmv.y + 2 + FF_SIGNBIT(uvmv.y)) >> 2;
1921                 if (s->profile == 3) {
1922                     uvmv.x &= ~7;
1923                     uvmv.y &= ~7;
1924                 }
1925                 vp8_mc_chroma(s, td, dst[1] + 4 * y * s->uvlinesize + x * 4,
1926                               dst[2] + 4 * y * s->uvlinesize + x * 4, ref,
1927                               &uvmv, 4 * x + x_off, 4 * y + y_off, 4, 4,
1928                               width, height, s->uvlinesize,
1929                               s->put_pixels_tab[2]);
1930             }
1931         }
1932         break;
1933     }
1934     case VP8_SPLITMVMODE_16x8:
1935         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1936                     0, 0, 16, 8, width, height, &bmv[0]);
1937         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1938                     0, 8, 16, 8, width, height, &bmv[1]);
1939         break;
1940     case VP8_SPLITMVMODE_8x16:
1941         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1942                     0, 0, 8, 16, width, height, &bmv[0]);
1943         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1944                     8, 0, 8, 16, width, height, &bmv[1]);
1945         break;
1946     case VP8_SPLITMVMODE_8x8:
1947         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1948                     0, 0, 8, 8, width, height, &bmv[0]);
1949         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1950                     8, 0, 8, 8, width, height, &bmv[1]);
1951         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1952                     0, 8, 8, 8, width, height, &bmv[2]);
1953         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1954                     8, 8, 8, 8, width, height, &bmv[3]);
1955         break;
1956     }
1957 }
1958
1959 static av_always_inline
1960 void idct_mb(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3], VP8Macroblock *mb)
1961 {
1962     int x, y, ch;
1963
1964     if (mb->mode != MODE_I4x4) {
1965         uint8_t *y_dst = dst[0];
1966         for (y = 0; y < 4; y++) {
1967             uint32_t nnz4 = AV_RL32(td->non_zero_count_cache[y]);
1968             if (nnz4) {
1969                 if (nnz4 & ~0x01010101) {
1970                     for (x = 0; x < 4; x++) {
1971                         if ((uint8_t) nnz4 == 1)
1972                             s->vp8dsp.vp8_idct_dc_add(y_dst + 4 * x,
1973                                                       td->block[y][x],
1974                                                       s->linesize);
1975                         else if ((uint8_t) nnz4 > 1)
1976                             s->vp8dsp.vp8_idct_add(y_dst + 4 * x,
1977                                                    td->block[y][x],
1978                                                    s->linesize);
1979                         nnz4 >>= 8;
1980                         if (!nnz4)
1981                             break;
1982                     }
1983                 } else {
1984                     s->vp8dsp.vp8_idct_dc_add4y(y_dst, td->block[y], s->linesize);
1985                 }
1986             }
1987             y_dst += 4 * s->linesize;
1988         }
1989     }
1990
1991     for (ch = 0; ch < 2; ch++) {
1992         uint32_t nnz4 = AV_RL32(td->non_zero_count_cache[4 + ch]);
1993         if (nnz4) {
1994             uint8_t *ch_dst = dst[1 + ch];
1995             if (nnz4 & ~0x01010101) {
1996                 for (y = 0; y < 2; y++) {
1997                     for (x = 0; x < 2; x++) {
1998                         if ((uint8_t) nnz4 == 1)
1999                             s->vp8dsp.vp8_idct_dc_add(ch_dst + 4 * x,
2000                                                       td->block[4 + ch][(y << 1) + x],
2001                                                       s->uvlinesize);
2002                         else if ((uint8_t) nnz4 > 1)
2003                             s->vp8dsp.vp8_idct_add(ch_dst + 4 * x,
2004                                                    td->block[4 + ch][(y << 1) + x],
2005                                                    s->uvlinesize);
2006                         nnz4 >>= 8;
2007                         if (!nnz4)
2008                             goto chroma_idct_end;
2009                     }
2010                     ch_dst += 4 * s->uvlinesize;
2011                 }
2012             } else {
2013                 s->vp8dsp.vp8_idct_dc_add4uv(ch_dst, td->block[4 + ch], s->uvlinesize);
2014             }
2015         }
2016 chroma_idct_end:
2017         ;
2018     }
2019 }
2020
2021 static av_always_inline
2022 void filter_level_for_mb(VP8Context *s, VP8Macroblock *mb,
2023                          VP8FilterStrength *f, int is_vp7)
2024 {
2025     int interior_limit, filter_level;
2026
2027     if (s->segmentation.enabled) {
2028         filter_level = s->segmentation.filter_level[mb->segment];
2029         if (!s->segmentation.absolute_vals)
2030             filter_level += s->filter.level;
2031     } else
2032         filter_level = s->filter.level;
2033
2034     if (s->lf_delta.enabled) {
2035         filter_level += s->lf_delta.ref[mb->ref_frame];
2036         filter_level += s->lf_delta.mode[mb->mode];
2037     }
2038
2039     filter_level = av_clip_uintp2(filter_level, 6);
2040
2041     interior_limit = filter_level;
2042     if (s->filter.sharpness) {
2043         interior_limit >>= (s->filter.sharpness + 3) >> 2;
2044         interior_limit = FFMIN(interior_limit, 9 - s->filter.sharpness);
2045     }
2046     interior_limit = FFMAX(interior_limit, 1);
2047
2048     f->filter_level = filter_level;
2049     f->inner_limit = interior_limit;
2050     f->inner_filter = is_vp7 || !mb->skip || mb->mode == MODE_I4x4 ||
2051                       mb->mode == VP8_MVMODE_SPLIT;
2052 }
2053
2054 static av_always_inline
2055 void filter_mb(VP8Context *s, uint8_t *dst[3], VP8FilterStrength *f,
2056                int mb_x, int mb_y, int is_vp7)
2057 {
2058     int mbedge_lim, bedge_lim_y, bedge_lim_uv, hev_thresh;
2059     int filter_level = f->filter_level;
2060     int inner_limit = f->inner_limit;
2061     int inner_filter = f->inner_filter;
2062     int linesize = s->linesize;
2063     int uvlinesize = s->uvlinesize;
2064     static const uint8_t hev_thresh_lut[2][64] = {
2065         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
2066           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2067           3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2068           3, 3, 3, 3 },
2069         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
2070           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2071           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2072           2, 2, 2, 2 }
2073     };
2074
2075     if (!filter_level)
2076         return;
2077
2078     if (is_vp7) {
2079         bedge_lim_y  = filter_level;
2080         bedge_lim_uv = filter_level * 2;
2081         mbedge_lim   = filter_level + 2;
2082     } else {
2083         bedge_lim_y  =
2084         bedge_lim_uv = filter_level * 2 + inner_limit;
2085         mbedge_lim   = bedge_lim_y + 4;
2086     }
2087
2088     hev_thresh = hev_thresh_lut[s->keyframe][filter_level];
2089
2090     if (mb_x) {
2091         s->vp8dsp.vp8_h_loop_filter16y(dst[0], linesize,
2092                                        mbedge_lim, inner_limit, hev_thresh);
2093         s->vp8dsp.vp8_h_loop_filter8uv(dst[1], dst[2], uvlinesize,
2094                                        mbedge_lim, inner_limit, hev_thresh);
2095     }
2096
2097 #define H_LOOP_FILTER_16Y_INNER(cond)                                         \
2098     if (cond && inner_filter) {                                               \
2099         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] +  4, linesize,           \
2100                                              bedge_lim_y, inner_limit,        \
2101                                              hev_thresh);                     \
2102         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] +  8, linesize,           \
2103                                              bedge_lim_y, inner_limit,        \
2104                                              hev_thresh);                     \
2105         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] + 12, linesize,           \
2106                                              bedge_lim_y, inner_limit,        \
2107                                              hev_thresh);                     \
2108         s->vp8dsp.vp8_h_loop_filter8uv_inner(dst[1] +  4, dst[2] + 4,         \
2109                                              uvlinesize,  bedge_lim_uv,       \
2110                                              inner_limit, hev_thresh);        \
2111     }
2112
2113     H_LOOP_FILTER_16Y_INNER(!is_vp7)
2114
2115     if (mb_y) {
2116         s->vp8dsp.vp8_v_loop_filter16y(dst[0], linesize,
2117                                        mbedge_lim, inner_limit, hev_thresh);
2118         s->vp8dsp.vp8_v_loop_filter8uv(dst[1], dst[2], uvlinesize,
2119                                        mbedge_lim, inner_limit, hev_thresh);
2120     }
2121
2122     if (inner_filter) {
2123         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] +  4 * linesize,
2124                                              linesize, bedge_lim_y,
2125                                              inner_limit, hev_thresh);
2126         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] +  8 * linesize,
2127                                              linesize, bedge_lim_y,
2128                                              inner_limit, hev_thresh);
2129         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] + 12 * linesize,
2130                                              linesize, bedge_lim_y,
2131                                              inner_limit, hev_thresh);
2132         s->vp8dsp.vp8_v_loop_filter8uv_inner(dst[1] +  4 * uvlinesize,
2133                                              dst[2] +  4 * uvlinesize,
2134                                              uvlinesize, bedge_lim_uv,
2135                                              inner_limit, hev_thresh);
2136     }
2137
2138     H_LOOP_FILTER_16Y_INNER(is_vp7)
2139 }
2140
2141 static av_always_inline
2142 void filter_mb_simple(VP8Context *s, uint8_t *dst, VP8FilterStrength *f,
2143                       int mb_x, int mb_y)
2144 {
2145     int mbedge_lim, bedge_lim;
2146     int filter_level = f->filter_level;
2147     int inner_limit  = f->inner_limit;
2148     int inner_filter = f->inner_filter;
2149     int linesize     = s->linesize;
2150
2151     if (!filter_level)
2152         return;
2153
2154     bedge_lim  = 2 * filter_level + inner_limit;
2155     mbedge_lim = bedge_lim + 4;
2156
2157     if (mb_x)
2158         s->vp8dsp.vp8_h_loop_filter_simple(dst, linesize, mbedge_lim);
2159     if (inner_filter) {
2160         s->vp8dsp.vp8_h_loop_filter_simple(dst +  4, linesize, bedge_lim);
2161         s->vp8dsp.vp8_h_loop_filter_simple(dst +  8, linesize, bedge_lim);
2162         s->vp8dsp.vp8_h_loop_filter_simple(dst + 12, linesize, bedge_lim);
2163     }
2164
2165     if (mb_y)
2166         s->vp8dsp.vp8_v_loop_filter_simple(dst, linesize, mbedge_lim);
2167     if (inner_filter) {
2168         s->vp8dsp.vp8_v_loop_filter_simple(dst +  4 * linesize, linesize, bedge_lim);
2169         s->vp8dsp.vp8_v_loop_filter_simple(dst +  8 * linesize, linesize, bedge_lim);
2170         s->vp8dsp.vp8_v_loop_filter_simple(dst + 12 * linesize, linesize, bedge_lim);
2171     }
2172 }
2173
2174 #define MARGIN (16 << 2)
2175 static av_always_inline
2176 void vp78_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *curframe,
2177                                     VP8Frame *prev_frame, int is_vp7)
2178 {
2179     VP8Context *s = avctx->priv_data;
2180     int mb_x, mb_y;
2181
2182     s->mv_min.y = -MARGIN;
2183     s->mv_max.y = ((s->mb_height - 1) << 6) + MARGIN;
2184     for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
2185         VP8Macroblock *mb = s->macroblocks_base +
2186                             ((s->mb_width + 1) * (mb_y + 1) + 1);
2187         int mb_xy = mb_y * s->mb_width;
2188
2189         AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED * 0x01010101);
2190
2191         s->mv_min.x = -MARGIN;
2192         s->mv_max.x = ((s->mb_width - 1) << 6) + MARGIN;
2193         for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
2194             if (mb_y == 0)
2195                 AV_WN32A((mb - s->mb_width - 1)->intra4x4_pred_mode_top,
2196                          DC_PRED * 0x01010101);
2197             decode_mb_mode(s, mb, mb_x, mb_y, curframe->seg_map->data + mb_xy,
2198                            prev_frame && prev_frame->seg_map ?
2199                            prev_frame->seg_map->data + mb_xy : NULL, 1, is_vp7);
2200             s->mv_min.x -= 64;
2201             s->mv_max.x -= 64;
2202         }
2203         s->mv_min.y -= 64;
2204         s->mv_max.y -= 64;
2205     }
2206 }
2207
2208 static void vp7_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *cur_frame,
2209                                    VP8Frame *prev_frame)
2210 {
2211     vp78_decode_mv_mb_modes(avctx, cur_frame, prev_frame, IS_VP7);
2212 }
2213
2214 static void vp8_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *cur_frame,
2215                                    VP8Frame *prev_frame)
2216 {
2217     vp78_decode_mv_mb_modes(avctx, cur_frame, prev_frame, IS_VP8);
2218 }
2219
2220 #if HAVE_THREADS
2221 #define check_thread_pos(td, otd, mb_x_check, mb_y_check)                     \
2222     do {                                                                      \
2223         int tmp = (mb_y_check << 16) | (mb_x_check & 0xFFFF);                 \
2224         if (otd->thread_mb_pos < tmp) {                                       \
2225             pthread_mutex_lock(&otd->lock);                                   \
2226             td->wait_mb_pos = tmp;                                            \
2227             do {                                                              \
2228                 if (otd->thread_mb_pos >= tmp)                                \
2229                     break;                                                    \
2230                 pthread_cond_wait(&otd->cond, &otd->lock);                    \
2231             } while (1);                                                      \
2232             td->wait_mb_pos = INT_MAX;                                        \
2233             pthread_mutex_unlock(&otd->lock);                                 \
2234         }                                                                     \
2235     } while (0);
2236
2237 #define update_pos(td, mb_y, mb_x)                                            \
2238     do {                                                                      \
2239         int pos              = (mb_y << 16) | (mb_x & 0xFFFF);                \
2240         int sliced_threading = (avctx->active_thread_type == FF_THREAD_SLICE) && \
2241                                (num_jobs > 1);                                \
2242         int is_null          = !next_td || !prev_td;                          \
2243         int pos_check        = (is_null) ? 1                                  \
2244                                          : (next_td != td &&                  \
2245                                             pos >= next_td->wait_mb_pos) ||   \
2246                                            (prev_td != td &&                  \
2247                                             pos >= prev_td->wait_mb_pos);     \
2248         td->thread_mb_pos = pos;                                              \
2249         if (sliced_threading && pos_check) {                                  \
2250             pthread_mutex_lock(&td->lock);                                    \
2251             pthread_cond_broadcast(&td->cond);                                \
2252             pthread_mutex_unlock(&td->lock);                                  \
2253         }                                                                     \
2254     } while (0);
2255 #else
2256 #define check_thread_pos(td, otd, mb_x_check, mb_y_check)
2257 #define update_pos(td, mb_y, mb_x)
2258 #endif
2259
2260 static av_always_inline void decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
2261                                         int jobnr, int threadnr, int is_vp7)
2262 {
2263     VP8Context *s = avctx->priv_data;
2264     VP8ThreadData *prev_td, *next_td, *td = &s->thread_data[threadnr];
2265     int mb_y = td->thread_mb_pos >> 16;
2266     int mb_x, mb_xy = mb_y * s->mb_width;
2267     int num_jobs = s->num_jobs;
2268     VP8Frame *curframe = s->curframe, *prev_frame = s->prev_frame;
2269     VP56RangeCoder *c  = &s->coeff_partition[mb_y & (s->num_coeff_partitions - 1)];
2270     VP8Macroblock *mb;
2271     uint8_t *dst[3] = {
2272         curframe->tf.f->data[0] + 16 * mb_y * s->linesize,
2273         curframe->tf.f->data[1] +  8 * mb_y * s->uvlinesize,
2274         curframe->tf.f->data[2] +  8 * mb_y * s->uvlinesize
2275     };
2276     if (mb_y == 0)
2277         prev_td = td;
2278     else
2279         prev_td = &s->thread_data[(jobnr + num_jobs - 1) % num_jobs];
2280     if (mb_y == s->mb_height - 1)
2281         next_td = td;
2282     else
2283         next_td = &s->thread_data[(jobnr + 1) % num_jobs];
2284     if (s->mb_layout == 1)
2285         mb = s->macroblocks_base + ((s->mb_width + 1) * (mb_y + 1) + 1);
2286     else {
2287         // Make sure the previous frame has read its segmentation map,
2288         // if we re-use the same map.
2289         if (prev_frame && s->segmentation.enabled &&
2290             !s->segmentation.update_map)
2291             ff_thread_await_progress(&prev_frame->tf, mb_y, 0);
2292         mb = s->macroblocks + (s->mb_height - mb_y - 1) * 2;
2293         memset(mb - 1, 0, sizeof(*mb)); // zero left macroblock
2294         AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED * 0x01010101);
2295     }
2296
2297     if (!is_vp7 || mb_y == 0)
2298         memset(td->left_nnz, 0, sizeof(td->left_nnz));
2299
2300     s->mv_min.x = -MARGIN;
2301     s->mv_max.x = ((s->mb_width - 1) << 6) + MARGIN;
2302
2303     for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
2304         // Wait for previous thread to read mb_x+2, and reach mb_y-1.
2305         if (prev_td != td) {
2306             if (threadnr != 0) {
2307                 check_thread_pos(td, prev_td,
2308                                  mb_x + (is_vp7 ? 2 : 1),
2309                                  mb_y - (is_vp7 ? 2 : 1));
2310             } else {
2311                 check_thread_pos(td, prev_td,
2312                                  mb_x + (is_vp7 ? 2 : 1) + s->mb_width + 3,
2313                                  mb_y - (is_vp7 ? 2 : 1));
2314             }
2315         }
2316
2317         s->vdsp.prefetch(dst[0] + (mb_x & 3) * 4 * s->linesize + 64,
2318                          s->linesize, 4);
2319         s->vdsp.prefetch(dst[1] + (mb_x & 7) * s->uvlinesize + 64,
2320                          dst[2] - dst[1], 2);
2321
2322         if (!s->mb_layout)
2323             decode_mb_mode(s, mb, mb_x, mb_y, curframe->seg_map->data + mb_xy,
2324                            prev_frame && prev_frame->seg_map ?
2325                            prev_frame->seg_map->data + mb_xy : NULL, 0, is_vp7);
2326
2327         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_PREVIOUS);
2328
2329         if (!mb->skip)
2330             decode_mb_coeffs(s, td, c, mb, s->top_nnz[mb_x], td->left_nnz, is_vp7);
2331
2332         if (mb->mode <= MODE_I4x4)
2333             intra_predict(s, td, dst, mb, mb_x, mb_y, is_vp7);
2334         else
2335             inter_predict(s, td, dst, mb, mb_x, mb_y);
2336
2337         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN);
2338
2339         if (!mb->skip) {
2340             idct_mb(s, td, dst, mb);
2341         } else {
2342             AV_ZERO64(td->left_nnz);
2343             AV_WN64(s->top_nnz[mb_x], 0);   // array of 9, so unaligned
2344
2345             /* Reset DC block predictors if they would exist
2346              * if the mb had coefficients */
2347             if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
2348                 td->left_nnz[8]     = 0;
2349                 s->top_nnz[mb_x][8] = 0;
2350             }
2351         }
2352
2353         if (s->deblock_filter)
2354             filter_level_for_mb(s, mb, &td->filter_strength[mb_x], is_vp7);
2355
2356         if (s->deblock_filter && num_jobs != 1 && threadnr == num_jobs - 1) {
2357             if (s->filter.simple)
2358                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2359                                  NULL, NULL, s->linesize, 0, 1);
2360             else
2361                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2362                                  dst[1], dst[2], s->linesize, s->uvlinesize, 0);
2363         }
2364
2365         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN2);
2366
2367         dst[0]      += 16;
2368         dst[1]      += 8;
2369         dst[2]      += 8;
2370         s->mv_min.x -= 64;
2371         s->mv_max.x -= 64;
2372
2373         if (mb_x == s->mb_width + 1) {
2374             update_pos(td, mb_y, s->mb_width + 3);
2375         } else {
2376             update_pos(td, mb_y, mb_x);
2377         }
2378     }
2379 }
2380
2381 static void vp7_decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
2382                                         int jobnr, int threadnr)
2383 {
2384     decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr, 1);
2385 }
2386
2387 static void vp8_decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
2388                                         int jobnr, int threadnr)
2389 {
2390     decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr, 0);
2391 }
2392
2393 static av_always_inline void filter_mb_row(AVCodecContext *avctx, void *tdata,
2394                               int jobnr, int threadnr, int is_vp7)
2395 {
2396     VP8Context *s = avctx->priv_data;
2397     VP8ThreadData *td = &s->thread_data[threadnr];
2398     int mb_x, mb_y = td->thread_mb_pos >> 16, num_jobs = s->num_jobs;
2399     AVFrame *curframe = s->curframe->tf.f;
2400     VP8Macroblock *mb;
2401     VP8ThreadData *prev_td, *next_td;
2402     uint8_t *dst[3] = {
2403         curframe->data[0] + 16 * mb_y * s->linesize,
2404         curframe->data[1] +  8 * mb_y * s->uvlinesize,
2405         curframe->data[2] +  8 * mb_y * s->uvlinesize
2406     };
2407
2408     if (s->mb_layout == 1)
2409         mb = s->macroblocks_base + ((s->mb_width + 1) * (mb_y + 1) + 1);
2410     else
2411         mb = s->macroblocks + (s->mb_height - mb_y - 1) * 2;
2412
2413     if (mb_y == 0)
2414         prev_td = td;
2415     else
2416         prev_td = &s->thread_data[(jobnr + num_jobs - 1) % num_jobs];
2417     if (mb_y == s->mb_height - 1)
2418         next_td = td;
2419     else
2420         next_td = &s->thread_data[(jobnr + 1) % num_jobs];
2421
2422     for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb++) {
2423         VP8FilterStrength *f = &td->filter_strength[mb_x];
2424         if (prev_td != td)
2425             check_thread_pos(td, prev_td,
2426                              (mb_x + 1) + (s->mb_width + 3), mb_y - 1);
2427         if (next_td != td)
2428             if (next_td != &s->thread_data[0])
2429                 check_thread_pos(td, next_td, mb_x + 1, mb_y + 1);
2430
2431         if (num_jobs == 1) {
2432             if (s->filter.simple)
2433                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2434                                  NULL, NULL, s->linesize, 0, 1);
2435             else
2436                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2437                                  dst[1], dst[2], s->linesize, s->uvlinesize, 0);
2438         }
2439
2440         if (s->filter.simple)
2441             filter_mb_simple(s, dst[0], f, mb_x, mb_y);
2442         else
2443             filter_mb(s, dst, f, mb_x, mb_y, is_vp7);
2444         dst[0] += 16;
2445         dst[1] += 8;
2446         dst[2] += 8;
2447
2448         update_pos(td, mb_y, (s->mb_width + 3) + mb_x);
2449     }
2450 }
2451
2452 static void vp7_filter_mb_row(AVCodecContext *avctx, void *tdata,
2453                               int jobnr, int threadnr)
2454 {
2455     filter_mb_row(avctx, tdata, jobnr, threadnr, 1);
2456 }
2457
2458 static void vp8_filter_mb_row(AVCodecContext *avctx, void *tdata,
2459                               int jobnr, int threadnr)
2460 {
2461     filter_mb_row(avctx, tdata, jobnr, threadnr, 0);
2462 }
2463
2464 static av_always_inline
2465 int vp78_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata, int jobnr,
2466                               int threadnr, int is_vp7)
2467 {
2468     VP8Context *s = avctx->priv_data;
2469     VP8ThreadData *td = &s->thread_data[jobnr];
2470     VP8ThreadData *next_td = NULL, *prev_td = NULL;
2471     VP8Frame *curframe = s->curframe;
2472     int mb_y, num_jobs = s->num_jobs;
2473
2474     td->thread_nr = threadnr;
2475     for (mb_y = jobnr; mb_y < s->mb_height; mb_y += num_jobs) {
2476         if (mb_y >= s->mb_height)
2477             break;
2478         td->thread_mb_pos = mb_y << 16;
2479         s->decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr);
2480         if (s->deblock_filter)
2481             s->filter_mb_row(avctx, tdata, jobnr, threadnr);
2482         update_pos(td, mb_y, INT_MAX & 0xFFFF);
2483
2484         s->mv_min.y -= 64;
2485         s->mv_max.y -= 64;
2486
2487         if (avctx->active_thread_type == FF_THREAD_FRAME)
2488             ff_thread_report_progress(&curframe->tf, mb_y, 0);
2489     }
2490
2491     return 0;
2492 }
2493
2494 static int vp7_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata,
2495                                     int jobnr, int threadnr)
2496 {
2497     return vp78_decode_mb_row_sliced(avctx, tdata, jobnr, threadnr, IS_VP7);
2498 }
2499
2500 static int vp8_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata,
2501                                     int jobnr, int threadnr)
2502 {
2503     return vp78_decode_mb_row_sliced(avctx, tdata, jobnr, threadnr, IS_VP8);
2504 }
2505
2506
2507 static av_always_inline
2508 int vp78_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
2509                       AVPacket *avpkt, int is_vp7)
2510 {
2511     VP8Context *s = avctx->priv_data;
2512     int ret, i, referenced, num_jobs;
2513     enum AVDiscard skip_thresh;
2514     VP8Frame *av_uninit(curframe), *prev_frame;
2515
2516     if (is_vp7)
2517         ret = vp7_decode_frame_header(s, avpkt->data, avpkt->size);
2518     else
2519         ret = vp8_decode_frame_header(s, avpkt->data, avpkt->size);
2520
2521     if (ret < 0)
2522         goto err;
2523
2524     prev_frame = s->framep[VP56_FRAME_CURRENT];
2525
2526     referenced = s->update_last || s->update_golden == VP56_FRAME_CURRENT ||
2527                  s->update_altref == VP56_FRAME_CURRENT;
2528
2529     skip_thresh = !referenced ? AVDISCARD_NONREF
2530                               : !s->keyframe ? AVDISCARD_NONKEY
2531                                              : AVDISCARD_ALL;
2532
2533     if (avctx->skip_frame >= skip_thresh) {
2534         s->invisible = 1;
2535         memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
2536         goto skip_decode;
2537     }
2538     s->deblock_filter = s->filter.level && avctx->skip_loop_filter < skip_thresh;
2539
2540     // release no longer referenced frames
2541     for (i = 0; i < 5; i++)
2542         if (s->frames[i].tf.f->data[0] &&
2543             &s->frames[i] != prev_frame &&
2544             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
2545             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN]   &&
2546             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2])
2547             vp8_release_frame(s, &s->frames[i]);
2548
2549     curframe = s->framep[VP56_FRAME_CURRENT] = vp8_find_free_buffer(s);
2550
2551     if (!s->colorspace)
2552         avctx->colorspace = AVCOL_SPC_BT470BG;
2553     if (s->fullrange)
2554         avctx->color_range = AVCOL_RANGE_JPEG;
2555     else
2556         avctx->color_range = AVCOL_RANGE_MPEG;
2557
2558     /* Given that arithmetic probabilities are updated every frame, it's quite
2559      * likely that the values we have on a random interframe are complete
2560      * junk if we didn't start decode on a keyframe. So just don't display
2561      * anything rather than junk. */
2562     if (!s->keyframe && (!s->framep[VP56_FRAME_PREVIOUS] ||
2563                          !s->framep[VP56_FRAME_GOLDEN]   ||
2564                          !s->framep[VP56_FRAME_GOLDEN2])) {
2565         av_log(avctx, AV_LOG_WARNING,
2566                "Discarding interframe without a prior keyframe!\n");
2567         ret = AVERROR_INVALIDDATA;
2568         goto err;
2569     }
2570
2571     curframe->tf.f->key_frame = s->keyframe;
2572     curframe->tf.f->pict_type = s->keyframe ? AV_PICTURE_TYPE_I
2573                                             : AV_PICTURE_TYPE_P;
2574     if ((ret = vp8_alloc_frame(s, curframe, referenced)) < 0)
2575         goto err;
2576
2577     // check if golden and altref are swapped
2578     if (s->update_altref != VP56_FRAME_NONE)
2579         s->next_framep[VP56_FRAME_GOLDEN2] = s->framep[s->update_altref];
2580     else
2581         s->next_framep[VP56_FRAME_GOLDEN2] = s->framep[VP56_FRAME_GOLDEN2];
2582
2583     if (s->update_golden != VP56_FRAME_NONE)
2584         s->next_framep[VP56_FRAME_GOLDEN] = s->framep[s->update_golden];
2585     else
2586         s->next_framep[VP56_FRAME_GOLDEN] = s->framep[VP56_FRAME_GOLDEN];
2587
2588     if (s->update_last)
2589         s->next_framep[VP56_FRAME_PREVIOUS] = curframe;
2590     else
2591         s->next_framep[VP56_FRAME_PREVIOUS] = s->framep[VP56_FRAME_PREVIOUS];
2592
2593     s->next_framep[VP56_FRAME_CURRENT] = curframe;
2594
2595     if (avctx->codec->update_thread_context)
2596         ff_thread_finish_setup(avctx);
2597
2598     s->linesize   = curframe->tf.f->linesize[0];
2599     s->uvlinesize = curframe->tf.f->linesize[1];
2600
2601     memset(s->top_nnz, 0, s->mb_width * sizeof(*s->top_nnz));
2602     /* Zero macroblock structures for top/top-left prediction
2603      * from outside the frame. */
2604     if (!s->mb_layout)
2605         memset(s->macroblocks + s->mb_height * 2 - 1, 0,
2606                (s->mb_width + 1) * sizeof(*s->macroblocks));
2607     if (!s->mb_layout && s->keyframe)
2608         memset(s->intra4x4_pred_mode_top, DC_PRED, s->mb_width * 4);
2609
2610     memset(s->ref_count, 0, sizeof(s->ref_count));
2611
2612     if (s->mb_layout == 1) {
2613         // Make sure the previous frame has read its segmentation map,
2614         // if we re-use the same map.
2615         if (prev_frame && s->segmentation.enabled &&
2616             !s->segmentation.update_map)
2617             ff_thread_await_progress(&prev_frame->tf, 1, 0);
2618         if (is_vp7)
2619             vp7_decode_mv_mb_modes(avctx, curframe, prev_frame);
2620         else
2621             vp8_decode_mv_mb_modes(avctx, curframe, prev_frame);
2622     }
2623
2624     if (avctx->active_thread_type == FF_THREAD_FRAME)
2625         num_jobs = 1;
2626     else
2627         num_jobs = FFMIN(s->num_coeff_partitions, avctx->thread_count);
2628     s->num_jobs   = num_jobs;
2629     s->curframe   = curframe;
2630     s->prev_frame = prev_frame;
2631     s->mv_min.y   = -MARGIN;
2632     s->mv_max.y   = ((s->mb_height - 1) << 6) + MARGIN;
2633     for (i = 0; i < MAX_THREADS; i++) {
2634         s->thread_data[i].thread_mb_pos = 0;
2635         s->thread_data[i].wait_mb_pos   = INT_MAX;
2636     }
2637     if (is_vp7)
2638         avctx->execute2(avctx, vp7_decode_mb_row_sliced, s->thread_data, NULL,
2639                         num_jobs);
2640     else
2641         avctx->execute2(avctx, vp8_decode_mb_row_sliced, s->thread_data, NULL,
2642                         num_jobs);
2643
2644     ff_thread_report_progress(&curframe->tf, INT_MAX, 0);
2645     memcpy(&s->framep[0], &s->next_framep[0], sizeof(s->framep[0]) * 4);
2646
2647 skip_decode:
2648     // if future frames don't use the updated probabilities,
2649     // reset them to the values we saved
2650     if (!s->update_probabilities)
2651         s->prob[0] = s->prob[1];
2652
2653     if (!s->invisible) {
2654         if ((ret = av_frame_ref(data, curframe->tf.f)) < 0)
2655             return ret;
2656         *got_frame = 1;
2657     }
2658
2659     return avpkt->size;
2660 err:
2661     memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
2662     return ret;
2663 }
2664
2665 int ff_vp8_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
2666                         AVPacket *avpkt)
2667 {
2668     return vp78_decode_frame(avctx, data, got_frame, avpkt, IS_VP8);
2669 }
2670
2671 #if CONFIG_VP7_DECODER
2672 static int vp7_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
2673                             AVPacket *avpkt)
2674 {
2675     return vp78_decode_frame(avctx, data, got_frame, avpkt, IS_VP7);
2676 }
2677 #endif /* CONFIG_VP7_DECODER */
2678
2679 av_cold int ff_vp8_decode_free(AVCodecContext *avctx)
2680 {
2681     VP8Context *s = avctx->priv_data;
2682     int i;
2683
2684     vp8_decode_flush_impl(avctx, 1);
2685     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++)
2686         av_frame_free(&s->frames[i].tf.f);
2687
2688     return 0;
2689 }
2690
2691 static av_cold int vp8_init_frames(VP8Context *s)
2692 {
2693     int i;
2694     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++) {
2695         s->frames[i].tf.f = av_frame_alloc();
2696         if (!s->frames[i].tf.f)
2697             return AVERROR(ENOMEM);
2698     }
2699     return 0;
2700 }
2701
2702 static av_always_inline
2703 int vp78_decode_init(AVCodecContext *avctx, int is_vp7)
2704 {
2705     VP8Context *s = avctx->priv_data;
2706     int ret;
2707
2708     s->avctx = avctx;
2709     s->vp7   = avctx->codec->id == AV_CODEC_ID_VP7;
2710     avctx->pix_fmt = AV_PIX_FMT_YUV420P;
2711     avctx->internal->allocate_progress = 1;
2712
2713     ff_videodsp_init(&s->vdsp, 8);
2714
2715     ff_vp78dsp_init(&s->vp8dsp);
2716     if (CONFIG_VP7_DECODER && is_vp7) {
2717         ff_h264_pred_init(&s->hpc, AV_CODEC_ID_VP7, 8, 1);
2718         ff_vp7dsp_init(&s->vp8dsp);
2719         s->decode_mb_row_no_filter = vp7_decode_mb_row_no_filter;
2720         s->filter_mb_row           = vp7_filter_mb_row;
2721     } else if (CONFIG_VP8_DECODER && !is_vp7) {
2722         ff_h264_pred_init(&s->hpc, AV_CODEC_ID_VP8, 8, 1);
2723         ff_vp8dsp_init(&s->vp8dsp);
2724         s->decode_mb_row_no_filter = vp8_decode_mb_row_no_filter;
2725         s->filter_mb_row           = vp8_filter_mb_row;
2726     }
2727
2728     /* does not change for VP8 */
2729     memcpy(s->prob[0].scan, zigzag_scan, sizeof(s->prob[0].scan));
2730
2731     if ((ret = vp8_init_frames(s)) < 0) {
2732         ff_vp8_decode_free(avctx);
2733         return ret;
2734     }
2735
2736     return 0;
2737 }
2738
2739 #if CONFIG_VP7_DECODER
2740 static int vp7_decode_init(AVCodecContext *avctx)
2741 {
2742     return vp78_decode_init(avctx, IS_VP7);
2743 }
2744 #endif /* CONFIG_VP7_DECODER */
2745
2746 av_cold int ff_vp8_decode_init(AVCodecContext *avctx)
2747 {
2748     return vp78_decode_init(avctx, IS_VP8);
2749 }
2750
2751 #if CONFIG_VP8_DECODER
2752 static av_cold int vp8_decode_init_thread_copy(AVCodecContext *avctx)
2753 {
2754     VP8Context *s = avctx->priv_data;
2755     int ret;
2756
2757     s->avctx = avctx;
2758
2759     if ((ret = vp8_init_frames(s)) < 0) {
2760         ff_vp8_decode_free(avctx);
2761         return ret;
2762     }
2763
2764     return 0;
2765 }
2766
2767 #define REBASE(pic) ((pic) ? (pic) - &s_src->frames[0] + &s->frames[0] : NULL)
2768
2769 static int vp8_decode_update_thread_context(AVCodecContext *dst,
2770                                             const AVCodecContext *src)
2771 {
2772     VP8Context *s = dst->priv_data, *s_src = src->priv_data;
2773     int i;
2774
2775     if (s->macroblocks_base &&
2776         (s_src->mb_width != s->mb_width || s_src->mb_height != s->mb_height)) {
2777         free_buffers(s);
2778         s->mb_width  = s_src->mb_width;
2779         s->mb_height = s_src->mb_height;
2780     }
2781
2782     s->prob[0]      = s_src->prob[!s_src->update_probabilities];
2783     s->segmentation = s_src->segmentation;
2784     s->lf_delta     = s_src->lf_delta;
2785     memcpy(s->sign_bias, s_src->sign_bias, sizeof(s->sign_bias));
2786
2787     for (i = 0; i < FF_ARRAY_ELEMS(s_src->frames); i++) {
2788         if (s_src->frames[i].tf.f->data[0]) {
2789             int ret = vp8_ref_frame(s, &s->frames[i], &s_src->frames[i]);
2790             if (ret < 0)
2791                 return ret;
2792         }
2793     }
2794
2795     s->framep[0] = REBASE(s_src->next_framep[0]);
2796     s->framep[1] = REBASE(s_src->next_framep[1]);
2797     s->framep[2] = REBASE(s_src->next_framep[2]);
2798     s->framep[3] = REBASE(s_src->next_framep[3]);
2799
2800     return 0;
2801 }
2802 #endif /* CONFIG_VP8_DECODER */
2803
2804 #if CONFIG_VP7_DECODER
2805 AVCodec ff_vp7_decoder = {
2806     .name                  = "vp7",
2807     .long_name             = NULL_IF_CONFIG_SMALL("On2 VP7"),
2808     .type                  = AVMEDIA_TYPE_VIDEO,
2809     .id                    = AV_CODEC_ID_VP7,
2810     .priv_data_size        = sizeof(VP8Context),
2811     .init                  = vp7_decode_init,
2812     .close                 = ff_vp8_decode_free,
2813     .decode                = vp7_decode_frame,
2814     .capabilities          = CODEC_CAP_DR1,
2815     .flush                 = vp8_decode_flush,
2816 };
2817 #endif /* CONFIG_VP7_DECODER */
2818
2819 #if CONFIG_VP8_DECODER
2820 AVCodec ff_vp8_decoder = {
2821     .name                  = "vp8",
2822     .long_name             = NULL_IF_CONFIG_SMALL("On2 VP8"),
2823     .type                  = AVMEDIA_TYPE_VIDEO,
2824     .id                    = AV_CODEC_ID_VP8,
2825     .priv_data_size        = sizeof(VP8Context),
2826     .init                  = ff_vp8_decode_init,
2827     .close                 = ff_vp8_decode_free,
2828     .decode                = ff_vp8_decode_frame,
2829     .capabilities          = CODEC_CAP_DR1 | CODEC_CAP_FRAME_THREADS | CODEC_CAP_SLICE_THREADS,
2830     .flush                 = vp8_decode_flush,
2831     .init_thread_copy      = ONLY_IF_THREADS_ENABLED(vp8_decode_init_thread_copy),
2832     .update_thread_context = ONLY_IF_THREADS_ENABLED(vp8_decode_update_thread_context),
2833 };
2834 #endif /* CONFIG_VP7_DECODER */