git.sesse.net Git - ffmpeg/blob - libavcodec/vp8.c

   1 /*
   2  * VP7/VP8 compatible video decoder
   3  *
   4  * Copyright (C) 2010 David Conrad
   5  * Copyright (C) 2010 Ronald S. Bultje
   6  * Copyright (C) 2010 Fiona Glaser
   7  * Copyright (C) 2012 Daniel Kang
   8  * Copyright (C) 2014 Peter Ross
   9  *
  10  * This file is part of Libav.
  11  *
  12  * Libav is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public
  14  * License as published by the Free Software Foundation; either
  15  * version 2.1 of the License, or (at your option) any later version.
  16  *
  17  * Libav is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with Libav; if not, write to the Free Software
  24  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  25  */
  26
  27 #include "libavutil/imgutils.h"
  28
  29 #include "avcodec.h"
  30 #include "internal.h"
  31 #include "rectangle.h"
  32 #include "thread.h"
  33 #include "vp8.h"
  34 #include "vp8data.h"
  35
  36 #if ARCH_ARM
  37 #   include "arm/vp8.h"
  38 #endif
  39
  40 static void free_buffers(VP8Context *s)
  41 {
  42     int i;
  43     if (s->thread_data)
  44         for (i = 0; i < MAX_THREADS; i++) {
  45 #if HAVE_THREADS
  46             pthread_cond_destroy(&s->thread_data[i].cond);
  47             pthread_mutex_destroy(&s->thread_data[i].lock);
  48 #endif
  49             av_freep(&s->thread_data[i].filter_strength);
  50         }
  51     av_freep(&s->thread_data);
  52     av_freep(&s->macroblocks_base);
  53     av_freep(&s->intra4x4_pred_mode_top);
  54     av_freep(&s->top_nnz);
  55     av_freep(&s->top_border);
  56
  57     s->macroblocks = NULL;
  58 }
  59
  60 static int vp8_alloc_frame(VP8Context *s, VP8Frame *f, int ref)
  61 {
  62     int ret;
  63     if ((ret = ff_thread_get_buffer(s->avctx, &f->tf,
  64                                     ref ? AV_GET_BUFFER_FLAG_REF : 0)) < 0)
  65         return ret;
  66     if (!(f->seg_map = av_buffer_allocz(s->mb_width * s->mb_height))) {
  67         ff_thread_release_buffer(s->avctx, &f->tf);
  68         return AVERROR(ENOMEM);
  69     }
  70     return 0;
  71 }
  72
  73 static void vp8_release_frame(VP8Context *s, VP8Frame *f)
  74 {
  75     av_buffer_unref(&f->seg_map);
  76     ff_thread_release_buffer(s->avctx, &f->tf);
  77 }
  78
  79 #if CONFIG_VP8_DECODER
  80 static int vp8_ref_frame(VP8Context *s, VP8Frame *dst, VP8Frame *src)
  81 {
  82     int ret;
  83
  84     vp8_release_frame(s, dst);
  85
  86     if ((ret = ff_thread_ref_frame(&dst->tf, &src->tf)) < 0)
  87         return ret;
  88     if (src->seg_map &&
  89         !(dst->seg_map = av_buffer_ref(src->seg_map))) {
  90         vp8_release_frame(s, dst);
  91         return AVERROR(ENOMEM);
  92     }
  93
  94     return 0;
  95 }
  96 #endif /* CONFIG_VP8_DECODER */
  97
  98 static void vp8_decode_flush_impl(AVCodecContext *avctx, int free_mem)
  99 {
 100     VP8Context *s = avctx->priv_data;
 101     int i;
 102
 103     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++)
 104         vp8_release_frame(s, &s->frames[i]);
 105     memset(s->framep, 0, sizeof(s->framep));
 106
 107     if (free_mem)
 108         free_buffers(s);
 109 }
 110
 111 static void vp8_decode_flush(AVCodecContext *avctx)
 112 {
 113     vp8_decode_flush_impl(avctx, 0);
 114 }
 115
 116 static VP8Frame *vp8_find_free_buffer(VP8Context *s)
 117 {
 118     VP8Frame *frame = NULL;
 119     int i;
 120
 121     // find a free buffer
 122     for (i = 0; i < 5; i++)
 123         if (&s->frames[i] != s->framep[VP56_FRAME_CURRENT]  &&
 124             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
 125             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN]   &&
 126             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2]) {
 127             frame = &s->frames[i];
 128             break;
 129         }
 130     if (i == 5) {
 131         av_log(s->avctx, AV_LOG_FATAL, "Ran out of free frames!\n");
 132         abort();
 133     }
 134     if (frame->tf.f->data[0])
 135         vp8_release_frame(s, frame);
 136
 137     return frame;
 138 }
 139
 140 static av_always_inline
 141 int update_dimensions(VP8Context *s, int width, int height, int is_vp7)
 142 {
 143     AVCodecContext *avctx = s->avctx;
 144     int i, ret;
 145
 146     if (width  != s->avctx->width ||
 147         height != s->avctx->height) {
 148         vp8_decode_flush_impl(s->avctx, 1);
 149
 150         ret = ff_set_dimensions(s->avctx, width, height);
 151         if (ret < 0)
 152             return ret;
 153     }
 154
 155     s->mb_width  = (s->avctx->coded_width  + 15) / 16;
 156     s->mb_height = (s->avctx->coded_height + 15) / 16;
 157
 158     s->mb_layout = is_vp7 || avctx->active_thread_type == FF_THREAD_SLICE &&
 159                    FFMIN(s->num_coeff_partitions, avctx->thread_count) > 1;
 160     if (!s->mb_layout) { // Frame threading and one thread
 161         s->macroblocks_base       = av_mallocz((s->mb_width + s->mb_height * 2 + 1) *
 162                                                sizeof(*s->macroblocks));
 163         s->intra4x4_pred_mode_top = av_mallocz(s->mb_width * 4);
 164     } else // Sliced threading
 165         s->macroblocks_base = av_mallocz((s->mb_width + 2) * (s->mb_height + 2) *
 166                                          sizeof(*s->macroblocks));
 167     s->top_nnz     = av_mallocz(s->mb_width * sizeof(*s->top_nnz));
 168     s->top_border  = av_mallocz((s->mb_width + 1) * sizeof(*s->top_border));
 169     s->thread_data = av_mallocz(MAX_THREADS * sizeof(VP8ThreadData));
 170
 171     if (!s->macroblocks_base || !s->top_nnz || !s->top_border ||
 172         !s->thread_data || (!s->intra4x4_pred_mode_top && !s->mb_layout)) {
 173         free_buffers(s);
 174         return AVERROR(ENOMEM);
 175     }
 176
 177     for (i = 0; i < MAX_THREADS; i++) {
 178         s->thread_data[i].filter_strength =
 179             av_mallocz(s->mb_width * sizeof(*s->thread_data[0].filter_strength));
 180         if (!s->thread_data[i].filter_strength) {
 181             free_buffers(s);
 182             return AVERROR(ENOMEM);
 183         }
 184 #if HAVE_THREADS
 185         pthread_mutex_init(&s->thread_data[i].lock, NULL);
 186         pthread_cond_init(&s->thread_data[i].cond, NULL);
 187 #endif
 188     }
 189
 190     s->macroblocks = s->macroblocks_base + 1;
 191
 192     return 0;
 193 }
 194
 195 static int vp7_update_dimensions(VP8Context *s, int width, int height)
 196 {
 197     return update_dimensions(s, width, height, IS_VP7);
 198 }
 199
 200 static int vp8_update_dimensions(VP8Context *s, int width, int height)
 201 {
 202     return update_dimensions(s, width, height, IS_VP8);
 203 }
 204
 205 static void parse_segment_info(VP8Context *s)
 206 {
 207     VP56RangeCoder *c = &s->c;
 208     int i;
 209
 210     s->segmentation.update_map = vp8_rac_get(c);
 211
 212     if (vp8_rac_get(c)) { // update segment feature data
 213         s->segmentation.absolute_vals = vp8_rac_get(c);
 214
 215         for (i = 0; i < 4; i++)
 216             s->segmentation.base_quant[i]   = vp8_rac_get_sint(c, 7);
 217
 218         for (i = 0; i < 4; i++)
 219             s->segmentation.filter_level[i] = vp8_rac_get_sint(c, 6);
 220     }
 221     if (s->segmentation.update_map)
 222         for (i = 0; i < 3; i++)
 223             s->prob->segmentid[i] = vp8_rac_get(c) ? vp8_rac_get_uint(c, 8) : 255;
 224 }
 225
 226 static void update_lf_deltas(VP8Context *s)
 227 {
 228     VP56RangeCoder *c = &s->c;
 229     int i;
 230
 231     for (i = 0; i < 4; i++) {
 232         if (vp8_rac_get(c)) {
 233             s->lf_delta.ref[i] = vp8_rac_get_uint(c, 6);
 234
 235             if (vp8_rac_get(c))
 236                 s->lf_delta.ref[i] = -s->lf_delta.ref[i];
 237         }
 238     }
 239
 240     for (i = MODE_I4x4; i <= VP8_MVMODE_SPLIT; i++) {
 241         if (vp8_rac_get(c)) {
 242             s->lf_delta.mode[i] = vp8_rac_get_uint(c, 6);
 243
 244             if (vp8_rac_get(c))
 245                 s->lf_delta.mode[i] = -s->lf_delta.mode[i];
 246         }
 247     }
 248 }
 249
 250 static int setup_partitions(VP8Context *s, const uint8_t *buf, int buf_size)
 251 {
 252     const uint8_t *sizes = buf;
 253     int i;
 254
 255     s->num_coeff_partitions = 1 << vp8_rac_get_uint(&s->c, 2);
 256
 257     buf      += 3 * (s->num_coeff_partitions - 1);
 258     buf_size -= 3 * (s->num_coeff_partitions - 1);
 259     if (buf_size < 0)
 260         return -1;
 261
 262     for (i = 0; i < s->num_coeff_partitions - 1; i++) {
 263         int size = AV_RL24(sizes + 3 * i);
 264         if (buf_size - size < 0)
 265             return -1;
 266
 267         ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, size);
 268         buf      += size;
 269         buf_size -= size;
 270     }
 271     ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, buf_size);
 272
 273     return 0;
 274 }
 275
 276 static void vp7_get_quants(VP8Context *s)
 277 {
 278     VP56RangeCoder *c = &s->c;
 279
 280     int yac_qi  = vp8_rac_get_uint(c, 7);
 281     int ydc_qi  = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 282     int y2dc_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 283     int y2ac_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 284     int uvdc_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 285     int uvac_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 286
 287     s->qmat[0].luma_qmul[0]    =       vp7_ydc_qlookup[ydc_qi];
 288     s->qmat[0].luma_qmul[1]    =       vp7_yac_qlookup[yac_qi];
 289     s->qmat[0].luma_dc_qmul[0] =       vp7_y2dc_qlookup[y2dc_qi];
 290     s->qmat[0].luma_dc_qmul[1] =       vp7_y2ac_qlookup[y2ac_qi];
 291     s->qmat[0].chroma_qmul[0]  = FFMIN(vp7_ydc_qlookup[uvdc_qi], 132);
 292     s->qmat[0].chroma_qmul[1]  =       vp7_yac_qlookup[uvac_qi];
 293 }
 294
 295 static void get_quants(VP8Context *s)
 296 {
 297     VP56RangeCoder *c = &s->c;
 298     int i, base_qi;
 299
 300     int yac_qi     = vp8_rac_get_uint(c, 7);
 301     int ydc_delta  = vp8_rac_get_sint(c, 4);
 302     int y2dc_delta = vp8_rac_get_sint(c, 4);
 303     int y2ac_delta = vp8_rac_get_sint(c, 4);
 304     int uvdc_delta = vp8_rac_get_sint(c, 4);
 305     int uvac_delta = vp8_rac_get_sint(c, 4);
 306
 307     for (i = 0; i < 4; i++) {
 308         if (s->segmentation.enabled) {
 309             base_qi = s->segmentation.base_quant[i];
 310             if (!s->segmentation.absolute_vals)
 311                 base_qi += yac_qi;
 312         } else
 313             base_qi = yac_qi;
 314
 315         s->qmat[i].luma_qmul[0]    = vp8_dc_qlookup[av_clip_uintp2(base_qi + ydc_delta,  7)];
 316         s->qmat[i].luma_qmul[1]    = vp8_ac_qlookup[av_clip_uintp2(base_qi,              7)];
 317         s->qmat[i].luma_dc_qmul[0] = vp8_dc_qlookup[av_clip_uintp2(base_qi + y2dc_delta, 7)] * 2;
 318         /* 101581>>16 is equivalent to 155/100 */
 319         s->qmat[i].luma_dc_qmul[1] = vp8_ac_qlookup[av_clip_uintp2(base_qi + y2ac_delta, 7)] * 101581 >> 16;
 320         s->qmat[i].chroma_qmul[0]  = vp8_dc_qlookup[av_clip_uintp2(base_qi + uvdc_delta, 7)];
 321         s->qmat[i].chroma_qmul[1]  = vp8_ac_qlookup[av_clip_uintp2(base_qi + uvac_delta, 7)];
 322
 323         s->qmat[i].luma_dc_qmul[1] = FFMAX(s->qmat[i].luma_dc_qmul[1], 8);
 324         s->qmat[i].chroma_qmul[0]  = FFMIN(s->qmat[i].chroma_qmul[0], 132);
 325     }
 326 }
 327
 328 /**
 329  * Determine which buffers golden and altref should be updated with after this frame.
 330  * The spec isn't clear here, so I'm going by my understanding of what libvpx does
 331  *
 332  * Intra frames update all 3 references
 333  * Inter frames update VP56_FRAME_PREVIOUS if the update_last flag is set
 334  * If the update (golden|altref) flag is set, it's updated with the current frame
 335  *      if update_last is set, and VP56_FRAME_PREVIOUS otherwise.
 336  * If the flag is not set, the number read means:
 337  *      0: no update
 338  *      1: VP56_FRAME_PREVIOUS
 339  *      2: update golden with altref, or update altref with golden
 340  */
 341 static VP56Frame ref_to_update(VP8Context *s, int update, VP56Frame ref)
 342 {
 343     VP56RangeCoder *c = &s->c;
 344
 345     if (update)
 346         return VP56_FRAME_CURRENT;
 347
 348     switch (vp8_rac_get_uint(c, 2)) {
 349     case 1:
 350         return VP56_FRAME_PREVIOUS;
 351     case 2:
 352         return (ref == VP56_FRAME_GOLDEN) ? VP56_FRAME_GOLDEN2 : VP56_FRAME_GOLDEN;
 353     }
 354     return VP56_FRAME_NONE;
 355 }
 356
 357 static void vp78_reset_probability_tables(VP8Context *s)
 358 {
 359     int i, j;
 360     for (i = 0; i < 4; i++)
 361         for (j = 0; j < 16; j++)
 362             memcpy(s->prob->token[i][j], vp8_token_default_probs[i][vp8_coeff_band[j]],
 363                    sizeof(s->prob->token[i][j]));
 364 }
 365
 366 static void vp78_update_probability_tables(VP8Context *s)
 367 {
 368     VP56RangeCoder *c = &s->c;
 369     int i, j, k, l, m;
 370
 371     for (i = 0; i < 4; i++)
 372         for (j = 0; j < 8; j++)
 373             for (k = 0; k < 3; k++)
 374                 for (l = 0; l < NUM_DCT_TOKENS-1; l++)
 375                     if (vp56_rac_get_prob_branchy(c, vp8_token_update_probs[i][j][k][l])) {
 376                         int prob = vp8_rac_get_uint(c, 8);
 377                         for (m = 0; vp8_coeff_band_indexes[j][m] >= 0; m++)
 378                             s->prob->token[i][vp8_coeff_band_indexes[j][m]][k][l] = prob;
 379                     }
 380 }
 381
 382 #define VP7_MVC_SIZE 17
 383 #define VP8_MVC_SIZE 19
 384
 385 static void vp78_update_pred16x16_pred8x8_mvc_probabilities(VP8Context *s,
 386                                                             int mvc_size)
 387 {
 388     VP56RangeCoder *c = &s->c;
 389     int i, j;
 390
 391     if (vp8_rac_get(c))
 392         for (i = 0; i < 4; i++)
 393             s->prob->pred16x16[i] = vp8_rac_get_uint(c, 8);
 394     if (vp8_rac_get(c))
 395         for (i = 0; i < 3; i++)
 396             s->prob->pred8x8c[i]  = vp8_rac_get_uint(c, 8);
 397
 398     // 17.2 MV probability update
 399     for (i = 0; i < 2; i++)
 400         for (j = 0; j < mvc_size; j++)
 401             if (vp56_rac_get_prob_branchy(c, vp8_mv_update_prob[i][j]))
 402                 s->prob->mvc[i][j] = vp8_rac_get_nn(c);
 403 }
 404
 405 static void update_refs(VP8Context *s)
 406 {
 407     VP56RangeCoder *c = &s->c;
 408
 409     int update_golden = vp8_rac_get(c);
 410     int update_altref = vp8_rac_get(c);
 411
 412     s->update_golden = ref_to_update(s, update_golden, VP56_FRAME_GOLDEN);
 413     s->update_altref = ref_to_update(s, update_altref, VP56_FRAME_GOLDEN2);
 414 }
 415
 416 static void copy_luma(AVFrame *dst, AVFrame *src, int width, int height)
 417 {
 418     int i, j;
 419
 420     for (j = 1; j < 3; j++) {
 421         for (i = 0; i < height / 2; i++)
 422             memcpy(dst->data[j] + i * dst->linesize[j],
 423                    src->data[j] + i * src->linesize[j], width / 2);
 424     }
 425 }
 426
 427 static void fade(uint8_t *dst, uint8_t *src,
 428                  int width, int height, int linesize,
 429                  int alpha, int beta)
 430 {
 431     int i, j;
 432
 433     for (j = 0; j < height; j++) {
 434         for (i = 0; i < width; i++) {
 435             uint8_t y = src[j * linesize + i];
 436             dst[j * linesize + i] = av_clip_uint8(y + ((y * beta) >> 8) + alpha);
 437         }
 438     }
 439 }
 440
 441 static int vp7_fade_frame(VP8Context *s, VP56RangeCoder *c)
 442 {
 443     int alpha = (int8_t) vp8_rac_get_uint(c, 8);
 444     int beta  = (int8_t) vp8_rac_get_uint(c, 8);
 445     int ret;
 446
 447     if (!s->keyframe && (alpha || beta)) {
 448         int width  = s->mb_width * 16;
 449         int height = s->mb_height * 16;
 450         AVFrame *src, *dst;
 451
 452         if (!s->framep[VP56_FRAME_PREVIOUS])
 453             return AVERROR_INVALIDDATA;
 454
 455         dst =
 456         src = s->framep[VP56_FRAME_PREVIOUS]->tf.f;
 457
 458         /* preserve the golden frame, write a new previous frame */
 459         if (s->framep[VP56_FRAME_GOLDEN] == s->framep[VP56_FRAME_PREVIOUS]) {
 460             s->framep[VP56_FRAME_PREVIOUS] = vp8_find_free_buffer(s);
 461             if ((ret = vp8_alloc_frame(s, s->framep[VP56_FRAME_PREVIOUS], 1)) < 0)
 462                return ret;
 463
 464             dst = s->framep[VP56_FRAME_PREVIOUS]->tf.f;
 465
 466             copy_luma(dst, src, width, height);
 467         }
 468
 469         fade(dst->data[0], src->data[0],
 470              width, height, dst->linesize[0], alpha, beta);
 471     }
 472
 473     return 0;
 474 }
 475
 476 static int vp7_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size)
 477 {
 478     VP56RangeCoder *c = &s->c;
 479     int part1_size, hscale, vscale, i, j, ret;
 480     int width  = s->avctx->width;
 481     int height = s->avctx->height;
 482
 483     s->profile = (buf[0] >> 1) & 7;
 484     if (s->profile > 1) {
 485         avpriv_request_sample(s->avctx, "Unknown profile %d", s->profile);
 486         return AVERROR_INVALIDDATA;
 487     }
 488
 489     s->keyframe  = !(buf[0] & 1);
 490     s->invisible = 0;
 491     part1_size   = AV_RL24(buf) >> 4;
 492
 493     buf      += 4 - s->profile;
 494     buf_size -= 4 - s->profile;
 495
 496     memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab, sizeof(s->put_pixels_tab));
 497
 498     ff_vp56_init_range_decoder(c, buf, part1_size);
 499     buf      += part1_size;
 500     buf_size -= part1_size;
 501
 502     /* A. Dimension information (keyframes only) */
 503     if (s->keyframe) {
 504         width  = vp8_rac_get_uint(c, 12);
 505         height = vp8_rac_get_uint(c, 12);
 506         hscale = vp8_rac_get_uint(c, 2);
 507         vscale = vp8_rac_get_uint(c, 2);
 508         if (hscale || vscale)
 509             avpriv_request_sample(s->avctx, "Upscaling");
 510
 511         s->update_golden = s->update_altref = VP56_FRAME_CURRENT;
 512         vp78_reset_probability_tables(s);
 513         memcpy(s->prob->pred16x16, vp8_pred16x16_prob_inter,
 514                sizeof(s->prob->pred16x16));
 515         memcpy(s->prob->pred8x8c, vp8_pred8x8c_prob_inter,
 516                sizeof(s->prob->pred8x8c));
 517         for (i = 0; i < 2; i++)
 518             memcpy(s->prob->mvc[i], vp7_mv_default_prob[i],
 519                    sizeof(vp7_mv_default_prob[i]));
 520         memset(&s->segmentation, 0, sizeof(s->segmentation));
 521         memset(&s->lf_delta, 0, sizeof(s->lf_delta));
 522         memcpy(s->prob[0].scan, zigzag_scan, sizeof(s->prob[0].scan));
 523     }
 524
 525     if (s->keyframe || s->profile > 0)
 526         memset(s->inter_dc_pred, 0 , sizeof(s->inter_dc_pred));
 527
 528     /* B. Decoding information for all four macroblock-level features */
 529     for (i = 0; i < 4; i++) {
 530         s->feature_enabled[i] = vp8_rac_get(c);
 531         if (s->feature_enabled[i]) {
 532              s->feature_present_prob[i] = vp8_rac_get_uint(c, 8);
 533
 534              for (j = 0; j < 3; j++)
 535                  s->feature_index_prob[i][j] =
 536                      vp8_rac_get(c) ? vp8_rac_get_uint(c, 8) : 255;
 537
 538              if (vp7_feature_value_size[s->profile][i])
 539                  for (j = 0; j < 4; j++)
 540                      s->feature_value[i][j] =
 541                          vp8_rac_get(c) ? vp8_rac_get_uint(c, vp7_feature_value_size[s->profile][i]) : 0;
 542         }
 543     }
 544
 545     s->segmentation.enabled    = 0;
 546     s->segmentation.update_map = 0;
 547     s->lf_delta.enabled        = 0;
 548
 549     s->num_coeff_partitions = 1;
 550     ff_vp56_init_range_decoder(&s->coeff_partition[0], buf, buf_size);
 551
 552     if (!s->macroblocks_base || /* first frame */
 553         width != s->avctx->width || height != s->avctx->height ||
 554         (width + 15) / 16 != s->mb_width || (height + 15) / 16 != s->mb_height) {
 555         if ((ret = vp7_update_dimensions(s, width, height)) < 0)
 556             return ret;
 557     }
 558
 559     /* C. Dequantization indices */
 560     vp7_get_quants(s);
 561
 562     /* D. Golden frame update flag (a Flag) for interframes only */
 563     if (!s->keyframe) {
 564         s->update_golden = vp8_rac_get(c) ? VP56_FRAME_CURRENT : VP56_FRAME_NONE;
 565         s->sign_bias[VP56_FRAME_GOLDEN] = 0;
 566     }
 567
 568     s->update_last          = 1;
 569     s->update_probabilities = 1;
 570     s->fade_present         = 1;
 571
 572     if (s->profile > 0) {
 573         s->update_probabilities = vp8_rac_get(c);
 574         if (!s->update_probabilities)
 575             s->prob[1] = s->prob[0];
 576
 577         if (!s->keyframe)
 578             s->fade_present = vp8_rac_get(c);
 579     }
 580
 581     /* E. Fading information for previous frame */
 582     if (s->fade_present && vp8_rac_get(c)) {
 583         if ((ret = vp7_fade_frame(s ,c)) < 0)
 584             return ret;
 585     }
 586
 587     /* F. Loop filter type */
 588     if (!s->profile)
 589         s->filter.simple = vp8_rac_get(c);
 590
 591     /* G. DCT coefficient ordering specification */
 592     if (vp8_rac_get(c))
 593         for (i = 1; i < 16; i++)
 594             s->prob[0].scan[i] = zigzag_scan[vp8_rac_get_uint(c, 4)];
 595
 596     /* H. Loop filter levels  */
 597     if (s->profile > 0)
 598         s->filter.simple = vp8_rac_get(c);
 599     s->filter.level     = vp8_rac_get_uint(c, 6);
 600     s->filter.sharpness = vp8_rac_get_uint(c, 3);
 601
 602     /* I. DCT coefficient probability update; 13.3 Token Probability Updates */
 603     vp78_update_probability_tables(s);
 604
 605     s->mbskip_enabled = 0;
 606
 607     /* J. The remaining frame header data occurs ONLY FOR INTERFRAMES */
 608     if (!s->keyframe) {
 609         s->prob->intra  = vp8_rac_get_uint(c, 8);
 610         s->prob->last   = vp8_rac_get_uint(c, 8);
 611         vp78_update_pred16x16_pred8x8_mvc_probabilities(s, VP7_MVC_SIZE);
 612     }
 613
 614     return 0;
 615 }
 616
 617 static int vp8_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size)
 618 {
 619     VP56RangeCoder *c = &s->c;
 620     int header_size, hscale, vscale, ret;
 621     int width  = s->avctx->width;
 622     int height = s->avctx->height;
 623
 624     s->keyframe  = !(buf[0] & 1);
 625     s->profile   =  (buf[0]>>1) & 7;
 626     s->invisible = !(buf[0] & 0x10);
 627     header_size  = AV_RL24(buf) >> 5;
 628     buf      += 3;
 629     buf_size -= 3;
 630
 631     if (s->profile > 3)
 632         av_log(s->avctx, AV_LOG_WARNING, "Unknown profile %d\n", s->profile);
 633
 634     if (!s->profile)
 635         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab,
 636                sizeof(s->put_pixels_tab));
 637     else    // profile 1-3 use bilinear, 4+ aren't defined so whatever
 638         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_bilinear_pixels_tab,
 639                sizeof(s->put_pixels_tab));
 640
 641     if (header_size > buf_size - 7 * s->keyframe) {
 642         av_log(s->avctx, AV_LOG_ERROR, "Header size larger than data provided\n");
 643         return AVERROR_INVALIDDATA;
 644     }
 645
 646     if (s->keyframe) {
 647         if (AV_RL24(buf) != 0x2a019d) {
 648             av_log(s->avctx, AV_LOG_ERROR,
 649                    "Invalid start code 0x%x\n", AV_RL24(buf));
 650             return AVERROR_INVALIDDATA;
 651         }
 652         width     = AV_RL16(buf + 3) & 0x3fff;
 653         height    = AV_RL16(buf + 5) & 0x3fff;
 654         hscale    = buf[4] >> 6;
 655         vscale    = buf[6] >> 6;
 656         buf      += 7;
 657         buf_size -= 7;
 658
 659         if (hscale || vscale)
 660             avpriv_request_sample(s->avctx, "Upscaling");
 661
 662         s->update_golden = s->update_altref = VP56_FRAME_CURRENT;
 663         vp78_reset_probability_tables(s);
 664         memcpy(s->prob->pred16x16, vp8_pred16x16_prob_inter,
 665                sizeof(s->prob->pred16x16));
 666         memcpy(s->prob->pred8x8c, vp8_pred8x8c_prob_inter,
 667                sizeof(s->prob->pred8x8c));
 668         memcpy(s->prob->mvc, vp8_mv_default_prob,
 669                sizeof(s->prob->mvc));
 670         memset(&s->segmentation, 0, sizeof(s->segmentation));
 671         memset(&s->lf_delta, 0, sizeof(s->lf_delta));
 672     }
 673
 674     ff_vp56_init_range_decoder(c, buf, header_size);
 675     buf      += header_size;
 676     buf_size -= header_size;
 677
 678     if (s->keyframe) {
 679         s->colorspace = vp8_rac_get(c);
 680         if (s->colorspace)
 681             av_log(s->avctx, AV_LOG_WARNING, "Unspecified colorspace\n");
 682         s->fullrange = vp8_rac_get(c);
 683     }
 684
 685     if ((s->segmentation.enabled = vp8_rac_get(c)))
 686         parse_segment_info(s);
 687     else
 688         s->segmentation.update_map = 0; // FIXME: move this to some init function?
 689
 690     s->filter.simple    = vp8_rac_get(c);
 691     s->filter.level     = vp8_rac_get_uint(c, 6);
 692     s->filter.sharpness = vp8_rac_get_uint(c, 3);
 693
 694     if ((s->lf_delta.enabled = vp8_rac_get(c)))
 695         if (vp8_rac_get(c))
 696             update_lf_deltas(s);
 697
 698     if (setup_partitions(s, buf, buf_size)) {
 699         av_log(s->avctx, AV_LOG_ERROR, "Invalid partitions\n");
 700         return AVERROR_INVALIDDATA;
 701     }
 702
 703     if (!s->macroblocks_base || /* first frame */
 704         width != s->avctx->width || height != s->avctx->height)
 705         if ((ret = vp8_update_dimensions(s, width, height)) < 0)
 706             return ret;
 707
 708     get_quants(s);
 709
 710     if (!s->keyframe) {
 711         update_refs(s);
 712         s->sign_bias[VP56_FRAME_GOLDEN]               = vp8_rac_get(c);
 713         s->sign_bias[VP56_FRAME_GOLDEN2 /* altref */] = vp8_rac_get(c);
 714     }
 715
 716     // if we aren't saving this frame's probabilities for future frames,
 717     // make a copy of the current probabilities
 718     if (!(s->update_probabilities = vp8_rac_get(c)))
 719         s->prob[1] = s->prob[0];
 720
 721     s->update_last = s->keyframe || vp8_rac_get(c);
 722
 723     vp78_update_probability_tables(s);
 724
 725     if ((s->mbskip_enabled = vp8_rac_get(c)))
 726         s->prob->mbskip = vp8_rac_get_uint(c, 8);
 727
 728     if (!s->keyframe) {
 729         s->prob->intra  = vp8_rac_get_uint(c, 8);
 730         s->prob->last   = vp8_rac_get_uint(c, 8);
 731         s->prob->golden = vp8_rac_get_uint(c, 8);
 732         vp78_update_pred16x16_pred8x8_mvc_probabilities(s, VP8_MVC_SIZE);
 733     }
 734
 735     return 0;
 736 }
 737
 738 static av_always_inline
 739 void clamp_mv(VP8Context *s, VP56mv *dst, const VP56mv *src)
 740 {
 741     dst->x = av_clip(src->x, s->mv_min.x, s->mv_max.x);
 742     dst->y = av_clip(src->y, s->mv_min.y, s->mv_max.y);
 743 }
 744
 745 /**
 746  * Motion vector coding, 17.1.
 747  */
 748 static int read_mv_component(VP56RangeCoder *c, const uint8_t *p, int vp7)
 749 {
 750     int bit, x = 0;
 751
 752     if (vp56_rac_get_prob_branchy(c, p[0])) {
 753         int i;
 754
 755         for (i = 0; i < 3; i++)
 756             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 757         for (i = (vp7 ? 7 : 9); i > 3; i--)
 758             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 759         if (!(x & (vp7 ? 0xF0 : 0xFFF0)) || vp56_rac_get_prob(c, p[12]))
 760             x += 8;
 761     } else {
 762         // small_mvtree
 763         const uint8_t *ps = p + 2;
 764         bit = vp56_rac_get_prob(c, *ps);
 765         ps += 1 + 3 * bit;
 766         x  += 4 * bit;
 767         bit = vp56_rac_get_prob(c, *ps);
 768         ps += 1 + bit;
 769         x  += 2 * bit;
 770         x  += vp56_rac_get_prob(c, *ps);
 771     }
 772
 773     return (x && vp56_rac_get_prob(c, p[1])) ? -x : x;
 774 }
 775
 776 static av_always_inline
 777 const uint8_t *get_submv_prob(uint32_t left, uint32_t top, int is_vp7)
 778 {
 779     if (is_vp7)
 780         return vp7_submv_prob;
 781
 782     if (left == top)
 783         return vp8_submv_prob[4 - !!left];
 784     if (!top)
 785         return vp8_submv_prob[2];
 786     return vp8_submv_prob[1 - !!left];
 787 }
 788
 789 /**
 790  * Split motion vector prediction, 16.4.
 791  * @returns the number of motion vectors parsed (2, 4 or 16)
 792  */
 793 static av_always_inline
 794 int decode_splitmvs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
 795                     int layout, int is_vp7)
 796 {
 797     int part_idx;
 798     int n, num;
 799     VP8Macroblock *top_mb;
 800     VP8Macroblock *left_mb = &mb[-1];
 801     const uint8_t *mbsplits_left = vp8_mbsplits[left_mb->partitioning];
 802     const uint8_t *mbsplits_top, *mbsplits_cur, *firstidx;
 803     VP56mv *top_mv;
 804     VP56mv *left_mv = left_mb->bmv;
 805     VP56mv *cur_mv  = mb->bmv;
 806
 807     if (!layout) // layout is inlined, s->mb_layout is not
 808         top_mb = &mb[2];
 809     else
 810         top_mb = &mb[-s->mb_width - 1];
 811     mbsplits_top = vp8_mbsplits[top_mb->partitioning];
 812     top_mv       = top_mb->bmv;
 813
 814     if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[0])) {
 815         if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[1]))
 816             part_idx = VP8_SPLITMVMODE_16x8 + vp56_rac_get_prob(c, vp8_mbsplit_prob[2]);
 817         else
 818             part_idx = VP8_SPLITMVMODE_8x8;
 819     } else {
 820         part_idx = VP8_SPLITMVMODE_4x4;
 821     }
 822
 823     num              = vp8_mbsplit_count[part_idx];
 824     mbsplits_cur     = vp8_mbsplits[part_idx],
 825     firstidx         = vp8_mbfirstidx[part_idx];
 826     mb->partitioning = part_idx;
 827
 828     for (n = 0; n < num; n++) {
 829         int k = firstidx[n];
 830         uint32_t left, above;
 831         const uint8_t *submv_prob;
 832
 833         if (!(k & 3))
 834             left = AV_RN32A(&left_mv[mbsplits_left[k + 3]]);
 835         else
 836             left = AV_RN32A(&cur_mv[mbsplits_cur[k - 1]]);
 837         if (k <= 3)
 838             above = AV_RN32A(&top_mv[mbsplits_top[k + 12]]);
 839         else
 840             above = AV_RN32A(&cur_mv[mbsplits_cur[k - 4]]);
 841
 842         submv_prob = get_submv_prob(left, above, is_vp7);
 843
 844         if (vp56_rac_get_prob_branchy(c, submv_prob[0])) {
 845             if (vp56_rac_get_prob_branchy(c, submv_prob[1])) {
 846                 if (vp56_rac_get_prob_branchy(c, submv_prob[2])) {
 847                     mb->bmv[n].y = mb->mv.y +
 848                                    read_mv_component(c, s->prob->mvc[0], is_vp7);
 849                     mb->bmv[n].x = mb->mv.x +
 850                                    read_mv_component(c, s->prob->mvc[1], is_vp7);
 851                 } else {
 852                     AV_ZERO32(&mb->bmv[n]);
 853                 }
 854             } else {
 855                 AV_WN32A(&mb->bmv[n], above);
 856             }
 857         } else {
 858             AV_WN32A(&mb->bmv[n], left);
 859         }
 860     }
 861
 862     return num;
 863 }
 864
 865 /**
 866  * The vp7 reference decoder uses a padding macroblock column (added to right
 867  * edge of the frame) to guard against illegal macroblock offsets. The
 868  * algorithm has bugs that permit offsets to straddle the padding column.
 869  * This function replicates those bugs.
 870  *
 871  * @param[out] edge_x macroblock x address
 872  * @param[out] edge_y macroblock y address
 873  *
 874  * @return macroblock offset legal (boolean)
 875  */
 876 static int vp7_calculate_mb_offset(int mb_x, int mb_y, int mb_width,
 877                                    int xoffset, int yoffset, int boundary,
 878                                    int *edge_x, int *edge_y)
 879 {
 880     int vwidth = mb_width + 1;
 881     int new = (mb_y + yoffset) * vwidth + mb_x + xoffset;
 882     if (new < boundary || new % vwidth == vwidth - 1)
 883         return 0;
 884     *edge_y = new / vwidth;
 885     *edge_x = new % vwidth;
 886     return 1;
 887 }
 888
 889 static const VP56mv *get_bmv_ptr(const VP8Macroblock *mb, int subblock)
 890 {
 891     return &mb->bmv[mb->mode == VP8_MVMODE_SPLIT ? vp8_mbsplits[mb->partitioning][subblock] : 0];
 892 }
 893
 894 static av_always_inline
 895 void vp7_decode_mvs(VP8Context *s, VP8Macroblock *mb,
 896                     int mb_x, int mb_y, int layout)
 897 {
 898     VP8Macroblock *mb_edge[12];
 899     enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR };
 900     enum { VP8_EDGE_TOP, VP8_EDGE_LEFT, VP8_EDGE_TOPLEFT };
 901     int idx = CNT_ZERO;
 902     VP56mv near_mv[3];
 903     uint8_t cnt[3] = { 0 };
 904     VP56RangeCoder *c = &s->c;
 905     int i;
 906
 907     AV_ZERO32(&near_mv[0]);
 908     AV_ZERO32(&near_mv[1]);
 909     AV_ZERO32(&near_mv[2]);
 910
 911     for (i = 0; i < VP7_MV_PRED_COUNT; i++) {
 912         const VP7MVPred * pred = &vp7_mv_pred[i];
 913         int edge_x, edge_y;
 914
 915         if (vp7_calculate_mb_offset(mb_x, mb_y, s->mb_width, pred->xoffset,
 916                                     pred->yoffset, !s->profile, &edge_x, &edge_y)) {
 917             VP8Macroblock *edge = mb_edge[i] = (s->mb_layout == 1)
 918                                              ? s->macroblocks_base + 1 + edge_x +
 919                                                (s->mb_width + 1) * (edge_y + 1)
 920                                              : s->macroblocks + edge_x +
 921                                                (s->mb_height - edge_y - 1) * 2;
 922             uint32_t mv = AV_RN32A(get_bmv_ptr(edge, vp7_mv_pred[i].subblock));
 923             if (mv) {
 924                 if (AV_RN32A(&near_mv[CNT_NEAREST])) {
 925                     if (mv == AV_RN32A(&near_mv[CNT_NEAREST])) {
 926                         idx = CNT_NEAREST;
 927                     } else if (AV_RN32A(&near_mv[CNT_NEAR])) {
 928                         if (mv != AV_RN32A(&near_mv[CNT_NEAR]))
 929                             continue;
 930                         idx = CNT_NEAR;
 931                     } else {
 932                         AV_WN32A(&near_mv[CNT_NEAR], mv);
 933                         idx = CNT_NEAR;
 934                     }
 935                 } else {
 936                     AV_WN32A(&near_mv[CNT_NEAREST], mv);
 937                     idx = CNT_NEAREST;
 938                 }
 939             } else {
 940                 idx = CNT_ZERO;
 941             }
 942         } else {
 943             idx = CNT_ZERO;
 944         }
 945         cnt[idx] += vp7_mv_pred[i].score;
 946     }
 947
 948     mb->partitioning = VP8_SPLITMVMODE_NONE;
 949
 950     if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_ZERO]][0])) {
 951         mb->mode = VP8_MVMODE_MV;
 952
 953         if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAREST]][1])) {
 954
 955             if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAR]][2])) {
 956
 957                 if (cnt[CNT_NEAREST] > cnt[CNT_NEAR])
 958                     AV_WN32A(&mb->mv, cnt[CNT_ZERO] > cnt[CNT_NEAREST] ? 0 : AV_RN32A(&near_mv[CNT_NEAREST]));
 959                 else
 960                     AV_WN32A(&mb->mv, cnt[CNT_ZERO] > cnt[CNT_NEAR]    ? 0 : AV_RN32A(&near_mv[CNT_NEAR]));
 961
 962                 if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAR]][3])) {
 963                     mb->mode = VP8_MVMODE_SPLIT;
 964                     mb->mv = mb->bmv[decode_splitmvs(s, c, mb, layout, IS_VP7) - 1];
 965                 } else {
 966                     mb->mv.y += read_mv_component(c, s->prob->mvc[0], IS_VP7);
 967                     mb->mv.x += read_mv_component(c, s->prob->mvc[1], IS_VP7);
 968                     mb->bmv[0] = mb->mv;
 969                 }
 970             } else {
 971                 mb->mv = near_mv[CNT_NEAR];
 972                 mb->bmv[0] = mb->mv;
 973             }
 974         } else {
 975             mb->mv = near_mv[CNT_NEAREST];
 976             mb->bmv[0] = mb->mv;
 977         }
 978     } else {
 979         mb->mode = VP8_MVMODE_ZERO;
 980         AV_ZERO32(&mb->mv);
 981         mb->bmv[0] = mb->mv;
 982     }
 983 }
 984
 985 static av_always_inline
 986 void vp8_decode_mvs(VP8Context *s, VP8Macroblock *mb,
 987                     int mb_x, int mb_y, int layout)
 988 {
 989     VP8Macroblock *mb_edge[3] = { 0      /* top */,
 990                                   mb - 1 /* left */,
 991                                   0      /* top-left */ };
 992     enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV };
 993     enum { VP8_EDGE_TOP, VP8_EDGE_LEFT, VP8_EDGE_TOPLEFT };
 994     int idx = CNT_ZERO;
 995     int cur_sign_bias = s->sign_bias[mb->ref_frame];
 996     int8_t *sign_bias = s->sign_bias;
 997     VP56mv near_mv[4];
 998     uint8_t cnt[4] = { 0 };
 999     VP56RangeCoder *c = &s->c;
1000
1001     if (!layout) { // layout is inlined (s->mb_layout is not)
1002         mb_edge[0] = mb + 2;
1003         mb_edge[2] = mb + 1;
1004     } else {
1005         mb_edge[0] = mb - s->mb_width - 1;
1006         mb_edge[2] = mb - s->mb_width - 2;
1007     }
1008
1009     AV_ZERO32(&near_mv[0]);
1010     AV_ZERO32(&near_mv[1]);
1011     AV_ZERO32(&near_mv[2]);
1012
1013     /* Process MB on top, left and top-left */
1014 #define MV_EDGE_CHECK(n)                                                      \
1015     {                                                                         \
1016         VP8Macroblock *edge = mb_edge[n];                                     \
1017         int edge_ref = edge->ref_frame;                                       \
1018         if (edge_ref != VP56_FRAME_CURRENT) {                                 \
1019             uint32_t mv = AV_RN32A(&edge->mv);                                \
1020             if (mv) {                                                         \
1021                 if (cur_sign_bias != sign_bias[edge_ref]) {                   \
1022                     /* SWAR negate of the values in mv. */                    \
1023                     mv = ~mv;                                                 \
1024                     mv = ((mv & 0x7fff7fff) +                                 \
1025                           0x00010001) ^ (mv & 0x80008000);                    \
1026                 }                                                             \
1027                 if (!n || mv != AV_RN32A(&near_mv[idx]))                      \
1028                     AV_WN32A(&near_mv[++idx], mv);                            \
1029                 cnt[idx] += 1 + (n != 2);                                     \
1030             } else                                                            \
1031                 cnt[CNT_ZERO] += 1 + (n != 2);                                \
1032         }                                                                     \
1033     }
1034
1035     MV_EDGE_CHECK(0)
1036     MV_EDGE_CHECK(1)
1037     MV_EDGE_CHECK(2)
1038
1039     mb->partitioning = VP8_SPLITMVMODE_NONE;
1040     if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_ZERO]][0])) {
1041         mb->mode = VP8_MVMODE_MV;
1042
1043         /* If we have three distinct MVs, merge first and last if they're the same */
1044         if (cnt[CNT_SPLITMV] &&
1045             AV_RN32A(&near_mv[1 + VP8_EDGE_TOP]) == AV_RN32A(&near_mv[1 + VP8_EDGE_TOPLEFT]))
1046             cnt[CNT_NEAREST] += 1;
1047
1048         /* Swap near and nearest if necessary */
1049         if (cnt[CNT_NEAR] > cnt[CNT_NEAREST]) {
1050             FFSWAP(uint8_t,     cnt[CNT_NEAREST],     cnt[CNT_NEAR]);
1051             FFSWAP( VP56mv, near_mv[CNT_NEAREST], near_mv[CNT_NEAR]);
1052         }
1053
1054         if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAREST]][1])) {
1055             if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAR]][2])) {
1056                 /* Choose the best mv out of 0,0 and the nearest mv */
1057                 clamp_mv(s, &mb->mv, &near_mv[CNT_ZERO + (cnt[CNT_NEAREST] >= cnt[CNT_ZERO])]);
1058                 cnt[CNT_SPLITMV] = ((mb_edge[VP8_EDGE_LEFT]->mode    == VP8_MVMODE_SPLIT) +
1059                                     (mb_edge[VP8_EDGE_TOP]->mode     == VP8_MVMODE_SPLIT)) * 2 +
1060                                     (mb_edge[VP8_EDGE_TOPLEFT]->mode == VP8_MVMODE_SPLIT);
1061
1062                 if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_SPLITMV]][3])) {
1063                     mb->mode = VP8_MVMODE_SPLIT;
1064                     mb->mv = mb->bmv[decode_splitmvs(s, c, mb, layout, IS_VP8) - 1];
1065                 } else {
1066                     mb->mv.y  += read_mv_component(c, s->prob->mvc[0], IS_VP8);
1067                     mb->mv.x  += read_mv_component(c, s->prob->mvc[1], IS_VP8);
1068                     mb->bmv[0] = mb->mv;
1069                 }
1070             } else {
1071                 clamp_mv(s, &mb->mv, &near_mv[CNT_NEAR]);
1072                 mb->bmv[0] = mb->mv;
1073             }
1074         } else {
1075             clamp_mv(s, &mb->mv, &near_mv[CNT_NEAREST]);
1076             mb->bmv[0] = mb->mv;
1077         }
1078     } else {
1079         mb->mode = VP8_MVMODE_ZERO;
1080         AV_ZERO32(&mb->mv);
1081         mb->bmv[0] = mb->mv;
1082     }
1083 }
1084
1085 static av_always_inline
1086 void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
1087                            int mb_x, int keyframe, int layout)
1088 {
1089     uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
1090
1091     if (layout == 1) {
1092         VP8Macroblock *mb_top = mb - s->mb_width - 1;
1093         memcpy(mb->intra4x4_pred_mode_top, mb_top->intra4x4_pred_mode_top, 4);
1094     }
1095     if (keyframe) {
1096         int x, y;
1097         uint8_t *top;
1098         uint8_t *const left = s->intra4x4_pred_mode_left;
1099         if (layout == 1)
1100             top = mb->intra4x4_pred_mode_top;
1101         else
1102             top = s->intra4x4_pred_mode_top + 4 * mb_x;
1103         for (y = 0; y < 4; y++) {
1104             for (x = 0; x < 4; x++) {
1105                 const uint8_t *ctx;
1106                 ctx       = vp8_pred4x4_prob_intra[top[x]][left[y]];
1107                 *intra4x4 = vp8_rac_get_tree(c, vp8_pred4x4_tree, ctx);
1108                 left[y]   = top[x] = *intra4x4;
1109                 intra4x4++;
1110             }
1111         }
1112     } else {
1113         int i;
1114         for (i = 0; i < 16; i++)
1115             intra4x4[i] = vp8_rac_get_tree(c, vp8_pred4x4_tree,
1116                                            vp8_pred4x4_prob_inter);
1117     }
1118 }
1119
1120 static av_always_inline
1121 void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
1122                     uint8_t *segment, uint8_t *ref, int layout, int is_vp7)
1123 {
1124     VP56RangeCoder *c = &s->c;
1125     const char *vp7_feature_name[] = { "q-index",
1126                                        "lf-delta",
1127                                        "partial-golden-update",
1128                                        "blit-pitch" };
1129     if (is_vp7) {
1130         int i;
1131         *segment = 0;
1132         for (i = 0; i < 4; i++) {
1133             if (s->feature_enabled[i]) {
1134                 if (vp56_rac_get_prob(c, s->feature_present_prob[i])) {
1135                       int index = vp8_rac_get_tree(c, vp7_feature_index_tree,
1136                                                    s->feature_index_prob[i]);
1137                       av_log(s->avctx, AV_LOG_WARNING,
1138                              "Feature %s present in macroblock (value 0x%x)\n",
1139                              vp7_feature_name[i], s->feature_value[i][index]);
1140                 }
1141            }
1142         }
1143     } else if (s->segmentation.update_map)
1144         *segment = vp8_rac_get_tree(c, vp8_segmentid_tree, s->prob->segmentid);
1145     else if (s->segmentation.enabled)
1146         *segment = ref ? *ref : *segment;
1147     mb->segment = *segment;
1148
1149     mb->skip = s->mbskip_enabled ? vp56_rac_get_prob(c, s->prob->mbskip) : 0;
1150
1151     if (s->keyframe) {
1152         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_intra,
1153                                     vp8_pred16x16_prob_intra);
1154
1155         if (mb->mode == MODE_I4x4) {
1156             decode_intra4x4_modes(s, c, mb, mb_x, 1, layout);
1157         } else {
1158             const uint32_t modes = (is_vp7 ? vp7_pred4x4_mode
1159                                            : vp8_pred4x4_mode)[mb->mode] * 0x01010101u;
1160             if (s->mb_layout == 1)
1161                 AV_WN32A(mb->intra4x4_pred_mode_top, modes);
1162             else
1163                 AV_WN32A(s->intra4x4_pred_mode_top + 4 * mb_x, modes);
1164             AV_WN32A(s->intra4x4_pred_mode_left, modes);
1165         }
1166
1167         mb->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree,
1168                                                 vp8_pred8x8c_prob_intra);
1169         mb->ref_frame        = VP56_FRAME_CURRENT;
1170     } else if (vp56_rac_get_prob_branchy(c, s->prob->intra)) {
1171         // inter MB, 16.2
1172         if (vp56_rac_get_prob_branchy(c, s->prob->last))
1173             mb->ref_frame =
1174                 (!is_vp7 && vp56_rac_get_prob(c, s->prob->golden)) ? VP56_FRAME_GOLDEN2 /* altref */
1175                                                                    : VP56_FRAME_GOLDEN;
1176         else
1177             mb->ref_frame = VP56_FRAME_PREVIOUS;
1178         s->ref_count[mb->ref_frame - 1]++;
1179
1180         // motion vectors, 16.3
1181         if (is_vp7)
1182             vp7_decode_mvs(s, mb, mb_x, mb_y, layout);
1183         else
1184             vp8_decode_mvs(s, mb, mb_x, mb_y, layout);
1185     } else {
1186         // intra MB, 16.1
1187         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_inter, s->prob->pred16x16);
1188
1189         if (mb->mode == MODE_I4x4)
1190             decode_intra4x4_modes(s, c, mb, mb_x, 0, layout);
1191
1192         mb->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree,
1193                                                 s->prob->pred8x8c);
1194         mb->ref_frame        = VP56_FRAME_CURRENT;
1195         mb->partitioning     = VP8_SPLITMVMODE_NONE;
1196         AV_ZERO32(&mb->bmv[0]);
1197     }
1198 }
1199
1200 /**
1201  * @param r     arithmetic bitstream reader context
1202  * @param block destination for block coefficients
1203  * @param probs probabilities to use when reading trees from the bitstream
1204  * @param i     initial coeff index, 0 unless a separate DC block is coded
1205  * @param qmul  array holding the dc/ac dequant factor at position 0/1
1206  *
1207  * @return 0 if no coeffs were decoded
1208  *         otherwise, the index of the last coeff decoded plus one
1209  */
1210 static av_always_inline
1211 int decode_block_coeffs_internal(VP56RangeCoder *r, int16_t block[16],
1212                                  uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1213                                  int i, uint8_t *token_prob, int16_t qmul[2],
1214                                  const uint8_t scan[16], int vp7)
1215 {
1216     VP56RangeCoder c = *r;
1217     goto skip_eob;
1218     do {
1219         int coeff;
1220 restart:
1221         if (!vp56_rac_get_prob_branchy(&c, token_prob[0]))   // DCT_EOB
1222             break;
1223
1224 skip_eob:
1225         if (!vp56_rac_get_prob_branchy(&c, token_prob[1])) { // DCT_0
1226             if (++i == 16)
1227                 break; // invalid input; blocks should end with EOB
1228             token_prob = probs[i][0];
1229             if (vp7)
1230                 goto restart;
1231             goto skip_eob;
1232         }
1233
1234         if (!vp56_rac_get_prob_branchy(&c, token_prob[2])) { // DCT_1
1235             coeff = 1;
1236             token_prob = probs[i + 1][1];
1237         } else {
1238             if (!vp56_rac_get_prob_branchy(&c, token_prob[3])) { // DCT 2,3,4
1239                 coeff = vp56_rac_get_prob_branchy(&c, token_prob[4]);
1240                 if (coeff)
1241                     coeff += vp56_rac_get_prob(&c, token_prob[5]);
1242                 coeff += 2;
1243             } else {
1244                 // DCT_CAT*
1245                 if (!vp56_rac_get_prob_branchy(&c, token_prob[6])) {
1246                     if (!vp56_rac_get_prob_branchy(&c, token_prob[7])) { // DCT_CAT1
1247                         coeff = 5 + vp56_rac_get_prob(&c, vp8_dct_cat1_prob[0]);
1248                     } else {                                    // DCT_CAT2
1249                         coeff  = 7;
1250                         coeff += vp56_rac_get_prob(&c, vp8_dct_cat2_prob[0]) << 1;
1251                         coeff += vp56_rac_get_prob(&c, vp8_dct_cat2_prob[1]);
1252                     }
1253                 } else {    // DCT_CAT3 and up
1254                     int a   = vp56_rac_get_prob(&c, token_prob[8]);
1255                     int b   = vp56_rac_get_prob(&c, token_prob[9 + a]);
1256                     int cat = (a << 1) + b;
1257                     coeff  = 3 + (8 << cat);
1258                     coeff += vp8_rac_get_coeff(&c, ff_vp8_dct_cat_prob[cat]);
1259                 }
1260             }
1261             token_prob = probs[i + 1][2];
1262         }
1263         block[scan[i]] = (vp8_rac_get(&c) ? -coeff : coeff) * qmul[!!i];
1264     } while (++i < 16);
1265
1266     *r = c;
1267     return i;
1268 }
1269
1270 static av_always_inline
1271 int inter_predict_dc(int16_t block[16], int16_t pred[2])
1272 {
1273     int16_t dc = block[0];
1274     int ret = 0;
1275
1276     if (pred[1] > 3) {
1277         dc += pred[0];
1278         ret = 1;
1279     }
1280
1281     if (!pred[0] | !dc | ((int32_t)pred[0] ^ (int32_t)dc) >> 31) {
1282         block[0] = pred[0] = dc;
1283         pred[1] = 0;
1284     } else {
1285         if (pred[0] == dc)
1286             pred[1]++;
1287         block[0] = pred[0] = dc;
1288     }
1289
1290     return ret;
1291 }
1292
1293 static int vp7_decode_block_coeffs_internal(VP56RangeCoder *r,
1294                                             int16_t block[16],
1295                                             uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1296                                             int i, uint8_t *token_prob,
1297                                             int16_t qmul[2],
1298                                             const uint8_t scan[16])
1299 {
1300     return decode_block_coeffs_internal(r, block, probs, i,
1301                                         token_prob, qmul, scan, IS_VP7);
1302 }
1303
1304 #ifndef vp8_decode_block_coeffs_internal
1305 static int vp8_decode_block_coeffs_internal(VP56RangeCoder *r,
1306                                             int16_t block[16],
1307                                             uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1308                                             int i, uint8_t *token_prob,
1309                                             int16_t qmul[2])
1310 {
1311     return decode_block_coeffs_internal(r, block, probs, i,
1312                                         token_prob, qmul, zigzag_scan, IS_VP8);
1313 }
1314 #endif
1315
1316 /**
1317  * @param c          arithmetic bitstream reader context
1318  * @param block      destination for block coefficients
1319  * @param probs      probabilities to use when reading trees from the bitstream
1320  * @param i          initial coeff index, 0 unless a separate DC block is coded
1321  * @param zero_nhood the initial prediction context for number of surrounding
1322  *                   all-zero blocks (only left/top, so 0-2)
1323  * @param qmul       array holding the dc/ac dequant factor at position 0/1
1324  *
1325  * @return 0 if no coeffs were decoded
1326  *         otherwise, the index of the last coeff decoded plus one
1327  */
1328 static av_always_inline
1329 int decode_block_coeffs(VP56RangeCoder *c, int16_t block[16],
1330                         uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1331                         int i, int zero_nhood, int16_t qmul[2],
1332                         const uint8_t scan[16], int vp7)
1333 {
1334     uint8_t *token_prob = probs[i][zero_nhood];
1335     if (!vp56_rac_get_prob_branchy(c, token_prob[0]))   // DCT_EOB
1336         return 0;
1337     return vp7 ? vp7_decode_block_coeffs_internal(c, block, probs, i,
1338                                                   token_prob, qmul, scan)
1339                : vp8_decode_block_coeffs_internal(c, block, probs, i,
1340                                                   token_prob, qmul);
1341 }
1342
1343 static av_always_inline
1344 void decode_mb_coeffs(VP8Context *s, VP8ThreadData *td, VP56RangeCoder *c,
1345                       VP8Macroblock *mb, uint8_t t_nnz[9], uint8_t l_nnz[9],
1346                       int is_vp7)
1347 {
1348     int i, x, y, luma_start = 0, luma_ctx = 3;
1349     int nnz_pred, nnz, nnz_total = 0;
1350     int segment = mb->segment;
1351     int block_dc = 0;
1352
1353     if (mb->mode != MODE_I4x4 && (is_vp7 || mb->mode != VP8_MVMODE_SPLIT)) {
1354         nnz_pred = t_nnz[8] + l_nnz[8];
1355
1356         // decode DC values and do hadamard
1357         nnz = decode_block_coeffs(c, td->block_dc, s->prob->token[1], 0,
1358                                   nnz_pred, s->qmat[segment].luma_dc_qmul,
1359                                   zigzag_scan, is_vp7);
1360         l_nnz[8] = t_nnz[8] = !!nnz;
1361
1362         if (is_vp7 && mb->mode > MODE_I4x4) {
1363             nnz |=  inter_predict_dc(td->block_dc,
1364                                      s->inter_dc_pred[mb->ref_frame - 1]);
1365         }
1366
1367         if (nnz) {
1368             nnz_total += nnz;
1369             block_dc   = 1;
1370             if (nnz == 1)
1371                 s->vp8dsp.vp8_luma_dc_wht_dc(td->block, td->block_dc);
1372             else
1373                 s->vp8dsp.vp8_luma_dc_wht(td->block, td->block_dc);
1374         }
1375         luma_start = 1;
1376         luma_ctx   = 0;
1377     }
1378
1379     // luma blocks
1380     for (y = 0; y < 4; y++)
1381         for (x = 0; x < 4; x++) {
1382             nnz_pred = l_nnz[y] + t_nnz[x];
1383             nnz = decode_block_coeffs(c, td->block[y][x],
1384                                       s->prob->token[luma_ctx],
1385                                       luma_start, nnz_pred,
1386                                       s->qmat[segment].luma_qmul,
1387                                       s->prob[0].scan, is_vp7);
1388             /* nnz+block_dc may be one more than the actual last index,
1389              * but we don't care */
1390             td->non_zero_count_cache[y][x] = nnz + block_dc;
1391             t_nnz[x] = l_nnz[y] = !!nnz;
1392             nnz_total += nnz;
1393         }
1394
1395     // chroma blocks
1396     // TODO: what to do about dimensions? 2nd dim for luma is x,
1397     // but for chroma it's (y<<1)|x
1398     for (i = 4; i < 6; i++)
1399         for (y = 0; y < 2; y++)
1400             for (x = 0; x < 2; x++) {
1401                 nnz_pred = l_nnz[i + 2 * y] + t_nnz[i + 2 * x];
1402                 nnz = decode_block_coeffs(c, td->block[i][(y << 1) + x],
1403                                           s->prob->token[2], 0, nnz_pred,
1404                                           s->qmat[segment].chroma_qmul,
1405                                           s->prob[0].scan, is_vp7);
1406                 td->non_zero_count_cache[i][(y << 1) + x] = nnz;
1407                 t_nnz[i + 2 * x] = l_nnz[i + 2 * y] = !!nnz;
1408                 nnz_total += nnz;
1409             }
1410
1411     // if there were no coded coeffs despite the macroblock not being marked skip,
1412     // we MUST not do the inner loop filter and should not do IDCT
1413     // Since skip isn't used for bitstream prediction, just manually set it.
1414     if (!nnz_total)
1415         mb->skip = 1;
1416 }
1417
1418 static av_always_inline
1419 void backup_mb_border(uint8_t *top_border, uint8_t *src_y,
1420                       uint8_t *src_cb, uint8_t *src_cr,
1421                       int linesize, int uvlinesize, int simple)
1422 {
1423     AV_COPY128(top_border, src_y + 15 * linesize);
1424     if (!simple) {
1425         AV_COPY64(top_border + 16, src_cb + 7 * uvlinesize);
1426         AV_COPY64(top_border + 24, src_cr + 7 * uvlinesize);
1427     }
1428 }
1429
1430 static av_always_inline
1431 void xchg_mb_border(uint8_t *top_border, uint8_t *src_y, uint8_t *src_cb,
1432                     uint8_t *src_cr, int linesize, int uvlinesize, int mb_x,
1433                     int mb_y, int mb_width, int simple, int xchg)
1434 {
1435     uint8_t *top_border_m1 = top_border - 32;     // for TL prediction
1436     src_y  -= linesize;
1437     src_cb -= uvlinesize;
1438     src_cr -= uvlinesize;
1439
1440 #define XCHG(a, b, xchg)                                                      \
1441     do {                                                                      \
1442         if (xchg)                                                             \
1443             AV_SWAP64(b, a);                                                  \
1444         else                                                                  \
1445             AV_COPY64(b, a);                                                  \
1446     } while (0)
1447
1448     XCHG(top_border_m1 + 8, src_y - 8, xchg);
1449     XCHG(top_border, src_y, xchg);
1450     XCHG(top_border + 8, src_y + 8, 1);
1451     if (mb_x < mb_width - 1)
1452         XCHG(top_border + 32, src_y + 16, 1);
1453
1454     // only copy chroma for normal loop filter
1455     // or to initialize the top row to 127
1456     if (!simple || !mb_y) {
1457         XCHG(top_border_m1 + 16, src_cb - 8, xchg);
1458         XCHG(top_border_m1 + 24, src_cr - 8, xchg);
1459         XCHG(top_border + 16, src_cb, 1);
1460         XCHG(top_border + 24, src_cr, 1);
1461     }
1462 }
1463
1464 static av_always_inline
1465 int check_dc_pred8x8_mode(int mode, int mb_x, int mb_y)
1466 {
1467     if (!mb_x)
1468         return mb_y ? TOP_DC_PRED8x8 : DC_128_PRED8x8;
1469     else
1470         return mb_y ? mode : LEFT_DC_PRED8x8;
1471 }
1472
1473 static av_always_inline
1474 int check_tm_pred8x8_mode(int mode, int mb_x, int mb_y, int vp7)
1475 {
1476     if (!mb_x)
1477         return mb_y ? VERT_PRED8x8 : (vp7 ? DC_128_PRED8x8 : DC_129_PRED8x8);
1478     else
1479         return mb_y ? mode : HOR_PRED8x8;
1480 }
1481
1482 static av_always_inline
1483 int check_intra_pred8x8_mode_emuedge(int mode, int mb_x, int mb_y, int vp7)
1484 {
1485     switch (mode) {
1486     case DC_PRED8x8:
1487         return check_dc_pred8x8_mode(mode, mb_x, mb_y);
1488     case VERT_PRED8x8:
1489         return !mb_y ? (vp7 ? DC_128_PRED8x8 : DC_127_PRED8x8) : mode;
1490     case HOR_PRED8x8:
1491         return !mb_x ? (vp7 ? DC_128_PRED8x8 : DC_129_PRED8x8) : mode;
1492     case PLANE_PRED8x8: /* TM */
1493         return check_tm_pred8x8_mode(mode, mb_x, mb_y, vp7);
1494     }
1495     return mode;
1496 }
1497
1498 static av_always_inline
1499 int check_tm_pred4x4_mode(int mode, int mb_x, int mb_y, int vp7)
1500 {
1501     if (!mb_x) {
1502         return mb_y ? VERT_VP8_PRED : (vp7 ? DC_128_PRED : DC_129_PRED);
1503     } else {
1504         return mb_y ? mode : HOR_VP8_PRED;
1505     }
1506 }
1507
1508 static av_always_inline
1509 int check_intra_pred4x4_mode_emuedge(int mode, int mb_x, int mb_y,
1510                                      int *copy_buf, int vp7)
1511 {
1512     switch (mode) {
1513     case VERT_PRED:
1514         if (!mb_x && mb_y) {
1515             *copy_buf = 1;
1516             return mode;
1517         }
1518         /* fall-through */
1519     case DIAG_DOWN_LEFT_PRED:
1520     case VERT_LEFT_PRED:
1521         return !mb_y ? (vp7 ? DC_128_PRED : DC_127_PRED) : mode;
1522     case HOR_PRED:
1523         if (!mb_y) {
1524             *copy_buf = 1;
1525             return mode;
1526         }
1527         /* fall-through */
1528     case HOR_UP_PRED:
1529         return !mb_x ? (vp7 ? DC_128_PRED : DC_129_PRED) : mode;
1530     case TM_VP8_PRED:
1531         return check_tm_pred4x4_mode(mode, mb_x, mb_y, vp7);
1532     case DC_PRED: /* 4x4 DC doesn't use the same "H.264-style" exceptions
1533                    * as 16x16/8x8 DC */
1534     case DIAG_DOWN_RIGHT_PRED:
1535     case VERT_RIGHT_PRED:
1536     case HOR_DOWN_PRED:
1537         if (!mb_y || !mb_x)
1538             *copy_buf = 1;
1539         return mode;
1540     }
1541     return mode;
1542 }
1543
1544 static av_always_inline
1545 void intra_predict(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1546                    VP8Macroblock *mb, int mb_x, int mb_y, int is_vp7)
1547 {
1548     int x, y, mode, nnz;
1549     uint32_t tr;
1550
1551     /* for the first row, we need to run xchg_mb_border to init the top edge
1552      * to 127 otherwise, skip it if we aren't going to deblock */
1553     if (mb_y && (s->deblock_filter || !mb_y) && td->thread_nr == 0)
1554         xchg_mb_border(s->top_border[mb_x + 1], dst[0], dst[1], dst[2],
1555                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1556                        s->filter.simple, 1);
1557
1558     if (mb->mode < MODE_I4x4) {
1559         mode = check_intra_pred8x8_mode_emuedge(mb->mode, mb_x, mb_y, is_vp7);
1560         s->hpc.pred16x16[mode](dst[0], s->linesize);
1561     } else {
1562         uint8_t *ptr = dst[0];
1563         uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
1564         const uint8_t lo = is_vp7 ? 128 : 127;
1565         const uint8_t hi = is_vp7 ? 128 : 129;
1566         uint8_t tr_top[4] = { lo, lo, lo, lo };
1567
1568         // all blocks on the right edge of the macroblock use bottom edge
1569         // the top macroblock for their topright edge
1570         uint8_t *tr_right = ptr - s->linesize + 16;
1571
1572         // if we're on the right edge of the frame, said edge is extended
1573         // from the top macroblock
1574         if (mb_y && mb_x == s->mb_width - 1) {
1575             tr       = tr_right[-1] * 0x01010101u;
1576             tr_right = (uint8_t *) &tr;
1577         }
1578
1579         if (mb->skip)
1580             AV_ZERO128(td->non_zero_count_cache);
1581
1582         for (y = 0; y < 4; y++) {
1583             uint8_t *topright = ptr + 4 - s->linesize;
1584             for (x = 0; x < 4; x++) {
1585                 int copy = 0, linesize = s->linesize;
1586                 uint8_t *dst = ptr + 4 * x;
1587                 DECLARE_ALIGNED(4, uint8_t, copy_dst)[5 * 8];
1588
1589                 if ((y == 0 || x == 3) && mb_y == 0) {
1590                     topright = tr_top;
1591                 } else if (x == 3)
1592                     topright = tr_right;
1593
1594                 mode = check_intra_pred4x4_mode_emuedge(intra4x4[x], mb_x + x,
1595                                                         mb_y + y, &copy, is_vp7);
1596                 if (copy) {
1597                     dst      = copy_dst + 12;
1598                     linesize = 8;
1599                     if (!(mb_y + y)) {
1600                         copy_dst[3] = lo;
1601                         AV_WN32A(copy_dst + 4, lo * 0x01010101U);
1602                     } else {
1603                         AV_COPY32(copy_dst + 4, ptr + 4 * x - s->linesize);
1604                         if (!(mb_x + x)) {
1605                             copy_dst[3] = hi;
1606                         } else {
1607                             copy_dst[3] = ptr[4 * x - s->linesize - 1];
1608                         }
1609                     }
1610                     if (!(mb_x + x)) {
1611                         copy_dst[11] =
1612                         copy_dst[19] =
1613                         copy_dst[27] =
1614                         copy_dst[35] = hi;
1615                     } else {
1616                         copy_dst[11] = ptr[4 * x                   - 1];
1617                         copy_dst[19] = ptr[4 * x + s->linesize     - 1];
1618                         copy_dst[27] = ptr[4 * x + s->linesize * 2 - 1];
1619                         copy_dst[35] = ptr[4 * x + s->linesize * 3 - 1];
1620                     }
1621                 }
1622                 s->hpc.pred4x4[mode](dst, topright, linesize);
1623                 if (copy) {
1624                     AV_COPY32(ptr + 4 * x,                   copy_dst + 12);
1625                     AV_COPY32(ptr + 4 * x + s->linesize,     copy_dst + 20);
1626                     AV_COPY32(ptr + 4 * x + s->linesize * 2, copy_dst + 28);
1627                     AV_COPY32(ptr + 4 * x + s->linesize * 3, copy_dst + 36);
1628                 }
1629
1630                 nnz = td->non_zero_count_cache[y][x];
1631                 if (nnz) {
1632                     if (nnz == 1)
1633                         s->vp8dsp.vp8_idct_dc_add(ptr + 4 * x,
1634                                                   td->block[y][x], s->linesize);
1635                     else
1636                         s->vp8dsp.vp8_idct_add(ptr + 4 * x,
1637                                                td->block[y][x], s->linesize);
1638                 }
1639                 topright += 4;
1640             }
1641
1642             ptr      += 4 * s->linesize;
1643             intra4x4 += 4;
1644         }
1645     }
1646
1647     mode = check_intra_pred8x8_mode_emuedge(mb->chroma_pred_mode,
1648                                             mb_x, mb_y, is_vp7);
1649     s->hpc.pred8x8[mode](dst[1], s->uvlinesize);
1650     s->hpc.pred8x8[mode](dst[2], s->uvlinesize);
1651
1652     if (mb_y && (s->deblock_filter || !mb_y) && td->thread_nr == 0)
1653         xchg_mb_border(s->top_border[mb_x + 1], dst[0], dst[1], dst[2],
1654                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1655                        s->filter.simple, 0);
1656 }
1657
1658 static const uint8_t subpel_idx[3][8] = {
1659     { 0, 1, 2, 1, 2, 1, 2, 1 }, // nr. of left extra pixels,
1660                                 // also function pointer index
1661     { 0, 3, 5, 3, 5, 3, 5, 3 }, // nr. of extra pixels required
1662     { 0, 2, 3, 2, 3, 2, 3, 2 }, // nr. of right extra pixels
1663 };
1664
1665 /**
1666  * luma MC function
1667  *
1668  * @param s        VP8 decoding context
1669  * @param dst      target buffer for block data at block position
1670  * @param ref      reference picture buffer at origin (0, 0)
1671  * @param mv       motion vector (relative to block position) to get pixel data from
1672  * @param x_off    horizontal position of block from origin (0, 0)
1673  * @param y_off    vertical position of block from origin (0, 0)
1674  * @param block_w  width of block (16, 8 or 4)
1675  * @param block_h  height of block (always same as block_w)
1676  * @param width    width of src/dst plane data
1677  * @param height   height of src/dst plane data
1678  * @param linesize size of a single line of plane data, including padding
1679  * @param mc_func  motion compensation function pointers (bilinear or sixtap MC)
1680  */
1681 static av_always_inline
1682 void vp8_mc_luma(VP8Context *s, VP8ThreadData *td, uint8_t *dst,
1683                  ThreadFrame *ref, const VP56mv *mv,
1684                  int x_off, int y_off, int block_w, int block_h,
1685                  int width, int height, ptrdiff_t linesize,
1686                  vp8_mc_func mc_func[3][3])
1687 {
1688     uint8_t *src = ref->f->data[0];
1689
1690     if (AV_RN32A(mv)) {
1691         int src_linesize = linesize;
1692
1693         int mx = (mv->x << 1) & 7, mx_idx = subpel_idx[0][mx];
1694         int my = (mv->y << 1) & 7, my_idx = subpel_idx[0][my];
1695
1696         x_off += mv->x >> 2;
1697         y_off += mv->y >> 2;
1698
1699         // edge emulation
1700         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 4, 0);
1701         src += y_off * linesize + x_off;
1702         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1703             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1704             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1705                                      src - my_idx * linesize - mx_idx,
1706                                      EDGE_EMU_LINESIZE, linesize,
1707                                      block_w + subpel_idx[1][mx],
1708                                      block_h + subpel_idx[1][my],
1709                                      x_off - mx_idx, y_off - my_idx,
1710                                      width, height);
1711             src = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
1712             src_linesize = EDGE_EMU_LINESIZE;
1713         }
1714         mc_func[my_idx][mx_idx](dst, linesize, src, src_linesize, block_h, mx, my);
1715     } else {
1716         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 4, 0);
1717         mc_func[0][0](dst, linesize, src + y_off * linesize + x_off,
1718                       linesize, block_h, 0, 0);
1719     }
1720 }
1721
1722 /**
1723  * chroma MC function
1724  *
1725  * @param s        VP8 decoding context
1726  * @param dst1     target buffer for block data at block position (U plane)
1727  * @param dst2     target buffer for block data at block position (V plane)
1728  * @param ref      reference picture buffer at origin (0, 0)
1729  * @param mv       motion vector (relative to block position) to get pixel data from
1730  * @param x_off    horizontal position of block from origin (0, 0)
1731  * @param y_off    vertical position of block from origin (0, 0)
1732  * @param block_w  width of block (16, 8 or 4)
1733  * @param block_h  height of block (always same as block_w)
1734  * @param width    width of src/dst plane data
1735  * @param height   height of src/dst plane data
1736  * @param linesize size of a single line of plane data, including padding
1737  * @param mc_func  motion compensation function pointers (bilinear or sixtap MC)
1738  */
1739 static av_always_inline
1740 void vp8_mc_chroma(VP8Context *s, VP8ThreadData *td, uint8_t *dst1,
1741                    uint8_t *dst2, ThreadFrame *ref, const VP56mv *mv,
1742                    int x_off, int y_off, int block_w, int block_h,
1743                    int width, int height, ptrdiff_t linesize,
1744                    vp8_mc_func mc_func[3][3])
1745 {
1746     uint8_t *src1 = ref->f->data[1], *src2 = ref->f->data[2];
1747
1748     if (AV_RN32A(mv)) {
1749         int mx = mv->x & 7, mx_idx = subpel_idx[0][mx];
1750         int my = mv->y & 7, my_idx = subpel_idx[0][my];
1751
1752         x_off += mv->x >> 3;
1753         y_off += mv->y >> 3;
1754
1755         // edge emulation
1756         src1 += y_off * linesize + x_off;
1757         src2 += y_off * linesize + x_off;
1758         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 3, 0);
1759         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1760             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1761             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1762                                      src1 - my_idx * linesize - mx_idx,
1763                                      EDGE_EMU_LINESIZE, linesize,
1764                                      block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1765                                      x_off - mx_idx, y_off - my_idx, width, height);
1766             src1 = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
1767             mc_func[my_idx][mx_idx](dst1, linesize, src1, EDGE_EMU_LINESIZE, block_h, mx, my);
1768
1769             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1770                                      src2 - my_idx * linesize - mx_idx,
1771                                      EDGE_EMU_LINESIZE, linesize,
1772                                      block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1773                                      x_off - mx_idx, y_off - my_idx, width, height);
1774             src2 = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
1775             mc_func[my_idx][mx_idx](dst2, linesize, src2, EDGE_EMU_LINESIZE, block_h, mx, my);
1776         } else {
1777             mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my);
1778             mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my);
1779         }
1780     } else {
1781         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 3, 0);
1782         mc_func[0][0](dst1, linesize, src1 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1783         mc_func[0][0](dst2, linesize, src2 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1784     }
1785 }
1786
1787 static av_always_inline
1788 void vp8_mc_part(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1789                  ThreadFrame *ref_frame, int x_off, int y_off,
1790                  int bx_off, int by_off, int block_w, int block_h,
1791                  int width, int height, VP56mv *mv)
1792 {
1793     VP56mv uvmv = *mv;
1794
1795     /* Y */
1796     vp8_mc_luma(s, td, dst[0] + by_off * s->linesize + bx_off,
1797                 ref_frame, mv, x_off + bx_off, y_off + by_off,
1798                 block_w, block_h, width, height, s->linesize,
1799                 s->put_pixels_tab[block_w == 8]);
1800
1801     /* U/V */
1802     if (s->profile == 3) {
1803         /* this block only applies VP8; it is safe to check
1804          * only the profile, as VP7 profile <= 1 */
1805         uvmv.x &= ~7;
1806         uvmv.y &= ~7;
1807     }
1808     x_off   >>= 1;
1809     y_off   >>= 1;
1810     bx_off  >>= 1;
1811     by_off  >>= 1;
1812     width   >>= 1;
1813     height  >>= 1;
1814     block_w >>= 1;
1815     block_h >>= 1;
1816     vp8_mc_chroma(s, td, dst[1] + by_off * s->uvlinesize + bx_off,
1817                   dst[2] + by_off * s->uvlinesize + bx_off, ref_frame,
1818                   &uvmv, x_off + bx_off, y_off + by_off,
1819                   block_w, block_h, width, height, s->uvlinesize,
1820                   s->put_pixels_tab[1 + (block_w == 4)]);
1821 }
1822
1823 /* Fetch pixels for estimated mv 4 macroblocks ahead.
1824  * Optimized for 64-byte cache lines. Inspired by ffh264 prefetch_motion. */
1825 static av_always_inline
1826 void prefetch_motion(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
1827                      int mb_xy, int ref)
1828 {
1829     /* Don't prefetch refs that haven't been used very often this frame. */
1830     if (s->ref_count[ref - 1] > (mb_xy >> 5)) {
1831         int x_off = mb_x << 4, y_off = mb_y << 4;
1832         int mx = (mb->mv.x >> 2) + x_off + 8;
1833         int my = (mb->mv.y >> 2) + y_off;
1834         uint8_t **src = s->framep[ref]->tf.f->data;
1835         int off = mx + (my + (mb_x & 3) * 4) * s->linesize + 64;
1836         /* For threading, a ff_thread_await_progress here might be useful, but
1837          * it actually slows down the decoder. Since a bad prefetch doesn't
1838          * generate bad decoder output, we don't run it here. */
1839         s->vdsp.prefetch(src[0] + off, s->linesize, 4);
1840         off = (mx >> 1) + ((my >> 1) + (mb_x & 7)) * s->uvlinesize + 64;
1841         s->vdsp.prefetch(src[1] + off, src[2] - src[1], 2);
1842     }
1843 }
1844
1845 /**
1846  * Apply motion vectors to prediction buffer, chapter 18.
1847  */
1848 static av_always_inline
1849 void inter_predict(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1850                    VP8Macroblock *mb, int mb_x, int mb_y)
1851 {
1852     int x_off = mb_x << 4, y_off = mb_y << 4;
1853     int width = 16 * s->mb_width, height = 16 * s->mb_height;
1854     ThreadFrame *ref = &s->framep[mb->ref_frame]->tf;
1855     VP56mv *bmv = mb->bmv;
1856
1857     switch (mb->partitioning) {
1858     case VP8_SPLITMVMODE_NONE:
1859         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1860                     0, 0, 16, 16, width, height, &mb->mv);
1861         break;
1862     case VP8_SPLITMVMODE_4x4: {
1863         int x, y;
1864         VP56mv uvmv;
1865
1866         /* Y */
1867         for (y = 0; y < 4; y++) {
1868             for (x = 0; x < 4; x++) {
1869                 vp8_mc_luma(s, td, dst[0] + 4 * y * s->linesize + x * 4,
1870                             ref, &bmv[4 * y + x],
1871                             4 * x + x_off, 4 * y + y_off, 4, 4,
1872                             width, height, s->linesize,
1873                             s->put_pixels_tab[2]);
1874             }
1875         }
1876
1877         /* U/V */
1878         x_off  >>= 1;
1879         y_off  >>= 1;
1880         width  >>= 1;
1881         height >>= 1;
1882         for (y = 0; y < 2; y++) {
1883             for (x = 0; x < 2; x++) {
1884                 uvmv.x = mb->bmv[2 * y       * 4 + 2 * x    ].x +
1885                          mb->bmv[2 * y       * 4 + 2 * x + 1].x +
1886                          mb->bmv[(2 * y + 1) * 4 + 2 * x    ].x +
1887                          mb->bmv[(2 * y + 1) * 4 + 2 * x + 1].x;
1888                 uvmv.y = mb->bmv[2 * y       * 4 + 2 * x    ].y +
1889                          mb->bmv[2 * y       * 4 + 2 * x + 1].y +
1890                          mb->bmv[(2 * y + 1) * 4 + 2 * x    ].y +
1891                          mb->bmv[(2 * y + 1) * 4 + 2 * x + 1].y;
1892                 uvmv.x = (uvmv.x + 2 + FF_SIGNBIT(uvmv.x)) >> 2;
1893                 uvmv.y = (uvmv.y + 2 + FF_SIGNBIT(uvmv.y)) >> 2;
1894                 if (s->profile == 3) {
1895                     uvmv.x &= ~7;
1896                     uvmv.y &= ~7;
1897                 }
1898                 vp8_mc_chroma(s, td, dst[1] + 4 * y * s->uvlinesize + x * 4,
1899                               dst[2] + 4 * y * s->uvlinesize + x * 4, ref,
1900                               &uvmv, 4 * x + x_off, 4 * y + y_off, 4, 4,
1901                               width, height, s->uvlinesize,
1902                               s->put_pixels_tab[2]);
1903             }
1904         }
1905         break;
1906     }
1907     case VP8_SPLITMVMODE_16x8:
1908         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1909                     0, 0, 16, 8, width, height, &bmv[0]);
1910         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1911                     0, 8, 16, 8, width, height, &bmv[1]);
1912         break;
1913     case VP8_SPLITMVMODE_8x16:
1914         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1915                     0, 0, 8, 16, width, height, &bmv[0]);
1916         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1917                     8, 0, 8, 16, width, height, &bmv[1]);
1918         break;
1919     case VP8_SPLITMVMODE_8x8:
1920         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1921                     0, 0, 8, 8, width, height, &bmv[0]);
1922         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1923                     8, 0, 8, 8, width, height, &bmv[1]);
1924         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1925                     0, 8, 8, 8, width, height, &bmv[2]);
1926         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1927                     8, 8, 8, 8, width, height, &bmv[3]);
1928         break;
1929     }
1930 }
1931
1932 static av_always_inline
1933 void idct_mb(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3], VP8Macroblock *mb)
1934 {
1935     int x, y, ch;
1936
1937     if (mb->mode != MODE_I4x4) {
1938         uint8_t *y_dst = dst[0];
1939         for (y = 0; y < 4; y++) {
1940             uint32_t nnz4 = AV_RL32(td->non_zero_count_cache[y]);
1941             if (nnz4) {
1942                 if (nnz4 & ~0x01010101) {
1943                     for (x = 0; x < 4; x++) {
1944                         if ((uint8_t) nnz4 == 1)
1945                             s->vp8dsp.vp8_idct_dc_add(y_dst + 4 * x,
1946                                                       td->block[y][x],
1947                                                       s->linesize);
1948                         else if ((uint8_t) nnz4 > 1)
1949                             s->vp8dsp.vp8_idct_add(y_dst + 4 * x,
1950                                                    td->block[y][x],
1951                                                    s->linesize);
1952                         nnz4 >>= 8;
1953                         if (!nnz4)
1954                             break;
1955                     }
1956                 } else {
1957                     s->vp8dsp.vp8_idct_dc_add4y(y_dst, td->block[y], s->linesize);
1958                 }
1959             }
1960             y_dst += 4 * s->linesize;
1961         }
1962     }
1963
1964     for (ch = 0; ch < 2; ch++) {
1965         uint32_t nnz4 = AV_RL32(td->non_zero_count_cache[4 + ch]);
1966         if (nnz4) {
1967             uint8_t *ch_dst = dst[1 + ch];
1968             if (nnz4 & ~0x01010101) {
1969                 for (y = 0; y < 2; y++) {
1970                     for (x = 0; x < 2; x++) {
1971                         if ((uint8_t) nnz4 == 1)
1972                             s->vp8dsp.vp8_idct_dc_add(ch_dst + 4 * x,
1973                                                       td->block[4 + ch][(y << 1) + x],
1974                                                       s->uvlinesize);
1975                         else if ((uint8_t) nnz4 > 1)
1976                             s->vp8dsp.vp8_idct_add(ch_dst + 4 * x,
1977                                                    td->block[4 + ch][(y << 1) + x],
1978                                                    s->uvlinesize);
1979                         nnz4 >>= 8;
1980                         if (!nnz4)
1981                             goto chroma_idct_end;
1982                     }
1983                     ch_dst += 4 * s->uvlinesize;
1984                 }
1985             } else {
1986                 s->vp8dsp.vp8_idct_dc_add4uv(ch_dst, td->block[4 + ch], s->uvlinesize);
1987             }
1988         }
1989 chroma_idct_end:
1990         ;
1991     }
1992 }
1993
1994 static av_always_inline
1995 void filter_level_for_mb(VP8Context *s, VP8Macroblock *mb,
1996                          VP8FilterStrength *f, int is_vp7)
1997 {
1998     int interior_limit, filter_level;
1999
2000     if (s->segmentation.enabled) {
2001         filter_level = s->segmentation.filter_level[mb->segment];
2002         if (!s->segmentation.absolute_vals)
2003             filter_level += s->filter.level;
2004     } else
2005         filter_level = s->filter.level;
2006
2007     if (s->lf_delta.enabled) {
2008         filter_level += s->lf_delta.ref[mb->ref_frame];
2009         filter_level += s->lf_delta.mode[mb->mode];
2010     }
2011
2012     filter_level = av_clip_uintp2(filter_level, 6);
2013
2014     interior_limit = filter_level;
2015     if (s->filter.sharpness) {
2016         interior_limit >>= (s->filter.sharpness + 3) >> 2;
2017         interior_limit = FFMIN(interior_limit, 9 - s->filter.sharpness);
2018     }
2019     interior_limit = FFMAX(interior_limit, 1);
2020
2021     f->filter_level = filter_level;
2022     f->inner_limit = interior_limit;
2023     f->inner_filter = is_vp7 || !mb->skip || mb->mode == MODE_I4x4 ||
2024                       mb->mode == VP8_MVMODE_SPLIT;
2025 }
2026
2027 static av_always_inline
2028 void filter_mb(VP8Context *s, uint8_t *dst[3], VP8FilterStrength *f,
2029                int mb_x, int mb_y, int is_vp7)
2030 {
2031     int mbedge_lim, bedge_lim_y, bedge_lim_uv, hev_thresh;
2032     int filter_level = f->filter_level;
2033     int inner_limit = f->inner_limit;
2034     int inner_filter = f->inner_filter;
2035     int linesize = s->linesize;
2036     int uvlinesize = s->uvlinesize;
2037     static const uint8_t hev_thresh_lut[2][64] = {
2038         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
2039           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2040           3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2041           3, 3, 3, 3 },
2042         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
2043           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2044           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2045           2, 2, 2, 2 }
2046     };
2047
2048     if (!filter_level)
2049         return;
2050
2051     if (is_vp7) {
2052         bedge_lim_y  = filter_level;
2053         bedge_lim_uv = filter_level * 2;
2054         mbedge_lim   = filter_level + 2;
2055     } else {
2056         bedge_lim_y  =
2057         bedge_lim_uv = filter_level * 2 + inner_limit;
2058         mbedge_lim   = bedge_lim_y + 4;
2059     }
2060
2061     hev_thresh = hev_thresh_lut[s->keyframe][filter_level];
2062
2063     if (mb_x) {
2064         s->vp8dsp.vp8_h_loop_filter16y(dst[0], linesize,
2065                                        mbedge_lim, inner_limit, hev_thresh);
2066         s->vp8dsp.vp8_h_loop_filter8uv(dst[1], dst[2], uvlinesize,
2067                                        mbedge_lim, inner_limit, hev_thresh);
2068     }
2069
2070 #define H_LOOP_FILTER_16Y_INNER(cond)                                         \
2071     if (cond && inner_filter) {                                               \
2072         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] +  4, linesize,           \
2073                                              bedge_lim_y, inner_limit,        \
2074                                              hev_thresh);                     \
2075         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] +  8, linesize,           \
2076                                              bedge_lim_y, inner_limit,        \
2077                                              hev_thresh);                     \
2078         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] + 12, linesize,           \
2079                                              bedge_lim_y, inner_limit,        \
2080                                              hev_thresh);                     \
2081         s->vp8dsp.vp8_h_loop_filter8uv_inner(dst[1] +  4, dst[2] + 4,         \
2082                                              uvlinesize,  bedge_lim_uv,       \
2083                                              inner_limit, hev_thresh);        \
2084     }
2085
2086     H_LOOP_FILTER_16Y_INNER(!is_vp7)
2087
2088     if (mb_y) {
2089         s->vp8dsp.vp8_v_loop_filter16y(dst[0], linesize,
2090                                        mbedge_lim, inner_limit, hev_thresh);
2091         s->vp8dsp.vp8_v_loop_filter8uv(dst[1], dst[2], uvlinesize,
2092                                        mbedge_lim, inner_limit, hev_thresh);
2093     }
2094
2095     if (inner_filter) {
2096         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] +  4 * linesize,
2097                                              linesize, bedge_lim_y,
2098                                              inner_limit, hev_thresh);
2099         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] +  8 * linesize,
2100                                              linesize, bedge_lim_y,
2101                                              inner_limit, hev_thresh);
2102         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] + 12 * linesize,
2103                                              linesize, bedge_lim_y,
2104                                              inner_limit, hev_thresh);
2105         s->vp8dsp.vp8_v_loop_filter8uv_inner(dst[1] +  4 * uvlinesize,
2106                                              dst[2] +  4 * uvlinesize,
2107                                              uvlinesize, bedge_lim_uv,
2108                                              inner_limit, hev_thresh);
2109     }
2110
2111     H_LOOP_FILTER_16Y_INNER(is_vp7)
2112 }
2113
2114 static av_always_inline
2115 void filter_mb_simple(VP8Context *s, uint8_t *dst, VP8FilterStrength *f,
2116                       int mb_x, int mb_y)
2117 {
2118     int mbedge_lim, bedge_lim;
2119     int filter_level = f->filter_level;
2120     int inner_limit  = f->inner_limit;
2121     int inner_filter = f->inner_filter;
2122     int linesize     = s->linesize;
2123
2124     if (!filter_level)
2125         return;
2126
2127     bedge_lim  = 2 * filter_level + inner_limit;
2128     mbedge_lim = bedge_lim + 4;
2129
2130     if (mb_x)
2131         s->vp8dsp.vp8_h_loop_filter_simple(dst, linesize, mbedge_lim);
2132     if (inner_filter) {
2133         s->vp8dsp.vp8_h_loop_filter_simple(dst +  4, linesize, bedge_lim);
2134         s->vp8dsp.vp8_h_loop_filter_simple(dst +  8, linesize, bedge_lim);
2135         s->vp8dsp.vp8_h_loop_filter_simple(dst + 12, linesize, bedge_lim);
2136     }
2137
2138     if (mb_y)
2139         s->vp8dsp.vp8_v_loop_filter_simple(dst, linesize, mbedge_lim);
2140     if (inner_filter) {
2141         s->vp8dsp.vp8_v_loop_filter_simple(dst +  4 * linesize, linesize, bedge_lim);
2142         s->vp8dsp.vp8_v_loop_filter_simple(dst +  8 * linesize, linesize, bedge_lim);
2143         s->vp8dsp.vp8_v_loop_filter_simple(dst + 12 * linesize, linesize, bedge_lim);
2144     }
2145 }
2146
2147 #define MARGIN (16 << 2)
2148 static av_always_inline
2149 void vp78_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *curframe,
2150                                     VP8Frame *prev_frame, int is_vp7)
2151 {
2152     VP8Context *s = avctx->priv_data;
2153     int mb_x, mb_y;
2154
2155     s->mv_min.y = -MARGIN;
2156     s->mv_max.y = ((s->mb_height - 1) << 6) + MARGIN;
2157     for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
2158         VP8Macroblock *mb = s->macroblocks_base +
2159                             ((s->mb_width + 1) * (mb_y + 1) + 1);
2160         int mb_xy = mb_y * s->mb_width;
2161
2162         AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED * 0x01010101);
2163
2164         s->mv_min.x = -MARGIN;
2165         s->mv_max.x = ((s->mb_width - 1) << 6) + MARGIN;
2166         for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
2167             if (mb_y == 0)
2168                 AV_WN32A((mb - s->mb_width - 1)->intra4x4_pred_mode_top,
2169                          DC_PRED * 0x01010101);
2170             decode_mb_mode(s, mb, mb_x, mb_y, curframe->seg_map->data + mb_xy,
2171                            prev_frame && prev_frame->seg_map ?
2172                            prev_frame->seg_map->data + mb_xy : NULL, 1, is_vp7);
2173             s->mv_min.x -= 64;
2174             s->mv_max.x -= 64;
2175         }
2176         s->mv_min.y -= 64;
2177         s->mv_max.y -= 64;
2178     }
2179 }
2180
2181 static void vp7_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *cur_frame,
2182                                    VP8Frame *prev_frame)
2183 {
2184     vp78_decode_mv_mb_modes(avctx, cur_frame, prev_frame, IS_VP7);
2185 }
2186
2187 static void vp8_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *cur_frame,
2188                                    VP8Frame *prev_frame)
2189 {
2190     vp78_decode_mv_mb_modes(avctx, cur_frame, prev_frame, IS_VP8);
2191 }
2192
2193 #if HAVE_THREADS
2194 #define check_thread_pos(td, otd, mb_x_check, mb_y_check)                     \
2195     do {                                                                      \
2196         int tmp = (mb_y_check << 16) | (mb_x_check & 0xFFFF);                 \
2197         if (otd->thread_mb_pos < tmp) {                                       \
2198             pthread_mutex_lock(&otd->lock);                                   \
2199             td->wait_mb_pos = tmp;                                            \
2200             do {                                                              \
2201                 if (otd->thread_mb_pos >= tmp)                                \
2202                     break;                                                    \
2203                 pthread_cond_wait(&otd->cond, &otd->lock);                    \
2204             } while (1);                                                      \
2205             td->wait_mb_pos = INT_MAX;                                        \
2206             pthread_mutex_unlock(&otd->lock);                                 \
2207         }                                                                     \
2208     } while (0);
2209
2210 #define update_pos(td, mb_y, mb_x)                                            \
2211     do {                                                                      \
2212         int pos              = (mb_y << 16) | (mb_x & 0xFFFF);                \
2213         int sliced_threading = (avctx->active_thread_type == FF_THREAD_SLICE) && \
2214                                (num_jobs > 1);                                \
2215         int is_null          = !next_td || !prev_td;                          \
2216         int pos_check        = (is_null) ? 1                                  \
2217                                          : (next_td != td &&                  \
2218                                             pos >= next_td->wait_mb_pos) ||   \
2219                                            (prev_td != td &&                  \
2220                                             pos >= prev_td->wait_mb_pos);     \
2221         td->thread_mb_pos = pos;                                              \
2222         if (sliced_threading && pos_check) {                                  \
2223             pthread_mutex_lock(&td->lock);                                    \
2224             pthread_cond_broadcast(&td->cond);                                \
2225             pthread_mutex_unlock(&td->lock);                                  \
2226         }                                                                     \
2227     } while (0);
2228 #else
2229 #define check_thread_pos(td, otd, mb_x_check, mb_y_check)
2230 #define update_pos(td, mb_y, mb_x)
2231 #endif
2232
2233 static void vp8_decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
2234                                         int jobnr, int threadnr, int is_vp7)
2235 {
2236     VP8Context *s = avctx->priv_data;
2237     VP8ThreadData *prev_td, *next_td, *td = &s->thread_data[threadnr];
2238     int mb_y = td->thread_mb_pos >> 16;
2239     int mb_x, mb_xy = mb_y * s->mb_width;
2240     int num_jobs = s->num_jobs;
2241     VP8Frame *curframe = s->curframe, *prev_frame = s->prev_frame;
2242     VP56RangeCoder *c  = &s->coeff_partition[mb_y & (s->num_coeff_partitions - 1)];
2243     VP8Macroblock *mb;
2244     uint8_t *dst[3] = {
2245         curframe->tf.f->data[0] + 16 * mb_y * s->linesize,
2246         curframe->tf.f->data[1] +  8 * mb_y * s->uvlinesize,
2247         curframe->tf.f->data[2] +  8 * mb_y * s->uvlinesize
2248     };
2249     if (mb_y == 0)
2250         prev_td = td;
2251     else
2252         prev_td = &s->thread_data[(jobnr + num_jobs - 1) % num_jobs];
2253     if (mb_y == s->mb_height - 1)
2254         next_td = td;
2255     else
2256         next_td = &s->thread_data[(jobnr + 1) % num_jobs];
2257     if (s->mb_layout == 1)
2258         mb = s->macroblocks_base + ((s->mb_width + 1) * (mb_y + 1) + 1);
2259     else {
2260         // Make sure the previous frame has read its segmentation map,
2261         // if we re-use the same map.
2262         if (prev_frame && s->segmentation.enabled &&
2263             !s->segmentation.update_map)
2264             ff_thread_await_progress(&prev_frame->tf, mb_y, 0);
2265         mb = s->macroblocks + (s->mb_height - mb_y - 1) * 2;
2266         memset(mb - 1, 0, sizeof(*mb)); // zero left macroblock
2267         AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED * 0x01010101);
2268     }
2269
2270     if (!is_vp7 || mb_y == 0)
2271         memset(td->left_nnz, 0, sizeof(td->left_nnz));
2272
2273     s->mv_min.x = -MARGIN;
2274     s->mv_max.x = ((s->mb_width - 1) << 6) + MARGIN;
2275
2276     for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
2277         // Wait for previous thread to read mb_x+2, and reach mb_y-1.
2278         if (prev_td != td) {
2279             if (threadnr != 0) {
2280                 check_thread_pos(td, prev_td,
2281                                  mb_x + (is_vp7 ? 2 : 1),
2282                                  mb_y - (is_vp7 ? 2 : 1));
2283             } else {
2284                 check_thread_pos(td, prev_td,
2285                                  mb_x + (is_vp7 ? 2 : 1) + s->mb_width + 3,
2286                                  mb_y - (is_vp7 ? 2 : 1));
2287             }
2288         }
2289
2290         s->vdsp.prefetch(dst[0] + (mb_x & 3) * 4 * s->linesize + 64,
2291                          s->linesize, 4);
2292         s->vdsp.prefetch(dst[1] + (mb_x & 7) * s->uvlinesize + 64,
2293                          dst[2] - dst[1], 2);
2294
2295         if (!s->mb_layout)
2296             decode_mb_mode(s, mb, mb_x, mb_y, curframe->seg_map->data + mb_xy,
2297                            prev_frame && prev_frame->seg_map ?
2298                            prev_frame->seg_map->data + mb_xy : NULL, 0, is_vp7);
2299
2300         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_PREVIOUS);
2301
2302         if (!mb->skip)
2303             decode_mb_coeffs(s, td, c, mb, s->top_nnz[mb_x], td->left_nnz, is_vp7);
2304
2305         if (mb->mode <= MODE_I4x4)
2306             intra_predict(s, td, dst, mb, mb_x, mb_y, is_vp7);
2307         else
2308             inter_predict(s, td, dst, mb, mb_x, mb_y);
2309
2310         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN);
2311
2312         if (!mb->skip) {
2313             idct_mb(s, td, dst, mb);
2314         } else {
2315             AV_ZERO64(td->left_nnz);
2316             AV_WN64(s->top_nnz[mb_x], 0);   // array of 9, so unaligned
2317
2318             /* Reset DC block predictors if they would exist
2319              * if the mb had coefficients */
2320             if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
2321                 td->left_nnz[8]     = 0;
2322                 s->top_nnz[mb_x][8] = 0;
2323             }
2324         }
2325
2326         if (s->deblock_filter)
2327             filter_level_for_mb(s, mb, &td->filter_strength[mb_x], is_vp7);
2328
2329         if (s->deblock_filter && num_jobs != 1 && threadnr == num_jobs - 1) {
2330             if (s->filter.simple)
2331                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2332                                  NULL, NULL, s->linesize, 0, 1);
2333             else
2334                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2335                                  dst[1], dst[2], s->linesize, s->uvlinesize, 0);
2336         }
2337
2338         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN2);
2339
2340         dst[0]      += 16;
2341         dst[1]      += 8;
2342         dst[2]      += 8;
2343         s->mv_min.x -= 64;
2344         s->mv_max.x -= 64;
2345
2346         if (mb_x == s->mb_width + 1) {
2347             update_pos(td, mb_y, s->mb_width + 3);
2348         } else {
2349             update_pos(td, mb_y, mb_x);
2350         }
2351     }
2352 }
2353
2354 static void vp8_filter_mb_row(AVCodecContext *avctx, void *tdata,
2355                               int jobnr, int threadnr, int is_vp7)
2356 {
2357     VP8Context *s = avctx->priv_data;
2358     VP8ThreadData *td = &s->thread_data[threadnr];
2359     int mb_x, mb_y = td->thread_mb_pos >> 16, num_jobs = s->num_jobs;
2360     AVFrame *curframe = s->curframe->tf.f;
2361     VP8Macroblock *mb;
2362     VP8ThreadData *prev_td, *next_td;
2363     uint8_t *dst[3] = {
2364         curframe->data[0] + 16 * mb_y * s->linesize,
2365         curframe->data[1] +  8 * mb_y * s->uvlinesize,
2366         curframe->data[2] +  8 * mb_y * s->uvlinesize
2367     };
2368
2369     if (s->mb_layout == 1)
2370         mb = s->macroblocks_base + ((s->mb_width + 1) * (mb_y + 1) + 1);
2371     else
2372         mb = s->macroblocks + (s->mb_height - mb_y - 1) * 2;
2373
2374     if (mb_y == 0)
2375         prev_td = td;
2376     else
2377         prev_td = &s->thread_data[(jobnr + num_jobs - 1) % num_jobs];
2378     if (mb_y == s->mb_height - 1)
2379         next_td = td;
2380     else
2381         next_td = &s->thread_data[(jobnr + 1) % num_jobs];
2382
2383     for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb++) {
2384         VP8FilterStrength *f = &td->filter_strength[mb_x];
2385         if (prev_td != td)
2386             check_thread_pos(td, prev_td,
2387                              (mb_x + 1) + (s->mb_width + 3), mb_y - 1);
2388         if (next_td != td)
2389             if (next_td != &s->thread_data[0])
2390                 check_thread_pos(td, next_td, mb_x + 1, mb_y + 1);
2391
2392         if (num_jobs == 1) {
2393             if (s->filter.simple)
2394                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2395                                  NULL, NULL, s->linesize, 0, 1);
2396             else
2397                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2398                                  dst[1], dst[2], s->linesize, s->uvlinesize, 0);
2399         }
2400
2401         if (s->filter.simple)
2402             filter_mb_simple(s, dst[0], f, mb_x, mb_y);
2403         else
2404             filter_mb(s, dst, f, mb_x, mb_y, is_vp7);
2405         dst[0] += 16;
2406         dst[1] += 8;
2407         dst[2] += 8;
2408
2409         update_pos(td, mb_y, (s->mb_width + 3) + mb_x);
2410     }
2411 }
2412
2413 static av_always_inline
2414 int vp78_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata, int jobnr,
2415                               int threadnr, int is_vp7)
2416 {
2417     VP8Context *s = avctx->priv_data;
2418     VP8ThreadData *td = &s->thread_data[jobnr];
2419     VP8ThreadData *next_td = NULL, *prev_td = NULL;
2420     VP8Frame *curframe = s->curframe;
2421     int mb_y, num_jobs = s->num_jobs;
2422
2423     td->thread_nr = threadnr;
2424     for (mb_y = jobnr; mb_y < s->mb_height; mb_y += num_jobs) {
2425         if (mb_y >= s->mb_height)
2426             break;
2427         td->thread_mb_pos = mb_y << 16;
2428         vp8_decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr, is_vp7);
2429         if (s->deblock_filter)
2430             vp8_filter_mb_row(avctx, tdata, jobnr, threadnr, is_vp7);
2431         update_pos(td, mb_y, INT_MAX & 0xFFFF);
2432
2433         s->mv_min.y -= 64;
2434         s->mv_max.y -= 64;
2435
2436         if (avctx->active_thread_type == FF_THREAD_FRAME)
2437             ff_thread_report_progress(&curframe->tf, mb_y, 0);
2438     }
2439
2440     return 0;
2441 }
2442
2443 static int vp7_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata,
2444                                     int jobnr, int threadnr)
2445 {
2446     return vp78_decode_mb_row_sliced(avctx, tdata, jobnr, threadnr, IS_VP7);
2447 }
2448
2449 static int vp8_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata,
2450                                     int jobnr, int threadnr)
2451 {
2452     return vp78_decode_mb_row_sliced(avctx, tdata, jobnr, threadnr, IS_VP8);
2453 }
2454
2455
2456 static av_always_inline
2457 int vp78_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
2458                       AVPacket *avpkt, int is_vp7)
2459 {
2460     VP8Context *s = avctx->priv_data;
2461     int ret, i, referenced, num_jobs;
2462     enum AVDiscard skip_thresh;
2463     VP8Frame *av_uninit(curframe), *prev_frame;
2464
2465     if (is_vp7)
2466         ret = vp7_decode_frame_header(s, avpkt->data, avpkt->size);
2467     else
2468         ret = vp8_decode_frame_header(s, avpkt->data, avpkt->size);
2469
2470     if (ret < 0)
2471         goto err;
2472
2473     prev_frame = s->framep[VP56_FRAME_CURRENT];
2474
2475     referenced = s->update_last || s->update_golden == VP56_FRAME_CURRENT ||
2476                  s->update_altref == VP56_FRAME_CURRENT;
2477
2478     skip_thresh = !referenced ? AVDISCARD_NONREF
2479                               : !s->keyframe ? AVDISCARD_NONKEY
2480                                              : AVDISCARD_ALL;
2481
2482     if (avctx->skip_frame >= skip_thresh) {
2483         s->invisible = 1;
2484         memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
2485         goto skip_decode;
2486     }
2487     s->deblock_filter = s->filter.level && avctx->skip_loop_filter < skip_thresh;
2488
2489     // release no longer referenced frames
2490     for (i = 0; i < 5; i++)
2491         if (s->frames[i].tf.f->data[0] &&
2492             &s->frames[i] != prev_frame &&
2493             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
2494             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN]   &&
2495             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2])
2496             vp8_release_frame(s, &s->frames[i]);
2497
2498     curframe = s->framep[VP56_FRAME_CURRENT] = vp8_find_free_buffer(s);
2499
2500     if (!s->colorspace)
2501         avctx->colorspace = AVCOL_SPC_BT470BG;
2502     if (s->fullrange)
2503         avctx->color_range = AVCOL_RANGE_JPEG;
2504     else
2505         avctx->color_range = AVCOL_RANGE_MPEG;
2506
2507     /* Given that arithmetic probabilities are updated every frame, it's quite
2508      * likely that the values we have on a random interframe are complete
2509      * junk if we didn't start decode on a keyframe. So just don't display
2510      * anything rather than junk. */
2511     if (!s->keyframe && (!s->framep[VP56_FRAME_PREVIOUS] ||
2512                          !s->framep[VP56_FRAME_GOLDEN]   ||
2513                          !s->framep[VP56_FRAME_GOLDEN2])) {
2514         av_log(avctx, AV_LOG_WARNING,
2515                "Discarding interframe without a prior keyframe!\n");
2516         ret = AVERROR_INVALIDDATA;
2517         goto err;
2518     }
2519
2520     curframe->tf.f->key_frame = s->keyframe;
2521     curframe->tf.f->pict_type = s->keyframe ? AV_PICTURE_TYPE_I
2522                                             : AV_PICTURE_TYPE_P;
2523     if ((ret = vp8_alloc_frame(s, curframe, referenced))) {
2524         av_log(avctx, AV_LOG_ERROR, "get_buffer() failed!\n");
2525         goto err;
2526     }
2527
2528     // check if golden and altref are swapped
2529     if (s->update_altref != VP56_FRAME_NONE)
2530         s->next_framep[VP56_FRAME_GOLDEN2] = s->framep[s->update_altref];
2531     else
2532         s->next_framep[VP56_FRAME_GOLDEN2] = s->framep[VP56_FRAME_GOLDEN2];
2533
2534     if (s->update_golden != VP56_FRAME_NONE)
2535         s->next_framep[VP56_FRAME_GOLDEN] = s->framep[s->update_golden];
2536     else
2537         s->next_framep[VP56_FRAME_GOLDEN] = s->framep[VP56_FRAME_GOLDEN];
2538
2539     if (s->update_last)
2540         s->next_framep[VP56_FRAME_PREVIOUS] = curframe;
2541     else
2542         s->next_framep[VP56_FRAME_PREVIOUS] = s->framep[VP56_FRAME_PREVIOUS];
2543
2544     s->next_framep[VP56_FRAME_CURRENT] = curframe;
2545
2546     ff_thread_finish_setup(avctx);
2547
2548     s->linesize   = curframe->tf.f->linesize[0];
2549     s->uvlinesize = curframe->tf.f->linesize[1];
2550
2551     memset(s->top_nnz, 0, s->mb_width * sizeof(*s->top_nnz));
2552     /* Zero macroblock structures for top/top-left prediction
2553      * from outside the frame. */
2554     if (!s->mb_layout)
2555         memset(s->macroblocks + s->mb_height * 2 - 1, 0,
2556                (s->mb_width + 1) * sizeof(*s->macroblocks));
2557     if (!s->mb_layout && s->keyframe)
2558         memset(s->intra4x4_pred_mode_top, DC_PRED, s->mb_width * 4);
2559
2560     memset(s->ref_count, 0, sizeof(s->ref_count));
2561
2562     if (s->mb_layout == 1) {
2563         // Make sure the previous frame has read its segmentation map,
2564         // if we re-use the same map.
2565         if (prev_frame && s->segmentation.enabled &&
2566             !s->segmentation.update_map)
2567             ff_thread_await_progress(&prev_frame->tf, 1, 0);
2568         if (is_vp7)
2569             vp7_decode_mv_mb_modes(avctx, curframe, prev_frame);
2570         else
2571             vp8_decode_mv_mb_modes(avctx, curframe, prev_frame);
2572     }
2573
2574     if (avctx->active_thread_type == FF_THREAD_FRAME)
2575         num_jobs = 1;
2576     else
2577         num_jobs = FFMIN(s->num_coeff_partitions, avctx->thread_count);
2578     s->num_jobs   = num_jobs;
2579     s->curframe   = curframe;
2580     s->prev_frame = prev_frame;
2581     s->mv_min.y   = -MARGIN;
2582     s->mv_max.y   = ((s->mb_height - 1) << 6) + MARGIN;
2583     for (i = 0; i < MAX_THREADS; i++) {
2584         s->thread_data[i].thread_mb_pos = 0;
2585         s->thread_data[i].wait_mb_pos   = INT_MAX;
2586     }
2587     if (is_vp7)
2588         avctx->execute2(avctx, vp7_decode_mb_row_sliced, s->thread_data, NULL,
2589                         num_jobs);
2590     else
2591         avctx->execute2(avctx, vp8_decode_mb_row_sliced, s->thread_data, NULL,
2592                         num_jobs);
2593
2594     ff_thread_report_progress(&curframe->tf, INT_MAX, 0);
2595     memcpy(&s->framep[0], &s->next_framep[0], sizeof(s->framep[0]) * 4);
2596
2597 skip_decode:
2598     // if future frames don't use the updated probabilities,
2599     // reset them to the values we saved
2600     if (!s->update_probabilities)
2601         s->prob[0] = s->prob[1];
2602
2603     if (!s->invisible) {
2604         if ((ret = av_frame_ref(data, curframe->tf.f)) < 0)
2605             return ret;
2606         *got_frame = 1;
2607     }
2608
2609     return avpkt->size;
2610 err:
2611     memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
2612     return ret;
2613 }
2614
2615 int ff_vp8_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
2616                         AVPacket *avpkt)
2617 {
2618     return vp78_decode_frame(avctx, data, got_frame, avpkt, IS_VP8);
2619 }
2620
2621 #if CONFIG_VP7_DECODER
2622 static int vp7_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
2623                             AVPacket *avpkt)
2624 {
2625     return vp78_decode_frame(avctx, data, got_frame, avpkt, IS_VP7);
2626 }
2627 #endif /* CONFIG_VP7_DECODER */
2628
2629 av_cold int ff_vp8_decode_free(AVCodecContext *avctx)
2630 {
2631     VP8Context *s = avctx->priv_data;
2632     int i;
2633
2634     vp8_decode_flush_impl(avctx, 1);
2635     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++)
2636         av_frame_free(&s->frames[i].tf.f);
2637
2638     return 0;
2639 }
2640
2641 static av_cold int vp8_init_frames(VP8Context *s)
2642 {
2643     int i;
2644     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++) {
2645         s->frames[i].tf.f = av_frame_alloc();
2646         if (!s->frames[i].tf.f)
2647             return AVERROR(ENOMEM);
2648     }
2649     return 0;
2650 }
2651
2652 static av_always_inline
2653 int vp78_decode_init(AVCodecContext *avctx, int is_vp7)
2654 {
2655     VP8Context *s = avctx->priv_data;
2656     int ret;
2657
2658     s->avctx = avctx;
2659     avctx->pix_fmt = AV_PIX_FMT_YUV420P;
2660     avctx->internal->allocate_progress = 1;
2661
2662     ff_videodsp_init(&s->vdsp, 8);
2663
2664     ff_vp78dsp_init(&s->vp8dsp);
2665     if (CONFIG_VP7_DECODER && is_vp7) {
2666         ff_h264_pred_init(&s->hpc, AV_CODEC_ID_VP7, 8, 1);
2667         ff_vp7dsp_init(&s->vp8dsp);
2668     } else if (CONFIG_VP8_DECODER && !is_vp7) {
2669         ff_h264_pred_init(&s->hpc, AV_CODEC_ID_VP8, 8, 1);
2670         ff_vp8dsp_init(&s->vp8dsp);
2671     }
2672
2673     /* does not change for VP8 */
2674     memcpy(s->prob[0].scan, zigzag_scan, sizeof(s->prob[0].scan));
2675
2676     if ((ret = vp8_init_frames(s)) < 0) {
2677         ff_vp8_decode_free(avctx);
2678         return ret;
2679     }
2680
2681     return 0;
2682 }
2683
2684 #if CONFIG_VP7_DECODER
2685 static int vp7_decode_init(AVCodecContext *avctx)
2686 {
2687     return vp78_decode_init(avctx, IS_VP7);
2688 }
2689 #endif /* CONFIG_VP7_DECODER */
2690
2691 av_cold int ff_vp8_decode_init(AVCodecContext *avctx)
2692 {
2693     return vp78_decode_init(avctx, IS_VP8);
2694 }
2695
2696 #if CONFIG_VP8_DECODER
2697 static av_cold int vp8_decode_init_thread_copy(AVCodecContext *avctx)
2698 {
2699     VP8Context *s = avctx->priv_data;
2700     int ret;
2701
2702     s->avctx = avctx;
2703
2704     if ((ret = vp8_init_frames(s)) < 0) {
2705         ff_vp8_decode_free(avctx);
2706         return ret;
2707     }
2708
2709     return 0;
2710 }
2711
2712 #define REBASE(pic) pic ? pic - &s_src->frames[0] + &s->frames[0] : NULL
2713
2714 static int vp8_decode_update_thread_context(AVCodecContext *dst,
2715                                             const AVCodecContext *src)
2716 {
2717     VP8Context *s = dst->priv_data, *s_src = src->priv_data;
2718     int i;
2719
2720     if (s->macroblocks_base &&
2721         (s_src->mb_width != s->mb_width || s_src->mb_height != s->mb_height)) {
2722         free_buffers(s);
2723         s->mb_width  = s_src->mb_width;
2724         s->mb_height = s_src->mb_height;
2725     }
2726
2727     s->prob[0]      = s_src->prob[!s_src->update_probabilities];
2728     s->segmentation = s_src->segmentation;
2729     s->lf_delta     = s_src->lf_delta;
2730     memcpy(s->sign_bias, s_src->sign_bias, sizeof(s->sign_bias));
2731
2732     for (i = 0; i < FF_ARRAY_ELEMS(s_src->frames); i++) {
2733         if (s_src->frames[i].tf.f->data[0]) {
2734             int ret = vp8_ref_frame(s, &s->frames[i], &s_src->frames[i]);
2735             if (ret < 0)
2736                 return ret;
2737         }
2738     }
2739
2740     s->framep[0] = REBASE(s_src->next_framep[0]);
2741     s->framep[1] = REBASE(s_src->next_framep[1]);
2742     s->framep[2] = REBASE(s_src->next_framep[2]);
2743     s->framep[3] = REBASE(s_src->next_framep[3]);
2744
2745     return 0;
2746 }
2747 #endif /* CONFIG_VP8_DECODER */
2748
2749 #if CONFIG_VP7_DECODER
2750 AVCodec ff_vp7_decoder = {
2751     .name                  = "vp7",
2752     .long_name             = NULL_IF_CONFIG_SMALL("On2 VP7"),
2753     .type                  = AVMEDIA_TYPE_VIDEO,
2754     .id                    = AV_CODEC_ID_VP7,
2755     .priv_data_size        = sizeof(VP8Context),
2756     .init                  = vp7_decode_init,
2757     .close                 = ff_vp8_decode_free,
2758     .decode                = vp7_decode_frame,
2759     .capabilities          = CODEC_CAP_DR1,
2760     .flush                 = vp8_decode_flush,
2761 };
2762 #endif /* CONFIG_VP7_DECODER */
2763
2764 #if CONFIG_VP8_DECODER
2765 AVCodec ff_vp8_decoder = {
2766     .name                  = "vp8",
2767     .long_name             = NULL_IF_CONFIG_SMALL("On2 VP8"),
2768     .type                  = AVMEDIA_TYPE_VIDEO,
2769     .id                    = AV_CODEC_ID_VP8,
2770     .priv_data_size        = sizeof(VP8Context),
2771     .init                  = ff_vp8_decode_init,
2772     .close                 = ff_vp8_decode_free,
2773     .decode                = ff_vp8_decode_frame,
2774     .capabilities          = CODEC_CAP_DR1 | CODEC_CAP_FRAME_THREADS | CODEC_CAP_SLICE_THREADS,
2775     .flush                 = vp8_decode_flush,
2776     .init_thread_copy      = ONLY_IF_THREADS_ENABLED(vp8_decode_init_thread_copy),
2777     .update_thread_context = ONLY_IF_THREADS_ENABLED(vp8_decode_update_thread_context),
2778 };
2779 #endif /* CONFIG_VP7_DECODER */