git.sesse.net Git - ffmpeg/blob - libavcodec/vp8.c

   1 /*
   2  * VP8 compatible video decoder
   3  *
   4  * Copyright (C) 2010 David Conrad
   5  * Copyright (C) 2010 Ronald S. Bultje
   6  * Copyright (C) 2010 Jason Garrett-Glaser
   7  * Copyright (C) 2012 Daniel Kang
   8  *
   9  * This file is part of FFmpeg.
  10  *
  11  * FFmpeg is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public
  13  * License as published by the Free Software Foundation; either
  14  * version 2.1 of the License, or (at your option) any later version.
  15  *
  16  * FFmpeg is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with FFmpeg; if not, write to the Free Software
  23  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  24  */
  25
  26 #include "libavutil/imgutils.h"
  27 #include "avcodec.h"
  28 #include "internal.h"
  29 #include "vp8.h"
  30 #include "vp8data.h"
  31 #include "rectangle.h"
  32 #include "thread.h"
  33
  34 #if ARCH_ARM
  35 #   include "arm/vp8.h"
  36 #endif
  37
  38 static void free_buffers(VP8Context *s)
  39 {
  40     int i;
  41     if (s->thread_data)
  42         for (i = 0; i < MAX_THREADS; i++) {
  43             av_freep(&s->thread_data[i].filter_strength);
  44             av_freep(&s->thread_data[i].edge_emu_buffer);
  45         }
  46     av_freep(&s->thread_data);
  47     av_freep(&s->macroblocks_base);
  48     av_freep(&s->intra4x4_pred_mode_top);
  49     av_freep(&s->top_nnz);
  50     av_freep(&s->top_border);
  51
  52     s->macroblocks = NULL;
  53 }
  54
  55 static int vp8_alloc_frame(VP8Context *s, AVFrame *f)
  56 {
  57     int ret;
  58     if ((ret = ff_thread_get_buffer(s->avctx, f)) < 0)
  59         return ret;
  60     if (s->num_maps_to_be_freed && !s->maps_are_invalid) {
  61         f->ref_index[0] = s->segmentation_maps[--s->num_maps_to_be_freed];
  62     } else if (!(f->ref_index[0] = av_mallocz(s->mb_width * s->mb_height))) {
  63         ff_thread_release_buffer(s->avctx, f);
  64         return AVERROR(ENOMEM);
  65     }
  66     return 0;
  67 }
  68
  69 static void vp8_release_frame(VP8Context *s, AVFrame *f, int prefer_delayed_free, int can_direct_free)
  70 {
  71     if (f->ref_index[0]) {
  72         if (prefer_delayed_free) {
  73             /* Upon a size change, we want to free the maps but other threads may still
  74              * be using them, so queue them. Upon a seek, all threads are inactive so
  75              * we want to cache one to prevent re-allocation in the next decoding
  76              * iteration, but the rest we can free directly. */
  77             int max_queued_maps = can_direct_free ? 1 : FF_ARRAY_ELEMS(s->segmentation_maps);
  78             if (s->num_maps_to_be_freed < max_queued_maps) {
  79                 s->segmentation_maps[s->num_maps_to_be_freed++] = f->ref_index[0];
  80             } else if (can_direct_free) /* vp8_decode_flush(), but our queue is full */ {
  81                 av_free(f->ref_index[0]);
  82             } /* else: MEMLEAK (should never happen, but better that than crash) */
  83             f->ref_index[0] = NULL;
  84         } else /* vp8_decode_free() */ {
  85             av_free(f->ref_index[0]);
  86         }
  87     }
  88     ff_thread_release_buffer(s->avctx, f);
  89 }
  90
  91 static void vp8_decode_flush_impl(AVCodecContext *avctx,
  92                                   int prefer_delayed_free, int can_direct_free, int free_mem)
  93 {
  94     VP8Context *s = avctx->priv_data;
  95     int i;
  96
  97     if (!avctx->internal->is_copy) {
  98         for (i = 0; i < 5; i++)
  99             if (s->frames[i].data[0])
 100                 vp8_release_frame(s, &s->frames[i], prefer_delayed_free, can_direct_free);
 101     }
 102     memset(s->framep, 0, sizeof(s->framep));
 103
 104     if (free_mem) {
 105         free_buffers(s);
 106         s->maps_are_invalid = 1;
 107     }
 108 }
 109
 110 static void vp8_decode_flush(AVCodecContext *avctx)
 111 {
 112     vp8_decode_flush_impl(avctx, 1, 1, 0);
 113 }
 114
 115 static int update_dimensions(VP8Context *s, int width, int height)
 116 {
 117     AVCodecContext *avctx = s->avctx;
 118     int i;
 119
 120     if (width  != s->avctx->width || ((width+15)/16 != s->mb_width || (height+15)/16 != s->mb_height) && s->macroblocks_base ||
 121         height != s->avctx->height) {
 122         if (av_image_check_size(width, height, 0, s->avctx))
 123             return AVERROR_INVALIDDATA;
 124
 125         vp8_decode_flush_impl(s->avctx, 1, 0, 1);
 126
 127         avcodec_set_dimensions(s->avctx, width, height);
 128     }
 129
 130     s->mb_width  = (s->avctx->coded_width +15) / 16;
 131     s->mb_height = (s->avctx->coded_height+15) / 16;
 132
 133     s->mb_layout = (avctx->active_thread_type == FF_THREAD_SLICE) && (FFMIN(s->num_coeff_partitions, avctx->thread_count) > 1);
 134     if (!s->mb_layout) { // Frame threading and one thread
 135         s->macroblocks_base       = av_mallocz((s->mb_width+s->mb_height*2+1)*sizeof(*s->macroblocks));
 136         s->intra4x4_pred_mode_top = av_mallocz(s->mb_width*4);
 137     }
 138     else // Sliced threading
 139         s->macroblocks_base       = av_mallocz((s->mb_width+2)*(s->mb_height+2)*sizeof(*s->macroblocks));
 140     s->top_nnz                    = av_mallocz(s->mb_width*sizeof(*s->top_nnz));
 141     s->top_border                 = av_mallocz((s->mb_width+1)*sizeof(*s->top_border));
 142     s->thread_data                = av_mallocz(MAX_THREADS*sizeof(VP8ThreadData));
 143
 144     for (i = 0; i < MAX_THREADS; i++) {
 145         s->thread_data[i].filter_strength = av_mallocz(s->mb_width*sizeof(*s->thread_data[0].filter_strength));
 146 #if HAVE_THREADS
 147         pthread_mutex_init(&s->thread_data[i].lock, NULL);
 148         pthread_cond_init(&s->thread_data[i].cond, NULL);
 149 #endif
 150     }
 151
 152     if (!s->macroblocks_base || !s->top_nnz || !s->top_border ||
 153         (!s->intra4x4_pred_mode_top && !s->mb_layout))
 154         return AVERROR(ENOMEM);
 155
 156     s->macroblocks        = s->macroblocks_base + 1;
 157
 158     return 0;
 159 }
 160
 161 static void parse_segment_info(VP8Context *s)
 162 {
 163     VP56RangeCoder *c = &s->c;
 164     int i;
 165
 166     s->segmentation.update_map = vp8_rac_get(c);
 167
 168     if (vp8_rac_get(c)) { // update segment feature data
 169         s->segmentation.absolute_vals = vp8_rac_get(c);
 170
 171         for (i = 0; i < 4; i++)
 172             s->segmentation.base_quant[i]   = vp8_rac_get_sint(c, 7);
 173
 174         for (i = 0; i < 4; i++)
 175             s->segmentation.filter_level[i] = vp8_rac_get_sint(c, 6);
 176     }
 177     if (s->segmentation.update_map)
 178         for (i = 0; i < 3; i++)
 179             s->prob->segmentid[i] = vp8_rac_get(c) ? vp8_rac_get_uint(c, 8) : 255;
 180 }
 181
 182 static void update_lf_deltas(VP8Context *s)
 183 {
 184     VP56RangeCoder *c = &s->c;
 185     int i;
 186
 187     for (i = 0; i < 4; i++) {
 188         if (vp8_rac_get(c)) {
 189             s->lf_delta.ref[i] = vp8_rac_get_uint(c, 6);
 190
 191             if (vp8_rac_get(c))
 192                 s->lf_delta.ref[i] = -s->lf_delta.ref[i];
 193         }
 194     }
 195
 196     for (i = MODE_I4x4; i <= VP8_MVMODE_SPLIT; i++) {
 197         if (vp8_rac_get(c)) {
 198             s->lf_delta.mode[i] = vp8_rac_get_uint(c, 6);
 199
 200             if (vp8_rac_get(c))
 201                 s->lf_delta.mode[i] = -s->lf_delta.mode[i];
 202         }
 203     }
 204 }
 205
 206 static int setup_partitions(VP8Context *s, const uint8_t *buf, int buf_size)
 207 {
 208     const uint8_t *sizes = buf;
 209     int i;
 210
 211     s->num_coeff_partitions = 1 << vp8_rac_get_uint(&s->c, 2);
 212
 213     buf      += 3*(s->num_coeff_partitions-1);
 214     buf_size -= 3*(s->num_coeff_partitions-1);
 215     if (buf_size < 0)
 216         return -1;
 217
 218     for (i = 0; i < s->num_coeff_partitions-1; i++) {
 219         int size = AV_RL24(sizes + 3*i);
 220         if (buf_size - size < 0)
 221             return -1;
 222
 223         ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, size);
 224         buf      += size;
 225         buf_size -= size;
 226     }
 227     ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, buf_size);
 228
 229     return 0;
 230 }
 231
 232 static void get_quants(VP8Context *s)
 233 {
 234     VP56RangeCoder *c = &s->c;
 235     int i, base_qi;
 236
 237     int yac_qi     = vp8_rac_get_uint(c, 7);
 238     int ydc_delta  = vp8_rac_get_sint(c, 4);
 239     int y2dc_delta = vp8_rac_get_sint(c, 4);
 240     int y2ac_delta = vp8_rac_get_sint(c, 4);
 241     int uvdc_delta = vp8_rac_get_sint(c, 4);
 242     int uvac_delta = vp8_rac_get_sint(c, 4);
 243
 244     for (i = 0; i < 4; i++) {
 245         if (s->segmentation.enabled) {
 246             base_qi = s->segmentation.base_quant[i];
 247             if (!s->segmentation.absolute_vals)
 248                 base_qi += yac_qi;
 249         } else
 250             base_qi = yac_qi;
 251
 252         s->qmat[i].luma_qmul[0]    =           vp8_dc_qlookup[av_clip_uintp2(base_qi + ydc_delta , 7)];
 253         s->qmat[i].luma_qmul[1]    =           vp8_ac_qlookup[av_clip_uintp2(base_qi             , 7)];
 254         s->qmat[i].luma_dc_qmul[0] =       2 * vp8_dc_qlookup[av_clip_uintp2(base_qi + y2dc_delta, 7)];
 255         /* 101581>>16 is equivalent to 155/100 */
 256         s->qmat[i].luma_dc_qmul[1] = (101581 * vp8_ac_qlookup[av_clip_uintp2(base_qi + y2ac_delta, 7)]) >> 16;
 257         s->qmat[i].chroma_qmul[0]  =           vp8_dc_qlookup[av_clip_uintp2(base_qi + uvdc_delta, 7)];
 258         s->qmat[i].chroma_qmul[1]  =           vp8_ac_qlookup[av_clip_uintp2(base_qi + uvac_delta, 7)];
 259
 260         s->qmat[i].luma_dc_qmul[1] = FFMAX(s->qmat[i].luma_dc_qmul[1], 8);
 261         s->qmat[i].chroma_qmul[0]  = FFMIN(s->qmat[i].chroma_qmul[0], 132);
 262     }
 263 }
 264
 265 /**
 266  * Determine which buffers golden and altref should be updated with after this frame.
 267  * The spec isn't clear here, so I'm going by my understanding of what libvpx does
 268  *
 269  * Intra frames update all 3 references
 270  * Inter frames update VP56_FRAME_PREVIOUS if the update_last flag is set
 271  * If the update (golden|altref) flag is set, it's updated with the current frame
 272  *      if update_last is set, and VP56_FRAME_PREVIOUS otherwise.
 273  * If the flag is not set, the number read means:
 274  *      0: no update
 275  *      1: VP56_FRAME_PREVIOUS
 276  *      2: update golden with altref, or update altref with golden
 277  */
 278 static VP56Frame ref_to_update(VP8Context *s, int update, VP56Frame ref)
 279 {
 280     VP56RangeCoder *c = &s->c;
 281
 282     if (update)
 283         return VP56_FRAME_CURRENT;
 284
 285     switch (vp8_rac_get_uint(c, 2)) {
 286     case 1:
 287         return VP56_FRAME_PREVIOUS;
 288     case 2:
 289         return (ref == VP56_FRAME_GOLDEN) ? VP56_FRAME_GOLDEN2 : VP56_FRAME_GOLDEN;
 290     }
 291     return VP56_FRAME_NONE;
 292 }
 293
 294 static void update_refs(VP8Context *s)
 295 {
 296     VP56RangeCoder *c = &s->c;
 297
 298     int update_golden = vp8_rac_get(c);
 299     int update_altref = vp8_rac_get(c);
 300
 301     s->update_golden = ref_to_update(s, update_golden, VP56_FRAME_GOLDEN);
 302     s->update_altref = ref_to_update(s, update_altref, VP56_FRAME_GOLDEN2);
 303 }
 304
 305 static int decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size)
 306 {
 307     VP56RangeCoder *c = &s->c;
 308     int header_size, hscale, vscale, i, j, k, l, m, ret;
 309     int width  = s->avctx->width;
 310     int height = s->avctx->height;
 311
 312     s->keyframe  = !(buf[0] & 1);
 313     s->profile   =  (buf[0]>>1) & 7;
 314     s->invisible = !(buf[0] & 0x10);
 315     header_size  = AV_RL24(buf) >> 5;
 316     buf      += 3;
 317     buf_size -= 3;
 318
 319     if (s->profile > 3)
 320         av_log(s->avctx, AV_LOG_WARNING, "Unknown profile %d\n", s->profile);
 321
 322     if (!s->profile)
 323         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab, sizeof(s->put_pixels_tab));
 324     else    // profile 1-3 use bilinear, 4+ aren't defined so whatever
 325         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_bilinear_pixels_tab, sizeof(s->put_pixels_tab));
 326
 327     if (header_size > buf_size - 7*s->keyframe) {
 328         av_log(s->avctx, AV_LOG_ERROR, "Header size larger than data provided\n");
 329         return AVERROR_INVALIDDATA;
 330     }
 331
 332     if (s->keyframe) {
 333         if (AV_RL24(buf) != 0x2a019d) {
 334             av_log(s->avctx, AV_LOG_ERROR, "Invalid start code 0x%x\n", AV_RL24(buf));
 335             return AVERROR_INVALIDDATA;
 336         }
 337         width  = AV_RL16(buf+3) & 0x3fff;
 338         height = AV_RL16(buf+5) & 0x3fff;
 339         hscale = buf[4] >> 6;
 340         vscale = buf[6] >> 6;
 341         buf      += 7;
 342         buf_size -= 7;
 343
 344         if (hscale || vscale)
 345             av_log_missing_feature(s->avctx, "Upscaling", 1);
 346
 347         s->update_golden = s->update_altref = VP56_FRAME_CURRENT;
 348         for (i = 0; i < 4; i++)
 349             for (j = 0; j < 16; j++)
 350                 memcpy(s->prob->token[i][j], vp8_token_default_probs[i][vp8_coeff_band[j]],
 351                        sizeof(s->prob->token[i][j]));
 352         memcpy(s->prob->pred16x16, vp8_pred16x16_prob_inter, sizeof(s->prob->pred16x16));
 353         memcpy(s->prob->pred8x8c , vp8_pred8x8c_prob_inter , sizeof(s->prob->pred8x8c));
 354         memcpy(s->prob->mvc      , vp8_mv_default_prob     , sizeof(s->prob->mvc));
 355         memset(&s->segmentation, 0, sizeof(s->segmentation));
 356         memset(&s->lf_delta, 0, sizeof(s->lf_delta));
 357     }
 358
 359     ff_vp56_init_range_decoder(c, buf, header_size);
 360     buf      += header_size;
 361     buf_size -= header_size;
 362
 363     if (s->keyframe) {
 364         if (vp8_rac_get(c))
 365             av_log(s->avctx, AV_LOG_WARNING, "Unspecified colorspace\n");
 366         vp8_rac_get(c); // whether we can skip clamping in dsp functions
 367     }
 368
 369     if ((s->segmentation.enabled = vp8_rac_get(c)))
 370         parse_segment_info(s);
 371     else
 372         s->segmentation.update_map = 0; // FIXME: move this to some init function?
 373
 374     s->filter.simple    = vp8_rac_get(c);
 375     s->filter.level     = vp8_rac_get_uint(c, 6);
 376     s->filter.sharpness = vp8_rac_get_uint(c, 3);
 377
 378     if ((s->lf_delta.enabled = vp8_rac_get(c)))
 379         if (vp8_rac_get(c))
 380             update_lf_deltas(s);
 381
 382     if (setup_partitions(s, buf, buf_size)) {
 383         av_log(s->avctx, AV_LOG_ERROR, "Invalid partitions\n");
 384         return AVERROR_INVALIDDATA;
 385     }
 386
 387     if (!s->macroblocks_base || /* first frame */
 388         width != s->avctx->width || height != s->avctx->height || (width+15)/16 != s->mb_width || (height+15)/16 != s->mb_height) {
 389         if ((ret = update_dimensions(s, width, height)) < 0)
 390             return ret;
 391     }
 392
 393     get_quants(s);
 394
 395     if (!s->keyframe) {
 396         update_refs(s);
 397         s->sign_bias[VP56_FRAME_GOLDEN]               = vp8_rac_get(c);
 398         s->sign_bias[VP56_FRAME_GOLDEN2 /* altref */] = vp8_rac_get(c);
 399     }
 400
 401     // if we aren't saving this frame's probabilities for future frames,
 402     // make a copy of the current probabilities
 403     if (!(s->update_probabilities = vp8_rac_get(c)))
 404         s->prob[1] = s->prob[0];
 405
 406     s->update_last = s->keyframe || vp8_rac_get(c);
 407
 408     for (i = 0; i < 4; i++)
 409         for (j = 0; j < 8; j++)
 410             for (k = 0; k < 3; k++)
 411                 for (l = 0; l < NUM_DCT_TOKENS-1; l++)
 412                     if (vp56_rac_get_prob_branchy(c, vp8_token_update_probs[i][j][k][l])) {
 413                         int prob = vp8_rac_get_uint(c, 8);
 414                         for (m = 0; vp8_coeff_band_indexes[j][m] >= 0; m++)
 415                             s->prob->token[i][vp8_coeff_band_indexes[j][m]][k][l] = prob;
 416                     }
 417
 418     if ((s->mbskip_enabled = vp8_rac_get(c)))
 419         s->prob->mbskip = vp8_rac_get_uint(c, 8);
 420
 421     if (!s->keyframe) {
 422         s->prob->intra  = vp8_rac_get_uint(c, 8);
 423         s->prob->last   = vp8_rac_get_uint(c, 8);
 424         s->prob->golden = vp8_rac_get_uint(c, 8);
 425
 426         if (vp8_rac_get(c))
 427             for (i = 0; i < 4; i++)
 428                 s->prob->pred16x16[i] = vp8_rac_get_uint(c, 8);
 429         if (vp8_rac_get(c))
 430             for (i = 0; i < 3; i++)
 431                 s->prob->pred8x8c[i]  = vp8_rac_get_uint(c, 8);
 432
 433         // 17.2 MV probability update
 434         for (i = 0; i < 2; i++)
 435             for (j = 0; j < 19; j++)
 436                 if (vp56_rac_get_prob_branchy(c, vp8_mv_update_prob[i][j]))
 437                     s->prob->mvc[i][j] = vp8_rac_get_nn(c);
 438     }
 439
 440     return 0;
 441 }
 442
 443 static av_always_inline void clamp_mv(VP8Context *s, VP56mv *dst, const VP56mv *src)
 444 {
 445     dst->x = av_clip(src->x, s->mv_min.x, s->mv_max.x);
 446     dst->y = av_clip(src->y, s->mv_min.y, s->mv_max.y);
 447 }
 448
 449 /**
 450  * Motion vector coding, 17.1.
 451  */
 452 static int read_mv_component(VP56RangeCoder *c, const uint8_t *p)
 453 {
 454     int bit, x = 0;
 455
 456     if (vp56_rac_get_prob_branchy(c, p[0])) {
 457         int i;
 458
 459         for (i = 0; i < 3; i++)
 460             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 461         for (i = 9; i > 3; i--)
 462             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 463         if (!(x & 0xFFF0) || vp56_rac_get_prob(c, p[12]))
 464             x += 8;
 465     } else {
 466         // small_mvtree
 467         const uint8_t *ps = p+2;
 468         bit = vp56_rac_get_prob(c, *ps);
 469         ps += 1 + 3*bit;
 470         x  += 4*bit;
 471         bit = vp56_rac_get_prob(c, *ps);
 472         ps += 1 + bit;
 473         x  += 2*bit;
 474         x  += vp56_rac_get_prob(c, *ps);
 475     }
 476
 477     return (x && vp56_rac_get_prob(c, p[1])) ? -x : x;
 478 }
 479
 480 static av_always_inline
 481 const uint8_t *get_submv_prob(uint32_t left, uint32_t top)
 482 {
 483     if (left == top)
 484         return vp8_submv_prob[4-!!left];
 485     if (!top)
 486         return vp8_submv_prob[2];
 487     return vp8_submv_prob[1-!!left];
 488 }
 489
 490 /**
 491  * Split motion vector prediction, 16.4.
 492  * @returns the number of motion vectors parsed (2, 4 or 16)
 493  */
 494 static av_always_inline
 495 int decode_splitmvs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb, int layout)
 496 {
 497     int part_idx;
 498     int n, num;
 499     VP8Macroblock *top_mb;
 500     VP8Macroblock *left_mb = &mb[-1];
 501     const uint8_t *mbsplits_left = vp8_mbsplits[left_mb->partitioning],
 502                   *mbsplits_top,
 503                   *mbsplits_cur, *firstidx;
 504     VP56mv *top_mv;
 505     VP56mv *left_mv = left_mb->bmv;
 506     VP56mv *cur_mv  = mb->bmv;
 507
 508     if (!layout) // layout is inlined, s->mb_layout is not
 509         top_mb = &mb[2];
 510     else
 511         top_mb = &mb[-s->mb_width-1];
 512     mbsplits_top = vp8_mbsplits[top_mb->partitioning];
 513     top_mv = top_mb->bmv;
 514
 515     if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[0])) {
 516         if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[1])) {
 517             part_idx = VP8_SPLITMVMODE_16x8 + vp56_rac_get_prob(c, vp8_mbsplit_prob[2]);
 518         } else {
 519             part_idx = VP8_SPLITMVMODE_8x8;
 520         }
 521     } else {
 522         part_idx = VP8_SPLITMVMODE_4x4;
 523     }
 524
 525     num = vp8_mbsplit_count[part_idx];
 526     mbsplits_cur = vp8_mbsplits[part_idx],
 527     firstidx = vp8_mbfirstidx[part_idx];
 528     mb->partitioning = part_idx;
 529
 530     for (n = 0; n < num; n++) {
 531         int k = firstidx[n];
 532         uint32_t left, above;
 533         const uint8_t *submv_prob;
 534
 535         if (!(k & 3))
 536             left = AV_RN32A(&left_mv[mbsplits_left[k + 3]]);
 537         else
 538             left  = AV_RN32A(&cur_mv[mbsplits_cur[k - 1]]);
 539         if (k <= 3)
 540             above = AV_RN32A(&top_mv[mbsplits_top[k + 12]]);
 541         else
 542             above = AV_RN32A(&cur_mv[mbsplits_cur[k - 4]]);
 543
 544         submv_prob = get_submv_prob(left, above);
 545
 546         if (vp56_rac_get_prob_branchy(c, submv_prob[0])) {
 547             if (vp56_rac_get_prob_branchy(c, submv_prob[1])) {
 548                 if (vp56_rac_get_prob_branchy(c, submv_prob[2])) {
 549                     mb->bmv[n].y = mb->mv.y + read_mv_component(c, s->prob->mvc[0]);
 550                     mb->bmv[n].x = mb->mv.x + read_mv_component(c, s->prob->mvc[1]);
 551                 } else {
 552                     AV_ZERO32(&mb->bmv[n]);
 553                 }
 554             } else {
 555                 AV_WN32A(&mb->bmv[n], above);
 556             }
 557         } else {
 558             AV_WN32A(&mb->bmv[n], left);
 559         }
 560     }
 561
 562     return num;
 563 }
 564
 565 static av_always_inline
 566 void decode_mvs(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, int layout)
 567 {
 568     VP8Macroblock *mb_edge[3] = { 0 /* top */,
 569                                   mb - 1 /* left */,
 570                                   0 /* top-left */ };
 571     enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV };
 572     enum { VP8_EDGE_TOP, VP8_EDGE_LEFT, VP8_EDGE_TOPLEFT };
 573     int idx = CNT_ZERO;
 574     int cur_sign_bias = s->sign_bias[mb->ref_frame];
 575     int8_t *sign_bias = s->sign_bias;
 576     VP56mv near_mv[4];
 577     uint8_t cnt[4] = { 0 };
 578     VP56RangeCoder *c = &s->c;
 579
 580     if (!layout) { // layout is inlined (s->mb_layout is not)
 581         mb_edge[0] = mb + 2;
 582         mb_edge[2] = mb + 1;
 583     }
 584     else {
 585         mb_edge[0] = mb - s->mb_width-1;
 586         mb_edge[2] = mb - s->mb_width-2;
 587     }
 588
 589     AV_ZERO32(&near_mv[0]);
 590     AV_ZERO32(&near_mv[1]);
 591     AV_ZERO32(&near_mv[2]);
 592
 593     /* Process MB on top, left and top-left */
 594     #define MV_EDGE_CHECK(n)\
 595     {\
 596         VP8Macroblock *edge = mb_edge[n];\
 597         int edge_ref = edge->ref_frame;\
 598         if (edge_ref != VP56_FRAME_CURRENT) {\
 599             uint32_t mv = AV_RN32A(&edge->mv);\
 600             if (mv) {\
 601                 if (cur_sign_bias != sign_bias[edge_ref]) {\
 602                     /* SWAR negate of the values in mv. */\
 603                     mv = ~mv;\
 604                     mv = ((mv&0x7fff7fff) + 0x00010001) ^ (mv&0x80008000);\
 605                 }\
 606                 if (!n || mv != AV_RN32A(&near_mv[idx]))\
 607                     AV_WN32A(&near_mv[++idx], mv);\
 608                 cnt[idx]      += 1 + (n != 2);\
 609             } else\
 610                 cnt[CNT_ZERO] += 1 + (n != 2);\
 611         }\
 612     }
 613
 614     MV_EDGE_CHECK(0)
 615     MV_EDGE_CHECK(1)
 616     MV_EDGE_CHECK(2)
 617
 618     mb->partitioning = VP8_SPLITMVMODE_NONE;
 619     if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_ZERO]][0])) {
 620         mb->mode = VP8_MVMODE_MV;
 621
 622         /* If we have three distinct MVs, merge first and last if they're the same */
 623         if (cnt[CNT_SPLITMV] && AV_RN32A(&near_mv[1 + VP8_EDGE_TOP]) == AV_RN32A(&near_mv[1 + VP8_EDGE_TOPLEFT]))
 624             cnt[CNT_NEAREST] += 1;
 625
 626         /* Swap near and nearest if necessary */
 627         if (cnt[CNT_NEAR] > cnt[CNT_NEAREST]) {
 628             FFSWAP(uint8_t,     cnt[CNT_NEAREST],     cnt[CNT_NEAR]);
 629             FFSWAP( VP56mv, near_mv[CNT_NEAREST], near_mv[CNT_NEAR]);
 630         }
 631
 632         if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAREST]][1])) {
 633             if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAR]][2])) {
 634
 635                 /* Choose the best mv out of 0,0 and the nearest mv */
 636                 clamp_mv(s, &mb->mv, &near_mv[CNT_ZERO + (cnt[CNT_NEAREST] >= cnt[CNT_ZERO])]);
 637                 cnt[CNT_SPLITMV] = ((mb_edge[VP8_EDGE_LEFT]->mode    == VP8_MVMODE_SPLIT) +
 638                                     (mb_edge[VP8_EDGE_TOP]->mode     == VP8_MVMODE_SPLIT)) * 2 +
 639                                     (mb_edge[VP8_EDGE_TOPLEFT]->mode == VP8_MVMODE_SPLIT);
 640
 641                 if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_SPLITMV]][3])) {
 642                     mb->mode = VP8_MVMODE_SPLIT;
 643                     mb->mv = mb->bmv[decode_splitmvs(s, c, mb, layout) - 1];
 644                 } else {
 645                     mb->mv.y += read_mv_component(c, s->prob->mvc[0]);
 646                     mb->mv.x += read_mv_component(c, s->prob->mvc[1]);
 647                     mb->bmv[0] = mb->mv;
 648                 }
 649             } else {
 650                 clamp_mv(s, &mb->mv, &near_mv[CNT_NEAR]);
 651                 mb->bmv[0] = mb->mv;
 652             }
 653         } else {
 654             clamp_mv(s, &mb->mv, &near_mv[CNT_NEAREST]);
 655             mb->bmv[0] = mb->mv;
 656         }
 657     } else {
 658         mb->mode = VP8_MVMODE_ZERO;
 659         AV_ZERO32(&mb->mv);
 660         mb->bmv[0] = mb->mv;
 661     }
 662 }
 663
 664 static av_always_inline
 665 void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
 666                            int mb_x, int keyframe, int layout)
 667 {
 668     uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
 669
 670     if (layout == 1) {
 671         VP8Macroblock *mb_top = mb - s->mb_width - 1;
 672         memcpy(mb->intra4x4_pred_mode_top, mb_top->intra4x4_pred_mode_top, 4);
 673     }
 674     if (keyframe) {
 675         int x, y;
 676         uint8_t* top;
 677         uint8_t* const left = s->intra4x4_pred_mode_left;
 678         if (layout == 1)
 679             top = mb->intra4x4_pred_mode_top;
 680         else
 681             top = s->intra4x4_pred_mode_top + 4 * mb_x;
 682         for (y = 0; y < 4; y++) {
 683             for (x = 0; x < 4; x++) {
 684                 const uint8_t *ctx;
 685                 ctx = vp8_pred4x4_prob_intra[top[x]][left[y]];
 686                 *intra4x4 = vp8_rac_get_tree(c, vp8_pred4x4_tree, ctx);
 687                 left[y] = top[x] = *intra4x4;
 688                 intra4x4++;
 689             }
 690         }
 691     } else {
 692         int i;
 693         for (i = 0; i < 16; i++)
 694             intra4x4[i] = vp8_rac_get_tree(c, vp8_pred4x4_tree, vp8_pred4x4_prob_inter);
 695     }
 696 }
 697
 698 static av_always_inline
 699 void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
 700                     uint8_t *segment, uint8_t *ref, int layout)
 701 {
 702     VP56RangeCoder *c = &s->c;
 703
 704     if (s->segmentation.update_map) {
 705         int bit  = vp56_rac_get_prob(c, s->prob->segmentid[0]);
 706         *segment = vp56_rac_get_prob(c, s->prob->segmentid[1+bit]) + 2*bit;
 707     } else if (s->segmentation.enabled)
 708         *segment = ref ? *ref : *segment;
 709     mb->segment = *segment;
 710
 711     mb->skip = s->mbskip_enabled ? vp56_rac_get_prob(c, s->prob->mbskip) : 0;
 712
 713     if (s->keyframe) {
 714         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_intra, vp8_pred16x16_prob_intra);
 715
 716         if (mb->mode == MODE_I4x4) {
 717             decode_intra4x4_modes(s, c, mb, mb_x, 1, layout);
 718         } else {
 719             const uint32_t modes = vp8_pred4x4_mode[mb->mode] * 0x01010101u;
 720             if (s->mb_layout == 1)
 721                 AV_WN32A(mb->intra4x4_pred_mode_top, modes);
 722             else
 723                 AV_WN32A(s->intra4x4_pred_mode_top + 4 * mb_x, modes);
 724             AV_WN32A( s->intra4x4_pred_mode_left, modes);
 725         }
 726
 727         mb->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, vp8_pred8x8c_prob_intra);
 728         mb->ref_frame = VP56_FRAME_CURRENT;
 729     } else if (vp56_rac_get_prob_branchy(c, s->prob->intra)) {
 730         // inter MB, 16.2
 731         if (vp56_rac_get_prob_branchy(c, s->prob->last))
 732             mb->ref_frame = vp56_rac_get_prob(c, s->prob->golden) ?
 733                 VP56_FRAME_GOLDEN2 /* altref */ : VP56_FRAME_GOLDEN;
 734         else
 735             mb->ref_frame = VP56_FRAME_PREVIOUS;
 736         s->ref_count[mb->ref_frame-1]++;
 737
 738         // motion vectors, 16.3
 739         decode_mvs(s, mb, mb_x, mb_y, layout);
 740     } else {
 741         // intra MB, 16.1
 742         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_inter, s->prob->pred16x16);
 743
 744         if (mb->mode == MODE_I4x4)
 745             decode_intra4x4_modes(s, c, mb, mb_x, 0, layout);
 746
 747         mb->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, s->prob->pred8x8c);
 748         mb->ref_frame = VP56_FRAME_CURRENT;
 749         mb->partitioning = VP8_SPLITMVMODE_NONE;
 750         AV_ZERO32(&mb->bmv[0]);
 751     }
 752 }
 753
 754 #ifndef decode_block_coeffs_internal
 755 /**
 756  * @param r arithmetic bitstream reader context
 757  * @param block destination for block coefficients
 758  * @param probs probabilities to use when reading trees from the bitstream
 759  * @param i initial coeff index, 0 unless a separate DC block is coded
 760  * @param qmul array holding the dc/ac dequant factor at position 0/1
 761  * @return 0 if no coeffs were decoded
 762  *         otherwise, the index of the last coeff decoded plus one
 763  */
 764 static int decode_block_coeffs_internal(VP56RangeCoder *r, DCTELEM block[16],
 765                                         uint8_t probs[16][3][NUM_DCT_TOKENS-1],
 766                                         int i, uint8_t *token_prob, int16_t qmul[2])
 767 {
 768     VP56RangeCoder c = *r;
 769     goto skip_eob;
 770     do {
 771         int coeff;
 772         if (!vp56_rac_get_prob_branchy(&c, token_prob[0]))   // DCT_EOB
 773             break;
 774
 775 skip_eob:
 776         if (!vp56_rac_get_prob_branchy(&c, token_prob[1])) { // DCT_0
 777             if (++i == 16)
 778                 break; // invalid input; blocks should end with EOB
 779             token_prob = probs[i][0];
 780             goto skip_eob;
 781         }
 782
 783         if (!vp56_rac_get_prob_branchy(&c, token_prob[2])) { // DCT_1
 784             coeff = 1;
 785             token_prob = probs[i+1][1];
 786         } else {
 787             if (!vp56_rac_get_prob_branchy(&c, token_prob[3])) { // DCT 2,3,4
 788                 coeff = vp56_rac_get_prob_branchy(&c, token_prob[4]);
 789                 if (coeff)
 790                     coeff += vp56_rac_get_prob(&c, token_prob[5]);
 791                 coeff += 2;
 792             } else {
 793                 // DCT_CAT*
 794                 if (!vp56_rac_get_prob_branchy(&c, token_prob[6])) {
 795                     if (!vp56_rac_get_prob_branchy(&c, token_prob[7])) { // DCT_CAT1
 796                         coeff  = 5 + vp56_rac_get_prob(&c, vp8_dct_cat1_prob[0]);
 797                     } else {                                    // DCT_CAT2
 798                         coeff  = 7;
 799                         coeff += vp56_rac_get_prob(&c, vp8_dct_cat2_prob[0]) << 1;
 800                         coeff += vp56_rac_get_prob(&c, vp8_dct_cat2_prob[1]);
 801                     }
 802                 } else {    // DCT_CAT3 and up
 803                     int a = vp56_rac_get_prob(&c, token_prob[8]);
 804                     int b = vp56_rac_get_prob(&c, token_prob[9+a]);
 805                     int cat = (a<<1) + b;
 806                     coeff  = 3 + (8<<cat);
 807                     coeff += vp8_rac_get_coeff(&c, ff_vp8_dct_cat_prob[cat]);
 808                 }
 809             }
 810             token_prob = probs[i+1][2];
 811         }
 812         block[zigzag_scan[i]] = (vp8_rac_get(&c) ? -coeff : coeff) * qmul[!!i];
 813     } while (++i < 16);
 814
 815     *r = c;
 816     return i;
 817 }
 818 #endif
 819
 820 /**
 821  * @param c arithmetic bitstream reader context
 822  * @param block destination for block coefficients
 823  * @param probs probabilities to use when reading trees from the bitstream
 824  * @param i initial coeff index, 0 unless a separate DC block is coded
 825  * @param zero_nhood the initial prediction context for number of surrounding
 826  *                   all-zero blocks (only left/top, so 0-2)
 827  * @param qmul array holding the dc/ac dequant factor at position 0/1
 828  * @return 0 if no coeffs were decoded
 829  *         otherwise, the index of the last coeff decoded plus one
 830  */
 831 static av_always_inline
 832 int decode_block_coeffs(VP56RangeCoder *c, DCTELEM block[16],
 833                         uint8_t probs[16][3][NUM_DCT_TOKENS-1],
 834                         int i, int zero_nhood, int16_t qmul[2])
 835 {
 836     uint8_t *token_prob = probs[i][zero_nhood];
 837     if (!vp56_rac_get_prob_branchy(c, token_prob[0]))   // DCT_EOB
 838         return 0;
 839     return decode_block_coeffs_internal(c, block, probs, i, token_prob, qmul);
 840 }
 841
 842 static av_always_inline
 843 void decode_mb_coeffs(VP8Context *s, VP8ThreadData *td, VP56RangeCoder *c, VP8Macroblock *mb,
 844                       uint8_t t_nnz[9], uint8_t l_nnz[9])
 845 {
 846     int i, x, y, luma_start = 0, luma_ctx = 3;
 847     int nnz_pred, nnz, nnz_total = 0;
 848     int segment = mb->segment;
 849     int block_dc = 0;
 850
 851     if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
 852         nnz_pred = t_nnz[8] + l_nnz[8];
 853
 854         // decode DC values and do hadamard
 855         nnz = decode_block_coeffs(c, td->block_dc, s->prob->token[1], 0, nnz_pred,
 856                                   s->qmat[segment].luma_dc_qmul);
 857         l_nnz[8] = t_nnz[8] = !!nnz;
 858         if (nnz) {
 859             nnz_total += nnz;
 860             block_dc = 1;
 861             if (nnz == 1)
 862                 s->vp8dsp.vp8_luma_dc_wht_dc(td->block, td->block_dc);
 863             else
 864                 s->vp8dsp.vp8_luma_dc_wht(td->block, td->block_dc);
 865         }
 866         luma_start = 1;
 867         luma_ctx = 0;
 868     }
 869
 870     // luma blocks
 871     for (y = 0; y < 4; y++)
 872         for (x = 0; x < 4; x++) {
 873             nnz_pred = l_nnz[y] + t_nnz[x];
 874             nnz = decode_block_coeffs(c, td->block[y][x], s->prob->token[luma_ctx], luma_start,
 875                                       nnz_pred, s->qmat[segment].luma_qmul);
 876             // nnz+block_dc may be one more than the actual last index, but we don't care
 877             td->non_zero_count_cache[y][x] = nnz + block_dc;
 878             t_nnz[x] = l_nnz[y] = !!nnz;
 879             nnz_total += nnz;
 880         }
 881
 882     // chroma blocks
 883     // TODO: what to do about dimensions? 2nd dim for luma is x,
 884     // but for chroma it's (y<<1)|x
 885     for (i = 4; i < 6; i++)
 886         for (y = 0; y < 2; y++)
 887             for (x = 0; x < 2; x++) {
 888                 nnz_pred = l_nnz[i+2*y] + t_nnz[i+2*x];
 889                 nnz = decode_block_coeffs(c, td->block[i][(y<<1)+x], s->prob->token[2], 0,
 890                                           nnz_pred, s->qmat[segment].chroma_qmul);
 891                 td->non_zero_count_cache[i][(y<<1)+x] = nnz;
 892                 t_nnz[i+2*x] = l_nnz[i+2*y] = !!nnz;
 893                 nnz_total += nnz;
 894             }
 895
 896     // if there were no coded coeffs despite the macroblock not being marked skip,
 897     // we MUST not do the inner loop filter and should not do IDCT
 898     // Since skip isn't used for bitstream prediction, just manually set it.
 899     if (!nnz_total)
 900         mb->skip = 1;
 901 }
 902
 903 static av_always_inline
 904 void backup_mb_border(uint8_t *top_border, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr,
 905                       int linesize, int uvlinesize, int simple)
 906 {
 907     AV_COPY128(top_border, src_y + 15*linesize);
 908     if (!simple) {
 909         AV_COPY64(top_border+16, src_cb + 7*uvlinesize);
 910         AV_COPY64(top_border+24, src_cr + 7*uvlinesize);
 911     }
 912 }
 913
 914 static av_always_inline
 915 void xchg_mb_border(uint8_t *top_border, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr,
 916                     int linesize, int uvlinesize, int mb_x, int mb_y, int mb_width,
 917                     int simple, int xchg)
 918 {
 919     uint8_t *top_border_m1 = top_border-32;     // for TL prediction
 920     src_y  -=   linesize;
 921     src_cb -= uvlinesize;
 922     src_cr -= uvlinesize;
 923
 924 #define XCHG(a,b,xchg) do {                     \
 925         if (xchg) AV_SWAP64(b,a);               \
 926         else      AV_COPY64(b,a);               \
 927     } while (0)
 928
 929     XCHG(top_border_m1+8, src_y-8, xchg);
 930     XCHG(top_border,      src_y,   xchg);
 931     XCHG(top_border+8,    src_y+8, 1);
 932     if (mb_x < mb_width-1)
 933         XCHG(top_border+32, src_y+16, 1);
 934
 935     // only copy chroma for normal loop filter
 936     // or to initialize the top row to 127
 937     if (!simple || !mb_y) {
 938         XCHG(top_border_m1+16, src_cb-8, xchg);
 939         XCHG(top_border_m1+24, src_cr-8, xchg);
 940         XCHG(top_border+16,    src_cb, 1);
 941         XCHG(top_border+24,    src_cr, 1);
 942     }
 943 }
 944
 945 static av_always_inline
 946 int check_dc_pred8x8_mode(int mode, int mb_x, int mb_y)
 947 {
 948     if (!mb_x) {
 949         return mb_y ? TOP_DC_PRED8x8 : DC_128_PRED8x8;
 950     } else {
 951         return mb_y ? mode : LEFT_DC_PRED8x8;
 952     }
 953 }
 954
 955 static av_always_inline
 956 int check_tm_pred8x8_mode(int mode, int mb_x, int mb_y)
 957 {
 958     if (!mb_x) {
 959         return mb_y ? VERT_PRED8x8 : DC_129_PRED8x8;
 960     } else {
 961         return mb_y ? mode : HOR_PRED8x8;
 962     }
 963 }
 964
 965 static av_always_inline
 966 int check_intra_pred8x8_mode(int mode, int mb_x, int mb_y)
 967 {
 968     if (mode == DC_PRED8x8) {
 969         return check_dc_pred8x8_mode(mode, mb_x, mb_y);
 970     } else {
 971         return mode;
 972     }
 973 }
 974
 975 static av_always_inline
 976 int check_intra_pred8x8_mode_emuedge(int mode, int mb_x, int mb_y)
 977 {
 978     switch (mode) {
 979     case DC_PRED8x8:
 980         return check_dc_pred8x8_mode(mode, mb_x, mb_y);
 981     case VERT_PRED8x8:
 982         return !mb_y ? DC_127_PRED8x8 : mode;
 983     case HOR_PRED8x8:
 984         return !mb_x ? DC_129_PRED8x8 : mode;
 985     case PLANE_PRED8x8 /*TM*/:
 986         return check_tm_pred8x8_mode(mode, mb_x, mb_y);
 987     }
 988     return mode;
 989 }
 990
 991 static av_always_inline
 992 int check_tm_pred4x4_mode(int mode, int mb_x, int mb_y)
 993 {
 994     if (!mb_x) {
 995         return mb_y ? VERT_VP8_PRED : DC_129_PRED;
 996     } else {
 997         return mb_y ? mode : HOR_VP8_PRED;
 998     }
 999 }
1000
1001 static av_always_inline
1002 int check_intra_pred4x4_mode_emuedge(int mode, int mb_x, int mb_y, int *copy_buf)
1003 {
1004     switch (mode) {
1005     case VERT_PRED:
1006         if (!mb_x && mb_y) {
1007             *copy_buf = 1;
1008             return mode;
1009         }
1010         /* fall-through */
1011     case DIAG_DOWN_LEFT_PRED:
1012     case VERT_LEFT_PRED:
1013         return !mb_y ? DC_127_PRED : mode;
1014     case HOR_PRED:
1015         if (!mb_y) {
1016             *copy_buf = 1;
1017             return mode;
1018         }
1019         /* fall-through */
1020     case HOR_UP_PRED:
1021         return !mb_x ? DC_129_PRED : mode;
1022     case TM_VP8_PRED:
1023         return check_tm_pred4x4_mode(mode, mb_x, mb_y);
1024     case DC_PRED: // 4x4 DC doesn't use the same "H.264-style" exceptions as 16x16/8x8 DC
1025     case DIAG_DOWN_RIGHT_PRED:
1026     case VERT_RIGHT_PRED:
1027     case HOR_DOWN_PRED:
1028         if (!mb_y || !mb_x)
1029             *copy_buf = 1;
1030         return mode;
1031     }
1032     return mode;
1033 }
1034
1035 static av_always_inline
1036 void intra_predict(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1037                    VP8Macroblock *mb, int mb_x, int mb_y)
1038 {
1039     AVCodecContext *avctx = s->avctx;
1040     int x, y, mode, nnz;
1041     uint32_t tr;
1042
1043     // for the first row, we need to run xchg_mb_border to init the top edge to 127
1044     // otherwise, skip it if we aren't going to deblock
1045     if (!(avctx->flags & CODEC_FLAG_EMU_EDGE && !mb_y) && (s->deblock_filter || !mb_y) && td->thread_nr == 0)
1046         xchg_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2],
1047                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1048                        s->filter.simple, 1);
1049
1050     if (mb->mode < MODE_I4x4) {
1051         if (avctx->flags & CODEC_FLAG_EMU_EDGE) { // tested
1052             mode = check_intra_pred8x8_mode_emuedge(mb->mode, mb_x, mb_y);
1053         } else {
1054             mode = check_intra_pred8x8_mode(mb->mode, mb_x, mb_y);
1055         }
1056         s->hpc.pred16x16[mode](dst[0], s->linesize);
1057     } else {
1058         uint8_t *ptr = dst[0];
1059         uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
1060         uint8_t tr_top[4] = { 127, 127, 127, 127 };
1061
1062         // all blocks on the right edge of the macroblock use bottom edge
1063         // the top macroblock for their topright edge
1064         uint8_t *tr_right = ptr - s->linesize + 16;
1065
1066         // if we're on the right edge of the frame, said edge is extended
1067         // from the top macroblock
1068         if (!(!mb_y && avctx->flags & CODEC_FLAG_EMU_EDGE) &&
1069             mb_x == s->mb_width-1) {
1070             tr = tr_right[-1]*0x01010101u;
1071             tr_right = (uint8_t *)&tr;
1072         }
1073
1074         if (mb->skip)
1075             AV_ZERO128(td->non_zero_count_cache);
1076
1077         for (y = 0; y < 4; y++) {
1078             uint8_t *topright = ptr + 4 - s->linesize;
1079             for (x = 0; x < 4; x++) {
1080                 int copy = 0, linesize = s->linesize;
1081                 uint8_t *dst = ptr+4*x;
1082                 DECLARE_ALIGNED(4, uint8_t, copy_dst)[5*8];
1083
1084                 if ((y == 0 || x == 3) && mb_y == 0 && avctx->flags & CODEC_FLAG_EMU_EDGE) {
1085                     topright = tr_top;
1086                 } else if (x == 3)
1087                     topright = tr_right;
1088
1089                 if (avctx->flags & CODEC_FLAG_EMU_EDGE) { // mb_x+x or mb_y+y is a hack but works
1090                     mode = check_intra_pred4x4_mode_emuedge(intra4x4[x], mb_x + x, mb_y + y, &copy);
1091                     if (copy) {
1092                         dst = copy_dst + 12;
1093                         linesize = 8;
1094                         if (!(mb_y + y)) {
1095                             copy_dst[3] = 127U;
1096                             AV_WN32A(copy_dst+4, 127U * 0x01010101U);
1097                         } else {
1098                             AV_COPY32(copy_dst+4, ptr+4*x-s->linesize);
1099                             if (!(mb_x + x)) {
1100                                 copy_dst[3] = 129U;
1101                             } else {
1102                                 copy_dst[3] = ptr[4*x-s->linesize-1];
1103                             }
1104                         }
1105                         if (!(mb_x + x)) {
1106                             copy_dst[11] =
1107                             copy_dst[19] =
1108                             copy_dst[27] =
1109                             copy_dst[35] = 129U;
1110                         } else {
1111                             copy_dst[11] = ptr[4*x              -1];
1112                             copy_dst[19] = ptr[4*x+s->linesize  -1];
1113                             copy_dst[27] = ptr[4*x+s->linesize*2-1];
1114                             copy_dst[35] = ptr[4*x+s->linesize*3-1];
1115                         }
1116                     }
1117                 } else {
1118                     mode = intra4x4[x];
1119                 }
1120                 s->hpc.pred4x4[mode](dst, topright, linesize);
1121                 if (copy) {
1122                     AV_COPY32(ptr+4*x              , copy_dst+12);
1123                     AV_COPY32(ptr+4*x+s->linesize  , copy_dst+20);
1124                     AV_COPY32(ptr+4*x+s->linesize*2, copy_dst+28);
1125                     AV_COPY32(ptr+4*x+s->linesize*3, copy_dst+36);
1126                 }
1127
1128                 nnz = td->non_zero_count_cache[y][x];
1129                 if (nnz) {
1130                     if (nnz == 1)
1131                         s->vp8dsp.vp8_idct_dc_add(ptr+4*x, td->block[y][x], s->linesize);
1132                     else
1133                         s->vp8dsp.vp8_idct_add(ptr+4*x, td->block[y][x], s->linesize);
1134                 }
1135                 topright += 4;
1136             }
1137
1138             ptr   += 4*s->linesize;
1139             intra4x4 += 4;
1140         }
1141     }
1142
1143     if (avctx->flags & CODEC_FLAG_EMU_EDGE) {
1144         mode = check_intra_pred8x8_mode_emuedge(mb->chroma_pred_mode, mb_x, mb_y);
1145     } else {
1146         mode = check_intra_pred8x8_mode(mb->chroma_pred_mode, mb_x, mb_y);
1147     }
1148     s->hpc.pred8x8[mode](dst[1], s->uvlinesize);
1149     s->hpc.pred8x8[mode](dst[2], s->uvlinesize);
1150
1151     if (!(avctx->flags & CODEC_FLAG_EMU_EDGE && !mb_y) && (s->deblock_filter || !mb_y) && td->thread_nr == 0)
1152         xchg_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2],
1153                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1154                        s->filter.simple, 0);
1155 }
1156
1157 static const uint8_t subpel_idx[3][8] = {
1158     { 0, 1, 2, 1, 2, 1, 2, 1 }, // nr. of left extra pixels,
1159                                 // also function pointer index
1160     { 0, 3, 5, 3, 5, 3, 5, 3 }, // nr. of extra pixels required
1161     { 0, 2, 3, 2, 3, 2, 3, 2 }, // nr. of right extra pixels
1162 };
1163
1164 /**
1165  * luma MC function
1166  *
1167  * @param s VP8 decoding context
1168  * @param dst target buffer for block data at block position
1169  * @param ref reference picture buffer at origin (0, 0)
1170  * @param mv motion vector (relative to block position) to get pixel data from
1171  * @param x_off horizontal position of block from origin (0, 0)
1172  * @param y_off vertical position of block from origin (0, 0)
1173  * @param block_w width of block (16, 8 or 4)
1174  * @param block_h height of block (always same as block_w)
1175  * @param width width of src/dst plane data
1176  * @param height height of src/dst plane data
1177  * @param linesize size of a single line of plane data, including padding
1178  * @param mc_func motion compensation function pointers (bilinear or sixtap MC)
1179  */
1180 static av_always_inline
1181 void vp8_mc_luma(VP8Context *s, VP8ThreadData *td, uint8_t *dst,
1182                  AVFrame *ref, const VP56mv *mv,
1183                  int x_off, int y_off, int block_w, int block_h,
1184                  int width, int height, int linesize,
1185                  vp8_mc_func mc_func[3][3])
1186 {
1187     uint8_t *src = ref->data[0];
1188
1189     if (AV_RN32A(mv)) {
1190
1191         int mx = (mv->x << 1)&7, mx_idx = subpel_idx[0][mx];
1192         int my = (mv->y << 1)&7, my_idx = subpel_idx[0][my];
1193
1194         x_off += mv->x >> 2;
1195         y_off += mv->y >> 2;
1196
1197         // edge emulation
1198         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 4, 0);
1199         src += y_off * linesize + x_off;
1200         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1201             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1202             s->vdsp.emulated_edge_mc(td->edge_emu_buffer, src - my_idx * linesize - mx_idx, linesize,
1203                                      block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1204                                      x_off - mx_idx, y_off - my_idx, width, height);
1205             src = td->edge_emu_buffer + mx_idx + linesize * my_idx;
1206         }
1207         mc_func[my_idx][mx_idx](dst, linesize, src, linesize, block_h, mx, my);
1208     } else {
1209         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 4, 0);
1210         mc_func[0][0](dst, linesize, src + y_off * linesize + x_off, linesize, block_h, 0, 0);
1211     }
1212 }
1213
1214 /**
1215  * chroma MC function
1216  *
1217  * @param s VP8 decoding context
1218  * @param dst1 target buffer for block data at block position (U plane)
1219  * @param dst2 target buffer for block data at block position (V plane)
1220  * @param ref reference picture buffer at origin (0, 0)
1221  * @param mv motion vector (relative to block position) to get pixel data from
1222  * @param x_off horizontal position of block from origin (0, 0)
1223  * @param y_off vertical position of block from origin (0, 0)
1224  * @param block_w width of block (16, 8 or 4)
1225  * @param block_h height of block (always same as block_w)
1226  * @param width width of src/dst plane data
1227  * @param height height of src/dst plane data
1228  * @param linesize size of a single line of plane data, including padding
1229  * @param mc_func motion compensation function pointers (bilinear or sixtap MC)
1230  */
1231 static av_always_inline
1232 void vp8_mc_chroma(VP8Context *s, VP8ThreadData *td, uint8_t *dst1, uint8_t *dst2,
1233                    AVFrame *ref, const VP56mv *mv, int x_off, int y_off,
1234                    int block_w, int block_h, int width, int height, int linesize,
1235                    vp8_mc_func mc_func[3][3])
1236 {
1237     uint8_t *src1 = ref->data[1], *src2 = ref->data[2];
1238
1239     if (AV_RN32A(mv)) {
1240         int mx = mv->x&7, mx_idx = subpel_idx[0][mx];
1241         int my = mv->y&7, my_idx = subpel_idx[0][my];
1242
1243         x_off += mv->x >> 3;
1244         y_off += mv->y >> 3;
1245
1246         // edge emulation
1247         src1 += y_off * linesize + x_off;
1248         src2 += y_off * linesize + x_off;
1249         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 3, 0);
1250         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1251             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1252             s->vdsp.emulated_edge_mc(td->edge_emu_buffer, src1 - my_idx * linesize - mx_idx, linesize,
1253                                      block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1254                                      x_off - mx_idx, y_off - my_idx, width, height);
1255             src1 = td->edge_emu_buffer + mx_idx + linesize * my_idx;
1256             mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my);
1257
1258             s->vdsp.emulated_edge_mc(td->edge_emu_buffer, src2 - my_idx * linesize - mx_idx, linesize,
1259                                      block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1260                                      x_off - mx_idx, y_off - my_idx, width, height);
1261             src2 = td->edge_emu_buffer + mx_idx + linesize * my_idx;
1262             mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my);
1263         } else {
1264             mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my);
1265             mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my);
1266         }
1267     } else {
1268         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 3, 0);
1269         mc_func[0][0](dst1, linesize, src1 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1270         mc_func[0][0](dst2, linesize, src2 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1271     }
1272 }
1273
1274 static av_always_inline
1275 void vp8_mc_part(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1276                  AVFrame *ref_frame, int x_off, int y_off,
1277                  int bx_off, int by_off,
1278                  int block_w, int block_h,
1279                  int width, int height, VP56mv *mv)
1280 {
1281     VP56mv uvmv = *mv;
1282
1283     /* Y */
1284     vp8_mc_luma(s, td, dst[0] + by_off * s->linesize + bx_off,
1285                 ref_frame, mv, x_off + bx_off, y_off + by_off,
1286                 block_w, block_h, width, height, s->linesize,
1287                 s->put_pixels_tab[block_w == 8]);
1288
1289     /* U/V */
1290     if (s->profile == 3) {
1291         uvmv.x &= ~7;
1292         uvmv.y &= ~7;
1293     }
1294     x_off   >>= 1; y_off   >>= 1;
1295     bx_off  >>= 1; by_off  >>= 1;
1296     width   >>= 1; height  >>= 1;
1297     block_w >>= 1; block_h >>= 1;
1298     vp8_mc_chroma(s, td, dst[1] + by_off * s->uvlinesize + bx_off,
1299                   dst[2] + by_off * s->uvlinesize + bx_off, ref_frame,
1300                   &uvmv, x_off + bx_off, y_off + by_off,
1301                   block_w, block_h, width, height, s->uvlinesize,
1302                   s->put_pixels_tab[1 + (block_w == 4)]);
1303 }
1304
1305 /* Fetch pixels for estimated mv 4 macroblocks ahead.
1306  * Optimized for 64-byte cache lines.  Inspired by ffh264 prefetch_motion. */
1307 static av_always_inline void prefetch_motion(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, int mb_xy, int ref)
1308 {
1309     /* Don't prefetch refs that haven't been used very often this frame. */
1310     if (s->ref_count[ref-1] > (mb_xy >> 5)) {
1311         int x_off = mb_x << 4, y_off = mb_y << 4;
1312         int mx = (mb->mv.x>>2) + x_off + 8;
1313         int my = (mb->mv.y>>2) + y_off;
1314         uint8_t **src= s->framep[ref]->data;
1315         int off= mx + (my + (mb_x&3)*4)*s->linesize + 64;
1316         /* For threading, a ff_thread_await_progress here might be useful, but
1317          * it actually slows down the decoder. Since a bad prefetch doesn't
1318          * generate bad decoder output, we don't run it here. */
1319         s->vdsp.prefetch(src[0]+off, s->linesize, 4);
1320         off= (mx>>1) + ((my>>1) + (mb_x&7))*s->uvlinesize + 64;
1321         s->vdsp.prefetch(src[1]+off, src[2]-src[1], 2);
1322     }
1323 }
1324
1325 /**
1326  * Apply motion vectors to prediction buffer, chapter 18.
1327  */
1328 static av_always_inline
1329 void inter_predict(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1330                    VP8Macroblock *mb, int mb_x, int mb_y)
1331 {
1332     int x_off = mb_x << 4, y_off = mb_y << 4;
1333     int width = 16*s->mb_width, height = 16*s->mb_height;
1334     AVFrame *ref = s->framep[mb->ref_frame];
1335     VP56mv *bmv = mb->bmv;
1336
1337     switch (mb->partitioning) {
1338     case VP8_SPLITMVMODE_NONE:
1339         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1340                     0, 0, 16, 16, width, height, &mb->mv);
1341         break;
1342     case VP8_SPLITMVMODE_4x4: {
1343         int x, y;
1344         VP56mv uvmv;
1345
1346         /* Y */
1347         for (y = 0; y < 4; y++) {
1348             for (x = 0; x < 4; x++) {
1349                 vp8_mc_luma(s, td, dst[0] + 4*y*s->linesize + x*4,
1350                             ref, &bmv[4*y + x],
1351                             4*x + x_off, 4*y + y_off, 4, 4,
1352                             width, height, s->linesize,
1353                             s->put_pixels_tab[2]);
1354             }
1355         }
1356
1357         /* U/V */
1358         x_off >>= 1; y_off >>= 1; width >>= 1; height >>= 1;
1359         for (y = 0; y < 2; y++) {
1360             for (x = 0; x < 2; x++) {
1361                 uvmv.x = mb->bmv[ 2*y    * 4 + 2*x  ].x +
1362                          mb->bmv[ 2*y    * 4 + 2*x+1].x +
1363                          mb->bmv[(2*y+1) * 4 + 2*x  ].x +
1364                          mb->bmv[(2*y+1) * 4 + 2*x+1].x;
1365                 uvmv.y = mb->bmv[ 2*y    * 4 + 2*x  ].y +
1366                          mb->bmv[ 2*y    * 4 + 2*x+1].y +
1367                          mb->bmv[(2*y+1) * 4 + 2*x  ].y +
1368                          mb->bmv[(2*y+1) * 4 + 2*x+1].y;
1369                 uvmv.x = (uvmv.x + 2 + (uvmv.x >> (INT_BIT-1))) >> 2;
1370                 uvmv.y = (uvmv.y + 2 + (uvmv.y >> (INT_BIT-1))) >> 2;
1371                 if (s->profile == 3) {
1372                     uvmv.x &= ~7;
1373                     uvmv.y &= ~7;
1374                 }
1375                 vp8_mc_chroma(s, td, dst[1] + 4*y*s->uvlinesize + x*4,
1376                               dst[2] + 4*y*s->uvlinesize + x*4, ref, &uvmv,
1377                               4*x + x_off, 4*y + y_off, 4, 4,
1378                               width, height, s->uvlinesize,
1379                               s->put_pixels_tab[2]);
1380             }
1381         }
1382         break;
1383     }
1384     case VP8_SPLITMVMODE_16x8:
1385         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1386                     0, 0, 16, 8, width, height, &bmv[0]);
1387         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1388                     0, 8, 16, 8, width, height, &bmv[1]);
1389         break;
1390     case VP8_SPLITMVMODE_8x16:
1391         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1392                     0, 0, 8, 16, width, height, &bmv[0]);
1393         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1394                     8, 0, 8, 16, width, height, &bmv[1]);
1395         break;
1396     case VP8_SPLITMVMODE_8x8:
1397         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1398                     0, 0, 8, 8, width, height, &bmv[0]);
1399         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1400                     8, 0, 8, 8, width, height, &bmv[1]);
1401         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1402                     0, 8, 8, 8, width, height, &bmv[2]);
1403         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1404                     8, 8, 8, 8, width, height, &bmv[3]);
1405         break;
1406     }
1407 }
1408
1409 static av_always_inline void idct_mb(VP8Context *s, VP8ThreadData *td,
1410                                      uint8_t *dst[3], VP8Macroblock *mb)
1411 {
1412     int x, y, ch;
1413
1414     if (mb->mode != MODE_I4x4) {
1415         uint8_t *y_dst = dst[0];
1416         for (y = 0; y < 4; y++) {
1417             uint32_t nnz4 = AV_RL32(td->non_zero_count_cache[y]);
1418             if (nnz4) {
1419                 if (nnz4&~0x01010101) {
1420                     for (x = 0; x < 4; x++) {
1421                         if ((uint8_t)nnz4 == 1)
1422                             s->vp8dsp.vp8_idct_dc_add(y_dst+4*x, td->block[y][x], s->linesize);
1423                         else if((uint8_t)nnz4 > 1)
1424                             s->vp8dsp.vp8_idct_add(y_dst+4*x, td->block[y][x], s->linesize);
1425                         nnz4 >>= 8;
1426                         if (!nnz4)
1427                             break;
1428                     }
1429                 } else {
1430                     s->vp8dsp.vp8_idct_dc_add4y(y_dst, td->block[y], s->linesize);
1431                 }
1432             }
1433             y_dst += 4*s->linesize;
1434         }
1435     }
1436
1437     for (ch = 0; ch < 2; ch++) {
1438         uint32_t nnz4 = AV_RL32(td->non_zero_count_cache[4+ch]);
1439         if (nnz4) {
1440             uint8_t *ch_dst = dst[1+ch];
1441             if (nnz4&~0x01010101) {
1442                 for (y = 0; y < 2; y++) {
1443                     for (x = 0; x < 2; x++) {
1444                         if ((uint8_t)nnz4 == 1)
1445                             s->vp8dsp.vp8_idct_dc_add(ch_dst+4*x, td->block[4+ch][(y<<1)+x], s->uvlinesize);
1446                         else if((uint8_t)nnz4 > 1)
1447                             s->vp8dsp.vp8_idct_add(ch_dst+4*x, td->block[4+ch][(y<<1)+x], s->uvlinesize);
1448                         nnz4 >>= 8;
1449                         if (!nnz4)
1450                             goto chroma_idct_end;
1451                     }
1452                     ch_dst += 4*s->uvlinesize;
1453                 }
1454             } else {
1455                 s->vp8dsp.vp8_idct_dc_add4uv(ch_dst, td->block[4+ch], s->uvlinesize);
1456             }
1457         }
1458 chroma_idct_end: ;
1459     }
1460 }
1461
1462 static av_always_inline void filter_level_for_mb(VP8Context *s, VP8Macroblock *mb, VP8FilterStrength *f )
1463 {
1464     int interior_limit, filter_level;
1465
1466     if (s->segmentation.enabled) {
1467         filter_level = s->segmentation.filter_level[mb->segment];
1468         if (!s->segmentation.absolute_vals)
1469             filter_level += s->filter.level;
1470     } else
1471         filter_level = s->filter.level;
1472
1473     if (s->lf_delta.enabled) {
1474         filter_level += s->lf_delta.ref[mb->ref_frame];
1475         filter_level += s->lf_delta.mode[mb->mode];
1476     }
1477
1478     filter_level = av_clip_uintp2(filter_level, 6);
1479
1480     interior_limit = filter_level;
1481     if (s->filter.sharpness) {
1482         interior_limit >>= (s->filter.sharpness + 3) >> 2;
1483         interior_limit = FFMIN(interior_limit, 9 - s->filter.sharpness);
1484     }
1485     interior_limit = FFMAX(interior_limit, 1);
1486
1487     f->filter_level = filter_level;
1488     f->inner_limit = interior_limit;
1489     f->inner_filter = !mb->skip || mb->mode == MODE_I4x4 || mb->mode == VP8_MVMODE_SPLIT;
1490 }
1491
1492 static av_always_inline void filter_mb(VP8Context *s, uint8_t *dst[3], VP8FilterStrength *f, int mb_x, int mb_y)
1493 {
1494     int mbedge_lim, bedge_lim, hev_thresh;
1495     int filter_level = f->filter_level;
1496     int inner_limit = f->inner_limit;
1497     int inner_filter = f->inner_filter;
1498     int linesize = s->linesize;
1499     int uvlinesize = s->uvlinesize;
1500     static const uint8_t hev_thresh_lut[2][64] = {
1501         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
1502           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1503           3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1504           3, 3, 3, 3 },
1505         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
1506           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1507           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1508           2, 2, 2, 2 }
1509     };
1510
1511     if (!filter_level)
1512         return;
1513
1514      bedge_lim = 2*filter_level + inner_limit;
1515     mbedge_lim = bedge_lim + 4;
1516
1517     hev_thresh = hev_thresh_lut[s->keyframe][filter_level];
1518
1519     if (mb_x) {
1520         s->vp8dsp.vp8_h_loop_filter16y(dst[0],     linesize,
1521                                        mbedge_lim, inner_limit, hev_thresh);
1522         s->vp8dsp.vp8_h_loop_filter8uv(dst[1],     dst[2],      uvlinesize,
1523                                        mbedge_lim, inner_limit, hev_thresh);
1524     }
1525
1526     if (inner_filter) {
1527         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+ 4, linesize, bedge_lim,
1528                                              inner_limit, hev_thresh);
1529         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+ 8, linesize, bedge_lim,
1530                                              inner_limit, hev_thresh);
1531         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+12, linesize, bedge_lim,
1532                                              inner_limit, hev_thresh);
1533         s->vp8dsp.vp8_h_loop_filter8uv_inner(dst[1] + 4, dst[2] + 4,
1534                                              uvlinesize,  bedge_lim,
1535                                              inner_limit, hev_thresh);
1536     }
1537
1538     if (mb_y) {
1539         s->vp8dsp.vp8_v_loop_filter16y(dst[0],     linesize,
1540                                        mbedge_lim, inner_limit, hev_thresh);
1541         s->vp8dsp.vp8_v_loop_filter8uv(dst[1],     dst[2],      uvlinesize,
1542                                        mbedge_lim, inner_limit, hev_thresh);
1543     }
1544
1545     if (inner_filter) {
1546         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+ 4*linesize,
1547                                              linesize,    bedge_lim,
1548                                              inner_limit, hev_thresh);
1549         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+ 8*linesize,
1550                                              linesize,    bedge_lim,
1551                                              inner_limit, hev_thresh);
1552         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+12*linesize,
1553                                              linesize,    bedge_lim,
1554                                              inner_limit, hev_thresh);
1555         s->vp8dsp.vp8_v_loop_filter8uv_inner(dst[1] + 4 * uvlinesize,
1556                                              dst[2] + 4 * uvlinesize,
1557                                              uvlinesize,  bedge_lim,
1558                                              inner_limit, hev_thresh);
1559     }
1560 }
1561
1562 static av_always_inline void filter_mb_simple(VP8Context *s, uint8_t *dst, VP8FilterStrength *f, int mb_x, int mb_y)
1563 {
1564     int mbedge_lim, bedge_lim;
1565     int filter_level = f->filter_level;
1566     int inner_limit = f->inner_limit;
1567     int inner_filter = f->inner_filter;
1568     int linesize = s->linesize;
1569
1570     if (!filter_level)
1571         return;
1572
1573      bedge_lim = 2*filter_level + inner_limit;
1574     mbedge_lim = bedge_lim + 4;
1575
1576     if (mb_x)
1577         s->vp8dsp.vp8_h_loop_filter_simple(dst, linesize, mbedge_lim);
1578     if (inner_filter) {
1579         s->vp8dsp.vp8_h_loop_filter_simple(dst+ 4, linesize, bedge_lim);
1580         s->vp8dsp.vp8_h_loop_filter_simple(dst+ 8, linesize, bedge_lim);
1581         s->vp8dsp.vp8_h_loop_filter_simple(dst+12, linesize, bedge_lim);
1582     }
1583
1584     if (mb_y)
1585         s->vp8dsp.vp8_v_loop_filter_simple(dst, linesize, mbedge_lim);
1586     if (inner_filter) {
1587         s->vp8dsp.vp8_v_loop_filter_simple(dst+ 4*linesize, linesize, bedge_lim);
1588         s->vp8dsp.vp8_v_loop_filter_simple(dst+ 8*linesize, linesize, bedge_lim);
1589         s->vp8dsp.vp8_v_loop_filter_simple(dst+12*linesize, linesize, bedge_lim);
1590     }
1591 }
1592
1593 static void release_queued_segmaps(VP8Context *s, int is_close)
1594 {
1595     int leave_behind = is_close ? 0 : !s->maps_are_invalid;
1596     while (s->num_maps_to_be_freed > leave_behind)
1597         av_freep(&s->segmentation_maps[--s->num_maps_to_be_freed]);
1598     s->maps_are_invalid = 0;
1599 }
1600
1601 #define MARGIN (16 << 2)
1602 static void vp8_decode_mv_mb_modes(AVCodecContext *avctx, AVFrame *curframe,
1603                                    AVFrame *prev_frame)
1604 {
1605     VP8Context *s = avctx->priv_data;
1606     int mb_x, mb_y;
1607
1608     s->mv_min.y = -MARGIN;
1609     s->mv_max.y = ((s->mb_height - 1) << 6) + MARGIN;
1610     for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
1611         VP8Macroblock *mb = s->macroblocks_base + ((s->mb_width+1)*(mb_y + 1) + 1);
1612         int mb_xy = mb_y*s->mb_width;
1613
1614         AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED*0x01010101);
1615
1616         s->mv_min.x = -MARGIN;
1617         s->mv_max.x = ((s->mb_width - 1) << 6) + MARGIN;
1618         for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
1619             if (mb_y == 0)
1620                 AV_WN32A((mb-s->mb_width-1)->intra4x4_pred_mode_top, DC_PRED*0x01010101);
1621             decode_mb_mode(s, mb, mb_x, mb_y, curframe->ref_index[0] + mb_xy,
1622                            prev_frame && prev_frame->ref_index[0] ? prev_frame->ref_index[0] + mb_xy : NULL, 1);
1623             s->mv_min.x -= 64;
1624             s->mv_max.x -= 64;
1625         }
1626         s->mv_min.y -= 64;
1627         s->mv_max.y -= 64;
1628     }
1629 }
1630
1631 #if HAVE_THREADS
1632 #define check_thread_pos(td, otd, mb_x_check, mb_y_check)\
1633     do {\
1634         int tmp = (mb_y_check << 16) | (mb_x_check & 0xFFFF);\
1635         if (otd->thread_mb_pos < tmp) {\
1636             pthread_mutex_lock(&otd->lock);\
1637             td->wait_mb_pos = tmp;\
1638             do {\
1639                 if (otd->thread_mb_pos >= tmp)\
1640                     break;\
1641                 pthread_cond_wait(&otd->cond, &otd->lock);\
1642             } while (1);\
1643             td->wait_mb_pos = INT_MAX;\
1644             pthread_mutex_unlock(&otd->lock);\
1645         }\
1646     } while(0);
1647
1648 #define update_pos(td, mb_y, mb_x)\
1649     do {\
1650     int pos              = (mb_y << 16) | (mb_x & 0xFFFF);\
1651     int sliced_threading = (avctx->active_thread_type == FF_THREAD_SLICE) && (num_jobs > 1);\
1652     int is_null          = (next_td == NULL) || (prev_td == NULL);\
1653     int pos_check        = (is_null) ? 1 :\
1654                             (next_td != td && pos >= next_td->wait_mb_pos) ||\
1655                             (prev_td != td && pos >= prev_td->wait_mb_pos);\
1656     td->thread_mb_pos = pos;\
1657     if (sliced_threading && pos_check) {\
1658         pthread_mutex_lock(&td->lock);\
1659         pthread_cond_broadcast(&td->cond);\
1660         pthread_mutex_unlock(&td->lock);\
1661     }\
1662     } while(0);
1663 #else
1664 #define check_thread_pos(td, otd, mb_x_check, mb_y_check)
1665 #define update_pos(td, mb_y, mb_x)
1666 #endif
1667
1668 static void vp8_decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
1669                                         int jobnr, int threadnr)
1670 {
1671     VP8Context *s = avctx->priv_data;
1672     VP8ThreadData *prev_td, *next_td, *td = &s->thread_data[threadnr];
1673     int mb_y = td->thread_mb_pos>>16;
1674     int i, y, mb_x, mb_xy = mb_y*s->mb_width;
1675     int num_jobs = s->num_jobs;
1676     AVFrame *curframe = s->curframe, *prev_frame = s->prev_frame;
1677     VP56RangeCoder *c = &s->coeff_partition[mb_y & (s->num_coeff_partitions-1)];
1678     VP8Macroblock *mb;
1679     uint8_t *dst[3] = {
1680         curframe->data[0] + 16*mb_y*s->linesize,
1681         curframe->data[1] +  8*mb_y*s->uvlinesize,
1682         curframe->data[2] +  8*mb_y*s->uvlinesize
1683     };
1684     if (mb_y == 0) prev_td = td;
1685     else           prev_td = &s->thread_data[(jobnr + num_jobs - 1)%num_jobs];
1686     if (mb_y == s->mb_height-1) next_td = td;
1687     else                        next_td = &s->thread_data[(jobnr + 1)%num_jobs];
1688     if (s->mb_layout == 1)
1689         mb = s->macroblocks_base + ((s->mb_width+1)*(mb_y + 1) + 1);
1690     else {
1691         mb = s->macroblocks + (s->mb_height - mb_y - 1)*2;
1692         memset(mb - 1, 0, sizeof(*mb)); // zero left macroblock
1693         AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED*0x01010101);
1694     }
1695
1696     memset(td->left_nnz, 0, sizeof(td->left_nnz));
1697     // left edge of 129 for intra prediction
1698     if (!(avctx->flags & CODEC_FLAG_EMU_EDGE)) {
1699         for (i = 0; i < 3; i++)
1700             for (y = 0; y < 16>>!!i; y++)
1701                 dst[i][y*curframe->linesize[i]-1] = 129;
1702         if (mb_y == 1) {
1703             s->top_border[0][15] = s->top_border[0][23] = s->top_border[0][31] = 129;
1704         }
1705     }
1706
1707     s->mv_min.x = -MARGIN;
1708     s->mv_max.x = ((s->mb_width  - 1) << 6) + MARGIN;
1709
1710     for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
1711         // Wait for previous thread to read mb_x+2, and reach mb_y-1.
1712         if (prev_td != td) {
1713             if (threadnr != 0) {
1714                 check_thread_pos(td, prev_td, mb_x+1, mb_y-1);
1715             } else {
1716                 check_thread_pos(td, prev_td, (s->mb_width+3) + (mb_x+1), mb_y-1);
1717             }
1718         }
1719
1720         s->vdsp.prefetch(dst[0] + (mb_x&3)*4*s->linesize + 64, s->linesize, 4);
1721         s->vdsp.prefetch(dst[1] + (mb_x&7)*s->uvlinesize + 64, dst[2] - dst[1], 2);
1722
1723         if (!s->mb_layout)
1724             decode_mb_mode(s, mb, mb_x, mb_y, curframe->ref_index[0] + mb_xy,
1725                            prev_frame && prev_frame->ref_index[0] ? prev_frame->ref_index[0] + mb_xy : NULL, 0);
1726
1727         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_PREVIOUS);
1728
1729         if (!mb->skip)
1730             decode_mb_coeffs(s, td, c, mb, s->top_nnz[mb_x], td->left_nnz);
1731
1732         if (mb->mode <= MODE_I4x4)
1733             intra_predict(s, td, dst, mb, mb_x, mb_y);
1734         else
1735             inter_predict(s, td, dst, mb, mb_x, mb_y);
1736
1737         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN);
1738
1739         if (!mb->skip) {
1740             idct_mb(s, td, dst, mb);
1741         } else {
1742             AV_ZERO64(td->left_nnz);
1743             AV_WN64(s->top_nnz[mb_x], 0);   // array of 9, so unaligned
1744
1745             // Reset DC block predictors if they would exist if the mb had coefficients
1746             if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
1747                 td->left_nnz[8]     = 0;
1748                 s->top_nnz[mb_x][8] = 0;
1749             }
1750         }
1751
1752         if (s->deblock_filter)
1753             filter_level_for_mb(s, mb, &td->filter_strength[mb_x]);
1754
1755         if (s->deblock_filter && num_jobs != 1 && threadnr == num_jobs-1) {
1756             if (s->filter.simple)
1757                 backup_mb_border(s->top_border[mb_x+1], dst[0], NULL, NULL, s->linesize, 0, 1);
1758             else
1759                 backup_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2], s->linesize, s->uvlinesize, 0);
1760         }
1761
1762         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN2);
1763
1764         dst[0] += 16;
1765         dst[1] += 8;
1766         dst[2] += 8;
1767         s->mv_min.x -= 64;
1768         s->mv_max.x -= 64;
1769
1770         if (mb_x == s->mb_width+1) {
1771             update_pos(td, mb_y, s->mb_width+3);
1772         } else {
1773             update_pos(td, mb_y, mb_x);
1774         }
1775     }
1776 }
1777
1778 static void vp8_filter_mb_row(AVCodecContext *avctx, void *tdata,
1779                               int jobnr, int threadnr)
1780 {
1781     VP8Context *s = avctx->priv_data;
1782     VP8ThreadData *td = &s->thread_data[threadnr];
1783     int mb_x, mb_y = td->thread_mb_pos>>16, num_jobs = s->num_jobs;
1784     AVFrame *curframe = s->curframe;
1785     VP8Macroblock *mb;
1786     VP8ThreadData *prev_td, *next_td;
1787     uint8_t *dst[3] = {
1788         curframe->data[0] + 16*mb_y*s->linesize,
1789         curframe->data[1] +  8*mb_y*s->uvlinesize,
1790         curframe->data[2] +  8*mb_y*s->uvlinesize
1791     };
1792
1793     if (s->mb_layout == 1)
1794         mb = s->macroblocks_base + ((s->mb_width+1)*(mb_y + 1) + 1);
1795     else
1796         mb = s->macroblocks + (s->mb_height - mb_y - 1)*2;
1797
1798     if (mb_y == 0) prev_td = td;
1799     else           prev_td = &s->thread_data[(jobnr + num_jobs - 1)%num_jobs];
1800     if (mb_y == s->mb_height-1) next_td = td;
1801     else                        next_td = &s->thread_data[(jobnr + 1)%num_jobs];
1802
1803     for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb++) {
1804         VP8FilterStrength *f = &td->filter_strength[mb_x];
1805         if (prev_td != td) {
1806             check_thread_pos(td, prev_td, (mb_x+1) + (s->mb_width+3), mb_y-1);
1807         }
1808         if (next_td != td)
1809             if (next_td != &s->thread_data[0]) {
1810                 check_thread_pos(td, next_td, mb_x+1, mb_y+1);
1811             }
1812
1813         if (num_jobs == 1) {
1814             if (s->filter.simple)
1815                 backup_mb_border(s->top_border[mb_x+1], dst[0], NULL, NULL, s->linesize, 0, 1);
1816             else
1817                 backup_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2], s->linesize, s->uvlinesize, 0);
1818         }
1819
1820         if (s->filter.simple)
1821             filter_mb_simple(s, dst[0], f, mb_x, mb_y);
1822         else
1823             filter_mb(s, dst, f, mb_x, mb_y);
1824         dst[0] += 16;
1825         dst[1] += 8;
1826         dst[2] += 8;
1827
1828         update_pos(td, mb_y, (s->mb_width+3) + mb_x);
1829     }
1830 }
1831
1832 static int vp8_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata,
1833                                     int jobnr, int threadnr)
1834 {
1835     VP8Context *s = avctx->priv_data;
1836     VP8ThreadData *td = &s->thread_data[jobnr];
1837     VP8ThreadData *next_td = NULL, *prev_td = NULL;
1838     AVFrame *curframe = s->curframe;
1839     int mb_y, num_jobs = s->num_jobs;
1840     td->thread_nr = threadnr;
1841     for (mb_y = jobnr; mb_y < s->mb_height; mb_y += num_jobs) {
1842         if (mb_y >= s->mb_height) break;
1843         td->thread_mb_pos = mb_y<<16;
1844         vp8_decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr);
1845         if (s->deblock_filter)
1846             vp8_filter_mb_row(avctx, tdata, jobnr, threadnr);
1847         update_pos(td, mb_y, INT_MAX & 0xFFFF);
1848
1849         s->mv_min.y -= 64;
1850         s->mv_max.y -= 64;
1851
1852         if (avctx->active_thread_type == FF_THREAD_FRAME)
1853             ff_thread_report_progress(curframe, mb_y, 0);
1854     }
1855
1856     return 0;
1857 }
1858
1859 static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
1860                             AVPacket *avpkt)
1861 {
1862     VP8Context *s = avctx->priv_data;
1863     int ret, i, referenced, num_jobs;
1864     enum AVDiscard skip_thresh;
1865     AVFrame *av_uninit(curframe), *prev_frame;
1866
1867     release_queued_segmaps(s, 0);
1868
1869     if ((ret = decode_frame_header(s, avpkt->data, avpkt->size)) < 0)
1870         goto err;
1871
1872     prev_frame = s->framep[VP56_FRAME_CURRENT];
1873
1874     referenced = s->update_last || s->update_golden == VP56_FRAME_CURRENT
1875                                 || s->update_altref == VP56_FRAME_CURRENT;
1876
1877     skip_thresh = !referenced ? AVDISCARD_NONREF :
1878                     !s->keyframe ? AVDISCARD_NONKEY : AVDISCARD_ALL;
1879
1880     if (avctx->skip_frame >= skip_thresh) {
1881         s->invisible = 1;
1882         memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
1883         goto skip_decode;
1884     }
1885     s->deblock_filter = s->filter.level && avctx->skip_loop_filter < skip_thresh;
1886
1887     // release no longer referenced frames
1888     for (i = 0; i < 5; i++)
1889         if (s->frames[i].data[0] &&
1890             &s->frames[i] != prev_frame &&
1891             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
1892             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN] &&
1893             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2])
1894             vp8_release_frame(s, &s->frames[i], 1, 0);
1895
1896     // find a free buffer
1897     for (i = 0; i < 5; i++)
1898         if (&s->frames[i] != prev_frame &&
1899             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
1900             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN] &&
1901             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2]) {
1902             curframe = s->framep[VP56_FRAME_CURRENT] = &s->frames[i];
1903             break;
1904         }
1905     if (i == 5) {
1906         av_log(avctx, AV_LOG_FATAL, "Ran out of free frames!\n");
1907         abort();
1908     }
1909     if (curframe->data[0])
1910         vp8_release_frame(s, curframe, 1, 0);
1911
1912     // Given that arithmetic probabilities are updated every frame, it's quite likely
1913     // that the values we have on a random interframe are complete junk if we didn't
1914     // start decode on a keyframe. So just don't display anything rather than junk.
1915     if (!s->keyframe && (!s->framep[VP56_FRAME_PREVIOUS] ||
1916                          !s->framep[VP56_FRAME_GOLDEN] ||
1917                          !s->framep[VP56_FRAME_GOLDEN2])) {
1918         av_log(avctx, AV_LOG_WARNING, "Discarding interframe without a prior keyframe!\n");
1919         ret = AVERROR_INVALIDDATA;
1920         goto err;
1921     }
1922
1923     curframe->key_frame = s->keyframe;
1924     curframe->pict_type = s->keyframe ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
1925     curframe->reference = referenced ? 3 : 0;
1926     if ((ret = vp8_alloc_frame(s, curframe))) {
1927         av_log(avctx, AV_LOG_ERROR, "get_buffer() failed!\n");
1928         goto err;
1929     }
1930
1931     // check if golden and altref are swapped
1932     if (s->update_altref != VP56_FRAME_NONE) {
1933         s->next_framep[VP56_FRAME_GOLDEN2]  = s->framep[s->update_altref];
1934     } else {
1935         s->next_framep[VP56_FRAME_GOLDEN2]  = s->framep[VP56_FRAME_GOLDEN2];
1936     }
1937     if (s->update_golden != VP56_FRAME_NONE) {
1938         s->next_framep[VP56_FRAME_GOLDEN]   = s->framep[s->update_golden];
1939     } else {
1940         s->next_framep[VP56_FRAME_GOLDEN]   = s->framep[VP56_FRAME_GOLDEN];
1941     }
1942     if (s->update_last) {
1943         s->next_framep[VP56_FRAME_PREVIOUS] = curframe;
1944     } else {
1945         s->next_framep[VP56_FRAME_PREVIOUS] = s->framep[VP56_FRAME_PREVIOUS];
1946     }
1947     s->next_framep[VP56_FRAME_CURRENT]      = curframe;
1948
1949     ff_thread_finish_setup(avctx);
1950
1951     s->linesize   = curframe->linesize[0];
1952     s->uvlinesize = curframe->linesize[1];
1953
1954     if (!s->thread_data[0].edge_emu_buffer)
1955         for (i = 0; i < MAX_THREADS; i++)
1956             s->thread_data[i].edge_emu_buffer = av_malloc(21*s->linesize);
1957
1958     memset(s->top_nnz, 0, s->mb_width*sizeof(*s->top_nnz));
1959     /* Zero macroblock structures for top/top-left prediction from outside the frame. */
1960     if (!s->mb_layout)
1961         memset(s->macroblocks + s->mb_height*2 - 1, 0, (s->mb_width+1)*sizeof(*s->macroblocks));
1962     if (!s->mb_layout && s->keyframe)
1963         memset(s->intra4x4_pred_mode_top, DC_PRED, s->mb_width*4);
1964
1965     // top edge of 127 for intra prediction
1966     if (!(avctx->flags & CODEC_FLAG_EMU_EDGE)) {
1967         s->top_border[0][15] = s->top_border[0][23] = 127;
1968         s->top_border[0][31] = 127;
1969         memset(s->top_border[1], 127, s->mb_width*sizeof(*s->top_border));
1970     }
1971     memset(s->ref_count, 0, sizeof(s->ref_count));
1972
1973
1974     // Make sure the previous frame has read its segmentation map,
1975     // if we re-use the same map.
1976     if (prev_frame && s->segmentation.enabled && !s->segmentation.update_map)
1977         ff_thread_await_progress(prev_frame, 1, 0);
1978
1979     if (s->mb_layout == 1)
1980         vp8_decode_mv_mb_modes(avctx, curframe, prev_frame);
1981
1982     if (avctx->active_thread_type == FF_THREAD_FRAME)
1983         num_jobs = 1;
1984     else
1985         num_jobs = FFMIN(s->num_coeff_partitions, avctx->thread_count);
1986     s->num_jobs   = num_jobs;
1987     s->curframe   = curframe;
1988     s->prev_frame = prev_frame;
1989     s->mv_min.y   = -MARGIN;
1990     s->mv_max.y   = ((s->mb_height - 1) << 6) + MARGIN;
1991     for (i = 0; i < MAX_THREADS; i++) {
1992         s->thread_data[i].thread_mb_pos = 0;
1993         s->thread_data[i].wait_mb_pos = INT_MAX;
1994     }
1995     avctx->execute2(avctx, vp8_decode_mb_row_sliced, s->thread_data, NULL, num_jobs);
1996
1997     ff_thread_report_progress(curframe, INT_MAX, 0);
1998     memcpy(&s->framep[0], &s->next_framep[0], sizeof(s->framep[0]) * 4);
1999
2000 skip_decode:
2001     // if future frames don't use the updated probabilities,
2002     // reset them to the values we saved
2003     if (!s->update_probabilities)
2004         s->prob[0] = s->prob[1];
2005
2006     if (!s->invisible) {
2007         *(AVFrame*)data = *curframe;
2008         *got_frame      = 1;
2009     }
2010
2011     return avpkt->size;
2012 err:
2013     memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
2014     return ret;
2015 }
2016
2017 static av_cold int vp8_decode_init(AVCodecContext *avctx)
2018 {
2019     VP8Context *s = avctx->priv_data;
2020
2021     s->avctx = avctx;
2022     avctx->pix_fmt = AV_PIX_FMT_YUV420P;
2023
2024     ff_videodsp_init(&s->vdsp, 8);
2025     ff_h264_pred_init(&s->hpc, AV_CODEC_ID_VP8, 8, 1);
2026     ff_vp8dsp_init(&s->vp8dsp);
2027
2028     return 0;
2029 }
2030
2031 static av_cold int vp8_decode_free(AVCodecContext *avctx)
2032 {
2033     vp8_decode_flush_impl(avctx, 0, 1, 1);
2034     release_queued_segmaps(avctx->priv_data, 1);
2035     return 0;
2036 }
2037
2038 static av_cold int vp8_decode_init_thread_copy(AVCodecContext *avctx)
2039 {
2040     VP8Context *s = avctx->priv_data;
2041
2042     s->avctx = avctx;
2043
2044     return 0;
2045 }
2046
2047 #define REBASE(pic) \
2048     pic ? pic - &s_src->frames[0] + &s->frames[0] : NULL
2049
2050 static int vp8_decode_update_thread_context(AVCodecContext *dst, const AVCodecContext *src)
2051 {
2052     VP8Context *s = dst->priv_data, *s_src = src->priv_data;
2053
2054     if (s->macroblocks_base &&
2055         (s_src->mb_width != s->mb_width || s_src->mb_height != s->mb_height)) {
2056         free_buffers(s);
2057         s->maps_are_invalid = 1;
2058         s->mb_width  = s_src->mb_width;
2059         s->mb_height = s_src->mb_height;
2060     }
2061
2062     s->prob[0] = s_src->prob[!s_src->update_probabilities];
2063     s->segmentation = s_src->segmentation;
2064     s->lf_delta = s_src->lf_delta;
2065     memcpy(s->sign_bias, s_src->sign_bias, sizeof(s->sign_bias));
2066
2067     memcpy(&s->frames, &s_src->frames, sizeof(s->frames));
2068     s->framep[0] = REBASE(s_src->next_framep[0]);
2069     s->framep[1] = REBASE(s_src->next_framep[1]);
2070     s->framep[2] = REBASE(s_src->next_framep[2]);
2071     s->framep[3] = REBASE(s_src->next_framep[3]);
2072
2073     return 0;
2074 }
2075
2076 AVCodec ff_vp8_decoder = {
2077     .name                  = "vp8",
2078     .type                  = AVMEDIA_TYPE_VIDEO,
2079     .id                    = AV_CODEC_ID_VP8,
2080     .priv_data_size        = sizeof(VP8Context),
2081     .init                  = vp8_decode_init,
2082     .close                 = vp8_decode_free,
2083     .decode                = vp8_decode_frame,
2084     .capabilities          = CODEC_CAP_DR1 | CODEC_CAP_FRAME_THREADS | CODEC_CAP_SLICE_THREADS,
2085     .flush                 = vp8_decode_flush,
2086     .long_name             = NULL_IF_CONFIG_SMALL("On2 VP8"),
2087     .init_thread_copy      = ONLY_IF_THREADS_ENABLED(vp8_decode_init_thread_copy),
2088     .update_thread_context = ONLY_IF_THREADS_ENABLED(vp8_decode_update_thread_context),
2089 };