git.sesse.net Git - ffmpeg/blob - libavcodec/vp8.c

   1 /*
   2  * VP8 compatible video decoder
   3  *
   4  * Copyright (C) 2010 David Conrad
   5  * Copyright (C) 2010 Ronald S. Bultje
   6  * Copyright (C) 2010 Jason Garrett-Glaser
   7  * Copyright (C) 2012 Daniel Kang
   8  *
   9  * This file is part of Libav.
  10  *
  11  * Libav is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public
  13  * License as published by the Free Software Foundation; either
  14  * version 2.1 of the License, or (at your option) any later version.
  15  *
  16  * Libav is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with Libav; if not, write to the Free Software
  23  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  24  */
  25
  26 #include "libavutil/imgutils.h"
  27 #include "avcodec.h"
  28 #include "internal.h"
  29 #include "vp8.h"
  30 #include "vp8data.h"
  31 #include "rectangle.h"
  32 #include "thread.h"
  33
  34 #if ARCH_ARM
  35 #   include "arm/vp8.h"
  36 #endif
  37
  38 static void free_buffers(VP8Context *s)
  39 {
  40     int i;
  41     if (s->thread_data)
  42         for (i = 0; i < MAX_THREADS; i++) {
  43             av_freep(&s->thread_data[i].filter_strength);
  44             av_freep(&s->thread_data[i].edge_emu_buffer);
  45         }
  46     av_freep(&s->thread_data);
  47     av_freep(&s->macroblocks_base);
  48     av_freep(&s->intra4x4_pred_mode_top);
  49     av_freep(&s->top_nnz);
  50     av_freep(&s->top_border);
  51
  52     s->macroblocks = NULL;
  53 }
  54
  55 static int vp8_alloc_frame(VP8Context *s, AVFrame *f)
  56 {
  57     int ret;
  58     if ((ret = ff_thread_get_buffer(s->avctx, f)) < 0)
  59         return ret;
  60     if (s->num_maps_to_be_freed && !s->maps_are_invalid) {
  61         f->ref_index[0] = s->segmentation_maps[--s->num_maps_to_be_freed];
  62     } else if (!(f->ref_index[0] = av_mallocz(s->mb_width * s->mb_height))) {
  63         ff_thread_release_buffer(s->avctx, f);
  64         return AVERROR(ENOMEM);
  65     }
  66     return 0;
  67 }
  68
  69 static void vp8_release_frame(VP8Context *s, AVFrame *f, int prefer_delayed_free, int can_direct_free)
  70 {
  71     if (f->ref_index[0]) {
  72         if (prefer_delayed_free) {
  73             /* Upon a size change, we want to free the maps but other threads may still
  74              * be using them, so queue them. Upon a seek, all threads are inactive so
  75              * we want to cache one to prevent re-allocation in the next decoding
  76              * iteration, but the rest we can free directly. */
  77             int max_queued_maps = can_direct_free ? 1 : FF_ARRAY_ELEMS(s->segmentation_maps);
  78             if (s->num_maps_to_be_freed < max_queued_maps) {
  79                 s->segmentation_maps[s->num_maps_to_be_freed++] = f->ref_index[0];
  80             } else if (can_direct_free) /* vp8_decode_flush(), but our queue is full */ {
  81                 av_free(f->ref_index[0]);
  82             } /* else: MEMLEAK (should never happen, but better that than crash) */
  83             f->ref_index[0] = NULL;
  84         } else /* vp8_decode_free() */ {
  85             av_free(f->ref_index[0]);
  86         }
  87     }
  88     ff_thread_release_buffer(s->avctx, f);
  89 }
  90
  91 static void vp8_decode_flush_impl(AVCodecContext *avctx,
  92                                   int prefer_delayed_free, int can_direct_free, int free_mem)
  93 {
  94     VP8Context *s = avctx->priv_data;
  95     int i;
  96
  97     if (!avctx->internal->is_copy) {
  98         for (i = 0; i < 5; i++)
  99             if (s->frames[i].data[0])
 100                 vp8_release_frame(s, &s->frames[i], prefer_delayed_free, can_direct_free);
 101     }
 102     memset(s->framep, 0, sizeof(s->framep));
 103
 104     if (free_mem) {
 105         free_buffers(s);
 106         s->maps_are_invalid = 1;
 107     }
 108 }
 109
 110 static void vp8_decode_flush(AVCodecContext *avctx)
 111 {
 112     vp8_decode_flush_impl(avctx, 1, 1, 0);
 113 }
 114
 115 static int update_dimensions(VP8Context *s, int width, int height)
 116 {
 117     AVCodecContext *avctx = s->avctx;
 118     int i;
 119
 120     if (width  != s->avctx->width ||
 121         height != s->avctx->height) {
 122         if (av_image_check_size(width, height, 0, s->avctx))
 123             return AVERROR_INVALIDDATA;
 124
 125         vp8_decode_flush_impl(s->avctx, 1, 0, 1);
 126
 127         avcodec_set_dimensions(s->avctx, width, height);
 128     }
 129
 130     s->mb_width  = (s->avctx->coded_width +15) / 16;
 131     s->mb_height = (s->avctx->coded_height+15) / 16;
 132
 133     s->mb_layout = (avctx->active_thread_type == FF_THREAD_SLICE) && (FFMIN(s->num_coeff_partitions, avctx->thread_count) > 1);
 134     if (!s->mb_layout) { // Frame threading and one thread
 135         s->macroblocks_base       = av_mallocz((s->mb_width+s->mb_height*2+1)*sizeof(*s->macroblocks));
 136         s->intra4x4_pred_mode_top = av_mallocz(s->mb_width*4);
 137     }
 138     else // Sliced threading
 139         s->macroblocks_base       = av_mallocz((s->mb_width+2)*(s->mb_height+2)*sizeof(*s->macroblocks));
 140     s->top_nnz                    = av_mallocz(s->mb_width*sizeof(*s->top_nnz));
 141     s->top_border                 = av_mallocz((s->mb_width+1)*sizeof(*s->top_border));
 142     s->thread_data                = av_mallocz(MAX_THREADS*sizeof(VP8ThreadData));
 143
 144     for (i = 0; i < MAX_THREADS; i++) {
 145         s->thread_data[i].filter_strength = av_mallocz(s->mb_width*sizeof(*s->thread_data[0].filter_strength));
 146 #if HAVE_THREADS
 147         pthread_mutex_init(&s->thread_data[i].lock, NULL);
 148         pthread_cond_init(&s->thread_data[i].cond, NULL);
 149 #endif
 150     }
 151
 152     if (!s->macroblocks_base || !s->top_nnz || !s->top_border ||
 153         (!s->intra4x4_pred_mode_top && !s->mb_layout))
 154         return AVERROR(ENOMEM);
 155
 156     s->macroblocks        = s->macroblocks_base + 1;
 157
 158     return 0;
 159 }
 160
 161 static void parse_segment_info(VP8Context *s)
 162 {
 163     VP56RangeCoder *c = &s->c;
 164     int i;
 165
 166     s->segmentation.update_map = vp8_rac_get(c);
 167
 168     if (vp8_rac_get(c)) { // update segment feature data
 169         s->segmentation.absolute_vals = vp8_rac_get(c);
 170
 171         for (i = 0; i < 4; i++)
 172             s->segmentation.base_quant[i]   = vp8_rac_get_sint(c, 7);
 173
 174         for (i = 0; i < 4; i++)
 175             s->segmentation.filter_level[i] = vp8_rac_get_sint(c, 6);
 176     }
 177     if (s->segmentation.update_map)
 178         for (i = 0; i < 3; i++)
 179             s->prob->segmentid[i] = vp8_rac_get(c) ? vp8_rac_get_uint(c, 8) : 255;
 180 }
 181
 182 static void update_lf_deltas(VP8Context *s)
 183 {
 184     VP56RangeCoder *c = &s->c;
 185     int i;
 186
 187     for (i = 0; i < 4; i++) {
 188         if (vp8_rac_get(c)) {
 189             s->lf_delta.ref[i] = vp8_rac_get_uint(c, 6);
 190
 191             if (vp8_rac_get(c))
 192                 s->lf_delta.ref[i] = -s->lf_delta.ref[i];
 193         }
 194     }
 195
 196     for (i = MODE_I4x4; i <= VP8_MVMODE_SPLIT; i++) {
 197         if (vp8_rac_get(c)) {
 198             s->lf_delta.mode[i] = vp8_rac_get_uint(c, 6);
 199
 200             if (vp8_rac_get(c))
 201                 s->lf_delta.mode[i] = -s->lf_delta.mode[i];
 202         }
 203     }
 204 }
 205
 206 static int setup_partitions(VP8Context *s, const uint8_t *buf, int buf_size)
 207 {
 208     const uint8_t *sizes = buf;
 209     int i;
 210
 211     s->num_coeff_partitions = 1 << vp8_rac_get_uint(&s->c, 2);
 212
 213     buf      += 3*(s->num_coeff_partitions-1);
 214     buf_size -= 3*(s->num_coeff_partitions-1);
 215     if (buf_size < 0)
 216         return -1;
 217
 218     for (i = 0; i < s->num_coeff_partitions-1; i++) {
 219         int size = AV_RL24(sizes + 3*i);
 220         if (buf_size - size < 0)
 221             return -1;
 222
 223         ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, size);
 224         buf      += size;
 225         buf_size -= size;
 226     }
 227     ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, buf_size);
 228
 229     return 0;
 230 }
 231
 232 static void get_quants(VP8Context *s)
 233 {
 234     VP56RangeCoder *c = &s->c;
 235     int i, base_qi;
 236
 237     int yac_qi     = vp8_rac_get_uint(c, 7);
 238     int ydc_delta  = vp8_rac_get_sint(c, 4);
 239     int y2dc_delta = vp8_rac_get_sint(c, 4);
 240     int y2ac_delta = vp8_rac_get_sint(c, 4);
 241     int uvdc_delta = vp8_rac_get_sint(c, 4);
 242     int uvac_delta = vp8_rac_get_sint(c, 4);
 243
 244     for (i = 0; i < 4; i++) {
 245         if (s->segmentation.enabled) {
 246             base_qi = s->segmentation.base_quant[i];
 247             if (!s->segmentation.absolute_vals)
 248                 base_qi += yac_qi;
 249         } else
 250             base_qi = yac_qi;
 251
 252         s->qmat[i].luma_qmul[0]    =       vp8_dc_qlookup[av_clip_uintp2(base_qi + ydc_delta , 7)];
 253         s->qmat[i].luma_qmul[1]    =       vp8_ac_qlookup[av_clip_uintp2(base_qi             , 7)];
 254         s->qmat[i].luma_dc_qmul[0] =   2 * vp8_dc_qlookup[av_clip_uintp2(base_qi + y2dc_delta, 7)];
 255         s->qmat[i].luma_dc_qmul[1] = 155 * vp8_ac_qlookup[av_clip_uintp2(base_qi + y2ac_delta, 7)] / 100;
 256         s->qmat[i].chroma_qmul[0]  =       vp8_dc_qlookup[av_clip_uintp2(base_qi + uvdc_delta, 7)];
 257         s->qmat[i].chroma_qmul[1]  =       vp8_ac_qlookup[av_clip_uintp2(base_qi + uvac_delta, 7)];
 258
 259         s->qmat[i].luma_dc_qmul[1] = FFMAX(s->qmat[i].luma_dc_qmul[1], 8);
 260         s->qmat[i].chroma_qmul[0]  = FFMIN(s->qmat[i].chroma_qmul[0], 132);
 261     }
 262 }
 263
 264 /**
 265  * Determine which buffers golden and altref should be updated with after this frame.
 266  * The spec isn't clear here, so I'm going by my understanding of what libvpx does
 267  *
 268  * Intra frames update all 3 references
 269  * Inter frames update VP56_FRAME_PREVIOUS if the update_last flag is set
 270  * If the update (golden|altref) flag is set, it's updated with the current frame
 271  *      if update_last is set, and VP56_FRAME_PREVIOUS otherwise.
 272  * If the flag is not set, the number read means:
 273  *      0: no update
 274  *      1: VP56_FRAME_PREVIOUS
 275  *      2: update golden with altref, or update altref with golden
 276  */
 277 static VP56Frame ref_to_update(VP8Context *s, int update, VP56Frame ref)
 278 {
 279     VP56RangeCoder *c = &s->c;
 280
 281     if (update)
 282         return VP56_FRAME_CURRENT;
 283
 284     switch (vp8_rac_get_uint(c, 2)) {
 285     case 1:
 286         return VP56_FRAME_PREVIOUS;
 287     case 2:
 288         return (ref == VP56_FRAME_GOLDEN) ? VP56_FRAME_GOLDEN2 : VP56_FRAME_GOLDEN;
 289     }
 290     return VP56_FRAME_NONE;
 291 }
 292
 293 static void update_refs(VP8Context *s)
 294 {
 295     VP56RangeCoder *c = &s->c;
 296
 297     int update_golden = vp8_rac_get(c);
 298     int update_altref = vp8_rac_get(c);
 299
 300     s->update_golden = ref_to_update(s, update_golden, VP56_FRAME_GOLDEN);
 301     s->update_altref = ref_to_update(s, update_altref, VP56_FRAME_GOLDEN2);
 302 }
 303
 304 static int decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size)
 305 {
 306     VP56RangeCoder *c = &s->c;
 307     int header_size, hscale, vscale, i, j, k, l, m, ret;
 308     int width  = s->avctx->width;
 309     int height = s->avctx->height;
 310
 311     s->keyframe  = !(buf[0] & 1);
 312     s->profile   =  (buf[0]>>1) & 7;
 313     s->invisible = !(buf[0] & 0x10);
 314     header_size  = AV_RL24(buf) >> 5;
 315     buf      += 3;
 316     buf_size -= 3;
 317
 318     if (s->profile > 3)
 319         av_log(s->avctx, AV_LOG_WARNING, "Unknown profile %d\n", s->profile);
 320
 321     if (!s->profile)
 322         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab, sizeof(s->put_pixels_tab));
 323     else    // profile 1-3 use bilinear, 4+ aren't defined so whatever
 324         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_bilinear_pixels_tab, sizeof(s->put_pixels_tab));
 325
 326     if (header_size > buf_size - 7*s->keyframe) {
 327         av_log(s->avctx, AV_LOG_ERROR, "Header size larger than data provided\n");
 328         return AVERROR_INVALIDDATA;
 329     }
 330
 331     if (s->keyframe) {
 332         if (AV_RL24(buf) != 0x2a019d) {
 333             av_log(s->avctx, AV_LOG_ERROR, "Invalid start code 0x%x\n", AV_RL24(buf));
 334             return AVERROR_INVALIDDATA;
 335         }
 336         width  = AV_RL16(buf+3) & 0x3fff;
 337         height = AV_RL16(buf+5) & 0x3fff;
 338         hscale = buf[4] >> 6;
 339         vscale = buf[6] >> 6;
 340         buf      += 7;
 341         buf_size -= 7;
 342
 343         if (hscale || vscale)
 344             av_log_missing_feature(s->avctx, "Upscaling", 1);
 345
 346         s->update_golden = s->update_altref = VP56_FRAME_CURRENT;
 347         for (i = 0; i < 4; i++)
 348             for (j = 0; j < 16; j++)
 349                 memcpy(s->prob->token[i][j], vp8_token_default_probs[i][vp8_coeff_band[j]],
 350                        sizeof(s->prob->token[i][j]));
 351         memcpy(s->prob->pred16x16, vp8_pred16x16_prob_inter, sizeof(s->prob->pred16x16));
 352         memcpy(s->prob->pred8x8c , vp8_pred8x8c_prob_inter , sizeof(s->prob->pred8x8c));
 353         memcpy(s->prob->mvc      , vp8_mv_default_prob     , sizeof(s->prob->mvc));
 354         memset(&s->segmentation, 0, sizeof(s->segmentation));
 355     }
 356
 357     ff_vp56_init_range_decoder(c, buf, header_size);
 358     buf      += header_size;
 359     buf_size -= header_size;
 360
 361     if (s->keyframe) {
 362         if (vp8_rac_get(c))
 363             av_log(s->avctx, AV_LOG_WARNING, "Unspecified colorspace\n");
 364         vp8_rac_get(c); // whether we can skip clamping in dsp functions
 365     }
 366
 367     if ((s->segmentation.enabled = vp8_rac_get(c)))
 368         parse_segment_info(s);
 369     else
 370         s->segmentation.update_map = 0; // FIXME: move this to some init function?
 371
 372     s->filter.simple    = vp8_rac_get(c);
 373     s->filter.level     = vp8_rac_get_uint(c, 6);
 374     s->filter.sharpness = vp8_rac_get_uint(c, 3);
 375
 376     if ((s->lf_delta.enabled = vp8_rac_get(c)))
 377         if (vp8_rac_get(c))
 378             update_lf_deltas(s);
 379
 380     if (setup_partitions(s, buf, buf_size)) {
 381         av_log(s->avctx, AV_LOG_ERROR, "Invalid partitions\n");
 382         return AVERROR_INVALIDDATA;
 383     }
 384
 385     if (!s->macroblocks_base || /* first frame */
 386         width != s->avctx->width || height != s->avctx->height) {
 387         if ((ret = update_dimensions(s, width, height)) < 0)
 388             return ret;
 389     }
 390
 391     get_quants(s);
 392
 393     if (!s->keyframe) {
 394         update_refs(s);
 395         s->sign_bias[VP56_FRAME_GOLDEN]               = vp8_rac_get(c);
 396         s->sign_bias[VP56_FRAME_GOLDEN2 /* altref */] = vp8_rac_get(c);
 397     }
 398
 399     // if we aren't saving this frame's probabilities for future frames,
 400     // make a copy of the current probabilities
 401     if (!(s->update_probabilities = vp8_rac_get(c)))
 402         s->prob[1] = s->prob[0];
 403
 404     s->update_last = s->keyframe || vp8_rac_get(c);
 405
 406     for (i = 0; i < 4; i++)
 407         for (j = 0; j < 8; j++)
 408             for (k = 0; k < 3; k++)
 409                 for (l = 0; l < NUM_DCT_TOKENS-1; l++)
 410                     if (vp56_rac_get_prob_branchy(c, vp8_token_update_probs[i][j][k][l])) {
 411                         int prob = vp8_rac_get_uint(c, 8);
 412                         for (m = 0; vp8_coeff_band_indexes[j][m] >= 0; m++)
 413                             s->prob->token[i][vp8_coeff_band_indexes[j][m]][k][l] = prob;
 414                     }
 415
 416     if ((s->mbskip_enabled = vp8_rac_get(c)))
 417         s->prob->mbskip = vp8_rac_get_uint(c, 8);
 418
 419     if (!s->keyframe) {
 420         s->prob->intra  = vp8_rac_get_uint(c, 8);
 421         s->prob->last   = vp8_rac_get_uint(c, 8);
 422         s->prob->golden = vp8_rac_get_uint(c, 8);
 423
 424         if (vp8_rac_get(c))
 425             for (i = 0; i < 4; i++)
 426                 s->prob->pred16x16[i] = vp8_rac_get_uint(c, 8);
 427         if (vp8_rac_get(c))
 428             for (i = 0; i < 3; i++)
 429                 s->prob->pred8x8c[i]  = vp8_rac_get_uint(c, 8);
 430
 431         // 17.2 MV probability update
 432         for (i = 0; i < 2; i++)
 433             for (j = 0; j < 19; j++)
 434                 if (vp56_rac_get_prob_branchy(c, vp8_mv_update_prob[i][j]))
 435                     s->prob->mvc[i][j] = vp8_rac_get_nn(c);
 436     }
 437
 438     return 0;
 439 }
 440
 441 static av_always_inline void clamp_mv(VP8Context *s, VP56mv *dst, const VP56mv *src)
 442 {
 443     dst->x = av_clip(src->x, s->mv_min.x, s->mv_max.x);
 444     dst->y = av_clip(src->y, s->mv_min.y, s->mv_max.y);
 445 }
 446
 447 /**
 448  * Motion vector coding, 17.1.
 449  */
 450 static int read_mv_component(VP56RangeCoder *c, const uint8_t *p)
 451 {
 452     int bit, x = 0;
 453
 454     if (vp56_rac_get_prob_branchy(c, p[0])) {
 455         int i;
 456
 457         for (i = 0; i < 3; i++)
 458             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 459         for (i = 9; i > 3; i--)
 460             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 461         if (!(x & 0xFFF0) || vp56_rac_get_prob(c, p[12]))
 462             x += 8;
 463     } else {
 464         // small_mvtree
 465         const uint8_t *ps = p+2;
 466         bit = vp56_rac_get_prob(c, *ps);
 467         ps += 1 + 3*bit;
 468         x  += 4*bit;
 469         bit = vp56_rac_get_prob(c, *ps);
 470         ps += 1 + bit;
 471         x  += 2*bit;
 472         x  += vp56_rac_get_prob(c, *ps);
 473     }
 474
 475     return (x && vp56_rac_get_prob(c, p[1])) ? -x : x;
 476 }
 477
 478 static av_always_inline
 479 const uint8_t *get_submv_prob(uint32_t left, uint32_t top)
 480 {
 481     if (left == top)
 482         return vp8_submv_prob[4-!!left];
 483     if (!top)
 484         return vp8_submv_prob[2];
 485     return vp8_submv_prob[1-!!left];
 486 }
 487
 488 /**
 489  * Split motion vector prediction, 16.4.
 490  * @returns the number of motion vectors parsed (2, 4 or 16)
 491  */
 492 static av_always_inline
 493 int decode_splitmvs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb, int layout)
 494 {
 495     int part_idx;
 496     int n, num;
 497     VP8Macroblock *top_mb;
 498     VP8Macroblock *left_mb = &mb[-1];
 499     const uint8_t *mbsplits_left = vp8_mbsplits[left_mb->partitioning],
 500                   *mbsplits_top,
 501                   *mbsplits_cur, *firstidx;
 502     VP56mv *top_mv;
 503     VP56mv *left_mv = left_mb->bmv;
 504     VP56mv *cur_mv  = mb->bmv;
 505
 506     if (!layout) // layout is inlined, s->mb_layout is not
 507         top_mb = &mb[2];
 508     else
 509         top_mb = &mb[-s->mb_width-1];
 510     mbsplits_top = vp8_mbsplits[top_mb->partitioning];
 511     top_mv = top_mb->bmv;
 512
 513     if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[0])) {
 514         if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[1])) {
 515             part_idx = VP8_SPLITMVMODE_16x8 + vp56_rac_get_prob(c, vp8_mbsplit_prob[2]);
 516         } else {
 517             part_idx = VP8_SPLITMVMODE_8x8;
 518         }
 519     } else {
 520         part_idx = VP8_SPLITMVMODE_4x4;
 521     }
 522
 523     num = vp8_mbsplit_count[part_idx];
 524     mbsplits_cur = vp8_mbsplits[part_idx],
 525     firstidx = vp8_mbfirstidx[part_idx];
 526     mb->partitioning = part_idx;
 527
 528     for (n = 0; n < num; n++) {
 529         int k = firstidx[n];
 530         uint32_t left, above;
 531         const uint8_t *submv_prob;
 532
 533         if (!(k & 3))
 534             left = AV_RN32A(&left_mv[mbsplits_left[k + 3]]);
 535         else
 536             left  = AV_RN32A(&cur_mv[mbsplits_cur[k - 1]]);
 537         if (k <= 3)
 538             above = AV_RN32A(&top_mv[mbsplits_top[k + 12]]);
 539         else
 540             above = AV_RN32A(&cur_mv[mbsplits_cur[k - 4]]);
 541
 542         submv_prob = get_submv_prob(left, above);
 543
 544         if (vp56_rac_get_prob_branchy(c, submv_prob[0])) {
 545             if (vp56_rac_get_prob_branchy(c, submv_prob[1])) {
 546                 if (vp56_rac_get_prob_branchy(c, submv_prob[2])) {
 547                     mb->bmv[n].y = mb->mv.y + read_mv_component(c, s->prob->mvc[0]);
 548                     mb->bmv[n].x = mb->mv.x + read_mv_component(c, s->prob->mvc[1]);
 549                 } else {
 550                     AV_ZERO32(&mb->bmv[n]);
 551                 }
 552             } else {
 553                 AV_WN32A(&mb->bmv[n], above);
 554             }
 555         } else {
 556             AV_WN32A(&mb->bmv[n], left);
 557         }
 558     }
 559
 560     return num;
 561 }
 562
 563 static av_always_inline
 564 void decode_mvs(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, int layout)
 565 {
 566     VP8Macroblock *mb_edge[3] = { 0 /* top */,
 567                                   mb - 1 /* left */,
 568                                   0 /* top-left */ };
 569     enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV };
 570     enum { VP8_EDGE_TOP, VP8_EDGE_LEFT, VP8_EDGE_TOPLEFT };
 571     int idx = CNT_ZERO;
 572     int cur_sign_bias = s->sign_bias[mb->ref_frame];
 573     int8_t *sign_bias = s->sign_bias;
 574     VP56mv near_mv[4];
 575     uint8_t cnt[4] = { 0 };
 576     VP56RangeCoder *c = &s->c;
 577
 578     if (!layout) { // layout is inlined (s->mb_layout is not)
 579         mb_edge[0] = mb + 2;
 580         mb_edge[2] = mb + 1;
 581     }
 582     else {
 583         mb_edge[0] = mb - s->mb_width-1;
 584         mb_edge[2] = mb - s->mb_width-2;
 585     }
 586
 587     AV_ZERO32(&near_mv[0]);
 588     AV_ZERO32(&near_mv[1]);
 589     AV_ZERO32(&near_mv[2]);
 590
 591     /* Process MB on top, left and top-left */
 592     #define MV_EDGE_CHECK(n)\
 593     {\
 594         VP8Macroblock *edge = mb_edge[n];\
 595         int edge_ref = edge->ref_frame;\
 596         if (edge_ref != VP56_FRAME_CURRENT) {\
 597             uint32_t mv = AV_RN32A(&edge->mv);\
 598             if (mv) {\
 599                 if (cur_sign_bias != sign_bias[edge_ref]) {\
 600                     /* SWAR negate of the values in mv. */\
 601                     mv = ~mv;\
 602                     mv = ((mv&0x7fff7fff) + 0x00010001) ^ (mv&0x80008000);\
 603                 }\
 604                 if (!n || mv != AV_RN32A(&near_mv[idx]))\
 605                     AV_WN32A(&near_mv[++idx], mv);\
 606                 cnt[idx]      += 1 + (n != 2);\
 607             } else\
 608                 cnt[CNT_ZERO] += 1 + (n != 2);\
 609         }\
 610     }
 611
 612     MV_EDGE_CHECK(0)
 613     MV_EDGE_CHECK(1)
 614     MV_EDGE_CHECK(2)
 615
 616     mb->partitioning = VP8_SPLITMVMODE_NONE;
 617     if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_ZERO]][0])) {
 618         mb->mode = VP8_MVMODE_MV;
 619
 620         /* If we have three distinct MVs, merge first and last if they're the same */
 621         if (cnt[CNT_SPLITMV] && AV_RN32A(&near_mv[1 + VP8_EDGE_TOP]) == AV_RN32A(&near_mv[1 + VP8_EDGE_TOPLEFT]))
 622             cnt[CNT_NEAREST] += 1;
 623
 624         /* Swap near and nearest if necessary */
 625         if (cnt[CNT_NEAR] > cnt[CNT_NEAREST]) {
 626             FFSWAP(uint8_t,     cnt[CNT_NEAREST],     cnt[CNT_NEAR]);
 627             FFSWAP( VP56mv, near_mv[CNT_NEAREST], near_mv[CNT_NEAR]);
 628         }
 629
 630         if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAREST]][1])) {
 631             if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAR]][2])) {
 632
 633                 /* Choose the best mv out of 0,0 and the nearest mv */
 634                 clamp_mv(s, &mb->mv, &near_mv[CNT_ZERO + (cnt[CNT_NEAREST] >= cnt[CNT_ZERO])]);
 635                 cnt[CNT_SPLITMV] = ((mb_edge[VP8_EDGE_LEFT]->mode    == VP8_MVMODE_SPLIT) +
 636                                     (mb_edge[VP8_EDGE_TOP]->mode     == VP8_MVMODE_SPLIT)) * 2 +
 637                                     (mb_edge[VP8_EDGE_TOPLEFT]->mode == VP8_MVMODE_SPLIT);
 638
 639                 if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_SPLITMV]][3])) {
 640                     mb->mode = VP8_MVMODE_SPLIT;
 641                     mb->mv = mb->bmv[decode_splitmvs(s, c, mb, layout) - 1];
 642                 } else {
 643                     mb->mv.y += read_mv_component(c, s->prob->mvc[0]);
 644                     mb->mv.x += read_mv_component(c, s->prob->mvc[1]);
 645                     mb->bmv[0] = mb->mv;
 646                 }
 647             } else {
 648                 clamp_mv(s, &mb->mv, &near_mv[CNT_NEAR]);
 649                 mb->bmv[0] = mb->mv;
 650             }
 651         } else {
 652             clamp_mv(s, &mb->mv, &near_mv[CNT_NEAREST]);
 653             mb->bmv[0] = mb->mv;
 654         }
 655     } else {
 656         mb->mode = VP8_MVMODE_ZERO;
 657         AV_ZERO32(&mb->mv);
 658         mb->bmv[0] = mb->mv;
 659     }
 660 }
 661
 662 static av_always_inline
 663 void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
 664                            int mb_x, int keyframe, int layout)
 665 {
 666     uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
 667
 668     if (layout == 1) {
 669         VP8Macroblock *mb_top = mb - s->mb_width - 1;
 670         memcpy(mb->intra4x4_pred_mode_top, mb_top->intra4x4_pred_mode_top, 4);
 671     }
 672     if (keyframe) {
 673         int x, y;
 674         uint8_t* top;
 675         uint8_t* const left = s->intra4x4_pred_mode_left;
 676         if (layout == 1)
 677             top = mb->intra4x4_pred_mode_top;
 678         else
 679             top = s->intra4x4_pred_mode_top + 4 * mb_x;
 680         for (y = 0; y < 4; y++) {
 681             for (x = 0; x < 4; x++) {
 682                 const uint8_t *ctx;
 683                 ctx = vp8_pred4x4_prob_intra[top[x]][left[y]];
 684                 *intra4x4 = vp8_rac_get_tree(c, vp8_pred4x4_tree, ctx);
 685                 left[y] = top[x] = *intra4x4;
 686                 intra4x4++;
 687             }
 688         }
 689     } else {
 690         int i;
 691         for (i = 0; i < 16; i++)
 692             intra4x4[i] = vp8_rac_get_tree(c, vp8_pred4x4_tree, vp8_pred4x4_prob_inter);
 693     }
 694 }
 695
 696 static av_always_inline
 697 void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
 698                     uint8_t *segment, uint8_t *ref, int layout)
 699 {
 700     VP56RangeCoder *c = &s->c;
 701
 702     if (s->segmentation.update_map)
 703         *segment = vp8_rac_get_tree(c, vp8_segmentid_tree, s->prob->segmentid);
 704     else if (s->segmentation.enabled)
 705         *segment = ref ? *ref : *segment;
 706     mb->segment = *segment;
 707
 708     mb->skip = s->mbskip_enabled ? vp56_rac_get_prob(c, s->prob->mbskip) : 0;
 709
 710     if (s->keyframe) {
 711         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_intra, vp8_pred16x16_prob_intra);
 712
 713         if (mb->mode == MODE_I4x4) {
 714             decode_intra4x4_modes(s, c, mb, mb_x, 1, layout);
 715         } else {
 716             const uint32_t modes = vp8_pred4x4_mode[mb->mode] * 0x01010101u;
 717             if (s->mb_layout == 1)
 718                 AV_WN32A(mb->intra4x4_pred_mode_top, modes);
 719             else
 720                 AV_WN32A(s->intra4x4_pred_mode_top + 4 * mb_x, modes);
 721             AV_WN32A( s->intra4x4_pred_mode_left, modes);
 722         }
 723
 724         mb->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, vp8_pred8x8c_prob_intra);
 725         mb->ref_frame = VP56_FRAME_CURRENT;
 726     } else if (vp56_rac_get_prob_branchy(c, s->prob->intra)) {
 727         // inter MB, 16.2
 728         if (vp56_rac_get_prob_branchy(c, s->prob->last))
 729             mb->ref_frame = vp56_rac_get_prob(c, s->prob->golden) ?
 730                 VP56_FRAME_GOLDEN2 /* altref */ : VP56_FRAME_GOLDEN;
 731         else
 732             mb->ref_frame = VP56_FRAME_PREVIOUS;
 733         s->ref_count[mb->ref_frame-1]++;
 734
 735         // motion vectors, 16.3
 736         decode_mvs(s, mb, mb_x, mb_y, layout);
 737     } else {
 738         // intra MB, 16.1
 739         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_inter, s->prob->pred16x16);
 740
 741         if (mb->mode == MODE_I4x4)
 742             decode_intra4x4_modes(s, c, mb, mb_x, 0, layout);
 743
 744         mb->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, s->prob->pred8x8c);
 745         mb->ref_frame = VP56_FRAME_CURRENT;
 746         mb->partitioning = VP8_SPLITMVMODE_NONE;
 747         AV_ZERO32(&mb->bmv[0]);
 748     }
 749 }
 750
 751 #ifndef decode_block_coeffs_internal
 752 /**
 753  * @param c arithmetic bitstream reader context
 754  * @param block destination for block coefficients
 755  * @param probs probabilities to use when reading trees from the bitstream
 756  * @param i initial coeff index, 0 unless a separate DC block is coded
 757  * @param qmul array holding the dc/ac dequant factor at position 0/1
 758  * @return 0 if no coeffs were decoded
 759  *         otherwise, the index of the last coeff decoded plus one
 760  */
 761 static int decode_block_coeffs_internal(VP56RangeCoder *r, DCTELEM block[16],
 762                                         uint8_t probs[16][3][NUM_DCT_TOKENS-1],
 763                                         int i, uint8_t *token_prob, int16_t qmul[2])
 764 {
 765     VP56RangeCoder c = *r;
 766     goto skip_eob;
 767     do {
 768         int coeff;
 769         if (!vp56_rac_get_prob_branchy(&c, token_prob[0]))   // DCT_EOB
 770             break;
 771
 772 skip_eob:
 773         if (!vp56_rac_get_prob_branchy(&c, token_prob[1])) { // DCT_0
 774             if (++i == 16)
 775                 break; // invalid input; blocks should end with EOB
 776             token_prob = probs[i][0];
 777             goto skip_eob;
 778         }
 779
 780         if (!vp56_rac_get_prob_branchy(&c, token_prob[2])) { // DCT_1
 781             coeff = 1;
 782             token_prob = probs[i+1][1];
 783         } else {
 784             if (!vp56_rac_get_prob_branchy(&c, token_prob[3])) { // DCT 2,3,4
 785                 coeff = vp56_rac_get_prob_branchy(&c, token_prob[4]);
 786                 if (coeff)
 787                     coeff += vp56_rac_get_prob(&c, token_prob[5]);
 788                 coeff += 2;
 789             } else {
 790                 // DCT_CAT*
 791                 if (!vp56_rac_get_prob_branchy(&c, token_prob[6])) {
 792                     if (!vp56_rac_get_prob_branchy(&c, token_prob[7])) { // DCT_CAT1
 793                         coeff  = 5 + vp56_rac_get_prob(&c, vp8_dct_cat1_prob[0]);
 794                     } else {                                    // DCT_CAT2
 795                         coeff  = 7;
 796                         coeff += vp56_rac_get_prob(&c, vp8_dct_cat2_prob[0]) << 1;
 797                         coeff += vp56_rac_get_prob(&c, vp8_dct_cat2_prob[1]);
 798                     }
 799                 } else {    // DCT_CAT3 and up
 800                     int a = vp56_rac_get_prob(&c, token_prob[8]);
 801                     int b = vp56_rac_get_prob(&c, token_prob[9+a]);
 802                     int cat = (a<<1) + b;
 803                     coeff  = 3 + (8<<cat);
 804                     coeff += vp8_rac_get_coeff(&c, ff_vp8_dct_cat_prob[cat]);
 805                 }
 806             }
 807             token_prob = probs[i+1][2];
 808         }
 809         block[zigzag_scan[i]] = (vp8_rac_get(&c) ? -coeff : coeff) * qmul[!!i];
 810     } while (++i < 16);
 811
 812     *r = c;
 813     return i;
 814 }
 815 #endif
 816
 817 /**
 818  * @param c arithmetic bitstream reader context
 819  * @param block destination for block coefficients
 820  * @param probs probabilities to use when reading trees from the bitstream
 821  * @param i initial coeff index, 0 unless a separate DC block is coded
 822  * @param zero_nhood the initial prediction context for number of surrounding
 823  *                   all-zero blocks (only left/top, so 0-2)
 824  * @param qmul array holding the dc/ac dequant factor at position 0/1
 825  * @return 0 if no coeffs were decoded
 826  *         otherwise, the index of the last coeff decoded plus one
 827  */
 828 static av_always_inline
 829 int decode_block_coeffs(VP56RangeCoder *c, DCTELEM block[16],
 830                         uint8_t probs[16][3][NUM_DCT_TOKENS-1],
 831                         int i, int zero_nhood, int16_t qmul[2])
 832 {
 833     uint8_t *token_prob = probs[i][zero_nhood];
 834     if (!vp56_rac_get_prob_branchy(c, token_prob[0]))   // DCT_EOB
 835         return 0;
 836     return decode_block_coeffs_internal(c, block, probs, i, token_prob, qmul);
 837 }
 838
 839 static av_always_inline
 840 void decode_mb_coeffs(VP8Context *s, VP8ThreadData *td, VP56RangeCoder *c, VP8Macroblock *mb,
 841                       uint8_t t_nnz[9], uint8_t l_nnz[9])
 842 {
 843     int i, x, y, luma_start = 0, luma_ctx = 3;
 844     int nnz_pred, nnz, nnz_total = 0;
 845     int segment = mb->segment;
 846     int block_dc = 0;
 847
 848     if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
 849         nnz_pred = t_nnz[8] + l_nnz[8];
 850
 851         // decode DC values and do hadamard
 852         nnz = decode_block_coeffs(c, td->block_dc, s->prob->token[1], 0, nnz_pred,
 853                                   s->qmat[segment].luma_dc_qmul);
 854         l_nnz[8] = t_nnz[8] = !!nnz;
 855         if (nnz) {
 856             nnz_total += nnz;
 857             block_dc = 1;
 858             if (nnz == 1)
 859                 s->vp8dsp.vp8_luma_dc_wht_dc(td->block, td->block_dc);
 860             else
 861                 s->vp8dsp.vp8_luma_dc_wht(td->block, td->block_dc);
 862         }
 863         luma_start = 1;
 864         luma_ctx = 0;
 865     }
 866
 867     // luma blocks
 868     for (y = 0; y < 4; y++)
 869         for (x = 0; x < 4; x++) {
 870             nnz_pred = l_nnz[y] + t_nnz[x];
 871             nnz = decode_block_coeffs(c, td->block[y][x], s->prob->token[luma_ctx], luma_start,
 872                                       nnz_pred, s->qmat[segment].luma_qmul);
 873             // nnz+block_dc may be one more than the actual last index, but we don't care
 874             td->non_zero_count_cache[y][x] = nnz + block_dc;
 875             t_nnz[x] = l_nnz[y] = !!nnz;
 876             nnz_total += nnz;
 877         }
 878
 879     // chroma blocks
 880     // TODO: what to do about dimensions? 2nd dim for luma is x,
 881     // but for chroma it's (y<<1)|x
 882     for (i = 4; i < 6; i++)
 883         for (y = 0; y < 2; y++)
 884             for (x = 0; x < 2; x++) {
 885                 nnz_pred = l_nnz[i+2*y] + t_nnz[i+2*x];
 886                 nnz = decode_block_coeffs(c, td->block[i][(y<<1)+x], s->prob->token[2], 0,
 887                                           nnz_pred, s->qmat[segment].chroma_qmul);
 888                 td->non_zero_count_cache[i][(y<<1)+x] = nnz;
 889                 t_nnz[i+2*x] = l_nnz[i+2*y] = !!nnz;
 890                 nnz_total += nnz;
 891             }
 892
 893     // if there were no coded coeffs despite the macroblock not being marked skip,
 894     // we MUST not do the inner loop filter and should not do IDCT
 895     // Since skip isn't used for bitstream prediction, just manually set it.
 896     if (!nnz_total)
 897         mb->skip = 1;
 898 }
 899
 900 static av_always_inline
 901 void backup_mb_border(uint8_t *top_border, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr,
 902                       int linesize, int uvlinesize, int simple)
 903 {
 904     AV_COPY128(top_border, src_y + 15*linesize);
 905     if (!simple) {
 906         AV_COPY64(top_border+16, src_cb + 7*uvlinesize);
 907         AV_COPY64(top_border+24, src_cr + 7*uvlinesize);
 908     }
 909 }
 910
 911 static av_always_inline
 912 void xchg_mb_border(uint8_t *top_border, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr,
 913                     int linesize, int uvlinesize, int mb_x, int mb_y, int mb_width,
 914                     int simple, int xchg)
 915 {
 916     uint8_t *top_border_m1 = top_border-32;     // for TL prediction
 917     src_y  -=   linesize;
 918     src_cb -= uvlinesize;
 919     src_cr -= uvlinesize;
 920
 921 #define XCHG(a,b,xchg) do {                     \
 922         if (xchg) AV_SWAP64(b,a);               \
 923         else      AV_COPY64(b,a);               \
 924     } while (0)
 925
 926     XCHG(top_border_m1+8, src_y-8, xchg);
 927     XCHG(top_border,      src_y,   xchg);
 928     XCHG(top_border+8,    src_y+8, 1);
 929     if (mb_x < mb_width-1)
 930         XCHG(top_border+32, src_y+16, 1);
 931
 932     // only copy chroma for normal loop filter
 933     // or to initialize the top row to 127
 934     if (!simple || !mb_y) {
 935         XCHG(top_border_m1+16, src_cb-8, xchg);
 936         XCHG(top_border_m1+24, src_cr-8, xchg);
 937         XCHG(top_border+16,    src_cb, 1);
 938         XCHG(top_border+24,    src_cr, 1);
 939     }
 940 }
 941
 942 static av_always_inline
 943 int check_dc_pred8x8_mode(int mode, int mb_x, int mb_y)
 944 {
 945     if (!mb_x) {
 946         return mb_y ? TOP_DC_PRED8x8 : DC_128_PRED8x8;
 947     } else {
 948         return mb_y ? mode : LEFT_DC_PRED8x8;
 949     }
 950 }
 951
 952 static av_always_inline
 953 int check_tm_pred8x8_mode(int mode, int mb_x, int mb_y)
 954 {
 955     if (!mb_x) {
 956         return mb_y ? VERT_PRED8x8 : DC_129_PRED8x8;
 957     } else {
 958         return mb_y ? mode : HOR_PRED8x8;
 959     }
 960 }
 961
 962 static av_always_inline
 963 int check_intra_pred8x8_mode(int mode, int mb_x, int mb_y)
 964 {
 965     if (mode == DC_PRED8x8) {
 966         return check_dc_pred8x8_mode(mode, mb_x, mb_y);
 967     } else {
 968         return mode;
 969     }
 970 }
 971
 972 static av_always_inline
 973 int check_intra_pred8x8_mode_emuedge(int mode, int mb_x, int mb_y)
 974 {
 975     switch (mode) {
 976     case DC_PRED8x8:
 977         return check_dc_pred8x8_mode(mode, mb_x, mb_y);
 978     case VERT_PRED8x8:
 979         return !mb_y ? DC_127_PRED8x8 : mode;
 980     case HOR_PRED8x8:
 981         return !mb_x ? DC_129_PRED8x8 : mode;
 982     case PLANE_PRED8x8 /*TM*/:
 983         return check_tm_pred8x8_mode(mode, mb_x, mb_y);
 984     }
 985     return mode;
 986 }
 987
 988 static av_always_inline
 989 int check_tm_pred4x4_mode(int mode, int mb_x, int mb_y)
 990 {
 991     if (!mb_x) {
 992         return mb_y ? VERT_VP8_PRED : DC_129_PRED;
 993     } else {
 994         return mb_y ? mode : HOR_VP8_PRED;
 995     }
 996 }
 997
 998 static av_always_inline
 999 int check_intra_pred4x4_mode_emuedge(int mode, int mb_x, int mb_y, int *copy_buf)
1000 {
1001     switch (mode) {
1002     case VERT_PRED:
1003         if (!mb_x && mb_y) {
1004             *copy_buf = 1;
1005             return mode;
1006         }
1007         /* fall-through */
1008     case DIAG_DOWN_LEFT_PRED:
1009     case VERT_LEFT_PRED:
1010         return !mb_y ? DC_127_PRED : mode;
1011     case HOR_PRED:
1012         if (!mb_y) {
1013             *copy_buf = 1;
1014             return mode;
1015         }
1016         /* fall-through */
1017     case HOR_UP_PRED:
1018         return !mb_x ? DC_129_PRED : mode;
1019     case TM_VP8_PRED:
1020         return check_tm_pred4x4_mode(mode, mb_x, mb_y);
1021     case DC_PRED: // 4x4 DC doesn't use the same "H.264-style" exceptions as 16x16/8x8 DC
1022     case DIAG_DOWN_RIGHT_PRED:
1023     case VERT_RIGHT_PRED:
1024     case HOR_DOWN_PRED:
1025         if (!mb_y || !mb_x)
1026             *copy_buf = 1;
1027         return mode;
1028     }
1029     return mode;
1030 }
1031
1032 static av_always_inline
1033 void intra_predict(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1034                    VP8Macroblock *mb, int mb_x, int mb_y)
1035 {
1036     AVCodecContext *avctx = s->avctx;
1037     int x, y, mode, nnz;
1038     uint32_t tr;
1039
1040     // for the first row, we need to run xchg_mb_border to init the top edge to 127
1041     // otherwise, skip it if we aren't going to deblock
1042     if (!(avctx->flags & CODEC_FLAG_EMU_EDGE && !mb_y) && (s->deblock_filter || !mb_y) && td->thread_nr == 0)
1043         xchg_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2],
1044                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1045                        s->filter.simple, 1);
1046
1047     if (mb->mode < MODE_I4x4) {
1048         if (avctx->flags & CODEC_FLAG_EMU_EDGE) { // tested
1049             mode = check_intra_pred8x8_mode_emuedge(mb->mode, mb_x, mb_y);
1050         } else {
1051             mode = check_intra_pred8x8_mode(mb->mode, mb_x, mb_y);
1052         }
1053         s->hpc.pred16x16[mode](dst[0], s->linesize);
1054     } else {
1055         uint8_t *ptr = dst[0];
1056         uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
1057         uint8_t tr_top[4] = { 127, 127, 127, 127 };
1058
1059         // all blocks on the right edge of the macroblock use bottom edge
1060         // the top macroblock for their topright edge
1061         uint8_t *tr_right = ptr - s->linesize + 16;
1062
1063         // if we're on the right edge of the frame, said edge is extended
1064         // from the top macroblock
1065         if (!(!mb_y && avctx->flags & CODEC_FLAG_EMU_EDGE) &&
1066             mb_x == s->mb_width-1) {
1067             tr = tr_right[-1]*0x01010101u;
1068             tr_right = (uint8_t *)&tr;
1069         }
1070
1071         if (mb->skip)
1072             AV_ZERO128(td->non_zero_count_cache);
1073
1074         for (y = 0; y < 4; y++) {
1075             uint8_t *topright = ptr + 4 - s->linesize;
1076             for (x = 0; x < 4; x++) {
1077                 int copy = 0, linesize = s->linesize;
1078                 uint8_t *dst = ptr+4*x;
1079                 DECLARE_ALIGNED(4, uint8_t, copy_dst)[5*8];
1080
1081                 if ((y == 0 || x == 3) && mb_y == 0 && avctx->flags & CODEC_FLAG_EMU_EDGE) {
1082                     topright = tr_top;
1083                 } else if (x == 3)
1084                     topright = tr_right;
1085
1086                 if (avctx->flags & CODEC_FLAG_EMU_EDGE) { // mb_x+x or mb_y+y is a hack but works
1087                     mode = check_intra_pred4x4_mode_emuedge(intra4x4[x], mb_x + x, mb_y + y, &copy);
1088                     if (copy) {
1089                         dst = copy_dst + 12;
1090                         linesize = 8;
1091                         if (!(mb_y + y)) {
1092                             copy_dst[3] = 127U;
1093                             AV_WN32A(copy_dst+4, 127U * 0x01010101U);
1094                         } else {
1095                             AV_COPY32(copy_dst+4, ptr+4*x-s->linesize);
1096                             if (!(mb_x + x)) {
1097                                 copy_dst[3] = 129U;
1098                             } else {
1099                                 copy_dst[3] = ptr[4*x-s->linesize-1];
1100                             }
1101                         }
1102                         if (!(mb_x + x)) {
1103                             copy_dst[11] =
1104                             copy_dst[19] =
1105                             copy_dst[27] =
1106                             copy_dst[35] = 129U;
1107                         } else {
1108                             copy_dst[11] = ptr[4*x              -1];
1109                             copy_dst[19] = ptr[4*x+s->linesize  -1];
1110                             copy_dst[27] = ptr[4*x+s->linesize*2-1];
1111                             copy_dst[35] = ptr[4*x+s->linesize*3-1];
1112                         }
1113                     }
1114                 } else {
1115                     mode = intra4x4[x];
1116                 }
1117                 s->hpc.pred4x4[mode](dst, topright, linesize);
1118                 if (copy) {
1119                     AV_COPY32(ptr+4*x              , copy_dst+12);
1120                     AV_COPY32(ptr+4*x+s->linesize  , copy_dst+20);
1121                     AV_COPY32(ptr+4*x+s->linesize*2, copy_dst+28);
1122                     AV_COPY32(ptr+4*x+s->linesize*3, copy_dst+36);
1123                 }
1124
1125                 nnz = td->non_zero_count_cache[y][x];
1126                 if (nnz) {
1127                     if (nnz == 1)
1128                         s->vp8dsp.vp8_idct_dc_add(ptr+4*x, td->block[y][x], s->linesize);
1129                     else
1130                         s->vp8dsp.vp8_idct_add(ptr+4*x, td->block[y][x], s->linesize);
1131                 }
1132                 topright += 4;
1133             }
1134
1135             ptr   += 4*s->linesize;
1136             intra4x4 += 4;
1137         }
1138     }
1139
1140     if (avctx->flags & CODEC_FLAG_EMU_EDGE) {
1141         mode = check_intra_pred8x8_mode_emuedge(mb->chroma_pred_mode, mb_x, mb_y);
1142     } else {
1143         mode = check_intra_pred8x8_mode(mb->chroma_pred_mode, mb_x, mb_y);
1144     }
1145     s->hpc.pred8x8[mode](dst[1], s->uvlinesize);
1146     s->hpc.pred8x8[mode](dst[2], s->uvlinesize);
1147
1148     if (!(avctx->flags & CODEC_FLAG_EMU_EDGE && !mb_y) && (s->deblock_filter || !mb_y) && td->thread_nr == 0)
1149         xchg_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2],
1150                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1151                        s->filter.simple, 0);
1152 }
1153
1154 static const uint8_t subpel_idx[3][8] = {
1155     { 0, 1, 2, 1, 2, 1, 2, 1 }, // nr. of left extra pixels,
1156                                 // also function pointer index
1157     { 0, 3, 5, 3, 5, 3, 5, 3 }, // nr. of extra pixels required
1158     { 0, 2, 3, 2, 3, 2, 3, 2 }, // nr. of right extra pixels
1159 };
1160
1161 /**
1162  * luma MC function
1163  *
1164  * @param s VP8 decoding context
1165  * @param dst target buffer for block data at block position
1166  * @param ref reference picture buffer at origin (0, 0)
1167  * @param mv motion vector (relative to block position) to get pixel data from
1168  * @param x_off horizontal position of block from origin (0, 0)
1169  * @param y_off vertical position of block from origin (0, 0)
1170  * @param block_w width of block (16, 8 or 4)
1171  * @param block_h height of block (always same as block_w)
1172  * @param width width of src/dst plane data
1173  * @param height height of src/dst plane data
1174  * @param linesize size of a single line of plane data, including padding
1175  * @param mc_func motion compensation function pointers (bilinear or sixtap MC)
1176  */
1177 static av_always_inline
1178 void vp8_mc_luma(VP8Context *s, VP8ThreadData *td, uint8_t *dst,
1179                  AVFrame *ref, const VP56mv *mv,
1180                  int x_off, int y_off, int block_w, int block_h,
1181                  int width, int height, int linesize,
1182                  vp8_mc_func mc_func[3][3])
1183 {
1184     uint8_t *src = ref->data[0];
1185
1186     if (AV_RN32A(mv)) {
1187
1188         int mx = (mv->x << 1)&7, mx_idx = subpel_idx[0][mx];
1189         int my = (mv->y << 1)&7, my_idx = subpel_idx[0][my];
1190
1191         x_off += mv->x >> 2;
1192         y_off += mv->y >> 2;
1193
1194         // edge emulation
1195         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 4, 0);
1196         src += y_off * linesize + x_off;
1197         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1198             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1199             s->dsp.emulated_edge_mc(td->edge_emu_buffer, src - my_idx * linesize - mx_idx, linesize,
1200                                     block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1201                                     x_off - mx_idx, y_off - my_idx, width, height);
1202             src = td->edge_emu_buffer + mx_idx + linesize * my_idx;
1203         }
1204         mc_func[my_idx][mx_idx](dst, linesize, src, linesize, block_h, mx, my);
1205     } else {
1206         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 4, 0);
1207         mc_func[0][0](dst, linesize, src + y_off * linesize + x_off, linesize, block_h, 0, 0);
1208     }
1209 }
1210
1211 /**
1212  * chroma MC function
1213  *
1214  * @param s VP8 decoding context
1215  * @param dst1 target buffer for block data at block position (U plane)
1216  * @param dst2 target buffer for block data at block position (V plane)
1217  * @param ref reference picture buffer at origin (0, 0)
1218  * @param mv motion vector (relative to block position) to get pixel data from
1219  * @param x_off horizontal position of block from origin (0, 0)
1220  * @param y_off vertical position of block from origin (0, 0)
1221  * @param block_w width of block (16, 8 or 4)
1222  * @param block_h height of block (always same as block_w)
1223  * @param width width of src/dst plane data
1224  * @param height height of src/dst plane data
1225  * @param linesize size of a single line of plane data, including padding
1226  * @param mc_func motion compensation function pointers (bilinear or sixtap MC)
1227  */
1228 static av_always_inline
1229 void vp8_mc_chroma(VP8Context *s, VP8ThreadData *td, uint8_t *dst1, uint8_t *dst2,
1230                    AVFrame *ref, const VP56mv *mv, int x_off, int y_off,
1231                    int block_w, int block_h, int width, int height, int linesize,
1232                    vp8_mc_func mc_func[3][3])
1233 {
1234     uint8_t *src1 = ref->data[1], *src2 = ref->data[2];
1235
1236     if (AV_RN32A(mv)) {
1237         int mx = mv->x&7, mx_idx = subpel_idx[0][mx];
1238         int my = mv->y&7, my_idx = subpel_idx[0][my];
1239
1240         x_off += mv->x >> 3;
1241         y_off += mv->y >> 3;
1242
1243         // edge emulation
1244         src1 += y_off * linesize + x_off;
1245         src2 += y_off * linesize + x_off;
1246         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 3, 0);
1247         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1248             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1249             s->dsp.emulated_edge_mc(td->edge_emu_buffer, src1 - my_idx * linesize - mx_idx, linesize,
1250                                     block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1251                                     x_off - mx_idx, y_off - my_idx, width, height);
1252             src1 = td->edge_emu_buffer + mx_idx + linesize * my_idx;
1253             mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my);
1254
1255             s->dsp.emulated_edge_mc(td->edge_emu_buffer, src2 - my_idx * linesize - mx_idx, linesize,
1256                                     block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1257                                     x_off - mx_idx, y_off - my_idx, width, height);
1258             src2 = td->edge_emu_buffer + mx_idx + linesize * my_idx;
1259             mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my);
1260         } else {
1261             mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my);
1262             mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my);
1263         }
1264     } else {
1265         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 3, 0);
1266         mc_func[0][0](dst1, linesize, src1 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1267         mc_func[0][0](dst2, linesize, src2 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1268     }
1269 }
1270
1271 static av_always_inline
1272 void vp8_mc_part(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1273                  AVFrame *ref_frame, int x_off, int y_off,
1274                  int bx_off, int by_off,
1275                  int block_w, int block_h,
1276                  int width, int height, VP56mv *mv)
1277 {
1278     VP56mv uvmv = *mv;
1279
1280     /* Y */
1281     vp8_mc_luma(s, td, dst[0] + by_off * s->linesize + bx_off,
1282                 ref_frame, mv, x_off + bx_off, y_off + by_off,
1283                 block_w, block_h, width, height, s->linesize,
1284                 s->put_pixels_tab[block_w == 8]);
1285
1286     /* U/V */
1287     if (s->profile == 3) {
1288         uvmv.x &= ~7;
1289         uvmv.y &= ~7;
1290     }
1291     x_off   >>= 1; y_off   >>= 1;
1292     bx_off  >>= 1; by_off  >>= 1;
1293     width   >>= 1; height  >>= 1;
1294     block_w >>= 1; block_h >>= 1;
1295     vp8_mc_chroma(s, td, dst[1] + by_off * s->uvlinesize + bx_off,
1296                   dst[2] + by_off * s->uvlinesize + bx_off, ref_frame,
1297                   &uvmv, x_off + bx_off, y_off + by_off,
1298                   block_w, block_h, width, height, s->uvlinesize,
1299                   s->put_pixels_tab[1 + (block_w == 4)]);
1300 }
1301
1302 /* Fetch pixels for estimated mv 4 macroblocks ahead.
1303  * Optimized for 64-byte cache lines.  Inspired by ffh264 prefetch_motion. */
1304 static av_always_inline void prefetch_motion(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, int mb_xy, int ref)
1305 {
1306     /* Don't prefetch refs that haven't been used very often this frame. */
1307     if (s->ref_count[ref-1] > (mb_xy >> 5)) {
1308         int x_off = mb_x << 4, y_off = mb_y << 4;
1309         int mx = (mb->mv.x>>2) + x_off + 8;
1310         int my = (mb->mv.y>>2) + y_off;
1311         uint8_t **src= s->framep[ref]->data;
1312         int off= mx + (my + (mb_x&3)*4)*s->linesize + 64;
1313         /* For threading, a ff_thread_await_progress here might be useful, but
1314          * it actually slows down the decoder. Since a bad prefetch doesn't
1315          * generate bad decoder output, we don't run it here. */
1316         s->dsp.prefetch(src[0]+off, s->linesize, 4);
1317         off= (mx>>1) + ((my>>1) + (mb_x&7))*s->uvlinesize + 64;
1318         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
1319     }
1320 }
1321
1322 /**
1323  * Apply motion vectors to prediction buffer, chapter 18.
1324  */
1325 static av_always_inline
1326 void inter_predict(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1327                    VP8Macroblock *mb, int mb_x, int mb_y)
1328 {
1329     int x_off = mb_x << 4, y_off = mb_y << 4;
1330     int width = 16*s->mb_width, height = 16*s->mb_height;
1331     AVFrame *ref = s->framep[mb->ref_frame];
1332     VP56mv *bmv = mb->bmv;
1333
1334     switch (mb->partitioning) {
1335     case VP8_SPLITMVMODE_NONE:
1336         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1337                     0, 0, 16, 16, width, height, &mb->mv);
1338         break;
1339     case VP8_SPLITMVMODE_4x4: {
1340         int x, y;
1341         VP56mv uvmv;
1342
1343         /* Y */
1344         for (y = 0; y < 4; y++) {
1345             for (x = 0; x < 4; x++) {
1346                 vp8_mc_luma(s, td, dst[0] + 4*y*s->linesize + x*4,
1347                             ref, &bmv[4*y + x],
1348                             4*x + x_off, 4*y + y_off, 4, 4,
1349                             width, height, s->linesize,
1350                             s->put_pixels_tab[2]);
1351             }
1352         }
1353
1354         /* U/V */
1355         x_off >>= 1; y_off >>= 1; width >>= 1; height >>= 1;
1356         for (y = 0; y < 2; y++) {
1357             for (x = 0; x < 2; x++) {
1358                 uvmv.x = mb->bmv[ 2*y    * 4 + 2*x  ].x +
1359                          mb->bmv[ 2*y    * 4 + 2*x+1].x +
1360                          mb->bmv[(2*y+1) * 4 + 2*x  ].x +
1361                          mb->bmv[(2*y+1) * 4 + 2*x+1].x;
1362                 uvmv.y = mb->bmv[ 2*y    * 4 + 2*x  ].y +
1363                          mb->bmv[ 2*y    * 4 + 2*x+1].y +
1364                          mb->bmv[(2*y+1) * 4 + 2*x  ].y +
1365                          mb->bmv[(2*y+1) * 4 + 2*x+1].y;
1366                 uvmv.x = (uvmv.x + 2 + (uvmv.x >> (INT_BIT-1))) >> 2;
1367                 uvmv.y = (uvmv.y + 2 + (uvmv.y >> (INT_BIT-1))) >> 2;
1368                 if (s->profile == 3) {
1369                     uvmv.x &= ~7;
1370                     uvmv.y &= ~7;
1371                 }
1372                 vp8_mc_chroma(s, td, dst[1] + 4*y*s->uvlinesize + x*4,
1373                               dst[2] + 4*y*s->uvlinesize + x*4, ref, &uvmv,
1374                               4*x + x_off, 4*y + y_off, 4, 4,
1375                               width, height, s->uvlinesize,
1376                               s->put_pixels_tab[2]);
1377             }
1378         }
1379         break;
1380     }
1381     case VP8_SPLITMVMODE_16x8:
1382         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1383                     0, 0, 16, 8, width, height, &bmv[0]);
1384         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1385                     0, 8, 16, 8, width, height, &bmv[1]);
1386         break;
1387     case VP8_SPLITMVMODE_8x16:
1388         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1389                     0, 0, 8, 16, width, height, &bmv[0]);
1390         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1391                     8, 0, 8, 16, width, height, &bmv[1]);
1392         break;
1393     case VP8_SPLITMVMODE_8x8:
1394         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1395                     0, 0, 8, 8, width, height, &bmv[0]);
1396         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1397                     8, 0, 8, 8, width, height, &bmv[1]);
1398         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1399                     0, 8, 8, 8, width, height, &bmv[2]);
1400         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1401                     8, 8, 8, 8, width, height, &bmv[3]);
1402         break;
1403     }
1404 }
1405
1406 static av_always_inline void idct_mb(VP8Context *s, VP8ThreadData *td,
1407                                      uint8_t *dst[3], VP8Macroblock *mb)
1408 {
1409     int x, y, ch;
1410
1411     if (mb->mode != MODE_I4x4) {
1412         uint8_t *y_dst = dst[0];
1413         for (y = 0; y < 4; y++) {
1414             uint32_t nnz4 = AV_RL32(td->non_zero_count_cache[y]);
1415             if (nnz4) {
1416                 if (nnz4&~0x01010101) {
1417                     for (x = 0; x < 4; x++) {
1418                         if ((uint8_t)nnz4 == 1)
1419                             s->vp8dsp.vp8_idct_dc_add(y_dst+4*x, td->block[y][x], s->linesize);
1420                         else if((uint8_t)nnz4 > 1)
1421                             s->vp8dsp.vp8_idct_add(y_dst+4*x, td->block[y][x], s->linesize);
1422                         nnz4 >>= 8;
1423                         if (!nnz4)
1424                             break;
1425                     }
1426                 } else {
1427                     s->vp8dsp.vp8_idct_dc_add4y(y_dst, td->block[y], s->linesize);
1428                 }
1429             }
1430             y_dst += 4*s->linesize;
1431         }
1432     }
1433
1434     for (ch = 0; ch < 2; ch++) {
1435         uint32_t nnz4 = AV_RL32(td->non_zero_count_cache[4+ch]);
1436         if (nnz4) {
1437             uint8_t *ch_dst = dst[1+ch];
1438             if (nnz4&~0x01010101) {
1439                 for (y = 0; y < 2; y++) {
1440                     for (x = 0; x < 2; x++) {
1441                         if ((uint8_t)nnz4 == 1)
1442                             s->vp8dsp.vp8_idct_dc_add(ch_dst+4*x, td->block[4+ch][(y<<1)+x], s->uvlinesize);
1443                         else if((uint8_t)nnz4 > 1)
1444                             s->vp8dsp.vp8_idct_add(ch_dst+4*x, td->block[4+ch][(y<<1)+x], s->uvlinesize);
1445                         nnz4 >>= 8;
1446                         if (!nnz4)
1447                             goto chroma_idct_end;
1448                     }
1449                     ch_dst += 4*s->uvlinesize;
1450                 }
1451             } else {
1452                 s->vp8dsp.vp8_idct_dc_add4uv(ch_dst, td->block[4+ch], s->uvlinesize);
1453             }
1454         }
1455 chroma_idct_end: ;
1456     }
1457 }
1458
1459 static av_always_inline void filter_level_for_mb(VP8Context *s, VP8Macroblock *mb, VP8FilterStrength *f )
1460 {
1461     int interior_limit, filter_level;
1462
1463     if (s->segmentation.enabled) {
1464         filter_level = s->segmentation.filter_level[mb->segment];
1465         if (!s->segmentation.absolute_vals)
1466             filter_level += s->filter.level;
1467     } else
1468         filter_level = s->filter.level;
1469
1470     if (s->lf_delta.enabled) {
1471         filter_level += s->lf_delta.ref[mb->ref_frame];
1472         filter_level += s->lf_delta.mode[mb->mode];
1473     }
1474
1475     filter_level = av_clip_uintp2(filter_level, 6);
1476
1477     interior_limit = filter_level;
1478     if (s->filter.sharpness) {
1479         interior_limit >>= (s->filter.sharpness + 3) >> 2;
1480         interior_limit = FFMIN(interior_limit, 9 - s->filter.sharpness);
1481     }
1482     interior_limit = FFMAX(interior_limit, 1);
1483
1484     f->filter_level = filter_level;
1485     f->inner_limit = interior_limit;
1486     f->inner_filter = !mb->skip || mb->mode == MODE_I4x4 || mb->mode == VP8_MVMODE_SPLIT;
1487 }
1488
1489 static av_always_inline void filter_mb(VP8Context *s, uint8_t *dst[3], VP8FilterStrength *f, int mb_x, int mb_y)
1490 {
1491     int mbedge_lim, bedge_lim, hev_thresh;
1492     int filter_level = f->filter_level;
1493     int inner_limit = f->inner_limit;
1494     int inner_filter = f->inner_filter;
1495     int linesize = s->linesize;
1496     int uvlinesize = s->uvlinesize;
1497     static const uint8_t hev_thresh_lut[2][64] = {
1498         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
1499           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1500           3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1501           3, 3, 3, 3 },
1502         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
1503           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1504           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1505           2, 2, 2, 2 }
1506     };
1507
1508     if (!filter_level)
1509         return;
1510
1511      bedge_lim = 2*filter_level + inner_limit;
1512     mbedge_lim = bedge_lim + 4;
1513
1514     hev_thresh = hev_thresh_lut[s->keyframe][filter_level];
1515
1516     if (mb_x) {
1517         s->vp8dsp.vp8_h_loop_filter16y(dst[0],     linesize,
1518                                        mbedge_lim, inner_limit, hev_thresh);
1519         s->vp8dsp.vp8_h_loop_filter8uv(dst[1],     dst[2],      uvlinesize,
1520                                        mbedge_lim, inner_limit, hev_thresh);
1521     }
1522
1523     if (inner_filter) {
1524         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+ 4, linesize, bedge_lim,
1525                                              inner_limit, hev_thresh);
1526         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+ 8, linesize, bedge_lim,
1527                                              inner_limit, hev_thresh);
1528         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+12, linesize, bedge_lim,
1529                                              inner_limit, hev_thresh);
1530         s->vp8dsp.vp8_h_loop_filter8uv_inner(dst[1] + 4, dst[2] + 4,
1531                                              uvlinesize,  bedge_lim,
1532                                              inner_limit, hev_thresh);
1533     }
1534
1535     if (mb_y) {
1536         s->vp8dsp.vp8_v_loop_filter16y(dst[0],     linesize,
1537                                        mbedge_lim, inner_limit, hev_thresh);
1538         s->vp8dsp.vp8_v_loop_filter8uv(dst[1],     dst[2],      uvlinesize,
1539                                        mbedge_lim, inner_limit, hev_thresh);
1540     }
1541
1542     if (inner_filter) {
1543         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+ 4*linesize,
1544                                              linesize,    bedge_lim,
1545                                              inner_limit, hev_thresh);
1546         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+ 8*linesize,
1547                                              linesize,    bedge_lim,
1548                                              inner_limit, hev_thresh);
1549         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+12*linesize,
1550                                              linesize,    bedge_lim,
1551                                              inner_limit, hev_thresh);
1552         s->vp8dsp.vp8_v_loop_filter8uv_inner(dst[1] + 4 * uvlinesize,
1553                                              dst[2] + 4 * uvlinesize,
1554                                              uvlinesize,  bedge_lim,
1555                                              inner_limit, hev_thresh);
1556     }
1557 }
1558
1559 static av_always_inline void filter_mb_simple(VP8Context *s, uint8_t *dst, VP8FilterStrength *f, int mb_x, int mb_y)
1560 {
1561     int mbedge_lim, bedge_lim;
1562     int filter_level = f->filter_level;
1563     int inner_limit = f->inner_limit;
1564     int inner_filter = f->inner_filter;
1565     int linesize = s->linesize;
1566
1567     if (!filter_level)
1568         return;
1569
1570      bedge_lim = 2*filter_level + inner_limit;
1571     mbedge_lim = bedge_lim + 4;
1572
1573     if (mb_x)
1574         s->vp8dsp.vp8_h_loop_filter_simple(dst, linesize, mbedge_lim);
1575     if (inner_filter) {
1576         s->vp8dsp.vp8_h_loop_filter_simple(dst+ 4, linesize, bedge_lim);
1577         s->vp8dsp.vp8_h_loop_filter_simple(dst+ 8, linesize, bedge_lim);
1578         s->vp8dsp.vp8_h_loop_filter_simple(dst+12, linesize, bedge_lim);
1579     }
1580
1581     if (mb_y)
1582         s->vp8dsp.vp8_v_loop_filter_simple(dst, linesize, mbedge_lim);
1583     if (inner_filter) {
1584         s->vp8dsp.vp8_v_loop_filter_simple(dst+ 4*linesize, linesize, bedge_lim);
1585         s->vp8dsp.vp8_v_loop_filter_simple(dst+ 8*linesize, linesize, bedge_lim);
1586         s->vp8dsp.vp8_v_loop_filter_simple(dst+12*linesize, linesize, bedge_lim);
1587     }
1588 }
1589
1590 static void release_queued_segmaps(VP8Context *s, int is_close)
1591 {
1592     int leave_behind = is_close ? 0 : !s->maps_are_invalid;
1593     while (s->num_maps_to_be_freed > leave_behind)
1594         av_freep(&s->segmentation_maps[--s->num_maps_to_be_freed]);
1595     s->maps_are_invalid = 0;
1596 }
1597
1598 #define MARGIN (16 << 2)
1599 static void vp8_decode_mv_mb_modes(AVCodecContext *avctx, AVFrame *curframe,
1600                                    AVFrame *prev_frame)
1601 {
1602     VP8Context *s = avctx->priv_data;
1603     int mb_x, mb_y;
1604
1605     s->mv_min.y = -MARGIN;
1606     s->mv_max.y = ((s->mb_height - 1) << 6) + MARGIN;
1607     for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
1608         VP8Macroblock *mb = s->macroblocks_base + ((s->mb_width+1)*(mb_y + 1) + 1);
1609         int mb_xy = mb_y*s->mb_width;
1610
1611         AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED*0x01010101);
1612
1613         s->mv_min.x = -MARGIN;
1614         s->mv_max.x = ((s->mb_width - 1) << 6) + MARGIN;
1615         for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
1616             if (mb_y == 0)
1617                 AV_WN32A((mb-s->mb_width-1)->intra4x4_pred_mode_top, DC_PRED*0x01010101);
1618             decode_mb_mode(s, mb, mb_x, mb_y, curframe->ref_index[0] + mb_xy,
1619                            prev_frame && prev_frame->ref_index[0] ? prev_frame->ref_index[0] + mb_xy : NULL, 1);
1620             s->mv_min.x -= 64;
1621             s->mv_max.x -= 64;
1622         }
1623         s->mv_min.y -= 64;
1624         s->mv_max.y -= 64;
1625     }
1626 }
1627
1628 #if HAVE_THREADS
1629 #define check_thread_pos(td, otd, mb_x_check, mb_y_check)\
1630     do {\
1631         int tmp = (mb_y_check << 16) | (mb_x_check & 0xFFFF);\
1632         if (otd->thread_mb_pos < tmp) {\
1633             pthread_mutex_lock(&otd->lock);\
1634             td->wait_mb_pos = tmp;\
1635             do {\
1636                 if (otd->thread_mb_pos >= tmp)\
1637                     break;\
1638                 pthread_cond_wait(&otd->cond, &otd->lock);\
1639             } while (1);\
1640             td->wait_mb_pos = INT_MAX;\
1641             pthread_mutex_unlock(&otd->lock);\
1642         }\
1643     } while(0);
1644
1645 #define update_pos(td, mb_y, mb_x)\
1646     do {\
1647     int pos              = (mb_y << 16) | (mb_x & 0xFFFF);\
1648     int sliced_threading = (avctx->active_thread_type == FF_THREAD_SLICE) && (num_jobs > 1);\
1649     int is_null          = (next_td == NULL) || (prev_td == NULL);\
1650     int pos_check        = (is_null) ? 1 :\
1651                             (next_td != td && pos >= next_td->wait_mb_pos) ||\
1652                             (prev_td != td && pos >= prev_td->wait_mb_pos);\
1653     td->thread_mb_pos = pos;\
1654     if (sliced_threading && pos_check) {\
1655         pthread_mutex_lock(&td->lock);\
1656         pthread_cond_broadcast(&td->cond);\
1657         pthread_mutex_unlock(&td->lock);\
1658     }\
1659     } while(0);
1660 #else
1661 #define check_thread_pos(td, otd, mb_x_check, mb_y_check)
1662 #define update_pos(td, mb_y, mb_x)
1663 #endif
1664
1665 static void vp8_decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
1666                                         int jobnr, int threadnr)
1667 {
1668     VP8Context *s = avctx->priv_data;
1669     VP8ThreadData *prev_td, *next_td, *td = &s->thread_data[threadnr];
1670     int mb_y = td->thread_mb_pos>>16;
1671     int i, y, mb_x, mb_xy = mb_y*s->mb_width;
1672     int num_jobs = s->num_jobs;
1673     AVFrame *curframe = s->curframe, *prev_frame = s->prev_frame;
1674     VP56RangeCoder *c = &s->coeff_partition[mb_y & (s->num_coeff_partitions-1)];
1675     VP8Macroblock *mb;
1676     uint8_t *dst[3] = {
1677         curframe->data[0] + 16*mb_y*s->linesize,
1678         curframe->data[1] +  8*mb_y*s->uvlinesize,
1679         curframe->data[2] +  8*mb_y*s->uvlinesize
1680     };
1681     if (mb_y == 0) prev_td = td;
1682     else           prev_td = &s->thread_data[(jobnr + num_jobs - 1)%num_jobs];
1683     if (mb_y == s->mb_height-1) next_td = td;
1684     else                        next_td = &s->thread_data[(jobnr + 1)%num_jobs];
1685     if (s->mb_layout == 1)
1686         mb = s->macroblocks_base + ((s->mb_width+1)*(mb_y + 1) + 1);
1687     else {
1688         mb = s->macroblocks + (s->mb_height - mb_y - 1)*2;
1689         memset(mb - 1, 0, sizeof(*mb)); // zero left macroblock
1690         AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED*0x01010101);
1691     }
1692
1693     memset(td->left_nnz, 0, sizeof(td->left_nnz));
1694     // left edge of 129 for intra prediction
1695     if (!(avctx->flags & CODEC_FLAG_EMU_EDGE)) {
1696         for (i = 0; i < 3; i++)
1697             for (y = 0; y < 16>>!!i; y++)
1698                 dst[i][y*curframe->linesize[i]-1] = 129;
1699         if (mb_y == 1) {
1700             s->top_border[0][15] = s->top_border[0][23] = s->top_border[0][31] = 129;
1701         }
1702     }
1703
1704     s->mv_min.x = -MARGIN;
1705     s->mv_max.x = ((s->mb_width  - 1) << 6) + MARGIN;
1706
1707     for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
1708         // Wait for previous thread to read mb_x+2, and reach mb_y-1.
1709         if (prev_td != td) {
1710             if (threadnr != 0) {
1711                 check_thread_pos(td, prev_td, mb_x+1, mb_y-1);
1712             } else {
1713                 check_thread_pos(td, prev_td, (s->mb_width+3) + (mb_x+1), mb_y-1);
1714             }
1715         }
1716
1717         s->dsp.prefetch(dst[0] + (mb_x&3)*4*s->linesize + 64, s->linesize, 4);
1718         s->dsp.prefetch(dst[1] + (mb_x&7)*s->uvlinesize + 64, dst[2] - dst[1], 2);
1719
1720         if (!s->mb_layout)
1721             decode_mb_mode(s, mb, mb_x, mb_y, curframe->ref_index[0] + mb_xy,
1722                            prev_frame && prev_frame->ref_index[0] ? prev_frame->ref_index[0] + mb_xy : NULL, 0);
1723
1724         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_PREVIOUS);
1725
1726         if (!mb->skip)
1727             decode_mb_coeffs(s, td, c, mb, s->top_nnz[mb_x], td->left_nnz);
1728
1729         if (mb->mode <= MODE_I4x4)
1730             intra_predict(s, td, dst, mb, mb_x, mb_y);
1731         else
1732             inter_predict(s, td, dst, mb, mb_x, mb_y);
1733
1734         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN);
1735
1736         if (!mb->skip) {
1737             idct_mb(s, td, dst, mb);
1738         } else {
1739             AV_ZERO64(td->left_nnz);
1740             AV_WN64(s->top_nnz[mb_x], 0);   // array of 9, so unaligned
1741
1742             // Reset DC block predictors if they would exist if the mb had coefficients
1743             if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
1744                 td->left_nnz[8]     = 0;
1745                 s->top_nnz[mb_x][8] = 0;
1746             }
1747         }
1748
1749         if (s->deblock_filter)
1750             filter_level_for_mb(s, mb, &td->filter_strength[mb_x]);
1751
1752         if (s->deblock_filter && num_jobs != 1 && threadnr == num_jobs-1) {
1753             if (s->filter.simple)
1754                 backup_mb_border(s->top_border[mb_x+1], dst[0], NULL, NULL, s->linesize, 0, 1);
1755             else
1756                 backup_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2], s->linesize, s->uvlinesize, 0);
1757         }
1758
1759         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN2);
1760
1761         dst[0] += 16;
1762         dst[1] += 8;
1763         dst[2] += 8;
1764         s->mv_min.x -= 64;
1765         s->mv_max.x -= 64;
1766
1767         if (mb_x == s->mb_width+1) {
1768             update_pos(td, mb_y, s->mb_width+3);
1769         } else {
1770             update_pos(td, mb_y, mb_x);
1771         }
1772     }
1773 }
1774
1775 static void vp8_filter_mb_row(AVCodecContext *avctx, void *tdata,
1776                               int jobnr, int threadnr)
1777 {
1778     VP8Context *s = avctx->priv_data;
1779     VP8ThreadData *td = &s->thread_data[threadnr];
1780     int mb_x, mb_y = td->thread_mb_pos>>16, num_jobs = s->num_jobs;
1781     AVFrame *curframe = s->curframe;
1782     VP8Macroblock *mb;
1783     VP8ThreadData *prev_td, *next_td;
1784     uint8_t *dst[3] = {
1785         curframe->data[0] + 16*mb_y*s->linesize,
1786         curframe->data[1] +  8*mb_y*s->uvlinesize,
1787         curframe->data[2] +  8*mb_y*s->uvlinesize
1788     };
1789
1790     if (s->mb_layout == 1)
1791         mb = s->macroblocks_base + ((s->mb_width+1)*(mb_y + 1) + 1);
1792     else
1793         mb = s->macroblocks + (s->mb_height - mb_y - 1)*2;
1794
1795     if (mb_y == 0) prev_td = td;
1796     else           prev_td = &s->thread_data[(jobnr + num_jobs - 1)%num_jobs];
1797     if (mb_y == s->mb_height-1) next_td = td;
1798     else                        next_td = &s->thread_data[(jobnr + 1)%num_jobs];
1799
1800     for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb++) {
1801         VP8FilterStrength *f = &td->filter_strength[mb_x];
1802         if (prev_td != td) {
1803             check_thread_pos(td, prev_td, (mb_x+1) + (s->mb_width+3), mb_y-1);
1804         }
1805         if (next_td != td)
1806             if (next_td != &s->thread_data[0]) {
1807                 check_thread_pos(td, next_td, mb_x+1, mb_y+1);
1808             }
1809
1810         if (num_jobs == 1) {
1811             if (s->filter.simple)
1812                 backup_mb_border(s->top_border[mb_x+1], dst[0], NULL, NULL, s->linesize, 0, 1);
1813             else
1814                 backup_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2], s->linesize, s->uvlinesize, 0);
1815         }
1816
1817         if (s->filter.simple)
1818             filter_mb_simple(s, dst[0], f, mb_x, mb_y);
1819         else
1820             filter_mb(s, dst, f, mb_x, mb_y);
1821         dst[0] += 16;
1822         dst[1] += 8;
1823         dst[2] += 8;
1824
1825         update_pos(td, mb_y, (s->mb_width+3) + mb_x);
1826     }
1827 }
1828
1829 static int vp8_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata,
1830                                     int jobnr, int threadnr)
1831 {
1832     VP8Context *s = avctx->priv_data;
1833     VP8ThreadData *td = &s->thread_data[jobnr];
1834     VP8ThreadData *next_td = NULL, *prev_td = NULL;
1835     AVFrame *curframe = s->curframe;
1836     int mb_y, num_jobs = s->num_jobs;
1837     td->thread_nr = threadnr;
1838     for (mb_y = jobnr; mb_y < s->mb_height; mb_y += num_jobs) {
1839         if (mb_y >= s->mb_height) break;
1840         td->thread_mb_pos = mb_y<<16;
1841         vp8_decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr);
1842         if (s->deblock_filter)
1843             vp8_filter_mb_row(avctx, tdata, jobnr, threadnr);
1844         update_pos(td, mb_y, INT_MAX & 0xFFFF);
1845
1846         s->mv_min.y -= 64;
1847         s->mv_max.y -= 64;
1848
1849         if (avctx->active_thread_type == FF_THREAD_FRAME)
1850             ff_thread_report_progress(curframe, mb_y, 0);
1851     }
1852
1853     return 0;
1854 }
1855
1856 static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
1857                             AVPacket *avpkt)
1858 {
1859     VP8Context *s = avctx->priv_data;
1860     int ret, i, referenced, num_jobs;
1861     enum AVDiscard skip_thresh;
1862     AVFrame *av_uninit(curframe), *prev_frame;
1863
1864     release_queued_segmaps(s, 0);
1865
1866     if ((ret = decode_frame_header(s, avpkt->data, avpkt->size)) < 0)
1867         goto err;
1868
1869     prev_frame = s->framep[VP56_FRAME_CURRENT];
1870
1871     referenced = s->update_last || s->update_golden == VP56_FRAME_CURRENT
1872                                 || s->update_altref == VP56_FRAME_CURRENT;
1873
1874     skip_thresh = !referenced ? AVDISCARD_NONREF :
1875                     !s->keyframe ? AVDISCARD_NONKEY : AVDISCARD_ALL;
1876
1877     if (avctx->skip_frame >= skip_thresh) {
1878         s->invisible = 1;
1879         memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
1880         goto skip_decode;
1881     }
1882     s->deblock_filter = s->filter.level && avctx->skip_loop_filter < skip_thresh;
1883
1884     // release no longer referenced frames
1885     for (i = 0; i < 5; i++)
1886         if (s->frames[i].data[0] &&
1887             &s->frames[i] != prev_frame &&
1888             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
1889             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN] &&
1890             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2])
1891             vp8_release_frame(s, &s->frames[i], 1, 0);
1892
1893     // find a free buffer
1894     for (i = 0; i < 5; i++)
1895         if (&s->frames[i] != prev_frame &&
1896             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
1897             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN] &&
1898             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2]) {
1899             curframe = s->framep[VP56_FRAME_CURRENT] = &s->frames[i];
1900             break;
1901         }
1902     if (i == 5) {
1903         av_log(avctx, AV_LOG_FATAL, "Ran out of free frames!\n");
1904         abort();
1905     }
1906     if (curframe->data[0])
1907         vp8_release_frame(s, curframe, 1, 0);
1908
1909     // Given that arithmetic probabilities are updated every frame, it's quite likely
1910     // that the values we have on a random interframe are complete junk if we didn't
1911     // start decode on a keyframe. So just don't display anything rather than junk.
1912     if (!s->keyframe && (!s->framep[VP56_FRAME_PREVIOUS] ||
1913                          !s->framep[VP56_FRAME_GOLDEN] ||
1914                          !s->framep[VP56_FRAME_GOLDEN2])) {
1915         av_log(avctx, AV_LOG_WARNING, "Discarding interframe without a prior keyframe!\n");
1916         ret = AVERROR_INVALIDDATA;
1917         goto err;
1918     }
1919
1920     curframe->key_frame = s->keyframe;
1921     curframe->pict_type = s->keyframe ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
1922     curframe->reference = referenced ? 3 : 0;
1923     if ((ret = vp8_alloc_frame(s, curframe))) {
1924         av_log(avctx, AV_LOG_ERROR, "get_buffer() failed!\n");
1925         goto err;
1926     }
1927
1928     // check if golden and altref are swapped
1929     if (s->update_altref != VP56_FRAME_NONE) {
1930         s->next_framep[VP56_FRAME_GOLDEN2]  = s->framep[s->update_altref];
1931     } else {
1932         s->next_framep[VP56_FRAME_GOLDEN2]  = s->framep[VP56_FRAME_GOLDEN2];
1933     }
1934     if (s->update_golden != VP56_FRAME_NONE) {
1935         s->next_framep[VP56_FRAME_GOLDEN]   = s->framep[s->update_golden];
1936     } else {
1937         s->next_framep[VP56_FRAME_GOLDEN]   = s->framep[VP56_FRAME_GOLDEN];
1938     }
1939     if (s->update_last) {
1940         s->next_framep[VP56_FRAME_PREVIOUS] = curframe;
1941     } else {
1942         s->next_framep[VP56_FRAME_PREVIOUS] = s->framep[VP56_FRAME_PREVIOUS];
1943     }
1944     s->next_framep[VP56_FRAME_CURRENT]      = curframe;
1945
1946     ff_thread_finish_setup(avctx);
1947
1948     s->linesize   = curframe->linesize[0];
1949     s->uvlinesize = curframe->linesize[1];
1950
1951     if (!s->thread_data[0].edge_emu_buffer)
1952         for (i = 0; i < MAX_THREADS; i++)
1953             s->thread_data[i].edge_emu_buffer = av_malloc(21*s->linesize);
1954
1955     memset(s->top_nnz, 0, s->mb_width*sizeof(*s->top_nnz));
1956     /* Zero macroblock structures for top/top-left prediction from outside the frame. */
1957     if (!s->mb_layout)
1958         memset(s->macroblocks + s->mb_height*2 - 1, 0, (s->mb_width+1)*sizeof(*s->macroblocks));
1959     if (!s->mb_layout && s->keyframe)
1960         memset(s->intra4x4_pred_mode_top, DC_PRED, s->mb_width*4);
1961
1962     // top edge of 127 for intra prediction
1963     if (!(avctx->flags & CODEC_FLAG_EMU_EDGE)) {
1964         s->top_border[0][15] = s->top_border[0][23] = 127;
1965         memset(s->top_border[1]-1, 127, s->mb_width*sizeof(*s->top_border)+1);
1966     }
1967     memset(s->ref_count, 0, sizeof(s->ref_count));
1968
1969
1970     // Make sure the previous frame has read its segmentation map,
1971     // if we re-use the same map.
1972     if (prev_frame && s->segmentation.enabled && !s->segmentation.update_map)
1973         ff_thread_await_progress(prev_frame, 1, 0);
1974
1975     if (s->mb_layout == 1)
1976         vp8_decode_mv_mb_modes(avctx, curframe, prev_frame);
1977
1978     if (avctx->active_thread_type == FF_THREAD_FRAME)
1979         num_jobs = 1;
1980     else
1981         num_jobs = FFMIN(s->num_coeff_partitions, avctx->thread_count);
1982     s->num_jobs   = num_jobs;
1983     s->curframe   = curframe;
1984     s->prev_frame = prev_frame;
1985     s->mv_min.y   = -MARGIN;
1986     s->mv_max.y   = ((s->mb_height - 1) << 6) + MARGIN;
1987     for (i = 0; i < MAX_THREADS; i++) {
1988         s->thread_data[i].thread_mb_pos = 0;
1989         s->thread_data[i].wait_mb_pos = INT_MAX;
1990     }
1991     avctx->execute2(avctx, vp8_decode_mb_row_sliced, s->thread_data, NULL, num_jobs);
1992
1993     ff_thread_report_progress(curframe, INT_MAX, 0);
1994     memcpy(&s->framep[0], &s->next_framep[0], sizeof(s->framep[0]) * 4);
1995
1996 skip_decode:
1997     // if future frames don't use the updated probabilities,
1998     // reset them to the values we saved
1999     if (!s->update_probabilities)
2000         s->prob[0] = s->prob[1];
2001
2002     if (!s->invisible) {
2003         *(AVFrame*)data = *curframe;
2004         *data_size = sizeof(AVFrame);
2005     }
2006
2007     return avpkt->size;
2008 err:
2009     memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
2010     return ret;
2011 }
2012
2013 static av_cold int vp8_decode_init(AVCodecContext *avctx)
2014 {
2015     VP8Context *s = avctx->priv_data;
2016
2017     s->avctx = avctx;
2018     avctx->pix_fmt = PIX_FMT_YUV420P;
2019
2020     ff_dsputil_init(&s->dsp, avctx);
2021     ff_h264_pred_init(&s->hpc, CODEC_ID_VP8, 8, 1);
2022     ff_vp8dsp_init(&s->vp8dsp);
2023
2024     return 0;
2025 }
2026
2027 static av_cold int vp8_decode_free(AVCodecContext *avctx)
2028 {
2029     vp8_decode_flush_impl(avctx, 0, 1, 1);
2030     release_queued_segmaps(avctx->priv_data, 1);
2031     return 0;
2032 }
2033
2034 static av_cold int vp8_decode_init_thread_copy(AVCodecContext *avctx)
2035 {
2036     VP8Context *s = avctx->priv_data;
2037
2038     s->avctx = avctx;
2039
2040     return 0;
2041 }
2042
2043 #define REBASE(pic) \
2044     pic ? pic - &s_src->frames[0] + &s->frames[0] : NULL
2045
2046 static int vp8_decode_update_thread_context(AVCodecContext *dst, const AVCodecContext *src)
2047 {
2048     VP8Context *s = dst->priv_data, *s_src = src->priv_data;
2049
2050     if (s->macroblocks_base &&
2051         (s_src->mb_width != s->mb_width || s_src->mb_height != s->mb_height)) {
2052         free_buffers(s);
2053         s->maps_are_invalid = 1;
2054         s->mb_width  = s_src->mb_width;
2055         s->mb_height = s_src->mb_height;
2056     }
2057
2058     s->prob[0] = s_src->prob[!s_src->update_probabilities];
2059     s->segmentation = s_src->segmentation;
2060     s->lf_delta = s_src->lf_delta;
2061     memcpy(s->sign_bias, s_src->sign_bias, sizeof(s->sign_bias));
2062
2063     memcpy(&s->frames, &s_src->frames, sizeof(s->frames));
2064     s->framep[0] = REBASE(s_src->next_framep[0]);
2065     s->framep[1] = REBASE(s_src->next_framep[1]);
2066     s->framep[2] = REBASE(s_src->next_framep[2]);
2067     s->framep[3] = REBASE(s_src->next_framep[3]);
2068
2069     return 0;
2070 }
2071
2072 AVCodec ff_vp8_decoder = {
2073     .name                  = "vp8",
2074     .type                  = AVMEDIA_TYPE_VIDEO,
2075     .id                    = CODEC_ID_VP8,
2076     .priv_data_size        = sizeof(VP8Context),
2077     .init                  = vp8_decode_init,
2078     .close                 = vp8_decode_free,
2079     .decode                = vp8_decode_frame,
2080     .capabilities          = CODEC_CAP_DR1 | CODEC_CAP_FRAME_THREADS | CODEC_CAP_SLICE_THREADS,
2081     .flush                 = vp8_decode_flush,
2082     .long_name             = NULL_IF_CONFIG_SMALL("On2 VP8"),
2083     .init_thread_copy      = ONLY_IF_THREADS_ENABLED(vp8_decode_init_thread_copy),
2084     .update_thread_context = ONLY_IF_THREADS_ENABLED(vp8_decode_update_thread_context),
2085 };