git.sesse.net Git - ffmpeg/blob - libavcodec/vp8.c

   1 /*
   2  * VP8 compatible video decoder
   3  *
   4  * Copyright (C) 2010 David Conrad
   5  * Copyright (C) 2010 Ronald S. Bultje
   6  * Copyright (C) 2010 Jason Garrett-Glaser
   7  * Copyright (C) 2012 Daniel Kang
   8  *
   9  * This file is part of Libav.
  10  *
  11  * Libav is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public
  13  * License as published by the Free Software Foundation; either
  14  * version 2.1 of the License, or (at your option) any later version.
  15  *
  16  * Libav is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with Libav; if not, write to the Free Software
  23  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  24  */
  25
  26 #include "libavutil/imgutils.h"
  27 #include "avcodec.h"
  28 #include "internal.h"
  29 #include "vp8.h"
  30 #include "vp8data.h"
  31 #include "rectangle.h"
  32 #include "thread.h"
  33
  34 #if ARCH_ARM
  35 #   include "arm/vp8.h"
  36 #endif
  37
  38 static void free_buffers(VP8Context *s)
  39 {
  40     int i;
  41     if (s->thread_data)
  42         for (i = 0; i < MAX_THREADS; i++) {
  43 #if HAVE_THREADS
  44             pthread_cond_destroy(&s->thread_data[i].cond);
  45             pthread_mutex_destroy(&s->thread_data[i].lock);
  46 #endif
  47             av_freep(&s->thread_data[i].filter_strength);
  48             av_freep(&s->thread_data[i].edge_emu_buffer);
  49         }
  50     av_freep(&s->thread_data);
  51     av_freep(&s->macroblocks_base);
  52     av_freep(&s->intra4x4_pred_mode_top);
  53     av_freep(&s->top_nnz);
  54     av_freep(&s->top_border);
  55
  56     s->macroblocks = NULL;
  57 }
  58
  59 static int vp8_alloc_frame(VP8Context *s, VP8Frame *f, int ref)
  60 {
  61     int ret;
  62     if ((ret = ff_thread_get_buffer(s->avctx, &f->tf,
  63                                     ref ? AV_GET_BUFFER_FLAG_REF : 0)) < 0)
  64         return ret;
  65     if (!(f->seg_map = av_buffer_allocz(s->mb_width * s->mb_height))) {
  66         ff_thread_release_buffer(s->avctx, &f->tf);
  67         return AVERROR(ENOMEM);
  68     }
  69     return 0;
  70 }
  71
  72 static void vp8_release_frame(VP8Context *s, VP8Frame *f)
  73 {
  74     av_buffer_unref(&f->seg_map);
  75     ff_thread_release_buffer(s->avctx, &f->tf);
  76 }
  77
  78 static int vp8_ref_frame(VP8Context *s, VP8Frame *dst, VP8Frame *src)
  79 {
  80     int ret;
  81
  82     vp8_release_frame(s, dst);
  83
  84     if ((ret = ff_thread_ref_frame(&dst->tf, &src->tf)) < 0)
  85         return ret;
  86     if (src->seg_map &&
  87         !(dst->seg_map = av_buffer_ref(src->seg_map))) {
  88         vp8_release_frame(s, dst);
  89         return AVERROR(ENOMEM);
  90     }
  91
  92     return 0;
  93 }
  94
  95
  96 static void vp8_decode_flush_impl(AVCodecContext *avctx, int free_mem)
  97 {
  98     VP8Context *s = avctx->priv_data;
  99     int i;
 100
 101     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++)
 102         vp8_release_frame(s, &s->frames[i]);
 103     memset(s->framep, 0, sizeof(s->framep));
 104
 105     if (free_mem)
 106         free_buffers(s);
 107 }
 108
 109 static void vp8_decode_flush(AVCodecContext *avctx)
 110 {
 111     vp8_decode_flush_impl(avctx, 0);
 112 }
 113
 114 static int update_dimensions(VP8Context *s, int width, int height)
 115 {
 116     AVCodecContext *avctx = s->avctx;
 117     int i, ret;
 118
 119     if (width  != s->avctx->width ||
 120         height != s->avctx->height) {
 121         vp8_decode_flush_impl(s->avctx, 1);
 122
 123         ret = ff_set_dimensions(s->avctx, width, height);
 124         if (ret < 0)
 125             return ret;
 126     }
 127
 128     s->mb_width  = (s->avctx->coded_width +15) / 16;
 129     s->mb_height = (s->avctx->coded_height+15) / 16;
 130
 131     s->mb_layout = (avctx->active_thread_type == FF_THREAD_SLICE) && (FFMIN(s->num_coeff_partitions, avctx->thread_count) > 1);
 132     if (!s->mb_layout) { // Frame threading and one thread
 133         s->macroblocks_base       = av_mallocz((s->mb_width+s->mb_height*2+1)*sizeof(*s->macroblocks));
 134         s->intra4x4_pred_mode_top = av_mallocz(s->mb_width*4);
 135     }
 136     else // Sliced threading
 137         s->macroblocks_base       = av_mallocz((s->mb_width+2)*(s->mb_height+2)*sizeof(*s->macroblocks));
 138     s->top_nnz                    = av_mallocz(s->mb_width*sizeof(*s->top_nnz));
 139     s->top_border                 = av_mallocz((s->mb_width+1)*sizeof(*s->top_border));
 140     s->thread_data                = av_mallocz(MAX_THREADS*sizeof(VP8ThreadData));
 141
 142     for (i = 0; i < MAX_THREADS; i++) {
 143         s->thread_data[i].filter_strength = av_mallocz(s->mb_width*sizeof(*s->thread_data[0].filter_strength));
 144 #if HAVE_THREADS
 145         pthread_mutex_init(&s->thread_data[i].lock, NULL);
 146         pthread_cond_init(&s->thread_data[i].cond, NULL);
 147 #endif
 148     }
 149
 150     if (!s->macroblocks_base || !s->top_nnz || !s->top_border ||
 151         (!s->intra4x4_pred_mode_top && !s->mb_layout))
 152         return AVERROR(ENOMEM);
 153
 154     s->macroblocks        = s->macroblocks_base + 1;
 155
 156     return 0;
 157 }
 158
 159 static void parse_segment_info(VP8Context *s)
 160 {
 161     VP56RangeCoder *c = &s->c;
 162     int i;
 163
 164     s->segmentation.update_map = vp8_rac_get(c);
 165
 166     if (vp8_rac_get(c)) { // update segment feature data
 167         s->segmentation.absolute_vals = vp8_rac_get(c);
 168
 169         for (i = 0; i < 4; i++)
 170             s->segmentation.base_quant[i]   = vp8_rac_get_sint(c, 7);
 171
 172         for (i = 0; i < 4; i++)
 173             s->segmentation.filter_level[i] = vp8_rac_get_sint(c, 6);
 174     }
 175     if (s->segmentation.update_map)
 176         for (i = 0; i < 3; i++)
 177             s->prob->segmentid[i] = vp8_rac_get(c) ? vp8_rac_get_uint(c, 8) : 255;
 178 }
 179
 180 static void update_lf_deltas(VP8Context *s)
 181 {
 182     VP56RangeCoder *c = &s->c;
 183     int i;
 184
 185     for (i = 0; i < 4; i++) {
 186         if (vp8_rac_get(c)) {
 187             s->lf_delta.ref[i] = vp8_rac_get_uint(c, 6);
 188
 189             if (vp8_rac_get(c))
 190                 s->lf_delta.ref[i] = -s->lf_delta.ref[i];
 191         }
 192     }
 193
 194     for (i = MODE_I4x4; i <= VP8_MVMODE_SPLIT; i++) {
 195         if (vp8_rac_get(c)) {
 196             s->lf_delta.mode[i] = vp8_rac_get_uint(c, 6);
 197
 198             if (vp8_rac_get(c))
 199                 s->lf_delta.mode[i] = -s->lf_delta.mode[i];
 200         }
 201     }
 202 }
 203
 204 static int setup_partitions(VP8Context *s, const uint8_t *buf, int buf_size)
 205 {
 206     const uint8_t *sizes = buf;
 207     int i;
 208
 209     s->num_coeff_partitions = 1 << vp8_rac_get_uint(&s->c, 2);
 210
 211     buf      += 3*(s->num_coeff_partitions-1);
 212     buf_size -= 3*(s->num_coeff_partitions-1);
 213     if (buf_size < 0)
 214         return -1;
 215
 216     for (i = 0; i < s->num_coeff_partitions-1; i++) {
 217         int size = AV_RL24(sizes + 3*i);
 218         if (buf_size - size < 0)
 219             return -1;
 220
 221         ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, size);
 222         buf      += size;
 223         buf_size -= size;
 224     }
 225     ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, buf_size);
 226
 227     return 0;
 228 }
 229
 230 static void get_quants(VP8Context *s)
 231 {
 232     VP56RangeCoder *c = &s->c;
 233     int i, base_qi;
 234
 235     int yac_qi     = vp8_rac_get_uint(c, 7);
 236     int ydc_delta  = vp8_rac_get_sint(c, 4);
 237     int y2dc_delta = vp8_rac_get_sint(c, 4);
 238     int y2ac_delta = vp8_rac_get_sint(c, 4);
 239     int uvdc_delta = vp8_rac_get_sint(c, 4);
 240     int uvac_delta = vp8_rac_get_sint(c, 4);
 241
 242     for (i = 0; i < 4; i++) {
 243         if (s->segmentation.enabled) {
 244             base_qi = s->segmentation.base_quant[i];
 245             if (!s->segmentation.absolute_vals)
 246                 base_qi += yac_qi;
 247         } else
 248             base_qi = yac_qi;
 249
 250         s->qmat[i].luma_qmul[0]    =           vp8_dc_qlookup[av_clip_uintp2(base_qi + ydc_delta , 7)];
 251         s->qmat[i].luma_qmul[1]    =           vp8_ac_qlookup[av_clip_uintp2(base_qi             , 7)];
 252         s->qmat[i].luma_dc_qmul[0] =       2 * vp8_dc_qlookup[av_clip_uintp2(base_qi + y2dc_delta, 7)];
 253         /* 101581>>16 is equivalent to 155/100 */
 254         s->qmat[i].luma_dc_qmul[1] = (101581 * vp8_ac_qlookup[av_clip_uintp2(base_qi + y2ac_delta, 7)]) >> 16;
 255         s->qmat[i].chroma_qmul[0]  =           vp8_dc_qlookup[av_clip_uintp2(base_qi + uvdc_delta, 7)];
 256         s->qmat[i].chroma_qmul[1]  =           vp8_ac_qlookup[av_clip_uintp2(base_qi + uvac_delta, 7)];
 257
 258         s->qmat[i].luma_dc_qmul[1] = FFMAX(s->qmat[i].luma_dc_qmul[1], 8);
 259         s->qmat[i].chroma_qmul[0]  = FFMIN(s->qmat[i].chroma_qmul[0], 132);
 260     }
 261 }
 262
 263 /**
 264  * Determine which buffers golden and altref should be updated with after this frame.
 265  * The spec isn't clear here, so I'm going by my understanding of what libvpx does
 266  *
 267  * Intra frames update all 3 references
 268  * Inter frames update VP56_FRAME_PREVIOUS if the update_last flag is set
 269  * If the update (golden|altref) flag is set, it's updated with the current frame
 270  *      if update_last is set, and VP56_FRAME_PREVIOUS otherwise.
 271  * If the flag is not set, the number read means:
 272  *      0: no update
 273  *      1: VP56_FRAME_PREVIOUS
 274  *      2: update golden with altref, or update altref with golden
 275  */
 276 static VP56Frame ref_to_update(VP8Context *s, int update, VP56Frame ref)
 277 {
 278     VP56RangeCoder *c = &s->c;
 279
 280     if (update)
 281         return VP56_FRAME_CURRENT;
 282
 283     switch (vp8_rac_get_uint(c, 2)) {
 284     case 1:
 285         return VP56_FRAME_PREVIOUS;
 286     case 2:
 287         return (ref == VP56_FRAME_GOLDEN) ? VP56_FRAME_GOLDEN2 : VP56_FRAME_GOLDEN;
 288     }
 289     return VP56_FRAME_NONE;
 290 }
 291
 292 static void update_refs(VP8Context *s)
 293 {
 294     VP56RangeCoder *c = &s->c;
 295
 296     int update_golden = vp8_rac_get(c);
 297     int update_altref = vp8_rac_get(c);
 298
 299     s->update_golden = ref_to_update(s, update_golden, VP56_FRAME_GOLDEN);
 300     s->update_altref = ref_to_update(s, update_altref, VP56_FRAME_GOLDEN2);
 301 }
 302
 303 static int decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size)
 304 {
 305     VP56RangeCoder *c = &s->c;
 306     int header_size, hscale, vscale, i, j, k, l, m, ret;
 307     int width  = s->avctx->width;
 308     int height = s->avctx->height;
 309
 310     s->keyframe  = !(buf[0] & 1);
 311     s->profile   =  (buf[0]>>1) & 7;
 312     s->invisible = !(buf[0] & 0x10);
 313     header_size  = AV_RL24(buf) >> 5;
 314     buf      += 3;
 315     buf_size -= 3;
 316
 317     if (s->profile > 3)
 318         av_log(s->avctx, AV_LOG_WARNING, "Unknown profile %d\n", s->profile);
 319
 320     if (!s->profile)
 321         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab, sizeof(s->put_pixels_tab));
 322     else    // profile 1-3 use bilinear, 4+ aren't defined so whatever
 323         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_bilinear_pixels_tab, sizeof(s->put_pixels_tab));
 324
 325     if (header_size > buf_size - 7*s->keyframe) {
 326         av_log(s->avctx, AV_LOG_ERROR, "Header size larger than data provided\n");
 327         return AVERROR_INVALIDDATA;
 328     }
 329
 330     if (s->keyframe) {
 331         if (AV_RL24(buf) != 0x2a019d) {
 332             av_log(s->avctx, AV_LOG_ERROR, "Invalid start code 0x%x\n", AV_RL24(buf));
 333             return AVERROR_INVALIDDATA;
 334         }
 335         width  = AV_RL16(buf+3) & 0x3fff;
 336         height = AV_RL16(buf+5) & 0x3fff;
 337         hscale = buf[4] >> 6;
 338         vscale = buf[6] >> 6;
 339         buf      += 7;
 340         buf_size -= 7;
 341
 342         if (hscale || vscale)
 343             avpriv_request_sample(s->avctx, "Upscaling");
 344
 345         s->update_golden = s->update_altref = VP56_FRAME_CURRENT;
 346         for (i = 0; i < 4; i++)
 347             for (j = 0; j < 16; j++)
 348                 memcpy(s->prob->token[i][j], vp8_token_default_probs[i][vp8_coeff_band[j]],
 349                        sizeof(s->prob->token[i][j]));
 350         memcpy(s->prob->pred16x16, vp8_pred16x16_prob_inter, sizeof(s->prob->pred16x16));
 351         memcpy(s->prob->pred8x8c , vp8_pred8x8c_prob_inter , sizeof(s->prob->pred8x8c));
 352         memcpy(s->prob->mvc      , vp8_mv_default_prob     , sizeof(s->prob->mvc));
 353         memset(&s->segmentation, 0, sizeof(s->segmentation));
 354         memset(&s->lf_delta, 0, sizeof(s->lf_delta));
 355     }
 356
 357     ff_vp56_init_range_decoder(c, buf, header_size);
 358     buf      += header_size;
 359     buf_size -= header_size;
 360
 361     if (s->keyframe) {
 362         if (vp8_rac_get(c))
 363             av_log(s->avctx, AV_LOG_WARNING, "Unspecified colorspace\n");
 364         vp8_rac_get(c); // whether we can skip clamping in dsp functions
 365     }
 366
 367     if ((s->segmentation.enabled = vp8_rac_get(c)))
 368         parse_segment_info(s);
 369     else
 370         s->segmentation.update_map = 0; // FIXME: move this to some init function?
 371
 372     s->filter.simple    = vp8_rac_get(c);
 373     s->filter.level     = vp8_rac_get_uint(c, 6);
 374     s->filter.sharpness = vp8_rac_get_uint(c, 3);
 375
 376     if ((s->lf_delta.enabled = vp8_rac_get(c)))
 377         if (vp8_rac_get(c))
 378             update_lf_deltas(s);
 379
 380     if (setup_partitions(s, buf, buf_size)) {
 381         av_log(s->avctx, AV_LOG_ERROR, "Invalid partitions\n");
 382         return AVERROR_INVALIDDATA;
 383     }
 384
 385     if (!s->macroblocks_base || /* first frame */
 386         width != s->avctx->width || height != s->avctx->height) {
 387         if ((ret = update_dimensions(s, width, height)) < 0)
 388             return ret;
 389     }
 390
 391     get_quants(s);
 392
 393     if (!s->keyframe) {
 394         update_refs(s);
 395         s->sign_bias[VP56_FRAME_GOLDEN]               = vp8_rac_get(c);
 396         s->sign_bias[VP56_FRAME_GOLDEN2 /* altref */] = vp8_rac_get(c);
 397     }
 398
 399     // if we aren't saving this frame's probabilities for future frames,
 400     // make a copy of the current probabilities
 401     if (!(s->update_probabilities = vp8_rac_get(c)))
 402         s->prob[1] = s->prob[0];
 403
 404     s->update_last = s->keyframe || vp8_rac_get(c);
 405
 406     for (i = 0; i < 4; i++)
 407         for (j = 0; j < 8; j++)
 408             for (k = 0; k < 3; k++)
 409                 for (l = 0; l < NUM_DCT_TOKENS-1; l++)
 410                     if (vp56_rac_get_prob_branchy(c, vp8_token_update_probs[i][j][k][l])) {
 411                         int prob = vp8_rac_get_uint(c, 8);
 412                         for (m = 0; vp8_coeff_band_indexes[j][m] >= 0; m++)
 413                             s->prob->token[i][vp8_coeff_band_indexes[j][m]][k][l] = prob;
 414                     }
 415
 416     if ((s->mbskip_enabled = vp8_rac_get(c)))
 417         s->prob->mbskip = vp8_rac_get_uint(c, 8);
 418
 419     if (!s->keyframe) {
 420         s->prob->intra  = vp8_rac_get_uint(c, 8);
 421         s->prob->last   = vp8_rac_get_uint(c, 8);
 422         s->prob->golden = vp8_rac_get_uint(c, 8);
 423
 424         if (vp8_rac_get(c))
 425             for (i = 0; i < 4; i++)
 426                 s->prob->pred16x16[i] = vp8_rac_get_uint(c, 8);
 427         if (vp8_rac_get(c))
 428             for (i = 0; i < 3; i++)
 429                 s->prob->pred8x8c[i]  = vp8_rac_get_uint(c, 8);
 430
 431         // 17.2 MV probability update
 432         for (i = 0; i < 2; i++)
 433             for (j = 0; j < 19; j++)
 434                 if (vp56_rac_get_prob_branchy(c, vp8_mv_update_prob[i][j]))
 435                     s->prob->mvc[i][j] = vp8_rac_get_nn(c);
 436     }
 437
 438     return 0;
 439 }
 440
 441 static av_always_inline void clamp_mv(VP8Context *s, VP56mv *dst, const VP56mv *src)
 442 {
 443     dst->x = av_clip(src->x, s->mv_min.x, s->mv_max.x);
 444     dst->y = av_clip(src->y, s->mv_min.y, s->mv_max.y);
 445 }
 446
 447 /**
 448  * Motion vector coding, 17.1.
 449  */
 450 static int read_mv_component(VP56RangeCoder *c, const uint8_t *p)
 451 {
 452     int bit, x = 0;
 453
 454     if (vp56_rac_get_prob_branchy(c, p[0])) {
 455         int i;
 456
 457         for (i = 0; i < 3; i++)
 458             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 459         for (i = 9; i > 3; i--)
 460             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 461         if (!(x & 0xFFF0) || vp56_rac_get_prob(c, p[12]))
 462             x += 8;
 463     } else {
 464         // small_mvtree
 465         const uint8_t *ps = p+2;
 466         bit = vp56_rac_get_prob(c, *ps);
 467         ps += 1 + 3*bit;
 468         x  += 4*bit;
 469         bit = vp56_rac_get_prob(c, *ps);
 470         ps += 1 + bit;
 471         x  += 2*bit;
 472         x  += vp56_rac_get_prob(c, *ps);
 473     }
 474
 475     return (x && vp56_rac_get_prob(c, p[1])) ? -x : x;
 476 }
 477
 478 static av_always_inline
 479 const uint8_t *get_submv_prob(uint32_t left, uint32_t top)
 480 {
 481     if (left == top)
 482         return vp8_submv_prob[4-!!left];
 483     if (!top)
 484         return vp8_submv_prob[2];
 485     return vp8_submv_prob[1-!!left];
 486 }
 487
 488 /**
 489  * Split motion vector prediction, 16.4.
 490  * @returns the number of motion vectors parsed (2, 4 or 16)
 491  */
 492 static av_always_inline
 493 int decode_splitmvs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb, int layout)
 494 {
 495     int part_idx;
 496     int n, num;
 497     VP8Macroblock *top_mb;
 498     VP8Macroblock *left_mb = &mb[-1];
 499     const uint8_t *mbsplits_left = vp8_mbsplits[left_mb->partitioning],
 500                   *mbsplits_top,
 501                   *mbsplits_cur, *firstidx;
 502     VP56mv *top_mv;
 503     VP56mv *left_mv = left_mb->bmv;
 504     VP56mv *cur_mv  = mb->bmv;
 505
 506     if (!layout) // layout is inlined, s->mb_layout is not
 507         top_mb = &mb[2];
 508     else
 509         top_mb = &mb[-s->mb_width-1];
 510     mbsplits_top = vp8_mbsplits[top_mb->partitioning];
 511     top_mv = top_mb->bmv;
 512
 513     if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[0])) {
 514         if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[1])) {
 515             part_idx = VP8_SPLITMVMODE_16x8 + vp56_rac_get_prob(c, vp8_mbsplit_prob[2]);
 516         } else {
 517             part_idx = VP8_SPLITMVMODE_8x8;
 518         }
 519     } else {
 520         part_idx = VP8_SPLITMVMODE_4x4;
 521     }
 522
 523     num = vp8_mbsplit_count[part_idx];
 524     mbsplits_cur = vp8_mbsplits[part_idx],
 525     firstidx = vp8_mbfirstidx[part_idx];
 526     mb->partitioning = part_idx;
 527
 528     for (n = 0; n < num; n++) {
 529         int k = firstidx[n];
 530         uint32_t left, above;
 531         const uint8_t *submv_prob;
 532
 533         if (!(k & 3))
 534             left = AV_RN32A(&left_mv[mbsplits_left[k + 3]]);
 535         else
 536             left  = AV_RN32A(&cur_mv[mbsplits_cur[k - 1]]);
 537         if (k <= 3)
 538             above = AV_RN32A(&top_mv[mbsplits_top[k + 12]]);
 539         else
 540             above = AV_RN32A(&cur_mv[mbsplits_cur[k - 4]]);
 541
 542         submv_prob = get_submv_prob(left, above);
 543
 544         if (vp56_rac_get_prob_branchy(c, submv_prob[0])) {
 545             if (vp56_rac_get_prob_branchy(c, submv_prob[1])) {
 546                 if (vp56_rac_get_prob_branchy(c, submv_prob[2])) {
 547                     mb->bmv[n].y = mb->mv.y + read_mv_component(c, s->prob->mvc[0]);
 548                     mb->bmv[n].x = mb->mv.x + read_mv_component(c, s->prob->mvc[1]);
 549                 } else {
 550                     AV_ZERO32(&mb->bmv[n]);
 551                 }
 552             } else {
 553                 AV_WN32A(&mb->bmv[n], above);
 554             }
 555         } else {
 556             AV_WN32A(&mb->bmv[n], left);
 557         }
 558     }
 559
 560     return num;
 561 }
 562
 563 static av_always_inline
 564 void decode_mvs(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, int layout)
 565 {
 566     VP8Macroblock *mb_edge[3] = { 0 /* top */,
 567                                   mb - 1 /* left */,
 568                                   0 /* top-left */ };
 569     enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV };
 570     enum { VP8_EDGE_TOP, VP8_EDGE_LEFT, VP8_EDGE_TOPLEFT };
 571     int idx = CNT_ZERO;
 572     int cur_sign_bias = s->sign_bias[mb->ref_frame];
 573     int8_t *sign_bias = s->sign_bias;
 574     VP56mv near_mv[4];
 575     uint8_t cnt[4] = { 0 };
 576     VP56RangeCoder *c = &s->c;
 577
 578     if (!layout) { // layout is inlined (s->mb_layout is not)
 579         mb_edge[0] = mb + 2;
 580         mb_edge[2] = mb + 1;
 581     }
 582     else {
 583         mb_edge[0] = mb - s->mb_width-1;
 584         mb_edge[2] = mb - s->mb_width-2;
 585     }
 586
 587     AV_ZERO32(&near_mv[0]);
 588     AV_ZERO32(&near_mv[1]);
 589     AV_ZERO32(&near_mv[2]);
 590
 591     /* Process MB on top, left and top-left */
 592     #define MV_EDGE_CHECK(n)\
 593     {\
 594         VP8Macroblock *edge = mb_edge[n];\
 595         int edge_ref = edge->ref_frame;\
 596         if (edge_ref != VP56_FRAME_CURRENT) {\
 597             uint32_t mv = AV_RN32A(&edge->mv);\
 598             if (mv) {\
 599                 if (cur_sign_bias != sign_bias[edge_ref]) {\
 600                     /* SWAR negate of the values in mv. */\
 601                     mv = ~mv;\
 602                     mv = ((mv&0x7fff7fff) + 0x00010001) ^ (mv&0x80008000);\
 603                 }\
 604                 if (!n || mv != AV_RN32A(&near_mv[idx]))\
 605                     AV_WN32A(&near_mv[++idx], mv);\
 606                 cnt[idx]      += 1 + (n != 2);\
 607             } else\
 608                 cnt[CNT_ZERO] += 1 + (n != 2);\
 609         }\
 610     }
 611
 612     MV_EDGE_CHECK(0)
 613     MV_EDGE_CHECK(1)
 614     MV_EDGE_CHECK(2)
 615
 616     mb->partitioning = VP8_SPLITMVMODE_NONE;
 617     if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_ZERO]][0])) {
 618         mb->mode = VP8_MVMODE_MV;
 619
 620         /* If we have three distinct MVs, merge first and last if they're the same */
 621         if (cnt[CNT_SPLITMV] && AV_RN32A(&near_mv[1 + VP8_EDGE_TOP]) == AV_RN32A(&near_mv[1 + VP8_EDGE_TOPLEFT]))
 622             cnt[CNT_NEAREST] += 1;
 623
 624         /* Swap near and nearest if necessary */
 625         if (cnt[CNT_NEAR] > cnt[CNT_NEAREST]) {
 626             FFSWAP(uint8_t,     cnt[CNT_NEAREST],     cnt[CNT_NEAR]);
 627             FFSWAP( VP56mv, near_mv[CNT_NEAREST], near_mv[CNT_NEAR]);
 628         }
 629
 630         if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAREST]][1])) {
 631             if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAR]][2])) {
 632
 633                 /* Choose the best mv out of 0,0 and the nearest mv */
 634                 clamp_mv(s, &mb->mv, &near_mv[CNT_ZERO + (cnt[CNT_NEAREST] >= cnt[CNT_ZERO])]);
 635                 cnt[CNT_SPLITMV] = ((mb_edge[VP8_EDGE_LEFT]->mode    == VP8_MVMODE_SPLIT) +
 636                                     (mb_edge[VP8_EDGE_TOP]->mode     == VP8_MVMODE_SPLIT)) * 2 +
 637                                     (mb_edge[VP8_EDGE_TOPLEFT]->mode == VP8_MVMODE_SPLIT);
 638
 639                 if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_SPLITMV]][3])) {
 640                     mb->mode = VP8_MVMODE_SPLIT;
 641                     mb->mv = mb->bmv[decode_splitmvs(s, c, mb, layout) - 1];
 642                 } else {
 643                     mb->mv.y += read_mv_component(c, s->prob->mvc[0]);
 644                     mb->mv.x += read_mv_component(c, s->prob->mvc[1]);
 645                     mb->bmv[0] = mb->mv;
 646                 }
 647             } else {
 648                 clamp_mv(s, &mb->mv, &near_mv[CNT_NEAR]);
 649                 mb->bmv[0] = mb->mv;
 650             }
 651         } else {
 652             clamp_mv(s, &mb->mv, &near_mv[CNT_NEAREST]);
 653             mb->bmv[0] = mb->mv;
 654         }
 655     } else {
 656         mb->mode = VP8_MVMODE_ZERO;
 657         AV_ZERO32(&mb->mv);
 658         mb->bmv[0] = mb->mv;
 659     }
 660 }
 661
 662 static av_always_inline
 663 void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
 664                            int mb_x, int keyframe, int layout)
 665 {
 666     uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
 667
 668     if (layout == 1) {
 669         VP8Macroblock *mb_top = mb - s->mb_width - 1;
 670         memcpy(mb->intra4x4_pred_mode_top, mb_top->intra4x4_pred_mode_top, 4);
 671     }
 672     if (keyframe) {
 673         int x, y;
 674         uint8_t* top;
 675         uint8_t* const left = s->intra4x4_pred_mode_left;
 676         if (layout == 1)
 677             top = mb->intra4x4_pred_mode_top;
 678         else
 679             top = s->intra4x4_pred_mode_top + 4 * mb_x;
 680         for (y = 0; y < 4; y++) {
 681             for (x = 0; x < 4; x++) {
 682                 const uint8_t *ctx;
 683                 ctx = vp8_pred4x4_prob_intra[top[x]][left[y]];
 684                 *intra4x4 = vp8_rac_get_tree(c, vp8_pred4x4_tree, ctx);
 685                 left[y] = top[x] = *intra4x4;
 686                 intra4x4++;
 687             }
 688         }
 689     } else {
 690         int i;
 691         for (i = 0; i < 16; i++)
 692             intra4x4[i] = vp8_rac_get_tree(c, vp8_pred4x4_tree, vp8_pred4x4_prob_inter);
 693     }
 694 }
 695
 696 static av_always_inline
 697 void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
 698                     uint8_t *segment, uint8_t *ref, int layout)
 699 {
 700     VP56RangeCoder *c = &s->c;
 701
 702     if (s->segmentation.update_map)
 703         *segment = vp8_rac_get_tree(c, vp8_segmentid_tree, s->prob->segmentid);
 704     else if (s->segmentation.enabled)
 705         *segment = ref ? *ref : *segment;
 706     mb->segment = *segment;
 707
 708     mb->skip = s->mbskip_enabled ? vp56_rac_get_prob(c, s->prob->mbskip) : 0;
 709
 710     if (s->keyframe) {
 711         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_intra, vp8_pred16x16_prob_intra);
 712
 713         if (mb->mode == MODE_I4x4) {
 714             decode_intra4x4_modes(s, c, mb, mb_x, 1, layout);
 715         } else {
 716             const uint32_t modes = vp8_pred4x4_mode[mb->mode] * 0x01010101u;
 717             if (s->mb_layout == 1)
 718                 AV_WN32A(mb->intra4x4_pred_mode_top, modes);
 719             else
 720                 AV_WN32A(s->intra4x4_pred_mode_top + 4 * mb_x, modes);
 721             AV_WN32A( s->intra4x4_pred_mode_left, modes);
 722         }
 723
 724         mb->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, vp8_pred8x8c_prob_intra);
 725         mb->ref_frame = VP56_FRAME_CURRENT;
 726     } else if (vp56_rac_get_prob_branchy(c, s->prob->intra)) {
 727         // inter MB, 16.2
 728         if (vp56_rac_get_prob_branchy(c, s->prob->last))
 729             mb->ref_frame = vp56_rac_get_prob(c, s->prob->golden) ?
 730                 VP56_FRAME_GOLDEN2 /* altref */ : VP56_FRAME_GOLDEN;
 731         else
 732             mb->ref_frame = VP56_FRAME_PREVIOUS;
 733         s->ref_count[mb->ref_frame-1]++;
 734
 735         // motion vectors, 16.3
 736         decode_mvs(s, mb, mb_x, mb_y, layout);
 737     } else {
 738         // intra MB, 16.1
 739         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_inter, s->prob->pred16x16);
 740
 741         if (mb->mode == MODE_I4x4)
 742             decode_intra4x4_modes(s, c, mb, mb_x, 0, layout);
 743
 744         mb->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, s->prob->pred8x8c);
 745         mb->ref_frame = VP56_FRAME_CURRENT;
 746         mb->partitioning = VP8_SPLITMVMODE_NONE;
 747         AV_ZERO32(&mb->bmv[0]);
 748     }
 749 }
 750
 751 #ifndef decode_block_coeffs_internal
 752 /**
 753  * @param r arithmetic bitstream reader context
 754  * @param block destination for block coefficients
 755  * @param probs probabilities to use when reading trees from the bitstream
 756  * @param i initial coeff index, 0 unless a separate DC block is coded
 757  * @param qmul array holding the dc/ac dequant factor at position 0/1
 758  * @return 0 if no coeffs were decoded
 759  *         otherwise, the index of the last coeff decoded plus one
 760  */
 761 static int decode_block_coeffs_internal(VP56RangeCoder *r, int16_t block[16],
 762                                         uint8_t probs[16][3][NUM_DCT_TOKENS-1],
 763                                         int i, uint8_t *token_prob, int16_t qmul[2])
 764 {
 765     VP56RangeCoder c = *r;
 766     goto skip_eob;
 767     do {
 768         int coeff;
 769         if (!vp56_rac_get_prob_branchy(&c, token_prob[0]))   // DCT_EOB
 770             break;
 771
 772 skip_eob:
 773         if (!vp56_rac_get_prob_branchy(&c, token_prob[1])) { // DCT_0
 774             if (++i == 16)
 775                 break; // invalid input; blocks should end with EOB
 776             token_prob = probs[i][0];
 777             goto skip_eob;
 778         }
 779
 780         if (!vp56_rac_get_prob_branchy(&c, token_prob[2])) { // DCT_1
 781             coeff = 1;
 782             token_prob = probs[i+1][1];
 783         } else {
 784             if (!vp56_rac_get_prob_branchy(&c, token_prob[3])) { // DCT 2,3,4
 785                 coeff = vp56_rac_get_prob_branchy(&c, token_prob[4]);
 786                 if (coeff)
 787                     coeff += vp56_rac_get_prob(&c, token_prob[5]);
 788                 coeff += 2;
 789             } else {
 790                 // DCT_CAT*
 791                 if (!vp56_rac_get_prob_branchy(&c, token_prob[6])) {
 792                     if (!vp56_rac_get_prob_branchy(&c, token_prob[7])) { // DCT_CAT1
 793                         coeff  = 5 + vp56_rac_get_prob(&c, vp8_dct_cat1_prob[0]);
 794                     } else {                                    // DCT_CAT2
 795                         coeff  = 7;
 796                         coeff += vp56_rac_get_prob(&c, vp8_dct_cat2_prob[0]) << 1;
 797                         coeff += vp56_rac_get_prob(&c, vp8_dct_cat2_prob[1]);
 798                     }
 799                 } else {    // DCT_CAT3 and up
 800                     int a = vp56_rac_get_prob(&c, token_prob[8]);
 801                     int b = vp56_rac_get_prob(&c, token_prob[9+a]);
 802                     int cat = (a<<1) + b;
 803                     coeff  = 3 + (8<<cat);
 804                     coeff += vp8_rac_get_coeff(&c, ff_vp8_dct_cat_prob[cat]);
 805                 }
 806             }
 807             token_prob = probs[i+1][2];
 808         }
 809         block[zigzag_scan[i]] = (vp8_rac_get(&c) ? -coeff : coeff) * qmul[!!i];
 810     } while (++i < 16);
 811
 812     *r = c;
 813     return i;
 814 }
 815 #endif
 816
 817 /**
 818  * @param c arithmetic bitstream reader context
 819  * @param block destination for block coefficients
 820  * @param probs probabilities to use when reading trees from the bitstream
 821  * @param i initial coeff index, 0 unless a separate DC block is coded
 822  * @param zero_nhood the initial prediction context for number of surrounding
 823  *                   all-zero blocks (only left/top, so 0-2)
 824  * @param qmul array holding the dc/ac dequant factor at position 0/1
 825  * @return 0 if no coeffs were decoded
 826  *         otherwise, the index of the last coeff decoded plus one
 827  */
 828 static av_always_inline
 829 int decode_block_coeffs(VP56RangeCoder *c, int16_t block[16],
 830                         uint8_t probs[16][3][NUM_DCT_TOKENS-1],
 831                         int i, int zero_nhood, int16_t qmul[2])
 832 {
 833     uint8_t *token_prob = probs[i][zero_nhood];
 834     if (!vp56_rac_get_prob_branchy(c, token_prob[0]))   // DCT_EOB
 835         return 0;
 836     return decode_block_coeffs_internal(c, block, probs, i, token_prob, qmul);
 837 }
 838
 839 static av_always_inline
 840 void decode_mb_coeffs(VP8Context *s, VP8ThreadData *td, VP56RangeCoder *c, VP8Macroblock *mb,
 841                       uint8_t t_nnz[9], uint8_t l_nnz[9])
 842 {
 843     int i, x, y, luma_start = 0, luma_ctx = 3;
 844     int nnz_pred, nnz, nnz_total = 0;
 845     int segment = mb->segment;
 846     int block_dc = 0;
 847
 848     if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
 849         nnz_pred = t_nnz[8] + l_nnz[8];
 850
 851         // decode DC values and do hadamard
 852         nnz = decode_block_coeffs(c, td->block_dc, s->prob->token[1], 0, nnz_pred,
 853                                   s->qmat[segment].luma_dc_qmul);
 854         l_nnz[8] = t_nnz[8] = !!nnz;
 855         if (nnz) {
 856             nnz_total += nnz;
 857             block_dc = 1;
 858             if (nnz == 1)
 859                 s->vp8dsp.vp8_luma_dc_wht_dc(td->block, td->block_dc);
 860             else
 861                 s->vp8dsp.vp8_luma_dc_wht(td->block, td->block_dc);
 862         }
 863         luma_start = 1;
 864         luma_ctx = 0;
 865     }
 866
 867     // luma blocks
 868     for (y = 0; y < 4; y++)
 869         for (x = 0; x < 4; x++) {
 870             nnz_pred = l_nnz[y] + t_nnz[x];
 871             nnz = decode_block_coeffs(c, td->block[y][x], s->prob->token[luma_ctx], luma_start,
 872                                       nnz_pred, s->qmat[segment].luma_qmul);
 873             // nnz+block_dc may be one more than the actual last index, but we don't care
 874             td->non_zero_count_cache[y][x] = nnz + block_dc;
 875             t_nnz[x] = l_nnz[y] = !!nnz;
 876             nnz_total += nnz;
 877         }
 878
 879     // chroma blocks
 880     // TODO: what to do about dimensions? 2nd dim for luma is x,
 881     // but for chroma it's (y<<1)|x
 882     for (i = 4; i < 6; i++)
 883         for (y = 0; y < 2; y++)
 884             for (x = 0; x < 2; x++) {
 885                 nnz_pred = l_nnz[i+2*y] + t_nnz[i+2*x];
 886                 nnz = decode_block_coeffs(c, td->block[i][(y<<1)+x], s->prob->token[2], 0,
 887                                           nnz_pred, s->qmat[segment].chroma_qmul);
 888                 td->non_zero_count_cache[i][(y<<1)+x] = nnz;
 889                 t_nnz[i+2*x] = l_nnz[i+2*y] = !!nnz;
 890                 nnz_total += nnz;
 891             }
 892
 893     // if there were no coded coeffs despite the macroblock not being marked skip,
 894     // we MUST not do the inner loop filter and should not do IDCT
 895     // Since skip isn't used for bitstream prediction, just manually set it.
 896     if (!nnz_total)
 897         mb->skip = 1;
 898 }
 899
 900 static av_always_inline
 901 void backup_mb_border(uint8_t *top_border, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr,
 902                       int linesize, int uvlinesize, int simple)
 903 {
 904     AV_COPY128(top_border, src_y + 15*linesize);
 905     if (!simple) {
 906         AV_COPY64(top_border+16, src_cb + 7*uvlinesize);
 907         AV_COPY64(top_border+24, src_cr + 7*uvlinesize);
 908     }
 909 }
 910
 911 static av_always_inline
 912 void xchg_mb_border(uint8_t *top_border, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr,
 913                     int linesize, int uvlinesize, int mb_x, int mb_y, int mb_width,
 914                     int simple, int xchg)
 915 {
 916     uint8_t *top_border_m1 = top_border-32;     // for TL prediction
 917     src_y  -=   linesize;
 918     src_cb -= uvlinesize;
 919     src_cr -= uvlinesize;
 920
 921 #define XCHG(a,b,xchg) do {                     \
 922         if (xchg) AV_SWAP64(b,a);               \
 923         else      AV_COPY64(b,a);               \
 924     } while (0)
 925
 926     XCHG(top_border_m1+8, src_y-8, xchg);
 927     XCHG(top_border,      src_y,   xchg);
 928     XCHG(top_border+8,    src_y+8, 1);
 929     if (mb_x < mb_width-1)
 930         XCHG(top_border+32, src_y+16, 1);
 931
 932     // only copy chroma for normal loop filter
 933     // or to initialize the top row to 127
 934     if (!simple || !mb_y) {
 935         XCHG(top_border_m1+16, src_cb-8, xchg);
 936         XCHG(top_border_m1+24, src_cr-8, xchg);
 937         XCHG(top_border+16,    src_cb, 1);
 938         XCHG(top_border+24,    src_cr, 1);
 939     }
 940 }
 941
 942 static av_always_inline
 943 int check_dc_pred8x8_mode(int mode, int mb_x, int mb_y)
 944 {
 945     if (!mb_x) {
 946         return mb_y ? TOP_DC_PRED8x8 : DC_128_PRED8x8;
 947     } else {
 948         return mb_y ? mode : LEFT_DC_PRED8x8;
 949     }
 950 }
 951
 952 static av_always_inline
 953 int check_tm_pred8x8_mode(int mode, int mb_x, int mb_y)
 954 {
 955     if (!mb_x) {
 956         return mb_y ? VERT_PRED8x8 : DC_129_PRED8x8;
 957     } else {
 958         return mb_y ? mode : HOR_PRED8x8;
 959     }
 960 }
 961
 962 static av_always_inline
 963 int check_intra_pred8x8_mode(int mode, int mb_x, int mb_y)
 964 {
 965     if (mode == DC_PRED8x8) {
 966         return check_dc_pred8x8_mode(mode, mb_x, mb_y);
 967     } else {
 968         return mode;
 969     }
 970 }
 971
 972 static av_always_inline
 973 int check_intra_pred8x8_mode_emuedge(int mode, int mb_x, int mb_y)
 974 {
 975     switch (mode) {
 976     case DC_PRED8x8:
 977         return check_dc_pred8x8_mode(mode, mb_x, mb_y);
 978     case VERT_PRED8x8:
 979         return !mb_y ? DC_127_PRED8x8 : mode;
 980     case HOR_PRED8x8:
 981         return !mb_x ? DC_129_PRED8x8 : mode;
 982     case PLANE_PRED8x8 /*TM*/:
 983         return check_tm_pred8x8_mode(mode, mb_x, mb_y);
 984     }
 985     return mode;
 986 }
 987
 988 static av_always_inline
 989 int check_tm_pred4x4_mode(int mode, int mb_x, int mb_y)
 990 {
 991     if (!mb_x) {
 992         return mb_y ? VERT_VP8_PRED : DC_129_PRED;
 993     } else {
 994         return mb_y ? mode : HOR_VP8_PRED;
 995     }
 996 }
 997
 998 static av_always_inline
 999 int check_intra_pred4x4_mode_emuedge(int mode, int mb_x, int mb_y, int *copy_buf)
1000 {
1001     switch (mode) {
1002     case VERT_PRED:
1003         if (!mb_x && mb_y) {
1004             *copy_buf = 1;
1005             return mode;
1006         }
1007         /* fall-through */
1008     case DIAG_DOWN_LEFT_PRED:
1009     case VERT_LEFT_PRED:
1010         return !mb_y ? DC_127_PRED : mode;
1011     case HOR_PRED:
1012         if (!mb_y) {
1013             *copy_buf = 1;
1014             return mode;
1015         }
1016         /* fall-through */
1017     case HOR_UP_PRED:
1018         return !mb_x ? DC_129_PRED : mode;
1019     case TM_VP8_PRED:
1020         return check_tm_pred4x4_mode(mode, mb_x, mb_y);
1021     case DC_PRED: // 4x4 DC doesn't use the same "H.264-style" exceptions as 16x16/8x8 DC
1022     case DIAG_DOWN_RIGHT_PRED:
1023     case VERT_RIGHT_PRED:
1024     case HOR_DOWN_PRED:
1025         if (!mb_y || !mb_x)
1026             *copy_buf = 1;
1027         return mode;
1028     }
1029     return mode;
1030 }
1031
1032 static av_always_inline
1033 void intra_predict(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1034                    VP8Macroblock *mb, int mb_x, int mb_y)
1035 {
1036     AVCodecContext *avctx = s->avctx;
1037     int x, y, mode, nnz;
1038     uint32_t tr;
1039
1040     // for the first row, we need to run xchg_mb_border to init the top edge to 127
1041     // otherwise, skip it if we aren't going to deblock
1042     if (!(avctx->flags & CODEC_FLAG_EMU_EDGE && !mb_y) && (s->deblock_filter || !mb_y) && td->thread_nr == 0)
1043         xchg_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2],
1044                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1045                        s->filter.simple, 1);
1046
1047     if (mb->mode < MODE_I4x4) {
1048         if (avctx->flags & CODEC_FLAG_EMU_EDGE) { // tested
1049             mode = check_intra_pred8x8_mode_emuedge(mb->mode, mb_x, mb_y);
1050         } else {
1051             mode = check_intra_pred8x8_mode(mb->mode, mb_x, mb_y);
1052         }
1053         s->hpc.pred16x16[mode](dst[0], s->linesize);
1054     } else {
1055         uint8_t *ptr = dst[0];
1056         uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
1057         uint8_t tr_top[4] = { 127, 127, 127, 127 };
1058
1059         // all blocks on the right edge of the macroblock use bottom edge
1060         // the top macroblock for their topright edge
1061         uint8_t *tr_right = ptr - s->linesize + 16;
1062
1063         // if we're on the right edge of the frame, said edge is extended
1064         // from the top macroblock
1065         if (!(!mb_y && avctx->flags & CODEC_FLAG_EMU_EDGE) &&
1066             mb_x == s->mb_width-1) {
1067             tr = tr_right[-1]*0x01010101u;
1068             tr_right = (uint8_t *)&tr;
1069         }
1070
1071         if (mb->skip)
1072             AV_ZERO128(td->non_zero_count_cache);
1073
1074         for (y = 0; y < 4; y++) {
1075             uint8_t *topright = ptr + 4 - s->linesize;
1076             for (x = 0; x < 4; x++) {
1077                 int copy = 0, linesize = s->linesize;
1078                 uint8_t *dst = ptr+4*x;
1079                 DECLARE_ALIGNED(4, uint8_t, copy_dst)[5*8];
1080
1081                 if ((y == 0 || x == 3) && mb_y == 0 && avctx->flags & CODEC_FLAG_EMU_EDGE) {
1082                     topright = tr_top;
1083                 } else if (x == 3)
1084                     topright = tr_right;
1085
1086                 if (avctx->flags & CODEC_FLAG_EMU_EDGE) { // mb_x+x or mb_y+y is a hack but works
1087                     mode = check_intra_pred4x4_mode_emuedge(intra4x4[x], mb_x + x, mb_y + y, &copy);
1088                     if (copy) {
1089                         dst = copy_dst + 12;
1090                         linesize = 8;
1091                         if (!(mb_y + y)) {
1092                             copy_dst[3] = 127U;
1093                             AV_WN32A(copy_dst+4, 127U * 0x01010101U);
1094                         } else {
1095                             AV_COPY32(copy_dst+4, ptr+4*x-s->linesize);
1096                             if (!(mb_x + x)) {
1097                                 copy_dst[3] = 129U;
1098                             } else {
1099                                 copy_dst[3] = ptr[4*x-s->linesize-1];
1100                             }
1101                         }
1102                         if (!(mb_x + x)) {
1103                             copy_dst[11] =
1104                             copy_dst[19] =
1105                             copy_dst[27] =
1106                             copy_dst[35] = 129U;
1107                         } else {
1108                             copy_dst[11] = ptr[4*x              -1];
1109                             copy_dst[19] = ptr[4*x+s->linesize  -1];
1110                             copy_dst[27] = ptr[4*x+s->linesize*2-1];
1111                             copy_dst[35] = ptr[4*x+s->linesize*3-1];
1112                         }
1113                     }
1114                 } else {
1115                     mode = intra4x4[x];
1116                 }
1117                 s->hpc.pred4x4[mode](dst, topright, linesize);
1118                 if (copy) {
1119                     AV_COPY32(ptr+4*x              , copy_dst+12);
1120                     AV_COPY32(ptr+4*x+s->linesize  , copy_dst+20);
1121                     AV_COPY32(ptr+4*x+s->linesize*2, copy_dst+28);
1122                     AV_COPY32(ptr+4*x+s->linesize*3, copy_dst+36);
1123                 }
1124
1125                 nnz = td->non_zero_count_cache[y][x];
1126                 if (nnz) {
1127                     if (nnz == 1)
1128                         s->vp8dsp.vp8_idct_dc_add(ptr+4*x, td->block[y][x], s->linesize);
1129                     else
1130                         s->vp8dsp.vp8_idct_add(ptr+4*x, td->block[y][x], s->linesize);
1131                 }
1132                 topright += 4;
1133             }
1134
1135             ptr   += 4*s->linesize;
1136             intra4x4 += 4;
1137         }
1138     }
1139
1140     if (avctx->flags & CODEC_FLAG_EMU_EDGE) {
1141         mode = check_intra_pred8x8_mode_emuedge(mb->chroma_pred_mode, mb_x, mb_y);
1142     } else {
1143         mode = check_intra_pred8x8_mode(mb->chroma_pred_mode, mb_x, mb_y);
1144     }
1145     s->hpc.pred8x8[mode](dst[1], s->uvlinesize);
1146     s->hpc.pred8x8[mode](dst[2], s->uvlinesize);
1147
1148     if (!(avctx->flags & CODEC_FLAG_EMU_EDGE && !mb_y) && (s->deblock_filter || !mb_y) && td->thread_nr == 0)
1149         xchg_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2],
1150                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1151                        s->filter.simple, 0);
1152 }
1153
1154 static const uint8_t subpel_idx[3][8] = {
1155     { 0, 1, 2, 1, 2, 1, 2, 1 }, // nr. of left extra pixels,
1156                                 // also function pointer index
1157     { 0, 3, 5, 3, 5, 3, 5, 3 }, // nr. of extra pixels required
1158     { 0, 2, 3, 2, 3, 2, 3, 2 }, // nr. of right extra pixels
1159 };
1160
1161 /**
1162  * luma MC function
1163  *
1164  * @param s VP8 decoding context
1165  * @param dst target buffer for block data at block position
1166  * @param ref reference picture buffer at origin (0, 0)
1167  * @param mv motion vector (relative to block position) to get pixel data from
1168  * @param x_off horizontal position of block from origin (0, 0)
1169  * @param y_off vertical position of block from origin (0, 0)
1170  * @param block_w width of block (16, 8 or 4)
1171  * @param block_h height of block (always same as block_w)
1172  * @param width width of src/dst plane data
1173  * @param height height of src/dst plane data
1174  * @param linesize size of a single line of plane data, including padding
1175  * @param mc_func motion compensation function pointers (bilinear or sixtap MC)
1176  */
1177 static av_always_inline
1178 void vp8_mc_luma(VP8Context *s, VP8ThreadData *td, uint8_t *dst,
1179                  ThreadFrame *ref, const VP56mv *mv,
1180                  int x_off, int y_off, int block_w, int block_h,
1181                  int width, int height, ptrdiff_t linesize,
1182                  vp8_mc_func mc_func[3][3])
1183 {
1184     uint8_t *src = ref->f->data[0];
1185
1186     if (AV_RN32A(mv)) {
1187
1188         int mx = (mv->x << 1)&7, mx_idx = subpel_idx[0][mx];
1189         int my = (mv->y << 1)&7, my_idx = subpel_idx[0][my];
1190
1191         x_off += mv->x >> 2;
1192         y_off += mv->y >> 2;
1193
1194         // edge emulation
1195         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 4, 0);
1196         src += y_off * linesize + x_off;
1197         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1198             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1199             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1200                                      src - my_idx * linesize - mx_idx,
1201                                      linesize, linesize,
1202                                      block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1203                                      x_off - mx_idx, y_off - my_idx, width, height);
1204             src = td->edge_emu_buffer + mx_idx + linesize * my_idx;
1205         }
1206         mc_func[my_idx][mx_idx](dst, linesize, src, linesize, block_h, mx, my);
1207     } else {
1208         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 4, 0);
1209         mc_func[0][0](dst, linesize, src + y_off * linesize + x_off, linesize, block_h, 0, 0);
1210     }
1211 }
1212
1213 /**
1214  * chroma MC function
1215  *
1216  * @param s VP8 decoding context
1217  * @param dst1 target buffer for block data at block position (U plane)
1218  * @param dst2 target buffer for block data at block position (V plane)
1219  * @param ref reference picture buffer at origin (0, 0)
1220  * @param mv motion vector (relative to block position) to get pixel data from
1221  * @param x_off horizontal position of block from origin (0, 0)
1222  * @param y_off vertical position of block from origin (0, 0)
1223  * @param block_w width of block (16, 8 or 4)
1224  * @param block_h height of block (always same as block_w)
1225  * @param width width of src/dst plane data
1226  * @param height height of src/dst plane data
1227  * @param linesize size of a single line of plane data, including padding
1228  * @param mc_func motion compensation function pointers (bilinear or sixtap MC)
1229  */
1230 static av_always_inline
1231 void vp8_mc_chroma(VP8Context *s, VP8ThreadData *td, uint8_t *dst1, uint8_t *dst2,
1232                    ThreadFrame *ref, const VP56mv *mv, int x_off, int y_off,
1233                    int block_w, int block_h, int width, int height, ptrdiff_t linesize,
1234                    vp8_mc_func mc_func[3][3])
1235 {
1236     uint8_t *src1 = ref->f->data[1], *src2 = ref->f->data[2];
1237
1238     if (AV_RN32A(mv)) {
1239         int mx = mv->x&7, mx_idx = subpel_idx[0][mx];
1240         int my = mv->y&7, my_idx = subpel_idx[0][my];
1241
1242         x_off += mv->x >> 3;
1243         y_off += mv->y >> 3;
1244
1245         // edge emulation
1246         src1 += y_off * linesize + x_off;
1247         src2 += y_off * linesize + x_off;
1248         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 3, 0);
1249         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1250             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1251             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1252                                      src1 - my_idx * linesize - mx_idx,
1253                                      linesize, linesize,
1254                                      block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1255                                      x_off - mx_idx, y_off - my_idx, width, height);
1256             src1 = td->edge_emu_buffer + mx_idx + linesize * my_idx;
1257             mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my);
1258
1259             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1260                                      src2 - my_idx * linesize - mx_idx,
1261                                      linesize, linesize,
1262                                      block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1263                                      x_off - mx_idx, y_off - my_idx, width, height);
1264             src2 = td->edge_emu_buffer + mx_idx + linesize * my_idx;
1265             mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my);
1266         } else {
1267             mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my);
1268             mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my);
1269         }
1270     } else {
1271         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 3, 0);
1272         mc_func[0][0](dst1, linesize, src1 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1273         mc_func[0][0](dst2, linesize, src2 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1274     }
1275 }
1276
1277 static av_always_inline
1278 void vp8_mc_part(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1279                  ThreadFrame *ref_frame, int x_off, int y_off,
1280                  int bx_off, int by_off,
1281                  int block_w, int block_h,
1282                  int width, int height, VP56mv *mv)
1283 {
1284     VP56mv uvmv = *mv;
1285
1286     /* Y */
1287     vp8_mc_luma(s, td, dst[0] + by_off * s->linesize + bx_off,
1288                 ref_frame, mv, x_off + bx_off, y_off + by_off,
1289                 block_w, block_h, width, height, s->linesize,
1290                 s->put_pixels_tab[block_w == 8]);
1291
1292     /* U/V */
1293     if (s->profile == 3) {
1294         uvmv.x &= ~7;
1295         uvmv.y &= ~7;
1296     }
1297     x_off   >>= 1; y_off   >>= 1;
1298     bx_off  >>= 1; by_off  >>= 1;
1299     width   >>= 1; height  >>= 1;
1300     block_w >>= 1; block_h >>= 1;
1301     vp8_mc_chroma(s, td, dst[1] + by_off * s->uvlinesize + bx_off,
1302                   dst[2] + by_off * s->uvlinesize + bx_off, ref_frame,
1303                   &uvmv, x_off + bx_off, y_off + by_off,
1304                   block_w, block_h, width, height, s->uvlinesize,
1305                   s->put_pixels_tab[1 + (block_w == 4)]);
1306 }
1307
1308 /* Fetch pixels for estimated mv 4 macroblocks ahead.
1309  * Optimized for 64-byte cache lines.  Inspired by ffh264 prefetch_motion. */
1310 static av_always_inline void prefetch_motion(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, int mb_xy, int ref)
1311 {
1312     /* Don't prefetch refs that haven't been used very often this frame. */
1313     if (s->ref_count[ref-1] > (mb_xy >> 5)) {
1314         int x_off = mb_x << 4, y_off = mb_y << 4;
1315         int mx = (mb->mv.x>>2) + x_off + 8;
1316         int my = (mb->mv.y>>2) + y_off;
1317         uint8_t **src= s->framep[ref]->tf.f->data;
1318         int off= mx + (my + (mb_x&3)*4)*s->linesize + 64;
1319         /* For threading, a ff_thread_await_progress here might be useful, but
1320          * it actually slows down the decoder. Since a bad prefetch doesn't
1321          * generate bad decoder output, we don't run it here. */
1322         s->vdsp.prefetch(src[0]+off, s->linesize, 4);
1323         off= (mx>>1) + ((my>>1) + (mb_x&7))*s->uvlinesize + 64;
1324         s->vdsp.prefetch(src[1]+off, src[2]-src[1], 2);
1325     }
1326 }
1327
1328 /**
1329  * Apply motion vectors to prediction buffer, chapter 18.
1330  */
1331 static av_always_inline
1332 void inter_predict(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1333                    VP8Macroblock *mb, int mb_x, int mb_y)
1334 {
1335     int x_off = mb_x << 4, y_off = mb_y << 4;
1336     int width = 16*s->mb_width, height = 16*s->mb_height;
1337     ThreadFrame *ref = &s->framep[mb->ref_frame]->tf;
1338     VP56mv *bmv = mb->bmv;
1339
1340     switch (mb->partitioning) {
1341     case VP8_SPLITMVMODE_NONE:
1342         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1343                     0, 0, 16, 16, width, height, &mb->mv);
1344         break;
1345     case VP8_SPLITMVMODE_4x4: {
1346         int x, y;
1347         VP56mv uvmv;
1348
1349         /* Y */
1350         for (y = 0; y < 4; y++) {
1351             for (x = 0; x < 4; x++) {
1352                 vp8_mc_luma(s, td, dst[0] + 4*y*s->linesize + x*4,
1353                             ref, &bmv[4*y + x],
1354                             4*x + x_off, 4*y + y_off, 4, 4,
1355                             width, height, s->linesize,
1356                             s->put_pixels_tab[2]);
1357             }
1358         }
1359
1360         /* U/V */
1361         x_off >>= 1; y_off >>= 1; width >>= 1; height >>= 1;
1362         for (y = 0; y < 2; y++) {
1363             for (x = 0; x < 2; x++) {
1364                 uvmv.x = mb->bmv[ 2*y    * 4 + 2*x  ].x +
1365                          mb->bmv[ 2*y    * 4 + 2*x+1].x +
1366                          mb->bmv[(2*y+1) * 4 + 2*x  ].x +
1367                          mb->bmv[(2*y+1) * 4 + 2*x+1].x;
1368                 uvmv.y = mb->bmv[ 2*y    * 4 + 2*x  ].y +
1369                          mb->bmv[ 2*y    * 4 + 2*x+1].y +
1370                          mb->bmv[(2*y+1) * 4 + 2*x  ].y +
1371                          mb->bmv[(2*y+1) * 4 + 2*x+1].y;
1372                 uvmv.x = (uvmv.x + 2 + (uvmv.x >> (INT_BIT-1))) >> 2;
1373                 uvmv.y = (uvmv.y + 2 + (uvmv.y >> (INT_BIT-1))) >> 2;
1374                 if (s->profile == 3) {
1375                     uvmv.x &= ~7;
1376                     uvmv.y &= ~7;
1377                 }
1378                 vp8_mc_chroma(s, td, dst[1] + 4*y*s->uvlinesize + x*4,
1379                               dst[2] + 4*y*s->uvlinesize + x*4, ref, &uvmv,
1380                               4*x + x_off, 4*y + y_off, 4, 4,
1381                               width, height, s->uvlinesize,
1382                               s->put_pixels_tab[2]);
1383             }
1384         }
1385         break;
1386     }
1387     case VP8_SPLITMVMODE_16x8:
1388         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1389                     0, 0, 16, 8, width, height, &bmv[0]);
1390         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1391                     0, 8, 16, 8, width, height, &bmv[1]);
1392         break;
1393     case VP8_SPLITMVMODE_8x16:
1394         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1395                     0, 0, 8, 16, width, height, &bmv[0]);
1396         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1397                     8, 0, 8, 16, width, height, &bmv[1]);
1398         break;
1399     case VP8_SPLITMVMODE_8x8:
1400         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1401                     0, 0, 8, 8, width, height, &bmv[0]);
1402         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1403                     8, 0, 8, 8, width, height, &bmv[1]);
1404         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1405                     0, 8, 8, 8, width, height, &bmv[2]);
1406         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1407                     8, 8, 8, 8, width, height, &bmv[3]);
1408         break;
1409     }
1410 }
1411
1412 static av_always_inline void idct_mb(VP8Context *s, VP8ThreadData *td,
1413                                      uint8_t *dst[3], VP8Macroblock *mb)
1414 {
1415     int x, y, ch;
1416
1417     if (mb->mode != MODE_I4x4) {
1418         uint8_t *y_dst = dst[0];
1419         for (y = 0; y < 4; y++) {
1420             uint32_t nnz4 = AV_RL32(td->non_zero_count_cache[y]);
1421             if (nnz4) {
1422                 if (nnz4&~0x01010101) {
1423                     for (x = 0; x < 4; x++) {
1424                         if ((uint8_t)nnz4 == 1)
1425                             s->vp8dsp.vp8_idct_dc_add(y_dst+4*x, td->block[y][x], s->linesize);
1426                         else if((uint8_t)nnz4 > 1)
1427                             s->vp8dsp.vp8_idct_add(y_dst+4*x, td->block[y][x], s->linesize);
1428                         nnz4 >>= 8;
1429                         if (!nnz4)
1430                             break;
1431                     }
1432                 } else {
1433                     s->vp8dsp.vp8_idct_dc_add4y(y_dst, td->block[y], s->linesize);
1434                 }
1435             }
1436             y_dst += 4*s->linesize;
1437         }
1438     }
1439
1440     for (ch = 0; ch < 2; ch++) {
1441         uint32_t nnz4 = AV_RL32(td->non_zero_count_cache[4+ch]);
1442         if (nnz4) {
1443             uint8_t *ch_dst = dst[1+ch];
1444             if (nnz4&~0x01010101) {
1445                 for (y = 0; y < 2; y++) {
1446                     for (x = 0; x < 2; x++) {
1447                         if ((uint8_t)nnz4 == 1)
1448                             s->vp8dsp.vp8_idct_dc_add(ch_dst+4*x, td->block[4+ch][(y<<1)+x], s->uvlinesize);
1449                         else if((uint8_t)nnz4 > 1)
1450                             s->vp8dsp.vp8_idct_add(ch_dst+4*x, td->block[4+ch][(y<<1)+x], s->uvlinesize);
1451                         nnz4 >>= 8;
1452                         if (!nnz4)
1453                             goto chroma_idct_end;
1454                     }
1455                     ch_dst += 4*s->uvlinesize;
1456                 }
1457             } else {
1458                 s->vp8dsp.vp8_idct_dc_add4uv(ch_dst, td->block[4+ch], s->uvlinesize);
1459             }
1460         }
1461 chroma_idct_end: ;
1462     }
1463 }
1464
1465 static av_always_inline void filter_level_for_mb(VP8Context *s, VP8Macroblock *mb, VP8FilterStrength *f )
1466 {
1467     int interior_limit, filter_level;
1468
1469     if (s->segmentation.enabled) {
1470         filter_level = s->segmentation.filter_level[mb->segment];
1471         if (!s->segmentation.absolute_vals)
1472             filter_level += s->filter.level;
1473     } else
1474         filter_level = s->filter.level;
1475
1476     if (s->lf_delta.enabled) {
1477         filter_level += s->lf_delta.ref[mb->ref_frame];
1478         filter_level += s->lf_delta.mode[mb->mode];
1479     }
1480
1481     filter_level = av_clip_uintp2(filter_level, 6);
1482
1483     interior_limit = filter_level;
1484     if (s->filter.sharpness) {
1485         interior_limit >>= (s->filter.sharpness + 3) >> 2;
1486         interior_limit = FFMIN(interior_limit, 9 - s->filter.sharpness);
1487     }
1488     interior_limit = FFMAX(interior_limit, 1);
1489
1490     f->filter_level = filter_level;
1491     f->inner_limit = interior_limit;
1492     f->inner_filter = !mb->skip || mb->mode == MODE_I4x4 || mb->mode == VP8_MVMODE_SPLIT;
1493 }
1494
1495 static av_always_inline void filter_mb(VP8Context *s, uint8_t *dst[3], VP8FilterStrength *f, int mb_x, int mb_y)
1496 {
1497     int mbedge_lim, bedge_lim, hev_thresh;
1498     int filter_level = f->filter_level;
1499     int inner_limit = f->inner_limit;
1500     int inner_filter = f->inner_filter;
1501     int linesize = s->linesize;
1502     int uvlinesize = s->uvlinesize;
1503     static const uint8_t hev_thresh_lut[2][64] = {
1504         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
1505           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1506           3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1507           3, 3, 3, 3 },
1508         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
1509           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1510           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1511           2, 2, 2, 2 }
1512     };
1513
1514     if (!filter_level)
1515         return;
1516
1517      bedge_lim = 2*filter_level + inner_limit;
1518     mbedge_lim = bedge_lim + 4;
1519
1520     hev_thresh = hev_thresh_lut[s->keyframe][filter_level];
1521
1522     if (mb_x) {
1523         s->vp8dsp.vp8_h_loop_filter16y(dst[0],     linesize,
1524                                        mbedge_lim, inner_limit, hev_thresh);
1525         s->vp8dsp.vp8_h_loop_filter8uv(dst[1],     dst[2],      uvlinesize,
1526                                        mbedge_lim, inner_limit, hev_thresh);
1527     }
1528
1529     if (inner_filter) {
1530         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+ 4, linesize, bedge_lim,
1531                                              inner_limit, hev_thresh);
1532         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+ 8, linesize, bedge_lim,
1533                                              inner_limit, hev_thresh);
1534         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+12, linesize, bedge_lim,
1535                                              inner_limit, hev_thresh);
1536         s->vp8dsp.vp8_h_loop_filter8uv_inner(dst[1] + 4, dst[2] + 4,
1537                                              uvlinesize,  bedge_lim,
1538                                              inner_limit, hev_thresh);
1539     }
1540
1541     if (mb_y) {
1542         s->vp8dsp.vp8_v_loop_filter16y(dst[0],     linesize,
1543                                        mbedge_lim, inner_limit, hev_thresh);
1544         s->vp8dsp.vp8_v_loop_filter8uv(dst[1],     dst[2],      uvlinesize,
1545                                        mbedge_lim, inner_limit, hev_thresh);
1546     }
1547
1548     if (inner_filter) {
1549         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+ 4*linesize,
1550                                              linesize,    bedge_lim,
1551                                              inner_limit, hev_thresh);
1552         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+ 8*linesize,
1553                                              linesize,    bedge_lim,
1554                                              inner_limit, hev_thresh);
1555         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+12*linesize,
1556                                              linesize,    bedge_lim,
1557                                              inner_limit, hev_thresh);
1558         s->vp8dsp.vp8_v_loop_filter8uv_inner(dst[1] + 4 * uvlinesize,
1559                                              dst[2] + 4 * uvlinesize,
1560                                              uvlinesize,  bedge_lim,
1561                                              inner_limit, hev_thresh);
1562     }
1563 }
1564
1565 static av_always_inline void filter_mb_simple(VP8Context *s, uint8_t *dst, VP8FilterStrength *f, int mb_x, int mb_y)
1566 {
1567     int mbedge_lim, bedge_lim;
1568     int filter_level = f->filter_level;
1569     int inner_limit = f->inner_limit;
1570     int inner_filter = f->inner_filter;
1571     int linesize = s->linesize;
1572
1573     if (!filter_level)
1574         return;
1575
1576      bedge_lim = 2*filter_level + inner_limit;
1577     mbedge_lim = bedge_lim + 4;
1578
1579     if (mb_x)
1580         s->vp8dsp.vp8_h_loop_filter_simple(dst, linesize, mbedge_lim);
1581     if (inner_filter) {
1582         s->vp8dsp.vp8_h_loop_filter_simple(dst+ 4, linesize, bedge_lim);
1583         s->vp8dsp.vp8_h_loop_filter_simple(dst+ 8, linesize, bedge_lim);
1584         s->vp8dsp.vp8_h_loop_filter_simple(dst+12, linesize, bedge_lim);
1585     }
1586
1587     if (mb_y)
1588         s->vp8dsp.vp8_v_loop_filter_simple(dst, linesize, mbedge_lim);
1589     if (inner_filter) {
1590         s->vp8dsp.vp8_v_loop_filter_simple(dst+ 4*linesize, linesize, bedge_lim);
1591         s->vp8dsp.vp8_v_loop_filter_simple(dst+ 8*linesize, linesize, bedge_lim);
1592         s->vp8dsp.vp8_v_loop_filter_simple(dst+12*linesize, linesize, bedge_lim);
1593     }
1594 }
1595
1596 #define MARGIN (16 << 2)
1597 static void vp8_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *curframe,
1598                                    VP8Frame *prev_frame)
1599 {
1600     VP8Context *s = avctx->priv_data;
1601     int mb_x, mb_y;
1602
1603     s->mv_min.y = -MARGIN;
1604     s->mv_max.y = ((s->mb_height - 1) << 6) + MARGIN;
1605     for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
1606         VP8Macroblock *mb = s->macroblocks_base + ((s->mb_width+1)*(mb_y + 1) + 1);
1607         int mb_xy = mb_y*s->mb_width;
1608
1609         AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED*0x01010101);
1610
1611         s->mv_min.x = -MARGIN;
1612         s->mv_max.x = ((s->mb_width - 1) << 6) + MARGIN;
1613         for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
1614             if (mb_y == 0)
1615                 AV_WN32A((mb-s->mb_width-1)->intra4x4_pred_mode_top, DC_PRED*0x01010101);
1616             decode_mb_mode(s, mb, mb_x, mb_y, curframe->seg_map->data + mb_xy,
1617                            prev_frame && prev_frame->seg_map ?
1618                            prev_frame->seg_map->data + mb_xy : NULL, 1);
1619             s->mv_min.x -= 64;
1620             s->mv_max.x -= 64;
1621         }
1622         s->mv_min.y -= 64;
1623         s->mv_max.y -= 64;
1624     }
1625 }
1626
1627 #if HAVE_THREADS
1628 #define check_thread_pos(td, otd, mb_x_check, mb_y_check)\
1629     do {\
1630         int tmp = (mb_y_check << 16) | (mb_x_check & 0xFFFF);\
1631         if (otd->thread_mb_pos < tmp) {\
1632             pthread_mutex_lock(&otd->lock);\
1633             td->wait_mb_pos = tmp;\
1634             do {\
1635                 if (otd->thread_mb_pos >= tmp)\
1636                     break;\
1637                 pthread_cond_wait(&otd->cond, &otd->lock);\
1638             } while (1);\
1639             td->wait_mb_pos = INT_MAX;\
1640             pthread_mutex_unlock(&otd->lock);\
1641         }\
1642     } while(0);
1643
1644 #define update_pos(td, mb_y, mb_x)\
1645     do {\
1646     int pos              = (mb_y << 16) | (mb_x & 0xFFFF);\
1647     int sliced_threading = (avctx->active_thread_type == FF_THREAD_SLICE) && (num_jobs > 1);\
1648     int is_null          = (next_td == NULL) || (prev_td == NULL);\
1649     int pos_check        = (is_null) ? 1 :\
1650                             (next_td != td && pos >= next_td->wait_mb_pos) ||\
1651                             (prev_td != td && pos >= prev_td->wait_mb_pos);\
1652     td->thread_mb_pos = pos;\
1653     if (sliced_threading && pos_check) {\
1654         pthread_mutex_lock(&td->lock);\
1655         pthread_cond_broadcast(&td->cond);\
1656         pthread_mutex_unlock(&td->lock);\
1657     }\
1658     } while(0);
1659 #else
1660 #define check_thread_pos(td, otd, mb_x_check, mb_y_check)
1661 #define update_pos(td, mb_y, mb_x)
1662 #endif
1663
1664 static void vp8_decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
1665                                         int jobnr, int threadnr)
1666 {
1667     VP8Context *s = avctx->priv_data;
1668     VP8ThreadData *prev_td, *next_td, *td = &s->thread_data[threadnr];
1669     int mb_y = td->thread_mb_pos>>16;
1670     int i, y, mb_x, mb_xy = mb_y*s->mb_width;
1671     int num_jobs = s->num_jobs;
1672     VP8Frame *curframe = s->curframe, *prev_frame = s->prev_frame;
1673     VP56RangeCoder *c = &s->coeff_partition[mb_y & (s->num_coeff_partitions-1)];
1674     VP8Macroblock *mb;
1675     uint8_t *dst[3] = {
1676         curframe->tf.f->data[0] + 16*mb_y*s->linesize,
1677         curframe->tf.f->data[1] +  8*mb_y*s->uvlinesize,
1678         curframe->tf.f->data[2] +  8*mb_y*s->uvlinesize
1679     };
1680     if (mb_y == 0) prev_td = td;
1681     else           prev_td = &s->thread_data[(jobnr + num_jobs - 1)%num_jobs];
1682     if (mb_y == s->mb_height-1) next_td = td;
1683     else                        next_td = &s->thread_data[(jobnr + 1)%num_jobs];
1684     if (s->mb_layout == 1)
1685         mb = s->macroblocks_base + ((s->mb_width+1)*(mb_y + 1) + 1);
1686     else {
1687         // Make sure the previous frame has read its segmentation map,
1688         // if we re-use the same map.
1689         if (prev_frame && s->segmentation.enabled &&
1690             !s->segmentation.update_map)
1691             ff_thread_await_progress(&prev_frame->tf, mb_y, 0);
1692         mb = s->macroblocks + (s->mb_height - mb_y - 1)*2;
1693         memset(mb - 1, 0, sizeof(*mb)); // zero left macroblock
1694         AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED*0x01010101);
1695     }
1696
1697     memset(td->left_nnz, 0, sizeof(td->left_nnz));
1698     // left edge of 129 for intra prediction
1699     if (!(avctx->flags & CODEC_FLAG_EMU_EDGE)) {
1700         for (i = 0; i < 3; i++)
1701             for (y = 0; y < 16>>!!i; y++)
1702                 dst[i][y*curframe->tf.f->linesize[i]-1] = 129;
1703         if (mb_y == 1) {
1704             s->top_border[0][15] = s->top_border[0][23] = s->top_border[0][31] = 129;
1705         }
1706     }
1707
1708     s->mv_min.x = -MARGIN;
1709     s->mv_max.x = ((s->mb_width  - 1) << 6) + MARGIN;
1710
1711     for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
1712         // Wait for previous thread to read mb_x+2, and reach mb_y-1.
1713         if (prev_td != td) {
1714             if (threadnr != 0) {
1715                 check_thread_pos(td, prev_td, mb_x+1, mb_y-1);
1716             } else {
1717                 check_thread_pos(td, prev_td, (s->mb_width+3) + (mb_x+1), mb_y-1);
1718             }
1719         }
1720
1721         s->vdsp.prefetch(dst[0] + (mb_x&3)*4*s->linesize + 64, s->linesize, 4);
1722         s->vdsp.prefetch(dst[1] + (mb_x&7)*s->uvlinesize + 64, dst[2] - dst[1], 2);
1723
1724         if (!s->mb_layout)
1725             decode_mb_mode(s, mb, mb_x, mb_y, curframe->seg_map->data + mb_xy,
1726                            prev_frame && prev_frame->seg_map ?
1727                            prev_frame->seg_map->data + mb_xy : NULL, 0);
1728
1729         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_PREVIOUS);
1730
1731         if (!mb->skip)
1732             decode_mb_coeffs(s, td, c, mb, s->top_nnz[mb_x], td->left_nnz);
1733
1734         if (mb->mode <= MODE_I4x4)
1735             intra_predict(s, td, dst, mb, mb_x, mb_y);
1736         else
1737             inter_predict(s, td, dst, mb, mb_x, mb_y);
1738
1739         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN);
1740
1741         if (!mb->skip) {
1742             idct_mb(s, td, dst, mb);
1743         } else {
1744             AV_ZERO64(td->left_nnz);
1745             AV_WN64(s->top_nnz[mb_x], 0);   // array of 9, so unaligned
1746
1747             // Reset DC block predictors if they would exist if the mb had coefficients
1748             if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
1749                 td->left_nnz[8]     = 0;
1750                 s->top_nnz[mb_x][8] = 0;
1751             }
1752         }
1753
1754         if (s->deblock_filter)
1755             filter_level_for_mb(s, mb, &td->filter_strength[mb_x]);
1756
1757         if (s->deblock_filter && num_jobs != 1 && threadnr == num_jobs-1) {
1758             if (s->filter.simple)
1759                 backup_mb_border(s->top_border[mb_x+1], dst[0], NULL, NULL, s->linesize, 0, 1);
1760             else
1761                 backup_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2], s->linesize, s->uvlinesize, 0);
1762         }
1763
1764         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN2);
1765
1766         dst[0] += 16;
1767         dst[1] += 8;
1768         dst[2] += 8;
1769         s->mv_min.x -= 64;
1770         s->mv_max.x -= 64;
1771
1772         if (mb_x == s->mb_width+1) {
1773             update_pos(td, mb_y, s->mb_width+3);
1774         } else {
1775             update_pos(td, mb_y, mb_x);
1776         }
1777     }
1778 }
1779
1780 static void vp8_filter_mb_row(AVCodecContext *avctx, void *tdata,
1781                               int jobnr, int threadnr)
1782 {
1783     VP8Context *s = avctx->priv_data;
1784     VP8ThreadData *td = &s->thread_data[threadnr];
1785     int mb_x, mb_y = td->thread_mb_pos>>16, num_jobs = s->num_jobs;
1786     AVFrame *curframe = s->curframe->tf.f;
1787     VP8Macroblock *mb;
1788     VP8ThreadData *prev_td, *next_td;
1789     uint8_t *dst[3] = {
1790         curframe->data[0] + 16*mb_y*s->linesize,
1791         curframe->data[1] +  8*mb_y*s->uvlinesize,
1792         curframe->data[2] +  8*mb_y*s->uvlinesize
1793     };
1794
1795     if (s->mb_layout == 1)
1796         mb = s->macroblocks_base + ((s->mb_width+1)*(mb_y + 1) + 1);
1797     else
1798         mb = s->macroblocks + (s->mb_height - mb_y - 1)*2;
1799
1800     if (mb_y == 0) prev_td = td;
1801     else           prev_td = &s->thread_data[(jobnr + num_jobs - 1)%num_jobs];
1802     if (mb_y == s->mb_height-1) next_td = td;
1803     else                        next_td = &s->thread_data[(jobnr + 1)%num_jobs];
1804
1805     for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb++) {
1806         VP8FilterStrength *f = &td->filter_strength[mb_x];
1807         if (prev_td != td) {
1808             check_thread_pos(td, prev_td, (mb_x+1) + (s->mb_width+3), mb_y-1);
1809         }
1810         if (next_td != td)
1811             if (next_td != &s->thread_data[0]) {
1812                 check_thread_pos(td, next_td, mb_x+1, mb_y+1);
1813             }
1814
1815         if (num_jobs == 1) {
1816             if (s->filter.simple)
1817                 backup_mb_border(s->top_border[mb_x+1], dst[0], NULL, NULL, s->linesize, 0, 1);
1818             else
1819                 backup_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2], s->linesize, s->uvlinesize, 0);
1820         }
1821
1822         if (s->filter.simple)
1823             filter_mb_simple(s, dst[0], f, mb_x, mb_y);
1824         else
1825             filter_mb(s, dst, f, mb_x, mb_y);
1826         dst[0] += 16;
1827         dst[1] += 8;
1828         dst[2] += 8;
1829
1830         update_pos(td, mb_y, (s->mb_width+3) + mb_x);
1831     }
1832 }
1833
1834 static int vp8_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata,
1835                                     int jobnr, int threadnr)
1836 {
1837     VP8Context *s = avctx->priv_data;
1838     VP8ThreadData *td = &s->thread_data[jobnr];
1839     VP8ThreadData *next_td = NULL, *prev_td = NULL;
1840     VP8Frame *curframe = s->curframe;
1841     int mb_y, num_jobs = s->num_jobs;
1842     td->thread_nr = threadnr;
1843     for (mb_y = jobnr; mb_y < s->mb_height; mb_y += num_jobs) {
1844         if (mb_y >= s->mb_height) break;
1845         td->thread_mb_pos = mb_y<<16;
1846         vp8_decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr);
1847         if (s->deblock_filter)
1848             vp8_filter_mb_row(avctx, tdata, jobnr, threadnr);
1849         update_pos(td, mb_y, INT_MAX & 0xFFFF);
1850
1851         s->mv_min.y -= 64;
1852         s->mv_max.y -= 64;
1853
1854         if (avctx->active_thread_type == FF_THREAD_FRAME)
1855             ff_thread_report_progress(&curframe->tf, mb_y, 0);
1856     }
1857
1858     return 0;
1859 }
1860
1861 int ff_vp8_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
1862                         AVPacket *avpkt)
1863 {
1864     VP8Context *s = avctx->priv_data;
1865     int ret, i, referenced, num_jobs;
1866     enum AVDiscard skip_thresh;
1867     VP8Frame *av_uninit(curframe), *prev_frame;
1868
1869     if ((ret = decode_frame_header(s, avpkt->data, avpkt->size)) < 0)
1870         goto err;
1871
1872     prev_frame = s->framep[VP56_FRAME_CURRENT];
1873
1874     referenced = s->update_last || s->update_golden == VP56_FRAME_CURRENT
1875                                 || s->update_altref == VP56_FRAME_CURRENT;
1876
1877     skip_thresh = !referenced ? AVDISCARD_NONREF :
1878                     !s->keyframe ? AVDISCARD_NONKEY : AVDISCARD_ALL;
1879
1880     if (avctx->skip_frame >= skip_thresh) {
1881         s->invisible = 1;
1882         memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
1883         goto skip_decode;
1884     }
1885     s->deblock_filter = s->filter.level && avctx->skip_loop_filter < skip_thresh;
1886
1887     // release no longer referenced frames
1888     for (i = 0; i < 5; i++)
1889         if (s->frames[i].tf.f->data[0] &&
1890             &s->frames[i] != prev_frame &&
1891             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
1892             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN] &&
1893             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2])
1894             vp8_release_frame(s, &s->frames[i]);
1895
1896     // find a free buffer
1897     for (i = 0; i < 5; i++)
1898         if (&s->frames[i] != prev_frame &&
1899             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
1900             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN] &&
1901             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2]) {
1902             curframe = s->framep[VP56_FRAME_CURRENT] = &s->frames[i];
1903             break;
1904         }
1905     if (i == 5) {
1906         av_log(avctx, AV_LOG_FATAL, "Ran out of free frames!\n");
1907         abort();
1908     }
1909     if (curframe->tf.f->data[0])
1910         vp8_release_frame(s, curframe);
1911
1912     // Given that arithmetic probabilities are updated every frame, it's quite likely
1913     // that the values we have on a random interframe are complete junk if we didn't
1914     // start decode on a keyframe. So just don't display anything rather than junk.
1915     if (!s->keyframe && (!s->framep[VP56_FRAME_PREVIOUS] ||
1916                          !s->framep[VP56_FRAME_GOLDEN] ||
1917                          !s->framep[VP56_FRAME_GOLDEN2])) {
1918         av_log(avctx, AV_LOG_WARNING, "Discarding interframe without a prior keyframe!\n");
1919         ret = AVERROR_INVALIDDATA;
1920         goto err;
1921     }
1922
1923     curframe->tf.f->key_frame = s->keyframe;
1924     curframe->tf.f->pict_type = s->keyframe ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
1925     if ((ret = vp8_alloc_frame(s, curframe, referenced))) {
1926         av_log(avctx, AV_LOG_ERROR, "get_buffer() failed!\n");
1927         goto err;
1928     }
1929
1930     // check if golden and altref are swapped
1931     if (s->update_altref != VP56_FRAME_NONE) {
1932         s->next_framep[VP56_FRAME_GOLDEN2]  = s->framep[s->update_altref];
1933     } else {
1934         s->next_framep[VP56_FRAME_GOLDEN2]  = s->framep[VP56_FRAME_GOLDEN2];
1935     }
1936     if (s->update_golden != VP56_FRAME_NONE) {
1937         s->next_framep[VP56_FRAME_GOLDEN]   = s->framep[s->update_golden];
1938     } else {
1939         s->next_framep[VP56_FRAME_GOLDEN]   = s->framep[VP56_FRAME_GOLDEN];
1940     }
1941     if (s->update_last) {
1942         s->next_framep[VP56_FRAME_PREVIOUS] = curframe;
1943     } else {
1944         s->next_framep[VP56_FRAME_PREVIOUS] = s->framep[VP56_FRAME_PREVIOUS];
1945     }
1946     s->next_framep[VP56_FRAME_CURRENT]      = curframe;
1947
1948     ff_thread_finish_setup(avctx);
1949
1950     s->linesize   = curframe->tf.f->linesize[0];
1951     s->uvlinesize = curframe->tf.f->linesize[1];
1952
1953     if (!s->thread_data[0].edge_emu_buffer)
1954         for (i = 0; i < MAX_THREADS; i++)
1955             s->thread_data[i].edge_emu_buffer = av_malloc(21*s->linesize);
1956
1957     memset(s->top_nnz, 0, s->mb_width*sizeof(*s->top_nnz));
1958     /* Zero macroblock structures for top/top-left prediction from outside the frame. */
1959     if (!s->mb_layout)
1960         memset(s->macroblocks + s->mb_height*2 - 1, 0, (s->mb_width+1)*sizeof(*s->macroblocks));
1961     if (!s->mb_layout && s->keyframe)
1962         memset(s->intra4x4_pred_mode_top, DC_PRED, s->mb_width*4);
1963
1964     // top edge of 127 for intra prediction
1965     if (!(avctx->flags & CODEC_FLAG_EMU_EDGE)) {
1966         s->top_border[0][15] = s->top_border[0][23] = 127;
1967         s->top_border[0][31] = 127;
1968         memset(s->top_border[1], 127, s->mb_width*sizeof(*s->top_border));
1969     }
1970     memset(s->ref_count, 0, sizeof(s->ref_count));
1971
1972
1973     if (s->mb_layout == 1) {
1974         // Make sure the previous frame has read its segmentation map,
1975         // if we re-use the same map.
1976         if (prev_frame && s->segmentation.enabled &&
1977             !s->segmentation.update_map)
1978             ff_thread_await_progress(&prev_frame->tf, 1, 0);
1979         vp8_decode_mv_mb_modes(avctx, curframe, prev_frame);
1980     }
1981
1982     if (avctx->active_thread_type == FF_THREAD_FRAME)
1983         num_jobs = 1;
1984     else
1985         num_jobs = FFMIN(s->num_coeff_partitions, avctx->thread_count);
1986     s->num_jobs   = num_jobs;
1987     s->curframe   = curframe;
1988     s->prev_frame = prev_frame;
1989     s->mv_min.y   = -MARGIN;
1990     s->mv_max.y   = ((s->mb_height - 1) << 6) + MARGIN;
1991     for (i = 0; i < MAX_THREADS; i++) {
1992         s->thread_data[i].thread_mb_pos = 0;
1993         s->thread_data[i].wait_mb_pos = INT_MAX;
1994     }
1995     avctx->execute2(avctx, vp8_decode_mb_row_sliced, s->thread_data, NULL, num_jobs);
1996
1997     ff_thread_report_progress(&curframe->tf, INT_MAX, 0);
1998     memcpy(&s->framep[0], &s->next_framep[0], sizeof(s->framep[0]) * 4);
1999
2000 skip_decode:
2001     // if future frames don't use the updated probabilities,
2002     // reset them to the values we saved
2003     if (!s->update_probabilities)
2004         s->prob[0] = s->prob[1];
2005
2006     if (!s->invisible) {
2007         if ((ret = av_frame_ref(data, curframe->tf.f)) < 0)
2008             return ret;
2009         *got_frame      = 1;
2010     }
2011
2012     return avpkt->size;
2013 err:
2014     memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
2015     return ret;
2016 }
2017
2018 av_cold int ff_vp8_decode_free(AVCodecContext *avctx)
2019 {
2020     VP8Context *s = avctx->priv_data;
2021     int i;
2022
2023     vp8_decode_flush_impl(avctx, 1);
2024     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++)
2025         av_frame_free(&s->frames[i].tf.f);
2026
2027     return 0;
2028 }
2029
2030 static av_cold int vp8_init_frames(VP8Context *s)
2031 {
2032     int i;
2033     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++) {
2034         s->frames[i].tf.f = av_frame_alloc();
2035         if (!s->frames[i].tf.f)
2036             return AVERROR(ENOMEM);
2037     }
2038     return 0;
2039 }
2040
2041 av_cold int ff_vp8_decode_init(AVCodecContext *avctx)
2042 {
2043     VP8Context *s = avctx->priv_data;
2044     int ret;
2045
2046     s->avctx = avctx;
2047     avctx->pix_fmt = AV_PIX_FMT_YUV420P;
2048     avctx->internal->allocate_progress = 1;
2049
2050     ff_videodsp_init(&s->vdsp, 8);
2051     ff_h264_pred_init(&s->hpc, AV_CODEC_ID_VP8, 8, 1);
2052     ff_vp8dsp_init(&s->vp8dsp);
2053
2054     if ((ret = vp8_init_frames(s)) < 0) {
2055         ff_vp8_decode_free(avctx);
2056         return ret;
2057     }
2058
2059     return 0;
2060 }
2061
2062 static av_cold int vp8_decode_init_thread_copy(AVCodecContext *avctx)
2063 {
2064     VP8Context *s = avctx->priv_data;
2065     int ret;
2066
2067     s->avctx = avctx;
2068
2069     if ((ret = vp8_init_frames(s)) < 0) {
2070         ff_vp8_decode_free(avctx);
2071         return ret;
2072     }
2073
2074     return 0;
2075 }
2076
2077 #define REBASE(pic) \
2078     pic ? pic - &s_src->frames[0] + &s->frames[0] : NULL
2079
2080 static int vp8_decode_update_thread_context(AVCodecContext *dst, const AVCodecContext *src)
2081 {
2082     VP8Context *s = dst->priv_data, *s_src = src->priv_data;
2083     int i;
2084
2085     if (s->macroblocks_base &&
2086         (s_src->mb_width != s->mb_width || s_src->mb_height != s->mb_height)) {
2087         free_buffers(s);
2088         s->mb_width  = s_src->mb_width;
2089         s->mb_height = s_src->mb_height;
2090     }
2091
2092     s->prob[0] = s_src->prob[!s_src->update_probabilities];
2093     s->segmentation = s_src->segmentation;
2094     s->lf_delta = s_src->lf_delta;
2095     memcpy(s->sign_bias, s_src->sign_bias, sizeof(s->sign_bias));
2096
2097     for (i = 0; i < FF_ARRAY_ELEMS(s_src->frames); i++) {
2098         if (s_src->frames[i].tf.f->data[0]) {
2099             int ret = vp8_ref_frame(s, &s->frames[i], &s_src->frames[i]);
2100             if (ret < 0)
2101                 return ret;
2102         }
2103     }
2104
2105     s->framep[0] = REBASE(s_src->next_framep[0]);
2106     s->framep[1] = REBASE(s_src->next_framep[1]);
2107     s->framep[2] = REBASE(s_src->next_framep[2]);
2108     s->framep[3] = REBASE(s_src->next_framep[3]);
2109
2110     return 0;
2111 }
2112
2113 AVCodec ff_vp8_decoder = {
2114     .name                  = "vp8",
2115     .long_name             = NULL_IF_CONFIG_SMALL("On2 VP8"),
2116     .type                  = AVMEDIA_TYPE_VIDEO,
2117     .id                    = AV_CODEC_ID_VP8,
2118     .priv_data_size        = sizeof(VP8Context),
2119     .init                  = ff_vp8_decode_init,
2120     .close                 = ff_vp8_decode_free,
2121     .decode                = ff_vp8_decode_frame,
2122     .capabilities          = CODEC_CAP_DR1 | CODEC_CAP_FRAME_THREADS | CODEC_CAP_SLICE_THREADS,
2123     .flush                 = vp8_decode_flush,
2124     .init_thread_copy      = ONLY_IF_THREADS_ENABLED(vp8_decode_init_thread_copy),
2125     .update_thread_context = ONLY_IF_THREADS_ENABLED(vp8_decode_update_thread_context),
2126 };