git.sesse.net Git - ffmpeg/blob - libavcodec/vp8.c

   1 /*
   2  * VP8 compatible video decoder
   3  *
   4  * Copyright (C) 2010 David Conrad
   5  * Copyright (C) 2010 Ronald S. Bultje
   6  * Copyright (C) 2010 Jason Garrett-Glaser
   7  * Copyright (C) 2012 Daniel Kang
   8  *
   9  * This file is part of FFmpeg.
  10  *
  11  * FFmpeg is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public
  13  * License as published by the Free Software Foundation; either
  14  * version 2.1 of the License, or (at your option) any later version.
  15  *
  16  * FFmpeg is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with FFmpeg; if not, write to the Free Software
  23  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  24  */
  25
  26 #include "libavutil/imgutils.h"
  27 #include "avcodec.h"
  28 #include "internal.h"
  29 #include "vp8.h"
  30 #include "vp8data.h"
  31 #include "rectangle.h"
  32 #include "thread.h"
  33
  34 #if ARCH_ARM
  35 #   include "arm/vp8.h"
  36 #endif
  37
  38 static void free_buffers(VP8Context *s)
  39 {
  40     int i;
  41     if (s->thread_data)
  42         for (i = 0; i < MAX_THREADS; i++) {
  43 #if HAVE_THREADS
  44             pthread_cond_destroy(&s->thread_data[i].cond);
  45             pthread_mutex_destroy(&s->thread_data[i].lock);
  46 #endif
  47             av_freep(&s->thread_data[i].filter_strength);
  48             av_freep(&s->thread_data[i].edge_emu_buffer);
  49         }
  50     av_freep(&s->thread_data);
  51     av_freep(&s->macroblocks_base);
  52     av_freep(&s->intra4x4_pred_mode_top);
  53     av_freep(&s->top_nnz);
  54     av_freep(&s->top_border);
  55
  56     s->macroblocks = NULL;
  57 }
  58
  59 static int vp8_alloc_frame(VP8Context *s, VP8Frame *f, int ref)
  60 {
  61     int ret;
  62     if ((ret = ff_thread_get_buffer(s->avctx, &f->tf,
  63                                     ref ? AV_GET_BUFFER_FLAG_REF : 0)) < 0)
  64         return ret;
  65     if (!(f->seg_map = av_buffer_allocz(s->mb_width * s->mb_height))) {
  66         ff_thread_release_buffer(s->avctx, &f->tf);
  67         return AVERROR(ENOMEM);
  68     }
  69     return 0;
  70 }
  71
  72 static void vp8_release_frame(VP8Context *s, VP8Frame *f)
  73 {
  74     av_buffer_unref(&f->seg_map);
  75     ff_thread_release_buffer(s->avctx, &f->tf);
  76 }
  77
  78 static int vp8_ref_frame(VP8Context *s, VP8Frame *dst, VP8Frame *src)
  79 {
  80     int ret;
  81
  82     vp8_release_frame(s, dst);
  83
  84     if ((ret = ff_thread_ref_frame(&dst->tf, &src->tf)) < 0)
  85         return ret;
  86     if (src->seg_map &&
  87         !(dst->seg_map = av_buffer_ref(src->seg_map))) {
  88         vp8_release_frame(s, dst);
  89         return AVERROR(ENOMEM);
  90     }
  91
  92     return 0;
  93 }
  94
  95
  96 static void vp8_decode_flush_impl(AVCodecContext *avctx, int free_mem)
  97 {
  98     VP8Context *s = avctx->priv_data;
  99     int i;
 100
 101     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++)
 102         vp8_release_frame(s, &s->frames[i]);
 103     memset(s->framep, 0, sizeof(s->framep));
 104
 105     if (free_mem)
 106         free_buffers(s);
 107 }
 108
 109 static void vp8_decode_flush(AVCodecContext *avctx)
 110 {
 111     vp8_decode_flush_impl(avctx, 0);
 112 }
 113
 114 static int update_dimensions(VP8Context *s, int width, int height)
 115 {
 116     AVCodecContext *avctx = s->avctx;
 117     int i;
 118
 119     if (width  != s->avctx->width || ((width+15)/16 != s->mb_width || (height+15)/16 != s->mb_height) && s->macroblocks_base ||
 120         height != s->avctx->height) {
 121         if (av_image_check_size(width, height, 0, s->avctx))
 122             return AVERROR_INVALIDDATA;
 123
 124         vp8_decode_flush_impl(s->avctx, 1);
 125
 126         avcodec_set_dimensions(s->avctx, width, height);
 127     }
 128
 129     s->mb_width  = (s->avctx->coded_width +15) / 16;
 130     s->mb_height = (s->avctx->coded_height+15) / 16;
 131
 132     s->mb_layout = (avctx->active_thread_type == FF_THREAD_SLICE) && (FFMIN(s->num_coeff_partitions, avctx->thread_count) > 1);
 133     if (!s->mb_layout) { // Frame threading and one thread
 134         s->macroblocks_base       = av_mallocz((s->mb_width+s->mb_height*2+1)*sizeof(*s->macroblocks));
 135         s->intra4x4_pred_mode_top = av_mallocz(s->mb_width*4);
 136     }
 137     else // Sliced threading
 138         s->macroblocks_base       = av_mallocz((s->mb_width+2)*(s->mb_height+2)*sizeof(*s->macroblocks));
 139     s->top_nnz                    = av_mallocz(s->mb_width*sizeof(*s->top_nnz));
 140     s->top_border                 = av_mallocz((s->mb_width+1)*sizeof(*s->top_border));
 141     s->thread_data                = av_mallocz(MAX_THREADS*sizeof(VP8ThreadData));
 142
 143     for (i = 0; i < MAX_THREADS; i++) {
 144         s->thread_data[i].filter_strength = av_mallocz(s->mb_width*sizeof(*s->thread_data[0].filter_strength));
 145 #if HAVE_THREADS
 146         pthread_mutex_init(&s->thread_data[i].lock, NULL);
 147         pthread_cond_init(&s->thread_data[i].cond, NULL);
 148 #endif
 149     }
 150
 151     if (!s->macroblocks_base || !s->top_nnz || !s->top_border ||
 152         (!s->intra4x4_pred_mode_top && !s->mb_layout))
 153         return AVERROR(ENOMEM);
 154
 155     s->macroblocks        = s->macroblocks_base + 1;
 156
 157     return 0;
 158 }
 159
 160 static void parse_segment_info(VP8Context *s)
 161 {
 162     VP56RangeCoder *c = &s->c;
 163     int i;
 164
 165     s->segmentation.update_map = vp8_rac_get(c);
 166
 167     if (vp8_rac_get(c)) { // update segment feature data
 168         s->segmentation.absolute_vals = vp8_rac_get(c);
 169
 170         for (i = 0; i < 4; i++)
 171             s->segmentation.base_quant[i]   = vp8_rac_get_sint(c, 7);
 172
 173         for (i = 0; i < 4; i++)
 174             s->segmentation.filter_level[i] = vp8_rac_get_sint(c, 6);
 175     }
 176     if (s->segmentation.update_map)
 177         for (i = 0; i < 3; i++)
 178             s->prob->segmentid[i] = vp8_rac_get(c) ? vp8_rac_get_uint(c, 8) : 255;
 179 }
 180
 181 static void update_lf_deltas(VP8Context *s)
 182 {
 183     VP56RangeCoder *c = &s->c;
 184     int i;
 185
 186     for (i = 0; i < 4; i++) {
 187         if (vp8_rac_get(c)) {
 188             s->lf_delta.ref[i] = vp8_rac_get_uint(c, 6);
 189
 190             if (vp8_rac_get(c))
 191                 s->lf_delta.ref[i] = -s->lf_delta.ref[i];
 192         }
 193     }
 194
 195     for (i = MODE_I4x4; i <= VP8_MVMODE_SPLIT; i++) {
 196         if (vp8_rac_get(c)) {
 197             s->lf_delta.mode[i] = vp8_rac_get_uint(c, 6);
 198
 199             if (vp8_rac_get(c))
 200                 s->lf_delta.mode[i] = -s->lf_delta.mode[i];
 201         }
 202     }
 203 }
 204
 205 static int setup_partitions(VP8Context *s, const uint8_t *buf, int buf_size)
 206 {
 207     const uint8_t *sizes = buf;
 208     int i;
 209
 210     s->num_coeff_partitions = 1 << vp8_rac_get_uint(&s->c, 2);
 211
 212     buf      += 3*(s->num_coeff_partitions-1);
 213     buf_size -= 3*(s->num_coeff_partitions-1);
 214     if (buf_size < 0)
 215         return -1;
 216
 217     for (i = 0; i < s->num_coeff_partitions-1; i++) {
 218         int size = AV_RL24(sizes + 3*i);
 219         if (buf_size - size < 0)
 220             return -1;
 221
 222         ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, size);
 223         buf      += size;
 224         buf_size -= size;
 225     }
 226     ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, buf_size);
 227
 228     return 0;
 229 }
 230
 231 static void get_quants(VP8Context *s)
 232 {
 233     VP56RangeCoder *c = &s->c;
 234     int i, base_qi;
 235
 236     int yac_qi     = vp8_rac_get_uint(c, 7);
 237     int ydc_delta  = vp8_rac_get_sint(c, 4);
 238     int y2dc_delta = vp8_rac_get_sint(c, 4);
 239     int y2ac_delta = vp8_rac_get_sint(c, 4);
 240     int uvdc_delta = vp8_rac_get_sint(c, 4);
 241     int uvac_delta = vp8_rac_get_sint(c, 4);
 242
 243     for (i = 0; i < 4; i++) {
 244         if (s->segmentation.enabled) {
 245             base_qi = s->segmentation.base_quant[i];
 246             if (!s->segmentation.absolute_vals)
 247                 base_qi += yac_qi;
 248         } else
 249             base_qi = yac_qi;
 250
 251         s->qmat[i].luma_qmul[0]    =           vp8_dc_qlookup[av_clip_uintp2(base_qi + ydc_delta , 7)];
 252         s->qmat[i].luma_qmul[1]    =           vp8_ac_qlookup[av_clip_uintp2(base_qi             , 7)];
 253         s->qmat[i].luma_dc_qmul[0] =       2 * vp8_dc_qlookup[av_clip_uintp2(base_qi + y2dc_delta, 7)];
 254         /* 101581>>16 is equivalent to 155/100 */
 255         s->qmat[i].luma_dc_qmul[1] = (101581 * vp8_ac_qlookup[av_clip_uintp2(base_qi + y2ac_delta, 7)]) >> 16;
 256         s->qmat[i].chroma_qmul[0]  =           vp8_dc_qlookup[av_clip_uintp2(base_qi + uvdc_delta, 7)];
 257         s->qmat[i].chroma_qmul[1]  =           vp8_ac_qlookup[av_clip_uintp2(base_qi + uvac_delta, 7)];
 258
 259         s->qmat[i].luma_dc_qmul[1] = FFMAX(s->qmat[i].luma_dc_qmul[1], 8);
 260         s->qmat[i].chroma_qmul[0]  = FFMIN(s->qmat[i].chroma_qmul[0], 132);
 261     }
 262 }
 263
 264 /**
 265  * Determine which buffers golden and altref should be updated with after this frame.
 266  * The spec isn't clear here, so I'm going by my understanding of what libvpx does
 267  *
 268  * Intra frames update all 3 references
 269  * Inter frames update VP56_FRAME_PREVIOUS if the update_last flag is set
 270  * If the update (golden|altref) flag is set, it's updated with the current frame
 271  *      if update_last is set, and VP56_FRAME_PREVIOUS otherwise.
 272  * If the flag is not set, the number read means:
 273  *      0: no update
 274  *      1: VP56_FRAME_PREVIOUS
 275  *      2: update golden with altref, or update altref with golden
 276  */
 277 static VP56Frame ref_to_update(VP8Context *s, int update, VP56Frame ref)
 278 {
 279     VP56RangeCoder *c = &s->c;
 280
 281     if (update)
 282         return VP56_FRAME_CURRENT;
 283
 284     switch (vp8_rac_get_uint(c, 2)) {
 285     case 1:
 286         return VP56_FRAME_PREVIOUS;
 287     case 2:
 288         return (ref == VP56_FRAME_GOLDEN) ? VP56_FRAME_GOLDEN2 : VP56_FRAME_GOLDEN;
 289     }
 290     return VP56_FRAME_NONE;
 291 }
 292
 293 static void update_refs(VP8Context *s)
 294 {
 295     VP56RangeCoder *c = &s->c;
 296
 297     int update_golden = vp8_rac_get(c);
 298     int update_altref = vp8_rac_get(c);
 299
 300     s->update_golden = ref_to_update(s, update_golden, VP56_FRAME_GOLDEN);
 301     s->update_altref = ref_to_update(s, update_altref, VP56_FRAME_GOLDEN2);
 302 }
 303
 304 static int decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size)
 305 {
 306     VP56RangeCoder *c = &s->c;
 307     int header_size, hscale, vscale, i, j, k, l, m, ret;
 308     int width  = s->avctx->width;
 309     int height = s->avctx->height;
 310
 311     s->keyframe  = !(buf[0] & 1);
 312     s->profile   =  (buf[0]>>1) & 7;
 313     s->invisible = !(buf[0] & 0x10);
 314     header_size  = AV_RL24(buf) >> 5;
 315     buf      += 3;
 316     buf_size -= 3;
 317
 318     if (s->profile > 3)
 319         av_log(s->avctx, AV_LOG_WARNING, "Unknown profile %d\n", s->profile);
 320
 321     if (!s->profile)
 322         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab, sizeof(s->put_pixels_tab));
 323     else    // profile 1-3 use bilinear, 4+ aren't defined so whatever
 324         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_bilinear_pixels_tab, sizeof(s->put_pixels_tab));
 325
 326     if (header_size > buf_size - 7*s->keyframe) {
 327         av_log(s->avctx, AV_LOG_ERROR, "Header size larger than data provided\n");
 328         return AVERROR_INVALIDDATA;
 329     }
 330
 331     if (s->keyframe) {
 332         if (AV_RL24(buf) != 0x2a019d) {
 333             av_log(s->avctx, AV_LOG_ERROR, "Invalid start code 0x%x\n", AV_RL24(buf));
 334             return AVERROR_INVALIDDATA;
 335         }
 336         width  = AV_RL16(buf+3) & 0x3fff;
 337         height = AV_RL16(buf+5) & 0x3fff;
 338         hscale = buf[4] >> 6;
 339         vscale = buf[6] >> 6;
 340         buf      += 7;
 341         buf_size -= 7;
 342
 343         if (hscale || vscale)
 344             avpriv_request_sample(s->avctx, "Upscaling");
 345
 346         s->update_golden = s->update_altref = VP56_FRAME_CURRENT;
 347         for (i = 0; i < 4; i++)
 348             for (j = 0; j < 16; j++)
 349                 memcpy(s->prob->token[i][j], vp8_token_default_probs[i][vp8_coeff_band[j]],
 350                        sizeof(s->prob->token[i][j]));
 351         memcpy(s->prob->pred16x16, vp8_pred16x16_prob_inter, sizeof(s->prob->pred16x16));
 352         memcpy(s->prob->pred8x8c , vp8_pred8x8c_prob_inter , sizeof(s->prob->pred8x8c));
 353         memcpy(s->prob->mvc      , vp8_mv_default_prob     , sizeof(s->prob->mvc));
 354         memset(&s->segmentation, 0, sizeof(s->segmentation));
 355         memset(&s->lf_delta, 0, sizeof(s->lf_delta));
 356     }
 357
 358     ff_vp56_init_range_decoder(c, buf, header_size);
 359     buf      += header_size;
 360     buf_size -= header_size;
 361
 362     if (s->keyframe) {
 363         if (vp8_rac_get(c))
 364             av_log(s->avctx, AV_LOG_WARNING, "Unspecified colorspace\n");
 365         vp8_rac_get(c); // whether we can skip clamping in dsp functions
 366     }
 367
 368     if ((s->segmentation.enabled = vp8_rac_get(c)))
 369         parse_segment_info(s);
 370     else
 371         s->segmentation.update_map = 0; // FIXME: move this to some init function?
 372
 373     s->filter.simple    = vp8_rac_get(c);
 374     s->filter.level     = vp8_rac_get_uint(c, 6);
 375     s->filter.sharpness = vp8_rac_get_uint(c, 3);
 376
 377     if ((s->lf_delta.enabled = vp8_rac_get(c)))
 378         if (vp8_rac_get(c))
 379             update_lf_deltas(s);
 380
 381     if (setup_partitions(s, buf, buf_size)) {
 382         av_log(s->avctx, AV_LOG_ERROR, "Invalid partitions\n");
 383         return AVERROR_INVALIDDATA;
 384     }
 385
 386     if (!s->macroblocks_base || /* first frame */
 387         width != s->avctx->width || height != s->avctx->height || (width+15)/16 != s->mb_width || (height+15)/16 != s->mb_height) {
 388         if ((ret = update_dimensions(s, width, height)) < 0)
 389             return ret;
 390     }
 391
 392     get_quants(s);
 393
 394     if (!s->keyframe) {
 395         update_refs(s);
 396         s->sign_bias[VP56_FRAME_GOLDEN]               = vp8_rac_get(c);
 397         s->sign_bias[VP56_FRAME_GOLDEN2 /* altref */] = vp8_rac_get(c);
 398     }
 399
 400     // if we aren't saving this frame's probabilities for future frames,
 401     // make a copy of the current probabilities
 402     if (!(s->update_probabilities = vp8_rac_get(c)))
 403         s->prob[1] = s->prob[0];
 404
 405     s->update_last = s->keyframe || vp8_rac_get(c);
 406
 407     for (i = 0; i < 4; i++)
 408         for (j = 0; j < 8; j++)
 409             for (k = 0; k < 3; k++)
 410                 for (l = 0; l < NUM_DCT_TOKENS-1; l++)
 411                     if (vp56_rac_get_prob_branchy(c, vp8_token_update_probs[i][j][k][l])) {
 412                         int prob = vp8_rac_get_uint(c, 8);
 413                         for (m = 0; vp8_coeff_band_indexes[j][m] >= 0; m++)
 414                             s->prob->token[i][vp8_coeff_band_indexes[j][m]][k][l] = prob;
 415                     }
 416
 417     if ((s->mbskip_enabled = vp8_rac_get(c)))
 418         s->prob->mbskip = vp8_rac_get_uint(c, 8);
 419
 420     if (!s->keyframe) {
 421         s->prob->intra  = vp8_rac_get_uint(c, 8);
 422         s->prob->last   = vp8_rac_get_uint(c, 8);
 423         s->prob->golden = vp8_rac_get_uint(c, 8);
 424
 425         if (vp8_rac_get(c))
 426             for (i = 0; i < 4; i++)
 427                 s->prob->pred16x16[i] = vp8_rac_get_uint(c, 8);
 428         if (vp8_rac_get(c))
 429             for (i = 0; i < 3; i++)
 430                 s->prob->pred8x8c[i]  = vp8_rac_get_uint(c, 8);
 431
 432         // 17.2 MV probability update
 433         for (i = 0; i < 2; i++)
 434             for (j = 0; j < 19; j++)
 435                 if (vp56_rac_get_prob_branchy(c, vp8_mv_update_prob[i][j]))
 436                     s->prob->mvc[i][j] = vp8_rac_get_nn(c);
 437     }
 438
 439     return 0;
 440 }
 441
 442 static av_always_inline void clamp_mv(VP8Context *s, VP56mv *dst, const VP56mv *src)
 443 {
 444     dst->x = av_clip(src->x, s->mv_min.x, s->mv_max.x);
 445     dst->y = av_clip(src->y, s->mv_min.y, s->mv_max.y);
 446 }
 447
 448 /**
 449  * Motion vector coding, 17.1.
 450  */
 451 static int read_mv_component(VP56RangeCoder *c, const uint8_t *p)
 452 {
 453     int bit, x = 0;
 454
 455     if (vp56_rac_get_prob_branchy(c, p[0])) {
 456         int i;
 457
 458         for (i = 0; i < 3; i++)
 459             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 460         for (i = 9; i > 3; i--)
 461             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 462         if (!(x & 0xFFF0) || vp56_rac_get_prob(c, p[12]))
 463             x += 8;
 464     } else {
 465         // small_mvtree
 466         const uint8_t *ps = p+2;
 467         bit = vp56_rac_get_prob(c, *ps);
 468         ps += 1 + 3*bit;
 469         x  += 4*bit;
 470         bit = vp56_rac_get_prob(c, *ps);
 471         ps += 1 + bit;
 472         x  += 2*bit;
 473         x  += vp56_rac_get_prob(c, *ps);
 474     }
 475
 476     return (x && vp56_rac_get_prob(c, p[1])) ? -x : x;
 477 }
 478
 479 static av_always_inline
 480 const uint8_t *get_submv_prob(uint32_t left, uint32_t top)
 481 {
 482     if (left == top)
 483         return vp8_submv_prob[4-!!left];
 484     if (!top)
 485         return vp8_submv_prob[2];
 486     return vp8_submv_prob[1-!!left];
 487 }
 488
 489 /**
 490  * Split motion vector prediction, 16.4.
 491  * @returns the number of motion vectors parsed (2, 4 or 16)
 492  */
 493 static av_always_inline
 494 int decode_splitmvs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb, int layout)
 495 {
 496     int part_idx;
 497     int n, num;
 498     VP8Macroblock *top_mb;
 499     VP8Macroblock *left_mb = &mb[-1];
 500     const uint8_t *mbsplits_left = vp8_mbsplits[left_mb->partitioning],
 501                   *mbsplits_top,
 502                   *mbsplits_cur, *firstidx;
 503     VP56mv *top_mv;
 504     VP56mv *left_mv = left_mb->bmv;
 505     VP56mv *cur_mv  = mb->bmv;
 506
 507     if (!layout) // layout is inlined, s->mb_layout is not
 508         top_mb = &mb[2];
 509     else
 510         top_mb = &mb[-s->mb_width-1];
 511     mbsplits_top = vp8_mbsplits[top_mb->partitioning];
 512     top_mv = top_mb->bmv;
 513
 514     if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[0])) {
 515         if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[1])) {
 516             part_idx = VP8_SPLITMVMODE_16x8 + vp56_rac_get_prob(c, vp8_mbsplit_prob[2]);
 517         } else {
 518             part_idx = VP8_SPLITMVMODE_8x8;
 519         }
 520     } else {
 521         part_idx = VP8_SPLITMVMODE_4x4;
 522     }
 523
 524     num = vp8_mbsplit_count[part_idx];
 525     mbsplits_cur = vp8_mbsplits[part_idx],
 526     firstidx = vp8_mbfirstidx[part_idx];
 527     mb->partitioning = part_idx;
 528
 529     for (n = 0; n < num; n++) {
 530         int k = firstidx[n];
 531         uint32_t left, above;
 532         const uint8_t *submv_prob;
 533
 534         if (!(k & 3))
 535             left = AV_RN32A(&left_mv[mbsplits_left[k + 3]]);
 536         else
 537             left  = AV_RN32A(&cur_mv[mbsplits_cur[k - 1]]);
 538         if (k <= 3)
 539             above = AV_RN32A(&top_mv[mbsplits_top[k + 12]]);
 540         else
 541             above = AV_RN32A(&cur_mv[mbsplits_cur[k - 4]]);
 542
 543         submv_prob = get_submv_prob(left, above);
 544
 545         if (vp56_rac_get_prob_branchy(c, submv_prob[0])) {
 546             if (vp56_rac_get_prob_branchy(c, submv_prob[1])) {
 547                 if (vp56_rac_get_prob_branchy(c, submv_prob[2])) {
 548                     mb->bmv[n].y = mb->mv.y + read_mv_component(c, s->prob->mvc[0]);
 549                     mb->bmv[n].x = mb->mv.x + read_mv_component(c, s->prob->mvc[1]);
 550                 } else {
 551                     AV_ZERO32(&mb->bmv[n]);
 552                 }
 553             } else {
 554                 AV_WN32A(&mb->bmv[n], above);
 555             }
 556         } else {
 557             AV_WN32A(&mb->bmv[n], left);
 558         }
 559     }
 560
 561     return num;
 562 }
 563
 564 static av_always_inline
 565 void decode_mvs(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, int layout)
 566 {
 567     VP8Macroblock *mb_edge[3] = { 0 /* top */,
 568                                   mb - 1 /* left */,
 569                                   0 /* top-left */ };
 570     enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV };
 571     enum { VP8_EDGE_TOP, VP8_EDGE_LEFT, VP8_EDGE_TOPLEFT };
 572     int idx = CNT_ZERO;
 573     int cur_sign_bias = s->sign_bias[mb->ref_frame];
 574     int8_t *sign_bias = s->sign_bias;
 575     VP56mv near_mv[4];
 576     uint8_t cnt[4] = { 0 };
 577     VP56RangeCoder *c = &s->c;
 578
 579     if (!layout) { // layout is inlined (s->mb_layout is not)
 580         mb_edge[0] = mb + 2;
 581         mb_edge[2] = mb + 1;
 582     }
 583     else {
 584         mb_edge[0] = mb - s->mb_width-1;
 585         mb_edge[2] = mb - s->mb_width-2;
 586     }
 587
 588     AV_ZERO32(&near_mv[0]);
 589     AV_ZERO32(&near_mv[1]);
 590     AV_ZERO32(&near_mv[2]);
 591
 592     /* Process MB on top, left and top-left */
 593     #define MV_EDGE_CHECK(n)\
 594     {\
 595         VP8Macroblock *edge = mb_edge[n];\
 596         int edge_ref = edge->ref_frame;\
 597         if (edge_ref != VP56_FRAME_CURRENT) {\
 598             uint32_t mv = AV_RN32A(&edge->mv);\
 599             if (mv) {\
 600                 if (cur_sign_bias != sign_bias[edge_ref]) {\
 601                     /* SWAR negate of the values in mv. */\
 602                     mv = ~mv;\
 603                     mv = ((mv&0x7fff7fff) + 0x00010001) ^ (mv&0x80008000);\
 604                 }\
 605                 if (!n || mv != AV_RN32A(&near_mv[idx]))\
 606                     AV_WN32A(&near_mv[++idx], mv);\
 607                 cnt[idx]      += 1 + (n != 2);\
 608             } else\
 609                 cnt[CNT_ZERO] += 1 + (n != 2);\
 610         }\
 611     }
 612
 613     MV_EDGE_CHECK(0)
 614     MV_EDGE_CHECK(1)
 615     MV_EDGE_CHECK(2)
 616
 617     mb->partitioning = VP8_SPLITMVMODE_NONE;
 618     if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_ZERO]][0])) {
 619         mb->mode = VP8_MVMODE_MV;
 620
 621         /* If we have three distinct MVs, merge first and last if they're the same */
 622         if (cnt[CNT_SPLITMV] && AV_RN32A(&near_mv[1 + VP8_EDGE_TOP]) == AV_RN32A(&near_mv[1 + VP8_EDGE_TOPLEFT]))
 623             cnt[CNT_NEAREST] += 1;
 624
 625         /* Swap near and nearest if necessary */
 626         if (cnt[CNT_NEAR] > cnt[CNT_NEAREST]) {
 627             FFSWAP(uint8_t,     cnt[CNT_NEAREST],     cnt[CNT_NEAR]);
 628             FFSWAP( VP56mv, near_mv[CNT_NEAREST], near_mv[CNT_NEAR]);
 629         }
 630
 631         if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAREST]][1])) {
 632             if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAR]][2])) {
 633
 634                 /* Choose the best mv out of 0,0 and the nearest mv */
 635                 clamp_mv(s, &mb->mv, &near_mv[CNT_ZERO + (cnt[CNT_NEAREST] >= cnt[CNT_ZERO])]);
 636                 cnt[CNT_SPLITMV] = ((mb_edge[VP8_EDGE_LEFT]->mode    == VP8_MVMODE_SPLIT) +
 637                                     (mb_edge[VP8_EDGE_TOP]->mode     == VP8_MVMODE_SPLIT)) * 2 +
 638                                     (mb_edge[VP8_EDGE_TOPLEFT]->mode == VP8_MVMODE_SPLIT);
 639
 640                 if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_SPLITMV]][3])) {
 641                     mb->mode = VP8_MVMODE_SPLIT;
 642                     mb->mv = mb->bmv[decode_splitmvs(s, c, mb, layout) - 1];
 643                 } else {
 644                     mb->mv.y += read_mv_component(c, s->prob->mvc[0]);
 645                     mb->mv.x += read_mv_component(c, s->prob->mvc[1]);
 646                     mb->bmv[0] = mb->mv;
 647                 }
 648             } else {
 649                 clamp_mv(s, &mb->mv, &near_mv[CNT_NEAR]);
 650                 mb->bmv[0] = mb->mv;
 651             }
 652         } else {
 653             clamp_mv(s, &mb->mv, &near_mv[CNT_NEAREST]);
 654             mb->bmv[0] = mb->mv;
 655         }
 656     } else {
 657         mb->mode = VP8_MVMODE_ZERO;
 658         AV_ZERO32(&mb->mv);
 659         mb->bmv[0] = mb->mv;
 660     }
 661 }
 662
 663 static av_always_inline
 664 void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
 665                            int mb_x, int keyframe, int layout)
 666 {
 667     uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
 668
 669     if (layout == 1) {
 670         VP8Macroblock *mb_top = mb - s->mb_width - 1;
 671         memcpy(mb->intra4x4_pred_mode_top, mb_top->intra4x4_pred_mode_top, 4);
 672     }
 673     if (keyframe) {
 674         int x, y;
 675         uint8_t* top;
 676         uint8_t* const left = s->intra4x4_pred_mode_left;
 677         if (layout == 1)
 678             top = mb->intra4x4_pred_mode_top;
 679         else
 680             top = s->intra4x4_pred_mode_top + 4 * mb_x;
 681         for (y = 0; y < 4; y++) {
 682             for (x = 0; x < 4; x++) {
 683                 const uint8_t *ctx;
 684                 ctx = vp8_pred4x4_prob_intra[top[x]][left[y]];
 685                 *intra4x4 = vp8_rac_get_tree(c, vp8_pred4x4_tree, ctx);
 686                 left[y] = top[x] = *intra4x4;
 687                 intra4x4++;
 688             }
 689         }
 690     } else {
 691         int i;
 692         for (i = 0; i < 16; i++)
 693             intra4x4[i] = vp8_rac_get_tree(c, vp8_pred4x4_tree, vp8_pred4x4_prob_inter);
 694     }
 695 }
 696
 697 static av_always_inline
 698 void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
 699                     uint8_t *segment, uint8_t *ref, int layout)
 700 {
 701     VP56RangeCoder *c = &s->c;
 702
 703     if (s->segmentation.update_map) {
 704         int bit  = vp56_rac_get_prob(c, s->prob->segmentid[0]);
 705         *segment = vp56_rac_get_prob(c, s->prob->segmentid[1+bit]) + 2*bit;
 706     } else if (s->segmentation.enabled)
 707         *segment = ref ? *ref : *segment;
 708     mb->segment = *segment;
 709
 710     mb->skip = s->mbskip_enabled ? vp56_rac_get_prob(c, s->prob->mbskip) : 0;
 711
 712     if (s->keyframe) {
 713         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_intra, vp8_pred16x16_prob_intra);
 714
 715         if (mb->mode == MODE_I4x4) {
 716             decode_intra4x4_modes(s, c, mb, mb_x, 1, layout);
 717         } else {
 718             const uint32_t modes = vp8_pred4x4_mode[mb->mode] * 0x01010101u;
 719             if (s->mb_layout == 1)
 720                 AV_WN32A(mb->intra4x4_pred_mode_top, modes);
 721             else
 722                 AV_WN32A(s->intra4x4_pred_mode_top + 4 * mb_x, modes);
 723             AV_WN32A( s->intra4x4_pred_mode_left, modes);
 724         }
 725
 726         mb->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, vp8_pred8x8c_prob_intra);
 727         mb->ref_frame = VP56_FRAME_CURRENT;
 728     } else if (vp56_rac_get_prob_branchy(c, s->prob->intra)) {
 729         // inter MB, 16.2
 730         if (vp56_rac_get_prob_branchy(c, s->prob->last))
 731             mb->ref_frame = vp56_rac_get_prob(c, s->prob->golden) ?
 732                 VP56_FRAME_GOLDEN2 /* altref */ : VP56_FRAME_GOLDEN;
 733         else
 734             mb->ref_frame = VP56_FRAME_PREVIOUS;
 735         s->ref_count[mb->ref_frame-1]++;
 736
 737         // motion vectors, 16.3
 738         decode_mvs(s, mb, mb_x, mb_y, layout);
 739     } else {
 740         // intra MB, 16.1
 741         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_inter, s->prob->pred16x16);
 742
 743         if (mb->mode == MODE_I4x4)
 744             decode_intra4x4_modes(s, c, mb, mb_x, 0, layout);
 745
 746         mb->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, s->prob->pred8x8c);
 747         mb->ref_frame = VP56_FRAME_CURRENT;
 748         mb->partitioning = VP8_SPLITMVMODE_NONE;
 749         AV_ZERO32(&mb->bmv[0]);
 750     }
 751 }
 752
 753 #ifndef decode_block_coeffs_internal
 754 /**
 755  * @param r arithmetic bitstream reader context
 756  * @param block destination for block coefficients
 757  * @param probs probabilities to use when reading trees from the bitstream
 758  * @param i initial coeff index, 0 unless a separate DC block is coded
 759  * @param qmul array holding the dc/ac dequant factor at position 0/1
 760  * @return 0 if no coeffs were decoded
 761  *         otherwise, the index of the last coeff decoded plus one
 762  */
 763 static int decode_block_coeffs_internal(VP56RangeCoder *r, int16_t block[16],
 764                                         uint8_t probs[16][3][NUM_DCT_TOKENS-1],
 765                                         int i, uint8_t *token_prob, int16_t qmul[2])
 766 {
 767     VP56RangeCoder c = *r;
 768     goto skip_eob;
 769     do {
 770         int coeff;
 771         if (!vp56_rac_get_prob_branchy(&c, token_prob[0]))   // DCT_EOB
 772             break;
 773
 774 skip_eob:
 775         if (!vp56_rac_get_prob_branchy(&c, token_prob[1])) { // DCT_0
 776             if (++i == 16)
 777                 break; // invalid input; blocks should end with EOB
 778             token_prob = probs[i][0];
 779             goto skip_eob;
 780         }
 781
 782         if (!vp56_rac_get_prob_branchy(&c, token_prob[2])) { // DCT_1
 783             coeff = 1;
 784             token_prob = probs[i+1][1];
 785         } else {
 786             if (!vp56_rac_get_prob_branchy(&c, token_prob[3])) { // DCT 2,3,4
 787                 coeff = vp56_rac_get_prob_branchy(&c, token_prob[4]);
 788                 if (coeff)
 789                     coeff += vp56_rac_get_prob(&c, token_prob[5]);
 790                 coeff += 2;
 791             } else {
 792                 // DCT_CAT*
 793                 if (!vp56_rac_get_prob_branchy(&c, token_prob[6])) {
 794                     if (!vp56_rac_get_prob_branchy(&c, token_prob[7])) { // DCT_CAT1
 795                         coeff  = 5 + vp56_rac_get_prob(&c, vp8_dct_cat1_prob[0]);
 796                     } else {                                    // DCT_CAT2
 797                         coeff  = 7;
 798                         coeff += vp56_rac_get_prob(&c, vp8_dct_cat2_prob[0]) << 1;
 799                         coeff += vp56_rac_get_prob(&c, vp8_dct_cat2_prob[1]);
 800                     }
 801                 } else {    // DCT_CAT3 and up
 802                     int a = vp56_rac_get_prob(&c, token_prob[8]);
 803                     int b = vp56_rac_get_prob(&c, token_prob[9+a]);
 804                     int cat = (a<<1) + b;
 805                     coeff  = 3 + (8<<cat);
 806                     coeff += vp8_rac_get_coeff(&c, ff_vp8_dct_cat_prob[cat]);
 807                 }
 808             }
 809             token_prob = probs[i+1][2];
 810         }
 811         block[zigzag_scan[i]] = (vp8_rac_get(&c) ? -coeff : coeff) * qmul[!!i];
 812     } while (++i < 16);
 813
 814     *r = c;
 815     return i;
 816 }
 817 #endif
 818
 819 /**
 820  * @param c arithmetic bitstream reader context
 821  * @param block destination for block coefficients
 822  * @param probs probabilities to use when reading trees from the bitstream
 823  * @param i initial coeff index, 0 unless a separate DC block is coded
 824  * @param zero_nhood the initial prediction context for number of surrounding
 825  *                   all-zero blocks (only left/top, so 0-2)
 826  * @param qmul array holding the dc/ac dequant factor at position 0/1
 827  * @return 0 if no coeffs were decoded
 828  *         otherwise, the index of the last coeff decoded plus one
 829  */
 830 static av_always_inline
 831 int decode_block_coeffs(VP56RangeCoder *c, int16_t block[16],
 832                         uint8_t probs[16][3][NUM_DCT_TOKENS-1],
 833                         int i, int zero_nhood, int16_t qmul[2])
 834 {
 835     uint8_t *token_prob = probs[i][zero_nhood];
 836     if (!vp56_rac_get_prob_branchy(c, token_prob[0]))   // DCT_EOB
 837         return 0;
 838     return decode_block_coeffs_internal(c, block, probs, i, token_prob, qmul);
 839 }
 840
 841 static av_always_inline
 842 void decode_mb_coeffs(VP8Context *s, VP8ThreadData *td, VP56RangeCoder *c, VP8Macroblock *mb,
 843                       uint8_t t_nnz[9], uint8_t l_nnz[9])
 844 {
 845     int i, x, y, luma_start = 0, luma_ctx = 3;
 846     int nnz_pred, nnz, nnz_total = 0;
 847     int segment = mb->segment;
 848     int block_dc = 0;
 849
 850     if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
 851         nnz_pred = t_nnz[8] + l_nnz[8];
 852
 853         // decode DC values and do hadamard
 854         nnz = decode_block_coeffs(c, td->block_dc, s->prob->token[1], 0, nnz_pred,
 855                                   s->qmat[segment].luma_dc_qmul);
 856         l_nnz[8] = t_nnz[8] = !!nnz;
 857         if (nnz) {
 858             nnz_total += nnz;
 859             block_dc = 1;
 860             if (nnz == 1)
 861                 s->vp8dsp.vp8_luma_dc_wht_dc(td->block, td->block_dc);
 862             else
 863                 s->vp8dsp.vp8_luma_dc_wht(td->block, td->block_dc);
 864         }
 865         luma_start = 1;
 866         luma_ctx = 0;
 867     }
 868
 869     // luma blocks
 870     for (y = 0; y < 4; y++)
 871         for (x = 0; x < 4; x++) {
 872             nnz_pred = l_nnz[y] + t_nnz[x];
 873             nnz = decode_block_coeffs(c, td->block[y][x], s->prob->token[luma_ctx], luma_start,
 874                                       nnz_pred, s->qmat[segment].luma_qmul);
 875             // nnz+block_dc may be one more than the actual last index, but we don't care
 876             td->non_zero_count_cache[y][x] = nnz + block_dc;
 877             t_nnz[x] = l_nnz[y] = !!nnz;
 878             nnz_total += nnz;
 879         }
 880
 881     // chroma blocks
 882     // TODO: what to do about dimensions? 2nd dim for luma is x,
 883     // but for chroma it's (y<<1)|x
 884     for (i = 4; i < 6; i++)
 885         for (y = 0; y < 2; y++)
 886             for (x = 0; x < 2; x++) {
 887                 nnz_pred = l_nnz[i+2*y] + t_nnz[i+2*x];
 888                 nnz = decode_block_coeffs(c, td->block[i][(y<<1)+x], s->prob->token[2], 0,
 889                                           nnz_pred, s->qmat[segment].chroma_qmul);
 890                 td->non_zero_count_cache[i][(y<<1)+x] = nnz;
 891                 t_nnz[i+2*x] = l_nnz[i+2*y] = !!nnz;
 892                 nnz_total += nnz;
 893             }
 894
 895     // if there were no coded coeffs despite the macroblock not being marked skip,
 896     // we MUST not do the inner loop filter and should not do IDCT
 897     // Since skip isn't used for bitstream prediction, just manually set it.
 898     if (!nnz_total)
 899         mb->skip = 1;
 900 }
 901
 902 static av_always_inline
 903 void backup_mb_border(uint8_t *top_border, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr,
 904                       int linesize, int uvlinesize, int simple)
 905 {
 906     AV_COPY128(top_border, src_y + 15*linesize);
 907     if (!simple) {
 908         AV_COPY64(top_border+16, src_cb + 7*uvlinesize);
 909         AV_COPY64(top_border+24, src_cr + 7*uvlinesize);
 910     }
 911 }
 912
 913 static av_always_inline
 914 void xchg_mb_border(uint8_t *top_border, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr,
 915                     int linesize, int uvlinesize, int mb_x, int mb_y, int mb_width,
 916                     int simple, int xchg)
 917 {
 918     uint8_t *top_border_m1 = top_border-32;     // for TL prediction
 919     src_y  -=   linesize;
 920     src_cb -= uvlinesize;
 921     src_cr -= uvlinesize;
 922
 923 #define XCHG(a,b,xchg) do {                     \
 924         if (xchg) AV_SWAP64(b,a);               \
 925         else      AV_COPY64(b,a);               \
 926     } while (0)
 927
 928     XCHG(top_border_m1+8, src_y-8, xchg);
 929     XCHG(top_border,      src_y,   xchg);
 930     XCHG(top_border+8,    src_y+8, 1);
 931     if (mb_x < mb_width-1)
 932         XCHG(top_border+32, src_y+16, 1);
 933
 934     // only copy chroma for normal loop filter
 935     // or to initialize the top row to 127
 936     if (!simple || !mb_y) {
 937         XCHG(top_border_m1+16, src_cb-8, xchg);
 938         XCHG(top_border_m1+24, src_cr-8, xchg);
 939         XCHG(top_border+16,    src_cb, 1);
 940         XCHG(top_border+24,    src_cr, 1);
 941     }
 942 }
 943
 944 static av_always_inline
 945 int check_dc_pred8x8_mode(int mode, int mb_x, int mb_y)
 946 {
 947     if (!mb_x) {
 948         return mb_y ? TOP_DC_PRED8x8 : DC_128_PRED8x8;
 949     } else {
 950         return mb_y ? mode : LEFT_DC_PRED8x8;
 951     }
 952 }
 953
 954 static av_always_inline
 955 int check_tm_pred8x8_mode(int mode, int mb_x, int mb_y)
 956 {
 957     if (!mb_x) {
 958         return mb_y ? VERT_PRED8x8 : DC_129_PRED8x8;
 959     } else {
 960         return mb_y ? mode : HOR_PRED8x8;
 961     }
 962 }
 963
 964 static av_always_inline
 965 int check_intra_pred8x8_mode(int mode, int mb_x, int mb_y)
 966 {
 967     if (mode == DC_PRED8x8) {
 968         return check_dc_pred8x8_mode(mode, mb_x, mb_y);
 969     } else {
 970         return mode;
 971     }
 972 }
 973
 974 static av_always_inline
 975 int check_intra_pred8x8_mode_emuedge(int mode, int mb_x, int mb_y)
 976 {
 977     switch (mode) {
 978     case DC_PRED8x8:
 979         return check_dc_pred8x8_mode(mode, mb_x, mb_y);
 980     case VERT_PRED8x8:
 981         return !mb_y ? DC_127_PRED8x8 : mode;
 982     case HOR_PRED8x8:
 983         return !mb_x ? DC_129_PRED8x8 : mode;
 984     case PLANE_PRED8x8 /*TM*/:
 985         return check_tm_pred8x8_mode(mode, mb_x, mb_y);
 986     }
 987     return mode;
 988 }
 989
 990 static av_always_inline
 991 int check_tm_pred4x4_mode(int mode, int mb_x, int mb_y)
 992 {
 993     if (!mb_x) {
 994         return mb_y ? VERT_VP8_PRED : DC_129_PRED;
 995     } else {
 996         return mb_y ? mode : HOR_VP8_PRED;
 997     }
 998 }
 999
1000 static av_always_inline
1001 int check_intra_pred4x4_mode_emuedge(int mode, int mb_x, int mb_y, int *copy_buf)
1002 {
1003     switch (mode) {
1004     case VERT_PRED:
1005         if (!mb_x && mb_y) {
1006             *copy_buf = 1;
1007             return mode;
1008         }
1009         /* fall-through */
1010     case DIAG_DOWN_LEFT_PRED:
1011     case VERT_LEFT_PRED:
1012         return !mb_y ? DC_127_PRED : mode;
1013     case HOR_PRED:
1014         if (!mb_y) {
1015             *copy_buf = 1;
1016             return mode;
1017         }
1018         /* fall-through */
1019     case HOR_UP_PRED:
1020         return !mb_x ? DC_129_PRED : mode;
1021     case TM_VP8_PRED:
1022         return check_tm_pred4x4_mode(mode, mb_x, mb_y);
1023     case DC_PRED: // 4x4 DC doesn't use the same "H.264-style" exceptions as 16x16/8x8 DC
1024     case DIAG_DOWN_RIGHT_PRED:
1025     case VERT_RIGHT_PRED:
1026     case HOR_DOWN_PRED:
1027         if (!mb_y || !mb_x)
1028             *copy_buf = 1;
1029         return mode;
1030     }
1031     return mode;
1032 }
1033
1034 static av_always_inline
1035 void intra_predict(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1036                    VP8Macroblock *mb, int mb_x, int mb_y)
1037 {
1038     AVCodecContext *avctx = s->avctx;
1039     int x, y, mode, nnz;
1040     uint32_t tr;
1041
1042     // for the first row, we need to run xchg_mb_border to init the top edge to 127
1043     // otherwise, skip it if we aren't going to deblock
1044     if (!(avctx->flags & CODEC_FLAG_EMU_EDGE && !mb_y) && (s->deblock_filter || !mb_y) && td->thread_nr == 0)
1045         xchg_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2],
1046                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1047                        s->filter.simple, 1);
1048
1049     if (mb->mode < MODE_I4x4) {
1050         if (avctx->flags & CODEC_FLAG_EMU_EDGE) { // tested
1051             mode = check_intra_pred8x8_mode_emuedge(mb->mode, mb_x, mb_y);
1052         } else {
1053             mode = check_intra_pred8x8_mode(mb->mode, mb_x, mb_y);
1054         }
1055         s->hpc.pred16x16[mode](dst[0], s->linesize);
1056     } else {
1057         uint8_t *ptr = dst[0];
1058         uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
1059         uint8_t tr_top[4] = { 127, 127, 127, 127 };
1060
1061         // all blocks on the right edge of the macroblock use bottom edge
1062         // the top macroblock for their topright edge
1063         uint8_t *tr_right = ptr - s->linesize + 16;
1064
1065         // if we're on the right edge of the frame, said edge is extended
1066         // from the top macroblock
1067         if (!(!mb_y && avctx->flags & CODEC_FLAG_EMU_EDGE) &&
1068             mb_x == s->mb_width-1) {
1069             tr = tr_right[-1]*0x01010101u;
1070             tr_right = (uint8_t *)&tr;
1071         }
1072
1073         if (mb->skip)
1074             AV_ZERO128(td->non_zero_count_cache);
1075
1076         for (y = 0; y < 4; y++) {
1077             uint8_t *topright = ptr + 4 - s->linesize;
1078             for (x = 0; x < 4; x++) {
1079                 int copy = 0, linesize = s->linesize;
1080                 uint8_t *dst = ptr+4*x;
1081                 DECLARE_ALIGNED(4, uint8_t, copy_dst)[5*8];
1082
1083                 if ((y == 0 || x == 3) && mb_y == 0 && avctx->flags & CODEC_FLAG_EMU_EDGE) {
1084                     topright = tr_top;
1085                 } else if (x == 3)
1086                     topright = tr_right;
1087
1088                 if (avctx->flags & CODEC_FLAG_EMU_EDGE) { // mb_x+x or mb_y+y is a hack but works
1089                     mode = check_intra_pred4x4_mode_emuedge(intra4x4[x], mb_x + x, mb_y + y, &copy);
1090                     if (copy) {
1091                         dst = copy_dst + 12;
1092                         linesize = 8;
1093                         if (!(mb_y + y)) {
1094                             copy_dst[3] = 127U;
1095                             AV_WN32A(copy_dst+4, 127U * 0x01010101U);
1096                         } else {
1097                             AV_COPY32(copy_dst+4, ptr+4*x-s->linesize);
1098                             if (!(mb_x + x)) {
1099                                 copy_dst[3] = 129U;
1100                             } else {
1101                                 copy_dst[3] = ptr[4*x-s->linesize-1];
1102                             }
1103                         }
1104                         if (!(mb_x + x)) {
1105                             copy_dst[11] =
1106                             copy_dst[19] =
1107                             copy_dst[27] =
1108                             copy_dst[35] = 129U;
1109                         } else {
1110                             copy_dst[11] = ptr[4*x              -1];
1111                             copy_dst[19] = ptr[4*x+s->linesize  -1];
1112                             copy_dst[27] = ptr[4*x+s->linesize*2-1];
1113                             copy_dst[35] = ptr[4*x+s->linesize*3-1];
1114                         }
1115                     }
1116                 } else {
1117                     mode = intra4x4[x];
1118                 }
1119                 s->hpc.pred4x4[mode](dst, topright, linesize);
1120                 if (copy) {
1121                     AV_COPY32(ptr+4*x              , copy_dst+12);
1122                     AV_COPY32(ptr+4*x+s->linesize  , copy_dst+20);
1123                     AV_COPY32(ptr+4*x+s->linesize*2, copy_dst+28);
1124                     AV_COPY32(ptr+4*x+s->linesize*3, copy_dst+36);
1125                 }
1126
1127                 nnz = td->non_zero_count_cache[y][x];
1128                 if (nnz) {
1129                     if (nnz == 1)
1130                         s->vp8dsp.vp8_idct_dc_add(ptr+4*x, td->block[y][x], s->linesize);
1131                     else
1132                         s->vp8dsp.vp8_idct_add(ptr+4*x, td->block[y][x], s->linesize);
1133                 }
1134                 topright += 4;
1135             }
1136
1137             ptr   += 4*s->linesize;
1138             intra4x4 += 4;
1139         }
1140     }
1141
1142     if (avctx->flags & CODEC_FLAG_EMU_EDGE) {
1143         mode = check_intra_pred8x8_mode_emuedge(mb->chroma_pred_mode, mb_x, mb_y);
1144     } else {
1145         mode = check_intra_pred8x8_mode(mb->chroma_pred_mode, mb_x, mb_y);
1146     }
1147     s->hpc.pred8x8[mode](dst[1], s->uvlinesize);
1148     s->hpc.pred8x8[mode](dst[2], s->uvlinesize);
1149
1150     if (!(avctx->flags & CODEC_FLAG_EMU_EDGE && !mb_y) && (s->deblock_filter || !mb_y) && td->thread_nr == 0)
1151         xchg_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2],
1152                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1153                        s->filter.simple, 0);
1154 }
1155
1156 static const uint8_t subpel_idx[3][8] = {
1157     { 0, 1, 2, 1, 2, 1, 2, 1 }, // nr. of left extra pixels,
1158                                 // also function pointer index
1159     { 0, 3, 5, 3, 5, 3, 5, 3 }, // nr. of extra pixels required
1160     { 0, 2, 3, 2, 3, 2, 3, 2 }, // nr. of right extra pixels
1161 };
1162
1163 /**
1164  * luma MC function
1165  *
1166  * @param s VP8 decoding context
1167  * @param dst target buffer for block data at block position
1168  * @param ref reference picture buffer at origin (0, 0)
1169  * @param mv motion vector (relative to block position) to get pixel data from
1170  * @param x_off horizontal position of block from origin (0, 0)
1171  * @param y_off vertical position of block from origin (0, 0)
1172  * @param block_w width of block (16, 8 or 4)
1173  * @param block_h height of block (always same as block_w)
1174  * @param width width of src/dst plane data
1175  * @param height height of src/dst plane data
1176  * @param linesize size of a single line of plane data, including padding
1177  * @param mc_func motion compensation function pointers (bilinear or sixtap MC)
1178  */
1179 static av_always_inline
1180 void vp8_mc_luma(VP8Context *s, VP8ThreadData *td, uint8_t *dst,
1181                  ThreadFrame *ref, const VP56mv *mv,
1182                  int x_off, int y_off, int block_w, int block_h,
1183                  int width, int height, int linesize,
1184                  vp8_mc_func mc_func[3][3])
1185 {
1186     uint8_t *src = ref->f->data[0];
1187
1188     if (AV_RN32A(mv)) {
1189
1190         int mx = (mv->x << 1)&7, mx_idx = subpel_idx[0][mx];
1191         int my = (mv->y << 1)&7, my_idx = subpel_idx[0][my];
1192
1193         x_off += mv->x >> 2;
1194         y_off += mv->y >> 2;
1195
1196         // edge emulation
1197         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 4, 0);
1198         src += y_off * linesize + x_off;
1199         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1200             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1201             s->vdsp.emulated_edge_mc(td->edge_emu_buffer, src - my_idx * linesize - mx_idx, linesize,
1202                                      block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1203                                      x_off - mx_idx, y_off - my_idx, width, height);
1204             src = td->edge_emu_buffer + mx_idx + linesize * my_idx;
1205         }
1206         mc_func[my_idx][mx_idx](dst, linesize, src, linesize, block_h, mx, my);
1207     } else {
1208         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 4, 0);
1209         mc_func[0][0](dst, linesize, src + y_off * linesize + x_off, linesize, block_h, 0, 0);
1210     }
1211 }
1212
1213 /**
1214  * chroma MC function
1215  *
1216  * @param s VP8 decoding context
1217  * @param dst1 target buffer for block data at block position (U plane)
1218  * @param dst2 target buffer for block data at block position (V plane)
1219  * @param ref reference picture buffer at origin (0, 0)
1220  * @param mv motion vector (relative to block position) to get pixel data from
1221  * @param x_off horizontal position of block from origin (0, 0)
1222  * @param y_off vertical position of block from origin (0, 0)
1223  * @param block_w width of block (16, 8 or 4)
1224  * @param block_h height of block (always same as block_w)
1225  * @param width width of src/dst plane data
1226  * @param height height of src/dst plane data
1227  * @param linesize size of a single line of plane data, including padding
1228  * @param mc_func motion compensation function pointers (bilinear or sixtap MC)
1229  */
1230 static av_always_inline
1231 void vp8_mc_chroma(VP8Context *s, VP8ThreadData *td, uint8_t *dst1, uint8_t *dst2,
1232                    ThreadFrame *ref, const VP56mv *mv, int x_off, int y_off,
1233                    int block_w, int block_h, int width, int height, int linesize,
1234                    vp8_mc_func mc_func[3][3])
1235 {
1236     uint8_t *src1 = ref->f->data[1], *src2 = ref->f->data[2];
1237
1238     if (AV_RN32A(mv)) {
1239         int mx = mv->x&7, mx_idx = subpel_idx[0][mx];
1240         int my = mv->y&7, my_idx = subpel_idx[0][my];
1241
1242         x_off += mv->x >> 3;
1243         y_off += mv->y >> 3;
1244
1245         // edge emulation
1246         src1 += y_off * linesize + x_off;
1247         src2 += y_off * linesize + x_off;
1248         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 3, 0);
1249         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1250             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1251             s->vdsp.emulated_edge_mc(td->edge_emu_buffer, src1 - my_idx * linesize - mx_idx, linesize,
1252                                      block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1253                                      x_off - mx_idx, y_off - my_idx, width, height);
1254             src1 = td->edge_emu_buffer + mx_idx + linesize * my_idx;
1255             mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my);
1256
1257             s->vdsp.emulated_edge_mc(td->edge_emu_buffer, src2 - my_idx * linesize - mx_idx, linesize,
1258                                      block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1259                                      x_off - mx_idx, y_off - my_idx, width, height);
1260             src2 = td->edge_emu_buffer + mx_idx + linesize * my_idx;
1261             mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my);
1262         } else {
1263             mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my);
1264             mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my);
1265         }
1266     } else {
1267         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 3, 0);
1268         mc_func[0][0](dst1, linesize, src1 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1269         mc_func[0][0](dst2, linesize, src2 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1270     }
1271 }
1272
1273 static av_always_inline
1274 void vp8_mc_part(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1275                  ThreadFrame *ref_frame, int x_off, int y_off,
1276                  int bx_off, int by_off,
1277                  int block_w, int block_h,
1278                  int width, int height, VP56mv *mv)
1279 {
1280     VP56mv uvmv = *mv;
1281
1282     /* Y */
1283     vp8_mc_luma(s, td, dst[0] + by_off * s->linesize + bx_off,
1284                 ref_frame, mv, x_off + bx_off, y_off + by_off,
1285                 block_w, block_h, width, height, s->linesize,
1286                 s->put_pixels_tab[block_w == 8]);
1287
1288     /* U/V */
1289     if (s->profile == 3) {
1290         uvmv.x &= ~7;
1291         uvmv.y &= ~7;
1292     }
1293     x_off   >>= 1; y_off   >>= 1;
1294     bx_off  >>= 1; by_off  >>= 1;
1295     width   >>= 1; height  >>= 1;
1296     block_w >>= 1; block_h >>= 1;
1297     vp8_mc_chroma(s, td, dst[1] + by_off * s->uvlinesize + bx_off,
1298                   dst[2] + by_off * s->uvlinesize + bx_off, ref_frame,
1299                   &uvmv, x_off + bx_off, y_off + by_off,
1300                   block_w, block_h, width, height, s->uvlinesize,
1301                   s->put_pixels_tab[1 + (block_w == 4)]);
1302 }
1303
1304 /* Fetch pixels for estimated mv 4 macroblocks ahead.
1305  * Optimized for 64-byte cache lines.  Inspired by ffh264 prefetch_motion. */
1306 static av_always_inline void prefetch_motion(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, int mb_xy, int ref)
1307 {
1308     /* Don't prefetch refs that haven't been used very often this frame. */
1309     if (s->ref_count[ref-1] > (mb_xy >> 5)) {
1310         int x_off = mb_x << 4, y_off = mb_y << 4;
1311         int mx = (mb->mv.x>>2) + x_off + 8;
1312         int my = (mb->mv.y>>2) + y_off;
1313         uint8_t **src= s->framep[ref]->tf.f->data;
1314         int off= mx + (my + (mb_x&3)*4)*s->linesize + 64;
1315         /* For threading, a ff_thread_await_progress here might be useful, but
1316          * it actually slows down the decoder. Since a bad prefetch doesn't
1317          * generate bad decoder output, we don't run it here. */
1318         s->vdsp.prefetch(src[0]+off, s->linesize, 4);
1319         off= (mx>>1) + ((my>>1) + (mb_x&7))*s->uvlinesize + 64;
1320         s->vdsp.prefetch(src[1]+off, src[2]-src[1], 2);
1321     }
1322 }
1323
1324 /**
1325  * Apply motion vectors to prediction buffer, chapter 18.
1326  */
1327 static av_always_inline
1328 void inter_predict(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1329                    VP8Macroblock *mb, int mb_x, int mb_y)
1330 {
1331     int x_off = mb_x << 4, y_off = mb_y << 4;
1332     int width = 16*s->mb_width, height = 16*s->mb_height;
1333     ThreadFrame *ref = &s->framep[mb->ref_frame]->tf;
1334     VP56mv *bmv = mb->bmv;
1335
1336     switch (mb->partitioning) {
1337     case VP8_SPLITMVMODE_NONE:
1338         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1339                     0, 0, 16, 16, width, height, &mb->mv);
1340         break;
1341     case VP8_SPLITMVMODE_4x4: {
1342         int x, y;
1343         VP56mv uvmv;
1344
1345         /* Y */
1346         for (y = 0; y < 4; y++) {
1347             for (x = 0; x < 4; x++) {
1348                 vp8_mc_luma(s, td, dst[0] + 4*y*s->linesize + x*4,
1349                             ref, &bmv[4*y + x],
1350                             4*x + x_off, 4*y + y_off, 4, 4,
1351                             width, height, s->linesize,
1352                             s->put_pixels_tab[2]);
1353             }
1354         }
1355
1356         /* U/V */
1357         x_off >>= 1; y_off >>= 1; width >>= 1; height >>= 1;
1358         for (y = 0; y < 2; y++) {
1359             for (x = 0; x < 2; x++) {
1360                 uvmv.x = mb->bmv[ 2*y    * 4 + 2*x  ].x +
1361                          mb->bmv[ 2*y    * 4 + 2*x+1].x +
1362                          mb->bmv[(2*y+1) * 4 + 2*x  ].x +
1363                          mb->bmv[(2*y+1) * 4 + 2*x+1].x;
1364                 uvmv.y = mb->bmv[ 2*y    * 4 + 2*x  ].y +
1365                          mb->bmv[ 2*y    * 4 + 2*x+1].y +
1366                          mb->bmv[(2*y+1) * 4 + 2*x  ].y +
1367                          mb->bmv[(2*y+1) * 4 + 2*x+1].y;
1368                 uvmv.x = (uvmv.x + 2 + (uvmv.x >> (INT_BIT-1))) >> 2;
1369                 uvmv.y = (uvmv.y + 2 + (uvmv.y >> (INT_BIT-1))) >> 2;
1370                 if (s->profile == 3) {
1371                     uvmv.x &= ~7;
1372                     uvmv.y &= ~7;
1373                 }
1374                 vp8_mc_chroma(s, td, dst[1] + 4*y*s->uvlinesize + x*4,
1375                               dst[2] + 4*y*s->uvlinesize + x*4, ref, &uvmv,
1376                               4*x + x_off, 4*y + y_off, 4, 4,
1377                               width, height, s->uvlinesize,
1378                               s->put_pixels_tab[2]);
1379             }
1380         }
1381         break;
1382     }
1383     case VP8_SPLITMVMODE_16x8:
1384         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1385                     0, 0, 16, 8, width, height, &bmv[0]);
1386         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1387                     0, 8, 16, 8, width, height, &bmv[1]);
1388         break;
1389     case VP8_SPLITMVMODE_8x16:
1390         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1391                     0, 0, 8, 16, width, height, &bmv[0]);
1392         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1393                     8, 0, 8, 16, width, height, &bmv[1]);
1394         break;
1395     case VP8_SPLITMVMODE_8x8:
1396         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1397                     0, 0, 8, 8, width, height, &bmv[0]);
1398         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1399                     8, 0, 8, 8, width, height, &bmv[1]);
1400         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1401                     0, 8, 8, 8, width, height, &bmv[2]);
1402         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1403                     8, 8, 8, 8, width, height, &bmv[3]);
1404         break;
1405     }
1406 }
1407
1408 static av_always_inline void idct_mb(VP8Context *s, VP8ThreadData *td,
1409                                      uint8_t *dst[3], VP8Macroblock *mb)
1410 {
1411     int x, y, ch;
1412
1413     if (mb->mode != MODE_I4x4) {
1414         uint8_t *y_dst = dst[0];
1415         for (y = 0; y < 4; y++) {
1416             uint32_t nnz4 = AV_RL32(td->non_zero_count_cache[y]);
1417             if (nnz4) {
1418                 if (nnz4&~0x01010101) {
1419                     for (x = 0; x < 4; x++) {
1420                         if ((uint8_t)nnz4 == 1)
1421                             s->vp8dsp.vp8_idct_dc_add(y_dst+4*x, td->block[y][x], s->linesize);
1422                         else if((uint8_t)nnz4 > 1)
1423                             s->vp8dsp.vp8_idct_add(y_dst+4*x, td->block[y][x], s->linesize);
1424                         nnz4 >>= 8;
1425                         if (!nnz4)
1426                             break;
1427                     }
1428                 } else {
1429                     s->vp8dsp.vp8_idct_dc_add4y(y_dst, td->block[y], s->linesize);
1430                 }
1431             }
1432             y_dst += 4*s->linesize;
1433         }
1434     }
1435
1436     for (ch = 0; ch < 2; ch++) {
1437         uint32_t nnz4 = AV_RL32(td->non_zero_count_cache[4+ch]);
1438         if (nnz4) {
1439             uint8_t *ch_dst = dst[1+ch];
1440             if (nnz4&~0x01010101) {
1441                 for (y = 0; y < 2; y++) {
1442                     for (x = 0; x < 2; x++) {
1443                         if ((uint8_t)nnz4 == 1)
1444                             s->vp8dsp.vp8_idct_dc_add(ch_dst+4*x, td->block[4+ch][(y<<1)+x], s->uvlinesize);
1445                         else if((uint8_t)nnz4 > 1)
1446                             s->vp8dsp.vp8_idct_add(ch_dst+4*x, td->block[4+ch][(y<<1)+x], s->uvlinesize);
1447                         nnz4 >>= 8;
1448                         if (!nnz4)
1449                             goto chroma_idct_end;
1450                     }
1451                     ch_dst += 4*s->uvlinesize;
1452                 }
1453             } else {
1454                 s->vp8dsp.vp8_idct_dc_add4uv(ch_dst, td->block[4+ch], s->uvlinesize);
1455             }
1456         }
1457 chroma_idct_end: ;
1458     }
1459 }
1460
1461 static av_always_inline void filter_level_for_mb(VP8Context *s, VP8Macroblock *mb, VP8FilterStrength *f )
1462 {
1463     int interior_limit, filter_level;
1464
1465     if (s->segmentation.enabled) {
1466         filter_level = s->segmentation.filter_level[mb->segment];
1467         if (!s->segmentation.absolute_vals)
1468             filter_level += s->filter.level;
1469     } else
1470         filter_level = s->filter.level;
1471
1472     if (s->lf_delta.enabled) {
1473         filter_level += s->lf_delta.ref[mb->ref_frame];
1474         filter_level += s->lf_delta.mode[mb->mode];
1475     }
1476
1477     filter_level = av_clip_uintp2(filter_level, 6);
1478
1479     interior_limit = filter_level;
1480     if (s->filter.sharpness) {
1481         interior_limit >>= (s->filter.sharpness + 3) >> 2;
1482         interior_limit = FFMIN(interior_limit, 9 - s->filter.sharpness);
1483     }
1484     interior_limit = FFMAX(interior_limit, 1);
1485
1486     f->filter_level = filter_level;
1487     f->inner_limit = interior_limit;
1488     f->inner_filter = !mb->skip || mb->mode == MODE_I4x4 || mb->mode == VP8_MVMODE_SPLIT;
1489 }
1490
1491 static av_always_inline void filter_mb(VP8Context *s, uint8_t *dst[3], VP8FilterStrength *f, int mb_x, int mb_y)
1492 {
1493     int mbedge_lim, bedge_lim, hev_thresh;
1494     int filter_level = f->filter_level;
1495     int inner_limit = f->inner_limit;
1496     int inner_filter = f->inner_filter;
1497     int linesize = s->linesize;
1498     int uvlinesize = s->uvlinesize;
1499     static const uint8_t hev_thresh_lut[2][64] = {
1500         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
1501           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1502           3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1503           3, 3, 3, 3 },
1504         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
1505           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1506           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1507           2, 2, 2, 2 }
1508     };
1509
1510     if (!filter_level)
1511         return;
1512
1513      bedge_lim = 2*filter_level + inner_limit;
1514     mbedge_lim = bedge_lim + 4;
1515
1516     hev_thresh = hev_thresh_lut[s->keyframe][filter_level];
1517
1518     if (mb_x) {
1519         s->vp8dsp.vp8_h_loop_filter16y(dst[0],     linesize,
1520                                        mbedge_lim, inner_limit, hev_thresh);
1521         s->vp8dsp.vp8_h_loop_filter8uv(dst[1],     dst[2],      uvlinesize,
1522                                        mbedge_lim, inner_limit, hev_thresh);
1523     }
1524
1525     if (inner_filter) {
1526         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+ 4, linesize, bedge_lim,
1527                                              inner_limit, hev_thresh);
1528         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+ 8, linesize, bedge_lim,
1529                                              inner_limit, hev_thresh);
1530         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+12, linesize, bedge_lim,
1531                                              inner_limit, hev_thresh);
1532         s->vp8dsp.vp8_h_loop_filter8uv_inner(dst[1] + 4, dst[2] + 4,
1533                                              uvlinesize,  bedge_lim,
1534                                              inner_limit, hev_thresh);
1535     }
1536
1537     if (mb_y) {
1538         s->vp8dsp.vp8_v_loop_filter16y(dst[0],     linesize,
1539                                        mbedge_lim, inner_limit, hev_thresh);
1540         s->vp8dsp.vp8_v_loop_filter8uv(dst[1],     dst[2],      uvlinesize,
1541                                        mbedge_lim, inner_limit, hev_thresh);
1542     }
1543
1544     if (inner_filter) {
1545         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+ 4*linesize,
1546                                              linesize,    bedge_lim,
1547                                              inner_limit, hev_thresh);
1548         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+ 8*linesize,
1549                                              linesize,    bedge_lim,
1550                                              inner_limit, hev_thresh);
1551         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+12*linesize,
1552                                              linesize,    bedge_lim,
1553                                              inner_limit, hev_thresh);
1554         s->vp8dsp.vp8_v_loop_filter8uv_inner(dst[1] + 4 * uvlinesize,
1555                                              dst[2] + 4 * uvlinesize,
1556                                              uvlinesize,  bedge_lim,
1557                                              inner_limit, hev_thresh);
1558     }
1559 }
1560
1561 static av_always_inline void filter_mb_simple(VP8Context *s, uint8_t *dst, VP8FilterStrength *f, int mb_x, int mb_y)
1562 {
1563     int mbedge_lim, bedge_lim;
1564     int filter_level = f->filter_level;
1565     int inner_limit = f->inner_limit;
1566     int inner_filter = f->inner_filter;
1567     int linesize = s->linesize;
1568
1569     if (!filter_level)
1570         return;
1571
1572      bedge_lim = 2*filter_level + inner_limit;
1573     mbedge_lim = bedge_lim + 4;
1574
1575     if (mb_x)
1576         s->vp8dsp.vp8_h_loop_filter_simple(dst, linesize, mbedge_lim);
1577     if (inner_filter) {
1578         s->vp8dsp.vp8_h_loop_filter_simple(dst+ 4, linesize, bedge_lim);
1579         s->vp8dsp.vp8_h_loop_filter_simple(dst+ 8, linesize, bedge_lim);
1580         s->vp8dsp.vp8_h_loop_filter_simple(dst+12, linesize, bedge_lim);
1581     }
1582
1583     if (mb_y)
1584         s->vp8dsp.vp8_v_loop_filter_simple(dst, linesize, mbedge_lim);
1585     if (inner_filter) {
1586         s->vp8dsp.vp8_v_loop_filter_simple(dst+ 4*linesize, linesize, bedge_lim);
1587         s->vp8dsp.vp8_v_loop_filter_simple(dst+ 8*linesize, linesize, bedge_lim);
1588         s->vp8dsp.vp8_v_loop_filter_simple(dst+12*linesize, linesize, bedge_lim);
1589     }
1590 }
1591
1592 #define MARGIN (16 << 2)
1593 static void vp8_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *curframe,
1594                                    VP8Frame *prev_frame)
1595 {
1596     VP8Context *s = avctx->priv_data;
1597     int mb_x, mb_y;
1598
1599     s->mv_min.y = -MARGIN;
1600     s->mv_max.y = ((s->mb_height - 1) << 6) + MARGIN;
1601     for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
1602         VP8Macroblock *mb = s->macroblocks_base + ((s->mb_width+1)*(mb_y + 1) + 1);
1603         int mb_xy = mb_y*s->mb_width;
1604
1605         AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED*0x01010101);
1606
1607         s->mv_min.x = -MARGIN;
1608         s->mv_max.x = ((s->mb_width - 1) << 6) + MARGIN;
1609         for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
1610             if (mb_y == 0)
1611                 AV_WN32A((mb-s->mb_width-1)->intra4x4_pred_mode_top, DC_PRED*0x01010101);
1612             decode_mb_mode(s, mb, mb_x, mb_y, curframe->seg_map->data + mb_xy,
1613                            prev_frame && prev_frame->seg_map ?
1614                            prev_frame->seg_map->data + mb_xy : NULL, 1);
1615             s->mv_min.x -= 64;
1616             s->mv_max.x -= 64;
1617         }
1618         s->mv_min.y -= 64;
1619         s->mv_max.y -= 64;
1620     }
1621 }
1622
1623 #if HAVE_THREADS
1624 #define check_thread_pos(td, otd, mb_x_check, mb_y_check)\
1625     do {\
1626         int tmp = (mb_y_check << 16) | (mb_x_check & 0xFFFF);\
1627         if (otd->thread_mb_pos < tmp) {\
1628             pthread_mutex_lock(&otd->lock);\
1629             td->wait_mb_pos = tmp;\
1630             do {\
1631                 if (otd->thread_mb_pos >= tmp)\
1632                     break;\
1633                 pthread_cond_wait(&otd->cond, &otd->lock);\
1634             } while (1);\
1635             td->wait_mb_pos = INT_MAX;\
1636             pthread_mutex_unlock(&otd->lock);\
1637         }\
1638     } while(0);
1639
1640 #define update_pos(td, mb_y, mb_x)\
1641     do {\
1642     int pos              = (mb_y << 16) | (mb_x & 0xFFFF);\
1643     int sliced_threading = (avctx->active_thread_type == FF_THREAD_SLICE) && (num_jobs > 1);\
1644     int is_null          = (next_td == NULL) || (prev_td == NULL);\
1645     int pos_check        = (is_null) ? 1 :\
1646                             (next_td != td && pos >= next_td->wait_mb_pos) ||\
1647                             (prev_td != td && pos >= prev_td->wait_mb_pos);\
1648     td->thread_mb_pos = pos;\
1649     if (sliced_threading && pos_check) {\
1650         pthread_mutex_lock(&td->lock);\
1651         pthread_cond_broadcast(&td->cond);\
1652         pthread_mutex_unlock(&td->lock);\
1653     }\
1654     } while(0);
1655 #else
1656 #define check_thread_pos(td, otd, mb_x_check, mb_y_check)
1657 #define update_pos(td, mb_y, mb_x)
1658 #endif
1659
1660 static void vp8_decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
1661                                         int jobnr, int threadnr)
1662 {
1663     VP8Context *s = avctx->priv_data;
1664     VP8ThreadData *prev_td, *next_td, *td = &s->thread_data[threadnr];
1665     int mb_y = td->thread_mb_pos>>16;
1666     int i, y, mb_x, mb_xy = mb_y*s->mb_width;
1667     int num_jobs = s->num_jobs;
1668     VP8Frame *curframe = s->curframe, *prev_frame = s->prev_frame;
1669     VP56RangeCoder *c = &s->coeff_partition[mb_y & (s->num_coeff_partitions-1)];
1670     VP8Macroblock *mb;
1671     uint8_t *dst[3] = {
1672         curframe->tf.f->data[0] + 16*mb_y*s->linesize,
1673         curframe->tf.f->data[1] +  8*mb_y*s->uvlinesize,
1674         curframe->tf.f->data[2] +  8*mb_y*s->uvlinesize
1675     };
1676     if (mb_y == 0) prev_td = td;
1677     else           prev_td = &s->thread_data[(jobnr + num_jobs - 1)%num_jobs];
1678     if (mb_y == s->mb_height-1) next_td = td;
1679     else                        next_td = &s->thread_data[(jobnr + 1)%num_jobs];
1680     if (s->mb_layout == 1)
1681         mb = s->macroblocks_base + ((s->mb_width+1)*(mb_y + 1) + 1);
1682     else {
1683         mb = s->macroblocks + (s->mb_height - mb_y - 1)*2;
1684         memset(mb - 1, 0, sizeof(*mb)); // zero left macroblock
1685         AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED*0x01010101);
1686     }
1687
1688     memset(td->left_nnz, 0, sizeof(td->left_nnz));
1689     // left edge of 129 for intra prediction
1690     if (!(avctx->flags & CODEC_FLAG_EMU_EDGE)) {
1691         for (i = 0; i < 3; i++)
1692             for (y = 0; y < 16>>!!i; y++)
1693                 dst[i][y*curframe->tf.f->linesize[i]-1] = 129;
1694         if (mb_y == 1) {
1695             s->top_border[0][15] = s->top_border[0][23] = s->top_border[0][31] = 129;
1696         }
1697     }
1698
1699     s->mv_min.x = -MARGIN;
1700     s->mv_max.x = ((s->mb_width  - 1) << 6) + MARGIN;
1701
1702     for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
1703         // Wait for previous thread to read mb_x+2, and reach mb_y-1.
1704         if (prev_td != td) {
1705             if (threadnr != 0) {
1706                 check_thread_pos(td, prev_td, mb_x+1, mb_y-1);
1707             } else {
1708                 check_thread_pos(td, prev_td, (s->mb_width+3) + (mb_x+1), mb_y-1);
1709             }
1710         }
1711
1712         s->vdsp.prefetch(dst[0] + (mb_x&3)*4*s->linesize + 64, s->linesize, 4);
1713         s->vdsp.prefetch(dst[1] + (mb_x&7)*s->uvlinesize + 64, dst[2] - dst[1], 2);
1714
1715         if (!s->mb_layout)
1716             decode_mb_mode(s, mb, mb_x, mb_y, curframe->seg_map->data + mb_xy,
1717                            prev_frame && prev_frame->seg_map ?
1718                            prev_frame->seg_map->data + mb_xy : NULL, 0);
1719
1720         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_PREVIOUS);
1721
1722         if (!mb->skip)
1723             decode_mb_coeffs(s, td, c, mb, s->top_nnz[mb_x], td->left_nnz);
1724
1725         if (mb->mode <= MODE_I4x4)
1726             intra_predict(s, td, dst, mb, mb_x, mb_y);
1727         else
1728             inter_predict(s, td, dst, mb, mb_x, mb_y);
1729
1730         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN);
1731
1732         if (!mb->skip) {
1733             idct_mb(s, td, dst, mb);
1734         } else {
1735             AV_ZERO64(td->left_nnz);
1736             AV_WN64(s->top_nnz[mb_x], 0);   // array of 9, so unaligned
1737
1738             // Reset DC block predictors if they would exist if the mb had coefficients
1739             if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
1740                 td->left_nnz[8]     = 0;
1741                 s->top_nnz[mb_x][8] = 0;
1742             }
1743         }
1744
1745         if (s->deblock_filter)
1746             filter_level_for_mb(s, mb, &td->filter_strength[mb_x]);
1747
1748         if (s->deblock_filter && num_jobs != 1 && threadnr == num_jobs-1) {
1749             if (s->filter.simple)
1750                 backup_mb_border(s->top_border[mb_x+1], dst[0], NULL, NULL, s->linesize, 0, 1);
1751             else
1752                 backup_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2], s->linesize, s->uvlinesize, 0);
1753         }
1754
1755         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN2);
1756
1757         dst[0] += 16;
1758         dst[1] += 8;
1759         dst[2] += 8;
1760         s->mv_min.x -= 64;
1761         s->mv_max.x -= 64;
1762
1763         if (mb_x == s->mb_width+1) {
1764             update_pos(td, mb_y, s->mb_width+3);
1765         } else {
1766             update_pos(td, mb_y, mb_x);
1767         }
1768     }
1769 }
1770
1771 static void vp8_filter_mb_row(AVCodecContext *avctx, void *tdata,
1772                               int jobnr, int threadnr)
1773 {
1774     VP8Context *s = avctx->priv_data;
1775     VP8ThreadData *td = &s->thread_data[threadnr];
1776     int mb_x, mb_y = td->thread_mb_pos>>16, num_jobs = s->num_jobs;
1777     AVFrame *curframe = s->curframe->tf.f;
1778     VP8Macroblock *mb;
1779     VP8ThreadData *prev_td, *next_td;
1780     uint8_t *dst[3] = {
1781         curframe->data[0] + 16*mb_y*s->linesize,
1782         curframe->data[1] +  8*mb_y*s->uvlinesize,
1783         curframe->data[2] +  8*mb_y*s->uvlinesize
1784     };
1785
1786     if (s->mb_layout == 1)
1787         mb = s->macroblocks_base + ((s->mb_width+1)*(mb_y + 1) + 1);
1788     else
1789         mb = s->macroblocks + (s->mb_height - mb_y - 1)*2;
1790
1791     if (mb_y == 0) prev_td = td;
1792     else           prev_td = &s->thread_data[(jobnr + num_jobs - 1)%num_jobs];
1793     if (mb_y == s->mb_height-1) next_td = td;
1794     else                        next_td = &s->thread_data[(jobnr + 1)%num_jobs];
1795
1796     for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb++) {
1797         VP8FilterStrength *f = &td->filter_strength[mb_x];
1798         if (prev_td != td) {
1799             check_thread_pos(td, prev_td, (mb_x+1) + (s->mb_width+3), mb_y-1);
1800         }
1801         if (next_td != td)
1802             if (next_td != &s->thread_data[0]) {
1803                 check_thread_pos(td, next_td, mb_x+1, mb_y+1);
1804             }
1805
1806         if (num_jobs == 1) {
1807             if (s->filter.simple)
1808                 backup_mb_border(s->top_border[mb_x+1], dst[0], NULL, NULL, s->linesize, 0, 1);
1809             else
1810                 backup_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2], s->linesize, s->uvlinesize, 0);
1811         }
1812
1813         if (s->filter.simple)
1814             filter_mb_simple(s, dst[0], f, mb_x, mb_y);
1815         else
1816             filter_mb(s, dst, f, mb_x, mb_y);
1817         dst[0] += 16;
1818         dst[1] += 8;
1819         dst[2] += 8;
1820
1821         update_pos(td, mb_y, (s->mb_width+3) + mb_x);
1822     }
1823 }
1824
1825 static int vp8_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata,
1826                                     int jobnr, int threadnr)
1827 {
1828     VP8Context *s = avctx->priv_data;
1829     VP8ThreadData *td = &s->thread_data[jobnr];
1830     VP8ThreadData *next_td = NULL, *prev_td = NULL;
1831     VP8Frame *curframe = s->curframe;
1832     int mb_y, num_jobs = s->num_jobs;
1833     td->thread_nr = threadnr;
1834     for (mb_y = jobnr; mb_y < s->mb_height; mb_y += num_jobs) {
1835         if (mb_y >= s->mb_height) break;
1836         td->thread_mb_pos = mb_y<<16;
1837         vp8_decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr);
1838         if (s->deblock_filter)
1839             vp8_filter_mb_row(avctx, tdata, jobnr, threadnr);
1840         update_pos(td, mb_y, INT_MAX & 0xFFFF);
1841
1842         s->mv_min.y -= 64;
1843         s->mv_max.y -= 64;
1844
1845         if (avctx->active_thread_type == FF_THREAD_FRAME)
1846             ff_thread_report_progress(&curframe->tf, mb_y, 0);
1847     }
1848
1849     return 0;
1850 }
1851
1852 static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
1853                             AVPacket *avpkt)
1854 {
1855     VP8Context *s = avctx->priv_data;
1856     int ret, i, referenced, num_jobs;
1857     enum AVDiscard skip_thresh;
1858     VP8Frame *av_uninit(curframe), *prev_frame;
1859
1860     if ((ret = decode_frame_header(s, avpkt->data, avpkt->size)) < 0)
1861         goto err;
1862
1863     prev_frame = s->framep[VP56_FRAME_CURRENT];
1864
1865     referenced = s->update_last || s->update_golden == VP56_FRAME_CURRENT
1866                                 || s->update_altref == VP56_FRAME_CURRENT;
1867
1868     skip_thresh = !referenced ? AVDISCARD_NONREF :
1869                     !s->keyframe ? AVDISCARD_NONKEY : AVDISCARD_ALL;
1870
1871     if (avctx->skip_frame >= skip_thresh) {
1872         s->invisible = 1;
1873         memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
1874         goto skip_decode;
1875     }
1876     s->deblock_filter = s->filter.level && avctx->skip_loop_filter < skip_thresh;
1877
1878     // release no longer referenced frames
1879     for (i = 0; i < 5; i++)
1880         if (s->frames[i].tf.f->data[0] &&
1881             &s->frames[i] != prev_frame &&
1882             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
1883             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN] &&
1884             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2])
1885             vp8_release_frame(s, &s->frames[i]);
1886
1887     // find a free buffer
1888     for (i = 0; i < 5; i++)
1889         if (&s->frames[i] != prev_frame &&
1890             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
1891             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN] &&
1892             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2]) {
1893             curframe = s->framep[VP56_FRAME_CURRENT] = &s->frames[i];
1894             break;
1895         }
1896     if (i == 5) {
1897         av_log(avctx, AV_LOG_FATAL, "Ran out of free frames!\n");
1898         abort();
1899     }
1900     if (curframe->tf.f->data[0])
1901         vp8_release_frame(s, curframe);
1902
1903     // Given that arithmetic probabilities are updated every frame, it's quite likely
1904     // that the values we have on a random interframe are complete junk if we didn't
1905     // start decode on a keyframe. So just don't display anything rather than junk.
1906     if (!s->keyframe && (!s->framep[VP56_FRAME_PREVIOUS] ||
1907                          !s->framep[VP56_FRAME_GOLDEN] ||
1908                          !s->framep[VP56_FRAME_GOLDEN2])) {
1909         av_log(avctx, AV_LOG_WARNING, "Discarding interframe without a prior keyframe!\n");
1910         ret = AVERROR_INVALIDDATA;
1911         goto err;
1912     }
1913
1914     curframe->tf.f->key_frame = s->keyframe;
1915     curframe->tf.f->pict_type = s->keyframe ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
1916     if ((ret = vp8_alloc_frame(s, curframe, referenced)) < 0)
1917         goto err;
1918
1919     // check if golden and altref are swapped
1920     if (s->update_altref != VP56_FRAME_NONE) {
1921         s->next_framep[VP56_FRAME_GOLDEN2]  = s->framep[s->update_altref];
1922     } else {
1923         s->next_framep[VP56_FRAME_GOLDEN2]  = s->framep[VP56_FRAME_GOLDEN2];
1924     }
1925     if (s->update_golden != VP56_FRAME_NONE) {
1926         s->next_framep[VP56_FRAME_GOLDEN]   = s->framep[s->update_golden];
1927     } else {
1928         s->next_framep[VP56_FRAME_GOLDEN]   = s->framep[VP56_FRAME_GOLDEN];
1929     }
1930     if (s->update_last) {
1931         s->next_framep[VP56_FRAME_PREVIOUS] = curframe;
1932     } else {
1933         s->next_framep[VP56_FRAME_PREVIOUS] = s->framep[VP56_FRAME_PREVIOUS];
1934     }
1935     s->next_framep[VP56_FRAME_CURRENT]      = curframe;
1936
1937     ff_thread_finish_setup(avctx);
1938
1939     s->linesize   = curframe->tf.f->linesize[0];
1940     s->uvlinesize = curframe->tf.f->linesize[1];
1941
1942     if (!s->thread_data[0].edge_emu_buffer)
1943         for (i = 0; i < MAX_THREADS; i++)
1944             s->thread_data[i].edge_emu_buffer = av_malloc(21*s->linesize);
1945
1946     memset(s->top_nnz, 0, s->mb_width*sizeof(*s->top_nnz));
1947     /* Zero macroblock structures for top/top-left prediction from outside the frame. */
1948     if (!s->mb_layout)
1949         memset(s->macroblocks + s->mb_height*2 - 1, 0, (s->mb_width+1)*sizeof(*s->macroblocks));
1950     if (!s->mb_layout && s->keyframe)
1951         memset(s->intra4x4_pred_mode_top, DC_PRED, s->mb_width*4);
1952
1953     // top edge of 127 for intra prediction
1954     if (!(avctx->flags & CODEC_FLAG_EMU_EDGE)) {
1955         s->top_border[0][15] = s->top_border[0][23] = 127;
1956         s->top_border[0][31] = 127;
1957         memset(s->top_border[1], 127, s->mb_width*sizeof(*s->top_border));
1958     }
1959     memset(s->ref_count, 0, sizeof(s->ref_count));
1960
1961
1962     // Make sure the previous frame has read its segmentation map,
1963     // if we re-use the same map.
1964     if (prev_frame && s->segmentation.enabled && !s->segmentation.update_map)
1965         ff_thread_await_progress(&prev_frame->tf, 1, 0);
1966
1967     if (s->mb_layout == 1)
1968         vp8_decode_mv_mb_modes(avctx, curframe, prev_frame);
1969
1970     if (avctx->active_thread_type == FF_THREAD_FRAME)
1971         num_jobs = 1;
1972     else
1973         num_jobs = FFMIN(s->num_coeff_partitions, avctx->thread_count);
1974     s->num_jobs   = num_jobs;
1975     s->curframe   = curframe;
1976     s->prev_frame = prev_frame;
1977     s->mv_min.y   = -MARGIN;
1978     s->mv_max.y   = ((s->mb_height - 1) << 6) + MARGIN;
1979     for (i = 0; i < MAX_THREADS; i++) {
1980         s->thread_data[i].thread_mb_pos = 0;
1981         s->thread_data[i].wait_mb_pos = INT_MAX;
1982     }
1983     avctx->execute2(avctx, vp8_decode_mb_row_sliced, s->thread_data, NULL, num_jobs);
1984
1985     ff_thread_report_progress(&curframe->tf, INT_MAX, 0);
1986     memcpy(&s->framep[0], &s->next_framep[0], sizeof(s->framep[0]) * 4);
1987
1988 skip_decode:
1989     // if future frames don't use the updated probabilities,
1990     // reset them to the values we saved
1991     if (!s->update_probabilities)
1992         s->prob[0] = s->prob[1];
1993
1994     if (!s->invisible) {
1995         if ((ret = av_frame_ref(data, curframe->tf.f)) < 0)
1996             return ret;
1997         *got_frame      = 1;
1998     }
1999
2000     return avpkt->size;
2001 err:
2002     memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
2003     return ret;
2004 }
2005
2006 static av_cold int vp8_decode_free(AVCodecContext *avctx)
2007 {
2008     VP8Context *s = avctx->priv_data;
2009     int i;
2010
2011     vp8_decode_flush_impl(avctx, 1);
2012     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++)
2013         av_frame_free(&s->frames[i].tf.f);
2014
2015     return 0;
2016 }
2017
2018 static av_cold int vp8_init_frames(VP8Context *s)
2019 {
2020     int i;
2021     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++) {
2022         s->frames[i].tf.f = av_frame_alloc();
2023         if (!s->frames[i].tf.f)
2024             return AVERROR(ENOMEM);
2025     }
2026     return 0;
2027 }
2028
2029 static av_cold int vp8_decode_init(AVCodecContext *avctx)
2030 {
2031     VP8Context *s = avctx->priv_data;
2032     int ret;
2033
2034     s->avctx = avctx;
2035     avctx->pix_fmt = AV_PIX_FMT_YUV420P;
2036     avctx->internal->allocate_progress = 1;
2037
2038     ff_videodsp_init(&s->vdsp, 8);
2039     ff_h264_pred_init(&s->hpc, AV_CODEC_ID_VP8, 8, 1);
2040     ff_vp8dsp_init(&s->vp8dsp);
2041
2042     if ((ret = vp8_init_frames(s)) < 0) {
2043         vp8_decode_free(avctx);
2044         return ret;
2045     }
2046
2047     return 0;
2048 }
2049
2050 static av_cold int vp8_decode_init_thread_copy(AVCodecContext *avctx)
2051 {
2052     VP8Context *s = avctx->priv_data;
2053     int ret;
2054
2055     s->avctx = avctx;
2056
2057     if ((ret = vp8_init_frames(s)) < 0) {
2058         vp8_decode_free(avctx);
2059         return ret;
2060     }
2061
2062     return 0;
2063 }
2064
2065 #define REBASE(pic) \
2066     pic ? pic - &s_src->frames[0] + &s->frames[0] : NULL
2067
2068 static int vp8_decode_update_thread_context(AVCodecContext *dst, const AVCodecContext *src)
2069 {
2070     VP8Context *s = dst->priv_data, *s_src = src->priv_data;
2071     int i;
2072
2073     if (s->macroblocks_base &&
2074         (s_src->mb_width != s->mb_width || s_src->mb_height != s->mb_height)) {
2075         free_buffers(s);
2076         s->mb_width  = s_src->mb_width;
2077         s->mb_height = s_src->mb_height;
2078     }
2079
2080     s->prob[0] = s_src->prob[!s_src->update_probabilities];
2081     s->segmentation = s_src->segmentation;
2082     s->lf_delta = s_src->lf_delta;
2083     memcpy(s->sign_bias, s_src->sign_bias, sizeof(s->sign_bias));
2084
2085     for (i = 0; i < FF_ARRAY_ELEMS(s_src->frames); i++) {
2086         if (s_src->frames[i].tf.f->data[0]) {
2087             int ret = vp8_ref_frame(s, &s->frames[i], &s_src->frames[i]);
2088             if (ret < 0)
2089                 return ret;
2090         }
2091     }
2092
2093     s->framep[0] = REBASE(s_src->next_framep[0]);
2094     s->framep[1] = REBASE(s_src->next_framep[1]);
2095     s->framep[2] = REBASE(s_src->next_framep[2]);
2096     s->framep[3] = REBASE(s_src->next_framep[3]);
2097
2098     return 0;
2099 }
2100
2101 static unsigned apply_padding(unsigned size) { return size + (size & 1); }
2102
2103 static int webp_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
2104                              AVPacket *avpkt)
2105 {
2106     const uint8_t *buf = avpkt->data;
2107     int buf_size       = avpkt->size;
2108     AVPacket pkt       = *avpkt;
2109
2110     if (buf_size >= 16
2111         && AV_RL32(buf   ) == AV_RL32("RIFF")
2112         && AV_RL32(buf+ 8) == AV_RL32("WEBP")) {
2113         unsigned riff_size = apply_padding(AV_RL32(buf+4)) + 8;
2114         buf += 12;   // Skip over main header
2115         buf_size -= 12;
2116         if (buf_size < 8 || riff_size < 8) {
2117             av_log(avctx, AV_LOG_ERROR, "Incomplete header.\n");
2118             return AVERROR_INVALIDDATA;
2119         }
2120         if (AV_RL32(buf) == AV_RL32("VP8L")) {
2121             av_log(avctx, AV_LOG_ERROR, "Unsupported WebP lossless format.\n");
2122             return AVERROR_PATCHWELCOME;
2123         }
2124         if (AV_RL32(buf) == AV_RL32("VP8X") && AV_RL32(buf+4) < (unsigned)buf_size) {
2125             unsigned size = apply_padding(AV_RL32(buf+4) + 8);
2126             buf      += size;
2127             buf_size -= size;
2128         }
2129         if (buf_size >= 8
2130             && AV_RL32(buf) == AV_RL32("ALPH") && AV_RL32(buf+4) < (unsigned)buf_size) {
2131             unsigned size = apply_padding(AV_RL32(buf+4) + 8);
2132             buf      += size;
2133             buf_size -= size;
2134             av_log(avctx, AV_LOG_WARNING, "Skipping alpha plane\n");
2135         }
2136         if (buf_size >= 8 && AV_RL32(buf) == AV_RL32("VP8 ")) {
2137             buf      += 8;
2138             buf_size -= 8;
2139         }
2140     }
2141     pkt.data = buf;
2142     pkt.size = buf_size;
2143
2144     return vp8_decode_frame(avctx, data, data_size, &pkt);
2145 }
2146
2147 AVCodec ff_vp8_decoder = {
2148     .name                  = "vp8",
2149     .type                  = AVMEDIA_TYPE_VIDEO,
2150     .id                    = AV_CODEC_ID_VP8,
2151     .priv_data_size        = sizeof(VP8Context),
2152     .init                  = vp8_decode_init,
2153     .close                 = vp8_decode_free,
2154     .decode                = vp8_decode_frame,
2155     .capabilities          = CODEC_CAP_DR1 | CODEC_CAP_FRAME_THREADS | CODEC_CAP_SLICE_THREADS,
2156     .flush                 = vp8_decode_flush,
2157     .long_name             = NULL_IF_CONFIG_SMALL("On2 VP8"),
2158     .init_thread_copy      = ONLY_IF_THREADS_ENABLED(vp8_decode_init_thread_copy),
2159     .update_thread_context = ONLY_IF_THREADS_ENABLED(vp8_decode_update_thread_context),
2160 };
2161
2162 AVCodec ff_webp_decoder = {
2163     .name                  = "webp",
2164     .type                  = AVMEDIA_TYPE_VIDEO,
2165     .id                    = AV_CODEC_ID_WEBP,
2166     .priv_data_size        = sizeof(VP8Context),
2167     .init                  = vp8_decode_init,
2168     .close                 = vp8_decode_free,
2169     .decode                = webp_decode_frame,
2170     .capabilities          = CODEC_CAP_DR1 | CODEC_CAP_FRAME_THREADS | CODEC_CAP_SLICE_THREADS,
2171     .flush                 = vp8_decode_flush,
2172     .long_name             = NULL_IF_CONFIG_SMALL("WebP"),
2173     .init_thread_copy      = ONLY_IF_THREADS_ENABLED(vp8_decode_init_thread_copy),
2174     .update_thread_context = ONLY_IF_THREADS_ENABLED(vp8_decode_update_thread_context),
2175 };