git.sesse.net Git - ffmpeg/blob - libavcodec/vp8.c

   1 /*
   2  * VP8 compatible video decoder
   3  *
   4  * Copyright (C) 2010 David Conrad
   5  * Copyright (C) 2010 Ronald S. Bultje
   6  * Copyright (C) 2010 Jason Garrett-Glaser
   7  * Copyright (C) 2012 Daniel Kang
   8  *
   9  * This file is part of FFmpeg.
  10  *
  11  * FFmpeg is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public
  13  * License as published by the Free Software Foundation; either
  14  * version 2.1 of the License, or (at your option) any later version.
  15  *
  16  * FFmpeg is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with FFmpeg; if not, write to the Free Software
  23  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  24  */
  25
  26 #include "libavutil/imgutils.h"
  27 #include "avcodec.h"
  28 #include "internal.h"
  29 #include "vp8.h"
  30 #include "vp8data.h"
  31 #include "rectangle.h"
  32 #include "thread.h"
  33
  34 #if ARCH_ARM
  35 #   include "arm/vp8.h"
  36 #endif
  37
  38 static void free_buffers(VP8Context *s)
  39 {
  40     int i;
  41     if (s->thread_data)
  42         for (i = 0; i < MAX_THREADS; i++) {
  43 #if HAVE_THREADS
  44             pthread_cond_destroy(&s->thread_data[i].cond);
  45             pthread_mutex_destroy(&s->thread_data[i].lock);
  46 #endif
  47             av_freep(&s->thread_data[i].filter_strength);
  48         }
  49     av_freep(&s->thread_data);
  50     av_freep(&s->macroblocks_base);
  51     av_freep(&s->intra4x4_pred_mode_top);
  52     av_freep(&s->top_nnz);
  53     av_freep(&s->top_border);
  54
  55     s->macroblocks = NULL;
  56 }
  57
  58 static int vp8_alloc_frame(VP8Context *s, VP8Frame *f, int ref)
  59 {
  60     int ret;
  61     if ((ret = ff_thread_get_buffer(s->avctx, &f->tf,
  62                                     ref ? AV_GET_BUFFER_FLAG_REF : 0)) < 0)
  63         return ret;
  64     if (!(f->seg_map = av_buffer_allocz(s->mb_width * s->mb_height))) {
  65         ff_thread_release_buffer(s->avctx, &f->tf);
  66         return AVERROR(ENOMEM);
  67     }
  68     return 0;
  69 }
  70
  71 static void vp8_release_frame(VP8Context *s, VP8Frame *f)
  72 {
  73     av_buffer_unref(&f->seg_map);
  74     ff_thread_release_buffer(s->avctx, &f->tf);
  75 }
  76
  77 static int vp8_ref_frame(VP8Context *s, VP8Frame *dst, VP8Frame *src)
  78 {
  79     int ret;
  80
  81     vp8_release_frame(s, dst);
  82
  83     if ((ret = ff_thread_ref_frame(&dst->tf, &src->tf)) < 0)
  84         return ret;
  85     if (src->seg_map &&
  86         !(dst->seg_map = av_buffer_ref(src->seg_map))) {
  87         vp8_release_frame(s, dst);
  88         return AVERROR(ENOMEM);
  89     }
  90
  91     return 0;
  92 }
  93
  94
  95 static void vp8_decode_flush_impl(AVCodecContext *avctx, int free_mem)
  96 {
  97     VP8Context *s = avctx->priv_data;
  98     int i;
  99
 100     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++)
 101         vp8_release_frame(s, &s->frames[i]);
 102     memset(s->framep, 0, sizeof(s->framep));
 103
 104     if (free_mem)
 105         free_buffers(s);
 106 }
 107
 108 static void vp8_decode_flush(AVCodecContext *avctx)
 109 {
 110     vp8_decode_flush_impl(avctx, 0);
 111 }
 112
 113 static int update_dimensions(VP8Context *s, int width, int height)
 114 {
 115     AVCodecContext *avctx = s->avctx;
 116     int i, ret;
 117
 118     if (width  != s->avctx->width || ((width+15)/16 != s->mb_width || (height+15)/16 != s->mb_height) && s->macroblocks_base ||
 119         height != s->avctx->height) {
 120         vp8_decode_flush_impl(s->avctx, 1);
 121
 122         ret = ff_set_dimensions(s->avctx, width, height);
 123         if (ret < 0)
 124             return ret;
 125     }
 126
 127     s->mb_width  = (s->avctx->coded_width +15) / 16;
 128     s->mb_height = (s->avctx->coded_height+15) / 16;
 129
 130     s->mb_layout = (avctx->active_thread_type == FF_THREAD_SLICE) && (FFMIN(s->num_coeff_partitions, avctx->thread_count) > 1);
 131     if (!s->mb_layout) { // Frame threading and one thread
 132         s->macroblocks_base       = av_mallocz((s->mb_width+s->mb_height*2+1)*sizeof(*s->macroblocks));
 133         s->intra4x4_pred_mode_top = av_mallocz(s->mb_width*4);
 134     }
 135     else // Sliced threading
 136         s->macroblocks_base       = av_mallocz((s->mb_width+2)*(s->mb_height+2)*sizeof(*s->macroblocks));
 137     s->top_nnz                    = av_mallocz(s->mb_width*sizeof(*s->top_nnz));
 138     s->top_border                 = av_mallocz((s->mb_width+1)*sizeof(*s->top_border));
 139     s->thread_data                = av_mallocz(MAX_THREADS*sizeof(VP8ThreadData));
 140
 141     for (i = 0; i < MAX_THREADS; i++) {
 142         s->thread_data[i].filter_strength = av_mallocz(s->mb_width*sizeof(*s->thread_data[0].filter_strength));
 143 #if HAVE_THREADS
 144         pthread_mutex_init(&s->thread_data[i].lock, NULL);
 145         pthread_cond_init(&s->thread_data[i].cond, NULL);
 146 #endif
 147     }
 148
 149     if (!s->macroblocks_base || !s->top_nnz || !s->top_border ||
 150         (!s->intra4x4_pred_mode_top && !s->mb_layout))
 151         return AVERROR(ENOMEM);
 152
 153     s->macroblocks        = s->macroblocks_base + 1;
 154
 155     return 0;
 156 }
 157
 158 static void parse_segment_info(VP8Context *s)
 159 {
 160     VP56RangeCoder *c = &s->c;
 161     int i;
 162
 163     s->segmentation.update_map = vp8_rac_get(c);
 164
 165     if (vp8_rac_get(c)) { // update segment feature data
 166         s->segmentation.absolute_vals = vp8_rac_get(c);
 167
 168         for (i = 0; i < 4; i++)
 169             s->segmentation.base_quant[i]   = vp8_rac_get_sint(c, 7);
 170
 171         for (i = 0; i < 4; i++)
 172             s->segmentation.filter_level[i] = vp8_rac_get_sint(c, 6);
 173     }
 174     if (s->segmentation.update_map)
 175         for (i = 0; i < 3; i++)
 176             s->prob->segmentid[i] = vp8_rac_get(c) ? vp8_rac_get_uint(c, 8) : 255;
 177 }
 178
 179 static void update_lf_deltas(VP8Context *s)
 180 {
 181     VP56RangeCoder *c = &s->c;
 182     int i;
 183
 184     for (i = 0; i < 4; i++) {
 185         if (vp8_rac_get(c)) {
 186             s->lf_delta.ref[i] = vp8_rac_get_uint(c, 6);
 187
 188             if (vp8_rac_get(c))
 189                 s->lf_delta.ref[i] = -s->lf_delta.ref[i];
 190         }
 191     }
 192
 193     for (i = MODE_I4x4; i <= VP8_MVMODE_SPLIT; i++) {
 194         if (vp8_rac_get(c)) {
 195             s->lf_delta.mode[i] = vp8_rac_get_uint(c, 6);
 196
 197             if (vp8_rac_get(c))
 198                 s->lf_delta.mode[i] = -s->lf_delta.mode[i];
 199         }
 200     }
 201 }
 202
 203 static int setup_partitions(VP8Context *s, const uint8_t *buf, int buf_size)
 204 {
 205     const uint8_t *sizes = buf;
 206     int i;
 207
 208     s->num_coeff_partitions = 1 << vp8_rac_get_uint(&s->c, 2);
 209
 210     buf      += 3*(s->num_coeff_partitions-1);
 211     buf_size -= 3*(s->num_coeff_partitions-1);
 212     if (buf_size < 0)
 213         return -1;
 214
 215     for (i = 0; i < s->num_coeff_partitions-1; i++) {
 216         int size = AV_RL24(sizes + 3*i);
 217         if (buf_size - size < 0)
 218             return -1;
 219
 220         ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, size);
 221         buf      += size;
 222         buf_size -= size;
 223     }
 224     ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, buf_size);
 225
 226     return 0;
 227 }
 228
 229 static void get_quants(VP8Context *s)
 230 {
 231     VP56RangeCoder *c = &s->c;
 232     int i, base_qi;
 233
 234     int yac_qi     = vp8_rac_get_uint(c, 7);
 235     int ydc_delta  = vp8_rac_get_sint(c, 4);
 236     int y2dc_delta = vp8_rac_get_sint(c, 4);
 237     int y2ac_delta = vp8_rac_get_sint(c, 4);
 238     int uvdc_delta = vp8_rac_get_sint(c, 4);
 239     int uvac_delta = vp8_rac_get_sint(c, 4);
 240
 241     for (i = 0; i < 4; i++) {
 242         if (s->segmentation.enabled) {
 243             base_qi = s->segmentation.base_quant[i];
 244             if (!s->segmentation.absolute_vals)
 245                 base_qi += yac_qi;
 246         } else
 247             base_qi = yac_qi;
 248
 249         s->qmat[i].luma_qmul[0]    =           vp8_dc_qlookup[av_clip_uintp2(base_qi + ydc_delta , 7)];
 250         s->qmat[i].luma_qmul[1]    =           vp8_ac_qlookup[av_clip_uintp2(base_qi             , 7)];
 251         s->qmat[i].luma_dc_qmul[0] =       2 * vp8_dc_qlookup[av_clip_uintp2(base_qi + y2dc_delta, 7)];
 252         /* 101581>>16 is equivalent to 155/100 */
 253         s->qmat[i].luma_dc_qmul[1] = (101581 * vp8_ac_qlookup[av_clip_uintp2(base_qi + y2ac_delta, 7)]) >> 16;
 254         s->qmat[i].chroma_qmul[0]  =           vp8_dc_qlookup[av_clip_uintp2(base_qi + uvdc_delta, 7)];
 255         s->qmat[i].chroma_qmul[1]  =           vp8_ac_qlookup[av_clip_uintp2(base_qi + uvac_delta, 7)];
 256
 257         s->qmat[i].luma_dc_qmul[1] = FFMAX(s->qmat[i].luma_dc_qmul[1], 8);
 258         s->qmat[i].chroma_qmul[0]  = FFMIN(s->qmat[i].chroma_qmul[0], 132);
 259     }
 260 }
 261
 262 /**
 263  * Determine which buffers golden and altref should be updated with after this frame.
 264  * The spec isn't clear here, so I'm going by my understanding of what libvpx does
 265  *
 266  * Intra frames update all 3 references
 267  * Inter frames update VP56_FRAME_PREVIOUS if the update_last flag is set
 268  * If the update (golden|altref) flag is set, it's updated with the current frame
 269  *      if update_last is set, and VP56_FRAME_PREVIOUS otherwise.
 270  * If the flag is not set, the number read means:
 271  *      0: no update
 272  *      1: VP56_FRAME_PREVIOUS
 273  *      2: update golden with altref, or update altref with golden
 274  */
 275 static VP56Frame ref_to_update(VP8Context *s, int update, VP56Frame ref)
 276 {
 277     VP56RangeCoder *c = &s->c;
 278
 279     if (update)
 280         return VP56_FRAME_CURRENT;
 281
 282     switch (vp8_rac_get_uint(c, 2)) {
 283     case 1:
 284         return VP56_FRAME_PREVIOUS;
 285     case 2:
 286         return (ref == VP56_FRAME_GOLDEN) ? VP56_FRAME_GOLDEN2 : VP56_FRAME_GOLDEN;
 287     }
 288     return VP56_FRAME_NONE;
 289 }
 290
 291 static void update_refs(VP8Context *s)
 292 {
 293     VP56RangeCoder *c = &s->c;
 294
 295     int update_golden = vp8_rac_get(c);
 296     int update_altref = vp8_rac_get(c);
 297
 298     s->update_golden = ref_to_update(s, update_golden, VP56_FRAME_GOLDEN);
 299     s->update_altref = ref_to_update(s, update_altref, VP56_FRAME_GOLDEN2);
 300 }
 301
 302 static int decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size)
 303 {
 304     VP56RangeCoder *c = &s->c;
 305     int header_size, hscale, vscale, i, j, k, l, m, ret;
 306     int width  = s->avctx->width;
 307     int height = s->avctx->height;
 308
 309     s->keyframe  = !(buf[0] & 1);
 310     s->profile   =  (buf[0]>>1) & 7;
 311     s->invisible = !(buf[0] & 0x10);
 312     header_size  = AV_RL24(buf) >> 5;
 313     buf      += 3;
 314     buf_size -= 3;
 315
 316     if (s->profile > 3)
 317         av_log(s->avctx, AV_LOG_WARNING, "Unknown profile %d\n", s->profile);
 318
 319     if (!s->profile)
 320         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab, sizeof(s->put_pixels_tab));
 321     else    // profile 1-3 use bilinear, 4+ aren't defined so whatever
 322         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_bilinear_pixels_tab, sizeof(s->put_pixels_tab));
 323
 324     if (header_size > buf_size - 7*s->keyframe) {
 325         av_log(s->avctx, AV_LOG_ERROR, "Header size larger than data provided\n");
 326         return AVERROR_INVALIDDATA;
 327     }
 328
 329     if (s->keyframe) {
 330         if (AV_RL24(buf) != 0x2a019d) {
 331             av_log(s->avctx, AV_LOG_ERROR, "Invalid start code 0x%x\n", AV_RL24(buf));
 332             return AVERROR_INVALIDDATA;
 333         }
 334         width  = AV_RL16(buf+3) & 0x3fff;
 335         height = AV_RL16(buf+5) & 0x3fff;
 336         hscale = buf[4] >> 6;
 337         vscale = buf[6] >> 6;
 338         buf      += 7;
 339         buf_size -= 7;
 340
 341         if (hscale || vscale)
 342             avpriv_request_sample(s->avctx, "Upscaling");
 343
 344         s->update_golden = s->update_altref = VP56_FRAME_CURRENT;
 345         for (i = 0; i < 4; i++)
 346             for (j = 0; j < 16; j++)
 347                 memcpy(s->prob->token[i][j], vp8_token_default_probs[i][vp8_coeff_band[j]],
 348                        sizeof(s->prob->token[i][j]));
 349         memcpy(s->prob->pred16x16, vp8_pred16x16_prob_inter, sizeof(s->prob->pred16x16));
 350         memcpy(s->prob->pred8x8c , vp8_pred8x8c_prob_inter , sizeof(s->prob->pred8x8c));
 351         memcpy(s->prob->mvc      , vp8_mv_default_prob     , sizeof(s->prob->mvc));
 352         memset(&s->segmentation, 0, sizeof(s->segmentation));
 353         memset(&s->lf_delta, 0, sizeof(s->lf_delta));
 354     }
 355
 356     ff_vp56_init_range_decoder(c, buf, header_size);
 357     buf      += header_size;
 358     buf_size -= header_size;
 359
 360     if (s->keyframe) {
 361         if (vp8_rac_get(c))
 362             av_log(s->avctx, AV_LOG_WARNING, "Unspecified colorspace\n");
 363         vp8_rac_get(c); // whether we can skip clamping in dsp functions
 364     }
 365
 366     if ((s->segmentation.enabled = vp8_rac_get(c)))
 367         parse_segment_info(s);
 368     else
 369         s->segmentation.update_map = 0; // FIXME: move this to some init function?
 370
 371     s->filter.simple    = vp8_rac_get(c);
 372     s->filter.level     = vp8_rac_get_uint(c, 6);
 373     s->filter.sharpness = vp8_rac_get_uint(c, 3);
 374
 375     if ((s->lf_delta.enabled = vp8_rac_get(c)))
 376         if (vp8_rac_get(c))
 377             update_lf_deltas(s);
 378
 379     if (setup_partitions(s, buf, buf_size)) {
 380         av_log(s->avctx, AV_LOG_ERROR, "Invalid partitions\n");
 381         return AVERROR_INVALIDDATA;
 382     }
 383
 384     if (!s->macroblocks_base || /* first frame */
 385         width != s->avctx->width || height != s->avctx->height || (width+15)/16 != s->mb_width || (height+15)/16 != s->mb_height) {
 386         if ((ret = update_dimensions(s, width, height)) < 0)
 387             return ret;
 388     }
 389
 390     get_quants(s);
 391
 392     if (!s->keyframe) {
 393         update_refs(s);
 394         s->sign_bias[VP56_FRAME_GOLDEN]               = vp8_rac_get(c);
 395         s->sign_bias[VP56_FRAME_GOLDEN2 /* altref */] = vp8_rac_get(c);
 396     }
 397
 398     // if we aren't saving this frame's probabilities for future frames,
 399     // make a copy of the current probabilities
 400     if (!(s->update_probabilities = vp8_rac_get(c)))
 401         s->prob[1] = s->prob[0];
 402
 403     s->update_last = s->keyframe || vp8_rac_get(c);
 404
 405     for (i = 0; i < 4; i++)
 406         for (j = 0; j < 8; j++)
 407             for (k = 0; k < 3; k++)
 408                 for (l = 0; l < NUM_DCT_TOKENS-1; l++)
 409                     if (vp56_rac_get_prob_branchy(c, vp8_token_update_probs[i][j][k][l])) {
 410                         int prob = vp8_rac_get_uint(c, 8);
 411                         for (m = 0; vp8_coeff_band_indexes[j][m] >= 0; m++)
 412                             s->prob->token[i][vp8_coeff_band_indexes[j][m]][k][l] = prob;
 413                     }
 414
 415     if ((s->mbskip_enabled = vp8_rac_get(c)))
 416         s->prob->mbskip = vp8_rac_get_uint(c, 8);
 417
 418     if (!s->keyframe) {
 419         s->prob->intra  = vp8_rac_get_uint(c, 8);
 420         s->prob->last   = vp8_rac_get_uint(c, 8);
 421         s->prob->golden = vp8_rac_get_uint(c, 8);
 422
 423         if (vp8_rac_get(c))
 424             for (i = 0; i < 4; i++)
 425                 s->prob->pred16x16[i] = vp8_rac_get_uint(c, 8);
 426         if (vp8_rac_get(c))
 427             for (i = 0; i < 3; i++)
 428                 s->prob->pred8x8c[i]  = vp8_rac_get_uint(c, 8);
 429
 430         // 17.2 MV probability update
 431         for (i = 0; i < 2; i++)
 432             for (j = 0; j < 19; j++)
 433                 if (vp56_rac_get_prob_branchy(c, vp8_mv_update_prob[i][j]))
 434                     s->prob->mvc[i][j] = vp8_rac_get_nn(c);
 435     }
 436
 437     return 0;
 438 }
 439
 440 static av_always_inline void clamp_mv(VP8Context *s, VP56mv *dst, const VP56mv *src)
 441 {
 442     dst->x = av_clip(src->x, s->mv_min.x, s->mv_max.x);
 443     dst->y = av_clip(src->y, s->mv_min.y, s->mv_max.y);
 444 }
 445
 446 /**
 447  * Motion vector coding, 17.1.
 448  */
 449 static int read_mv_component(VP56RangeCoder *c, const uint8_t *p)
 450 {
 451     int bit, x = 0;
 452
 453     if (vp56_rac_get_prob_branchy(c, p[0])) {
 454         int i;
 455
 456         for (i = 0; i < 3; i++)
 457             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 458         for (i = 9; i > 3; i--)
 459             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 460         if (!(x & 0xFFF0) || vp56_rac_get_prob(c, p[12]))
 461             x += 8;
 462     } else {
 463         // small_mvtree
 464         const uint8_t *ps = p+2;
 465         bit = vp56_rac_get_prob(c, *ps);
 466         ps += 1 + 3*bit;
 467         x  += 4*bit;
 468         bit = vp56_rac_get_prob(c, *ps);
 469         ps += 1 + bit;
 470         x  += 2*bit;
 471         x  += vp56_rac_get_prob(c, *ps);
 472     }
 473
 474     return (x && vp56_rac_get_prob(c, p[1])) ? -x : x;
 475 }
 476
 477 static av_always_inline
 478 const uint8_t *get_submv_prob(uint32_t left, uint32_t top)
 479 {
 480     if (left == top)
 481         return vp8_submv_prob[4-!!left];
 482     if (!top)
 483         return vp8_submv_prob[2];
 484     return vp8_submv_prob[1-!!left];
 485 }
 486
 487 /**
 488  * Split motion vector prediction, 16.4.
 489  * @returns the number of motion vectors parsed (2, 4 or 16)
 490  */
 491 static av_always_inline
 492 int decode_splitmvs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb, int layout)
 493 {
 494     int part_idx;
 495     int n, num;
 496     VP8Macroblock *top_mb;
 497     VP8Macroblock *left_mb = &mb[-1];
 498     const uint8_t *mbsplits_left = vp8_mbsplits[left_mb->partitioning],
 499                   *mbsplits_top,
 500                   *mbsplits_cur, *firstidx;
 501     VP56mv *top_mv;
 502     VP56mv *left_mv = left_mb->bmv;
 503     VP56mv *cur_mv  = mb->bmv;
 504
 505     if (!layout) // layout is inlined, s->mb_layout is not
 506         top_mb = &mb[2];
 507     else
 508         top_mb = &mb[-s->mb_width-1];
 509     mbsplits_top = vp8_mbsplits[top_mb->partitioning];
 510     top_mv = top_mb->bmv;
 511
 512     if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[0])) {
 513         if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[1])) {
 514             part_idx = VP8_SPLITMVMODE_16x8 + vp56_rac_get_prob(c, vp8_mbsplit_prob[2]);
 515         } else {
 516             part_idx = VP8_SPLITMVMODE_8x8;
 517         }
 518     } else {
 519         part_idx = VP8_SPLITMVMODE_4x4;
 520     }
 521
 522     num = vp8_mbsplit_count[part_idx];
 523     mbsplits_cur = vp8_mbsplits[part_idx],
 524     firstidx = vp8_mbfirstidx[part_idx];
 525     mb->partitioning = part_idx;
 526
 527     for (n = 0; n < num; n++) {
 528         int k = firstidx[n];
 529         uint32_t left, above;
 530         const uint8_t *submv_prob;
 531
 532         if (!(k & 3))
 533             left = AV_RN32A(&left_mv[mbsplits_left[k + 3]]);
 534         else
 535             left  = AV_RN32A(&cur_mv[mbsplits_cur[k - 1]]);
 536         if (k <= 3)
 537             above = AV_RN32A(&top_mv[mbsplits_top[k + 12]]);
 538         else
 539             above = AV_RN32A(&cur_mv[mbsplits_cur[k - 4]]);
 540
 541         submv_prob = get_submv_prob(left, above);
 542
 543         if (vp56_rac_get_prob_branchy(c, submv_prob[0])) {
 544             if (vp56_rac_get_prob_branchy(c, submv_prob[1])) {
 545                 if (vp56_rac_get_prob_branchy(c, submv_prob[2])) {
 546                     mb->bmv[n].y = mb->mv.y + read_mv_component(c, s->prob->mvc[0]);
 547                     mb->bmv[n].x = mb->mv.x + read_mv_component(c, s->prob->mvc[1]);
 548                 } else {
 549                     AV_ZERO32(&mb->bmv[n]);
 550                 }
 551             } else {
 552                 AV_WN32A(&mb->bmv[n], above);
 553             }
 554         } else {
 555             AV_WN32A(&mb->bmv[n], left);
 556         }
 557     }
 558
 559     return num;
 560 }
 561
 562 static av_always_inline
 563 void decode_mvs(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, int layout)
 564 {
 565     VP8Macroblock *mb_edge[3] = { 0 /* top */,
 566                                   mb - 1 /* left */,
 567                                   0 /* top-left */ };
 568     enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV };
 569     enum { VP8_EDGE_TOP, VP8_EDGE_LEFT, VP8_EDGE_TOPLEFT };
 570     int idx = CNT_ZERO;
 571     int cur_sign_bias = s->sign_bias[mb->ref_frame];
 572     int8_t *sign_bias = s->sign_bias;
 573     VP56mv near_mv[4];
 574     uint8_t cnt[4] = { 0 };
 575     VP56RangeCoder *c = &s->c;
 576
 577     if (!layout) { // layout is inlined (s->mb_layout is not)
 578         mb_edge[0] = mb + 2;
 579         mb_edge[2] = mb + 1;
 580     }
 581     else {
 582         mb_edge[0] = mb - s->mb_width-1;
 583         mb_edge[2] = mb - s->mb_width-2;
 584     }
 585
 586     AV_ZERO32(&near_mv[0]);
 587     AV_ZERO32(&near_mv[1]);
 588     AV_ZERO32(&near_mv[2]);
 589
 590     /* Process MB on top, left and top-left */
 591     #define MV_EDGE_CHECK(n)\
 592     {\
 593         VP8Macroblock *edge = mb_edge[n];\
 594         int edge_ref = edge->ref_frame;\
 595         if (edge_ref != VP56_FRAME_CURRENT) {\
 596             uint32_t mv = AV_RN32A(&edge->mv);\
 597             if (mv) {\
 598                 if (cur_sign_bias != sign_bias[edge_ref]) {\
 599                     /* SWAR negate of the values in mv. */\
 600                     mv = ~mv;\
 601                     mv = ((mv&0x7fff7fff) + 0x00010001) ^ (mv&0x80008000);\
 602                 }\
 603                 if (!n || mv != AV_RN32A(&near_mv[idx]))\
 604                     AV_WN32A(&near_mv[++idx], mv);\
 605                 cnt[idx]      += 1 + (n != 2);\
 606             } else\
 607                 cnt[CNT_ZERO] += 1 + (n != 2);\
 608         }\
 609     }
 610
 611     MV_EDGE_CHECK(0)
 612     MV_EDGE_CHECK(1)
 613     MV_EDGE_CHECK(2)
 614
 615     mb->partitioning = VP8_SPLITMVMODE_NONE;
 616     if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_ZERO]][0])) {
 617         mb->mode = VP8_MVMODE_MV;
 618
 619         /* If we have three distinct MVs, merge first and last if they're the same */
 620         if (cnt[CNT_SPLITMV] && AV_RN32A(&near_mv[1 + VP8_EDGE_TOP]) == AV_RN32A(&near_mv[1 + VP8_EDGE_TOPLEFT]))
 621             cnt[CNT_NEAREST] += 1;
 622
 623         /* Swap near and nearest if necessary */
 624         if (cnt[CNT_NEAR] > cnt[CNT_NEAREST]) {
 625             FFSWAP(uint8_t,     cnt[CNT_NEAREST],     cnt[CNT_NEAR]);
 626             FFSWAP( VP56mv, near_mv[CNT_NEAREST], near_mv[CNT_NEAR]);
 627         }
 628
 629         if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAREST]][1])) {
 630             if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAR]][2])) {
 631
 632                 /* Choose the best mv out of 0,0 and the nearest mv */
 633                 clamp_mv(s, &mb->mv, &near_mv[CNT_ZERO + (cnt[CNT_NEAREST] >= cnt[CNT_ZERO])]);
 634                 cnt[CNT_SPLITMV] = ((mb_edge[VP8_EDGE_LEFT]->mode    == VP8_MVMODE_SPLIT) +
 635                                     (mb_edge[VP8_EDGE_TOP]->mode     == VP8_MVMODE_SPLIT)) * 2 +
 636                                     (mb_edge[VP8_EDGE_TOPLEFT]->mode == VP8_MVMODE_SPLIT);
 637
 638                 if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_SPLITMV]][3])) {
 639                     mb->mode = VP8_MVMODE_SPLIT;
 640                     mb->mv = mb->bmv[decode_splitmvs(s, c, mb, layout) - 1];
 641                 } else {
 642                     mb->mv.y += read_mv_component(c, s->prob->mvc[0]);
 643                     mb->mv.x += read_mv_component(c, s->prob->mvc[1]);
 644                     mb->bmv[0] = mb->mv;
 645                 }
 646             } else {
 647                 clamp_mv(s, &mb->mv, &near_mv[CNT_NEAR]);
 648                 mb->bmv[0] = mb->mv;
 649             }
 650         } else {
 651             clamp_mv(s, &mb->mv, &near_mv[CNT_NEAREST]);
 652             mb->bmv[0] = mb->mv;
 653         }
 654     } else {
 655         mb->mode = VP8_MVMODE_ZERO;
 656         AV_ZERO32(&mb->mv);
 657         mb->bmv[0] = mb->mv;
 658     }
 659 }
 660
 661 static av_always_inline
 662 void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
 663                            int mb_x, int keyframe, int layout)
 664 {
 665     uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
 666
 667     if (layout == 1) {
 668         VP8Macroblock *mb_top = mb - s->mb_width - 1;
 669         memcpy(mb->intra4x4_pred_mode_top, mb_top->intra4x4_pred_mode_top, 4);
 670     }
 671     if (keyframe) {
 672         int x, y;
 673         uint8_t* top;
 674         uint8_t* const left = s->intra4x4_pred_mode_left;
 675         if (layout == 1)
 676             top = mb->intra4x4_pred_mode_top;
 677         else
 678             top = s->intra4x4_pred_mode_top + 4 * mb_x;
 679         for (y = 0; y < 4; y++) {
 680             for (x = 0; x < 4; x++) {
 681                 const uint8_t *ctx;
 682                 ctx = vp8_pred4x4_prob_intra[top[x]][left[y]];
 683                 *intra4x4 = vp8_rac_get_tree(c, vp8_pred4x4_tree, ctx);
 684                 left[y] = top[x] = *intra4x4;
 685                 intra4x4++;
 686             }
 687         }
 688     } else {
 689         int i;
 690         for (i = 0; i < 16; i++)
 691             intra4x4[i] = vp8_rac_get_tree(c, vp8_pred4x4_tree, vp8_pred4x4_prob_inter);
 692     }
 693 }
 694
 695 static av_always_inline
 696 void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
 697                     uint8_t *segment, uint8_t *ref, int layout)
 698 {
 699     VP56RangeCoder *c = &s->c;
 700
 701     if (s->segmentation.update_map) {
 702         int bit  = vp56_rac_get_prob(c, s->prob->segmentid[0]);
 703         *segment = vp56_rac_get_prob(c, s->prob->segmentid[1+bit]) + 2*bit;
 704     } else if (s->segmentation.enabled)
 705         *segment = ref ? *ref : *segment;
 706     mb->segment = *segment;
 707
 708     mb->skip = s->mbskip_enabled ? vp56_rac_get_prob(c, s->prob->mbskip) : 0;
 709
 710     if (s->keyframe) {
 711         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_intra, vp8_pred16x16_prob_intra);
 712
 713         if (mb->mode == MODE_I4x4) {
 714             decode_intra4x4_modes(s, c, mb, mb_x, 1, layout);
 715         } else {
 716             const uint32_t modes = vp8_pred4x4_mode[mb->mode] * 0x01010101u;
 717             if (s->mb_layout == 1)
 718                 AV_WN32A(mb->intra4x4_pred_mode_top, modes);
 719             else
 720                 AV_WN32A(s->intra4x4_pred_mode_top + 4 * mb_x, modes);
 721             AV_WN32A( s->intra4x4_pred_mode_left, modes);
 722         }
 723
 724         mb->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, vp8_pred8x8c_prob_intra);
 725         mb->ref_frame = VP56_FRAME_CURRENT;
 726     } else if (vp56_rac_get_prob_branchy(c, s->prob->intra)) {
 727         // inter MB, 16.2
 728         if (vp56_rac_get_prob_branchy(c, s->prob->last))
 729             mb->ref_frame = vp56_rac_get_prob(c, s->prob->golden) ?
 730                 VP56_FRAME_GOLDEN2 /* altref */ : VP56_FRAME_GOLDEN;
 731         else
 732             mb->ref_frame = VP56_FRAME_PREVIOUS;
 733         s->ref_count[mb->ref_frame-1]++;
 734
 735         // motion vectors, 16.3
 736         decode_mvs(s, mb, mb_x, mb_y, layout);
 737     } else {
 738         // intra MB, 16.1
 739         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_inter, s->prob->pred16x16);
 740
 741         if (mb->mode == MODE_I4x4)
 742             decode_intra4x4_modes(s, c, mb, mb_x, 0, layout);
 743
 744         mb->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, s->prob->pred8x8c);
 745         mb->ref_frame = VP56_FRAME_CURRENT;
 746         mb->partitioning = VP8_SPLITMVMODE_NONE;
 747         AV_ZERO32(&mb->bmv[0]);
 748     }
 749 }
 750
 751 #ifndef decode_block_coeffs_internal
 752 /**
 753  * @param r arithmetic bitstream reader context
 754  * @param block destination for block coefficients
 755  * @param probs probabilities to use when reading trees from the bitstream
 756  * @param i initial coeff index, 0 unless a separate DC block is coded
 757  * @param qmul array holding the dc/ac dequant factor at position 0/1
 758  * @return 0 if no coeffs were decoded
 759  *         otherwise, the index of the last coeff decoded plus one
 760  */
 761 static int decode_block_coeffs_internal(VP56RangeCoder *r, int16_t block[16],
 762                                         uint8_t probs[16][3][NUM_DCT_TOKENS-1],
 763                                         int i, uint8_t *token_prob, int16_t qmul[2])
 764 {
 765     VP56RangeCoder c = *r;
 766     goto skip_eob;
 767     do {
 768         int coeff;
 769         if (!vp56_rac_get_prob_branchy(&c, token_prob[0]))   // DCT_EOB
 770             break;
 771
 772 skip_eob:
 773         if (!vp56_rac_get_prob_branchy(&c, token_prob[1])) { // DCT_0
 774             if (++i == 16)
 775                 break; // invalid input; blocks should end with EOB
 776             token_prob = probs[i][0];
 777             goto skip_eob;
 778         }
 779
 780         if (!vp56_rac_get_prob_branchy(&c, token_prob[2])) { // DCT_1
 781             coeff = 1;
 782             token_prob = probs[i+1][1];
 783         } else {
 784             if (!vp56_rac_get_prob_branchy(&c, token_prob[3])) { // DCT 2,3,4
 785                 coeff = vp56_rac_get_prob_branchy(&c, token_prob[4]);
 786                 if (coeff)
 787                     coeff += vp56_rac_get_prob(&c, token_prob[5]);
 788                 coeff += 2;
 789             } else {
 790                 // DCT_CAT*
 791                 if (!vp56_rac_get_prob_branchy(&c, token_prob[6])) {
 792                     if (!vp56_rac_get_prob_branchy(&c, token_prob[7])) { // DCT_CAT1
 793                         coeff  = 5 + vp56_rac_get_prob(&c, vp8_dct_cat1_prob[0]);
 794                     } else {                                    // DCT_CAT2
 795                         coeff  = 7;
 796                         coeff += vp56_rac_get_prob(&c, vp8_dct_cat2_prob[0]) << 1;
 797                         coeff += vp56_rac_get_prob(&c, vp8_dct_cat2_prob[1]);
 798                     }
 799                 } else {    // DCT_CAT3 and up
 800                     int a = vp56_rac_get_prob(&c, token_prob[8]);
 801                     int b = vp56_rac_get_prob(&c, token_prob[9+a]);
 802                     int cat = (a<<1) + b;
 803                     coeff  = 3 + (8<<cat);
 804                     coeff += vp8_rac_get_coeff(&c, ff_vp8_dct_cat_prob[cat]);
 805                 }
 806             }
 807             token_prob = probs[i+1][2];
 808         }
 809         block[zigzag_scan[i]] = (vp8_rac_get(&c) ? -coeff : coeff) * qmul[!!i];
 810     } while (++i < 16);
 811
 812     *r = c;
 813     return i;
 814 }
 815 #endif
 816
 817 /**
 818  * @param c arithmetic bitstream reader context
 819  * @param block destination for block coefficients
 820  * @param probs probabilities to use when reading trees from the bitstream
 821  * @param i initial coeff index, 0 unless a separate DC block is coded
 822  * @param zero_nhood the initial prediction context for number of surrounding
 823  *                   all-zero blocks (only left/top, so 0-2)
 824  * @param qmul array holding the dc/ac dequant factor at position 0/1
 825  * @return 0 if no coeffs were decoded
 826  *         otherwise, the index of the last coeff decoded plus one
 827  */
 828 static av_always_inline
 829 int decode_block_coeffs(VP56RangeCoder *c, int16_t block[16],
 830                         uint8_t probs[16][3][NUM_DCT_TOKENS-1],
 831                         int i, int zero_nhood, int16_t qmul[2])
 832 {
 833     uint8_t *token_prob = probs[i][zero_nhood];
 834     if (!vp56_rac_get_prob_branchy(c, token_prob[0]))   // DCT_EOB
 835         return 0;
 836     return decode_block_coeffs_internal(c, block, probs, i, token_prob, qmul);
 837 }
 838
 839 static av_always_inline
 840 void decode_mb_coeffs(VP8Context *s, VP8ThreadData *td, VP56RangeCoder *c, VP8Macroblock *mb,
 841                       uint8_t t_nnz[9], uint8_t l_nnz[9])
 842 {
 843     int i, x, y, luma_start = 0, luma_ctx = 3;
 844     int nnz_pred, nnz, nnz_total = 0;
 845     int segment = mb->segment;
 846     int block_dc = 0;
 847
 848     if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
 849         nnz_pred = t_nnz[8] + l_nnz[8];
 850
 851         // decode DC values and do hadamard
 852         nnz = decode_block_coeffs(c, td->block_dc, s->prob->token[1], 0, nnz_pred,
 853                                   s->qmat[segment].luma_dc_qmul);
 854         l_nnz[8] = t_nnz[8] = !!nnz;
 855         if (nnz) {
 856             nnz_total += nnz;
 857             block_dc = 1;
 858             if (nnz == 1)
 859                 s->vp8dsp.vp8_luma_dc_wht_dc(td->block, td->block_dc);
 860             else
 861                 s->vp8dsp.vp8_luma_dc_wht(td->block, td->block_dc);
 862         }
 863         luma_start = 1;
 864         luma_ctx = 0;
 865     }
 866
 867     // luma blocks
 868     for (y = 0; y < 4; y++)
 869         for (x = 0; x < 4; x++) {
 870             nnz_pred = l_nnz[y] + t_nnz[x];
 871             nnz = decode_block_coeffs(c, td->block[y][x], s->prob->token[luma_ctx], luma_start,
 872                                       nnz_pred, s->qmat[segment].luma_qmul);
 873             // nnz+block_dc may be one more than the actual last index, but we don't care
 874             td->non_zero_count_cache[y][x] = nnz + block_dc;
 875             t_nnz[x] = l_nnz[y] = !!nnz;
 876             nnz_total += nnz;
 877         }
 878
 879     // chroma blocks
 880     // TODO: what to do about dimensions? 2nd dim for luma is x,
 881     // but for chroma it's (y<<1)|x
 882     for (i = 4; i < 6; i++)
 883         for (y = 0; y < 2; y++)
 884             for (x = 0; x < 2; x++) {
 885                 nnz_pred = l_nnz[i+2*y] + t_nnz[i+2*x];
 886                 nnz = decode_block_coeffs(c, td->block[i][(y<<1)+x], s->prob->token[2], 0,
 887                                           nnz_pred, s->qmat[segment].chroma_qmul);
 888                 td->non_zero_count_cache[i][(y<<1)+x] = nnz;
 889                 t_nnz[i+2*x] = l_nnz[i+2*y] = !!nnz;
 890                 nnz_total += nnz;
 891             }
 892
 893     // if there were no coded coeffs despite the macroblock not being marked skip,
 894     // we MUST not do the inner loop filter and should not do IDCT
 895     // Since skip isn't used for bitstream prediction, just manually set it.
 896     if (!nnz_total)
 897         mb->skip = 1;
 898 }
 899
 900 static av_always_inline
 901 void backup_mb_border(uint8_t *top_border, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr,
 902                       int linesize, int uvlinesize, int simple)
 903 {
 904     AV_COPY128(top_border, src_y + 15*linesize);
 905     if (!simple) {
 906         AV_COPY64(top_border+16, src_cb + 7*uvlinesize);
 907         AV_COPY64(top_border+24, src_cr + 7*uvlinesize);
 908     }
 909 }
 910
 911 static av_always_inline
 912 void xchg_mb_border(uint8_t *top_border, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr,
 913                     int linesize, int uvlinesize, int mb_x, int mb_y, int mb_width,
 914                     int simple, int xchg)
 915 {
 916     uint8_t *top_border_m1 = top_border-32;     // for TL prediction
 917     src_y  -=   linesize;
 918     src_cb -= uvlinesize;
 919     src_cr -= uvlinesize;
 920
 921 #define XCHG(a,b,xchg) do {                     \
 922         if (xchg) AV_SWAP64(b,a);               \
 923         else      AV_COPY64(b,a);               \
 924     } while (0)
 925
 926     XCHG(top_border_m1+8, src_y-8, xchg);
 927     XCHG(top_border,      src_y,   xchg);
 928     XCHG(top_border+8,    src_y+8, 1);
 929     if (mb_x < mb_width-1)
 930         XCHG(top_border+32, src_y+16, 1);
 931
 932     // only copy chroma for normal loop filter
 933     // or to initialize the top row to 127
 934     if (!simple || !mb_y) {
 935         XCHG(top_border_m1+16, src_cb-8, xchg);
 936         XCHG(top_border_m1+24, src_cr-8, xchg);
 937         XCHG(top_border+16,    src_cb, 1);
 938         XCHG(top_border+24,    src_cr, 1);
 939     }
 940 }
 941
 942 static av_always_inline
 943 int check_dc_pred8x8_mode(int mode, int mb_x, int mb_y)
 944 {
 945     if (!mb_x) {
 946         return mb_y ? TOP_DC_PRED8x8 : DC_128_PRED8x8;
 947     } else {
 948         return mb_y ? mode : LEFT_DC_PRED8x8;
 949     }
 950 }
 951
 952 static av_always_inline
 953 int check_tm_pred8x8_mode(int mode, int mb_x, int mb_y)
 954 {
 955     if (!mb_x) {
 956         return mb_y ? VERT_PRED8x8 : DC_129_PRED8x8;
 957     } else {
 958         return mb_y ? mode : HOR_PRED8x8;
 959     }
 960 }
 961
 962 static av_always_inline
 963 int check_intra_pred8x8_mode(int mode, int mb_x, int mb_y)
 964 {
 965     if (mode == DC_PRED8x8) {
 966         return check_dc_pred8x8_mode(mode, mb_x, mb_y);
 967     } else {
 968         return mode;
 969     }
 970 }
 971
 972 static av_always_inline
 973 int check_intra_pred8x8_mode_emuedge(int mode, int mb_x, int mb_y)
 974 {
 975     switch (mode) {
 976     case DC_PRED8x8:
 977         return check_dc_pred8x8_mode(mode, mb_x, mb_y);
 978     case VERT_PRED8x8:
 979         return !mb_y ? DC_127_PRED8x8 : mode;
 980     case HOR_PRED8x8:
 981         return !mb_x ? DC_129_PRED8x8 : mode;
 982     case PLANE_PRED8x8 /*TM*/:
 983         return check_tm_pred8x8_mode(mode, mb_x, mb_y);
 984     }
 985     return mode;
 986 }
 987
 988 static av_always_inline
 989 int check_tm_pred4x4_mode(int mode, int mb_x, int mb_y)
 990 {
 991     if (!mb_x) {
 992         return mb_y ? VERT_VP8_PRED : DC_129_PRED;
 993     } else {
 994         return mb_y ? mode : HOR_VP8_PRED;
 995     }
 996 }
 997
 998 static av_always_inline
 999 int check_intra_pred4x4_mode_emuedge(int mode, int mb_x, int mb_y, int *copy_buf)
1000 {
1001     switch (mode) {
1002     case VERT_PRED:
1003         if (!mb_x && mb_y) {
1004             *copy_buf = 1;
1005             return mode;
1006         }
1007         /* fall-through */
1008     case DIAG_DOWN_LEFT_PRED:
1009     case VERT_LEFT_PRED:
1010         return !mb_y ? DC_127_PRED : mode;
1011     case HOR_PRED:
1012         if (!mb_y) {
1013             *copy_buf = 1;
1014             return mode;
1015         }
1016         /* fall-through */
1017     case HOR_UP_PRED:
1018         return !mb_x ? DC_129_PRED : mode;
1019     case TM_VP8_PRED:
1020         return check_tm_pred4x4_mode(mode, mb_x, mb_y);
1021     case DC_PRED: // 4x4 DC doesn't use the same "H.264-style" exceptions as 16x16/8x8 DC
1022     case DIAG_DOWN_RIGHT_PRED:
1023     case VERT_RIGHT_PRED:
1024     case HOR_DOWN_PRED:
1025         if (!mb_y || !mb_x)
1026             *copy_buf = 1;
1027         return mode;
1028     }
1029     return mode;
1030 }
1031
1032 static av_always_inline
1033 void intra_predict(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1034                    VP8Macroblock *mb, int mb_x, int mb_y)
1035 {
1036     AVCodecContext *avctx = s->avctx;
1037     int x, y, mode, nnz;
1038     uint32_t tr;
1039
1040     // for the first row, we need to run xchg_mb_border to init the top edge to 127
1041     // otherwise, skip it if we aren't going to deblock
1042     if (!(avctx->flags & CODEC_FLAG_EMU_EDGE && !mb_y) && (s->deblock_filter || !mb_y) && td->thread_nr == 0)
1043         xchg_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2],
1044                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1045                        s->filter.simple, 1);
1046
1047     if (mb->mode < MODE_I4x4) {
1048         if (avctx->flags & CODEC_FLAG_EMU_EDGE) { // tested
1049             mode = check_intra_pred8x8_mode_emuedge(mb->mode, mb_x, mb_y);
1050         } else {
1051             mode = check_intra_pred8x8_mode(mb->mode, mb_x, mb_y);
1052         }
1053         s->hpc.pred16x16[mode](dst[0], s->linesize);
1054     } else {
1055         uint8_t *ptr = dst[0];
1056         uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
1057         uint8_t tr_top[4] = { 127, 127, 127, 127 };
1058
1059         // all blocks on the right edge of the macroblock use bottom edge
1060         // the top macroblock for their topright edge
1061         uint8_t *tr_right = ptr - s->linesize + 16;
1062
1063         // if we're on the right edge of the frame, said edge is extended
1064         // from the top macroblock
1065         if (!(!mb_y && avctx->flags & CODEC_FLAG_EMU_EDGE) &&
1066             mb_x == s->mb_width-1) {
1067             tr = tr_right[-1]*0x01010101u;
1068             tr_right = (uint8_t *)&tr;
1069         }
1070
1071         if (mb->skip)
1072             AV_ZERO128(td->non_zero_count_cache);
1073
1074         for (y = 0; y < 4; y++) {
1075             uint8_t *topright = ptr + 4 - s->linesize;
1076             for (x = 0; x < 4; x++) {
1077                 int copy = 0, linesize = s->linesize;
1078                 uint8_t *dst = ptr+4*x;
1079                 DECLARE_ALIGNED(4, uint8_t, copy_dst)[5*8];
1080
1081                 if ((y == 0 || x == 3) && mb_y == 0 && avctx->flags & CODEC_FLAG_EMU_EDGE) {
1082                     topright = tr_top;
1083                 } else if (x == 3)
1084                     topright = tr_right;
1085
1086                 if (avctx->flags & CODEC_FLAG_EMU_EDGE) { // mb_x+x or mb_y+y is a hack but works
1087                     mode = check_intra_pred4x4_mode_emuedge(intra4x4[x], mb_x + x, mb_y + y, &copy);
1088                     if (copy) {
1089                         dst = copy_dst + 12;
1090                         linesize = 8;
1091                         if (!(mb_y + y)) {
1092                             copy_dst[3] = 127U;
1093                             AV_WN32A(copy_dst+4, 127U * 0x01010101U);
1094                         } else {
1095                             AV_COPY32(copy_dst+4, ptr+4*x-s->linesize);
1096                             if (!(mb_x + x)) {
1097                                 copy_dst[3] = 129U;
1098                             } else {
1099                                 copy_dst[3] = ptr[4*x-s->linesize-1];
1100                             }
1101                         }
1102                         if (!(mb_x + x)) {
1103                             copy_dst[11] =
1104                             copy_dst[19] =
1105                             copy_dst[27] =
1106                             copy_dst[35] = 129U;
1107                         } else {
1108                             copy_dst[11] = ptr[4*x              -1];
1109                             copy_dst[19] = ptr[4*x+s->linesize  -1];
1110                             copy_dst[27] = ptr[4*x+s->linesize*2-1];
1111                             copy_dst[35] = ptr[4*x+s->linesize*3-1];
1112                         }
1113                     }
1114                 } else {
1115                     mode = intra4x4[x];
1116                 }
1117                 s->hpc.pred4x4[mode](dst, topright, linesize);
1118                 if (copy) {
1119                     AV_COPY32(ptr+4*x              , copy_dst+12);
1120                     AV_COPY32(ptr+4*x+s->linesize  , copy_dst+20);
1121                     AV_COPY32(ptr+4*x+s->linesize*2, copy_dst+28);
1122                     AV_COPY32(ptr+4*x+s->linesize*3, copy_dst+36);
1123                 }
1124
1125                 nnz = td->non_zero_count_cache[y][x];
1126                 if (nnz) {
1127                     if (nnz == 1)
1128                         s->vp8dsp.vp8_idct_dc_add(ptr+4*x, td->block[y][x], s->linesize);
1129                     else
1130                         s->vp8dsp.vp8_idct_add(ptr+4*x, td->block[y][x], s->linesize);
1131                 }
1132                 topright += 4;
1133             }
1134
1135             ptr   += 4*s->linesize;
1136             intra4x4 += 4;
1137         }
1138     }
1139
1140     if (avctx->flags & CODEC_FLAG_EMU_EDGE) {
1141         mode = check_intra_pred8x8_mode_emuedge(mb->chroma_pred_mode, mb_x, mb_y);
1142     } else {
1143         mode = check_intra_pred8x8_mode(mb->chroma_pred_mode, mb_x, mb_y);
1144     }
1145     s->hpc.pred8x8[mode](dst[1], s->uvlinesize);
1146     s->hpc.pred8x8[mode](dst[2], s->uvlinesize);
1147
1148     if (!(avctx->flags & CODEC_FLAG_EMU_EDGE && !mb_y) && (s->deblock_filter || !mb_y) && td->thread_nr == 0)
1149         xchg_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2],
1150                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1151                        s->filter.simple, 0);
1152 }
1153
1154 static const uint8_t subpel_idx[3][8] = {
1155     { 0, 1, 2, 1, 2, 1, 2, 1 }, // nr. of left extra pixels,
1156                                 // also function pointer index
1157     { 0, 3, 5, 3, 5, 3, 5, 3 }, // nr. of extra pixels required
1158     { 0, 2, 3, 2, 3, 2, 3, 2 }, // nr. of right extra pixels
1159 };
1160
1161 /**
1162  * luma MC function
1163  *
1164  * @param s VP8 decoding context
1165  * @param dst target buffer for block data at block position
1166  * @param ref reference picture buffer at origin (0, 0)
1167  * @param mv motion vector (relative to block position) to get pixel data from
1168  * @param x_off horizontal position of block from origin (0, 0)
1169  * @param y_off vertical position of block from origin (0, 0)
1170  * @param block_w width of block (16, 8 or 4)
1171  * @param block_h height of block (always same as block_w)
1172  * @param width width of src/dst plane data
1173  * @param height height of src/dst plane data
1174  * @param linesize size of a single line of plane data, including padding
1175  * @param mc_func motion compensation function pointers (bilinear or sixtap MC)
1176  */
1177 static av_always_inline
1178 void vp8_mc_luma(VP8Context *s, VP8ThreadData *td, uint8_t *dst,
1179                  ThreadFrame *ref, const VP56mv *mv,
1180                  int x_off, int y_off, int block_w, int block_h,
1181                  int width, int height, ptrdiff_t linesize,
1182                  vp8_mc_func mc_func[3][3])
1183 {
1184     uint8_t *src = ref->f->data[0];
1185
1186     if (AV_RN32A(mv)) {
1187         int src_linesize = linesize;
1188         int mx = (mv->x << 1)&7, mx_idx = subpel_idx[0][mx];
1189         int my = (mv->y << 1)&7, my_idx = subpel_idx[0][my];
1190
1191         x_off += mv->x >> 2;
1192         y_off += mv->y >> 2;
1193
1194         // edge emulation
1195         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 4, 0);
1196         src += y_off * linesize + x_off;
1197         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1198             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1199             s->vdsp.emulated_edge_mc(td->edge_emu_buffer, 32,
1200                                      src - my_idx * linesize - mx_idx, linesize,
1201                                      block_w + subpel_idx[1][mx],
1202                                      block_h + subpel_idx[1][my],
1203                                      x_off - mx_idx, y_off - my_idx, width, height);
1204             src = td->edge_emu_buffer + mx_idx + 32 * my_idx;
1205             src_linesize = 32;
1206         }
1207         mc_func[my_idx][mx_idx](dst, linesize, src, src_linesize, block_h, mx, my);
1208     } else {
1209         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 4, 0);
1210         mc_func[0][0](dst, linesize, src + y_off * linesize + x_off, linesize, block_h, 0, 0);
1211     }
1212 }
1213
1214 /**
1215  * chroma MC function
1216  *
1217  * @param s VP8 decoding context
1218  * @param dst1 target buffer for block data at block position (U plane)
1219  * @param dst2 target buffer for block data at block position (V plane)
1220  * @param ref reference picture buffer at origin (0, 0)
1221  * @param mv motion vector (relative to block position) to get pixel data from
1222  * @param x_off horizontal position of block from origin (0, 0)
1223  * @param y_off vertical position of block from origin (0, 0)
1224  * @param block_w width of block (16, 8 or 4)
1225  * @param block_h height of block (always same as block_w)
1226  * @param width width of src/dst plane data
1227  * @param height height of src/dst plane data
1228  * @param linesize size of a single line of plane data, including padding
1229  * @param mc_func motion compensation function pointers (bilinear or sixtap MC)
1230  */
1231 static av_always_inline
1232 void vp8_mc_chroma(VP8Context *s, VP8ThreadData *td, uint8_t *dst1, uint8_t *dst2,
1233                    ThreadFrame *ref, const VP56mv *mv, int x_off, int y_off,
1234                    int block_w, int block_h, int width, int height, ptrdiff_t linesize,
1235                    vp8_mc_func mc_func[3][3])
1236 {
1237     uint8_t *src1 = ref->f->data[1], *src2 = ref->f->data[2];
1238
1239     if (AV_RN32A(mv)) {
1240         int mx = mv->x&7, mx_idx = subpel_idx[0][mx];
1241         int my = mv->y&7, my_idx = subpel_idx[0][my];
1242
1243         x_off += mv->x >> 3;
1244         y_off += mv->y >> 3;
1245
1246         // edge emulation
1247         src1 += y_off * linesize + x_off;
1248         src2 += y_off * linesize + x_off;
1249         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 3, 0);
1250         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1251             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1252             s->vdsp.emulated_edge_mc(td->edge_emu_buffer, 32,
1253                                      src1 - my_idx * linesize - mx_idx, linesize,
1254                                      block_w + subpel_idx[1][mx],
1255                                      block_h + subpel_idx[1][my],
1256                                      x_off - mx_idx, y_off - my_idx, width, height);
1257             src1 = td->edge_emu_buffer + mx_idx + 32 * my_idx;
1258             mc_func[my_idx][mx_idx](dst1, linesize, src1, 32, block_h, mx, my);
1259
1260             s->vdsp.emulated_edge_mc(td->edge_emu_buffer, 32,
1261                                      src2 - my_idx * linesize - mx_idx, linesize,
1262                                      block_w + subpel_idx[1][mx],
1263                                      block_h + subpel_idx[1][my],
1264                                      x_off - mx_idx, y_off - my_idx, width, height);
1265             src2 = td->edge_emu_buffer + mx_idx + 32 * my_idx;
1266             mc_func[my_idx][mx_idx](dst2, linesize, src2, 32, block_h, mx, my);
1267         } else {
1268             mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my);
1269             mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my);
1270         }
1271     } else {
1272         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 3, 0);
1273         mc_func[0][0](dst1, linesize, src1 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1274         mc_func[0][0](dst2, linesize, src2 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1275     }
1276 }
1277
1278 static av_always_inline
1279 void vp8_mc_part(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1280                  ThreadFrame *ref_frame, int x_off, int y_off,
1281                  int bx_off, int by_off,
1282                  int block_w, int block_h,
1283                  int width, int height, VP56mv *mv)
1284 {
1285     VP56mv uvmv = *mv;
1286
1287     /* Y */
1288     vp8_mc_luma(s, td, dst[0] + by_off * s->linesize + bx_off,
1289                 ref_frame, mv, x_off + bx_off, y_off + by_off,
1290                 block_w, block_h, width, height, s->linesize,
1291                 s->put_pixels_tab[block_w == 8]);
1292
1293     /* U/V */
1294     if (s->profile == 3) {
1295         uvmv.x &= ~7;
1296         uvmv.y &= ~7;
1297     }
1298     x_off   >>= 1; y_off   >>= 1;
1299     bx_off  >>= 1; by_off  >>= 1;
1300     width   >>= 1; height  >>= 1;
1301     block_w >>= 1; block_h >>= 1;
1302     vp8_mc_chroma(s, td, dst[1] + by_off * s->uvlinesize + bx_off,
1303                   dst[2] + by_off * s->uvlinesize + bx_off, ref_frame,
1304                   &uvmv, x_off + bx_off, y_off + by_off,
1305                   block_w, block_h, width, height, s->uvlinesize,
1306                   s->put_pixels_tab[1 + (block_w == 4)]);
1307 }
1308
1309 /* Fetch pixels for estimated mv 4 macroblocks ahead.
1310  * Optimized for 64-byte cache lines.  Inspired by ffh264 prefetch_motion. */
1311 static av_always_inline void prefetch_motion(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, int mb_xy, int ref)
1312 {
1313     /* Don't prefetch refs that haven't been used very often this frame. */
1314     if (s->ref_count[ref-1] > (mb_xy >> 5)) {
1315         int x_off = mb_x << 4, y_off = mb_y << 4;
1316         int mx = (mb->mv.x>>2) + x_off + 8;
1317         int my = (mb->mv.y>>2) + y_off;
1318         uint8_t **src= s->framep[ref]->tf.f->data;
1319         int off= mx + (my + (mb_x&3)*4)*s->linesize + 64;
1320         /* For threading, a ff_thread_await_progress here might be useful, but
1321          * it actually slows down the decoder. Since a bad prefetch doesn't
1322          * generate bad decoder output, we don't run it here. */
1323         s->vdsp.prefetch(src[0]+off, s->linesize, 4);
1324         off= (mx>>1) + ((my>>1) + (mb_x&7))*s->uvlinesize + 64;
1325         s->vdsp.prefetch(src[1]+off, src[2]-src[1], 2);
1326     }
1327 }
1328
1329 /**
1330  * Apply motion vectors to prediction buffer, chapter 18.
1331  */
1332 static av_always_inline
1333 void inter_predict(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1334                    VP8Macroblock *mb, int mb_x, int mb_y)
1335 {
1336     int x_off = mb_x << 4, y_off = mb_y << 4;
1337     int width = 16*s->mb_width, height = 16*s->mb_height;
1338     ThreadFrame *ref = &s->framep[mb->ref_frame]->tf;
1339     VP56mv *bmv = mb->bmv;
1340
1341     switch (mb->partitioning) {
1342     case VP8_SPLITMVMODE_NONE:
1343         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1344                     0, 0, 16, 16, width, height, &mb->mv);
1345         break;
1346     case VP8_SPLITMVMODE_4x4: {
1347         int x, y;
1348         VP56mv uvmv;
1349
1350         /* Y */
1351         for (y = 0; y < 4; y++) {
1352             for (x = 0; x < 4; x++) {
1353                 vp8_mc_luma(s, td, dst[0] + 4*y*s->linesize + x*4,
1354                             ref, &bmv[4*y + x],
1355                             4*x + x_off, 4*y + y_off, 4, 4,
1356                             width, height, s->linesize,
1357                             s->put_pixels_tab[2]);
1358             }
1359         }
1360
1361         /* U/V */
1362         x_off >>= 1; y_off >>= 1; width >>= 1; height >>= 1;
1363         for (y = 0; y < 2; y++) {
1364             for (x = 0; x < 2; x++) {
1365                 uvmv.x = mb->bmv[ 2*y    * 4 + 2*x  ].x +
1366                          mb->bmv[ 2*y    * 4 + 2*x+1].x +
1367                          mb->bmv[(2*y+1) * 4 + 2*x  ].x +
1368                          mb->bmv[(2*y+1) * 4 + 2*x+1].x;
1369                 uvmv.y = mb->bmv[ 2*y    * 4 + 2*x  ].y +
1370                          mb->bmv[ 2*y    * 4 + 2*x+1].y +
1371                          mb->bmv[(2*y+1) * 4 + 2*x  ].y +
1372                          mb->bmv[(2*y+1) * 4 + 2*x+1].y;
1373                 uvmv.x = (uvmv.x + 2 + (uvmv.x >> (INT_BIT-1))) >> 2;
1374                 uvmv.y = (uvmv.y + 2 + (uvmv.y >> (INT_BIT-1))) >> 2;
1375                 if (s->profile == 3) {
1376                     uvmv.x &= ~7;
1377                     uvmv.y &= ~7;
1378                 }
1379                 vp8_mc_chroma(s, td, dst[1] + 4*y*s->uvlinesize + x*4,
1380                               dst[2] + 4*y*s->uvlinesize + x*4, ref, &uvmv,
1381                               4*x + x_off, 4*y + y_off, 4, 4,
1382                               width, height, s->uvlinesize,
1383                               s->put_pixels_tab[2]);
1384             }
1385         }
1386         break;
1387     }
1388     case VP8_SPLITMVMODE_16x8:
1389         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1390                     0, 0, 16, 8, width, height, &bmv[0]);
1391         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1392                     0, 8, 16, 8, width, height, &bmv[1]);
1393         break;
1394     case VP8_SPLITMVMODE_8x16:
1395         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1396                     0, 0, 8, 16, width, height, &bmv[0]);
1397         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1398                     8, 0, 8, 16, width, height, &bmv[1]);
1399         break;
1400     case VP8_SPLITMVMODE_8x8:
1401         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1402                     0, 0, 8, 8, width, height, &bmv[0]);
1403         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1404                     8, 0, 8, 8, width, height, &bmv[1]);
1405         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1406                     0, 8, 8, 8, width, height, &bmv[2]);
1407         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1408                     8, 8, 8, 8, width, height, &bmv[3]);
1409         break;
1410     }
1411 }
1412
1413 static av_always_inline void idct_mb(VP8Context *s, VP8ThreadData *td,
1414                                      uint8_t *dst[3], VP8Macroblock *mb)
1415 {
1416     int x, y, ch;
1417
1418     if (mb->mode != MODE_I4x4) {
1419         uint8_t *y_dst = dst[0];
1420         for (y = 0; y < 4; y++) {
1421             uint32_t nnz4 = AV_RL32(td->non_zero_count_cache[y]);
1422             if (nnz4) {
1423                 if (nnz4&~0x01010101) {
1424                     for (x = 0; x < 4; x++) {
1425                         if ((uint8_t)nnz4 == 1)
1426                             s->vp8dsp.vp8_idct_dc_add(y_dst+4*x, td->block[y][x], s->linesize);
1427                         else if((uint8_t)nnz4 > 1)
1428                             s->vp8dsp.vp8_idct_add(y_dst+4*x, td->block[y][x], s->linesize);
1429                         nnz4 >>= 8;
1430                         if (!nnz4)
1431                             break;
1432                     }
1433                 } else {
1434                     s->vp8dsp.vp8_idct_dc_add4y(y_dst, td->block[y], s->linesize);
1435                 }
1436             }
1437             y_dst += 4*s->linesize;
1438         }
1439     }
1440
1441     for (ch = 0; ch < 2; ch++) {
1442         uint32_t nnz4 = AV_RL32(td->non_zero_count_cache[4+ch]);
1443         if (nnz4) {
1444             uint8_t *ch_dst = dst[1+ch];
1445             if (nnz4&~0x01010101) {
1446                 for (y = 0; y < 2; y++) {
1447                     for (x = 0; x < 2; x++) {
1448                         if ((uint8_t)nnz4 == 1)
1449                             s->vp8dsp.vp8_idct_dc_add(ch_dst+4*x, td->block[4+ch][(y<<1)+x], s->uvlinesize);
1450                         else if((uint8_t)nnz4 > 1)
1451                             s->vp8dsp.vp8_idct_add(ch_dst+4*x, td->block[4+ch][(y<<1)+x], s->uvlinesize);
1452                         nnz4 >>= 8;
1453                         if (!nnz4)
1454                             goto chroma_idct_end;
1455                     }
1456                     ch_dst += 4*s->uvlinesize;
1457                 }
1458             } else {
1459                 s->vp8dsp.vp8_idct_dc_add4uv(ch_dst, td->block[4+ch], s->uvlinesize);
1460             }
1461         }
1462 chroma_idct_end: ;
1463     }
1464 }
1465
1466 static av_always_inline void filter_level_for_mb(VP8Context *s, VP8Macroblock *mb, VP8FilterStrength *f )
1467 {
1468     int interior_limit, filter_level;
1469
1470     if (s->segmentation.enabled) {
1471         filter_level = s->segmentation.filter_level[mb->segment];
1472         if (!s->segmentation.absolute_vals)
1473             filter_level += s->filter.level;
1474     } else
1475         filter_level = s->filter.level;
1476
1477     if (s->lf_delta.enabled) {
1478         filter_level += s->lf_delta.ref[mb->ref_frame];
1479         filter_level += s->lf_delta.mode[mb->mode];
1480     }
1481
1482     filter_level = av_clip_uintp2(filter_level, 6);
1483
1484     interior_limit = filter_level;
1485     if (s->filter.sharpness) {
1486         interior_limit >>= (s->filter.sharpness + 3) >> 2;
1487         interior_limit = FFMIN(interior_limit, 9 - s->filter.sharpness);
1488     }
1489     interior_limit = FFMAX(interior_limit, 1);
1490
1491     f->filter_level = filter_level;
1492     f->inner_limit = interior_limit;
1493     f->inner_filter = !mb->skip || mb->mode == MODE_I4x4 || mb->mode == VP8_MVMODE_SPLIT;
1494 }
1495
1496 static av_always_inline void filter_mb(VP8Context *s, uint8_t *dst[3], VP8FilterStrength *f, int mb_x, int mb_y)
1497 {
1498     int mbedge_lim, bedge_lim, hev_thresh;
1499     int filter_level = f->filter_level;
1500     int inner_limit = f->inner_limit;
1501     int inner_filter = f->inner_filter;
1502     int linesize = s->linesize;
1503     int uvlinesize = s->uvlinesize;
1504     static const uint8_t hev_thresh_lut[2][64] = {
1505         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
1506           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1507           3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1508           3, 3, 3, 3 },
1509         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
1510           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1511           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1512           2, 2, 2, 2 }
1513     };
1514
1515     if (!filter_level)
1516         return;
1517
1518      bedge_lim = 2*filter_level + inner_limit;
1519     mbedge_lim = bedge_lim + 4;
1520
1521     hev_thresh = hev_thresh_lut[s->keyframe][filter_level];
1522
1523     if (mb_x) {
1524         s->vp8dsp.vp8_h_loop_filter16y(dst[0],     linesize,
1525                                        mbedge_lim, inner_limit, hev_thresh);
1526         s->vp8dsp.vp8_h_loop_filter8uv(dst[1],     dst[2],      uvlinesize,
1527                                        mbedge_lim, inner_limit, hev_thresh);
1528     }
1529
1530     if (inner_filter) {
1531         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+ 4, linesize, bedge_lim,
1532                                              inner_limit, hev_thresh);
1533         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+ 8, linesize, bedge_lim,
1534                                              inner_limit, hev_thresh);
1535         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+12, linesize, bedge_lim,
1536                                              inner_limit, hev_thresh);
1537         s->vp8dsp.vp8_h_loop_filter8uv_inner(dst[1] + 4, dst[2] + 4,
1538                                              uvlinesize,  bedge_lim,
1539                                              inner_limit, hev_thresh);
1540     }
1541
1542     if (mb_y) {
1543         s->vp8dsp.vp8_v_loop_filter16y(dst[0],     linesize,
1544                                        mbedge_lim, inner_limit, hev_thresh);
1545         s->vp8dsp.vp8_v_loop_filter8uv(dst[1],     dst[2],      uvlinesize,
1546                                        mbedge_lim, inner_limit, hev_thresh);
1547     }
1548
1549     if (inner_filter) {
1550         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+ 4*linesize,
1551                                              linesize,    bedge_lim,
1552                                              inner_limit, hev_thresh);
1553         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+ 8*linesize,
1554                                              linesize,    bedge_lim,
1555                                              inner_limit, hev_thresh);
1556         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+12*linesize,
1557                                              linesize,    bedge_lim,
1558                                              inner_limit, hev_thresh);
1559         s->vp8dsp.vp8_v_loop_filter8uv_inner(dst[1] + 4 * uvlinesize,
1560                                              dst[2] + 4 * uvlinesize,
1561                                              uvlinesize,  bedge_lim,
1562                                              inner_limit, hev_thresh);
1563     }
1564 }
1565
1566 static av_always_inline void filter_mb_simple(VP8Context *s, uint8_t *dst, VP8FilterStrength *f, int mb_x, int mb_y)
1567 {
1568     int mbedge_lim, bedge_lim;
1569     int filter_level = f->filter_level;
1570     int inner_limit = f->inner_limit;
1571     int inner_filter = f->inner_filter;
1572     int linesize = s->linesize;
1573
1574     if (!filter_level)
1575         return;
1576
1577      bedge_lim = 2*filter_level + inner_limit;
1578     mbedge_lim = bedge_lim + 4;
1579
1580     if (mb_x)
1581         s->vp8dsp.vp8_h_loop_filter_simple(dst, linesize, mbedge_lim);
1582     if (inner_filter) {
1583         s->vp8dsp.vp8_h_loop_filter_simple(dst+ 4, linesize, bedge_lim);
1584         s->vp8dsp.vp8_h_loop_filter_simple(dst+ 8, linesize, bedge_lim);
1585         s->vp8dsp.vp8_h_loop_filter_simple(dst+12, linesize, bedge_lim);
1586     }
1587
1588     if (mb_y)
1589         s->vp8dsp.vp8_v_loop_filter_simple(dst, linesize, mbedge_lim);
1590     if (inner_filter) {
1591         s->vp8dsp.vp8_v_loop_filter_simple(dst+ 4*linesize, linesize, bedge_lim);
1592         s->vp8dsp.vp8_v_loop_filter_simple(dst+ 8*linesize, linesize, bedge_lim);
1593         s->vp8dsp.vp8_v_loop_filter_simple(dst+12*linesize, linesize, bedge_lim);
1594     }
1595 }
1596
1597 #define MARGIN (16 << 2)
1598 static void vp8_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *curframe,
1599                                    VP8Frame *prev_frame)
1600 {
1601     VP8Context *s = avctx->priv_data;
1602     int mb_x, mb_y;
1603
1604     s->mv_min.y = -MARGIN;
1605     s->mv_max.y = ((s->mb_height - 1) << 6) + MARGIN;
1606     for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
1607         VP8Macroblock *mb = s->macroblocks_base + ((s->mb_width+1)*(mb_y + 1) + 1);
1608         int mb_xy = mb_y*s->mb_width;
1609
1610         AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED*0x01010101);
1611
1612         s->mv_min.x = -MARGIN;
1613         s->mv_max.x = ((s->mb_width - 1) << 6) + MARGIN;
1614         for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
1615             if (mb_y == 0)
1616                 AV_WN32A((mb-s->mb_width-1)->intra4x4_pred_mode_top, DC_PRED*0x01010101);
1617             decode_mb_mode(s, mb, mb_x, mb_y, curframe->seg_map->data + mb_xy,
1618                            prev_frame && prev_frame->seg_map ?
1619                            prev_frame->seg_map->data + mb_xy : NULL, 1);
1620             s->mv_min.x -= 64;
1621             s->mv_max.x -= 64;
1622         }
1623         s->mv_min.y -= 64;
1624         s->mv_max.y -= 64;
1625     }
1626 }
1627
1628 #if HAVE_THREADS
1629 #define check_thread_pos(td, otd, mb_x_check, mb_y_check)\
1630     do {\
1631         int tmp = (mb_y_check << 16) | (mb_x_check & 0xFFFF);\
1632         if (otd->thread_mb_pos < tmp) {\
1633             pthread_mutex_lock(&otd->lock);\
1634             td->wait_mb_pos = tmp;\
1635             do {\
1636                 if (otd->thread_mb_pos >= tmp)\
1637                     break;\
1638                 pthread_cond_wait(&otd->cond, &otd->lock);\
1639             } while (1);\
1640             td->wait_mb_pos = INT_MAX;\
1641             pthread_mutex_unlock(&otd->lock);\
1642         }\
1643     } while(0);
1644
1645 #define update_pos(td, mb_y, mb_x)\
1646     do {\
1647     int pos              = (mb_y << 16) | (mb_x & 0xFFFF);\
1648     int sliced_threading = (avctx->active_thread_type == FF_THREAD_SLICE) && (num_jobs > 1);\
1649     int is_null          = (next_td == NULL) || (prev_td == NULL);\
1650     int pos_check        = (is_null) ? 1 :\
1651                             (next_td != td && pos >= next_td->wait_mb_pos) ||\
1652                             (prev_td != td && pos >= prev_td->wait_mb_pos);\
1653     td->thread_mb_pos = pos;\
1654     if (sliced_threading && pos_check) {\
1655         pthread_mutex_lock(&td->lock);\
1656         pthread_cond_broadcast(&td->cond);\
1657         pthread_mutex_unlock(&td->lock);\
1658     }\
1659     } while(0);
1660 #else
1661 #define check_thread_pos(td, otd, mb_x_check, mb_y_check)
1662 #define update_pos(td, mb_y, mb_x)
1663 #endif
1664
1665 static void vp8_decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
1666                                         int jobnr, int threadnr)
1667 {
1668     VP8Context *s = avctx->priv_data;
1669     VP8ThreadData *prev_td, *next_td, *td = &s->thread_data[threadnr];
1670     int mb_y = td->thread_mb_pos>>16;
1671     int i, y, mb_x, mb_xy = mb_y*s->mb_width;
1672     int num_jobs = s->num_jobs;
1673     VP8Frame *curframe = s->curframe, *prev_frame = s->prev_frame;
1674     VP56RangeCoder *c = &s->coeff_partition[mb_y & (s->num_coeff_partitions-1)];
1675     VP8Macroblock *mb;
1676     uint8_t *dst[3] = {
1677         curframe->tf.f->data[0] + 16*mb_y*s->linesize,
1678         curframe->tf.f->data[1] +  8*mb_y*s->uvlinesize,
1679         curframe->tf.f->data[2] +  8*mb_y*s->uvlinesize
1680     };
1681     if (mb_y == 0) prev_td = td;
1682     else           prev_td = &s->thread_data[(jobnr + num_jobs - 1)%num_jobs];
1683     if (mb_y == s->mb_height-1) next_td = td;
1684     else                        next_td = &s->thread_data[(jobnr + 1)%num_jobs];
1685     if (s->mb_layout == 1)
1686         mb = s->macroblocks_base + ((s->mb_width+1)*(mb_y + 1) + 1);
1687     else {
1688         // Make sure the previous frame has read its segmentation map,
1689         // if we re-use the same map.
1690         if (prev_frame && s->segmentation.enabled &&
1691             !s->segmentation.update_map)
1692             ff_thread_await_progress(&prev_frame->tf, mb_y, 0);
1693         mb = s->macroblocks + (s->mb_height - mb_y - 1)*2;
1694         memset(mb - 1, 0, sizeof(*mb)); // zero left macroblock
1695         AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED*0x01010101);
1696     }
1697
1698     memset(td->left_nnz, 0, sizeof(td->left_nnz));
1699     // left edge of 129 for intra prediction
1700     if (!(avctx->flags & CODEC_FLAG_EMU_EDGE)) {
1701         for (i = 0; i < 3; i++)
1702             for (y = 0; y < 16>>!!i; y++)
1703                 dst[i][y*curframe->tf.f->linesize[i]-1] = 129;
1704         if (mb_y == 1) {
1705             s->top_border[0][15] = s->top_border[0][23] = s->top_border[0][31] = 129;
1706         }
1707     }
1708
1709     s->mv_min.x = -MARGIN;
1710     s->mv_max.x = ((s->mb_width  - 1) << 6) + MARGIN;
1711
1712     for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
1713         // Wait for previous thread to read mb_x+2, and reach mb_y-1.
1714         if (prev_td != td) {
1715             if (threadnr != 0) {
1716                 check_thread_pos(td, prev_td, mb_x+1, mb_y-1);
1717             } else {
1718                 check_thread_pos(td, prev_td, (s->mb_width+3) + (mb_x+1), mb_y-1);
1719             }
1720         }
1721
1722         s->vdsp.prefetch(dst[0] + (mb_x&3)*4*s->linesize + 64, s->linesize, 4);
1723         s->vdsp.prefetch(dst[1] + (mb_x&7)*s->uvlinesize + 64, dst[2] - dst[1], 2);
1724
1725         if (!s->mb_layout)
1726             decode_mb_mode(s, mb, mb_x, mb_y, curframe->seg_map->data + mb_xy,
1727                            prev_frame && prev_frame->seg_map ?
1728                            prev_frame->seg_map->data + mb_xy : NULL, 0);
1729
1730         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_PREVIOUS);
1731
1732         if (!mb->skip)
1733             decode_mb_coeffs(s, td, c, mb, s->top_nnz[mb_x], td->left_nnz);
1734
1735         if (mb->mode <= MODE_I4x4)
1736             intra_predict(s, td, dst, mb, mb_x, mb_y);
1737         else
1738             inter_predict(s, td, dst, mb, mb_x, mb_y);
1739
1740         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN);
1741
1742         if (!mb->skip) {
1743             idct_mb(s, td, dst, mb);
1744         } else {
1745             AV_ZERO64(td->left_nnz);
1746             AV_WN64(s->top_nnz[mb_x], 0);   // array of 9, so unaligned
1747
1748             // Reset DC block predictors if they would exist if the mb had coefficients
1749             if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
1750                 td->left_nnz[8]     = 0;
1751                 s->top_nnz[mb_x][8] = 0;
1752             }
1753         }
1754
1755         if (s->deblock_filter)
1756             filter_level_for_mb(s, mb, &td->filter_strength[mb_x]);
1757
1758         if (s->deblock_filter && num_jobs != 1 && threadnr == num_jobs-1) {
1759             if (s->filter.simple)
1760                 backup_mb_border(s->top_border[mb_x+1], dst[0], NULL, NULL, s->linesize, 0, 1);
1761             else
1762                 backup_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2], s->linesize, s->uvlinesize, 0);
1763         }
1764
1765         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN2);
1766
1767         dst[0] += 16;
1768         dst[1] += 8;
1769         dst[2] += 8;
1770         s->mv_min.x -= 64;
1771         s->mv_max.x -= 64;
1772
1773         if (mb_x == s->mb_width+1) {
1774             update_pos(td, mb_y, s->mb_width+3);
1775         } else {
1776             update_pos(td, mb_y, mb_x);
1777         }
1778     }
1779 }
1780
1781 static void vp8_filter_mb_row(AVCodecContext *avctx, void *tdata,
1782                               int jobnr, int threadnr)
1783 {
1784     VP8Context *s = avctx->priv_data;
1785     VP8ThreadData *td = &s->thread_data[threadnr];
1786     int mb_x, mb_y = td->thread_mb_pos>>16, num_jobs = s->num_jobs;
1787     AVFrame *curframe = s->curframe->tf.f;
1788     VP8Macroblock *mb;
1789     VP8ThreadData *prev_td, *next_td;
1790     uint8_t *dst[3] = {
1791         curframe->data[0] + 16*mb_y*s->linesize,
1792         curframe->data[1] +  8*mb_y*s->uvlinesize,
1793         curframe->data[2] +  8*mb_y*s->uvlinesize
1794     };
1795
1796     if (s->mb_layout == 1)
1797         mb = s->macroblocks_base + ((s->mb_width+1)*(mb_y + 1) + 1);
1798     else
1799         mb = s->macroblocks + (s->mb_height - mb_y - 1)*2;
1800
1801     if (mb_y == 0) prev_td = td;
1802     else           prev_td = &s->thread_data[(jobnr + num_jobs - 1)%num_jobs];
1803     if (mb_y == s->mb_height-1) next_td = td;
1804     else                        next_td = &s->thread_data[(jobnr + 1)%num_jobs];
1805
1806     for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb++) {
1807         VP8FilterStrength *f = &td->filter_strength[mb_x];
1808         if (prev_td != td) {
1809             check_thread_pos(td, prev_td, (mb_x+1) + (s->mb_width+3), mb_y-1);
1810         }
1811         if (next_td != td)
1812             if (next_td != &s->thread_data[0]) {
1813                 check_thread_pos(td, next_td, mb_x+1, mb_y+1);
1814             }
1815
1816         if (num_jobs == 1) {
1817             if (s->filter.simple)
1818                 backup_mb_border(s->top_border[mb_x+1], dst[0], NULL, NULL, s->linesize, 0, 1);
1819             else
1820                 backup_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2], s->linesize, s->uvlinesize, 0);
1821         }
1822
1823         if (s->filter.simple)
1824             filter_mb_simple(s, dst[0], f, mb_x, mb_y);
1825         else
1826             filter_mb(s, dst, f, mb_x, mb_y);
1827         dst[0] += 16;
1828         dst[1] += 8;
1829         dst[2] += 8;
1830
1831         update_pos(td, mb_y, (s->mb_width+3) + mb_x);
1832     }
1833 }
1834
1835 static int vp8_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata,
1836                                     int jobnr, int threadnr)
1837 {
1838     VP8Context *s = avctx->priv_data;
1839     VP8ThreadData *td = &s->thread_data[jobnr];
1840     VP8ThreadData *next_td = NULL, *prev_td = NULL;
1841     VP8Frame *curframe = s->curframe;
1842     int mb_y, num_jobs = s->num_jobs;
1843     td->thread_nr = threadnr;
1844     for (mb_y = jobnr; mb_y < s->mb_height; mb_y += num_jobs) {
1845         if (mb_y >= s->mb_height) break;
1846         td->thread_mb_pos = mb_y<<16;
1847         vp8_decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr);
1848         if (s->deblock_filter)
1849             vp8_filter_mb_row(avctx, tdata, jobnr, threadnr);
1850         update_pos(td, mb_y, INT_MAX & 0xFFFF);
1851
1852         s->mv_min.y -= 64;
1853         s->mv_max.y -= 64;
1854
1855         if (avctx->active_thread_type == FF_THREAD_FRAME)
1856             ff_thread_report_progress(&curframe->tf, mb_y, 0);
1857     }
1858
1859     return 0;
1860 }
1861
1862 int ff_vp8_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
1863                         AVPacket *avpkt)
1864 {
1865     VP8Context *s = avctx->priv_data;
1866     int ret, i, referenced, num_jobs;
1867     enum AVDiscard skip_thresh;
1868     VP8Frame *av_uninit(curframe), *prev_frame;
1869
1870     if ((ret = decode_frame_header(s, avpkt->data, avpkt->size)) < 0)
1871         goto err;
1872
1873     prev_frame = s->framep[VP56_FRAME_CURRENT];
1874
1875     referenced = s->update_last || s->update_golden == VP56_FRAME_CURRENT
1876                                 || s->update_altref == VP56_FRAME_CURRENT;
1877
1878     skip_thresh = !referenced ? AVDISCARD_NONREF :
1879                     !s->keyframe ? AVDISCARD_NONKEY : AVDISCARD_ALL;
1880
1881     if (avctx->skip_frame >= skip_thresh) {
1882         s->invisible = 1;
1883         memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
1884         goto skip_decode;
1885     }
1886     s->deblock_filter = s->filter.level && avctx->skip_loop_filter < skip_thresh;
1887
1888     // release no longer referenced frames
1889     for (i = 0; i < 5; i++)
1890         if (s->frames[i].tf.f->data[0] &&
1891             &s->frames[i] != prev_frame &&
1892             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
1893             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN] &&
1894             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2])
1895             vp8_release_frame(s, &s->frames[i]);
1896
1897     // find a free buffer
1898     for (i = 0; i < 5; i++)
1899         if (&s->frames[i] != prev_frame &&
1900             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
1901             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN] &&
1902             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2]) {
1903             curframe = s->framep[VP56_FRAME_CURRENT] = &s->frames[i];
1904             break;
1905         }
1906     if (i == 5) {
1907         av_log(avctx, AV_LOG_FATAL, "Ran out of free frames!\n");
1908         abort();
1909     }
1910     if (curframe->tf.f->data[0])
1911         vp8_release_frame(s, curframe);
1912
1913     // Given that arithmetic probabilities are updated every frame, it's quite likely
1914     // that the values we have on a random interframe are complete junk if we didn't
1915     // start decode on a keyframe. So just don't display anything rather than junk.
1916     if (!s->keyframe && (!s->framep[VP56_FRAME_PREVIOUS] ||
1917                          !s->framep[VP56_FRAME_GOLDEN] ||
1918                          !s->framep[VP56_FRAME_GOLDEN2])) {
1919         av_log(avctx, AV_LOG_WARNING, "Discarding interframe without a prior keyframe!\n");
1920         ret = AVERROR_INVALIDDATA;
1921         goto err;
1922     }
1923
1924     curframe->tf.f->key_frame = s->keyframe;
1925     curframe->tf.f->pict_type = s->keyframe ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
1926     if ((ret = vp8_alloc_frame(s, curframe, referenced)) < 0)
1927         goto err;
1928
1929     // check if golden and altref are swapped
1930     if (s->update_altref != VP56_FRAME_NONE) {
1931         s->next_framep[VP56_FRAME_GOLDEN2]  = s->framep[s->update_altref];
1932     } else {
1933         s->next_framep[VP56_FRAME_GOLDEN2]  = s->framep[VP56_FRAME_GOLDEN2];
1934     }
1935     if (s->update_golden != VP56_FRAME_NONE) {
1936         s->next_framep[VP56_FRAME_GOLDEN]   = s->framep[s->update_golden];
1937     } else {
1938         s->next_framep[VP56_FRAME_GOLDEN]   = s->framep[VP56_FRAME_GOLDEN];
1939     }
1940     if (s->update_last) {
1941         s->next_framep[VP56_FRAME_PREVIOUS] = curframe;
1942     } else {
1943         s->next_framep[VP56_FRAME_PREVIOUS] = s->framep[VP56_FRAME_PREVIOUS];
1944     }
1945     s->next_framep[VP56_FRAME_CURRENT]      = curframe;
1946
1947     ff_thread_finish_setup(avctx);
1948
1949     s->linesize   = curframe->tf.f->linesize[0];
1950     s->uvlinesize = curframe->tf.f->linesize[1];
1951
1952     memset(s->top_nnz, 0, s->mb_width*sizeof(*s->top_nnz));
1953     /* Zero macroblock structures for top/top-left prediction from outside the frame. */
1954     if (!s->mb_layout)
1955         memset(s->macroblocks + s->mb_height*2 - 1, 0, (s->mb_width+1)*sizeof(*s->macroblocks));
1956     if (!s->mb_layout && s->keyframe)
1957         memset(s->intra4x4_pred_mode_top, DC_PRED, s->mb_width*4);
1958
1959     // top edge of 127 for intra prediction
1960     if (!(avctx->flags & CODEC_FLAG_EMU_EDGE)) {
1961         s->top_border[0][15] = s->top_border[0][23] = 127;
1962         s->top_border[0][31] = 127;
1963         memset(s->top_border[1], 127, s->mb_width*sizeof(*s->top_border));
1964     }
1965     memset(s->ref_count, 0, sizeof(s->ref_count));
1966
1967
1968     if (s->mb_layout == 1) {
1969         // Make sure the previous frame has read its segmentation map,
1970         // if we re-use the same map.
1971         if (prev_frame && s->segmentation.enabled &&
1972             !s->segmentation.update_map)
1973             ff_thread_await_progress(&prev_frame->tf, 1, 0);
1974         vp8_decode_mv_mb_modes(avctx, curframe, prev_frame);
1975     }
1976
1977     if (avctx->active_thread_type == FF_THREAD_FRAME)
1978         num_jobs = 1;
1979     else
1980         num_jobs = FFMIN(s->num_coeff_partitions, avctx->thread_count);
1981     s->num_jobs   = num_jobs;
1982     s->curframe   = curframe;
1983     s->prev_frame = prev_frame;
1984     s->mv_min.y   = -MARGIN;
1985     s->mv_max.y   = ((s->mb_height - 1) << 6) + MARGIN;
1986     for (i = 0; i < MAX_THREADS; i++) {
1987         s->thread_data[i].thread_mb_pos = 0;
1988         s->thread_data[i].wait_mb_pos = INT_MAX;
1989     }
1990     avctx->execute2(avctx, vp8_decode_mb_row_sliced, s->thread_data, NULL, num_jobs);
1991
1992     ff_thread_report_progress(&curframe->tf, INT_MAX, 0);
1993     memcpy(&s->framep[0], &s->next_framep[0], sizeof(s->framep[0]) * 4);
1994
1995 skip_decode:
1996     // if future frames don't use the updated probabilities,
1997     // reset them to the values we saved
1998     if (!s->update_probabilities)
1999         s->prob[0] = s->prob[1];
2000
2001     if (!s->invisible) {
2002         if ((ret = av_frame_ref(data, curframe->tf.f)) < 0)
2003             return ret;
2004         *got_frame      = 1;
2005     }
2006
2007     return avpkt->size;
2008 err:
2009     memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
2010     return ret;
2011 }
2012
2013 av_cold int ff_vp8_decode_free(AVCodecContext *avctx)
2014 {
2015     VP8Context *s = avctx->priv_data;
2016     int i;
2017
2018     vp8_decode_flush_impl(avctx, 1);
2019     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++)
2020         av_frame_free(&s->frames[i].tf.f);
2021
2022     return 0;
2023 }
2024
2025 static av_cold int vp8_init_frames(VP8Context *s)
2026 {
2027     int i;
2028     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++) {
2029         s->frames[i].tf.f = av_frame_alloc();
2030         if (!s->frames[i].tf.f)
2031             return AVERROR(ENOMEM);
2032     }
2033     return 0;
2034 }
2035
2036 av_cold int ff_vp8_decode_init(AVCodecContext *avctx)
2037 {
2038     VP8Context *s = avctx->priv_data;
2039     int ret;
2040
2041     s->avctx = avctx;
2042     avctx->pix_fmt = AV_PIX_FMT_YUV420P;
2043     avctx->internal->allocate_progress = 1;
2044
2045     ff_videodsp_init(&s->vdsp, 8);
2046     ff_h264_pred_init(&s->hpc, AV_CODEC_ID_VP8, 8, 1);
2047     ff_vp8dsp_init(&s->vp8dsp);
2048
2049     if ((ret = vp8_init_frames(s)) < 0) {
2050         ff_vp8_decode_free(avctx);
2051         return ret;
2052     }
2053
2054     return 0;
2055 }
2056
2057 static av_cold int vp8_decode_init_thread_copy(AVCodecContext *avctx)
2058 {
2059     VP8Context *s = avctx->priv_data;
2060     int ret;
2061
2062     s->avctx = avctx;
2063
2064     if ((ret = vp8_init_frames(s)) < 0) {
2065         ff_vp8_decode_free(avctx);
2066         return ret;
2067     }
2068
2069     return 0;
2070 }
2071
2072 #define REBASE(pic) \
2073     pic ? pic - &s_src->frames[0] + &s->frames[0] : NULL
2074
2075 static int vp8_decode_update_thread_context(AVCodecContext *dst, const AVCodecContext *src)
2076 {
2077     VP8Context *s = dst->priv_data, *s_src = src->priv_data;
2078     int i;
2079
2080     if (s->macroblocks_base &&
2081         (s_src->mb_width != s->mb_width || s_src->mb_height != s->mb_height)) {
2082         free_buffers(s);
2083         s->mb_width  = s_src->mb_width;
2084         s->mb_height = s_src->mb_height;
2085     }
2086
2087     s->prob[0] = s_src->prob[!s_src->update_probabilities];
2088     s->segmentation = s_src->segmentation;
2089     s->lf_delta = s_src->lf_delta;
2090     memcpy(s->sign_bias, s_src->sign_bias, sizeof(s->sign_bias));
2091
2092     for (i = 0; i < FF_ARRAY_ELEMS(s_src->frames); i++) {
2093         if (s_src->frames[i].tf.f->data[0]) {
2094             int ret = vp8_ref_frame(s, &s->frames[i], &s_src->frames[i]);
2095             if (ret < 0)
2096                 return ret;
2097         }
2098     }
2099
2100     s->framep[0] = REBASE(s_src->next_framep[0]);
2101     s->framep[1] = REBASE(s_src->next_framep[1]);
2102     s->framep[2] = REBASE(s_src->next_framep[2]);
2103     s->framep[3] = REBASE(s_src->next_framep[3]);
2104
2105     return 0;
2106 }
2107
2108 static unsigned apply_padding(unsigned size) { return size + (size & 1); }
2109
2110 static int webp_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
2111                              AVPacket *avpkt)
2112 {
2113     const uint8_t *buf = avpkt->data;
2114     int buf_size       = avpkt->size;
2115     AVPacket pkt       = *avpkt;
2116
2117     if (buf_size >= 16
2118         && AV_RL32(buf   ) == AV_RL32("RIFF")
2119         && AV_RL32(buf+ 8) == AV_RL32("WEBP")) {
2120         unsigned riff_size = apply_padding(AV_RL32(buf+4)) + 8;
2121         buf += 12;   // Skip over main header
2122         buf_size -= 12;
2123         if (buf_size < 8 || riff_size < 8) {
2124             av_log(avctx, AV_LOG_ERROR, "Incomplete header.\n");
2125             return AVERROR_INVALIDDATA;
2126         }
2127         if (AV_RL32(buf) == AV_RL32("VP8L")) {
2128             av_log(avctx, AV_LOG_ERROR, "Unsupported WebP lossless format.\n");
2129             return AVERROR_PATCHWELCOME;
2130         }
2131         if (AV_RL32(buf) == AV_RL32("VP8X") && AV_RL32(buf+4) < (unsigned)buf_size) {
2132             unsigned size = apply_padding(AV_RL32(buf+4) + 8);
2133             buf      += size;
2134             buf_size -= size;
2135         }
2136         if (buf_size >= 8
2137             && AV_RL32(buf) == AV_RL32("ALPH") && AV_RL32(buf+4) < (unsigned)buf_size) {
2138             unsigned size = apply_padding(AV_RL32(buf+4) + 8);
2139             buf      += size;
2140             buf_size -= size;
2141             av_log(avctx, AV_LOG_WARNING, "Skipping alpha plane\n");
2142         }
2143         if (buf_size >= 8 && AV_RL32(buf) == AV_RL32("VP8 ")) {
2144             buf      += 8;
2145             buf_size -= 8;
2146         }
2147     }
2148     pkt.data = buf;
2149     pkt.size = buf_size;
2150
2151     return ff_vp8_decode_frame(avctx, data, data_size, &pkt);
2152 }
2153
2154 AVCodec ff_vp8_decoder = {
2155     .name                  = "vp8",
2156     .long_name             = NULL_IF_CONFIG_SMALL("On2 VP8"),
2157     .type                  = AVMEDIA_TYPE_VIDEO,
2158     .id                    = AV_CODEC_ID_VP8,
2159     .priv_data_size        = sizeof(VP8Context),
2160     .init                  = ff_vp8_decode_init,
2161     .close                 = ff_vp8_decode_free,
2162     .decode                = ff_vp8_decode_frame,
2163     .capabilities          = CODEC_CAP_DR1 | CODEC_CAP_FRAME_THREADS | CODEC_CAP_SLICE_THREADS,
2164     .flush                 = vp8_decode_flush,
2165     .init_thread_copy      = ONLY_IF_THREADS_ENABLED(vp8_decode_init_thread_copy),
2166     .update_thread_context = ONLY_IF_THREADS_ENABLED(vp8_decode_update_thread_context),
2167 };
2168
2169 // AVCodec ff_webp_decoder = {
2170 //     .name                  = "webp",
2171 //     .long_name             = NULL_IF_CONFIG_SMALL("WebP"),
2172 //     .type                  = AVMEDIA_TYPE_VIDEO,
2173 //     .id                    = AV_CODEC_ID_WEBP,
2174 //     .priv_data_size        = sizeof(VP8Context),
2175 //     .init                  = vp8_decode_init,
2176 //     .close                 = vp8_decode_free,
2177 //     .decode                = webp_decode_frame,
2178 //     .capabilities          = CODEC_CAP_DR1 | CODEC_CAP_FRAME_THREADS | CODEC_CAP_SLICE_THREADS,
2179 //     .flush                 = vp8_decode_flush,
2180 //     .init_thread_copy      = ONLY_IF_THREADS_ENABLED(vp8_decode_init_thread_copy),
2181 //     .update_thread_context = ONLY_IF_THREADS_ENABLED(vp8_decode_update_thread_context),
2182 // };