git.sesse.net Git - ffmpeg/blob - libavcodec/vp8.c

   1 /*
   2  * VP8 compatible video decoder
   3  *
   4  * Copyright (C) 2010 David Conrad
   5  * Copyright (C) 2010 Ronald S. Bultje
   6  * Copyright (C) 2010 Jason Garrett-Glaser
   7  *
   8  * This file is part of Libav.
   9  *
  10  * Libav is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2.1 of the License, or (at your option) any later version.
  14  *
  15  * Libav is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with Libav; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23  */
  24
  25 #include "libavutil/imgutils.h"
  26 #include "avcodec.h"
  27 #include "internal.h"
  28 #include "vp8.h"
  29 #include "vp8data.h"
  30 #include "rectangle.h"
  31 #include "thread.h"
  32
  33 #if ARCH_ARM
  34 #   include "arm/vp8.h"
  35 #endif
  36
  37 static void free_buffers(VP8Context *s)
  38 {
  39     av_freep(&s->macroblocks_base);
  40     av_freep(&s->filter_strength);
  41     av_freep(&s->intra4x4_pred_mode_top);
  42     av_freep(&s->top_nnz);
  43     av_freep(&s->edge_emu_buffer);
  44     av_freep(&s->top_border);
  45
  46     s->macroblocks = NULL;
  47 }
  48
  49 static int vp8_alloc_frame(VP8Context *s, AVFrame *f)
  50 {
  51     int ret;
  52     if ((ret = ff_thread_get_buffer(s->avctx, f)) < 0)
  53         return ret;
  54     if (s->num_maps_to_be_freed && !s->maps_are_invalid) {
  55         f->ref_index[0] = s->segmentation_maps[--s->num_maps_to_be_freed];
  56     } else if (!(f->ref_index[0] = av_mallocz(s->mb_width * s->mb_height))) {
  57         ff_thread_release_buffer(s->avctx, f);
  58         return AVERROR(ENOMEM);
  59     }
  60     return 0;
  61 }
  62
  63 static void vp8_release_frame(VP8Context *s, AVFrame *f, int prefer_delayed_free, int can_direct_free)
  64 {
  65     if (f->ref_index[0]) {
  66         if (prefer_delayed_free) {
  67             /* Upon a size change, we want to free the maps but other threads may still
  68              * be using them, so queue them. Upon a seek, all threads are inactive so
  69              * we want to cache one to prevent re-allocation in the next decoding
  70              * iteration, but the rest we can free directly. */
  71             int max_queued_maps = can_direct_free ? 1 : FF_ARRAY_ELEMS(s->segmentation_maps);
  72             if (s->num_maps_to_be_freed < max_queued_maps) {
  73                 s->segmentation_maps[s->num_maps_to_be_freed++] = f->ref_index[0];
  74             } else if (can_direct_free) /* vp8_decode_flush(), but our queue is full */ {
  75                 av_free(f->ref_index[0]);
  76             } /* else: MEMLEAK (should never happen, but better that than crash) */
  77             f->ref_index[0] = NULL;
  78         } else /* vp8_decode_free() */ {
  79             av_free(f->ref_index[0]);
  80         }
  81     }
  82     ff_thread_release_buffer(s->avctx, f);
  83 }
  84
  85 static void vp8_decode_flush_impl(AVCodecContext *avctx,
  86                                   int prefer_delayed_free, int can_direct_free, int free_mem)
  87 {
  88     VP8Context *s = avctx->priv_data;
  89     int i;
  90
  91     if (!avctx->internal->is_copy) {
  92         for (i = 0; i < 5; i++)
  93             if (s->frames[i].data[0])
  94                 vp8_release_frame(s, &s->frames[i], prefer_delayed_free, can_direct_free);
  95     }
  96     memset(s->framep, 0, sizeof(s->framep));
  97
  98     if (free_mem) {
  99         free_buffers(s);
 100         s->maps_are_invalid = 1;
 101     }
 102 }
 103
 104 static void vp8_decode_flush(AVCodecContext *avctx)
 105 {
 106     vp8_decode_flush_impl(avctx, 1, 1, 0);
 107 }
 108
 109 static int update_dimensions(VP8Context *s, int width, int height)
 110 {
 111     if (width  != s->avctx->width ||
 112         height != s->avctx->height) {
 113         if (av_image_check_size(width, height, 0, s->avctx))
 114             return AVERROR_INVALIDDATA;
 115
 116         vp8_decode_flush_impl(s->avctx, 1, 0, 1);
 117
 118         avcodec_set_dimensions(s->avctx, width, height);
 119     }
 120
 121     s->mb_width  = (s->avctx->coded_width +15) / 16;
 122     s->mb_height = (s->avctx->coded_height+15) / 16;
 123
 124     s->macroblocks_base        = av_mallocz((s->mb_width+s->mb_height*2+1)*sizeof(*s->macroblocks));
 125     s->filter_strength         = av_mallocz(s->mb_width*sizeof(*s->filter_strength));
 126     s->intra4x4_pred_mode_top  = av_mallocz(s->mb_width*4);
 127     s->top_nnz                 = av_mallocz(s->mb_width*sizeof(*s->top_nnz));
 128     s->top_border              = av_mallocz((s->mb_width+1)*sizeof(*s->top_border));
 129
 130     if (!s->macroblocks_base || !s->filter_strength || !s->intra4x4_pred_mode_top ||
 131         !s->top_nnz || !s->top_border)
 132         return AVERROR(ENOMEM);
 133
 134     s->macroblocks        = s->macroblocks_base + 1;
 135
 136     return 0;
 137 }
 138
 139 static void parse_segment_info(VP8Context *s)
 140 {
 141     VP56RangeCoder *c = &s->c;
 142     int i;
 143
 144     s->segmentation.update_map = vp8_rac_get(c);
 145
 146     if (vp8_rac_get(c)) { // update segment feature data
 147         s->segmentation.absolute_vals = vp8_rac_get(c);
 148
 149         for (i = 0; i < 4; i++)
 150             s->segmentation.base_quant[i]   = vp8_rac_get_sint(c, 7);
 151
 152         for (i = 0; i < 4; i++)
 153             s->segmentation.filter_level[i] = vp8_rac_get_sint(c, 6);
 154     }
 155     if (s->segmentation.update_map)
 156         for (i = 0; i < 3; i++)
 157             s->prob->segmentid[i] = vp8_rac_get(c) ? vp8_rac_get_uint(c, 8) : 255;
 158 }
 159
 160 static void update_lf_deltas(VP8Context *s)
 161 {
 162     VP56RangeCoder *c = &s->c;
 163     int i;
 164
 165     for (i = 0; i < 4; i++) {
 166         if (vp8_rac_get(c)) {
 167             s->lf_delta.ref[i] = vp8_rac_get_uint(c, 6);
 168
 169             if (vp8_rac_get(c))
 170                 s->lf_delta.ref[i] = -s->lf_delta.ref[i];
 171         }
 172     }
 173
 174     for (i = MODE_I4x4; i <= VP8_MVMODE_SPLIT; i++) {
 175         if (vp8_rac_get(c)) {
 176             s->lf_delta.mode[i] = vp8_rac_get_uint(c, 6);
 177
 178             if (vp8_rac_get(c))
 179                 s->lf_delta.mode[i] = -s->lf_delta.mode[i];
 180         }
 181     }
 182 }
 183
 184 static int setup_partitions(VP8Context *s, const uint8_t *buf, int buf_size)
 185 {
 186     const uint8_t *sizes = buf;
 187     int i;
 188
 189     s->num_coeff_partitions = 1 << vp8_rac_get_uint(&s->c, 2);
 190
 191     buf      += 3*(s->num_coeff_partitions-1);
 192     buf_size -= 3*(s->num_coeff_partitions-1);
 193     if (buf_size < 0)
 194         return -1;
 195
 196     for (i = 0; i < s->num_coeff_partitions-1; i++) {
 197         int size = AV_RL24(sizes + 3*i);
 198         if (buf_size - size < 0)
 199             return -1;
 200
 201         ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, size);
 202         buf      += size;
 203         buf_size -= size;
 204     }
 205     ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, buf_size);
 206
 207     return 0;
 208 }
 209
 210 static void get_quants(VP8Context *s)
 211 {
 212     VP56RangeCoder *c = &s->c;
 213     int i, base_qi;
 214
 215     int yac_qi     = vp8_rac_get_uint(c, 7);
 216     int ydc_delta  = vp8_rac_get_sint(c, 4);
 217     int y2dc_delta = vp8_rac_get_sint(c, 4);
 218     int y2ac_delta = vp8_rac_get_sint(c, 4);
 219     int uvdc_delta = vp8_rac_get_sint(c, 4);
 220     int uvac_delta = vp8_rac_get_sint(c, 4);
 221
 222     for (i = 0; i < 4; i++) {
 223         if (s->segmentation.enabled) {
 224             base_qi = s->segmentation.base_quant[i];
 225             if (!s->segmentation.absolute_vals)
 226                 base_qi += yac_qi;
 227         } else
 228             base_qi = yac_qi;
 229
 230         s->qmat[i].luma_qmul[0]    =       vp8_dc_qlookup[av_clip_uintp2(base_qi + ydc_delta , 7)];
 231         s->qmat[i].luma_qmul[1]    =       vp8_ac_qlookup[av_clip_uintp2(base_qi             , 7)];
 232         s->qmat[i].luma_dc_qmul[0] =   2 * vp8_dc_qlookup[av_clip_uintp2(base_qi + y2dc_delta, 7)];
 233         s->qmat[i].luma_dc_qmul[1] = 155 * vp8_ac_qlookup[av_clip_uintp2(base_qi + y2ac_delta, 7)] / 100;
 234         s->qmat[i].chroma_qmul[0]  =       vp8_dc_qlookup[av_clip_uintp2(base_qi + uvdc_delta, 7)];
 235         s->qmat[i].chroma_qmul[1]  =       vp8_ac_qlookup[av_clip_uintp2(base_qi + uvac_delta, 7)];
 236
 237         s->qmat[i].luma_dc_qmul[1] = FFMAX(s->qmat[i].luma_dc_qmul[1], 8);
 238         s->qmat[i].chroma_qmul[0]  = FFMIN(s->qmat[i].chroma_qmul[0], 132);
 239     }
 240 }
 241
 242 /**
 243  * Determine which buffers golden and altref should be updated with after this frame.
 244  * The spec isn't clear here, so I'm going by my understanding of what libvpx does
 245  *
 246  * Intra frames update all 3 references
 247  * Inter frames update VP56_FRAME_PREVIOUS if the update_last flag is set
 248  * If the update (golden|altref) flag is set, it's updated with the current frame
 249  *      if update_last is set, and VP56_FRAME_PREVIOUS otherwise.
 250  * If the flag is not set, the number read means:
 251  *      0: no update
 252  *      1: VP56_FRAME_PREVIOUS
 253  *      2: update golden with altref, or update altref with golden
 254  */
 255 static VP56Frame ref_to_update(VP8Context *s, int update, VP56Frame ref)
 256 {
 257     VP56RangeCoder *c = &s->c;
 258
 259     if (update)
 260         return VP56_FRAME_CURRENT;
 261
 262     switch (vp8_rac_get_uint(c, 2)) {
 263     case 1:
 264         return VP56_FRAME_PREVIOUS;
 265     case 2:
 266         return (ref == VP56_FRAME_GOLDEN) ? VP56_FRAME_GOLDEN2 : VP56_FRAME_GOLDEN;
 267     }
 268     return VP56_FRAME_NONE;
 269 }
 270
 271 static void update_refs(VP8Context *s)
 272 {
 273     VP56RangeCoder *c = &s->c;
 274
 275     int update_golden = vp8_rac_get(c);
 276     int update_altref = vp8_rac_get(c);
 277
 278     s->update_golden = ref_to_update(s, update_golden, VP56_FRAME_GOLDEN);
 279     s->update_altref = ref_to_update(s, update_altref, VP56_FRAME_GOLDEN2);
 280 }
 281
 282 static int decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size)
 283 {
 284     VP56RangeCoder *c = &s->c;
 285     int header_size, hscale, vscale, i, j, k, l, m, ret;
 286     int width  = s->avctx->width;
 287     int height = s->avctx->height;
 288
 289     s->keyframe  = !(buf[0] & 1);
 290     s->profile   =  (buf[0]>>1) & 7;
 291     s->invisible = !(buf[0] & 0x10);
 292     header_size  = AV_RL24(buf) >> 5;
 293     buf      += 3;
 294     buf_size -= 3;
 295
 296     if (s->profile > 3)
 297         av_log(s->avctx, AV_LOG_WARNING, "Unknown profile %d\n", s->profile);
 298
 299     if (!s->profile)
 300         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab, sizeof(s->put_pixels_tab));
 301     else    // profile 1-3 use bilinear, 4+ aren't defined so whatever
 302         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_bilinear_pixels_tab, sizeof(s->put_pixels_tab));
 303
 304     if (header_size > buf_size - 7*s->keyframe) {
 305         av_log(s->avctx, AV_LOG_ERROR, "Header size larger than data provided\n");
 306         return AVERROR_INVALIDDATA;
 307     }
 308
 309     if (s->keyframe) {
 310         if (AV_RL24(buf) != 0x2a019d) {
 311             av_log(s->avctx, AV_LOG_ERROR, "Invalid start code 0x%x\n", AV_RL24(buf));
 312             return AVERROR_INVALIDDATA;
 313         }
 314         width  = AV_RL16(buf+3) & 0x3fff;
 315         height = AV_RL16(buf+5) & 0x3fff;
 316         hscale = buf[4] >> 6;
 317         vscale = buf[6] >> 6;
 318         buf      += 7;
 319         buf_size -= 7;
 320
 321         if (hscale || vscale)
 322             av_log_missing_feature(s->avctx, "Upscaling", 1);
 323
 324         s->update_golden = s->update_altref = VP56_FRAME_CURRENT;
 325         for (i = 0; i < 4; i++)
 326             for (j = 0; j < 16; j++)
 327                 memcpy(s->prob->token[i][j], vp8_token_default_probs[i][vp8_coeff_band[j]],
 328                        sizeof(s->prob->token[i][j]));
 329         memcpy(s->prob->pred16x16, vp8_pred16x16_prob_inter, sizeof(s->prob->pred16x16));
 330         memcpy(s->prob->pred8x8c , vp8_pred8x8c_prob_inter , sizeof(s->prob->pred8x8c));
 331         memcpy(s->prob->mvc      , vp8_mv_default_prob     , sizeof(s->prob->mvc));
 332         memset(&s->segmentation, 0, sizeof(s->segmentation));
 333     }
 334
 335     if (!s->macroblocks_base || /* first frame */
 336         width != s->avctx->width || height != s->avctx->height) {
 337         if ((ret = update_dimensions(s, width, height)) < 0)
 338             return ret;
 339     }
 340
 341     ff_vp56_init_range_decoder(c, buf, header_size);
 342     buf      += header_size;
 343     buf_size -= header_size;
 344
 345     if (s->keyframe) {
 346         if (vp8_rac_get(c))
 347             av_log(s->avctx, AV_LOG_WARNING, "Unspecified colorspace\n");
 348         vp8_rac_get(c); // whether we can skip clamping in dsp functions
 349     }
 350
 351     if ((s->segmentation.enabled = vp8_rac_get(c)))
 352         parse_segment_info(s);
 353     else
 354         s->segmentation.update_map = 0; // FIXME: move this to some init function?
 355
 356     s->filter.simple    = vp8_rac_get(c);
 357     s->filter.level     = vp8_rac_get_uint(c, 6);
 358     s->filter.sharpness = vp8_rac_get_uint(c, 3);
 359
 360     if ((s->lf_delta.enabled = vp8_rac_get(c)))
 361         if (vp8_rac_get(c))
 362             update_lf_deltas(s);
 363
 364     if (setup_partitions(s, buf, buf_size)) {
 365         av_log(s->avctx, AV_LOG_ERROR, "Invalid partitions\n");
 366         return AVERROR_INVALIDDATA;
 367     }
 368
 369     get_quants(s);
 370
 371     if (!s->keyframe) {
 372         update_refs(s);
 373         s->sign_bias[VP56_FRAME_GOLDEN]               = vp8_rac_get(c);
 374         s->sign_bias[VP56_FRAME_GOLDEN2 /* altref */] = vp8_rac_get(c);
 375     }
 376
 377     // if we aren't saving this frame's probabilities for future frames,
 378     // make a copy of the current probabilities
 379     if (!(s->update_probabilities = vp8_rac_get(c)))
 380         s->prob[1] = s->prob[0];
 381
 382     s->update_last = s->keyframe || vp8_rac_get(c);
 383
 384     for (i = 0; i < 4; i++)
 385         for (j = 0; j < 8; j++)
 386             for (k = 0; k < 3; k++)
 387                 for (l = 0; l < NUM_DCT_TOKENS-1; l++)
 388                     if (vp56_rac_get_prob_branchy(c, vp8_token_update_probs[i][j][k][l])) {
 389                         int prob = vp8_rac_get_uint(c, 8);
 390                         for (m = 0; vp8_coeff_band_indexes[j][m] >= 0; m++)
 391                             s->prob->token[i][vp8_coeff_band_indexes[j][m]][k][l] = prob;
 392                     }
 393
 394     if ((s->mbskip_enabled = vp8_rac_get(c)))
 395         s->prob->mbskip = vp8_rac_get_uint(c, 8);
 396
 397     if (!s->keyframe) {
 398         s->prob->intra  = vp8_rac_get_uint(c, 8);
 399         s->prob->last   = vp8_rac_get_uint(c, 8);
 400         s->prob->golden = vp8_rac_get_uint(c, 8);
 401
 402         if (vp8_rac_get(c))
 403             for (i = 0; i < 4; i++)
 404                 s->prob->pred16x16[i] = vp8_rac_get_uint(c, 8);
 405         if (vp8_rac_get(c))
 406             for (i = 0; i < 3; i++)
 407                 s->prob->pred8x8c[i]  = vp8_rac_get_uint(c, 8);
 408
 409         // 17.2 MV probability update
 410         for (i = 0; i < 2; i++)
 411             for (j = 0; j < 19; j++)
 412                 if (vp56_rac_get_prob_branchy(c, vp8_mv_update_prob[i][j]))
 413                     s->prob->mvc[i][j] = vp8_rac_get_nn(c);
 414     }
 415
 416     return 0;
 417 }
 418
 419 static av_always_inline void clamp_mv(VP8Context *s, VP56mv *dst, const VP56mv *src)
 420 {
 421     dst->x = av_clip(src->x, s->mv_min.x, s->mv_max.x);
 422     dst->y = av_clip(src->y, s->mv_min.y, s->mv_max.y);
 423 }
 424
 425 /**
 426  * Motion vector coding, 17.1.
 427  */
 428 static int read_mv_component(VP56RangeCoder *c, const uint8_t *p)
 429 {
 430     int bit, x = 0;
 431
 432     if (vp56_rac_get_prob_branchy(c, p[0])) {
 433         int i;
 434
 435         for (i = 0; i < 3; i++)
 436             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 437         for (i = 9; i > 3; i--)
 438             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 439         if (!(x & 0xFFF0) || vp56_rac_get_prob(c, p[12]))
 440             x += 8;
 441     } else {
 442         // small_mvtree
 443         const uint8_t *ps = p+2;
 444         bit = vp56_rac_get_prob(c, *ps);
 445         ps += 1 + 3*bit;
 446         x  += 4*bit;
 447         bit = vp56_rac_get_prob(c, *ps);
 448         ps += 1 + bit;
 449         x  += 2*bit;
 450         x  += vp56_rac_get_prob(c, *ps);
 451     }
 452
 453     return (x && vp56_rac_get_prob(c, p[1])) ? -x : x;
 454 }
 455
 456 static av_always_inline
 457 const uint8_t *get_submv_prob(uint32_t left, uint32_t top)
 458 {
 459     if (left == top)
 460         return vp8_submv_prob[4-!!left];
 461     if (!top)
 462         return vp8_submv_prob[2];
 463     return vp8_submv_prob[1-!!left];
 464 }
 465
 466 /**
 467  * Split motion vector prediction, 16.4.
 468  * @returns the number of motion vectors parsed (2, 4 or 16)
 469  */
 470 static av_always_inline
 471 int decode_splitmvs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb)
 472 {
 473     int part_idx;
 474     int n, num;
 475     VP8Macroblock *top_mb  = &mb[2];
 476     VP8Macroblock *left_mb = &mb[-1];
 477     const uint8_t *mbsplits_left = vp8_mbsplits[left_mb->partitioning],
 478                   *mbsplits_top = vp8_mbsplits[top_mb->partitioning],
 479                   *mbsplits_cur, *firstidx;
 480     VP56mv *top_mv  = top_mb->bmv;
 481     VP56mv *left_mv = left_mb->bmv;
 482     VP56mv *cur_mv  = mb->bmv;
 483
 484     if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[0])) {
 485         if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[1])) {
 486             part_idx = VP8_SPLITMVMODE_16x8 + vp56_rac_get_prob(c, vp8_mbsplit_prob[2]);
 487         } else {
 488             part_idx = VP8_SPLITMVMODE_8x8;
 489         }
 490     } else {
 491         part_idx = VP8_SPLITMVMODE_4x4;
 492     }
 493
 494     num = vp8_mbsplit_count[part_idx];
 495     mbsplits_cur = vp8_mbsplits[part_idx],
 496     firstidx = vp8_mbfirstidx[part_idx];
 497     mb->partitioning = part_idx;
 498
 499     for (n = 0; n < num; n++) {
 500         int k = firstidx[n];
 501         uint32_t left, above;
 502         const uint8_t *submv_prob;
 503
 504         if (!(k & 3))
 505             left = AV_RN32A(&left_mv[mbsplits_left[k + 3]]);
 506         else
 507             left  = AV_RN32A(&cur_mv[mbsplits_cur[k - 1]]);
 508         if (k <= 3)
 509             above = AV_RN32A(&top_mv[mbsplits_top[k + 12]]);
 510         else
 511             above = AV_RN32A(&cur_mv[mbsplits_cur[k - 4]]);
 512
 513         submv_prob = get_submv_prob(left, above);
 514
 515         if (vp56_rac_get_prob_branchy(c, submv_prob[0])) {
 516             if (vp56_rac_get_prob_branchy(c, submv_prob[1])) {
 517                 if (vp56_rac_get_prob_branchy(c, submv_prob[2])) {
 518                     mb->bmv[n].y = mb->mv.y + read_mv_component(c, s->prob->mvc[0]);
 519                     mb->bmv[n].x = mb->mv.x + read_mv_component(c, s->prob->mvc[1]);
 520                 } else {
 521                     AV_ZERO32(&mb->bmv[n]);
 522                 }
 523             } else {
 524                 AV_WN32A(&mb->bmv[n], above);
 525             }
 526         } else {
 527             AV_WN32A(&mb->bmv[n], left);
 528         }
 529     }
 530
 531     return num;
 532 }
 533
 534 static av_always_inline
 535 void decode_mvs(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y)
 536 {
 537     VP8Macroblock *mb_edge[3] = { mb + 2 /* top */,
 538                                   mb - 1 /* left */,
 539                                   mb + 1 /* top-left */ };
 540     enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV };
 541     enum { VP8_EDGE_TOP, VP8_EDGE_LEFT, VP8_EDGE_TOPLEFT };
 542     int idx = CNT_ZERO;
 543     int cur_sign_bias = s->sign_bias[mb->ref_frame];
 544     int8_t *sign_bias = s->sign_bias;
 545     VP56mv near_mv[4];
 546     uint8_t cnt[4] = { 0 };
 547     VP56RangeCoder *c = &s->c;
 548
 549     AV_ZERO32(&near_mv[0]);
 550     AV_ZERO32(&near_mv[1]);
 551     AV_ZERO32(&near_mv[2]);
 552
 553     /* Process MB on top, left and top-left */
 554     #define MV_EDGE_CHECK(n)\
 555     {\
 556         VP8Macroblock *edge = mb_edge[n];\
 557         int edge_ref = edge->ref_frame;\
 558         if (edge_ref != VP56_FRAME_CURRENT) {\
 559             uint32_t mv = AV_RN32A(&edge->mv);\
 560             if (mv) {\
 561                 if (cur_sign_bias != sign_bias[edge_ref]) {\
 562                     /* SWAR negate of the values in mv. */\
 563                     mv = ~mv;\
 564                     mv = ((mv&0x7fff7fff) + 0x00010001) ^ (mv&0x80008000);\
 565                 }\
 566                 if (!n || mv != AV_RN32A(&near_mv[idx]))\
 567                     AV_WN32A(&near_mv[++idx], mv);\
 568                 cnt[idx]      += 1 + (n != 2);\
 569             } else\
 570                 cnt[CNT_ZERO] += 1 + (n != 2);\
 571         }\
 572     }
 573
 574     MV_EDGE_CHECK(0)
 575     MV_EDGE_CHECK(1)
 576     MV_EDGE_CHECK(2)
 577
 578     mb->partitioning = VP8_SPLITMVMODE_NONE;
 579     if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_ZERO]][0])) {
 580         mb->mode = VP8_MVMODE_MV;
 581
 582         /* If we have three distinct MVs, merge first and last if they're the same */
 583         if (cnt[CNT_SPLITMV] && AV_RN32A(&near_mv[1 + VP8_EDGE_TOP]) == AV_RN32A(&near_mv[1 + VP8_EDGE_TOPLEFT]))
 584             cnt[CNT_NEAREST] += 1;
 585
 586         /* Swap near and nearest if necessary */
 587         if (cnt[CNT_NEAR] > cnt[CNT_NEAREST]) {
 588             FFSWAP(uint8_t,     cnt[CNT_NEAREST],     cnt[CNT_NEAR]);
 589             FFSWAP( VP56mv, near_mv[CNT_NEAREST], near_mv[CNT_NEAR]);
 590         }
 591
 592         if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAREST]][1])) {
 593             if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAR]][2])) {
 594
 595                 /* Choose the best mv out of 0,0 and the nearest mv */
 596                 clamp_mv(s, &mb->mv, &near_mv[CNT_ZERO + (cnt[CNT_NEAREST] >= cnt[CNT_ZERO])]);
 597                 cnt[CNT_SPLITMV] = ((mb_edge[VP8_EDGE_LEFT]->mode    == VP8_MVMODE_SPLIT) +
 598                                     (mb_edge[VP8_EDGE_TOP]->mode     == VP8_MVMODE_SPLIT)) * 2 +
 599                                     (mb_edge[VP8_EDGE_TOPLEFT]->mode == VP8_MVMODE_SPLIT);
 600
 601                 if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_SPLITMV]][3])) {
 602                     mb->mode = VP8_MVMODE_SPLIT;
 603                     mb->mv = mb->bmv[decode_splitmvs(s, c, mb) - 1];
 604                 } else {
 605                     mb->mv.y += read_mv_component(c, s->prob->mvc[0]);
 606                     mb->mv.x += read_mv_component(c, s->prob->mvc[1]);
 607                     mb->bmv[0] = mb->mv;
 608                 }
 609             } else {
 610                 clamp_mv(s, &mb->mv, &near_mv[CNT_NEAR]);
 611                 mb->bmv[0] = mb->mv;
 612             }
 613         } else {
 614             clamp_mv(s, &mb->mv, &near_mv[CNT_NEAREST]);
 615             mb->bmv[0] = mb->mv;
 616         }
 617     } else {
 618         mb->mode = VP8_MVMODE_ZERO;
 619         AV_ZERO32(&mb->mv);
 620         mb->bmv[0] = mb->mv;
 621     }
 622 }
 623
 624 static av_always_inline
 625 void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c,
 626                            int mb_x, int keyframe)
 627 {
 628     uint8_t *intra4x4 = s->intra4x4_pred_mode_mb;
 629     if (keyframe) {
 630         int x, y;
 631         uint8_t* const top = s->intra4x4_pred_mode_top + 4 * mb_x;
 632         uint8_t* const left = s->intra4x4_pred_mode_left;
 633         for (y = 0; y < 4; y++) {
 634             for (x = 0; x < 4; x++) {
 635                 const uint8_t *ctx;
 636                 ctx = vp8_pred4x4_prob_intra[top[x]][left[y]];
 637                 *intra4x4 = vp8_rac_get_tree(c, vp8_pred4x4_tree, ctx);
 638                 left[y] = top[x] = *intra4x4;
 639                 intra4x4++;
 640             }
 641         }
 642     } else {
 643         int i;
 644         for (i = 0; i < 16; i++)
 645             intra4x4[i] = vp8_rac_get_tree(c, vp8_pred4x4_tree, vp8_pred4x4_prob_inter);
 646     }
 647 }
 648
 649 static av_always_inline
 650 void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, uint8_t *segment, uint8_t *ref)
 651 {
 652     VP56RangeCoder *c = &s->c;
 653
 654     if (s->segmentation.update_map)
 655         *segment = vp8_rac_get_tree(c, vp8_segmentid_tree, s->prob->segmentid);
 656     else if (s->segmentation.enabled)
 657         *segment = ref ? *ref : *segment;
 658     s->segment = *segment;
 659
 660     mb->skip = s->mbskip_enabled ? vp56_rac_get_prob(c, s->prob->mbskip) : 0;
 661
 662     if (s->keyframe) {
 663         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_intra, vp8_pred16x16_prob_intra);
 664
 665         if (mb->mode == MODE_I4x4) {
 666             decode_intra4x4_modes(s, c, mb_x, 1);
 667         } else {
 668             const uint32_t modes = vp8_pred4x4_mode[mb->mode] * 0x01010101u;
 669             AV_WN32A(s->intra4x4_pred_mode_top + 4 * mb_x, modes);
 670             AV_WN32A(s->intra4x4_pred_mode_left, modes);
 671         }
 672
 673         s->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, vp8_pred8x8c_prob_intra);
 674         mb->ref_frame = VP56_FRAME_CURRENT;
 675     } else if (vp56_rac_get_prob_branchy(c, s->prob->intra)) {
 676         // inter MB, 16.2
 677         if (vp56_rac_get_prob_branchy(c, s->prob->last))
 678             mb->ref_frame = vp56_rac_get_prob(c, s->prob->golden) ?
 679                 VP56_FRAME_GOLDEN2 /* altref */ : VP56_FRAME_GOLDEN;
 680         else
 681             mb->ref_frame = VP56_FRAME_PREVIOUS;
 682         s->ref_count[mb->ref_frame-1]++;
 683
 684         // motion vectors, 16.3
 685         decode_mvs(s, mb, mb_x, mb_y);
 686     } else {
 687         // intra MB, 16.1
 688         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_inter, s->prob->pred16x16);
 689
 690         if (mb->mode == MODE_I4x4)
 691             decode_intra4x4_modes(s, c, mb_x, 0);
 692
 693         s->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, s->prob->pred8x8c);
 694         mb->ref_frame = VP56_FRAME_CURRENT;
 695         mb->partitioning = VP8_SPLITMVMODE_NONE;
 696         AV_ZERO32(&mb->bmv[0]);
 697     }
 698 }
 699
 700 #ifndef decode_block_coeffs_internal
 701 /**
 702  * @param c arithmetic bitstream reader context
 703  * @param block destination for block coefficients
 704  * @param probs probabilities to use when reading trees from the bitstream
 705  * @param i initial coeff index, 0 unless a separate DC block is coded
 706  * @param qmul array holding the dc/ac dequant factor at position 0/1
 707  * @return 0 if no coeffs were decoded
 708  *         otherwise, the index of the last coeff decoded plus one
 709  */
 710 static int decode_block_coeffs_internal(VP56RangeCoder *c, DCTELEM block[16],
 711                                         uint8_t probs[16][3][NUM_DCT_TOKENS-1],
 712                                         int i, uint8_t *token_prob, int16_t qmul[2])
 713 {
 714     goto skip_eob;
 715     do {
 716         int coeff;
 717         if (!vp56_rac_get_prob_branchy(c, token_prob[0]))   // DCT_EOB
 718             return i;
 719
 720 skip_eob:
 721         if (!vp56_rac_get_prob_branchy(c, token_prob[1])) { // DCT_0
 722             if (++i == 16)
 723                 return i; // invalid input; blocks should end with EOB
 724             token_prob = probs[i][0];
 725             goto skip_eob;
 726         }
 727
 728         if (!vp56_rac_get_prob_branchy(c, token_prob[2])) { // DCT_1
 729             coeff = 1;
 730             token_prob = probs[i+1][1];
 731         } else {
 732             if (!vp56_rac_get_prob_branchy(c, token_prob[3])) { // DCT 2,3,4
 733                 coeff = vp56_rac_get_prob_branchy(c, token_prob[4]);
 734                 if (coeff)
 735                     coeff += vp56_rac_get_prob(c, token_prob[5]);
 736                 coeff += 2;
 737             } else {
 738                 // DCT_CAT*
 739                 if (!vp56_rac_get_prob_branchy(c, token_prob[6])) {
 740                     if (!vp56_rac_get_prob_branchy(c, token_prob[7])) { // DCT_CAT1
 741                         coeff  = 5 + vp56_rac_get_prob(c, vp8_dct_cat1_prob[0]);
 742                     } else {                                    // DCT_CAT2
 743                         coeff  = 7;
 744                         coeff += vp56_rac_get_prob(c, vp8_dct_cat2_prob[0]) << 1;
 745                         coeff += vp56_rac_get_prob(c, vp8_dct_cat2_prob[1]);
 746                     }
 747                 } else {    // DCT_CAT3 and up
 748                     int a = vp56_rac_get_prob(c, token_prob[8]);
 749                     int b = vp56_rac_get_prob(c, token_prob[9+a]);
 750                     int cat = (a<<1) + b;
 751                     coeff  = 3 + (8<<cat);
 752                     coeff += vp8_rac_get_coeff(c, ff_vp8_dct_cat_prob[cat]);
 753                 }
 754             }
 755             token_prob = probs[i+1][2];
 756         }
 757         block[zigzag_scan[i]] = (vp8_rac_get(c) ? -coeff : coeff) * qmul[!!i];
 758     } while (++i < 16);
 759
 760     return i;
 761 }
 762 #endif
 763
 764 /**
 765  * @param c arithmetic bitstream reader context
 766  * @param block destination for block coefficients
 767  * @param probs probabilities to use when reading trees from the bitstream
 768  * @param i initial coeff index, 0 unless a separate DC block is coded
 769  * @param zero_nhood the initial prediction context for number of surrounding
 770  *                   all-zero blocks (only left/top, so 0-2)
 771  * @param qmul array holding the dc/ac dequant factor at position 0/1
 772  * @return 0 if no coeffs were decoded
 773  *         otherwise, the index of the last coeff decoded plus one
 774  */
 775 static av_always_inline
 776 int decode_block_coeffs(VP56RangeCoder *c, DCTELEM block[16],
 777                         uint8_t probs[16][3][NUM_DCT_TOKENS-1],
 778                         int i, int zero_nhood, int16_t qmul[2])
 779 {
 780     uint8_t *token_prob = probs[i][zero_nhood];
 781     if (!vp56_rac_get_prob_branchy(c, token_prob[0]))   // DCT_EOB
 782         return 0;
 783     return decode_block_coeffs_internal(c, block, probs, i, token_prob, qmul);
 784 }
 785
 786 static av_always_inline
 787 void decode_mb_coeffs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
 788                       uint8_t t_nnz[9], uint8_t l_nnz[9])
 789 {
 790     int i, x, y, luma_start = 0, luma_ctx = 3;
 791     int nnz_pred, nnz, nnz_total = 0;
 792     int segment = s->segment;
 793     int block_dc = 0;
 794
 795     if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
 796         nnz_pred = t_nnz[8] + l_nnz[8];
 797
 798         // decode DC values and do hadamard
 799         nnz = decode_block_coeffs(c, s->block_dc, s->prob->token[1], 0, nnz_pred,
 800                                   s->qmat[segment].luma_dc_qmul);
 801         l_nnz[8] = t_nnz[8] = !!nnz;
 802         if (nnz) {
 803             nnz_total += nnz;
 804             block_dc = 1;
 805             if (nnz == 1)
 806                 s->vp8dsp.vp8_luma_dc_wht_dc(s->block, s->block_dc);
 807             else
 808                 s->vp8dsp.vp8_luma_dc_wht(s->block, s->block_dc);
 809         }
 810         luma_start = 1;
 811         luma_ctx = 0;
 812     }
 813
 814     // luma blocks
 815     for (y = 0; y < 4; y++)
 816         for (x = 0; x < 4; x++) {
 817             nnz_pred = l_nnz[y] + t_nnz[x];
 818             nnz = decode_block_coeffs(c, s->block[y][x], s->prob->token[luma_ctx], luma_start,
 819                                       nnz_pred, s->qmat[segment].luma_qmul);
 820             // nnz+block_dc may be one more than the actual last index, but we don't care
 821             s->non_zero_count_cache[y][x] = nnz + block_dc;
 822             t_nnz[x] = l_nnz[y] = !!nnz;
 823             nnz_total += nnz;
 824         }
 825
 826     // chroma blocks
 827     // TODO: what to do about dimensions? 2nd dim for luma is x,
 828     // but for chroma it's (y<<1)|x
 829     for (i = 4; i < 6; i++)
 830         for (y = 0; y < 2; y++)
 831             for (x = 0; x < 2; x++) {
 832                 nnz_pred = l_nnz[i+2*y] + t_nnz[i+2*x];
 833                 nnz = decode_block_coeffs(c, s->block[i][(y<<1)+x], s->prob->token[2], 0,
 834                                           nnz_pred, s->qmat[segment].chroma_qmul);
 835                 s->non_zero_count_cache[i][(y<<1)+x] = nnz;
 836                 t_nnz[i+2*x] = l_nnz[i+2*y] = !!nnz;
 837                 nnz_total += nnz;
 838             }
 839
 840     // if there were no coded coeffs despite the macroblock not being marked skip,
 841     // we MUST not do the inner loop filter and should not do IDCT
 842     // Since skip isn't used for bitstream prediction, just manually set it.
 843     if (!nnz_total)
 844         mb->skip = 1;
 845 }
 846
 847 static av_always_inline
 848 void backup_mb_border(uint8_t *top_border, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr,
 849                       int linesize, int uvlinesize, int simple)
 850 {
 851     AV_COPY128(top_border, src_y + 15*linesize);
 852     if (!simple) {
 853         AV_COPY64(top_border+16, src_cb + 7*uvlinesize);
 854         AV_COPY64(top_border+24, src_cr + 7*uvlinesize);
 855     }
 856 }
 857
 858 static av_always_inline
 859 void xchg_mb_border(uint8_t *top_border, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr,
 860                     int linesize, int uvlinesize, int mb_x, int mb_y, int mb_width,
 861                     int simple, int xchg)
 862 {
 863     uint8_t *top_border_m1 = top_border-32;     // for TL prediction
 864     src_y  -=   linesize;
 865     src_cb -= uvlinesize;
 866     src_cr -= uvlinesize;
 867
 868 #define XCHG(a,b,xchg) do {                     \
 869         if (xchg) AV_SWAP64(b,a);               \
 870         else      AV_COPY64(b,a);               \
 871     } while (0)
 872
 873     XCHG(top_border_m1+8, src_y-8, xchg);
 874     XCHG(top_border,      src_y,   xchg);
 875     XCHG(top_border+8,    src_y+8, 1);
 876     if (mb_x < mb_width-1)
 877         XCHG(top_border+32, src_y+16, 1);
 878
 879     // only copy chroma for normal loop filter
 880     // or to initialize the top row to 127
 881     if (!simple || !mb_y) {
 882         XCHG(top_border_m1+16, src_cb-8, xchg);
 883         XCHG(top_border_m1+24, src_cr-8, xchg);
 884         XCHG(top_border+16,    src_cb, 1);
 885         XCHG(top_border+24,    src_cr, 1);
 886     }
 887 }
 888
 889 static av_always_inline
 890 int check_dc_pred8x8_mode(int mode, int mb_x, int mb_y)
 891 {
 892     if (!mb_x) {
 893         return mb_y ? TOP_DC_PRED8x8 : DC_128_PRED8x8;
 894     } else {
 895         return mb_y ? mode : LEFT_DC_PRED8x8;
 896     }
 897 }
 898
 899 static av_always_inline
 900 int check_tm_pred8x8_mode(int mode, int mb_x, int mb_y)
 901 {
 902     if (!mb_x) {
 903         return mb_y ? VERT_PRED8x8 : DC_129_PRED8x8;
 904     } else {
 905         return mb_y ? mode : HOR_PRED8x8;
 906     }
 907 }
 908
 909 static av_always_inline
 910 int check_intra_pred8x8_mode(int mode, int mb_x, int mb_y)
 911 {
 912     if (mode == DC_PRED8x8) {
 913         return check_dc_pred8x8_mode(mode, mb_x, mb_y);
 914     } else {
 915         return mode;
 916     }
 917 }
 918
 919 static av_always_inline
 920 int check_intra_pred8x8_mode_emuedge(int mode, int mb_x, int mb_y)
 921 {
 922     switch (mode) {
 923     case DC_PRED8x8:
 924         return check_dc_pred8x8_mode(mode, mb_x, mb_y);
 925     case VERT_PRED8x8:
 926         return !mb_y ? DC_127_PRED8x8 : mode;
 927     case HOR_PRED8x8:
 928         return !mb_x ? DC_129_PRED8x8 : mode;
 929     case PLANE_PRED8x8 /*TM*/:
 930         return check_tm_pred8x8_mode(mode, mb_x, mb_y);
 931     }
 932     return mode;
 933 }
 934
 935 static av_always_inline
 936 int check_tm_pred4x4_mode(int mode, int mb_x, int mb_y)
 937 {
 938     if (!mb_x) {
 939         return mb_y ? VERT_VP8_PRED : DC_129_PRED;
 940     } else {
 941         return mb_y ? mode : HOR_VP8_PRED;
 942     }
 943 }
 944
 945 static av_always_inline
 946 int check_intra_pred4x4_mode_emuedge(int mode, int mb_x, int mb_y, int *copy_buf)
 947 {
 948     switch (mode) {
 949     case VERT_PRED:
 950         if (!mb_x && mb_y) {
 951             *copy_buf = 1;
 952             return mode;
 953         }
 954         /* fall-through */
 955     case DIAG_DOWN_LEFT_PRED:
 956     case VERT_LEFT_PRED:
 957         return !mb_y ? DC_127_PRED : mode;
 958     case HOR_PRED:
 959         if (!mb_y) {
 960             *copy_buf = 1;
 961             return mode;
 962         }
 963         /* fall-through */
 964     case HOR_UP_PRED:
 965         return !mb_x ? DC_129_PRED : mode;
 966     case TM_VP8_PRED:
 967         return check_tm_pred4x4_mode(mode, mb_x, mb_y);
 968     case DC_PRED: // 4x4 DC doesn't use the same "H.264-style" exceptions as 16x16/8x8 DC
 969     case DIAG_DOWN_RIGHT_PRED:
 970     case VERT_RIGHT_PRED:
 971     case HOR_DOWN_PRED:
 972         if (!mb_y || !mb_x)
 973             *copy_buf = 1;
 974         return mode;
 975     }
 976     return mode;
 977 }
 978
 979 static av_always_inline
 980 void intra_predict(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb,
 981                    int mb_x, int mb_y)
 982 {
 983     AVCodecContext *avctx = s->avctx;
 984     int x, y, mode, nnz;
 985     uint32_t tr;
 986
 987     // for the first row, we need to run xchg_mb_border to init the top edge to 127
 988     // otherwise, skip it if we aren't going to deblock
 989     if (!(avctx->flags & CODEC_FLAG_EMU_EDGE && !mb_y) && (s->deblock_filter || !mb_y))
 990         xchg_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2],
 991                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
 992                        s->filter.simple, 1);
 993
 994     if (mb->mode < MODE_I4x4) {
 995         if (avctx->flags & CODEC_FLAG_EMU_EDGE) { // tested
 996             mode = check_intra_pred8x8_mode_emuedge(mb->mode, mb_x, mb_y);
 997         } else {
 998             mode = check_intra_pred8x8_mode(mb->mode, mb_x, mb_y);
 999         }
1000         s->hpc.pred16x16[mode](dst[0], s->linesize);
1001     } else {
1002         uint8_t *ptr = dst[0];
1003         uint8_t *intra4x4 = s->intra4x4_pred_mode_mb;
1004         uint8_t tr_top[4] = { 127, 127, 127, 127 };
1005
1006         // all blocks on the right edge of the macroblock use bottom edge
1007         // the top macroblock for their topright edge
1008         uint8_t *tr_right = ptr - s->linesize + 16;
1009
1010         // if we're on the right edge of the frame, said edge is extended
1011         // from the top macroblock
1012         if (!(!mb_y && avctx->flags & CODEC_FLAG_EMU_EDGE) &&
1013             mb_x == s->mb_width-1) {
1014             tr = tr_right[-1]*0x01010101u;
1015             tr_right = (uint8_t *)&tr;
1016         }
1017
1018         if (mb->skip)
1019             AV_ZERO128(s->non_zero_count_cache);
1020
1021         for (y = 0; y < 4; y++) {
1022             uint8_t *topright = ptr + 4 - s->linesize;
1023             for (x = 0; x < 4; x++) {
1024                 int copy = 0, linesize = s->linesize;
1025                 uint8_t *dst = ptr+4*x;
1026                 DECLARE_ALIGNED(4, uint8_t, copy_dst)[5*8];
1027
1028                 if ((y == 0 || x == 3) && mb_y == 0 && avctx->flags & CODEC_FLAG_EMU_EDGE) {
1029                     topright = tr_top;
1030                 } else if (x == 3)
1031                     topright = tr_right;
1032
1033                 if (avctx->flags & CODEC_FLAG_EMU_EDGE) { // mb_x+x or mb_y+y is a hack but works
1034                     mode = check_intra_pred4x4_mode_emuedge(intra4x4[x], mb_x + x, mb_y + y, &copy);
1035                     if (copy) {
1036                         dst = copy_dst + 12;
1037                         linesize = 8;
1038                         if (!(mb_y + y)) {
1039                             copy_dst[3] = 127U;
1040                             AV_WN32A(copy_dst+4, 127U * 0x01010101U);
1041                         } else {
1042                             AV_COPY32(copy_dst+4, ptr+4*x-s->linesize);
1043                             if (!(mb_x + x)) {
1044                                 copy_dst[3] = 129U;
1045                             } else {
1046                                 copy_dst[3] = ptr[4*x-s->linesize-1];
1047                             }
1048                         }
1049                         if (!(mb_x + x)) {
1050                             copy_dst[11] =
1051                             copy_dst[19] =
1052                             copy_dst[27] =
1053                             copy_dst[35] = 129U;
1054                         } else {
1055                             copy_dst[11] = ptr[4*x              -1];
1056                             copy_dst[19] = ptr[4*x+s->linesize  -1];
1057                             copy_dst[27] = ptr[4*x+s->linesize*2-1];
1058                             copy_dst[35] = ptr[4*x+s->linesize*3-1];
1059                         }
1060                     }
1061                 } else {
1062                     mode = intra4x4[x];
1063                 }
1064                 s->hpc.pred4x4[mode](dst, topright, linesize);
1065                 if (copy) {
1066                     AV_COPY32(ptr+4*x              , copy_dst+12);
1067                     AV_COPY32(ptr+4*x+s->linesize  , copy_dst+20);
1068                     AV_COPY32(ptr+4*x+s->linesize*2, copy_dst+28);
1069                     AV_COPY32(ptr+4*x+s->linesize*3, copy_dst+36);
1070                 }
1071
1072                 nnz = s->non_zero_count_cache[y][x];
1073                 if (nnz) {
1074                     if (nnz == 1)
1075                         s->vp8dsp.vp8_idct_dc_add(ptr+4*x, s->block[y][x], s->linesize);
1076                     else
1077                         s->vp8dsp.vp8_idct_add(ptr+4*x, s->block[y][x], s->linesize);
1078                 }
1079                 topright += 4;
1080             }
1081
1082             ptr   += 4*s->linesize;
1083             intra4x4 += 4;
1084         }
1085     }
1086
1087     if (avctx->flags & CODEC_FLAG_EMU_EDGE) {
1088         mode = check_intra_pred8x8_mode_emuedge(s->chroma_pred_mode, mb_x, mb_y);
1089     } else {
1090         mode = check_intra_pred8x8_mode(s->chroma_pred_mode, mb_x, mb_y);
1091     }
1092     s->hpc.pred8x8[mode](dst[1], s->uvlinesize);
1093     s->hpc.pred8x8[mode](dst[2], s->uvlinesize);
1094
1095     if (!(avctx->flags & CODEC_FLAG_EMU_EDGE && !mb_y) && (s->deblock_filter || !mb_y))
1096         xchg_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2],
1097                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1098                        s->filter.simple, 0);
1099 }
1100
1101 static const uint8_t subpel_idx[3][8] = {
1102     { 0, 1, 2, 1, 2, 1, 2, 1 }, // nr. of left extra pixels,
1103                                 // also function pointer index
1104     { 0, 3, 5, 3, 5, 3, 5, 3 }, // nr. of extra pixels required
1105     { 0, 2, 3, 2, 3, 2, 3, 2 }, // nr. of right extra pixels
1106 };
1107
1108 /**
1109  * luma MC function
1110  *
1111  * @param s VP8 decoding context
1112  * @param dst target buffer for block data at block position
1113  * @param ref reference picture buffer at origin (0, 0)
1114  * @param mv motion vector (relative to block position) to get pixel data from
1115  * @param x_off horizontal position of block from origin (0, 0)
1116  * @param y_off vertical position of block from origin (0, 0)
1117  * @param block_w width of block (16, 8 or 4)
1118  * @param block_h height of block (always same as block_w)
1119  * @param width width of src/dst plane data
1120  * @param height height of src/dst plane data
1121  * @param linesize size of a single line of plane data, including padding
1122  * @param mc_func motion compensation function pointers (bilinear or sixtap MC)
1123  */
1124 static av_always_inline
1125 void vp8_mc_luma(VP8Context *s, uint8_t *dst, AVFrame *ref, const VP56mv *mv,
1126                  int x_off, int y_off, int block_w, int block_h,
1127                  int width, int height, int linesize,
1128                  vp8_mc_func mc_func[3][3])
1129 {
1130     uint8_t *src = ref->data[0];
1131
1132     if (AV_RN32A(mv)) {
1133
1134         int mx = (mv->x << 1)&7, mx_idx = subpel_idx[0][mx];
1135         int my = (mv->y << 1)&7, my_idx = subpel_idx[0][my];
1136
1137         x_off += mv->x >> 2;
1138         y_off += mv->y >> 2;
1139
1140         // edge emulation
1141         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 4, 0);
1142         src += y_off * linesize + x_off;
1143         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1144             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1145             s->dsp.emulated_edge_mc(s->edge_emu_buffer, src - my_idx * linesize - mx_idx, linesize,
1146                                     block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1147                                     x_off - mx_idx, y_off - my_idx, width, height);
1148             src = s->edge_emu_buffer + mx_idx + linesize * my_idx;
1149         }
1150         mc_func[my_idx][mx_idx](dst, linesize, src, linesize, block_h, mx, my);
1151     } else {
1152         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 4, 0);
1153         mc_func[0][0](dst, linesize, src + y_off * linesize + x_off, linesize, block_h, 0, 0);
1154     }
1155 }
1156
1157 /**
1158  * chroma MC function
1159  *
1160  * @param s VP8 decoding context
1161  * @param dst1 target buffer for block data at block position (U plane)
1162  * @param dst2 target buffer for block data at block position (V plane)
1163  * @param ref reference picture buffer at origin (0, 0)
1164  * @param mv motion vector (relative to block position) to get pixel data from
1165  * @param x_off horizontal position of block from origin (0, 0)
1166  * @param y_off vertical position of block from origin (0, 0)
1167  * @param block_w width of block (16, 8 or 4)
1168  * @param block_h height of block (always same as block_w)
1169  * @param width width of src/dst plane data
1170  * @param height height of src/dst plane data
1171  * @param linesize size of a single line of plane data, including padding
1172  * @param mc_func motion compensation function pointers (bilinear or sixtap MC)
1173  */
1174 static av_always_inline
1175 void vp8_mc_chroma(VP8Context *s, uint8_t *dst1, uint8_t *dst2, AVFrame *ref,
1176                    const VP56mv *mv, int x_off, int y_off,
1177                    int block_w, int block_h, int width, int height, int linesize,
1178                    vp8_mc_func mc_func[3][3])
1179 {
1180     uint8_t *src1 = ref->data[1], *src2 = ref->data[2];
1181
1182     if (AV_RN32A(mv)) {
1183         int mx = mv->x&7, mx_idx = subpel_idx[0][mx];
1184         int my = mv->y&7, my_idx = subpel_idx[0][my];
1185
1186         x_off += mv->x >> 3;
1187         y_off += mv->y >> 3;
1188
1189         // edge emulation
1190         src1 += y_off * linesize + x_off;
1191         src2 += y_off * linesize + x_off;
1192         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 3, 0);
1193         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1194             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1195             s->dsp.emulated_edge_mc(s->edge_emu_buffer, src1 - my_idx * linesize - mx_idx, linesize,
1196                                     block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1197                                     x_off - mx_idx, y_off - my_idx, width, height);
1198             src1 = s->edge_emu_buffer + mx_idx + linesize * my_idx;
1199             mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my);
1200
1201             s->dsp.emulated_edge_mc(s->edge_emu_buffer, src2 - my_idx * linesize - mx_idx, linesize,
1202                                     block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1203                                     x_off - mx_idx, y_off - my_idx, width, height);
1204             src2 = s->edge_emu_buffer + mx_idx + linesize * my_idx;
1205             mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my);
1206         } else {
1207             mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my);
1208             mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my);
1209         }
1210     } else {
1211         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 3, 0);
1212         mc_func[0][0](dst1, linesize, src1 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1213         mc_func[0][0](dst2, linesize, src2 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1214     }
1215 }
1216
1217 static av_always_inline
1218 void vp8_mc_part(VP8Context *s, uint8_t *dst[3],
1219                  AVFrame *ref_frame, int x_off, int y_off,
1220                  int bx_off, int by_off,
1221                  int block_w, int block_h,
1222                  int width, int height, VP56mv *mv)
1223 {
1224     VP56mv uvmv = *mv;
1225
1226     /* Y */
1227     vp8_mc_luma(s, dst[0] + by_off * s->linesize + bx_off,
1228                 ref_frame, mv, x_off + bx_off, y_off + by_off,
1229                 block_w, block_h, width, height, s->linesize,
1230                 s->put_pixels_tab[block_w == 8]);
1231
1232     /* U/V */
1233     if (s->profile == 3) {
1234         uvmv.x &= ~7;
1235         uvmv.y &= ~7;
1236     }
1237     x_off   >>= 1; y_off   >>= 1;
1238     bx_off  >>= 1; by_off  >>= 1;
1239     width   >>= 1; height  >>= 1;
1240     block_w >>= 1; block_h >>= 1;
1241     vp8_mc_chroma(s, dst[1] + by_off * s->uvlinesize + bx_off,
1242                   dst[2] + by_off * s->uvlinesize + bx_off, ref_frame,
1243                   &uvmv, x_off + bx_off, y_off + by_off,
1244                   block_w, block_h, width, height, s->uvlinesize,
1245                   s->put_pixels_tab[1 + (block_w == 4)]);
1246 }
1247
1248 /* Fetch pixels for estimated mv 4 macroblocks ahead.
1249  * Optimized for 64-byte cache lines.  Inspired by ffh264 prefetch_motion. */
1250 static av_always_inline void prefetch_motion(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, int mb_xy, int ref)
1251 {
1252     /* Don't prefetch refs that haven't been used very often this frame. */
1253     if (s->ref_count[ref-1] > (mb_xy >> 5)) {
1254         int x_off = mb_x << 4, y_off = mb_y << 4;
1255         int mx = (mb->mv.x>>2) + x_off + 8;
1256         int my = (mb->mv.y>>2) + y_off;
1257         uint8_t **src= s->framep[ref]->data;
1258         int off= mx + (my + (mb_x&3)*4)*s->linesize + 64;
1259         /* For threading, a ff_thread_await_progress here might be useful, but
1260          * it actually slows down the decoder. Since a bad prefetch doesn't
1261          * generate bad decoder output, we don't run it here. */
1262         s->dsp.prefetch(src[0]+off, s->linesize, 4);
1263         off= (mx>>1) + ((my>>1) + (mb_x&7))*s->uvlinesize + 64;
1264         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
1265     }
1266 }
1267
1268 /**
1269  * Apply motion vectors to prediction buffer, chapter 18.
1270  */
1271 static av_always_inline
1272 void inter_predict(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb,
1273                    int mb_x, int mb_y)
1274 {
1275     int x_off = mb_x << 4, y_off = mb_y << 4;
1276     int width = 16*s->mb_width, height = 16*s->mb_height;
1277     AVFrame *ref = s->framep[mb->ref_frame];
1278     VP56mv *bmv = mb->bmv;
1279
1280     switch (mb->partitioning) {
1281     case VP8_SPLITMVMODE_NONE:
1282         vp8_mc_part(s, dst, ref, x_off, y_off,
1283                     0, 0, 16, 16, width, height, &mb->mv);
1284         break;
1285     case VP8_SPLITMVMODE_4x4: {
1286         int x, y;
1287         VP56mv uvmv;
1288
1289         /* Y */
1290         for (y = 0; y < 4; y++) {
1291             for (x = 0; x < 4; x++) {
1292                 vp8_mc_luma(s, dst[0] + 4*y*s->linesize + x*4,
1293                             ref, &bmv[4*y + x],
1294                             4*x + x_off, 4*y + y_off, 4, 4,
1295                             width, height, s->linesize,
1296                             s->put_pixels_tab[2]);
1297             }
1298         }
1299
1300         /* U/V */
1301         x_off >>= 1; y_off >>= 1; width >>= 1; height >>= 1;
1302         for (y = 0; y < 2; y++) {
1303             for (x = 0; x < 2; x++) {
1304                 uvmv.x = mb->bmv[ 2*y    * 4 + 2*x  ].x +
1305                          mb->bmv[ 2*y    * 4 + 2*x+1].x +
1306                          mb->bmv[(2*y+1) * 4 + 2*x  ].x +
1307                          mb->bmv[(2*y+1) * 4 + 2*x+1].x;
1308                 uvmv.y = mb->bmv[ 2*y    * 4 + 2*x  ].y +
1309                          mb->bmv[ 2*y    * 4 + 2*x+1].y +
1310                          mb->bmv[(2*y+1) * 4 + 2*x  ].y +
1311                          mb->bmv[(2*y+1) * 4 + 2*x+1].y;
1312                 uvmv.x = (uvmv.x + 2 + (uvmv.x >> (INT_BIT-1))) >> 2;
1313                 uvmv.y = (uvmv.y + 2 + (uvmv.y >> (INT_BIT-1))) >> 2;
1314                 if (s->profile == 3) {
1315                     uvmv.x &= ~7;
1316                     uvmv.y &= ~7;
1317                 }
1318                 vp8_mc_chroma(s, dst[1] + 4*y*s->uvlinesize + x*4,
1319                               dst[2] + 4*y*s->uvlinesize + x*4, ref, &uvmv,
1320                               4*x + x_off, 4*y + y_off, 4, 4,
1321                               width, height, s->uvlinesize,
1322                               s->put_pixels_tab[2]);
1323             }
1324         }
1325         break;
1326     }
1327     case VP8_SPLITMVMODE_16x8:
1328         vp8_mc_part(s, dst, ref, x_off, y_off,
1329                     0, 0, 16, 8, width, height, &bmv[0]);
1330         vp8_mc_part(s, dst, ref, x_off, y_off,
1331                     0, 8, 16, 8, width, height, &bmv[1]);
1332         break;
1333     case VP8_SPLITMVMODE_8x16:
1334         vp8_mc_part(s, dst, ref, x_off, y_off,
1335                     0, 0, 8, 16, width, height, &bmv[0]);
1336         vp8_mc_part(s, dst, ref, x_off, y_off,
1337                     8, 0, 8, 16, width, height, &bmv[1]);
1338         break;
1339     case VP8_SPLITMVMODE_8x8:
1340         vp8_mc_part(s, dst, ref, x_off, y_off,
1341                     0, 0, 8, 8, width, height, &bmv[0]);
1342         vp8_mc_part(s, dst, ref, x_off, y_off,
1343                     8, 0, 8, 8, width, height, &bmv[1]);
1344         vp8_mc_part(s, dst, ref, x_off, y_off,
1345                     0, 8, 8, 8, width, height, &bmv[2]);
1346         vp8_mc_part(s, dst, ref, x_off, y_off,
1347                     8, 8, 8, 8, width, height, &bmv[3]);
1348         break;
1349     }
1350 }
1351
1352 static av_always_inline void idct_mb(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb)
1353 {
1354     int x, y, ch;
1355
1356     if (mb->mode != MODE_I4x4) {
1357         uint8_t *y_dst = dst[0];
1358         for (y = 0; y < 4; y++) {
1359             uint32_t nnz4 = AV_RL32(s->non_zero_count_cache[y]);
1360             if (nnz4) {
1361                 if (nnz4&~0x01010101) {
1362                     for (x = 0; x < 4; x++) {
1363                         if ((uint8_t)nnz4 == 1)
1364                             s->vp8dsp.vp8_idct_dc_add(y_dst+4*x, s->block[y][x], s->linesize);
1365                         else if((uint8_t)nnz4 > 1)
1366                             s->vp8dsp.vp8_idct_add(y_dst+4*x, s->block[y][x], s->linesize);
1367                         nnz4 >>= 8;
1368                         if (!nnz4)
1369                             break;
1370                     }
1371                 } else {
1372                     s->vp8dsp.vp8_idct_dc_add4y(y_dst, s->block[y], s->linesize);
1373                 }
1374             }
1375             y_dst += 4*s->linesize;
1376         }
1377     }
1378
1379     for (ch = 0; ch < 2; ch++) {
1380         uint32_t nnz4 = AV_RL32(s->non_zero_count_cache[4+ch]);
1381         if (nnz4) {
1382             uint8_t *ch_dst = dst[1+ch];
1383             if (nnz4&~0x01010101) {
1384                 for (y = 0; y < 2; y++) {
1385                     for (x = 0; x < 2; x++) {
1386                         if ((uint8_t)nnz4 == 1)
1387                             s->vp8dsp.vp8_idct_dc_add(ch_dst+4*x, s->block[4+ch][(y<<1)+x], s->uvlinesize);
1388                         else if((uint8_t)nnz4 > 1)
1389                             s->vp8dsp.vp8_idct_add(ch_dst+4*x, s->block[4+ch][(y<<1)+x], s->uvlinesize);
1390                         nnz4 >>= 8;
1391                         if (!nnz4)
1392                             goto chroma_idct_end;
1393                     }
1394                     ch_dst += 4*s->uvlinesize;
1395                 }
1396             } else {
1397                 s->vp8dsp.vp8_idct_dc_add4uv(ch_dst, s->block[4+ch], s->uvlinesize);
1398             }
1399         }
1400 chroma_idct_end: ;
1401     }
1402 }
1403
1404 static av_always_inline void filter_level_for_mb(VP8Context *s, VP8Macroblock *mb, VP8FilterStrength *f )
1405 {
1406     int interior_limit, filter_level;
1407
1408     if (s->segmentation.enabled) {
1409         filter_level = s->segmentation.filter_level[s->segment];
1410         if (!s->segmentation.absolute_vals)
1411             filter_level += s->filter.level;
1412     } else
1413         filter_level = s->filter.level;
1414
1415     if (s->lf_delta.enabled) {
1416         filter_level += s->lf_delta.ref[mb->ref_frame];
1417         filter_level += s->lf_delta.mode[mb->mode];
1418     }
1419
1420     filter_level = av_clip_uintp2(filter_level, 6);
1421
1422     interior_limit = filter_level;
1423     if (s->filter.sharpness) {
1424         interior_limit >>= (s->filter.sharpness + 3) >> 2;
1425         interior_limit = FFMIN(interior_limit, 9 - s->filter.sharpness);
1426     }
1427     interior_limit = FFMAX(interior_limit, 1);
1428
1429     f->filter_level = filter_level;
1430     f->inner_limit = interior_limit;
1431     f->inner_filter = !mb->skip || mb->mode == MODE_I4x4 || mb->mode == VP8_MVMODE_SPLIT;
1432 }
1433
1434 static av_always_inline void filter_mb(VP8Context *s, uint8_t *dst[3], VP8FilterStrength *f, int mb_x, int mb_y)
1435 {
1436     int mbedge_lim, bedge_lim, hev_thresh;
1437     int filter_level = f->filter_level;
1438     int inner_limit = f->inner_limit;
1439     int inner_filter = f->inner_filter;
1440     int linesize = s->linesize;
1441     int uvlinesize = s->uvlinesize;
1442     static const uint8_t hev_thresh_lut[2][64] = {
1443         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
1444           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1445           3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1446           3, 3, 3, 3 },
1447         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
1448           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1449           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1450           2, 2, 2, 2 }
1451     };
1452
1453     if (!filter_level)
1454         return;
1455
1456      bedge_lim = 2*filter_level + inner_limit;
1457     mbedge_lim = bedge_lim + 4;
1458
1459     hev_thresh = hev_thresh_lut[s->keyframe][filter_level];
1460
1461     if (mb_x) {
1462         s->vp8dsp.vp8_h_loop_filter16y(dst[0],     linesize,
1463                                        mbedge_lim, inner_limit, hev_thresh);
1464         s->vp8dsp.vp8_h_loop_filter8uv(dst[1],     dst[2],      uvlinesize,
1465                                        mbedge_lim, inner_limit, hev_thresh);
1466     }
1467
1468     if (inner_filter) {
1469         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+ 4, linesize, bedge_lim,
1470                                              inner_limit, hev_thresh);
1471         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+ 8, linesize, bedge_lim,
1472                                              inner_limit, hev_thresh);
1473         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+12, linesize, bedge_lim,
1474                                              inner_limit, hev_thresh);
1475         s->vp8dsp.vp8_h_loop_filter8uv_inner(dst[1] + 4, dst[2] + 4,
1476                                              uvlinesize,  bedge_lim,
1477                                              inner_limit, hev_thresh);
1478     }
1479
1480     if (mb_y) {
1481         s->vp8dsp.vp8_v_loop_filter16y(dst[0],     linesize,
1482                                        mbedge_lim, inner_limit, hev_thresh);
1483         s->vp8dsp.vp8_v_loop_filter8uv(dst[1],     dst[2],      uvlinesize,
1484                                        mbedge_lim, inner_limit, hev_thresh);
1485     }
1486
1487     if (inner_filter) {
1488         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+ 4*linesize,
1489                                              linesize,    bedge_lim,
1490                                              inner_limit, hev_thresh);
1491         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+ 8*linesize,
1492                                              linesize,    bedge_lim,
1493                                              inner_limit, hev_thresh);
1494         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+12*linesize,
1495                                              linesize,    bedge_lim,
1496                                              inner_limit, hev_thresh);
1497         s->vp8dsp.vp8_v_loop_filter8uv_inner(dst[1] + 4 * uvlinesize,
1498                                              dst[2] + 4 * uvlinesize,
1499                                              uvlinesize,  bedge_lim,
1500                                              inner_limit, hev_thresh);
1501     }
1502 }
1503
1504 static av_always_inline void filter_mb_simple(VP8Context *s, uint8_t *dst, VP8FilterStrength *f, int mb_x, int mb_y)
1505 {
1506     int mbedge_lim, bedge_lim;
1507     int filter_level = f->filter_level;
1508     int inner_limit = f->inner_limit;
1509     int inner_filter = f->inner_filter;
1510     int linesize = s->linesize;
1511
1512     if (!filter_level)
1513         return;
1514
1515      bedge_lim = 2*filter_level + inner_limit;
1516     mbedge_lim = bedge_lim + 4;
1517
1518     if (mb_x)
1519         s->vp8dsp.vp8_h_loop_filter_simple(dst, linesize, mbedge_lim);
1520     if (inner_filter) {
1521         s->vp8dsp.vp8_h_loop_filter_simple(dst+ 4, linesize, bedge_lim);
1522         s->vp8dsp.vp8_h_loop_filter_simple(dst+ 8, linesize, bedge_lim);
1523         s->vp8dsp.vp8_h_loop_filter_simple(dst+12, linesize, bedge_lim);
1524     }
1525
1526     if (mb_y)
1527         s->vp8dsp.vp8_v_loop_filter_simple(dst, linesize, mbedge_lim);
1528     if (inner_filter) {
1529         s->vp8dsp.vp8_v_loop_filter_simple(dst+ 4*linesize, linesize, bedge_lim);
1530         s->vp8dsp.vp8_v_loop_filter_simple(dst+ 8*linesize, linesize, bedge_lim);
1531         s->vp8dsp.vp8_v_loop_filter_simple(dst+12*linesize, linesize, bedge_lim);
1532     }
1533 }
1534
1535 static void filter_mb_row(VP8Context *s, AVFrame *curframe, int mb_y)
1536 {
1537     VP8FilterStrength *f = s->filter_strength;
1538     uint8_t *dst[3] = {
1539         curframe->data[0] + 16*mb_y*s->linesize,
1540         curframe->data[1] +  8*mb_y*s->uvlinesize,
1541         curframe->data[2] +  8*mb_y*s->uvlinesize
1542     };
1543     int mb_x;
1544
1545     for (mb_x = 0; mb_x < s->mb_width; mb_x++) {
1546         backup_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2], s->linesize, s->uvlinesize, 0);
1547         filter_mb(s, dst, f++, mb_x, mb_y);
1548         dst[0] += 16;
1549         dst[1] += 8;
1550         dst[2] += 8;
1551     }
1552 }
1553
1554 static void filter_mb_row_simple(VP8Context *s, AVFrame *curframe, int mb_y)
1555 {
1556     VP8FilterStrength *f = s->filter_strength;
1557     uint8_t *dst = curframe->data[0] + 16*mb_y*s->linesize;
1558     int mb_x;
1559
1560     for (mb_x = 0; mb_x < s->mb_width; mb_x++) {
1561         backup_mb_border(s->top_border[mb_x+1], dst, NULL, NULL, s->linesize, 0, 1);
1562         filter_mb_simple(s, dst, f++, mb_x, mb_y);
1563         dst += 16;
1564     }
1565 }
1566
1567 static void release_queued_segmaps(VP8Context *s, int is_close)
1568 {
1569     int leave_behind = is_close ? 0 : !s->maps_are_invalid;
1570     while (s->num_maps_to_be_freed > leave_behind)
1571         av_freep(&s->segmentation_maps[--s->num_maps_to_be_freed]);
1572     s->maps_are_invalid = 0;
1573 }
1574
1575 static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
1576                             AVPacket *avpkt)
1577 {
1578     VP8Context *s = avctx->priv_data;
1579     int ret, mb_x, mb_y, i, y, referenced;
1580     enum AVDiscard skip_thresh;
1581     AVFrame *av_uninit(curframe), *prev_frame;
1582
1583     release_queued_segmaps(s, 0);
1584
1585     if ((ret = decode_frame_header(s, avpkt->data, avpkt->size)) < 0)
1586         goto err;
1587
1588     prev_frame = s->framep[VP56_FRAME_CURRENT];
1589
1590     referenced = s->update_last || s->update_golden == VP56_FRAME_CURRENT
1591                                 || s->update_altref == VP56_FRAME_CURRENT;
1592
1593     skip_thresh = !referenced ? AVDISCARD_NONREF :
1594                     !s->keyframe ? AVDISCARD_NONKEY : AVDISCARD_ALL;
1595
1596     if (avctx->skip_frame >= skip_thresh) {
1597         s->invisible = 1;
1598         memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
1599         goto skip_decode;
1600     }
1601     s->deblock_filter = s->filter.level && avctx->skip_loop_filter < skip_thresh;
1602
1603     // release no longer referenced frames
1604     for (i = 0; i < 5; i++)
1605         if (s->frames[i].data[0] &&
1606             &s->frames[i] != prev_frame &&
1607             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
1608             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN] &&
1609             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2])
1610             vp8_release_frame(s, &s->frames[i], 1, 0);
1611
1612     // find a free buffer
1613     for (i = 0; i < 5; i++)
1614         if (&s->frames[i] != prev_frame &&
1615             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
1616             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN] &&
1617             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2]) {
1618             curframe = s->framep[VP56_FRAME_CURRENT] = &s->frames[i];
1619             break;
1620         }
1621     if (i == 5) {
1622         av_log(avctx, AV_LOG_FATAL, "Ran out of free frames!\n");
1623         abort();
1624     }
1625     if (curframe->data[0])
1626         vp8_release_frame(s, curframe, 1, 0);
1627
1628     // Given that arithmetic probabilities are updated every frame, it's quite likely
1629     // that the values we have on a random interframe are complete junk if we didn't
1630     // start decode on a keyframe. So just don't display anything rather than junk.
1631     if (!s->keyframe && (!s->framep[VP56_FRAME_PREVIOUS] ||
1632                          !s->framep[VP56_FRAME_GOLDEN] ||
1633                          !s->framep[VP56_FRAME_GOLDEN2])) {
1634         av_log(avctx, AV_LOG_WARNING, "Discarding interframe without a prior keyframe!\n");
1635         ret = AVERROR_INVALIDDATA;
1636         goto err;
1637     }
1638
1639     curframe->key_frame = s->keyframe;
1640     curframe->pict_type = s->keyframe ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
1641     curframe->reference = referenced ? 3 : 0;
1642     if ((ret = vp8_alloc_frame(s, curframe))) {
1643         av_log(avctx, AV_LOG_ERROR, "get_buffer() failed!\n");
1644         goto err;
1645     }
1646
1647     // check if golden and altref are swapped
1648     if (s->update_altref != VP56_FRAME_NONE) {
1649         s->next_framep[VP56_FRAME_GOLDEN2]  = s->framep[s->update_altref];
1650     } else {
1651         s->next_framep[VP56_FRAME_GOLDEN2]  = s->framep[VP56_FRAME_GOLDEN2];
1652     }
1653     if (s->update_golden != VP56_FRAME_NONE) {
1654         s->next_framep[VP56_FRAME_GOLDEN]   = s->framep[s->update_golden];
1655     } else {
1656         s->next_framep[VP56_FRAME_GOLDEN]   = s->framep[VP56_FRAME_GOLDEN];
1657     }
1658     if (s->update_last) {
1659         s->next_framep[VP56_FRAME_PREVIOUS] = curframe;
1660     } else {
1661         s->next_framep[VP56_FRAME_PREVIOUS] = s->framep[VP56_FRAME_PREVIOUS];
1662     }
1663     s->next_framep[VP56_FRAME_CURRENT]      = curframe;
1664
1665     ff_thread_finish_setup(avctx);
1666
1667     s->linesize   = curframe->linesize[0];
1668     s->uvlinesize = curframe->linesize[1];
1669
1670     if (!s->edge_emu_buffer)
1671         s->edge_emu_buffer = av_malloc(21*s->linesize);
1672
1673     memset(s->top_nnz, 0, s->mb_width*sizeof(*s->top_nnz));
1674
1675     /* Zero macroblock structures for top/top-left prediction from outside the frame. */
1676     memset(s->macroblocks + s->mb_height*2 - 1, 0, (s->mb_width+1)*sizeof(*s->macroblocks));
1677
1678     // top edge of 127 for intra prediction
1679     if (!(avctx->flags & CODEC_FLAG_EMU_EDGE)) {
1680         s->top_border[0][15] = s->top_border[0][23] = 127;
1681         memset(s->top_border[1]-1, 127, s->mb_width*sizeof(*s->top_border)+1);
1682     }
1683     memset(s->ref_count, 0, sizeof(s->ref_count));
1684     if (s->keyframe)
1685         memset(s->intra4x4_pred_mode_top, DC_PRED, s->mb_width*4);
1686
1687 #define MARGIN (16 << 2)
1688     s->mv_min.y = -MARGIN;
1689     s->mv_max.y = ((s->mb_height - 1) << 6) + MARGIN;
1690
1691     for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
1692         VP56RangeCoder *c = &s->coeff_partition[mb_y & (s->num_coeff_partitions-1)];
1693         VP8Macroblock *mb = s->macroblocks + (s->mb_height - mb_y - 1)*2;
1694         int mb_xy = mb_y*s->mb_width;
1695         uint8_t *dst[3] = {
1696             curframe->data[0] + 16*mb_y*s->linesize,
1697             curframe->data[1] +  8*mb_y*s->uvlinesize,
1698             curframe->data[2] +  8*mb_y*s->uvlinesize
1699         };
1700
1701         memset(mb - 1, 0, sizeof(*mb));   // zero left macroblock
1702         memset(s->left_nnz, 0, sizeof(s->left_nnz));
1703         AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED*0x01010101);
1704
1705         // left edge of 129 for intra prediction
1706         if (!(avctx->flags & CODEC_FLAG_EMU_EDGE)) {
1707             for (i = 0; i < 3; i++)
1708                 for (y = 0; y < 16>>!!i; y++)
1709                     dst[i][y*curframe->linesize[i]-1] = 129;
1710             if (mb_y == 1) // top left edge is also 129
1711                 s->top_border[0][15] = s->top_border[0][23] = s->top_border[0][31] = 129;
1712         }
1713
1714         s->mv_min.x = -MARGIN;
1715         s->mv_max.x = ((s->mb_width  - 1) << 6) + MARGIN;
1716         if (prev_frame && s->segmentation.enabled && !s->segmentation.update_map)
1717             ff_thread_await_progress(prev_frame, mb_y, 0);
1718
1719         for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
1720             /* Prefetch the current frame, 4 MBs ahead */
1721             s->dsp.prefetch(dst[0] + (mb_x&3)*4*s->linesize + 64, s->linesize, 4);
1722             s->dsp.prefetch(dst[1] + (mb_x&7)*s->uvlinesize + 64, dst[2] - dst[1], 2);
1723
1724             decode_mb_mode(s, mb, mb_x, mb_y, curframe->ref_index[0] + mb_xy,
1725                            prev_frame && prev_frame->ref_index[0] ? prev_frame->ref_index[0] + mb_xy : NULL);
1726
1727             prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_PREVIOUS);
1728
1729             if (!mb->skip)
1730                 decode_mb_coeffs(s, c, mb, s->top_nnz[mb_x], s->left_nnz);
1731
1732             if (mb->mode <= MODE_I4x4)
1733                 intra_predict(s, dst, mb, mb_x, mb_y);
1734             else
1735                 inter_predict(s, dst, mb, mb_x, mb_y);
1736
1737             prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN);
1738
1739             if (!mb->skip) {
1740                 idct_mb(s, dst, mb);
1741             } else {
1742                 AV_ZERO64(s->left_nnz);
1743                 AV_WN64(s->top_nnz[mb_x], 0);   // array of 9, so unaligned
1744
1745                 // Reset DC block predictors if they would exist if the mb had coefficients
1746                 if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
1747                     s->left_nnz[8]      = 0;
1748                     s->top_nnz[mb_x][8] = 0;
1749                 }
1750             }
1751
1752             if (s->deblock_filter)
1753                 filter_level_for_mb(s, mb, &s->filter_strength[mb_x]);
1754
1755             prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN2);
1756
1757             dst[0] += 16;
1758             dst[1] += 8;
1759             dst[2] += 8;
1760             s->mv_min.x -= 64;
1761             s->mv_max.x -= 64;
1762         }
1763         if (s->deblock_filter) {
1764             if (s->filter.simple)
1765                 filter_mb_row_simple(s, curframe, mb_y);
1766             else
1767                 filter_mb_row(s, curframe, mb_y);
1768         }
1769         s->mv_min.y -= 64;
1770         s->mv_max.y -= 64;
1771
1772         ff_thread_report_progress(curframe, mb_y, 0);
1773     }
1774
1775     ff_thread_report_progress(curframe, INT_MAX, 0);
1776     memcpy(&s->framep[0], &s->next_framep[0], sizeof(s->framep[0]) * 4);
1777
1778 skip_decode:
1779     // if future frames don't use the updated probabilities,
1780     // reset them to the values we saved
1781     if (!s->update_probabilities)
1782         s->prob[0] = s->prob[1];
1783
1784     if (!s->invisible) {
1785         *(AVFrame*)data = *curframe;
1786         *data_size = sizeof(AVFrame);
1787     }
1788
1789     return avpkt->size;
1790 err:
1791     memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
1792     return ret;
1793 }
1794
1795 static av_cold int vp8_decode_init(AVCodecContext *avctx)
1796 {
1797     VP8Context *s = avctx->priv_data;
1798
1799     s->avctx = avctx;
1800     avctx->pix_fmt = PIX_FMT_YUV420P;
1801
1802     ff_dsputil_init(&s->dsp, avctx);
1803     ff_h264_pred_init(&s->hpc, CODEC_ID_VP8, 8, 1);
1804     ff_vp8dsp_init(&s->vp8dsp);
1805
1806     return 0;
1807 }
1808
1809 static av_cold int vp8_decode_free(AVCodecContext *avctx)
1810 {
1811     vp8_decode_flush_impl(avctx, 0, 1, 1);
1812     release_queued_segmaps(avctx->priv_data, 1);
1813     return 0;
1814 }
1815
1816 static av_cold int vp8_decode_init_thread_copy(AVCodecContext *avctx)
1817 {
1818     VP8Context *s = avctx->priv_data;
1819
1820     s->avctx = avctx;
1821
1822     return 0;
1823 }
1824
1825 #define REBASE(pic) \
1826     pic ? pic - &s_src->frames[0] + &s->frames[0] : NULL
1827
1828 static int vp8_decode_update_thread_context(AVCodecContext *dst, const AVCodecContext *src)
1829 {
1830     VP8Context *s = dst->priv_data, *s_src = src->priv_data;
1831
1832     if (s->macroblocks_base &&
1833         (s_src->mb_width != s->mb_width || s_src->mb_height != s->mb_height)) {
1834         free_buffers(s);
1835         s->maps_are_invalid = 1;
1836         s->mb_width  = s_src->mb_width;
1837         s->mb_height = s_src->mb_height;
1838     }
1839
1840     s->prob[0] = s_src->prob[!s_src->update_probabilities];
1841     s->segmentation = s_src->segmentation;
1842     s->lf_delta = s_src->lf_delta;
1843     memcpy(s->sign_bias, s_src->sign_bias, sizeof(s->sign_bias));
1844
1845     memcpy(&s->frames, &s_src->frames, sizeof(s->frames));
1846     s->framep[0] = REBASE(s_src->next_framep[0]);
1847     s->framep[1] = REBASE(s_src->next_framep[1]);
1848     s->framep[2] = REBASE(s_src->next_framep[2]);
1849     s->framep[3] = REBASE(s_src->next_framep[3]);
1850
1851     return 0;
1852 }
1853
1854 AVCodec ff_vp8_decoder = {
1855     .name                  = "vp8",
1856     .type                  = AVMEDIA_TYPE_VIDEO,
1857     .id                    = CODEC_ID_VP8,
1858     .priv_data_size        = sizeof(VP8Context),
1859     .init                  = vp8_decode_init,
1860     .close                 = vp8_decode_free,
1861     .decode                = vp8_decode_frame,
1862     .capabilities          = CODEC_CAP_DR1 | CODEC_CAP_FRAME_THREADS,
1863     .flush                 = vp8_decode_flush,
1864     .long_name             = NULL_IF_CONFIG_SMALL("On2 VP8"),
1865     .init_thread_copy      = ONLY_IF_THREADS_ENABLED(vp8_decode_init_thread_copy),
1866     .update_thread_context = ONLY_IF_THREADS_ENABLED(vp8_decode_update_thread_context),
1867 };