git.sesse.net Git - ffmpeg/blob - libavcodec/vp8.c

   1 /**
   2  * VP8 compatible video decoder
   3  *
   4  * Copyright (C) 2010 David Conrad
   5  * Copyright (C) 2010 Ronald S. Bultje
   6  * Copyright (C) 2010 Jason Garrett-Glaser
   7  *
   8  * This file is part of FFmpeg.
   9  *
  10  * FFmpeg is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2.1 of the License, or (at your option) any later version.
  14  *
  15  * FFmpeg is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with FFmpeg; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23  */
  24
  25 #include "libavutil/imgutils.h"
  26 #include "avcodec.h"
  27 #include "internal.h"
  28 #include "vp8.h"
  29 #include "vp8data.h"
  30 #include "rectangle.h"
  31 #include "thread.h"
  32
  33 #if ARCH_ARM
  34 #   include "arm/vp8.h"
  35 #endif
  36
  37 static void free_buffers(VP8Context *s)
  38 {
  39     av_freep(&s->macroblocks_base);
  40     av_freep(&s->filter_strength);
  41     av_freep(&s->intra4x4_pred_mode_top);
  42     av_freep(&s->top_nnz);
  43     av_freep(&s->edge_emu_buffer);
  44     av_freep(&s->top_border);
  45
  46     s->macroblocks = NULL;
  47 }
  48
  49 static int vp8_alloc_frame(VP8Context *s, AVFrame *f)
  50 {
  51     int ret;
  52     if ((ret = ff_thread_get_buffer(s->avctx, f)) < 0)
  53         return ret;
  54     if (s->num_maps_to_be_freed) {
  55         assert(!s->maps_are_invalid);
  56         f->ref_index[0] = s->segmentation_maps[--s->num_maps_to_be_freed];
  57     } else if (!(f->ref_index[0] = av_mallocz(s->mb_width * s->mb_height))) {
  58         ff_thread_release_buffer(s->avctx, f);
  59         return AVERROR(ENOMEM);
  60     }
  61     return 0;
  62 }
  63
  64 static void vp8_release_frame(VP8Context *s, AVFrame *f, int prefer_delayed_free, int can_direct_free)
  65 {
  66     if (f->ref_index[0]) {
  67         if (prefer_delayed_free) {
  68             /* Upon a size change, we want to free the maps but other threads may still
  69              * be using them, so queue them. Upon a seek, all threads are inactive so
  70              * we want to cache one to prevent re-allocation in the next decoding
  71              * iteration, but the rest we can free directly. */
  72             int max_queued_maps = can_direct_free ? 1 : FF_ARRAY_ELEMS(s->segmentation_maps);
  73             if (s->num_maps_to_be_freed < max_queued_maps) {
  74                 s->segmentation_maps[s->num_maps_to_be_freed++] = f->ref_index[0];
  75             } else if (can_direct_free) /* vp8_decode_flush(), but our queue is full */ {
  76                 av_free(f->ref_index[0]);
  77             } /* else: MEMLEAK (should never happen, but better that than crash) */
  78             f->ref_index[0] = NULL;
  79         } else /* vp8_decode_free() */ {
  80             av_free(f->ref_index[0]);
  81         }
  82     }
  83     ff_thread_release_buffer(s->avctx, f);
  84 }
  85
  86 static void vp8_decode_flush_impl(AVCodecContext *avctx,
  87                                   int prefer_delayed_free, int can_direct_free, int free_mem)
  88 {
  89     VP8Context *s = avctx->priv_data;
  90     int i;
  91
  92     if (!avctx->internal->is_copy) {
  93         for (i = 0; i < 5; i++)
  94             if (s->frames[i].data[0])
  95                 vp8_release_frame(s, &s->frames[i], prefer_delayed_free, can_direct_free);
  96     }
  97     memset(s->framep, 0, sizeof(s->framep));
  98
  99     if (free_mem) {
 100         free_buffers(s);
 101         s->maps_are_invalid = 1;
 102     }
 103 }
 104
 105 static void vp8_decode_flush(AVCodecContext *avctx)
 106 {
 107     vp8_decode_flush_impl(avctx, 1, 1, 0);
 108 }
 109
 110 static int update_dimensions(VP8Context *s, int width, int height)
 111 {
 112     if (width  != s->avctx->width ||
 113         height != s->avctx->height) {
 114         if (av_image_check_size(width, height, 0, s->avctx))
 115             return AVERROR_INVALIDDATA;
 116
 117         vp8_decode_flush_impl(s->avctx, 1, 0, 1);
 118
 119         avcodec_set_dimensions(s->avctx, width, height);
 120     }
 121
 122     s->mb_width  = (s->avctx->coded_width +15) / 16;
 123     s->mb_height = (s->avctx->coded_height+15) / 16;
 124
 125     s->macroblocks_base        = av_mallocz((s->mb_width+s->mb_height*2+1)*sizeof(*s->macroblocks));
 126     s->filter_strength         = av_mallocz(s->mb_width*sizeof(*s->filter_strength));
 127     s->intra4x4_pred_mode_top  = av_mallocz(s->mb_width*4);
 128     s->top_nnz                 = av_mallocz(s->mb_width*sizeof(*s->top_nnz));
 129     s->top_border              = av_mallocz((s->mb_width+1)*sizeof(*s->top_border));
 130
 131     if (!s->macroblocks_base || !s->filter_strength || !s->intra4x4_pred_mode_top ||
 132         !s->top_nnz || !s->top_border)
 133         return AVERROR(ENOMEM);
 134
 135     s->macroblocks        = s->macroblocks_base + 1;
 136
 137     return 0;
 138 }
 139
 140 static void parse_segment_info(VP8Context *s)
 141 {
 142     VP56RangeCoder *c = &s->c;
 143     int i;
 144
 145     s->segmentation.update_map = vp8_rac_get(c);
 146
 147     if (vp8_rac_get(c)) { // update segment feature data
 148         s->segmentation.absolute_vals = vp8_rac_get(c);
 149
 150         for (i = 0; i < 4; i++)
 151             s->segmentation.base_quant[i]   = vp8_rac_get_sint(c, 7);
 152
 153         for (i = 0; i < 4; i++)
 154             s->segmentation.filter_level[i] = vp8_rac_get_sint(c, 6);
 155     }
 156     if (s->segmentation.update_map)
 157         for (i = 0; i < 3; i++)
 158             s->prob->segmentid[i] = vp8_rac_get(c) ? vp8_rac_get_uint(c, 8) : 255;
 159 }
 160
 161 static void update_lf_deltas(VP8Context *s)
 162 {
 163     VP56RangeCoder *c = &s->c;
 164     int i;
 165
 166     for (i = 0; i < 4; i++)
 167         s->lf_delta.ref[i]  = vp8_rac_get_sint(c, 6);
 168
 169     for (i = MODE_I4x4; i <= VP8_MVMODE_SPLIT; i++)
 170         s->lf_delta.mode[i] = vp8_rac_get_sint(c, 6);
 171 }
 172
 173 static int setup_partitions(VP8Context *s, const uint8_t *buf, int buf_size)
 174 {
 175     const uint8_t *sizes = buf;
 176     int i;
 177
 178     s->num_coeff_partitions = 1 << vp8_rac_get_uint(&s->c, 2);
 179
 180     buf      += 3*(s->num_coeff_partitions-1);
 181     buf_size -= 3*(s->num_coeff_partitions-1);
 182     if (buf_size < 0)
 183         return -1;
 184
 185     for (i = 0; i < s->num_coeff_partitions-1; i++) {
 186         int size = AV_RL24(sizes + 3*i);
 187         if (buf_size - size < 0)
 188             return -1;
 189
 190         ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, size);
 191         buf      += size;
 192         buf_size -= size;
 193     }
 194     ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, buf_size);
 195
 196     return 0;
 197 }
 198
 199 static void get_quants(VP8Context *s)
 200 {
 201     VP56RangeCoder *c = &s->c;
 202     int i, base_qi;
 203
 204     int yac_qi     = vp8_rac_get_uint(c, 7);
 205     int ydc_delta  = vp8_rac_get_sint(c, 4);
 206     int y2dc_delta = vp8_rac_get_sint(c, 4);
 207     int y2ac_delta = vp8_rac_get_sint(c, 4);
 208     int uvdc_delta = vp8_rac_get_sint(c, 4);
 209     int uvac_delta = vp8_rac_get_sint(c, 4);
 210
 211     for (i = 0; i < 4; i++) {
 212         if (s->segmentation.enabled) {
 213             base_qi = s->segmentation.base_quant[i];
 214             if (!s->segmentation.absolute_vals)
 215                 base_qi += yac_qi;
 216         } else
 217             base_qi = yac_qi;
 218
 219         s->qmat[i].luma_qmul[0]    =       vp8_dc_qlookup[av_clip_uintp2(base_qi + ydc_delta , 7)];
 220         s->qmat[i].luma_qmul[1]    =       vp8_ac_qlookup[av_clip_uintp2(base_qi             , 7)];
 221         s->qmat[i].luma_dc_qmul[0] =   2 * vp8_dc_qlookup[av_clip_uintp2(base_qi + y2dc_delta, 7)];
 222         s->qmat[i].luma_dc_qmul[1] = 155 * vp8_ac_qlookup[av_clip_uintp2(base_qi + y2ac_delta, 7)] / 100;
 223         s->qmat[i].chroma_qmul[0]  =       vp8_dc_qlookup[av_clip_uintp2(base_qi + uvdc_delta, 7)];
 224         s->qmat[i].chroma_qmul[1]  =       vp8_ac_qlookup[av_clip_uintp2(base_qi + uvac_delta, 7)];
 225
 226         s->qmat[i].luma_dc_qmul[1] = FFMAX(s->qmat[i].luma_dc_qmul[1], 8);
 227         s->qmat[i].chroma_qmul[0]  = FFMIN(s->qmat[i].chroma_qmul[0], 132);
 228     }
 229 }
 230
 231 /**
 232  * Determine which buffers golden and altref should be updated with after this frame.
 233  * The spec isn't clear here, so I'm going by my understanding of what libvpx does
 234  *
 235  * Intra frames update all 3 references
 236  * Inter frames update VP56_FRAME_PREVIOUS if the update_last flag is set
 237  * If the update (golden|altref) flag is set, it's updated with the current frame
 238  *      if update_last is set, and VP56_FRAME_PREVIOUS otherwise.
 239  * If the flag is not set, the number read means:
 240  *      0: no update
 241  *      1: VP56_FRAME_PREVIOUS
 242  *      2: update golden with altref, or update altref with golden
 243  */
 244 static VP56Frame ref_to_update(VP8Context *s, int update, VP56Frame ref)
 245 {
 246     VP56RangeCoder *c = &s->c;
 247
 248     if (update)
 249         return VP56_FRAME_CURRENT;
 250
 251     switch (vp8_rac_get_uint(c, 2)) {
 252     case 1:
 253         return VP56_FRAME_PREVIOUS;
 254     case 2:
 255         return (ref == VP56_FRAME_GOLDEN) ? VP56_FRAME_GOLDEN2 : VP56_FRAME_GOLDEN;
 256     }
 257     return VP56_FRAME_NONE;
 258 }
 259
 260 static void update_refs(VP8Context *s)
 261 {
 262     VP56RangeCoder *c = &s->c;
 263
 264     int update_golden = vp8_rac_get(c);
 265     int update_altref = vp8_rac_get(c);
 266
 267     s->update_golden = ref_to_update(s, update_golden, VP56_FRAME_GOLDEN);
 268     s->update_altref = ref_to_update(s, update_altref, VP56_FRAME_GOLDEN2);
 269 }
 270
 271 static int decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size)
 272 {
 273     VP56RangeCoder *c = &s->c;
 274     int header_size, hscale, vscale, i, j, k, l, m, ret;
 275     int width  = s->avctx->width;
 276     int height = s->avctx->height;
 277
 278     s->keyframe  = !(buf[0] & 1);
 279     s->profile   =  (buf[0]>>1) & 7;
 280     s->invisible = !(buf[0] & 0x10);
 281     header_size  = AV_RL24(buf) >> 5;
 282     buf      += 3;
 283     buf_size -= 3;
 284
 285     if (s->profile > 3)
 286         av_log(s->avctx, AV_LOG_WARNING, "Unknown profile %d\n", s->profile);
 287
 288     if (!s->profile)
 289         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab, sizeof(s->put_pixels_tab));
 290     else    // profile 1-3 use bilinear, 4+ aren't defined so whatever
 291         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_bilinear_pixels_tab, sizeof(s->put_pixels_tab));
 292
 293     if (header_size > buf_size - 7*s->keyframe) {
 294         av_log(s->avctx, AV_LOG_ERROR, "Header size larger than data provided\n");
 295         return AVERROR_INVALIDDATA;
 296     }
 297
 298     if (s->keyframe) {
 299         if (AV_RL24(buf) != 0x2a019d) {
 300             av_log(s->avctx, AV_LOG_ERROR, "Invalid start code 0x%x\n", AV_RL24(buf));
 301             return AVERROR_INVALIDDATA;
 302         }
 303         width  = AV_RL16(buf+3) & 0x3fff;
 304         height = AV_RL16(buf+5) & 0x3fff;
 305         hscale = buf[4] >> 6;
 306         vscale = buf[6] >> 6;
 307         buf      += 7;
 308         buf_size -= 7;
 309
 310         if (hscale || vscale)
 311             av_log_missing_feature(s->avctx, "Upscaling", 1);
 312
 313         s->update_golden = s->update_altref = VP56_FRAME_CURRENT;
 314         for (i = 0; i < 4; i++)
 315             for (j = 0; j < 16; j++)
 316                 memcpy(s->prob->token[i][j], vp8_token_default_probs[i][vp8_coeff_band[j]],
 317                        sizeof(s->prob->token[i][j]));
 318         memcpy(s->prob->pred16x16, vp8_pred16x16_prob_inter, sizeof(s->prob->pred16x16));
 319         memcpy(s->prob->pred8x8c , vp8_pred8x8c_prob_inter , sizeof(s->prob->pred8x8c));
 320         memcpy(s->prob->mvc      , vp8_mv_default_prob     , sizeof(s->prob->mvc));
 321         memset(&s->segmentation, 0, sizeof(s->segmentation));
 322     }
 323
 324     if (!s->macroblocks_base || /* first frame */
 325         width != s->avctx->width || height != s->avctx->height) {
 326         if ((ret = update_dimensions(s, width, height)) < 0)
 327             return ret;
 328     }
 329
 330     ff_vp56_init_range_decoder(c, buf, header_size);
 331     buf      += header_size;
 332     buf_size -= header_size;
 333
 334     if (s->keyframe) {
 335         if (vp8_rac_get(c))
 336             av_log(s->avctx, AV_LOG_WARNING, "Unspecified colorspace\n");
 337         vp8_rac_get(c); // whether we can skip clamping in dsp functions
 338     }
 339
 340     if ((s->segmentation.enabled = vp8_rac_get(c)))
 341         parse_segment_info(s);
 342     else
 343         s->segmentation.update_map = 0; // FIXME: move this to some init function?
 344
 345     s->filter.simple    = vp8_rac_get(c);
 346     s->filter.level     = vp8_rac_get_uint(c, 6);
 347     s->filter.sharpness = vp8_rac_get_uint(c, 3);
 348
 349     if ((s->lf_delta.enabled = vp8_rac_get(c)))
 350         if (vp8_rac_get(c))
 351             update_lf_deltas(s);
 352
 353     if (setup_partitions(s, buf, buf_size)) {
 354         av_log(s->avctx, AV_LOG_ERROR, "Invalid partitions\n");
 355         return AVERROR_INVALIDDATA;
 356     }
 357
 358     get_quants(s);
 359
 360     if (!s->keyframe) {
 361         update_refs(s);
 362         s->sign_bias[VP56_FRAME_GOLDEN]               = vp8_rac_get(c);
 363         s->sign_bias[VP56_FRAME_GOLDEN2 /* altref */] = vp8_rac_get(c);
 364     }
 365
 366     // if we aren't saving this frame's probabilities for future frames,
 367     // make a copy of the current probabilities
 368     if (!(s->update_probabilities = vp8_rac_get(c)))
 369         s->prob[1] = s->prob[0];
 370
 371     s->update_last = s->keyframe || vp8_rac_get(c);
 372
 373     for (i = 0; i < 4; i++)
 374         for (j = 0; j < 8; j++)
 375             for (k = 0; k < 3; k++)
 376                 for (l = 0; l < NUM_DCT_TOKENS-1; l++)
 377                     if (vp56_rac_get_prob_branchy(c, vp8_token_update_probs[i][j][k][l])) {
 378                         int prob = vp8_rac_get_uint(c, 8);
 379                         for (m = 0; vp8_coeff_band_indexes[j][m] >= 0; m++)
 380                             s->prob->token[i][vp8_coeff_band_indexes[j][m]][k][l] = prob;
 381                     }
 382
 383     if ((s->mbskip_enabled = vp8_rac_get(c)))
 384         s->prob->mbskip = vp8_rac_get_uint(c, 8);
 385
 386     if (!s->keyframe) {
 387         s->prob->intra  = vp8_rac_get_uint(c, 8);
 388         s->prob->last   = vp8_rac_get_uint(c, 8);
 389         s->prob->golden = vp8_rac_get_uint(c, 8);
 390
 391         if (vp8_rac_get(c))
 392             for (i = 0; i < 4; i++)
 393                 s->prob->pred16x16[i] = vp8_rac_get_uint(c, 8);
 394         if (vp8_rac_get(c))
 395             for (i = 0; i < 3; i++)
 396                 s->prob->pred8x8c[i]  = vp8_rac_get_uint(c, 8);
 397
 398         // 17.2 MV probability update
 399         for (i = 0; i < 2; i++)
 400             for (j = 0; j < 19; j++)
 401                 if (vp56_rac_get_prob_branchy(c, vp8_mv_update_prob[i][j]))
 402                     s->prob->mvc[i][j] = vp8_rac_get_nn(c);
 403     }
 404
 405     return 0;
 406 }
 407
 408 static av_always_inline void clamp_mv(VP8Context *s, VP56mv *dst, const VP56mv *src)
 409 {
 410     dst->x = av_clip(src->x, s->mv_min.x, s->mv_max.x);
 411     dst->y = av_clip(src->y, s->mv_min.y, s->mv_max.y);
 412 }
 413
 414 /**
 415  * Motion vector coding, 17.1.
 416  */
 417 static int read_mv_component(VP56RangeCoder *c, const uint8_t *p)
 418 {
 419     int bit, x = 0;
 420
 421     if (vp56_rac_get_prob_branchy(c, p[0])) {
 422         int i;
 423
 424         for (i = 0; i < 3; i++)
 425             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 426         for (i = 9; i > 3; i--)
 427             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 428         if (!(x & 0xFFF0) || vp56_rac_get_prob(c, p[12]))
 429             x += 8;
 430     } else {
 431         // small_mvtree
 432         const uint8_t *ps = p+2;
 433         bit = vp56_rac_get_prob(c, *ps);
 434         ps += 1 + 3*bit;
 435         x  += 4*bit;
 436         bit = vp56_rac_get_prob(c, *ps);
 437         ps += 1 + bit;
 438         x  += 2*bit;
 439         x  += vp56_rac_get_prob(c, *ps);
 440     }
 441
 442     return (x && vp56_rac_get_prob(c, p[1])) ? -x : x;
 443 }
 444
 445 static av_always_inline
 446 const uint8_t *get_submv_prob(uint32_t left, uint32_t top)
 447 {
 448     if (left == top)
 449         return vp8_submv_prob[4-!!left];
 450     if (!top)
 451         return vp8_submv_prob[2];
 452     return vp8_submv_prob[1-!!left];
 453 }
 454
 455 /**
 456  * Split motion vector prediction, 16.4.
 457  * @returns the number of motion vectors parsed (2, 4 or 16)
 458  */
 459 static av_always_inline
 460 int decode_splitmvs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb)
 461 {
 462     int part_idx;
 463     int n, num;
 464     VP8Macroblock *top_mb  = &mb[2];
 465     VP8Macroblock *left_mb = &mb[-1];
 466     const uint8_t *mbsplits_left = vp8_mbsplits[left_mb->partitioning],
 467                   *mbsplits_top = vp8_mbsplits[top_mb->partitioning],
 468                   *mbsplits_cur, *firstidx;
 469     VP56mv *top_mv  = top_mb->bmv;
 470     VP56mv *left_mv = left_mb->bmv;
 471     VP56mv *cur_mv  = mb->bmv;
 472
 473     if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[0])) {
 474         if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[1])) {
 475             part_idx = VP8_SPLITMVMODE_16x8 + vp56_rac_get_prob(c, vp8_mbsplit_prob[2]);
 476         } else {
 477             part_idx = VP8_SPLITMVMODE_8x8;
 478         }
 479     } else {
 480         part_idx = VP8_SPLITMVMODE_4x4;
 481     }
 482
 483     num = vp8_mbsplit_count[part_idx];
 484     mbsplits_cur = vp8_mbsplits[part_idx],
 485     firstidx = vp8_mbfirstidx[part_idx];
 486     mb->partitioning = part_idx;
 487
 488     for (n = 0; n < num; n++) {
 489         int k = firstidx[n];
 490         uint32_t left, above;
 491         const uint8_t *submv_prob;
 492
 493         if (!(k & 3))
 494             left = AV_RN32A(&left_mv[mbsplits_left[k + 3]]);
 495         else
 496             left  = AV_RN32A(&cur_mv[mbsplits_cur[k - 1]]);
 497         if (k <= 3)
 498             above = AV_RN32A(&top_mv[mbsplits_top[k + 12]]);
 499         else
 500             above = AV_RN32A(&cur_mv[mbsplits_cur[k - 4]]);
 501
 502         submv_prob = get_submv_prob(left, above);
 503
 504         if (vp56_rac_get_prob_branchy(c, submv_prob[0])) {
 505             if (vp56_rac_get_prob_branchy(c, submv_prob[1])) {
 506                 if (vp56_rac_get_prob_branchy(c, submv_prob[2])) {
 507                     mb->bmv[n].y = mb->mv.y + read_mv_component(c, s->prob->mvc[0]);
 508                     mb->bmv[n].x = mb->mv.x + read_mv_component(c, s->prob->mvc[1]);
 509                 } else {
 510                     AV_ZERO32(&mb->bmv[n]);
 511                 }
 512             } else {
 513                 AV_WN32A(&mb->bmv[n], above);
 514             }
 515         } else {
 516             AV_WN32A(&mb->bmv[n], left);
 517         }
 518     }
 519
 520     return num;
 521 }
 522
 523 static av_always_inline
 524 void decode_mvs(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y)
 525 {
 526     VP8Macroblock *mb_edge[3] = { mb + 2 /* top */,
 527                                   mb - 1 /* left */,
 528                                   mb + 1 /* top-left */ };
 529     enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV };
 530     enum { VP8_EDGE_TOP, VP8_EDGE_LEFT, VP8_EDGE_TOPLEFT };
 531     int idx = CNT_ZERO;
 532     int cur_sign_bias = s->sign_bias[mb->ref_frame];
 533     int8_t *sign_bias = s->sign_bias;
 534     VP56mv near_mv[4];
 535     uint8_t cnt[4] = { 0 };
 536     VP56RangeCoder *c = &s->c;
 537
 538     AV_ZERO32(&near_mv[0]);
 539     AV_ZERO32(&near_mv[1]);
 540     AV_ZERO32(&near_mv[2]);
 541
 542     /* Process MB on top, left and top-left */
 543     #define MV_EDGE_CHECK(n)\
 544     {\
 545         VP8Macroblock *edge = mb_edge[n];\
 546         int edge_ref = edge->ref_frame;\
 547         if (edge_ref != VP56_FRAME_CURRENT) {\
 548             uint32_t mv = AV_RN32A(&edge->mv);\
 549             if (mv) {\
 550                 if (cur_sign_bias != sign_bias[edge_ref]) {\
 551                     /* SWAR negate of the values in mv. */\
 552                     mv = ~mv;\
 553                     mv = ((mv&0x7fff7fff) + 0x00010001) ^ (mv&0x80008000);\
 554                 }\
 555                 if (!n || mv != AV_RN32A(&near_mv[idx]))\
 556                     AV_WN32A(&near_mv[++idx], mv);\
 557                 cnt[idx]      += 1 + (n != 2);\
 558             } else\
 559                 cnt[CNT_ZERO] += 1 + (n != 2);\
 560         }\
 561     }
 562
 563     MV_EDGE_CHECK(0)
 564     MV_EDGE_CHECK(1)
 565     MV_EDGE_CHECK(2)
 566
 567     mb->partitioning = VP8_SPLITMVMODE_NONE;
 568     if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_ZERO]][0])) {
 569         mb->mode = VP8_MVMODE_MV;
 570
 571         /* If we have three distinct MVs, merge first and last if they're the same */
 572         if (cnt[CNT_SPLITMV] && AV_RN32A(&near_mv[1 + VP8_EDGE_TOP]) == AV_RN32A(&near_mv[1 + VP8_EDGE_TOPLEFT]))
 573             cnt[CNT_NEAREST] += 1;
 574
 575         /* Swap near and nearest if necessary */
 576         if (cnt[CNT_NEAR] > cnt[CNT_NEAREST]) {
 577             FFSWAP(uint8_t,     cnt[CNT_NEAREST],     cnt[CNT_NEAR]);
 578             FFSWAP( VP56mv, near_mv[CNT_NEAREST], near_mv[CNT_NEAR]);
 579         }
 580
 581         if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAREST]][1])) {
 582             if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAR]][2])) {
 583
 584                 /* Choose the best mv out of 0,0 and the nearest mv */
 585                 clamp_mv(s, &mb->mv, &near_mv[CNT_ZERO + (cnt[CNT_NEAREST] >= cnt[CNT_ZERO])]);
 586                 cnt[CNT_SPLITMV] = ((mb_edge[VP8_EDGE_LEFT]->mode    == VP8_MVMODE_SPLIT) +
 587                                     (mb_edge[VP8_EDGE_TOP]->mode     == VP8_MVMODE_SPLIT)) * 2 +
 588                                     (mb_edge[VP8_EDGE_TOPLEFT]->mode == VP8_MVMODE_SPLIT);
 589
 590                 if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_SPLITMV]][3])) {
 591                     mb->mode = VP8_MVMODE_SPLIT;
 592                     mb->mv = mb->bmv[decode_splitmvs(s, c, mb) - 1];
 593                 } else {
 594                     mb->mv.y += read_mv_component(c, s->prob->mvc[0]);
 595                     mb->mv.x += read_mv_component(c, s->prob->mvc[1]);
 596                     mb->bmv[0] = mb->mv;
 597                 }
 598             } else {
 599                 clamp_mv(s, &mb->mv, &near_mv[CNT_NEAR]);
 600                 mb->bmv[0] = mb->mv;
 601             }
 602         } else {
 603             clamp_mv(s, &mb->mv, &near_mv[CNT_NEAREST]);
 604             mb->bmv[0] = mb->mv;
 605         }
 606     } else {
 607         mb->mode = VP8_MVMODE_ZERO;
 608         AV_ZERO32(&mb->mv);
 609         mb->bmv[0] = mb->mv;
 610     }
 611 }
 612
 613 static av_always_inline
 614 void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c,
 615                            int mb_x, int keyframe)
 616 {
 617     uint8_t *intra4x4 = s->intra4x4_pred_mode_mb;
 618     if (keyframe) {
 619         int x, y;
 620         uint8_t* const top = s->intra4x4_pred_mode_top + 4 * mb_x;
 621         uint8_t* const left = s->intra4x4_pred_mode_left;
 622         for (y = 0; y < 4; y++) {
 623             for (x = 0; x < 4; x++) {
 624                 const uint8_t *ctx;
 625                 ctx = vp8_pred4x4_prob_intra[top[x]][left[y]];
 626                 *intra4x4 = vp8_rac_get_tree(c, vp8_pred4x4_tree, ctx);
 627                 left[y] = top[x] = *intra4x4;
 628                 intra4x4++;
 629             }
 630         }
 631     } else {
 632         int i;
 633         for (i = 0; i < 16; i++)
 634             intra4x4[i] = vp8_rac_get_tree(c, vp8_pred4x4_tree, vp8_pred4x4_prob_inter);
 635     }
 636 }
 637
 638 static av_always_inline
 639 void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, uint8_t *segment, uint8_t *ref)
 640 {
 641     VP56RangeCoder *c = &s->c;
 642
 643     if (s->segmentation.update_map) {
 644         int bit  = vp56_rac_get_prob(c, s->prob->segmentid[0]);
 645         *segment = vp56_rac_get_prob(c, s->prob->segmentid[1+bit]) + 2*bit;
 646     } else
 647         *segment = ref ? *ref : *segment;
 648     s->segment = *segment;
 649
 650     mb->skip = s->mbskip_enabled ? vp56_rac_get_prob(c, s->prob->mbskip) : 0;
 651
 652     if (s->keyframe) {
 653         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_intra, vp8_pred16x16_prob_intra);
 654
 655         if (mb->mode == MODE_I4x4) {
 656             decode_intra4x4_modes(s, c, mb_x, 1);
 657         } else {
 658             const uint32_t modes = vp8_pred4x4_mode[mb->mode] * 0x01010101u;
 659             AV_WN32A(s->intra4x4_pred_mode_top + 4 * mb_x, modes);
 660             AV_WN32A(s->intra4x4_pred_mode_left, modes);
 661         }
 662
 663         s->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, vp8_pred8x8c_prob_intra);
 664         mb->ref_frame = VP56_FRAME_CURRENT;
 665     } else if (vp56_rac_get_prob_branchy(c, s->prob->intra)) {
 666         // inter MB, 16.2
 667         if (vp56_rac_get_prob_branchy(c, s->prob->last))
 668             mb->ref_frame = vp56_rac_get_prob(c, s->prob->golden) ?
 669                 VP56_FRAME_GOLDEN2 /* altref */ : VP56_FRAME_GOLDEN;
 670         else
 671             mb->ref_frame = VP56_FRAME_PREVIOUS;
 672         s->ref_count[mb->ref_frame-1]++;
 673
 674         // motion vectors, 16.3
 675         decode_mvs(s, mb, mb_x, mb_y);
 676     } else {
 677         // intra MB, 16.1
 678         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_inter, s->prob->pred16x16);
 679
 680         if (mb->mode == MODE_I4x4)
 681             decode_intra4x4_modes(s, c, mb_x, 0);
 682
 683         s->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, s->prob->pred8x8c);
 684         mb->ref_frame = VP56_FRAME_CURRENT;
 685         mb->partitioning = VP8_SPLITMVMODE_NONE;
 686         AV_ZERO32(&mb->bmv[0]);
 687     }
 688 }
 689
 690 #ifndef decode_block_coeffs_internal
 691 /**
 692  * @param c arithmetic bitstream reader context
 693  * @param block destination for block coefficients
 694  * @param probs probabilities to use when reading trees from the bitstream
 695  * @param i initial coeff index, 0 unless a separate DC block is coded
 696  * @param qmul array holding the dc/ac dequant factor at position 0/1
 697  * @return 0 if no coeffs were decoded
 698  *         otherwise, the index of the last coeff decoded plus one
 699  */
 700 static int decode_block_coeffs_internal(VP56RangeCoder *c, DCTELEM block[16],
 701                                         uint8_t probs[16][3][NUM_DCT_TOKENS-1],
 702                                         int i, uint8_t *token_prob, int16_t qmul[2])
 703 {
 704     goto skip_eob;
 705     do {
 706         int coeff;
 707         if (!vp56_rac_get_prob_branchy(c, token_prob[0]))   // DCT_EOB
 708             return i;
 709
 710 skip_eob:
 711         if (!vp56_rac_get_prob_branchy(c, token_prob[1])) { // DCT_0
 712             if (++i == 16)
 713                 return i; // invalid input; blocks should end with EOB
 714             token_prob = probs[i][0];
 715             goto skip_eob;
 716         }
 717
 718         if (!vp56_rac_get_prob_branchy(c, token_prob[2])) { // DCT_1
 719             coeff = 1;
 720             token_prob = probs[i+1][1];
 721         } else {
 722             if (!vp56_rac_get_prob_branchy(c, token_prob[3])) { // DCT 2,3,4
 723                 coeff = vp56_rac_get_prob_branchy(c, token_prob[4]);
 724                 if (coeff)
 725                     coeff += vp56_rac_get_prob(c, token_prob[5]);
 726                 coeff += 2;
 727             } else {
 728                 // DCT_CAT*
 729                 if (!vp56_rac_get_prob_branchy(c, token_prob[6])) {
 730                     if (!vp56_rac_get_prob_branchy(c, token_prob[7])) { // DCT_CAT1
 731                         coeff  = 5 + vp56_rac_get_prob(c, vp8_dct_cat1_prob[0]);
 732                     } else {                                    // DCT_CAT2
 733                         coeff  = 7;
 734                         coeff += vp56_rac_get_prob(c, vp8_dct_cat2_prob[0]) << 1;
 735                         coeff += vp56_rac_get_prob(c, vp8_dct_cat2_prob[1]);
 736                     }
 737                 } else {    // DCT_CAT3 and up
 738                     int a = vp56_rac_get_prob(c, token_prob[8]);
 739                     int b = vp56_rac_get_prob(c, token_prob[9+a]);
 740                     int cat = (a<<1) + b;
 741                     coeff  = 3 + (8<<cat);
 742                     coeff += vp8_rac_get_coeff(c, ff_vp8_dct_cat_prob[cat]);
 743                 }
 744             }
 745             token_prob = probs[i+1][2];
 746         }
 747         block[zigzag_scan[i]] = (vp8_rac_get(c) ? -coeff : coeff) * qmul[!!i];
 748     } while (++i < 16);
 749
 750     return i;
 751 }
 752 #endif
 753
 754 /**
 755  * @param c arithmetic bitstream reader context
 756  * @param block destination for block coefficients
 757  * @param probs probabilities to use when reading trees from the bitstream
 758  * @param i initial coeff index, 0 unless a separate DC block is coded
 759  * @param zero_nhood the initial prediction context for number of surrounding
 760  *                   all-zero blocks (only left/top, so 0-2)
 761  * @param qmul array holding the dc/ac dequant factor at position 0/1
 762  * @return 0 if no coeffs were decoded
 763  *         otherwise, the index of the last coeff decoded plus one
 764  */
 765 static av_always_inline
 766 int decode_block_coeffs(VP56RangeCoder *c, DCTELEM block[16],
 767                         uint8_t probs[16][3][NUM_DCT_TOKENS-1],
 768                         int i, int zero_nhood, int16_t qmul[2])
 769 {
 770     uint8_t *token_prob = probs[i][zero_nhood];
 771     if (!vp56_rac_get_prob_branchy(c, token_prob[0]))   // DCT_EOB
 772         return 0;
 773     return decode_block_coeffs_internal(c, block, probs, i, token_prob, qmul);
 774 }
 775
 776 static av_always_inline
 777 void decode_mb_coeffs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
 778                       uint8_t t_nnz[9], uint8_t l_nnz[9])
 779 {
 780     int i, x, y, luma_start = 0, luma_ctx = 3;
 781     int nnz_pred, nnz, nnz_total = 0;
 782     int segment = s->segment;
 783     int block_dc = 0;
 784
 785     if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
 786         nnz_pred = t_nnz[8] + l_nnz[8];
 787
 788         // decode DC values and do hadamard
 789         nnz = decode_block_coeffs(c, s->block_dc, s->prob->token[1], 0, nnz_pred,
 790                                   s->qmat[segment].luma_dc_qmul);
 791         l_nnz[8] = t_nnz[8] = !!nnz;
 792         if (nnz) {
 793             nnz_total += nnz;
 794             block_dc = 1;
 795             if (nnz == 1)
 796                 s->vp8dsp.vp8_luma_dc_wht_dc(s->block, s->block_dc);
 797             else
 798                 s->vp8dsp.vp8_luma_dc_wht(s->block, s->block_dc);
 799         }
 800         luma_start = 1;
 801         luma_ctx = 0;
 802     }
 803
 804     // luma blocks
 805     for (y = 0; y < 4; y++)
 806         for (x = 0; x < 4; x++) {
 807             nnz_pred = l_nnz[y] + t_nnz[x];
 808             nnz = decode_block_coeffs(c, s->block[y][x], s->prob->token[luma_ctx], luma_start,
 809                                       nnz_pred, s->qmat[segment].luma_qmul);
 810             // nnz+block_dc may be one more than the actual last index, but we don't care
 811             s->non_zero_count_cache[y][x] = nnz + block_dc;
 812             t_nnz[x] = l_nnz[y] = !!nnz;
 813             nnz_total += nnz;
 814         }
 815
 816     // chroma blocks
 817     // TODO: what to do about dimensions? 2nd dim for luma is x,
 818     // but for chroma it's (y<<1)|x
 819     for (i = 4; i < 6; i++)
 820         for (y = 0; y < 2; y++)
 821             for (x = 0; x < 2; x++) {
 822                 nnz_pred = l_nnz[i+2*y] + t_nnz[i+2*x];
 823                 nnz = decode_block_coeffs(c, s->block[i][(y<<1)+x], s->prob->token[2], 0,
 824                                           nnz_pred, s->qmat[segment].chroma_qmul);
 825                 s->non_zero_count_cache[i][(y<<1)+x] = nnz;
 826                 t_nnz[i+2*x] = l_nnz[i+2*y] = !!nnz;
 827                 nnz_total += nnz;
 828             }
 829
 830     // if there were no coded coeffs despite the macroblock not being marked skip,
 831     // we MUST not do the inner loop filter and should not do IDCT
 832     // Since skip isn't used for bitstream prediction, just manually set it.
 833     if (!nnz_total)
 834         mb->skip = 1;
 835 }
 836
 837 static av_always_inline
 838 void backup_mb_border(uint8_t *top_border, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr,
 839                       int linesize, int uvlinesize, int simple)
 840 {
 841     AV_COPY128(top_border, src_y + 15*linesize);
 842     if (!simple) {
 843         AV_COPY64(top_border+16, src_cb + 7*uvlinesize);
 844         AV_COPY64(top_border+24, src_cr + 7*uvlinesize);
 845     }
 846 }
 847
 848 static av_always_inline
 849 void xchg_mb_border(uint8_t *top_border, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr,
 850                     int linesize, int uvlinesize, int mb_x, int mb_y, int mb_width,
 851                     int simple, int xchg)
 852 {
 853     uint8_t *top_border_m1 = top_border-32;     // for TL prediction
 854     src_y  -=   linesize;
 855     src_cb -= uvlinesize;
 856     src_cr -= uvlinesize;
 857
 858 #define XCHG(a,b,xchg) do {                     \
 859         if (xchg) AV_SWAP64(b,a);               \
 860         else      AV_COPY64(b,a);               \
 861     } while (0)
 862
 863     XCHG(top_border_m1+8, src_y-8, xchg);
 864     XCHG(top_border,      src_y,   xchg);
 865     XCHG(top_border+8,    src_y+8, 1);
 866     if (mb_x < mb_width-1)
 867         XCHG(top_border+32, src_y+16, 1);
 868
 869     // only copy chroma for normal loop filter
 870     // or to initialize the top row to 127
 871     if (!simple || !mb_y) {
 872         XCHG(top_border_m1+16, src_cb-8, xchg);
 873         XCHG(top_border_m1+24, src_cr-8, xchg);
 874         XCHG(top_border+16,    src_cb, 1);
 875         XCHG(top_border+24,    src_cr, 1);
 876     }
 877 }
 878
 879 static av_always_inline
 880 int check_dc_pred8x8_mode(int mode, int mb_x, int mb_y)
 881 {
 882     if (!mb_x) {
 883         return mb_y ? TOP_DC_PRED8x8 : DC_128_PRED8x8;
 884     } else {
 885         return mb_y ? mode : LEFT_DC_PRED8x8;
 886     }
 887 }
 888
 889 static av_always_inline
 890 int check_tm_pred8x8_mode(int mode, int mb_x, int mb_y)
 891 {
 892     if (!mb_x) {
 893         return mb_y ? VERT_PRED8x8 : DC_129_PRED8x8;
 894     } else {
 895         return mb_y ? mode : HOR_PRED8x8;
 896     }
 897 }
 898
 899 static av_always_inline
 900 int check_intra_pred8x8_mode(int mode, int mb_x, int mb_y)
 901 {
 902     if (mode == DC_PRED8x8) {
 903         return check_dc_pred8x8_mode(mode, mb_x, mb_y);
 904     } else {
 905         return mode;
 906     }
 907 }
 908
 909 static av_always_inline
 910 int check_intra_pred8x8_mode_emuedge(int mode, int mb_x, int mb_y)
 911 {
 912     switch (mode) {
 913     case DC_PRED8x8:
 914         return check_dc_pred8x8_mode(mode, mb_x, mb_y);
 915     case VERT_PRED8x8:
 916         return !mb_y ? DC_127_PRED8x8 : mode;
 917     case HOR_PRED8x8:
 918         return !mb_x ? DC_129_PRED8x8 : mode;
 919     case PLANE_PRED8x8 /*TM*/:
 920         return check_tm_pred8x8_mode(mode, mb_x, mb_y);
 921     }
 922     return mode;
 923 }
 924
 925 static av_always_inline
 926 int check_tm_pred4x4_mode(int mode, int mb_x, int mb_y)
 927 {
 928     if (!mb_x) {
 929         return mb_y ? VERT_VP8_PRED : DC_129_PRED;
 930     } else {
 931         return mb_y ? mode : HOR_VP8_PRED;
 932     }
 933 }
 934
 935 static av_always_inline
 936 int check_intra_pred4x4_mode_emuedge(int mode, int mb_x, int mb_y, int *copy_buf)
 937 {
 938     switch (mode) {
 939     case VERT_PRED:
 940         if (!mb_x && mb_y) {
 941             *copy_buf = 1;
 942             return mode;
 943         }
 944         /* fall-through */
 945     case DIAG_DOWN_LEFT_PRED:
 946     case VERT_LEFT_PRED:
 947         return !mb_y ? DC_127_PRED : mode;
 948     case HOR_PRED:
 949         if (!mb_y) {
 950             *copy_buf = 1;
 951             return mode;
 952         }
 953         /* fall-through */
 954     case HOR_UP_PRED:
 955         return !mb_x ? DC_129_PRED : mode;
 956     case TM_VP8_PRED:
 957         return check_tm_pred4x4_mode(mode, mb_x, mb_y);
 958     case DC_PRED: // 4x4 DC doesn't use the same "H.264-style" exceptions as 16x16/8x8 DC
 959     case DIAG_DOWN_RIGHT_PRED:
 960     case VERT_RIGHT_PRED:
 961     case HOR_DOWN_PRED:
 962         if (!mb_y || !mb_x)
 963             *copy_buf = 1;
 964         return mode;
 965     }
 966     return mode;
 967 }
 968
 969 static av_always_inline
 970 void intra_predict(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb,
 971                    int mb_x, int mb_y)
 972 {
 973     AVCodecContext *avctx = s->avctx;
 974     int x, y, mode, nnz;
 975     uint32_t tr;
 976
 977     // for the first row, we need to run xchg_mb_border to init the top edge to 127
 978     // otherwise, skip it if we aren't going to deblock
 979     if (!(avctx->flags & CODEC_FLAG_EMU_EDGE && !mb_y) && (s->deblock_filter || !mb_y))
 980         xchg_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2],
 981                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
 982                        s->filter.simple, 1);
 983
 984     if (mb->mode < MODE_I4x4) {
 985         if (avctx->flags & CODEC_FLAG_EMU_EDGE) { // tested
 986             mode = check_intra_pred8x8_mode_emuedge(mb->mode, mb_x, mb_y);
 987         } else {
 988             mode = check_intra_pred8x8_mode(mb->mode, mb_x, mb_y);
 989         }
 990         s->hpc.pred16x16[mode](dst[0], s->linesize);
 991     } else {
 992         uint8_t *ptr = dst[0];
 993         uint8_t *intra4x4 = s->intra4x4_pred_mode_mb;
 994         uint8_t tr_top[4] = { 127, 127, 127, 127 };
 995
 996         // all blocks on the right edge of the macroblock use bottom edge
 997         // the top macroblock for their topright edge
 998         uint8_t *tr_right = ptr - s->linesize + 16;
 999
1000         // if we're on the right edge of the frame, said edge is extended
1001         // from the top macroblock
1002         if (!(!mb_y && avctx->flags & CODEC_FLAG_EMU_EDGE) &&
1003             mb_x == s->mb_width-1) {
1004             tr = tr_right[-1]*0x01010101u;
1005             tr_right = (uint8_t *)&tr;
1006         }
1007
1008         if (mb->skip)
1009             AV_ZERO128(s->non_zero_count_cache);
1010
1011         for (y = 0; y < 4; y++) {
1012             uint8_t *topright = ptr + 4 - s->linesize;
1013             for (x = 0; x < 4; x++) {
1014                 int copy = 0, linesize = s->linesize;
1015                 uint8_t *dst = ptr+4*x;
1016                 DECLARE_ALIGNED(4, uint8_t, copy_dst)[5*8];
1017
1018                 if ((y == 0 || x == 3) && mb_y == 0 && avctx->flags & CODEC_FLAG_EMU_EDGE) {
1019                     topright = tr_top;
1020                 } else if (x == 3)
1021                     topright = tr_right;
1022
1023                 if (avctx->flags & CODEC_FLAG_EMU_EDGE) { // mb_x+x or mb_y+y is a hack but works
1024                     mode = check_intra_pred4x4_mode_emuedge(intra4x4[x], mb_x + x, mb_y + y, &copy);
1025                     if (copy) {
1026                         dst = copy_dst + 12;
1027                         linesize = 8;
1028                         if (!(mb_y + y)) {
1029                             copy_dst[3] = 127U;
1030                             AV_WN32A(copy_dst+4, 127U * 0x01010101U);
1031                         } else {
1032                             AV_COPY32(copy_dst+4, ptr+4*x-s->linesize);
1033                             if (!(mb_x + x)) {
1034                                 copy_dst[3] = 129U;
1035                             } else {
1036                                 copy_dst[3] = ptr[4*x-s->linesize-1];
1037                             }
1038                         }
1039                         if (!(mb_x + x)) {
1040                             copy_dst[11] =
1041                             copy_dst[19] =
1042                             copy_dst[27] =
1043                             copy_dst[35] = 129U;
1044                         } else {
1045                             copy_dst[11] = ptr[4*x              -1];
1046                             copy_dst[19] = ptr[4*x+s->linesize  -1];
1047                             copy_dst[27] = ptr[4*x+s->linesize*2-1];
1048                             copy_dst[35] = ptr[4*x+s->linesize*3-1];
1049                         }
1050                     }
1051                 } else {
1052                     mode = intra4x4[x];
1053                 }
1054                 s->hpc.pred4x4[mode](dst, topright, linesize);
1055                 if (copy) {
1056                     AV_COPY32(ptr+4*x              , copy_dst+12);
1057                     AV_COPY32(ptr+4*x+s->linesize  , copy_dst+20);
1058                     AV_COPY32(ptr+4*x+s->linesize*2, copy_dst+28);
1059                     AV_COPY32(ptr+4*x+s->linesize*3, copy_dst+36);
1060                 }
1061
1062                 nnz = s->non_zero_count_cache[y][x];
1063                 if (nnz) {
1064                     if (nnz == 1)
1065                         s->vp8dsp.vp8_idct_dc_add(ptr+4*x, s->block[y][x], s->linesize);
1066                     else
1067                         s->vp8dsp.vp8_idct_add(ptr+4*x, s->block[y][x], s->linesize);
1068                 }
1069                 topright += 4;
1070             }
1071
1072             ptr   += 4*s->linesize;
1073             intra4x4 += 4;
1074         }
1075     }
1076
1077     if (avctx->flags & CODEC_FLAG_EMU_EDGE) {
1078         mode = check_intra_pred8x8_mode_emuedge(s->chroma_pred_mode, mb_x, mb_y);
1079     } else {
1080         mode = check_intra_pred8x8_mode(s->chroma_pred_mode, mb_x, mb_y);
1081     }
1082     s->hpc.pred8x8[mode](dst[1], s->uvlinesize);
1083     s->hpc.pred8x8[mode](dst[2], s->uvlinesize);
1084
1085     if (!(avctx->flags & CODEC_FLAG_EMU_EDGE && !mb_y) && (s->deblock_filter || !mb_y))
1086         xchg_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2],
1087                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1088                        s->filter.simple, 0);
1089 }
1090
1091 static const uint8_t subpel_idx[3][8] = {
1092     { 0, 1, 2, 1, 2, 1, 2, 1 }, // nr. of left extra pixels,
1093                                 // also function pointer index
1094     { 0, 3, 5, 3, 5, 3, 5, 3 }, // nr. of extra pixels required
1095     { 0, 2, 3, 2, 3, 2, 3, 2 }, // nr. of right extra pixels
1096 };
1097
1098 /**
1099  * luma MC function
1100  *
1101  * @param s VP8 decoding context
1102  * @param dst target buffer for block data at block position
1103  * @param ref reference picture buffer at origin (0, 0)
1104  * @param mv motion vector (relative to block position) to get pixel data from
1105  * @param x_off horizontal position of block from origin (0, 0)
1106  * @param y_off vertical position of block from origin (0, 0)
1107  * @param block_w width of block (16, 8 or 4)
1108  * @param block_h height of block (always same as block_w)
1109  * @param width width of src/dst plane data
1110  * @param height height of src/dst plane data
1111  * @param linesize size of a single line of plane data, including padding
1112  * @param mc_func motion compensation function pointers (bilinear or sixtap MC)
1113  */
1114 static av_always_inline
1115 void vp8_mc_luma(VP8Context *s, uint8_t *dst, AVFrame *ref, const VP56mv *mv,
1116                  int x_off, int y_off, int block_w, int block_h,
1117                  int width, int height, int linesize,
1118                  vp8_mc_func mc_func[3][3])
1119 {
1120     uint8_t *src = ref->data[0];
1121
1122     if (AV_RN32A(mv)) {
1123
1124         int mx = (mv->x << 1)&7, mx_idx = subpel_idx[0][mx];
1125         int my = (mv->y << 1)&7, my_idx = subpel_idx[0][my];
1126
1127         x_off += mv->x >> 2;
1128         y_off += mv->y >> 2;
1129
1130         // edge emulation
1131         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 4, 0);
1132         src += y_off * linesize + x_off;
1133         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1134             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1135             s->dsp.emulated_edge_mc(s->edge_emu_buffer, src - my_idx * linesize - mx_idx, linesize,
1136                                     block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1137                                     x_off - mx_idx, y_off - my_idx, width, height);
1138             src = s->edge_emu_buffer + mx_idx + linesize * my_idx;
1139         }
1140         mc_func[my_idx][mx_idx](dst, linesize, src, linesize, block_h, mx, my);
1141     } else {
1142         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 4, 0);
1143         mc_func[0][0](dst, linesize, src + y_off * linesize + x_off, linesize, block_h, 0, 0);
1144     }
1145 }
1146
1147 /**
1148  * chroma MC function
1149  *
1150  * @param s VP8 decoding context
1151  * @param dst1 target buffer for block data at block position (U plane)
1152  * @param dst2 target buffer for block data at block position (V plane)
1153  * @param ref reference picture buffer at origin (0, 0)
1154  * @param mv motion vector (relative to block position) to get pixel data from
1155  * @param x_off horizontal position of block from origin (0, 0)
1156  * @param y_off vertical position of block from origin (0, 0)
1157  * @param block_w width of block (16, 8 or 4)
1158  * @param block_h height of block (always same as block_w)
1159  * @param width width of src/dst plane data
1160  * @param height height of src/dst plane data
1161  * @param linesize size of a single line of plane data, including padding
1162  * @param mc_func motion compensation function pointers (bilinear or sixtap MC)
1163  */
1164 static av_always_inline
1165 void vp8_mc_chroma(VP8Context *s, uint8_t *dst1, uint8_t *dst2, AVFrame *ref,
1166                    const VP56mv *mv, int x_off, int y_off,
1167                    int block_w, int block_h, int width, int height, int linesize,
1168                    vp8_mc_func mc_func[3][3])
1169 {
1170     uint8_t *src1 = ref->data[1], *src2 = ref->data[2];
1171
1172     if (AV_RN32A(mv)) {
1173         int mx = mv->x&7, mx_idx = subpel_idx[0][mx];
1174         int my = mv->y&7, my_idx = subpel_idx[0][my];
1175
1176         x_off += mv->x >> 3;
1177         y_off += mv->y >> 3;
1178
1179         // edge emulation
1180         src1 += y_off * linesize + x_off;
1181         src2 += y_off * linesize + x_off;
1182         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 3, 0);
1183         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1184             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1185             s->dsp.emulated_edge_mc(s->edge_emu_buffer, src1 - my_idx * linesize - mx_idx, linesize,
1186                                     block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1187                                     x_off - mx_idx, y_off - my_idx, width, height);
1188             src1 = s->edge_emu_buffer + mx_idx + linesize * my_idx;
1189             mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my);
1190
1191             s->dsp.emulated_edge_mc(s->edge_emu_buffer, src2 - my_idx * linesize - mx_idx, linesize,
1192                                     block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1193                                     x_off - mx_idx, y_off - my_idx, width, height);
1194             src2 = s->edge_emu_buffer + mx_idx + linesize * my_idx;
1195             mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my);
1196         } else {
1197             mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my);
1198             mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my);
1199         }
1200     } else {
1201         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 3, 0);
1202         mc_func[0][0](dst1, linesize, src1 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1203         mc_func[0][0](dst2, linesize, src2 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1204     }
1205 }
1206
1207 static av_always_inline
1208 void vp8_mc_part(VP8Context *s, uint8_t *dst[3],
1209                  AVFrame *ref_frame, int x_off, int y_off,
1210                  int bx_off, int by_off,
1211                  int block_w, int block_h,
1212                  int width, int height, VP56mv *mv)
1213 {
1214     VP56mv uvmv = *mv;
1215
1216     /* Y */
1217     vp8_mc_luma(s, dst[0] + by_off * s->linesize + bx_off,
1218                 ref_frame, mv, x_off + bx_off, y_off + by_off,
1219                 block_w, block_h, width, height, s->linesize,
1220                 s->put_pixels_tab[block_w == 8]);
1221
1222     /* U/V */
1223     if (s->profile == 3) {
1224         uvmv.x &= ~7;
1225         uvmv.y &= ~7;
1226     }
1227     x_off   >>= 1; y_off   >>= 1;
1228     bx_off  >>= 1; by_off  >>= 1;
1229     width   >>= 1; height  >>= 1;
1230     block_w >>= 1; block_h >>= 1;
1231     vp8_mc_chroma(s, dst[1] + by_off * s->uvlinesize + bx_off,
1232                   dst[2] + by_off * s->uvlinesize + bx_off, ref_frame,
1233                   &uvmv, x_off + bx_off, y_off + by_off,
1234                   block_w, block_h, width, height, s->uvlinesize,
1235                   s->put_pixels_tab[1 + (block_w == 4)]);
1236 }
1237
1238 /* Fetch pixels for estimated mv 4 macroblocks ahead.
1239  * Optimized for 64-byte cache lines.  Inspired by ffh264 prefetch_motion. */
1240 static av_always_inline void prefetch_motion(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, int mb_xy, int ref)
1241 {
1242     /* Don't prefetch refs that haven't been used very often this frame. */
1243     if (s->ref_count[ref-1] > (mb_xy >> 5)) {
1244         int x_off = mb_x << 4, y_off = mb_y << 4;
1245         int mx = (mb->mv.x>>2) + x_off + 8;
1246         int my = (mb->mv.y>>2) + y_off;
1247         uint8_t **src= s->framep[ref]->data;
1248         int off= mx + (my + (mb_x&3)*4)*s->linesize + 64;
1249         /* For threading, a ff_thread_await_progress here might be useful, but
1250          * it actually slows down the decoder. Since a bad prefetch doesn't
1251          * generate bad decoder output, we don't run it here. */
1252         s->dsp.prefetch(src[0]+off, s->linesize, 4);
1253         off= (mx>>1) + ((my>>1) + (mb_x&7))*s->uvlinesize + 64;
1254         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
1255     }
1256 }
1257
1258 /**
1259  * Apply motion vectors to prediction buffer, chapter 18.
1260  */
1261 static av_always_inline
1262 void inter_predict(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb,
1263                    int mb_x, int mb_y)
1264 {
1265     int x_off = mb_x << 4, y_off = mb_y << 4;
1266     int width = 16*s->mb_width, height = 16*s->mb_height;
1267     AVFrame *ref = s->framep[mb->ref_frame];
1268     VP56mv *bmv = mb->bmv;
1269
1270     switch (mb->partitioning) {
1271     case VP8_SPLITMVMODE_NONE:
1272         vp8_mc_part(s, dst, ref, x_off, y_off,
1273                     0, 0, 16, 16, width, height, &mb->mv);
1274         break;
1275     case VP8_SPLITMVMODE_4x4: {
1276         int x, y;
1277         VP56mv uvmv;
1278
1279         /* Y */
1280         for (y = 0; y < 4; y++) {
1281             for (x = 0; x < 4; x++) {
1282                 vp8_mc_luma(s, dst[0] + 4*y*s->linesize + x*4,
1283                             ref, &bmv[4*y + x],
1284                             4*x + x_off, 4*y + y_off, 4, 4,
1285                             width, height, s->linesize,
1286                             s->put_pixels_tab[2]);
1287             }
1288         }
1289
1290         /* U/V */
1291         x_off >>= 1; y_off >>= 1; width >>= 1; height >>= 1;
1292         for (y = 0; y < 2; y++) {
1293             for (x = 0; x < 2; x++) {
1294                 uvmv.x = mb->bmv[ 2*y    * 4 + 2*x  ].x +
1295                          mb->bmv[ 2*y    * 4 + 2*x+1].x +
1296                          mb->bmv[(2*y+1) * 4 + 2*x  ].x +
1297                          mb->bmv[(2*y+1) * 4 + 2*x+1].x;
1298                 uvmv.y = mb->bmv[ 2*y    * 4 + 2*x  ].y +
1299                          mb->bmv[ 2*y    * 4 + 2*x+1].y +
1300                          mb->bmv[(2*y+1) * 4 + 2*x  ].y +
1301                          mb->bmv[(2*y+1) * 4 + 2*x+1].y;
1302                 uvmv.x = (uvmv.x + 2 + (uvmv.x >> (INT_BIT-1))) >> 2;
1303                 uvmv.y = (uvmv.y + 2 + (uvmv.y >> (INT_BIT-1))) >> 2;
1304                 if (s->profile == 3) {
1305                     uvmv.x &= ~7;
1306                     uvmv.y &= ~7;
1307                 }
1308                 vp8_mc_chroma(s, dst[1] + 4*y*s->uvlinesize + x*4,
1309                               dst[2] + 4*y*s->uvlinesize + x*4, ref, &uvmv,
1310                               4*x + x_off, 4*y + y_off, 4, 4,
1311                               width, height, s->uvlinesize,
1312                               s->put_pixels_tab[2]);
1313             }
1314         }
1315         break;
1316     }
1317     case VP8_SPLITMVMODE_16x8:
1318         vp8_mc_part(s, dst, ref, x_off, y_off,
1319                     0, 0, 16, 8, width, height, &bmv[0]);
1320         vp8_mc_part(s, dst, ref, x_off, y_off,
1321                     0, 8, 16, 8, width, height, &bmv[1]);
1322         break;
1323     case VP8_SPLITMVMODE_8x16:
1324         vp8_mc_part(s, dst, ref, x_off, y_off,
1325                     0, 0, 8, 16, width, height, &bmv[0]);
1326         vp8_mc_part(s, dst, ref, x_off, y_off,
1327                     8, 0, 8, 16, width, height, &bmv[1]);
1328         break;
1329     case VP8_SPLITMVMODE_8x8:
1330         vp8_mc_part(s, dst, ref, x_off, y_off,
1331                     0, 0, 8, 8, width, height, &bmv[0]);
1332         vp8_mc_part(s, dst, ref, x_off, y_off,
1333                     8, 0, 8, 8, width, height, &bmv[1]);
1334         vp8_mc_part(s, dst, ref, x_off, y_off,
1335                     0, 8, 8, 8, width, height, &bmv[2]);
1336         vp8_mc_part(s, dst, ref, x_off, y_off,
1337                     8, 8, 8, 8, width, height, &bmv[3]);
1338         break;
1339     }
1340 }
1341
1342 static av_always_inline void idct_mb(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb)
1343 {
1344     int x, y, ch;
1345
1346     if (mb->mode != MODE_I4x4) {
1347         uint8_t *y_dst = dst[0];
1348         for (y = 0; y < 4; y++) {
1349             uint32_t nnz4 = AV_RL32(s->non_zero_count_cache[y]);
1350             if (nnz4) {
1351                 if (nnz4&~0x01010101) {
1352                     for (x = 0; x < 4; x++) {
1353                         if ((uint8_t)nnz4 == 1)
1354                             s->vp8dsp.vp8_idct_dc_add(y_dst+4*x, s->block[y][x], s->linesize);
1355                         else if((uint8_t)nnz4 > 1)
1356                             s->vp8dsp.vp8_idct_add(y_dst+4*x, s->block[y][x], s->linesize);
1357                         nnz4 >>= 8;
1358                         if (!nnz4)
1359                             break;
1360                     }
1361                 } else {
1362                     s->vp8dsp.vp8_idct_dc_add4y(y_dst, s->block[y], s->linesize);
1363                 }
1364             }
1365             y_dst += 4*s->linesize;
1366         }
1367     }
1368
1369     for (ch = 0; ch < 2; ch++) {
1370         uint32_t nnz4 = AV_RL32(s->non_zero_count_cache[4+ch]);
1371         if (nnz4) {
1372             uint8_t *ch_dst = dst[1+ch];
1373             if (nnz4&~0x01010101) {
1374                 for (y = 0; y < 2; y++) {
1375                     for (x = 0; x < 2; x++) {
1376                         if ((uint8_t)nnz4 == 1)
1377                             s->vp8dsp.vp8_idct_dc_add(ch_dst+4*x, s->block[4+ch][(y<<1)+x], s->uvlinesize);
1378                         else if((uint8_t)nnz4 > 1)
1379                             s->vp8dsp.vp8_idct_add(ch_dst+4*x, s->block[4+ch][(y<<1)+x], s->uvlinesize);
1380                         nnz4 >>= 8;
1381                         if (!nnz4)
1382                             goto chroma_idct_end;
1383                     }
1384                     ch_dst += 4*s->uvlinesize;
1385                 }
1386             } else {
1387                 s->vp8dsp.vp8_idct_dc_add4uv(ch_dst, s->block[4+ch], s->uvlinesize);
1388             }
1389         }
1390 chroma_idct_end: ;
1391     }
1392 }
1393
1394 static av_always_inline void filter_level_for_mb(VP8Context *s, VP8Macroblock *mb, VP8FilterStrength *f )
1395 {
1396     int interior_limit, filter_level;
1397
1398     if (s->segmentation.enabled) {
1399         filter_level = s->segmentation.filter_level[s->segment];
1400         if (!s->segmentation.absolute_vals)
1401             filter_level += s->filter.level;
1402     } else
1403         filter_level = s->filter.level;
1404
1405     if (s->lf_delta.enabled) {
1406         filter_level += s->lf_delta.ref[mb->ref_frame];
1407         filter_level += s->lf_delta.mode[mb->mode];
1408     }
1409
1410     filter_level = av_clip_uintp2(filter_level, 6);
1411
1412     interior_limit = filter_level;
1413     if (s->filter.sharpness) {
1414         interior_limit >>= (s->filter.sharpness + 3) >> 2;
1415         interior_limit = FFMIN(interior_limit, 9 - s->filter.sharpness);
1416     }
1417     interior_limit = FFMAX(interior_limit, 1);
1418
1419     f->filter_level = filter_level;
1420     f->inner_limit = interior_limit;
1421     f->inner_filter = !mb->skip || mb->mode == MODE_I4x4 || mb->mode == VP8_MVMODE_SPLIT;
1422 }
1423
1424 static av_always_inline void filter_mb(VP8Context *s, uint8_t *dst[3], VP8FilterStrength *f, int mb_x, int mb_y)
1425 {
1426     int mbedge_lim, bedge_lim, hev_thresh;
1427     int filter_level = f->filter_level;
1428     int inner_limit = f->inner_limit;
1429     int inner_filter = f->inner_filter;
1430     int linesize = s->linesize;
1431     int uvlinesize = s->uvlinesize;
1432     static const uint8_t hev_thresh_lut[2][64] = {
1433         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
1434           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1435           3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1436           3, 3, 3, 3 },
1437         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
1438           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1439           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1440           2, 2, 2, 2 }
1441     };
1442
1443     if (!filter_level)
1444         return;
1445
1446      bedge_lim = 2*filter_level + inner_limit;
1447     mbedge_lim = bedge_lim + 4;
1448
1449     hev_thresh = hev_thresh_lut[s->keyframe][filter_level];
1450
1451     if (mb_x) {
1452         s->vp8dsp.vp8_h_loop_filter16y(dst[0],     linesize,
1453                                        mbedge_lim, inner_limit, hev_thresh);
1454         s->vp8dsp.vp8_h_loop_filter8uv(dst[1],     dst[2],      uvlinesize,
1455                                        mbedge_lim, inner_limit, hev_thresh);
1456     }
1457
1458     if (inner_filter) {
1459         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+ 4, linesize, bedge_lim,
1460                                              inner_limit, hev_thresh);
1461         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+ 8, linesize, bedge_lim,
1462                                              inner_limit, hev_thresh);
1463         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+12, linesize, bedge_lim,
1464                                              inner_limit, hev_thresh);
1465         s->vp8dsp.vp8_h_loop_filter8uv_inner(dst[1] + 4, dst[2] + 4,
1466                                              uvlinesize,  bedge_lim,
1467                                              inner_limit, hev_thresh);
1468     }
1469
1470     if (mb_y) {
1471         s->vp8dsp.vp8_v_loop_filter16y(dst[0],     linesize,
1472                                        mbedge_lim, inner_limit, hev_thresh);
1473         s->vp8dsp.vp8_v_loop_filter8uv(dst[1],     dst[2],      uvlinesize,
1474                                        mbedge_lim, inner_limit, hev_thresh);
1475     }
1476
1477     if (inner_filter) {
1478         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+ 4*linesize,
1479                                              linesize,    bedge_lim,
1480                                              inner_limit, hev_thresh);
1481         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+ 8*linesize,
1482                                              linesize,    bedge_lim,
1483                                              inner_limit, hev_thresh);
1484         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+12*linesize,
1485                                              linesize,    bedge_lim,
1486                                              inner_limit, hev_thresh);
1487         s->vp8dsp.vp8_v_loop_filter8uv_inner(dst[1] + 4 * uvlinesize,
1488                                              dst[2] + 4 * uvlinesize,
1489                                              uvlinesize,  bedge_lim,
1490                                              inner_limit, hev_thresh);
1491     }
1492 }
1493
1494 static av_always_inline void filter_mb_simple(VP8Context *s, uint8_t *dst, VP8FilterStrength *f, int mb_x, int mb_y)
1495 {
1496     int mbedge_lim, bedge_lim;
1497     int filter_level = f->filter_level;
1498     int inner_limit = f->inner_limit;
1499     int inner_filter = f->inner_filter;
1500     int linesize = s->linesize;
1501
1502     if (!filter_level)
1503         return;
1504
1505      bedge_lim = 2*filter_level + inner_limit;
1506     mbedge_lim = bedge_lim + 4;
1507
1508     if (mb_x)
1509         s->vp8dsp.vp8_h_loop_filter_simple(dst, linesize, mbedge_lim);
1510     if (inner_filter) {
1511         s->vp8dsp.vp8_h_loop_filter_simple(dst+ 4, linesize, bedge_lim);
1512         s->vp8dsp.vp8_h_loop_filter_simple(dst+ 8, linesize, bedge_lim);
1513         s->vp8dsp.vp8_h_loop_filter_simple(dst+12, linesize, bedge_lim);
1514     }
1515
1516     if (mb_y)
1517         s->vp8dsp.vp8_v_loop_filter_simple(dst, linesize, mbedge_lim);
1518     if (inner_filter) {
1519         s->vp8dsp.vp8_v_loop_filter_simple(dst+ 4*linesize, linesize, bedge_lim);
1520         s->vp8dsp.vp8_v_loop_filter_simple(dst+ 8*linesize, linesize, bedge_lim);
1521         s->vp8dsp.vp8_v_loop_filter_simple(dst+12*linesize, linesize, bedge_lim);
1522     }
1523 }
1524
1525 static void filter_mb_row(VP8Context *s, AVFrame *curframe, int mb_y)
1526 {
1527     VP8FilterStrength *f = s->filter_strength;
1528     uint8_t *dst[3] = {
1529         curframe->data[0] + 16*mb_y*s->linesize,
1530         curframe->data[1] +  8*mb_y*s->uvlinesize,
1531         curframe->data[2] +  8*mb_y*s->uvlinesize
1532     };
1533     int mb_x;
1534
1535     for (mb_x = 0; mb_x < s->mb_width; mb_x++) {
1536         backup_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2], s->linesize, s->uvlinesize, 0);
1537         filter_mb(s, dst, f++, mb_x, mb_y);
1538         dst[0] += 16;
1539         dst[1] += 8;
1540         dst[2] += 8;
1541     }
1542 }
1543
1544 static void filter_mb_row_simple(VP8Context *s, AVFrame *curframe, int mb_y)
1545 {
1546     VP8FilterStrength *f = s->filter_strength;
1547     uint8_t *dst = curframe->data[0] + 16*mb_y*s->linesize;
1548     int mb_x;
1549
1550     for (mb_x = 0; mb_x < s->mb_width; mb_x++) {
1551         backup_mb_border(s->top_border[mb_x+1], dst, NULL, NULL, s->linesize, 0, 1);
1552         filter_mb_simple(s, dst, f++, mb_x, mb_y);
1553         dst += 16;
1554     }
1555 }
1556
1557 static void release_queued_segmaps(VP8Context *s, int is_close)
1558 {
1559     int leave_behind = is_close ? 0 : !s->maps_are_invalid;
1560     while (s->num_maps_to_be_freed > leave_behind)
1561         av_freep(&s->segmentation_maps[--s->num_maps_to_be_freed]);
1562     s->maps_are_invalid = 0;
1563 }
1564
1565 static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
1566                             AVPacket *avpkt)
1567 {
1568     VP8Context *s = avctx->priv_data;
1569     int ret, mb_x, mb_y, i, y, referenced;
1570     enum AVDiscard skip_thresh;
1571     AVFrame *av_uninit(curframe), *prev_frame = s->framep[VP56_FRAME_CURRENT];
1572
1573     release_queued_segmaps(s, 0);
1574
1575     if ((ret = decode_frame_header(s, avpkt->data, avpkt->size)) < 0)
1576         return ret;
1577
1578     referenced = s->update_last || s->update_golden == VP56_FRAME_CURRENT
1579                                 || s->update_altref == VP56_FRAME_CURRENT;
1580
1581     skip_thresh = !referenced ? AVDISCARD_NONREF :
1582                     !s->keyframe ? AVDISCARD_NONKEY : AVDISCARD_ALL;
1583
1584     if (avctx->skip_frame >= skip_thresh) {
1585         s->invisible = 1;
1586         goto skip_decode;
1587     }
1588     s->deblock_filter = s->filter.level && avctx->skip_loop_filter < skip_thresh;
1589
1590     // release no longer referenced frames
1591     for (i = 0; i < 5; i++)
1592         if (s->frames[i].data[0] &&
1593             &s->frames[i] != prev_frame &&
1594             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
1595             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN] &&
1596             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2])
1597             vp8_release_frame(s, &s->frames[i], 1, 0);
1598
1599     // find a free buffer
1600     for (i = 0; i < 5; i++)
1601         if (&s->frames[i] != prev_frame &&
1602             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
1603             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN] &&
1604             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2]) {
1605             curframe = s->framep[VP56_FRAME_CURRENT] = &s->frames[i];
1606             break;
1607         }
1608     if (i == 5) {
1609         av_log(avctx, AV_LOG_FATAL, "Ran out of free frames!\n");
1610         abort();
1611     }
1612     if (curframe->data[0])
1613         vp8_release_frame(s, curframe, 1, 0);
1614
1615     curframe->key_frame = s->keyframe;
1616     curframe->pict_type = s->keyframe ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
1617     curframe->reference = referenced ? 3 : 0;
1618     if ((ret = vp8_alloc_frame(s, curframe))) {
1619         av_log(avctx, AV_LOG_ERROR, "get_buffer() failed!\n");
1620         return ret;
1621     }
1622
1623     // check if golden and altref are swapped
1624     if (s->update_altref != VP56_FRAME_NONE) {
1625         s->next_framep[VP56_FRAME_GOLDEN2]  = s->framep[s->update_altref];
1626     } else {
1627         s->next_framep[VP56_FRAME_GOLDEN2]  = s->framep[VP56_FRAME_GOLDEN2];
1628     }
1629     if (s->update_golden != VP56_FRAME_NONE) {
1630         s->next_framep[VP56_FRAME_GOLDEN]   = s->framep[s->update_golden];
1631     } else {
1632         s->next_framep[VP56_FRAME_GOLDEN]   = s->framep[VP56_FRAME_GOLDEN];
1633     }
1634     if (s->update_last) {
1635         s->next_framep[VP56_FRAME_PREVIOUS] = curframe;
1636     } else {
1637         s->next_framep[VP56_FRAME_PREVIOUS] = s->framep[VP56_FRAME_PREVIOUS];
1638     }
1639     s->next_framep[VP56_FRAME_CURRENT]      = curframe;
1640
1641     ff_thread_finish_setup(avctx);
1642
1643     // Given that arithmetic probabilities are updated every frame, it's quite likely
1644     // that the values we have on a random interframe are complete junk if we didn't
1645     // start decode on a keyframe. So just don't display anything rather than junk.
1646     if (!s->keyframe && (!s->framep[VP56_FRAME_PREVIOUS] ||
1647                          !s->framep[VP56_FRAME_GOLDEN] ||
1648                          !s->framep[VP56_FRAME_GOLDEN2])) {
1649         av_log(avctx, AV_LOG_WARNING, "Discarding interframe without a prior keyframe!\n");
1650         return AVERROR_INVALIDDATA;
1651     }
1652
1653     s->linesize   = curframe->linesize[0];
1654     s->uvlinesize = curframe->linesize[1];
1655
1656     if (!s->edge_emu_buffer)
1657         s->edge_emu_buffer = av_malloc(21*s->linesize);
1658
1659     memset(s->top_nnz, 0, s->mb_width*sizeof(*s->top_nnz));
1660
1661     /* Zero macroblock structures for top/top-left prediction from outside the frame. */
1662     memset(s->macroblocks + s->mb_height*2 - 1, 0, (s->mb_width+1)*sizeof(*s->macroblocks));
1663
1664     // top edge of 127 for intra prediction
1665     if (!(avctx->flags & CODEC_FLAG_EMU_EDGE)) {
1666         s->top_border[0][15] = s->top_border[0][23] = 127;
1667         memset(s->top_border[1]-1, 127, s->mb_width*sizeof(*s->top_border)+1);
1668     }
1669     memset(s->ref_count, 0, sizeof(s->ref_count));
1670     if (s->keyframe)
1671         memset(s->intra4x4_pred_mode_top, DC_PRED, s->mb_width*4);
1672
1673 #define MARGIN (16 << 2)
1674     s->mv_min.y = -MARGIN;
1675     s->mv_max.y = ((s->mb_height - 1) << 6) + MARGIN;
1676
1677     for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
1678         VP56RangeCoder *c = &s->coeff_partition[mb_y & (s->num_coeff_partitions-1)];
1679         VP8Macroblock *mb = s->macroblocks + (s->mb_height - mb_y - 1)*2;
1680         int mb_xy = mb_y*s->mb_width;
1681         uint8_t *dst[3] = {
1682             curframe->data[0] + 16*mb_y*s->linesize,
1683             curframe->data[1] +  8*mb_y*s->uvlinesize,
1684             curframe->data[2] +  8*mb_y*s->uvlinesize
1685         };
1686
1687         memset(mb - 1, 0, sizeof(*mb));   // zero left macroblock
1688         memset(s->left_nnz, 0, sizeof(s->left_nnz));
1689         AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED*0x01010101);
1690
1691         // left edge of 129 for intra prediction
1692         if (!(avctx->flags & CODEC_FLAG_EMU_EDGE)) {
1693             for (i = 0; i < 3; i++)
1694                 for (y = 0; y < 16>>!!i; y++)
1695                     dst[i][y*curframe->linesize[i]-1] = 129;
1696             if (mb_y == 1) // top left edge is also 129
1697                 s->top_border[0][15] = s->top_border[0][23] = s->top_border[0][31] = 129;
1698         }
1699
1700         s->mv_min.x = -MARGIN;
1701         s->mv_max.x = ((s->mb_width  - 1) << 6) + MARGIN;
1702         if (prev_frame && s->segmentation.enabled && !s->segmentation.update_map)
1703             ff_thread_await_progress(prev_frame, mb_y, 0);
1704
1705         for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
1706             /* Prefetch the current frame, 4 MBs ahead */
1707             s->dsp.prefetch(dst[0] + (mb_x&3)*4*s->linesize + 64, s->linesize, 4);
1708             s->dsp.prefetch(dst[1] + (mb_x&7)*s->uvlinesize + 64, dst[2] - dst[1], 2);
1709
1710             decode_mb_mode(s, mb, mb_x, mb_y, curframe->ref_index[0] + mb_xy,
1711                            prev_frame && prev_frame->ref_index[0] ? prev_frame->ref_index[0] + mb_xy : NULL);
1712
1713             prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_PREVIOUS);
1714
1715             if (!mb->skip)
1716                 decode_mb_coeffs(s, c, mb, s->top_nnz[mb_x], s->left_nnz);
1717
1718             if (mb->mode <= MODE_I4x4)
1719                 intra_predict(s, dst, mb, mb_x, mb_y);
1720             else
1721                 inter_predict(s, dst, mb, mb_x, mb_y);
1722
1723             prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN);
1724
1725             if (!mb->skip) {
1726                 idct_mb(s, dst, mb);
1727             } else {
1728                 AV_ZERO64(s->left_nnz);
1729                 AV_WN64(s->top_nnz[mb_x], 0);   // array of 9, so unaligned
1730
1731                 // Reset DC block predictors if they would exist if the mb had coefficients
1732                 if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
1733                     s->left_nnz[8]      = 0;
1734                     s->top_nnz[mb_x][8] = 0;
1735                 }
1736             }
1737
1738             if (s->deblock_filter)
1739                 filter_level_for_mb(s, mb, &s->filter_strength[mb_x]);
1740
1741             prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN2);
1742
1743             dst[0] += 16;
1744             dst[1] += 8;
1745             dst[2] += 8;
1746             s->mv_min.x -= 64;
1747             s->mv_max.x -= 64;
1748         }
1749         if (s->deblock_filter) {
1750             if (s->filter.simple)
1751                 filter_mb_row_simple(s, curframe, mb_y);
1752             else
1753                 filter_mb_row(s, curframe, mb_y);
1754         }
1755         s->mv_min.y -= 64;
1756         s->mv_max.y -= 64;
1757
1758         ff_thread_report_progress(curframe, mb_y, 0);
1759     }
1760
1761     ff_thread_report_progress(curframe, INT_MAX, 0);
1762 skip_decode:
1763     // if future frames don't use the updated probabilities,
1764     // reset them to the values we saved
1765     if (!s->update_probabilities)
1766         s->prob[0] = s->prob[1];
1767
1768     memcpy(&s->framep[0], &s->next_framep[0], sizeof(s->framep[0]) * 4);
1769
1770     if (!s->invisible) {
1771         *(AVFrame*)data = *curframe;
1772         *data_size = sizeof(AVFrame);
1773     }
1774
1775     return avpkt->size;
1776 }
1777
1778 static av_cold int vp8_decode_init(AVCodecContext *avctx)
1779 {
1780     VP8Context *s = avctx->priv_data;
1781
1782     s->avctx = avctx;
1783     avctx->pix_fmt = PIX_FMT_YUV420P;
1784
1785     dsputil_init(&s->dsp, avctx);
1786     ff_h264_pred_init(&s->hpc, CODEC_ID_VP8, 8, 1);
1787     ff_vp8dsp_init(&s->vp8dsp);
1788
1789     return 0;
1790 }
1791
1792 static av_cold int vp8_decode_free(AVCodecContext *avctx)
1793 {
1794     vp8_decode_flush_impl(avctx, 0, 1, 1);
1795     release_queued_segmaps(avctx->priv_data, 1);
1796     return 0;
1797 }
1798
1799 static av_cold int vp8_decode_init_thread_copy(AVCodecContext *avctx)
1800 {
1801     VP8Context *s = avctx->priv_data;
1802
1803     s->avctx = avctx;
1804
1805     return 0;
1806 }
1807
1808 #define REBASE(pic) \
1809     pic ? pic - &s_src->frames[0] + &s->frames[0] : NULL
1810
1811 static int vp8_decode_update_thread_context(AVCodecContext *dst, const AVCodecContext *src)
1812 {
1813     VP8Context *s = dst->priv_data, *s_src = src->priv_data;
1814
1815     if (s->macroblocks_base &&
1816         (s_src->mb_width != s->mb_width || s_src->mb_height != s->mb_height)) {
1817         free_buffers(s);
1818     }
1819
1820     s->prob[0] = s_src->prob[!s_src->update_probabilities];
1821     s->segmentation = s_src->segmentation;
1822     s->lf_delta = s_src->lf_delta;
1823     memcpy(s->sign_bias, s_src->sign_bias, sizeof(s->sign_bias));
1824
1825     memcpy(&s->frames, &s_src->frames, sizeof(s->frames));
1826     s->framep[0] = REBASE(s_src->next_framep[0]);
1827     s->framep[1] = REBASE(s_src->next_framep[1]);
1828     s->framep[2] = REBASE(s_src->next_framep[2]);
1829     s->framep[3] = REBASE(s_src->next_framep[3]);
1830
1831     return 0;
1832 }
1833
1834 AVCodec ff_vp8_decoder = {
1835     .name           = "vp8",
1836     .type           = AVMEDIA_TYPE_VIDEO,
1837     .id             = CODEC_ID_VP8,
1838     .priv_data_size = sizeof(VP8Context),
1839     .init           = vp8_decode_init,
1840     .close          = vp8_decode_free,
1841     .decode         = vp8_decode_frame,
1842     .capabilities   = CODEC_CAP_DR1 | CODEC_CAP_FRAME_THREADS,
1843     .flush = vp8_decode_flush,
1844     .long_name = NULL_IF_CONFIG_SMALL("On2 VP8"),
1845     .init_thread_copy      = ONLY_IF_THREADS_ENABLED(vp8_decode_init_thread_copy),
1846     .update_thread_context = ONLY_IF_THREADS_ENABLED(vp8_decode_update_thread_context),
1847 };