git.sesse.net Git - ffmpeg/blob - libavcodec/vp8.c

   1 /*
   2  * VP8 compatible video decoder
   3  *
   4  * Copyright (C) 2010 David Conrad
   5  * Copyright (C) 2010 Ronald S. Bultje
   6  * Copyright (C) 2010 Jason Garrett-Glaser
   7  * Copyright (C) 2012 Daniel Kang
   8  *
   9  * This file is part of Libav.
  10  *
  11  * Libav is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public
  13  * License as published by the Free Software Foundation; either
  14  * version 2.1 of the License, or (at your option) any later version.
  15  *
  16  * Libav is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with Libav; if not, write to the Free Software
  23  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  24  */
  25
  26 #include "libavutil/imgutils.h"
  27 #include "avcodec.h"
  28 #include "internal.h"
  29 #include "vp8.h"
  30 #include "vp8data.h"
  31 #include "rectangle.h"
  32 #include "thread.h"
  33
  34 #if ARCH_ARM
  35 #   include "arm/vp8.h"
  36 #endif
  37
  38 static void free_buffers(VP8Context *s)
  39 {
  40     int i;
  41     if (s->thread_data)
  42         for (i = 0; i < MAX_THREADS; i++) {
  43 #if HAVE_THREADS
  44             pthread_cond_destroy(&s->thread_data[i].cond);
  45             pthread_mutex_destroy(&s->thread_data[i].lock);
  46 #endif
  47             av_freep(&s->thread_data[i].filter_strength);
  48             av_freep(&s->thread_data[i].edge_emu_buffer);
  49         }
  50     av_freep(&s->thread_data);
  51     av_freep(&s->macroblocks_base);
  52     av_freep(&s->intra4x4_pred_mode_top);
  53     av_freep(&s->top_nnz);
  54     av_freep(&s->top_border);
  55
  56     s->macroblocks = NULL;
  57 }
  58
  59 static int vp8_alloc_frame(VP8Context *s, VP8Frame *f, int ref)
  60 {
  61     int ret;
  62     if ((ret = ff_thread_get_buffer(s->avctx, &f->tf,
  63                                     ref ? AV_GET_BUFFER_FLAG_REF : 0)) < 0)
  64         return ret;
  65     if (!(f->seg_map = av_buffer_allocz(s->mb_width * s->mb_height))) {
  66         ff_thread_release_buffer(s->avctx, &f->tf);
  67         return AVERROR(ENOMEM);
  68     }
  69     return 0;
  70 }
  71
  72 static void vp8_release_frame(VP8Context *s, VP8Frame *f)
  73 {
  74     av_buffer_unref(&f->seg_map);
  75     ff_thread_release_buffer(s->avctx, &f->tf);
  76 }
  77
  78 static int vp8_ref_frame(VP8Context *s, VP8Frame *dst, VP8Frame *src)
  79 {
  80     int ret;
  81
  82     vp8_release_frame(s, dst);
  83
  84     if ((ret = ff_thread_ref_frame(&dst->tf, &src->tf)) < 0)
  85         return ret;
  86     if (src->seg_map &&
  87         !(dst->seg_map = av_buffer_ref(src->seg_map))) {
  88         vp8_release_frame(s, dst);
  89         return AVERROR(ENOMEM);
  90     }
  91
  92     return 0;
  93 }
  94
  95
  96 static void vp8_decode_flush_impl(AVCodecContext *avctx, int free_mem)
  97 {
  98     VP8Context *s = avctx->priv_data;
  99     int i;
 100
 101     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++)
 102         vp8_release_frame(s, &s->frames[i]);
 103     memset(s->framep, 0, sizeof(s->framep));
 104
 105     if (free_mem)
 106         free_buffers(s);
 107 }
 108
 109 static void vp8_decode_flush(AVCodecContext *avctx)
 110 {
 111     vp8_decode_flush_impl(avctx, 0);
 112 }
 113
 114 static int update_dimensions(VP8Context *s, int width, int height)
 115 {
 116     AVCodecContext *avctx = s->avctx;
 117     int i;
 118
 119     if (width  != s->avctx->width ||
 120         height != s->avctx->height) {
 121         if (av_image_check_size(width, height, 0, s->avctx))
 122             return AVERROR_INVALIDDATA;
 123
 124         vp8_decode_flush_impl(s->avctx, 1);
 125
 126         avcodec_set_dimensions(s->avctx, width, height);
 127     }
 128
 129     s->mb_width  = (s->avctx->coded_width +15) / 16;
 130     s->mb_height = (s->avctx->coded_height+15) / 16;
 131
 132     s->mb_layout = (avctx->active_thread_type == FF_THREAD_SLICE) && (FFMIN(s->num_coeff_partitions, avctx->thread_count) > 1);
 133     if (!s->mb_layout) { // Frame threading and one thread
 134         s->macroblocks_base       = av_mallocz((s->mb_width+s->mb_height*2+1)*sizeof(*s->macroblocks));
 135         s->intra4x4_pred_mode_top = av_mallocz(s->mb_width*4);
 136     }
 137     else // Sliced threading
 138         s->macroblocks_base       = av_mallocz((s->mb_width+2)*(s->mb_height+2)*sizeof(*s->macroblocks));
 139     s->top_nnz                    = av_mallocz(s->mb_width*sizeof(*s->top_nnz));
 140     s->top_border                 = av_mallocz((s->mb_width+1)*sizeof(*s->top_border));
 141     s->thread_data                = av_mallocz(MAX_THREADS*sizeof(VP8ThreadData));
 142
 143     for (i = 0; i < MAX_THREADS; i++) {
 144         s->thread_data[i].filter_strength = av_mallocz(s->mb_width*sizeof(*s->thread_data[0].filter_strength));
 145 #if HAVE_THREADS
 146         pthread_mutex_init(&s->thread_data[i].lock, NULL);
 147         pthread_cond_init(&s->thread_data[i].cond, NULL);
 148 #endif
 149     }
 150
 151     if (!s->macroblocks_base || !s->top_nnz || !s->top_border ||
 152         (!s->intra4x4_pred_mode_top && !s->mb_layout))
 153         return AVERROR(ENOMEM);
 154
 155     s->macroblocks        = s->macroblocks_base + 1;
 156
 157     return 0;
 158 }
 159
 160 static void parse_segment_info(VP8Context *s)
 161 {
 162     VP56RangeCoder *c = &s->c;
 163     int i;
 164
 165     s->segmentation.update_map = vp8_rac_get(c);
 166
 167     if (vp8_rac_get(c)) { // update segment feature data
 168         s->segmentation.absolute_vals = vp8_rac_get(c);
 169
 170         for (i = 0; i < 4; i++)
 171             s->segmentation.base_quant[i]   = vp8_rac_get_sint(c, 7);
 172
 173         for (i = 0; i < 4; i++)
 174             s->segmentation.filter_level[i] = vp8_rac_get_sint(c, 6);
 175     }
 176     if (s->segmentation.update_map)
 177         for (i = 0; i < 3; i++)
 178             s->prob->segmentid[i] = vp8_rac_get(c) ? vp8_rac_get_uint(c, 8) : 255;
 179 }
 180
 181 static void update_lf_deltas(VP8Context *s)
 182 {
 183     VP56RangeCoder *c = &s->c;
 184     int i;
 185
 186     for (i = 0; i < 4; i++) {
 187         if (vp8_rac_get(c)) {
 188             s->lf_delta.ref[i] = vp8_rac_get_uint(c, 6);
 189
 190             if (vp8_rac_get(c))
 191                 s->lf_delta.ref[i] = -s->lf_delta.ref[i];
 192         }
 193     }
 194
 195     for (i = MODE_I4x4; i <= VP8_MVMODE_SPLIT; i++) {
 196         if (vp8_rac_get(c)) {
 197             s->lf_delta.mode[i] = vp8_rac_get_uint(c, 6);
 198
 199             if (vp8_rac_get(c))
 200                 s->lf_delta.mode[i] = -s->lf_delta.mode[i];
 201         }
 202     }
 203 }
 204
 205 static int setup_partitions(VP8Context *s, const uint8_t *buf, int buf_size)
 206 {
 207     const uint8_t *sizes = buf;
 208     int i;
 209
 210     s->num_coeff_partitions = 1 << vp8_rac_get_uint(&s->c, 2);
 211
 212     buf      += 3*(s->num_coeff_partitions-1);
 213     buf_size -= 3*(s->num_coeff_partitions-1);
 214     if (buf_size < 0)
 215         return -1;
 216
 217     for (i = 0; i < s->num_coeff_partitions-1; i++) {
 218         int size = AV_RL24(sizes + 3*i);
 219         if (buf_size - size < 0)
 220             return -1;
 221
 222         ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, size);
 223         buf      += size;
 224         buf_size -= size;
 225     }
 226     ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, buf_size);
 227
 228     return 0;
 229 }
 230
 231 static void get_quants(VP8Context *s)
 232 {
 233     VP56RangeCoder *c = &s->c;
 234     int i, base_qi;
 235
 236     int yac_qi     = vp8_rac_get_uint(c, 7);
 237     int ydc_delta  = vp8_rac_get_sint(c, 4);
 238     int y2dc_delta = vp8_rac_get_sint(c, 4);
 239     int y2ac_delta = vp8_rac_get_sint(c, 4);
 240     int uvdc_delta = vp8_rac_get_sint(c, 4);
 241     int uvac_delta = vp8_rac_get_sint(c, 4);
 242
 243     for (i = 0; i < 4; i++) {
 244         if (s->segmentation.enabled) {
 245             base_qi = s->segmentation.base_quant[i];
 246             if (!s->segmentation.absolute_vals)
 247                 base_qi += yac_qi;
 248         } else
 249             base_qi = yac_qi;
 250
 251         s->qmat[i].luma_qmul[0]    =           vp8_dc_qlookup[av_clip_uintp2(base_qi + ydc_delta , 7)];
 252         s->qmat[i].luma_qmul[1]    =           vp8_ac_qlookup[av_clip_uintp2(base_qi             , 7)];
 253         s->qmat[i].luma_dc_qmul[0] =       2 * vp8_dc_qlookup[av_clip_uintp2(base_qi + y2dc_delta, 7)];
 254         /* 101581>>16 is equivalent to 155/100 */
 255         s->qmat[i].luma_dc_qmul[1] = (101581 * vp8_ac_qlookup[av_clip_uintp2(base_qi + y2ac_delta, 7)]) >> 16;
 256         s->qmat[i].chroma_qmul[0]  =           vp8_dc_qlookup[av_clip_uintp2(base_qi + uvdc_delta, 7)];
 257         s->qmat[i].chroma_qmul[1]  =           vp8_ac_qlookup[av_clip_uintp2(base_qi + uvac_delta, 7)];
 258
 259         s->qmat[i].luma_dc_qmul[1] = FFMAX(s->qmat[i].luma_dc_qmul[1], 8);
 260         s->qmat[i].chroma_qmul[0]  = FFMIN(s->qmat[i].chroma_qmul[0], 132);
 261     }
 262 }
 263
 264 /**
 265  * Determine which buffers golden and altref should be updated with after this frame.
 266  * The spec isn't clear here, so I'm going by my understanding of what libvpx does
 267  *
 268  * Intra frames update all 3 references
 269  * Inter frames update VP56_FRAME_PREVIOUS if the update_last flag is set
 270  * If the update (golden|altref) flag is set, it's updated with the current frame
 271  *      if update_last is set, and VP56_FRAME_PREVIOUS otherwise.
 272  * If the flag is not set, the number read means:
 273  *      0: no update
 274  *      1: VP56_FRAME_PREVIOUS
 275  *      2: update golden with altref, or update altref with golden
 276  */
 277 static VP56Frame ref_to_update(VP8Context *s, int update, VP56Frame ref)
 278 {
 279     VP56RangeCoder *c = &s->c;
 280
 281     if (update)
 282         return VP56_FRAME_CURRENT;
 283
 284     switch (vp8_rac_get_uint(c, 2)) {
 285     case 1:
 286         return VP56_FRAME_PREVIOUS;
 287     case 2:
 288         return (ref == VP56_FRAME_GOLDEN) ? VP56_FRAME_GOLDEN2 : VP56_FRAME_GOLDEN;
 289     }
 290     return VP56_FRAME_NONE;
 291 }
 292
 293 static void update_refs(VP8Context *s)
 294 {
 295     VP56RangeCoder *c = &s->c;
 296
 297     int update_golden = vp8_rac_get(c);
 298     int update_altref = vp8_rac_get(c);
 299
 300     s->update_golden = ref_to_update(s, update_golden, VP56_FRAME_GOLDEN);
 301     s->update_altref = ref_to_update(s, update_altref, VP56_FRAME_GOLDEN2);
 302 }
 303
 304 static int decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size)
 305 {
 306     VP56RangeCoder *c = &s->c;
 307     int header_size, hscale, vscale, i, j, k, l, m, ret;
 308     int width  = s->avctx->width;
 309     int height = s->avctx->height;
 310
 311     s->keyframe  = !(buf[0] & 1);
 312     s->profile   =  (buf[0]>>1) & 7;
 313     s->invisible = !(buf[0] & 0x10);
 314     header_size  = AV_RL24(buf) >> 5;
 315     buf      += 3;
 316     buf_size -= 3;
 317
 318     if (s->profile > 3)
 319         av_log(s->avctx, AV_LOG_WARNING, "Unknown profile %d\n", s->profile);
 320
 321     if (!s->profile)
 322         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab, sizeof(s->put_pixels_tab));
 323     else    // profile 1-3 use bilinear, 4+ aren't defined so whatever
 324         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_bilinear_pixels_tab, sizeof(s->put_pixels_tab));
 325
 326     if (header_size > buf_size - 7*s->keyframe) {
 327         av_log(s->avctx, AV_LOG_ERROR, "Header size larger than data provided\n");
 328         return AVERROR_INVALIDDATA;
 329     }
 330
 331     if (s->keyframe) {
 332         if (AV_RL24(buf) != 0x2a019d) {
 333             av_log(s->avctx, AV_LOG_ERROR, "Invalid start code 0x%x\n", AV_RL24(buf));
 334             return AVERROR_INVALIDDATA;
 335         }
 336         width  = AV_RL16(buf+3) & 0x3fff;
 337         height = AV_RL16(buf+5) & 0x3fff;
 338         hscale = buf[4] >> 6;
 339         vscale = buf[6] >> 6;
 340         buf      += 7;
 341         buf_size -= 7;
 342
 343         if (hscale || vscale)
 344             avpriv_request_sample(s->avctx, "Upscaling");
 345
 346         s->update_golden = s->update_altref = VP56_FRAME_CURRENT;
 347         for (i = 0; i < 4; i++)
 348             for (j = 0; j < 16; j++)
 349                 memcpy(s->prob->token[i][j], vp8_token_default_probs[i][vp8_coeff_band[j]],
 350                        sizeof(s->prob->token[i][j]));
 351         memcpy(s->prob->pred16x16, vp8_pred16x16_prob_inter, sizeof(s->prob->pred16x16));
 352         memcpy(s->prob->pred8x8c , vp8_pred8x8c_prob_inter , sizeof(s->prob->pred8x8c));
 353         memcpy(s->prob->mvc      , vp8_mv_default_prob     , sizeof(s->prob->mvc));
 354         memset(&s->segmentation, 0, sizeof(s->segmentation));
 355         memset(&s->lf_delta, 0, sizeof(s->lf_delta));
 356     }
 357
 358     ff_vp56_init_range_decoder(c, buf, header_size);
 359     buf      += header_size;
 360     buf_size -= header_size;
 361
 362     if (s->keyframe) {
 363         if (vp8_rac_get(c))
 364             av_log(s->avctx, AV_LOG_WARNING, "Unspecified colorspace\n");
 365         vp8_rac_get(c); // whether we can skip clamping in dsp functions
 366     }
 367
 368     if ((s->segmentation.enabled = vp8_rac_get(c)))
 369         parse_segment_info(s);
 370     else
 371         s->segmentation.update_map = 0; // FIXME: move this to some init function?
 372
 373     s->filter.simple    = vp8_rac_get(c);
 374     s->filter.level     = vp8_rac_get_uint(c, 6);
 375     s->filter.sharpness = vp8_rac_get_uint(c, 3);
 376
 377     if ((s->lf_delta.enabled = vp8_rac_get(c)))
 378         if (vp8_rac_get(c))
 379             update_lf_deltas(s);
 380
 381     if (setup_partitions(s, buf, buf_size)) {
 382         av_log(s->avctx, AV_LOG_ERROR, "Invalid partitions\n");
 383         return AVERROR_INVALIDDATA;
 384     }
 385
 386     if (!s->macroblocks_base || /* first frame */
 387         width != s->avctx->width || height != s->avctx->height) {
 388         if ((ret = update_dimensions(s, width, height)) < 0)
 389             return ret;
 390     }
 391
 392     get_quants(s);
 393
 394     if (!s->keyframe) {
 395         update_refs(s);
 396         s->sign_bias[VP56_FRAME_GOLDEN]               = vp8_rac_get(c);
 397         s->sign_bias[VP56_FRAME_GOLDEN2 /* altref */] = vp8_rac_get(c);
 398     }
 399
 400     // if we aren't saving this frame's probabilities for future frames,
 401     // make a copy of the current probabilities
 402     if (!(s->update_probabilities = vp8_rac_get(c)))
 403         s->prob[1] = s->prob[0];
 404
 405     s->update_last = s->keyframe || vp8_rac_get(c);
 406
 407     for (i = 0; i < 4; i++)
 408         for (j = 0; j < 8; j++)
 409             for (k = 0; k < 3; k++)
 410                 for (l = 0; l < NUM_DCT_TOKENS-1; l++)
 411                     if (vp56_rac_get_prob_branchy(c, vp8_token_update_probs[i][j][k][l])) {
 412                         int prob = vp8_rac_get_uint(c, 8);
 413                         for (m = 0; vp8_coeff_band_indexes[j][m] >= 0; m++)
 414                             s->prob->token[i][vp8_coeff_band_indexes[j][m]][k][l] = prob;
 415                     }
 416
 417     if ((s->mbskip_enabled = vp8_rac_get(c)))
 418         s->prob->mbskip = vp8_rac_get_uint(c, 8);
 419
 420     if (!s->keyframe) {
 421         s->prob->intra  = vp8_rac_get_uint(c, 8);
 422         s->prob->last   = vp8_rac_get_uint(c, 8);
 423         s->prob->golden = vp8_rac_get_uint(c, 8);
 424
 425         if (vp8_rac_get(c))
 426             for (i = 0; i < 4; i++)
 427                 s->prob->pred16x16[i] = vp8_rac_get_uint(c, 8);
 428         if (vp8_rac_get(c))
 429             for (i = 0; i < 3; i++)
 430                 s->prob->pred8x8c[i]  = vp8_rac_get_uint(c, 8);
 431
 432         // 17.2 MV probability update
 433         for (i = 0; i < 2; i++)
 434             for (j = 0; j < 19; j++)
 435                 if (vp56_rac_get_prob_branchy(c, vp8_mv_update_prob[i][j]))
 436                     s->prob->mvc[i][j] = vp8_rac_get_nn(c);
 437     }
 438
 439     return 0;
 440 }
 441
 442 static av_always_inline void clamp_mv(VP8Context *s, VP56mv *dst, const VP56mv *src)
 443 {
 444     dst->x = av_clip(src->x, s->mv_min.x, s->mv_max.x);
 445     dst->y = av_clip(src->y, s->mv_min.y, s->mv_max.y);
 446 }
 447
 448 /**
 449  * Motion vector coding, 17.1.
 450  */
 451 static int read_mv_component(VP56RangeCoder *c, const uint8_t *p)
 452 {
 453     int bit, x = 0;
 454
 455     if (vp56_rac_get_prob_branchy(c, p[0])) {
 456         int i;
 457
 458         for (i = 0; i < 3; i++)
 459             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 460         for (i = 9; i > 3; i--)
 461             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 462         if (!(x & 0xFFF0) || vp56_rac_get_prob(c, p[12]))
 463             x += 8;
 464     } else {
 465         // small_mvtree
 466         const uint8_t *ps = p+2;
 467         bit = vp56_rac_get_prob(c, *ps);
 468         ps += 1 + 3*bit;
 469         x  += 4*bit;
 470         bit = vp56_rac_get_prob(c, *ps);
 471         ps += 1 + bit;
 472         x  += 2*bit;
 473         x  += vp56_rac_get_prob(c, *ps);
 474     }
 475
 476     return (x && vp56_rac_get_prob(c, p[1])) ? -x : x;
 477 }
 478
 479 static av_always_inline
 480 const uint8_t *get_submv_prob(uint32_t left, uint32_t top)
 481 {
 482     if (left == top)
 483         return vp8_submv_prob[4-!!left];
 484     if (!top)
 485         return vp8_submv_prob[2];
 486     return vp8_submv_prob[1-!!left];
 487 }
 488
 489 /**
 490  * Split motion vector prediction, 16.4.
 491  * @returns the number of motion vectors parsed (2, 4 or 16)
 492  */
 493 static av_always_inline
 494 int decode_splitmvs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb, int layout)
 495 {
 496     int part_idx;
 497     int n, num;
 498     VP8Macroblock *top_mb;
 499     VP8Macroblock *left_mb = &mb[-1];
 500     const uint8_t *mbsplits_left = vp8_mbsplits[left_mb->partitioning],
 501                   *mbsplits_top,
 502                   *mbsplits_cur, *firstidx;
 503     VP56mv *top_mv;
 504     VP56mv *left_mv = left_mb->bmv;
 505     VP56mv *cur_mv  = mb->bmv;
 506
 507     if (!layout) // layout is inlined, s->mb_layout is not
 508         top_mb = &mb[2];
 509     else
 510         top_mb = &mb[-s->mb_width-1];
 511     mbsplits_top = vp8_mbsplits[top_mb->partitioning];
 512     top_mv = top_mb->bmv;
 513
 514     if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[0])) {
 515         if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[1])) {
 516             part_idx = VP8_SPLITMVMODE_16x8 + vp56_rac_get_prob(c, vp8_mbsplit_prob[2]);
 517         } else {
 518             part_idx = VP8_SPLITMVMODE_8x8;
 519         }
 520     } else {
 521         part_idx = VP8_SPLITMVMODE_4x4;
 522     }
 523
 524     num = vp8_mbsplit_count[part_idx];
 525     mbsplits_cur = vp8_mbsplits[part_idx],
 526     firstidx = vp8_mbfirstidx[part_idx];
 527     mb->partitioning = part_idx;
 528
 529     for (n = 0; n < num; n++) {
 530         int k = firstidx[n];
 531         uint32_t left, above;
 532         const uint8_t *submv_prob;
 533
 534         if (!(k & 3))
 535             left = AV_RN32A(&left_mv[mbsplits_left[k + 3]]);
 536         else
 537             left  = AV_RN32A(&cur_mv[mbsplits_cur[k - 1]]);
 538         if (k <= 3)
 539             above = AV_RN32A(&top_mv[mbsplits_top[k + 12]]);
 540         else
 541             above = AV_RN32A(&cur_mv[mbsplits_cur[k - 4]]);
 542
 543         submv_prob = get_submv_prob(left, above);
 544
 545         if (vp56_rac_get_prob_branchy(c, submv_prob[0])) {
 546             if (vp56_rac_get_prob_branchy(c, submv_prob[1])) {
 547                 if (vp56_rac_get_prob_branchy(c, submv_prob[2])) {
 548                     mb->bmv[n].y = mb->mv.y + read_mv_component(c, s->prob->mvc[0]);
 549                     mb->bmv[n].x = mb->mv.x + read_mv_component(c, s->prob->mvc[1]);
 550                 } else {
 551                     AV_ZERO32(&mb->bmv[n]);
 552                 }
 553             } else {
 554                 AV_WN32A(&mb->bmv[n], above);
 555             }
 556         } else {
 557             AV_WN32A(&mb->bmv[n], left);
 558         }
 559     }
 560
 561     return num;
 562 }
 563
 564 static av_always_inline
 565 void decode_mvs(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, int layout)
 566 {
 567     VP8Macroblock *mb_edge[3] = { 0 /* top */,
 568                                   mb - 1 /* left */,
 569                                   0 /* top-left */ };
 570     enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV };
 571     enum { VP8_EDGE_TOP, VP8_EDGE_LEFT, VP8_EDGE_TOPLEFT };
 572     int idx = CNT_ZERO;
 573     int cur_sign_bias = s->sign_bias[mb->ref_frame];
 574     int8_t *sign_bias = s->sign_bias;
 575     VP56mv near_mv[4];
 576     uint8_t cnt[4] = { 0 };
 577     VP56RangeCoder *c = &s->c;
 578
 579     if (!layout) { // layout is inlined (s->mb_layout is not)
 580         mb_edge[0] = mb + 2;
 581         mb_edge[2] = mb + 1;
 582     }
 583     else {
 584         mb_edge[0] = mb - s->mb_width-1;
 585         mb_edge[2] = mb - s->mb_width-2;
 586     }
 587
 588     AV_ZERO32(&near_mv[0]);
 589     AV_ZERO32(&near_mv[1]);
 590     AV_ZERO32(&near_mv[2]);
 591
 592     /* Process MB on top, left and top-left */
 593     #define MV_EDGE_CHECK(n)\
 594     {\
 595         VP8Macroblock *edge = mb_edge[n];\
 596         int edge_ref = edge->ref_frame;\
 597         if (edge_ref != VP56_FRAME_CURRENT) {\
 598             uint32_t mv = AV_RN32A(&edge->mv);\
 599             if (mv) {\
 600                 if (cur_sign_bias != sign_bias[edge_ref]) {\
 601                     /* SWAR negate of the values in mv. */\
 602                     mv = ~mv;\
 603                     mv = ((mv&0x7fff7fff) + 0x00010001) ^ (mv&0x80008000);\
 604                 }\
 605                 if (!n || mv != AV_RN32A(&near_mv[idx]))\
 606                     AV_WN32A(&near_mv[++idx], mv);\
 607                 cnt[idx]      += 1 + (n != 2);\
 608             } else\
 609                 cnt[CNT_ZERO] += 1 + (n != 2);\
 610         }\
 611     }
 612
 613     MV_EDGE_CHECK(0)
 614     MV_EDGE_CHECK(1)
 615     MV_EDGE_CHECK(2)
 616
 617     mb->partitioning = VP8_SPLITMVMODE_NONE;
 618     if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_ZERO]][0])) {
 619         mb->mode = VP8_MVMODE_MV;
 620
 621         /* If we have three distinct MVs, merge first and last if they're the same */
 622         if (cnt[CNT_SPLITMV] && AV_RN32A(&near_mv[1 + VP8_EDGE_TOP]) == AV_RN32A(&near_mv[1 + VP8_EDGE_TOPLEFT]))
 623             cnt[CNT_NEAREST] += 1;
 624
 625         /* Swap near and nearest if necessary */
 626         if (cnt[CNT_NEAR] > cnt[CNT_NEAREST]) {
 627             FFSWAP(uint8_t,     cnt[CNT_NEAREST],     cnt[CNT_NEAR]);
 628             FFSWAP( VP56mv, near_mv[CNT_NEAREST], near_mv[CNT_NEAR]);
 629         }
 630
 631         if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAREST]][1])) {
 632             if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAR]][2])) {
 633
 634                 /* Choose the best mv out of 0,0 and the nearest mv */
 635                 clamp_mv(s, &mb->mv, &near_mv[CNT_ZERO + (cnt[CNT_NEAREST] >= cnt[CNT_ZERO])]);
 636                 cnt[CNT_SPLITMV] = ((mb_edge[VP8_EDGE_LEFT]->mode    == VP8_MVMODE_SPLIT) +
 637                                     (mb_edge[VP8_EDGE_TOP]->mode     == VP8_MVMODE_SPLIT)) * 2 +
 638                                     (mb_edge[VP8_EDGE_TOPLEFT]->mode == VP8_MVMODE_SPLIT);
 639
 640                 if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_SPLITMV]][3])) {
 641                     mb->mode = VP8_MVMODE_SPLIT;
 642                     mb->mv = mb->bmv[decode_splitmvs(s, c, mb, layout) - 1];
 643                 } else {
 644                     mb->mv.y += read_mv_component(c, s->prob->mvc[0]);
 645                     mb->mv.x += read_mv_component(c, s->prob->mvc[1]);
 646                     mb->bmv[0] = mb->mv;
 647                 }
 648             } else {
 649                 clamp_mv(s, &mb->mv, &near_mv[CNT_NEAR]);
 650                 mb->bmv[0] = mb->mv;
 651             }
 652         } else {
 653             clamp_mv(s, &mb->mv, &near_mv[CNT_NEAREST]);
 654             mb->bmv[0] = mb->mv;
 655         }
 656     } else {
 657         mb->mode = VP8_MVMODE_ZERO;
 658         AV_ZERO32(&mb->mv);
 659         mb->bmv[0] = mb->mv;
 660     }
 661 }
 662
 663 static av_always_inline
 664 void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
 665                            int mb_x, int keyframe, int layout)
 666 {
 667     uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
 668
 669     if (layout == 1) {
 670         VP8Macroblock *mb_top = mb - s->mb_width - 1;
 671         memcpy(mb->intra4x4_pred_mode_top, mb_top->intra4x4_pred_mode_top, 4);
 672     }
 673     if (keyframe) {
 674         int x, y;
 675         uint8_t* top;
 676         uint8_t* const left = s->intra4x4_pred_mode_left;
 677         if (layout == 1)
 678             top = mb->intra4x4_pred_mode_top;
 679         else
 680             top = s->intra4x4_pred_mode_top + 4 * mb_x;
 681         for (y = 0; y < 4; y++) {
 682             for (x = 0; x < 4; x++) {
 683                 const uint8_t *ctx;
 684                 ctx = vp8_pred4x4_prob_intra[top[x]][left[y]];
 685                 *intra4x4 = vp8_rac_get_tree(c, vp8_pred4x4_tree, ctx);
 686                 left[y] = top[x] = *intra4x4;
 687                 intra4x4++;
 688             }
 689         }
 690     } else {
 691         int i;
 692         for (i = 0; i < 16; i++)
 693             intra4x4[i] = vp8_rac_get_tree(c, vp8_pred4x4_tree, vp8_pred4x4_prob_inter);
 694     }
 695 }
 696
 697 static av_always_inline
 698 void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
 699                     uint8_t *segment, uint8_t *ref, int layout)
 700 {
 701     VP56RangeCoder *c = &s->c;
 702
 703     if (s->segmentation.update_map)
 704         *segment = vp8_rac_get_tree(c, vp8_segmentid_tree, s->prob->segmentid);
 705     else if (s->segmentation.enabled)
 706         *segment = ref ? *ref : *segment;
 707     mb->segment = *segment;
 708
 709     mb->skip = s->mbskip_enabled ? vp56_rac_get_prob(c, s->prob->mbskip) : 0;
 710
 711     if (s->keyframe) {
 712         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_intra, vp8_pred16x16_prob_intra);
 713
 714         if (mb->mode == MODE_I4x4) {
 715             decode_intra4x4_modes(s, c, mb, mb_x, 1, layout);
 716         } else {
 717             const uint32_t modes = vp8_pred4x4_mode[mb->mode] * 0x01010101u;
 718             if (s->mb_layout == 1)
 719                 AV_WN32A(mb->intra4x4_pred_mode_top, modes);
 720             else
 721                 AV_WN32A(s->intra4x4_pred_mode_top + 4 * mb_x, modes);
 722             AV_WN32A( s->intra4x4_pred_mode_left, modes);
 723         }
 724
 725         mb->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, vp8_pred8x8c_prob_intra);
 726         mb->ref_frame = VP56_FRAME_CURRENT;
 727     } else if (vp56_rac_get_prob_branchy(c, s->prob->intra)) {
 728         // inter MB, 16.2
 729         if (vp56_rac_get_prob_branchy(c, s->prob->last))
 730             mb->ref_frame = vp56_rac_get_prob(c, s->prob->golden) ?
 731                 VP56_FRAME_GOLDEN2 /* altref */ : VP56_FRAME_GOLDEN;
 732         else
 733             mb->ref_frame = VP56_FRAME_PREVIOUS;
 734         s->ref_count[mb->ref_frame-1]++;
 735
 736         // motion vectors, 16.3
 737         decode_mvs(s, mb, mb_x, mb_y, layout);
 738     } else {
 739         // intra MB, 16.1
 740         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_inter, s->prob->pred16x16);
 741
 742         if (mb->mode == MODE_I4x4)
 743             decode_intra4x4_modes(s, c, mb, mb_x, 0, layout);
 744
 745         mb->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, s->prob->pred8x8c);
 746         mb->ref_frame = VP56_FRAME_CURRENT;
 747         mb->partitioning = VP8_SPLITMVMODE_NONE;
 748         AV_ZERO32(&mb->bmv[0]);
 749     }
 750 }
 751
 752 #ifndef decode_block_coeffs_internal
 753 /**
 754  * @param r arithmetic bitstream reader context
 755  * @param block destination for block coefficients
 756  * @param probs probabilities to use when reading trees from the bitstream
 757  * @param i initial coeff index, 0 unless a separate DC block is coded
 758  * @param qmul array holding the dc/ac dequant factor at position 0/1
 759  * @return 0 if no coeffs were decoded
 760  *         otherwise, the index of the last coeff decoded plus one
 761  */
 762 static int decode_block_coeffs_internal(VP56RangeCoder *r, int16_t block[16],
 763                                         uint8_t probs[16][3][NUM_DCT_TOKENS-1],
 764                                         int i, uint8_t *token_prob, int16_t qmul[2])
 765 {
 766     VP56RangeCoder c = *r;
 767     goto skip_eob;
 768     do {
 769         int coeff;
 770         if (!vp56_rac_get_prob_branchy(&c, token_prob[0]))   // DCT_EOB
 771             break;
 772
 773 skip_eob:
 774         if (!vp56_rac_get_prob_branchy(&c, token_prob[1])) { // DCT_0
 775             if (++i == 16)
 776                 break; // invalid input; blocks should end with EOB
 777             token_prob = probs[i][0];
 778             goto skip_eob;
 779         }
 780
 781         if (!vp56_rac_get_prob_branchy(&c, token_prob[2])) { // DCT_1
 782             coeff = 1;
 783             token_prob = probs[i+1][1];
 784         } else {
 785             if (!vp56_rac_get_prob_branchy(&c, token_prob[3])) { // DCT 2,3,4
 786                 coeff = vp56_rac_get_prob_branchy(&c, token_prob[4]);
 787                 if (coeff)
 788                     coeff += vp56_rac_get_prob(&c, token_prob[5]);
 789                 coeff += 2;
 790             } else {
 791                 // DCT_CAT*
 792                 if (!vp56_rac_get_prob_branchy(&c, token_prob[6])) {
 793                     if (!vp56_rac_get_prob_branchy(&c, token_prob[7])) { // DCT_CAT1
 794                         coeff  = 5 + vp56_rac_get_prob(&c, vp8_dct_cat1_prob[0]);
 795                     } else {                                    // DCT_CAT2
 796                         coeff  = 7;
 797                         coeff += vp56_rac_get_prob(&c, vp8_dct_cat2_prob[0]) << 1;
 798                         coeff += vp56_rac_get_prob(&c, vp8_dct_cat2_prob[1]);
 799                     }
 800                 } else {    // DCT_CAT3 and up
 801                     int a = vp56_rac_get_prob(&c, token_prob[8]);
 802                     int b = vp56_rac_get_prob(&c, token_prob[9+a]);
 803                     int cat = (a<<1) + b;
 804                     coeff  = 3 + (8<<cat);
 805                     coeff += vp8_rac_get_coeff(&c, ff_vp8_dct_cat_prob[cat]);
 806                 }
 807             }
 808             token_prob = probs[i+1][2];
 809         }
 810         block[zigzag_scan[i]] = (vp8_rac_get(&c) ? -coeff : coeff) * qmul[!!i];
 811     } while (++i < 16);
 812
 813     *r = c;
 814     return i;
 815 }
 816 #endif
 817
 818 /**
 819  * @param c arithmetic bitstream reader context
 820  * @param block destination for block coefficients
 821  * @param probs probabilities to use when reading trees from the bitstream
 822  * @param i initial coeff index, 0 unless a separate DC block is coded
 823  * @param zero_nhood the initial prediction context for number of surrounding
 824  *                   all-zero blocks (only left/top, so 0-2)
 825  * @param qmul array holding the dc/ac dequant factor at position 0/1
 826  * @return 0 if no coeffs were decoded
 827  *         otherwise, the index of the last coeff decoded plus one
 828  */
 829 static av_always_inline
 830 int decode_block_coeffs(VP56RangeCoder *c, int16_t block[16],
 831                         uint8_t probs[16][3][NUM_DCT_TOKENS-1],
 832                         int i, int zero_nhood, int16_t qmul[2])
 833 {
 834     uint8_t *token_prob = probs[i][zero_nhood];
 835     if (!vp56_rac_get_prob_branchy(c, token_prob[0]))   // DCT_EOB
 836         return 0;
 837     return decode_block_coeffs_internal(c, block, probs, i, token_prob, qmul);
 838 }
 839
 840 static av_always_inline
 841 void decode_mb_coeffs(VP8Context *s, VP8ThreadData *td, VP56RangeCoder *c, VP8Macroblock *mb,
 842                       uint8_t t_nnz[9], uint8_t l_nnz[9])
 843 {
 844     int i, x, y, luma_start = 0, luma_ctx = 3;
 845     int nnz_pred, nnz, nnz_total = 0;
 846     int segment = mb->segment;
 847     int block_dc = 0;
 848
 849     if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
 850         nnz_pred = t_nnz[8] + l_nnz[8];
 851
 852         // decode DC values and do hadamard
 853         nnz = decode_block_coeffs(c, td->block_dc, s->prob->token[1], 0, nnz_pred,
 854                                   s->qmat[segment].luma_dc_qmul);
 855         l_nnz[8] = t_nnz[8] = !!nnz;
 856         if (nnz) {
 857             nnz_total += nnz;
 858             block_dc = 1;
 859             if (nnz == 1)
 860                 s->vp8dsp.vp8_luma_dc_wht_dc(td->block, td->block_dc);
 861             else
 862                 s->vp8dsp.vp8_luma_dc_wht(td->block, td->block_dc);
 863         }
 864         luma_start = 1;
 865         luma_ctx = 0;
 866     }
 867
 868     // luma blocks
 869     for (y = 0; y < 4; y++)
 870         for (x = 0; x < 4; x++) {
 871             nnz_pred = l_nnz[y] + t_nnz[x];
 872             nnz = decode_block_coeffs(c, td->block[y][x], s->prob->token[luma_ctx], luma_start,
 873                                       nnz_pred, s->qmat[segment].luma_qmul);
 874             // nnz+block_dc may be one more than the actual last index, but we don't care
 875             td->non_zero_count_cache[y][x] = nnz + block_dc;
 876             t_nnz[x] = l_nnz[y] = !!nnz;
 877             nnz_total += nnz;
 878         }
 879
 880     // chroma blocks
 881     // TODO: what to do about dimensions? 2nd dim for luma is x,
 882     // but for chroma it's (y<<1)|x
 883     for (i = 4; i < 6; i++)
 884         for (y = 0; y < 2; y++)
 885             for (x = 0; x < 2; x++) {
 886                 nnz_pred = l_nnz[i+2*y] + t_nnz[i+2*x];
 887                 nnz = decode_block_coeffs(c, td->block[i][(y<<1)+x], s->prob->token[2], 0,
 888                                           nnz_pred, s->qmat[segment].chroma_qmul);
 889                 td->non_zero_count_cache[i][(y<<1)+x] = nnz;
 890                 t_nnz[i+2*x] = l_nnz[i+2*y] = !!nnz;
 891                 nnz_total += nnz;
 892             }
 893
 894     // if there were no coded coeffs despite the macroblock not being marked skip,
 895     // we MUST not do the inner loop filter and should not do IDCT
 896     // Since skip isn't used for bitstream prediction, just manually set it.
 897     if (!nnz_total)
 898         mb->skip = 1;
 899 }
 900
 901 static av_always_inline
 902 void backup_mb_border(uint8_t *top_border, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr,
 903                       int linesize, int uvlinesize, int simple)
 904 {
 905     AV_COPY128(top_border, src_y + 15*linesize);
 906     if (!simple) {
 907         AV_COPY64(top_border+16, src_cb + 7*uvlinesize);
 908         AV_COPY64(top_border+24, src_cr + 7*uvlinesize);
 909     }
 910 }
 911
 912 static av_always_inline
 913 void xchg_mb_border(uint8_t *top_border, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr,
 914                     int linesize, int uvlinesize, int mb_x, int mb_y, int mb_width,
 915                     int simple, int xchg)
 916 {
 917     uint8_t *top_border_m1 = top_border-32;     // for TL prediction
 918     src_y  -=   linesize;
 919     src_cb -= uvlinesize;
 920     src_cr -= uvlinesize;
 921
 922 #define XCHG(a,b,xchg) do {                     \
 923         if (xchg) AV_SWAP64(b,a);               \
 924         else      AV_COPY64(b,a);               \
 925     } while (0)
 926
 927     XCHG(top_border_m1+8, src_y-8, xchg);
 928     XCHG(top_border,      src_y,   xchg);
 929     XCHG(top_border+8,    src_y+8, 1);
 930     if (mb_x < mb_width-1)
 931         XCHG(top_border+32, src_y+16, 1);
 932
 933     // only copy chroma for normal loop filter
 934     // or to initialize the top row to 127
 935     if (!simple || !mb_y) {
 936         XCHG(top_border_m1+16, src_cb-8, xchg);
 937         XCHG(top_border_m1+24, src_cr-8, xchg);
 938         XCHG(top_border+16,    src_cb, 1);
 939         XCHG(top_border+24,    src_cr, 1);
 940     }
 941 }
 942
 943 static av_always_inline
 944 int check_dc_pred8x8_mode(int mode, int mb_x, int mb_y)
 945 {
 946     if (!mb_x) {
 947         return mb_y ? TOP_DC_PRED8x8 : DC_128_PRED8x8;
 948     } else {
 949         return mb_y ? mode : LEFT_DC_PRED8x8;
 950     }
 951 }
 952
 953 static av_always_inline
 954 int check_tm_pred8x8_mode(int mode, int mb_x, int mb_y)
 955 {
 956     if (!mb_x) {
 957         return mb_y ? VERT_PRED8x8 : DC_129_PRED8x8;
 958     } else {
 959         return mb_y ? mode : HOR_PRED8x8;
 960     }
 961 }
 962
 963 static av_always_inline
 964 int check_intra_pred8x8_mode(int mode, int mb_x, int mb_y)
 965 {
 966     if (mode == DC_PRED8x8) {
 967         return check_dc_pred8x8_mode(mode, mb_x, mb_y);
 968     } else {
 969         return mode;
 970     }
 971 }
 972
 973 static av_always_inline
 974 int check_intra_pred8x8_mode_emuedge(int mode, int mb_x, int mb_y)
 975 {
 976     switch (mode) {
 977     case DC_PRED8x8:
 978         return check_dc_pred8x8_mode(mode, mb_x, mb_y);
 979     case VERT_PRED8x8:
 980         return !mb_y ? DC_127_PRED8x8 : mode;
 981     case HOR_PRED8x8:
 982         return !mb_x ? DC_129_PRED8x8 : mode;
 983     case PLANE_PRED8x8 /*TM*/:
 984         return check_tm_pred8x8_mode(mode, mb_x, mb_y);
 985     }
 986     return mode;
 987 }
 988
 989 static av_always_inline
 990 int check_tm_pred4x4_mode(int mode, int mb_x, int mb_y)
 991 {
 992     if (!mb_x) {
 993         return mb_y ? VERT_VP8_PRED : DC_129_PRED;
 994     } else {
 995         return mb_y ? mode : HOR_VP8_PRED;
 996     }
 997 }
 998
 999 static av_always_inline
1000 int check_intra_pred4x4_mode_emuedge(int mode, int mb_x, int mb_y, int *copy_buf)
1001 {
1002     switch (mode) {
1003     case VERT_PRED:
1004         if (!mb_x && mb_y) {
1005             *copy_buf = 1;
1006             return mode;
1007         }
1008         /* fall-through */
1009     case DIAG_DOWN_LEFT_PRED:
1010     case VERT_LEFT_PRED:
1011         return !mb_y ? DC_127_PRED : mode;
1012     case HOR_PRED:
1013         if (!mb_y) {
1014             *copy_buf = 1;
1015             return mode;
1016         }
1017         /* fall-through */
1018     case HOR_UP_PRED:
1019         return !mb_x ? DC_129_PRED : mode;
1020     case TM_VP8_PRED:
1021         return check_tm_pred4x4_mode(mode, mb_x, mb_y);
1022     case DC_PRED: // 4x4 DC doesn't use the same "H.264-style" exceptions as 16x16/8x8 DC
1023     case DIAG_DOWN_RIGHT_PRED:
1024     case VERT_RIGHT_PRED:
1025     case HOR_DOWN_PRED:
1026         if (!mb_y || !mb_x)
1027             *copy_buf = 1;
1028         return mode;
1029     }
1030     return mode;
1031 }
1032
1033 static av_always_inline
1034 void intra_predict(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1035                    VP8Macroblock *mb, int mb_x, int mb_y)
1036 {
1037     AVCodecContext *avctx = s->avctx;
1038     int x, y, mode, nnz;
1039     uint32_t tr;
1040
1041     // for the first row, we need to run xchg_mb_border to init the top edge to 127
1042     // otherwise, skip it if we aren't going to deblock
1043     if (!(avctx->flags & CODEC_FLAG_EMU_EDGE && !mb_y) && (s->deblock_filter || !mb_y) && td->thread_nr == 0)
1044         xchg_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2],
1045                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1046                        s->filter.simple, 1);
1047
1048     if (mb->mode < MODE_I4x4) {
1049         if (avctx->flags & CODEC_FLAG_EMU_EDGE) { // tested
1050             mode = check_intra_pred8x8_mode_emuedge(mb->mode, mb_x, mb_y);
1051         } else {
1052             mode = check_intra_pred8x8_mode(mb->mode, mb_x, mb_y);
1053         }
1054         s->hpc.pred16x16[mode](dst[0], s->linesize);
1055     } else {
1056         uint8_t *ptr = dst[0];
1057         uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
1058         uint8_t tr_top[4] = { 127, 127, 127, 127 };
1059
1060         // all blocks on the right edge of the macroblock use bottom edge
1061         // the top macroblock for their topright edge
1062         uint8_t *tr_right = ptr - s->linesize + 16;
1063
1064         // if we're on the right edge of the frame, said edge is extended
1065         // from the top macroblock
1066         if (!(!mb_y && avctx->flags & CODEC_FLAG_EMU_EDGE) &&
1067             mb_x == s->mb_width-1) {
1068             tr = tr_right[-1]*0x01010101u;
1069             tr_right = (uint8_t *)&tr;
1070         }
1071
1072         if (mb->skip)
1073             AV_ZERO128(td->non_zero_count_cache);
1074
1075         for (y = 0; y < 4; y++) {
1076             uint8_t *topright = ptr + 4 - s->linesize;
1077             for (x = 0; x < 4; x++) {
1078                 int copy = 0, linesize = s->linesize;
1079                 uint8_t *dst = ptr+4*x;
1080                 DECLARE_ALIGNED(4, uint8_t, copy_dst)[5*8];
1081
1082                 if ((y == 0 || x == 3) && mb_y == 0 && avctx->flags & CODEC_FLAG_EMU_EDGE) {
1083                     topright = tr_top;
1084                 } else if (x == 3)
1085                     topright = tr_right;
1086
1087                 if (avctx->flags & CODEC_FLAG_EMU_EDGE) { // mb_x+x or mb_y+y is a hack but works
1088                     mode = check_intra_pred4x4_mode_emuedge(intra4x4[x], mb_x + x, mb_y + y, &copy);
1089                     if (copy) {
1090                         dst = copy_dst + 12;
1091                         linesize = 8;
1092                         if (!(mb_y + y)) {
1093                             copy_dst[3] = 127U;
1094                             AV_WN32A(copy_dst+4, 127U * 0x01010101U);
1095                         } else {
1096                             AV_COPY32(copy_dst+4, ptr+4*x-s->linesize);
1097                             if (!(mb_x + x)) {
1098                                 copy_dst[3] = 129U;
1099                             } else {
1100                                 copy_dst[3] = ptr[4*x-s->linesize-1];
1101                             }
1102                         }
1103                         if (!(mb_x + x)) {
1104                             copy_dst[11] =
1105                             copy_dst[19] =
1106                             copy_dst[27] =
1107                             copy_dst[35] = 129U;
1108                         } else {
1109                             copy_dst[11] = ptr[4*x              -1];
1110                             copy_dst[19] = ptr[4*x+s->linesize  -1];
1111                             copy_dst[27] = ptr[4*x+s->linesize*2-1];
1112                             copy_dst[35] = ptr[4*x+s->linesize*3-1];
1113                         }
1114                     }
1115                 } else {
1116                     mode = intra4x4[x];
1117                 }
1118                 s->hpc.pred4x4[mode](dst, topright, linesize);
1119                 if (copy) {
1120                     AV_COPY32(ptr+4*x              , copy_dst+12);
1121                     AV_COPY32(ptr+4*x+s->linesize  , copy_dst+20);
1122                     AV_COPY32(ptr+4*x+s->linesize*2, copy_dst+28);
1123                     AV_COPY32(ptr+4*x+s->linesize*3, copy_dst+36);
1124                 }
1125
1126                 nnz = td->non_zero_count_cache[y][x];
1127                 if (nnz) {
1128                     if (nnz == 1)
1129                         s->vp8dsp.vp8_idct_dc_add(ptr+4*x, td->block[y][x], s->linesize);
1130                     else
1131                         s->vp8dsp.vp8_idct_add(ptr+4*x, td->block[y][x], s->linesize);
1132                 }
1133                 topright += 4;
1134             }
1135
1136             ptr   += 4*s->linesize;
1137             intra4x4 += 4;
1138         }
1139     }
1140
1141     if (avctx->flags & CODEC_FLAG_EMU_EDGE) {
1142         mode = check_intra_pred8x8_mode_emuedge(mb->chroma_pred_mode, mb_x, mb_y);
1143     } else {
1144         mode = check_intra_pred8x8_mode(mb->chroma_pred_mode, mb_x, mb_y);
1145     }
1146     s->hpc.pred8x8[mode](dst[1], s->uvlinesize);
1147     s->hpc.pred8x8[mode](dst[2], s->uvlinesize);
1148
1149     if (!(avctx->flags & CODEC_FLAG_EMU_EDGE && !mb_y) && (s->deblock_filter || !mb_y) && td->thread_nr == 0)
1150         xchg_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2],
1151                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1152                        s->filter.simple, 0);
1153 }
1154
1155 static const uint8_t subpel_idx[3][8] = {
1156     { 0, 1, 2, 1, 2, 1, 2, 1 }, // nr. of left extra pixels,
1157                                 // also function pointer index
1158     { 0, 3, 5, 3, 5, 3, 5, 3 }, // nr. of extra pixels required
1159     { 0, 2, 3, 2, 3, 2, 3, 2 }, // nr. of right extra pixels
1160 };
1161
1162 /**
1163  * luma MC function
1164  *
1165  * @param s VP8 decoding context
1166  * @param dst target buffer for block data at block position
1167  * @param ref reference picture buffer at origin (0, 0)
1168  * @param mv motion vector (relative to block position) to get pixel data from
1169  * @param x_off horizontal position of block from origin (0, 0)
1170  * @param y_off vertical position of block from origin (0, 0)
1171  * @param block_w width of block (16, 8 or 4)
1172  * @param block_h height of block (always same as block_w)
1173  * @param width width of src/dst plane data
1174  * @param height height of src/dst plane data
1175  * @param linesize size of a single line of plane data, including padding
1176  * @param mc_func motion compensation function pointers (bilinear or sixtap MC)
1177  */
1178 static av_always_inline
1179 void vp8_mc_luma(VP8Context *s, VP8ThreadData *td, uint8_t *dst,
1180                  ThreadFrame *ref, const VP56mv *mv,
1181                  int x_off, int y_off, int block_w, int block_h,
1182                  int width, int height, int linesize,
1183                  vp8_mc_func mc_func[3][3])
1184 {
1185     uint8_t *src = ref->f->data[0];
1186
1187     if (AV_RN32A(mv)) {
1188
1189         int mx = (mv->x << 1)&7, mx_idx = subpel_idx[0][mx];
1190         int my = (mv->y << 1)&7, my_idx = subpel_idx[0][my];
1191
1192         x_off += mv->x >> 2;
1193         y_off += mv->y >> 2;
1194
1195         // edge emulation
1196         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 4, 0);
1197         src += y_off * linesize + x_off;
1198         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1199             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1200             s->vdsp.emulated_edge_mc(td->edge_emu_buffer, src - my_idx * linesize - mx_idx, linesize,
1201                                      block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1202                                      x_off - mx_idx, y_off - my_idx, width, height);
1203             src = td->edge_emu_buffer + mx_idx + linesize * my_idx;
1204         }
1205         mc_func[my_idx][mx_idx](dst, linesize, src, linesize, block_h, mx, my);
1206     } else {
1207         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 4, 0);
1208         mc_func[0][0](dst, linesize, src + y_off * linesize + x_off, linesize, block_h, 0, 0);
1209     }
1210 }
1211
1212 /**
1213  * chroma MC function
1214  *
1215  * @param s VP8 decoding context
1216  * @param dst1 target buffer for block data at block position (U plane)
1217  * @param dst2 target buffer for block data at block position (V plane)
1218  * @param ref reference picture buffer at origin (0, 0)
1219  * @param mv motion vector (relative to block position) to get pixel data from
1220  * @param x_off horizontal position of block from origin (0, 0)
1221  * @param y_off vertical position of block from origin (0, 0)
1222  * @param block_w width of block (16, 8 or 4)
1223  * @param block_h height of block (always same as block_w)
1224  * @param width width of src/dst plane data
1225  * @param height height of src/dst plane data
1226  * @param linesize size of a single line of plane data, including padding
1227  * @param mc_func motion compensation function pointers (bilinear or sixtap MC)
1228  */
1229 static av_always_inline
1230 void vp8_mc_chroma(VP8Context *s, VP8ThreadData *td, uint8_t *dst1, uint8_t *dst2,
1231                    ThreadFrame *ref, const VP56mv *mv, int x_off, int y_off,
1232                    int block_w, int block_h, int width, int height, int linesize,
1233                    vp8_mc_func mc_func[3][3])
1234 {
1235     uint8_t *src1 = ref->f->data[1], *src2 = ref->f->data[2];
1236
1237     if (AV_RN32A(mv)) {
1238         int mx = mv->x&7, mx_idx = subpel_idx[0][mx];
1239         int my = mv->y&7, my_idx = subpel_idx[0][my];
1240
1241         x_off += mv->x >> 3;
1242         y_off += mv->y >> 3;
1243
1244         // edge emulation
1245         src1 += y_off * linesize + x_off;
1246         src2 += y_off * linesize + x_off;
1247         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 3, 0);
1248         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1249             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1250             s->vdsp.emulated_edge_mc(td->edge_emu_buffer, src1 - my_idx * linesize - mx_idx, linesize,
1251                                      block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1252                                      x_off - mx_idx, y_off - my_idx, width, height);
1253             src1 = td->edge_emu_buffer + mx_idx + linesize * my_idx;
1254             mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my);
1255
1256             s->vdsp.emulated_edge_mc(td->edge_emu_buffer, src2 - my_idx * linesize - mx_idx, linesize,
1257                                      block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1258                                      x_off - mx_idx, y_off - my_idx, width, height);
1259             src2 = td->edge_emu_buffer + mx_idx + linesize * my_idx;
1260             mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my);
1261         } else {
1262             mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my);
1263             mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my);
1264         }
1265     } else {
1266         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 3, 0);
1267         mc_func[0][0](dst1, linesize, src1 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1268         mc_func[0][0](dst2, linesize, src2 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1269     }
1270 }
1271
1272 static av_always_inline
1273 void vp8_mc_part(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1274                  ThreadFrame *ref_frame, int x_off, int y_off,
1275                  int bx_off, int by_off,
1276                  int block_w, int block_h,
1277                  int width, int height, VP56mv *mv)
1278 {
1279     VP56mv uvmv = *mv;
1280
1281     /* Y */
1282     vp8_mc_luma(s, td, dst[0] + by_off * s->linesize + bx_off,
1283                 ref_frame, mv, x_off + bx_off, y_off + by_off,
1284                 block_w, block_h, width, height, s->linesize,
1285                 s->put_pixels_tab[block_w == 8]);
1286
1287     /* U/V */
1288     if (s->profile == 3) {
1289         uvmv.x &= ~7;
1290         uvmv.y &= ~7;
1291     }
1292     x_off   >>= 1; y_off   >>= 1;
1293     bx_off  >>= 1; by_off  >>= 1;
1294     width   >>= 1; height  >>= 1;
1295     block_w >>= 1; block_h >>= 1;
1296     vp8_mc_chroma(s, td, dst[1] + by_off * s->uvlinesize + bx_off,
1297                   dst[2] + by_off * s->uvlinesize + bx_off, ref_frame,
1298                   &uvmv, x_off + bx_off, y_off + by_off,
1299                   block_w, block_h, width, height, s->uvlinesize,
1300                   s->put_pixels_tab[1 + (block_w == 4)]);
1301 }
1302
1303 /* Fetch pixels for estimated mv 4 macroblocks ahead.
1304  * Optimized for 64-byte cache lines.  Inspired by ffh264 prefetch_motion. */
1305 static av_always_inline void prefetch_motion(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, int mb_xy, int ref)
1306 {
1307     /* Don't prefetch refs that haven't been used very often this frame. */
1308     if (s->ref_count[ref-1] > (mb_xy >> 5)) {
1309         int x_off = mb_x << 4, y_off = mb_y << 4;
1310         int mx = (mb->mv.x>>2) + x_off + 8;
1311         int my = (mb->mv.y>>2) + y_off;
1312         uint8_t **src= s->framep[ref]->tf.f->data;
1313         int off= mx + (my + (mb_x&3)*4)*s->linesize + 64;
1314         /* For threading, a ff_thread_await_progress here might be useful, but
1315          * it actually slows down the decoder. Since a bad prefetch doesn't
1316          * generate bad decoder output, we don't run it here. */
1317         s->vdsp.prefetch(src[0]+off, s->linesize, 4);
1318         off= (mx>>1) + ((my>>1) + (mb_x&7))*s->uvlinesize + 64;
1319         s->vdsp.prefetch(src[1]+off, src[2]-src[1], 2);
1320     }
1321 }
1322
1323 /**
1324  * Apply motion vectors to prediction buffer, chapter 18.
1325  */
1326 static av_always_inline
1327 void inter_predict(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1328                    VP8Macroblock *mb, int mb_x, int mb_y)
1329 {
1330     int x_off = mb_x << 4, y_off = mb_y << 4;
1331     int width = 16*s->mb_width, height = 16*s->mb_height;
1332     ThreadFrame *ref = &s->framep[mb->ref_frame]->tf;
1333     VP56mv *bmv = mb->bmv;
1334
1335     switch (mb->partitioning) {
1336     case VP8_SPLITMVMODE_NONE:
1337         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1338                     0, 0, 16, 16, width, height, &mb->mv);
1339         break;
1340     case VP8_SPLITMVMODE_4x4: {
1341         int x, y;
1342         VP56mv uvmv;
1343
1344         /* Y */
1345         for (y = 0; y < 4; y++) {
1346             for (x = 0; x < 4; x++) {
1347                 vp8_mc_luma(s, td, dst[0] + 4*y*s->linesize + x*4,
1348                             ref, &bmv[4*y + x],
1349                             4*x + x_off, 4*y + y_off, 4, 4,
1350                             width, height, s->linesize,
1351                             s->put_pixels_tab[2]);
1352             }
1353         }
1354
1355         /* U/V */
1356         x_off >>= 1; y_off >>= 1; width >>= 1; height >>= 1;
1357         for (y = 0; y < 2; y++) {
1358             for (x = 0; x < 2; x++) {
1359                 uvmv.x = mb->bmv[ 2*y    * 4 + 2*x  ].x +
1360                          mb->bmv[ 2*y    * 4 + 2*x+1].x +
1361                          mb->bmv[(2*y+1) * 4 + 2*x  ].x +
1362                          mb->bmv[(2*y+1) * 4 + 2*x+1].x;
1363                 uvmv.y = mb->bmv[ 2*y    * 4 + 2*x  ].y +
1364                          mb->bmv[ 2*y    * 4 + 2*x+1].y +
1365                          mb->bmv[(2*y+1) * 4 + 2*x  ].y +
1366                          mb->bmv[(2*y+1) * 4 + 2*x+1].y;
1367                 uvmv.x = (uvmv.x + 2 + (uvmv.x >> (INT_BIT-1))) >> 2;
1368                 uvmv.y = (uvmv.y + 2 + (uvmv.y >> (INT_BIT-1))) >> 2;
1369                 if (s->profile == 3) {
1370                     uvmv.x &= ~7;
1371                     uvmv.y &= ~7;
1372                 }
1373                 vp8_mc_chroma(s, td, dst[1] + 4*y*s->uvlinesize + x*4,
1374                               dst[2] + 4*y*s->uvlinesize + x*4, ref, &uvmv,
1375                               4*x + x_off, 4*y + y_off, 4, 4,
1376                               width, height, s->uvlinesize,
1377                               s->put_pixels_tab[2]);
1378             }
1379         }
1380         break;
1381     }
1382     case VP8_SPLITMVMODE_16x8:
1383         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1384                     0, 0, 16, 8, width, height, &bmv[0]);
1385         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1386                     0, 8, 16, 8, width, height, &bmv[1]);
1387         break;
1388     case VP8_SPLITMVMODE_8x16:
1389         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1390                     0, 0, 8, 16, width, height, &bmv[0]);
1391         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1392                     8, 0, 8, 16, width, height, &bmv[1]);
1393         break;
1394     case VP8_SPLITMVMODE_8x8:
1395         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1396                     0, 0, 8, 8, width, height, &bmv[0]);
1397         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1398                     8, 0, 8, 8, width, height, &bmv[1]);
1399         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1400                     0, 8, 8, 8, width, height, &bmv[2]);
1401         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1402                     8, 8, 8, 8, width, height, &bmv[3]);
1403         break;
1404     }
1405 }
1406
1407 static av_always_inline void idct_mb(VP8Context *s, VP8ThreadData *td,
1408                                      uint8_t *dst[3], VP8Macroblock *mb)
1409 {
1410     int x, y, ch;
1411
1412     if (mb->mode != MODE_I4x4) {
1413         uint8_t *y_dst = dst[0];
1414         for (y = 0; y < 4; y++) {
1415             uint32_t nnz4 = AV_RL32(td->non_zero_count_cache[y]);
1416             if (nnz4) {
1417                 if (nnz4&~0x01010101) {
1418                     for (x = 0; x < 4; x++) {
1419                         if ((uint8_t)nnz4 == 1)
1420                             s->vp8dsp.vp8_idct_dc_add(y_dst+4*x, td->block[y][x], s->linesize);
1421                         else if((uint8_t)nnz4 > 1)
1422                             s->vp8dsp.vp8_idct_add(y_dst+4*x, td->block[y][x], s->linesize);
1423                         nnz4 >>= 8;
1424                         if (!nnz4)
1425                             break;
1426                     }
1427                 } else {
1428                     s->vp8dsp.vp8_idct_dc_add4y(y_dst, td->block[y], s->linesize);
1429                 }
1430             }
1431             y_dst += 4*s->linesize;
1432         }
1433     }
1434
1435     for (ch = 0; ch < 2; ch++) {
1436         uint32_t nnz4 = AV_RL32(td->non_zero_count_cache[4+ch]);
1437         if (nnz4) {
1438             uint8_t *ch_dst = dst[1+ch];
1439             if (nnz4&~0x01010101) {
1440                 for (y = 0; y < 2; y++) {
1441                     for (x = 0; x < 2; x++) {
1442                         if ((uint8_t)nnz4 == 1)
1443                             s->vp8dsp.vp8_idct_dc_add(ch_dst+4*x, td->block[4+ch][(y<<1)+x], s->uvlinesize);
1444                         else if((uint8_t)nnz4 > 1)
1445                             s->vp8dsp.vp8_idct_add(ch_dst+4*x, td->block[4+ch][(y<<1)+x], s->uvlinesize);
1446                         nnz4 >>= 8;
1447                         if (!nnz4)
1448                             goto chroma_idct_end;
1449                     }
1450                     ch_dst += 4*s->uvlinesize;
1451                 }
1452             } else {
1453                 s->vp8dsp.vp8_idct_dc_add4uv(ch_dst, td->block[4+ch], s->uvlinesize);
1454             }
1455         }
1456 chroma_idct_end: ;
1457     }
1458 }
1459
1460 static av_always_inline void filter_level_for_mb(VP8Context *s, VP8Macroblock *mb, VP8FilterStrength *f )
1461 {
1462     int interior_limit, filter_level;
1463
1464     if (s->segmentation.enabled) {
1465         filter_level = s->segmentation.filter_level[mb->segment];
1466         if (!s->segmentation.absolute_vals)
1467             filter_level += s->filter.level;
1468     } else
1469         filter_level = s->filter.level;
1470
1471     if (s->lf_delta.enabled) {
1472         filter_level += s->lf_delta.ref[mb->ref_frame];
1473         filter_level += s->lf_delta.mode[mb->mode];
1474     }
1475
1476     filter_level = av_clip_uintp2(filter_level, 6);
1477
1478     interior_limit = filter_level;
1479     if (s->filter.sharpness) {
1480         interior_limit >>= (s->filter.sharpness + 3) >> 2;
1481         interior_limit = FFMIN(interior_limit, 9 - s->filter.sharpness);
1482     }
1483     interior_limit = FFMAX(interior_limit, 1);
1484
1485     f->filter_level = filter_level;
1486     f->inner_limit = interior_limit;
1487     f->inner_filter = !mb->skip || mb->mode == MODE_I4x4 || mb->mode == VP8_MVMODE_SPLIT;
1488 }
1489
1490 static av_always_inline void filter_mb(VP8Context *s, uint8_t *dst[3], VP8FilterStrength *f, int mb_x, int mb_y)
1491 {
1492     int mbedge_lim, bedge_lim, hev_thresh;
1493     int filter_level = f->filter_level;
1494     int inner_limit = f->inner_limit;
1495     int inner_filter = f->inner_filter;
1496     int linesize = s->linesize;
1497     int uvlinesize = s->uvlinesize;
1498     static const uint8_t hev_thresh_lut[2][64] = {
1499         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
1500           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1501           3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1502           3, 3, 3, 3 },
1503         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
1504           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1505           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1506           2, 2, 2, 2 }
1507     };
1508
1509     if (!filter_level)
1510         return;
1511
1512      bedge_lim = 2*filter_level + inner_limit;
1513     mbedge_lim = bedge_lim + 4;
1514
1515     hev_thresh = hev_thresh_lut[s->keyframe][filter_level];
1516
1517     if (mb_x) {
1518         s->vp8dsp.vp8_h_loop_filter16y(dst[0],     linesize,
1519                                        mbedge_lim, inner_limit, hev_thresh);
1520         s->vp8dsp.vp8_h_loop_filter8uv(dst[1],     dst[2],      uvlinesize,
1521                                        mbedge_lim, inner_limit, hev_thresh);
1522     }
1523
1524     if (inner_filter) {
1525         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+ 4, linesize, bedge_lim,
1526                                              inner_limit, hev_thresh);
1527         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+ 8, linesize, bedge_lim,
1528                                              inner_limit, hev_thresh);
1529         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+12, linesize, bedge_lim,
1530                                              inner_limit, hev_thresh);
1531         s->vp8dsp.vp8_h_loop_filter8uv_inner(dst[1] + 4, dst[2] + 4,
1532                                              uvlinesize,  bedge_lim,
1533                                              inner_limit, hev_thresh);
1534     }
1535
1536     if (mb_y) {
1537         s->vp8dsp.vp8_v_loop_filter16y(dst[0],     linesize,
1538                                        mbedge_lim, inner_limit, hev_thresh);
1539         s->vp8dsp.vp8_v_loop_filter8uv(dst[1],     dst[2],      uvlinesize,
1540                                        mbedge_lim, inner_limit, hev_thresh);
1541     }
1542
1543     if (inner_filter) {
1544         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+ 4*linesize,
1545                                              linesize,    bedge_lim,
1546                                              inner_limit, hev_thresh);
1547         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+ 8*linesize,
1548                                              linesize,    bedge_lim,
1549                                              inner_limit, hev_thresh);
1550         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+12*linesize,
1551                                              linesize,    bedge_lim,
1552                                              inner_limit, hev_thresh);
1553         s->vp8dsp.vp8_v_loop_filter8uv_inner(dst[1] + 4 * uvlinesize,
1554                                              dst[2] + 4 * uvlinesize,
1555                                              uvlinesize,  bedge_lim,
1556                                              inner_limit, hev_thresh);
1557     }
1558 }
1559
1560 static av_always_inline void filter_mb_simple(VP8Context *s, uint8_t *dst, VP8FilterStrength *f, int mb_x, int mb_y)
1561 {
1562     int mbedge_lim, bedge_lim;
1563     int filter_level = f->filter_level;
1564     int inner_limit = f->inner_limit;
1565     int inner_filter = f->inner_filter;
1566     int linesize = s->linesize;
1567
1568     if (!filter_level)
1569         return;
1570
1571      bedge_lim = 2*filter_level + inner_limit;
1572     mbedge_lim = bedge_lim + 4;
1573
1574     if (mb_x)
1575         s->vp8dsp.vp8_h_loop_filter_simple(dst, linesize, mbedge_lim);
1576     if (inner_filter) {
1577         s->vp8dsp.vp8_h_loop_filter_simple(dst+ 4, linesize, bedge_lim);
1578         s->vp8dsp.vp8_h_loop_filter_simple(dst+ 8, linesize, bedge_lim);
1579         s->vp8dsp.vp8_h_loop_filter_simple(dst+12, linesize, bedge_lim);
1580     }
1581
1582     if (mb_y)
1583         s->vp8dsp.vp8_v_loop_filter_simple(dst, linesize, mbedge_lim);
1584     if (inner_filter) {
1585         s->vp8dsp.vp8_v_loop_filter_simple(dst+ 4*linesize, linesize, bedge_lim);
1586         s->vp8dsp.vp8_v_loop_filter_simple(dst+ 8*linesize, linesize, bedge_lim);
1587         s->vp8dsp.vp8_v_loop_filter_simple(dst+12*linesize, linesize, bedge_lim);
1588     }
1589 }
1590
1591 #define MARGIN (16 << 2)
1592 static void vp8_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *curframe,
1593                                    VP8Frame *prev_frame)
1594 {
1595     VP8Context *s = avctx->priv_data;
1596     int mb_x, mb_y;
1597
1598     s->mv_min.y = -MARGIN;
1599     s->mv_max.y = ((s->mb_height - 1) << 6) + MARGIN;
1600     for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
1601         VP8Macroblock *mb = s->macroblocks_base + ((s->mb_width+1)*(mb_y + 1) + 1);
1602         int mb_xy = mb_y*s->mb_width;
1603
1604         AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED*0x01010101);
1605
1606         s->mv_min.x = -MARGIN;
1607         s->mv_max.x = ((s->mb_width - 1) << 6) + MARGIN;
1608         for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
1609             if (mb_y == 0)
1610                 AV_WN32A((mb-s->mb_width-1)->intra4x4_pred_mode_top, DC_PRED*0x01010101);
1611             decode_mb_mode(s, mb, mb_x, mb_y, curframe->seg_map->data + mb_xy,
1612                            prev_frame && prev_frame->seg_map ?
1613                            prev_frame->seg_map->data + mb_xy : NULL, 1);
1614             s->mv_min.x -= 64;
1615             s->mv_max.x -= 64;
1616         }
1617         s->mv_min.y -= 64;
1618         s->mv_max.y -= 64;
1619     }
1620 }
1621
1622 #if HAVE_THREADS
1623 #define check_thread_pos(td, otd, mb_x_check, mb_y_check)\
1624     do {\
1625         int tmp = (mb_y_check << 16) | (mb_x_check & 0xFFFF);\
1626         if (otd->thread_mb_pos < tmp) {\
1627             pthread_mutex_lock(&otd->lock);\
1628             td->wait_mb_pos = tmp;\
1629             do {\
1630                 if (otd->thread_mb_pos >= tmp)\
1631                     break;\
1632                 pthread_cond_wait(&otd->cond, &otd->lock);\
1633             } while (1);\
1634             td->wait_mb_pos = INT_MAX;\
1635             pthread_mutex_unlock(&otd->lock);\
1636         }\
1637     } while(0);
1638
1639 #define update_pos(td, mb_y, mb_x)\
1640     do {\
1641     int pos              = (mb_y << 16) | (mb_x & 0xFFFF);\
1642     int sliced_threading = (avctx->active_thread_type == FF_THREAD_SLICE) && (num_jobs > 1);\
1643     int is_null          = (next_td == NULL) || (prev_td == NULL);\
1644     int pos_check        = (is_null) ? 1 :\
1645                             (next_td != td && pos >= next_td->wait_mb_pos) ||\
1646                             (prev_td != td && pos >= prev_td->wait_mb_pos);\
1647     td->thread_mb_pos = pos;\
1648     if (sliced_threading && pos_check) {\
1649         pthread_mutex_lock(&td->lock);\
1650         pthread_cond_broadcast(&td->cond);\
1651         pthread_mutex_unlock(&td->lock);\
1652     }\
1653     } while(0);
1654 #else
1655 #define check_thread_pos(td, otd, mb_x_check, mb_y_check)
1656 #define update_pos(td, mb_y, mb_x)
1657 #endif
1658
1659 static void vp8_decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
1660                                         int jobnr, int threadnr)
1661 {
1662     VP8Context *s = avctx->priv_data;
1663     VP8ThreadData *prev_td, *next_td, *td = &s->thread_data[threadnr];
1664     int mb_y = td->thread_mb_pos>>16;
1665     int i, y, mb_x, mb_xy = mb_y*s->mb_width;
1666     int num_jobs = s->num_jobs;
1667     VP8Frame *curframe = s->curframe, *prev_frame = s->prev_frame;
1668     VP56RangeCoder *c = &s->coeff_partition[mb_y & (s->num_coeff_partitions-1)];
1669     VP8Macroblock *mb;
1670     uint8_t *dst[3] = {
1671         curframe->tf.f->data[0] + 16*mb_y*s->linesize,
1672         curframe->tf.f->data[1] +  8*mb_y*s->uvlinesize,
1673         curframe->tf.f->data[2] +  8*mb_y*s->uvlinesize
1674     };
1675     if (mb_y == 0) prev_td = td;
1676     else           prev_td = &s->thread_data[(jobnr + num_jobs - 1)%num_jobs];
1677     if (mb_y == s->mb_height-1) next_td = td;
1678     else                        next_td = &s->thread_data[(jobnr + 1)%num_jobs];
1679     if (s->mb_layout == 1)
1680         mb = s->macroblocks_base + ((s->mb_width+1)*(mb_y + 1) + 1);
1681     else {
1682         mb = s->macroblocks + (s->mb_height - mb_y - 1)*2;
1683         memset(mb - 1, 0, sizeof(*mb)); // zero left macroblock
1684         AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED*0x01010101);
1685     }
1686
1687     memset(td->left_nnz, 0, sizeof(td->left_nnz));
1688     // left edge of 129 for intra prediction
1689     if (!(avctx->flags & CODEC_FLAG_EMU_EDGE)) {
1690         for (i = 0; i < 3; i++)
1691             for (y = 0; y < 16>>!!i; y++)
1692                 dst[i][y*curframe->tf.f->linesize[i]-1] = 129;
1693         if (mb_y == 1) {
1694             s->top_border[0][15] = s->top_border[0][23] = s->top_border[0][31] = 129;
1695         }
1696     }
1697
1698     s->mv_min.x = -MARGIN;
1699     s->mv_max.x = ((s->mb_width  - 1) << 6) + MARGIN;
1700
1701     for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
1702         // Wait for previous thread to read mb_x+2, and reach mb_y-1.
1703         if (prev_td != td) {
1704             if (threadnr != 0) {
1705                 check_thread_pos(td, prev_td, mb_x+1, mb_y-1);
1706             } else {
1707                 check_thread_pos(td, prev_td, (s->mb_width+3) + (mb_x+1), mb_y-1);
1708             }
1709         }
1710
1711         s->vdsp.prefetch(dst[0] + (mb_x&3)*4*s->linesize + 64, s->linesize, 4);
1712         s->vdsp.prefetch(dst[1] + (mb_x&7)*s->uvlinesize + 64, dst[2] - dst[1], 2);
1713
1714         if (!s->mb_layout)
1715             decode_mb_mode(s, mb, mb_x, mb_y, curframe->seg_map->data + mb_xy,
1716                            prev_frame && prev_frame->seg_map ?
1717                            prev_frame->seg_map->data + mb_xy : NULL, 0);
1718
1719         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_PREVIOUS);
1720
1721         if (!mb->skip)
1722             decode_mb_coeffs(s, td, c, mb, s->top_nnz[mb_x], td->left_nnz);
1723
1724         if (mb->mode <= MODE_I4x4)
1725             intra_predict(s, td, dst, mb, mb_x, mb_y);
1726         else
1727             inter_predict(s, td, dst, mb, mb_x, mb_y);
1728
1729         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN);
1730
1731         if (!mb->skip) {
1732             idct_mb(s, td, dst, mb);
1733         } else {
1734             AV_ZERO64(td->left_nnz);
1735             AV_WN64(s->top_nnz[mb_x], 0);   // array of 9, so unaligned
1736
1737             // Reset DC block predictors if they would exist if the mb had coefficients
1738             if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
1739                 td->left_nnz[8]     = 0;
1740                 s->top_nnz[mb_x][8] = 0;
1741             }
1742         }
1743
1744         if (s->deblock_filter)
1745             filter_level_for_mb(s, mb, &td->filter_strength[mb_x]);
1746
1747         if (s->deblock_filter && num_jobs != 1 && threadnr == num_jobs-1) {
1748             if (s->filter.simple)
1749                 backup_mb_border(s->top_border[mb_x+1], dst[0], NULL, NULL, s->linesize, 0, 1);
1750             else
1751                 backup_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2], s->linesize, s->uvlinesize, 0);
1752         }
1753
1754         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN2);
1755
1756         dst[0] += 16;
1757         dst[1] += 8;
1758         dst[2] += 8;
1759         s->mv_min.x -= 64;
1760         s->mv_max.x -= 64;
1761
1762         if (mb_x == s->mb_width+1) {
1763             update_pos(td, mb_y, s->mb_width+3);
1764         } else {
1765             update_pos(td, mb_y, mb_x);
1766         }
1767     }
1768 }
1769
1770 static void vp8_filter_mb_row(AVCodecContext *avctx, void *tdata,
1771                               int jobnr, int threadnr)
1772 {
1773     VP8Context *s = avctx->priv_data;
1774     VP8ThreadData *td = &s->thread_data[threadnr];
1775     int mb_x, mb_y = td->thread_mb_pos>>16, num_jobs = s->num_jobs;
1776     AVFrame *curframe = s->curframe->tf.f;
1777     VP8Macroblock *mb;
1778     VP8ThreadData *prev_td, *next_td;
1779     uint8_t *dst[3] = {
1780         curframe->data[0] + 16*mb_y*s->linesize,
1781         curframe->data[1] +  8*mb_y*s->uvlinesize,
1782         curframe->data[2] +  8*mb_y*s->uvlinesize
1783     };
1784
1785     if (s->mb_layout == 1)
1786         mb = s->macroblocks_base + ((s->mb_width+1)*(mb_y + 1) + 1);
1787     else
1788         mb = s->macroblocks + (s->mb_height - mb_y - 1)*2;
1789
1790     if (mb_y == 0) prev_td = td;
1791     else           prev_td = &s->thread_data[(jobnr + num_jobs - 1)%num_jobs];
1792     if (mb_y == s->mb_height-1) next_td = td;
1793     else                        next_td = &s->thread_data[(jobnr + 1)%num_jobs];
1794
1795     for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb++) {
1796         VP8FilterStrength *f = &td->filter_strength[mb_x];
1797         if (prev_td != td) {
1798             check_thread_pos(td, prev_td, (mb_x+1) + (s->mb_width+3), mb_y-1);
1799         }
1800         if (next_td != td)
1801             if (next_td != &s->thread_data[0]) {
1802                 check_thread_pos(td, next_td, mb_x+1, mb_y+1);
1803             }
1804
1805         if (num_jobs == 1) {
1806             if (s->filter.simple)
1807                 backup_mb_border(s->top_border[mb_x+1], dst[0], NULL, NULL, s->linesize, 0, 1);
1808             else
1809                 backup_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2], s->linesize, s->uvlinesize, 0);
1810         }
1811
1812         if (s->filter.simple)
1813             filter_mb_simple(s, dst[0], f, mb_x, mb_y);
1814         else
1815             filter_mb(s, dst, f, mb_x, mb_y);
1816         dst[0] += 16;
1817         dst[1] += 8;
1818         dst[2] += 8;
1819
1820         update_pos(td, mb_y, (s->mb_width+3) + mb_x);
1821     }
1822 }
1823
1824 static int vp8_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata,
1825                                     int jobnr, int threadnr)
1826 {
1827     VP8Context *s = avctx->priv_data;
1828     VP8ThreadData *td = &s->thread_data[jobnr];
1829     VP8ThreadData *next_td = NULL, *prev_td = NULL;
1830     VP8Frame *curframe = s->curframe;
1831     int mb_y, num_jobs = s->num_jobs;
1832     td->thread_nr = threadnr;
1833     for (mb_y = jobnr; mb_y < s->mb_height; mb_y += num_jobs) {
1834         if (mb_y >= s->mb_height) break;
1835         td->thread_mb_pos = mb_y<<16;
1836         vp8_decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr);
1837         if (s->deblock_filter)
1838             vp8_filter_mb_row(avctx, tdata, jobnr, threadnr);
1839         update_pos(td, mb_y, INT_MAX & 0xFFFF);
1840
1841         s->mv_min.y -= 64;
1842         s->mv_max.y -= 64;
1843
1844         if (avctx->active_thread_type == FF_THREAD_FRAME)
1845             ff_thread_report_progress(&curframe->tf, mb_y, 0);
1846     }
1847
1848     return 0;
1849 }
1850
1851 static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
1852                             AVPacket *avpkt)
1853 {
1854     VP8Context *s = avctx->priv_data;
1855     int ret, i, referenced, num_jobs;
1856     enum AVDiscard skip_thresh;
1857     VP8Frame *av_uninit(curframe), *prev_frame;
1858
1859     if ((ret = decode_frame_header(s, avpkt->data, avpkt->size)) < 0)
1860         goto err;
1861
1862     prev_frame = s->framep[VP56_FRAME_CURRENT];
1863
1864     referenced = s->update_last || s->update_golden == VP56_FRAME_CURRENT
1865                                 || s->update_altref == VP56_FRAME_CURRENT;
1866
1867     skip_thresh = !referenced ? AVDISCARD_NONREF :
1868                     !s->keyframe ? AVDISCARD_NONKEY : AVDISCARD_ALL;
1869
1870     if (avctx->skip_frame >= skip_thresh) {
1871         s->invisible = 1;
1872         memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
1873         goto skip_decode;
1874     }
1875     s->deblock_filter = s->filter.level && avctx->skip_loop_filter < skip_thresh;
1876
1877     // release no longer referenced frames
1878     for (i = 0; i < 5; i++)
1879         if (s->frames[i].tf.f->data[0] &&
1880             &s->frames[i] != prev_frame &&
1881             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
1882             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN] &&
1883             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2])
1884             vp8_release_frame(s, &s->frames[i]);
1885
1886     // find a free buffer
1887     for (i = 0; i < 5; i++)
1888         if (&s->frames[i] != prev_frame &&
1889             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
1890             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN] &&
1891             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2]) {
1892             curframe = s->framep[VP56_FRAME_CURRENT] = &s->frames[i];
1893             break;
1894         }
1895     if (i == 5) {
1896         av_log(avctx, AV_LOG_FATAL, "Ran out of free frames!\n");
1897         abort();
1898     }
1899     if (curframe->tf.f->data[0])
1900         vp8_release_frame(s, curframe);
1901
1902     // Given that arithmetic probabilities are updated every frame, it's quite likely
1903     // that the values we have on a random interframe are complete junk if we didn't
1904     // start decode on a keyframe. So just don't display anything rather than junk.
1905     if (!s->keyframe && (!s->framep[VP56_FRAME_PREVIOUS] ||
1906                          !s->framep[VP56_FRAME_GOLDEN] ||
1907                          !s->framep[VP56_FRAME_GOLDEN2])) {
1908         av_log(avctx, AV_LOG_WARNING, "Discarding interframe without a prior keyframe!\n");
1909         ret = AVERROR_INVALIDDATA;
1910         goto err;
1911     }
1912
1913     curframe->tf.f->key_frame = s->keyframe;
1914     curframe->tf.f->pict_type = s->keyframe ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
1915     if ((ret = vp8_alloc_frame(s, curframe, referenced))) {
1916         av_log(avctx, AV_LOG_ERROR, "get_buffer() failed!\n");
1917         goto err;
1918     }
1919
1920     // check if golden and altref are swapped
1921     if (s->update_altref != VP56_FRAME_NONE) {
1922         s->next_framep[VP56_FRAME_GOLDEN2]  = s->framep[s->update_altref];
1923     } else {
1924         s->next_framep[VP56_FRAME_GOLDEN2]  = s->framep[VP56_FRAME_GOLDEN2];
1925     }
1926     if (s->update_golden != VP56_FRAME_NONE) {
1927         s->next_framep[VP56_FRAME_GOLDEN]   = s->framep[s->update_golden];
1928     } else {
1929         s->next_framep[VP56_FRAME_GOLDEN]   = s->framep[VP56_FRAME_GOLDEN];
1930     }
1931     if (s->update_last) {
1932         s->next_framep[VP56_FRAME_PREVIOUS] = curframe;
1933     } else {
1934         s->next_framep[VP56_FRAME_PREVIOUS] = s->framep[VP56_FRAME_PREVIOUS];
1935     }
1936     s->next_framep[VP56_FRAME_CURRENT]      = curframe;
1937
1938     ff_thread_finish_setup(avctx);
1939
1940     s->linesize   = curframe->tf.f->linesize[0];
1941     s->uvlinesize = curframe->tf.f->linesize[1];
1942
1943     if (!s->thread_data[0].edge_emu_buffer)
1944         for (i = 0; i < MAX_THREADS; i++)
1945             s->thread_data[i].edge_emu_buffer = av_malloc(21*s->linesize);
1946
1947     memset(s->top_nnz, 0, s->mb_width*sizeof(*s->top_nnz));
1948     /* Zero macroblock structures for top/top-left prediction from outside the frame. */
1949     if (!s->mb_layout)
1950         memset(s->macroblocks + s->mb_height*2 - 1, 0, (s->mb_width+1)*sizeof(*s->macroblocks));
1951     if (!s->mb_layout && s->keyframe)
1952         memset(s->intra4x4_pred_mode_top, DC_PRED, s->mb_width*4);
1953
1954     // top edge of 127 for intra prediction
1955     if (!(avctx->flags & CODEC_FLAG_EMU_EDGE)) {
1956         s->top_border[0][15] = s->top_border[0][23] = 127;
1957         s->top_border[0][31] = 127;
1958         memset(s->top_border[1], 127, s->mb_width*sizeof(*s->top_border));
1959     }
1960     memset(s->ref_count, 0, sizeof(s->ref_count));
1961
1962
1963     // Make sure the previous frame has read its segmentation map,
1964     // if we re-use the same map.
1965     if (prev_frame && s->segmentation.enabled && !s->segmentation.update_map)
1966         ff_thread_await_progress(&prev_frame->tf, 1, 0);
1967
1968     if (s->mb_layout == 1)
1969         vp8_decode_mv_mb_modes(avctx, curframe, prev_frame);
1970
1971     if (avctx->active_thread_type == FF_THREAD_FRAME)
1972         num_jobs = 1;
1973     else
1974         num_jobs = FFMIN(s->num_coeff_partitions, avctx->thread_count);
1975     s->num_jobs   = num_jobs;
1976     s->curframe   = curframe;
1977     s->prev_frame = prev_frame;
1978     s->mv_min.y   = -MARGIN;
1979     s->mv_max.y   = ((s->mb_height - 1) << 6) + MARGIN;
1980     for (i = 0; i < MAX_THREADS; i++) {
1981         s->thread_data[i].thread_mb_pos = 0;
1982         s->thread_data[i].wait_mb_pos = INT_MAX;
1983     }
1984     avctx->execute2(avctx, vp8_decode_mb_row_sliced, s->thread_data, NULL, num_jobs);
1985
1986     ff_thread_report_progress(&curframe->tf, INT_MAX, 0);
1987     memcpy(&s->framep[0], &s->next_framep[0], sizeof(s->framep[0]) * 4);
1988
1989 skip_decode:
1990     // if future frames don't use the updated probabilities,
1991     // reset them to the values we saved
1992     if (!s->update_probabilities)
1993         s->prob[0] = s->prob[1];
1994
1995     if (!s->invisible) {
1996         if ((ret = av_frame_ref(data, curframe->tf.f)) < 0)
1997             return ret;
1998         *got_frame      = 1;
1999     }
2000
2001     return avpkt->size;
2002 err:
2003     memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
2004     return ret;
2005 }
2006
2007 static av_cold int vp8_decode_free(AVCodecContext *avctx)
2008 {
2009     VP8Context *s = avctx->priv_data;
2010     int i;
2011
2012     vp8_decode_flush_impl(avctx, 1);
2013     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++)
2014         av_frame_free(&s->frames[i].tf.f);
2015
2016     return 0;
2017 }
2018
2019 static av_cold int vp8_init_frames(VP8Context *s)
2020 {
2021     int i;
2022     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++) {
2023         s->frames[i].tf.f = av_frame_alloc();
2024         if (!s->frames[i].tf.f)
2025             return AVERROR(ENOMEM);
2026     }
2027     return 0;
2028 }
2029
2030 static av_cold int vp8_decode_init(AVCodecContext *avctx)
2031 {
2032     VP8Context *s = avctx->priv_data;
2033     int ret;
2034
2035     s->avctx = avctx;
2036     avctx->pix_fmt = AV_PIX_FMT_YUV420P;
2037     avctx->internal->allocate_progress = 1;
2038
2039     ff_videodsp_init(&s->vdsp, 8);
2040     ff_h264_pred_init(&s->hpc, AV_CODEC_ID_VP8, 8, 1);
2041     ff_vp8dsp_init(&s->vp8dsp);
2042
2043     if ((ret = vp8_init_frames(s)) < 0) {
2044         vp8_decode_free(avctx);
2045         return ret;
2046     }
2047
2048     return 0;
2049 }
2050
2051 static av_cold int vp8_decode_init_thread_copy(AVCodecContext *avctx)
2052 {
2053     VP8Context *s = avctx->priv_data;
2054     int ret;
2055
2056     s->avctx = avctx;
2057
2058     if ((ret = vp8_init_frames(s)) < 0) {
2059         vp8_decode_free(avctx);
2060         return ret;
2061     }
2062
2063     return 0;
2064 }
2065
2066 #define REBASE(pic) \
2067     pic ? pic - &s_src->frames[0] + &s->frames[0] : NULL
2068
2069 static int vp8_decode_update_thread_context(AVCodecContext *dst, const AVCodecContext *src)
2070 {
2071     VP8Context *s = dst->priv_data, *s_src = src->priv_data;
2072     int i;
2073
2074     if (s->macroblocks_base &&
2075         (s_src->mb_width != s->mb_width || s_src->mb_height != s->mb_height)) {
2076         free_buffers(s);
2077         s->mb_width  = s_src->mb_width;
2078         s->mb_height = s_src->mb_height;
2079     }
2080
2081     s->prob[0] = s_src->prob[!s_src->update_probabilities];
2082     s->segmentation = s_src->segmentation;
2083     s->lf_delta = s_src->lf_delta;
2084     memcpy(s->sign_bias, s_src->sign_bias, sizeof(s->sign_bias));
2085
2086     for (i = 0; i < FF_ARRAY_ELEMS(s_src->frames); i++) {
2087         if (s_src->frames[i].tf.f->data[0]) {
2088             int ret = vp8_ref_frame(s, &s->frames[i], &s_src->frames[i]);
2089             if (ret < 0)
2090                 return ret;
2091         }
2092     }
2093
2094     s->framep[0] = REBASE(s_src->next_framep[0]);
2095     s->framep[1] = REBASE(s_src->next_framep[1]);
2096     s->framep[2] = REBASE(s_src->next_framep[2]);
2097     s->framep[3] = REBASE(s_src->next_framep[3]);
2098
2099     return 0;
2100 }
2101
2102 AVCodec ff_vp8_decoder = {
2103     .name                  = "vp8",
2104     .type                  = AVMEDIA_TYPE_VIDEO,
2105     .id                    = AV_CODEC_ID_VP8,
2106     .priv_data_size        = sizeof(VP8Context),
2107     .init                  = vp8_decode_init,
2108     .close                 = vp8_decode_free,
2109     .decode                = vp8_decode_frame,
2110     .capabilities          = CODEC_CAP_DR1 | CODEC_CAP_FRAME_THREADS | CODEC_CAP_SLICE_THREADS,
2111     .flush                 = vp8_decode_flush,
2112     .long_name             = NULL_IF_CONFIG_SMALL("On2 VP8"),
2113     .init_thread_copy      = ONLY_IF_THREADS_ENABLED(vp8_decode_init_thread_copy),
2114     .update_thread_context = ONLY_IF_THREADS_ENABLED(vp8_decode_update_thread_context),
2115 };