git.sesse.net Git - ffmpeg/blob - libavcodec/vp8.c

   1 /*
   2  * VP7/VP8 compatible video decoder
   3  *
   4  * Copyright (C) 2010 David Conrad
   5  * Copyright (C) 2010 Ronald S. Bultje
   6  * Copyright (C) 2010 Fiona Glaser
   7  * Copyright (C) 2012 Daniel Kang
   8  * Copyright (C) 2014 Peter Ross
   9  *
  10  * This file is part of Libav.
  11  *
  12  * Libav is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public
  14  * License as published by the Free Software Foundation; either
  15  * version 2.1 of the License, or (at your option) any later version.
  16  *
  17  * Libav is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with Libav; if not, write to the Free Software
  24  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  25  */
  26
  27 #include "libavutil/imgutils.h"
  28
  29 #include "avcodec.h"
  30 #include "internal.h"
  31 #include "rectangle.h"
  32 #include "thread.h"
  33 #include "vp8.h"
  34 #include "vp8data.h"
  35
  36 #if ARCH_ARM
  37 #   include "arm/vp8.h"
  38 #endif
  39
  40 static void free_buffers(VP8Context *s)
  41 {
  42     int i;
  43     if (s->thread_data)
  44         for (i = 0; i < MAX_THREADS; i++) {
  45 #if HAVE_THREADS
  46             pthread_cond_destroy(&s->thread_data[i].cond);
  47             pthread_mutex_destroy(&s->thread_data[i].lock);
  48 #endif
  49             av_freep(&s->thread_data[i].filter_strength);
  50         }
  51     av_freep(&s->thread_data);
  52     av_freep(&s->macroblocks_base);
  53     av_freep(&s->intra4x4_pred_mode_top);
  54     av_freep(&s->top_nnz);
  55     av_freep(&s->top_border);
  56
  57     s->macroblocks = NULL;
  58 }
  59
  60 static int vp8_alloc_frame(VP8Context *s, VP8Frame *f, int ref)
  61 {
  62     int ret;
  63     if ((ret = ff_thread_get_buffer(s->avctx, &f->tf,
  64                                     ref ? AV_GET_BUFFER_FLAG_REF : 0)) < 0)
  65         return ret;
  66     if (!(f->seg_map = av_buffer_allocz(s->mb_width * s->mb_height))) {
  67         ff_thread_release_buffer(s->avctx, &f->tf);
  68         return AVERROR(ENOMEM);
  69     }
  70     return 0;
  71 }
  72
  73 static void vp8_release_frame(VP8Context *s, VP8Frame *f)
  74 {
  75     av_buffer_unref(&f->seg_map);
  76     ff_thread_release_buffer(s->avctx, &f->tf);
  77 }
  78
  79 #if CONFIG_VP8_DECODER
  80 static int vp8_ref_frame(VP8Context *s, VP8Frame *dst, VP8Frame *src)
  81 {
  82     int ret;
  83
  84     vp8_release_frame(s, dst);
  85
  86     if ((ret = ff_thread_ref_frame(&dst->tf, &src->tf)) < 0)
  87         return ret;
  88     if (src->seg_map &&
  89         !(dst->seg_map = av_buffer_ref(src->seg_map))) {
  90         vp8_release_frame(s, dst);
  91         return AVERROR(ENOMEM);
  92     }
  93
  94     return 0;
  95 }
  96 #endif /* CONFIG_VP8_DECODER */
  97
  98 static void vp8_decode_flush_impl(AVCodecContext *avctx, int free_mem)
  99 {
 100     VP8Context *s = avctx->priv_data;
 101     int i;
 102
 103     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++)
 104         vp8_release_frame(s, &s->frames[i]);
 105     memset(s->framep, 0, sizeof(s->framep));
 106
 107     if (free_mem)
 108         free_buffers(s);
 109 }
 110
 111 static void vp8_decode_flush(AVCodecContext *avctx)
 112 {
 113     vp8_decode_flush_impl(avctx, 0);
 114 }
 115
 116 static VP8Frame *vp8_find_free_buffer(VP8Context *s)
 117 {
 118     VP8Frame *frame = NULL;
 119     int i;
 120
 121     // find a free buffer
 122     for (i = 0; i < 5; i++)
 123         if (&s->frames[i] != s->framep[VP56_FRAME_CURRENT]  &&
 124             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
 125             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN]   &&
 126             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2]) {
 127             frame = &s->frames[i];
 128             break;
 129         }
 130     if (i == 5) {
 131         av_log(s->avctx, AV_LOG_FATAL, "Ran out of free frames!\n");
 132         abort();
 133     }
 134     if (frame->tf.f->data[0])
 135         vp8_release_frame(s, frame);
 136
 137     return frame;
 138 }
 139
 140 static av_always_inline
 141 int update_dimensions(VP8Context *s, int width, int height, int is_vp7)
 142 {
 143     AVCodecContext *avctx = s->avctx;
 144     int i, ret;
 145
 146     if (width  != s->avctx->width ||
 147         height != s->avctx->height) {
 148         vp8_decode_flush_impl(s->avctx, 1);
 149
 150         ret = ff_set_dimensions(s->avctx, width, height);
 151         if (ret < 0)
 152             return ret;
 153     }
 154
 155     s->mb_width  = (s->avctx->coded_width  + 15) / 16;
 156     s->mb_height = (s->avctx->coded_height + 15) / 16;
 157
 158     s->mb_layout = is_vp7 || avctx->active_thread_type == FF_THREAD_SLICE &&
 159                    FFMIN(s->num_coeff_partitions, avctx->thread_count) > 1;
 160     if (!s->mb_layout) { // Frame threading and one thread
 161         s->macroblocks_base       = av_mallocz((s->mb_width + s->mb_height * 2 + 1) *
 162                                                sizeof(*s->macroblocks));
 163         s->intra4x4_pred_mode_top = av_mallocz(s->mb_width * 4);
 164     } else // Sliced threading
 165         s->macroblocks_base = av_mallocz((s->mb_width + 2) * (s->mb_height + 2) *
 166                                          sizeof(*s->macroblocks));
 167     s->top_nnz     = av_mallocz(s->mb_width * sizeof(*s->top_nnz));
 168     s->top_border  = av_mallocz((s->mb_width + 1) * sizeof(*s->top_border));
 169     s->thread_data = av_mallocz(MAX_THREADS * sizeof(VP8ThreadData));
 170
 171     for (i = 0; i < MAX_THREADS; i++) {
 172         s->thread_data[i].filter_strength =
 173             av_mallocz(s->mb_width * sizeof(*s->thread_data[0].filter_strength));
 174 #if HAVE_THREADS
 175         pthread_mutex_init(&s->thread_data[i].lock, NULL);
 176         pthread_cond_init(&s->thread_data[i].cond, NULL);
 177 #endif
 178     }
 179
 180     if (!s->macroblocks_base || !s->top_nnz || !s->top_border ||
 181         (!s->intra4x4_pred_mode_top && !s->mb_layout))
 182         return AVERROR(ENOMEM);
 183
 184     s->macroblocks = s->macroblocks_base + 1;
 185
 186     return 0;
 187 }
 188
 189 static int vp7_update_dimensions(VP8Context *s, int width, int height)
 190 {
 191     return update_dimensions(s, width, height, IS_VP7);
 192 }
 193
 194 static int vp8_update_dimensions(VP8Context *s, int width, int height)
 195 {
 196     return update_dimensions(s, width, height, IS_VP8);
 197 }
 198
 199 static void parse_segment_info(VP8Context *s)
 200 {
 201     VP56RangeCoder *c = &s->c;
 202     int i;
 203
 204     s->segmentation.update_map = vp8_rac_get(c);
 205
 206     if (vp8_rac_get(c)) { // update segment feature data
 207         s->segmentation.absolute_vals = vp8_rac_get(c);
 208
 209         for (i = 0; i < 4; i++)
 210             s->segmentation.base_quant[i]   = vp8_rac_get_sint(c, 7);
 211
 212         for (i = 0; i < 4; i++)
 213             s->segmentation.filter_level[i] = vp8_rac_get_sint(c, 6);
 214     }
 215     if (s->segmentation.update_map)
 216         for (i = 0; i < 3; i++)
 217             s->prob->segmentid[i] = vp8_rac_get(c) ? vp8_rac_get_uint(c, 8) : 255;
 218 }
 219
 220 static void update_lf_deltas(VP8Context *s)
 221 {
 222     VP56RangeCoder *c = &s->c;
 223     int i;
 224
 225     for (i = 0; i < 4; i++) {
 226         if (vp8_rac_get(c)) {
 227             s->lf_delta.ref[i] = vp8_rac_get_uint(c, 6);
 228
 229             if (vp8_rac_get(c))
 230                 s->lf_delta.ref[i] = -s->lf_delta.ref[i];
 231         }
 232     }
 233
 234     for (i = MODE_I4x4; i <= VP8_MVMODE_SPLIT; i++) {
 235         if (vp8_rac_get(c)) {
 236             s->lf_delta.mode[i] = vp8_rac_get_uint(c, 6);
 237
 238             if (vp8_rac_get(c))
 239                 s->lf_delta.mode[i] = -s->lf_delta.mode[i];
 240         }
 241     }
 242 }
 243
 244 static int setup_partitions(VP8Context *s, const uint8_t *buf, int buf_size)
 245 {
 246     const uint8_t *sizes = buf;
 247     int i;
 248
 249     s->num_coeff_partitions = 1 << vp8_rac_get_uint(&s->c, 2);
 250
 251     buf      += 3 * (s->num_coeff_partitions - 1);
 252     buf_size -= 3 * (s->num_coeff_partitions - 1);
 253     if (buf_size < 0)
 254         return -1;
 255
 256     for (i = 0; i < s->num_coeff_partitions - 1; i++) {
 257         int size = AV_RL24(sizes + 3 * i);
 258         if (buf_size - size < 0)
 259             return -1;
 260
 261         ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, size);
 262         buf      += size;
 263         buf_size -= size;
 264     }
 265     ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, buf_size);
 266
 267     return 0;
 268 }
 269
 270 static void vp7_get_quants(VP8Context *s)
 271 {
 272     VP56RangeCoder *c = &s->c;
 273
 274     int yac_qi  = vp8_rac_get_uint(c, 7);
 275     int ydc_qi  = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 276     int y2dc_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 277     int y2ac_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 278     int uvdc_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 279     int uvac_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 280
 281     s->qmat[0].luma_qmul[0]    =       vp7_ydc_qlookup[ydc_qi];
 282     s->qmat[0].luma_qmul[1]    =       vp7_yac_qlookup[yac_qi];
 283     s->qmat[0].luma_dc_qmul[0] =       vp7_y2dc_qlookup[y2dc_qi];
 284     s->qmat[0].luma_dc_qmul[1] =       vp7_y2ac_qlookup[y2ac_qi];
 285     s->qmat[0].chroma_qmul[0]  = FFMIN(vp7_ydc_qlookup[uvdc_qi], 132);
 286     s->qmat[0].chroma_qmul[1]  =       vp7_yac_qlookup[uvac_qi];
 287 }
 288
 289 static void get_quants(VP8Context *s)
 290 {
 291     VP56RangeCoder *c = &s->c;
 292     int i, base_qi;
 293
 294     int yac_qi     = vp8_rac_get_uint(c, 7);
 295     int ydc_delta  = vp8_rac_get_sint(c, 4);
 296     int y2dc_delta = vp8_rac_get_sint(c, 4);
 297     int y2ac_delta = vp8_rac_get_sint(c, 4);
 298     int uvdc_delta = vp8_rac_get_sint(c, 4);
 299     int uvac_delta = vp8_rac_get_sint(c, 4);
 300
 301     for (i = 0; i < 4; i++) {
 302         if (s->segmentation.enabled) {
 303             base_qi = s->segmentation.base_quant[i];
 304             if (!s->segmentation.absolute_vals)
 305                 base_qi += yac_qi;
 306         } else
 307             base_qi = yac_qi;
 308
 309         s->qmat[i].luma_qmul[0]    = vp8_dc_qlookup[av_clip_uintp2(base_qi + ydc_delta,  7)];
 310         s->qmat[i].luma_qmul[1]    = vp8_ac_qlookup[av_clip_uintp2(base_qi,              7)];
 311         s->qmat[i].luma_dc_qmul[0] = vp8_dc_qlookup[av_clip_uintp2(base_qi + y2dc_delta, 7)] * 2;
 312         /* 101581>>16 is equivalent to 155/100 */
 313         s->qmat[i].luma_dc_qmul[1] = vp8_ac_qlookup[av_clip_uintp2(base_qi + y2ac_delta, 7)] * 101581 >> 16;
 314         s->qmat[i].chroma_qmul[0]  = vp8_dc_qlookup[av_clip_uintp2(base_qi + uvdc_delta, 7)];
 315         s->qmat[i].chroma_qmul[1]  = vp8_ac_qlookup[av_clip_uintp2(base_qi + uvac_delta, 7)];
 316
 317         s->qmat[i].luma_dc_qmul[1] = FFMAX(s->qmat[i].luma_dc_qmul[1], 8);
 318         s->qmat[i].chroma_qmul[0]  = FFMIN(s->qmat[i].chroma_qmul[0], 132);
 319     }
 320 }
 321
 322 /**
 323  * Determine which buffers golden and altref should be updated with after this frame.
 324  * The spec isn't clear here, so I'm going by my understanding of what libvpx does
 325  *
 326  * Intra frames update all 3 references
 327  * Inter frames update VP56_FRAME_PREVIOUS if the update_last flag is set
 328  * If the update (golden|altref) flag is set, it's updated with the current frame
 329  *      if update_last is set, and VP56_FRAME_PREVIOUS otherwise.
 330  * If the flag is not set, the number read means:
 331  *      0: no update
 332  *      1: VP56_FRAME_PREVIOUS
 333  *      2: update golden with altref, or update altref with golden
 334  */
 335 static VP56Frame ref_to_update(VP8Context *s, int update, VP56Frame ref)
 336 {
 337     VP56RangeCoder *c = &s->c;
 338
 339     if (update)
 340         return VP56_FRAME_CURRENT;
 341
 342     switch (vp8_rac_get_uint(c, 2)) {
 343     case 1:
 344         return VP56_FRAME_PREVIOUS;
 345     case 2:
 346         return (ref == VP56_FRAME_GOLDEN) ? VP56_FRAME_GOLDEN2 : VP56_FRAME_GOLDEN;
 347     }
 348     return VP56_FRAME_NONE;
 349 }
 350
 351 static void vp78_reset_probability_tables(VP8Context *s)
 352 {
 353     int i, j;
 354     for (i = 0; i < 4; i++)
 355         for (j = 0; j < 16; j++)
 356             memcpy(s->prob->token[i][j], vp8_token_default_probs[i][vp8_coeff_band[j]],
 357                    sizeof(s->prob->token[i][j]));
 358 }
 359
 360 static void vp78_update_probability_tables(VP8Context *s)
 361 {
 362     VP56RangeCoder *c = &s->c;
 363     int i, j, k, l, m;
 364
 365     for (i = 0; i < 4; i++)
 366         for (j = 0; j < 8; j++)
 367             for (k = 0; k < 3; k++)
 368                 for (l = 0; l < NUM_DCT_TOKENS-1; l++)
 369                     if (vp56_rac_get_prob_branchy(c, vp8_token_update_probs[i][j][k][l])) {
 370                         int prob = vp8_rac_get_uint(c, 8);
 371                         for (m = 0; vp8_coeff_band_indexes[j][m] >= 0; m++)
 372                             s->prob->token[i][vp8_coeff_band_indexes[j][m]][k][l] = prob;
 373                     }
 374 }
 375
 376 #define VP7_MVC_SIZE 17
 377 #define VP8_MVC_SIZE 19
 378
 379 static void vp78_update_pred16x16_pred8x8_mvc_probabilities(VP8Context *s,
 380                                                             int mvc_size)
 381 {
 382     VP56RangeCoder *c = &s->c;
 383     int i, j;
 384
 385     if (vp8_rac_get(c))
 386         for (i = 0; i < 4; i++)
 387             s->prob->pred16x16[i] = vp8_rac_get_uint(c, 8);
 388     if (vp8_rac_get(c))
 389         for (i = 0; i < 3; i++)
 390             s->prob->pred8x8c[i]  = vp8_rac_get_uint(c, 8);
 391
 392     // 17.2 MV probability update
 393     for (i = 0; i < 2; i++)
 394         for (j = 0; j < mvc_size; j++)
 395             if (vp56_rac_get_prob_branchy(c, vp8_mv_update_prob[i][j]))
 396                 s->prob->mvc[i][j] = vp8_rac_get_nn(c);
 397 }
 398
 399 static void update_refs(VP8Context *s)
 400 {
 401     VP56RangeCoder *c = &s->c;
 402
 403     int update_golden = vp8_rac_get(c);
 404     int update_altref = vp8_rac_get(c);
 405
 406     s->update_golden = ref_to_update(s, update_golden, VP56_FRAME_GOLDEN);
 407     s->update_altref = ref_to_update(s, update_altref, VP56_FRAME_GOLDEN2);
 408 }
 409
 410 static void copy_luma(AVFrame *dst, AVFrame *src, int width, int height)
 411 {
 412     int i, j;
 413
 414     for (j = 1; j < 3; j++) {
 415         for (i = 0; i < height / 2; i++)
 416             memcpy(dst->data[j] + i * dst->linesize[j],
 417                    src->data[j] + i * src->linesize[j], width / 2);
 418     }
 419 }
 420
 421 static void fade(uint8_t *dst, uint8_t *src,
 422                  int width, int height, int linesize,
 423                  int alpha, int beta)
 424 {
 425     int i, j;
 426
 427     for (j = 0; j < height; j++) {
 428         for (i = 0; i < width; i++) {
 429             uint8_t y = src[j * linesize + i];
 430             dst[j * linesize + i] = av_clip_uint8(y + ((y * beta) >> 8) + alpha);
 431         }
 432     }
 433 }
 434
 435 static int vp7_fade_frame(VP8Context *s, VP56RangeCoder *c)
 436 {
 437     int alpha = (int8_t) vp8_rac_get_uint(c, 8);
 438     int beta  = (int8_t) vp8_rac_get_uint(c, 8);
 439     int ret;
 440
 441     if (!s->keyframe && (alpha || beta)) {
 442         int width  = s->mb_width * 16;
 443         int height = s->mb_height * 16;
 444         AVFrame *src, *dst;
 445
 446         if (!s->framep[VP56_FRAME_PREVIOUS])
 447             return AVERROR_INVALIDDATA;
 448
 449         dst =
 450         src = s->framep[VP56_FRAME_PREVIOUS]->tf.f;
 451
 452         /* preserve the golden frame, write a new previous frame */
 453         if (s->framep[VP56_FRAME_GOLDEN] == s->framep[VP56_FRAME_PREVIOUS]) {
 454             s->framep[VP56_FRAME_PREVIOUS] = vp8_find_free_buffer(s);
 455             if ((ret = vp8_alloc_frame(s, s->framep[VP56_FRAME_PREVIOUS], 1)) < 0)
 456                return ret;
 457
 458             dst = s->framep[VP56_FRAME_PREVIOUS]->tf.f;
 459
 460             copy_luma(dst, src, width, height);
 461         }
 462
 463         fade(dst->data[0], src->data[0],
 464              width, height, dst->linesize[0], alpha, beta);
 465     }
 466
 467     return 0;
 468 }
 469
 470 static int vp7_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size)
 471 {
 472     VP56RangeCoder *c = &s->c;
 473     int part1_size, hscale, vscale, i, j, ret;
 474     int width  = s->avctx->width;
 475     int height = s->avctx->height;
 476
 477     s->profile = (buf[0] >> 1) & 7;
 478     if (s->profile > 1) {
 479         avpriv_request_sample(s->avctx, "Unknown profile %d", s->profile);
 480         return AVERROR_INVALIDDATA;
 481     }
 482
 483     s->keyframe  = !(buf[0] & 1);
 484     s->invisible = 0;
 485     part1_size   = AV_RL24(buf) >> 4;
 486
 487     buf      += 4 - s->profile;
 488     buf_size -= 4 - s->profile;
 489
 490     memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab, sizeof(s->put_pixels_tab));
 491
 492     ff_vp56_init_range_decoder(c, buf, part1_size);
 493     buf      += part1_size;
 494     buf_size -= part1_size;
 495
 496     /* A. Dimension information (keyframes only) */
 497     if (s->keyframe) {
 498         width  = vp8_rac_get_uint(c, 12);
 499         height = vp8_rac_get_uint(c, 12);
 500         hscale = vp8_rac_get_uint(c, 2);
 501         vscale = vp8_rac_get_uint(c, 2);
 502         if (hscale || vscale)
 503             avpriv_request_sample(s->avctx, "Upscaling");
 504
 505         s->update_golden = s->update_altref = VP56_FRAME_CURRENT;
 506         vp78_reset_probability_tables(s);
 507         memcpy(s->prob->pred16x16, vp8_pred16x16_prob_inter,
 508                sizeof(s->prob->pred16x16));
 509         memcpy(s->prob->pred8x8c, vp8_pred8x8c_prob_inter,
 510                sizeof(s->prob->pred8x8c));
 511         for (i = 0; i < 2; i++)
 512             memcpy(s->prob->mvc[i], vp7_mv_default_prob[i],
 513                    sizeof(vp7_mv_default_prob[i]));
 514         memset(&s->segmentation, 0, sizeof(s->segmentation));
 515         memset(&s->lf_delta, 0, sizeof(s->lf_delta));
 516         memcpy(s->prob[0].scan, zigzag_scan, sizeof(s->prob[0].scan));
 517     }
 518
 519     if (s->keyframe || s->profile > 0)
 520         memset(s->inter_dc_pred, 0 , sizeof(s->inter_dc_pred));
 521
 522     /* B. Decoding information for all four macroblock-level features */
 523     for (i = 0; i < 4; i++) {
 524         s->feature_enabled[i] = vp8_rac_get(c);
 525         if (s->feature_enabled[i]) {
 526              s->feature_present_prob[i] = vp8_rac_get_uint(c, 8);
 527
 528              for (j = 0; j < 3; j++)
 529                  s->feature_index_prob[i][j] =
 530                      vp8_rac_get(c) ? vp8_rac_get_uint(c, 8) : 255;
 531
 532              if (vp7_feature_value_size[i])
 533                  for (j = 0; j < 4; j++)
 534                      s->feature_value[i][j] =
 535                          vp8_rac_get(c) ? vp8_rac_get_uint(c, vp7_feature_value_size[s->profile][i]) : 0;
 536         }
 537     }
 538
 539     s->segmentation.enabled    = 0;
 540     s->segmentation.update_map = 0;
 541     s->lf_delta.enabled        = 0;
 542
 543     s->num_coeff_partitions = 1;
 544     ff_vp56_init_range_decoder(&s->coeff_partition[0], buf, buf_size);
 545
 546     if (!s->macroblocks_base || /* first frame */
 547         width != s->avctx->width || height != s->avctx->height ||
 548         (width + 15) / 16 != s->mb_width || (height + 15) / 16 != s->mb_height) {
 549         if ((ret = vp7_update_dimensions(s, width, height)) < 0)
 550             return ret;
 551     }
 552
 553     /* C. Dequantization indices */
 554     vp7_get_quants(s);
 555
 556     /* D. Golden frame update flag (a Flag) for interframes only */
 557     if (!s->keyframe) {
 558         s->update_golden = vp8_rac_get(c) ? VP56_FRAME_CURRENT : VP56_FRAME_NONE;
 559         s->sign_bias[VP56_FRAME_GOLDEN] = 0;
 560     }
 561
 562     s->update_last          = 1;
 563     s->update_probabilities = 1;
 564     s->fade_present         = 1;
 565
 566     if (s->profile > 0) {
 567         s->update_probabilities = vp8_rac_get(c);
 568         if (!s->update_probabilities)
 569             s->prob[1] = s->prob[0];
 570
 571         if (!s->keyframe)
 572             s->fade_present = vp8_rac_get(c);
 573     }
 574
 575     /* E. Fading information for previous frame */
 576     if (s->fade_present && vp8_rac_get(c)) {
 577         if ((ret = vp7_fade_frame(s ,c)) < 0)
 578             return ret;
 579     }
 580
 581     /* F. Loop filter type */
 582     if (!s->profile)
 583         s->filter.simple = vp8_rac_get(c);
 584
 585     /* G. DCT coefficient ordering specification */
 586     if (vp8_rac_get(c))
 587         for (i = 1; i < 16; i++)
 588             s->prob[0].scan[i] = zigzag_scan[vp8_rac_get_uint(c, 4)];
 589
 590     /* H. Loop filter levels  */
 591     if (s->profile > 0)
 592         s->filter.simple = vp8_rac_get(c);
 593     s->filter.level     = vp8_rac_get_uint(c, 6);
 594     s->filter.sharpness = vp8_rac_get_uint(c, 3);
 595
 596     /* I. DCT coefficient probability update; 13.3 Token Probability Updates */
 597     vp78_update_probability_tables(s);
 598
 599     s->mbskip_enabled = 0;
 600
 601     /* J. The remaining frame header data occurs ONLY FOR INTERFRAMES */
 602     if (!s->keyframe) {
 603         s->prob->intra  = vp8_rac_get_uint(c, 8);
 604         s->prob->last   = vp8_rac_get_uint(c, 8);
 605         vp78_update_pred16x16_pred8x8_mvc_probabilities(s, VP7_MVC_SIZE);
 606     }
 607
 608     return 0;
 609 }
 610
 611 static int vp8_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size)
 612 {
 613     VP56RangeCoder *c = &s->c;
 614     int header_size, hscale, vscale, ret;
 615     int width  = s->avctx->width;
 616     int height = s->avctx->height;
 617
 618     s->keyframe  = !(buf[0] & 1);
 619     s->profile   =  (buf[0]>>1) & 7;
 620     s->invisible = !(buf[0] & 0x10);
 621     header_size  = AV_RL24(buf) >> 5;
 622     buf      += 3;
 623     buf_size -= 3;
 624
 625     if (s->profile > 3)
 626         av_log(s->avctx, AV_LOG_WARNING, "Unknown profile %d\n", s->profile);
 627
 628     if (!s->profile)
 629         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab,
 630                sizeof(s->put_pixels_tab));
 631     else    // profile 1-3 use bilinear, 4+ aren't defined so whatever
 632         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_bilinear_pixels_tab,
 633                sizeof(s->put_pixels_tab));
 634
 635     if (header_size > buf_size - 7 * s->keyframe) {
 636         av_log(s->avctx, AV_LOG_ERROR, "Header size larger than data provided\n");
 637         return AVERROR_INVALIDDATA;
 638     }
 639
 640     if (s->keyframe) {
 641         if (AV_RL24(buf) != 0x2a019d) {
 642             av_log(s->avctx, AV_LOG_ERROR,
 643                    "Invalid start code 0x%x\n", AV_RL24(buf));
 644             return AVERROR_INVALIDDATA;
 645         }
 646         width     = AV_RL16(buf + 3) & 0x3fff;
 647         height    = AV_RL16(buf + 5) & 0x3fff;
 648         hscale    = buf[4] >> 6;
 649         vscale    = buf[6] >> 6;
 650         buf      += 7;
 651         buf_size -= 7;
 652
 653         if (hscale || vscale)
 654             avpriv_request_sample(s->avctx, "Upscaling");
 655
 656         s->update_golden = s->update_altref = VP56_FRAME_CURRENT;
 657         vp78_reset_probability_tables(s);
 658         memcpy(s->prob->pred16x16, vp8_pred16x16_prob_inter,
 659                sizeof(s->prob->pred16x16));
 660         memcpy(s->prob->pred8x8c, vp8_pred8x8c_prob_inter,
 661                sizeof(s->prob->pred8x8c));
 662         memcpy(s->prob->mvc, vp8_mv_default_prob,
 663                sizeof(s->prob->mvc));
 664         memset(&s->segmentation, 0, sizeof(s->segmentation));
 665         memset(&s->lf_delta, 0, sizeof(s->lf_delta));
 666     }
 667
 668     ff_vp56_init_range_decoder(c, buf, header_size);
 669     buf      += header_size;
 670     buf_size -= header_size;
 671
 672     if (s->keyframe) {
 673         if (vp8_rac_get(c))
 674             av_log(s->avctx, AV_LOG_WARNING, "Unspecified colorspace\n");
 675         vp8_rac_get(c); // whether we can skip clamping in dsp functions
 676     }
 677
 678     if ((s->segmentation.enabled = vp8_rac_get(c)))
 679         parse_segment_info(s);
 680     else
 681         s->segmentation.update_map = 0; // FIXME: move this to some init function?
 682
 683     s->filter.simple    = vp8_rac_get(c);
 684     s->filter.level     = vp8_rac_get_uint(c, 6);
 685     s->filter.sharpness = vp8_rac_get_uint(c, 3);
 686
 687     if ((s->lf_delta.enabled = vp8_rac_get(c)))
 688         if (vp8_rac_get(c))
 689             update_lf_deltas(s);
 690
 691     if (setup_partitions(s, buf, buf_size)) {
 692         av_log(s->avctx, AV_LOG_ERROR, "Invalid partitions\n");
 693         return AVERROR_INVALIDDATA;
 694     }
 695
 696     if (!s->macroblocks_base || /* first frame */
 697         width != s->avctx->width || height != s->avctx->height)
 698         if ((ret = vp8_update_dimensions(s, width, height)) < 0)
 699             return ret;
 700
 701     get_quants(s);
 702
 703     if (!s->keyframe) {
 704         update_refs(s);
 705         s->sign_bias[VP56_FRAME_GOLDEN]               = vp8_rac_get(c);
 706         s->sign_bias[VP56_FRAME_GOLDEN2 /* altref */] = vp8_rac_get(c);
 707     }
 708
 709     // if we aren't saving this frame's probabilities for future frames,
 710     // make a copy of the current probabilities
 711     if (!(s->update_probabilities = vp8_rac_get(c)))
 712         s->prob[1] = s->prob[0];
 713
 714     s->update_last = s->keyframe || vp8_rac_get(c);
 715
 716     vp78_update_probability_tables(s);
 717
 718     if ((s->mbskip_enabled = vp8_rac_get(c)))
 719         s->prob->mbskip = vp8_rac_get_uint(c, 8);
 720
 721     if (!s->keyframe) {
 722         s->prob->intra  = vp8_rac_get_uint(c, 8);
 723         s->prob->last   = vp8_rac_get_uint(c, 8);
 724         s->prob->golden = vp8_rac_get_uint(c, 8);
 725         vp78_update_pred16x16_pred8x8_mvc_probabilities(s, VP8_MVC_SIZE);
 726     }
 727
 728     return 0;
 729 }
 730
 731 static av_always_inline
 732 void clamp_mv(VP8Context *s, VP56mv *dst, const VP56mv *src)
 733 {
 734     dst->x = av_clip(src->x, s->mv_min.x, s->mv_max.x);
 735     dst->y = av_clip(src->y, s->mv_min.y, s->mv_max.y);
 736 }
 737
 738 /**
 739  * Motion vector coding, 17.1.
 740  */
 741 static int read_mv_component(VP56RangeCoder *c, const uint8_t *p, int vp7)
 742 {
 743     int bit, x = 0;
 744
 745     if (vp56_rac_get_prob_branchy(c, p[0])) {
 746         int i;
 747
 748         for (i = 0; i < 3; i++)
 749             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 750         for (i = (vp7 ? 7 : 9); i > 3; i--)
 751             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 752         if (!(x & (vp7 ? 0xF0 : 0xFFF0)) || vp56_rac_get_prob(c, p[12]))
 753             x += 8;
 754     } else {
 755         // small_mvtree
 756         const uint8_t *ps = p + 2;
 757         bit = vp56_rac_get_prob(c, *ps);
 758         ps += 1 + 3 * bit;
 759         x  += 4 * bit;
 760         bit = vp56_rac_get_prob(c, *ps);
 761         ps += 1 + bit;
 762         x  += 2 * bit;
 763         x  += vp56_rac_get_prob(c, *ps);
 764     }
 765
 766     return (x && vp56_rac_get_prob(c, p[1])) ? -x : x;
 767 }
 768
 769 static av_always_inline
 770 const uint8_t *get_submv_prob(uint32_t left, uint32_t top, int is_vp7)
 771 {
 772     if (is_vp7)
 773         return vp7_submv_prob;
 774
 775     if (left == top)
 776         return vp8_submv_prob[4 - !!left];
 777     if (!top)
 778         return vp8_submv_prob[2];
 779     return vp8_submv_prob[1 - !!left];
 780 }
 781
 782 /**
 783  * Split motion vector prediction, 16.4.
 784  * @returns the number of motion vectors parsed (2, 4 or 16)
 785  */
 786 static av_always_inline
 787 int decode_splitmvs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
 788                     int layout, int is_vp7)
 789 {
 790     int part_idx;
 791     int n, num;
 792     VP8Macroblock *top_mb;
 793     VP8Macroblock *left_mb = &mb[-1];
 794     const uint8_t *mbsplits_left = vp8_mbsplits[left_mb->partitioning];
 795     const uint8_t *mbsplits_top, *mbsplits_cur, *firstidx;
 796     VP56mv *top_mv;
 797     VP56mv *left_mv = left_mb->bmv;
 798     VP56mv *cur_mv  = mb->bmv;
 799
 800     if (!layout) // layout is inlined, s->mb_layout is not
 801         top_mb = &mb[2];
 802     else
 803         top_mb = &mb[-s->mb_width - 1];
 804     mbsplits_top = vp8_mbsplits[top_mb->partitioning];
 805     top_mv       = top_mb->bmv;
 806
 807     if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[0])) {
 808         if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[1]))
 809             part_idx = VP8_SPLITMVMODE_16x8 + vp56_rac_get_prob(c, vp8_mbsplit_prob[2]);
 810         else
 811             part_idx = VP8_SPLITMVMODE_8x8;
 812     } else {
 813         part_idx = VP8_SPLITMVMODE_4x4;
 814     }
 815
 816     num              = vp8_mbsplit_count[part_idx];
 817     mbsplits_cur     = vp8_mbsplits[part_idx],
 818     firstidx         = vp8_mbfirstidx[part_idx];
 819     mb->partitioning = part_idx;
 820
 821     for (n = 0; n < num; n++) {
 822         int k = firstidx[n];
 823         uint32_t left, above;
 824         const uint8_t *submv_prob;
 825
 826         if (!(k & 3))
 827             left = AV_RN32A(&left_mv[mbsplits_left[k + 3]]);
 828         else
 829             left = AV_RN32A(&cur_mv[mbsplits_cur[k - 1]]);
 830         if (k <= 3)
 831             above = AV_RN32A(&top_mv[mbsplits_top[k + 12]]);
 832         else
 833             above = AV_RN32A(&cur_mv[mbsplits_cur[k - 4]]);
 834
 835         submv_prob = get_submv_prob(left, above, is_vp7);
 836
 837         if (vp56_rac_get_prob_branchy(c, submv_prob[0])) {
 838             if (vp56_rac_get_prob_branchy(c, submv_prob[1])) {
 839                 if (vp56_rac_get_prob_branchy(c, submv_prob[2])) {
 840                     mb->bmv[n].y = mb->mv.y +
 841                                    read_mv_component(c, s->prob->mvc[0], is_vp7);
 842                     mb->bmv[n].x = mb->mv.x +
 843                                    read_mv_component(c, s->prob->mvc[1], is_vp7);
 844                 } else {
 845                     AV_ZERO32(&mb->bmv[n]);
 846                 }
 847             } else {
 848                 AV_WN32A(&mb->bmv[n], above);
 849             }
 850         } else {
 851             AV_WN32A(&mb->bmv[n], left);
 852         }
 853     }
 854
 855     return num;
 856 }
 857
 858 /**
 859  * The vp7 reference decoder uses a padding macroblock column (added to right
 860  * edge of the frame) to guard against illegal macroblock offsets. The
 861  * algorithm has bugs that permit offsets to straddle the padding column.
 862  * This function replicates those bugs.
 863  *
 864  * @param[out] edge_x macroblock x address
 865  * @param[out] edge_y macroblock y address
 866  *
 867  * @return macroblock offset legal (boolean)
 868  */
 869 static int vp7_calculate_mb_offset(int mb_x, int mb_y, int mb_width,
 870                                    int xoffset, int yoffset, int boundary,
 871                                    int *edge_x, int *edge_y)
 872 {
 873     int vwidth = mb_width + 1;
 874     int new = (mb_y + yoffset) * vwidth + mb_x + xoffset;
 875     if (new < boundary || new % vwidth == vwidth - 1)
 876         return 0;
 877     *edge_y = new / vwidth;
 878     *edge_x = new % vwidth;
 879     return 1;
 880 }
 881
 882 static const VP56mv *get_bmv_ptr(const VP8Macroblock *mb, int subblock)
 883 {
 884     return &mb->bmv[mb->mode == VP8_MVMODE_SPLIT ? vp8_mbsplits[mb->partitioning][subblock] : 0];
 885 }
 886
 887 static av_always_inline
 888 void vp7_decode_mvs(VP8Context *s, VP8Macroblock *mb,
 889                     int mb_x, int mb_y, int layout)
 890 {
 891     VP8Macroblock *mb_edge[12];
 892     enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR };
 893     enum { VP8_EDGE_TOP, VP8_EDGE_LEFT, VP8_EDGE_TOPLEFT };
 894     int idx = CNT_ZERO;
 895     VP56mv near_mv[3];
 896     uint8_t cnt[3] = { 0 };
 897     VP56RangeCoder *c = &s->c;
 898     int i;
 899
 900     AV_ZERO32(&near_mv[0]);
 901     AV_ZERO32(&near_mv[1]);
 902     AV_ZERO32(&near_mv[2]);
 903
 904     for (i = 0; i < VP7_MV_PRED_COUNT; i++) {
 905         const VP7MVPred * pred = &vp7_mv_pred[i];
 906         int edge_x, edge_y;
 907
 908         if (vp7_calculate_mb_offset(mb_x, mb_y, s->mb_width, pred->xoffset,
 909                                     pred->yoffset, !s->profile, &edge_x, &edge_y)) {
 910             VP8Macroblock *edge = mb_edge[i] = (s->mb_layout == 1)
 911                                              ? s->macroblocks_base + 1 + edge_x +
 912                                                (s->mb_width + 1) * (edge_y + 1)
 913                                              : s->macroblocks + edge_x +
 914                                                (s->mb_height - edge_y - 1) * 2;
 915             uint32_t mv = AV_RN32A(get_bmv_ptr(edge, vp7_mv_pred[i].subblock));
 916             if (mv) {
 917                 if (AV_RN32A(&near_mv[CNT_NEAREST])) {
 918                     if (mv == AV_RN32A(&near_mv[CNT_NEAREST])) {
 919                         idx = CNT_NEAREST;
 920                     } else if (AV_RN32A(&near_mv[CNT_NEAR])) {
 921                         if (mv != AV_RN32A(&near_mv[CNT_NEAR]))
 922                             continue;
 923                         idx = CNT_NEAR;
 924                     } else {
 925                         AV_WN32A(&near_mv[CNT_NEAR], mv);
 926                         idx = CNT_NEAR;
 927                     }
 928                 } else {
 929                     AV_WN32A(&near_mv[CNT_NEAREST], mv);
 930                     idx = CNT_NEAREST;
 931                 }
 932             } else {
 933                 idx = CNT_ZERO;
 934             }
 935         } else {
 936             idx = CNT_ZERO;
 937         }
 938         cnt[idx] += vp7_mv_pred[i].score;
 939     }
 940
 941     mb->partitioning = VP8_SPLITMVMODE_NONE;
 942
 943     if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_ZERO]][0])) {
 944         mb->mode = VP8_MVMODE_MV;
 945
 946         if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAREST]][1])) {
 947
 948             if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAR]][2])) {
 949
 950                 if (cnt[CNT_NEAREST] > cnt[CNT_NEAR])
 951                     AV_WN32A(&mb->mv, cnt[CNT_ZERO] > cnt[CNT_NEAREST] ? 0 : AV_RN32A(&near_mv[CNT_NEAREST]));
 952                 else
 953                     AV_WN32A(&mb->mv, cnt[CNT_ZERO] > cnt[CNT_NEAR]    ? 0 : AV_RN32A(&near_mv[CNT_NEAR]));
 954
 955                 if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAR]][3])) {
 956                     mb->mode = VP8_MVMODE_SPLIT;
 957                     mb->mv = mb->bmv[decode_splitmvs(s, c, mb, layout, IS_VP7) - 1];
 958                 } else {
 959                     mb->mv.y += read_mv_component(c, s->prob->mvc[0], IS_VP7);
 960                     mb->mv.x += read_mv_component(c, s->prob->mvc[1], IS_VP7);
 961                     mb->bmv[0] = mb->mv;
 962                 }
 963             } else {
 964                 mb->mv = near_mv[CNT_NEAR];
 965                 mb->bmv[0] = mb->mv;
 966             }
 967         } else {
 968             mb->mv = near_mv[CNT_NEAREST];
 969             mb->bmv[0] = mb->mv;
 970         }
 971     } else {
 972         mb->mode = VP8_MVMODE_ZERO;
 973         AV_ZERO32(&mb->mv);
 974         mb->bmv[0] = mb->mv;
 975     }
 976 }
 977
 978 static av_always_inline
 979 void vp8_decode_mvs(VP8Context *s, VP8Macroblock *mb,
 980                     int mb_x, int mb_y, int layout)
 981 {
 982     VP8Macroblock *mb_edge[3] = { 0      /* top */,
 983                                   mb - 1 /* left */,
 984                                   0      /* top-left */ };
 985     enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV };
 986     enum { VP8_EDGE_TOP, VP8_EDGE_LEFT, VP8_EDGE_TOPLEFT };
 987     int idx = CNT_ZERO;
 988     int cur_sign_bias = s->sign_bias[mb->ref_frame];
 989     int8_t *sign_bias = s->sign_bias;
 990     VP56mv near_mv[4];
 991     uint8_t cnt[4] = { 0 };
 992     VP56RangeCoder *c = &s->c;
 993
 994     if (!layout) { // layout is inlined (s->mb_layout is not)
 995         mb_edge[0] = mb + 2;
 996         mb_edge[2] = mb + 1;
 997     } else {
 998         mb_edge[0] = mb - s->mb_width - 1;
 999         mb_edge[2] = mb - s->mb_width - 2;
1000     }
1001
1002     AV_ZERO32(&near_mv[0]);
1003     AV_ZERO32(&near_mv[1]);
1004     AV_ZERO32(&near_mv[2]);
1005
1006     /* Process MB on top, left and top-left */
1007 #define MV_EDGE_CHECK(n)                                                      \
1008     {                                                                         \
1009         VP8Macroblock *edge = mb_edge[n];                                     \
1010         int edge_ref = edge->ref_frame;                                       \
1011         if (edge_ref != VP56_FRAME_CURRENT) {                                 \
1012             uint32_t mv = AV_RN32A(&edge->mv);                                \
1013             if (mv) {                                                         \
1014                 if (cur_sign_bias != sign_bias[edge_ref]) {                   \
1015                     /* SWAR negate of the values in mv. */                    \
1016                     mv = ~mv;                                                 \
1017                     mv = ((mv & 0x7fff7fff) +                                 \
1018                           0x00010001) ^ (mv & 0x80008000);                    \
1019                 }                                                             \
1020                 if (!n || mv != AV_RN32A(&near_mv[idx]))                      \
1021                     AV_WN32A(&near_mv[++idx], mv);                            \
1022                 cnt[idx] += 1 + (n != 2);                                     \
1023             } else                                                            \
1024                 cnt[CNT_ZERO] += 1 + (n != 2);                                \
1025         }                                                                     \
1026     }
1027
1028     MV_EDGE_CHECK(0)
1029     MV_EDGE_CHECK(1)
1030     MV_EDGE_CHECK(2)
1031
1032     mb->partitioning = VP8_SPLITMVMODE_NONE;
1033     if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_ZERO]][0])) {
1034         mb->mode = VP8_MVMODE_MV;
1035
1036         /* If we have three distinct MVs, merge first and last if they're the same */
1037         if (cnt[CNT_SPLITMV] &&
1038             AV_RN32A(&near_mv[1 + VP8_EDGE_TOP]) == AV_RN32A(&near_mv[1 + VP8_EDGE_TOPLEFT]))
1039             cnt[CNT_NEAREST] += 1;
1040
1041         /* Swap near and nearest if necessary */
1042         if (cnt[CNT_NEAR] > cnt[CNT_NEAREST]) {
1043             FFSWAP(uint8_t,     cnt[CNT_NEAREST],     cnt[CNT_NEAR]);
1044             FFSWAP( VP56mv, near_mv[CNT_NEAREST], near_mv[CNT_NEAR]);
1045         }
1046
1047         if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAREST]][1])) {
1048             if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAR]][2])) {
1049                 /* Choose the best mv out of 0,0 and the nearest mv */
1050                 clamp_mv(s, &mb->mv, &near_mv[CNT_ZERO + (cnt[CNT_NEAREST] >= cnt[CNT_ZERO])]);
1051                 cnt[CNT_SPLITMV] = ((mb_edge[VP8_EDGE_LEFT]->mode    == VP8_MVMODE_SPLIT) +
1052                                     (mb_edge[VP8_EDGE_TOP]->mode     == VP8_MVMODE_SPLIT)) * 2 +
1053                                     (mb_edge[VP8_EDGE_TOPLEFT]->mode == VP8_MVMODE_SPLIT);
1054
1055                 if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_SPLITMV]][3])) {
1056                     mb->mode = VP8_MVMODE_SPLIT;
1057                     mb->mv = mb->bmv[decode_splitmvs(s, c, mb, layout, IS_VP8) - 1];
1058                 } else {
1059                     mb->mv.y  += read_mv_component(c, s->prob->mvc[0], IS_VP8);
1060                     mb->mv.x  += read_mv_component(c, s->prob->mvc[1], IS_VP8);
1061                     mb->bmv[0] = mb->mv;
1062                 }
1063             } else {
1064                 clamp_mv(s, &mb->mv, &near_mv[CNT_NEAR]);
1065                 mb->bmv[0] = mb->mv;
1066             }
1067         } else {
1068             clamp_mv(s, &mb->mv, &near_mv[CNT_NEAREST]);
1069             mb->bmv[0] = mb->mv;
1070         }
1071     } else {
1072         mb->mode = VP8_MVMODE_ZERO;
1073         AV_ZERO32(&mb->mv);
1074         mb->bmv[0] = mb->mv;
1075     }
1076 }
1077
1078 static av_always_inline
1079 void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
1080                            int mb_x, int keyframe, int layout)
1081 {
1082     uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
1083
1084     if (layout == 1) {
1085         VP8Macroblock *mb_top = mb - s->mb_width - 1;
1086         memcpy(mb->intra4x4_pred_mode_top, mb_top->intra4x4_pred_mode_top, 4);
1087     }
1088     if (keyframe) {
1089         int x, y;
1090         uint8_t *top;
1091         uint8_t *const left = s->intra4x4_pred_mode_left;
1092         if (layout == 1)
1093             top = mb->intra4x4_pred_mode_top;
1094         else
1095             top = s->intra4x4_pred_mode_top + 4 * mb_x;
1096         for (y = 0; y < 4; y++) {
1097             for (x = 0; x < 4; x++) {
1098                 const uint8_t *ctx;
1099                 ctx       = vp8_pred4x4_prob_intra[top[x]][left[y]];
1100                 *intra4x4 = vp8_rac_get_tree(c, vp8_pred4x4_tree, ctx);
1101                 left[y]   = top[x] = *intra4x4;
1102                 intra4x4++;
1103             }
1104         }
1105     } else {
1106         int i;
1107         for (i = 0; i < 16; i++)
1108             intra4x4[i] = vp8_rac_get_tree(c, vp8_pred4x4_tree,
1109                                            vp8_pred4x4_prob_inter);
1110     }
1111 }
1112
1113 static av_always_inline
1114 void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
1115                     uint8_t *segment, uint8_t *ref, int layout, int is_vp7)
1116 {
1117     VP56RangeCoder *c = &s->c;
1118     const char *vp7_feature_name[] = { "q-index",
1119                                        "lf-delta",
1120                                        "partial-golden-update",
1121                                        "blit-pitch" };
1122     if (is_vp7) {
1123         int i;
1124         *segment = 0;
1125         for (i = 0; i < 4; i++) {
1126             if (s->feature_enabled[i]) {
1127                 if (vp56_rac_get_prob(c, s->feature_present_prob[i])) {
1128                       int index = vp8_rac_get_tree(c, vp7_feature_index_tree,
1129                                                    s->feature_index_prob[i]);
1130                       av_log(s->avctx, AV_LOG_WARNING,
1131                              "Feature %s present in macroblock (value 0x%x)\n",
1132                              vp7_feature_name[i], s->feature_value[i][index]);
1133                 }
1134            }
1135         }
1136     } else if (s->segmentation.update_map)
1137         *segment = vp8_rac_get_tree(c, vp8_segmentid_tree, s->prob->segmentid);
1138     else if (s->segmentation.enabled)
1139         *segment = ref ? *ref : *segment;
1140     mb->segment = *segment;
1141
1142     mb->skip = s->mbskip_enabled ? vp56_rac_get_prob(c, s->prob->mbskip) : 0;
1143
1144     if (s->keyframe) {
1145         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_intra,
1146                                     vp8_pred16x16_prob_intra);
1147
1148         if (mb->mode == MODE_I4x4) {
1149             decode_intra4x4_modes(s, c, mb, mb_x, 1, layout);
1150         } else {
1151             const uint32_t modes = (is_vp7 ? vp7_pred4x4_mode
1152                                            : vp8_pred4x4_mode)[mb->mode] * 0x01010101u;
1153             if (s->mb_layout == 1)
1154                 AV_WN32A(mb->intra4x4_pred_mode_top, modes);
1155             else
1156                 AV_WN32A(s->intra4x4_pred_mode_top + 4 * mb_x, modes);
1157             AV_WN32A(s->intra4x4_pred_mode_left, modes);
1158         }
1159
1160         mb->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree,
1161                                                 vp8_pred8x8c_prob_intra);
1162         mb->ref_frame        = VP56_FRAME_CURRENT;
1163     } else if (vp56_rac_get_prob_branchy(c, s->prob->intra)) {
1164         // inter MB, 16.2
1165         if (vp56_rac_get_prob_branchy(c, s->prob->last))
1166             mb->ref_frame =
1167                 (!is_vp7 && vp56_rac_get_prob(c, s->prob->golden)) ? VP56_FRAME_GOLDEN2 /* altref */
1168                                                                    : VP56_FRAME_GOLDEN;
1169         else
1170             mb->ref_frame = VP56_FRAME_PREVIOUS;
1171         s->ref_count[mb->ref_frame - 1]++;
1172
1173         // motion vectors, 16.3
1174         if (is_vp7)
1175             vp7_decode_mvs(s, mb, mb_x, mb_y, layout);
1176         else
1177             vp8_decode_mvs(s, mb, mb_x, mb_y, layout);
1178     } else {
1179         // intra MB, 16.1
1180         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_inter, s->prob->pred16x16);
1181
1182         if (mb->mode == MODE_I4x4)
1183             decode_intra4x4_modes(s, c, mb, mb_x, 0, layout);
1184
1185         mb->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree,
1186                                                 s->prob->pred8x8c);
1187         mb->ref_frame        = VP56_FRAME_CURRENT;
1188         mb->partitioning     = VP8_SPLITMVMODE_NONE;
1189         AV_ZERO32(&mb->bmv[0]);
1190     }
1191 }
1192
1193 /**
1194  * @param r     arithmetic bitstream reader context
1195  * @param block destination for block coefficients
1196  * @param probs probabilities to use when reading trees from the bitstream
1197  * @param i     initial coeff index, 0 unless a separate DC block is coded
1198  * @param qmul  array holding the dc/ac dequant factor at position 0/1
1199  *
1200  * @return 0 if no coeffs were decoded
1201  *         otherwise, the index of the last coeff decoded plus one
1202  */
1203 static av_always_inline
1204 int decode_block_coeffs_internal(VP56RangeCoder *r, int16_t block[16],
1205                                  uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1206                                  int i, uint8_t *token_prob, int16_t qmul[2],
1207                                  const uint8_t scan[16], int vp7)
1208 {
1209     VP56RangeCoder c = *r;
1210     goto skip_eob;
1211     do {
1212         int coeff;
1213 restart:
1214         if (!vp56_rac_get_prob_branchy(&c, token_prob[0]))   // DCT_EOB
1215             break;
1216
1217 skip_eob:
1218         if (!vp56_rac_get_prob_branchy(&c, token_prob[1])) { // DCT_0
1219             if (++i == 16)
1220                 break; // invalid input; blocks should end with EOB
1221             token_prob = probs[i][0];
1222             if (vp7)
1223                 goto restart;
1224             goto skip_eob;
1225         }
1226
1227         if (!vp56_rac_get_prob_branchy(&c, token_prob[2])) { // DCT_1
1228             coeff = 1;
1229             token_prob = probs[i + 1][1];
1230         } else {
1231             if (!vp56_rac_get_prob_branchy(&c, token_prob[3])) { // DCT 2,3,4
1232                 coeff = vp56_rac_get_prob_branchy(&c, token_prob[4]);
1233                 if (coeff)
1234                     coeff += vp56_rac_get_prob(&c, token_prob[5]);
1235                 coeff += 2;
1236             } else {
1237                 // DCT_CAT*
1238                 if (!vp56_rac_get_prob_branchy(&c, token_prob[6])) {
1239                     if (!vp56_rac_get_prob_branchy(&c, token_prob[7])) { // DCT_CAT1
1240                         coeff = 5 + vp56_rac_get_prob(&c, vp8_dct_cat1_prob[0]);
1241                     } else {                                    // DCT_CAT2
1242                         coeff  = 7;
1243                         coeff += vp56_rac_get_prob(&c, vp8_dct_cat2_prob[0]) << 1;
1244                         coeff += vp56_rac_get_prob(&c, vp8_dct_cat2_prob[1]);
1245                     }
1246                 } else {    // DCT_CAT3 and up
1247                     int a   = vp56_rac_get_prob(&c, token_prob[8]);
1248                     int b   = vp56_rac_get_prob(&c, token_prob[9 + a]);
1249                     int cat = (a << 1) + b;
1250                     coeff  = 3 + (8 << cat);
1251                     coeff += vp8_rac_get_coeff(&c, ff_vp8_dct_cat_prob[cat]);
1252                 }
1253             }
1254             token_prob = probs[i + 1][2];
1255         }
1256         block[scan[i]] = (vp8_rac_get(&c) ? -coeff : coeff) * qmul[!!i];
1257     } while (++i < 16);
1258
1259     *r = c;
1260     return i;
1261 }
1262
1263 static av_always_inline
1264 int inter_predict_dc(int16_t block[16], int16_t pred[2])
1265 {
1266     int16_t dc = block[0];
1267     int ret = 0;
1268
1269     if (pred[1] > 3) {
1270         dc += pred[0];
1271         ret = 1;
1272     }
1273
1274     if (!pred[0] | !dc | ((int32_t)pred[0] ^ (int32_t)dc) >> 31) {
1275         block[0] = pred[0] = dc;
1276         pred[1] = 0;
1277     } else {
1278         if (pred[0] == dc)
1279             pred[1]++;
1280         block[0] = pred[0] = dc;
1281     }
1282
1283     return ret;
1284 }
1285
1286 static int vp7_decode_block_coeffs_internal(VP56RangeCoder *r,
1287                                             int16_t block[16],
1288                                             uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1289                                             int i, uint8_t *token_prob,
1290                                             int16_t qmul[2],
1291                                             const uint8_t scan[16])
1292 {
1293     return decode_block_coeffs_internal(r, block, probs, i,
1294                                         token_prob, qmul, scan, IS_VP7);
1295 }
1296
1297 #ifndef vp8_decode_block_coeffs_internal
1298 static int vp8_decode_block_coeffs_internal(VP56RangeCoder *r,
1299                                             int16_t block[16],
1300                                             uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1301                                             int i, uint8_t *token_prob,
1302                                             int16_t qmul[2])
1303 {
1304     return decode_block_coeffs_internal(r, block, probs, i,
1305                                         token_prob, qmul, zigzag_scan, IS_VP8);
1306 }
1307 #endif
1308
1309 /**
1310  * @param c          arithmetic bitstream reader context
1311  * @param block      destination for block coefficients
1312  * @param probs      probabilities to use when reading trees from the bitstream
1313  * @param i          initial coeff index, 0 unless a separate DC block is coded
1314  * @param zero_nhood the initial prediction context for number of surrounding
1315  *                   all-zero blocks (only left/top, so 0-2)
1316  * @param qmul       array holding the dc/ac dequant factor at position 0/1
1317  *
1318  * @return 0 if no coeffs were decoded
1319  *         otherwise, the index of the last coeff decoded plus one
1320  */
1321 static av_always_inline
1322 int decode_block_coeffs(VP56RangeCoder *c, int16_t block[16],
1323                         uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1324                         int i, int zero_nhood, int16_t qmul[2],
1325                         const uint8_t scan[16], int vp7)
1326 {
1327     uint8_t *token_prob = probs[i][zero_nhood];
1328     if (!vp56_rac_get_prob_branchy(c, token_prob[0]))   // DCT_EOB
1329         return 0;
1330     return vp7 ? vp7_decode_block_coeffs_internal(c, block, probs, i,
1331                                                   token_prob, qmul, scan)
1332                : vp8_decode_block_coeffs_internal(c, block, probs, i,
1333                                                   token_prob, qmul);
1334 }
1335
1336 static av_always_inline
1337 void decode_mb_coeffs(VP8Context *s, VP8ThreadData *td, VP56RangeCoder *c,
1338                       VP8Macroblock *mb, uint8_t t_nnz[9], uint8_t l_nnz[9],
1339                       int is_vp7)
1340 {
1341     int i, x, y, luma_start = 0, luma_ctx = 3;
1342     int nnz_pred, nnz, nnz_total = 0;
1343     int segment = mb->segment;
1344     int block_dc = 0;
1345
1346     if (mb->mode != MODE_I4x4 && (is_vp7 || mb->mode != VP8_MVMODE_SPLIT)) {
1347         nnz_pred = t_nnz[8] + l_nnz[8];
1348
1349         // decode DC values and do hadamard
1350         nnz = decode_block_coeffs(c, td->block_dc, s->prob->token[1], 0,
1351                                   nnz_pred, s->qmat[segment].luma_dc_qmul,
1352                                   zigzag_scan, is_vp7);
1353         l_nnz[8] = t_nnz[8] = !!nnz;
1354
1355         if (is_vp7 && mb->mode > MODE_I4x4) {
1356             nnz |=  inter_predict_dc(td->block_dc,
1357                                      s->inter_dc_pred[mb->ref_frame - 1]);
1358         }
1359
1360         if (nnz) {
1361             nnz_total += nnz;
1362             block_dc   = 1;
1363             if (nnz == 1)
1364                 s->vp8dsp.vp8_luma_dc_wht_dc(td->block, td->block_dc);
1365             else
1366                 s->vp8dsp.vp8_luma_dc_wht(td->block, td->block_dc);
1367         }
1368         luma_start = 1;
1369         luma_ctx   = 0;
1370     }
1371
1372     // luma blocks
1373     for (y = 0; y < 4; y++)
1374         for (x = 0; x < 4; x++) {
1375             nnz_pred = l_nnz[y] + t_nnz[x];
1376             nnz = decode_block_coeffs(c, td->block[y][x],
1377                                       s->prob->token[luma_ctx],
1378                                       luma_start, nnz_pred,
1379                                       s->qmat[segment].luma_qmul,
1380                                       s->prob[0].scan, is_vp7);
1381             /* nnz+block_dc may be one more than the actual last index,
1382              * but we don't care */
1383             td->non_zero_count_cache[y][x] = nnz + block_dc;
1384             t_nnz[x] = l_nnz[y] = !!nnz;
1385             nnz_total += nnz;
1386         }
1387
1388     // chroma blocks
1389     // TODO: what to do about dimensions? 2nd dim for luma is x,
1390     // but for chroma it's (y<<1)|x
1391     for (i = 4; i < 6; i++)
1392         for (y = 0; y < 2; y++)
1393             for (x = 0; x < 2; x++) {
1394                 nnz_pred = l_nnz[i + 2 * y] + t_nnz[i + 2 * x];
1395                 nnz = decode_block_coeffs(c, td->block[i][(y << 1) + x],
1396                                           s->prob->token[2], 0, nnz_pred,
1397                                           s->qmat[segment].chroma_qmul,
1398                                           s->prob[0].scan, is_vp7);
1399                 td->non_zero_count_cache[i][(y << 1) + x] = nnz;
1400                 t_nnz[i + 2 * x] = l_nnz[i + 2 * y] = !!nnz;
1401                 nnz_total += nnz;
1402             }
1403
1404     // if there were no coded coeffs despite the macroblock not being marked skip,
1405     // we MUST not do the inner loop filter and should not do IDCT
1406     // Since skip isn't used for bitstream prediction, just manually set it.
1407     if (!nnz_total)
1408         mb->skip = 1;
1409 }
1410
1411 static av_always_inline
1412 void backup_mb_border(uint8_t *top_border, uint8_t *src_y,
1413                       uint8_t *src_cb, uint8_t *src_cr,
1414                       int linesize, int uvlinesize, int simple)
1415 {
1416     AV_COPY128(top_border, src_y + 15 * linesize);
1417     if (!simple) {
1418         AV_COPY64(top_border + 16, src_cb + 7 * uvlinesize);
1419         AV_COPY64(top_border + 24, src_cr + 7 * uvlinesize);
1420     }
1421 }
1422
1423 static av_always_inline
1424 void xchg_mb_border(uint8_t *top_border, uint8_t *src_y, uint8_t *src_cb,
1425                     uint8_t *src_cr, int linesize, int uvlinesize, int mb_x,
1426                     int mb_y, int mb_width, int simple, int xchg)
1427 {
1428     uint8_t *top_border_m1 = top_border - 32;     // for TL prediction
1429     src_y  -= linesize;
1430     src_cb -= uvlinesize;
1431     src_cr -= uvlinesize;
1432
1433 #define XCHG(a, b, xchg)                                                      \
1434     do {                                                                      \
1435         if (xchg)                                                             \
1436             AV_SWAP64(b, a);                                                  \
1437         else                                                                  \
1438             AV_COPY64(b, a);                                                  \
1439     } while (0)
1440
1441     XCHG(top_border_m1 + 8, src_y - 8, xchg);
1442     XCHG(top_border, src_y, xchg);
1443     XCHG(top_border + 8, src_y + 8, 1);
1444     if (mb_x < mb_width - 1)
1445         XCHG(top_border + 32, src_y + 16, 1);
1446
1447     // only copy chroma for normal loop filter
1448     // or to initialize the top row to 127
1449     if (!simple || !mb_y) {
1450         XCHG(top_border_m1 + 16, src_cb - 8, xchg);
1451         XCHG(top_border_m1 + 24, src_cr - 8, xchg);
1452         XCHG(top_border + 16, src_cb, 1);
1453         XCHG(top_border + 24, src_cr, 1);
1454     }
1455 }
1456
1457 static av_always_inline
1458 int check_dc_pred8x8_mode(int mode, int mb_x, int mb_y)
1459 {
1460     if (!mb_x)
1461         return mb_y ? TOP_DC_PRED8x8 : DC_128_PRED8x8;
1462     else
1463         return mb_y ? mode : LEFT_DC_PRED8x8;
1464 }
1465
1466 static av_always_inline
1467 int check_tm_pred8x8_mode(int mode, int mb_x, int mb_y, int vp7)
1468 {
1469     if (!mb_x)
1470         return mb_y ? VERT_PRED8x8 : (vp7 ? DC_128_PRED8x8 : DC_129_PRED8x8);
1471     else
1472         return mb_y ? mode : HOR_PRED8x8;
1473 }
1474
1475 static av_always_inline
1476 int check_intra_pred8x8_mode_emuedge(int mode, int mb_x, int mb_y, int vp7)
1477 {
1478     switch (mode) {
1479     case DC_PRED8x8:
1480         return check_dc_pred8x8_mode(mode, mb_x, mb_y);
1481     case VERT_PRED8x8:
1482         return !mb_y ? (vp7 ? DC_128_PRED8x8 : DC_127_PRED8x8) : mode;
1483     case HOR_PRED8x8:
1484         return !mb_x ? (vp7 ? DC_128_PRED8x8 : DC_129_PRED8x8) : mode;
1485     case PLANE_PRED8x8: /* TM */
1486         return check_tm_pred8x8_mode(mode, mb_x, mb_y, vp7);
1487     }
1488     return mode;
1489 }
1490
1491 static av_always_inline
1492 int check_tm_pred4x4_mode(int mode, int mb_x, int mb_y, int vp7)
1493 {
1494     if (!mb_x) {
1495         return mb_y ? VERT_VP8_PRED : (vp7 ? DC_128_PRED : DC_129_PRED);
1496     } else {
1497         return mb_y ? mode : HOR_VP8_PRED;
1498     }
1499 }
1500
1501 static av_always_inline
1502 int check_intra_pred4x4_mode_emuedge(int mode, int mb_x, int mb_y,
1503                                      int *copy_buf, int vp7)
1504 {
1505     switch (mode) {
1506     case VERT_PRED:
1507         if (!mb_x && mb_y) {
1508             *copy_buf = 1;
1509             return mode;
1510         }
1511         /* fall-through */
1512     case DIAG_DOWN_LEFT_PRED:
1513     case VERT_LEFT_PRED:
1514         return !mb_y ? (vp7 ? DC_128_PRED : DC_127_PRED) : mode;
1515     case HOR_PRED:
1516         if (!mb_y) {
1517             *copy_buf = 1;
1518             return mode;
1519         }
1520         /* fall-through */
1521     case HOR_UP_PRED:
1522         return !mb_x ? (vp7 ? DC_128_PRED : DC_129_PRED) : mode;
1523     case TM_VP8_PRED:
1524         return check_tm_pred4x4_mode(mode, mb_x, mb_y, vp7);
1525     case DC_PRED: /* 4x4 DC doesn't use the same "H.264-style" exceptions
1526                    * as 16x16/8x8 DC */
1527     case DIAG_DOWN_RIGHT_PRED:
1528     case VERT_RIGHT_PRED:
1529     case HOR_DOWN_PRED:
1530         if (!mb_y || !mb_x)
1531             *copy_buf = 1;
1532         return mode;
1533     }
1534     return mode;
1535 }
1536
1537 static av_always_inline
1538 void intra_predict(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1539                    VP8Macroblock *mb, int mb_x, int mb_y, int is_vp7)
1540 {
1541     int x, y, mode, nnz;
1542     uint32_t tr;
1543
1544     /* for the first row, we need to run xchg_mb_border to init the top edge
1545      * to 127 otherwise, skip it if we aren't going to deblock */
1546     if (mb_y && (s->deblock_filter || !mb_y) && td->thread_nr == 0)
1547         xchg_mb_border(s->top_border[mb_x + 1], dst[0], dst[1], dst[2],
1548                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1549                        s->filter.simple, 1);
1550
1551     if (mb->mode < MODE_I4x4) {
1552         mode = check_intra_pred8x8_mode_emuedge(mb->mode, mb_x, mb_y, is_vp7);
1553         s->hpc.pred16x16[mode](dst[0], s->linesize);
1554     } else {
1555         uint8_t *ptr = dst[0];
1556         uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
1557         const uint8_t lo = is_vp7 ? 128 : 127;
1558         const uint8_t hi = is_vp7 ? 128 : 129;
1559         uint8_t tr_top[4] = { lo, lo, lo, lo };
1560
1561         // all blocks on the right edge of the macroblock use bottom edge
1562         // the top macroblock for their topright edge
1563         uint8_t *tr_right = ptr - s->linesize + 16;
1564
1565         // if we're on the right edge of the frame, said edge is extended
1566         // from the top macroblock
1567         if (mb_y && mb_x == s->mb_width - 1) {
1568             tr       = tr_right[-1] * 0x01010101u;
1569             tr_right = (uint8_t *) &tr;
1570         }
1571
1572         if (mb->skip)
1573             AV_ZERO128(td->non_zero_count_cache);
1574
1575         for (y = 0; y < 4; y++) {
1576             uint8_t *topright = ptr + 4 - s->linesize;
1577             for (x = 0; x < 4; x++) {
1578                 int copy = 0, linesize = s->linesize;
1579                 uint8_t *dst = ptr + 4 * x;
1580                 DECLARE_ALIGNED(4, uint8_t, copy_dst)[5 * 8];
1581
1582                 if ((y == 0 || x == 3) && mb_y == 0) {
1583                     topright = tr_top;
1584                 } else if (x == 3)
1585                     topright = tr_right;
1586
1587                 mode = check_intra_pred4x4_mode_emuedge(intra4x4[x], mb_x + x,
1588                                                         mb_y + y, &copy, is_vp7);
1589                 if (copy) {
1590                     dst      = copy_dst + 12;
1591                     linesize = 8;
1592                     if (!(mb_y + y)) {
1593                         copy_dst[3] = lo;
1594                         AV_WN32A(copy_dst + 4, lo * 0x01010101U);
1595                     } else {
1596                         AV_COPY32(copy_dst + 4, ptr + 4 * x - s->linesize);
1597                         if (!(mb_x + x)) {
1598                             copy_dst[3] = hi;
1599                         } else {
1600                             copy_dst[3] = ptr[4 * x - s->linesize - 1];
1601                         }
1602                     }
1603                     if (!(mb_x + x)) {
1604                         copy_dst[11] =
1605                         copy_dst[19] =
1606                         copy_dst[27] =
1607                         copy_dst[35] = hi;
1608                     } else {
1609                         copy_dst[11] = ptr[4 * x                   - 1];
1610                         copy_dst[19] = ptr[4 * x + s->linesize     - 1];
1611                         copy_dst[27] = ptr[4 * x + s->linesize * 2 - 1];
1612                         copy_dst[35] = ptr[4 * x + s->linesize * 3 - 1];
1613                     }
1614                 }
1615                 s->hpc.pred4x4[mode](dst, topright, linesize);
1616                 if (copy) {
1617                     AV_COPY32(ptr + 4 * x,                   copy_dst + 12);
1618                     AV_COPY32(ptr + 4 * x + s->linesize,     copy_dst + 20);
1619                     AV_COPY32(ptr + 4 * x + s->linesize * 2, copy_dst + 28);
1620                     AV_COPY32(ptr + 4 * x + s->linesize * 3, copy_dst + 36);
1621                 }
1622
1623                 nnz = td->non_zero_count_cache[y][x];
1624                 if (nnz) {
1625                     if (nnz == 1)
1626                         s->vp8dsp.vp8_idct_dc_add(ptr + 4 * x,
1627                                                   td->block[y][x], s->linesize);
1628                     else
1629                         s->vp8dsp.vp8_idct_add(ptr + 4 * x,
1630                                                td->block[y][x], s->linesize);
1631                 }
1632                 topright += 4;
1633             }
1634
1635             ptr      += 4 * s->linesize;
1636             intra4x4 += 4;
1637         }
1638     }
1639
1640     mode = check_intra_pred8x8_mode_emuedge(mb->chroma_pred_mode,
1641                                             mb_x, mb_y, is_vp7);
1642     s->hpc.pred8x8[mode](dst[1], s->uvlinesize);
1643     s->hpc.pred8x8[mode](dst[2], s->uvlinesize);
1644
1645     if (mb_y && (s->deblock_filter || !mb_y) && td->thread_nr == 0)
1646         xchg_mb_border(s->top_border[mb_x + 1], dst[0], dst[1], dst[2],
1647                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1648                        s->filter.simple, 0);
1649 }
1650
1651 static const uint8_t subpel_idx[3][8] = {
1652     { 0, 1, 2, 1, 2, 1, 2, 1 }, // nr. of left extra pixels,
1653                                 // also function pointer index
1654     { 0, 3, 5, 3, 5, 3, 5, 3 }, // nr. of extra pixels required
1655     { 0, 2, 3, 2, 3, 2, 3, 2 }, // nr. of right extra pixels
1656 };
1657
1658 /**
1659  * luma MC function
1660  *
1661  * @param s        VP8 decoding context
1662  * @param dst      target buffer for block data at block position
1663  * @param ref      reference picture buffer at origin (0, 0)
1664  * @param mv       motion vector (relative to block position) to get pixel data from
1665  * @param x_off    horizontal position of block from origin (0, 0)
1666  * @param y_off    vertical position of block from origin (0, 0)
1667  * @param block_w  width of block (16, 8 or 4)
1668  * @param block_h  height of block (always same as block_w)
1669  * @param width    width of src/dst plane data
1670  * @param height   height of src/dst plane data
1671  * @param linesize size of a single line of plane data, including padding
1672  * @param mc_func  motion compensation function pointers (bilinear or sixtap MC)
1673  */
1674 static av_always_inline
1675 void vp8_mc_luma(VP8Context *s, VP8ThreadData *td, uint8_t *dst,
1676                  ThreadFrame *ref, const VP56mv *mv,
1677                  int x_off, int y_off, int block_w, int block_h,
1678                  int width, int height, ptrdiff_t linesize,
1679                  vp8_mc_func mc_func[3][3])
1680 {
1681     uint8_t *src = ref->f->data[0];
1682
1683     if (AV_RN32A(mv)) {
1684         int src_linesize = linesize;
1685
1686         int mx = (mv->x << 1) & 7, mx_idx = subpel_idx[0][mx];
1687         int my = (mv->y << 1) & 7, my_idx = subpel_idx[0][my];
1688
1689         x_off += mv->x >> 2;
1690         y_off += mv->y >> 2;
1691
1692         // edge emulation
1693         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 4, 0);
1694         src += y_off * linesize + x_off;
1695         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1696             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1697             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1698                                      src - my_idx * linesize - mx_idx,
1699                                      EDGE_EMU_LINESIZE, linesize,
1700                                      block_w + subpel_idx[1][mx],
1701                                      block_h + subpel_idx[1][my],
1702                                      x_off - mx_idx, y_off - my_idx,
1703                                      width, height);
1704             src = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
1705             src_linesize = EDGE_EMU_LINESIZE;
1706         }
1707         mc_func[my_idx][mx_idx](dst, linesize, src, src_linesize, block_h, mx, my);
1708     } else {
1709         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 4, 0);
1710         mc_func[0][0](dst, linesize, src + y_off * linesize + x_off,
1711                       linesize, block_h, 0, 0);
1712     }
1713 }
1714
1715 /**
1716  * chroma MC function
1717  *
1718  * @param s        VP8 decoding context
1719  * @param dst1     target buffer for block data at block position (U plane)
1720  * @param dst2     target buffer for block data at block position (V plane)
1721  * @param ref      reference picture buffer at origin (0, 0)
1722  * @param mv       motion vector (relative to block position) to get pixel data from
1723  * @param x_off    horizontal position of block from origin (0, 0)
1724  * @param y_off    vertical position of block from origin (0, 0)
1725  * @param block_w  width of block (16, 8 or 4)
1726  * @param block_h  height of block (always same as block_w)
1727  * @param width    width of src/dst plane data
1728  * @param height   height of src/dst plane data
1729  * @param linesize size of a single line of plane data, including padding
1730  * @param mc_func  motion compensation function pointers (bilinear or sixtap MC)
1731  */
1732 static av_always_inline
1733 void vp8_mc_chroma(VP8Context *s, VP8ThreadData *td, uint8_t *dst1,
1734                    uint8_t *dst2, ThreadFrame *ref, const VP56mv *mv,
1735                    int x_off, int y_off, int block_w, int block_h,
1736                    int width, int height, ptrdiff_t linesize,
1737                    vp8_mc_func mc_func[3][3])
1738 {
1739     uint8_t *src1 = ref->f->data[1], *src2 = ref->f->data[2];
1740
1741     if (AV_RN32A(mv)) {
1742         int mx = mv->x & 7, mx_idx = subpel_idx[0][mx];
1743         int my = mv->y & 7, my_idx = subpel_idx[0][my];
1744
1745         x_off += mv->x >> 3;
1746         y_off += mv->y >> 3;
1747
1748         // edge emulation
1749         src1 += y_off * linesize + x_off;
1750         src2 += y_off * linesize + x_off;
1751         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 3, 0);
1752         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1753             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1754             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1755                                      src1 - my_idx * linesize - mx_idx,
1756                                      EDGE_EMU_LINESIZE, linesize,
1757                                      block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1758                                      x_off - mx_idx, y_off - my_idx, width, height);
1759             src1 = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
1760             mc_func[my_idx][mx_idx](dst1, linesize, src1, EDGE_EMU_LINESIZE, block_h, mx, my);
1761
1762             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1763                                      src2 - my_idx * linesize - mx_idx,
1764                                      EDGE_EMU_LINESIZE, linesize,
1765                                      block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1766                                      x_off - mx_idx, y_off - my_idx, width, height);
1767             src2 = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
1768             mc_func[my_idx][mx_idx](dst2, linesize, src2, EDGE_EMU_LINESIZE, block_h, mx, my);
1769         } else {
1770             mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my);
1771             mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my);
1772         }
1773     } else {
1774         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 3, 0);
1775         mc_func[0][0](dst1, linesize, src1 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1776         mc_func[0][0](dst2, linesize, src2 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1777     }
1778 }
1779
1780 static av_always_inline
1781 void vp8_mc_part(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1782                  ThreadFrame *ref_frame, int x_off, int y_off,
1783                  int bx_off, int by_off, int block_w, int block_h,
1784                  int width, int height, VP56mv *mv)
1785 {
1786     VP56mv uvmv = *mv;
1787
1788     /* Y */
1789     vp8_mc_luma(s, td, dst[0] + by_off * s->linesize + bx_off,
1790                 ref_frame, mv, x_off + bx_off, y_off + by_off,
1791                 block_w, block_h, width, height, s->linesize,
1792                 s->put_pixels_tab[block_w == 8]);
1793
1794     /* U/V */
1795     if (s->profile == 3) {
1796         /* this block only applies VP8; it is safe to check
1797          * only the profile, as VP7 profile <= 1 */
1798         uvmv.x &= ~7;
1799         uvmv.y &= ~7;
1800     }
1801     x_off   >>= 1;
1802     y_off   >>= 1;
1803     bx_off  >>= 1;
1804     by_off  >>= 1;
1805     width   >>= 1;
1806     height  >>= 1;
1807     block_w >>= 1;
1808     block_h >>= 1;
1809     vp8_mc_chroma(s, td, dst[1] + by_off * s->uvlinesize + bx_off,
1810                   dst[2] + by_off * s->uvlinesize + bx_off, ref_frame,
1811                   &uvmv, x_off + bx_off, y_off + by_off,
1812                   block_w, block_h, width, height, s->uvlinesize,
1813                   s->put_pixels_tab[1 + (block_w == 4)]);
1814 }
1815
1816 /* Fetch pixels for estimated mv 4 macroblocks ahead.
1817  * Optimized for 64-byte cache lines. Inspired by ffh264 prefetch_motion. */
1818 static av_always_inline
1819 void prefetch_motion(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
1820                      int mb_xy, int ref)
1821 {
1822     /* Don't prefetch refs that haven't been used very often this frame. */
1823     if (s->ref_count[ref - 1] > (mb_xy >> 5)) {
1824         int x_off = mb_x << 4, y_off = mb_y << 4;
1825         int mx = (mb->mv.x >> 2) + x_off + 8;
1826         int my = (mb->mv.y >> 2) + y_off;
1827         uint8_t **src = s->framep[ref]->tf.f->data;
1828         int off = mx + (my + (mb_x & 3) * 4) * s->linesize + 64;
1829         /* For threading, a ff_thread_await_progress here might be useful, but
1830          * it actually slows down the decoder. Since a bad prefetch doesn't
1831          * generate bad decoder output, we don't run it here. */
1832         s->vdsp.prefetch(src[0] + off, s->linesize, 4);
1833         off = (mx >> 1) + ((my >> 1) + (mb_x & 7)) * s->uvlinesize + 64;
1834         s->vdsp.prefetch(src[1] + off, src[2] - src[1], 2);
1835     }
1836 }
1837
1838 /**
1839  * Apply motion vectors to prediction buffer, chapter 18.
1840  */
1841 static av_always_inline
1842 void inter_predict(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1843                    VP8Macroblock *mb, int mb_x, int mb_y)
1844 {
1845     int x_off = mb_x << 4, y_off = mb_y << 4;
1846     int width = 16 * s->mb_width, height = 16 * s->mb_height;
1847     ThreadFrame *ref = &s->framep[mb->ref_frame]->tf;
1848     VP56mv *bmv = mb->bmv;
1849
1850     switch (mb->partitioning) {
1851     case VP8_SPLITMVMODE_NONE:
1852         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1853                     0, 0, 16, 16, width, height, &mb->mv);
1854         break;
1855     case VP8_SPLITMVMODE_4x4: {
1856         int x, y;
1857         VP56mv uvmv;
1858
1859         /* Y */
1860         for (y = 0; y < 4; y++) {
1861             for (x = 0; x < 4; x++) {
1862                 vp8_mc_luma(s, td, dst[0] + 4 * y * s->linesize + x * 4,
1863                             ref, &bmv[4 * y + x],
1864                             4 * x + x_off, 4 * y + y_off, 4, 4,
1865                             width, height, s->linesize,
1866                             s->put_pixels_tab[2]);
1867             }
1868         }
1869
1870         /* U/V */
1871         x_off  >>= 1;
1872         y_off  >>= 1;
1873         width  >>= 1;
1874         height >>= 1;
1875         for (y = 0; y < 2; y++) {
1876             for (x = 0; x < 2; x++) {
1877                 uvmv.x = mb->bmv[2 * y       * 4 + 2 * x    ].x +
1878                          mb->bmv[2 * y       * 4 + 2 * x + 1].x +
1879                          mb->bmv[(2 * y + 1) * 4 + 2 * x    ].x +
1880                          mb->bmv[(2 * y + 1) * 4 + 2 * x + 1].x;
1881                 uvmv.y = mb->bmv[2 * y       * 4 + 2 * x    ].y +
1882                          mb->bmv[2 * y       * 4 + 2 * x + 1].y +
1883                          mb->bmv[(2 * y + 1) * 4 + 2 * x    ].y +
1884                          mb->bmv[(2 * y + 1) * 4 + 2 * x + 1].y;
1885                 uvmv.x = (uvmv.x + 2 + (uvmv.x >> (INT_BIT - 1))) >> 2;
1886                 uvmv.y = (uvmv.y + 2 + (uvmv.y >> (INT_BIT - 1))) >> 2;
1887                 if (s->profile == 3) {
1888                     uvmv.x &= ~7;
1889                     uvmv.y &= ~7;
1890                 }
1891                 vp8_mc_chroma(s, td, dst[1] + 4 * y * s->uvlinesize + x * 4,
1892                               dst[2] + 4 * y * s->uvlinesize + x * 4, ref,
1893                               &uvmv, 4 * x + x_off, 4 * y + y_off, 4, 4,
1894                               width, height, s->uvlinesize,
1895                               s->put_pixels_tab[2]);
1896             }
1897         }
1898         break;
1899     }
1900     case VP8_SPLITMVMODE_16x8:
1901         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1902                     0, 0, 16, 8, width, height, &bmv[0]);
1903         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1904                     0, 8, 16, 8, width, height, &bmv[1]);
1905         break;
1906     case VP8_SPLITMVMODE_8x16:
1907         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1908                     0, 0, 8, 16, width, height, &bmv[0]);
1909         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1910                     8, 0, 8, 16, width, height, &bmv[1]);
1911         break;
1912     case VP8_SPLITMVMODE_8x8:
1913         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1914                     0, 0, 8, 8, width, height, &bmv[0]);
1915         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1916                     8, 0, 8, 8, width, height, &bmv[1]);
1917         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1918                     0, 8, 8, 8, width, height, &bmv[2]);
1919         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1920                     8, 8, 8, 8, width, height, &bmv[3]);
1921         break;
1922     }
1923 }
1924
1925 static av_always_inline
1926 void idct_mb(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3], VP8Macroblock *mb)
1927 {
1928     int x, y, ch;
1929
1930     if (mb->mode != MODE_I4x4) {
1931         uint8_t *y_dst = dst[0];
1932         for (y = 0; y < 4; y++) {
1933             uint32_t nnz4 = AV_RL32(td->non_zero_count_cache[y]);
1934             if (nnz4) {
1935                 if (nnz4 & ~0x01010101) {
1936                     for (x = 0; x < 4; x++) {
1937                         if ((uint8_t) nnz4 == 1)
1938                             s->vp8dsp.vp8_idct_dc_add(y_dst + 4 * x,
1939                                                       td->block[y][x],
1940                                                       s->linesize);
1941                         else if ((uint8_t) nnz4 > 1)
1942                             s->vp8dsp.vp8_idct_add(y_dst + 4 * x,
1943                                                    td->block[y][x],
1944                                                    s->linesize);
1945                         nnz4 >>= 8;
1946                         if (!nnz4)
1947                             break;
1948                     }
1949                 } else {
1950                     s->vp8dsp.vp8_idct_dc_add4y(y_dst, td->block[y], s->linesize);
1951                 }
1952             }
1953             y_dst += 4 * s->linesize;
1954         }
1955     }
1956
1957     for (ch = 0; ch < 2; ch++) {
1958         uint32_t nnz4 = AV_RL32(td->non_zero_count_cache[4 + ch]);
1959         if (nnz4) {
1960             uint8_t *ch_dst = dst[1 + ch];
1961             if (nnz4 & ~0x01010101) {
1962                 for (y = 0; y < 2; y++) {
1963                     for (x = 0; x < 2; x++) {
1964                         if ((uint8_t) nnz4 == 1)
1965                             s->vp8dsp.vp8_idct_dc_add(ch_dst + 4 * x,
1966                                                       td->block[4 + ch][(y << 1) + x],
1967                                                       s->uvlinesize);
1968                         else if ((uint8_t) nnz4 > 1)
1969                             s->vp8dsp.vp8_idct_add(ch_dst + 4 * x,
1970                                                    td->block[4 + ch][(y << 1) + x],
1971                                                    s->uvlinesize);
1972                         nnz4 >>= 8;
1973                         if (!nnz4)
1974                             goto chroma_idct_end;
1975                     }
1976                     ch_dst += 4 * s->uvlinesize;
1977                 }
1978             } else {
1979                 s->vp8dsp.vp8_idct_dc_add4uv(ch_dst, td->block[4 + ch], s->uvlinesize);
1980             }
1981         }
1982 chroma_idct_end:
1983         ;
1984     }
1985 }
1986
1987 static av_always_inline
1988 void filter_level_for_mb(VP8Context *s, VP8Macroblock *mb,
1989                          VP8FilterStrength *f, int is_vp7)
1990 {
1991     int interior_limit, filter_level;
1992
1993     if (s->segmentation.enabled) {
1994         filter_level = s->segmentation.filter_level[mb->segment];
1995         if (!s->segmentation.absolute_vals)
1996             filter_level += s->filter.level;
1997     } else
1998         filter_level = s->filter.level;
1999
2000     if (s->lf_delta.enabled) {
2001         filter_level += s->lf_delta.ref[mb->ref_frame];
2002         filter_level += s->lf_delta.mode[mb->mode];
2003     }
2004
2005     filter_level = av_clip_uintp2(filter_level, 6);
2006
2007     interior_limit = filter_level;
2008     if (s->filter.sharpness) {
2009         interior_limit >>= (s->filter.sharpness + 3) >> 2;
2010         interior_limit = FFMIN(interior_limit, 9 - s->filter.sharpness);
2011     }
2012     interior_limit = FFMAX(interior_limit, 1);
2013
2014     f->filter_level = filter_level;
2015     f->inner_limit = interior_limit;
2016     f->inner_filter = is_vp7 || !mb->skip || mb->mode == MODE_I4x4 ||
2017                       mb->mode == VP8_MVMODE_SPLIT;
2018 }
2019
2020 static av_always_inline
2021 void filter_mb(VP8Context *s, uint8_t *dst[3], VP8FilterStrength *f,
2022                int mb_x, int mb_y, int is_vp7)
2023 {
2024     int mbedge_lim, bedge_lim_y, bedge_lim_uv, hev_thresh;
2025     int filter_level = f->filter_level;
2026     int inner_limit = f->inner_limit;
2027     int inner_filter = f->inner_filter;
2028     int linesize = s->linesize;
2029     int uvlinesize = s->uvlinesize;
2030     static const uint8_t hev_thresh_lut[2][64] = {
2031         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
2032           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2033           3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2034           3, 3, 3, 3 },
2035         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
2036           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2037           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2038           2, 2, 2, 2 }
2039     };
2040
2041     if (!filter_level)
2042         return;
2043
2044     if (is_vp7) {
2045         bedge_lim_y  = filter_level;
2046         bedge_lim_uv = filter_level * 2;
2047         mbedge_lim   = filter_level + 2;
2048     } else {
2049         bedge_lim_y  =
2050         bedge_lim_uv = filter_level * 2 + inner_limit;
2051         mbedge_lim   = bedge_lim_y + 4;
2052     }
2053
2054     hev_thresh = hev_thresh_lut[s->keyframe][filter_level];
2055
2056     if (mb_x) {
2057         s->vp8dsp.vp8_h_loop_filter16y(dst[0], linesize,
2058                                        mbedge_lim, inner_limit, hev_thresh);
2059         s->vp8dsp.vp8_h_loop_filter8uv(dst[1], dst[2], uvlinesize,
2060                                        mbedge_lim, inner_limit, hev_thresh);
2061     }
2062
2063 #define H_LOOP_FILTER_16Y_INNER(cond)                                         \
2064     if (cond && inner_filter) {                                               \
2065         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] +  4, linesize,           \
2066                                              bedge_lim_y, inner_limit,        \
2067                                              hev_thresh);                     \
2068         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] +  8, linesize,           \
2069                                              bedge_lim_y, inner_limit,        \
2070                                              hev_thresh);                     \
2071         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] + 12, linesize,           \
2072                                              bedge_lim_y, inner_limit,        \
2073                                              hev_thresh);                     \
2074         s->vp8dsp.vp8_h_loop_filter8uv_inner(dst[1] +  4, dst[2] + 4,         \
2075                                              uvlinesize,  bedge_lim_uv,       \
2076                                              inner_limit, hev_thresh);        \
2077     }
2078
2079     H_LOOP_FILTER_16Y_INNER(!is_vp7)
2080
2081     if (mb_y) {
2082         s->vp8dsp.vp8_v_loop_filter16y(dst[0], linesize,
2083                                        mbedge_lim, inner_limit, hev_thresh);
2084         s->vp8dsp.vp8_v_loop_filter8uv(dst[1], dst[2], uvlinesize,
2085                                        mbedge_lim, inner_limit, hev_thresh);
2086     }
2087
2088     if (inner_filter) {
2089         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] +  4 * linesize,
2090                                              linesize, bedge_lim_y,
2091                                              inner_limit, hev_thresh);
2092         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] +  8 * linesize,
2093                                              linesize, bedge_lim_y,
2094                                              inner_limit, hev_thresh);
2095         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] + 12 * linesize,
2096                                              linesize, bedge_lim_y,
2097                                              inner_limit, hev_thresh);
2098         s->vp8dsp.vp8_v_loop_filter8uv_inner(dst[1] +  4 * uvlinesize,
2099                                              dst[2] +  4 * uvlinesize,
2100                                              uvlinesize, bedge_lim_uv,
2101                                              inner_limit, hev_thresh);
2102     }
2103
2104     H_LOOP_FILTER_16Y_INNER(is_vp7)
2105 }
2106
2107 static av_always_inline
2108 void filter_mb_simple(VP8Context *s, uint8_t *dst, VP8FilterStrength *f,
2109                       int mb_x, int mb_y)
2110 {
2111     int mbedge_lim, bedge_lim;
2112     int filter_level = f->filter_level;
2113     int inner_limit  = f->inner_limit;
2114     int inner_filter = f->inner_filter;
2115     int linesize     = s->linesize;
2116
2117     if (!filter_level)
2118         return;
2119
2120     bedge_lim  = 2 * filter_level + inner_limit;
2121     mbedge_lim = bedge_lim + 4;
2122
2123     if (mb_x)
2124         s->vp8dsp.vp8_h_loop_filter_simple(dst, linesize, mbedge_lim);
2125     if (inner_filter) {
2126         s->vp8dsp.vp8_h_loop_filter_simple(dst +  4, linesize, bedge_lim);
2127         s->vp8dsp.vp8_h_loop_filter_simple(dst +  8, linesize, bedge_lim);
2128         s->vp8dsp.vp8_h_loop_filter_simple(dst + 12, linesize, bedge_lim);
2129     }
2130
2131     if (mb_y)
2132         s->vp8dsp.vp8_v_loop_filter_simple(dst, linesize, mbedge_lim);
2133     if (inner_filter) {
2134         s->vp8dsp.vp8_v_loop_filter_simple(dst +  4 * linesize, linesize, bedge_lim);
2135         s->vp8dsp.vp8_v_loop_filter_simple(dst +  8 * linesize, linesize, bedge_lim);
2136         s->vp8dsp.vp8_v_loop_filter_simple(dst + 12 * linesize, linesize, bedge_lim);
2137     }
2138 }
2139
2140 #define MARGIN (16 << 2)
2141 static av_always_inline
2142 void vp78_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *curframe,
2143                                     VP8Frame *prev_frame, int is_vp7)
2144 {
2145     VP8Context *s = avctx->priv_data;
2146     int mb_x, mb_y;
2147
2148     s->mv_min.y = -MARGIN;
2149     s->mv_max.y = ((s->mb_height - 1) << 6) + MARGIN;
2150     for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
2151         VP8Macroblock *mb = s->macroblocks_base +
2152                             ((s->mb_width + 1) * (mb_y + 1) + 1);
2153         int mb_xy = mb_y * s->mb_width;
2154
2155         AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED * 0x01010101);
2156
2157         s->mv_min.x = -MARGIN;
2158         s->mv_max.x = ((s->mb_width - 1) << 6) + MARGIN;
2159         for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
2160             if (mb_y == 0)
2161                 AV_WN32A((mb - s->mb_width - 1)->intra4x4_pred_mode_top,
2162                          DC_PRED * 0x01010101);
2163             decode_mb_mode(s, mb, mb_x, mb_y, curframe->seg_map->data + mb_xy,
2164                            prev_frame && prev_frame->seg_map ?
2165                            prev_frame->seg_map->data + mb_xy : NULL, 1, is_vp7);
2166             s->mv_min.x -= 64;
2167             s->mv_max.x -= 64;
2168         }
2169         s->mv_min.y -= 64;
2170         s->mv_max.y -= 64;
2171     }
2172 }
2173
2174 static void vp7_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *cur_frame,
2175                                    VP8Frame *prev_frame)
2176 {
2177     vp78_decode_mv_mb_modes(avctx, cur_frame, prev_frame, IS_VP7);
2178 }
2179
2180 static void vp8_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *cur_frame,
2181                                    VP8Frame *prev_frame)
2182 {
2183     vp78_decode_mv_mb_modes(avctx, cur_frame, prev_frame, IS_VP8);
2184 }
2185
2186 #if HAVE_THREADS
2187 #define check_thread_pos(td, otd, mb_x_check, mb_y_check)                     \
2188     do {                                                                      \
2189         int tmp = (mb_y_check << 16) | (mb_x_check & 0xFFFF);                 \
2190         if (otd->thread_mb_pos < tmp) {                                       \
2191             pthread_mutex_lock(&otd->lock);                                   \
2192             td->wait_mb_pos = tmp;                                            \
2193             do {                                                              \
2194                 if (otd->thread_mb_pos >= tmp)                                \
2195                     break;                                                    \
2196                 pthread_cond_wait(&otd->cond, &otd->lock);                    \
2197             } while (1);                                                      \
2198             td->wait_mb_pos = INT_MAX;                                        \
2199             pthread_mutex_unlock(&otd->lock);                                 \
2200         }                                                                     \
2201     } while (0);
2202
2203 #define update_pos(td, mb_y, mb_x)                                            \
2204     do {                                                                      \
2205         int pos              = (mb_y << 16) | (mb_x & 0xFFFF);                \
2206         int sliced_threading = (avctx->active_thread_type == FF_THREAD_SLICE) && \
2207                                (num_jobs > 1);                                \
2208         int is_null          = !next_td || !prev_td;                          \
2209         int pos_check        = (is_null) ? 1                                  \
2210                                          : (next_td != td &&                  \
2211                                             pos >= next_td->wait_mb_pos) ||   \
2212                                            (prev_td != td &&                  \
2213                                             pos >= prev_td->wait_mb_pos);     \
2214         td->thread_mb_pos = pos;                                              \
2215         if (sliced_threading && pos_check) {                                  \
2216             pthread_mutex_lock(&td->lock);                                    \
2217             pthread_cond_broadcast(&td->cond);                                \
2218             pthread_mutex_unlock(&td->lock);                                  \
2219         }                                                                     \
2220     } while (0);
2221 #else
2222 #define check_thread_pos(td, otd, mb_x_check, mb_y_check)
2223 #define update_pos(td, mb_y, mb_x)
2224 #endif
2225
2226 static void vp8_decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
2227                                         int jobnr, int threadnr, int is_vp7)
2228 {
2229     VP8Context *s = avctx->priv_data;
2230     VP8ThreadData *prev_td, *next_td, *td = &s->thread_data[threadnr];
2231     int mb_y = td->thread_mb_pos >> 16;
2232     int mb_x, mb_xy = mb_y * s->mb_width;
2233     int num_jobs = s->num_jobs;
2234     VP8Frame *curframe = s->curframe, *prev_frame = s->prev_frame;
2235     VP56RangeCoder *c  = &s->coeff_partition[mb_y & (s->num_coeff_partitions - 1)];
2236     VP8Macroblock *mb;
2237     uint8_t *dst[3] = {
2238         curframe->tf.f->data[0] + 16 * mb_y * s->linesize,
2239         curframe->tf.f->data[1] +  8 * mb_y * s->uvlinesize,
2240         curframe->tf.f->data[2] +  8 * mb_y * s->uvlinesize
2241     };
2242     if (mb_y == 0)
2243         prev_td = td;
2244     else
2245         prev_td = &s->thread_data[(jobnr + num_jobs - 1) % num_jobs];
2246     if (mb_y == s->mb_height - 1)
2247         next_td = td;
2248     else
2249         next_td = &s->thread_data[(jobnr + 1) % num_jobs];
2250     if (s->mb_layout == 1)
2251         mb = s->macroblocks_base + ((s->mb_width + 1) * (mb_y + 1) + 1);
2252     else {
2253         // Make sure the previous frame has read its segmentation map,
2254         // if we re-use the same map.
2255         if (prev_frame && s->segmentation.enabled &&
2256             !s->segmentation.update_map)
2257             ff_thread_await_progress(&prev_frame->tf, mb_y, 0);
2258         mb = s->macroblocks + (s->mb_height - mb_y - 1) * 2;
2259         memset(mb - 1, 0, sizeof(*mb)); // zero left macroblock
2260         AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED * 0x01010101);
2261     }
2262
2263     if (!is_vp7 || mb_y == 0)
2264         memset(td->left_nnz, 0, sizeof(td->left_nnz));
2265
2266     s->mv_min.x = -MARGIN;
2267     s->mv_max.x = ((s->mb_width - 1) << 6) + MARGIN;
2268
2269     for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
2270         // Wait for previous thread to read mb_x+2, and reach mb_y-1.
2271         if (prev_td != td) {
2272             if (threadnr != 0) {
2273                 check_thread_pos(td, prev_td,
2274                                  mb_x + (is_vp7 ? 2 : 1),
2275                                  mb_y - (is_vp7 ? 2 : 1));
2276             } else {
2277                 check_thread_pos(td, prev_td,
2278                                  mb_x + (is_vp7 ? 2 : 1) + s->mb_width + 3,
2279                                  mb_y - (is_vp7 ? 2 : 1));
2280             }
2281         }
2282
2283         s->vdsp.prefetch(dst[0] + (mb_x & 3) * 4 * s->linesize + 64,
2284                          s->linesize, 4);
2285         s->vdsp.prefetch(dst[1] + (mb_x & 7) * s->uvlinesize + 64,
2286                          dst[2] - dst[1], 2);
2287
2288         if (!s->mb_layout)
2289             decode_mb_mode(s, mb, mb_x, mb_y, curframe->seg_map->data + mb_xy,
2290                            prev_frame && prev_frame->seg_map ?
2291                            prev_frame->seg_map->data + mb_xy : NULL, 0, is_vp7);
2292
2293         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_PREVIOUS);
2294
2295         if (!mb->skip)
2296             decode_mb_coeffs(s, td, c, mb, s->top_nnz[mb_x], td->left_nnz, is_vp7);
2297
2298         if (mb->mode <= MODE_I4x4)
2299             intra_predict(s, td, dst, mb, mb_x, mb_y, is_vp7);
2300         else
2301             inter_predict(s, td, dst, mb, mb_x, mb_y);
2302
2303         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN);
2304
2305         if (!mb->skip) {
2306             idct_mb(s, td, dst, mb);
2307         } else {
2308             AV_ZERO64(td->left_nnz);
2309             AV_WN64(s->top_nnz[mb_x], 0);   // array of 9, so unaligned
2310
2311             /* Reset DC block predictors if they would exist
2312              * if the mb had coefficients */
2313             if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
2314                 td->left_nnz[8]     = 0;
2315                 s->top_nnz[mb_x][8] = 0;
2316             }
2317         }
2318
2319         if (s->deblock_filter)
2320             filter_level_for_mb(s, mb, &td->filter_strength[mb_x], is_vp7);
2321
2322         if (s->deblock_filter && num_jobs != 1 && threadnr == num_jobs - 1) {
2323             if (s->filter.simple)
2324                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2325                                  NULL, NULL, s->linesize, 0, 1);
2326             else
2327                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2328                                  dst[1], dst[2], s->linesize, s->uvlinesize, 0);
2329         }
2330
2331         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN2);
2332
2333         dst[0]      += 16;
2334         dst[1]      += 8;
2335         dst[2]      += 8;
2336         s->mv_min.x -= 64;
2337         s->mv_max.x -= 64;
2338
2339         if (mb_x == s->mb_width + 1) {
2340             update_pos(td, mb_y, s->mb_width + 3);
2341         } else {
2342             update_pos(td, mb_y, mb_x);
2343         }
2344     }
2345 }
2346
2347 static void vp8_filter_mb_row(AVCodecContext *avctx, void *tdata,
2348                               int jobnr, int threadnr, int is_vp7)
2349 {
2350     VP8Context *s = avctx->priv_data;
2351     VP8ThreadData *td = &s->thread_data[threadnr];
2352     int mb_x, mb_y = td->thread_mb_pos >> 16, num_jobs = s->num_jobs;
2353     AVFrame *curframe = s->curframe->tf.f;
2354     VP8Macroblock *mb;
2355     VP8ThreadData *prev_td, *next_td;
2356     uint8_t *dst[3] = {
2357         curframe->data[0] + 16 * mb_y * s->linesize,
2358         curframe->data[1] +  8 * mb_y * s->uvlinesize,
2359         curframe->data[2] +  8 * mb_y * s->uvlinesize
2360     };
2361
2362     if (s->mb_layout == 1)
2363         mb = s->macroblocks_base + ((s->mb_width + 1) * (mb_y + 1) + 1);
2364     else
2365         mb = s->macroblocks + (s->mb_height - mb_y - 1) * 2;
2366
2367     if (mb_y == 0)
2368         prev_td = td;
2369     else
2370         prev_td = &s->thread_data[(jobnr + num_jobs - 1) % num_jobs];
2371     if (mb_y == s->mb_height - 1)
2372         next_td = td;
2373     else
2374         next_td = &s->thread_data[(jobnr + 1) % num_jobs];
2375
2376     for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb++) {
2377         VP8FilterStrength *f = &td->filter_strength[mb_x];
2378         if (prev_td != td)
2379             check_thread_pos(td, prev_td,
2380                              (mb_x + 1) + (s->mb_width + 3), mb_y - 1);
2381         if (next_td != td)
2382             if (next_td != &s->thread_data[0])
2383                 check_thread_pos(td, next_td, mb_x + 1, mb_y + 1);
2384
2385         if (num_jobs == 1) {
2386             if (s->filter.simple)
2387                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2388                                  NULL, NULL, s->linesize, 0, 1);
2389             else
2390                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2391                                  dst[1], dst[2], s->linesize, s->uvlinesize, 0);
2392         }
2393
2394         if (s->filter.simple)
2395             filter_mb_simple(s, dst[0], f, mb_x, mb_y);
2396         else
2397             filter_mb(s, dst, f, mb_x, mb_y, is_vp7);
2398         dst[0] += 16;
2399         dst[1] += 8;
2400         dst[2] += 8;
2401
2402         update_pos(td, mb_y, (s->mb_width + 3) + mb_x);
2403     }
2404 }
2405
2406 static av_always_inline
2407 int vp78_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata, int jobnr,
2408                               int threadnr, int is_vp7)
2409 {
2410     VP8Context *s = avctx->priv_data;
2411     VP8ThreadData *td = &s->thread_data[jobnr];
2412     VP8ThreadData *next_td = NULL, *prev_td = NULL;
2413     VP8Frame *curframe = s->curframe;
2414     int mb_y, num_jobs = s->num_jobs;
2415
2416     td->thread_nr = threadnr;
2417     for (mb_y = jobnr; mb_y < s->mb_height; mb_y += num_jobs) {
2418         if (mb_y >= s->mb_height)
2419             break;
2420         td->thread_mb_pos = mb_y << 16;
2421         vp8_decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr, is_vp7);
2422         if (s->deblock_filter)
2423             vp8_filter_mb_row(avctx, tdata, jobnr, threadnr, is_vp7);
2424         update_pos(td, mb_y, INT_MAX & 0xFFFF);
2425
2426         s->mv_min.y -= 64;
2427         s->mv_max.y -= 64;
2428
2429         if (avctx->active_thread_type == FF_THREAD_FRAME)
2430             ff_thread_report_progress(&curframe->tf, mb_y, 0);
2431     }
2432
2433     return 0;
2434 }
2435
2436 static int vp7_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata,
2437                                     int jobnr, int threadnr)
2438 {
2439     return vp78_decode_mb_row_sliced(avctx, tdata, jobnr, threadnr, IS_VP7);
2440 }
2441
2442 static int vp8_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata,
2443                                     int jobnr, int threadnr)
2444 {
2445     return vp78_decode_mb_row_sliced(avctx, tdata, jobnr, threadnr, IS_VP8);
2446 }
2447
2448
2449 static av_always_inline
2450 int vp78_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
2451                       AVPacket *avpkt, int is_vp7)
2452 {
2453     VP8Context *s = avctx->priv_data;
2454     int ret, i, referenced, num_jobs;
2455     enum AVDiscard skip_thresh;
2456     VP8Frame *av_uninit(curframe), *prev_frame;
2457
2458     if (is_vp7)
2459         ret = vp7_decode_frame_header(s, avpkt->data, avpkt->size);
2460     else
2461         ret = vp8_decode_frame_header(s, avpkt->data, avpkt->size);
2462
2463     if (ret < 0)
2464         goto err;
2465
2466     prev_frame = s->framep[VP56_FRAME_CURRENT];
2467
2468     referenced = s->update_last || s->update_golden == VP56_FRAME_CURRENT ||
2469                  s->update_altref == VP56_FRAME_CURRENT;
2470
2471     skip_thresh = !referenced ? AVDISCARD_NONREF
2472                               : !s->keyframe ? AVDISCARD_NONKEY
2473                                              : AVDISCARD_ALL;
2474
2475     if (avctx->skip_frame >= skip_thresh) {
2476         s->invisible = 1;
2477         memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
2478         goto skip_decode;
2479     }
2480     s->deblock_filter = s->filter.level && avctx->skip_loop_filter < skip_thresh;
2481
2482     // release no longer referenced frames
2483     for (i = 0; i < 5; i++)
2484         if (s->frames[i].tf.f->data[0] &&
2485             &s->frames[i] != prev_frame &&
2486             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
2487             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN]   &&
2488             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2])
2489             vp8_release_frame(s, &s->frames[i]);
2490
2491     curframe = s->framep[VP56_FRAME_CURRENT] = vp8_find_free_buffer(s);
2492
2493     /* Given that arithmetic probabilities are updated every frame, it's quite
2494      * likely that the values we have on a random interframe are complete
2495      * junk if we didn't start decode on a keyframe. So just don't display
2496      * anything rather than junk. */
2497     if (!s->keyframe && (!s->framep[VP56_FRAME_PREVIOUS] ||
2498                          !s->framep[VP56_FRAME_GOLDEN]   ||
2499                          !s->framep[VP56_FRAME_GOLDEN2])) {
2500         av_log(avctx, AV_LOG_WARNING,
2501                "Discarding interframe without a prior keyframe!\n");
2502         ret = AVERROR_INVALIDDATA;
2503         goto err;
2504     }
2505
2506     curframe->tf.f->key_frame = s->keyframe;
2507     curframe->tf.f->pict_type = s->keyframe ? AV_PICTURE_TYPE_I
2508                                             : AV_PICTURE_TYPE_P;
2509     if ((ret = vp8_alloc_frame(s, curframe, referenced))) {
2510         av_log(avctx, AV_LOG_ERROR, "get_buffer() failed!\n");
2511         goto err;
2512     }
2513
2514     // check if golden and altref are swapped
2515     if (s->update_altref != VP56_FRAME_NONE)
2516         s->next_framep[VP56_FRAME_GOLDEN2] = s->framep[s->update_altref];
2517     else
2518         s->next_framep[VP56_FRAME_GOLDEN2] = s->framep[VP56_FRAME_GOLDEN2];
2519
2520     if (s->update_golden != VP56_FRAME_NONE)
2521         s->next_framep[VP56_FRAME_GOLDEN] = s->framep[s->update_golden];
2522     else
2523         s->next_framep[VP56_FRAME_GOLDEN] = s->framep[VP56_FRAME_GOLDEN];
2524
2525     if (s->update_last)
2526         s->next_framep[VP56_FRAME_PREVIOUS] = curframe;
2527     else
2528         s->next_framep[VP56_FRAME_PREVIOUS] = s->framep[VP56_FRAME_PREVIOUS];
2529
2530     s->next_framep[VP56_FRAME_CURRENT] = curframe;
2531
2532     ff_thread_finish_setup(avctx);
2533
2534     s->linesize   = curframe->tf.f->linesize[0];
2535     s->uvlinesize = curframe->tf.f->linesize[1];
2536
2537     memset(s->top_nnz, 0, s->mb_width * sizeof(*s->top_nnz));
2538     /* Zero macroblock structures for top/top-left prediction
2539      * from outside the frame. */
2540     if (!s->mb_layout)
2541         memset(s->macroblocks + s->mb_height * 2 - 1, 0,
2542                (s->mb_width + 1) * sizeof(*s->macroblocks));
2543     if (!s->mb_layout && s->keyframe)
2544         memset(s->intra4x4_pred_mode_top, DC_PRED, s->mb_width * 4);
2545
2546     memset(s->ref_count, 0, sizeof(s->ref_count));
2547
2548     if (s->mb_layout == 1) {
2549         // Make sure the previous frame has read its segmentation map,
2550         // if we re-use the same map.
2551         if (prev_frame && s->segmentation.enabled &&
2552             !s->segmentation.update_map)
2553             ff_thread_await_progress(&prev_frame->tf, 1, 0);
2554         if (is_vp7)
2555             vp7_decode_mv_mb_modes(avctx, curframe, prev_frame);
2556         else
2557             vp8_decode_mv_mb_modes(avctx, curframe, prev_frame);
2558     }
2559
2560     if (avctx->active_thread_type == FF_THREAD_FRAME)
2561         num_jobs = 1;
2562     else
2563         num_jobs = FFMIN(s->num_coeff_partitions, avctx->thread_count);
2564     s->num_jobs   = num_jobs;
2565     s->curframe   = curframe;
2566     s->prev_frame = prev_frame;
2567     s->mv_min.y   = -MARGIN;
2568     s->mv_max.y   = ((s->mb_height - 1) << 6) + MARGIN;
2569     for (i = 0; i < MAX_THREADS; i++) {
2570         s->thread_data[i].thread_mb_pos = 0;
2571         s->thread_data[i].wait_mb_pos   = INT_MAX;
2572     }
2573     if (is_vp7)
2574         avctx->execute2(avctx, vp7_decode_mb_row_sliced, s->thread_data, NULL,
2575                         num_jobs);
2576     else
2577         avctx->execute2(avctx, vp8_decode_mb_row_sliced, s->thread_data, NULL,
2578                         num_jobs);
2579
2580     ff_thread_report_progress(&curframe->tf, INT_MAX, 0);
2581     memcpy(&s->framep[0], &s->next_framep[0], sizeof(s->framep[0]) * 4);
2582
2583 skip_decode:
2584     // if future frames don't use the updated probabilities,
2585     // reset them to the values we saved
2586     if (!s->update_probabilities)
2587         s->prob[0] = s->prob[1];
2588
2589     if (!s->invisible) {
2590         if ((ret = av_frame_ref(data, curframe->tf.f)) < 0)
2591             return ret;
2592         *got_frame = 1;
2593     }
2594
2595     return avpkt->size;
2596 err:
2597     memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
2598     return ret;
2599 }
2600
2601 int ff_vp8_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
2602                         AVPacket *avpkt)
2603 {
2604     return vp78_decode_frame(avctx, data, got_frame, avpkt, IS_VP8);
2605 }
2606
2607 #if CONFIG_VP7_DECODER
2608 static int vp7_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
2609                             AVPacket *avpkt)
2610 {
2611     return vp78_decode_frame(avctx, data, got_frame, avpkt, IS_VP7);
2612 }
2613 #endif /* CONFIG_VP7_DECODER */
2614
2615 av_cold int ff_vp8_decode_free(AVCodecContext *avctx)
2616 {
2617     VP8Context *s = avctx->priv_data;
2618     int i;
2619
2620     vp8_decode_flush_impl(avctx, 1);
2621     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++)
2622         av_frame_free(&s->frames[i].tf.f);
2623
2624     return 0;
2625 }
2626
2627 static av_cold int vp8_init_frames(VP8Context *s)
2628 {
2629     int i;
2630     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++) {
2631         s->frames[i].tf.f = av_frame_alloc();
2632         if (!s->frames[i].tf.f)
2633             return AVERROR(ENOMEM);
2634     }
2635     return 0;
2636 }
2637
2638 static av_always_inline
2639 int vp78_decode_init(AVCodecContext *avctx, int is_vp7)
2640 {
2641     VP8Context *s = avctx->priv_data;
2642     int ret;
2643
2644     s->avctx = avctx;
2645     avctx->pix_fmt = AV_PIX_FMT_YUV420P;
2646     avctx->internal->allocate_progress = 1;
2647
2648     ff_videodsp_init(&s->vdsp, 8);
2649
2650     ff_vp78dsp_init(&s->vp8dsp);
2651     if (CONFIG_VP7_DECODER && is_vp7) {
2652         ff_h264_pred_init(&s->hpc, AV_CODEC_ID_VP7, 8, 1);
2653         ff_vp7dsp_init(&s->vp8dsp);
2654     } else if (CONFIG_VP8_DECODER && !is_vp7) {
2655         ff_h264_pred_init(&s->hpc, AV_CODEC_ID_VP8, 8, 1);
2656         ff_vp8dsp_init(&s->vp8dsp);
2657     }
2658
2659     /* does not change for VP8 */
2660     memcpy(s->prob[0].scan, zigzag_scan, sizeof(s->prob[0].scan));
2661
2662     if ((ret = vp8_init_frames(s)) < 0) {
2663         ff_vp8_decode_free(avctx);
2664         return ret;
2665     }
2666
2667     return 0;
2668 }
2669
2670 #if CONFIG_VP7_DECODER
2671 static int vp7_decode_init(AVCodecContext *avctx)
2672 {
2673     return vp78_decode_init(avctx, IS_VP7);
2674 }
2675 #endif /* CONFIG_VP7_DECODER */
2676
2677 av_cold int ff_vp8_decode_init(AVCodecContext *avctx)
2678 {
2679     return vp78_decode_init(avctx, IS_VP8);
2680 }
2681
2682 #if CONFIG_VP8_DECODER
2683 static av_cold int vp8_decode_init_thread_copy(AVCodecContext *avctx)
2684 {
2685     VP8Context *s = avctx->priv_data;
2686     int ret;
2687
2688     s->avctx = avctx;
2689
2690     if ((ret = vp8_init_frames(s)) < 0) {
2691         ff_vp8_decode_free(avctx);
2692         return ret;
2693     }
2694
2695     return 0;
2696 }
2697
2698 #define REBASE(pic) pic ? pic - &s_src->frames[0] + &s->frames[0] : NULL
2699
2700 static int vp8_decode_update_thread_context(AVCodecContext *dst,
2701                                             const AVCodecContext *src)
2702 {
2703     VP8Context *s = dst->priv_data, *s_src = src->priv_data;
2704     int i;
2705
2706     if (s->macroblocks_base &&
2707         (s_src->mb_width != s->mb_width || s_src->mb_height != s->mb_height)) {
2708         free_buffers(s);
2709         s->mb_width  = s_src->mb_width;
2710         s->mb_height = s_src->mb_height;
2711     }
2712
2713     s->prob[0]      = s_src->prob[!s_src->update_probabilities];
2714     s->segmentation = s_src->segmentation;
2715     s->lf_delta     = s_src->lf_delta;
2716     memcpy(s->sign_bias, s_src->sign_bias, sizeof(s->sign_bias));
2717
2718     for (i = 0; i < FF_ARRAY_ELEMS(s_src->frames); i++) {
2719         if (s_src->frames[i].tf.f->data[0]) {
2720             int ret = vp8_ref_frame(s, &s->frames[i], &s_src->frames[i]);
2721             if (ret < 0)
2722                 return ret;
2723         }
2724     }
2725
2726     s->framep[0] = REBASE(s_src->next_framep[0]);
2727     s->framep[1] = REBASE(s_src->next_framep[1]);
2728     s->framep[2] = REBASE(s_src->next_framep[2]);
2729     s->framep[3] = REBASE(s_src->next_framep[3]);
2730
2731     return 0;
2732 }
2733 #endif /* CONFIG_VP8_DECODER */
2734
2735 #if CONFIG_VP7_DECODER
2736 AVCodec ff_vp7_decoder = {
2737     .name                  = "vp7",
2738     .long_name             = NULL_IF_CONFIG_SMALL("On2 VP7"),
2739     .type                  = AVMEDIA_TYPE_VIDEO,
2740     .id                    = AV_CODEC_ID_VP7,
2741     .priv_data_size        = sizeof(VP8Context),
2742     .init                  = vp7_decode_init,
2743     .close                 = ff_vp8_decode_free,
2744     .decode                = vp7_decode_frame,
2745     .capabilities          = CODEC_CAP_DR1,
2746     .flush                 = vp8_decode_flush,
2747 };
2748 #endif /* CONFIG_VP7_DECODER */
2749
2750 #if CONFIG_VP8_DECODER
2751 AVCodec ff_vp8_decoder = {
2752     .name                  = "vp8",
2753     .long_name             = NULL_IF_CONFIG_SMALL("On2 VP8"),
2754     .type                  = AVMEDIA_TYPE_VIDEO,
2755     .id                    = AV_CODEC_ID_VP8,
2756     .priv_data_size        = sizeof(VP8Context),
2757     .init                  = ff_vp8_decode_init,
2758     .close                 = ff_vp8_decode_free,
2759     .decode                = ff_vp8_decode_frame,
2760     .capabilities          = CODEC_CAP_DR1 | CODEC_CAP_FRAME_THREADS | CODEC_CAP_SLICE_THREADS,
2761     .flush                 = vp8_decode_flush,
2762     .init_thread_copy      = ONLY_IF_THREADS_ENABLED(vp8_decode_init_thread_copy),
2763     .update_thread_context = ONLY_IF_THREADS_ENABLED(vp8_decode_update_thread_context),
2764 };
2765 #endif /* CONFIG_VP7_DECODER */