git.sesse.net Git - ffmpeg/blob - libavcodec/vp8.c

   1 /*
   2  * VP7/VP8 compatible video decoder
   3  *
   4  * Copyright (C) 2010 David Conrad
   5  * Copyright (C) 2010 Ronald S. Bultje
   6  * Copyright (C) 2010 Fiona Glaser
   7  * Copyright (C) 2012 Daniel Kang
   8  * Copyright (C) 2014 Peter Ross
   9  *
  10  * This file is part of Libav.
  11  *
  12  * Libav is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public
  14  * License as published by the Free Software Foundation; either
  15  * version 2.1 of the License, or (at your option) any later version.
  16  *
  17  * Libav is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with Libav; if not, write to the Free Software
  24  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  25  */
  26
  27 #include "libavutil/imgutils.h"
  28
  29 #include "avcodec.h"
  30 #include "internal.h"
  31 #include "rectangle.h"
  32 #include "thread.h"
  33 #include "vp8.h"
  34 #include "vp8data.h"
  35
  36 #if ARCH_ARM
  37 #   include "arm/vp8.h"
  38 #endif
  39
  40 static void free_buffers(VP8Context *s)
  41 {
  42     int i;
  43     if (s->thread_data)
  44         for (i = 0; i < MAX_THREADS; i++) {
  45 #if HAVE_THREADS
  46             pthread_cond_destroy(&s->thread_data[i].cond);
  47             pthread_mutex_destroy(&s->thread_data[i].lock);
  48 #endif
  49             av_freep(&s->thread_data[i].filter_strength);
  50         }
  51     av_freep(&s->thread_data);
  52     av_freep(&s->macroblocks_base);
  53     av_freep(&s->intra4x4_pred_mode_top);
  54     av_freep(&s->top_nnz);
  55     av_freep(&s->top_border);
  56
  57     s->macroblocks = NULL;
  58 }
  59
  60 static int vp8_alloc_frame(VP8Context *s, VP8Frame *f, int ref)
  61 {
  62     int ret;
  63     if ((ret = ff_thread_get_buffer(s->avctx, &f->tf,
  64                                     ref ? AV_GET_BUFFER_FLAG_REF : 0)) < 0)
  65         return ret;
  66     if (!(f->seg_map = av_buffer_allocz(s->mb_width * s->mb_height))) {
  67         ff_thread_release_buffer(s->avctx, &f->tf);
  68         return AVERROR(ENOMEM);
  69     }
  70     return 0;
  71 }
  72
  73 static void vp8_release_frame(VP8Context *s, VP8Frame *f)
  74 {
  75     av_buffer_unref(&f->seg_map);
  76     ff_thread_release_buffer(s->avctx, &f->tf);
  77 }
  78
  79 #if CONFIG_VP8_DECODER
  80 static int vp8_ref_frame(VP8Context *s, VP8Frame *dst, VP8Frame *src)
  81 {
  82     int ret;
  83
  84     vp8_release_frame(s, dst);
  85
  86     if ((ret = ff_thread_ref_frame(&dst->tf, &src->tf)) < 0)
  87         return ret;
  88     if (src->seg_map &&
  89         !(dst->seg_map = av_buffer_ref(src->seg_map))) {
  90         vp8_release_frame(s, dst);
  91         return AVERROR(ENOMEM);
  92     }
  93
  94     return 0;
  95 }
  96 #endif /* CONFIG_VP8_DECODER */
  97
  98 static void vp8_decode_flush_impl(AVCodecContext *avctx, int free_mem)
  99 {
 100     VP8Context *s = avctx->priv_data;
 101     int i;
 102
 103     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++)
 104         vp8_release_frame(s, &s->frames[i]);
 105     memset(s->framep, 0, sizeof(s->framep));
 106
 107     if (free_mem)
 108         free_buffers(s);
 109 }
 110
 111 static void vp8_decode_flush(AVCodecContext *avctx)
 112 {
 113     vp8_decode_flush_impl(avctx, 0);
 114 }
 115
 116 static VP8Frame *vp8_find_free_buffer(VP8Context *s)
 117 {
 118     VP8Frame *frame = NULL;
 119     int i;
 120
 121     // find a free buffer
 122     for (i = 0; i < 5; i++)
 123         if (&s->frames[i] != s->framep[VP56_FRAME_CURRENT]  &&
 124             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
 125             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN]   &&
 126             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2]) {
 127             frame = &s->frames[i];
 128             break;
 129         }
 130     if (i == 5) {
 131         av_log(s->avctx, AV_LOG_FATAL, "Ran out of free frames!\n");
 132         abort();
 133     }
 134     if (frame->tf.f->data[0])
 135         vp8_release_frame(s, frame);
 136
 137     return frame;
 138 }
 139
 140 static av_always_inline
 141 int update_dimensions(VP8Context *s, int width, int height, int is_vp7)
 142 {
 143     AVCodecContext *avctx = s->avctx;
 144     int i, ret;
 145
 146     if (width  != s->avctx->width ||
 147         height != s->avctx->height) {
 148         vp8_decode_flush_impl(s->avctx, 1);
 149
 150         ret = ff_set_dimensions(s->avctx, width, height);
 151         if (ret < 0)
 152             return ret;
 153     }
 154
 155     s->mb_width  = (s->avctx->coded_width  + 15) / 16;
 156     s->mb_height = (s->avctx->coded_height + 15) / 16;
 157
 158     s->mb_layout = is_vp7 || avctx->active_thread_type == FF_THREAD_SLICE &&
 159                    FFMIN(s->num_coeff_partitions, avctx->thread_count) > 1;
 160     if (!s->mb_layout) { // Frame threading and one thread
 161         s->macroblocks_base       = av_mallocz((s->mb_width + s->mb_height * 2 + 1) *
 162                                                sizeof(*s->macroblocks));
 163         s->intra4x4_pred_mode_top = av_mallocz(s->mb_width * 4);
 164     } else // Sliced threading
 165         s->macroblocks_base = av_mallocz((s->mb_width + 2) * (s->mb_height + 2) *
 166                                          sizeof(*s->macroblocks));
 167     s->top_nnz     = av_mallocz(s->mb_width * sizeof(*s->top_nnz));
 168     s->top_border  = av_mallocz((s->mb_width + 1) * sizeof(*s->top_border));
 169     s->thread_data = av_mallocz(MAX_THREADS * sizeof(VP8ThreadData));
 170
 171     for (i = 0; i < MAX_THREADS; i++) {
 172         s->thread_data[i].filter_strength =
 173             av_mallocz(s->mb_width * sizeof(*s->thread_data[0].filter_strength));
 174 #if HAVE_THREADS
 175         pthread_mutex_init(&s->thread_data[i].lock, NULL);
 176         pthread_cond_init(&s->thread_data[i].cond, NULL);
 177 #endif
 178     }
 179
 180     if (!s->macroblocks_base || !s->top_nnz || !s->top_border ||
 181         (!s->intra4x4_pred_mode_top && !s->mb_layout))
 182         return AVERROR(ENOMEM);
 183
 184     s->macroblocks = s->macroblocks_base + 1;
 185
 186     return 0;
 187 }
 188
 189 static int vp7_update_dimensions(VP8Context *s, int width, int height)
 190 {
 191     return update_dimensions(s, width, height, IS_VP7);
 192 }
 193
 194 static int vp8_update_dimensions(VP8Context *s, int width, int height)
 195 {
 196     return update_dimensions(s, width, height, IS_VP8);
 197 }
 198
 199 static void parse_segment_info(VP8Context *s)
 200 {
 201     VP56RangeCoder *c = &s->c;
 202     int i;
 203
 204     s->segmentation.update_map = vp8_rac_get(c);
 205
 206     if (vp8_rac_get(c)) { // update segment feature data
 207         s->segmentation.absolute_vals = vp8_rac_get(c);
 208
 209         for (i = 0; i < 4; i++)
 210             s->segmentation.base_quant[i]   = vp8_rac_get_sint(c, 7);
 211
 212         for (i = 0; i < 4; i++)
 213             s->segmentation.filter_level[i] = vp8_rac_get_sint(c, 6);
 214     }
 215     if (s->segmentation.update_map)
 216         for (i = 0; i < 3; i++)
 217             s->prob->segmentid[i] = vp8_rac_get(c) ? vp8_rac_get_uint(c, 8) : 255;
 218 }
 219
 220 static void update_lf_deltas(VP8Context *s)
 221 {
 222     VP56RangeCoder *c = &s->c;
 223     int i;
 224
 225     for (i = 0; i < 4; i++) {
 226         if (vp8_rac_get(c)) {
 227             s->lf_delta.ref[i] = vp8_rac_get_uint(c, 6);
 228
 229             if (vp8_rac_get(c))
 230                 s->lf_delta.ref[i] = -s->lf_delta.ref[i];
 231         }
 232     }
 233
 234     for (i = MODE_I4x4; i <= VP8_MVMODE_SPLIT; i++) {
 235         if (vp8_rac_get(c)) {
 236             s->lf_delta.mode[i] = vp8_rac_get_uint(c, 6);
 237
 238             if (vp8_rac_get(c))
 239                 s->lf_delta.mode[i] = -s->lf_delta.mode[i];
 240         }
 241     }
 242 }
 243
 244 static int setup_partitions(VP8Context *s, const uint8_t *buf, int buf_size)
 245 {
 246     const uint8_t *sizes = buf;
 247     int i;
 248
 249     s->num_coeff_partitions = 1 << vp8_rac_get_uint(&s->c, 2);
 250
 251     buf      += 3 * (s->num_coeff_partitions - 1);
 252     buf_size -= 3 * (s->num_coeff_partitions - 1);
 253     if (buf_size < 0)
 254         return -1;
 255
 256     for (i = 0; i < s->num_coeff_partitions - 1; i++) {
 257         int size = AV_RL24(sizes + 3 * i);
 258         if (buf_size - size < 0)
 259             return -1;
 260
 261         ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, size);
 262         buf      += size;
 263         buf_size -= size;
 264     }
 265     ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, buf_size);
 266
 267     return 0;
 268 }
 269
 270 static void vp7_get_quants(VP8Context *s)
 271 {
 272     VP56RangeCoder *c = &s->c;
 273
 274     int yac_qi  = vp8_rac_get_uint(c, 7);
 275     int ydc_qi  = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 276     int y2dc_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 277     int y2ac_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 278     int uvdc_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 279     int uvac_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 280
 281     s->qmat[0].luma_qmul[0]    =       vp7_ydc_qlookup[ydc_qi];
 282     s->qmat[0].luma_qmul[1]    =       vp7_yac_qlookup[yac_qi];
 283     s->qmat[0].luma_dc_qmul[0] =       vp7_y2dc_qlookup[y2dc_qi];
 284     s->qmat[0].luma_dc_qmul[1] =       vp7_y2ac_qlookup[y2ac_qi];
 285     s->qmat[0].chroma_qmul[0]  = FFMIN(vp7_ydc_qlookup[uvdc_qi], 132);
 286     s->qmat[0].chroma_qmul[1]  =       vp7_yac_qlookup[uvac_qi];
 287 }
 288
 289 static void get_quants(VP8Context *s)
 290 {
 291     VP56RangeCoder *c = &s->c;
 292     int i, base_qi;
 293
 294     int yac_qi     = vp8_rac_get_uint(c, 7);
 295     int ydc_delta  = vp8_rac_get_sint(c, 4);
 296     int y2dc_delta = vp8_rac_get_sint(c, 4);
 297     int y2ac_delta = vp8_rac_get_sint(c, 4);
 298     int uvdc_delta = vp8_rac_get_sint(c, 4);
 299     int uvac_delta = vp8_rac_get_sint(c, 4);
 300
 301     for (i = 0; i < 4; i++) {
 302         if (s->segmentation.enabled) {
 303             base_qi = s->segmentation.base_quant[i];
 304             if (!s->segmentation.absolute_vals)
 305                 base_qi += yac_qi;
 306         } else
 307             base_qi = yac_qi;
 308
 309         s->qmat[i].luma_qmul[0]    = vp8_dc_qlookup[av_clip_uintp2(base_qi + ydc_delta,  7)];
 310         s->qmat[i].luma_qmul[1]    = vp8_ac_qlookup[av_clip_uintp2(base_qi,              7)];
 311         s->qmat[i].luma_dc_qmul[0] = vp8_dc_qlookup[av_clip_uintp2(base_qi + y2dc_delta, 7)] * 2;
 312         /* 101581>>16 is equivalent to 155/100 */
 313         s->qmat[i].luma_dc_qmul[1] = vp8_ac_qlookup[av_clip_uintp2(base_qi + y2ac_delta, 7)] * 101581 >> 16;
 314         s->qmat[i].chroma_qmul[0]  = vp8_dc_qlookup[av_clip_uintp2(base_qi + uvdc_delta, 7)];
 315         s->qmat[i].chroma_qmul[1]  = vp8_ac_qlookup[av_clip_uintp2(base_qi + uvac_delta, 7)];
 316
 317         s->qmat[i].luma_dc_qmul[1] = FFMAX(s->qmat[i].luma_dc_qmul[1], 8);
 318         s->qmat[i].chroma_qmul[0]  = FFMIN(s->qmat[i].chroma_qmul[0], 132);
 319     }
 320 }
 321
 322 /**
 323  * Determine which buffers golden and altref should be updated with after this frame.
 324  * The spec isn't clear here, so I'm going by my understanding of what libvpx does
 325  *
 326  * Intra frames update all 3 references
 327  * Inter frames update VP56_FRAME_PREVIOUS if the update_last flag is set
 328  * If the update (golden|altref) flag is set, it's updated with the current frame
 329  *      if update_last is set, and VP56_FRAME_PREVIOUS otherwise.
 330  * If the flag is not set, the number read means:
 331  *      0: no update
 332  *      1: VP56_FRAME_PREVIOUS
 333  *      2: update golden with altref, or update altref with golden
 334  */
 335 static VP56Frame ref_to_update(VP8Context *s, int update, VP56Frame ref)
 336 {
 337     VP56RangeCoder *c = &s->c;
 338
 339     if (update)
 340         return VP56_FRAME_CURRENT;
 341
 342     switch (vp8_rac_get_uint(c, 2)) {
 343     case 1:
 344         return VP56_FRAME_PREVIOUS;
 345     case 2:
 346         return (ref == VP56_FRAME_GOLDEN) ? VP56_FRAME_GOLDEN2 : VP56_FRAME_GOLDEN;
 347     }
 348     return VP56_FRAME_NONE;
 349 }
 350
 351 static void vp78_reset_probability_tables(VP8Context *s)
 352 {
 353     int i, j;
 354     for (i = 0; i < 4; i++)
 355         for (j = 0; j < 16; j++)
 356             memcpy(s->prob->token[i][j], vp8_token_default_probs[i][vp8_coeff_band[j]],
 357                    sizeof(s->prob->token[i][j]));
 358 }
 359
 360 static void vp78_update_probability_tables(VP8Context *s)
 361 {
 362     VP56RangeCoder *c = &s->c;
 363     int i, j, k, l, m;
 364
 365     for (i = 0; i < 4; i++)
 366         for (j = 0; j < 8; j++)
 367             for (k = 0; k < 3; k++)
 368                 for (l = 0; l < NUM_DCT_TOKENS-1; l++)
 369                     if (vp56_rac_get_prob_branchy(c, vp8_token_update_probs[i][j][k][l])) {
 370                         int prob = vp8_rac_get_uint(c, 8);
 371                         for (m = 0; vp8_coeff_band_indexes[j][m] >= 0; m++)
 372                             s->prob->token[i][vp8_coeff_band_indexes[j][m]][k][l] = prob;
 373                     }
 374 }
 375
 376 #define VP7_MVC_SIZE 17
 377 #define VP8_MVC_SIZE 19
 378
 379 static void vp78_update_pred16x16_pred8x8_mvc_probabilities(VP8Context *s,
 380                                                             int mvc_size)
 381 {
 382     VP56RangeCoder *c = &s->c;
 383     int i, j;
 384
 385     if (vp8_rac_get(c))
 386         for (i = 0; i < 4; i++)
 387             s->prob->pred16x16[i] = vp8_rac_get_uint(c, 8);
 388     if (vp8_rac_get(c))
 389         for (i = 0; i < 3; i++)
 390             s->prob->pred8x8c[i]  = vp8_rac_get_uint(c, 8);
 391
 392     // 17.2 MV probability update
 393     for (i = 0; i < 2; i++)
 394         for (j = 0; j < mvc_size; j++)
 395             if (vp56_rac_get_prob_branchy(c, vp8_mv_update_prob[i][j]))
 396                 s->prob->mvc[i][j] = vp8_rac_get_nn(c);
 397 }
 398
 399 static void update_refs(VP8Context *s)
 400 {
 401     VP56RangeCoder *c = &s->c;
 402
 403     int update_golden = vp8_rac_get(c);
 404     int update_altref = vp8_rac_get(c);
 405
 406     s->update_golden = ref_to_update(s, update_golden, VP56_FRAME_GOLDEN);
 407     s->update_altref = ref_to_update(s, update_altref, VP56_FRAME_GOLDEN2);
 408 }
 409
 410 static void copy_luma(AVFrame *dst, AVFrame *src, int width, int height)
 411 {
 412     int i, j;
 413
 414     for (j = 1; j < 3; j++) {
 415         for (i = 0; i < height / 2; i++)
 416             memcpy(dst->data[j] + i * dst->linesize[j],
 417                    src->data[j] + i * src->linesize[j], width / 2);
 418     }
 419 }
 420
 421 static void fade(uint8_t *dst, uint8_t *src,
 422                  int width, int height, int linesize,
 423                  int alpha, int beta)
 424 {
 425     int i, j;
 426
 427     for (j = 0; j < height; j++) {
 428         for (i = 0; i < width; i++) {
 429             uint8_t y = src[j * linesize + i];
 430             dst[j * linesize + i] = av_clip_uint8(y + ((y * beta) >> 8) + alpha);
 431         }
 432     }
 433 }
 434
 435 static int vp7_fade_frame(VP8Context *s, VP56RangeCoder *c)
 436 {
 437     int alpha = (int8_t) vp8_rac_get_uint(c, 8);
 438     int beta  = (int8_t) vp8_rac_get_uint(c, 8);
 439     int ret;
 440
 441     if (!s->keyframe && (alpha || beta)) {
 442         int width  = s->mb_width * 16;
 443         int height = s->mb_height * 16;
 444         AVFrame *src, *dst;
 445
 446         if (!s->framep[VP56_FRAME_PREVIOUS])
 447             return AVERROR_INVALIDDATA;
 448
 449         dst =
 450         src = s->framep[VP56_FRAME_PREVIOUS]->tf.f;
 451
 452         /* preserve the golden frame, write a new previous frame */
 453         if (s->framep[VP56_FRAME_GOLDEN] == s->framep[VP56_FRAME_PREVIOUS]) {
 454             s->framep[VP56_FRAME_PREVIOUS] = vp8_find_free_buffer(s);
 455             if ((ret = vp8_alloc_frame(s, s->framep[VP56_FRAME_PREVIOUS], 1)) < 0)
 456                return ret;
 457
 458             dst = s->framep[VP56_FRAME_PREVIOUS]->tf.f;
 459
 460             copy_luma(dst, src, width, height);
 461         }
 462
 463         fade(dst->data[0], src->data[0],
 464              width, height, dst->linesize[0], alpha, beta);
 465     }
 466
 467     return 0;
 468 }
 469
 470 static int vp7_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size)
 471 {
 472     VP56RangeCoder *c = &s->c;
 473     int part1_size, hscale, vscale, i, j, ret;
 474     int width  = s->avctx->width;
 475     int height = s->avctx->height;
 476
 477     s->profile = (buf[0] >> 1) & 7;
 478     if (s->profile > 1) {
 479         avpriv_request_sample(s->avctx, "Unknown profile %d", s->profile);
 480         return AVERROR_INVALIDDATA;
 481     }
 482
 483     s->keyframe  = !(buf[0] & 1);
 484     s->invisible = 0;
 485     part1_size   = AV_RL24(buf) >> 4;
 486
 487     buf      += 4 - s->profile;
 488     buf_size -= 4 - s->profile;
 489
 490     memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab, sizeof(s->put_pixels_tab));
 491
 492     ff_vp56_init_range_decoder(c, buf, part1_size);
 493     buf      += part1_size;
 494     buf_size -= part1_size;
 495
 496     /* A. Dimension information (keyframes only) */
 497     if (s->keyframe) {
 498         width  = vp8_rac_get_uint(c, 12);
 499         height = vp8_rac_get_uint(c, 12);
 500         hscale = vp8_rac_get_uint(c, 2);
 501         vscale = vp8_rac_get_uint(c, 2);
 502         if (hscale || vscale)
 503             avpriv_request_sample(s->avctx, "Upscaling");
 504
 505         s->update_golden = s->update_altref = VP56_FRAME_CURRENT;
 506         vp78_reset_probability_tables(s);
 507         memcpy(s->prob->pred16x16, vp8_pred16x16_prob_inter,
 508                sizeof(s->prob->pred16x16));
 509         memcpy(s->prob->pred8x8c, vp8_pred8x8c_prob_inter,
 510                sizeof(s->prob->pred8x8c));
 511         for (i = 0; i < 2; i++)
 512             memcpy(s->prob->mvc[i], vp7_mv_default_prob[i],
 513                    sizeof(vp7_mv_default_prob[i]));
 514         memset(&s->segmentation, 0, sizeof(s->segmentation));
 515         memset(&s->lf_delta, 0, sizeof(s->lf_delta));
 516         memcpy(s->prob[0].scan, zigzag_scan, sizeof(s->prob[0].scan));
 517     }
 518
 519     if (s->keyframe || s->profile > 0)
 520         memset(s->inter_dc_pred, 0 , sizeof(s->inter_dc_pred));
 521
 522     /* B. Decoding information for all four macroblock-level features */
 523     for (i = 0; i < 4; i++) {
 524         s->feature_enabled[i] = vp8_rac_get(c);
 525         if (s->feature_enabled[i]) {
 526              s->feature_present_prob[i] = vp8_rac_get_uint(c, 8);
 527
 528              for (j = 0; j < 3; j++)
 529                  s->feature_index_prob[i][j] =
 530                      vp8_rac_get(c) ? vp8_rac_get_uint(c, 8) : 255;
 531
 532              if (vp7_feature_value_size[s->profile][i])
 533                  for (j = 0; j < 4; j++)
 534                      s->feature_value[i][j] =
 535                          vp8_rac_get(c) ? vp8_rac_get_uint(c, vp7_feature_value_size[s->profile][i]) : 0;
 536         }
 537     }
 538
 539     s->segmentation.enabled    = 0;
 540     s->segmentation.update_map = 0;
 541     s->lf_delta.enabled        = 0;
 542
 543     s->num_coeff_partitions = 1;
 544     ff_vp56_init_range_decoder(&s->coeff_partition[0], buf, buf_size);
 545
 546     if (!s->macroblocks_base || /* first frame */
 547         width != s->avctx->width || height != s->avctx->height ||
 548         (width + 15) / 16 != s->mb_width || (height + 15) / 16 != s->mb_height) {
 549         if ((ret = vp7_update_dimensions(s, width, height)) < 0)
 550             return ret;
 551     }
 552
 553     /* C. Dequantization indices */
 554     vp7_get_quants(s);
 555
 556     /* D. Golden frame update flag (a Flag) for interframes only */
 557     if (!s->keyframe) {
 558         s->update_golden = vp8_rac_get(c) ? VP56_FRAME_CURRENT : VP56_FRAME_NONE;
 559         s->sign_bias[VP56_FRAME_GOLDEN] = 0;
 560     }
 561
 562     s->update_last          = 1;
 563     s->update_probabilities = 1;
 564     s->fade_present         = 1;
 565
 566     if (s->profile > 0) {
 567         s->update_probabilities = vp8_rac_get(c);
 568         if (!s->update_probabilities)
 569             s->prob[1] = s->prob[0];
 570
 571         if (!s->keyframe)
 572             s->fade_present = vp8_rac_get(c);
 573     }
 574
 575     /* E. Fading information for previous frame */
 576     if (s->fade_present && vp8_rac_get(c)) {
 577         if ((ret = vp7_fade_frame(s ,c)) < 0)
 578             return ret;
 579     }
 580
 581     /* F. Loop filter type */
 582     if (!s->profile)
 583         s->filter.simple = vp8_rac_get(c);
 584
 585     /* G. DCT coefficient ordering specification */
 586     if (vp8_rac_get(c))
 587         for (i = 1; i < 16; i++)
 588             s->prob[0].scan[i] = zigzag_scan[vp8_rac_get_uint(c, 4)];
 589
 590     /* H. Loop filter levels  */
 591     if (s->profile > 0)
 592         s->filter.simple = vp8_rac_get(c);
 593     s->filter.level     = vp8_rac_get_uint(c, 6);
 594     s->filter.sharpness = vp8_rac_get_uint(c, 3);
 595
 596     /* I. DCT coefficient probability update; 13.3 Token Probability Updates */
 597     vp78_update_probability_tables(s);
 598
 599     s->mbskip_enabled = 0;
 600
 601     /* J. The remaining frame header data occurs ONLY FOR INTERFRAMES */
 602     if (!s->keyframe) {
 603         s->prob->intra  = vp8_rac_get_uint(c, 8);
 604         s->prob->last   = vp8_rac_get_uint(c, 8);
 605         vp78_update_pred16x16_pred8x8_mvc_probabilities(s, VP7_MVC_SIZE);
 606     }
 607
 608     return 0;
 609 }
 610
 611 static int vp8_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size)
 612 {
 613     VP56RangeCoder *c = &s->c;
 614     int header_size, hscale, vscale, ret;
 615     int width  = s->avctx->width;
 616     int height = s->avctx->height;
 617
 618     s->keyframe  = !(buf[0] & 1);
 619     s->profile   =  (buf[0]>>1) & 7;
 620     s->invisible = !(buf[0] & 0x10);
 621     header_size  = AV_RL24(buf) >> 5;
 622     buf      += 3;
 623     buf_size -= 3;
 624
 625     if (s->profile > 3)
 626         av_log(s->avctx, AV_LOG_WARNING, "Unknown profile %d\n", s->profile);
 627
 628     if (!s->profile)
 629         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab,
 630                sizeof(s->put_pixels_tab));
 631     else    // profile 1-3 use bilinear, 4+ aren't defined so whatever
 632         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_bilinear_pixels_tab,
 633                sizeof(s->put_pixels_tab));
 634
 635     if (header_size > buf_size - 7 * s->keyframe) {
 636         av_log(s->avctx, AV_LOG_ERROR, "Header size larger than data provided\n");
 637         return AVERROR_INVALIDDATA;
 638     }
 639
 640     if (s->keyframe) {
 641         if (AV_RL24(buf) != 0x2a019d) {
 642             av_log(s->avctx, AV_LOG_ERROR,
 643                    "Invalid start code 0x%x\n", AV_RL24(buf));
 644             return AVERROR_INVALIDDATA;
 645         }
 646         width     = AV_RL16(buf + 3) & 0x3fff;
 647         height    = AV_RL16(buf + 5) & 0x3fff;
 648         hscale    = buf[4] >> 6;
 649         vscale    = buf[6] >> 6;
 650         buf      += 7;
 651         buf_size -= 7;
 652
 653         if (hscale || vscale)
 654             avpriv_request_sample(s->avctx, "Upscaling");
 655
 656         s->update_golden = s->update_altref = VP56_FRAME_CURRENT;
 657         vp78_reset_probability_tables(s);
 658         memcpy(s->prob->pred16x16, vp8_pred16x16_prob_inter,
 659                sizeof(s->prob->pred16x16));
 660         memcpy(s->prob->pred8x8c, vp8_pred8x8c_prob_inter,
 661                sizeof(s->prob->pred8x8c));
 662         memcpy(s->prob->mvc, vp8_mv_default_prob,
 663                sizeof(s->prob->mvc));
 664         memset(&s->segmentation, 0, sizeof(s->segmentation));
 665         memset(&s->lf_delta, 0, sizeof(s->lf_delta));
 666     }
 667
 668     ff_vp56_init_range_decoder(c, buf, header_size);
 669     buf      += header_size;
 670     buf_size -= header_size;
 671
 672     if (s->keyframe) {
 673         s->colorspace = vp8_rac_get(c);
 674         if (s->colorspace)
 675             av_log(s->avctx, AV_LOG_WARNING, "Unspecified colorspace\n");
 676         s->fullrange = vp8_rac_get(c);
 677     }
 678
 679     if ((s->segmentation.enabled = vp8_rac_get(c)))
 680         parse_segment_info(s);
 681     else
 682         s->segmentation.update_map = 0; // FIXME: move this to some init function?
 683
 684     s->filter.simple    = vp8_rac_get(c);
 685     s->filter.level     = vp8_rac_get_uint(c, 6);
 686     s->filter.sharpness = vp8_rac_get_uint(c, 3);
 687
 688     if ((s->lf_delta.enabled = vp8_rac_get(c)))
 689         if (vp8_rac_get(c))
 690             update_lf_deltas(s);
 691
 692     if (setup_partitions(s, buf, buf_size)) {
 693         av_log(s->avctx, AV_LOG_ERROR, "Invalid partitions\n");
 694         return AVERROR_INVALIDDATA;
 695     }
 696
 697     if (!s->macroblocks_base || /* first frame */
 698         width != s->avctx->width || height != s->avctx->height)
 699         if ((ret = vp8_update_dimensions(s, width, height)) < 0)
 700             return ret;
 701
 702     get_quants(s);
 703
 704     if (!s->keyframe) {
 705         update_refs(s);
 706         s->sign_bias[VP56_FRAME_GOLDEN]               = vp8_rac_get(c);
 707         s->sign_bias[VP56_FRAME_GOLDEN2 /* altref */] = vp8_rac_get(c);
 708     }
 709
 710     // if we aren't saving this frame's probabilities for future frames,
 711     // make a copy of the current probabilities
 712     if (!(s->update_probabilities = vp8_rac_get(c)))
 713         s->prob[1] = s->prob[0];
 714
 715     s->update_last = s->keyframe || vp8_rac_get(c);
 716
 717     vp78_update_probability_tables(s);
 718
 719     if ((s->mbskip_enabled = vp8_rac_get(c)))
 720         s->prob->mbskip = vp8_rac_get_uint(c, 8);
 721
 722     if (!s->keyframe) {
 723         s->prob->intra  = vp8_rac_get_uint(c, 8);
 724         s->prob->last   = vp8_rac_get_uint(c, 8);
 725         s->prob->golden = vp8_rac_get_uint(c, 8);
 726         vp78_update_pred16x16_pred8x8_mvc_probabilities(s, VP8_MVC_SIZE);
 727     }
 728
 729     return 0;
 730 }
 731
 732 static av_always_inline
 733 void clamp_mv(VP8Context *s, VP56mv *dst, const VP56mv *src)
 734 {
 735     dst->x = av_clip(src->x, s->mv_min.x, s->mv_max.x);
 736     dst->y = av_clip(src->y, s->mv_min.y, s->mv_max.y);
 737 }
 738
 739 /**
 740  * Motion vector coding, 17.1.
 741  */
 742 static int read_mv_component(VP56RangeCoder *c, const uint8_t *p, int vp7)
 743 {
 744     int bit, x = 0;
 745
 746     if (vp56_rac_get_prob_branchy(c, p[0])) {
 747         int i;
 748
 749         for (i = 0; i < 3; i++)
 750             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 751         for (i = (vp7 ? 7 : 9); i > 3; i--)
 752             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 753         if (!(x & (vp7 ? 0xF0 : 0xFFF0)) || vp56_rac_get_prob(c, p[12]))
 754             x += 8;
 755     } else {
 756         // small_mvtree
 757         const uint8_t *ps = p + 2;
 758         bit = vp56_rac_get_prob(c, *ps);
 759         ps += 1 + 3 * bit;
 760         x  += 4 * bit;
 761         bit = vp56_rac_get_prob(c, *ps);
 762         ps += 1 + bit;
 763         x  += 2 * bit;
 764         x  += vp56_rac_get_prob(c, *ps);
 765     }
 766
 767     return (x && vp56_rac_get_prob(c, p[1])) ? -x : x;
 768 }
 769
 770 static av_always_inline
 771 const uint8_t *get_submv_prob(uint32_t left, uint32_t top, int is_vp7)
 772 {
 773     if (is_vp7)
 774         return vp7_submv_prob;
 775
 776     if (left == top)
 777         return vp8_submv_prob[4 - !!left];
 778     if (!top)
 779         return vp8_submv_prob[2];
 780     return vp8_submv_prob[1 - !!left];
 781 }
 782
 783 /**
 784  * Split motion vector prediction, 16.4.
 785  * @returns the number of motion vectors parsed (2, 4 or 16)
 786  */
 787 static av_always_inline
 788 int decode_splitmvs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
 789                     int layout, int is_vp7)
 790 {
 791     int part_idx;
 792     int n, num;
 793     VP8Macroblock *top_mb;
 794     VP8Macroblock *left_mb = &mb[-1];
 795     const uint8_t *mbsplits_left = vp8_mbsplits[left_mb->partitioning];
 796     const uint8_t *mbsplits_top, *mbsplits_cur, *firstidx;
 797     VP56mv *top_mv;
 798     VP56mv *left_mv = left_mb->bmv;
 799     VP56mv *cur_mv  = mb->bmv;
 800
 801     if (!layout) // layout is inlined, s->mb_layout is not
 802         top_mb = &mb[2];
 803     else
 804         top_mb = &mb[-s->mb_width - 1];
 805     mbsplits_top = vp8_mbsplits[top_mb->partitioning];
 806     top_mv       = top_mb->bmv;
 807
 808     if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[0])) {
 809         if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[1]))
 810             part_idx = VP8_SPLITMVMODE_16x8 + vp56_rac_get_prob(c, vp8_mbsplit_prob[2]);
 811         else
 812             part_idx = VP8_SPLITMVMODE_8x8;
 813     } else {
 814         part_idx = VP8_SPLITMVMODE_4x4;
 815     }
 816
 817     num              = vp8_mbsplit_count[part_idx];
 818     mbsplits_cur     = vp8_mbsplits[part_idx],
 819     firstidx         = vp8_mbfirstidx[part_idx];
 820     mb->partitioning = part_idx;
 821
 822     for (n = 0; n < num; n++) {
 823         int k = firstidx[n];
 824         uint32_t left, above;
 825         const uint8_t *submv_prob;
 826
 827         if (!(k & 3))
 828             left = AV_RN32A(&left_mv[mbsplits_left[k + 3]]);
 829         else
 830             left = AV_RN32A(&cur_mv[mbsplits_cur[k - 1]]);
 831         if (k <= 3)
 832             above = AV_RN32A(&top_mv[mbsplits_top[k + 12]]);
 833         else
 834             above = AV_RN32A(&cur_mv[mbsplits_cur[k - 4]]);
 835
 836         submv_prob = get_submv_prob(left, above, is_vp7);
 837
 838         if (vp56_rac_get_prob_branchy(c, submv_prob[0])) {
 839             if (vp56_rac_get_prob_branchy(c, submv_prob[1])) {
 840                 if (vp56_rac_get_prob_branchy(c, submv_prob[2])) {
 841                     mb->bmv[n].y = mb->mv.y +
 842                                    read_mv_component(c, s->prob->mvc[0], is_vp7);
 843                     mb->bmv[n].x = mb->mv.x +
 844                                    read_mv_component(c, s->prob->mvc[1], is_vp7);
 845                 } else {
 846                     AV_ZERO32(&mb->bmv[n]);
 847                 }
 848             } else {
 849                 AV_WN32A(&mb->bmv[n], above);
 850             }
 851         } else {
 852             AV_WN32A(&mb->bmv[n], left);
 853         }
 854     }
 855
 856     return num;
 857 }
 858
 859 /**
 860  * The vp7 reference decoder uses a padding macroblock column (added to right
 861  * edge of the frame) to guard against illegal macroblock offsets. The
 862  * algorithm has bugs that permit offsets to straddle the padding column.
 863  * This function replicates those bugs.
 864  *
 865  * @param[out] edge_x macroblock x address
 866  * @param[out] edge_y macroblock y address
 867  *
 868  * @return macroblock offset legal (boolean)
 869  */
 870 static int vp7_calculate_mb_offset(int mb_x, int mb_y, int mb_width,
 871                                    int xoffset, int yoffset, int boundary,
 872                                    int *edge_x, int *edge_y)
 873 {
 874     int vwidth = mb_width + 1;
 875     int new = (mb_y + yoffset) * vwidth + mb_x + xoffset;
 876     if (new < boundary || new % vwidth == vwidth - 1)
 877         return 0;
 878     *edge_y = new / vwidth;
 879     *edge_x = new % vwidth;
 880     return 1;
 881 }
 882
 883 static const VP56mv *get_bmv_ptr(const VP8Macroblock *mb, int subblock)
 884 {
 885     return &mb->bmv[mb->mode == VP8_MVMODE_SPLIT ? vp8_mbsplits[mb->partitioning][subblock] : 0];
 886 }
 887
 888 static av_always_inline
 889 void vp7_decode_mvs(VP8Context *s, VP8Macroblock *mb,
 890                     int mb_x, int mb_y, int layout)
 891 {
 892     VP8Macroblock *mb_edge[12];
 893     enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR };
 894     enum { VP8_EDGE_TOP, VP8_EDGE_LEFT, VP8_EDGE_TOPLEFT };
 895     int idx = CNT_ZERO;
 896     VP56mv near_mv[3];
 897     uint8_t cnt[3] = { 0 };
 898     VP56RangeCoder *c = &s->c;
 899     int i;
 900
 901     AV_ZERO32(&near_mv[0]);
 902     AV_ZERO32(&near_mv[1]);
 903     AV_ZERO32(&near_mv[2]);
 904
 905     for (i = 0; i < VP7_MV_PRED_COUNT; i++) {
 906         const VP7MVPred * pred = &vp7_mv_pred[i];
 907         int edge_x, edge_y;
 908
 909         if (vp7_calculate_mb_offset(mb_x, mb_y, s->mb_width, pred->xoffset,
 910                                     pred->yoffset, !s->profile, &edge_x, &edge_y)) {
 911             VP8Macroblock *edge = mb_edge[i] = (s->mb_layout == 1)
 912                                              ? s->macroblocks_base + 1 + edge_x +
 913                                                (s->mb_width + 1) * (edge_y + 1)
 914                                              : s->macroblocks + edge_x +
 915                                                (s->mb_height - edge_y - 1) * 2;
 916             uint32_t mv = AV_RN32A(get_bmv_ptr(edge, vp7_mv_pred[i].subblock));
 917             if (mv) {
 918                 if (AV_RN32A(&near_mv[CNT_NEAREST])) {
 919                     if (mv == AV_RN32A(&near_mv[CNT_NEAREST])) {
 920                         idx = CNT_NEAREST;
 921                     } else if (AV_RN32A(&near_mv[CNT_NEAR])) {
 922                         if (mv != AV_RN32A(&near_mv[CNT_NEAR]))
 923                             continue;
 924                         idx = CNT_NEAR;
 925                     } else {
 926                         AV_WN32A(&near_mv[CNT_NEAR], mv);
 927                         idx = CNT_NEAR;
 928                     }
 929                 } else {
 930                     AV_WN32A(&near_mv[CNT_NEAREST], mv);
 931                     idx = CNT_NEAREST;
 932                 }
 933             } else {
 934                 idx = CNT_ZERO;
 935             }
 936         } else {
 937             idx = CNT_ZERO;
 938         }
 939         cnt[idx] += vp7_mv_pred[i].score;
 940     }
 941
 942     mb->partitioning = VP8_SPLITMVMODE_NONE;
 943
 944     if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_ZERO]][0])) {
 945         mb->mode = VP8_MVMODE_MV;
 946
 947         if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAREST]][1])) {
 948
 949             if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAR]][2])) {
 950
 951                 if (cnt[CNT_NEAREST] > cnt[CNT_NEAR])
 952                     AV_WN32A(&mb->mv, cnt[CNT_ZERO] > cnt[CNT_NEAREST] ? 0 : AV_RN32A(&near_mv[CNT_NEAREST]));
 953                 else
 954                     AV_WN32A(&mb->mv, cnt[CNT_ZERO] > cnt[CNT_NEAR]    ? 0 : AV_RN32A(&near_mv[CNT_NEAR]));
 955
 956                 if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAR]][3])) {
 957                     mb->mode = VP8_MVMODE_SPLIT;
 958                     mb->mv = mb->bmv[decode_splitmvs(s, c, mb, layout, IS_VP7) - 1];
 959                 } else {
 960                     mb->mv.y += read_mv_component(c, s->prob->mvc[0], IS_VP7);
 961                     mb->mv.x += read_mv_component(c, s->prob->mvc[1], IS_VP7);
 962                     mb->bmv[0] = mb->mv;
 963                 }
 964             } else {
 965                 mb->mv = near_mv[CNT_NEAR];
 966                 mb->bmv[0] = mb->mv;
 967             }
 968         } else {
 969             mb->mv = near_mv[CNT_NEAREST];
 970             mb->bmv[0] = mb->mv;
 971         }
 972     } else {
 973         mb->mode = VP8_MVMODE_ZERO;
 974         AV_ZERO32(&mb->mv);
 975         mb->bmv[0] = mb->mv;
 976     }
 977 }
 978
 979 static av_always_inline
 980 void vp8_decode_mvs(VP8Context *s, VP8Macroblock *mb,
 981                     int mb_x, int mb_y, int layout)
 982 {
 983     VP8Macroblock *mb_edge[3] = { 0      /* top */,
 984                                   mb - 1 /* left */,
 985                                   0      /* top-left */ };
 986     enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV };
 987     enum { VP8_EDGE_TOP, VP8_EDGE_LEFT, VP8_EDGE_TOPLEFT };
 988     int idx = CNT_ZERO;
 989     int cur_sign_bias = s->sign_bias[mb->ref_frame];
 990     int8_t *sign_bias = s->sign_bias;
 991     VP56mv near_mv[4];
 992     uint8_t cnt[4] = { 0 };
 993     VP56RangeCoder *c = &s->c;
 994
 995     if (!layout) { // layout is inlined (s->mb_layout is not)
 996         mb_edge[0] = mb + 2;
 997         mb_edge[2] = mb + 1;
 998     } else {
 999         mb_edge[0] = mb - s->mb_width - 1;
1000         mb_edge[2] = mb - s->mb_width - 2;
1001     }
1002
1003     AV_ZERO32(&near_mv[0]);
1004     AV_ZERO32(&near_mv[1]);
1005     AV_ZERO32(&near_mv[2]);
1006
1007     /* Process MB on top, left and top-left */
1008 #define MV_EDGE_CHECK(n)                                                      \
1009     {                                                                         \
1010         VP8Macroblock *edge = mb_edge[n];                                     \
1011         int edge_ref = edge->ref_frame;                                       \
1012         if (edge_ref != VP56_FRAME_CURRENT) {                                 \
1013             uint32_t mv = AV_RN32A(&edge->mv);                                \
1014             if (mv) {                                                         \
1015                 if (cur_sign_bias != sign_bias[edge_ref]) {                   \
1016                     /* SWAR negate of the values in mv. */                    \
1017                     mv = ~mv;                                                 \
1018                     mv = ((mv & 0x7fff7fff) +                                 \
1019                           0x00010001) ^ (mv & 0x80008000);                    \
1020                 }                                                             \
1021                 if (!n || mv != AV_RN32A(&near_mv[idx]))                      \
1022                     AV_WN32A(&near_mv[++idx], mv);                            \
1023                 cnt[idx] += 1 + (n != 2);                                     \
1024             } else                                                            \
1025                 cnt[CNT_ZERO] += 1 + (n != 2);                                \
1026         }                                                                     \
1027     }
1028
1029     MV_EDGE_CHECK(0)
1030     MV_EDGE_CHECK(1)
1031     MV_EDGE_CHECK(2)
1032
1033     mb->partitioning = VP8_SPLITMVMODE_NONE;
1034     if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_ZERO]][0])) {
1035         mb->mode = VP8_MVMODE_MV;
1036
1037         /* If we have three distinct MVs, merge first and last if they're the same */
1038         if (cnt[CNT_SPLITMV] &&
1039             AV_RN32A(&near_mv[1 + VP8_EDGE_TOP]) == AV_RN32A(&near_mv[1 + VP8_EDGE_TOPLEFT]))
1040             cnt[CNT_NEAREST] += 1;
1041
1042         /* Swap near and nearest if necessary */
1043         if (cnt[CNT_NEAR] > cnt[CNT_NEAREST]) {
1044             FFSWAP(uint8_t,     cnt[CNT_NEAREST],     cnt[CNT_NEAR]);
1045             FFSWAP( VP56mv, near_mv[CNT_NEAREST], near_mv[CNT_NEAR]);
1046         }
1047
1048         if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAREST]][1])) {
1049             if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAR]][2])) {
1050                 /* Choose the best mv out of 0,0 and the nearest mv */
1051                 clamp_mv(s, &mb->mv, &near_mv[CNT_ZERO + (cnt[CNT_NEAREST] >= cnt[CNT_ZERO])]);
1052                 cnt[CNT_SPLITMV] = ((mb_edge[VP8_EDGE_LEFT]->mode    == VP8_MVMODE_SPLIT) +
1053                                     (mb_edge[VP8_EDGE_TOP]->mode     == VP8_MVMODE_SPLIT)) * 2 +
1054                                     (mb_edge[VP8_EDGE_TOPLEFT]->mode == VP8_MVMODE_SPLIT);
1055
1056                 if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_SPLITMV]][3])) {
1057                     mb->mode = VP8_MVMODE_SPLIT;
1058                     mb->mv = mb->bmv[decode_splitmvs(s, c, mb, layout, IS_VP8) - 1];
1059                 } else {
1060                     mb->mv.y  += read_mv_component(c, s->prob->mvc[0], IS_VP8);
1061                     mb->mv.x  += read_mv_component(c, s->prob->mvc[1], IS_VP8);
1062                     mb->bmv[0] = mb->mv;
1063                 }
1064             } else {
1065                 clamp_mv(s, &mb->mv, &near_mv[CNT_NEAR]);
1066                 mb->bmv[0] = mb->mv;
1067             }
1068         } else {
1069             clamp_mv(s, &mb->mv, &near_mv[CNT_NEAREST]);
1070             mb->bmv[0] = mb->mv;
1071         }
1072     } else {
1073         mb->mode = VP8_MVMODE_ZERO;
1074         AV_ZERO32(&mb->mv);
1075         mb->bmv[0] = mb->mv;
1076     }
1077 }
1078
1079 static av_always_inline
1080 void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
1081                            int mb_x, int keyframe, int layout)
1082 {
1083     uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
1084
1085     if (layout == 1) {
1086         VP8Macroblock *mb_top = mb - s->mb_width - 1;
1087         memcpy(mb->intra4x4_pred_mode_top, mb_top->intra4x4_pred_mode_top, 4);
1088     }
1089     if (keyframe) {
1090         int x, y;
1091         uint8_t *top;
1092         uint8_t *const left = s->intra4x4_pred_mode_left;
1093         if (layout == 1)
1094             top = mb->intra4x4_pred_mode_top;
1095         else
1096             top = s->intra4x4_pred_mode_top + 4 * mb_x;
1097         for (y = 0; y < 4; y++) {
1098             for (x = 0; x < 4; x++) {
1099                 const uint8_t *ctx;
1100                 ctx       = vp8_pred4x4_prob_intra[top[x]][left[y]];
1101                 *intra4x4 = vp8_rac_get_tree(c, vp8_pred4x4_tree, ctx);
1102                 left[y]   = top[x] = *intra4x4;
1103                 intra4x4++;
1104             }
1105         }
1106     } else {
1107         int i;
1108         for (i = 0; i < 16; i++)
1109             intra4x4[i] = vp8_rac_get_tree(c, vp8_pred4x4_tree,
1110                                            vp8_pred4x4_prob_inter);
1111     }
1112 }
1113
1114 static av_always_inline
1115 void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
1116                     uint8_t *segment, uint8_t *ref, int layout, int is_vp7)
1117 {
1118     VP56RangeCoder *c = &s->c;
1119     const char *vp7_feature_name[] = { "q-index",
1120                                        "lf-delta",
1121                                        "partial-golden-update",
1122                                        "blit-pitch" };
1123     if (is_vp7) {
1124         int i;
1125         *segment = 0;
1126         for (i = 0; i < 4; i++) {
1127             if (s->feature_enabled[i]) {
1128                 if (vp56_rac_get_prob(c, s->feature_present_prob[i])) {
1129                       int index = vp8_rac_get_tree(c, vp7_feature_index_tree,
1130                                                    s->feature_index_prob[i]);
1131                       av_log(s->avctx, AV_LOG_WARNING,
1132                              "Feature %s present in macroblock (value 0x%x)\n",
1133                              vp7_feature_name[i], s->feature_value[i][index]);
1134                 }
1135            }
1136         }
1137     } else if (s->segmentation.update_map)
1138         *segment = vp8_rac_get_tree(c, vp8_segmentid_tree, s->prob->segmentid);
1139     else if (s->segmentation.enabled)
1140         *segment = ref ? *ref : *segment;
1141     mb->segment = *segment;
1142
1143     mb->skip = s->mbskip_enabled ? vp56_rac_get_prob(c, s->prob->mbskip) : 0;
1144
1145     if (s->keyframe) {
1146         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_intra,
1147                                     vp8_pred16x16_prob_intra);
1148
1149         if (mb->mode == MODE_I4x4) {
1150             decode_intra4x4_modes(s, c, mb, mb_x, 1, layout);
1151         } else {
1152             const uint32_t modes = (is_vp7 ? vp7_pred4x4_mode
1153                                            : vp8_pred4x4_mode)[mb->mode] * 0x01010101u;
1154             if (s->mb_layout == 1)
1155                 AV_WN32A(mb->intra4x4_pred_mode_top, modes);
1156             else
1157                 AV_WN32A(s->intra4x4_pred_mode_top + 4 * mb_x, modes);
1158             AV_WN32A(s->intra4x4_pred_mode_left, modes);
1159         }
1160
1161         mb->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree,
1162                                                 vp8_pred8x8c_prob_intra);
1163         mb->ref_frame        = VP56_FRAME_CURRENT;
1164     } else if (vp56_rac_get_prob_branchy(c, s->prob->intra)) {
1165         // inter MB, 16.2
1166         if (vp56_rac_get_prob_branchy(c, s->prob->last))
1167             mb->ref_frame =
1168                 (!is_vp7 && vp56_rac_get_prob(c, s->prob->golden)) ? VP56_FRAME_GOLDEN2 /* altref */
1169                                                                    : VP56_FRAME_GOLDEN;
1170         else
1171             mb->ref_frame = VP56_FRAME_PREVIOUS;
1172         s->ref_count[mb->ref_frame - 1]++;
1173
1174         // motion vectors, 16.3
1175         if (is_vp7)
1176             vp7_decode_mvs(s, mb, mb_x, mb_y, layout);
1177         else
1178             vp8_decode_mvs(s, mb, mb_x, mb_y, layout);
1179     } else {
1180         // intra MB, 16.1
1181         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_inter, s->prob->pred16x16);
1182
1183         if (mb->mode == MODE_I4x4)
1184             decode_intra4x4_modes(s, c, mb, mb_x, 0, layout);
1185
1186         mb->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree,
1187                                                 s->prob->pred8x8c);
1188         mb->ref_frame        = VP56_FRAME_CURRENT;
1189         mb->partitioning     = VP8_SPLITMVMODE_NONE;
1190         AV_ZERO32(&mb->bmv[0]);
1191     }
1192 }
1193
1194 /**
1195  * @param r     arithmetic bitstream reader context
1196  * @param block destination for block coefficients
1197  * @param probs probabilities to use when reading trees from the bitstream
1198  * @param i     initial coeff index, 0 unless a separate DC block is coded
1199  * @param qmul  array holding the dc/ac dequant factor at position 0/1
1200  *
1201  * @return 0 if no coeffs were decoded
1202  *         otherwise, the index of the last coeff decoded plus one
1203  */
1204 static av_always_inline
1205 int decode_block_coeffs_internal(VP56RangeCoder *r, int16_t block[16],
1206                                  uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1207                                  int i, uint8_t *token_prob, int16_t qmul[2],
1208                                  const uint8_t scan[16], int vp7)
1209 {
1210     VP56RangeCoder c = *r;
1211     goto skip_eob;
1212     do {
1213         int coeff;
1214 restart:
1215         if (!vp56_rac_get_prob_branchy(&c, token_prob[0]))   // DCT_EOB
1216             break;
1217
1218 skip_eob:
1219         if (!vp56_rac_get_prob_branchy(&c, token_prob[1])) { // DCT_0
1220             if (++i == 16)
1221                 break; // invalid input; blocks should end with EOB
1222             token_prob = probs[i][0];
1223             if (vp7)
1224                 goto restart;
1225             goto skip_eob;
1226         }
1227
1228         if (!vp56_rac_get_prob_branchy(&c, token_prob[2])) { // DCT_1
1229             coeff = 1;
1230             token_prob = probs[i + 1][1];
1231         } else {
1232             if (!vp56_rac_get_prob_branchy(&c, token_prob[3])) { // DCT 2,3,4
1233                 coeff = vp56_rac_get_prob_branchy(&c, token_prob[4]);
1234                 if (coeff)
1235                     coeff += vp56_rac_get_prob(&c, token_prob[5]);
1236                 coeff += 2;
1237             } else {
1238                 // DCT_CAT*
1239                 if (!vp56_rac_get_prob_branchy(&c, token_prob[6])) {
1240                     if (!vp56_rac_get_prob_branchy(&c, token_prob[7])) { // DCT_CAT1
1241                         coeff = 5 + vp56_rac_get_prob(&c, vp8_dct_cat1_prob[0]);
1242                     } else {                                    // DCT_CAT2
1243                         coeff  = 7;
1244                         coeff += vp56_rac_get_prob(&c, vp8_dct_cat2_prob[0]) << 1;
1245                         coeff += vp56_rac_get_prob(&c, vp8_dct_cat2_prob[1]);
1246                     }
1247                 } else {    // DCT_CAT3 and up
1248                     int a   = vp56_rac_get_prob(&c, token_prob[8]);
1249                     int b   = vp56_rac_get_prob(&c, token_prob[9 + a]);
1250                     int cat = (a << 1) + b;
1251                     coeff  = 3 + (8 << cat);
1252                     coeff += vp8_rac_get_coeff(&c, ff_vp8_dct_cat_prob[cat]);
1253                 }
1254             }
1255             token_prob = probs[i + 1][2];
1256         }
1257         block[scan[i]] = (vp8_rac_get(&c) ? -coeff : coeff) * qmul[!!i];
1258     } while (++i < 16);
1259
1260     *r = c;
1261     return i;
1262 }
1263
1264 static av_always_inline
1265 int inter_predict_dc(int16_t block[16], int16_t pred[2])
1266 {
1267     int16_t dc = block[0];
1268     int ret = 0;
1269
1270     if (pred[1] > 3) {
1271         dc += pred[0];
1272         ret = 1;
1273     }
1274
1275     if (!pred[0] | !dc | ((int32_t)pred[0] ^ (int32_t)dc) >> 31) {
1276         block[0] = pred[0] = dc;
1277         pred[1] = 0;
1278     } else {
1279         if (pred[0] == dc)
1280             pred[1]++;
1281         block[0] = pred[0] = dc;
1282     }
1283
1284     return ret;
1285 }
1286
1287 static int vp7_decode_block_coeffs_internal(VP56RangeCoder *r,
1288                                             int16_t block[16],
1289                                             uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1290                                             int i, uint8_t *token_prob,
1291                                             int16_t qmul[2],
1292                                             const uint8_t scan[16])
1293 {
1294     return decode_block_coeffs_internal(r, block, probs, i,
1295                                         token_prob, qmul, scan, IS_VP7);
1296 }
1297
1298 #ifndef vp8_decode_block_coeffs_internal
1299 static int vp8_decode_block_coeffs_internal(VP56RangeCoder *r,
1300                                             int16_t block[16],
1301                                             uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1302                                             int i, uint8_t *token_prob,
1303                                             int16_t qmul[2])
1304 {
1305     return decode_block_coeffs_internal(r, block, probs, i,
1306                                         token_prob, qmul, zigzag_scan, IS_VP8);
1307 }
1308 #endif
1309
1310 /**
1311  * @param c          arithmetic bitstream reader context
1312  * @param block      destination for block coefficients
1313  * @param probs      probabilities to use when reading trees from the bitstream
1314  * @param i          initial coeff index, 0 unless a separate DC block is coded
1315  * @param zero_nhood the initial prediction context for number of surrounding
1316  *                   all-zero blocks (only left/top, so 0-2)
1317  * @param qmul       array holding the dc/ac dequant factor at position 0/1
1318  *
1319  * @return 0 if no coeffs were decoded
1320  *         otherwise, the index of the last coeff decoded plus one
1321  */
1322 static av_always_inline
1323 int decode_block_coeffs(VP56RangeCoder *c, int16_t block[16],
1324                         uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1325                         int i, int zero_nhood, int16_t qmul[2],
1326                         const uint8_t scan[16], int vp7)
1327 {
1328     uint8_t *token_prob = probs[i][zero_nhood];
1329     if (!vp56_rac_get_prob_branchy(c, token_prob[0]))   // DCT_EOB
1330         return 0;
1331     return vp7 ? vp7_decode_block_coeffs_internal(c, block, probs, i,
1332                                                   token_prob, qmul, scan)
1333                : vp8_decode_block_coeffs_internal(c, block, probs, i,
1334                                                   token_prob, qmul);
1335 }
1336
1337 static av_always_inline
1338 void decode_mb_coeffs(VP8Context *s, VP8ThreadData *td, VP56RangeCoder *c,
1339                       VP8Macroblock *mb, uint8_t t_nnz[9], uint8_t l_nnz[9],
1340                       int is_vp7)
1341 {
1342     int i, x, y, luma_start = 0, luma_ctx = 3;
1343     int nnz_pred, nnz, nnz_total = 0;
1344     int segment = mb->segment;
1345     int block_dc = 0;
1346
1347     if (mb->mode != MODE_I4x4 && (is_vp7 || mb->mode != VP8_MVMODE_SPLIT)) {
1348         nnz_pred = t_nnz[8] + l_nnz[8];
1349
1350         // decode DC values and do hadamard
1351         nnz = decode_block_coeffs(c, td->block_dc, s->prob->token[1], 0,
1352                                   nnz_pred, s->qmat[segment].luma_dc_qmul,
1353                                   zigzag_scan, is_vp7);
1354         l_nnz[8] = t_nnz[8] = !!nnz;
1355
1356         if (is_vp7 && mb->mode > MODE_I4x4) {
1357             nnz |=  inter_predict_dc(td->block_dc,
1358                                      s->inter_dc_pred[mb->ref_frame - 1]);
1359         }
1360
1361         if (nnz) {
1362             nnz_total += nnz;
1363             block_dc   = 1;
1364             if (nnz == 1)
1365                 s->vp8dsp.vp8_luma_dc_wht_dc(td->block, td->block_dc);
1366             else
1367                 s->vp8dsp.vp8_luma_dc_wht(td->block, td->block_dc);
1368         }
1369         luma_start = 1;
1370         luma_ctx   = 0;
1371     }
1372
1373     // luma blocks
1374     for (y = 0; y < 4; y++)
1375         for (x = 0; x < 4; x++) {
1376             nnz_pred = l_nnz[y] + t_nnz[x];
1377             nnz = decode_block_coeffs(c, td->block[y][x],
1378                                       s->prob->token[luma_ctx],
1379                                       luma_start, nnz_pred,
1380                                       s->qmat[segment].luma_qmul,
1381                                       s->prob[0].scan, is_vp7);
1382             /* nnz+block_dc may be one more than the actual last index,
1383              * but we don't care */
1384             td->non_zero_count_cache[y][x] = nnz + block_dc;
1385             t_nnz[x] = l_nnz[y] = !!nnz;
1386             nnz_total += nnz;
1387         }
1388
1389     // chroma blocks
1390     // TODO: what to do about dimensions? 2nd dim for luma is x,
1391     // but for chroma it's (y<<1)|x
1392     for (i = 4; i < 6; i++)
1393         for (y = 0; y < 2; y++)
1394             for (x = 0; x < 2; x++) {
1395                 nnz_pred = l_nnz[i + 2 * y] + t_nnz[i + 2 * x];
1396                 nnz = decode_block_coeffs(c, td->block[i][(y << 1) + x],
1397                                           s->prob->token[2], 0, nnz_pred,
1398                                           s->qmat[segment].chroma_qmul,
1399                                           s->prob[0].scan, is_vp7);
1400                 td->non_zero_count_cache[i][(y << 1) + x] = nnz;
1401                 t_nnz[i + 2 * x] = l_nnz[i + 2 * y] = !!nnz;
1402                 nnz_total += nnz;
1403             }
1404
1405     // if there were no coded coeffs despite the macroblock not being marked skip,
1406     // we MUST not do the inner loop filter and should not do IDCT
1407     // Since skip isn't used for bitstream prediction, just manually set it.
1408     if (!nnz_total)
1409         mb->skip = 1;
1410 }
1411
1412 static av_always_inline
1413 void backup_mb_border(uint8_t *top_border, uint8_t *src_y,
1414                       uint8_t *src_cb, uint8_t *src_cr,
1415                       int linesize, int uvlinesize, int simple)
1416 {
1417     AV_COPY128(top_border, src_y + 15 * linesize);
1418     if (!simple) {
1419         AV_COPY64(top_border + 16, src_cb + 7 * uvlinesize);
1420         AV_COPY64(top_border + 24, src_cr + 7 * uvlinesize);
1421     }
1422 }
1423
1424 static av_always_inline
1425 void xchg_mb_border(uint8_t *top_border, uint8_t *src_y, uint8_t *src_cb,
1426                     uint8_t *src_cr, int linesize, int uvlinesize, int mb_x,
1427                     int mb_y, int mb_width, int simple, int xchg)
1428 {
1429     uint8_t *top_border_m1 = top_border - 32;     // for TL prediction
1430     src_y  -= linesize;
1431     src_cb -= uvlinesize;
1432     src_cr -= uvlinesize;
1433
1434 #define XCHG(a, b, xchg)                                                      \
1435     do {                                                                      \
1436         if (xchg)                                                             \
1437             AV_SWAP64(b, a);                                                  \
1438         else                                                                  \
1439             AV_COPY64(b, a);                                                  \
1440     } while (0)
1441
1442     XCHG(top_border_m1 + 8, src_y - 8, xchg);
1443     XCHG(top_border, src_y, xchg);
1444     XCHG(top_border + 8, src_y + 8, 1);
1445     if (mb_x < mb_width - 1)
1446         XCHG(top_border + 32, src_y + 16, 1);
1447
1448     // only copy chroma for normal loop filter
1449     // or to initialize the top row to 127
1450     if (!simple || !mb_y) {
1451         XCHG(top_border_m1 + 16, src_cb - 8, xchg);
1452         XCHG(top_border_m1 + 24, src_cr - 8, xchg);
1453         XCHG(top_border + 16, src_cb, 1);
1454         XCHG(top_border + 24, src_cr, 1);
1455     }
1456 }
1457
1458 static av_always_inline
1459 int check_dc_pred8x8_mode(int mode, int mb_x, int mb_y)
1460 {
1461     if (!mb_x)
1462         return mb_y ? TOP_DC_PRED8x8 : DC_128_PRED8x8;
1463     else
1464         return mb_y ? mode : LEFT_DC_PRED8x8;
1465 }
1466
1467 static av_always_inline
1468 int check_tm_pred8x8_mode(int mode, int mb_x, int mb_y, int vp7)
1469 {
1470     if (!mb_x)
1471         return mb_y ? VERT_PRED8x8 : (vp7 ? DC_128_PRED8x8 : DC_129_PRED8x8);
1472     else
1473         return mb_y ? mode : HOR_PRED8x8;
1474 }
1475
1476 static av_always_inline
1477 int check_intra_pred8x8_mode_emuedge(int mode, int mb_x, int mb_y, int vp7)
1478 {
1479     switch (mode) {
1480     case DC_PRED8x8:
1481         return check_dc_pred8x8_mode(mode, mb_x, mb_y);
1482     case VERT_PRED8x8:
1483         return !mb_y ? (vp7 ? DC_128_PRED8x8 : DC_127_PRED8x8) : mode;
1484     case HOR_PRED8x8:
1485         return !mb_x ? (vp7 ? DC_128_PRED8x8 : DC_129_PRED8x8) : mode;
1486     case PLANE_PRED8x8: /* TM */
1487         return check_tm_pred8x8_mode(mode, mb_x, mb_y, vp7);
1488     }
1489     return mode;
1490 }
1491
1492 static av_always_inline
1493 int check_tm_pred4x4_mode(int mode, int mb_x, int mb_y, int vp7)
1494 {
1495     if (!mb_x) {
1496         return mb_y ? VERT_VP8_PRED : (vp7 ? DC_128_PRED : DC_129_PRED);
1497     } else {
1498         return mb_y ? mode : HOR_VP8_PRED;
1499     }
1500 }
1501
1502 static av_always_inline
1503 int check_intra_pred4x4_mode_emuedge(int mode, int mb_x, int mb_y,
1504                                      int *copy_buf, int vp7)
1505 {
1506     switch (mode) {
1507     case VERT_PRED:
1508         if (!mb_x && mb_y) {
1509             *copy_buf = 1;
1510             return mode;
1511         }
1512         /* fall-through */
1513     case DIAG_DOWN_LEFT_PRED:
1514     case VERT_LEFT_PRED:
1515         return !mb_y ? (vp7 ? DC_128_PRED : DC_127_PRED) : mode;
1516     case HOR_PRED:
1517         if (!mb_y) {
1518             *copy_buf = 1;
1519             return mode;
1520         }
1521         /* fall-through */
1522     case HOR_UP_PRED:
1523         return !mb_x ? (vp7 ? DC_128_PRED : DC_129_PRED) : mode;
1524     case TM_VP8_PRED:
1525         return check_tm_pred4x4_mode(mode, mb_x, mb_y, vp7);
1526     case DC_PRED: /* 4x4 DC doesn't use the same "H.264-style" exceptions
1527                    * as 16x16/8x8 DC */
1528     case DIAG_DOWN_RIGHT_PRED:
1529     case VERT_RIGHT_PRED:
1530     case HOR_DOWN_PRED:
1531         if (!mb_y || !mb_x)
1532             *copy_buf = 1;
1533         return mode;
1534     }
1535     return mode;
1536 }
1537
1538 static av_always_inline
1539 void intra_predict(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1540                    VP8Macroblock *mb, int mb_x, int mb_y, int is_vp7)
1541 {
1542     int x, y, mode, nnz;
1543     uint32_t tr;
1544
1545     /* for the first row, we need to run xchg_mb_border to init the top edge
1546      * to 127 otherwise, skip it if we aren't going to deblock */
1547     if (mb_y && (s->deblock_filter || !mb_y) && td->thread_nr == 0)
1548         xchg_mb_border(s->top_border[mb_x + 1], dst[0], dst[1], dst[2],
1549                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1550                        s->filter.simple, 1);
1551
1552     if (mb->mode < MODE_I4x4) {
1553         mode = check_intra_pred8x8_mode_emuedge(mb->mode, mb_x, mb_y, is_vp7);
1554         s->hpc.pred16x16[mode](dst[0], s->linesize);
1555     } else {
1556         uint8_t *ptr = dst[0];
1557         uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
1558         const uint8_t lo = is_vp7 ? 128 : 127;
1559         const uint8_t hi = is_vp7 ? 128 : 129;
1560         uint8_t tr_top[4] = { lo, lo, lo, lo };
1561
1562         // all blocks on the right edge of the macroblock use bottom edge
1563         // the top macroblock for their topright edge
1564         uint8_t *tr_right = ptr - s->linesize + 16;
1565
1566         // if we're on the right edge of the frame, said edge is extended
1567         // from the top macroblock
1568         if (mb_y && mb_x == s->mb_width - 1) {
1569             tr       = tr_right[-1] * 0x01010101u;
1570             tr_right = (uint8_t *) &tr;
1571         }
1572
1573         if (mb->skip)
1574             AV_ZERO128(td->non_zero_count_cache);
1575
1576         for (y = 0; y < 4; y++) {
1577             uint8_t *topright = ptr + 4 - s->linesize;
1578             for (x = 0; x < 4; x++) {
1579                 int copy = 0, linesize = s->linesize;
1580                 uint8_t *dst = ptr + 4 * x;
1581                 DECLARE_ALIGNED(4, uint8_t, copy_dst)[5 * 8];
1582
1583                 if ((y == 0 || x == 3) && mb_y == 0) {
1584                     topright = tr_top;
1585                 } else if (x == 3)
1586                     topright = tr_right;
1587
1588                 mode = check_intra_pred4x4_mode_emuedge(intra4x4[x], mb_x + x,
1589                                                         mb_y + y, &copy, is_vp7);
1590                 if (copy) {
1591                     dst      = copy_dst + 12;
1592                     linesize = 8;
1593                     if (!(mb_y + y)) {
1594                         copy_dst[3] = lo;
1595                         AV_WN32A(copy_dst + 4, lo * 0x01010101U);
1596                     } else {
1597                         AV_COPY32(copy_dst + 4, ptr + 4 * x - s->linesize);
1598                         if (!(mb_x + x)) {
1599                             copy_dst[3] = hi;
1600                         } else {
1601                             copy_dst[3] = ptr[4 * x - s->linesize - 1];
1602                         }
1603                     }
1604                     if (!(mb_x + x)) {
1605                         copy_dst[11] =
1606                         copy_dst[19] =
1607                         copy_dst[27] =
1608                         copy_dst[35] = hi;
1609                     } else {
1610                         copy_dst[11] = ptr[4 * x                   - 1];
1611                         copy_dst[19] = ptr[4 * x + s->linesize     - 1];
1612                         copy_dst[27] = ptr[4 * x + s->linesize * 2 - 1];
1613                         copy_dst[35] = ptr[4 * x + s->linesize * 3 - 1];
1614                     }
1615                 }
1616                 s->hpc.pred4x4[mode](dst, topright, linesize);
1617                 if (copy) {
1618                     AV_COPY32(ptr + 4 * x,                   copy_dst + 12);
1619                     AV_COPY32(ptr + 4 * x + s->linesize,     copy_dst + 20);
1620                     AV_COPY32(ptr + 4 * x + s->linesize * 2, copy_dst + 28);
1621                     AV_COPY32(ptr + 4 * x + s->linesize * 3, copy_dst + 36);
1622                 }
1623
1624                 nnz = td->non_zero_count_cache[y][x];
1625                 if (nnz) {
1626                     if (nnz == 1)
1627                         s->vp8dsp.vp8_idct_dc_add(ptr + 4 * x,
1628                                                   td->block[y][x], s->linesize);
1629                     else
1630                         s->vp8dsp.vp8_idct_add(ptr + 4 * x,
1631                                                td->block[y][x], s->linesize);
1632                 }
1633                 topright += 4;
1634             }
1635
1636             ptr      += 4 * s->linesize;
1637             intra4x4 += 4;
1638         }
1639     }
1640
1641     mode = check_intra_pred8x8_mode_emuedge(mb->chroma_pred_mode,
1642                                             mb_x, mb_y, is_vp7);
1643     s->hpc.pred8x8[mode](dst[1], s->uvlinesize);
1644     s->hpc.pred8x8[mode](dst[2], s->uvlinesize);
1645
1646     if (mb_y && (s->deblock_filter || !mb_y) && td->thread_nr == 0)
1647         xchg_mb_border(s->top_border[mb_x + 1], dst[0], dst[1], dst[2],
1648                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1649                        s->filter.simple, 0);
1650 }
1651
1652 static const uint8_t subpel_idx[3][8] = {
1653     { 0, 1, 2, 1, 2, 1, 2, 1 }, // nr. of left extra pixels,
1654                                 // also function pointer index
1655     { 0, 3, 5, 3, 5, 3, 5, 3 }, // nr. of extra pixels required
1656     { 0, 2, 3, 2, 3, 2, 3, 2 }, // nr. of right extra pixels
1657 };
1658
1659 /**
1660  * luma MC function
1661  *
1662  * @param s        VP8 decoding context
1663  * @param dst      target buffer for block data at block position
1664  * @param ref      reference picture buffer at origin (0, 0)
1665  * @param mv       motion vector (relative to block position) to get pixel data from
1666  * @param x_off    horizontal position of block from origin (0, 0)
1667  * @param y_off    vertical position of block from origin (0, 0)
1668  * @param block_w  width of block (16, 8 or 4)
1669  * @param block_h  height of block (always same as block_w)
1670  * @param width    width of src/dst plane data
1671  * @param height   height of src/dst plane data
1672  * @param linesize size of a single line of plane data, including padding
1673  * @param mc_func  motion compensation function pointers (bilinear or sixtap MC)
1674  */
1675 static av_always_inline
1676 void vp8_mc_luma(VP8Context *s, VP8ThreadData *td, uint8_t *dst,
1677                  ThreadFrame *ref, const VP56mv *mv,
1678                  int x_off, int y_off, int block_w, int block_h,
1679                  int width, int height, ptrdiff_t linesize,
1680                  vp8_mc_func mc_func[3][3])
1681 {
1682     uint8_t *src = ref->f->data[0];
1683
1684     if (AV_RN32A(mv)) {
1685         int src_linesize = linesize;
1686
1687         int mx = (mv->x << 1) & 7, mx_idx = subpel_idx[0][mx];
1688         int my = (mv->y << 1) & 7, my_idx = subpel_idx[0][my];
1689
1690         x_off += mv->x >> 2;
1691         y_off += mv->y >> 2;
1692
1693         // edge emulation
1694         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 4, 0);
1695         src += y_off * linesize + x_off;
1696         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1697             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1698             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1699                                      src - my_idx * linesize - mx_idx,
1700                                      EDGE_EMU_LINESIZE, linesize,
1701                                      block_w + subpel_idx[1][mx],
1702                                      block_h + subpel_idx[1][my],
1703                                      x_off - mx_idx, y_off - my_idx,
1704                                      width, height);
1705             src = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
1706             src_linesize = EDGE_EMU_LINESIZE;
1707         }
1708         mc_func[my_idx][mx_idx](dst, linesize, src, src_linesize, block_h, mx, my);
1709     } else {
1710         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 4, 0);
1711         mc_func[0][0](dst, linesize, src + y_off * linesize + x_off,
1712                       linesize, block_h, 0, 0);
1713     }
1714 }
1715
1716 /**
1717  * chroma MC function
1718  *
1719  * @param s        VP8 decoding context
1720  * @param dst1     target buffer for block data at block position (U plane)
1721  * @param dst2     target buffer for block data at block position (V plane)
1722  * @param ref      reference picture buffer at origin (0, 0)
1723  * @param mv       motion vector (relative to block position) to get pixel data from
1724  * @param x_off    horizontal position of block from origin (0, 0)
1725  * @param y_off    vertical position of block from origin (0, 0)
1726  * @param block_w  width of block (16, 8 or 4)
1727  * @param block_h  height of block (always same as block_w)
1728  * @param width    width of src/dst plane data
1729  * @param height   height of src/dst plane data
1730  * @param linesize size of a single line of plane data, including padding
1731  * @param mc_func  motion compensation function pointers (bilinear or sixtap MC)
1732  */
1733 static av_always_inline
1734 void vp8_mc_chroma(VP8Context *s, VP8ThreadData *td, uint8_t *dst1,
1735                    uint8_t *dst2, ThreadFrame *ref, const VP56mv *mv,
1736                    int x_off, int y_off, int block_w, int block_h,
1737                    int width, int height, ptrdiff_t linesize,
1738                    vp8_mc_func mc_func[3][3])
1739 {
1740     uint8_t *src1 = ref->f->data[1], *src2 = ref->f->data[2];
1741
1742     if (AV_RN32A(mv)) {
1743         int mx = mv->x & 7, mx_idx = subpel_idx[0][mx];
1744         int my = mv->y & 7, my_idx = subpel_idx[0][my];
1745
1746         x_off += mv->x >> 3;
1747         y_off += mv->y >> 3;
1748
1749         // edge emulation
1750         src1 += y_off * linesize + x_off;
1751         src2 += y_off * linesize + x_off;
1752         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 3, 0);
1753         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1754             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1755             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1756                                      src1 - my_idx * linesize - mx_idx,
1757                                      EDGE_EMU_LINESIZE, linesize,
1758                                      block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1759                                      x_off - mx_idx, y_off - my_idx, width, height);
1760             src1 = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
1761             mc_func[my_idx][mx_idx](dst1, linesize, src1, EDGE_EMU_LINESIZE, block_h, mx, my);
1762
1763             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1764                                      src2 - my_idx * linesize - mx_idx,
1765                                      EDGE_EMU_LINESIZE, linesize,
1766                                      block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1767                                      x_off - mx_idx, y_off - my_idx, width, height);
1768             src2 = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
1769             mc_func[my_idx][mx_idx](dst2, linesize, src2, EDGE_EMU_LINESIZE, block_h, mx, my);
1770         } else {
1771             mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my);
1772             mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my);
1773         }
1774     } else {
1775         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 3, 0);
1776         mc_func[0][0](dst1, linesize, src1 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1777         mc_func[0][0](dst2, linesize, src2 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1778     }
1779 }
1780
1781 static av_always_inline
1782 void vp8_mc_part(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1783                  ThreadFrame *ref_frame, int x_off, int y_off,
1784                  int bx_off, int by_off, int block_w, int block_h,
1785                  int width, int height, VP56mv *mv)
1786 {
1787     VP56mv uvmv = *mv;
1788
1789     /* Y */
1790     vp8_mc_luma(s, td, dst[0] + by_off * s->linesize + bx_off,
1791                 ref_frame, mv, x_off + bx_off, y_off + by_off,
1792                 block_w, block_h, width, height, s->linesize,
1793                 s->put_pixels_tab[block_w == 8]);
1794
1795     /* U/V */
1796     if (s->profile == 3) {
1797         /* this block only applies VP8; it is safe to check
1798          * only the profile, as VP7 profile <= 1 */
1799         uvmv.x &= ~7;
1800         uvmv.y &= ~7;
1801     }
1802     x_off   >>= 1;
1803     y_off   >>= 1;
1804     bx_off  >>= 1;
1805     by_off  >>= 1;
1806     width   >>= 1;
1807     height  >>= 1;
1808     block_w >>= 1;
1809     block_h >>= 1;
1810     vp8_mc_chroma(s, td, dst[1] + by_off * s->uvlinesize + bx_off,
1811                   dst[2] + by_off * s->uvlinesize + bx_off, ref_frame,
1812                   &uvmv, x_off + bx_off, y_off + by_off,
1813                   block_w, block_h, width, height, s->uvlinesize,
1814                   s->put_pixels_tab[1 + (block_w == 4)]);
1815 }
1816
1817 /* Fetch pixels for estimated mv 4 macroblocks ahead.
1818  * Optimized for 64-byte cache lines. Inspired by ffh264 prefetch_motion. */
1819 static av_always_inline
1820 void prefetch_motion(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
1821                      int mb_xy, int ref)
1822 {
1823     /* Don't prefetch refs that haven't been used very often this frame. */
1824     if (s->ref_count[ref - 1] > (mb_xy >> 5)) {
1825         int x_off = mb_x << 4, y_off = mb_y << 4;
1826         int mx = (mb->mv.x >> 2) + x_off + 8;
1827         int my = (mb->mv.y >> 2) + y_off;
1828         uint8_t **src = s->framep[ref]->tf.f->data;
1829         int off = mx + (my + (mb_x & 3) * 4) * s->linesize + 64;
1830         /* For threading, a ff_thread_await_progress here might be useful, but
1831          * it actually slows down the decoder. Since a bad prefetch doesn't
1832          * generate bad decoder output, we don't run it here. */
1833         s->vdsp.prefetch(src[0] + off, s->linesize, 4);
1834         off = (mx >> 1) + ((my >> 1) + (mb_x & 7)) * s->uvlinesize + 64;
1835         s->vdsp.prefetch(src[1] + off, src[2] - src[1], 2);
1836     }
1837 }
1838
1839 /**
1840  * Apply motion vectors to prediction buffer, chapter 18.
1841  */
1842 static av_always_inline
1843 void inter_predict(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1844                    VP8Macroblock *mb, int mb_x, int mb_y)
1845 {
1846     int x_off = mb_x << 4, y_off = mb_y << 4;
1847     int width = 16 * s->mb_width, height = 16 * s->mb_height;
1848     ThreadFrame *ref = &s->framep[mb->ref_frame]->tf;
1849     VP56mv *bmv = mb->bmv;
1850
1851     switch (mb->partitioning) {
1852     case VP8_SPLITMVMODE_NONE:
1853         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1854                     0, 0, 16, 16, width, height, &mb->mv);
1855         break;
1856     case VP8_SPLITMVMODE_4x4: {
1857         int x, y;
1858         VP56mv uvmv;
1859
1860         /* Y */
1861         for (y = 0; y < 4; y++) {
1862             for (x = 0; x < 4; x++) {
1863                 vp8_mc_luma(s, td, dst[0] + 4 * y * s->linesize + x * 4,
1864                             ref, &bmv[4 * y + x],
1865                             4 * x + x_off, 4 * y + y_off, 4, 4,
1866                             width, height, s->linesize,
1867                             s->put_pixels_tab[2]);
1868             }
1869         }
1870
1871         /* U/V */
1872         x_off  >>= 1;
1873         y_off  >>= 1;
1874         width  >>= 1;
1875         height >>= 1;
1876         for (y = 0; y < 2; y++) {
1877             for (x = 0; x < 2; x++) {
1878                 uvmv.x = mb->bmv[2 * y       * 4 + 2 * x    ].x +
1879                          mb->bmv[2 * y       * 4 + 2 * x + 1].x +
1880                          mb->bmv[(2 * y + 1) * 4 + 2 * x    ].x +
1881                          mb->bmv[(2 * y + 1) * 4 + 2 * x + 1].x;
1882                 uvmv.y = mb->bmv[2 * y       * 4 + 2 * x    ].y +
1883                          mb->bmv[2 * y       * 4 + 2 * x + 1].y +
1884                          mb->bmv[(2 * y + 1) * 4 + 2 * x    ].y +
1885                          mb->bmv[(2 * y + 1) * 4 + 2 * x + 1].y;
1886                 uvmv.x = (uvmv.x + 2 + FF_SIGNBIT(uvmv.x)) >> 2;
1887                 uvmv.y = (uvmv.y + 2 + FF_SIGNBIT(uvmv.y)) >> 2;
1888                 if (s->profile == 3) {
1889                     uvmv.x &= ~7;
1890                     uvmv.y &= ~7;
1891                 }
1892                 vp8_mc_chroma(s, td, dst[1] + 4 * y * s->uvlinesize + x * 4,
1893                               dst[2] + 4 * y * s->uvlinesize + x * 4, ref,
1894                               &uvmv, 4 * x + x_off, 4 * y + y_off, 4, 4,
1895                               width, height, s->uvlinesize,
1896                               s->put_pixels_tab[2]);
1897             }
1898         }
1899         break;
1900     }
1901     case VP8_SPLITMVMODE_16x8:
1902         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1903                     0, 0, 16, 8, width, height, &bmv[0]);
1904         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1905                     0, 8, 16, 8, width, height, &bmv[1]);
1906         break;
1907     case VP8_SPLITMVMODE_8x16:
1908         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1909                     0, 0, 8, 16, width, height, &bmv[0]);
1910         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1911                     8, 0, 8, 16, width, height, &bmv[1]);
1912         break;
1913     case VP8_SPLITMVMODE_8x8:
1914         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1915                     0, 0, 8, 8, width, height, &bmv[0]);
1916         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1917                     8, 0, 8, 8, width, height, &bmv[1]);
1918         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1919                     0, 8, 8, 8, width, height, &bmv[2]);
1920         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1921                     8, 8, 8, 8, width, height, &bmv[3]);
1922         break;
1923     }
1924 }
1925
1926 static av_always_inline
1927 void idct_mb(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3], VP8Macroblock *mb)
1928 {
1929     int x, y, ch;
1930
1931     if (mb->mode != MODE_I4x4) {
1932         uint8_t *y_dst = dst[0];
1933         for (y = 0; y < 4; y++) {
1934             uint32_t nnz4 = AV_RL32(td->non_zero_count_cache[y]);
1935             if (nnz4) {
1936                 if (nnz4 & ~0x01010101) {
1937                     for (x = 0; x < 4; x++) {
1938                         if ((uint8_t) nnz4 == 1)
1939                             s->vp8dsp.vp8_idct_dc_add(y_dst + 4 * x,
1940                                                       td->block[y][x],
1941                                                       s->linesize);
1942                         else if ((uint8_t) nnz4 > 1)
1943                             s->vp8dsp.vp8_idct_add(y_dst + 4 * x,
1944                                                    td->block[y][x],
1945                                                    s->linesize);
1946                         nnz4 >>= 8;
1947                         if (!nnz4)
1948                             break;
1949                     }
1950                 } else {
1951                     s->vp8dsp.vp8_idct_dc_add4y(y_dst, td->block[y], s->linesize);
1952                 }
1953             }
1954             y_dst += 4 * s->linesize;
1955         }
1956     }
1957
1958     for (ch = 0; ch < 2; ch++) {
1959         uint32_t nnz4 = AV_RL32(td->non_zero_count_cache[4 + ch]);
1960         if (nnz4) {
1961             uint8_t *ch_dst = dst[1 + ch];
1962             if (nnz4 & ~0x01010101) {
1963                 for (y = 0; y < 2; y++) {
1964                     for (x = 0; x < 2; x++) {
1965                         if ((uint8_t) nnz4 == 1)
1966                             s->vp8dsp.vp8_idct_dc_add(ch_dst + 4 * x,
1967                                                       td->block[4 + ch][(y << 1) + x],
1968                                                       s->uvlinesize);
1969                         else if ((uint8_t) nnz4 > 1)
1970                             s->vp8dsp.vp8_idct_add(ch_dst + 4 * x,
1971                                                    td->block[4 + ch][(y << 1) + x],
1972                                                    s->uvlinesize);
1973                         nnz4 >>= 8;
1974                         if (!nnz4)
1975                             goto chroma_idct_end;
1976                     }
1977                     ch_dst += 4 * s->uvlinesize;
1978                 }
1979             } else {
1980                 s->vp8dsp.vp8_idct_dc_add4uv(ch_dst, td->block[4 + ch], s->uvlinesize);
1981             }
1982         }
1983 chroma_idct_end:
1984         ;
1985     }
1986 }
1987
1988 static av_always_inline
1989 void filter_level_for_mb(VP8Context *s, VP8Macroblock *mb,
1990                          VP8FilterStrength *f, int is_vp7)
1991 {
1992     int interior_limit, filter_level;
1993
1994     if (s->segmentation.enabled) {
1995         filter_level = s->segmentation.filter_level[mb->segment];
1996         if (!s->segmentation.absolute_vals)
1997             filter_level += s->filter.level;
1998     } else
1999         filter_level = s->filter.level;
2000
2001     if (s->lf_delta.enabled) {
2002         filter_level += s->lf_delta.ref[mb->ref_frame];
2003         filter_level += s->lf_delta.mode[mb->mode];
2004     }
2005
2006     filter_level = av_clip_uintp2(filter_level, 6);
2007
2008     interior_limit = filter_level;
2009     if (s->filter.sharpness) {
2010         interior_limit >>= (s->filter.sharpness + 3) >> 2;
2011         interior_limit = FFMIN(interior_limit, 9 - s->filter.sharpness);
2012     }
2013     interior_limit = FFMAX(interior_limit, 1);
2014
2015     f->filter_level = filter_level;
2016     f->inner_limit = interior_limit;
2017     f->inner_filter = is_vp7 || !mb->skip || mb->mode == MODE_I4x4 ||
2018                       mb->mode == VP8_MVMODE_SPLIT;
2019 }
2020
2021 static av_always_inline
2022 void filter_mb(VP8Context *s, uint8_t *dst[3], VP8FilterStrength *f,
2023                int mb_x, int mb_y, int is_vp7)
2024 {
2025     int mbedge_lim, bedge_lim_y, bedge_lim_uv, hev_thresh;
2026     int filter_level = f->filter_level;
2027     int inner_limit = f->inner_limit;
2028     int inner_filter = f->inner_filter;
2029     int linesize = s->linesize;
2030     int uvlinesize = s->uvlinesize;
2031     static const uint8_t hev_thresh_lut[2][64] = {
2032         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
2033           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2034           3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2035           3, 3, 3, 3 },
2036         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
2037           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2038           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2039           2, 2, 2, 2 }
2040     };
2041
2042     if (!filter_level)
2043         return;
2044
2045     if (is_vp7) {
2046         bedge_lim_y  = filter_level;
2047         bedge_lim_uv = filter_level * 2;
2048         mbedge_lim   = filter_level + 2;
2049     } else {
2050         bedge_lim_y  =
2051         bedge_lim_uv = filter_level * 2 + inner_limit;
2052         mbedge_lim   = bedge_lim_y + 4;
2053     }
2054
2055     hev_thresh = hev_thresh_lut[s->keyframe][filter_level];
2056
2057     if (mb_x) {
2058         s->vp8dsp.vp8_h_loop_filter16y(dst[0], linesize,
2059                                        mbedge_lim, inner_limit, hev_thresh);
2060         s->vp8dsp.vp8_h_loop_filter8uv(dst[1], dst[2], uvlinesize,
2061                                        mbedge_lim, inner_limit, hev_thresh);
2062     }
2063
2064 #define H_LOOP_FILTER_16Y_INNER(cond)                                         \
2065     if (cond && inner_filter) {                                               \
2066         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] +  4, linesize,           \
2067                                              bedge_lim_y, inner_limit,        \
2068                                              hev_thresh);                     \
2069         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] +  8, linesize,           \
2070                                              bedge_lim_y, inner_limit,        \
2071                                              hev_thresh);                     \
2072         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] + 12, linesize,           \
2073                                              bedge_lim_y, inner_limit,        \
2074                                              hev_thresh);                     \
2075         s->vp8dsp.vp8_h_loop_filter8uv_inner(dst[1] +  4, dst[2] + 4,         \
2076                                              uvlinesize,  bedge_lim_uv,       \
2077                                              inner_limit, hev_thresh);        \
2078     }
2079
2080     H_LOOP_FILTER_16Y_INNER(!is_vp7)
2081
2082     if (mb_y) {
2083         s->vp8dsp.vp8_v_loop_filter16y(dst[0], linesize,
2084                                        mbedge_lim, inner_limit, hev_thresh);
2085         s->vp8dsp.vp8_v_loop_filter8uv(dst[1], dst[2], uvlinesize,
2086                                        mbedge_lim, inner_limit, hev_thresh);
2087     }
2088
2089     if (inner_filter) {
2090         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] +  4 * linesize,
2091                                              linesize, bedge_lim_y,
2092                                              inner_limit, hev_thresh);
2093         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] +  8 * linesize,
2094                                              linesize, bedge_lim_y,
2095                                              inner_limit, hev_thresh);
2096         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] + 12 * linesize,
2097                                              linesize, bedge_lim_y,
2098                                              inner_limit, hev_thresh);
2099         s->vp8dsp.vp8_v_loop_filter8uv_inner(dst[1] +  4 * uvlinesize,
2100                                              dst[2] +  4 * uvlinesize,
2101                                              uvlinesize, bedge_lim_uv,
2102                                              inner_limit, hev_thresh);
2103     }
2104
2105     H_LOOP_FILTER_16Y_INNER(is_vp7)
2106 }
2107
2108 static av_always_inline
2109 void filter_mb_simple(VP8Context *s, uint8_t *dst, VP8FilterStrength *f,
2110                       int mb_x, int mb_y)
2111 {
2112     int mbedge_lim, bedge_lim;
2113     int filter_level = f->filter_level;
2114     int inner_limit  = f->inner_limit;
2115     int inner_filter = f->inner_filter;
2116     int linesize     = s->linesize;
2117
2118     if (!filter_level)
2119         return;
2120
2121     bedge_lim  = 2 * filter_level + inner_limit;
2122     mbedge_lim = bedge_lim + 4;
2123
2124     if (mb_x)
2125         s->vp8dsp.vp8_h_loop_filter_simple(dst, linesize, mbedge_lim);
2126     if (inner_filter) {
2127         s->vp8dsp.vp8_h_loop_filter_simple(dst +  4, linesize, bedge_lim);
2128         s->vp8dsp.vp8_h_loop_filter_simple(dst +  8, linesize, bedge_lim);
2129         s->vp8dsp.vp8_h_loop_filter_simple(dst + 12, linesize, bedge_lim);
2130     }
2131
2132     if (mb_y)
2133         s->vp8dsp.vp8_v_loop_filter_simple(dst, linesize, mbedge_lim);
2134     if (inner_filter) {
2135         s->vp8dsp.vp8_v_loop_filter_simple(dst +  4 * linesize, linesize, bedge_lim);
2136         s->vp8dsp.vp8_v_loop_filter_simple(dst +  8 * linesize, linesize, bedge_lim);
2137         s->vp8dsp.vp8_v_loop_filter_simple(dst + 12 * linesize, linesize, bedge_lim);
2138     }
2139 }
2140
2141 #define MARGIN (16 << 2)
2142 static av_always_inline
2143 void vp78_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *curframe,
2144                                     VP8Frame *prev_frame, int is_vp7)
2145 {
2146     VP8Context *s = avctx->priv_data;
2147     int mb_x, mb_y;
2148
2149     s->mv_min.y = -MARGIN;
2150     s->mv_max.y = ((s->mb_height - 1) << 6) + MARGIN;
2151     for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
2152         VP8Macroblock *mb = s->macroblocks_base +
2153                             ((s->mb_width + 1) * (mb_y + 1) + 1);
2154         int mb_xy = mb_y * s->mb_width;
2155
2156         AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED * 0x01010101);
2157
2158         s->mv_min.x = -MARGIN;
2159         s->mv_max.x = ((s->mb_width - 1) << 6) + MARGIN;
2160         for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
2161             if (mb_y == 0)
2162                 AV_WN32A((mb - s->mb_width - 1)->intra4x4_pred_mode_top,
2163                          DC_PRED * 0x01010101);
2164             decode_mb_mode(s, mb, mb_x, mb_y, curframe->seg_map->data + mb_xy,
2165                            prev_frame && prev_frame->seg_map ?
2166                            prev_frame->seg_map->data + mb_xy : NULL, 1, is_vp7);
2167             s->mv_min.x -= 64;
2168             s->mv_max.x -= 64;
2169         }
2170         s->mv_min.y -= 64;
2171         s->mv_max.y -= 64;
2172     }
2173 }
2174
2175 static void vp7_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *cur_frame,
2176                                    VP8Frame *prev_frame)
2177 {
2178     vp78_decode_mv_mb_modes(avctx, cur_frame, prev_frame, IS_VP7);
2179 }
2180
2181 static void vp8_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *cur_frame,
2182                                    VP8Frame *prev_frame)
2183 {
2184     vp78_decode_mv_mb_modes(avctx, cur_frame, prev_frame, IS_VP8);
2185 }
2186
2187 #if HAVE_THREADS
2188 #define check_thread_pos(td, otd, mb_x_check, mb_y_check)                     \
2189     do {                                                                      \
2190         int tmp = (mb_y_check << 16) | (mb_x_check & 0xFFFF);                 \
2191         if (otd->thread_mb_pos < tmp) {                                       \
2192             pthread_mutex_lock(&otd->lock);                                   \
2193             td->wait_mb_pos = tmp;                                            \
2194             do {                                                              \
2195                 if (otd->thread_mb_pos >= tmp)                                \
2196                     break;                                                    \
2197                 pthread_cond_wait(&otd->cond, &otd->lock);                    \
2198             } while (1);                                                      \
2199             td->wait_mb_pos = INT_MAX;                                        \
2200             pthread_mutex_unlock(&otd->lock);                                 \
2201         }                                                                     \
2202     } while (0);
2203
2204 #define update_pos(td, mb_y, mb_x)                                            \
2205     do {                                                                      \
2206         int pos              = (mb_y << 16) | (mb_x & 0xFFFF);                \
2207         int sliced_threading = (avctx->active_thread_type == FF_THREAD_SLICE) && \
2208                                (num_jobs > 1);                                \
2209         int is_null          = !next_td || !prev_td;                          \
2210         int pos_check        = (is_null) ? 1                                  \
2211                                          : (next_td != td &&                  \
2212                                             pos >= next_td->wait_mb_pos) ||   \
2213                                            (prev_td != td &&                  \
2214                                             pos >= prev_td->wait_mb_pos);     \
2215         td->thread_mb_pos = pos;                                              \
2216         if (sliced_threading && pos_check) {                                  \
2217             pthread_mutex_lock(&td->lock);                                    \
2218             pthread_cond_broadcast(&td->cond);                                \
2219             pthread_mutex_unlock(&td->lock);                                  \
2220         }                                                                     \
2221     } while (0);
2222 #else
2223 #define check_thread_pos(td, otd, mb_x_check, mb_y_check)
2224 #define update_pos(td, mb_y, mb_x)
2225 #endif
2226
2227 static void vp8_decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
2228                                         int jobnr, int threadnr, int is_vp7)
2229 {
2230     VP8Context *s = avctx->priv_data;
2231     VP8ThreadData *prev_td, *next_td, *td = &s->thread_data[threadnr];
2232     int mb_y = td->thread_mb_pos >> 16;
2233     int mb_x, mb_xy = mb_y * s->mb_width;
2234     int num_jobs = s->num_jobs;
2235     VP8Frame *curframe = s->curframe, *prev_frame = s->prev_frame;
2236     VP56RangeCoder *c  = &s->coeff_partition[mb_y & (s->num_coeff_partitions - 1)];
2237     VP8Macroblock *mb;
2238     uint8_t *dst[3] = {
2239         curframe->tf.f->data[0] + 16 * mb_y * s->linesize,
2240         curframe->tf.f->data[1] +  8 * mb_y * s->uvlinesize,
2241         curframe->tf.f->data[2] +  8 * mb_y * s->uvlinesize
2242     };
2243     if (mb_y == 0)
2244         prev_td = td;
2245     else
2246         prev_td = &s->thread_data[(jobnr + num_jobs - 1) % num_jobs];
2247     if (mb_y == s->mb_height - 1)
2248         next_td = td;
2249     else
2250         next_td = &s->thread_data[(jobnr + 1) % num_jobs];
2251     if (s->mb_layout == 1)
2252         mb = s->macroblocks_base + ((s->mb_width + 1) * (mb_y + 1) + 1);
2253     else {
2254         // Make sure the previous frame has read its segmentation map,
2255         // if we re-use the same map.
2256         if (prev_frame && s->segmentation.enabled &&
2257             !s->segmentation.update_map)
2258             ff_thread_await_progress(&prev_frame->tf, mb_y, 0);
2259         mb = s->macroblocks + (s->mb_height - mb_y - 1) * 2;
2260         memset(mb - 1, 0, sizeof(*mb)); // zero left macroblock
2261         AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED * 0x01010101);
2262     }
2263
2264     if (!is_vp7 || mb_y == 0)
2265         memset(td->left_nnz, 0, sizeof(td->left_nnz));
2266
2267     s->mv_min.x = -MARGIN;
2268     s->mv_max.x = ((s->mb_width - 1) << 6) + MARGIN;
2269
2270     for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
2271         // Wait for previous thread to read mb_x+2, and reach mb_y-1.
2272         if (prev_td != td) {
2273             if (threadnr != 0) {
2274                 check_thread_pos(td, prev_td,
2275                                  mb_x + (is_vp7 ? 2 : 1),
2276                                  mb_y - (is_vp7 ? 2 : 1));
2277             } else {
2278                 check_thread_pos(td, prev_td,
2279                                  mb_x + (is_vp7 ? 2 : 1) + s->mb_width + 3,
2280                                  mb_y - (is_vp7 ? 2 : 1));
2281             }
2282         }
2283
2284         s->vdsp.prefetch(dst[0] + (mb_x & 3) * 4 * s->linesize + 64,
2285                          s->linesize, 4);
2286         s->vdsp.prefetch(dst[1] + (mb_x & 7) * s->uvlinesize + 64,
2287                          dst[2] - dst[1], 2);
2288
2289         if (!s->mb_layout)
2290             decode_mb_mode(s, mb, mb_x, mb_y, curframe->seg_map->data + mb_xy,
2291                            prev_frame && prev_frame->seg_map ?
2292                            prev_frame->seg_map->data + mb_xy : NULL, 0, is_vp7);
2293
2294         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_PREVIOUS);
2295
2296         if (!mb->skip)
2297             decode_mb_coeffs(s, td, c, mb, s->top_nnz[mb_x], td->left_nnz, is_vp7);
2298
2299         if (mb->mode <= MODE_I4x4)
2300             intra_predict(s, td, dst, mb, mb_x, mb_y, is_vp7);
2301         else
2302             inter_predict(s, td, dst, mb, mb_x, mb_y);
2303
2304         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN);
2305
2306         if (!mb->skip) {
2307             idct_mb(s, td, dst, mb);
2308         } else {
2309             AV_ZERO64(td->left_nnz);
2310             AV_WN64(s->top_nnz[mb_x], 0);   // array of 9, so unaligned
2311
2312             /* Reset DC block predictors if they would exist
2313              * if the mb had coefficients */
2314             if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
2315                 td->left_nnz[8]     = 0;
2316                 s->top_nnz[mb_x][8] = 0;
2317             }
2318         }
2319
2320         if (s->deblock_filter)
2321             filter_level_for_mb(s, mb, &td->filter_strength[mb_x], is_vp7);
2322
2323         if (s->deblock_filter && num_jobs != 1 && threadnr == num_jobs - 1) {
2324             if (s->filter.simple)
2325                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2326                                  NULL, NULL, s->linesize, 0, 1);
2327             else
2328                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2329                                  dst[1], dst[2], s->linesize, s->uvlinesize, 0);
2330         }
2331
2332         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN2);
2333
2334         dst[0]      += 16;
2335         dst[1]      += 8;
2336         dst[2]      += 8;
2337         s->mv_min.x -= 64;
2338         s->mv_max.x -= 64;
2339
2340         if (mb_x == s->mb_width + 1) {
2341             update_pos(td, mb_y, s->mb_width + 3);
2342         } else {
2343             update_pos(td, mb_y, mb_x);
2344         }
2345     }
2346 }
2347
2348 static void vp8_filter_mb_row(AVCodecContext *avctx, void *tdata,
2349                               int jobnr, int threadnr, int is_vp7)
2350 {
2351     VP8Context *s = avctx->priv_data;
2352     VP8ThreadData *td = &s->thread_data[threadnr];
2353     int mb_x, mb_y = td->thread_mb_pos >> 16, num_jobs = s->num_jobs;
2354     AVFrame *curframe = s->curframe->tf.f;
2355     VP8Macroblock *mb;
2356     VP8ThreadData *prev_td, *next_td;
2357     uint8_t *dst[3] = {
2358         curframe->data[0] + 16 * mb_y * s->linesize,
2359         curframe->data[1] +  8 * mb_y * s->uvlinesize,
2360         curframe->data[2] +  8 * mb_y * s->uvlinesize
2361     };
2362
2363     if (s->mb_layout == 1)
2364         mb = s->macroblocks_base + ((s->mb_width + 1) * (mb_y + 1) + 1);
2365     else
2366         mb = s->macroblocks + (s->mb_height - mb_y - 1) * 2;
2367
2368     if (mb_y == 0)
2369         prev_td = td;
2370     else
2371         prev_td = &s->thread_data[(jobnr + num_jobs - 1) % num_jobs];
2372     if (mb_y == s->mb_height - 1)
2373         next_td = td;
2374     else
2375         next_td = &s->thread_data[(jobnr + 1) % num_jobs];
2376
2377     for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb++) {
2378         VP8FilterStrength *f = &td->filter_strength[mb_x];
2379         if (prev_td != td)
2380             check_thread_pos(td, prev_td,
2381                              (mb_x + 1) + (s->mb_width + 3), mb_y - 1);
2382         if (next_td != td)
2383             if (next_td != &s->thread_data[0])
2384                 check_thread_pos(td, next_td, mb_x + 1, mb_y + 1);
2385
2386         if (num_jobs == 1) {
2387             if (s->filter.simple)
2388                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2389                                  NULL, NULL, s->linesize, 0, 1);
2390             else
2391                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2392                                  dst[1], dst[2], s->linesize, s->uvlinesize, 0);
2393         }
2394
2395         if (s->filter.simple)
2396             filter_mb_simple(s, dst[0], f, mb_x, mb_y);
2397         else
2398             filter_mb(s, dst, f, mb_x, mb_y, is_vp7);
2399         dst[0] += 16;
2400         dst[1] += 8;
2401         dst[2] += 8;
2402
2403         update_pos(td, mb_y, (s->mb_width + 3) + mb_x);
2404     }
2405 }
2406
2407 static av_always_inline
2408 int vp78_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata, int jobnr,
2409                               int threadnr, int is_vp7)
2410 {
2411     VP8Context *s = avctx->priv_data;
2412     VP8ThreadData *td = &s->thread_data[jobnr];
2413     VP8ThreadData *next_td = NULL, *prev_td = NULL;
2414     VP8Frame *curframe = s->curframe;
2415     int mb_y, num_jobs = s->num_jobs;
2416
2417     td->thread_nr = threadnr;
2418     for (mb_y = jobnr; mb_y < s->mb_height; mb_y += num_jobs) {
2419         if (mb_y >= s->mb_height)
2420             break;
2421         td->thread_mb_pos = mb_y << 16;
2422         vp8_decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr, is_vp7);
2423         if (s->deblock_filter)
2424             vp8_filter_mb_row(avctx, tdata, jobnr, threadnr, is_vp7);
2425         update_pos(td, mb_y, INT_MAX & 0xFFFF);
2426
2427         s->mv_min.y -= 64;
2428         s->mv_max.y -= 64;
2429
2430         if (avctx->active_thread_type == FF_THREAD_FRAME)
2431             ff_thread_report_progress(&curframe->tf, mb_y, 0);
2432     }
2433
2434     return 0;
2435 }
2436
2437 static int vp7_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata,
2438                                     int jobnr, int threadnr)
2439 {
2440     return vp78_decode_mb_row_sliced(avctx, tdata, jobnr, threadnr, IS_VP7);
2441 }
2442
2443 static int vp8_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata,
2444                                     int jobnr, int threadnr)
2445 {
2446     return vp78_decode_mb_row_sliced(avctx, tdata, jobnr, threadnr, IS_VP8);
2447 }
2448
2449
2450 static av_always_inline
2451 int vp78_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
2452                       AVPacket *avpkt, int is_vp7)
2453 {
2454     VP8Context *s = avctx->priv_data;
2455     int ret, i, referenced, num_jobs;
2456     enum AVDiscard skip_thresh;
2457     VP8Frame *av_uninit(curframe), *prev_frame;
2458
2459     if (is_vp7)
2460         ret = vp7_decode_frame_header(s, avpkt->data, avpkt->size);
2461     else
2462         ret = vp8_decode_frame_header(s, avpkt->data, avpkt->size);
2463
2464     if (ret < 0)
2465         goto err;
2466
2467     prev_frame = s->framep[VP56_FRAME_CURRENT];
2468
2469     referenced = s->update_last || s->update_golden == VP56_FRAME_CURRENT ||
2470                  s->update_altref == VP56_FRAME_CURRENT;
2471
2472     skip_thresh = !referenced ? AVDISCARD_NONREF
2473                               : !s->keyframe ? AVDISCARD_NONKEY
2474                                              : AVDISCARD_ALL;
2475
2476     if (avctx->skip_frame >= skip_thresh) {
2477         s->invisible = 1;
2478         memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
2479         goto skip_decode;
2480     }
2481     s->deblock_filter = s->filter.level && avctx->skip_loop_filter < skip_thresh;
2482
2483     // release no longer referenced frames
2484     for (i = 0; i < 5; i++)
2485         if (s->frames[i].tf.f->data[0] &&
2486             &s->frames[i] != prev_frame &&
2487             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
2488             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN]   &&
2489             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2])
2490             vp8_release_frame(s, &s->frames[i]);
2491
2492     curframe = s->framep[VP56_FRAME_CURRENT] = vp8_find_free_buffer(s);
2493
2494     if (!s->colorspace)
2495         avctx->colorspace = AVCOL_SPC_BT470BG;
2496     if (s->fullrange)
2497         avctx->color_range = AVCOL_RANGE_JPEG;
2498     else
2499         avctx->color_range = AVCOL_RANGE_MPEG;
2500
2501     /* Given that arithmetic probabilities are updated every frame, it's quite
2502      * likely that the values we have on a random interframe are complete
2503      * junk if we didn't start decode on a keyframe. So just don't display
2504      * anything rather than junk. */
2505     if (!s->keyframe && (!s->framep[VP56_FRAME_PREVIOUS] ||
2506                          !s->framep[VP56_FRAME_GOLDEN]   ||
2507                          !s->framep[VP56_FRAME_GOLDEN2])) {
2508         av_log(avctx, AV_LOG_WARNING,
2509                "Discarding interframe without a prior keyframe!\n");
2510         ret = AVERROR_INVALIDDATA;
2511         goto err;
2512     }
2513
2514     curframe->tf.f->key_frame = s->keyframe;
2515     curframe->tf.f->pict_type = s->keyframe ? AV_PICTURE_TYPE_I
2516                                             : AV_PICTURE_TYPE_P;
2517     if ((ret = vp8_alloc_frame(s, curframe, referenced))) {
2518         av_log(avctx, AV_LOG_ERROR, "get_buffer() failed!\n");
2519         goto err;
2520     }
2521
2522     // check if golden and altref are swapped
2523     if (s->update_altref != VP56_FRAME_NONE)
2524         s->next_framep[VP56_FRAME_GOLDEN2] = s->framep[s->update_altref];
2525     else
2526         s->next_framep[VP56_FRAME_GOLDEN2] = s->framep[VP56_FRAME_GOLDEN2];
2527
2528     if (s->update_golden != VP56_FRAME_NONE)
2529         s->next_framep[VP56_FRAME_GOLDEN] = s->framep[s->update_golden];
2530     else
2531         s->next_framep[VP56_FRAME_GOLDEN] = s->framep[VP56_FRAME_GOLDEN];
2532
2533     if (s->update_last)
2534         s->next_framep[VP56_FRAME_PREVIOUS] = curframe;
2535     else
2536         s->next_framep[VP56_FRAME_PREVIOUS] = s->framep[VP56_FRAME_PREVIOUS];
2537
2538     s->next_framep[VP56_FRAME_CURRENT] = curframe;
2539
2540     ff_thread_finish_setup(avctx);
2541
2542     s->linesize   = curframe->tf.f->linesize[0];
2543     s->uvlinesize = curframe->tf.f->linesize[1];
2544
2545     memset(s->top_nnz, 0, s->mb_width * sizeof(*s->top_nnz));
2546     /* Zero macroblock structures for top/top-left prediction
2547      * from outside the frame. */
2548     if (!s->mb_layout)
2549         memset(s->macroblocks + s->mb_height * 2 - 1, 0,
2550                (s->mb_width + 1) * sizeof(*s->macroblocks));
2551     if (!s->mb_layout && s->keyframe)
2552         memset(s->intra4x4_pred_mode_top, DC_PRED, s->mb_width * 4);
2553
2554     memset(s->ref_count, 0, sizeof(s->ref_count));
2555
2556     if (s->mb_layout == 1) {
2557         // Make sure the previous frame has read its segmentation map,
2558         // if we re-use the same map.
2559         if (prev_frame && s->segmentation.enabled &&
2560             !s->segmentation.update_map)
2561             ff_thread_await_progress(&prev_frame->tf, 1, 0);
2562         if (is_vp7)
2563             vp7_decode_mv_mb_modes(avctx, curframe, prev_frame);
2564         else
2565             vp8_decode_mv_mb_modes(avctx, curframe, prev_frame);
2566     }
2567
2568     if (avctx->active_thread_type == FF_THREAD_FRAME)
2569         num_jobs = 1;
2570     else
2571         num_jobs = FFMIN(s->num_coeff_partitions, avctx->thread_count);
2572     s->num_jobs   = num_jobs;
2573     s->curframe   = curframe;
2574     s->prev_frame = prev_frame;
2575     s->mv_min.y   = -MARGIN;
2576     s->mv_max.y   = ((s->mb_height - 1) << 6) + MARGIN;
2577     for (i = 0; i < MAX_THREADS; i++) {
2578         s->thread_data[i].thread_mb_pos = 0;
2579         s->thread_data[i].wait_mb_pos   = INT_MAX;
2580     }
2581     if (is_vp7)
2582         avctx->execute2(avctx, vp7_decode_mb_row_sliced, s->thread_data, NULL,
2583                         num_jobs);
2584     else
2585         avctx->execute2(avctx, vp8_decode_mb_row_sliced, s->thread_data, NULL,
2586                         num_jobs);
2587
2588     ff_thread_report_progress(&curframe->tf, INT_MAX, 0);
2589     memcpy(&s->framep[0], &s->next_framep[0], sizeof(s->framep[0]) * 4);
2590
2591 skip_decode:
2592     // if future frames don't use the updated probabilities,
2593     // reset them to the values we saved
2594     if (!s->update_probabilities)
2595         s->prob[0] = s->prob[1];
2596
2597     if (!s->invisible) {
2598         if ((ret = av_frame_ref(data, curframe->tf.f)) < 0)
2599             return ret;
2600         *got_frame = 1;
2601     }
2602
2603     return avpkt->size;
2604 err:
2605     memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
2606     return ret;
2607 }
2608
2609 int ff_vp8_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
2610                         AVPacket *avpkt)
2611 {
2612     return vp78_decode_frame(avctx, data, got_frame, avpkt, IS_VP8);
2613 }
2614
2615 #if CONFIG_VP7_DECODER
2616 static int vp7_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
2617                             AVPacket *avpkt)
2618 {
2619     return vp78_decode_frame(avctx, data, got_frame, avpkt, IS_VP7);
2620 }
2621 #endif /* CONFIG_VP7_DECODER */
2622
2623 av_cold int ff_vp8_decode_free(AVCodecContext *avctx)
2624 {
2625     VP8Context *s = avctx->priv_data;
2626     int i;
2627
2628     vp8_decode_flush_impl(avctx, 1);
2629     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++)
2630         av_frame_free(&s->frames[i].tf.f);
2631
2632     return 0;
2633 }
2634
2635 static av_cold int vp8_init_frames(VP8Context *s)
2636 {
2637     int i;
2638     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++) {
2639         s->frames[i].tf.f = av_frame_alloc();
2640         if (!s->frames[i].tf.f)
2641             return AVERROR(ENOMEM);
2642     }
2643     return 0;
2644 }
2645
2646 static av_always_inline
2647 int vp78_decode_init(AVCodecContext *avctx, int is_vp7)
2648 {
2649     VP8Context *s = avctx->priv_data;
2650     int ret;
2651
2652     s->avctx = avctx;
2653     avctx->pix_fmt = AV_PIX_FMT_YUV420P;
2654     avctx->internal->allocate_progress = 1;
2655
2656     ff_videodsp_init(&s->vdsp, 8);
2657
2658     ff_vp78dsp_init(&s->vp8dsp);
2659     if (CONFIG_VP7_DECODER && is_vp7) {
2660         ff_h264_pred_init(&s->hpc, AV_CODEC_ID_VP7, 8, 1);
2661         ff_vp7dsp_init(&s->vp8dsp);
2662     } else if (CONFIG_VP8_DECODER && !is_vp7) {
2663         ff_h264_pred_init(&s->hpc, AV_CODEC_ID_VP8, 8, 1);
2664         ff_vp8dsp_init(&s->vp8dsp);
2665     }
2666
2667     /* does not change for VP8 */
2668     memcpy(s->prob[0].scan, zigzag_scan, sizeof(s->prob[0].scan));
2669
2670     if ((ret = vp8_init_frames(s)) < 0) {
2671         ff_vp8_decode_free(avctx);
2672         return ret;
2673     }
2674
2675     return 0;
2676 }
2677
2678 #if CONFIG_VP7_DECODER
2679 static int vp7_decode_init(AVCodecContext *avctx)
2680 {
2681     return vp78_decode_init(avctx, IS_VP7);
2682 }
2683 #endif /* CONFIG_VP7_DECODER */
2684
2685 av_cold int ff_vp8_decode_init(AVCodecContext *avctx)
2686 {
2687     return vp78_decode_init(avctx, IS_VP8);
2688 }
2689
2690 #if CONFIG_VP8_DECODER
2691 static av_cold int vp8_decode_init_thread_copy(AVCodecContext *avctx)
2692 {
2693     VP8Context *s = avctx->priv_data;
2694     int ret;
2695
2696     s->avctx = avctx;
2697
2698     if ((ret = vp8_init_frames(s)) < 0) {
2699         ff_vp8_decode_free(avctx);
2700         return ret;
2701     }
2702
2703     return 0;
2704 }
2705
2706 #define REBASE(pic) pic ? pic - &s_src->frames[0] + &s->frames[0] : NULL
2707
2708 static int vp8_decode_update_thread_context(AVCodecContext *dst,
2709                                             const AVCodecContext *src)
2710 {
2711     VP8Context *s = dst->priv_data, *s_src = src->priv_data;
2712     int i;
2713
2714     if (s->macroblocks_base &&
2715         (s_src->mb_width != s->mb_width || s_src->mb_height != s->mb_height)) {
2716         free_buffers(s);
2717         s->mb_width  = s_src->mb_width;
2718         s->mb_height = s_src->mb_height;
2719     }
2720
2721     s->prob[0]      = s_src->prob[!s_src->update_probabilities];
2722     s->segmentation = s_src->segmentation;
2723     s->lf_delta     = s_src->lf_delta;
2724     memcpy(s->sign_bias, s_src->sign_bias, sizeof(s->sign_bias));
2725
2726     for (i = 0; i < FF_ARRAY_ELEMS(s_src->frames); i++) {
2727         if (s_src->frames[i].tf.f->data[0]) {
2728             int ret = vp8_ref_frame(s, &s->frames[i], &s_src->frames[i]);
2729             if (ret < 0)
2730                 return ret;
2731         }
2732     }
2733
2734     s->framep[0] = REBASE(s_src->next_framep[0]);
2735     s->framep[1] = REBASE(s_src->next_framep[1]);
2736     s->framep[2] = REBASE(s_src->next_framep[2]);
2737     s->framep[3] = REBASE(s_src->next_framep[3]);
2738
2739     return 0;
2740 }
2741 #endif /* CONFIG_VP8_DECODER */
2742
2743 #if CONFIG_VP7_DECODER
2744 AVCodec ff_vp7_decoder = {
2745     .name                  = "vp7",
2746     .long_name             = NULL_IF_CONFIG_SMALL("On2 VP7"),
2747     .type                  = AVMEDIA_TYPE_VIDEO,
2748     .id                    = AV_CODEC_ID_VP7,
2749     .priv_data_size        = sizeof(VP8Context),
2750     .init                  = vp7_decode_init,
2751     .close                 = ff_vp8_decode_free,
2752     .decode                = vp7_decode_frame,
2753     .capabilities          = CODEC_CAP_DR1,
2754     .flush                 = vp8_decode_flush,
2755 };
2756 #endif /* CONFIG_VP7_DECODER */
2757
2758 #if CONFIG_VP8_DECODER
2759 AVCodec ff_vp8_decoder = {
2760     .name                  = "vp8",
2761     .long_name             = NULL_IF_CONFIG_SMALL("On2 VP8"),
2762     .type                  = AVMEDIA_TYPE_VIDEO,
2763     .id                    = AV_CODEC_ID_VP8,
2764     .priv_data_size        = sizeof(VP8Context),
2765     .init                  = ff_vp8_decode_init,
2766     .close                 = ff_vp8_decode_free,
2767     .decode                = ff_vp8_decode_frame,
2768     .capabilities          = CODEC_CAP_DR1 | CODEC_CAP_FRAME_THREADS | CODEC_CAP_SLICE_THREADS,
2769     .flush                 = vp8_decode_flush,
2770     .init_thread_copy      = ONLY_IF_THREADS_ENABLED(vp8_decode_init_thread_copy),
2771     .update_thread_context = ONLY_IF_THREADS_ENABLED(vp8_decode_update_thread_context),
2772 };
2773 #endif /* CONFIG_VP7_DECODER */