git.sesse.net Git - ffmpeg/blob - libavcodec/vp8.c

   1 /*
   2  * VP7/VP8 compatible video decoder
   3  *
   4  * Copyright (C) 2010 David Conrad
   5  * Copyright (C) 2010 Ronald S. Bultje
   6  * Copyright (C) 2010 Fiona Glaser
   7  * Copyright (C) 2012 Daniel Kang
   8  * Copyright (C) 2014 Peter Ross
   9  *
  10  * This file is part of FFmpeg.
  11  *
  12  * FFmpeg is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public
  14  * License as published by the Free Software Foundation; either
  15  * version 2.1 of the License, or (at your option) any later version.
  16  *
  17  * FFmpeg is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with FFmpeg; if not, write to the Free Software
  24  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  25  */
  26
  27 #include "libavutil/imgutils.h"
  28
  29 #include "avcodec.h"
  30 #include "hwaccel.h"
  31 #include "internal.h"
  32 #include "mathops.h"
  33 #include "rectangle.h"
  34 #include "thread.h"
  35 #include "vp8.h"
  36 #include "vp8data.h"
  37
  38 #if ARCH_ARM
  39 #   include "arm/vp8.h"
  40 #endif
  41
  42 #if CONFIG_VP7_DECODER && CONFIG_VP8_DECODER
  43 #define VPX(vp7, f) (vp7 ? vp7_ ## f : vp8_ ## f)
  44 #elif CONFIG_VP7_DECODER
  45 #define VPX(vp7, f) vp7_ ## f
  46 #else // CONFIG_VP8_DECODER
  47 #define VPX(vp7, f) vp8_ ## f
  48 #endif
  49
  50 static void free_buffers(VP8Context *s)
  51 {
  52     int i;
  53     if (s->thread_data)
  54         for (i = 0; i < MAX_THREADS; i++) {
  55 #if HAVE_THREADS
  56             pthread_cond_destroy(&s->thread_data[i].cond);
  57             pthread_mutex_destroy(&s->thread_data[i].lock);
  58 #endif
  59             av_freep(&s->thread_data[i].filter_strength);
  60         }
  61     av_freep(&s->thread_data);
  62     av_freep(&s->macroblocks_base);
  63     av_freep(&s->intra4x4_pred_mode_top);
  64     av_freep(&s->top_nnz);
  65     av_freep(&s->top_border);
  66
  67     s->macroblocks = NULL;
  68 }
  69
  70 static int vp8_alloc_frame(VP8Context *s, VP8Frame *f, int ref)
  71 {
  72     int ret;
  73     if ((ret = ff_thread_get_buffer(s->avctx, &f->tf,
  74                                     ref ? AV_GET_BUFFER_FLAG_REF : 0)) < 0)
  75         return ret;
  76     if (!(f->seg_map = av_buffer_allocz(s->mb_width * s->mb_height)))
  77         goto fail;
  78     if (s->avctx->hwaccel) {
  79         const AVHWAccel *hwaccel = s->avctx->hwaccel;
  80         if (hwaccel->frame_priv_data_size) {
  81             f->hwaccel_priv_buf = av_buffer_allocz(hwaccel->frame_priv_data_size);
  82             if (!f->hwaccel_priv_buf)
  83                 goto fail;
  84             f->hwaccel_picture_private = f->hwaccel_priv_buf->data;
  85         }
  86     }
  87     return 0;
  88
  89 fail:
  90     av_buffer_unref(&f->seg_map);
  91     ff_thread_release_buffer(s->avctx, &f->tf);
  92     return AVERROR(ENOMEM);
  93 }
  94
  95 static void vp8_release_frame(VP8Context *s, VP8Frame *f)
  96 {
  97     av_buffer_unref(&f->seg_map);
  98     av_buffer_unref(&f->hwaccel_priv_buf);
  99     f->hwaccel_picture_private = NULL;
 100     ff_thread_release_buffer(s->avctx, &f->tf);
 101 }
 102
 103 #if CONFIG_VP8_DECODER
 104 static int vp8_ref_frame(VP8Context *s, VP8Frame *dst, VP8Frame *src)
 105 {
 106     int ret;
 107
 108     vp8_release_frame(s, dst);
 109
 110     if ((ret = ff_thread_ref_frame(&dst->tf, &src->tf)) < 0)
 111         return ret;
 112     if (src->seg_map &&
 113         !(dst->seg_map = av_buffer_ref(src->seg_map))) {
 114         vp8_release_frame(s, dst);
 115         return AVERROR(ENOMEM);
 116     }
 117     if (src->hwaccel_picture_private) {
 118         dst->hwaccel_priv_buf = av_buffer_ref(src->hwaccel_priv_buf);
 119         if (!dst->hwaccel_priv_buf)
 120             return AVERROR(ENOMEM);
 121         dst->hwaccel_picture_private = dst->hwaccel_priv_buf->data;
 122     }
 123
 124     return 0;
 125 }
 126 #endif /* CONFIG_VP8_DECODER */
 127
 128 static void vp8_decode_flush_impl(AVCodecContext *avctx, int free_mem)
 129 {
 130     VP8Context *s = avctx->priv_data;
 131     int i;
 132
 133     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++)
 134         vp8_release_frame(s, &s->frames[i]);
 135     memset(s->framep, 0, sizeof(s->framep));
 136
 137     if (free_mem)
 138         free_buffers(s);
 139 }
 140
 141 static void vp8_decode_flush(AVCodecContext *avctx)
 142 {
 143     vp8_decode_flush_impl(avctx, 0);
 144 }
 145
 146 static VP8Frame *vp8_find_free_buffer(VP8Context *s)
 147 {
 148     VP8Frame *frame = NULL;
 149     int i;
 150
 151     // find a free buffer
 152     for (i = 0; i < 5; i++)
 153         if (&s->frames[i] != s->framep[VP56_FRAME_CURRENT]  &&
 154             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
 155             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN]   &&
 156             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2]) {
 157             frame = &s->frames[i];
 158             break;
 159         }
 160     if (i == 5) {
 161         av_log(s->avctx, AV_LOG_FATAL, "Ran out of free frames!\n");
 162         abort();
 163     }
 164     if (frame->tf.f->buf[0])
 165         vp8_release_frame(s, frame);
 166
 167     return frame;
 168 }
 169
 170 static enum AVPixelFormat get_pixel_format(VP8Context *s)
 171 {
 172     enum AVPixelFormat pix_fmts[] = {
 173 #if CONFIG_VP8_VAAPI_HWACCEL
 174         AV_PIX_FMT_VAAPI,
 175 #endif
 176 #if CONFIG_VP8_NVDEC_HWACCEL
 177         AV_PIX_FMT_CUDA,
 178 #endif
 179         AV_PIX_FMT_YUV420P,
 180         AV_PIX_FMT_NONE,
 181     };
 182
 183     return ff_get_format(s->avctx, pix_fmts);
 184 }
 185
 186 static av_always_inline
 187 int update_dimensions(VP8Context *s, int width, int height, int is_vp7)
 188 {
 189     AVCodecContext *avctx = s->avctx;
 190     int i, ret;
 191
 192     if (width  != s->avctx->width || ((width+15)/16 != s->mb_width || (height+15)/16 != s->mb_height) && s->macroblocks_base ||
 193         height != s->avctx->height) {
 194         vp8_decode_flush_impl(s->avctx, 1);
 195
 196         ret = ff_set_dimensions(s->avctx, width, height);
 197         if (ret < 0)
 198             return ret;
 199     }
 200
 201     if (!s->actually_webp && !is_vp7) {
 202         s->pix_fmt = get_pixel_format(s);
 203         if (s->pix_fmt < 0)
 204             return AVERROR(EINVAL);
 205         avctx->pix_fmt = s->pix_fmt;
 206     }
 207
 208     s->mb_width  = (s->avctx->coded_width  + 15) / 16;
 209     s->mb_height = (s->avctx->coded_height + 15) / 16;
 210
 211     s->mb_layout = is_vp7 || avctx->active_thread_type == FF_THREAD_SLICE &&
 212                    avctx->thread_count > 1;
 213     if (!s->mb_layout) { // Frame threading and one thread
 214         s->macroblocks_base       = av_mallocz((s->mb_width + s->mb_height * 2 + 1) *
 215                                                sizeof(*s->macroblocks));
 216         s->intra4x4_pred_mode_top = av_mallocz(s->mb_width * 4);
 217     } else // Sliced threading
 218         s->macroblocks_base = av_mallocz((s->mb_width + 2) * (s->mb_height + 2) *
 219                                          sizeof(*s->macroblocks));
 220     s->top_nnz     = av_mallocz(s->mb_width * sizeof(*s->top_nnz));
 221     s->top_border  = av_mallocz((s->mb_width + 1) * sizeof(*s->top_border));
 222     s->thread_data = av_mallocz(MAX_THREADS * sizeof(VP8ThreadData));
 223
 224     if (!s->macroblocks_base || !s->top_nnz || !s->top_border ||
 225         !s->thread_data || (!s->intra4x4_pred_mode_top && !s->mb_layout)) {
 226         free_buffers(s);
 227         return AVERROR(ENOMEM);
 228     }
 229
 230     for (i = 0; i < MAX_THREADS; i++) {
 231         s->thread_data[i].filter_strength =
 232             av_mallocz(s->mb_width * sizeof(*s->thread_data[0].filter_strength));
 233         if (!s->thread_data[i].filter_strength) {
 234             free_buffers(s);
 235             return AVERROR(ENOMEM);
 236         }
 237 #if HAVE_THREADS
 238         pthread_mutex_init(&s->thread_data[i].lock, NULL);
 239         pthread_cond_init(&s->thread_data[i].cond, NULL);
 240 #endif
 241     }
 242
 243     s->macroblocks = s->macroblocks_base + 1;
 244
 245     return 0;
 246 }
 247
 248 static int vp7_update_dimensions(VP8Context *s, int width, int height)
 249 {
 250     return update_dimensions(s, width, height, IS_VP7);
 251 }
 252
 253 static int vp8_update_dimensions(VP8Context *s, int width, int height)
 254 {
 255     return update_dimensions(s, width, height, IS_VP8);
 256 }
 257
 258
 259 static void parse_segment_info(VP8Context *s)
 260 {
 261     VP56RangeCoder *c = &s->c;
 262     int i;
 263
 264     s->segmentation.update_map = vp8_rac_get(c);
 265     s->segmentation.update_feature_data = vp8_rac_get(c);
 266
 267     if (s->segmentation.update_feature_data) {
 268         s->segmentation.absolute_vals = vp8_rac_get(c);
 269
 270         for (i = 0; i < 4; i++)
 271             s->segmentation.base_quant[i]   = vp8_rac_get_sint(c, 7);
 272
 273         for (i = 0; i < 4; i++)
 274             s->segmentation.filter_level[i] = vp8_rac_get_sint(c, 6);
 275     }
 276     if (s->segmentation.update_map)
 277         for (i = 0; i < 3; i++)
 278             s->prob->segmentid[i] = vp8_rac_get(c) ? vp8_rac_get_uint(c, 8) : 255;
 279 }
 280
 281 static void update_lf_deltas(VP8Context *s)
 282 {
 283     VP56RangeCoder *c = &s->c;
 284     int i;
 285
 286     for (i = 0; i < 4; i++) {
 287         if (vp8_rac_get(c)) {
 288             s->lf_delta.ref[i] = vp8_rac_get_uint(c, 6);
 289
 290             if (vp8_rac_get(c))
 291                 s->lf_delta.ref[i] = -s->lf_delta.ref[i];
 292         }
 293     }
 294
 295     for (i = MODE_I4x4; i <= VP8_MVMODE_SPLIT; i++) {
 296         if (vp8_rac_get(c)) {
 297             s->lf_delta.mode[i] = vp8_rac_get_uint(c, 6);
 298
 299             if (vp8_rac_get(c))
 300                 s->lf_delta.mode[i] = -s->lf_delta.mode[i];
 301         }
 302     }
 303 }
 304
 305 static int setup_partitions(VP8Context *s, const uint8_t *buf, int buf_size)
 306 {
 307     const uint8_t *sizes = buf;
 308     int i;
 309     int ret;
 310
 311     s->num_coeff_partitions = 1 << vp8_rac_get_uint(&s->c, 2);
 312
 313     buf      += 3 * (s->num_coeff_partitions - 1);
 314     buf_size -= 3 * (s->num_coeff_partitions - 1);
 315     if (buf_size < 0)
 316         return -1;
 317
 318     for (i = 0; i < s->num_coeff_partitions - 1; i++) {
 319         int size = AV_RL24(sizes + 3 * i);
 320         if (buf_size - size < 0)
 321             return -1;
 322         s->coeff_partition_size[i] = size;
 323
 324         ret = ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, size);
 325         if (ret < 0)
 326             return ret;
 327         buf      += size;
 328         buf_size -= size;
 329     }
 330
 331     s->coeff_partition_size[i] = buf_size;
 332     ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, buf_size);
 333
 334     return 0;
 335 }
 336
 337 static void vp7_get_quants(VP8Context *s)
 338 {
 339     VP56RangeCoder *c = &s->c;
 340
 341     int yac_qi  = vp8_rac_get_uint(c, 7);
 342     int ydc_qi  = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 343     int y2dc_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 344     int y2ac_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 345     int uvdc_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 346     int uvac_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 347
 348     s->qmat[0].luma_qmul[0]    =       vp7_ydc_qlookup[ydc_qi];
 349     s->qmat[0].luma_qmul[1]    =       vp7_yac_qlookup[yac_qi];
 350     s->qmat[0].luma_dc_qmul[0] =       vp7_y2dc_qlookup[y2dc_qi];
 351     s->qmat[0].luma_dc_qmul[1] =       vp7_y2ac_qlookup[y2ac_qi];
 352     s->qmat[0].chroma_qmul[0]  = FFMIN(vp7_ydc_qlookup[uvdc_qi], 132);
 353     s->qmat[0].chroma_qmul[1]  =       vp7_yac_qlookup[uvac_qi];
 354 }
 355
 356 static void vp8_get_quants(VP8Context *s)
 357 {
 358     VP56RangeCoder *c = &s->c;
 359     int i, base_qi;
 360
 361     s->quant.yac_qi     = vp8_rac_get_uint(c, 7);
 362     s->quant.ydc_delta  = vp8_rac_get_sint(c, 4);
 363     s->quant.y2dc_delta = vp8_rac_get_sint(c, 4);
 364     s->quant.y2ac_delta = vp8_rac_get_sint(c, 4);
 365     s->quant.uvdc_delta = vp8_rac_get_sint(c, 4);
 366     s->quant.uvac_delta = vp8_rac_get_sint(c, 4);
 367
 368     for (i = 0; i < 4; i++) {
 369         if (s->segmentation.enabled) {
 370             base_qi = s->segmentation.base_quant[i];
 371             if (!s->segmentation.absolute_vals)
 372                 base_qi += s->quant.yac_qi;
 373         } else
 374             base_qi = s->quant.yac_qi;
 375
 376         s->qmat[i].luma_qmul[0]    = vp8_dc_qlookup[av_clip_uintp2(base_qi + s->quant.ydc_delta,  7)];
 377         s->qmat[i].luma_qmul[1]    = vp8_ac_qlookup[av_clip_uintp2(base_qi,              7)];
 378         s->qmat[i].luma_dc_qmul[0] = vp8_dc_qlookup[av_clip_uintp2(base_qi + s->quant.y2dc_delta, 7)] * 2;
 379         /* 101581>>16 is equivalent to 155/100 */
 380         s->qmat[i].luma_dc_qmul[1] = vp8_ac_qlookup[av_clip_uintp2(base_qi + s->quant.y2ac_delta, 7)] * 101581 >> 16;
 381         s->qmat[i].chroma_qmul[0]  = vp8_dc_qlookup[av_clip_uintp2(base_qi + s->quant.uvdc_delta, 7)];
 382         s->qmat[i].chroma_qmul[1]  = vp8_ac_qlookup[av_clip_uintp2(base_qi + s->quant.uvac_delta, 7)];
 383
 384         s->qmat[i].luma_dc_qmul[1] = FFMAX(s->qmat[i].luma_dc_qmul[1], 8);
 385         s->qmat[i].chroma_qmul[0]  = FFMIN(s->qmat[i].chroma_qmul[0], 132);
 386     }
 387 }
 388
 389 /**
 390  * Determine which buffers golden and altref should be updated with after this frame.
 391  * The spec isn't clear here, so I'm going by my understanding of what libvpx does
 392  *
 393  * Intra frames update all 3 references
 394  * Inter frames update VP56_FRAME_PREVIOUS if the update_last flag is set
 395  * If the update (golden|altref) flag is set, it's updated with the current frame
 396  *      if update_last is set, and VP56_FRAME_PREVIOUS otherwise.
 397  * If the flag is not set, the number read means:
 398  *      0: no update
 399  *      1: VP56_FRAME_PREVIOUS
 400  *      2: update golden with altref, or update altref with golden
 401  */
 402 static VP56Frame ref_to_update(VP8Context *s, int update, VP56Frame ref)
 403 {
 404     VP56RangeCoder *c = &s->c;
 405
 406     if (update)
 407         return VP56_FRAME_CURRENT;
 408
 409     switch (vp8_rac_get_uint(c, 2)) {
 410     case 1:
 411         return VP56_FRAME_PREVIOUS;
 412     case 2:
 413         return (ref == VP56_FRAME_GOLDEN) ? VP56_FRAME_GOLDEN2 : VP56_FRAME_GOLDEN;
 414     }
 415     return VP56_FRAME_NONE;
 416 }
 417
 418 static void vp78_reset_probability_tables(VP8Context *s)
 419 {
 420     int i, j;
 421     for (i = 0; i < 4; i++)
 422         for (j = 0; j < 16; j++)
 423             memcpy(s->prob->token[i][j], vp8_token_default_probs[i][vp8_coeff_band[j]],
 424                    sizeof(s->prob->token[i][j]));
 425 }
 426
 427 static void vp78_update_probability_tables(VP8Context *s)
 428 {
 429     VP56RangeCoder *c = &s->c;
 430     int i, j, k, l, m;
 431
 432     for (i = 0; i < 4; i++)
 433         for (j = 0; j < 8; j++)
 434             for (k = 0; k < 3; k++)
 435                 for (l = 0; l < NUM_DCT_TOKENS-1; l++)
 436                     if (vp56_rac_get_prob_branchy(c, vp8_token_update_probs[i][j][k][l])) {
 437                         int prob = vp8_rac_get_uint(c, 8);
 438                         for (m = 0; vp8_coeff_band_indexes[j][m] >= 0; m++)
 439                             s->prob->token[i][vp8_coeff_band_indexes[j][m]][k][l] = prob;
 440                     }
 441 }
 442
 443 #define VP7_MVC_SIZE 17
 444 #define VP8_MVC_SIZE 19
 445
 446 static void vp78_update_pred16x16_pred8x8_mvc_probabilities(VP8Context *s,
 447                                                             int mvc_size)
 448 {
 449     VP56RangeCoder *c = &s->c;
 450     int i, j;
 451
 452     if (vp8_rac_get(c))
 453         for (i = 0; i < 4; i++)
 454             s->prob->pred16x16[i] = vp8_rac_get_uint(c, 8);
 455     if (vp8_rac_get(c))
 456         for (i = 0; i < 3; i++)
 457             s->prob->pred8x8c[i]  = vp8_rac_get_uint(c, 8);
 458
 459     // 17.2 MV probability update
 460     for (i = 0; i < 2; i++)
 461         for (j = 0; j < mvc_size; j++)
 462             if (vp56_rac_get_prob_branchy(c, vp8_mv_update_prob[i][j]))
 463                 s->prob->mvc[i][j] = vp8_rac_get_nn(c);
 464 }
 465
 466 static void update_refs(VP8Context *s)
 467 {
 468     VP56RangeCoder *c = &s->c;
 469
 470     int update_golden = vp8_rac_get(c);
 471     int update_altref = vp8_rac_get(c);
 472
 473     s->update_golden = ref_to_update(s, update_golden, VP56_FRAME_GOLDEN);
 474     s->update_altref = ref_to_update(s, update_altref, VP56_FRAME_GOLDEN2);
 475 }
 476
 477 static void copy_chroma(AVFrame *dst, AVFrame *src, int width, int height)
 478 {
 479     int i, j;
 480
 481     for (j = 1; j < 3; j++) {
 482         for (i = 0; i < height / 2; i++)
 483             memcpy(dst->data[j] + i * dst->linesize[j],
 484                    src->data[j] + i * src->linesize[j], width / 2);
 485     }
 486 }
 487
 488 static void fade(uint8_t *dst, ptrdiff_t dst_linesize,
 489                  const uint8_t *src, ptrdiff_t src_linesize,
 490                  int width, int height,
 491                  int alpha, int beta)
 492 {
 493     int i, j;
 494     for (j = 0; j < height; j++) {
 495         for (i = 0; i < width; i++) {
 496             uint8_t y = src[j * src_linesize + i];
 497             dst[j * dst_linesize + i] = av_clip_uint8(y + ((y * beta) >> 8) + alpha);
 498         }
 499     }
 500 }
 501
 502 static int vp7_fade_frame(VP8Context *s, VP56RangeCoder *c)
 503 {
 504     int alpha = (int8_t) vp8_rac_get_uint(c, 8);
 505     int beta  = (int8_t) vp8_rac_get_uint(c, 8);
 506     int ret;
 507
 508     if (!s->keyframe && (alpha || beta)) {
 509         int width  = s->mb_width * 16;
 510         int height = s->mb_height * 16;
 511         AVFrame *src, *dst;
 512
 513         if (!s->framep[VP56_FRAME_PREVIOUS] ||
 514             !s->framep[VP56_FRAME_GOLDEN]) {
 515             av_log(s->avctx, AV_LOG_WARNING, "Discarding interframe without a prior keyframe!\n");
 516             return AVERROR_INVALIDDATA;
 517         }
 518
 519         dst =
 520         src = s->framep[VP56_FRAME_PREVIOUS]->tf.f;
 521
 522         /* preserve the golden frame, write a new previous frame */
 523         if (s->framep[VP56_FRAME_GOLDEN] == s->framep[VP56_FRAME_PREVIOUS]) {
 524             s->framep[VP56_FRAME_PREVIOUS] = vp8_find_free_buffer(s);
 525             if ((ret = vp8_alloc_frame(s, s->framep[VP56_FRAME_PREVIOUS], 1)) < 0)
 526                 return ret;
 527
 528             dst = s->framep[VP56_FRAME_PREVIOUS]->tf.f;
 529
 530             copy_chroma(dst, src, width, height);
 531         }
 532
 533         fade(dst->data[0], dst->linesize[0],
 534              src->data[0], src->linesize[0],
 535              width, height, alpha, beta);
 536     }
 537
 538     return 0;
 539 }
 540
 541 static int vp7_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size)
 542 {
 543     VP56RangeCoder *c = &s->c;
 544     int part1_size, hscale, vscale, i, j, ret;
 545     int width  = s->avctx->width;
 546     int height = s->avctx->height;
 547
 548     if (buf_size < 4) {
 549         return AVERROR_INVALIDDATA;
 550     }
 551
 552     s->profile = (buf[0] >> 1) & 7;
 553     if (s->profile > 1) {
 554         avpriv_request_sample(s->avctx, "Unknown profile %d", s->profile);
 555         return AVERROR_INVALIDDATA;
 556     }
 557
 558     s->keyframe  = !(buf[0] & 1);
 559     s->invisible = 0;
 560     part1_size   = AV_RL24(buf) >> 4;
 561
 562     if (buf_size < 4 - s->profile + part1_size) {
 563         av_log(s->avctx, AV_LOG_ERROR, "Buffer size %d is too small, needed : %d\n", buf_size, 4 - s->profile + part1_size);
 564         return AVERROR_INVALIDDATA;
 565     }
 566
 567     buf      += 4 - s->profile;
 568     buf_size -= 4 - s->profile;
 569
 570     memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab, sizeof(s->put_pixels_tab));
 571
 572     ret = ff_vp56_init_range_decoder(c, buf, part1_size);
 573     if (ret < 0)
 574         return ret;
 575     buf      += part1_size;
 576     buf_size -= part1_size;
 577
 578     /* A. Dimension information (keyframes only) */
 579     if (s->keyframe) {
 580         width  = vp8_rac_get_uint(c, 12);
 581         height = vp8_rac_get_uint(c, 12);
 582         hscale = vp8_rac_get_uint(c, 2);
 583         vscale = vp8_rac_get_uint(c, 2);
 584         if (hscale || vscale)
 585             avpriv_request_sample(s->avctx, "Upscaling");
 586
 587         s->update_golden = s->update_altref = VP56_FRAME_CURRENT;
 588         vp78_reset_probability_tables(s);
 589         memcpy(s->prob->pred16x16, vp8_pred16x16_prob_inter,
 590                sizeof(s->prob->pred16x16));
 591         memcpy(s->prob->pred8x8c, vp8_pred8x8c_prob_inter,
 592                sizeof(s->prob->pred8x8c));
 593         for (i = 0; i < 2; i++)
 594             memcpy(s->prob->mvc[i], vp7_mv_default_prob[i],
 595                    sizeof(vp7_mv_default_prob[i]));
 596         memset(&s->segmentation, 0, sizeof(s->segmentation));
 597         memset(&s->lf_delta, 0, sizeof(s->lf_delta));
 598         memcpy(s->prob[0].scan, ff_zigzag_scan, sizeof(s->prob[0].scan));
 599     }
 600
 601     if (s->keyframe || s->profile > 0)
 602         memset(s->inter_dc_pred, 0 , sizeof(s->inter_dc_pred));
 603
 604     /* B. Decoding information for all four macroblock-level features */
 605     for (i = 0; i < 4; i++) {
 606         s->feature_enabled[i] = vp8_rac_get(c);
 607         if (s->feature_enabled[i]) {
 608              s->feature_present_prob[i] = vp8_rac_get_uint(c, 8);
 609
 610              for (j = 0; j < 3; j++)
 611                  s->feature_index_prob[i][j] =
 612                      vp8_rac_get(c) ? vp8_rac_get_uint(c, 8) : 255;
 613
 614              if (vp7_feature_value_size[s->profile][i])
 615                  for (j = 0; j < 4; j++)
 616                      s->feature_value[i][j] =
 617                         vp8_rac_get(c) ? vp8_rac_get_uint(c, vp7_feature_value_size[s->profile][i]) : 0;
 618         }
 619     }
 620
 621     s->segmentation.enabled    = 0;
 622     s->segmentation.update_map = 0;
 623     s->lf_delta.enabled        = 0;
 624
 625     s->num_coeff_partitions = 1;
 626     ret = ff_vp56_init_range_decoder(&s->coeff_partition[0], buf, buf_size);
 627     if (ret < 0)
 628         return ret;
 629
 630     if (!s->macroblocks_base || /* first frame */
 631         width != s->avctx->width || height != s->avctx->height ||
 632         (width + 15) / 16 != s->mb_width || (height + 15) / 16 != s->mb_height) {
 633         if ((ret = vp7_update_dimensions(s, width, height)) < 0)
 634             return ret;
 635     }
 636
 637     /* C. Dequantization indices */
 638     vp7_get_quants(s);
 639
 640     /* D. Golden frame update flag (a Flag) for interframes only */
 641     if (!s->keyframe) {
 642         s->update_golden = vp8_rac_get(c) ? VP56_FRAME_CURRENT : VP56_FRAME_NONE;
 643         s->sign_bias[VP56_FRAME_GOLDEN] = 0;
 644     }
 645
 646     s->update_last          = 1;
 647     s->update_probabilities = 1;
 648     s->fade_present         = 1;
 649
 650     if (s->profile > 0) {
 651         s->update_probabilities = vp8_rac_get(c);
 652         if (!s->update_probabilities)
 653             s->prob[1] = s->prob[0];
 654
 655         if (!s->keyframe)
 656             s->fade_present = vp8_rac_get(c);
 657     }
 658
 659     if (c->end <= c->buffer && c->bits >= 0)
 660         return AVERROR_INVALIDDATA;
 661     /* E. Fading information for previous frame */
 662     if (s->fade_present && vp8_rac_get(c)) {
 663         if ((ret = vp7_fade_frame(s ,c)) < 0)
 664             return ret;
 665     }
 666
 667     /* F. Loop filter type */
 668     if (!s->profile)
 669         s->filter.simple = vp8_rac_get(c);
 670
 671     /* G. DCT coefficient ordering specification */
 672     if (vp8_rac_get(c))
 673         for (i = 1; i < 16; i++)
 674             s->prob[0].scan[i] = ff_zigzag_scan[vp8_rac_get_uint(c, 4)];
 675
 676     /* H. Loop filter levels  */
 677     if (s->profile > 0)
 678         s->filter.simple = vp8_rac_get(c);
 679     s->filter.level     = vp8_rac_get_uint(c, 6);
 680     s->filter.sharpness = vp8_rac_get_uint(c, 3);
 681
 682     /* I. DCT coefficient probability update; 13.3 Token Probability Updates */
 683     vp78_update_probability_tables(s);
 684
 685     s->mbskip_enabled = 0;
 686
 687     /* J. The remaining frame header data occurs ONLY FOR INTERFRAMES */
 688     if (!s->keyframe) {
 689         s->prob->intra  = vp8_rac_get_uint(c, 8);
 690         s->prob->last   = vp8_rac_get_uint(c, 8);
 691         vp78_update_pred16x16_pred8x8_mvc_probabilities(s, VP7_MVC_SIZE);
 692     }
 693
 694     return 0;
 695 }
 696
 697 static int vp8_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size)
 698 {
 699     VP56RangeCoder *c = &s->c;
 700     int header_size, hscale, vscale, ret;
 701     int width  = s->avctx->width;
 702     int height = s->avctx->height;
 703
 704     if (buf_size < 3) {
 705         av_log(s->avctx, AV_LOG_ERROR, "Insufficent data (%d) for header\n", buf_size);
 706         return AVERROR_INVALIDDATA;
 707     }
 708
 709     s->keyframe  = !(buf[0] & 1);
 710     s->profile   =  (buf[0]>>1) & 7;
 711     s->invisible = !(buf[0] & 0x10);
 712     header_size  = AV_RL24(buf) >> 5;
 713     buf      += 3;
 714     buf_size -= 3;
 715
 716     s->header_partition_size = header_size;
 717
 718     if (s->profile > 3)
 719         av_log(s->avctx, AV_LOG_WARNING, "Unknown profile %d\n", s->profile);
 720
 721     if (!s->profile)
 722         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab,
 723                sizeof(s->put_pixels_tab));
 724     else    // profile 1-3 use bilinear, 4+ aren't defined so whatever
 725         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_bilinear_pixels_tab,
 726                sizeof(s->put_pixels_tab));
 727
 728     if (header_size > buf_size - 7 * s->keyframe) {
 729         av_log(s->avctx, AV_LOG_ERROR, "Header size larger than data provided\n");
 730         return AVERROR_INVALIDDATA;
 731     }
 732
 733     if (s->keyframe) {
 734         if (AV_RL24(buf) != 0x2a019d) {
 735             av_log(s->avctx, AV_LOG_ERROR,
 736                    "Invalid start code 0x%x\n", AV_RL24(buf));
 737             return AVERROR_INVALIDDATA;
 738         }
 739         width     = AV_RL16(buf + 3) & 0x3fff;
 740         height    = AV_RL16(buf + 5) & 0x3fff;
 741         hscale    = buf[4] >> 6;
 742         vscale    = buf[6] >> 6;
 743         buf      += 7;
 744         buf_size -= 7;
 745
 746         if (hscale || vscale)
 747             avpriv_request_sample(s->avctx, "Upscaling");
 748
 749         s->update_golden = s->update_altref = VP56_FRAME_CURRENT;
 750         vp78_reset_probability_tables(s);
 751         memcpy(s->prob->pred16x16, vp8_pred16x16_prob_inter,
 752                sizeof(s->prob->pred16x16));
 753         memcpy(s->prob->pred8x8c, vp8_pred8x8c_prob_inter,
 754                sizeof(s->prob->pred8x8c));
 755         memcpy(s->prob->mvc, vp8_mv_default_prob,
 756                sizeof(s->prob->mvc));
 757         memset(&s->segmentation, 0, sizeof(s->segmentation));
 758         memset(&s->lf_delta, 0, sizeof(s->lf_delta));
 759     }
 760
 761     ret = ff_vp56_init_range_decoder(c, buf, header_size);
 762     if (ret < 0)
 763         return ret;
 764     buf      += header_size;
 765     buf_size -= header_size;
 766
 767     if (s->keyframe) {
 768         s->colorspace = vp8_rac_get(c);
 769         if (s->colorspace)
 770             av_log(s->avctx, AV_LOG_WARNING, "Unspecified colorspace\n");
 771         s->fullrange = vp8_rac_get(c);
 772     }
 773
 774     if ((s->segmentation.enabled = vp8_rac_get(c)))
 775         parse_segment_info(s);
 776     else
 777         s->segmentation.update_map = 0; // FIXME: move this to some init function?
 778
 779     s->filter.simple    = vp8_rac_get(c);
 780     s->filter.level     = vp8_rac_get_uint(c, 6);
 781     s->filter.sharpness = vp8_rac_get_uint(c, 3);
 782
 783     if ((s->lf_delta.enabled = vp8_rac_get(c))) {
 784         s->lf_delta.update = vp8_rac_get(c);
 785         if (s->lf_delta.update)
 786             update_lf_deltas(s);
 787     }
 788
 789     if (setup_partitions(s, buf, buf_size)) {
 790         av_log(s->avctx, AV_LOG_ERROR, "Invalid partitions\n");
 791         return AVERROR_INVALIDDATA;
 792     }
 793
 794     if (!s->macroblocks_base || /* first frame */
 795         width != s->avctx->width || height != s->avctx->height ||
 796         (width+15)/16 != s->mb_width || (height+15)/16 != s->mb_height)
 797         if ((ret = vp8_update_dimensions(s, width, height)) < 0)
 798             return ret;
 799
 800     vp8_get_quants(s);
 801
 802     if (!s->keyframe) {
 803         update_refs(s);
 804         s->sign_bias[VP56_FRAME_GOLDEN]               = vp8_rac_get(c);
 805         s->sign_bias[VP56_FRAME_GOLDEN2 /* altref */] = vp8_rac_get(c);
 806     }
 807
 808     // if we aren't saving this frame's probabilities for future frames,
 809     // make a copy of the current probabilities
 810     if (!(s->update_probabilities = vp8_rac_get(c)))
 811         s->prob[1] = s->prob[0];
 812
 813     s->update_last = s->keyframe || vp8_rac_get(c);
 814
 815     vp78_update_probability_tables(s);
 816
 817     if ((s->mbskip_enabled = vp8_rac_get(c)))
 818         s->prob->mbskip = vp8_rac_get_uint(c, 8);
 819
 820     if (!s->keyframe) {
 821         s->prob->intra  = vp8_rac_get_uint(c, 8);
 822         s->prob->last   = vp8_rac_get_uint(c, 8);
 823         s->prob->golden = vp8_rac_get_uint(c, 8);
 824         vp78_update_pred16x16_pred8x8_mvc_probabilities(s, VP8_MVC_SIZE);
 825     }
 826
 827     // Record the entropy coder state here so that hwaccels can use it.
 828     s->c.code_word = vp56_rac_renorm(&s->c);
 829     s->coder_state_at_header_end.input     = s->c.buffer - (-s->c.bits / 8);
 830     s->coder_state_at_header_end.range     = s->c.high;
 831     s->coder_state_at_header_end.value     = s->c.code_word >> 16;
 832     s->coder_state_at_header_end.bit_count = -s->c.bits % 8;
 833
 834     return 0;
 835 }
 836
 837 static av_always_inline
 838 void clamp_mv(VP8mvbounds *s, VP56mv *dst, const VP56mv *src)
 839 {
 840     dst->x = av_clip(src->x, av_clip(s->mv_min.x, INT16_MIN, INT16_MAX),
 841                              av_clip(s->mv_max.x, INT16_MIN, INT16_MAX));
 842     dst->y = av_clip(src->y, av_clip(s->mv_min.y, INT16_MIN, INT16_MAX),
 843                              av_clip(s->mv_max.y, INT16_MIN, INT16_MAX));
 844 }
 845
 846 /**
 847  * Motion vector coding, 17.1.
 848  */
 849 static av_always_inline int read_mv_component(VP56RangeCoder *c, const uint8_t *p, int vp7)
 850 {
 851     int bit, x = 0;
 852
 853     if (vp56_rac_get_prob_branchy(c, p[0])) {
 854         int i;
 855
 856         for (i = 0; i < 3; i++)
 857             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 858         for (i = (vp7 ? 7 : 9); i > 3; i--)
 859             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 860         if (!(x & (vp7 ? 0xF0 : 0xFFF0)) || vp56_rac_get_prob(c, p[12]))
 861             x += 8;
 862     } else {
 863         // small_mvtree
 864         const uint8_t *ps = p + 2;
 865         bit = vp56_rac_get_prob(c, *ps);
 866         ps += 1 + 3 * bit;
 867         x  += 4 * bit;
 868         bit = vp56_rac_get_prob(c, *ps);
 869         ps += 1 + bit;
 870         x  += 2 * bit;
 871         x  += vp56_rac_get_prob(c, *ps);
 872     }
 873
 874     return (x && vp56_rac_get_prob(c, p[1])) ? -x : x;
 875 }
 876
 877 static int vp7_read_mv_component(VP56RangeCoder *c, const uint8_t *p)
 878 {
 879     return read_mv_component(c, p, 1);
 880 }
 881
 882 static int vp8_read_mv_component(VP56RangeCoder *c, const uint8_t *p)
 883 {
 884     return read_mv_component(c, p, 0);
 885 }
 886
 887 static av_always_inline
 888 const uint8_t *get_submv_prob(uint32_t left, uint32_t top, int is_vp7)
 889 {
 890     if (is_vp7)
 891         return vp7_submv_prob;
 892
 893     if (left == top)
 894         return vp8_submv_prob[4 - !!left];
 895     if (!top)
 896         return vp8_submv_prob[2];
 897     return vp8_submv_prob[1 - !!left];
 898 }
 899
 900 /**
 901  * Split motion vector prediction, 16.4.
 902  * @returns the number of motion vectors parsed (2, 4 or 16)
 903  */
 904 static av_always_inline
 905 int decode_splitmvs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
 906                     int layout, int is_vp7)
 907 {
 908     int part_idx;
 909     int n, num;
 910     VP8Macroblock *top_mb;
 911     VP8Macroblock *left_mb = &mb[-1];
 912     const uint8_t *mbsplits_left = vp8_mbsplits[left_mb->partitioning];
 913     const uint8_t *mbsplits_top, *mbsplits_cur, *firstidx;
 914     VP56mv *top_mv;
 915     VP56mv *left_mv = left_mb->bmv;
 916     VP56mv *cur_mv  = mb->bmv;
 917
 918     if (!layout) // layout is inlined, s->mb_layout is not
 919         top_mb = &mb[2];
 920     else
 921         top_mb = &mb[-s->mb_width - 1];
 922     mbsplits_top = vp8_mbsplits[top_mb->partitioning];
 923     top_mv       = top_mb->bmv;
 924
 925     if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[0])) {
 926         if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[1]))
 927             part_idx = VP8_SPLITMVMODE_16x8 + vp56_rac_get_prob(c, vp8_mbsplit_prob[2]);
 928         else
 929             part_idx = VP8_SPLITMVMODE_8x8;
 930     } else {
 931         part_idx = VP8_SPLITMVMODE_4x4;
 932     }
 933
 934     num              = vp8_mbsplit_count[part_idx];
 935     mbsplits_cur     = vp8_mbsplits[part_idx],
 936     firstidx         = vp8_mbfirstidx[part_idx];
 937     mb->partitioning = part_idx;
 938
 939     for (n = 0; n < num; n++) {
 940         int k = firstidx[n];
 941         uint32_t left, above;
 942         const uint8_t *submv_prob;
 943
 944         if (!(k & 3))
 945             left = AV_RN32A(&left_mv[mbsplits_left[k + 3]]);
 946         else
 947             left = AV_RN32A(&cur_mv[mbsplits_cur[k - 1]]);
 948         if (k <= 3)
 949             above = AV_RN32A(&top_mv[mbsplits_top[k + 12]]);
 950         else
 951             above = AV_RN32A(&cur_mv[mbsplits_cur[k - 4]]);
 952
 953         submv_prob = get_submv_prob(left, above, is_vp7);
 954
 955         if (vp56_rac_get_prob_branchy(c, submv_prob[0])) {
 956             if (vp56_rac_get_prob_branchy(c, submv_prob[1])) {
 957                 if (vp56_rac_get_prob_branchy(c, submv_prob[2])) {
 958                     mb->bmv[n].y = mb->mv.y +
 959                                    read_mv_component(c, s->prob->mvc[0], is_vp7);
 960                     mb->bmv[n].x = mb->mv.x +
 961                                    read_mv_component(c, s->prob->mvc[1], is_vp7);
 962                 } else {
 963                     AV_ZERO32(&mb->bmv[n]);
 964                 }
 965             } else {
 966                 AV_WN32A(&mb->bmv[n], above);
 967             }
 968         } else {
 969             AV_WN32A(&mb->bmv[n], left);
 970         }
 971     }
 972
 973     return num;
 974 }
 975
 976 /**
 977  * The vp7 reference decoder uses a padding macroblock column (added to right
 978  * edge of the frame) to guard against illegal macroblock offsets. The
 979  * algorithm has bugs that permit offsets to straddle the padding column.
 980  * This function replicates those bugs.
 981  *
 982  * @param[out] edge_x macroblock x address
 983  * @param[out] edge_y macroblock y address
 984  *
 985  * @return macroblock offset legal (boolean)
 986  */
 987 static int vp7_calculate_mb_offset(int mb_x, int mb_y, int mb_width,
 988                                    int xoffset, int yoffset, int boundary,
 989                                    int *edge_x, int *edge_y)
 990 {
 991     int vwidth = mb_width + 1;
 992     int new = (mb_y + yoffset) * vwidth + mb_x + xoffset;
 993     if (new < boundary || new % vwidth == vwidth - 1)
 994         return 0;
 995     *edge_y = new / vwidth;
 996     *edge_x = new % vwidth;
 997     return 1;
 998 }
 999
1000 static const VP56mv *get_bmv_ptr(const VP8Macroblock *mb, int subblock)
1001 {
1002     return &mb->bmv[mb->mode == VP8_MVMODE_SPLIT ? vp8_mbsplits[mb->partitioning][subblock] : 0];
1003 }
1004
1005 static av_always_inline
1006 void vp7_decode_mvs(VP8Context *s, VP8Macroblock *mb,
1007                     int mb_x, int mb_y, int layout)
1008 {
1009     VP8Macroblock *mb_edge[12];
1010     enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR };
1011     enum { VP8_EDGE_TOP, VP8_EDGE_LEFT, VP8_EDGE_TOPLEFT };
1012     int idx = CNT_ZERO;
1013     VP56mv near_mv[3];
1014     uint8_t cnt[3] = { 0 };
1015     VP56RangeCoder *c = &s->c;
1016     int i;
1017
1018     AV_ZERO32(&near_mv[0]);
1019     AV_ZERO32(&near_mv[1]);
1020     AV_ZERO32(&near_mv[2]);
1021
1022     for (i = 0; i < VP7_MV_PRED_COUNT; i++) {
1023         const VP7MVPred * pred = &vp7_mv_pred[i];
1024         int edge_x, edge_y;
1025
1026         if (vp7_calculate_mb_offset(mb_x, mb_y, s->mb_width, pred->xoffset,
1027                                     pred->yoffset, !s->profile, &edge_x, &edge_y)) {
1028             VP8Macroblock *edge = mb_edge[i] = (s->mb_layout == 1)
1029                                              ? s->macroblocks_base + 1 + edge_x +
1030                                                (s->mb_width + 1) * (edge_y + 1)
1031                                              : s->macroblocks + edge_x +
1032                                                (s->mb_height - edge_y - 1) * 2;
1033             uint32_t mv = AV_RN32A(get_bmv_ptr(edge, vp7_mv_pred[i].subblock));
1034             if (mv) {
1035                 if (AV_RN32A(&near_mv[CNT_NEAREST])) {
1036                     if (mv == AV_RN32A(&near_mv[CNT_NEAREST])) {
1037                         idx = CNT_NEAREST;
1038                     } else if (AV_RN32A(&near_mv[CNT_NEAR])) {
1039                         if (mv != AV_RN32A(&near_mv[CNT_NEAR]))
1040                             continue;
1041                         idx = CNT_NEAR;
1042                     } else {
1043                         AV_WN32A(&near_mv[CNT_NEAR], mv);
1044                         idx = CNT_NEAR;
1045                     }
1046                 } else {
1047                     AV_WN32A(&near_mv[CNT_NEAREST], mv);
1048                     idx = CNT_NEAREST;
1049                 }
1050             } else {
1051                 idx = CNT_ZERO;
1052             }
1053         } else {
1054             idx = CNT_ZERO;
1055         }
1056         cnt[idx] += vp7_mv_pred[i].score;
1057     }
1058
1059     mb->partitioning = VP8_SPLITMVMODE_NONE;
1060
1061     if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_ZERO]][0])) {
1062         mb->mode = VP8_MVMODE_MV;
1063
1064         if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAREST]][1])) {
1065
1066             if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAR]][2])) {
1067
1068                 if (cnt[CNT_NEAREST] > cnt[CNT_NEAR])
1069                     AV_WN32A(&mb->mv, cnt[CNT_ZERO] > cnt[CNT_NEAREST] ? 0 : AV_RN32A(&near_mv[CNT_NEAREST]));
1070                 else
1071                     AV_WN32A(&mb->mv, cnt[CNT_ZERO] > cnt[CNT_NEAR]    ? 0 : AV_RN32A(&near_mv[CNT_NEAR]));
1072
1073                 if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAR]][3])) {
1074                     mb->mode = VP8_MVMODE_SPLIT;
1075                     mb->mv = mb->bmv[decode_splitmvs(s, c, mb, layout, IS_VP7) - 1];
1076                 } else {
1077                     mb->mv.y += vp7_read_mv_component(c, s->prob->mvc[0]);
1078                     mb->mv.x += vp7_read_mv_component(c, s->prob->mvc[1]);
1079                     mb->bmv[0] = mb->mv;
1080                 }
1081             } else {
1082                 mb->mv = near_mv[CNT_NEAR];
1083                 mb->bmv[0] = mb->mv;
1084             }
1085         } else {
1086             mb->mv = near_mv[CNT_NEAREST];
1087             mb->bmv[0] = mb->mv;
1088         }
1089     } else {
1090         mb->mode = VP8_MVMODE_ZERO;
1091         AV_ZERO32(&mb->mv);
1092         mb->bmv[0] = mb->mv;
1093     }
1094 }
1095
1096 static av_always_inline
1097 void vp8_decode_mvs(VP8Context *s, VP8mvbounds *mv_bounds, VP8Macroblock *mb,
1098                     int mb_x, int mb_y, int layout)
1099 {
1100     VP8Macroblock *mb_edge[3] = { 0      /* top */,
1101                                   mb - 1 /* left */,
1102                                   0      /* top-left */ };
1103     enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV };
1104     enum { VP8_EDGE_TOP, VP8_EDGE_LEFT, VP8_EDGE_TOPLEFT };
1105     int idx = CNT_ZERO;
1106     int cur_sign_bias = s->sign_bias[mb->ref_frame];
1107     int8_t *sign_bias = s->sign_bias;
1108     VP56mv near_mv[4];
1109     uint8_t cnt[4] = { 0 };
1110     VP56RangeCoder *c = &s->c;
1111
1112     if (!layout) { // layout is inlined (s->mb_layout is not)
1113         mb_edge[0] = mb + 2;
1114         mb_edge[2] = mb + 1;
1115     } else {
1116         mb_edge[0] = mb - s->mb_width - 1;
1117         mb_edge[2] = mb - s->mb_width - 2;
1118     }
1119
1120     AV_ZERO32(&near_mv[0]);
1121     AV_ZERO32(&near_mv[1]);
1122     AV_ZERO32(&near_mv[2]);
1123
1124     /* Process MB on top, left and top-left */
1125 #define MV_EDGE_CHECK(n)                                                      \
1126     {                                                                         \
1127         VP8Macroblock *edge = mb_edge[n];                                     \
1128         int edge_ref = edge->ref_frame;                                       \
1129         if (edge_ref != VP56_FRAME_CURRENT) {                                 \
1130             uint32_t mv = AV_RN32A(&edge->mv);                                \
1131             if (mv) {                                                         \
1132                 if (cur_sign_bias != sign_bias[edge_ref]) {                   \
1133                     /* SWAR negate of the values in mv. */                    \
1134                     mv = ~mv;                                                 \
1135                     mv = ((mv & 0x7fff7fff) +                                 \
1136                           0x00010001) ^ (mv & 0x80008000);                    \
1137                 }                                                             \
1138                 if (!n || mv != AV_RN32A(&near_mv[idx]))                      \
1139                     AV_WN32A(&near_mv[++idx], mv);                            \
1140                 cnt[idx] += 1 + (n != 2);                                     \
1141             } else                                                            \
1142                 cnt[CNT_ZERO] += 1 + (n != 2);                                \
1143         }                                                                     \
1144     }
1145
1146     MV_EDGE_CHECK(0)
1147     MV_EDGE_CHECK(1)
1148     MV_EDGE_CHECK(2)
1149
1150     mb->partitioning = VP8_SPLITMVMODE_NONE;
1151     if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_ZERO]][0])) {
1152         mb->mode = VP8_MVMODE_MV;
1153
1154         /* If we have three distinct MVs, merge first and last if they're the same */
1155         if (cnt[CNT_SPLITMV] &&
1156             AV_RN32A(&near_mv[1 + VP8_EDGE_TOP]) == AV_RN32A(&near_mv[1 + VP8_EDGE_TOPLEFT]))
1157             cnt[CNT_NEAREST] += 1;
1158
1159         /* Swap near and nearest if necessary */
1160         if (cnt[CNT_NEAR] > cnt[CNT_NEAREST]) {
1161             FFSWAP(uint8_t,     cnt[CNT_NEAREST],     cnt[CNT_NEAR]);
1162             FFSWAP( VP56mv, near_mv[CNT_NEAREST], near_mv[CNT_NEAR]);
1163         }
1164
1165         if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAREST]][1])) {
1166             if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAR]][2])) {
1167                 /* Choose the best mv out of 0,0 and the nearest mv */
1168                 clamp_mv(mv_bounds, &mb->mv, &near_mv[CNT_ZERO + (cnt[CNT_NEAREST] >= cnt[CNT_ZERO])]);
1169                 cnt[CNT_SPLITMV] = ((mb_edge[VP8_EDGE_LEFT]->mode    == VP8_MVMODE_SPLIT) +
1170                                     (mb_edge[VP8_EDGE_TOP]->mode     == VP8_MVMODE_SPLIT)) * 2 +
1171                                     (mb_edge[VP8_EDGE_TOPLEFT]->mode == VP8_MVMODE_SPLIT);
1172
1173                 if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_SPLITMV]][3])) {
1174                     mb->mode = VP8_MVMODE_SPLIT;
1175                     mb->mv = mb->bmv[decode_splitmvs(s, c, mb, layout, IS_VP8) - 1];
1176                 } else {
1177                     mb->mv.y  += vp8_read_mv_component(c, s->prob->mvc[0]);
1178                     mb->mv.x  += vp8_read_mv_component(c, s->prob->mvc[1]);
1179                     mb->bmv[0] = mb->mv;
1180                 }
1181             } else {
1182                 clamp_mv(mv_bounds, &mb->mv, &near_mv[CNT_NEAR]);
1183                 mb->bmv[0] = mb->mv;
1184             }
1185         } else {
1186             clamp_mv(mv_bounds, &mb->mv, &near_mv[CNT_NEAREST]);
1187             mb->bmv[0] = mb->mv;
1188         }
1189     } else {
1190         mb->mode = VP8_MVMODE_ZERO;
1191         AV_ZERO32(&mb->mv);
1192         mb->bmv[0] = mb->mv;
1193     }
1194 }
1195
1196 static av_always_inline
1197 void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
1198                            int mb_x, int keyframe, int layout)
1199 {
1200     uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
1201
1202     if (layout) {
1203         VP8Macroblock *mb_top = mb - s->mb_width - 1;
1204         memcpy(mb->intra4x4_pred_mode_top, mb_top->intra4x4_pred_mode_top, 4);
1205     }
1206     if (keyframe) {
1207         int x, y;
1208         uint8_t *top;
1209         uint8_t *const left = s->intra4x4_pred_mode_left;
1210         if (layout)
1211             top = mb->intra4x4_pred_mode_top;
1212         else
1213             top = s->intra4x4_pred_mode_top + 4 * mb_x;
1214         for (y = 0; y < 4; y++) {
1215             for (x = 0; x < 4; x++) {
1216                 const uint8_t *ctx;
1217                 ctx       = vp8_pred4x4_prob_intra[top[x]][left[y]];
1218                 *intra4x4 = vp8_rac_get_tree(c, vp8_pred4x4_tree, ctx);
1219                 left[y]   = top[x] = *intra4x4;
1220                 intra4x4++;
1221             }
1222         }
1223     } else {
1224         int i;
1225         for (i = 0; i < 16; i++)
1226             intra4x4[i] = vp8_rac_get_tree(c, vp8_pred4x4_tree,
1227                                            vp8_pred4x4_prob_inter);
1228     }
1229 }
1230
1231 static av_always_inline
1232 void decode_mb_mode(VP8Context *s, VP8mvbounds *mv_bounds,
1233                     VP8Macroblock *mb, int mb_x, int mb_y,
1234                     uint8_t *segment, uint8_t *ref, int layout, int is_vp7)
1235 {
1236     VP56RangeCoder *c = &s->c;
1237     static const char * const vp7_feature_name[] = { "q-index",
1238                                                      "lf-delta",
1239                                                      "partial-golden-update",
1240                                                      "blit-pitch" };
1241     if (is_vp7) {
1242         int i;
1243         *segment = 0;
1244         for (i = 0; i < 4; i++) {
1245             if (s->feature_enabled[i]) {
1246                 if (vp56_rac_get_prob_branchy(c, s->feature_present_prob[i])) {
1247                       int index = vp8_rac_get_tree(c, vp7_feature_index_tree,
1248                                                    s->feature_index_prob[i]);
1249                       av_log(s->avctx, AV_LOG_WARNING,
1250                              "Feature %s present in macroblock (value 0x%x)\n",
1251                              vp7_feature_name[i], s->feature_value[i][index]);
1252                 }
1253            }
1254         }
1255     } else if (s->segmentation.update_map) {
1256         int bit  = vp56_rac_get_prob(c, s->prob->segmentid[0]);
1257         *segment = vp56_rac_get_prob(c, s->prob->segmentid[1+bit]) + 2*bit;
1258     } else if (s->segmentation.enabled)
1259         *segment = ref ? *ref : *segment;
1260     mb->segment = *segment;
1261
1262     mb->skip = s->mbskip_enabled ? vp56_rac_get_prob(c, s->prob->mbskip) : 0;
1263
1264     if (s->keyframe) {
1265         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_intra,
1266                                     vp8_pred16x16_prob_intra);
1267
1268         if (mb->mode == MODE_I4x4) {
1269             decode_intra4x4_modes(s, c, mb, mb_x, 1, layout);
1270         } else {
1271             const uint32_t modes = (is_vp7 ? vp7_pred4x4_mode
1272                                            : vp8_pred4x4_mode)[mb->mode] * 0x01010101u;
1273             if (s->mb_layout)
1274                 AV_WN32A(mb->intra4x4_pred_mode_top, modes);
1275             else
1276                 AV_WN32A(s->intra4x4_pred_mode_top + 4 * mb_x, modes);
1277             AV_WN32A(s->intra4x4_pred_mode_left, modes);
1278         }
1279
1280         mb->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree,
1281                                                 vp8_pred8x8c_prob_intra);
1282         mb->ref_frame        = VP56_FRAME_CURRENT;
1283     } else if (vp56_rac_get_prob_branchy(c, s->prob->intra)) {
1284         // inter MB, 16.2
1285         if (vp56_rac_get_prob_branchy(c, s->prob->last))
1286             mb->ref_frame =
1287                 (!is_vp7 && vp56_rac_get_prob(c, s->prob->golden)) ? VP56_FRAME_GOLDEN2 /* altref */
1288                                                                    : VP56_FRAME_GOLDEN;
1289         else
1290             mb->ref_frame = VP56_FRAME_PREVIOUS;
1291         s->ref_count[mb->ref_frame - 1]++;
1292
1293         // motion vectors, 16.3
1294         if (is_vp7)
1295             vp7_decode_mvs(s, mb, mb_x, mb_y, layout);
1296         else
1297             vp8_decode_mvs(s, mv_bounds, mb, mb_x, mb_y, layout);
1298     } else {
1299         // intra MB, 16.1
1300         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_inter, s->prob->pred16x16);
1301
1302         if (mb->mode == MODE_I4x4)
1303             decode_intra4x4_modes(s, c, mb, mb_x, 0, layout);
1304
1305         mb->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree,
1306                                                 s->prob->pred8x8c);
1307         mb->ref_frame        = VP56_FRAME_CURRENT;
1308         mb->partitioning     = VP8_SPLITMVMODE_NONE;
1309         AV_ZERO32(&mb->bmv[0]);
1310     }
1311 }
1312
1313 /**
1314  * @param r     arithmetic bitstream reader context
1315  * @param block destination for block coefficients
1316  * @param probs probabilities to use when reading trees from the bitstream
1317  * @param i     initial coeff index, 0 unless a separate DC block is coded
1318  * @param qmul  array holding the dc/ac dequant factor at position 0/1
1319  *
1320  * @return 0 if no coeffs were decoded
1321  *         otherwise, the index of the last coeff decoded plus one
1322  */
1323 static av_always_inline
1324 int decode_block_coeffs_internal(VP56RangeCoder *r, int16_t block[16],
1325                                  uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1326                                  int i, uint8_t *token_prob, int16_t qmul[2],
1327                                  const uint8_t scan[16], int vp7)
1328 {
1329     VP56RangeCoder c = *r;
1330     goto skip_eob;
1331     do {
1332         int coeff;
1333 restart:
1334         if (!vp56_rac_get_prob_branchy(&c, token_prob[0]))   // DCT_EOB
1335             break;
1336
1337 skip_eob:
1338         if (!vp56_rac_get_prob_branchy(&c, token_prob[1])) { // DCT_0
1339             if (++i == 16)
1340                 break; // invalid input; blocks should end with EOB
1341             token_prob = probs[i][0];
1342             if (vp7)
1343                 goto restart;
1344             goto skip_eob;
1345         }
1346
1347         if (!vp56_rac_get_prob_branchy(&c, token_prob[2])) { // DCT_1
1348             coeff = 1;
1349             token_prob = probs[i + 1][1];
1350         } else {
1351             if (!vp56_rac_get_prob_branchy(&c, token_prob[3])) { // DCT 2,3,4
1352                 coeff = vp56_rac_get_prob_branchy(&c, token_prob[4]);
1353                 if (coeff)
1354                     coeff += vp56_rac_get_prob(&c, token_prob[5]);
1355                 coeff += 2;
1356             } else {
1357                 // DCT_CAT*
1358                 if (!vp56_rac_get_prob_branchy(&c, token_prob[6])) {
1359                     if (!vp56_rac_get_prob_branchy(&c, token_prob[7])) { // DCT_CAT1
1360                         coeff = 5 + vp56_rac_get_prob(&c, vp8_dct_cat1_prob[0]);
1361                     } else {                                    // DCT_CAT2
1362                         coeff  = 7;
1363                         coeff += vp56_rac_get_prob(&c, vp8_dct_cat2_prob[0]) << 1;
1364                         coeff += vp56_rac_get_prob(&c, vp8_dct_cat2_prob[1]);
1365                     }
1366                 } else {    // DCT_CAT3 and up
1367                     int a   = vp56_rac_get_prob(&c, token_prob[8]);
1368                     int b   = vp56_rac_get_prob(&c, token_prob[9 + a]);
1369                     int cat = (a << 1) + b;
1370                     coeff  = 3 + (8 << cat);
1371                     coeff += vp8_rac_get_coeff(&c, ff_vp8_dct_cat_prob[cat]);
1372                 }
1373             }
1374             token_prob = probs[i + 1][2];
1375         }
1376         block[scan[i]] = (vp8_rac_get(&c) ? -coeff : coeff) * qmul[!!i];
1377     } while (++i < 16);
1378
1379     *r = c;
1380     return i;
1381 }
1382
1383 static av_always_inline
1384 int inter_predict_dc(int16_t block[16], int16_t pred[2])
1385 {
1386     int16_t dc = block[0];
1387     int ret = 0;
1388
1389     if (pred[1] > 3) {
1390         dc += pred[0];
1391         ret = 1;
1392     }
1393
1394     if (!pred[0] | !dc | ((int32_t)pred[0] ^ (int32_t)dc) >> 31) {
1395         block[0] = pred[0] = dc;
1396         pred[1] = 0;
1397     } else {
1398         if (pred[0] == dc)
1399             pred[1]++;
1400         block[0] = pred[0] = dc;
1401     }
1402
1403     return ret;
1404 }
1405
1406 static int vp7_decode_block_coeffs_internal(VP56RangeCoder *r,
1407                                             int16_t block[16],
1408                                             uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1409                                             int i, uint8_t *token_prob,
1410                                             int16_t qmul[2],
1411                                             const uint8_t scan[16])
1412 {
1413     return decode_block_coeffs_internal(r, block, probs, i,
1414                                         token_prob, qmul, scan, IS_VP7);
1415 }
1416
1417 #ifndef vp8_decode_block_coeffs_internal
1418 static int vp8_decode_block_coeffs_internal(VP56RangeCoder *r,
1419                                             int16_t block[16],
1420                                             uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1421                                             int i, uint8_t *token_prob,
1422                                             int16_t qmul[2])
1423 {
1424     return decode_block_coeffs_internal(r, block, probs, i,
1425                                         token_prob, qmul, ff_zigzag_scan, IS_VP8);
1426 }
1427 #endif
1428
1429 /**
1430  * @param c          arithmetic bitstream reader context
1431  * @param block      destination for block coefficients
1432  * @param probs      probabilities to use when reading trees from the bitstream
1433  * @param i          initial coeff index, 0 unless a separate DC block is coded
1434  * @param zero_nhood the initial prediction context for number of surrounding
1435  *                   all-zero blocks (only left/top, so 0-2)
1436  * @param qmul       array holding the dc/ac dequant factor at position 0/1
1437  * @param scan       scan pattern (VP7 only)
1438  *
1439  * @return 0 if no coeffs were decoded
1440  *         otherwise, the index of the last coeff decoded plus one
1441  */
1442 static av_always_inline
1443 int decode_block_coeffs(VP56RangeCoder *c, int16_t block[16],
1444                         uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1445                         int i, int zero_nhood, int16_t qmul[2],
1446                         const uint8_t scan[16], int vp7)
1447 {
1448     uint8_t *token_prob = probs[i][zero_nhood];
1449     if (!vp56_rac_get_prob_branchy(c, token_prob[0]))   // DCT_EOB
1450         return 0;
1451     return vp7 ? vp7_decode_block_coeffs_internal(c, block, probs, i,
1452                                                   token_prob, qmul, scan)
1453                : vp8_decode_block_coeffs_internal(c, block, probs, i,
1454                                                   token_prob, qmul);
1455 }
1456
1457 static av_always_inline
1458 void decode_mb_coeffs(VP8Context *s, VP8ThreadData *td, VP56RangeCoder *c,
1459                       VP8Macroblock *mb, uint8_t t_nnz[9], uint8_t l_nnz[9],
1460                       int is_vp7)
1461 {
1462     int i, x, y, luma_start = 0, luma_ctx = 3;
1463     int nnz_pred, nnz, nnz_total = 0;
1464     int segment = mb->segment;
1465     int block_dc = 0;
1466
1467     if (mb->mode != MODE_I4x4 && (is_vp7 || mb->mode != VP8_MVMODE_SPLIT)) {
1468         nnz_pred = t_nnz[8] + l_nnz[8];
1469
1470         // decode DC values and do hadamard
1471         nnz = decode_block_coeffs(c, td->block_dc, s->prob->token[1], 0,
1472                                   nnz_pred, s->qmat[segment].luma_dc_qmul,
1473                                   ff_zigzag_scan, is_vp7);
1474         l_nnz[8] = t_nnz[8] = !!nnz;
1475
1476         if (is_vp7 && mb->mode > MODE_I4x4) {
1477             nnz |=  inter_predict_dc(td->block_dc,
1478                                      s->inter_dc_pred[mb->ref_frame - 1]);
1479         }
1480
1481         if (nnz) {
1482             nnz_total += nnz;
1483             block_dc   = 1;
1484             if (nnz == 1)
1485                 s->vp8dsp.vp8_luma_dc_wht_dc(td->block, td->block_dc);
1486             else
1487                 s->vp8dsp.vp8_luma_dc_wht(td->block, td->block_dc);
1488         }
1489         luma_start = 1;
1490         luma_ctx   = 0;
1491     }
1492
1493     // luma blocks
1494     for (y = 0; y < 4; y++)
1495         for (x = 0; x < 4; x++) {
1496             nnz_pred = l_nnz[y] + t_nnz[x];
1497             nnz = decode_block_coeffs(c, td->block[y][x],
1498                                       s->prob->token[luma_ctx],
1499                                       luma_start, nnz_pred,
1500                                       s->qmat[segment].luma_qmul,
1501                                       s->prob[0].scan, is_vp7);
1502             /* nnz+block_dc may be one more than the actual last index,
1503              * but we don't care */
1504             td->non_zero_count_cache[y][x] = nnz + block_dc;
1505             t_nnz[x] = l_nnz[y] = !!nnz;
1506             nnz_total += nnz;
1507         }
1508
1509     // chroma blocks
1510     // TODO: what to do about dimensions? 2nd dim for luma is x,
1511     // but for chroma it's (y<<1)|x
1512     for (i = 4; i < 6; i++)
1513         for (y = 0; y < 2; y++)
1514             for (x = 0; x < 2; x++) {
1515                 nnz_pred = l_nnz[i + 2 * y] + t_nnz[i + 2 * x];
1516                 nnz = decode_block_coeffs(c, td->block[i][(y << 1) + x],
1517                                           s->prob->token[2], 0, nnz_pred,
1518                                           s->qmat[segment].chroma_qmul,
1519                                           s->prob[0].scan, is_vp7);
1520                 td->non_zero_count_cache[i][(y << 1) + x] = nnz;
1521                 t_nnz[i + 2 * x] = l_nnz[i + 2 * y] = !!nnz;
1522                 nnz_total += nnz;
1523             }
1524
1525     // if there were no coded coeffs despite the macroblock not being marked skip,
1526     // we MUST not do the inner loop filter and should not do IDCT
1527     // Since skip isn't used for bitstream prediction, just manually set it.
1528     if (!nnz_total)
1529         mb->skip = 1;
1530 }
1531
1532 static av_always_inline
1533 void backup_mb_border(uint8_t *top_border, uint8_t *src_y,
1534                       uint8_t *src_cb, uint8_t *src_cr,
1535                       ptrdiff_t linesize, ptrdiff_t uvlinesize, int simple)
1536 {
1537     AV_COPY128(top_border, src_y + 15 * linesize);
1538     if (!simple) {
1539         AV_COPY64(top_border + 16, src_cb + 7 * uvlinesize);
1540         AV_COPY64(top_border + 24, src_cr + 7 * uvlinesize);
1541     }
1542 }
1543
1544 static av_always_inline
1545 void xchg_mb_border(uint8_t *top_border, uint8_t *src_y, uint8_t *src_cb,
1546                     uint8_t *src_cr, ptrdiff_t linesize, ptrdiff_t uvlinesize, int mb_x,
1547                     int mb_y, int mb_width, int simple, int xchg)
1548 {
1549     uint8_t *top_border_m1 = top_border - 32;     // for TL prediction
1550     src_y  -= linesize;
1551     src_cb -= uvlinesize;
1552     src_cr -= uvlinesize;
1553
1554 #define XCHG(a, b, xchg)                                                      \
1555     do {                                                                      \
1556         if (xchg)                                                             \
1557             AV_SWAP64(b, a);                                                  \
1558         else                                                                  \
1559             AV_COPY64(b, a);                                                  \
1560     } while (0)
1561
1562     XCHG(top_border_m1 + 8, src_y - 8, xchg);
1563     XCHG(top_border, src_y, xchg);
1564     XCHG(top_border + 8, src_y + 8, 1);
1565     if (mb_x < mb_width - 1)
1566         XCHG(top_border + 32, src_y + 16, 1);
1567
1568     // only copy chroma for normal loop filter
1569     // or to initialize the top row to 127
1570     if (!simple || !mb_y) {
1571         XCHG(top_border_m1 + 16, src_cb - 8, xchg);
1572         XCHG(top_border_m1 + 24, src_cr - 8, xchg);
1573         XCHG(top_border + 16, src_cb, 1);
1574         XCHG(top_border + 24, src_cr, 1);
1575     }
1576 }
1577
1578 static av_always_inline
1579 int check_dc_pred8x8_mode(int mode, int mb_x, int mb_y)
1580 {
1581     if (!mb_x)
1582         return mb_y ? TOP_DC_PRED8x8 : DC_128_PRED8x8;
1583     else
1584         return mb_y ? mode : LEFT_DC_PRED8x8;
1585 }
1586
1587 static av_always_inline
1588 int check_tm_pred8x8_mode(int mode, int mb_x, int mb_y, int vp7)
1589 {
1590     if (!mb_x)
1591         return mb_y ? VERT_PRED8x8 : (vp7 ? DC_128_PRED8x8 : DC_129_PRED8x8);
1592     else
1593         return mb_y ? mode : HOR_PRED8x8;
1594 }
1595
1596 static av_always_inline
1597 int check_intra_pred8x8_mode_emuedge(int mode, int mb_x, int mb_y, int vp7)
1598 {
1599     switch (mode) {
1600     case DC_PRED8x8:
1601         return check_dc_pred8x8_mode(mode, mb_x, mb_y);
1602     case VERT_PRED8x8:
1603         return !mb_y ? (vp7 ? DC_128_PRED8x8 : DC_127_PRED8x8) : mode;
1604     case HOR_PRED8x8:
1605         return !mb_x ? (vp7 ? DC_128_PRED8x8 : DC_129_PRED8x8) : mode;
1606     case PLANE_PRED8x8: /* TM */
1607         return check_tm_pred8x8_mode(mode, mb_x, mb_y, vp7);
1608     }
1609     return mode;
1610 }
1611
1612 static av_always_inline
1613 int check_tm_pred4x4_mode(int mode, int mb_x, int mb_y, int vp7)
1614 {
1615     if (!mb_x) {
1616         return mb_y ? VERT_VP8_PRED : (vp7 ? DC_128_PRED : DC_129_PRED);
1617     } else {
1618         return mb_y ? mode : HOR_VP8_PRED;
1619     }
1620 }
1621
1622 static av_always_inline
1623 int check_intra_pred4x4_mode_emuedge(int mode, int mb_x, int mb_y,
1624                                      int *copy_buf, int vp7)
1625 {
1626     switch (mode) {
1627     case VERT_PRED:
1628         if (!mb_x && mb_y) {
1629             *copy_buf = 1;
1630             return mode;
1631         }
1632         /* fall-through */
1633     case DIAG_DOWN_LEFT_PRED:
1634     case VERT_LEFT_PRED:
1635         return !mb_y ? (vp7 ? DC_128_PRED : DC_127_PRED) : mode;
1636     case HOR_PRED:
1637         if (!mb_y) {
1638             *copy_buf = 1;
1639             return mode;
1640         }
1641         /* fall-through */
1642     case HOR_UP_PRED:
1643         return !mb_x ? (vp7 ? DC_128_PRED : DC_129_PRED) : mode;
1644     case TM_VP8_PRED:
1645         return check_tm_pred4x4_mode(mode, mb_x, mb_y, vp7);
1646     case DC_PRED: /* 4x4 DC doesn't use the same "H.264-style" exceptions
1647                    * as 16x16/8x8 DC */
1648     case DIAG_DOWN_RIGHT_PRED:
1649     case VERT_RIGHT_PRED:
1650     case HOR_DOWN_PRED:
1651         if (!mb_y || !mb_x)
1652             *copy_buf = 1;
1653         return mode;
1654     }
1655     return mode;
1656 }
1657
1658 static av_always_inline
1659 void intra_predict(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1660                    VP8Macroblock *mb, int mb_x, int mb_y, int is_vp7)
1661 {
1662     int x, y, mode, nnz;
1663     uint32_t tr;
1664
1665     /* for the first row, we need to run xchg_mb_border to init the top edge
1666      * to 127 otherwise, skip it if we aren't going to deblock */
1667     if (mb_y && (s->deblock_filter || !mb_y) && td->thread_nr == 0)
1668         xchg_mb_border(s->top_border[mb_x + 1], dst[0], dst[1], dst[2],
1669                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1670                        s->filter.simple, 1);
1671
1672     if (mb->mode < MODE_I4x4) {
1673         mode = check_intra_pred8x8_mode_emuedge(mb->mode, mb_x, mb_y, is_vp7);
1674         s->hpc.pred16x16[mode](dst[0], s->linesize);
1675     } else {
1676         uint8_t *ptr = dst[0];
1677         uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
1678         const uint8_t lo = is_vp7 ? 128 : 127;
1679         const uint8_t hi = is_vp7 ? 128 : 129;
1680         uint8_t tr_top[4] = { lo, lo, lo, lo };
1681
1682         // all blocks on the right edge of the macroblock use bottom edge
1683         // the top macroblock for their topright edge
1684         uint8_t *tr_right = ptr - s->linesize + 16;
1685
1686         // if we're on the right edge of the frame, said edge is extended
1687         // from the top macroblock
1688         if (mb_y && mb_x == s->mb_width - 1) {
1689             tr       = tr_right[-1] * 0x01010101u;
1690             tr_right = (uint8_t *) &tr;
1691         }
1692
1693         if (mb->skip)
1694             AV_ZERO128(td->non_zero_count_cache);
1695
1696         for (y = 0; y < 4; y++) {
1697             uint8_t *topright = ptr + 4 - s->linesize;
1698             for (x = 0; x < 4; x++) {
1699                 int copy = 0;
1700                 ptrdiff_t linesize = s->linesize;
1701                 uint8_t *dst = ptr + 4 * x;
1702                 LOCAL_ALIGNED(4, uint8_t, copy_dst, [5 * 8]);
1703
1704                 if ((y == 0 || x == 3) && mb_y == 0) {
1705                     topright = tr_top;
1706                 } else if (x == 3)
1707                     topright = tr_right;
1708
1709                 mode = check_intra_pred4x4_mode_emuedge(intra4x4[x], mb_x + x,
1710                                                         mb_y + y, &copy, is_vp7);
1711                 if (copy) {
1712                     dst      = copy_dst + 12;
1713                     linesize = 8;
1714                     if (!(mb_y + y)) {
1715                         copy_dst[3] = lo;
1716                         AV_WN32A(copy_dst + 4, lo * 0x01010101U);
1717                     } else {
1718                         AV_COPY32(copy_dst + 4, ptr + 4 * x - s->linesize);
1719                         if (!(mb_x + x)) {
1720                             copy_dst[3] = hi;
1721                         } else {
1722                             copy_dst[3] = ptr[4 * x - s->linesize - 1];
1723                         }
1724                     }
1725                     if (!(mb_x + x)) {
1726                         copy_dst[11] =
1727                         copy_dst[19] =
1728                         copy_dst[27] =
1729                         copy_dst[35] = hi;
1730                     } else {
1731                         copy_dst[11] = ptr[4 * x                   - 1];
1732                         copy_dst[19] = ptr[4 * x + s->linesize     - 1];
1733                         copy_dst[27] = ptr[4 * x + s->linesize * 2 - 1];
1734                         copy_dst[35] = ptr[4 * x + s->linesize * 3 - 1];
1735                     }
1736                 }
1737                 s->hpc.pred4x4[mode](dst, topright, linesize);
1738                 if (copy) {
1739                     AV_COPY32(ptr + 4 * x,                   copy_dst + 12);
1740                     AV_COPY32(ptr + 4 * x + s->linesize,     copy_dst + 20);
1741                     AV_COPY32(ptr + 4 * x + s->linesize * 2, copy_dst + 28);
1742                     AV_COPY32(ptr + 4 * x + s->linesize * 3, copy_dst + 36);
1743                 }
1744
1745                 nnz = td->non_zero_count_cache[y][x];
1746                 if (nnz) {
1747                     if (nnz == 1)
1748                         s->vp8dsp.vp8_idct_dc_add(ptr + 4 * x,
1749                                                   td->block[y][x], s->linesize);
1750                     else
1751                         s->vp8dsp.vp8_idct_add(ptr + 4 * x,
1752                                                td->block[y][x], s->linesize);
1753                 }
1754                 topright += 4;
1755             }
1756
1757             ptr      += 4 * s->linesize;
1758             intra4x4 += 4;
1759         }
1760     }
1761
1762     mode = check_intra_pred8x8_mode_emuedge(mb->chroma_pred_mode,
1763                                             mb_x, mb_y, is_vp7);
1764     s->hpc.pred8x8[mode](dst[1], s->uvlinesize);
1765     s->hpc.pred8x8[mode](dst[2], s->uvlinesize);
1766
1767     if (mb_y && (s->deblock_filter || !mb_y) && td->thread_nr == 0)
1768         xchg_mb_border(s->top_border[mb_x + 1], dst[0], dst[1], dst[2],
1769                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1770                        s->filter.simple, 0);
1771 }
1772
1773 static const uint8_t subpel_idx[3][8] = {
1774     { 0, 1, 2, 1, 2, 1, 2, 1 }, // nr. of left extra pixels,
1775                                 // also function pointer index
1776     { 0, 3, 5, 3, 5, 3, 5, 3 }, // nr. of extra pixels required
1777     { 0, 2, 3, 2, 3, 2, 3, 2 }, // nr. of right extra pixels
1778 };
1779
1780 /**
1781  * luma MC function
1782  *
1783  * @param s        VP8 decoding context
1784  * @param dst      target buffer for block data at block position
1785  * @param ref      reference picture buffer at origin (0, 0)
1786  * @param mv       motion vector (relative to block position) to get pixel data from
1787  * @param x_off    horizontal position of block from origin (0, 0)
1788  * @param y_off    vertical position of block from origin (0, 0)
1789  * @param block_w  width of block (16, 8 or 4)
1790  * @param block_h  height of block (always same as block_w)
1791  * @param width    width of src/dst plane data
1792  * @param height   height of src/dst plane data
1793  * @param linesize size of a single line of plane data, including padding
1794  * @param mc_func  motion compensation function pointers (bilinear or sixtap MC)
1795  */
1796 static av_always_inline
1797 void vp8_mc_luma(VP8Context *s, VP8ThreadData *td, uint8_t *dst,
1798                  ThreadFrame *ref, const VP56mv *mv,
1799                  int x_off, int y_off, int block_w, int block_h,
1800                  int width, int height, ptrdiff_t linesize,
1801                  vp8_mc_func mc_func[3][3])
1802 {
1803     uint8_t *src = ref->f->data[0];
1804
1805     if (AV_RN32A(mv)) {
1806         ptrdiff_t src_linesize = linesize;
1807
1808         int mx = (mv->x * 2) & 7, mx_idx = subpel_idx[0][mx];
1809         int my = (mv->y * 2) & 7, my_idx = subpel_idx[0][my];
1810
1811         x_off += mv->x >> 2;
1812         y_off += mv->y >> 2;
1813
1814         // edge emulation
1815         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 4, 0);
1816         src += y_off * linesize + x_off;
1817         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1818             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1819             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1820                                      src - my_idx * linesize - mx_idx,
1821                                      EDGE_EMU_LINESIZE, linesize,
1822                                      block_w + subpel_idx[1][mx],
1823                                      block_h + subpel_idx[1][my],
1824                                      x_off - mx_idx, y_off - my_idx,
1825                                      width, height);
1826             src = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
1827             src_linesize = EDGE_EMU_LINESIZE;
1828         }
1829         mc_func[my_idx][mx_idx](dst, linesize, src, src_linesize, block_h, mx, my);
1830     } else {
1831         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 4, 0);
1832         mc_func[0][0](dst, linesize, src + y_off * linesize + x_off,
1833                       linesize, block_h, 0, 0);
1834     }
1835 }
1836
1837 /**
1838  * chroma MC function
1839  *
1840  * @param s        VP8 decoding context
1841  * @param dst1     target buffer for block data at block position (U plane)
1842  * @param dst2     target buffer for block data at block position (V plane)
1843  * @param ref      reference picture buffer at origin (0, 0)
1844  * @param mv       motion vector (relative to block position) to get pixel data from
1845  * @param x_off    horizontal position of block from origin (0, 0)
1846  * @param y_off    vertical position of block from origin (0, 0)
1847  * @param block_w  width of block (16, 8 or 4)
1848  * @param block_h  height of block (always same as block_w)
1849  * @param width    width of src/dst plane data
1850  * @param height   height of src/dst plane data
1851  * @param linesize size of a single line of plane data, including padding
1852  * @param mc_func  motion compensation function pointers (bilinear or sixtap MC)
1853  */
1854 static av_always_inline
1855 void vp8_mc_chroma(VP8Context *s, VP8ThreadData *td, uint8_t *dst1,
1856                    uint8_t *dst2, ThreadFrame *ref, const VP56mv *mv,
1857                    int x_off, int y_off, int block_w, int block_h,
1858                    int width, int height, ptrdiff_t linesize,
1859                    vp8_mc_func mc_func[3][3])
1860 {
1861     uint8_t *src1 = ref->f->data[1], *src2 = ref->f->data[2];
1862
1863     if (AV_RN32A(mv)) {
1864         int mx = mv->x & 7, mx_idx = subpel_idx[0][mx];
1865         int my = mv->y & 7, my_idx = subpel_idx[0][my];
1866
1867         x_off += mv->x >> 3;
1868         y_off += mv->y >> 3;
1869
1870         // edge emulation
1871         src1 += y_off * linesize + x_off;
1872         src2 += y_off * linesize + x_off;
1873         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 3, 0);
1874         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1875             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1876             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1877                                      src1 - my_idx * linesize - mx_idx,
1878                                      EDGE_EMU_LINESIZE, linesize,
1879                                      block_w + subpel_idx[1][mx],
1880                                      block_h + subpel_idx[1][my],
1881                                      x_off - mx_idx, y_off - my_idx, width, height);
1882             src1 = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
1883             mc_func[my_idx][mx_idx](dst1, linesize, src1, EDGE_EMU_LINESIZE, block_h, mx, my);
1884
1885             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1886                                      src2 - my_idx * linesize - mx_idx,
1887                                      EDGE_EMU_LINESIZE, linesize,
1888                                      block_w + subpel_idx[1][mx],
1889                                      block_h + subpel_idx[1][my],
1890                                      x_off - mx_idx, y_off - my_idx, width, height);
1891             src2 = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
1892             mc_func[my_idx][mx_idx](dst2, linesize, src2, EDGE_EMU_LINESIZE, block_h, mx, my);
1893         } else {
1894             mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my);
1895             mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my);
1896         }
1897     } else {
1898         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 3, 0);
1899         mc_func[0][0](dst1, linesize, src1 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1900         mc_func[0][0](dst2, linesize, src2 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1901     }
1902 }
1903
1904 static av_always_inline
1905 void vp8_mc_part(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1906                  ThreadFrame *ref_frame, int x_off, int y_off,
1907                  int bx_off, int by_off, int block_w, int block_h,
1908                  int width, int height, VP56mv *mv)
1909 {
1910     VP56mv uvmv = *mv;
1911
1912     /* Y */
1913     vp8_mc_luma(s, td, dst[0] + by_off * s->linesize + bx_off,
1914                 ref_frame, mv, x_off + bx_off, y_off + by_off,
1915                 block_w, block_h, width, height, s->linesize,
1916                 s->put_pixels_tab[block_w == 8]);
1917
1918     /* U/V */
1919     if (s->profile == 3) {
1920         /* this block only applies VP8; it is safe to check
1921          * only the profile, as VP7 profile <= 1 */
1922         uvmv.x &= ~7;
1923         uvmv.y &= ~7;
1924     }
1925     x_off   >>= 1;
1926     y_off   >>= 1;
1927     bx_off  >>= 1;
1928     by_off  >>= 1;
1929     width   >>= 1;
1930     height  >>= 1;
1931     block_w >>= 1;
1932     block_h >>= 1;
1933     vp8_mc_chroma(s, td, dst[1] + by_off * s->uvlinesize + bx_off,
1934                   dst[2] + by_off * s->uvlinesize + bx_off, ref_frame,
1935                   &uvmv, x_off + bx_off, y_off + by_off,
1936                   block_w, block_h, width, height, s->uvlinesize,
1937                   s->put_pixels_tab[1 + (block_w == 4)]);
1938 }
1939
1940 /* Fetch pixels for estimated mv 4 macroblocks ahead.
1941  * Optimized for 64-byte cache lines. Inspired by ffh264 prefetch_motion. */
1942 static av_always_inline
1943 void prefetch_motion(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
1944                      int mb_xy, int ref)
1945 {
1946     /* Don't prefetch refs that haven't been used very often this frame. */
1947     if (s->ref_count[ref - 1] > (mb_xy >> 5)) {
1948         int x_off = mb_x << 4, y_off = mb_y << 4;
1949         int mx = (mb->mv.x >> 2) + x_off + 8;
1950         int my = (mb->mv.y >> 2) + y_off;
1951         uint8_t **src = s->framep[ref]->tf.f->data;
1952         int off = mx + (my + (mb_x & 3) * 4) * s->linesize + 64;
1953         /* For threading, a ff_thread_await_progress here might be useful, but
1954          * it actually slows down the decoder. Since a bad prefetch doesn't
1955          * generate bad decoder output, we don't run it here. */
1956         s->vdsp.prefetch(src[0] + off, s->linesize, 4);
1957         off = (mx >> 1) + ((my >> 1) + (mb_x & 7)) * s->uvlinesize + 64;
1958         s->vdsp.prefetch(src[1] + off, src[2] - src[1], 2);
1959     }
1960 }
1961
1962 /**
1963  * Apply motion vectors to prediction buffer, chapter 18.
1964  */
1965 static av_always_inline
1966 void inter_predict(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1967                    VP8Macroblock *mb, int mb_x, int mb_y)
1968 {
1969     int x_off = mb_x << 4, y_off = mb_y << 4;
1970     int width = 16 * s->mb_width, height = 16 * s->mb_height;
1971     ThreadFrame *ref = &s->framep[mb->ref_frame]->tf;
1972     VP56mv *bmv = mb->bmv;
1973
1974     switch (mb->partitioning) {
1975     case VP8_SPLITMVMODE_NONE:
1976         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1977                     0, 0, 16, 16, width, height, &mb->mv);
1978         break;
1979     case VP8_SPLITMVMODE_4x4: {
1980         int x, y;
1981         VP56mv uvmv;
1982
1983         /* Y */
1984         for (y = 0; y < 4; y++) {
1985             for (x = 0; x < 4; x++) {
1986                 vp8_mc_luma(s, td, dst[0] + 4 * y * s->linesize + x * 4,
1987                             ref, &bmv[4 * y + x],
1988                             4 * x + x_off, 4 * y + y_off, 4, 4,
1989                             width, height, s->linesize,
1990                             s->put_pixels_tab[2]);
1991             }
1992         }
1993
1994         /* U/V */
1995         x_off  >>= 1;
1996         y_off  >>= 1;
1997         width  >>= 1;
1998         height >>= 1;
1999         for (y = 0; y < 2; y++) {
2000             for (x = 0; x < 2; x++) {
2001                 uvmv.x = mb->bmv[2 * y       * 4 + 2 * x    ].x +
2002                          mb->bmv[2 * y       * 4 + 2 * x + 1].x +
2003                          mb->bmv[(2 * y + 1) * 4 + 2 * x    ].x +
2004                          mb->bmv[(2 * y + 1) * 4 + 2 * x + 1].x;
2005                 uvmv.y = mb->bmv[2 * y       * 4 + 2 * x    ].y +
2006                          mb->bmv[2 * y       * 4 + 2 * x + 1].y +
2007                          mb->bmv[(2 * y + 1) * 4 + 2 * x    ].y +
2008                          mb->bmv[(2 * y + 1) * 4 + 2 * x + 1].y;
2009                 uvmv.x = (uvmv.x + 2 + FF_SIGNBIT(uvmv.x)) >> 2;
2010                 uvmv.y = (uvmv.y + 2 + FF_SIGNBIT(uvmv.y)) >> 2;
2011                 if (s->profile == 3) {
2012                     uvmv.x &= ~7;
2013                     uvmv.y &= ~7;
2014                 }
2015                 vp8_mc_chroma(s, td, dst[1] + 4 * y * s->uvlinesize + x * 4,
2016                               dst[2] + 4 * y * s->uvlinesize + x * 4, ref,
2017                               &uvmv, 4 * x + x_off, 4 * y + y_off, 4, 4,
2018                               width, height, s->uvlinesize,
2019                               s->put_pixels_tab[2]);
2020             }
2021         }
2022         break;
2023     }
2024     case VP8_SPLITMVMODE_16x8:
2025         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2026                     0, 0, 16, 8, width, height, &bmv[0]);
2027         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2028                     0, 8, 16, 8, width, height, &bmv[1]);
2029         break;
2030     case VP8_SPLITMVMODE_8x16:
2031         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2032                     0, 0, 8, 16, width, height, &bmv[0]);
2033         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2034                     8, 0, 8, 16, width, height, &bmv[1]);
2035         break;
2036     case VP8_SPLITMVMODE_8x8:
2037         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2038                     0, 0, 8, 8, width, height, &bmv[0]);
2039         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2040                     8, 0, 8, 8, width, height, &bmv[1]);
2041         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2042                     0, 8, 8, 8, width, height, &bmv[2]);
2043         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2044                     8, 8, 8, 8, width, height, &bmv[3]);
2045         break;
2046     }
2047 }
2048
2049 static av_always_inline
2050 void idct_mb(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3], VP8Macroblock *mb)
2051 {
2052     int x, y, ch;
2053
2054     if (mb->mode != MODE_I4x4) {
2055         uint8_t *y_dst = dst[0];
2056         for (y = 0; y < 4; y++) {
2057             uint32_t nnz4 = AV_RL32(td->non_zero_count_cache[y]);
2058             if (nnz4) {
2059                 if (nnz4 & ~0x01010101) {
2060                     for (x = 0; x < 4; x++) {
2061                         if ((uint8_t) nnz4 == 1)
2062                             s->vp8dsp.vp8_idct_dc_add(y_dst + 4 * x,
2063                                                       td->block[y][x],
2064                                                       s->linesize);
2065                         else if ((uint8_t) nnz4 > 1)
2066                             s->vp8dsp.vp8_idct_add(y_dst + 4 * x,
2067                                                    td->block[y][x],
2068                                                    s->linesize);
2069                         nnz4 >>= 8;
2070                         if (!nnz4)
2071                             break;
2072                     }
2073                 } else {
2074                     s->vp8dsp.vp8_idct_dc_add4y(y_dst, td->block[y], s->linesize);
2075                 }
2076             }
2077             y_dst += 4 * s->linesize;
2078         }
2079     }
2080
2081     for (ch = 0; ch < 2; ch++) {
2082         uint32_t nnz4 = AV_RL32(td->non_zero_count_cache[4 + ch]);
2083         if (nnz4) {
2084             uint8_t *ch_dst = dst[1 + ch];
2085             if (nnz4 & ~0x01010101) {
2086                 for (y = 0; y < 2; y++) {
2087                     for (x = 0; x < 2; x++) {
2088                         if ((uint8_t) nnz4 == 1)
2089                             s->vp8dsp.vp8_idct_dc_add(ch_dst + 4 * x,
2090                                                       td->block[4 + ch][(y << 1) + x],
2091                                                       s->uvlinesize);
2092                         else if ((uint8_t) nnz4 > 1)
2093                             s->vp8dsp.vp8_idct_add(ch_dst + 4 * x,
2094                                                    td->block[4 + ch][(y << 1) + x],
2095                                                    s->uvlinesize);
2096                         nnz4 >>= 8;
2097                         if (!nnz4)
2098                             goto chroma_idct_end;
2099                     }
2100                     ch_dst += 4 * s->uvlinesize;
2101                 }
2102             } else {
2103                 s->vp8dsp.vp8_idct_dc_add4uv(ch_dst, td->block[4 + ch], s->uvlinesize);
2104             }
2105         }
2106 chroma_idct_end:
2107         ;
2108     }
2109 }
2110
2111 static av_always_inline
2112 void filter_level_for_mb(VP8Context *s, VP8Macroblock *mb,
2113                          VP8FilterStrength *f, int is_vp7)
2114 {
2115     int interior_limit, filter_level;
2116
2117     if (s->segmentation.enabled) {
2118         filter_level = s->segmentation.filter_level[mb->segment];
2119         if (!s->segmentation.absolute_vals)
2120             filter_level += s->filter.level;
2121     } else
2122         filter_level = s->filter.level;
2123
2124     if (s->lf_delta.enabled) {
2125         filter_level += s->lf_delta.ref[mb->ref_frame];
2126         filter_level += s->lf_delta.mode[mb->mode];
2127     }
2128
2129     filter_level = av_clip_uintp2(filter_level, 6);
2130
2131     interior_limit = filter_level;
2132     if (s->filter.sharpness) {
2133         interior_limit >>= (s->filter.sharpness + 3) >> 2;
2134         interior_limit = FFMIN(interior_limit, 9 - s->filter.sharpness);
2135     }
2136     interior_limit = FFMAX(interior_limit, 1);
2137
2138     f->filter_level = filter_level;
2139     f->inner_limit = interior_limit;
2140     f->inner_filter = is_vp7 || !mb->skip || mb->mode == MODE_I4x4 ||
2141                       mb->mode == VP8_MVMODE_SPLIT;
2142 }
2143
2144 static av_always_inline
2145 void filter_mb(VP8Context *s, uint8_t *dst[3], VP8FilterStrength *f,
2146                int mb_x, int mb_y, int is_vp7)
2147 {
2148     int mbedge_lim, bedge_lim_y, bedge_lim_uv, hev_thresh;
2149     int filter_level = f->filter_level;
2150     int inner_limit = f->inner_limit;
2151     int inner_filter = f->inner_filter;
2152     ptrdiff_t linesize   = s->linesize;
2153     ptrdiff_t uvlinesize = s->uvlinesize;
2154     static const uint8_t hev_thresh_lut[2][64] = {
2155         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
2156           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2157           3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2158           3, 3, 3, 3 },
2159         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
2160           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2161           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2162           2, 2, 2, 2 }
2163     };
2164
2165     if (!filter_level)
2166         return;
2167
2168     if (is_vp7) {
2169         bedge_lim_y  = filter_level;
2170         bedge_lim_uv = filter_level * 2;
2171         mbedge_lim   = filter_level + 2;
2172     } else {
2173         bedge_lim_y  =
2174         bedge_lim_uv = filter_level * 2 + inner_limit;
2175         mbedge_lim   = bedge_lim_y + 4;
2176     }
2177
2178     hev_thresh = hev_thresh_lut[s->keyframe][filter_level];
2179
2180     if (mb_x) {
2181         s->vp8dsp.vp8_h_loop_filter16y(dst[0], linesize,
2182                                        mbedge_lim, inner_limit, hev_thresh);
2183         s->vp8dsp.vp8_h_loop_filter8uv(dst[1], dst[2], uvlinesize,
2184                                        mbedge_lim, inner_limit, hev_thresh);
2185     }
2186
2187 #define H_LOOP_FILTER_16Y_INNER(cond)                                         \
2188     if (cond && inner_filter) {                                               \
2189         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] +  4, linesize,           \
2190                                              bedge_lim_y, inner_limit,        \
2191                                              hev_thresh);                     \
2192         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] +  8, linesize,           \
2193                                              bedge_lim_y, inner_limit,        \
2194                                              hev_thresh);                     \
2195         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] + 12, linesize,           \
2196                                              bedge_lim_y, inner_limit,        \
2197                                              hev_thresh);                     \
2198         s->vp8dsp.vp8_h_loop_filter8uv_inner(dst[1] +  4, dst[2] + 4,         \
2199                                              uvlinesize,  bedge_lim_uv,       \
2200                                              inner_limit, hev_thresh);        \
2201     }
2202
2203     H_LOOP_FILTER_16Y_INNER(!is_vp7)
2204
2205     if (mb_y) {
2206         s->vp8dsp.vp8_v_loop_filter16y(dst[0], linesize,
2207                                        mbedge_lim, inner_limit, hev_thresh);
2208         s->vp8dsp.vp8_v_loop_filter8uv(dst[1], dst[2], uvlinesize,
2209                                        mbedge_lim, inner_limit, hev_thresh);
2210     }
2211
2212     if (inner_filter) {
2213         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] +  4 * linesize,
2214                                              linesize, bedge_lim_y,
2215                                              inner_limit, hev_thresh);
2216         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] +  8 * linesize,
2217                                              linesize, bedge_lim_y,
2218                                              inner_limit, hev_thresh);
2219         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] + 12 * linesize,
2220                                              linesize, bedge_lim_y,
2221                                              inner_limit, hev_thresh);
2222         s->vp8dsp.vp8_v_loop_filter8uv_inner(dst[1] +  4 * uvlinesize,
2223                                              dst[2] +  4 * uvlinesize,
2224                                              uvlinesize, bedge_lim_uv,
2225                                              inner_limit, hev_thresh);
2226     }
2227
2228     H_LOOP_FILTER_16Y_INNER(is_vp7)
2229 }
2230
2231 static av_always_inline
2232 void filter_mb_simple(VP8Context *s, uint8_t *dst, VP8FilterStrength *f,
2233                       int mb_x, int mb_y)
2234 {
2235     int mbedge_lim, bedge_lim;
2236     int filter_level = f->filter_level;
2237     int inner_limit  = f->inner_limit;
2238     int inner_filter = f->inner_filter;
2239     ptrdiff_t linesize = s->linesize;
2240
2241     if (!filter_level)
2242         return;
2243
2244     bedge_lim  = 2 * filter_level + inner_limit;
2245     mbedge_lim = bedge_lim + 4;
2246
2247     if (mb_x)
2248         s->vp8dsp.vp8_h_loop_filter_simple(dst, linesize, mbedge_lim);
2249     if (inner_filter) {
2250         s->vp8dsp.vp8_h_loop_filter_simple(dst +  4, linesize, bedge_lim);
2251         s->vp8dsp.vp8_h_loop_filter_simple(dst +  8, linesize, bedge_lim);
2252         s->vp8dsp.vp8_h_loop_filter_simple(dst + 12, linesize, bedge_lim);
2253     }
2254
2255     if (mb_y)
2256         s->vp8dsp.vp8_v_loop_filter_simple(dst, linesize, mbedge_lim);
2257     if (inner_filter) {
2258         s->vp8dsp.vp8_v_loop_filter_simple(dst +  4 * linesize, linesize, bedge_lim);
2259         s->vp8dsp.vp8_v_loop_filter_simple(dst +  8 * linesize, linesize, bedge_lim);
2260         s->vp8dsp.vp8_v_loop_filter_simple(dst + 12 * linesize, linesize, bedge_lim);
2261     }
2262 }
2263
2264 #define MARGIN (16 << 2)
2265 static av_always_inline
2266 void vp78_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *curframe,
2267                                     VP8Frame *prev_frame, int is_vp7)
2268 {
2269     VP8Context *s = avctx->priv_data;
2270     int mb_x, mb_y;
2271
2272     s->mv_bounds.mv_min.y = -MARGIN;
2273     s->mv_bounds.mv_max.y = ((s->mb_height - 1) << 6) + MARGIN;
2274     for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
2275         VP8Macroblock *mb = s->macroblocks_base +
2276                             ((s->mb_width + 1) * (mb_y + 1) + 1);
2277         int mb_xy = mb_y * s->mb_width;
2278
2279         AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED * 0x01010101);
2280
2281         s->mv_bounds.mv_min.x = -MARGIN;
2282         s->mv_bounds.mv_max.x = ((s->mb_width - 1) << 6) + MARGIN;
2283         for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
2284             if (mb_y == 0)
2285                 AV_WN32A((mb - s->mb_width - 1)->intra4x4_pred_mode_top,
2286                          DC_PRED * 0x01010101);
2287             decode_mb_mode(s, &s->mv_bounds, mb, mb_x, mb_y, curframe->seg_map->data + mb_xy,
2288                            prev_frame && prev_frame->seg_map ?
2289                            prev_frame->seg_map->data + mb_xy : NULL, 1, is_vp7);
2290             s->mv_bounds.mv_min.x -= 64;
2291             s->mv_bounds.mv_max.x -= 64;
2292         }
2293         s->mv_bounds.mv_min.y -= 64;
2294         s->mv_bounds.mv_max.y -= 64;
2295     }
2296 }
2297
2298 static void vp7_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *cur_frame,
2299                                    VP8Frame *prev_frame)
2300 {
2301     vp78_decode_mv_mb_modes(avctx, cur_frame, prev_frame, IS_VP7);
2302 }
2303
2304 static void vp8_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *cur_frame,
2305                                    VP8Frame *prev_frame)
2306 {
2307     vp78_decode_mv_mb_modes(avctx, cur_frame, prev_frame, IS_VP8);
2308 }
2309
2310 #if HAVE_THREADS
2311 #define check_thread_pos(td, otd, mb_x_check, mb_y_check)                     \
2312     do {                                                                      \
2313         int tmp = (mb_y_check << 16) | (mb_x_check & 0xFFFF);                 \
2314         if (atomic_load(&otd->thread_mb_pos) < tmp) {                         \
2315             pthread_mutex_lock(&otd->lock);                                   \
2316             atomic_store(&td->wait_mb_pos, tmp);                              \
2317             do {                                                              \
2318                 if (atomic_load(&otd->thread_mb_pos) >= tmp)                  \
2319                     break;                                                    \
2320                 pthread_cond_wait(&otd->cond, &otd->lock);                    \
2321             } while (1);                                                      \
2322             atomic_store(&td->wait_mb_pos, INT_MAX);                          \
2323             pthread_mutex_unlock(&otd->lock);                                 \
2324         }                                                                     \
2325     } while (0)
2326
2327 #define update_pos(td, mb_y, mb_x)                                            \
2328     do {                                                                      \
2329         int pos              = (mb_y << 16) | (mb_x & 0xFFFF);                \
2330         int sliced_threading = (avctx->active_thread_type == FF_THREAD_SLICE) && \
2331                                (num_jobs > 1);                                \
2332         int is_null          = !next_td || !prev_td;                          \
2333         int pos_check        = (is_null) ? 1 :                                \
2334             (next_td != td && pos >= atomic_load(&next_td->wait_mb_pos)) ||   \
2335             (prev_td != td && pos >= atomic_load(&prev_td->wait_mb_pos));     \
2336         atomic_store(&td->thread_mb_pos, pos);                                \
2337         if (sliced_threading && pos_check) {                                  \
2338             pthread_mutex_lock(&td->lock);                                    \
2339             pthread_cond_broadcast(&td->cond);                                \
2340             pthread_mutex_unlock(&td->lock);                                  \
2341         }                                                                     \
2342     } while (0)
2343 #else
2344 #define check_thread_pos(td, otd, mb_x_check, mb_y_check) while(0)
2345 #define update_pos(td, mb_y, mb_x) while(0)
2346 #endif
2347
2348 static av_always_inline int decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
2349                                         int jobnr, int threadnr, int is_vp7)
2350 {
2351     VP8Context *s = avctx->priv_data;
2352     VP8ThreadData *prev_td, *next_td, *td = &s->thread_data[threadnr];
2353     int mb_y = atomic_load(&td->thread_mb_pos) >> 16;
2354     int mb_x, mb_xy = mb_y * s->mb_width;
2355     int num_jobs = s->num_jobs;
2356     VP8Frame *curframe = s->curframe, *prev_frame = s->prev_frame;
2357     VP56RangeCoder *c  = &s->coeff_partition[mb_y & (s->num_coeff_partitions - 1)];
2358     VP8Macroblock *mb;
2359     uint8_t *dst[3] = {
2360         curframe->tf.f->data[0] + 16 * mb_y * s->linesize,
2361         curframe->tf.f->data[1] +  8 * mb_y * s->uvlinesize,
2362         curframe->tf.f->data[2] +  8 * mb_y * s->uvlinesize
2363     };
2364
2365     if (c->end <= c->buffer && c->bits >= 0)
2366          return AVERROR_INVALIDDATA;
2367
2368     if (mb_y == 0)
2369         prev_td = td;
2370     else
2371         prev_td = &s->thread_data[(jobnr + num_jobs - 1) % num_jobs];
2372     if (mb_y == s->mb_height - 1)
2373         next_td = td;
2374     else
2375         next_td = &s->thread_data[(jobnr + 1) % num_jobs];
2376     if (s->mb_layout == 1)
2377         mb = s->macroblocks_base + ((s->mb_width + 1) * (mb_y + 1) + 1);
2378     else {
2379         // Make sure the previous frame has read its segmentation map,
2380         // if we re-use the same map.
2381         if (prev_frame && s->segmentation.enabled &&
2382             !s->segmentation.update_map)
2383             ff_thread_await_progress(&prev_frame->tf, mb_y, 0);
2384         mb = s->macroblocks + (s->mb_height - mb_y - 1) * 2;
2385         memset(mb - 1, 0, sizeof(*mb)); // zero left macroblock
2386         AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED * 0x01010101);
2387     }
2388
2389     if (!is_vp7 || mb_y == 0)
2390         memset(td->left_nnz, 0, sizeof(td->left_nnz));
2391
2392     td->mv_bounds.mv_min.x = -MARGIN;
2393     td->mv_bounds.mv_max.x = ((s->mb_width - 1) << 6) + MARGIN;
2394
2395     for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
2396         if (c->end <= c->buffer && c->bits >= 0)
2397             return AVERROR_INVALIDDATA;
2398         // Wait for previous thread to read mb_x+2, and reach mb_y-1.
2399         if (prev_td != td) {
2400             if (threadnr != 0) {
2401                 check_thread_pos(td, prev_td,
2402                                  mb_x + (is_vp7 ? 2 : 1),
2403                                  mb_y - (is_vp7 ? 2 : 1));
2404             } else {
2405                 check_thread_pos(td, prev_td,
2406                                  mb_x + (is_vp7 ? 2 : 1) + s->mb_width + 3,
2407                                  mb_y - (is_vp7 ? 2 : 1));
2408             }
2409         }
2410
2411         s->vdsp.prefetch(dst[0] + (mb_x & 3) * 4 * s->linesize + 64,
2412                          s->linesize, 4);
2413         s->vdsp.prefetch(dst[1] + (mb_x & 7) * s->uvlinesize + 64,
2414                          dst[2] - dst[1], 2);
2415
2416         if (!s->mb_layout)
2417             decode_mb_mode(s, &td->mv_bounds, mb, mb_x, mb_y, curframe->seg_map->data + mb_xy,
2418                            prev_frame && prev_frame->seg_map ?
2419                            prev_frame->seg_map->data + mb_xy : NULL, 0, is_vp7);
2420
2421         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_PREVIOUS);
2422
2423         if (!mb->skip)
2424             decode_mb_coeffs(s, td, c, mb, s->top_nnz[mb_x], td->left_nnz, is_vp7);
2425
2426         if (mb->mode <= MODE_I4x4)
2427             intra_predict(s, td, dst, mb, mb_x, mb_y, is_vp7);
2428         else
2429             inter_predict(s, td, dst, mb, mb_x, mb_y);
2430
2431         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN);
2432
2433         if (!mb->skip) {
2434             idct_mb(s, td, dst, mb);
2435         } else {
2436             AV_ZERO64(td->left_nnz);
2437             AV_WN64(s->top_nnz[mb_x], 0);   // array of 9, so unaligned
2438
2439             /* Reset DC block predictors if they would exist
2440              * if the mb had coefficients */
2441             if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
2442                 td->left_nnz[8]     = 0;
2443                 s->top_nnz[mb_x][8] = 0;
2444             }
2445         }
2446
2447         if (s->deblock_filter)
2448             filter_level_for_mb(s, mb, &td->filter_strength[mb_x], is_vp7);
2449
2450         if (s->deblock_filter && num_jobs != 1 && threadnr == num_jobs - 1) {
2451             if (s->filter.simple)
2452                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2453                                  NULL, NULL, s->linesize, 0, 1);
2454             else
2455                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2456                                  dst[1], dst[2], s->linesize, s->uvlinesize, 0);
2457         }
2458
2459         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN2);
2460
2461         dst[0]      += 16;
2462         dst[1]      += 8;
2463         dst[2]      += 8;
2464         td->mv_bounds.mv_min.x -= 64;
2465         td->mv_bounds.mv_max.x -= 64;
2466
2467         if (mb_x == s->mb_width + 1) {
2468             update_pos(td, mb_y, s->mb_width + 3);
2469         } else {
2470             update_pos(td, mb_y, mb_x);
2471         }
2472     }
2473     return 0;
2474 }
2475
2476 static int vp7_decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
2477                                         int jobnr, int threadnr)
2478 {
2479     return decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr, 1);
2480 }
2481
2482 static int vp8_decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
2483                                         int jobnr, int threadnr)
2484 {
2485     return decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr, 0);
2486 }
2487
2488 static av_always_inline void filter_mb_row(AVCodecContext *avctx, void *tdata,
2489                               int jobnr, int threadnr, int is_vp7)
2490 {
2491     VP8Context *s = avctx->priv_data;
2492     VP8ThreadData *td = &s->thread_data[threadnr];
2493     int mb_x, mb_y = atomic_load(&td->thread_mb_pos) >> 16, num_jobs = s->num_jobs;
2494     AVFrame *curframe = s->curframe->tf.f;
2495     VP8Macroblock *mb;
2496     VP8ThreadData *prev_td, *next_td;
2497     uint8_t *dst[3] = {
2498         curframe->data[0] + 16 * mb_y * s->linesize,
2499         curframe->data[1] +  8 * mb_y * s->uvlinesize,
2500         curframe->data[2] +  8 * mb_y * s->uvlinesize
2501     };
2502
2503     if (s->mb_layout == 1)
2504         mb = s->macroblocks_base + ((s->mb_width + 1) * (mb_y + 1) + 1);
2505     else
2506         mb = s->macroblocks + (s->mb_height - mb_y - 1) * 2;
2507
2508     if (mb_y == 0)
2509         prev_td = td;
2510     else
2511         prev_td = &s->thread_data[(jobnr + num_jobs - 1) % num_jobs];
2512     if (mb_y == s->mb_height - 1)
2513         next_td = td;
2514     else
2515         next_td = &s->thread_data[(jobnr + 1) % num_jobs];
2516
2517     for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb++) {
2518         VP8FilterStrength *f = &td->filter_strength[mb_x];
2519         if (prev_td != td)
2520             check_thread_pos(td, prev_td,
2521                              (mb_x + 1) + (s->mb_width + 3), mb_y - 1);
2522         if (next_td != td)
2523             if (next_td != &s->thread_data[0])
2524                 check_thread_pos(td, next_td, mb_x + 1, mb_y + 1);
2525
2526         if (num_jobs == 1) {
2527             if (s->filter.simple)
2528                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2529                                  NULL, NULL, s->linesize, 0, 1);
2530             else
2531                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2532                                  dst[1], dst[2], s->linesize, s->uvlinesize, 0);
2533         }
2534
2535         if (s->filter.simple)
2536             filter_mb_simple(s, dst[0], f, mb_x, mb_y);
2537         else
2538             filter_mb(s, dst, f, mb_x, mb_y, is_vp7);
2539         dst[0] += 16;
2540         dst[1] += 8;
2541         dst[2] += 8;
2542
2543         update_pos(td, mb_y, (s->mb_width + 3) + mb_x);
2544     }
2545 }
2546
2547 static void vp7_filter_mb_row(AVCodecContext *avctx, void *tdata,
2548                               int jobnr, int threadnr)
2549 {
2550     filter_mb_row(avctx, tdata, jobnr, threadnr, 1);
2551 }
2552
2553 static void vp8_filter_mb_row(AVCodecContext *avctx, void *tdata,
2554                               int jobnr, int threadnr)
2555 {
2556     filter_mb_row(avctx, tdata, jobnr, threadnr, 0);
2557 }
2558
2559 static av_always_inline
2560 int vp78_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata, int jobnr,
2561                               int threadnr, int is_vp7)
2562 {
2563     VP8Context *s = avctx->priv_data;
2564     VP8ThreadData *td = &s->thread_data[jobnr];
2565     VP8ThreadData *next_td = NULL, *prev_td = NULL;
2566     VP8Frame *curframe = s->curframe;
2567     int mb_y, num_jobs = s->num_jobs;
2568     int ret;
2569
2570     td->thread_nr = threadnr;
2571     td->mv_bounds.mv_min.y   = -MARGIN - 64 * threadnr;
2572     td->mv_bounds.mv_max.y   = ((s->mb_height - 1) << 6) + MARGIN - 64 * threadnr;
2573     for (mb_y = jobnr; mb_y < s->mb_height; mb_y += num_jobs) {
2574         atomic_store(&td->thread_mb_pos, mb_y << 16);
2575         ret = s->decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr);
2576         if (ret < 0) {
2577             update_pos(td, s->mb_height, INT_MAX & 0xFFFF);
2578             return ret;
2579         }
2580         if (s->deblock_filter)
2581             s->filter_mb_row(avctx, tdata, jobnr, threadnr);
2582         update_pos(td, mb_y, INT_MAX & 0xFFFF);
2583
2584         td->mv_bounds.mv_min.y -= 64 * num_jobs;
2585         td->mv_bounds.mv_max.y -= 64 * num_jobs;
2586
2587         if (avctx->active_thread_type == FF_THREAD_FRAME)
2588             ff_thread_report_progress(&curframe->tf, mb_y, 0);
2589     }
2590
2591     return 0;
2592 }
2593
2594 static int vp7_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata,
2595                                     int jobnr, int threadnr)
2596 {
2597     return vp78_decode_mb_row_sliced(avctx, tdata, jobnr, threadnr, IS_VP7);
2598 }
2599
2600 static int vp8_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata,
2601                                     int jobnr, int threadnr)
2602 {
2603     return vp78_decode_mb_row_sliced(avctx, tdata, jobnr, threadnr, IS_VP8);
2604 }
2605
2606 static av_always_inline
2607 int vp78_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
2608                       AVPacket *avpkt, int is_vp7)
2609 {
2610     VP8Context *s = avctx->priv_data;
2611     int ret, i, referenced, num_jobs;
2612     enum AVDiscard skip_thresh;
2613     VP8Frame *av_uninit(curframe), *prev_frame;
2614
2615     if (is_vp7)
2616         ret = vp7_decode_frame_header(s, avpkt->data, avpkt->size);
2617     else
2618         ret = vp8_decode_frame_header(s, avpkt->data, avpkt->size);
2619
2620     if (ret < 0)
2621         goto err;
2622
2623     if (s->actually_webp) {
2624         // avctx->pix_fmt already set in caller.
2625     } else if (!is_vp7 && s->pix_fmt == AV_PIX_FMT_NONE) {
2626         s->pix_fmt = get_pixel_format(s);
2627         if (s->pix_fmt < 0) {
2628             ret = AVERROR(EINVAL);
2629             goto err;
2630         }
2631         avctx->pix_fmt = s->pix_fmt;
2632     }
2633
2634     prev_frame = s->framep[VP56_FRAME_CURRENT];
2635
2636     referenced = s->update_last || s->update_golden == VP56_FRAME_CURRENT ||
2637                  s->update_altref == VP56_FRAME_CURRENT;
2638
2639     skip_thresh = !referenced ? AVDISCARD_NONREF
2640                               : !s->keyframe ? AVDISCARD_NONKEY
2641                                              : AVDISCARD_ALL;
2642
2643     if (avctx->skip_frame >= skip_thresh) {
2644         s->invisible = 1;
2645         memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
2646         goto skip_decode;
2647     }
2648     s->deblock_filter = s->filter.level && avctx->skip_loop_filter < skip_thresh;
2649
2650     // release no longer referenced frames
2651     for (i = 0; i < 5; i++)
2652         if (s->frames[i].tf.f->buf[0] &&
2653             &s->frames[i] != prev_frame &&
2654             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
2655             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN]   &&
2656             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2])
2657             vp8_release_frame(s, &s->frames[i]);
2658
2659     curframe = s->framep[VP56_FRAME_CURRENT] = vp8_find_free_buffer(s);
2660
2661     if (!s->colorspace)
2662         avctx->colorspace = AVCOL_SPC_BT470BG;
2663     if (s->fullrange)
2664         avctx->color_range = AVCOL_RANGE_JPEG;
2665     else
2666         avctx->color_range = AVCOL_RANGE_MPEG;
2667
2668     /* Given that arithmetic probabilities are updated every frame, it's quite
2669      * likely that the values we have on a random interframe are complete
2670      * junk if we didn't start decode on a keyframe. So just don't display
2671      * anything rather than junk. */
2672     if (!s->keyframe && (!s->framep[VP56_FRAME_PREVIOUS] ||
2673                          !s->framep[VP56_FRAME_GOLDEN]   ||
2674                          !s->framep[VP56_FRAME_GOLDEN2])) {
2675         av_log(avctx, AV_LOG_WARNING,
2676                "Discarding interframe without a prior keyframe!\n");
2677         ret = AVERROR_INVALIDDATA;
2678         goto err;
2679     }
2680
2681     curframe->tf.f->key_frame = s->keyframe;
2682     curframe->tf.f->pict_type = s->keyframe ? AV_PICTURE_TYPE_I
2683                                             : AV_PICTURE_TYPE_P;
2684     if ((ret = vp8_alloc_frame(s, curframe, referenced)) < 0)
2685         goto err;
2686
2687     // check if golden and altref are swapped
2688     if (s->update_altref != VP56_FRAME_NONE)
2689         s->next_framep[VP56_FRAME_GOLDEN2] = s->framep[s->update_altref];
2690     else
2691         s->next_framep[VP56_FRAME_GOLDEN2] = s->framep[VP56_FRAME_GOLDEN2];
2692
2693     if (s->update_golden != VP56_FRAME_NONE)
2694         s->next_framep[VP56_FRAME_GOLDEN] = s->framep[s->update_golden];
2695     else
2696         s->next_framep[VP56_FRAME_GOLDEN] = s->framep[VP56_FRAME_GOLDEN];
2697
2698     if (s->update_last)
2699         s->next_framep[VP56_FRAME_PREVIOUS] = curframe;
2700     else
2701         s->next_framep[VP56_FRAME_PREVIOUS] = s->framep[VP56_FRAME_PREVIOUS];
2702
2703     s->next_framep[VP56_FRAME_CURRENT] = curframe;
2704
2705     ff_thread_finish_setup(avctx);
2706
2707     if (avctx->hwaccel) {
2708         ret = avctx->hwaccel->start_frame(avctx, avpkt->data, avpkt->size);
2709         if (ret < 0)
2710             goto err;
2711
2712         ret = avctx->hwaccel->decode_slice(avctx, avpkt->data, avpkt->size);
2713         if (ret < 0)
2714             goto err;
2715
2716         ret = avctx->hwaccel->end_frame(avctx);
2717         if (ret < 0)
2718             goto err;
2719
2720     } else {
2721         s->linesize   = curframe->tf.f->linesize[0];
2722         s->uvlinesize = curframe->tf.f->linesize[1];
2723
2724         memset(s->top_nnz, 0, s->mb_width * sizeof(*s->top_nnz));
2725         /* Zero macroblock structures for top/top-left prediction
2726          * from outside the frame. */
2727         if (!s->mb_layout)
2728             memset(s->macroblocks + s->mb_height * 2 - 1, 0,
2729                    (s->mb_width + 1) * sizeof(*s->macroblocks));
2730         if (!s->mb_layout && s->keyframe)
2731             memset(s->intra4x4_pred_mode_top, DC_PRED, s->mb_width * 4);
2732
2733         memset(s->ref_count, 0, sizeof(s->ref_count));
2734
2735         if (s->mb_layout == 1) {
2736             // Make sure the previous frame has read its segmentation map,
2737             // if we re-use the same map.
2738             if (prev_frame && s->segmentation.enabled &&
2739                 !s->segmentation.update_map)
2740                 ff_thread_await_progress(&prev_frame->tf, 1, 0);
2741             if (is_vp7)
2742                 vp7_decode_mv_mb_modes(avctx, curframe, prev_frame);
2743             else
2744                 vp8_decode_mv_mb_modes(avctx, curframe, prev_frame);
2745         }
2746
2747         if (avctx->active_thread_type == FF_THREAD_FRAME)
2748             num_jobs = 1;
2749         else
2750             num_jobs = FFMIN(s->num_coeff_partitions, avctx->thread_count);
2751         s->num_jobs   = num_jobs;
2752         s->curframe   = curframe;
2753         s->prev_frame = prev_frame;
2754         s->mv_bounds.mv_min.y   = -MARGIN;
2755         s->mv_bounds.mv_max.y   = ((s->mb_height - 1) << 6) + MARGIN;
2756         for (i = 0; i < MAX_THREADS; i++) {
2757             VP8ThreadData *td = &s->thread_data[i];
2758             atomic_init(&td->thread_mb_pos, 0);
2759             atomic_init(&td->wait_mb_pos, INT_MAX);
2760         }
2761         if (is_vp7)
2762             avctx->execute2(avctx, vp7_decode_mb_row_sliced, s->thread_data, NULL,
2763                             num_jobs);
2764         else
2765             avctx->execute2(avctx, vp8_decode_mb_row_sliced, s->thread_data, NULL,
2766                             num_jobs);
2767     }
2768
2769     ff_thread_report_progress(&curframe->tf, INT_MAX, 0);
2770     memcpy(&s->framep[0], &s->next_framep[0], sizeof(s->framep[0]) * 4);
2771
2772 skip_decode:
2773     // if future frames don't use the updated probabilities,
2774     // reset them to the values we saved
2775     if (!s->update_probabilities)
2776         s->prob[0] = s->prob[1];
2777
2778     if (!s->invisible) {
2779         if ((ret = av_frame_ref(data, curframe->tf.f)) < 0)
2780             return ret;
2781         *got_frame = 1;
2782     }
2783
2784     return avpkt->size;
2785 err:
2786     memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
2787     return ret;
2788 }
2789
2790 int ff_vp8_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
2791                         AVPacket *avpkt)
2792 {
2793     return vp78_decode_frame(avctx, data, got_frame, avpkt, IS_VP8);
2794 }
2795
2796 #if CONFIG_VP7_DECODER
2797 static int vp7_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
2798                             AVPacket *avpkt)
2799 {
2800     return vp78_decode_frame(avctx, data, got_frame, avpkt, IS_VP7);
2801 }
2802 #endif /* CONFIG_VP7_DECODER */
2803
2804 av_cold int ff_vp8_decode_free(AVCodecContext *avctx)
2805 {
2806     VP8Context *s = avctx->priv_data;
2807     int i;
2808
2809     if (!s)
2810         return 0;
2811
2812     vp8_decode_flush_impl(avctx, 1);
2813     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++)
2814         av_frame_free(&s->frames[i].tf.f);
2815
2816     return 0;
2817 }
2818
2819 static av_cold int vp8_init_frames(VP8Context *s)
2820 {
2821     int i;
2822     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++) {
2823         s->frames[i].tf.f = av_frame_alloc();
2824         if (!s->frames[i].tf.f)
2825             return AVERROR(ENOMEM);
2826     }
2827     return 0;
2828 }
2829
2830 static av_always_inline
2831 int vp78_decode_init(AVCodecContext *avctx, int is_vp7)
2832 {
2833     VP8Context *s = avctx->priv_data;
2834     int ret;
2835
2836     s->avctx = avctx;
2837     s->vp7   = avctx->codec->id == AV_CODEC_ID_VP7;
2838     s->pix_fmt = AV_PIX_FMT_NONE;
2839     avctx->pix_fmt = AV_PIX_FMT_YUV420P;
2840     avctx->internal->allocate_progress = 1;
2841
2842     ff_videodsp_init(&s->vdsp, 8);
2843
2844     ff_vp78dsp_init(&s->vp8dsp);
2845     if (CONFIG_VP7_DECODER && is_vp7) {
2846         ff_h264_pred_init(&s->hpc, AV_CODEC_ID_VP7, 8, 1);
2847         ff_vp7dsp_init(&s->vp8dsp);
2848         s->decode_mb_row_no_filter = vp7_decode_mb_row_no_filter;
2849         s->filter_mb_row           = vp7_filter_mb_row;
2850     } else if (CONFIG_VP8_DECODER && !is_vp7) {
2851         ff_h264_pred_init(&s->hpc, AV_CODEC_ID_VP8, 8, 1);
2852         ff_vp8dsp_init(&s->vp8dsp);
2853         s->decode_mb_row_no_filter = vp8_decode_mb_row_no_filter;
2854         s->filter_mb_row           = vp8_filter_mb_row;
2855     }
2856
2857     /* does not change for VP8 */
2858     memcpy(s->prob[0].scan, ff_zigzag_scan, sizeof(s->prob[0].scan));
2859
2860     if ((ret = vp8_init_frames(s)) < 0) {
2861         ff_vp8_decode_free(avctx);
2862         return ret;
2863     }
2864
2865     return 0;
2866 }
2867
2868 #if CONFIG_VP7_DECODER
2869 static int vp7_decode_init(AVCodecContext *avctx)
2870 {
2871     return vp78_decode_init(avctx, IS_VP7);
2872 }
2873 #endif /* CONFIG_VP7_DECODER */
2874
2875 av_cold int ff_vp8_decode_init(AVCodecContext *avctx)
2876 {
2877     return vp78_decode_init(avctx, IS_VP8);
2878 }
2879
2880 #if CONFIG_VP8_DECODER
2881 #if HAVE_THREADS
2882 static av_cold int vp8_decode_init_thread_copy(AVCodecContext *avctx)
2883 {
2884     VP8Context *s = avctx->priv_data;
2885     int ret;
2886
2887     s->avctx = avctx;
2888
2889     if ((ret = vp8_init_frames(s)) < 0) {
2890         ff_vp8_decode_free(avctx);
2891         return ret;
2892     }
2893
2894     return 0;
2895 }
2896
2897 #define REBASE(pic) ((pic) ? (pic) - &s_src->frames[0] + &s->frames[0] : NULL)
2898
2899 static int vp8_decode_update_thread_context(AVCodecContext *dst,
2900                                             const AVCodecContext *src)
2901 {
2902     VP8Context *s = dst->priv_data, *s_src = src->priv_data;
2903     int i;
2904
2905     if (s->macroblocks_base &&
2906         (s_src->mb_width != s->mb_width || s_src->mb_height != s->mb_height)) {
2907         free_buffers(s);
2908         s->mb_width  = s_src->mb_width;
2909         s->mb_height = s_src->mb_height;
2910     }
2911
2912     s->pix_fmt      = s_src->pix_fmt;
2913     s->prob[0]      = s_src->prob[!s_src->update_probabilities];
2914     s->segmentation = s_src->segmentation;
2915     s->lf_delta     = s_src->lf_delta;
2916     memcpy(s->sign_bias, s_src->sign_bias, sizeof(s->sign_bias));
2917
2918     for (i = 0; i < FF_ARRAY_ELEMS(s_src->frames); i++) {
2919         if (s_src->frames[i].tf.f->buf[0]) {
2920             int ret = vp8_ref_frame(s, &s->frames[i], &s_src->frames[i]);
2921             if (ret < 0)
2922                 return ret;
2923         }
2924     }
2925
2926     s->framep[0] = REBASE(s_src->next_framep[0]);
2927     s->framep[1] = REBASE(s_src->next_framep[1]);
2928     s->framep[2] = REBASE(s_src->next_framep[2]);
2929     s->framep[3] = REBASE(s_src->next_framep[3]);
2930
2931     return 0;
2932 }
2933 #endif /* HAVE_THREADS */
2934 #endif /* CONFIG_VP8_DECODER */
2935
2936 #if CONFIG_VP7_DECODER
2937 AVCodec ff_vp7_decoder = {
2938     .name                  = "vp7",
2939     .long_name             = NULL_IF_CONFIG_SMALL("On2 VP7"),
2940     .type                  = AVMEDIA_TYPE_VIDEO,
2941     .id                    = AV_CODEC_ID_VP7,
2942     .priv_data_size        = sizeof(VP8Context),
2943     .init                  = vp7_decode_init,
2944     .close                 = ff_vp8_decode_free,
2945     .decode                = vp7_decode_frame,
2946     .capabilities          = AV_CODEC_CAP_DR1,
2947     .flush                 = vp8_decode_flush,
2948 };
2949 #endif /* CONFIG_VP7_DECODER */
2950
2951 #if CONFIG_VP8_DECODER
2952 AVCodec ff_vp8_decoder = {
2953     .name                  = "vp8",
2954     .long_name             = NULL_IF_CONFIG_SMALL("On2 VP8"),
2955     .type                  = AVMEDIA_TYPE_VIDEO,
2956     .id                    = AV_CODEC_ID_VP8,
2957     .priv_data_size        = sizeof(VP8Context),
2958     .init                  = ff_vp8_decode_init,
2959     .close                 = ff_vp8_decode_free,
2960     .decode                = ff_vp8_decode_frame,
2961     .capabilities          = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS |
2962                              AV_CODEC_CAP_SLICE_THREADS,
2963     .flush                 = vp8_decode_flush,
2964     .init_thread_copy      = ONLY_IF_THREADS_ENABLED(vp8_decode_init_thread_copy),
2965     .update_thread_context = ONLY_IF_THREADS_ENABLED(vp8_decode_update_thread_context),
2966     .hw_configs            = (const AVCodecHWConfigInternal*[]) {
2967 #if CONFIG_VP8_VAAPI_HWACCEL
2968                                HWACCEL_VAAPI(vp8),
2969 #endif
2970 #if CONFIG_VP8_NVDEC_HWACCEL
2971                                HWACCEL_NVDEC(vp8),
2972 #endif
2973                                NULL
2974                            },
2975 };
2976 #endif /* CONFIG_VP7_DECODER */