git.sesse.net Git - ffmpeg/blob - libavcodec/vp9.c

   1 /*
   2  * VP9 compatible video decoder
   3  *
   4  * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
   5  * Copyright (C) 2013 Clément Bœsch <u pkh me>
   6  *
   7  * This file is part of FFmpeg.
   8  *
   9  * FFmpeg is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public
  11  * License as published by the Free Software Foundation; either
  12  * version 2.1 of the License, or (at your option) any later version.
  13  *
  14  * FFmpeg is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with FFmpeg; if not, write to the Free Software
  21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22  */
  23
  24 #include "avcodec.h"
  25 #include "get_bits.h"
  26 #include "internal.h"
  27 #include "thread.h"
  28 #include "videodsp.h"
  29 #include "vp56.h"
  30 #include "vp9.h"
  31 #include "vp9data.h"
  32 #include "vp9dsp.h"
  33 #include "libavutil/avassert.h"
  34 #include "libavutil/pixdesc.h"
  35
  36 #define VP9_SYNCCODE 0x498342
  37
  38 enum CompPredMode {
  39     PRED_SINGLEREF,
  40     PRED_COMPREF,
  41     PRED_SWITCHABLE,
  42 };
  43
  44 enum BlockLevel {
  45     BL_64X64,
  46     BL_32X32,
  47     BL_16X16,
  48     BL_8X8,
  49 };
  50
  51 enum BlockSize {
  52     BS_64x64,
  53     BS_64x32,
  54     BS_32x64,
  55     BS_32x32,
  56     BS_32x16,
  57     BS_16x32,
  58     BS_16x16,
  59     BS_16x8,
  60     BS_8x16,
  61     BS_8x8,
  62     BS_8x4,
  63     BS_4x8,
  64     BS_4x4,
  65     N_BS_SIZES,
  66 };
  67
  68 struct VP9mvrefPair {
  69     VP56mv mv[2];
  70     int8_t ref[2];
  71 };
  72
  73 typedef struct VP9Frame {
  74     ThreadFrame tf;
  75     AVBufferRef *extradata;
  76     uint8_t *segmentation_map;
  77     struct VP9mvrefPair *mv;
  78     int uses_2pass;
  79 } VP9Frame;
  80
  81 struct VP9Filter {
  82     uint8_t level[8 * 8];
  83     uint8_t /* bit=col */ mask[2 /* 0=y, 1=uv */][2 /* 0=col, 1=row */]
  84                               [8 /* rows */][4 /* 0=16, 1=8, 2=4, 3=inner4 */];
  85 };
  86
  87 typedef struct VP9Block {
  88     uint8_t seg_id, intra, comp, ref[2], mode[4], uvmode, skip;
  89     enum FilterMode filter;
  90     VP56mv mv[4 /* b_idx */][2 /* ref */];
  91     enum BlockSize bs;
  92     enum TxfmMode tx, uvtx;
  93     enum BlockLevel bl;
  94     enum BlockPartition bp;
  95 } VP9Block;
  96
  97 typedef struct VP9Context {
  98     VP9DSPContext dsp;
  99     VideoDSPContext vdsp;
 100     GetBitContext gb;
 101     VP56RangeCoder c;
 102     VP56RangeCoder *c_b;
 103     unsigned c_b_size;
 104     VP9Block *b_base, *b;
 105     int pass;
 106     int row, row7, col, col7;
 107     uint8_t *dst[3];
 108     ptrdiff_t y_stride, uv_stride;
 109
 110     // bitstream header
 111     uint8_t keyframe, last_keyframe;
 112     uint8_t last_bpp, bpp, bpp_index, bytesperpixel;
 113     uint8_t invisible;
 114     uint8_t use_last_frame_mvs;
 115     uint8_t errorres;
 116     uint8_t ss_h, ss_v;
 117     uint8_t intraonly;
 118     uint8_t resetctx;
 119     uint8_t refreshrefmask;
 120     uint8_t highprecisionmvs;
 121     enum FilterMode filtermode;
 122     uint8_t allowcompinter;
 123     uint8_t fixcompref;
 124     uint8_t refreshctx;
 125     uint8_t parallelmode;
 126     uint8_t framectxid;
 127     uint8_t refidx[3];
 128     uint8_t signbias[3];
 129     uint8_t varcompref[2];
 130     ThreadFrame refs[8], next_refs[8];
 131 #define CUR_FRAME 0
 132 #define REF_FRAME_MVPAIR 1
 133 #define REF_FRAME_SEGMAP 2
 134     VP9Frame frames[3];
 135
 136     struct {
 137         uint8_t level;
 138         int8_t sharpness;
 139         uint8_t lim_lut[64];
 140         uint8_t mblim_lut[64];
 141     } filter;
 142     struct {
 143         uint8_t enabled;
 144         int8_t mode[2];
 145         int8_t ref[4];
 146     } lf_delta;
 147     uint8_t yac_qi;
 148     int8_t ydc_qdelta, uvdc_qdelta, uvac_qdelta;
 149     uint8_t lossless;
 150 #define MAX_SEGMENT 8
 151     struct {
 152         uint8_t enabled;
 153         uint8_t temporal;
 154         uint8_t absolute_vals;
 155         uint8_t update_map;
 156         struct {
 157             uint8_t q_enabled;
 158             uint8_t lf_enabled;
 159             uint8_t ref_enabled;
 160             uint8_t skip_enabled;
 161             uint8_t ref_val;
 162             int16_t q_val;
 163             int8_t lf_val;
 164             int16_t qmul[2][2];
 165             uint8_t lflvl[4][2];
 166         } feat[MAX_SEGMENT];
 167     } segmentation;
 168     struct {
 169         unsigned log2_tile_cols, log2_tile_rows;
 170         unsigned tile_cols, tile_rows;
 171         unsigned tile_row_start, tile_row_end, tile_col_start, tile_col_end;
 172     } tiling;
 173     unsigned sb_cols, sb_rows, rows, cols;
 174     struct {
 175         prob_context p;
 176         uint8_t coef[4][2][2][6][6][3];
 177     } prob_ctx[4];
 178     struct {
 179         prob_context p;
 180         uint8_t coef[4][2][2][6][6][11];
 181         uint8_t seg[7];
 182         uint8_t segpred[3];
 183     } prob;
 184     struct {
 185         unsigned y_mode[4][10];
 186         unsigned uv_mode[10][10];
 187         unsigned filter[4][3];
 188         unsigned mv_mode[7][4];
 189         unsigned intra[4][2];
 190         unsigned comp[5][2];
 191         unsigned single_ref[5][2][2];
 192         unsigned comp_ref[5][2];
 193         unsigned tx32p[2][4];
 194         unsigned tx16p[2][3];
 195         unsigned tx8p[2][2];
 196         unsigned skip[3][2];
 197         unsigned mv_joint[4];
 198         struct {
 199             unsigned sign[2];
 200             unsigned classes[11];
 201             unsigned class0[2];
 202             unsigned bits[10][2];
 203             unsigned class0_fp[2][4];
 204             unsigned fp[4];
 205             unsigned class0_hp[2];
 206             unsigned hp[2];
 207         } mv_comp[2];
 208         unsigned partition[4][4][4];
 209         unsigned coef[4][2][2][6][6][3];
 210         unsigned eob[4][2][2][6][6][2];
 211     } counts;
 212     enum TxfmMode txfmmode;
 213     enum CompPredMode comppredmode;
 214
 215     // contextual (left/above) cache
 216     DECLARE_ALIGNED(16, uint8_t, left_y_nnz_ctx)[16];
 217     DECLARE_ALIGNED(16, uint8_t, left_mode_ctx)[16];
 218     DECLARE_ALIGNED(16, VP56mv, left_mv_ctx)[16][2];
 219     DECLARE_ALIGNED(16, uint8_t, left_uv_nnz_ctx)[2][16];
 220     DECLARE_ALIGNED(8, uint8_t, left_partition_ctx)[8];
 221     DECLARE_ALIGNED(8, uint8_t, left_skip_ctx)[8];
 222     DECLARE_ALIGNED(8, uint8_t, left_txfm_ctx)[8];
 223     DECLARE_ALIGNED(8, uint8_t, left_segpred_ctx)[8];
 224     DECLARE_ALIGNED(8, uint8_t, left_intra_ctx)[8];
 225     DECLARE_ALIGNED(8, uint8_t, left_comp_ctx)[8];
 226     DECLARE_ALIGNED(8, uint8_t, left_ref_ctx)[8];
 227     DECLARE_ALIGNED(8, uint8_t, left_filter_ctx)[8];
 228     uint8_t *above_partition_ctx;
 229     uint8_t *above_mode_ctx;
 230     // FIXME maybe merge some of the below in a flags field?
 231     uint8_t *above_y_nnz_ctx;
 232     uint8_t *above_uv_nnz_ctx[2];
 233     uint8_t *above_skip_ctx; // 1bit
 234     uint8_t *above_txfm_ctx; // 2bit
 235     uint8_t *above_segpred_ctx; // 1bit
 236     uint8_t *above_intra_ctx; // 1bit
 237     uint8_t *above_comp_ctx; // 1bit
 238     uint8_t *above_ref_ctx; // 2bit
 239     uint8_t *above_filter_ctx;
 240     VP56mv (*above_mv_ctx)[2];
 241
 242     // whole-frame cache
 243     uint8_t *intra_pred_data[3];
 244     struct VP9Filter *lflvl;
 245     DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[135 * 144 * 2];
 246
 247     // block reconstruction intermediates
 248     int block_alloc_using_2pass;
 249     int16_t *block_base, *block, *uvblock_base[2], *uvblock[2];
 250     uint8_t *eob_base, *uveob_base[2], *eob, *uveob[2];
 251     struct { int x, y; } min_mv, max_mv;
 252     DECLARE_ALIGNED(32, uint8_t, tmp_y)[64 * 64 * 2];
 253     DECLARE_ALIGNED(32, uint8_t, tmp_uv)[2][64 * 64 * 2];
 254     uint16_t mvscale[3][2];
 255     uint8_t mvstep[3][2];
 256 } VP9Context;
 257
 258 static const uint8_t bwh_tab[2][N_BS_SIZES][2] = {
 259     {
 260         { 16, 16 }, { 16, 8 }, { 8, 16 }, { 8, 8 }, { 8, 4 }, { 4, 8 },
 261         { 4, 4 }, { 4, 2 }, { 2, 4 }, { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 },
 262     }, {
 263         { 8, 8 }, { 8, 4 }, { 4, 8 }, { 4, 4 }, { 4, 2 }, { 2, 4 },
 264         { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },
 265     }
 266 };
 267
 268 static int vp9_alloc_frame(AVCodecContext *ctx, VP9Frame *f)
 269 {
 270     VP9Context *s = ctx->priv_data;
 271     int ret, sz;
 272
 273     if ((ret = ff_thread_get_buffer(ctx, &f->tf, AV_GET_BUFFER_FLAG_REF)) < 0)
 274         return ret;
 275     sz = 64 * s->sb_cols * s->sb_rows;
 276     if (!(f->extradata = av_buffer_allocz(sz * (1 + sizeof(struct VP9mvrefPair))))) {
 277         ff_thread_release_buffer(ctx, &f->tf);
 278         return AVERROR(ENOMEM);
 279     }
 280
 281     f->segmentation_map = f->extradata->data;
 282     f->mv = (struct VP9mvrefPair *) (f->extradata->data + sz);
 283
 284     return 0;
 285 }
 286
 287 static void vp9_unref_frame(AVCodecContext *ctx, VP9Frame *f)
 288 {
 289     ff_thread_release_buffer(ctx, &f->tf);
 290     av_buffer_unref(&f->extradata);
 291     f->segmentation_map = NULL;
 292 }
 293
 294 static int vp9_ref_frame(AVCodecContext *ctx, VP9Frame *dst, VP9Frame *src)
 295 {
 296     int res;
 297
 298     if ((res = ff_thread_ref_frame(&dst->tf, &src->tf)) < 0) {
 299         return res;
 300     } else if (!(dst->extradata = av_buffer_ref(src->extradata))) {
 301         vp9_unref_frame(ctx, dst);
 302         return AVERROR(ENOMEM);
 303     }
 304
 305     dst->segmentation_map = src->segmentation_map;
 306     dst->mv = src->mv;
 307     dst->uses_2pass = src->uses_2pass;
 308
 309     return 0;
 310 }
 311
 312 static int update_size(AVCodecContext *ctx, int w, int h, enum AVPixelFormat fmt)
 313 {
 314     VP9Context *s = ctx->priv_data;
 315     uint8_t *p;
 316     int bytesperpixel = s->bytesperpixel;
 317
 318     av_assert0(w > 0 && h > 0);
 319
 320     if (s->intra_pred_data[0] && w == ctx->width && h == ctx->height && ctx->pix_fmt == fmt)
 321         return 0;
 322
 323     ctx->width   = w;
 324     ctx->height  = h;
 325     ctx->pix_fmt = fmt;
 326     s->sb_cols   = (w + 63) >> 6;
 327     s->sb_rows   = (h + 63) >> 6;
 328     s->cols      = (w + 7) >> 3;
 329     s->rows      = (h + 7) >> 3;
 330
 331 #define assign(var, type, n) var = (type) p; p += s->sb_cols * (n) * sizeof(*var)
 332     av_freep(&s->intra_pred_data[0]);
 333     // FIXME we slightly over-allocate here for subsampled chroma, but a little
 334     // bit of padding shouldn't affect performance...
 335     p = av_malloc(s->sb_cols * (128 + 192 * bytesperpixel +
 336                                 sizeof(*s->lflvl) + 16 * sizeof(*s->above_mv_ctx)));
 337     if (!p)
 338         return AVERROR(ENOMEM);
 339     assign(s->intra_pred_data[0],  uint8_t *,             64 * bytesperpixel);
 340     assign(s->intra_pred_data[1],  uint8_t *,             64 * bytesperpixel);
 341     assign(s->intra_pred_data[2],  uint8_t *,             64 * bytesperpixel);
 342     assign(s->above_y_nnz_ctx,     uint8_t *,             16);
 343     assign(s->above_mode_ctx,      uint8_t *,             16);
 344     assign(s->above_mv_ctx,        VP56mv(*)[2],          16);
 345     assign(s->above_uv_nnz_ctx[0], uint8_t *,             16);
 346     assign(s->above_uv_nnz_ctx[1], uint8_t *,             16);
 347     assign(s->above_partition_ctx, uint8_t *,              8);
 348     assign(s->above_skip_ctx,      uint8_t *,              8);
 349     assign(s->above_txfm_ctx,      uint8_t *,              8);
 350     assign(s->above_segpred_ctx,   uint8_t *,              8);
 351     assign(s->above_intra_ctx,     uint8_t *,              8);
 352     assign(s->above_comp_ctx,      uint8_t *,              8);
 353     assign(s->above_ref_ctx,       uint8_t *,              8);
 354     assign(s->above_filter_ctx,    uint8_t *,              8);
 355     assign(s->lflvl,               struct VP9Filter *,     1);
 356 #undef assign
 357
 358     // these will be re-allocated a little later
 359     av_freep(&s->b_base);
 360     av_freep(&s->block_base);
 361
 362     if (s->bpp != s->last_bpp) {
 363         ff_vp9dsp_init(&s->dsp, s->bpp, ctx->flags & AV_CODEC_FLAG_BITEXACT);
 364         ff_videodsp_init(&s->vdsp, s->bpp);
 365         s->last_bpp = s->bpp;
 366     }
 367
 368     return 0;
 369 }
 370
 371 static int update_block_buffers(AVCodecContext *ctx)
 372 {
 373     VP9Context *s = ctx->priv_data;
 374     int chroma_blocks, chroma_eobs, bytesperpixel = s->bytesperpixel;
 375
 376     if (s->b_base && s->block_base && s->block_alloc_using_2pass == s->frames[CUR_FRAME].uses_2pass)
 377         return 0;
 378
 379     av_free(s->b_base);
 380     av_free(s->block_base);
 381     chroma_blocks = 64 * 64 >> (s->ss_h + s->ss_v);
 382     chroma_eobs   = 16 * 16 >> (s->ss_h + s->ss_v);
 383     if (s->frames[CUR_FRAME].uses_2pass) {
 384         int sbs = s->sb_cols * s->sb_rows;
 385
 386         s->b_base = av_malloc_array(s->cols * s->rows, sizeof(VP9Block));
 387         s->block_base = av_mallocz(((64 * 64 + 2 * chroma_blocks) * bytesperpixel * sizeof(int16_t) +
 388                                     16 * 16 + 2 * chroma_eobs) * sbs);
 389         if (!s->b_base || !s->block_base)
 390             return AVERROR(ENOMEM);
 391         s->uvblock_base[0] = s->block_base + sbs * 64 * 64 * bytesperpixel;
 392         s->uvblock_base[1] = s->uvblock_base[0] + sbs * chroma_blocks * bytesperpixel;
 393         s->eob_base = (uint8_t *) (s->uvblock_base[1] + sbs * chroma_blocks * bytesperpixel);
 394         s->uveob_base[0] = s->eob_base + 16 * 16 * sbs;
 395         s->uveob_base[1] = s->uveob_base[0] + chroma_eobs * sbs;
 396     } else {
 397         s->b_base = av_malloc(sizeof(VP9Block));
 398         s->block_base = av_mallocz((64 * 64 + 2 * chroma_blocks) * bytesperpixel * sizeof(int16_t) +
 399                                    16 * 16 + 2 * chroma_eobs);
 400         if (!s->b_base || !s->block_base)
 401             return AVERROR(ENOMEM);
 402         s->uvblock_base[0] = s->block_base + 64 * 64 * bytesperpixel;
 403         s->uvblock_base[1] = s->uvblock_base[0] + chroma_blocks * bytesperpixel;
 404         s->eob_base = (uint8_t *) (s->uvblock_base[1] + chroma_blocks * bytesperpixel);
 405         s->uveob_base[0] = s->eob_base + 16 * 16;
 406         s->uveob_base[1] = s->uveob_base[0] + chroma_eobs;
 407     }
 408     s->block_alloc_using_2pass = s->frames[CUR_FRAME].uses_2pass;
 409
 410     return 0;
 411 }
 412
 413 // for some reason the sign bit is at the end, not the start, of a bit sequence
 414 static av_always_inline int get_sbits_inv(GetBitContext *gb, int n)
 415 {
 416     int v = get_bits(gb, n);
 417     return get_bits1(gb) ? -v : v;
 418 }
 419
 420 static av_always_inline int inv_recenter_nonneg(int v, int m)
 421 {
 422     return v > 2 * m ? v : v & 1 ? m - ((v + 1) >> 1) : m + (v >> 1);
 423 }
 424
 425 // differential forward probability updates
 426 static int update_prob(VP56RangeCoder *c, int p)
 427 {
 428     static const int inv_map_table[255] = {
 429           7,  20,  33,  46,  59,  72,  85,  98, 111, 124, 137, 150, 163, 176,
 430         189, 202, 215, 228, 241, 254,   1,   2,   3,   4,   5,   6,   8,   9,
 431          10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  21,  22,  23,  24,
 432          25,  26,  27,  28,  29,  30,  31,  32,  34,  35,  36,  37,  38,  39,
 433          40,  41,  42,  43,  44,  45,  47,  48,  49,  50,  51,  52,  53,  54,
 434          55,  56,  57,  58,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,
 435          70,  71,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,
 436          86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,  99, 100,
 437         101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 112, 113, 114, 115,
 438         116, 117, 118, 119, 120, 121, 122, 123, 125, 126, 127, 128, 129, 130,
 439         131, 132, 133, 134, 135, 136, 138, 139, 140, 141, 142, 143, 144, 145,
 440         146, 147, 148, 149, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160,
 441         161, 162, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
 442         177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 190, 191,
 443         192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 203, 204, 205, 206,
 444         207, 208, 209, 210, 211, 212, 213, 214, 216, 217, 218, 219, 220, 221,
 445         222, 223, 224, 225, 226, 227, 229, 230, 231, 232, 233, 234, 235, 236,
 446         237, 238, 239, 240, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251,
 447         252, 253, 253,
 448     };
 449     int d;
 450
 451     /* This code is trying to do a differential probability update. For a
 452      * current probability A in the range [1, 255], the difference to a new
 453      * probability of any value can be expressed differentially as 1-A,255-A
 454      * where some part of this (absolute range) exists both in positive as
 455      * well as the negative part, whereas another part only exists in one
 456      * half. We're trying to code this shared part differentially, i.e.
 457      * times two where the value of the lowest bit specifies the sign, and
 458      * the single part is then coded on top of this. This absolute difference
 459      * then again has a value of [0,254], but a bigger value in this range
 460      * indicates that we're further away from the original value A, so we
 461      * can code this as a VLC code, since higher values are increasingly
 462      * unlikely. The first 20 values in inv_map_table[] allow 'cheap, rough'
 463      * updates vs. the 'fine, exact' updates further down the range, which
 464      * adds one extra dimension to this differential update model. */
 465
 466     if (!vp8_rac_get(c)) {
 467         d = vp8_rac_get_uint(c, 4) + 0;
 468     } else if (!vp8_rac_get(c)) {
 469         d = vp8_rac_get_uint(c, 4) + 16;
 470     } else if (!vp8_rac_get(c)) {
 471         d = vp8_rac_get_uint(c, 5) + 32;
 472     } else {
 473         d = vp8_rac_get_uint(c, 7);
 474         if (d >= 65)
 475             d = (d << 1) - 65 + vp8_rac_get(c);
 476         d += 64;
 477         av_assert2(d < FF_ARRAY_ELEMS(inv_map_table));
 478     }
 479
 480     return p <= 128 ? 1 + inv_recenter_nonneg(inv_map_table[d], p - 1) :
 481                     255 - inv_recenter_nonneg(inv_map_table[d], 255 - p);
 482 }
 483
 484 static enum AVPixelFormat read_colorspace_details(AVCodecContext *ctx)
 485 {
 486     static const enum AVColorSpace colorspaces[8] = {
 487         AVCOL_SPC_UNSPECIFIED, AVCOL_SPC_BT470BG, AVCOL_SPC_BT709, AVCOL_SPC_SMPTE170M,
 488         AVCOL_SPC_SMPTE240M, AVCOL_SPC_BT2020_NCL, AVCOL_SPC_RESERVED, AVCOL_SPC_RGB,
 489     };
 490     VP9Context *s = ctx->priv_data;
 491     enum AVPixelFormat res;
 492     int bits = ctx->profile <= 1 ? 0 : 1 + get_bits1(&s->gb); // 0:8, 1:10, 2:12
 493
 494     s->bpp_index = bits;
 495     s->bpp = 8 + bits * 2;
 496     s->bytesperpixel = (7 + s->bpp) >> 3;
 497     ctx->colorspace = colorspaces[get_bits(&s->gb, 3)];
 498     if (ctx->colorspace == AVCOL_SPC_RGB) { // RGB = profile 1
 499         static const enum AVPixelFormat pix_fmt_rgb[3] = {
 500             AV_PIX_FMT_GBRP, AV_PIX_FMT_GBRP10, AV_PIX_FMT_GBRP12
 501         };
 502         if (ctx->profile & 1) {
 503             s->ss_h = s->ss_v = 0;
 504             res = pix_fmt_rgb[bits];
 505             ctx->color_range = AVCOL_RANGE_JPEG;
 506             if (get_bits1(&s->gb)) {
 507                 av_log(ctx, AV_LOG_ERROR, "Reserved bit set in RGB\n");
 508                 return AVERROR_INVALIDDATA;
 509             }
 510         } else {
 511             av_log(ctx, AV_LOG_ERROR, "RGB not supported in profile %d\n",
 512                    ctx->profile);
 513             return AVERROR_INVALIDDATA;
 514         }
 515     } else {
 516         static const enum AVPixelFormat pix_fmt_for_ss[3][2 /* v */][2 /* h */] = {
 517             { { AV_PIX_FMT_YUV444P, AV_PIX_FMT_YUV422P },
 518               { AV_PIX_FMT_YUV440P, AV_PIX_FMT_YUV420P } },
 519             { { AV_PIX_FMT_YUV444P10, AV_PIX_FMT_YUV422P10 },
 520               { AV_PIX_FMT_YUV440P10, AV_PIX_FMT_YUV420P10 } },
 521             { { AV_PIX_FMT_YUV444P12, AV_PIX_FMT_YUV422P12 },
 522               { AV_PIX_FMT_YUV440P12, AV_PIX_FMT_YUV420P12 } }
 523         };
 524         ctx->color_range = get_bits1(&s->gb) ? AVCOL_RANGE_JPEG : AVCOL_RANGE_MPEG;
 525         if (ctx->profile & 1) {
 526             s->ss_h = get_bits1(&s->gb);
 527             s->ss_v = get_bits1(&s->gb);
 528             if ((res = pix_fmt_for_ss[bits][s->ss_v][s->ss_h]) == AV_PIX_FMT_YUV420P) {
 529                 av_log(ctx, AV_LOG_ERROR, "YUV 4:2:0 not supported in profile %d\n",
 530                        ctx->profile);
 531                 return AVERROR_INVALIDDATA;
 532             } else if (get_bits1(&s->gb)) {
 533                 av_log(ctx, AV_LOG_ERROR, "Profile %d color details reserved bit set\n",
 534                        ctx->profile);
 535                 return AVERROR_INVALIDDATA;
 536             }
 537         } else {
 538             s->ss_h = s->ss_v = 1;
 539             res = pix_fmt_for_ss[bits][1][1];
 540         }
 541     }
 542
 543     return res;
 544 }
 545
 546 static int decode_frame_header(AVCodecContext *ctx,
 547                                const uint8_t *data, int size, int *ref)
 548 {
 549     VP9Context *s = ctx->priv_data;
 550     int c, i, j, k, l, m, n, w, h, max, size2, res, sharp;
 551     enum AVPixelFormat fmt = ctx->pix_fmt;
 552     int last_invisible;
 553     const uint8_t *data2;
 554
 555     /* general header */
 556     if ((res = init_get_bits8(&s->gb, data, size)) < 0) {
 557         av_log(ctx, AV_LOG_ERROR, "Failed to initialize bitstream reader\n");
 558         return res;
 559     }
 560     if (get_bits(&s->gb, 2) != 0x2) { // frame marker
 561         av_log(ctx, AV_LOG_ERROR, "Invalid frame marker\n");
 562         return AVERROR_INVALIDDATA;
 563     }
 564     ctx->profile  = get_bits1(&s->gb);
 565     ctx->profile |= get_bits1(&s->gb) << 1;
 566     if (ctx->profile == 3) ctx->profile += get_bits1(&s->gb);
 567     if (ctx->profile > 3) {
 568         av_log(ctx, AV_LOG_ERROR, "Profile %d is not yet supported\n", ctx->profile);
 569         return AVERROR_INVALIDDATA;
 570     }
 571     if (get_bits1(&s->gb)) {
 572         *ref = get_bits(&s->gb, 3);
 573         return 0;
 574     }
 575     s->last_keyframe  = s->keyframe;
 576     s->keyframe       = !get_bits1(&s->gb);
 577     last_invisible    = s->invisible;
 578     s->invisible      = !get_bits1(&s->gb);
 579     s->errorres       = get_bits1(&s->gb);
 580     s->use_last_frame_mvs = !s->errorres && !last_invisible;
 581     if (s->keyframe) {
 582         if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
 583             av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
 584             return AVERROR_INVALIDDATA;
 585         }
 586         if ((fmt = read_colorspace_details(ctx)) < 0)
 587             return fmt;
 588         // for profile 1, here follows the subsampling bits
 589         s->refreshrefmask = 0xff;
 590         w = get_bits(&s->gb, 16) + 1;
 591         h = get_bits(&s->gb, 16) + 1;
 592         if (get_bits1(&s->gb)) // display size
 593             skip_bits(&s->gb, 32);
 594     } else {
 595         s->intraonly  = s->invisible ? get_bits1(&s->gb) : 0;
 596         s->resetctx   = s->errorres ? 0 : get_bits(&s->gb, 2);
 597         if (s->intraonly) {
 598             if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
 599                 av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
 600                 return AVERROR_INVALIDDATA;
 601             }
 602             if (ctx->profile >= 1) {
 603                 if ((fmt = read_colorspace_details(ctx)) < 0)
 604                     return fmt;
 605             } else {
 606                 s->ss_h = s->ss_v = 1;
 607                 s->bpp = 8;
 608                 s->bpp_index = 0;
 609                 s->bytesperpixel = 1;
 610                 fmt = AV_PIX_FMT_YUV420P;
 611                 ctx->colorspace = AVCOL_SPC_BT470BG;
 612                 ctx->color_range = AVCOL_RANGE_JPEG;
 613             }
 614             s->refreshrefmask = get_bits(&s->gb, 8);
 615             w = get_bits(&s->gb, 16) + 1;
 616             h = get_bits(&s->gb, 16) + 1;
 617             if (get_bits1(&s->gb)) // display size
 618                 skip_bits(&s->gb, 32);
 619         } else {
 620             s->refreshrefmask = get_bits(&s->gb, 8);
 621             s->refidx[0]      = get_bits(&s->gb, 3);
 622             s->signbias[0]    = get_bits1(&s->gb) && !s->errorres;
 623             s->refidx[1]      = get_bits(&s->gb, 3);
 624             s->signbias[1]    = get_bits1(&s->gb) && !s->errorres;
 625             s->refidx[2]      = get_bits(&s->gb, 3);
 626             s->signbias[2]    = get_bits1(&s->gb) && !s->errorres;
 627             if (!s->refs[s->refidx[0]].f->data[0] ||
 628                 !s->refs[s->refidx[1]].f->data[0] ||
 629                 !s->refs[s->refidx[2]].f->data[0]) {
 630                 av_log(ctx, AV_LOG_ERROR, "Not all references are available\n");
 631                 return AVERROR_INVALIDDATA;
 632             }
 633             if (get_bits1(&s->gb)) {
 634                 w = s->refs[s->refidx[0]].f->width;
 635                 h = s->refs[s->refidx[0]].f->height;
 636             } else if (get_bits1(&s->gb)) {
 637                 w = s->refs[s->refidx[1]].f->width;
 638                 h = s->refs[s->refidx[1]].f->height;
 639             } else if (get_bits1(&s->gb)) {
 640                 w = s->refs[s->refidx[2]].f->width;
 641                 h = s->refs[s->refidx[2]].f->height;
 642             } else {
 643                 w = get_bits(&s->gb, 16) + 1;
 644                 h = get_bits(&s->gb, 16) + 1;
 645             }
 646             // Note that in this code, "CUR_FRAME" is actually before we
 647             // have formally allocated a frame, and thus actually represents
 648             // the _last_ frame
 649             s->use_last_frame_mvs &= s->frames[CUR_FRAME].tf.f->width == w &&
 650                                      s->frames[CUR_FRAME].tf.f->height == h;
 651             if (get_bits1(&s->gb)) // display size
 652                 skip_bits(&s->gb, 32);
 653             s->highprecisionmvs = get_bits1(&s->gb);
 654             s->filtermode = get_bits1(&s->gb) ? FILTER_SWITCHABLE :
 655                                                 get_bits(&s->gb, 2);
 656             s->allowcompinter = (s->signbias[0] != s->signbias[1] ||
 657                                  s->signbias[0] != s->signbias[2]);
 658             if (s->allowcompinter) {
 659                 if (s->signbias[0] == s->signbias[1]) {
 660                     s->fixcompref    = 2;
 661                     s->varcompref[0] = 0;
 662                     s->varcompref[1] = 1;
 663                 } else if (s->signbias[0] == s->signbias[2]) {
 664                     s->fixcompref    = 1;
 665                     s->varcompref[0] = 0;
 666                     s->varcompref[1] = 2;
 667                 } else {
 668                     s->fixcompref    = 0;
 669                     s->varcompref[0] = 1;
 670                     s->varcompref[1] = 2;
 671                 }
 672             }
 673
 674             for (i = 0; i < 3; i++) {
 675                 AVFrame *ref = s->refs[s->refidx[i]].f;
 676                 int refw = ref->width, refh = ref->height;
 677
 678                 if (ref->format != fmt) {
 679                     av_log(ctx, AV_LOG_ERROR,
 680                            "Ref pixfmt (%s) did not match current frame (%s)",
 681                            av_get_pix_fmt_name(ref->format),
 682                            av_get_pix_fmt_name(fmt));
 683                     return AVERROR_INVALIDDATA;
 684                 } else if (refw == w && refh == h) {
 685                     s->mvscale[i][0] = s->mvscale[i][1] = 0;
 686                 } else {
 687                     if (w * 2 < refw || h * 2 < refh || w > 16 * refw || h > 16 * refh) {
 688                         av_log(ctx, AV_LOG_ERROR,
 689                                "Invalid ref frame dimensions %dx%d for frame size %dx%d\n",
 690                                refw, refh, w, h);
 691                         return AVERROR_INVALIDDATA;
 692                     }
 693                     s->mvscale[i][0] = (refw << 14) / w;
 694                     s->mvscale[i][1] = (refh << 14) / h;
 695                     s->mvstep[i][0] = 16 * s->mvscale[i][0] >> 14;
 696                     s->mvstep[i][1] = 16 * s->mvscale[i][1] >> 14;
 697                 }
 698             }
 699         }
 700     }
 701     s->refreshctx   = s->errorres ? 0 : get_bits1(&s->gb);
 702     s->parallelmode = s->errorres ? 1 : get_bits1(&s->gb);
 703     s->framectxid   = c = get_bits(&s->gb, 2);
 704
 705     /* loopfilter header data */
 706     if (s->keyframe || s->errorres || s->intraonly) {
 707         // reset loopfilter defaults
 708         s->lf_delta.ref[0] = 1;
 709         s->lf_delta.ref[1] = 0;
 710         s->lf_delta.ref[2] = -1;
 711         s->lf_delta.ref[3] = -1;
 712         s->lf_delta.mode[0] = 0;
 713         s->lf_delta.mode[1] = 0;
 714         memset(s->segmentation.feat, 0, sizeof(s->segmentation.feat));
 715     }
 716     s->filter.level = get_bits(&s->gb, 6);
 717     sharp = get_bits(&s->gb, 3);
 718     // if sharpness changed, reinit lim/mblim LUTs. if it didn't change, keep
 719     // the old cache values since they are still valid
 720     if (s->filter.sharpness != sharp)
 721         memset(s->filter.lim_lut, 0, sizeof(s->filter.lim_lut));
 722     s->filter.sharpness = sharp;
 723     if ((s->lf_delta.enabled = get_bits1(&s->gb))) {
 724         if (get_bits1(&s->gb)) {
 725             for (i = 0; i < 4; i++)
 726                 if (get_bits1(&s->gb))
 727                     s->lf_delta.ref[i] = get_sbits_inv(&s->gb, 6);
 728             for (i = 0; i < 2; i++)
 729                 if (get_bits1(&s->gb))
 730                     s->lf_delta.mode[i] = get_sbits_inv(&s->gb, 6);
 731         }
 732     }
 733
 734     /* quantization header data */
 735     s->yac_qi      = get_bits(&s->gb, 8);
 736     s->ydc_qdelta  = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
 737     s->uvdc_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
 738     s->uvac_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
 739     s->lossless    = s->yac_qi == 0 && s->ydc_qdelta == 0 &&
 740                      s->uvdc_qdelta == 0 && s->uvac_qdelta == 0;
 741     if (s->lossless)
 742         ctx->properties |= FF_CODEC_PROPERTY_LOSSLESS;
 743
 744     /* segmentation header info */
 745     if ((s->segmentation.enabled = get_bits1(&s->gb))) {
 746         if ((s->segmentation.update_map = get_bits1(&s->gb))) {
 747             for (i = 0; i < 7; i++)
 748                 s->prob.seg[i] = get_bits1(&s->gb) ?
 749                                  get_bits(&s->gb, 8) : 255;
 750             if ((s->segmentation.temporal = get_bits1(&s->gb))) {
 751                 for (i = 0; i < 3; i++)
 752                     s->prob.segpred[i] = get_bits1(&s->gb) ?
 753                                          get_bits(&s->gb, 8) : 255;
 754             }
 755         }
 756
 757         if (get_bits1(&s->gb)) {
 758             s->segmentation.absolute_vals = get_bits1(&s->gb);
 759             for (i = 0; i < 8; i++) {
 760                 if ((s->segmentation.feat[i].q_enabled = get_bits1(&s->gb)))
 761                     s->segmentation.feat[i].q_val = get_sbits_inv(&s->gb, 8);
 762                 if ((s->segmentation.feat[i].lf_enabled = get_bits1(&s->gb)))
 763                     s->segmentation.feat[i].lf_val = get_sbits_inv(&s->gb, 6);
 764                 if ((s->segmentation.feat[i].ref_enabled = get_bits1(&s->gb)))
 765                     s->segmentation.feat[i].ref_val = get_bits(&s->gb, 2);
 766                 s->segmentation.feat[i].skip_enabled = get_bits1(&s->gb);
 767             }
 768         }
 769     }
 770
 771     // set qmul[] based on Y/UV, AC/DC and segmentation Q idx deltas
 772     for (i = 0; i < (s->segmentation.enabled ? 8 : 1); i++) {
 773         int qyac, qydc, quvac, quvdc, lflvl, sh;
 774
 775         if (s->segmentation.enabled && s->segmentation.feat[i].q_enabled) {
 776             if (s->segmentation.absolute_vals)
 777                 qyac = av_clip_uintp2(s->segmentation.feat[i].q_val, 8);
 778             else
 779                 qyac = av_clip_uintp2(s->yac_qi + s->segmentation.feat[i].q_val, 8);
 780         } else {
 781             qyac  = s->yac_qi;
 782         }
 783         qydc  = av_clip_uintp2(qyac + s->ydc_qdelta, 8);
 784         quvdc = av_clip_uintp2(qyac + s->uvdc_qdelta, 8);
 785         quvac = av_clip_uintp2(qyac + s->uvac_qdelta, 8);
 786         qyac  = av_clip_uintp2(qyac, 8);
 787
 788         s->segmentation.feat[i].qmul[0][0] = vp9_dc_qlookup[s->bpp_index][qydc];
 789         s->segmentation.feat[i].qmul[0][1] = vp9_ac_qlookup[s->bpp_index][qyac];
 790         s->segmentation.feat[i].qmul[1][0] = vp9_dc_qlookup[s->bpp_index][quvdc];
 791         s->segmentation.feat[i].qmul[1][1] = vp9_ac_qlookup[s->bpp_index][quvac];
 792
 793         sh = s->filter.level >= 32;
 794         if (s->segmentation.enabled && s->segmentation.feat[i].lf_enabled) {
 795             if (s->segmentation.absolute_vals)
 796                 lflvl = av_clip_uintp2(s->segmentation.feat[i].lf_val, 6);
 797             else
 798                 lflvl = av_clip_uintp2(s->filter.level + s->segmentation.feat[i].lf_val, 6);
 799         } else {
 800             lflvl  = s->filter.level;
 801         }
 802         if (s->lf_delta.enabled) {
 803             s->segmentation.feat[i].lflvl[0][0] =
 804             s->segmentation.feat[i].lflvl[0][1] =
 805                 av_clip_uintp2(lflvl + (s->lf_delta.ref[0] << sh), 6);
 806             for (j = 1; j < 4; j++) {
 807                 s->segmentation.feat[i].lflvl[j][0] =
 808                     av_clip_uintp2(lflvl + ((s->lf_delta.ref[j] +
 809                                              s->lf_delta.mode[0]) * (1 << sh)), 6);
 810                 s->segmentation.feat[i].lflvl[j][1] =
 811                     av_clip_uintp2(lflvl + ((s->lf_delta.ref[j] +
 812                                              s->lf_delta.mode[1]) * (1 << sh)), 6);
 813             }
 814         } else {
 815             memset(s->segmentation.feat[i].lflvl, lflvl,
 816                    sizeof(s->segmentation.feat[i].lflvl));
 817         }
 818     }
 819
 820     /* tiling info */
 821     if ((res = update_size(ctx, w, h, fmt)) < 0) {
 822         av_log(ctx, AV_LOG_ERROR, "Failed to initialize decoder for %dx%d @ %d\n", w, h, fmt);
 823         return res;
 824     }
 825     for (s->tiling.log2_tile_cols = 0;
 826          s->sb_cols > (64 << s->tiling.log2_tile_cols);
 827          s->tiling.log2_tile_cols++) ;
 828     for (max = 0; (s->sb_cols >> max) >= 4; max++) ;
 829     max = FFMAX(0, max - 1);
 830     while (max > s->tiling.log2_tile_cols) {
 831         if (get_bits1(&s->gb))
 832             s->tiling.log2_tile_cols++;
 833         else
 834             break;
 835     }
 836     s->tiling.log2_tile_rows = decode012(&s->gb);
 837     s->tiling.tile_rows = 1 << s->tiling.log2_tile_rows;
 838     if (s->tiling.tile_cols != (1 << s->tiling.log2_tile_cols)) {
 839         s->tiling.tile_cols = 1 << s->tiling.log2_tile_cols;
 840         s->c_b = av_fast_realloc(s->c_b, &s->c_b_size,
 841                                  sizeof(VP56RangeCoder) * s->tiling.tile_cols);
 842         if (!s->c_b) {
 843             av_log(ctx, AV_LOG_ERROR, "Ran out of memory during range coder init\n");
 844             return AVERROR(ENOMEM);
 845         }
 846     }
 847
 848     if (s->keyframe || s->errorres || (s->intraonly && s->resetctx == 3)) {
 849         s->prob_ctx[0].p = s->prob_ctx[1].p = s->prob_ctx[2].p =
 850                            s->prob_ctx[3].p = vp9_default_probs;
 851         memcpy(s->prob_ctx[0].coef, vp9_default_coef_probs,
 852                sizeof(vp9_default_coef_probs));
 853         memcpy(s->prob_ctx[1].coef, vp9_default_coef_probs,
 854                sizeof(vp9_default_coef_probs));
 855         memcpy(s->prob_ctx[2].coef, vp9_default_coef_probs,
 856                sizeof(vp9_default_coef_probs));
 857         memcpy(s->prob_ctx[3].coef, vp9_default_coef_probs,
 858                sizeof(vp9_default_coef_probs));
 859     } else if (s->intraonly && s->resetctx == 2) {
 860         s->prob_ctx[c].p = vp9_default_probs;
 861         memcpy(s->prob_ctx[c].coef, vp9_default_coef_probs,
 862                sizeof(vp9_default_coef_probs));
 863     }
 864
 865     // next 16 bits is size of the rest of the header (arith-coded)
 866     size2 = get_bits(&s->gb, 16);
 867     data2 = align_get_bits(&s->gb);
 868     if (size2 > size - (data2 - data)) {
 869         av_log(ctx, AV_LOG_ERROR, "Invalid compressed header size\n");
 870         return AVERROR_INVALIDDATA;
 871     }
 872     ff_vp56_init_range_decoder(&s->c, data2, size2);
 873     if (vp56_rac_get_prob_branchy(&s->c, 128)) { // marker bit
 874         av_log(ctx, AV_LOG_ERROR, "Marker bit was set\n");
 875         return AVERROR_INVALIDDATA;
 876     }
 877
 878     if (s->keyframe || s->intraonly) {
 879         memset(s->counts.coef, 0, sizeof(s->counts.coef));
 880         memset(s->counts.eob,  0, sizeof(s->counts.eob));
 881     } else {
 882         memset(&s->counts, 0, sizeof(s->counts));
 883     }
 884     // FIXME is it faster to not copy here, but do it down in the fw updates
 885     // as explicit copies if the fw update is missing (and skip the copy upon
 886     // fw update)?
 887     s->prob.p = s->prob_ctx[c].p;
 888
 889     // txfm updates
 890     if (s->lossless) {
 891         s->txfmmode = TX_4X4;
 892     } else {
 893         s->txfmmode = vp8_rac_get_uint(&s->c, 2);
 894         if (s->txfmmode == 3)
 895             s->txfmmode += vp8_rac_get(&s->c);
 896
 897         if (s->txfmmode == TX_SWITCHABLE) {
 898             for (i = 0; i < 2; i++)
 899                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 900                     s->prob.p.tx8p[i] = update_prob(&s->c, s->prob.p.tx8p[i]);
 901             for (i = 0; i < 2; i++)
 902                 for (j = 0; j < 2; j++)
 903                     if (vp56_rac_get_prob_branchy(&s->c, 252))
 904                         s->prob.p.tx16p[i][j] =
 905                             update_prob(&s->c, s->prob.p.tx16p[i][j]);
 906             for (i = 0; i < 2; i++)
 907                 for (j = 0; j < 3; j++)
 908                     if (vp56_rac_get_prob_branchy(&s->c, 252))
 909                         s->prob.p.tx32p[i][j] =
 910                             update_prob(&s->c, s->prob.p.tx32p[i][j]);
 911         }
 912     }
 913
 914     // coef updates
 915     for (i = 0; i < 4; i++) {
 916         uint8_t (*ref)[2][6][6][3] = s->prob_ctx[c].coef[i];
 917         if (vp8_rac_get(&s->c)) {
 918             for (j = 0; j < 2; j++)
 919                 for (k = 0; k < 2; k++)
 920                     for (l = 0; l < 6; l++)
 921                         for (m = 0; m < 6; m++) {
 922                             uint8_t *p = s->prob.coef[i][j][k][l][m];
 923                             uint8_t *r = ref[j][k][l][m];
 924                             if (m >= 3 && l == 0) // dc only has 3 pt
 925                                 break;
 926                             for (n = 0; n < 3; n++) {
 927                                 if (vp56_rac_get_prob_branchy(&s->c, 252)) {
 928                                     p[n] = update_prob(&s->c, r[n]);
 929                                 } else {
 930                                     p[n] = r[n];
 931                                 }
 932                             }
 933                             p[3] = 0;
 934                         }
 935         } else {
 936             for (j = 0; j < 2; j++)
 937                 for (k = 0; k < 2; k++)
 938                     for (l = 0; l < 6; l++)
 939                         for (m = 0; m < 6; m++) {
 940                             uint8_t *p = s->prob.coef[i][j][k][l][m];
 941                             uint8_t *r = ref[j][k][l][m];
 942                             if (m > 3 && l == 0) // dc only has 3 pt
 943                                 break;
 944                             memcpy(p, r, 3);
 945                             p[3] = 0;
 946                         }
 947         }
 948         if (s->txfmmode == i)
 949             break;
 950     }
 951
 952     // mode updates
 953     for (i = 0; i < 3; i++)
 954         if (vp56_rac_get_prob_branchy(&s->c, 252))
 955             s->prob.p.skip[i] = update_prob(&s->c, s->prob.p.skip[i]);
 956     if (!s->keyframe && !s->intraonly) {
 957         for (i = 0; i < 7; i++)
 958             for (j = 0; j < 3; j++)
 959                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 960                     s->prob.p.mv_mode[i][j] =
 961                         update_prob(&s->c, s->prob.p.mv_mode[i][j]);
 962
 963         if (s->filtermode == FILTER_SWITCHABLE)
 964             for (i = 0; i < 4; i++)
 965                 for (j = 0; j < 2; j++)
 966                     if (vp56_rac_get_prob_branchy(&s->c, 252))
 967                         s->prob.p.filter[i][j] =
 968                             update_prob(&s->c, s->prob.p.filter[i][j]);
 969
 970         for (i = 0; i < 4; i++)
 971             if (vp56_rac_get_prob_branchy(&s->c, 252))
 972                 s->prob.p.intra[i] = update_prob(&s->c, s->prob.p.intra[i]);
 973
 974         if (s->allowcompinter) {
 975             s->comppredmode = vp8_rac_get(&s->c);
 976             if (s->comppredmode)
 977                 s->comppredmode += vp8_rac_get(&s->c);
 978             if (s->comppredmode == PRED_SWITCHABLE)
 979                 for (i = 0; i < 5; i++)
 980                     if (vp56_rac_get_prob_branchy(&s->c, 252))
 981                         s->prob.p.comp[i] =
 982                             update_prob(&s->c, s->prob.p.comp[i]);
 983         } else {
 984             s->comppredmode = PRED_SINGLEREF;
 985         }
 986
 987         if (s->comppredmode != PRED_COMPREF) {
 988             for (i = 0; i < 5; i++) {
 989                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 990                     s->prob.p.single_ref[i][0] =
 991                         update_prob(&s->c, s->prob.p.single_ref[i][0]);
 992                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 993                     s->prob.p.single_ref[i][1] =
 994                         update_prob(&s->c, s->prob.p.single_ref[i][1]);
 995             }
 996         }
 997
 998         if (s->comppredmode != PRED_SINGLEREF) {
 999             for (i = 0; i < 5; i++)
1000                 if (vp56_rac_get_prob_branchy(&s->c, 252))
1001                     s->prob.p.comp_ref[i] =
1002                         update_prob(&s->c, s->prob.p.comp_ref[i]);
1003         }
1004
1005         for (i = 0; i < 4; i++)
1006             for (j = 0; j < 9; j++)
1007                 if (vp56_rac_get_prob_branchy(&s->c, 252))
1008                     s->prob.p.y_mode[i][j] =
1009                         update_prob(&s->c, s->prob.p.y_mode[i][j]);
1010
1011         for (i = 0; i < 4; i++)
1012             for (j = 0; j < 4; j++)
1013                 for (k = 0; k < 3; k++)
1014                     if (vp56_rac_get_prob_branchy(&s->c, 252))
1015                         s->prob.p.partition[3 - i][j][k] =
1016                             update_prob(&s->c, s->prob.p.partition[3 - i][j][k]);
1017
1018         // mv fields don't use the update_prob subexp model for some reason
1019         for (i = 0; i < 3; i++)
1020             if (vp56_rac_get_prob_branchy(&s->c, 252))
1021                 s->prob.p.mv_joint[i] = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1022
1023         for (i = 0; i < 2; i++) {
1024             if (vp56_rac_get_prob_branchy(&s->c, 252))
1025                 s->prob.p.mv_comp[i].sign = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1026
1027             for (j = 0; j < 10; j++)
1028                 if (vp56_rac_get_prob_branchy(&s->c, 252))
1029                     s->prob.p.mv_comp[i].classes[j] =
1030                         (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1031
1032             if (vp56_rac_get_prob_branchy(&s->c, 252))
1033                 s->prob.p.mv_comp[i].class0 = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1034
1035             for (j = 0; j < 10; j++)
1036                 if (vp56_rac_get_prob_branchy(&s->c, 252))
1037                     s->prob.p.mv_comp[i].bits[j] =
1038                         (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1039         }
1040
1041         for (i = 0; i < 2; i++) {
1042             for (j = 0; j < 2; j++)
1043                 for (k = 0; k < 3; k++)
1044                     if (vp56_rac_get_prob_branchy(&s->c, 252))
1045                         s->prob.p.mv_comp[i].class0_fp[j][k] =
1046                             (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1047
1048             for (j = 0; j < 3; j++)
1049                 if (vp56_rac_get_prob_branchy(&s->c, 252))
1050                     s->prob.p.mv_comp[i].fp[j] =
1051                         (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1052         }
1053
1054         if (s->highprecisionmvs) {
1055             for (i = 0; i < 2; i++) {
1056                 if (vp56_rac_get_prob_branchy(&s->c, 252))
1057                     s->prob.p.mv_comp[i].class0_hp =
1058                         (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1059
1060                 if (vp56_rac_get_prob_branchy(&s->c, 252))
1061                     s->prob.p.mv_comp[i].hp =
1062                         (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1063             }
1064         }
1065     }
1066
1067     return (data2 - data) + size2;
1068 }
1069
1070 static av_always_inline void clamp_mv(VP56mv *dst, const VP56mv *src,
1071                                       VP9Context *s)
1072 {
1073     dst->x = av_clip(src->x, s->min_mv.x, s->max_mv.x);
1074     dst->y = av_clip(src->y, s->min_mv.y, s->max_mv.y);
1075 }
1076
1077 static void find_ref_mvs(VP9Context *s,
1078                          VP56mv *pmv, int ref, int z, int idx, int sb)
1079 {
1080     static const int8_t mv_ref_blk_off[N_BS_SIZES][8][2] = {
1081         [BS_64x64] = {{  3, -1 }, { -1,  3 }, {  4, -1 }, { -1,  4 },
1082                       { -1, -1 }, {  0, -1 }, { -1,  0 }, {  6, -1 }},
1083         [BS_64x32] = {{  0, -1 }, { -1,  0 }, {  4, -1 }, { -1,  2 },
1084                       { -1, -1 }, {  0, -3 }, { -3,  0 }, {  2, -1 }},
1085         [BS_32x64] = {{ -1,  0 }, {  0, -1 }, { -1,  4 }, {  2, -1 },
1086                       { -1, -1 }, { -3,  0 }, {  0, -3 }, { -1,  2 }},
1087         [BS_32x32] = {{  1, -1 }, { -1,  1 }, {  2, -1 }, { -1,  2 },
1088                       { -1, -1 }, {  0, -3 }, { -3,  0 }, { -3, -3 }},
1089         [BS_32x16] = {{  0, -1 }, { -1,  0 }, {  2, -1 }, { -1, -1 },
1090                       { -1,  1 }, {  0, -3 }, { -3,  0 }, { -3, -3 }},
1091         [BS_16x32] = {{ -1,  0 }, {  0, -1 }, { -1,  2 }, { -1, -1 },
1092                       {  1, -1 }, { -3,  0 }, {  0, -3 }, { -3, -3 }},
1093         [BS_16x16] = {{  0, -1 }, { -1,  0 }, {  1, -1 }, { -1,  1 },
1094                       { -1, -1 }, {  0, -3 }, { -3,  0 }, { -3, -3 }},
1095         [BS_16x8]  = {{  0, -1 }, { -1,  0 }, {  1, -1 }, { -1, -1 },
1096                       {  0, -2 }, { -2,  0 }, { -2, -1 }, { -1, -2 }},
1097         [BS_8x16]  = {{ -1,  0 }, {  0, -1 }, { -1,  1 }, { -1, -1 },
1098                       { -2,  0 }, {  0, -2 }, { -1, -2 }, { -2, -1 }},
1099         [BS_8x8]   = {{  0, -1 }, { -1,  0 }, { -1, -1 }, {  0, -2 },
1100                       { -2,  0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
1101         [BS_8x4]   = {{  0, -1 }, { -1,  0 }, { -1, -1 }, {  0, -2 },
1102                       { -2,  0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
1103         [BS_4x8]   = {{  0, -1 }, { -1,  0 }, { -1, -1 }, {  0, -2 },
1104                       { -2,  0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
1105         [BS_4x4]   = {{  0, -1 }, { -1,  0 }, { -1, -1 }, {  0, -2 },
1106                       { -2,  0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
1107     };
1108     VP9Block *b = s->b;
1109     int row = s->row, col = s->col, row7 = s->row7;
1110     const int8_t (*p)[2] = mv_ref_blk_off[b->bs];
1111 #define INVALID_MV 0x80008000U
1112     uint32_t mem = INVALID_MV, mem_sub8x8 = INVALID_MV;
1113     int i;
1114
1115 #define RETURN_DIRECT_MV(mv) \
1116     do { \
1117         uint32_t m = AV_RN32A(&mv); \
1118         if (!idx) { \
1119             AV_WN32A(pmv, m); \
1120             return; \
1121         } else if (mem == INVALID_MV) { \
1122             mem = m; \
1123         } else if (m != mem) { \
1124             AV_WN32A(pmv, m); \
1125             return; \
1126         } \
1127     } while (0)
1128
1129     if (sb >= 0) {
1130         if (sb == 2 || sb == 1) {
1131             RETURN_DIRECT_MV(b->mv[0][z]);
1132         } else if (sb == 3) {
1133             RETURN_DIRECT_MV(b->mv[2][z]);
1134             RETURN_DIRECT_MV(b->mv[1][z]);
1135             RETURN_DIRECT_MV(b->mv[0][z]);
1136         }
1137
1138 #define RETURN_MV(mv) \
1139     do { \
1140         if (sb > 0) { \
1141             VP56mv tmp; \
1142             uint32_t m; \
1143             av_assert2(idx == 1); \
1144             av_assert2(mem != INVALID_MV); \
1145             if (mem_sub8x8 == INVALID_MV) { \
1146                 clamp_mv(&tmp, &mv, s); \
1147                 m = AV_RN32A(&tmp); \
1148                 if (m != mem) { \
1149                     AV_WN32A(pmv, m); \
1150                     return; \
1151                 } \
1152                 mem_sub8x8 = AV_RN32A(&mv); \
1153             } else if (mem_sub8x8 != AV_RN32A(&mv)) { \
1154                 clamp_mv(&tmp, &mv, s); \
1155                 m = AV_RN32A(&tmp); \
1156                 if (m != mem) { \
1157                     AV_WN32A(pmv, m); \
1158                 } else { \
1159                     /* BUG I'm pretty sure this isn't the intention */ \
1160                     AV_WN32A(pmv, 0); \
1161                 } \
1162                 return; \
1163             } \
1164         } else { \
1165             uint32_t m = AV_RN32A(&mv); \
1166             if (!idx) { \
1167                 clamp_mv(pmv, &mv, s); \
1168                 return; \
1169             } else if (mem == INVALID_MV) { \
1170                 mem = m; \
1171             } else if (m != mem) { \
1172                 clamp_mv(pmv, &mv, s); \
1173                 return; \
1174             } \
1175         } \
1176     } while (0)
1177
1178         if (row > 0) {
1179             struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[(row - 1) * s->sb_cols * 8 + col];
1180             if (mv->ref[0] == ref) {
1181                 RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][0]);
1182             } else if (mv->ref[1] == ref) {
1183                 RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][1]);
1184             }
1185         }
1186         if (col > s->tiling.tile_col_start) {
1187             struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[row * s->sb_cols * 8 + col - 1];
1188             if (mv->ref[0] == ref) {
1189                 RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][0]);
1190             } else if (mv->ref[1] == ref) {
1191                 RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][1]);
1192             }
1193         }
1194         i = 2;
1195     } else {
1196         i = 0;
1197     }
1198
1199     // previously coded MVs in this neighbourhood, using same reference frame
1200     for (; i < 8; i++) {
1201         int c = p[i][0] + col, r = p[i][1] + row;
1202
1203         if (c >= s->tiling.tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
1204             struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
1205
1206             if (mv->ref[0] == ref) {
1207                 RETURN_MV(mv->mv[0]);
1208             } else if (mv->ref[1] == ref) {
1209                 RETURN_MV(mv->mv[1]);
1210             }
1211         }
1212     }
1213
1214     // MV at this position in previous frame, using same reference frame
1215     if (s->use_last_frame_mvs) {
1216         struct VP9mvrefPair *mv = &s->frames[REF_FRAME_MVPAIR].mv[row * s->sb_cols * 8 + col];
1217
1218         if (!s->frames[REF_FRAME_MVPAIR].uses_2pass)
1219             ff_thread_await_progress(&s->frames[REF_FRAME_MVPAIR].tf, row >> 3, 0);
1220         if (mv->ref[0] == ref) {
1221             RETURN_MV(mv->mv[0]);
1222         } else if (mv->ref[1] == ref) {
1223             RETURN_MV(mv->mv[1]);
1224         }
1225     }
1226
1227 #define RETURN_SCALE_MV(mv, scale) \
1228     do { \
1229         if (scale) { \
1230             VP56mv mv_temp = { -mv.x, -mv.y }; \
1231             RETURN_MV(mv_temp); \
1232         } else { \
1233             RETURN_MV(mv); \
1234         } \
1235     } while (0)
1236
1237     // previously coded MVs in this neighbourhood, using different reference frame
1238     for (i = 0; i < 8; i++) {
1239         int c = p[i][0] + col, r = p[i][1] + row;
1240
1241         if (c >= s->tiling.tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
1242             struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
1243
1244             if (mv->ref[0] != ref && mv->ref[0] >= 0) {
1245                 RETURN_SCALE_MV(mv->mv[0], s->signbias[mv->ref[0]] != s->signbias[ref]);
1246             }
1247             if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
1248                 // BUG - libvpx has this condition regardless of whether
1249                 // we used the first ref MV and pre-scaling
1250                 AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
1251                 RETURN_SCALE_MV(mv->mv[1], s->signbias[mv->ref[1]] != s->signbias[ref]);
1252             }
1253         }
1254     }
1255
1256     // MV at this position in previous frame, using different reference frame
1257     if (s->use_last_frame_mvs) {
1258         struct VP9mvrefPair *mv = &s->frames[REF_FRAME_MVPAIR].mv[row * s->sb_cols * 8 + col];
1259
1260         // no need to await_progress, because we already did that above
1261         if (mv->ref[0] != ref && mv->ref[0] >= 0) {
1262             RETURN_SCALE_MV(mv->mv[0], s->signbias[mv->ref[0]] != s->signbias[ref]);
1263         }
1264         if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
1265             // BUG - libvpx has this condition regardless of whether
1266             // we used the first ref MV and pre-scaling
1267             AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
1268             RETURN_SCALE_MV(mv->mv[1], s->signbias[mv->ref[1]] != s->signbias[ref]);
1269         }
1270     }
1271
1272     AV_ZERO32(pmv);
1273     clamp_mv(pmv, pmv, s);
1274 #undef INVALID_MV
1275 #undef RETURN_MV
1276 #undef RETURN_SCALE_MV
1277 }
1278
1279 static av_always_inline int read_mv_component(VP9Context *s, int idx, int hp)
1280 {
1281     int bit, sign = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].sign);
1282     int n, c = vp8_rac_get_tree(&s->c, vp9_mv_class_tree,
1283                                 s->prob.p.mv_comp[idx].classes);
1284
1285     s->counts.mv_comp[idx].sign[sign]++;
1286     s->counts.mv_comp[idx].classes[c]++;
1287     if (c) {
1288         int m;
1289
1290         for (n = 0, m = 0; m < c; m++) {
1291             bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].bits[m]);
1292             n |= bit << m;
1293             s->counts.mv_comp[idx].bits[m][bit]++;
1294         }
1295         n <<= 3;
1296         bit = vp8_rac_get_tree(&s->c, vp9_mv_fp_tree, s->prob.p.mv_comp[idx].fp);
1297         n |= bit << 1;
1298         s->counts.mv_comp[idx].fp[bit]++;
1299         if (hp) {
1300             bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].hp);
1301             s->counts.mv_comp[idx].hp[bit]++;
1302             n |= bit;
1303         } else {
1304             n |= 1;
1305             // bug in libvpx - we count for bw entropy purposes even if the
1306             // bit wasn't coded
1307             s->counts.mv_comp[idx].hp[1]++;
1308         }
1309         n += 8 << c;
1310     } else {
1311         n = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0);
1312         s->counts.mv_comp[idx].class0[n]++;
1313         bit = vp8_rac_get_tree(&s->c, vp9_mv_fp_tree,
1314                                s->prob.p.mv_comp[idx].class0_fp[n]);
1315         s->counts.mv_comp[idx].class0_fp[n][bit]++;
1316         n = (n << 3) | (bit << 1);
1317         if (hp) {
1318             bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0_hp);
1319             s->counts.mv_comp[idx].class0_hp[bit]++;
1320             n |= bit;
1321         } else {
1322             n |= 1;
1323             // bug in libvpx - we count for bw entropy purposes even if the
1324             // bit wasn't coded
1325             s->counts.mv_comp[idx].class0_hp[1]++;
1326         }
1327     }
1328
1329     return sign ? -(n + 1) : (n + 1);
1330 }
1331
1332 static void fill_mv(VP9Context *s,
1333                     VP56mv *mv, int mode, int sb)
1334 {
1335     VP9Block *b = s->b;
1336
1337     if (mode == ZEROMV) {
1338         AV_ZERO64(mv);
1339     } else {
1340         int hp;
1341
1342         // FIXME cache this value and reuse for other subblocks
1343         find_ref_mvs(s, &mv[0], b->ref[0], 0, mode == NEARMV,
1344                      mode == NEWMV ? -1 : sb);
1345         // FIXME maybe move this code into find_ref_mvs()
1346         if ((mode == NEWMV || sb == -1) &&
1347             !(hp = s->highprecisionmvs && abs(mv[0].x) < 64 && abs(mv[0].y) < 64)) {
1348             if (mv[0].y & 1) {
1349                 if (mv[0].y < 0)
1350                     mv[0].y++;
1351                 else
1352                     mv[0].y--;
1353             }
1354             if (mv[0].x & 1) {
1355                 if (mv[0].x < 0)
1356                     mv[0].x++;
1357                 else
1358                     mv[0].x--;
1359             }
1360         }
1361         if (mode == NEWMV) {
1362             enum MVJoint j = vp8_rac_get_tree(&s->c, vp9_mv_joint_tree,
1363                                               s->prob.p.mv_joint);
1364
1365             s->counts.mv_joint[j]++;
1366             if (j >= MV_JOINT_V)
1367                 mv[0].y += read_mv_component(s, 0, hp);
1368             if (j & 1)
1369                 mv[0].x += read_mv_component(s, 1, hp);
1370         }
1371
1372         if (b->comp) {
1373             // FIXME cache this value and reuse for other subblocks
1374             find_ref_mvs(s, &mv[1], b->ref[1], 1, mode == NEARMV,
1375                          mode == NEWMV ? -1 : sb);
1376             if ((mode == NEWMV || sb == -1) &&
1377                 !(hp = s->highprecisionmvs && abs(mv[1].x) < 64 && abs(mv[1].y) < 64)) {
1378                 if (mv[1].y & 1) {
1379                     if (mv[1].y < 0)
1380                         mv[1].y++;
1381                     else
1382                         mv[1].y--;
1383                 }
1384                 if (mv[1].x & 1) {
1385                     if (mv[1].x < 0)
1386                         mv[1].x++;
1387                     else
1388                         mv[1].x--;
1389                 }
1390             }
1391             if (mode == NEWMV) {
1392                 enum MVJoint j = vp8_rac_get_tree(&s->c, vp9_mv_joint_tree,
1393                                                   s->prob.p.mv_joint);
1394
1395                 s->counts.mv_joint[j]++;
1396                 if (j >= MV_JOINT_V)
1397                     mv[1].y += read_mv_component(s, 0, hp);
1398                 if (j & 1)
1399                     mv[1].x += read_mv_component(s, 1, hp);
1400             }
1401         }
1402     }
1403 }
1404
1405 static av_always_inline void setctx_2d(uint8_t *ptr, int w, int h,
1406                                        ptrdiff_t stride, int v)
1407 {
1408     switch (w) {
1409     case 1:
1410         do {
1411             *ptr = v;
1412             ptr += stride;
1413         } while (--h);
1414         break;
1415     case 2: {
1416         int v16 = v * 0x0101;
1417         do {
1418             AV_WN16A(ptr, v16);
1419             ptr += stride;
1420         } while (--h);
1421         break;
1422     }
1423     case 4: {
1424         uint32_t v32 = v * 0x01010101;
1425         do {
1426             AV_WN32A(ptr, v32);
1427             ptr += stride;
1428         } while (--h);
1429         break;
1430     }
1431     case 8: {
1432 #if HAVE_FAST_64BIT
1433         uint64_t v64 = v * 0x0101010101010101ULL;
1434         do {
1435             AV_WN64A(ptr, v64);
1436             ptr += stride;
1437         } while (--h);
1438 #else
1439         uint32_t v32 = v * 0x01010101;
1440         do {
1441             AV_WN32A(ptr,     v32);
1442             AV_WN32A(ptr + 4, v32);
1443             ptr += stride;
1444         } while (--h);
1445 #endif
1446         break;
1447     }
1448     }
1449 }
1450
1451 static void decode_mode(AVCodecContext *ctx)
1452 {
1453     static const uint8_t left_ctx[N_BS_SIZES] = {
1454         0x0, 0x8, 0x0, 0x8, 0xc, 0x8, 0xc, 0xe, 0xc, 0xe, 0xf, 0xe, 0xf
1455     };
1456     static const uint8_t above_ctx[N_BS_SIZES] = {
1457         0x0, 0x0, 0x8, 0x8, 0x8, 0xc, 0xc, 0xc, 0xe, 0xe, 0xe, 0xf, 0xf
1458     };
1459     static const uint8_t max_tx_for_bl_bp[N_BS_SIZES] = {
1460         TX_32X32, TX_32X32, TX_32X32, TX_32X32, TX_16X16, TX_16X16,
1461         TX_16X16, TX_8X8, TX_8X8, TX_8X8, TX_4X4, TX_4X4, TX_4X4
1462     };
1463     VP9Context *s = ctx->priv_data;
1464     VP9Block *b = s->b;
1465     int row = s->row, col = s->col, row7 = s->row7;
1466     enum TxfmMode max_tx = max_tx_for_bl_bp[b->bs];
1467     int bw4 = bwh_tab[1][b->bs][0], w4 = FFMIN(s->cols - col, bw4);
1468     int bh4 = bwh_tab[1][b->bs][1], h4 = FFMIN(s->rows - row, bh4), y;
1469     int have_a = row > 0, have_l = col > s->tiling.tile_col_start;
1470     int vref, filter_id;
1471
1472     if (!s->segmentation.enabled) {
1473         b->seg_id = 0;
1474     } else if (s->keyframe || s->intraonly) {
1475         b->seg_id = !s->segmentation.update_map ? 0 :
1476                     vp8_rac_get_tree(&s->c, vp9_segmentation_tree, s->prob.seg);
1477     } else if (!s->segmentation.update_map ||
1478                (s->segmentation.temporal &&
1479                 vp56_rac_get_prob_branchy(&s->c,
1480                     s->prob.segpred[s->above_segpred_ctx[col] +
1481                                     s->left_segpred_ctx[row7]]))) {
1482         if (!s->errorres && s->frames[REF_FRAME_SEGMAP].segmentation_map) {
1483             int pred = 8, x;
1484             uint8_t *refsegmap = s->frames[REF_FRAME_SEGMAP].segmentation_map;
1485
1486             if (!s->frames[REF_FRAME_SEGMAP].uses_2pass)
1487                 ff_thread_await_progress(&s->frames[REF_FRAME_SEGMAP].tf, row >> 3, 0);
1488             for (y = 0; y < h4; y++) {
1489                 int idx_base = (y + row) * 8 * s->sb_cols + col;
1490                 for (x = 0; x < w4; x++)
1491                     pred = FFMIN(pred, refsegmap[idx_base + x]);
1492             }
1493             av_assert1(pred < 8);
1494             b->seg_id = pred;
1495         } else {
1496             b->seg_id = 0;
1497         }
1498
1499         memset(&s->above_segpred_ctx[col], 1, w4);
1500         memset(&s->left_segpred_ctx[row7], 1, h4);
1501     } else {
1502         b->seg_id = vp8_rac_get_tree(&s->c, vp9_segmentation_tree,
1503                                      s->prob.seg);
1504
1505         memset(&s->above_segpred_ctx[col], 0, w4);
1506         memset(&s->left_segpred_ctx[row7], 0, h4);
1507     }
1508     if (s->segmentation.enabled &&
1509         (s->segmentation.update_map || s->keyframe || s->intraonly)) {
1510         setctx_2d(&s->frames[CUR_FRAME].segmentation_map[row * 8 * s->sb_cols + col],
1511                   bw4, bh4, 8 * s->sb_cols, b->seg_id);
1512     }
1513
1514     b->skip = s->segmentation.enabled &&
1515         s->segmentation.feat[b->seg_id].skip_enabled;
1516     if (!b->skip) {
1517         int c = s->left_skip_ctx[row7] + s->above_skip_ctx[col];
1518         b->skip = vp56_rac_get_prob(&s->c, s->prob.p.skip[c]);
1519         s->counts.skip[c][b->skip]++;
1520     }
1521
1522     if (s->keyframe || s->intraonly) {
1523         b->intra = 1;
1524     } else if (s->segmentation.enabled && s->segmentation.feat[b->seg_id].ref_enabled) {
1525         b->intra = !s->segmentation.feat[b->seg_id].ref_val;
1526     } else {
1527         int c, bit;
1528
1529         if (have_a && have_l) {
1530             c = s->above_intra_ctx[col] + s->left_intra_ctx[row7];
1531             c += (c == 2);
1532         } else {
1533             c = have_a ? 2 * s->above_intra_ctx[col] :
1534                 have_l ? 2 * s->left_intra_ctx[row7] : 0;
1535         }
1536         bit = vp56_rac_get_prob(&s->c, s->prob.p.intra[c]);
1537         s->counts.intra[c][bit]++;
1538         b->intra = !bit;
1539     }
1540
1541     if ((b->intra || !b->skip) && s->txfmmode == TX_SWITCHABLE) {
1542         int c;
1543         if (have_a) {
1544             if (have_l) {
1545                 c = (s->above_skip_ctx[col] ? max_tx :
1546                      s->above_txfm_ctx[col]) +
1547                     (s->left_skip_ctx[row7] ? max_tx :
1548                      s->left_txfm_ctx[row7]) > max_tx;
1549             } else {
1550                 c = s->above_skip_ctx[col] ? 1 :
1551                     (s->above_txfm_ctx[col] * 2 > max_tx);
1552             }
1553         } else if (have_l) {
1554             c = s->left_skip_ctx[row7] ? 1 :
1555                 (s->left_txfm_ctx[row7] * 2 > max_tx);
1556         } else {
1557             c = 1;
1558         }
1559         switch (max_tx) {
1560         case TX_32X32:
1561             b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][0]);
1562             if (b->tx) {
1563                 b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][1]);
1564                 if (b->tx == 2)
1565                     b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][2]);
1566             }
1567             s->counts.tx32p[c][b->tx]++;
1568             break;
1569         case TX_16X16:
1570             b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][0]);
1571             if (b->tx)
1572                 b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][1]);
1573             s->counts.tx16p[c][b->tx]++;
1574             break;
1575         case TX_8X8:
1576             b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx8p[c]);
1577             s->counts.tx8p[c][b->tx]++;
1578             break;
1579         case TX_4X4:
1580             b->tx = TX_4X4;
1581             break;
1582         }
1583     } else {
1584         b->tx = FFMIN(max_tx, s->txfmmode);
1585     }
1586
1587     if (s->keyframe || s->intraonly) {
1588         uint8_t *a = &s->above_mode_ctx[col * 2];
1589         uint8_t *l = &s->left_mode_ctx[(row7) << 1];
1590
1591         b->comp = 0;
1592         if (b->bs > BS_8x8) {
1593             // FIXME the memory storage intermediates here aren't really
1594             // necessary, they're just there to make the code slightly
1595             // simpler for now
1596             b->mode[0] = a[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1597                                     vp9_default_kf_ymode_probs[a[0]][l[0]]);
1598             if (b->bs != BS_8x4) {
1599                 b->mode[1] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1600                                  vp9_default_kf_ymode_probs[a[1]][b->mode[0]]);
1601                 l[0] = a[1] = b->mode[1];
1602             } else {
1603                 l[0] = a[1] = b->mode[1] = b->mode[0];
1604             }
1605             if (b->bs != BS_4x8) {
1606                 b->mode[2] = a[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1607                                         vp9_default_kf_ymode_probs[a[0]][l[1]]);
1608                 if (b->bs != BS_8x4) {
1609                     b->mode[3] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1610                                   vp9_default_kf_ymode_probs[a[1]][b->mode[2]]);
1611                     l[1] = a[1] = b->mode[3];
1612                 } else {
1613                     l[1] = a[1] = b->mode[3] = b->mode[2];
1614                 }
1615             } else {
1616                 b->mode[2] = b->mode[0];
1617                 l[1] = a[1] = b->mode[3] = b->mode[1];
1618             }
1619         } else {
1620             b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1621                                           vp9_default_kf_ymode_probs[*a][*l]);
1622             b->mode[3] = b->mode[2] = b->mode[1] = b->mode[0];
1623             // FIXME this can probably be optimized
1624             memset(a, b->mode[0], bwh_tab[0][b->bs][0]);
1625             memset(l, b->mode[0], bwh_tab[0][b->bs][1]);
1626         }
1627         b->uvmode = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1628                                      vp9_default_kf_uvmode_probs[b->mode[3]]);
1629     } else if (b->intra) {
1630         b->comp = 0;
1631         if (b->bs > BS_8x8) {
1632             b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1633                                           s->prob.p.y_mode[0]);
1634             s->counts.y_mode[0][b->mode[0]]++;
1635             if (b->bs != BS_8x4) {
1636                 b->mode[1] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1637                                               s->prob.p.y_mode[0]);
1638                 s->counts.y_mode[0][b->mode[1]]++;
1639             } else {
1640                 b->mode[1] = b->mode[0];
1641             }
1642             if (b->bs != BS_4x8) {
1643                 b->mode[2] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1644                                               s->prob.p.y_mode[0]);
1645                 s->counts.y_mode[0][b->mode[2]]++;
1646                 if (b->bs != BS_8x4) {
1647                     b->mode[3] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1648                                                   s->prob.p.y_mode[0]);
1649                     s->counts.y_mode[0][b->mode[3]]++;
1650                 } else {
1651                     b->mode[3] = b->mode[2];
1652                 }
1653             } else {
1654                 b->mode[2] = b->mode[0];
1655                 b->mode[3] = b->mode[1];
1656             }
1657         } else {
1658             static const uint8_t size_group[10] = {
1659                 3, 3, 3, 3, 2, 2, 2, 1, 1, 1
1660             };
1661             int sz = size_group[b->bs];
1662
1663             b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1664                                           s->prob.p.y_mode[sz]);
1665             b->mode[1] = b->mode[2] = b->mode[3] = b->mode[0];
1666             s->counts.y_mode[sz][b->mode[3]]++;
1667         }
1668         b->uvmode = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1669                                      s->prob.p.uv_mode[b->mode[3]]);
1670         s->counts.uv_mode[b->mode[3]][b->uvmode]++;
1671     } else {
1672         static const uint8_t inter_mode_ctx_lut[14][14] = {
1673             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1674             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1675             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1676             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1677             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1678             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1679             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1680             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1681             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1682             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1683             { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
1684             { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
1685             { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 0, 3 },
1686             { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 3, 3, 4 },
1687         };
1688
1689         if (s->segmentation.enabled && s->segmentation.feat[b->seg_id].ref_enabled) {
1690             av_assert2(s->segmentation.feat[b->seg_id].ref_val != 0);
1691             b->comp = 0;
1692             b->ref[0] = s->segmentation.feat[b->seg_id].ref_val - 1;
1693         } else {
1694             // read comp_pred flag
1695             if (s->comppredmode != PRED_SWITCHABLE) {
1696                 b->comp = s->comppredmode == PRED_COMPREF;
1697             } else {
1698                 int c;
1699
1700                 // FIXME add intra as ref=0xff (or -1) to make these easier?
1701                 if (have_a) {
1702                     if (have_l) {
1703                         if (s->above_comp_ctx[col] && s->left_comp_ctx[row7]) {
1704                             c = 4;
1705                         } else if (s->above_comp_ctx[col]) {
1706                             c = 2 + (s->left_intra_ctx[row7] ||
1707                                      s->left_ref_ctx[row7] == s->fixcompref);
1708                         } else if (s->left_comp_ctx[row7]) {
1709                             c = 2 + (s->above_intra_ctx[col] ||
1710                                      s->above_ref_ctx[col] == s->fixcompref);
1711                         } else {
1712                             c = (!s->above_intra_ctx[col] &&
1713                                  s->above_ref_ctx[col] == s->fixcompref) ^
1714                             (!s->left_intra_ctx[row7] &&
1715                              s->left_ref_ctx[row & 7] == s->fixcompref);
1716                         }
1717                     } else {
1718                         c = s->above_comp_ctx[col] ? 3 :
1719                         (!s->above_intra_ctx[col] && s->above_ref_ctx[col] == s->fixcompref);
1720                     }
1721                 } else if (have_l) {
1722                     c = s->left_comp_ctx[row7] ? 3 :
1723                     (!s->left_intra_ctx[row7] && s->left_ref_ctx[row7] == s->fixcompref);
1724                 } else {
1725                     c = 1;
1726                 }
1727                 b->comp = vp56_rac_get_prob(&s->c, s->prob.p.comp[c]);
1728                 s->counts.comp[c][b->comp]++;
1729             }
1730
1731             // read actual references
1732             // FIXME probably cache a few variables here to prevent repetitive
1733             // memory accesses below
1734             if (b->comp) /* two references */ {
1735                 int fix_idx = s->signbias[s->fixcompref], var_idx = !fix_idx, c, bit;
1736
1737                 b->ref[fix_idx] = s->fixcompref;
1738                 // FIXME can this codeblob be replaced by some sort of LUT?
1739                 if (have_a) {
1740                     if (have_l) {
1741                         if (s->above_intra_ctx[col]) {
1742                             if (s->left_intra_ctx[row7]) {
1743                                 c = 2;
1744                             } else {
1745                                 c = 1 + 2 * (s->left_ref_ctx[row7] != s->varcompref[1]);
1746                             }
1747                         } else if (s->left_intra_ctx[row7]) {
1748                             c = 1 + 2 * (s->above_ref_ctx[col] != s->varcompref[1]);
1749                         } else {
1750                             int refl = s->left_ref_ctx[row7], refa = s->above_ref_ctx[col];
1751
1752                             if (refl == refa && refa == s->varcompref[1]) {
1753                                 c = 0;
1754                             } else if (!s->left_comp_ctx[row7] && !s->above_comp_ctx[col]) {
1755                                 if ((refa == s->fixcompref && refl == s->varcompref[0]) ||
1756                                     (refl == s->fixcompref && refa == s->varcompref[0])) {
1757                                     c = 4;
1758                                 } else {
1759                                     c = (refa == refl) ? 3 : 1;
1760                                 }
1761                             } else if (!s->left_comp_ctx[row7]) {
1762                                 if (refa == s->varcompref[1] && refl != s->varcompref[1]) {
1763                                     c = 1;
1764                                 } else {
1765                                     c = (refl == s->varcompref[1] &&
1766                                          refa != s->varcompref[1]) ? 2 : 4;
1767                                 }
1768                             } else if (!s->above_comp_ctx[col]) {
1769                                 if (refl == s->varcompref[1] && refa != s->varcompref[1]) {
1770                                     c = 1;
1771                                 } else {
1772                                     c = (refa == s->varcompref[1] &&
1773                                          refl != s->varcompref[1]) ? 2 : 4;
1774                                 }
1775                             } else {
1776                                 c = (refl == refa) ? 4 : 2;
1777                             }
1778                         }
1779                     } else {
1780                         if (s->above_intra_ctx[col]) {
1781                             c = 2;
1782                         } else if (s->above_comp_ctx[col]) {
1783                             c = 4 * (s->above_ref_ctx[col] != s->varcompref[1]);
1784                         } else {
1785                             c = 3 * (s->above_ref_ctx[col] != s->varcompref[1]);
1786                         }
1787                     }
1788                 } else if (have_l) {
1789                     if (s->left_intra_ctx[row7]) {
1790                         c = 2;
1791                     } else if (s->left_comp_ctx[row7]) {
1792                         c = 4 * (s->left_ref_ctx[row7] != s->varcompref[1]);
1793                     } else {
1794                         c = 3 * (s->left_ref_ctx[row7] != s->varcompref[1]);
1795                     }
1796                 } else {
1797                     c = 2;
1798                 }
1799                 bit = vp56_rac_get_prob(&s->c, s->prob.p.comp_ref[c]);
1800                 b->ref[var_idx] = s->varcompref[bit];
1801                 s->counts.comp_ref[c][bit]++;
1802             } else /* single reference */ {
1803                 int bit, c;
1804
1805                 if (have_a && !s->above_intra_ctx[col]) {
1806                     if (have_l && !s->left_intra_ctx[row7]) {
1807                         if (s->left_comp_ctx[row7]) {
1808                             if (s->above_comp_ctx[col]) {
1809                                 c = 1 + (!s->fixcompref || !s->left_ref_ctx[row7] ||
1810                                          !s->above_ref_ctx[col]);
1811                             } else {
1812                                 c = (3 * !s->above_ref_ctx[col]) +
1813                                     (!s->fixcompref || !s->left_ref_ctx[row7]);
1814                             }
1815                         } else if (s->above_comp_ctx[col]) {
1816                             c = (3 * !s->left_ref_ctx[row7]) +
1817                                 (!s->fixcompref || !s->above_ref_ctx[col]);
1818                         } else {
1819                             c = 2 * !s->left_ref_ctx[row7] + 2 * !s->above_ref_ctx[col];
1820                         }
1821                     } else if (s->above_intra_ctx[col]) {
1822                         c = 2;
1823                     } else if (s->above_comp_ctx[col]) {
1824                         c = 1 + (!s->fixcompref || !s->above_ref_ctx[col]);
1825                     } else {
1826                         c = 4 * (!s->above_ref_ctx[col]);
1827                     }
1828                 } else if (have_l && !s->left_intra_ctx[row7]) {
1829                     if (s->left_intra_ctx[row7]) {
1830                         c = 2;
1831                     } else if (s->left_comp_ctx[row7]) {
1832                         c = 1 + (!s->fixcompref || !s->left_ref_ctx[row7]);
1833                     } else {
1834                         c = 4 * (!s->left_ref_ctx[row7]);
1835                     }
1836                 } else {
1837                     c = 2;
1838                 }
1839                 bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][0]);
1840                 s->counts.single_ref[c][0][bit]++;
1841                 if (!bit) {
1842                     b->ref[0] = 0;
1843                 } else {
1844                     // FIXME can this codeblob be replaced by some sort of LUT?
1845                     if (have_a) {
1846                         if (have_l) {
1847                             if (s->left_intra_ctx[row7]) {
1848                                 if (s->above_intra_ctx[col]) {
1849                                     c = 2;
1850                                 } else if (s->above_comp_ctx[col]) {
1851                                     c = 1 + 2 * (s->fixcompref == 1 ||
1852                                                  s->above_ref_ctx[col] == 1);
1853                                 } else if (!s->above_ref_ctx[col]) {
1854                                     c = 3;
1855                                 } else {
1856                                     c = 4 * (s->above_ref_ctx[col] == 1);
1857                                 }
1858                             } else if (s->above_intra_ctx[col]) {
1859                                 if (s->left_intra_ctx[row7]) {
1860                                     c = 2;
1861                                 } else if (s->left_comp_ctx[row7]) {
1862                                     c = 1 + 2 * (s->fixcompref == 1 ||
1863                                                  s->left_ref_ctx[row7] == 1);
1864                                 } else if (!s->left_ref_ctx[row7]) {
1865                                     c = 3;
1866                                 } else {
1867                                     c = 4 * (s->left_ref_ctx[row7] == 1);
1868                                 }
1869                             } else if (s->above_comp_ctx[col]) {
1870                                 if (s->left_comp_ctx[row7]) {
1871                                     if (s->left_ref_ctx[row7] == s->above_ref_ctx[col]) {
1872                                         c = 3 * (s->fixcompref == 1 ||
1873                                                  s->left_ref_ctx[row7] == 1);
1874                                     } else {
1875                                         c = 2;
1876                                     }
1877                                 } else if (!s->left_ref_ctx[row7]) {
1878                                     c = 1 + 2 * (s->fixcompref == 1 ||
1879                                                  s->above_ref_ctx[col] == 1);
1880                                 } else {
1881                                     c = 3 * (s->left_ref_ctx[row7] == 1) +
1882                                     (s->fixcompref == 1 || s->above_ref_ctx[col] == 1);
1883                                 }
1884                             } else if (s->left_comp_ctx[row7]) {
1885                                 if (!s->above_ref_ctx[col]) {
1886                                     c = 1 + 2 * (s->fixcompref == 1 ||
1887                                                  s->left_ref_ctx[row7] == 1);
1888                                 } else {
1889                                     c = 3 * (s->above_ref_ctx[col] == 1) +
1890                                     (s->fixcompref == 1 || s->left_ref_ctx[row7] == 1);
1891                                 }
1892                             } else if (!s->above_ref_ctx[col]) {
1893                                 if (!s->left_ref_ctx[row7]) {
1894                                     c = 3;
1895                                 } else {
1896                                     c = 4 * (s->left_ref_ctx[row7] == 1);
1897                                 }
1898                             } else if (!s->left_ref_ctx[row7]) {
1899                                 c = 4 * (s->above_ref_ctx[col] == 1);
1900                             } else {
1901                                 c = 2 * (s->left_ref_ctx[row7] == 1) +
1902                                 2 * (s->above_ref_ctx[col] == 1);
1903                             }
1904                         } else {
1905                             if (s->above_intra_ctx[col] ||
1906                                 (!s->above_comp_ctx[col] && !s->above_ref_ctx[col])) {
1907                                 c = 2;
1908                             } else if (s->above_comp_ctx[col]) {
1909                                 c = 3 * (s->fixcompref == 1 || s->above_ref_ctx[col] == 1);
1910                             } else {
1911                                 c = 4 * (s->above_ref_ctx[col] == 1);
1912                             }
1913                         }
1914                     } else if (have_l) {
1915                         if (s->left_intra_ctx[row7] ||
1916                             (!s->left_comp_ctx[row7] && !s->left_ref_ctx[row7])) {
1917                             c = 2;
1918                         } else if (s->left_comp_ctx[row7]) {
1919                             c = 3 * (s->fixcompref == 1 || s->left_ref_ctx[row7] == 1);
1920                         } else {
1921                             c = 4 * (s->left_ref_ctx[row7] == 1);
1922                         }
1923                     } else {
1924                         c = 2;
1925                     }
1926                     bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][1]);
1927                     s->counts.single_ref[c][1][bit]++;
1928                     b->ref[0] = 1 + bit;
1929                 }
1930             }
1931         }
1932
1933         if (b->bs <= BS_8x8) {
1934             if (s->segmentation.enabled && s->segmentation.feat[b->seg_id].skip_enabled) {
1935                 b->mode[0] = b->mode[1] = b->mode[2] = b->mode[3] = ZEROMV;
1936             } else {
1937                 static const uint8_t off[10] = {
1938                     3, 0, 0, 1, 0, 0, 0, 0, 0, 0
1939                 };
1940
1941                 // FIXME this needs to use the LUT tables from find_ref_mvs
1942                 // because not all are -1,0/0,-1
1943                 int c = inter_mode_ctx_lut[s->above_mode_ctx[col + off[b->bs]]]
1944                                           [s->left_mode_ctx[row7 + off[b->bs]]];
1945
1946                 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1947                                               s->prob.p.mv_mode[c]);
1948                 b->mode[1] = b->mode[2] = b->mode[3] = b->mode[0];
1949                 s->counts.mv_mode[c][b->mode[0] - 10]++;
1950             }
1951         }
1952
1953         if (s->filtermode == FILTER_SWITCHABLE) {
1954             int c;
1955
1956             if (have_a && s->above_mode_ctx[col] >= NEARESTMV) {
1957                 if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
1958                     c = s->above_filter_ctx[col] == s->left_filter_ctx[row7] ?
1959                         s->left_filter_ctx[row7] : 3;
1960                 } else {
1961                     c = s->above_filter_ctx[col];
1962                 }
1963             } else if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
1964                 c = s->left_filter_ctx[row7];
1965             } else {
1966                 c = 3;
1967             }
1968
1969             filter_id = vp8_rac_get_tree(&s->c, vp9_filter_tree,
1970                                          s->prob.p.filter[c]);
1971             s->counts.filter[c][filter_id]++;
1972             b->filter = vp9_filter_lut[filter_id];
1973         } else {
1974             b->filter = s->filtermode;
1975         }
1976
1977         if (b->bs > BS_8x8) {
1978             int c = inter_mode_ctx_lut[s->above_mode_ctx[col]][s->left_mode_ctx[row7]];
1979
1980             b->mode[0] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1981                                           s->prob.p.mv_mode[c]);
1982             s->counts.mv_mode[c][b->mode[0] - 10]++;
1983             fill_mv(s, b->mv[0], b->mode[0], 0);
1984
1985             if (b->bs != BS_8x4) {
1986                 b->mode[1] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1987                                               s->prob.p.mv_mode[c]);
1988                 s->counts.mv_mode[c][b->mode[1] - 10]++;
1989                 fill_mv(s, b->mv[1], b->mode[1], 1);
1990             } else {
1991                 b->mode[1] = b->mode[0];
1992                 AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
1993                 AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
1994             }
1995
1996             if (b->bs != BS_4x8) {
1997                 b->mode[2] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1998                                               s->prob.p.mv_mode[c]);
1999                 s->counts.mv_mode[c][b->mode[2] - 10]++;
2000                 fill_mv(s, b->mv[2], b->mode[2], 2);
2001
2002                 if (b->bs != BS_8x4) {
2003                     b->mode[3] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
2004                                                   s->prob.p.mv_mode[c]);
2005                     s->counts.mv_mode[c][b->mode[3] - 10]++;
2006                     fill_mv(s, b->mv[3], b->mode[3], 3);
2007                 } else {
2008                     b->mode[3] = b->mode[2];
2009                     AV_COPY32(&b->mv[3][0], &b->mv[2][0]);
2010                     AV_COPY32(&b->mv[3][1], &b->mv[2][1]);
2011                 }
2012             } else {
2013                 b->mode[2] = b->mode[0];
2014                 AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
2015                 AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
2016                 b->mode[3] = b->mode[1];
2017                 AV_COPY32(&b->mv[3][0], &b->mv[1][0]);
2018                 AV_COPY32(&b->mv[3][1], &b->mv[1][1]);
2019             }
2020         } else {
2021             fill_mv(s, b->mv[0], b->mode[0], -1);
2022             AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
2023             AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
2024             AV_COPY32(&b->mv[3][0], &b->mv[0][0]);
2025             AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
2026             AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
2027             AV_COPY32(&b->mv[3][1], &b->mv[0][1]);
2028         }
2029
2030         vref = b->ref[b->comp ? s->signbias[s->varcompref[0]] : 0];
2031     }
2032
2033 #if HAVE_FAST_64BIT
2034 #define SPLAT_CTX(var, val, n) \
2035     switch (n) { \
2036     case 1:  var = val;                                    break; \
2037     case 2:  AV_WN16A(&var, val *             0x0101);     break; \
2038     case 4:  AV_WN32A(&var, val *         0x01010101);     break; \
2039     case 8:  AV_WN64A(&var, val * 0x0101010101010101ULL);  break; \
2040     case 16: { \
2041         uint64_t v64 = val * 0x0101010101010101ULL; \
2042         AV_WN64A(              &var,     v64); \
2043         AV_WN64A(&((uint8_t *) &var)[8], v64); \
2044         break; \
2045     } \
2046     }
2047 #else
2048 #define SPLAT_CTX(var, val, n) \
2049     switch (n) { \
2050     case 1:  var = val;                         break; \
2051     case 2:  AV_WN16A(&var, val *     0x0101);  break; \
2052     case 4:  AV_WN32A(&var, val * 0x01010101);  break; \
2053     case 8: { \
2054         uint32_t v32 = val * 0x01010101; \
2055         AV_WN32A(              &var,     v32); \
2056         AV_WN32A(&((uint8_t *) &var)[4], v32); \
2057         break; \
2058     } \
2059     case 16: { \
2060         uint32_t v32 = val * 0x01010101; \
2061         AV_WN32A(              &var,      v32); \
2062         AV_WN32A(&((uint8_t *) &var)[4],  v32); \
2063         AV_WN32A(&((uint8_t *) &var)[8],  v32); \
2064         AV_WN32A(&((uint8_t *) &var)[12], v32); \
2065         break; \
2066     } \
2067     }
2068 #endif
2069
2070     switch (bwh_tab[1][b->bs][0]) {
2071 #define SET_CTXS(dir, off, n) \
2072     do { \
2073         SPLAT_CTX(s->dir##_skip_ctx[off],      b->skip,          n); \
2074         SPLAT_CTX(s->dir##_txfm_ctx[off],      b->tx,            n); \
2075         SPLAT_CTX(s->dir##_partition_ctx[off], dir##_ctx[b->bs], n); \
2076         if (!s->keyframe && !s->intraonly) { \
2077             SPLAT_CTX(s->dir##_intra_ctx[off], b->intra,   n); \
2078             SPLAT_CTX(s->dir##_comp_ctx[off],  b->comp,    n); \
2079             SPLAT_CTX(s->dir##_mode_ctx[off],  b->mode[3], n); \
2080             if (!b->intra) { \
2081                 SPLAT_CTX(s->dir##_ref_ctx[off], vref, n); \
2082                 if (s->filtermode == FILTER_SWITCHABLE) { \
2083                     SPLAT_CTX(s->dir##_filter_ctx[off], filter_id, n); \
2084                 } \
2085             } \
2086         } \
2087     } while (0)
2088     case 1: SET_CTXS(above, col, 1); break;
2089     case 2: SET_CTXS(above, col, 2); break;
2090     case 4: SET_CTXS(above, col, 4); break;
2091     case 8: SET_CTXS(above, col, 8); break;
2092     }
2093     switch (bwh_tab[1][b->bs][1]) {
2094     case 1: SET_CTXS(left, row7, 1); break;
2095     case 2: SET_CTXS(left, row7, 2); break;
2096     case 4: SET_CTXS(left, row7, 4); break;
2097     case 8: SET_CTXS(left, row7, 8); break;
2098     }
2099 #undef SPLAT_CTX
2100 #undef SET_CTXS
2101
2102     if (!s->keyframe && !s->intraonly) {
2103         if (b->bs > BS_8x8) {
2104             int mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
2105
2106             AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][0], &b->mv[1][0]);
2107             AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][1], &b->mv[1][1]);
2108             AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][0], mv0);
2109             AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][1], mv1);
2110             AV_COPY32(&s->above_mv_ctx[col * 2 + 0][0], &b->mv[2][0]);
2111             AV_COPY32(&s->above_mv_ctx[col * 2 + 0][1], &b->mv[2][1]);
2112             AV_WN32A(&s->above_mv_ctx[col * 2 + 1][0], mv0);
2113             AV_WN32A(&s->above_mv_ctx[col * 2 + 1][1], mv1);
2114         } else {
2115             int n, mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
2116
2117             for (n = 0; n < w4 * 2; n++) {
2118                 AV_WN32A(&s->above_mv_ctx[col * 2 + n][0], mv0);
2119                 AV_WN32A(&s->above_mv_ctx[col * 2 + n][1], mv1);
2120             }
2121             for (n = 0; n < h4 * 2; n++) {
2122                 AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][0], mv0);
2123                 AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][1], mv1);
2124             }
2125         }
2126     }
2127
2128     // FIXME kinda ugly
2129     for (y = 0; y < h4; y++) {
2130         int x, o = (row + y) * s->sb_cols * 8 + col;
2131         struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[o];
2132
2133         if (b->intra) {
2134             for (x = 0; x < w4; x++) {
2135                 mv[x].ref[0] =
2136                 mv[x].ref[1] = -1;
2137             }
2138         } else if (b->comp) {
2139             for (x = 0; x < w4; x++) {
2140                 mv[x].ref[0] = b->ref[0];
2141                 mv[x].ref[1] = b->ref[1];
2142                 AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
2143                 AV_COPY32(&mv[x].mv[1], &b->mv[3][1]);
2144             }
2145         } else {
2146             for (x = 0; x < w4; x++) {
2147                 mv[x].ref[0] = b->ref[0];
2148                 mv[x].ref[1] = -1;
2149                 AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
2150             }
2151         }
2152     }
2153 }
2154
2155 // FIXME merge cnt/eob arguments?
2156 static av_always_inline int
2157 decode_coeffs_b_generic(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
2158                         int is_tx32x32, int is8bitsperpixel, int bpp, unsigned (*cnt)[6][3],
2159                         unsigned (*eob)[6][2], uint8_t (*p)[6][11],
2160                         int nnz, const int16_t *scan, const int16_t (*nb)[2],
2161                         const int16_t *band_counts, const int16_t *qmul)
2162 {
2163     int i = 0, band = 0, band_left = band_counts[band];
2164     uint8_t *tp = p[0][nnz];
2165     uint8_t cache[1024];
2166
2167     do {
2168         int val, rc;
2169
2170         val = vp56_rac_get_prob_branchy(c, tp[0]); // eob
2171         eob[band][nnz][val]++;
2172         if (!val)
2173             break;
2174
2175     skip_eob:
2176         if (!vp56_rac_get_prob_branchy(c, tp[1])) { // zero
2177             cnt[band][nnz][0]++;
2178             if (!--band_left)
2179                 band_left = band_counts[++band];
2180             cache[scan[i]] = 0;
2181             nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
2182             tp = p[band][nnz];
2183             if (++i == n_coeffs)
2184                 break; //invalid input; blocks should end with EOB
2185             goto skip_eob;
2186         }
2187
2188         rc = scan[i];
2189         if (!vp56_rac_get_prob_branchy(c, tp[2])) { // one
2190             cnt[band][nnz][1]++;
2191             val = 1;
2192             cache[rc] = 1;
2193         } else {
2194             // fill in p[3-10] (model fill) - only once per frame for each pos
2195             if (!tp[3])
2196                 memcpy(&tp[3], vp9_model_pareto8[tp[2]], 8);
2197
2198             cnt[band][nnz][2]++;
2199             if (!vp56_rac_get_prob_branchy(c, tp[3])) { // 2, 3, 4
2200                 if (!vp56_rac_get_prob_branchy(c, tp[4])) {
2201                     cache[rc] = val = 2;
2202                 } else {
2203                     val = 3 + vp56_rac_get_prob(c, tp[5]);
2204                     cache[rc] = 3;
2205                 }
2206             } else if (!vp56_rac_get_prob_branchy(c, tp[6])) { // cat1/2
2207                 cache[rc] = 4;
2208                 if (!vp56_rac_get_prob_branchy(c, tp[7])) {
2209                     val = 5 + vp56_rac_get_prob(c, 159);
2210                 } else {
2211                     val  = 7 + (vp56_rac_get_prob(c, 165) << 1);
2212                     val +=      vp56_rac_get_prob(c, 145);
2213                 }
2214             } else { // cat 3-6
2215                 cache[rc] = 5;
2216                 if (!vp56_rac_get_prob_branchy(c, tp[8])) {
2217                     if (!vp56_rac_get_prob_branchy(c, tp[9])) {
2218                         val  = 11 + (vp56_rac_get_prob(c, 173) << 2);
2219                         val +=      (vp56_rac_get_prob(c, 148) << 1);
2220                         val +=       vp56_rac_get_prob(c, 140);
2221                     } else {
2222                         val  = 19 + (vp56_rac_get_prob(c, 176) << 3);
2223                         val +=      (vp56_rac_get_prob(c, 155) << 2);
2224                         val +=      (vp56_rac_get_prob(c, 140) << 1);
2225                         val +=       vp56_rac_get_prob(c, 135);
2226                     }
2227                 } else if (!vp56_rac_get_prob_branchy(c, tp[10])) {
2228                     val  = 35 + (vp56_rac_get_prob(c, 180) << 4);
2229                     val +=      (vp56_rac_get_prob(c, 157) << 3);
2230                     val +=      (vp56_rac_get_prob(c, 141) << 2);
2231                     val +=      (vp56_rac_get_prob(c, 134) << 1);
2232                     val +=       vp56_rac_get_prob(c, 130);
2233                 } else {
2234                     val = 67;
2235                     if (!is8bitsperpixel) {
2236                         if (bpp == 12) {
2237                             val += vp56_rac_get_prob(c, 255) << 17;
2238                             val += vp56_rac_get_prob(c, 255) << 16;
2239                         }
2240                         val +=  (vp56_rac_get_prob(c, 255) << 15);
2241                         val +=  (vp56_rac_get_prob(c, 255) << 14);
2242                     }
2243                     val +=      (vp56_rac_get_prob(c, 254) << 13);
2244                     val +=      (vp56_rac_get_prob(c, 254) << 12);
2245                     val +=      (vp56_rac_get_prob(c, 254) << 11);
2246                     val +=      (vp56_rac_get_prob(c, 252) << 10);
2247                     val +=      (vp56_rac_get_prob(c, 249) << 9);
2248                     val +=      (vp56_rac_get_prob(c, 243) << 8);
2249                     val +=      (vp56_rac_get_prob(c, 230) << 7);
2250                     val +=      (vp56_rac_get_prob(c, 196) << 6);
2251                     val +=      (vp56_rac_get_prob(c, 177) << 5);
2252                     val +=      (vp56_rac_get_prob(c, 153) << 4);
2253                     val +=      (vp56_rac_get_prob(c, 140) << 3);
2254                     val +=      (vp56_rac_get_prob(c, 133) << 2);
2255                     val +=      (vp56_rac_get_prob(c, 130) << 1);
2256                     val +=       vp56_rac_get_prob(c, 129);
2257                 }
2258             }
2259         }
2260 #define STORE_COEF(c, i, v) do { \
2261     if (is8bitsperpixel) { \
2262         c[i] = v; \
2263     } else { \
2264         AV_WN32A(&c[i * 2], v); \
2265     } \
2266 } while (0)
2267         if (!--band_left)
2268             band_left = band_counts[++band];
2269         if (is_tx32x32)
2270             STORE_COEF(coef, rc, ((vp8_rac_get(c) ? -val : val) * qmul[!!i]) / 2);
2271         else
2272             STORE_COEF(coef, rc, (vp8_rac_get(c) ? -val : val) * qmul[!!i]);
2273         nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
2274         tp = p[band][nnz];
2275     } while (++i < n_coeffs);
2276
2277     return i;
2278 }
2279
2280 static int decode_coeffs_b_8bpp(VP9Context *s, int16_t *coef, int n_coeffs,
2281                                 unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2282                                 uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2283                                 const int16_t (*nb)[2], const int16_t *band_counts,
2284                                 const int16_t *qmul)
2285 {
2286     return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 0, 1, 8, cnt, eob, p,
2287                                    nnz, scan, nb, band_counts, qmul);
2288 }
2289
2290 static int decode_coeffs_b32_8bpp(VP9Context *s, int16_t *coef, int n_coeffs,
2291                                   unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2292                                   uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2293                                   const int16_t (*nb)[2], const int16_t *band_counts,
2294                                   const int16_t *qmul)
2295 {
2296     return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 1, 1, 8, cnt, eob, p,
2297                                    nnz, scan, nb, band_counts, qmul);
2298 }
2299
2300 static int decode_coeffs_b_16bpp(VP9Context *s, int16_t *coef, int n_coeffs,
2301                                  unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2302                                  uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2303                                  const int16_t (*nb)[2], const int16_t *band_counts,
2304                                  const int16_t *qmul)
2305 {
2306     return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 0, 0, s->bpp, cnt, eob, p,
2307                                    nnz, scan, nb, band_counts, qmul);
2308 }
2309
2310 static int decode_coeffs_b32_16bpp(VP9Context *s, int16_t *coef, int n_coeffs,
2311                                    unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2312                                    uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2313                                    const int16_t (*nb)[2], const int16_t *band_counts,
2314                                    const int16_t *qmul)
2315 {
2316     return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 1, 0, s->bpp, cnt, eob, p,
2317                                    nnz, scan, nb, band_counts, qmul);
2318 }
2319
2320 static av_always_inline int decode_coeffs(AVCodecContext *ctx, int is8bitsperpixel)
2321 {
2322     VP9Context *s = ctx->priv_data;
2323     VP9Block *b = s->b;
2324     int row = s->row, col = s->col;
2325     uint8_t (*p)[6][11] = s->prob.coef[b->tx][0 /* y */][!b->intra];
2326     unsigned (*c)[6][3] = s->counts.coef[b->tx][0 /* y */][!b->intra];
2327     unsigned (*e)[6][2] = s->counts.eob[b->tx][0 /* y */][!b->intra];
2328     int w4 = bwh_tab[1][b->bs][0] << 1, h4 = bwh_tab[1][b->bs][1] << 1;
2329     int end_x = FFMIN(2 * (s->cols - col), w4);
2330     int end_y = FFMIN(2 * (s->rows - row), h4);
2331     int n, pl, x, y, res;
2332     int16_t (*qmul)[2] = s->segmentation.feat[b->seg_id].qmul;
2333     int tx = 4 * s->lossless + b->tx;
2334     const int16_t * const *yscans = vp9_scans[tx];
2335     const int16_t (* const *ynbs)[2] = vp9_scans_nb[tx];
2336     const int16_t *uvscan = vp9_scans[b->uvtx][DCT_DCT];
2337     const int16_t (*uvnb)[2] = vp9_scans_nb[b->uvtx][DCT_DCT];
2338     uint8_t *a = &s->above_y_nnz_ctx[col * 2];
2339     uint8_t *l = &s->left_y_nnz_ctx[(row & 7) << 1];
2340     static const int16_t band_counts[4][8] = {
2341         { 1, 2, 3, 4,  3,   16 - 13 },
2342         { 1, 2, 3, 4, 11,   64 - 21 },
2343         { 1, 2, 3, 4, 11,  256 - 21 },
2344         { 1, 2, 3, 4, 11, 1024 - 21 },
2345     };
2346     const int16_t *y_band_counts = band_counts[b->tx];
2347     const int16_t *uv_band_counts = band_counts[b->uvtx];
2348     int bytesperpixel = is8bitsperpixel ? 1 : 2;
2349     int total_coeff = 0;
2350
2351 #define MERGE(la, end, step, rd) \
2352     for (n = 0; n < end; n += step) \
2353         la[n] = !!rd(&la[n])
2354 #define MERGE_CTX(step, rd) \
2355     do { \
2356         MERGE(l, end_y, step, rd); \
2357         MERGE(a, end_x, step, rd); \
2358     } while (0)
2359
2360 #define DECODE_Y_COEF_LOOP(step, mode_index, v) \
2361     for (n = 0, y = 0; y < end_y; y += step) { \
2362         for (x = 0; x < end_x; x += step, n += step * step) { \
2363             enum TxfmType txtp = vp9_intra_txfm_type[b->mode[mode_index]]; \
2364             res = (is8bitsperpixel ? decode_coeffs_b##v##_8bpp : decode_coeffs_b##v##_16bpp) \
2365                                     (s, s->block + 16 * n * bytesperpixel, 16 * step * step, \
2366                                      c, e, p, a[x] + l[y], yscans[txtp], \
2367                                      ynbs[txtp], y_band_counts, qmul[0]); \
2368             a[x] = l[y] = !!res; \
2369             total_coeff |= !!res; \
2370             if (step >= 4) { \
2371                 AV_WN16A(&s->eob[n], res); \
2372             } else { \
2373                 s->eob[n] = res; \
2374             } \
2375         } \
2376     }
2377
2378 #define SPLAT(la, end, step, cond) \
2379     if (step == 2) { \
2380         for (n = 1; n < end; n += step) \
2381             la[n] = la[n - 1]; \
2382     } else if (step == 4) { \
2383         if (cond) { \
2384             for (n = 0; n < end; n += step) \
2385                 AV_WN32A(&la[n], la[n] * 0x01010101); \
2386         } else { \
2387             for (n = 0; n < end; n += step) \
2388                 memset(&la[n + 1], la[n], FFMIN(end - n - 1, 3)); \
2389         } \
2390     } else /* step == 8 */ { \
2391         if (cond) { \
2392             if (HAVE_FAST_64BIT) { \
2393                 for (n = 0; n < end; n += step) \
2394                     AV_WN64A(&la[n], la[n] * 0x0101010101010101ULL); \
2395             } else { \
2396                 for (n = 0; n < end; n += step) { \
2397                     uint32_t v32 = la[n] * 0x01010101; \
2398                     AV_WN32A(&la[n],     v32); \
2399                     AV_WN32A(&la[n + 4], v32); \
2400                 } \
2401             } \
2402         } else { \
2403             for (n = 0; n < end; n += step) \
2404                 memset(&la[n + 1], la[n], FFMIN(end - n - 1, 7)); \
2405         } \
2406     }
2407 #define SPLAT_CTX(step) \
2408     do { \
2409         SPLAT(a, end_x, step, end_x == w4); \
2410         SPLAT(l, end_y, step, end_y == h4); \
2411     } while (0)
2412
2413     /* y tokens */
2414     switch (b->tx) {
2415     case TX_4X4:
2416         DECODE_Y_COEF_LOOP(1, b->bs > BS_8x8 ? n : 0,);
2417         break;
2418     case TX_8X8:
2419         MERGE_CTX(2, AV_RN16A);
2420         DECODE_Y_COEF_LOOP(2, 0,);
2421         SPLAT_CTX(2);
2422         break;
2423     case TX_16X16:
2424         MERGE_CTX(4, AV_RN32A);
2425         DECODE_Y_COEF_LOOP(4, 0,);
2426         SPLAT_CTX(4);
2427         break;
2428     case TX_32X32:
2429         MERGE_CTX(8, AV_RN64A);
2430         DECODE_Y_COEF_LOOP(8, 0, 32);
2431         SPLAT_CTX(8);
2432         break;
2433     }
2434
2435 #define DECODE_UV_COEF_LOOP(step, v) \
2436     for (n = 0, y = 0; y < end_y; y += step) { \
2437         for (x = 0; x < end_x; x += step, n += step * step) { \
2438             res = (is8bitsperpixel ? decode_coeffs_b##v##_8bpp : decode_coeffs_b##v##_16bpp) \
2439                                     (s, s->uvblock[pl] + 16 * n * bytesperpixel, \
2440                                      16 * step * step, c, e, p, a[x] + l[y], \
2441                                      uvscan, uvnb, uv_band_counts, qmul[1]); \
2442             a[x] = l[y] = !!res; \
2443             total_coeff |= !!res; \
2444             if (step >= 4) { \
2445                 AV_WN16A(&s->uveob[pl][n], res); \
2446             } else { \
2447                 s->uveob[pl][n] = res; \
2448             } \
2449         } \
2450     }
2451
2452     p = s->prob.coef[b->uvtx][1 /* uv */][!b->intra];
2453     c = s->counts.coef[b->uvtx][1 /* uv */][!b->intra];
2454     e = s->counts.eob[b->uvtx][1 /* uv */][!b->intra];
2455     w4 >>= s->ss_h;
2456     end_x >>= s->ss_h;
2457     h4 >>= s->ss_v;
2458     end_y >>= s->ss_v;
2459     for (pl = 0; pl < 2; pl++) {
2460         a = &s->above_uv_nnz_ctx[pl][col << !s->ss_h];
2461         l = &s->left_uv_nnz_ctx[pl][(row & 7) << !s->ss_v];
2462         switch (b->uvtx) {
2463         case TX_4X4:
2464             DECODE_UV_COEF_LOOP(1,);
2465             break;
2466         case TX_8X8:
2467             MERGE_CTX(2, AV_RN16A);
2468             DECODE_UV_COEF_LOOP(2,);
2469             SPLAT_CTX(2);
2470             break;
2471         case TX_16X16:
2472             MERGE_CTX(4, AV_RN32A);
2473             DECODE_UV_COEF_LOOP(4,);
2474             SPLAT_CTX(4);
2475             break;
2476         case TX_32X32:
2477             MERGE_CTX(8, AV_RN64A);
2478             DECODE_UV_COEF_LOOP(8, 32);
2479             SPLAT_CTX(8);
2480             break;
2481         }
2482     }
2483
2484     return total_coeff;
2485 }
2486
2487 static int decode_coeffs_8bpp(AVCodecContext *ctx)
2488 {
2489     return decode_coeffs(ctx, 1);
2490 }
2491
2492 static int decode_coeffs_16bpp(AVCodecContext *ctx)
2493 {
2494     return decode_coeffs(ctx, 0);
2495 }
2496
2497 static av_always_inline int check_intra_mode(VP9Context *s, int mode, uint8_t **a,
2498                                              uint8_t *dst_edge, ptrdiff_t stride_edge,
2499                                              uint8_t *dst_inner, ptrdiff_t stride_inner,
2500                                              uint8_t *l, int col, int x, int w,
2501                                              int row, int y, enum TxfmMode tx,
2502                                              int p, int ss_h, int ss_v, int bytesperpixel)
2503 {
2504     int have_top = row > 0 || y > 0;
2505     int have_left = col > s->tiling.tile_col_start || x > 0;
2506     int have_right = x < w - 1;
2507     int bpp = s->bpp;
2508     static const uint8_t mode_conv[10][2 /* have_left */][2 /* have_top */] = {
2509         [VERT_PRED]            = { { DC_127_PRED,          VERT_PRED },
2510                                    { DC_127_PRED,          VERT_PRED } },
2511         [HOR_PRED]             = { { DC_129_PRED,          DC_129_PRED },
2512                                    { HOR_PRED,             HOR_PRED } },
2513         [DC_PRED]              = { { DC_128_PRED,          TOP_DC_PRED },
2514                                    { LEFT_DC_PRED,         DC_PRED } },
2515         [DIAG_DOWN_LEFT_PRED]  = { { DC_127_PRED,          DIAG_DOWN_LEFT_PRED },
2516                                    { DC_127_PRED,          DIAG_DOWN_LEFT_PRED } },
2517         [DIAG_DOWN_RIGHT_PRED] = { { DIAG_DOWN_RIGHT_PRED, DIAG_DOWN_RIGHT_PRED },
2518                                    { DIAG_DOWN_RIGHT_PRED, DIAG_DOWN_RIGHT_PRED } },
2519         [VERT_RIGHT_PRED]      = { { VERT_RIGHT_PRED,      VERT_RIGHT_PRED },
2520                                    { VERT_RIGHT_PRED,      VERT_RIGHT_PRED } },
2521         [HOR_DOWN_PRED]        = { { HOR_DOWN_PRED,        HOR_DOWN_PRED },
2522                                    { HOR_DOWN_PRED,        HOR_DOWN_PRED } },
2523         [VERT_LEFT_PRED]       = { { DC_127_PRED,          VERT_LEFT_PRED },
2524                                    { DC_127_PRED,          VERT_LEFT_PRED } },
2525         [HOR_UP_PRED]          = { { DC_129_PRED,          DC_129_PRED },
2526                                    { HOR_UP_PRED,          HOR_UP_PRED } },
2527         [TM_VP8_PRED]          = { { DC_129_PRED,          VERT_PRED },
2528                                    { HOR_PRED,             TM_VP8_PRED } },
2529     };
2530     static const struct {
2531         uint8_t needs_left:1;
2532         uint8_t needs_top:1;
2533         uint8_t needs_topleft:1;
2534         uint8_t needs_topright:1;
2535         uint8_t invert_left:1;
2536     } edges[N_INTRA_PRED_MODES] = {
2537         [VERT_PRED]            = { .needs_top  = 1 },
2538         [HOR_PRED]             = { .needs_left = 1 },
2539         [DC_PRED]              = { .needs_top  = 1, .needs_left = 1 },
2540         [DIAG_DOWN_LEFT_PRED]  = { .needs_top  = 1, .needs_topright = 1 },
2541         [DIAG_DOWN_RIGHT_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2542         [VERT_RIGHT_PRED]      = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2543         [HOR_DOWN_PRED]        = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2544         [VERT_LEFT_PRED]       = { .needs_top  = 1, .needs_topright = 1 },
2545         [HOR_UP_PRED]          = { .needs_left = 1, .invert_left = 1 },
2546         [TM_VP8_PRED]          = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2547         [LEFT_DC_PRED]         = { .needs_left = 1 },
2548         [TOP_DC_PRED]          = { .needs_top  = 1 },
2549         [DC_128_PRED]          = { 0 },
2550         [DC_127_PRED]          = { 0 },
2551         [DC_129_PRED]          = { 0 }
2552     };
2553
2554     av_assert2(mode >= 0 && mode < 10);
2555     mode = mode_conv[mode][have_left][have_top];
2556     if (edges[mode].needs_top) {
2557         uint8_t *top, *topleft;
2558         int n_px_need = 4 << tx, n_px_have = (((s->cols - col) << !ss_h) - x) * 4;
2559         int n_px_need_tr = 0;
2560
2561         if (tx == TX_4X4 && edges[mode].needs_topright && have_right)
2562             n_px_need_tr = 4;
2563
2564         // if top of sb64-row, use s->intra_pred_data[] instead of
2565         // dst[-stride] for intra prediction (it contains pre- instead of
2566         // post-loopfilter data)
2567         if (have_top) {
2568             top = !(row & 7) && !y ?
2569                 s->intra_pred_data[p] + (col * (8 >> ss_h) + x * 4) * bytesperpixel :
2570                 y == 0 ? &dst_edge[-stride_edge] : &dst_inner[-stride_inner];
2571             if (have_left)
2572                 topleft = !(row & 7) && !y ?
2573                     s->intra_pred_data[p] + (col * (8 >> ss_h) + x * 4) * bytesperpixel :
2574                     y == 0 || x == 0 ? &dst_edge[-stride_edge] :
2575                     &dst_inner[-stride_inner];
2576         }
2577
2578         if (have_top &&
2579             (!edges[mode].needs_topleft || (have_left && top == topleft)) &&
2580             (tx != TX_4X4 || !edges[mode].needs_topright || have_right) &&
2581             n_px_need + n_px_need_tr <= n_px_have) {
2582             *a = top;
2583         } else {
2584             if (have_top) {
2585                 if (n_px_need <= n_px_have) {
2586                     memcpy(*a, top, n_px_need * bytesperpixel);
2587                 } else {
2588 #define memset_bpp(c, i1, v, i2, num) do { \
2589     if (bytesperpixel == 1) { \
2590         memset(&(c)[(i1)], (v)[(i2)], (num)); \
2591     } else { \
2592         int n, val = AV_RN16A(&(v)[(i2) * 2]); \
2593         for (n = 0; n < (num); n++) { \
2594             AV_WN16A(&(c)[((i1) + n) * 2], val); \
2595         } \
2596     } \
2597 } while (0)
2598                     memcpy(*a, top, n_px_have * bytesperpixel);
2599                     memset_bpp(*a, n_px_have, (*a), n_px_have - 1, n_px_need - n_px_have);
2600                 }
2601             } else {
2602 #define memset_val(c, val, num) do { \
2603     if (bytesperpixel == 1) { \
2604         memset((c), (val), (num)); \
2605     } else { \
2606         int n; \
2607         for (n = 0; n < (num); n++) { \
2608             AV_WN16A(&(c)[n * 2], (val)); \
2609         } \
2610     } \
2611 } while (0)
2612                 memset_val(*a, (128 << (bpp - 8)) - 1, n_px_need);
2613             }
2614             if (edges[mode].needs_topleft) {
2615                 if (have_left && have_top) {
2616 #define assign_bpp(c, i1, v, i2) do { \
2617     if (bytesperpixel == 1) { \
2618         (c)[(i1)] = (v)[(i2)]; \
2619     } else { \
2620         AV_COPY16(&(c)[(i1) * 2], &(v)[(i2) * 2]); \
2621     } \
2622 } while (0)
2623                     assign_bpp(*a, -1, topleft, -1);
2624                 } else {
2625 #define assign_val(c, i, v) do { \
2626     if (bytesperpixel == 1) { \
2627         (c)[(i)] = (v); \
2628     } else { \
2629         AV_WN16A(&(c)[(i) * 2], (v)); \
2630     } \
2631 } while (0)
2632                     assign_val((*a), -1, (128 << (bpp - 8)) + (have_top ? +1 : -1));
2633                 }
2634             }
2635             if (tx == TX_4X4 && edges[mode].needs_topright) {
2636                 if (have_top && have_right &&
2637                     n_px_need + n_px_need_tr <= n_px_have) {
2638                     memcpy(&(*a)[4 * bytesperpixel], &top[4 * bytesperpixel], 4 * bytesperpixel);
2639                 } else {
2640                     memset_bpp(*a, 4, *a, 3, 4);
2641                 }
2642             }
2643         }
2644     }
2645     if (edges[mode].needs_left) {
2646         if (have_left) {
2647             int n_px_need = 4 << tx, i, n_px_have = (((s->rows - row) << !ss_v) - y) * 4;
2648             uint8_t *dst = x == 0 ? dst_edge : dst_inner;
2649             ptrdiff_t stride = x == 0 ? stride_edge : stride_inner;
2650
2651             if (edges[mode].invert_left) {
2652                 if (n_px_need <= n_px_have) {
2653                     for (i = 0; i < n_px_need; i++)
2654                         assign_bpp(l, i, &dst[i * stride], -1);
2655                 } else {
2656                     for (i = 0; i < n_px_have; i++)
2657                         assign_bpp(l, i, &dst[i * stride], -1);
2658                     memset_bpp(l, n_px_have, l, n_px_have - 1, n_px_need - n_px_have);
2659                 }
2660             } else {
2661                 if (n_px_need <= n_px_have) {
2662                     for (i = 0; i < n_px_need; i++)
2663                         assign_bpp(l, n_px_need - 1 - i, &dst[i * stride], -1);
2664                 } else {
2665                     for (i = 0; i < n_px_have; i++)
2666                         assign_bpp(l, n_px_need - 1 - i, &dst[i * stride], -1);
2667                     memset_bpp(l, 0, l, n_px_need - n_px_have, n_px_need - n_px_have);
2668                 }
2669             }
2670         } else {
2671             memset_val(l, (128 << (bpp - 8)) + 1, 4 << tx);
2672         }
2673     }
2674
2675     return mode;
2676 }
2677
2678 static av_always_inline void intra_recon(AVCodecContext *ctx, ptrdiff_t y_off,
2679                                          ptrdiff_t uv_off, int bytesperpixel)
2680 {
2681     VP9Context *s = ctx->priv_data;
2682     VP9Block *b = s->b;
2683     int row = s->row, col = s->col;
2684     int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
2685     int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
2686     int end_x = FFMIN(2 * (s->cols - col), w4);
2687     int end_y = FFMIN(2 * (s->rows - row), h4);
2688     int tx = 4 * s->lossless + b->tx, uvtx = b->uvtx + 4 * s->lossless;
2689     int uvstep1d = 1 << b->uvtx, p;
2690     uint8_t *dst = s->dst[0], *dst_r = s->frames[CUR_FRAME].tf.f->data[0] + y_off;
2691     LOCAL_ALIGNED_32(uint8_t, a_buf, [96]);
2692     LOCAL_ALIGNED_32(uint8_t, l, [64]);
2693
2694     for (n = 0, y = 0; y < end_y; y += step1d) {
2695         uint8_t *ptr = dst, *ptr_r = dst_r;
2696         for (x = 0; x < end_x; x += step1d, ptr += 4 * step1d * bytesperpixel,
2697                                ptr_r += 4 * step1d * bytesperpixel, n += step) {
2698             int mode = b->mode[b->bs > BS_8x8 && b->tx == TX_4X4 ?
2699                                y * 2 + x : 0];
2700             uint8_t *a = &a_buf[32];
2701             enum TxfmType txtp = vp9_intra_txfm_type[mode];
2702             int eob = b->skip ? 0 : b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
2703
2704             mode = check_intra_mode(s, mode, &a, ptr_r,
2705                                     s->frames[CUR_FRAME].tf.f->linesize[0],
2706                                     ptr, s->y_stride, l,
2707                                     col, x, w4, row, y, b->tx, 0, 0, 0, bytesperpixel);
2708             s->dsp.intra_pred[b->tx][mode](ptr, s->y_stride, l, a);
2709             if (eob)
2710                 s->dsp.itxfm_add[tx][txtp](ptr, s->y_stride,
2711                                            s->block + 16 * n * bytesperpixel, eob);
2712         }
2713         dst_r += 4 * step1d * s->frames[CUR_FRAME].tf.f->linesize[0];
2714         dst   += 4 * step1d * s->y_stride;
2715     }
2716
2717     // U/V
2718     w4 >>= s->ss_h;
2719     end_x >>= s->ss_h;
2720     end_y >>= s->ss_v;
2721     step = 1 << (b->uvtx * 2);
2722     for (p = 0; p < 2; p++) {
2723         dst   = s->dst[1 + p];
2724         dst_r = s->frames[CUR_FRAME].tf.f->data[1 + p] + uv_off;
2725         for (n = 0, y = 0; y < end_y; y += uvstep1d) {
2726             uint8_t *ptr = dst, *ptr_r = dst_r;
2727             for (x = 0; x < end_x; x += uvstep1d, ptr += 4 * uvstep1d * bytesperpixel,
2728                                    ptr_r += 4 * uvstep1d * bytesperpixel, n += step) {
2729                 int mode = b->uvmode;
2730                 uint8_t *a = &a_buf[32];
2731                 int eob = b->skip ? 0 : b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
2732
2733                 mode = check_intra_mode(s, mode, &a, ptr_r,
2734                                         s->frames[CUR_FRAME].tf.f->linesize[1],
2735                                         ptr, s->uv_stride, l, col, x, w4, row, y,
2736                                         b->uvtx, p + 1, s->ss_h, s->ss_v, bytesperpixel);
2737                 s->dsp.intra_pred[b->uvtx][mode](ptr, s->uv_stride, l, a);
2738                 if (eob)
2739                     s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
2740                                                     s->uvblock[p] + 16 * n * bytesperpixel, eob);
2741             }
2742             dst_r += 4 * uvstep1d * s->frames[CUR_FRAME].tf.f->linesize[1];
2743             dst   += 4 * uvstep1d * s->uv_stride;
2744         }
2745     }
2746 }
2747
2748 static void intra_recon_8bpp(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off)
2749 {
2750     intra_recon(ctx, y_off, uv_off, 1);
2751 }
2752
2753 static void intra_recon_16bpp(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off)
2754 {
2755     intra_recon(ctx, y_off, uv_off, 2);
2756 }
2757
2758 static av_always_inline void mc_luma_unscaled(VP9Context *s, vp9_mc_func (*mc)[2],
2759                                               uint8_t *dst, ptrdiff_t dst_stride,
2760                                               const uint8_t *ref, ptrdiff_t ref_stride,
2761                                               ThreadFrame *ref_frame,
2762                                               ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
2763                                               int bw, int bh, int w, int h, int bytesperpixel)
2764 {
2765     int mx = mv->x, my = mv->y, th;
2766
2767     y += my >> 3;
2768     x += mx >> 3;
2769     ref += y * ref_stride + x * bytesperpixel;
2770     mx &= 7;
2771     my &= 7;
2772     // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2773     // we use +7 because the last 7 pixels of each sbrow can be changed in
2774     // the longest loopfilter of the next sbrow
2775     th = (y + bh + 4 * !!my + 7) >> 6;
2776     ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2777     if (x < !!mx * 3 || y < !!my * 3 ||
2778         x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
2779         s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2780                                  ref - !!my * 3 * ref_stride - !!mx * 3 * bytesperpixel,
2781                                  160, ref_stride,
2782                                  bw + !!mx * 7, bh + !!my * 7,
2783                                  x - !!mx * 3, y - !!my * 3, w, h);
2784         ref = s->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel;
2785         ref_stride = 160;
2786     }
2787     mc[!!mx][!!my](dst, dst_stride, ref, ref_stride, bh, mx << 1, my << 1);
2788 }
2789
2790 static av_always_inline void mc_chroma_unscaled(VP9Context *s, vp9_mc_func (*mc)[2],
2791                                                 uint8_t *dst_u, uint8_t *dst_v,
2792                                                 ptrdiff_t dst_stride,
2793                                                 const uint8_t *ref_u, ptrdiff_t src_stride_u,
2794                                                 const uint8_t *ref_v, ptrdiff_t src_stride_v,
2795                                                 ThreadFrame *ref_frame,
2796                                                 ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
2797                                                 int bw, int bh, int w, int h, int bytesperpixel)
2798 {
2799     int mx = mv->x << !s->ss_h, my = mv->y << !s->ss_v, th;
2800
2801     y += my >> 4;
2802     x += mx >> 4;
2803     ref_u += y * src_stride_u + x * bytesperpixel;
2804     ref_v += y * src_stride_v + x * bytesperpixel;
2805     mx &= 15;
2806     my &= 15;
2807     // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2808     // we use +7 because the last 7 pixels of each sbrow can be changed in
2809     // the longest loopfilter of the next sbrow
2810     th = (y + bh + 4 * !!my + 7) >> (6 - s->ss_v);
2811     ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2812     if (x < !!mx * 3 || y < !!my * 3 ||
2813         x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
2814         s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2815                                  ref_u - !!my * 3 * src_stride_u - !!mx * 3 * bytesperpixel,
2816                                  160, src_stride_u,
2817                                  bw + !!mx * 7, bh + !!my * 7,
2818                                  x - !!mx * 3, y - !!my * 3, w, h);
2819         ref_u = s->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel;
2820         mc[!!mx][!!my](dst_u, dst_stride, ref_u, 160, bh, mx, my);
2821
2822         s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2823                                  ref_v - !!my * 3 * src_stride_v - !!mx * 3 * bytesperpixel,
2824                                  160, src_stride_v,
2825                                  bw + !!mx * 7, bh + !!my * 7,
2826                                  x - !!mx * 3, y - !!my * 3, w, h);
2827         ref_v = s->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel;
2828         mc[!!mx][!!my](dst_v, dst_stride, ref_v, 160, bh, mx, my);
2829     } else {
2830         mc[!!mx][!!my](dst_u, dst_stride, ref_u, src_stride_u, bh, mx, my);
2831         mc[!!mx][!!my](dst_v, dst_stride, ref_v, src_stride_v, bh, mx, my);
2832     }
2833 }
2834
2835 #define mc_luma_dir(s, mc, dst, dst_ls, src, src_ls, tref, row, col, mv, \
2836                     px, py, pw, ph, bw, bh, w, h, i) \
2837     mc_luma_unscaled(s, s->dsp.mc, dst, dst_ls, src, src_ls, tref, row, col, \
2838                      mv, bw, bh, w, h, bytesperpixel)
2839 #define mc_chroma_dir(s, mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
2840                       row, col, mv, px, py, pw, ph, bw, bh, w, h, i) \
2841     mc_chroma_unscaled(s, s->dsp.mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
2842                        row, col, mv, bw, bh, w, h, bytesperpixel)
2843 #define SCALED 0
2844 #define FN(x) x##_8bpp
2845 #define BYTES_PER_PIXEL 1
2846 #include "vp9_mc_template.c"
2847 #undef FN
2848 #undef BYTES_PER_PIXEL
2849 #define FN(x) x##_16bpp
2850 #define BYTES_PER_PIXEL 2
2851 #include "vp9_mc_template.c"
2852 #undef mc_luma_dir
2853 #undef mc_chroma_dir
2854 #undef FN
2855 #undef BYTES_PER_PIXEL
2856 #undef SCALED
2857
2858 static av_always_inline void mc_luma_scaled(VP9Context *s, vp9_scaled_mc_func smc,
2859                                             vp9_mc_func (*mc)[2],
2860                                             uint8_t *dst, ptrdiff_t dst_stride,
2861                                             const uint8_t *ref, ptrdiff_t ref_stride,
2862                                             ThreadFrame *ref_frame,
2863                                             ptrdiff_t y, ptrdiff_t x, const VP56mv *in_mv,
2864                                             int px, int py, int pw, int ph,
2865                                             int bw, int bh, int w, int h, int bytesperpixel,
2866                                             const uint16_t *scale, const uint8_t *step)
2867 {
2868     if (s->frames[CUR_FRAME].tf.f->width == ref_frame->f->width &&
2869         s->frames[CUR_FRAME].tf.f->height == ref_frame->f->height) {
2870         mc_luma_unscaled(s, mc, dst, dst_stride, ref, ref_stride, ref_frame,
2871                          y, x, in_mv, bw, bh, w, h, bytesperpixel);
2872     } else {
2873 #define scale_mv(n, dim) (((int64_t)(n) * scale[dim]) >> 14)
2874     int mx, my;
2875     int refbw_m1, refbh_m1;
2876     int th;
2877     VP56mv mv;
2878
2879     mv.x = av_clip(in_mv->x, -(x + pw - px + 4) << 3, (s->cols * 8 - x + px + 3) << 3);
2880     mv.y = av_clip(in_mv->y, -(y + ph - py + 4) << 3, (s->rows * 8 - y + py + 3) << 3);
2881     // BUG libvpx seems to scale the two components separately. This introduces
2882     // rounding errors but we have to reproduce them to be exactly compatible
2883     // with the output from libvpx...
2884     mx = scale_mv(mv.x * 2, 0) + scale_mv(x * 16, 0);
2885     my = scale_mv(mv.y * 2, 1) + scale_mv(y * 16, 1);
2886
2887     y = my >> 4;
2888     x = mx >> 4;
2889     ref += y * ref_stride + x * bytesperpixel;
2890     mx &= 15;
2891     my &= 15;
2892     refbw_m1 = ((bw - 1) * step[0] + mx) >> 4;
2893     refbh_m1 = ((bh - 1) * step[1] + my) >> 4;
2894     // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2895     // we use +7 because the last 7 pixels of each sbrow can be changed in
2896     // the longest loopfilter of the next sbrow
2897     th = (y + refbh_m1 + 4 + 7) >> 6;
2898     ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2899     if (x < 3 || y < 3 || x + 4 >= w - refbw_m1 || y + 4 >= h - refbh_m1) {
2900         s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2901                                  ref - 3 * ref_stride - 3 * bytesperpixel,
2902                                  288, ref_stride,
2903                                  refbw_m1 + 8, refbh_m1 + 8,
2904                                  x - 3, y - 3, w, h);
2905         ref = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
2906         ref_stride = 288;
2907     }
2908     smc(dst, dst_stride, ref, ref_stride, bh, mx, my, step[0], step[1]);
2909     }
2910 }
2911
2912 static av_always_inline void mc_chroma_scaled(VP9Context *s, vp9_scaled_mc_func smc,
2913                                               vp9_mc_func (*mc)[2],
2914                                               uint8_t *dst_u, uint8_t *dst_v,
2915                                               ptrdiff_t dst_stride,
2916                                               const uint8_t *ref_u, ptrdiff_t src_stride_u,
2917                                               const uint8_t *ref_v, ptrdiff_t src_stride_v,
2918                                               ThreadFrame *ref_frame,
2919                                               ptrdiff_t y, ptrdiff_t x, const VP56mv *in_mv,
2920                                               int px, int py, int pw, int ph,
2921                                               int bw, int bh, int w, int h, int bytesperpixel,
2922                                               const uint16_t *scale, const uint8_t *step)
2923 {
2924     if (s->frames[CUR_FRAME].tf.f->width == ref_frame->f->width &&
2925         s->frames[CUR_FRAME].tf.f->height == ref_frame->f->height) {
2926         mc_chroma_unscaled(s, mc, dst_u, dst_v, dst_stride, ref_u, src_stride_u,
2927                            ref_v, src_stride_v, ref_frame,
2928                            y, x, in_mv, bw, bh, w, h, bytesperpixel);
2929     } else {
2930     int mx, my;
2931     int refbw_m1, refbh_m1;
2932     int th;
2933     VP56mv mv;
2934
2935     if (s->ss_h) {
2936         // BUG https://code.google.com/p/webm/issues/detail?id=820
2937         mv.x = av_clip(in_mv->x, -(x + pw - px + 4) << 4, (s->cols * 4 - x + px + 3) << 4);
2938         mx = scale_mv(mv.x, 0) + (scale_mv(x * 16, 0) & ~15) + (scale_mv(x * 32, 0) & 15);
2939     } else {
2940         mv.x = av_clip(in_mv->x, -(x + pw - px + 4) << 3, (s->cols * 8 - x + px + 3) << 3);
2941         mx = scale_mv(mv.x << 1, 0) + scale_mv(x * 16, 0);
2942     }
2943     if (s->ss_v) {
2944         // BUG https://code.google.com/p/webm/issues/detail?id=820
2945         mv.y = av_clip(in_mv->y, -(y + ph - py + 4) << 4, (s->rows * 4 - y + py + 3) << 4);
2946         my = scale_mv(mv.y, 1) + (scale_mv(y * 16, 1) & ~15) + (scale_mv(y * 32, 1) & 15);
2947     } else {
2948         mv.y = av_clip(in_mv->y, -(y + ph - py + 4) << 3, (s->rows * 8 - y + py + 3) << 3);
2949         my = scale_mv(mv.y << 1, 1) + scale_mv(y * 16, 1);
2950     }
2951 #undef scale_mv
2952     y = my >> 4;
2953     x = mx >> 4;
2954     ref_u += y * src_stride_u + x * bytesperpixel;
2955     ref_v += y * src_stride_v + x * bytesperpixel;
2956     mx &= 15;
2957     my &= 15;
2958     refbw_m1 = ((bw - 1) * step[0] + mx) >> 4;
2959     refbh_m1 = ((bh - 1) * step[1] + my) >> 4;
2960     // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2961     // we use +7 because the last 7 pixels of each sbrow can be changed in
2962     // the longest loopfilter of the next sbrow
2963     th = (y + refbh_m1 + 4 + 7) >> (6 - s->ss_v);
2964     ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2965     if (x < 3 || y < 3 || x + 4 >= w - refbw_m1 || y + 4 >= h - refbh_m1) {
2966         s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2967                                  ref_u - 3 * src_stride_u - 3 * bytesperpixel,
2968                                  288, src_stride_u,
2969                                  refbw_m1 + 8, refbh_m1 + 8,
2970                                  x - 3, y - 3, w, h);
2971         ref_u = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
2972         smc(dst_u, dst_stride, ref_u, 288, bh, mx, my, step[0], step[1]);
2973
2974         s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2975                                  ref_v - 3 * src_stride_v - 3 * bytesperpixel,
2976                                  288, src_stride_v,
2977                                  refbw_m1 + 8, refbh_m1 + 8,
2978                                  x - 3, y - 3, w, h);
2979         ref_v = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
2980         smc(dst_v, dst_stride, ref_v, 288, bh, mx, my, step[0], step[1]);
2981     } else {
2982         smc(dst_u, dst_stride, ref_u, src_stride_u, bh, mx, my, step[0], step[1]);
2983         smc(dst_v, dst_stride, ref_v, src_stride_v, bh, mx, my, step[0], step[1]);
2984     }
2985     }
2986 }
2987
2988 #define mc_luma_dir(s, mc, dst, dst_ls, src, src_ls, tref, row, col, mv, \
2989                     px, py, pw, ph, bw, bh, w, h, i) \
2990     mc_luma_scaled(s, s->dsp.s##mc, s->dsp.mc, dst, dst_ls, src, src_ls, tref, row, col, \
2991                    mv, px, py, pw, ph, bw, bh, w, h, bytesperpixel, \
2992                    s->mvscale[b->ref[i]], s->mvstep[b->ref[i]])
2993 #define mc_chroma_dir(s, mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
2994                       row, col, mv, px, py, pw, ph, bw, bh, w, h, i) \
2995     mc_chroma_scaled(s, s->dsp.s##mc, s->dsp.mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
2996                      row, col, mv, px, py, pw, ph, bw, bh, w, h, bytesperpixel, \
2997                      s->mvscale[b->ref[i]], s->mvstep[b->ref[i]])
2998 #define SCALED 1
2999 #define FN(x) x##_scaled_8bpp
3000 #define BYTES_PER_PIXEL 1
3001 #include "vp9_mc_template.c"
3002 #undef FN
3003 #undef BYTES_PER_PIXEL
3004 #define FN(x) x##_scaled_16bpp
3005 #define BYTES_PER_PIXEL 2
3006 #include "vp9_mc_template.c"
3007 #undef mc_luma_dir
3008 #undef mc_chroma_dir
3009 #undef FN
3010 #undef BYTES_PER_PIXEL
3011 #undef SCALED
3012
3013 static av_always_inline void inter_recon(AVCodecContext *ctx, int bytesperpixel)
3014 {
3015     VP9Context *s = ctx->priv_data;
3016     VP9Block *b = s->b;
3017     int row = s->row, col = s->col;
3018
3019     if (s->mvscale[b->ref[0]][0] || (b->comp && s->mvscale[b->ref[1]][0])) {
3020         if (bytesperpixel == 1) {
3021             inter_pred_scaled_8bpp(ctx);
3022         } else {
3023             inter_pred_scaled_16bpp(ctx);
3024         }
3025     } else {
3026         if (bytesperpixel == 1) {
3027             inter_pred_8bpp(ctx);
3028         } else {
3029             inter_pred_16bpp(ctx);
3030         }
3031     }
3032     if (!b->skip) {
3033         /* mostly copied intra_recon() */
3034
3035         int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
3036         int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
3037         int end_x = FFMIN(2 * (s->cols - col), w4);
3038         int end_y = FFMIN(2 * (s->rows - row), h4);
3039         int tx = 4 * s->lossless + b->tx, uvtx = b->uvtx + 4 * s->lossless;
3040         int uvstep1d = 1 << b->uvtx, p;
3041         uint8_t *dst = s->dst[0];
3042
3043         // y itxfm add
3044         for (n = 0, y = 0; y < end_y; y += step1d) {
3045             uint8_t *ptr = dst;
3046             for (x = 0; x < end_x; x += step1d,
3047                  ptr += 4 * step1d * bytesperpixel, n += step) {
3048                 int eob = b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
3049
3050                 if (eob)
3051                     s->dsp.itxfm_add[tx][DCT_DCT](ptr, s->y_stride,
3052                                                   s->block + 16 * n * bytesperpixel, eob);
3053             }
3054             dst += 4 * s->y_stride * step1d;
3055         }
3056
3057         // uv itxfm add
3058         end_x >>= s->ss_h;
3059         end_y >>= s->ss_v;
3060         step = 1 << (b->uvtx * 2);
3061         for (p = 0; p < 2; p++) {
3062             dst = s->dst[p + 1];
3063             for (n = 0, y = 0; y < end_y; y += uvstep1d) {
3064                 uint8_t *ptr = dst;
3065                 for (x = 0; x < end_x; x += uvstep1d,
3066                      ptr += 4 * uvstep1d * bytesperpixel, n += step) {
3067                     int eob = b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
3068
3069                     if (eob)
3070                         s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
3071                                                         s->uvblock[p] + 16 * n * bytesperpixel, eob);
3072                 }
3073                 dst += 4 * uvstep1d * s->uv_stride;
3074             }
3075         }
3076     }
3077 }
3078
3079 static void inter_recon_8bpp(AVCodecContext *ctx)
3080 {
3081     inter_recon(ctx, 1);
3082 }
3083
3084 static void inter_recon_16bpp(AVCodecContext *ctx)
3085 {
3086     inter_recon(ctx, 2);
3087 }
3088
3089 static av_always_inline void mask_edges(uint8_t (*mask)[8][4], int ss_h, int ss_v,
3090                                         int row_and_7, int col_and_7,
3091                                         int w, int h, int col_end, int row_end,
3092                                         enum TxfmMode tx, int skip_inter)
3093 {
3094     static const unsigned wide_filter_col_mask[2] = { 0x11, 0x01 };
3095     static const unsigned wide_filter_row_mask[2] = { 0x03, 0x07 };
3096
3097     // FIXME I'm pretty sure all loops can be replaced by a single LUT if
3098     // we make VP9Filter.mask uint64_t (i.e. row/col all single variable)
3099     // and make the LUT 5-indexed (bl, bp, is_uv, tx and row/col), and then
3100     // use row_and_7/col_and_7 as shifts (1*col_and_7+8*row_and_7)
3101
3102     // the intended behaviour of the vp9 loopfilter is to work on 8-pixel
3103     // edges. This means that for UV, we work on two subsampled blocks at
3104     // a time, and we only use the topleft block's mode information to set
3105     // things like block strength. Thus, for any block size smaller than
3106     // 16x16, ignore the odd portion of the block.
3107     if (tx == TX_4X4 && (ss_v | ss_h)) {
3108         if (h == ss_v) {
3109             if (row_and_7 & 1)
3110                 return;
3111             if (!row_end)
3112                 h += 1;
3113         }
3114         if (w == ss_h) {
3115             if (col_and_7 & 1)
3116                 return;
3117             if (!col_end)
3118                 w += 1;
3119         }
3120     }
3121
3122     if (tx == TX_4X4 && !skip_inter) {
3123         int t = 1 << col_and_7, m_col = (t << w) - t, y;
3124         // on 32-px edges, use the 8-px wide loopfilter; else, use 4-px wide
3125         int m_row_8 = m_col & wide_filter_col_mask[ss_h], m_row_4 = m_col - m_row_8;
3126
3127         for (y = row_and_7; y < h + row_and_7; y++) {
3128             int col_mask_id = 2 - !(y & wide_filter_row_mask[ss_v]);
3129
3130             mask[0][y][1] |= m_row_8;
3131             mask[0][y][2] |= m_row_4;
3132             // for odd lines, if the odd col is not being filtered,
3133             // skip odd row also:
3134             // .---. <-- a
3135             // |   |
3136             // |___| <-- b
3137             // ^   ^
3138             // c   d
3139             //
3140             // if a/c are even row/col and b/d are odd, and d is skipped,
3141             // e.g. right edge of size-66x66.webm, then skip b also (bug)
3142             if ((ss_h & ss_v) && (col_end & 1) && (y & 1)) {
3143                 mask[1][y][col_mask_id] |= (t << (w - 1)) - t;
3144             } else {
3145                 mask[1][y][col_mask_id] |= m_col;
3146             }
3147             if (!ss_h)
3148                 mask[0][y][3] |= m_col;
3149             if (!ss_v) {
3150                 if (ss_h && (col_end & 1))
3151                     mask[1][y][3] |= (t << (w - 1)) - t;
3152                 else
3153                     mask[1][y][3] |= m_col;
3154             }
3155         }
3156     } else {
3157         int y, t = 1 << col_and_7, m_col = (t << w) - t;
3158
3159         if (!skip_inter) {
3160             int mask_id = (tx == TX_8X8);
3161             static const unsigned masks[4] = { 0xff, 0x55, 0x11, 0x01 };
3162             int l2 = tx + ss_h - 1, step1d;
3163             int m_row = m_col & masks[l2];
3164
3165             // at odd UV col/row edges tx16/tx32 loopfilter edges, force
3166             // 8wd loopfilter to prevent going off the visible edge.
3167             if (ss_h && tx > TX_8X8 && (w ^ (w - 1)) == 1) {
3168                 int m_row_16 = ((t << (w - 1)) - t) & masks[l2];
3169                 int m_row_8 = m_row - m_row_16;
3170
3171                 for (y = row_and_7; y < h + row_and_7; y++) {
3172                     mask[0][y][0] |= m_row_16;
3173                     mask[0][y][1] |= m_row_8;
3174                 }
3175             } else {
3176                 for (y = row_and_7; y < h + row_and_7; y++)
3177                     mask[0][y][mask_id] |= m_row;
3178             }
3179
3180             l2 = tx + ss_v - 1;
3181             step1d = 1 << l2;
3182             if (ss_v && tx > TX_8X8 && (h ^ (h - 1)) == 1) {
3183                 for (y = row_and_7; y < h + row_and_7 - 1; y += step1d)
3184                     mask[1][y][0] |= m_col;
3185                 if (y - row_and_7 == h - 1)
3186                     mask[1][y][1] |= m_col;
3187             } else {
3188                 for (y = row_and_7; y < h + row_and_7; y += step1d)
3189                     mask[1][y][mask_id] |= m_col;
3190             }
3191         } else if (tx != TX_4X4) {
3192             int mask_id;
3193
3194             mask_id = (tx == TX_8X8) || (h == ss_v);
3195             mask[1][row_and_7][mask_id] |= m_col;
3196             mask_id = (tx == TX_8X8) || (w == ss_h);
3197             for (y = row_and_7; y < h + row_and_7; y++)
3198                 mask[0][y][mask_id] |= t;
3199         } else {
3200             int t8 = t & wide_filter_col_mask[ss_h], t4 = t - t8;
3201
3202             for (y = row_and_7; y < h + row_and_7; y++) {
3203                 mask[0][y][2] |= t4;
3204                 mask[0][y][1] |= t8;
3205             }
3206             mask[1][row_and_7][2 - !(row_and_7 & wide_filter_row_mask[ss_v])] |= m_col;
3207         }
3208     }
3209 }
3210
3211 static void decode_b(AVCodecContext *ctx, int row, int col,
3212                      struct VP9Filter *lflvl, ptrdiff_t yoff, ptrdiff_t uvoff,
3213                      enum BlockLevel bl, enum BlockPartition bp)
3214 {
3215     VP9Context *s = ctx->priv_data;
3216     VP9Block *b = s->b;
3217     enum BlockSize bs = bl * 3 + bp;
3218     int bytesperpixel = s->bytesperpixel;
3219     int w4 = bwh_tab[1][bs][0], h4 = bwh_tab[1][bs][1], lvl;
3220     int emu[2];
3221     AVFrame *f = s->frames[CUR_FRAME].tf.f;
3222
3223     s->row = row;
3224     s->row7 = row & 7;
3225     s->col = col;
3226     s->col7 = col & 7;
3227     s->min_mv.x = -(128 + col * 64);
3228     s->min_mv.y = -(128 + row * 64);
3229     s->max_mv.x = 128 + (s->cols - col - w4) * 64;
3230     s->max_mv.y = 128 + (s->rows - row - h4) * 64;
3231     if (s->pass < 2) {
3232         b->bs = bs;
3233         b->bl = bl;
3234         b->bp = bp;
3235         decode_mode(ctx);
3236         b->uvtx = b->tx - ((s->ss_h && w4 * 2 == (1 << b->tx)) ||
3237                            (s->ss_v && h4 * 2 == (1 << b->tx)));
3238
3239         if (!b->skip) {
3240             int has_coeffs;
3241
3242             if (bytesperpixel == 1) {
3243                 has_coeffs = decode_coeffs_8bpp(ctx);
3244             } else {
3245                 has_coeffs = decode_coeffs_16bpp(ctx);
3246             }
3247             if (!has_coeffs && b->bs <= BS_8x8 && !b->intra) {
3248                 b->skip = 1;
3249                 memset(&s->above_skip_ctx[col], 1, w4);
3250                 memset(&s->left_skip_ctx[s->row7], 1, h4);
3251             }
3252         } else {
3253             int row7 = s->row7;
3254
3255 #define SPLAT_ZERO_CTX(v, n) \
3256     switch (n) { \
3257     case 1:  v = 0;          break; \
3258     case 2:  AV_ZERO16(&v);  break; \
3259     case 4:  AV_ZERO32(&v);  break; \
3260     case 8:  AV_ZERO64(&v);  break; \
3261     case 16: AV_ZERO128(&v); break; \
3262     }
3263 #define SPLAT_ZERO_YUV(dir, var, off, n, dir2) \
3264     do { \
3265         SPLAT_ZERO_CTX(s->dir##_y_##var[off * 2], n * 2); \
3266         if (s->ss_##dir2) { \
3267             SPLAT_ZERO_CTX(s->dir##_uv_##var[0][off], n); \
3268             SPLAT_ZERO_CTX(s->dir##_uv_##var[1][off], n); \
3269         } else { \
3270             SPLAT_ZERO_CTX(s->dir##_uv_##var[0][off * 2], n * 2); \
3271             SPLAT_ZERO_CTX(s->dir##_uv_##var[1][off * 2], n * 2); \
3272         } \
3273     } while (0)
3274
3275             switch (w4) {
3276             case 1: SPLAT_ZERO_YUV(above, nnz_ctx, col, 1, h); break;
3277             case 2: SPLAT_ZERO_YUV(above, nnz_ctx, col, 2, h); break;
3278             case 4: SPLAT_ZERO_YUV(above, nnz_ctx, col, 4, h); break;
3279             case 8: SPLAT_ZERO_YUV(above, nnz_ctx, col, 8, h); break;
3280             }
3281             switch (h4) {
3282             case 1: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 1, v); break;
3283             case 2: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 2, v); break;
3284             case 4: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 4, v); break;
3285             case 8: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 8, v); break;
3286             }
3287         }
3288         if (s->pass == 1) {
3289             s->b++;
3290             s->block += w4 * h4 * 64 * bytesperpixel;
3291             s->uvblock[0] += w4 * h4 * 64 * bytesperpixel >> (s->ss_h + s->ss_v);
3292             s->uvblock[1] += w4 * h4 * 64 * bytesperpixel >> (s->ss_h + s->ss_v);
3293             s->eob += 4 * w4 * h4;
3294             s->uveob[0] += 4 * w4 * h4 >> (s->ss_h + s->ss_v);
3295             s->uveob[1] += 4 * w4 * h4 >> (s->ss_h + s->ss_v);
3296
3297             return;
3298         }
3299     }
3300
3301     // emulated overhangs if the stride of the target buffer can't hold. This
3302     // makes it possible to support emu-edge and so on even if we have large block
3303     // overhangs
3304     emu[0] = (col + w4) * 8 * bytesperpixel > f->linesize[0] ||
3305              (row + h4) > s->rows;
3306     emu[1] = ((col + w4) * 8 >> s->ss_h) * bytesperpixel > f->linesize[1] ||
3307              (row + h4) > s->rows;
3308     if (emu[0]) {
3309         s->dst[0] = s->tmp_y;
3310         s->y_stride = 128;
3311     } else {
3312         s->dst[0] = f->data[0] + yoff;
3313         s->y_stride = f->linesize[0];
3314     }
3315     if (emu[1]) {
3316         s->dst[1] = s->tmp_uv[0];
3317         s->dst[2] = s->tmp_uv[1];
3318         s->uv_stride = 128;
3319     } else {
3320         s->dst[1] = f->data[1] + uvoff;
3321         s->dst[2] = f->data[2] + uvoff;
3322         s->uv_stride = f->linesize[1];
3323     }
3324     if (b->intra) {
3325         if (s->bpp > 8) {
3326             intra_recon_16bpp(ctx, yoff, uvoff);
3327         } else {
3328             intra_recon_8bpp(ctx, yoff, uvoff);
3329         }
3330     } else {
3331         if (s->bpp > 8) {
3332             inter_recon_16bpp(ctx);
3333         } else {
3334             inter_recon_8bpp(ctx);
3335         }
3336     }
3337     if (emu[0]) {
3338         int w = FFMIN(s->cols - col, w4) * 8, h = FFMIN(s->rows - row, h4) * 8, n, o = 0;
3339
3340         for (n = 0; o < w; n++) {
3341             int bw = 64 >> n;
3342
3343             av_assert2(n <= 4);
3344             if (w & bw) {
3345                 s->dsp.mc[n][0][0][0][0](f->data[0] + yoff + o * bytesperpixel, f->linesize[0],
3346                                          s->tmp_y + o * bytesperpixel, 128, h, 0, 0);
3347                 o += bw;
3348             }
3349         }
3350     }
3351     if (emu[1]) {
3352         int w = FFMIN(s->cols - col, w4) * 8 >> s->ss_h;
3353         int h = FFMIN(s->rows - row, h4) * 8 >> s->ss_v, n, o = 0;
3354
3355         for (n = s->ss_h; o < w; n++) {
3356             int bw = 64 >> n;
3357
3358             av_assert2(n <= 4);
3359             if (w & bw) {
3360                 s->dsp.mc[n][0][0][0][0](f->data[1] + uvoff + o * bytesperpixel, f->linesize[1],
3361                                          s->tmp_uv[0] + o * bytesperpixel, 128, h, 0, 0);
3362                 s->dsp.mc[n][0][0][0][0](f->data[2] + uvoff + o * bytesperpixel, f->linesize[2],
3363                                          s->tmp_uv[1] + o * bytesperpixel, 128, h, 0, 0);
3364                 o += bw;
3365             }
3366         }
3367     }
3368
3369     // pick filter level and find edges to apply filter to
3370     if (s->filter.level &&
3371         (lvl = s->segmentation.feat[b->seg_id].lflvl[b->intra ? 0 : b->ref[0] + 1]
3372                                                     [b->mode[3] != ZEROMV]) > 0) {
3373         int x_end = FFMIN(s->cols - col, w4), y_end = FFMIN(s->rows - row, h4);
3374         int skip_inter = !b->intra && b->skip, col7 = s->col7, row7 = s->row7;
3375
3376         setctx_2d(&lflvl->level[row7 * 8 + col7], w4, h4, 8, lvl);
3377         mask_edges(lflvl->mask[0], 0, 0, row7, col7, x_end, y_end, 0, 0, b->tx, skip_inter);
3378         if (s->ss_h || s->ss_v)
3379             mask_edges(lflvl->mask[1], s->ss_h, s->ss_v, row7, col7, x_end, y_end,
3380                        s->cols & 1 && col + w4 >= s->cols ? s->cols & 7 : 0,
3381                        s->rows & 1 && row + h4 >= s->rows ? s->rows & 7 : 0,
3382                        b->uvtx, skip_inter);
3383
3384         if (!s->filter.lim_lut[lvl]) {
3385             int sharp = s->filter.sharpness;
3386             int limit = lvl;
3387
3388             if (sharp > 0) {
3389                 limit >>= (sharp + 3) >> 2;
3390                 limit = FFMIN(limit, 9 - sharp);
3391             }
3392             limit = FFMAX(limit, 1);
3393
3394             s->filter.lim_lut[lvl] = limit;
3395             s->filter.mblim_lut[lvl] = 2 * (lvl + 2) + limit;
3396         }
3397     }
3398
3399     if (s->pass == 2) {
3400         s->b++;
3401         s->block += w4 * h4 * 64 * bytesperpixel;
3402         s->uvblock[0] += w4 * h4 * 64 * bytesperpixel >> (s->ss_v + s->ss_h);
3403         s->uvblock[1] += w4 * h4 * 64 * bytesperpixel >> (s->ss_v + s->ss_h);
3404         s->eob += 4 * w4 * h4;
3405         s->uveob[0] += 4 * w4 * h4 >> (s->ss_v + s->ss_h);
3406         s->uveob[1] += 4 * w4 * h4 >> (s->ss_v + s->ss_h);
3407     }
3408 }
3409
3410 static void decode_sb(AVCodecContext *ctx, int row, int col, struct VP9Filter *lflvl,
3411                       ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
3412 {
3413     VP9Context *s = ctx->priv_data;
3414     int c = ((s->above_partition_ctx[col] >> (3 - bl)) & 1) |
3415             (((s->left_partition_ctx[row & 0x7] >> (3 - bl)) & 1) << 1);
3416     const uint8_t *p = s->keyframe || s->intraonly ? vp9_default_kf_partition_probs[bl][c] :
3417                                                      s->prob.p.partition[bl][c];
3418     enum BlockPartition bp;
3419     ptrdiff_t hbs = 4 >> bl;
3420     AVFrame *f = s->frames[CUR_FRAME].tf.f;
3421     ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
3422     int bytesperpixel = s->bytesperpixel;
3423
3424     if (bl == BL_8X8) {
3425         bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
3426         decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3427     } else if (col + hbs < s->cols) { // FIXME why not <=?
3428         if (row + hbs < s->rows) { // FIXME why not <=?
3429             bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
3430             switch (bp) {
3431             case PARTITION_NONE:
3432                 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3433                 break;
3434             case PARTITION_H:
3435                 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3436                 yoff  += hbs * 8 * y_stride;
3437                 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3438                 decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, bl, bp);
3439                 break;
3440             case PARTITION_V:
3441                 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3442                 yoff  += hbs * 8 * bytesperpixel;
3443                 uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
3444                 decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, bl, bp);
3445                 break;
3446             case PARTITION_SPLIT:
3447                 decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3448                 decode_sb(ctx, row, col + hbs, lflvl,
3449                           yoff + 8 * hbs * bytesperpixel,
3450                           uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3451                 yoff  += hbs * 8 * y_stride;
3452                 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3453                 decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3454                 decode_sb(ctx, row + hbs, col + hbs, lflvl,
3455                           yoff + 8 * hbs * bytesperpixel,
3456                           uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3457                 break;
3458             default:
3459                 av_assert0(0);
3460             }
3461         } else if (vp56_rac_get_prob_branchy(&s->c, p[1])) {
3462             bp = PARTITION_SPLIT;
3463             decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3464             decode_sb(ctx, row, col + hbs, lflvl,
3465                       yoff + 8 * hbs * bytesperpixel,
3466                       uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3467         } else {
3468             bp = PARTITION_H;
3469             decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3470         }
3471     } else if (row + hbs < s->rows) { // FIXME why not <=?
3472         if (vp56_rac_get_prob_branchy(&s->c, p[2])) {
3473             bp = PARTITION_SPLIT;
3474             decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3475             yoff  += hbs * 8 * y_stride;
3476             uvoff += hbs * 8 * uv_stride >> s->ss_v;
3477             decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3478         } else {
3479             bp = PARTITION_V;
3480             decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3481         }
3482     } else {
3483         bp = PARTITION_SPLIT;
3484         decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3485     }
3486     s->counts.partition[bl][c][bp]++;
3487 }
3488
3489 static void decode_sb_mem(AVCodecContext *ctx, int row, int col, struct VP9Filter *lflvl,
3490                           ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
3491 {
3492     VP9Context *s = ctx->priv_data;
3493     VP9Block *b = s->b;
3494     ptrdiff_t hbs = 4 >> bl;
3495     AVFrame *f = s->frames[CUR_FRAME].tf.f;
3496     ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
3497     int bytesperpixel = s->bytesperpixel;
3498
3499     if (bl == BL_8X8) {
3500         av_assert2(b->bl == BL_8X8);
3501         decode_b(ctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
3502     } else if (s->b->bl == bl) {
3503         decode_b(ctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
3504         if (b->bp == PARTITION_H && row + hbs < s->rows) {
3505             yoff  += hbs * 8 * y_stride;
3506             uvoff += hbs * 8 * uv_stride >> s->ss_v;
3507             decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, b->bl, b->bp);
3508         } else if (b->bp == PARTITION_V && col + hbs < s->cols) {
3509             yoff  += hbs * 8 * bytesperpixel;
3510             uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
3511             decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, b->bl, b->bp);
3512         }
3513     } else {
3514         decode_sb_mem(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3515         if (col + hbs < s->cols) { // FIXME why not <=?
3516             if (row + hbs < s->rows) {
3517                 decode_sb_mem(ctx, row, col + hbs, lflvl, yoff + 8 * hbs * bytesperpixel,
3518                               uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3519                 yoff  += hbs * 8 * y_stride;
3520                 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3521                 decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3522                 decode_sb_mem(ctx, row + hbs, col + hbs, lflvl,
3523                               yoff + 8 * hbs * bytesperpixel,
3524                               uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3525             } else {
3526                 yoff  += hbs * 8 * bytesperpixel;
3527                 uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
3528                 decode_sb_mem(ctx, row, col + hbs, lflvl, yoff, uvoff, bl + 1);
3529             }
3530         } else if (row + hbs < s->rows) {
3531             yoff  += hbs * 8 * y_stride;
3532             uvoff += hbs * 8 * uv_stride >> s->ss_v;
3533             decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3534         }
3535     }
3536 }
3537
3538 static av_always_inline void filter_plane_cols(VP9Context *s, int col, int ss_h, int ss_v,
3539                                                uint8_t *lvl, uint8_t (*mask)[4],
3540                                                uint8_t *dst, ptrdiff_t ls)
3541 {
3542     int y, x, bytesperpixel = s->bytesperpixel;
3543
3544     // filter edges between columns (e.g. block1 | block2)
3545     for (y = 0; y < 8; y += 2 << ss_v, dst += 16 * ls, lvl += 16 << ss_v) {
3546         uint8_t *ptr = dst, *l = lvl, *hmask1 = mask[y], *hmask2 = mask[y + 1 + ss_v];
3547         unsigned hm1 = hmask1[0] | hmask1[1] | hmask1[2], hm13 = hmask1[3];
3548         unsigned hm2 = hmask2[1] | hmask2[2], hm23 = hmask2[3];
3549         unsigned hm = hm1 | hm2 | hm13 | hm23;
3550
3551         for (x = 1; hm & ~(x - 1); x <<= 1, ptr += 8 * bytesperpixel >> ss_h) {
3552             if (col || x > 1) {
3553                 if (hm1 & x) {
3554                     int L = *l, H = L >> 4;
3555                     int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3556
3557                     if (hmask1[0] & x) {
3558                         if (hmask2[0] & x) {
3559                             av_assert2(l[8 << ss_v] == L);
3560                             s->dsp.loop_filter_16[0](ptr, ls, E, I, H);
3561                         } else {
3562                             s->dsp.loop_filter_8[2][0](ptr, ls, E, I, H);
3563                         }
3564                     } else if (hm2 & x) {
3565                         L = l[8 << ss_v];
3566                         H |= (L >> 4) << 8;
3567                         E |= s->filter.mblim_lut[L] << 8;
3568                         I |= s->filter.lim_lut[L] << 8;
3569                         s->dsp.loop_filter_mix2[!!(hmask1[1] & x)]
3570                                                [!!(hmask2[1] & x)]
3571                                                [0](ptr, ls, E, I, H);
3572                     } else {
3573                         s->dsp.loop_filter_8[!!(hmask1[1] & x)]
3574                                             [0](ptr, ls, E, I, H);
3575                     }
3576                 } else if (hm2 & x) {
3577                     int L = l[8 << ss_v], H = L >> 4;
3578                     int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3579
3580                     s->dsp.loop_filter_8[!!(hmask2[1] & x)]
3581                                         [0](ptr + 8 * ls, ls, E, I, H);
3582                 }
3583             }
3584             if (ss_h) {
3585                 if (x & 0xAA)
3586                     l += 2;
3587             } else {
3588                 if (hm13 & x) {
3589                     int L = *l, H = L >> 4;
3590                     int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3591
3592                     if (hm23 & x) {
3593                         L = l[8 << ss_v];
3594                         H |= (L >> 4) << 8;
3595                         E |= s->filter.mblim_lut[L] << 8;
3596                         I |= s->filter.lim_lut[L] << 8;
3597                         s->dsp.loop_filter_mix2[0][0][0](ptr + 4 * bytesperpixel, ls, E, I, H);
3598                     } else {
3599                         s->dsp.loop_filter_8[0][0](ptr + 4 * bytesperpixel, ls, E, I, H);
3600                     }
3601                 } else if (hm23 & x) {
3602                     int L = l[8 << ss_v], H = L >> 4;
3603                     int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3604
3605                     s->dsp.loop_filter_8[0][0](ptr + 8 * ls + 4 * bytesperpixel, ls, E, I, H);
3606                 }
3607                 l++;
3608             }
3609         }
3610     }
3611 }
3612
3613 static av_always_inline void filter_plane_rows(VP9Context *s, int row, int ss_h, int ss_v,
3614                                                uint8_t *lvl, uint8_t (*mask)[4],
3615                                                uint8_t *dst, ptrdiff_t ls)
3616 {
3617     int y, x, bytesperpixel = s->bytesperpixel;
3618
3619     //                                 block1
3620     // filter edges between rows (e.g. ------)
3621     //                                 block2
3622     for (y = 0; y < 8; y++, dst += 8 * ls >> ss_v) {
3623         uint8_t *ptr = dst, *l = lvl, *vmask = mask[y];
3624         unsigned vm = vmask[0] | vmask[1] | vmask[2], vm3 = vmask[3];
3625
3626         for (x = 1; vm & ~(x - 1); x <<= (2 << ss_h), ptr += 16 * bytesperpixel, l += 2 << ss_h) {
3627             if (row || y) {
3628                 if (vm & x) {
3629                     int L = *l, H = L >> 4;
3630                     int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3631
3632                     if (vmask[0] & x) {
3633                         if (vmask[0] & (x << (1 + ss_h))) {
3634                             av_assert2(l[1 + ss_h] == L);
3635                             s->dsp.loop_filter_16[1](ptr, ls, E, I, H);
3636                         } else {
3637                             s->dsp.loop_filter_8[2][1](ptr, ls, E, I, H);
3638                         }
3639                     } else if (vm & (x << (1 + ss_h))) {
3640                         L = l[1 + ss_h];
3641                         H |= (L >> 4) << 8;
3642                         E |= s->filter.mblim_lut[L] << 8;
3643                         I |= s->filter.lim_lut[L] << 8;
3644                         s->dsp.loop_filter_mix2[!!(vmask[1] &  x)]
3645                                                [!!(vmask[1] & (x << (1 + ss_h)))]
3646                                                [1](ptr, ls, E, I, H);
3647                     } else {
3648                         s->dsp.loop_filter_8[!!(vmask[1] & x)]
3649                                             [1](ptr, ls, E, I, H);
3650                     }
3651                 } else if (vm & (x << (1 + ss_h))) {
3652                     int L = l[1 + ss_h], H = L >> 4;
3653                     int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3654
3655                     s->dsp.loop_filter_8[!!(vmask[1] & (x << (1 + ss_h)))]
3656                                         [1](ptr + 8 * bytesperpixel, ls, E, I, H);
3657                 }
3658             }
3659             if (!ss_v) {
3660                 if (vm3 & x) {
3661                     int L = *l, H = L >> 4;
3662                     int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3663
3664                     if (vm3 & (x << (1 + ss_h))) {
3665                         L = l[1 + ss_h];
3666                         H |= (L >> 4) << 8;
3667                         E |= s->filter.mblim_lut[L] << 8;
3668                         I |= s->filter.lim_lut[L] << 8;
3669                         s->dsp.loop_filter_mix2[0][0][1](ptr + ls * 4, ls, E, I, H);
3670                     } else {
3671                         s->dsp.loop_filter_8[0][1](ptr + ls * 4, ls, E, I, H);
3672                     }
3673                 } else if (vm3 & (x << (1 + ss_h))) {
3674                     int L = l[1 + ss_h], H = L >> 4;
3675                     int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3676
3677                     s->dsp.loop_filter_8[0][1](ptr + ls * 4 + 8 * bytesperpixel, ls, E, I, H);
3678                 }
3679             }
3680         }
3681         if (ss_v) {
3682             if (y & 1)
3683                 lvl += 16;
3684         } else {
3685             lvl += 8;
3686         }
3687     }
3688 }
3689
3690 static void loopfilter_sb(AVCodecContext *ctx, struct VP9Filter *lflvl,
3691                           int row, int col, ptrdiff_t yoff, ptrdiff_t uvoff)
3692 {
3693     VP9Context *s = ctx->priv_data;
3694     AVFrame *f = s->frames[CUR_FRAME].tf.f;
3695     uint8_t *dst = f->data[0] + yoff;
3696     ptrdiff_t ls_y = f->linesize[0], ls_uv = f->linesize[1];
3697     uint8_t (*uv_masks)[8][4] = lflvl->mask[s->ss_h | s->ss_v];
3698     int p;
3699
3700     // FIXME in how far can we interleave the v/h loopfilter calls? E.g.
3701     // if you think of them as acting on a 8x8 block max, we can interleave
3702     // each v/h within the single x loop, but that only works if we work on
3703     // 8 pixel blocks, and we won't always do that (we want at least 16px
3704     // to use SSE2 optimizations, perhaps 32 for AVX2)
3705
3706     filter_plane_cols(s, col, 0, 0, lflvl->level, lflvl->mask[0][0], dst, ls_y);
3707     filter_plane_rows(s, row, 0, 0, lflvl->level, lflvl->mask[0][1], dst, ls_y);
3708
3709     for (p = 0; p < 2; p++) {
3710         dst = f->data[1 + p] + uvoff;
3711         filter_plane_cols(s, col, s->ss_h, s->ss_v, lflvl->level, uv_masks[0], dst, ls_uv);
3712         filter_plane_rows(s, row, s->ss_h, s->ss_v, lflvl->level, uv_masks[1], dst, ls_uv);
3713     }
3714 }
3715
3716 static void set_tile_offset(int *start, int *end, int idx, int log2_n, int n)
3717 {
3718     int sb_start = ( idx      * n) >> log2_n;
3719     int sb_end   = ((idx + 1) * n) >> log2_n;
3720     *start = FFMIN(sb_start, n) << 3;
3721     *end   = FFMIN(sb_end,   n) << 3;
3722 }
3723
3724 static av_always_inline void adapt_prob(uint8_t *p, unsigned ct0, unsigned ct1,
3725                                         int max_count, int update_factor)
3726 {
3727     unsigned ct = ct0 + ct1, p2, p1;
3728
3729     if (!ct)
3730         return;
3731
3732     p1 = *p;
3733     p2 = ((ct0 << 8) + (ct >> 1)) / ct;
3734     p2 = av_clip(p2, 1, 255);
3735     ct = FFMIN(ct, max_count);
3736     update_factor = FASTDIV(update_factor * ct, max_count);
3737
3738     // (p1 * (256 - update_factor) + p2 * update_factor + 128) >> 8
3739     *p = p1 + (((p2 - p1) * update_factor + 128) >> 8);
3740 }
3741
3742 static void adapt_probs(VP9Context *s)
3743 {
3744     int i, j, k, l, m;
3745     prob_context *p = &s->prob_ctx[s->framectxid].p;
3746     int uf = (s->keyframe || s->intraonly || !s->last_keyframe) ? 112 : 128;
3747
3748     // coefficients
3749     for (i = 0; i < 4; i++)
3750         for (j = 0; j < 2; j++)
3751             for (k = 0; k < 2; k++)
3752                 for (l = 0; l < 6; l++)
3753                     for (m = 0; m < 6; m++) {
3754                         uint8_t *pp = s->prob_ctx[s->framectxid].coef[i][j][k][l][m];
3755                         unsigned *e = s->counts.eob[i][j][k][l][m];
3756                         unsigned *c = s->counts.coef[i][j][k][l][m];
3757
3758                         if (l == 0 && m >= 3) // dc only has 3 pt
3759                             break;
3760
3761                         adapt_prob(&pp[0], e[0], e[1], 24, uf);
3762                         adapt_prob(&pp[1], c[0], c[1] + c[2], 24, uf);
3763                         adapt_prob(&pp[2], c[1], c[2], 24, uf);
3764                     }
3765
3766     if (s->keyframe || s->intraonly) {
3767         memcpy(p->skip,  s->prob.p.skip,  sizeof(p->skip));
3768         memcpy(p->tx32p, s->prob.p.tx32p, sizeof(p->tx32p));
3769         memcpy(p->tx16p, s->prob.p.tx16p, sizeof(p->tx16p));
3770         memcpy(p->tx8p,  s->prob.p.tx8p,  sizeof(p->tx8p));
3771         return;
3772     }
3773
3774     // skip flag
3775     for (i = 0; i < 3; i++)
3776         adapt_prob(&p->skip[i], s->counts.skip[i][0], s->counts.skip[i][1], 20, 128);
3777
3778     // intra/inter flag
3779     for (i = 0; i < 4; i++)
3780         adapt_prob(&p->intra[i], s->counts.intra[i][0], s->counts.intra[i][1], 20, 128);
3781
3782     // comppred flag
3783     if (s->comppredmode == PRED_SWITCHABLE) {
3784       for (i = 0; i < 5; i++)
3785           adapt_prob(&p->comp[i], s->counts.comp[i][0], s->counts.comp[i][1], 20, 128);
3786     }
3787
3788     // reference frames
3789     if (s->comppredmode != PRED_SINGLEREF) {
3790       for (i = 0; i < 5; i++)
3791           adapt_prob(&p->comp_ref[i], s->counts.comp_ref[i][0],
3792                      s->counts.comp_ref[i][1], 20, 128);
3793     }
3794
3795     if (s->comppredmode != PRED_COMPREF) {
3796       for (i = 0; i < 5; i++) {
3797           uint8_t *pp = p->single_ref[i];
3798           unsigned (*c)[2] = s->counts.single_ref[i];
3799
3800           adapt_prob(&pp[0], c[0][0], c[0][1], 20, 128);
3801           adapt_prob(&pp[1], c[1][0], c[1][1], 20, 128);
3802       }
3803     }
3804
3805     // block partitioning
3806     for (i = 0; i < 4; i++)
3807         for (j = 0; j < 4; j++) {
3808             uint8_t *pp = p->partition[i][j];
3809             unsigned *c = s->counts.partition[i][j];
3810
3811             adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3812             adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3813             adapt_prob(&pp[2], c[2], c[3], 20, 128);
3814         }
3815
3816     // tx size
3817     if (s->txfmmode == TX_SWITCHABLE) {
3818       for (i = 0; i < 2; i++) {
3819           unsigned *c16 = s->counts.tx16p[i], *c32 = s->counts.tx32p[i];
3820
3821           adapt_prob(&p->tx8p[i], s->counts.tx8p[i][0], s->counts.tx8p[i][1], 20, 128);
3822           adapt_prob(&p->tx16p[i][0], c16[0], c16[1] + c16[2], 20, 128);
3823           adapt_prob(&p->tx16p[i][1], c16[1], c16[2], 20, 128);
3824           adapt_prob(&p->tx32p[i][0], c32[0], c32[1] + c32[2] + c32[3], 20, 128);
3825           adapt_prob(&p->tx32p[i][1], c32[1], c32[2] + c32[3], 20, 128);
3826           adapt_prob(&p->tx32p[i][2], c32[2], c32[3], 20, 128);
3827       }
3828     }
3829
3830     // interpolation filter
3831     if (s->filtermode == FILTER_SWITCHABLE) {
3832         for (i = 0; i < 4; i++) {
3833             uint8_t *pp = p->filter[i];
3834             unsigned *c = s->counts.filter[i];
3835
3836             adapt_prob(&pp[0], c[0], c[1] + c[2], 20, 128);
3837             adapt_prob(&pp[1], c[1], c[2], 20, 128);
3838         }
3839     }
3840
3841     // inter modes
3842     for (i = 0; i < 7; i++) {
3843         uint8_t *pp = p->mv_mode[i];
3844         unsigned *c = s->counts.mv_mode[i];
3845
3846         adapt_prob(&pp[0], c[2], c[1] + c[0] + c[3], 20, 128);
3847         adapt_prob(&pp[1], c[0], c[1] + c[3], 20, 128);
3848         adapt_prob(&pp[2], c[1], c[3], 20, 128);
3849     }
3850
3851     // mv joints
3852     {
3853         uint8_t *pp = p->mv_joint;
3854         unsigned *c = s->counts.mv_joint;
3855
3856         adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3857         adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3858         adapt_prob(&pp[2], c[2], c[3], 20, 128);
3859     }
3860
3861     // mv components
3862     for (i = 0; i < 2; i++) {
3863         uint8_t *pp;
3864         unsigned *c, (*c2)[2], sum;
3865
3866         adapt_prob(&p->mv_comp[i].sign, s->counts.mv_comp[i].sign[0],
3867                    s->counts.mv_comp[i].sign[1], 20, 128);
3868
3869         pp = p->mv_comp[i].classes;
3870         c = s->counts.mv_comp[i].classes;
3871         sum = c[1] + c[2] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9] + c[10];
3872         adapt_prob(&pp[0], c[0], sum, 20, 128);
3873         sum -= c[1];
3874         adapt_prob(&pp[1], c[1], sum, 20, 128);
3875         sum -= c[2] + c[3];
3876         adapt_prob(&pp[2], c[2] + c[3], sum, 20, 128);
3877         adapt_prob(&pp[3], c[2], c[3], 20, 128);
3878         sum -= c[4] + c[5];
3879         adapt_prob(&pp[4], c[4] + c[5], sum, 20, 128);
3880         adapt_prob(&pp[5], c[4], c[5], 20, 128);
3881         sum -= c[6];
3882         adapt_prob(&pp[6], c[6], sum, 20, 128);
3883         adapt_prob(&pp[7], c[7] + c[8], c[9] + c[10], 20, 128);
3884         adapt_prob(&pp[8], c[7], c[8], 20, 128);
3885         adapt_prob(&pp[9], c[9], c[10], 20, 128);
3886
3887         adapt_prob(&p->mv_comp[i].class0, s->counts.mv_comp[i].class0[0],
3888                    s->counts.mv_comp[i].class0[1], 20, 128);
3889         pp = p->mv_comp[i].bits;
3890         c2 = s->counts.mv_comp[i].bits;
3891         for (j = 0; j < 10; j++)
3892             adapt_prob(&pp[j], c2[j][0], c2[j][1], 20, 128);
3893
3894         for (j = 0; j < 2; j++) {
3895             pp = p->mv_comp[i].class0_fp[j];
3896             c = s->counts.mv_comp[i].class0_fp[j];
3897             adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3898             adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3899             adapt_prob(&pp[2], c[2], c[3], 20, 128);
3900         }
3901         pp = p->mv_comp[i].fp;
3902         c = s->counts.mv_comp[i].fp;
3903         adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3904         adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3905         adapt_prob(&pp[2], c[2], c[3], 20, 128);
3906
3907         if (s->highprecisionmvs) {
3908             adapt_prob(&p->mv_comp[i].class0_hp, s->counts.mv_comp[i].class0_hp[0],
3909                        s->counts.mv_comp[i].class0_hp[1], 20, 128);
3910             adapt_prob(&p->mv_comp[i].hp, s->counts.mv_comp[i].hp[0],
3911                        s->counts.mv_comp[i].hp[1], 20, 128);
3912         }
3913     }
3914
3915     // y intra modes
3916     for (i = 0; i < 4; i++) {
3917         uint8_t *pp = p->y_mode[i];
3918         unsigned *c = s->counts.y_mode[i], sum, s2;
3919
3920         sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
3921         adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
3922         sum -= c[TM_VP8_PRED];
3923         adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
3924         sum -= c[VERT_PRED];
3925         adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
3926         s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
3927         sum -= s2;
3928         adapt_prob(&pp[3], s2, sum, 20, 128);
3929         s2 -= c[HOR_PRED];
3930         adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
3931         adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], 20, 128);
3932         sum -= c[DIAG_DOWN_LEFT_PRED];
3933         adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
3934         sum -= c[VERT_LEFT_PRED];
3935         adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
3936         adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
3937     }
3938
3939     // uv intra modes
3940     for (i = 0; i < 10; i++) {
3941         uint8_t *pp = p->uv_mode[i];
3942         unsigned *c = s->counts.uv_mode[i], sum, s2;
3943
3944         sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
3945         adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
3946         sum -= c[TM_VP8_PRED];
3947         adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
3948         sum -= c[VERT_PRED];
3949         adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
3950         s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
3951         sum -= s2;
3952         adapt_prob(&pp[3], s2, sum, 20, 128);
3953         s2 -= c[HOR_PRED];
3954         adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
3955         adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], 20, 128);
3956         sum -= c[DIAG_DOWN_LEFT_PRED];
3957         adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
3958         sum -= c[VERT_LEFT_PRED];
3959         adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
3960         adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
3961     }
3962 }
3963
3964 static void free_buffers(VP9Context *s)
3965 {
3966     av_freep(&s->intra_pred_data[0]);
3967     av_freep(&s->b_base);
3968     av_freep(&s->block_base);
3969 }
3970
3971 static av_cold int vp9_decode_free(AVCodecContext *ctx)
3972 {
3973     VP9Context *s = ctx->priv_data;
3974     int i;
3975
3976     for (i = 0; i < 3; i++) {
3977         if (s->frames[i].tf.f->data[0])
3978             vp9_unref_frame(ctx, &s->frames[i]);
3979         av_frame_free(&s->frames[i].tf.f);
3980     }
3981     for (i = 0; i < 8; i++) {
3982         if (s->refs[i].f->data[0])
3983             ff_thread_release_buffer(ctx, &s->refs[i]);
3984         av_frame_free(&s->refs[i].f);
3985         if (s->next_refs[i].f->data[0])
3986             ff_thread_release_buffer(ctx, &s->next_refs[i]);
3987         av_frame_free(&s->next_refs[i].f);
3988     }
3989     free_buffers(s);
3990     av_freep(&s->c_b);
3991     s->c_b_size = 0;
3992
3993     return 0;
3994 }
3995
3996
3997 static int vp9_decode_frame(AVCodecContext *ctx, void *frame,
3998                             int *got_frame, AVPacket *pkt)
3999 {
4000     const uint8_t *data = pkt->data;
4001     int size = pkt->size;
4002     VP9Context *s = ctx->priv_data;
4003     int res, tile_row, tile_col, i, ref, row, col;
4004     int retain_segmap_ref = s->frames[REF_FRAME_SEGMAP].segmentation_map &&
4005                             (!s->segmentation.enabled || !s->segmentation.update_map);
4006     ptrdiff_t yoff, uvoff, ls_y, ls_uv;
4007     AVFrame *f;
4008     int bytesperpixel;
4009
4010     if ((res = decode_frame_header(ctx, data, size, &ref)) < 0) {
4011         return res;
4012     } else if (res == 0) {
4013         if (!s->refs[ref].f->data[0]) {
4014             av_log(ctx, AV_LOG_ERROR, "Requested reference %d not available\n", ref);
4015             return AVERROR_INVALIDDATA;
4016         }
4017         if ((res = av_frame_ref(frame, s->refs[ref].f)) < 0)
4018             return res;
4019         ((AVFrame *)frame)->pkt_pts = pkt->pts;
4020         ((AVFrame *)frame)->pkt_dts = pkt->dts;
4021         for (i = 0; i < 8; i++) {
4022             if (s->next_refs[i].f->data[0])
4023                 ff_thread_release_buffer(ctx, &s->next_refs[i]);
4024             if (s->refs[i].f->data[0] &&
4025                 (res = ff_thread_ref_frame(&s->next_refs[i], &s->refs[i])) < 0)
4026                 return res;
4027         }
4028         *got_frame = 1;
4029         return pkt->size;
4030     }
4031     data += res;
4032     size -= res;
4033
4034     if (!retain_segmap_ref || s->keyframe || s->intraonly) {
4035         if (s->frames[REF_FRAME_SEGMAP].tf.f->data[0])
4036             vp9_unref_frame(ctx, &s->frames[REF_FRAME_SEGMAP]);
4037         if (!s->keyframe && !s->intraonly && !s->errorres && s->frames[CUR_FRAME].tf.f->data[0] &&
4038             (res = vp9_ref_frame(ctx, &s->frames[REF_FRAME_SEGMAP], &s->frames[CUR_FRAME])) < 0)
4039             return res;
4040     }
4041     if (s->frames[REF_FRAME_MVPAIR].tf.f->data[0])
4042         vp9_unref_frame(ctx, &s->frames[REF_FRAME_MVPAIR]);
4043     if (!s->intraonly && !s->keyframe && !s->errorres && s->frames[CUR_FRAME].tf.f->data[0] &&
4044         (res = vp9_ref_frame(ctx, &s->frames[REF_FRAME_MVPAIR], &s->frames[CUR_FRAME])) < 0)
4045         return res;
4046     if (s->frames[CUR_FRAME].tf.f->data[0])
4047         vp9_unref_frame(ctx, &s->frames[CUR_FRAME]);
4048     if ((res = vp9_alloc_frame(ctx, &s->frames[CUR_FRAME])) < 0)
4049         return res;
4050     f = s->frames[CUR_FRAME].tf.f;
4051     f->key_frame = s->keyframe;
4052     f->pict_type = (s->keyframe || s->intraonly) ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
4053     ls_y = f->linesize[0];
4054     ls_uv =f->linesize[1];
4055
4056     if (s->frames[REF_FRAME_SEGMAP].tf.f->data[0] &&
4057         (s->frames[REF_FRAME_MVPAIR].tf.f->width  != s->frames[CUR_FRAME].tf.f->width ||
4058          s->frames[REF_FRAME_MVPAIR].tf.f->height != s->frames[CUR_FRAME].tf.f->height)) {
4059         vp9_unref_frame(ctx, &s->frames[REF_FRAME_SEGMAP]);
4060     }
4061
4062     // ref frame setup
4063     for (i = 0; i < 8; i++) {
4064         if (s->next_refs[i].f->data[0])
4065             ff_thread_release_buffer(ctx, &s->next_refs[i]);
4066         if (s->refreshrefmask & (1 << i)) {
4067             res = ff_thread_ref_frame(&s->next_refs[i], &s->frames[CUR_FRAME].tf);
4068         } else if (s->refs[i].f->data[0]) {
4069             res = ff_thread_ref_frame(&s->next_refs[i], &s->refs[i]);
4070         }
4071         if (res < 0)
4072             return res;
4073     }
4074
4075     // main tile decode loop
4076     bytesperpixel = s->bytesperpixel;
4077     memset(s->above_partition_ctx, 0, s->cols);
4078     memset(s->above_skip_ctx, 0, s->cols);
4079     if (s->keyframe || s->intraonly) {
4080         memset(s->above_mode_ctx, DC_PRED, s->cols * 2);
4081     } else {
4082         memset(s->above_mode_ctx, NEARESTMV, s->cols);
4083     }
4084     memset(s->above_y_nnz_ctx, 0, s->sb_cols * 16);
4085     memset(s->above_uv_nnz_ctx[0], 0, s->sb_cols * 16 >> s->ss_h);
4086     memset(s->above_uv_nnz_ctx[1], 0, s->sb_cols * 16 >> s->ss_h);
4087     memset(s->above_segpred_ctx, 0, s->cols);
4088     s->pass = s->frames[CUR_FRAME].uses_2pass =
4089         ctx->active_thread_type == FF_THREAD_FRAME && s->refreshctx && !s->parallelmode;
4090     if ((res = update_block_buffers(ctx)) < 0) {
4091         av_log(ctx, AV_LOG_ERROR,
4092                "Failed to allocate block buffers\n");
4093         return res;
4094     }
4095     if (s->refreshctx && s->parallelmode) {
4096         int j, k, l, m;
4097
4098         for (i = 0; i < 4; i++) {
4099             for (j = 0; j < 2; j++)
4100                 for (k = 0; k < 2; k++)
4101                     for (l = 0; l < 6; l++)
4102                         for (m = 0; m < 6; m++)
4103                             memcpy(s->prob_ctx[s->framectxid].coef[i][j][k][l][m],
4104                                    s->prob.coef[i][j][k][l][m], 3);
4105             if (s->txfmmode == i)
4106                 break;
4107         }
4108         s->prob_ctx[s->framectxid].p = s->prob.p;
4109         ff_thread_finish_setup(ctx);
4110     } else if (!s->refreshctx) {
4111         ff_thread_finish_setup(ctx);
4112     }
4113
4114     do {
4115         yoff = uvoff = 0;
4116         s->b = s->b_base;
4117         s->block = s->block_base;
4118         s->uvblock[0] = s->uvblock_base[0];
4119         s->uvblock[1] = s->uvblock_base[1];
4120         s->eob = s->eob_base;
4121         s->uveob[0] = s->uveob_base[0];
4122         s->uveob[1] = s->uveob_base[1];
4123
4124         for (tile_row = 0; tile_row < s->tiling.tile_rows; tile_row++) {
4125             set_tile_offset(&s->tiling.tile_row_start, &s->tiling.tile_row_end,
4126                             tile_row, s->tiling.log2_tile_rows, s->sb_rows);
4127             if (s->pass != 2) {
4128                 for (tile_col = 0; tile_col < s->tiling.tile_cols; tile_col++) {
4129                     int64_t tile_size;
4130
4131                     if (tile_col == s->tiling.tile_cols - 1 &&
4132                         tile_row == s->tiling.tile_rows - 1) {
4133                         tile_size = size;
4134                     } else {
4135                         tile_size = AV_RB32(data);
4136                         data += 4;
4137                         size -= 4;
4138                     }
4139                     if (tile_size > size) {
4140                         ff_thread_report_progress(&s->frames[CUR_FRAME].tf, INT_MAX, 0);
4141                         return AVERROR_INVALIDDATA;
4142                     }
4143                     ff_vp56_init_range_decoder(&s->c_b[tile_col], data, tile_size);
4144                     if (vp56_rac_get_prob_branchy(&s->c_b[tile_col], 128)) { // marker bit
4145                         ff_thread_report_progress(&s->frames[CUR_FRAME].tf, INT_MAX, 0);
4146                         return AVERROR_INVALIDDATA;
4147                     }
4148                     data += tile_size;
4149                     size -= tile_size;
4150                 }
4151             }
4152
4153             for (row = s->tiling.tile_row_start; row < s->tiling.tile_row_end;
4154                  row += 8, yoff += ls_y * 64, uvoff += ls_uv * 64 >> s->ss_v) {
4155                 struct VP9Filter *lflvl_ptr = s->lflvl;
4156                 ptrdiff_t yoff2 = yoff, uvoff2 = uvoff;
4157
4158                 for (tile_col = 0; tile_col < s->tiling.tile_cols; tile_col++) {
4159                     set_tile_offset(&s->tiling.tile_col_start, &s->tiling.tile_col_end,
4160                                     tile_col, s->tiling.log2_tile_cols, s->sb_cols);
4161
4162                     if (s->pass != 2) {
4163                         memset(s->left_partition_ctx, 0, 8);
4164                         memset(s->left_skip_ctx, 0, 8);
4165                         if (s->keyframe || s->intraonly) {
4166                             memset(s->left_mode_ctx, DC_PRED, 16);
4167                         } else {
4168                             memset(s->left_mode_ctx, NEARESTMV, 8);
4169                         }
4170                         memset(s->left_y_nnz_ctx, 0, 16);
4171                         memset(s->left_uv_nnz_ctx, 0, 32);
4172                         memset(s->left_segpred_ctx, 0, 8);
4173
4174                         memcpy(&s->c, &s->c_b[tile_col], sizeof(s->c));
4175                     }
4176
4177                     for (col = s->tiling.tile_col_start;
4178                          col < s->tiling.tile_col_end;
4179                          col += 8, yoff2 += 64 * bytesperpixel,
4180                          uvoff2 += 64 * bytesperpixel >> s->ss_h, lflvl_ptr++) {
4181                         // FIXME integrate with lf code (i.e. zero after each
4182                         // use, similar to invtxfm coefficients, or similar)
4183                         if (s->pass != 1) {
4184                             memset(lflvl_ptr->mask, 0, sizeof(lflvl_ptr->mask));
4185                         }
4186
4187                         if (s->pass == 2) {
4188                             decode_sb_mem(ctx, row, col, lflvl_ptr,
4189                                           yoff2, uvoff2, BL_64X64);
4190                         } else {
4191                             decode_sb(ctx, row, col, lflvl_ptr,
4192                                       yoff2, uvoff2, BL_64X64);
4193                         }
4194                     }
4195                     if (s->pass != 2) {
4196                         memcpy(&s->c_b[tile_col], &s->c, sizeof(s->c));
4197                     }
4198                 }
4199
4200                 if (s->pass == 1) {
4201                     continue;
4202                 }
4203
4204                 // backup pre-loopfilter reconstruction data for intra
4205                 // prediction of next row of sb64s
4206                 if (row + 8 < s->rows) {
4207                     memcpy(s->intra_pred_data[0],
4208                            f->data[0] + yoff + 63 * ls_y,
4209                            8 * s->cols * bytesperpixel);
4210                     memcpy(s->intra_pred_data[1],
4211                            f->data[1] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv,
4212                            8 * s->cols * bytesperpixel >> s->ss_h);
4213                     memcpy(s->intra_pred_data[2],
4214                            f->data[2] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv,
4215                            8 * s->cols * bytesperpixel >> s->ss_h);
4216                 }
4217
4218                 // loopfilter one row
4219                 if (s->filter.level) {
4220                     yoff2 = yoff;
4221                     uvoff2 = uvoff;
4222                     lflvl_ptr = s->lflvl;
4223                     for (col = 0; col < s->cols;
4224                          col += 8, yoff2 += 64 * bytesperpixel,
4225                          uvoff2 += 64 * bytesperpixel >> s->ss_h, lflvl_ptr++) {
4226                         loopfilter_sb(ctx, lflvl_ptr, row, col, yoff2, uvoff2);
4227                     }
4228                 }
4229
4230                 // FIXME maybe we can make this more finegrained by running the
4231                 // loopfilter per-block instead of after each sbrow
4232                 // In fact that would also make intra pred left preparation easier?
4233                 ff_thread_report_progress(&s->frames[CUR_FRAME].tf, row >> 3, 0);
4234             }
4235         }
4236
4237         if (s->pass < 2 && s->refreshctx && !s->parallelmode) {
4238             adapt_probs(s);
4239             ff_thread_finish_setup(ctx);
4240         }
4241     } while (s->pass++ == 1);
4242     ff_thread_report_progress(&s->frames[CUR_FRAME].tf, INT_MAX, 0);
4243
4244     // ref frame setup
4245     for (i = 0; i < 8; i++) {
4246         if (s->refs[i].f->data[0])
4247             ff_thread_release_buffer(ctx, &s->refs[i]);
4248         if (s->next_refs[i].f->data[0] &&
4249             (res = ff_thread_ref_frame(&s->refs[i], &s->next_refs[i])) < 0)
4250             return res;
4251     }
4252
4253     if (!s->invisible) {
4254         if ((res = av_frame_ref(frame, s->frames[CUR_FRAME].tf.f)) < 0)
4255             return res;
4256         *got_frame = 1;
4257     }
4258
4259     return pkt->size;
4260 }
4261
4262 static void vp9_decode_flush(AVCodecContext *ctx)
4263 {
4264     VP9Context *s = ctx->priv_data;
4265     int i;
4266
4267     for (i = 0; i < 3; i++)
4268         vp9_unref_frame(ctx, &s->frames[i]);
4269     for (i = 0; i < 8; i++)
4270         ff_thread_release_buffer(ctx, &s->refs[i]);
4271 }
4272
4273 static int init_frames(AVCodecContext *ctx)
4274 {
4275     VP9Context *s = ctx->priv_data;
4276     int i;
4277
4278     for (i = 0; i < 3; i++) {
4279         s->frames[i].tf.f = av_frame_alloc();
4280         if (!s->frames[i].tf.f) {
4281             vp9_decode_free(ctx);
4282             av_log(ctx, AV_LOG_ERROR, "Failed to allocate frame buffer %d\n", i);
4283             return AVERROR(ENOMEM);
4284         }
4285     }
4286     for (i = 0; i < 8; i++) {
4287         s->refs[i].f = av_frame_alloc();
4288         s->next_refs[i].f = av_frame_alloc();
4289         if (!s->refs[i].f || !s->next_refs[i].f) {
4290             vp9_decode_free(ctx);
4291             av_log(ctx, AV_LOG_ERROR, "Failed to allocate frame buffer %d\n", i);
4292             return AVERROR(ENOMEM);
4293         }
4294     }
4295
4296     return 0;
4297 }
4298
4299 static av_cold int vp9_decode_init(AVCodecContext *ctx)
4300 {
4301     VP9Context *s = ctx->priv_data;
4302
4303     ctx->internal->allocate_progress = 1;
4304     s->last_bpp = 0;
4305     s->filter.sharpness = -1;
4306
4307     return init_frames(ctx);
4308 }
4309
4310 static av_cold int vp9_decode_init_thread_copy(AVCodecContext *avctx)
4311 {
4312     return init_frames(avctx);
4313 }
4314
4315 static int vp9_decode_update_thread_context(AVCodecContext *dst, const AVCodecContext *src)
4316 {
4317     int i, res;
4318     VP9Context *s = dst->priv_data, *ssrc = src->priv_data;
4319
4320     // detect size changes in other threads
4321     if (s->intra_pred_data[0] &&
4322         (!ssrc->intra_pred_data[0] || s->cols != ssrc->cols ||
4323          s->rows != ssrc->rows || s->bpp != ssrc->bpp)) {
4324         free_buffers(s);
4325     }
4326
4327     for (i = 0; i < 3; i++) {
4328         if (s->frames[i].tf.f->data[0])
4329             vp9_unref_frame(dst, &s->frames[i]);
4330         if (ssrc->frames[i].tf.f->data[0]) {
4331             if ((res = vp9_ref_frame(dst, &s->frames[i], &ssrc->frames[i])) < 0)
4332                 return res;
4333         }
4334     }
4335     for (i = 0; i < 8; i++) {
4336         if (s->refs[i].f->data[0])
4337             ff_thread_release_buffer(dst, &s->refs[i]);
4338         if (ssrc->next_refs[i].f->data[0]) {
4339             if ((res = ff_thread_ref_frame(&s->refs[i], &ssrc->next_refs[i])) < 0)
4340                 return res;
4341         }
4342     }
4343
4344     s->invisible = ssrc->invisible;
4345     s->keyframe = ssrc->keyframe;
4346     s->intraonly = ssrc->intraonly;
4347     s->ss_v = ssrc->ss_v;
4348     s->ss_h = ssrc->ss_h;
4349     s->segmentation.enabled = ssrc->segmentation.enabled;
4350     s->segmentation.update_map = ssrc->segmentation.update_map;
4351     s->segmentation.absolute_vals = ssrc->segmentation.absolute_vals;
4352     s->bytesperpixel = ssrc->bytesperpixel;
4353     s->bpp = ssrc->bpp;
4354     s->bpp_index = ssrc->bpp_index;
4355     memcpy(&s->prob_ctx, &ssrc->prob_ctx, sizeof(s->prob_ctx));
4356     memcpy(&s->lf_delta, &ssrc->lf_delta, sizeof(s->lf_delta));
4357     memcpy(&s->segmentation.feat, &ssrc->segmentation.feat,
4358            sizeof(s->segmentation.feat));
4359
4360     return 0;
4361 }
4362
4363 static const AVProfile profiles[] = {
4364     { FF_PROFILE_VP9_0, "Profile 0" },
4365     { FF_PROFILE_VP9_1, "Profile 1" },
4366     { FF_PROFILE_VP9_2, "Profile 2" },
4367     { FF_PROFILE_VP9_3, "Profile 3" },
4368     { FF_PROFILE_UNKNOWN },
4369 };
4370
4371 AVCodec ff_vp9_decoder = {
4372     .name                  = "vp9",
4373     .long_name             = NULL_IF_CONFIG_SMALL("Google VP9"),
4374     .type                  = AVMEDIA_TYPE_VIDEO,
4375     .id                    = AV_CODEC_ID_VP9,
4376     .priv_data_size        = sizeof(VP9Context),
4377     .init                  = vp9_decode_init,
4378     .close                 = vp9_decode_free,
4379     .decode                = vp9_decode_frame,
4380     .capabilities          = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS,
4381     .flush                 = vp9_decode_flush,
4382     .init_thread_copy      = ONLY_IF_THREADS_ENABLED(vp9_decode_init_thread_copy),
4383     .update_thread_context = ONLY_IF_THREADS_ENABLED(vp9_decode_update_thread_context),
4384     .profiles              = NULL_IF_CONFIG_SMALL(profiles),
4385 };