git.sesse.net Git - ffmpeg/blob - libavcodec/vp9.c

   1 /*
   2  * VP9 compatible video decoder
   3  *
   4  * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
   5  * Copyright (C) 2013 Clément Bœsch <u pkh me>
   6  *
   7  * This file is part of FFmpeg.
   8  *
   9  * FFmpeg is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public
  11  * License as published by the Free Software Foundation; either
  12  * version 2.1 of the License, or (at your option) any later version.
  13  *
  14  * FFmpeg is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with FFmpeg; if not, write to the Free Software
  21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22  */
  23
  24 #include "avcodec.h"
  25 #include "get_bits.h"
  26 #include "internal.h"
  27 #include "thread.h"
  28 #include "videodsp.h"
  29 #include "vp56.h"
  30 #include "vp9.h"
  31 #include "vp9data.h"
  32 #include "vp9dsp.h"
  33 #include "libavutil/avassert.h"
  34 #include "libavutil/pixdesc.h"
  35
  36 #define VP9_SYNCCODE 0x498342
  37
  38 enum CompPredMode {
  39     PRED_SINGLEREF,
  40     PRED_COMPREF,
  41     PRED_SWITCHABLE,
  42 };
  43
  44 enum BlockLevel {
  45     BL_64X64,
  46     BL_32X32,
  47     BL_16X16,
  48     BL_8X8,
  49 };
  50
  51 enum BlockSize {
  52     BS_64x64,
  53     BS_64x32,
  54     BS_32x64,
  55     BS_32x32,
  56     BS_32x16,
  57     BS_16x32,
  58     BS_16x16,
  59     BS_16x8,
  60     BS_8x16,
  61     BS_8x8,
  62     BS_8x4,
  63     BS_4x8,
  64     BS_4x4,
  65     N_BS_SIZES,
  66 };
  67
  68 struct VP9mvrefPair {
  69     VP56mv mv[2];
  70     int8_t ref[2];
  71 };
  72
  73 typedef struct VP9Frame {
  74     ThreadFrame tf;
  75     AVBufferRef *extradata;
  76     uint8_t *segmentation_map;
  77     struct VP9mvrefPair *mv;
  78     int uses_2pass;
  79 } VP9Frame;
  80
  81 struct VP9Filter {
  82     uint8_t level[8 * 8];
  83     uint8_t /* bit=col */ mask[2 /* 0=y, 1=uv */][2 /* 0=col, 1=row */]
  84                               [8 /* rows */][4 /* 0=16, 1=8, 2=4, 3=inner4 */];
  85 };
  86
  87 typedef struct VP9Block {
  88     uint8_t seg_id, intra, comp, ref[2], mode[4], uvmode, skip;
  89     enum FilterMode filter;
  90     VP56mv mv[4 /* b_idx */][2 /* ref */];
  91     enum BlockSize bs;
  92     enum TxfmMode tx, uvtx;
  93     enum BlockLevel bl;
  94     enum BlockPartition bp;
  95 } VP9Block;
  96
  97 typedef struct VP9Context {
  98     VP9DSPContext dsp;
  99     VideoDSPContext vdsp;
 100     GetBitContext gb;
 101     VP56RangeCoder c;
 102     VP56RangeCoder *c_b;
 103     unsigned c_b_size;
 104     VP9Block *b_base, *b;
 105     int pass;
 106     int row, row7, col, col7;
 107     uint8_t *dst[3];
 108     ptrdiff_t y_stride, uv_stride;
 109
 110     // bitstream header
 111     uint8_t keyframe, last_keyframe;
 112     uint8_t last_bpp, bpp, bpp_index, bytesperpixel;
 113     uint8_t invisible;
 114     uint8_t use_last_frame_mvs;
 115     uint8_t errorres;
 116     uint8_t ss_h, ss_v;
 117     uint8_t intraonly;
 118     uint8_t resetctx;
 119     uint8_t refreshrefmask;
 120     uint8_t highprecisionmvs;
 121     enum FilterMode filtermode;
 122     uint8_t allowcompinter;
 123     uint8_t fixcompref;
 124     uint8_t refreshctx;
 125     uint8_t parallelmode;
 126     uint8_t framectxid;
 127     uint8_t refidx[3];
 128     uint8_t signbias[3];
 129     uint8_t varcompref[2];
 130     ThreadFrame refs[8], next_refs[8];
 131 #define CUR_FRAME 0
 132 #define REF_FRAME_MVPAIR 1
 133 #define REF_FRAME_SEGMAP 2
 134     VP9Frame frames[3];
 135
 136     struct {
 137         uint8_t level;
 138         int8_t sharpness;
 139         uint8_t lim_lut[64];
 140         uint8_t mblim_lut[64];
 141     } filter;
 142     struct {
 143         uint8_t enabled;
 144         int8_t mode[2];
 145         int8_t ref[4];
 146     } lf_delta;
 147     uint8_t yac_qi;
 148     int8_t ydc_qdelta, uvdc_qdelta, uvac_qdelta;
 149     uint8_t lossless;
 150 #define MAX_SEGMENT 8
 151     struct {
 152         uint8_t enabled;
 153         uint8_t temporal;
 154         uint8_t absolute_vals;
 155         uint8_t update_map;
 156         uint8_t ignore_refmap;
 157         struct {
 158             uint8_t q_enabled;
 159             uint8_t lf_enabled;
 160             uint8_t ref_enabled;
 161             uint8_t skip_enabled;
 162             uint8_t ref_val;
 163             int16_t q_val;
 164             int8_t lf_val;
 165             int16_t qmul[2][2];
 166             uint8_t lflvl[4][2];
 167         } feat[MAX_SEGMENT];
 168     } segmentation;
 169     struct {
 170         unsigned log2_tile_cols, log2_tile_rows;
 171         unsigned tile_cols, tile_rows;
 172         unsigned tile_row_start, tile_row_end, tile_col_start, tile_col_end;
 173     } tiling;
 174     unsigned sb_cols, sb_rows, rows, cols;
 175     struct {
 176         prob_context p;
 177         uint8_t coef[4][2][2][6][6][3];
 178     } prob_ctx[4];
 179     struct {
 180         prob_context p;
 181         uint8_t coef[4][2][2][6][6][11];
 182         uint8_t seg[7];
 183         uint8_t segpred[3];
 184     } prob;
 185     struct {
 186         unsigned y_mode[4][10];
 187         unsigned uv_mode[10][10];
 188         unsigned filter[4][3];
 189         unsigned mv_mode[7][4];
 190         unsigned intra[4][2];
 191         unsigned comp[5][2];
 192         unsigned single_ref[5][2][2];
 193         unsigned comp_ref[5][2];
 194         unsigned tx32p[2][4];
 195         unsigned tx16p[2][3];
 196         unsigned tx8p[2][2];
 197         unsigned skip[3][2];
 198         unsigned mv_joint[4];
 199         struct {
 200             unsigned sign[2];
 201             unsigned classes[11];
 202             unsigned class0[2];
 203             unsigned bits[10][2];
 204             unsigned class0_fp[2][4];
 205             unsigned fp[4];
 206             unsigned class0_hp[2];
 207             unsigned hp[2];
 208         } mv_comp[2];
 209         unsigned partition[4][4][4];
 210         unsigned coef[4][2][2][6][6][3];
 211         unsigned eob[4][2][2][6][6][2];
 212     } counts;
 213     enum TxfmMode txfmmode;
 214     enum CompPredMode comppredmode;
 215
 216     // contextual (left/above) cache
 217     DECLARE_ALIGNED(16, uint8_t, left_y_nnz_ctx)[16];
 218     DECLARE_ALIGNED(16, uint8_t, left_mode_ctx)[16];
 219     DECLARE_ALIGNED(16, VP56mv, left_mv_ctx)[16][2];
 220     DECLARE_ALIGNED(16, uint8_t, left_uv_nnz_ctx)[2][16];
 221     DECLARE_ALIGNED(8, uint8_t, left_partition_ctx)[8];
 222     DECLARE_ALIGNED(8, uint8_t, left_skip_ctx)[8];
 223     DECLARE_ALIGNED(8, uint8_t, left_txfm_ctx)[8];
 224     DECLARE_ALIGNED(8, uint8_t, left_segpred_ctx)[8];
 225     DECLARE_ALIGNED(8, uint8_t, left_intra_ctx)[8];
 226     DECLARE_ALIGNED(8, uint8_t, left_comp_ctx)[8];
 227     DECLARE_ALIGNED(8, uint8_t, left_ref_ctx)[8];
 228     DECLARE_ALIGNED(8, uint8_t, left_filter_ctx)[8];
 229     uint8_t *above_partition_ctx;
 230     uint8_t *above_mode_ctx;
 231     // FIXME maybe merge some of the below in a flags field?
 232     uint8_t *above_y_nnz_ctx;
 233     uint8_t *above_uv_nnz_ctx[2];
 234     uint8_t *above_skip_ctx; // 1bit
 235     uint8_t *above_txfm_ctx; // 2bit
 236     uint8_t *above_segpred_ctx; // 1bit
 237     uint8_t *above_intra_ctx; // 1bit
 238     uint8_t *above_comp_ctx; // 1bit
 239     uint8_t *above_ref_ctx; // 2bit
 240     uint8_t *above_filter_ctx;
 241     VP56mv (*above_mv_ctx)[2];
 242
 243     // whole-frame cache
 244     uint8_t *intra_pred_data[3];
 245     struct VP9Filter *lflvl;
 246     DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[135 * 144 * 2];
 247
 248     // block reconstruction intermediates
 249     int block_alloc_using_2pass;
 250     int16_t *block_base, *block, *uvblock_base[2], *uvblock[2];
 251     uint8_t *eob_base, *uveob_base[2], *eob, *uveob[2];
 252     struct { int x, y; } min_mv, max_mv;
 253     DECLARE_ALIGNED(32, uint8_t, tmp_y)[64 * 64 * 2];
 254     DECLARE_ALIGNED(32, uint8_t, tmp_uv)[2][64 * 64 * 2];
 255     uint16_t mvscale[3][2];
 256     uint8_t mvstep[3][2];
 257 } VP9Context;
 258
 259 static const uint8_t bwh_tab[2][N_BS_SIZES][2] = {
 260     {
 261         { 16, 16 }, { 16, 8 }, { 8, 16 }, { 8, 8 }, { 8, 4 }, { 4, 8 },
 262         { 4, 4 }, { 4, 2 }, { 2, 4 }, { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 },
 263     }, {
 264         { 8, 8 }, { 8, 4 }, { 4, 8 }, { 4, 4 }, { 4, 2 }, { 2, 4 },
 265         { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },
 266     }
 267 };
 268
 269 static int vp9_alloc_frame(AVCodecContext *ctx, VP9Frame *f)
 270 {
 271     VP9Context *s = ctx->priv_data;
 272     int ret, sz;
 273
 274     if ((ret = ff_thread_get_buffer(ctx, &f->tf, AV_GET_BUFFER_FLAG_REF)) < 0)
 275         return ret;
 276     sz = 64 * s->sb_cols * s->sb_rows;
 277     if (!(f->extradata = av_buffer_allocz(sz * (1 + sizeof(struct VP9mvrefPair))))) {
 278         ff_thread_release_buffer(ctx, &f->tf);
 279         return AVERROR(ENOMEM);
 280     }
 281
 282     f->segmentation_map = f->extradata->data;
 283     f->mv = (struct VP9mvrefPair *) (f->extradata->data + sz);
 284
 285     return 0;
 286 }
 287
 288 static void vp9_unref_frame(AVCodecContext *ctx, VP9Frame *f)
 289 {
 290     ff_thread_release_buffer(ctx, &f->tf);
 291     av_buffer_unref(&f->extradata);
 292     f->segmentation_map = NULL;
 293 }
 294
 295 static int vp9_ref_frame(AVCodecContext *ctx, VP9Frame *dst, VP9Frame *src)
 296 {
 297     int res;
 298
 299     if ((res = ff_thread_ref_frame(&dst->tf, &src->tf)) < 0) {
 300         return res;
 301     } else if (!(dst->extradata = av_buffer_ref(src->extradata))) {
 302         vp9_unref_frame(ctx, dst);
 303         return AVERROR(ENOMEM);
 304     }
 305
 306     dst->segmentation_map = src->segmentation_map;
 307     dst->mv = src->mv;
 308     dst->uses_2pass = src->uses_2pass;
 309
 310     return 0;
 311 }
 312
 313 static int update_size(AVCodecContext *ctx, int w, int h, enum AVPixelFormat fmt)
 314 {
 315     VP9Context *s = ctx->priv_data;
 316     uint8_t *p;
 317     int bytesperpixel = s->bytesperpixel;
 318
 319     av_assert0(w > 0 && h > 0);
 320
 321     if (s->intra_pred_data[0] && w == ctx->width && h == ctx->height && ctx->pix_fmt == fmt)
 322         return 0;
 323
 324     ctx->width   = w;
 325     ctx->height  = h;
 326     ctx->pix_fmt = fmt;
 327     s->sb_cols   = (w + 63) >> 6;
 328     s->sb_rows   = (h + 63) >> 6;
 329     s->cols      = (w + 7) >> 3;
 330     s->rows      = (h + 7) >> 3;
 331
 332 #define assign(var, type, n) var = (type) p; p += s->sb_cols * (n) * sizeof(*var)
 333     av_freep(&s->intra_pred_data[0]);
 334     // FIXME we slightly over-allocate here for subsampled chroma, but a little
 335     // bit of padding shouldn't affect performance...
 336     p = av_malloc(s->sb_cols * (128 + 192 * bytesperpixel +
 337                                 sizeof(*s->lflvl) + 16 * sizeof(*s->above_mv_ctx)));
 338     if (!p)
 339         return AVERROR(ENOMEM);
 340     assign(s->intra_pred_data[0],  uint8_t *,             64 * bytesperpixel);
 341     assign(s->intra_pred_data[1],  uint8_t *,             64 * bytesperpixel);
 342     assign(s->intra_pred_data[2],  uint8_t *,             64 * bytesperpixel);
 343     assign(s->above_y_nnz_ctx,     uint8_t *,             16);
 344     assign(s->above_mode_ctx,      uint8_t *,             16);
 345     assign(s->above_mv_ctx,        VP56mv(*)[2],          16);
 346     assign(s->above_uv_nnz_ctx[0], uint8_t *,             16);
 347     assign(s->above_uv_nnz_ctx[1], uint8_t *,             16);
 348     assign(s->above_partition_ctx, uint8_t *,              8);
 349     assign(s->above_skip_ctx,      uint8_t *,              8);
 350     assign(s->above_txfm_ctx,      uint8_t *,              8);
 351     assign(s->above_segpred_ctx,   uint8_t *,              8);
 352     assign(s->above_intra_ctx,     uint8_t *,              8);
 353     assign(s->above_comp_ctx,      uint8_t *,              8);
 354     assign(s->above_ref_ctx,       uint8_t *,              8);
 355     assign(s->above_filter_ctx,    uint8_t *,              8);
 356     assign(s->lflvl,               struct VP9Filter *,     1);
 357 #undef assign
 358
 359     // these will be re-allocated a little later
 360     av_freep(&s->b_base);
 361     av_freep(&s->block_base);
 362
 363     if (s->bpp != s->last_bpp) {
 364         ff_vp9dsp_init(&s->dsp, s->bpp);
 365         ff_videodsp_init(&s->vdsp, s->bpp);
 366         s->last_bpp = s->bpp;
 367     }
 368
 369     return 0;
 370 }
 371
 372 static int update_block_buffers(AVCodecContext *ctx)
 373 {
 374     VP9Context *s = ctx->priv_data;
 375     int chroma_blocks, chroma_eobs, bytesperpixel = s->bytesperpixel;
 376
 377     if (s->b_base && s->block_base && s->block_alloc_using_2pass == s->frames[CUR_FRAME].uses_2pass)
 378         return 0;
 379
 380     av_free(s->b_base);
 381     av_free(s->block_base);
 382     chroma_blocks = 64 * 64 >> (s->ss_h + s->ss_v);
 383     chroma_eobs   = 16 * 16 >> (s->ss_h + s->ss_v);
 384     if (s->frames[CUR_FRAME].uses_2pass) {
 385         int sbs = s->sb_cols * s->sb_rows;
 386
 387         s->b_base = av_malloc_array(s->cols * s->rows, sizeof(VP9Block));
 388         s->block_base = av_mallocz(((64 * 64 + 2 * chroma_blocks) * bytesperpixel * sizeof(int16_t) +
 389                                     16 * 16 + 2 * chroma_eobs) * sbs);
 390         if (!s->b_base || !s->block_base)
 391             return AVERROR(ENOMEM);
 392         s->uvblock_base[0] = s->block_base + sbs * 64 * 64 * bytesperpixel;
 393         s->uvblock_base[1] = s->uvblock_base[0] + sbs * chroma_blocks * bytesperpixel;
 394         s->eob_base = (uint8_t *) (s->uvblock_base[1] + sbs * chroma_blocks * bytesperpixel);
 395         s->uveob_base[0] = s->eob_base + 16 * 16 * sbs;
 396         s->uveob_base[1] = s->uveob_base[0] + chroma_eobs * sbs;
 397     } else {
 398         s->b_base = av_malloc(sizeof(VP9Block));
 399         s->block_base = av_mallocz((64 * 64 + 2 * chroma_blocks) * bytesperpixel * sizeof(int16_t) +
 400                                    16 * 16 + 2 * chroma_eobs);
 401         if (!s->b_base || !s->block_base)
 402             return AVERROR(ENOMEM);
 403         s->uvblock_base[0] = s->block_base + 64 * 64 * bytesperpixel;
 404         s->uvblock_base[1] = s->uvblock_base[0] + chroma_blocks * bytesperpixel;
 405         s->eob_base = (uint8_t *) (s->uvblock_base[1] + chroma_blocks * bytesperpixel);
 406         s->uveob_base[0] = s->eob_base + 16 * 16;
 407         s->uveob_base[1] = s->uveob_base[0] + chroma_eobs;
 408     }
 409     s->block_alloc_using_2pass = s->frames[CUR_FRAME].uses_2pass;
 410
 411     return 0;
 412 }
 413
 414 // for some reason the sign bit is at the end, not the start, of a bit sequence
 415 static av_always_inline int get_sbits_inv(GetBitContext *gb, int n)
 416 {
 417     int v = get_bits(gb, n);
 418     return get_bits1(gb) ? -v : v;
 419 }
 420
 421 static av_always_inline int inv_recenter_nonneg(int v, int m)
 422 {
 423     return v > 2 * m ? v : v & 1 ? m - ((v + 1) >> 1) : m + (v >> 1);
 424 }
 425
 426 // differential forward probability updates
 427 static int update_prob(VP56RangeCoder *c, int p)
 428 {
 429     static const int inv_map_table[255] = {
 430           7,  20,  33,  46,  59,  72,  85,  98, 111, 124, 137, 150, 163, 176,
 431         189, 202, 215, 228, 241, 254,   1,   2,   3,   4,   5,   6,   8,   9,
 432          10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  21,  22,  23,  24,
 433          25,  26,  27,  28,  29,  30,  31,  32,  34,  35,  36,  37,  38,  39,
 434          40,  41,  42,  43,  44,  45,  47,  48,  49,  50,  51,  52,  53,  54,
 435          55,  56,  57,  58,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,
 436          70,  71,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,
 437          86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,  99, 100,
 438         101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 112, 113, 114, 115,
 439         116, 117, 118, 119, 120, 121, 122, 123, 125, 126, 127, 128, 129, 130,
 440         131, 132, 133, 134, 135, 136, 138, 139, 140, 141, 142, 143, 144, 145,
 441         146, 147, 148, 149, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160,
 442         161, 162, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
 443         177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 190, 191,
 444         192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 203, 204, 205, 206,
 445         207, 208, 209, 210, 211, 212, 213, 214, 216, 217, 218, 219, 220, 221,
 446         222, 223, 224, 225, 226, 227, 229, 230, 231, 232, 233, 234, 235, 236,
 447         237, 238, 239, 240, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251,
 448         252, 253, 253,
 449     };
 450     int d;
 451
 452     /* This code is trying to do a differential probability update. For a
 453      * current probability A in the range [1, 255], the difference to a new
 454      * probability of any value can be expressed differentially as 1-A,255-A
 455      * where some part of this (absolute range) exists both in positive as
 456      * well as the negative part, whereas another part only exists in one
 457      * half. We're trying to code this shared part differentially, i.e.
 458      * times two where the value of the lowest bit specifies the sign, and
 459      * the single part is then coded on top of this. This absolute difference
 460      * then again has a value of [0,254], but a bigger value in this range
 461      * indicates that we're further away from the original value A, so we
 462      * can code this as a VLC code, since higher values are increasingly
 463      * unlikely. The first 20 values in inv_map_table[] allow 'cheap, rough'
 464      * updates vs. the 'fine, exact' updates further down the range, which
 465      * adds one extra dimension to this differential update model. */
 466
 467     if (!vp8_rac_get(c)) {
 468         d = vp8_rac_get_uint(c, 4) + 0;
 469     } else if (!vp8_rac_get(c)) {
 470         d = vp8_rac_get_uint(c, 4) + 16;
 471     } else if (!vp8_rac_get(c)) {
 472         d = vp8_rac_get_uint(c, 5) + 32;
 473     } else {
 474         d = vp8_rac_get_uint(c, 7);
 475         if (d >= 65)
 476             d = (d << 1) - 65 + vp8_rac_get(c);
 477         d += 64;
 478         av_assert2(d < FF_ARRAY_ELEMS(inv_map_table));
 479     }
 480
 481     return p <= 128 ? 1 + inv_recenter_nonneg(inv_map_table[d], p - 1) :
 482                     255 - inv_recenter_nonneg(inv_map_table[d], 255 - p);
 483 }
 484
 485 static enum AVPixelFormat read_colorspace_details(AVCodecContext *ctx)
 486 {
 487     static const enum AVColorSpace colorspaces[8] = {
 488         AVCOL_SPC_UNSPECIFIED, AVCOL_SPC_BT470BG, AVCOL_SPC_BT709, AVCOL_SPC_SMPTE170M,
 489         AVCOL_SPC_SMPTE240M, AVCOL_SPC_BT2020_NCL, AVCOL_SPC_RESERVED, AVCOL_SPC_RGB,
 490     };
 491     VP9Context *s = ctx->priv_data;
 492     enum AVPixelFormat res;
 493     int bits = ctx->profile <= 1 ? 0 : 1 + get_bits1(&s->gb); // 0:8, 1:10, 2:12
 494
 495     s->bpp_index = bits;
 496     s->bpp = 8 + bits * 2;
 497     s->bytesperpixel = (7 + s->bpp) >> 3;
 498     ctx->colorspace = colorspaces[get_bits(&s->gb, 3)];
 499     if (ctx->colorspace == AVCOL_SPC_RGB) { // RGB = profile 1
 500         static const enum AVPixelFormat pix_fmt_rgb[3] = {
 501             AV_PIX_FMT_GBRP, AV_PIX_FMT_GBRP10, AV_PIX_FMT_GBRP12
 502         };
 503         if (ctx->profile & 1) {
 504             s->ss_h = s->ss_v = 0;
 505             res = pix_fmt_rgb[bits];
 506             ctx->color_range = AVCOL_RANGE_JPEG;
 507             if (get_bits1(&s->gb)) {
 508                 av_log(ctx, AV_LOG_ERROR, "Reserved bit set in RGB\n");
 509                 return AVERROR_INVALIDDATA;
 510             }
 511         } else {
 512             av_log(ctx, AV_LOG_ERROR, "RGB not supported in profile %d\n",
 513                    ctx->profile);
 514             return AVERROR_INVALIDDATA;
 515         }
 516     } else {
 517         static const enum AVPixelFormat pix_fmt_for_ss[3][2 /* v */][2 /* h */] = {
 518             { { AV_PIX_FMT_YUV444P, AV_PIX_FMT_YUV422P },
 519               { AV_PIX_FMT_YUV440P, AV_PIX_FMT_YUV420P } },
 520             { { AV_PIX_FMT_YUV444P10, AV_PIX_FMT_YUV422P10 },
 521               { AV_PIX_FMT_YUV440P10, AV_PIX_FMT_YUV420P10 } },
 522             { { AV_PIX_FMT_YUV444P12, AV_PIX_FMT_YUV422P12 },
 523               { AV_PIX_FMT_YUV440P12, AV_PIX_FMT_YUV420P12 } }
 524         };
 525         ctx->color_range = get_bits1(&s->gb) ? AVCOL_RANGE_JPEG : AVCOL_RANGE_MPEG;
 526         if (ctx->profile & 1) {
 527             s->ss_h = get_bits1(&s->gb);
 528             s->ss_v = get_bits1(&s->gb);
 529             if ((res = pix_fmt_for_ss[bits][s->ss_v][s->ss_h]) == AV_PIX_FMT_YUV420P) {
 530                 av_log(ctx, AV_LOG_ERROR, "YUV 4:2:0 not supported in profile %d\n",
 531                        ctx->profile);
 532                 return AVERROR_INVALIDDATA;
 533             } else if (get_bits1(&s->gb)) {
 534                 av_log(ctx, AV_LOG_ERROR, "Profile %d color details reserved bit set\n",
 535                        ctx->profile);
 536                 return AVERROR_INVALIDDATA;
 537             }
 538         } else {
 539             s->ss_h = s->ss_v = 1;
 540             res = pix_fmt_for_ss[bits][1][1];
 541         }
 542     }
 543
 544     return res;
 545 }
 546
 547 static int decode_frame_header(AVCodecContext *ctx,
 548                                const uint8_t *data, int size, int *ref)
 549 {
 550     VP9Context *s = ctx->priv_data;
 551     int c, i, j, k, l, m, n, w, h, max, size2, res, sharp;
 552     enum AVPixelFormat fmt = ctx->pix_fmt;
 553     int last_invisible;
 554     const uint8_t *data2;
 555
 556     /* general header */
 557     if ((res = init_get_bits8(&s->gb, data, size)) < 0) {
 558         av_log(ctx, AV_LOG_ERROR, "Failed to initialize bitstream reader\n");
 559         return res;
 560     }
 561     if (get_bits(&s->gb, 2) != 0x2) { // frame marker
 562         av_log(ctx, AV_LOG_ERROR, "Invalid frame marker\n");
 563         return AVERROR_INVALIDDATA;
 564     }
 565     ctx->profile  = get_bits1(&s->gb);
 566     ctx->profile |= get_bits1(&s->gb) << 1;
 567     if (ctx->profile == 3) ctx->profile += get_bits1(&s->gb);
 568     if (ctx->profile > 3) {
 569         av_log(ctx, AV_LOG_ERROR, "Profile %d is not yet supported\n", ctx->profile);
 570         return AVERROR_INVALIDDATA;
 571     }
 572     if (get_bits1(&s->gb)) {
 573         *ref = get_bits(&s->gb, 3);
 574         return 0;
 575     }
 576     s->last_keyframe  = s->keyframe;
 577     s->keyframe       = !get_bits1(&s->gb);
 578     last_invisible    = s->invisible;
 579     s->invisible      = !get_bits1(&s->gb);
 580     s->errorres       = get_bits1(&s->gb);
 581     s->use_last_frame_mvs = !s->errorres && !last_invisible;
 582     if (s->keyframe) {
 583         if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
 584             av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
 585             return AVERROR_INVALIDDATA;
 586         }
 587         if ((fmt = read_colorspace_details(ctx)) < 0)
 588             return fmt;
 589         // for profile 1, here follows the subsampling bits
 590         s->refreshrefmask = 0xff;
 591         w = get_bits(&s->gb, 16) + 1;
 592         h = get_bits(&s->gb, 16) + 1;
 593         if (get_bits1(&s->gb)) // display size
 594             skip_bits(&s->gb, 32);
 595     } else {
 596         s->intraonly  = s->invisible ? get_bits1(&s->gb) : 0;
 597         s->resetctx   = s->errorres ? 0 : get_bits(&s->gb, 2);
 598         if (s->intraonly) {
 599             if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
 600                 av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
 601                 return AVERROR_INVALIDDATA;
 602             }
 603             if (ctx->profile >= 1) {
 604                 if ((fmt = read_colorspace_details(ctx)) < 0)
 605                     return fmt;
 606             } else {
 607                 s->ss_h = s->ss_v = 1;
 608                 s->bpp = 8;
 609                 s->bpp_index = 0;
 610                 s->bytesperpixel = 1;
 611                 fmt = AV_PIX_FMT_YUV420P;
 612                 ctx->colorspace = AVCOL_SPC_BT470BG;
 613                 ctx->color_range = AVCOL_RANGE_JPEG;
 614             }
 615             s->refreshrefmask = get_bits(&s->gb, 8);
 616             w = get_bits(&s->gb, 16) + 1;
 617             h = get_bits(&s->gb, 16) + 1;
 618             if (get_bits1(&s->gb)) // display size
 619                 skip_bits(&s->gb, 32);
 620         } else {
 621             s->refreshrefmask = get_bits(&s->gb, 8);
 622             s->refidx[0]      = get_bits(&s->gb, 3);
 623             s->signbias[0]    = get_bits1(&s->gb) && !s->errorres;
 624             s->refidx[1]      = get_bits(&s->gb, 3);
 625             s->signbias[1]    = get_bits1(&s->gb) && !s->errorres;
 626             s->refidx[2]      = get_bits(&s->gb, 3);
 627             s->signbias[2]    = get_bits1(&s->gb) && !s->errorres;
 628             if (!s->refs[s->refidx[0]].f->data[0] ||
 629                 !s->refs[s->refidx[1]].f->data[0] ||
 630                 !s->refs[s->refidx[2]].f->data[0]) {
 631                 av_log(ctx, AV_LOG_ERROR, "Not all references are available\n");
 632                 return AVERROR_INVALIDDATA;
 633             }
 634             if (get_bits1(&s->gb)) {
 635                 w = s->refs[s->refidx[0]].f->width;
 636                 h = s->refs[s->refidx[0]].f->height;
 637             } else if (get_bits1(&s->gb)) {
 638                 w = s->refs[s->refidx[1]].f->width;
 639                 h = s->refs[s->refidx[1]].f->height;
 640             } else if (get_bits1(&s->gb)) {
 641                 w = s->refs[s->refidx[2]].f->width;
 642                 h = s->refs[s->refidx[2]].f->height;
 643             } else {
 644                 w = get_bits(&s->gb, 16) + 1;
 645                 h = get_bits(&s->gb, 16) + 1;
 646             }
 647             // Note that in this code, "CUR_FRAME" is actually before we
 648             // have formally allocated a frame, and thus actually represents
 649             // the _last_ frame
 650             s->use_last_frame_mvs &= s->frames[CUR_FRAME].tf.f->width == w &&
 651                                      s->frames[CUR_FRAME].tf.f->height == h;
 652             if (get_bits1(&s->gb)) // display size
 653                 skip_bits(&s->gb, 32);
 654             s->highprecisionmvs = get_bits1(&s->gb);
 655             s->filtermode = get_bits1(&s->gb) ? FILTER_SWITCHABLE :
 656                                                 get_bits(&s->gb, 2);
 657             s->allowcompinter = (s->signbias[0] != s->signbias[1] ||
 658                                  s->signbias[0] != s->signbias[2]);
 659             if (s->allowcompinter) {
 660                 if (s->signbias[0] == s->signbias[1]) {
 661                     s->fixcompref    = 2;
 662                     s->varcompref[0] = 0;
 663                     s->varcompref[1] = 1;
 664                 } else if (s->signbias[0] == s->signbias[2]) {
 665                     s->fixcompref    = 1;
 666                     s->varcompref[0] = 0;
 667                     s->varcompref[1] = 2;
 668                 } else {
 669                     s->fixcompref    = 0;
 670                     s->varcompref[0] = 1;
 671                     s->varcompref[1] = 2;
 672                 }
 673             }
 674
 675             for (i = 0; i < 3; i++) {
 676                 AVFrame *ref = s->refs[s->refidx[i]].f;
 677                 int refw = ref->width, refh = ref->height;
 678
 679                 if (ref->format != fmt) {
 680                     av_log(ctx, AV_LOG_ERROR,
 681                            "Ref pixfmt (%s) did not match current frame (%s)",
 682                            av_get_pix_fmt_name(ref->format),
 683                            av_get_pix_fmt_name(fmt));
 684                     return AVERROR_INVALIDDATA;
 685                 } else if (refw == w && refh == h) {
 686                     s->mvscale[i][0] = s->mvscale[i][1] = 0;
 687                 } else {
 688                     if (w * 2 < refw || h * 2 < refh || w > 16 * refw || h > 16 * refh) {
 689                         av_log(ctx, AV_LOG_ERROR,
 690                                "Invalid ref frame dimensions %dx%d for frame size %dx%d\n",
 691                                refw, refh, w, h);
 692                         return AVERROR_INVALIDDATA;
 693                     }
 694                     s->mvscale[i][0] = (refw << 14) / w;
 695                     s->mvscale[i][1] = (refh << 14) / h;
 696                     s->mvstep[i][0] = 16 * s->mvscale[i][0] >> 14;
 697                     s->mvstep[i][1] = 16 * s->mvscale[i][1] >> 14;
 698                 }
 699             }
 700         }
 701     }
 702     s->refreshctx   = s->errorres ? 0 : get_bits1(&s->gb);
 703     s->parallelmode = s->errorres ? 1 : get_bits1(&s->gb);
 704     s->framectxid   = c = get_bits(&s->gb, 2);
 705
 706     /* loopfilter header data */
 707     if (s->keyframe || s->errorres || s->intraonly) {
 708         // reset loopfilter defaults
 709         s->lf_delta.ref[0] = 1;
 710         s->lf_delta.ref[1] = 0;
 711         s->lf_delta.ref[2] = -1;
 712         s->lf_delta.ref[3] = -1;
 713         s->lf_delta.mode[0] = 0;
 714         s->lf_delta.mode[1] = 0;
 715         memset(s->segmentation.feat, 0, sizeof(s->segmentation.feat));
 716     }
 717     s->filter.level = get_bits(&s->gb, 6);
 718     sharp = get_bits(&s->gb, 3);
 719     // if sharpness changed, reinit lim/mblim LUTs. if it didn't change, keep
 720     // the old cache values since they are still valid
 721     if (s->filter.sharpness != sharp)
 722         memset(s->filter.lim_lut, 0, sizeof(s->filter.lim_lut));
 723     s->filter.sharpness = sharp;
 724     if ((s->lf_delta.enabled = get_bits1(&s->gb))) {
 725         if (get_bits1(&s->gb)) {
 726             for (i = 0; i < 4; i++)
 727                 if (get_bits1(&s->gb))
 728                     s->lf_delta.ref[i] = get_sbits_inv(&s->gb, 6);
 729             for (i = 0; i < 2; i++)
 730                 if (get_bits1(&s->gb))
 731                     s->lf_delta.mode[i] = get_sbits_inv(&s->gb, 6);
 732         }
 733     }
 734
 735     /* quantization header data */
 736     s->yac_qi      = get_bits(&s->gb, 8);
 737     s->ydc_qdelta  = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
 738     s->uvdc_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
 739     s->uvac_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
 740     s->lossless    = s->yac_qi == 0 && s->ydc_qdelta == 0 &&
 741                      s->uvdc_qdelta == 0 && s->uvac_qdelta == 0;
 742     if (s->lossless)
 743         ctx->properties |= FF_CODEC_PROPERTY_LOSSLESS;
 744
 745     /* segmentation header info */
 746     s->segmentation.ignore_refmap = 0;
 747     if ((s->segmentation.enabled = get_bits1(&s->gb))) {
 748         if ((s->segmentation.update_map = get_bits1(&s->gb))) {
 749             for (i = 0; i < 7; i++)
 750                 s->prob.seg[i] = get_bits1(&s->gb) ?
 751                                  get_bits(&s->gb, 8) : 255;
 752             if ((s->segmentation.temporal = get_bits1(&s->gb))) {
 753                 for (i = 0; i < 3; i++)
 754                     s->prob.segpred[i] = get_bits1(&s->gb) ?
 755                                          get_bits(&s->gb, 8) : 255;
 756             }
 757         }
 758         if ((!s->segmentation.update_map || s->segmentation.temporal) &&
 759             (w != s->frames[CUR_FRAME].tf.f->width ||
 760              h != s->frames[CUR_FRAME].tf.f->height)) {
 761             av_log(ctx, AV_LOG_WARNING,
 762                    "Reference segmap (temp=%d,update=%d) enabled on size-change!\n",
 763                    s->segmentation.temporal, s->segmentation.update_map);
 764             s->segmentation.ignore_refmap = 1;
 765             //return AVERROR_INVALIDDATA;
 766         }
 767
 768         if (get_bits1(&s->gb)) {
 769             s->segmentation.absolute_vals = get_bits1(&s->gb);
 770             for (i = 0; i < 8; i++) {
 771                 if ((s->segmentation.feat[i].q_enabled = get_bits1(&s->gb)))
 772                     s->segmentation.feat[i].q_val = get_sbits_inv(&s->gb, 8);
 773                 if ((s->segmentation.feat[i].lf_enabled = get_bits1(&s->gb)))
 774                     s->segmentation.feat[i].lf_val = get_sbits_inv(&s->gb, 6);
 775                 if ((s->segmentation.feat[i].ref_enabled = get_bits1(&s->gb)))
 776                     s->segmentation.feat[i].ref_val = get_bits(&s->gb, 2);
 777                 s->segmentation.feat[i].skip_enabled = get_bits1(&s->gb);
 778             }
 779         }
 780     }
 781
 782     // set qmul[] based on Y/UV, AC/DC and segmentation Q idx deltas
 783     for (i = 0; i < (s->segmentation.enabled ? 8 : 1); i++) {
 784         int qyac, qydc, quvac, quvdc, lflvl, sh;
 785
 786         if (s->segmentation.enabled && s->segmentation.feat[i].q_enabled) {
 787             if (s->segmentation.absolute_vals)
 788                 qyac = av_clip_uintp2(s->segmentation.feat[i].q_val, 8);
 789             else
 790                 qyac = av_clip_uintp2(s->yac_qi + s->segmentation.feat[i].q_val, 8);
 791         } else {
 792             qyac  = s->yac_qi;
 793         }
 794         qydc  = av_clip_uintp2(qyac + s->ydc_qdelta, 8);
 795         quvdc = av_clip_uintp2(qyac + s->uvdc_qdelta, 8);
 796         quvac = av_clip_uintp2(qyac + s->uvac_qdelta, 8);
 797         qyac  = av_clip_uintp2(qyac, 8);
 798
 799         s->segmentation.feat[i].qmul[0][0] = vp9_dc_qlookup[s->bpp_index][qydc];
 800         s->segmentation.feat[i].qmul[0][1] = vp9_ac_qlookup[s->bpp_index][qyac];
 801         s->segmentation.feat[i].qmul[1][0] = vp9_dc_qlookup[s->bpp_index][quvdc];
 802         s->segmentation.feat[i].qmul[1][1] = vp9_ac_qlookup[s->bpp_index][quvac];
 803
 804         sh = s->filter.level >= 32;
 805         if (s->segmentation.enabled && s->segmentation.feat[i].lf_enabled) {
 806             if (s->segmentation.absolute_vals)
 807                 lflvl = av_clip_uintp2(s->segmentation.feat[i].lf_val, 6);
 808             else
 809                 lflvl = av_clip_uintp2(s->filter.level + s->segmentation.feat[i].lf_val, 6);
 810         } else {
 811             lflvl  = s->filter.level;
 812         }
 813         if (s->lf_delta.enabled) {
 814             s->segmentation.feat[i].lflvl[0][0] =
 815             s->segmentation.feat[i].lflvl[0][1] =
 816                 av_clip_uintp2(lflvl + (s->lf_delta.ref[0] << sh), 6);
 817             for (j = 1; j < 4; j++) {
 818                 s->segmentation.feat[i].lflvl[j][0] =
 819                     av_clip_uintp2(lflvl + ((s->lf_delta.ref[j] +
 820                                              s->lf_delta.mode[0]) * (1 << sh)), 6);
 821                 s->segmentation.feat[i].lflvl[j][1] =
 822                     av_clip_uintp2(lflvl + ((s->lf_delta.ref[j] +
 823                                              s->lf_delta.mode[1]) * (1 << sh)), 6);
 824             }
 825         } else {
 826             memset(s->segmentation.feat[i].lflvl, lflvl,
 827                    sizeof(s->segmentation.feat[i].lflvl));
 828         }
 829     }
 830
 831     /* tiling info */
 832     if ((res = update_size(ctx, w, h, fmt)) < 0) {
 833         av_log(ctx, AV_LOG_ERROR, "Failed to initialize decoder for %dx%d @ %d\n", w, h, fmt);
 834         return res;
 835     }
 836     for (s->tiling.log2_tile_cols = 0;
 837          (s->sb_cols >> s->tiling.log2_tile_cols) > 64;
 838          s->tiling.log2_tile_cols++) ;
 839     for (max = 0; (s->sb_cols >> max) >= 4; max++) ;
 840     max = FFMAX(0, max - 1);
 841     while (max > s->tiling.log2_tile_cols) {
 842         if (get_bits1(&s->gb))
 843             s->tiling.log2_tile_cols++;
 844         else
 845             break;
 846     }
 847     s->tiling.log2_tile_rows = decode012(&s->gb);
 848     s->tiling.tile_rows = 1 << s->tiling.log2_tile_rows;
 849     if (s->tiling.tile_cols != (1 << s->tiling.log2_tile_cols)) {
 850         s->tiling.tile_cols = 1 << s->tiling.log2_tile_cols;
 851         s->c_b = av_fast_realloc(s->c_b, &s->c_b_size,
 852                                  sizeof(VP56RangeCoder) * s->tiling.tile_cols);
 853         if (!s->c_b) {
 854             av_log(ctx, AV_LOG_ERROR, "Ran out of memory during range coder init\n");
 855             return AVERROR(ENOMEM);
 856         }
 857     }
 858
 859     if (s->keyframe || s->errorres || (s->intraonly && s->resetctx == 3)) {
 860         s->prob_ctx[0].p = s->prob_ctx[1].p = s->prob_ctx[2].p =
 861                            s->prob_ctx[3].p = vp9_default_probs;
 862         memcpy(s->prob_ctx[0].coef, vp9_default_coef_probs,
 863                sizeof(vp9_default_coef_probs));
 864         memcpy(s->prob_ctx[1].coef, vp9_default_coef_probs,
 865                sizeof(vp9_default_coef_probs));
 866         memcpy(s->prob_ctx[2].coef, vp9_default_coef_probs,
 867                sizeof(vp9_default_coef_probs));
 868         memcpy(s->prob_ctx[3].coef, vp9_default_coef_probs,
 869                sizeof(vp9_default_coef_probs));
 870     } else if (s->intraonly && s->resetctx == 2) {
 871         s->prob_ctx[c].p = vp9_default_probs;
 872         memcpy(s->prob_ctx[c].coef, vp9_default_coef_probs,
 873                sizeof(vp9_default_coef_probs));
 874     }
 875
 876     // next 16 bits is size of the rest of the header (arith-coded)
 877     size2 = get_bits(&s->gb, 16);
 878     data2 = align_get_bits(&s->gb);
 879     if (size2 > size - (data2 - data)) {
 880         av_log(ctx, AV_LOG_ERROR, "Invalid compressed header size\n");
 881         return AVERROR_INVALIDDATA;
 882     }
 883     ff_vp56_init_range_decoder(&s->c, data2, size2);
 884     if (vp56_rac_get_prob_branchy(&s->c, 128)) { // marker bit
 885         av_log(ctx, AV_LOG_ERROR, "Marker bit was set\n");
 886         return AVERROR_INVALIDDATA;
 887     }
 888
 889     if (s->keyframe || s->intraonly) {
 890         memset(s->counts.coef, 0, sizeof(s->counts.coef));
 891         memset(s->counts.eob,  0, sizeof(s->counts.eob));
 892     } else {
 893         memset(&s->counts, 0, sizeof(s->counts));
 894     }
 895     // FIXME is it faster to not copy here, but do it down in the fw updates
 896     // as explicit copies if the fw update is missing (and skip the copy upon
 897     // fw update)?
 898     s->prob.p = s->prob_ctx[c].p;
 899
 900     // txfm updates
 901     if (s->lossless) {
 902         s->txfmmode = TX_4X4;
 903     } else {
 904         s->txfmmode = vp8_rac_get_uint(&s->c, 2);
 905         if (s->txfmmode == 3)
 906             s->txfmmode += vp8_rac_get(&s->c);
 907
 908         if (s->txfmmode == TX_SWITCHABLE) {
 909             for (i = 0; i < 2; i++)
 910                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 911                     s->prob.p.tx8p[i] = update_prob(&s->c, s->prob.p.tx8p[i]);
 912             for (i = 0; i < 2; i++)
 913                 for (j = 0; j < 2; j++)
 914                     if (vp56_rac_get_prob_branchy(&s->c, 252))
 915                         s->prob.p.tx16p[i][j] =
 916                             update_prob(&s->c, s->prob.p.tx16p[i][j]);
 917             for (i = 0; i < 2; i++)
 918                 for (j = 0; j < 3; j++)
 919                     if (vp56_rac_get_prob_branchy(&s->c, 252))
 920                         s->prob.p.tx32p[i][j] =
 921                             update_prob(&s->c, s->prob.p.tx32p[i][j]);
 922         }
 923     }
 924
 925     // coef updates
 926     for (i = 0; i < 4; i++) {
 927         uint8_t (*ref)[2][6][6][3] = s->prob_ctx[c].coef[i];
 928         if (vp8_rac_get(&s->c)) {
 929             for (j = 0; j < 2; j++)
 930                 for (k = 0; k < 2; k++)
 931                     for (l = 0; l < 6; l++)
 932                         for (m = 0; m < 6; m++) {
 933                             uint8_t *p = s->prob.coef[i][j][k][l][m];
 934                             uint8_t *r = ref[j][k][l][m];
 935                             if (m >= 3 && l == 0) // dc only has 3 pt
 936                                 break;
 937                             for (n = 0; n < 3; n++) {
 938                                 if (vp56_rac_get_prob_branchy(&s->c, 252)) {
 939                                     p[n] = update_prob(&s->c, r[n]);
 940                                 } else {
 941                                     p[n] = r[n];
 942                                 }
 943                             }
 944                             p[3] = 0;
 945                         }
 946         } else {
 947             for (j = 0; j < 2; j++)
 948                 for (k = 0; k < 2; k++)
 949                     for (l = 0; l < 6; l++)
 950                         for (m = 0; m < 6; m++) {
 951                             uint8_t *p = s->prob.coef[i][j][k][l][m];
 952                             uint8_t *r = ref[j][k][l][m];
 953                             if (m > 3 && l == 0) // dc only has 3 pt
 954                                 break;
 955                             memcpy(p, r, 3);
 956                             p[3] = 0;
 957                         }
 958         }
 959         if (s->txfmmode == i)
 960             break;
 961     }
 962
 963     // mode updates
 964     for (i = 0; i < 3; i++)
 965         if (vp56_rac_get_prob_branchy(&s->c, 252))
 966             s->prob.p.skip[i] = update_prob(&s->c, s->prob.p.skip[i]);
 967     if (!s->keyframe && !s->intraonly) {
 968         for (i = 0; i < 7; i++)
 969             for (j = 0; j < 3; j++)
 970                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 971                     s->prob.p.mv_mode[i][j] =
 972                         update_prob(&s->c, s->prob.p.mv_mode[i][j]);
 973
 974         if (s->filtermode == FILTER_SWITCHABLE)
 975             for (i = 0; i < 4; i++)
 976                 for (j = 0; j < 2; j++)
 977                     if (vp56_rac_get_prob_branchy(&s->c, 252))
 978                         s->prob.p.filter[i][j] =
 979                             update_prob(&s->c, s->prob.p.filter[i][j]);
 980
 981         for (i = 0; i < 4; i++)
 982             if (vp56_rac_get_prob_branchy(&s->c, 252))
 983                 s->prob.p.intra[i] = update_prob(&s->c, s->prob.p.intra[i]);
 984
 985         if (s->allowcompinter) {
 986             s->comppredmode = vp8_rac_get(&s->c);
 987             if (s->comppredmode)
 988                 s->comppredmode += vp8_rac_get(&s->c);
 989             if (s->comppredmode == PRED_SWITCHABLE)
 990                 for (i = 0; i < 5; i++)
 991                     if (vp56_rac_get_prob_branchy(&s->c, 252))
 992                         s->prob.p.comp[i] =
 993                             update_prob(&s->c, s->prob.p.comp[i]);
 994         } else {
 995             s->comppredmode = PRED_SINGLEREF;
 996         }
 997
 998         if (s->comppredmode != PRED_COMPREF) {
 999             for (i = 0; i < 5; i++) {
1000                 if (vp56_rac_get_prob_branchy(&s->c, 252))
1001                     s->prob.p.single_ref[i][0] =
1002                         update_prob(&s->c, s->prob.p.single_ref[i][0]);
1003                 if (vp56_rac_get_prob_branchy(&s->c, 252))
1004                     s->prob.p.single_ref[i][1] =
1005                         update_prob(&s->c, s->prob.p.single_ref[i][1]);
1006             }
1007         }
1008
1009         if (s->comppredmode != PRED_SINGLEREF) {
1010             for (i = 0; i < 5; i++)
1011                 if (vp56_rac_get_prob_branchy(&s->c, 252))
1012                     s->prob.p.comp_ref[i] =
1013                         update_prob(&s->c, s->prob.p.comp_ref[i]);
1014         }
1015
1016         for (i = 0; i < 4; i++)
1017             for (j = 0; j < 9; j++)
1018                 if (vp56_rac_get_prob_branchy(&s->c, 252))
1019                     s->prob.p.y_mode[i][j] =
1020                         update_prob(&s->c, s->prob.p.y_mode[i][j]);
1021
1022         for (i = 0; i < 4; i++)
1023             for (j = 0; j < 4; j++)
1024                 for (k = 0; k < 3; k++)
1025                     if (vp56_rac_get_prob_branchy(&s->c, 252))
1026                         s->prob.p.partition[3 - i][j][k] =
1027                             update_prob(&s->c, s->prob.p.partition[3 - i][j][k]);
1028
1029         // mv fields don't use the update_prob subexp model for some reason
1030         for (i = 0; i < 3; i++)
1031             if (vp56_rac_get_prob_branchy(&s->c, 252))
1032                 s->prob.p.mv_joint[i] = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1033
1034         for (i = 0; i < 2; i++) {
1035             if (vp56_rac_get_prob_branchy(&s->c, 252))
1036                 s->prob.p.mv_comp[i].sign = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1037
1038             for (j = 0; j < 10; j++)
1039                 if (vp56_rac_get_prob_branchy(&s->c, 252))
1040                     s->prob.p.mv_comp[i].classes[j] =
1041                         (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1042
1043             if (vp56_rac_get_prob_branchy(&s->c, 252))
1044                 s->prob.p.mv_comp[i].class0 = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1045
1046             for (j = 0; j < 10; j++)
1047                 if (vp56_rac_get_prob_branchy(&s->c, 252))
1048                     s->prob.p.mv_comp[i].bits[j] =
1049                         (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1050         }
1051
1052         for (i = 0; i < 2; i++) {
1053             for (j = 0; j < 2; j++)
1054                 for (k = 0; k < 3; k++)
1055                     if (vp56_rac_get_prob_branchy(&s->c, 252))
1056                         s->prob.p.mv_comp[i].class0_fp[j][k] =
1057                             (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1058
1059             for (j = 0; j < 3; j++)
1060                 if (vp56_rac_get_prob_branchy(&s->c, 252))
1061                     s->prob.p.mv_comp[i].fp[j] =
1062                         (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1063         }
1064
1065         if (s->highprecisionmvs) {
1066             for (i = 0; i < 2; i++) {
1067                 if (vp56_rac_get_prob_branchy(&s->c, 252))
1068                     s->prob.p.mv_comp[i].class0_hp =
1069                         (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1070
1071                 if (vp56_rac_get_prob_branchy(&s->c, 252))
1072                     s->prob.p.mv_comp[i].hp =
1073                         (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1074             }
1075         }
1076     }
1077
1078     return (data2 - data) + size2;
1079 }
1080
1081 static av_always_inline void clamp_mv(VP56mv *dst, const VP56mv *src,
1082                                       VP9Context *s)
1083 {
1084     dst->x = av_clip(src->x, s->min_mv.x, s->max_mv.x);
1085     dst->y = av_clip(src->y, s->min_mv.y, s->max_mv.y);
1086 }
1087
1088 static void find_ref_mvs(VP9Context *s,
1089                          VP56mv *pmv, int ref, int z, int idx, int sb)
1090 {
1091     static const int8_t mv_ref_blk_off[N_BS_SIZES][8][2] = {
1092         [BS_64x64] = {{  3, -1 }, { -1,  3 }, {  4, -1 }, { -1,  4 },
1093                       { -1, -1 }, {  0, -1 }, { -1,  0 }, {  6, -1 }},
1094         [BS_64x32] = {{  0, -1 }, { -1,  0 }, {  4, -1 }, { -1,  2 },
1095                       { -1, -1 }, {  0, -3 }, { -3,  0 }, {  2, -1 }},
1096         [BS_32x64] = {{ -1,  0 }, {  0, -1 }, { -1,  4 }, {  2, -1 },
1097                       { -1, -1 }, { -3,  0 }, {  0, -3 }, { -1,  2 }},
1098         [BS_32x32] = {{  1, -1 }, { -1,  1 }, {  2, -1 }, { -1,  2 },
1099                       { -1, -1 }, {  0, -3 }, { -3,  0 }, { -3, -3 }},
1100         [BS_32x16] = {{  0, -1 }, { -1,  0 }, {  2, -1 }, { -1, -1 },
1101                       { -1,  1 }, {  0, -3 }, { -3,  0 }, { -3, -3 }},
1102         [BS_16x32] = {{ -1,  0 }, {  0, -1 }, { -1,  2 }, { -1, -1 },
1103                       {  1, -1 }, { -3,  0 }, {  0, -3 }, { -3, -3 }},
1104         [BS_16x16] = {{  0, -1 }, { -1,  0 }, {  1, -1 }, { -1,  1 },
1105                       { -1, -1 }, {  0, -3 }, { -3,  0 }, { -3, -3 }},
1106         [BS_16x8]  = {{  0, -1 }, { -1,  0 }, {  1, -1 }, { -1, -1 },
1107                       {  0, -2 }, { -2,  0 }, { -2, -1 }, { -1, -2 }},
1108         [BS_8x16]  = {{ -1,  0 }, {  0, -1 }, { -1,  1 }, { -1, -1 },
1109                       { -2,  0 }, {  0, -2 }, { -1, -2 }, { -2, -1 }},
1110         [BS_8x8]   = {{  0, -1 }, { -1,  0 }, { -1, -1 }, {  0, -2 },
1111                       { -2,  0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
1112         [BS_8x4]   = {{  0, -1 }, { -1,  0 }, { -1, -1 }, {  0, -2 },
1113                       { -2,  0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
1114         [BS_4x8]   = {{  0, -1 }, { -1,  0 }, { -1, -1 }, {  0, -2 },
1115                       { -2,  0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
1116         [BS_4x4]   = {{  0, -1 }, { -1,  0 }, { -1, -1 }, {  0, -2 },
1117                       { -2,  0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
1118     };
1119     VP9Block *b = s->b;
1120     int row = s->row, col = s->col, row7 = s->row7;
1121     const int8_t (*p)[2] = mv_ref_blk_off[b->bs];
1122 #define INVALID_MV 0x80008000U
1123     uint32_t mem = INVALID_MV, mem_sub8x8 = INVALID_MV;
1124     int i;
1125
1126 #define RETURN_DIRECT_MV(mv) \
1127     do { \
1128         uint32_t m = AV_RN32A(&mv); \
1129         if (!idx) { \
1130             AV_WN32A(pmv, m); \
1131             return; \
1132         } else if (mem == INVALID_MV) { \
1133             mem = m; \
1134         } else if (m != mem) { \
1135             AV_WN32A(pmv, m); \
1136             return; \
1137         } \
1138     } while (0)
1139
1140     if (sb >= 0) {
1141         if (sb == 2 || sb == 1) {
1142             RETURN_DIRECT_MV(b->mv[0][z]);
1143         } else if (sb == 3) {
1144             RETURN_DIRECT_MV(b->mv[2][z]);
1145             RETURN_DIRECT_MV(b->mv[1][z]);
1146             RETURN_DIRECT_MV(b->mv[0][z]);
1147         }
1148
1149 #define RETURN_MV(mv) \
1150     do { \
1151         if (sb > 0) { \
1152             VP56mv tmp; \
1153             uint32_t m; \
1154             av_assert2(idx == 1); \
1155             av_assert2(mem != INVALID_MV); \
1156             if (mem_sub8x8 == INVALID_MV) { \
1157                 clamp_mv(&tmp, &mv, s); \
1158                 m = AV_RN32A(&tmp); \
1159                 if (m != mem) { \
1160                     AV_WN32A(pmv, m); \
1161                     return; \
1162                 } \
1163                 mem_sub8x8 = AV_RN32A(&mv); \
1164             } else if (mem_sub8x8 != AV_RN32A(&mv)) { \
1165                 clamp_mv(&tmp, &mv, s); \
1166                 m = AV_RN32A(&tmp); \
1167                 if (m != mem) { \
1168                     AV_WN32A(pmv, m); \
1169                 } else { \
1170                     /* BUG I'm pretty sure this isn't the intention */ \
1171                     AV_WN32A(pmv, 0); \
1172                 } \
1173                 return; \
1174             } \
1175         } else { \
1176             uint32_t m = AV_RN32A(&mv); \
1177             if (!idx) { \
1178                 clamp_mv(pmv, &mv, s); \
1179                 return; \
1180             } else if (mem == INVALID_MV) { \
1181                 mem = m; \
1182             } else if (m != mem) { \
1183                 clamp_mv(pmv, &mv, s); \
1184                 return; \
1185             } \
1186         } \
1187     } while (0)
1188
1189         if (row > 0) {
1190             struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[(row - 1) * s->sb_cols * 8 + col];
1191             if (mv->ref[0] == ref) {
1192                 RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][0]);
1193             } else if (mv->ref[1] == ref) {
1194                 RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][1]);
1195             }
1196         }
1197         if (col > s->tiling.tile_col_start) {
1198             struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[row * s->sb_cols * 8 + col - 1];
1199             if (mv->ref[0] == ref) {
1200                 RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][0]);
1201             } else if (mv->ref[1] == ref) {
1202                 RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][1]);
1203             }
1204         }
1205         i = 2;
1206     } else {
1207         i = 0;
1208     }
1209
1210     // previously coded MVs in this neighbourhood, using same reference frame
1211     for (; i < 8; i++) {
1212         int c = p[i][0] + col, r = p[i][1] + row;
1213
1214         if (c >= s->tiling.tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
1215             struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
1216
1217             if (mv->ref[0] == ref) {
1218                 RETURN_MV(mv->mv[0]);
1219             } else if (mv->ref[1] == ref) {
1220                 RETURN_MV(mv->mv[1]);
1221             }
1222         }
1223     }
1224
1225     // MV at this position in previous frame, using same reference frame
1226     if (s->use_last_frame_mvs) {
1227         struct VP9mvrefPair *mv = &s->frames[REF_FRAME_MVPAIR].mv[row * s->sb_cols * 8 + col];
1228
1229         if (!s->frames[REF_FRAME_MVPAIR].uses_2pass)
1230             ff_thread_await_progress(&s->frames[REF_FRAME_MVPAIR].tf, row >> 3, 0);
1231         if (mv->ref[0] == ref) {
1232             RETURN_MV(mv->mv[0]);
1233         } else if (mv->ref[1] == ref) {
1234             RETURN_MV(mv->mv[1]);
1235         }
1236     }
1237
1238 #define RETURN_SCALE_MV(mv, scale) \
1239     do { \
1240         if (scale) { \
1241             VP56mv mv_temp = { -mv.x, -mv.y }; \
1242             RETURN_MV(mv_temp); \
1243         } else { \
1244             RETURN_MV(mv); \
1245         } \
1246     } while (0)
1247
1248     // previously coded MVs in this neighbourhood, using different reference frame
1249     for (i = 0; i < 8; i++) {
1250         int c = p[i][0] + col, r = p[i][1] + row;
1251
1252         if (c >= s->tiling.tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
1253             struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
1254
1255             if (mv->ref[0] != ref && mv->ref[0] >= 0) {
1256                 RETURN_SCALE_MV(mv->mv[0], s->signbias[mv->ref[0]] != s->signbias[ref]);
1257             }
1258             if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
1259                 // BUG - libvpx has this condition regardless of whether
1260                 // we used the first ref MV and pre-scaling
1261                 AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
1262                 RETURN_SCALE_MV(mv->mv[1], s->signbias[mv->ref[1]] != s->signbias[ref]);
1263             }
1264         }
1265     }
1266
1267     // MV at this position in previous frame, using different reference frame
1268     if (s->use_last_frame_mvs) {
1269         struct VP9mvrefPair *mv = &s->frames[REF_FRAME_MVPAIR].mv[row * s->sb_cols * 8 + col];
1270
1271         // no need to await_progress, because we already did that above
1272         if (mv->ref[0] != ref && mv->ref[0] >= 0) {
1273             RETURN_SCALE_MV(mv->mv[0], s->signbias[mv->ref[0]] != s->signbias[ref]);
1274         }
1275         if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
1276             // BUG - libvpx has this condition regardless of whether
1277             // we used the first ref MV and pre-scaling
1278             AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
1279             RETURN_SCALE_MV(mv->mv[1], s->signbias[mv->ref[1]] != s->signbias[ref]);
1280         }
1281     }
1282
1283     AV_ZERO32(pmv);
1284     clamp_mv(pmv, pmv, s);
1285 #undef INVALID_MV
1286 #undef RETURN_MV
1287 #undef RETURN_SCALE_MV
1288 }
1289
1290 static av_always_inline int read_mv_component(VP9Context *s, int idx, int hp)
1291 {
1292     int bit, sign = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].sign);
1293     int n, c = vp8_rac_get_tree(&s->c, vp9_mv_class_tree,
1294                                 s->prob.p.mv_comp[idx].classes);
1295
1296     s->counts.mv_comp[idx].sign[sign]++;
1297     s->counts.mv_comp[idx].classes[c]++;
1298     if (c) {
1299         int m;
1300
1301         for (n = 0, m = 0; m < c; m++) {
1302             bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].bits[m]);
1303             n |= bit << m;
1304             s->counts.mv_comp[idx].bits[m][bit]++;
1305         }
1306         n <<= 3;
1307         bit = vp8_rac_get_tree(&s->c, vp9_mv_fp_tree, s->prob.p.mv_comp[idx].fp);
1308         n |= bit << 1;
1309         s->counts.mv_comp[idx].fp[bit]++;
1310         if (hp) {
1311             bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].hp);
1312             s->counts.mv_comp[idx].hp[bit]++;
1313             n |= bit;
1314         } else {
1315             n |= 1;
1316             // bug in libvpx - we count for bw entropy purposes even if the
1317             // bit wasn't coded
1318             s->counts.mv_comp[idx].hp[1]++;
1319         }
1320         n += 8 << c;
1321     } else {
1322         n = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0);
1323         s->counts.mv_comp[idx].class0[n]++;
1324         bit = vp8_rac_get_tree(&s->c, vp9_mv_fp_tree,
1325                                s->prob.p.mv_comp[idx].class0_fp[n]);
1326         s->counts.mv_comp[idx].class0_fp[n][bit]++;
1327         n = (n << 3) | (bit << 1);
1328         if (hp) {
1329             bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0_hp);
1330             s->counts.mv_comp[idx].class0_hp[bit]++;
1331             n |= bit;
1332         } else {
1333             n |= 1;
1334             // bug in libvpx - we count for bw entropy purposes even if the
1335             // bit wasn't coded
1336             s->counts.mv_comp[idx].class0_hp[1]++;
1337         }
1338     }
1339
1340     return sign ? -(n + 1) : (n + 1);
1341 }
1342
1343 static void fill_mv(VP9Context *s,
1344                     VP56mv *mv, int mode, int sb)
1345 {
1346     VP9Block *b = s->b;
1347
1348     if (mode == ZEROMV) {
1349         AV_ZERO64(mv);
1350     } else {
1351         int hp;
1352
1353         // FIXME cache this value and reuse for other subblocks
1354         find_ref_mvs(s, &mv[0], b->ref[0], 0, mode == NEARMV,
1355                      mode == NEWMV ? -1 : sb);
1356         // FIXME maybe move this code into find_ref_mvs()
1357         if ((mode == NEWMV || sb == -1) &&
1358             !(hp = s->highprecisionmvs && abs(mv[0].x) < 64 && abs(mv[0].y) < 64)) {
1359             if (mv[0].y & 1) {
1360                 if (mv[0].y < 0)
1361                     mv[0].y++;
1362                 else
1363                     mv[0].y--;
1364             }
1365             if (mv[0].x & 1) {
1366                 if (mv[0].x < 0)
1367                     mv[0].x++;
1368                 else
1369                     mv[0].x--;
1370             }
1371         }
1372         if (mode == NEWMV) {
1373             enum MVJoint j = vp8_rac_get_tree(&s->c, vp9_mv_joint_tree,
1374                                               s->prob.p.mv_joint);
1375
1376             s->counts.mv_joint[j]++;
1377             if (j >= MV_JOINT_V)
1378                 mv[0].y += read_mv_component(s, 0, hp);
1379             if (j & 1)
1380                 mv[0].x += read_mv_component(s, 1, hp);
1381         }
1382
1383         if (b->comp) {
1384             // FIXME cache this value and reuse for other subblocks
1385             find_ref_mvs(s, &mv[1], b->ref[1], 1, mode == NEARMV,
1386                          mode == NEWMV ? -1 : sb);
1387             if ((mode == NEWMV || sb == -1) &&
1388                 !(hp = s->highprecisionmvs && abs(mv[1].x) < 64 && abs(mv[1].y) < 64)) {
1389                 if (mv[1].y & 1) {
1390                     if (mv[1].y < 0)
1391                         mv[1].y++;
1392                     else
1393                         mv[1].y--;
1394                 }
1395                 if (mv[1].x & 1) {
1396                     if (mv[1].x < 0)
1397                         mv[1].x++;
1398                     else
1399                         mv[1].x--;
1400                 }
1401             }
1402             if (mode == NEWMV) {
1403                 enum MVJoint j = vp8_rac_get_tree(&s->c, vp9_mv_joint_tree,
1404                                                   s->prob.p.mv_joint);
1405
1406                 s->counts.mv_joint[j]++;
1407                 if (j >= MV_JOINT_V)
1408                     mv[1].y += read_mv_component(s, 0, hp);
1409                 if (j & 1)
1410                     mv[1].x += read_mv_component(s, 1, hp);
1411             }
1412         }
1413     }
1414 }
1415
1416 static av_always_inline void setctx_2d(uint8_t *ptr, int w, int h,
1417                                        ptrdiff_t stride, int v)
1418 {
1419     switch (w) {
1420     case 1:
1421         do {
1422             *ptr = v;
1423             ptr += stride;
1424         } while (--h);
1425         break;
1426     case 2: {
1427         int v16 = v * 0x0101;
1428         do {
1429             AV_WN16A(ptr, v16);
1430             ptr += stride;
1431         } while (--h);
1432         break;
1433     }
1434     case 4: {
1435         uint32_t v32 = v * 0x01010101;
1436         do {
1437             AV_WN32A(ptr, v32);
1438             ptr += stride;
1439         } while (--h);
1440         break;
1441     }
1442     case 8: {
1443 #if HAVE_FAST_64BIT
1444         uint64_t v64 = v * 0x0101010101010101ULL;
1445         do {
1446             AV_WN64A(ptr, v64);
1447             ptr += stride;
1448         } while (--h);
1449 #else
1450         uint32_t v32 = v * 0x01010101;
1451         do {
1452             AV_WN32A(ptr,     v32);
1453             AV_WN32A(ptr + 4, v32);
1454             ptr += stride;
1455         } while (--h);
1456 #endif
1457         break;
1458     }
1459     }
1460 }
1461
1462 static void decode_mode(AVCodecContext *ctx)
1463 {
1464     static const uint8_t left_ctx[N_BS_SIZES] = {
1465         0x0, 0x8, 0x0, 0x8, 0xc, 0x8, 0xc, 0xe, 0xc, 0xe, 0xf, 0xe, 0xf
1466     };
1467     static const uint8_t above_ctx[N_BS_SIZES] = {
1468         0x0, 0x0, 0x8, 0x8, 0x8, 0xc, 0xc, 0xc, 0xe, 0xe, 0xe, 0xf, 0xf
1469     };
1470     static const uint8_t max_tx_for_bl_bp[N_BS_SIZES] = {
1471         TX_32X32, TX_32X32, TX_32X32, TX_32X32, TX_16X16, TX_16X16,
1472         TX_16X16, TX_8X8, TX_8X8, TX_8X8, TX_4X4, TX_4X4, TX_4X4
1473     };
1474     VP9Context *s = ctx->priv_data;
1475     VP9Block *b = s->b;
1476     int row = s->row, col = s->col, row7 = s->row7;
1477     enum TxfmMode max_tx = max_tx_for_bl_bp[b->bs];
1478     int bw4 = bwh_tab[1][b->bs][0], w4 = FFMIN(s->cols - col, bw4);
1479     int bh4 = bwh_tab[1][b->bs][1], h4 = FFMIN(s->rows - row, bh4), y;
1480     int have_a = row > 0, have_l = col > s->tiling.tile_col_start;
1481     int vref, filter_id;
1482
1483     if (!s->segmentation.enabled) {
1484         b->seg_id = 0;
1485     } else if (s->keyframe || s->intraonly) {
1486         b->seg_id = !s->segmentation.update_map ? 0 :
1487                     vp8_rac_get_tree(&s->c, vp9_segmentation_tree, s->prob.seg);
1488     } else if (!s->segmentation.update_map ||
1489                (s->segmentation.temporal &&
1490                 vp56_rac_get_prob_branchy(&s->c,
1491                     s->prob.segpred[s->above_segpred_ctx[col] +
1492                                     s->left_segpred_ctx[row7]]))) {
1493         if (!s->errorres && !s->segmentation.ignore_refmap) {
1494             int pred = 8, x;
1495             uint8_t *refsegmap = s->frames[REF_FRAME_SEGMAP].segmentation_map;
1496
1497             if (!s->frames[REF_FRAME_SEGMAP].uses_2pass)
1498                 ff_thread_await_progress(&s->frames[REF_FRAME_SEGMAP].tf, row >> 3, 0);
1499             for (y = 0; y < h4; y++) {
1500                 int idx_base = (y + row) * 8 * s->sb_cols + col;
1501                 for (x = 0; x < w4; x++)
1502                     pred = FFMIN(pred, refsegmap[idx_base + x]);
1503             }
1504             av_assert1(pred < 8);
1505             b->seg_id = pred;
1506         } else {
1507             b->seg_id = 0;
1508         }
1509
1510         memset(&s->above_segpred_ctx[col], 1, w4);
1511         memset(&s->left_segpred_ctx[row7], 1, h4);
1512     } else {
1513         b->seg_id = vp8_rac_get_tree(&s->c, vp9_segmentation_tree,
1514                                      s->prob.seg);
1515
1516         memset(&s->above_segpred_ctx[col], 0, w4);
1517         memset(&s->left_segpred_ctx[row7], 0, h4);
1518     }
1519     if (s->segmentation.enabled &&
1520         (s->segmentation.update_map || s->keyframe || s->intraonly)) {
1521         setctx_2d(&s->frames[CUR_FRAME].segmentation_map[row * 8 * s->sb_cols + col],
1522                   bw4, bh4, 8 * s->sb_cols, b->seg_id);
1523     }
1524
1525     b->skip = s->segmentation.enabled &&
1526         s->segmentation.feat[b->seg_id].skip_enabled;
1527     if (!b->skip) {
1528         int c = s->left_skip_ctx[row7] + s->above_skip_ctx[col];
1529         b->skip = vp56_rac_get_prob(&s->c, s->prob.p.skip[c]);
1530         s->counts.skip[c][b->skip]++;
1531     }
1532
1533     if (s->keyframe || s->intraonly) {
1534         b->intra = 1;
1535     } else if (s->segmentation.enabled && s->segmentation.feat[b->seg_id].ref_enabled) {
1536         b->intra = !s->segmentation.feat[b->seg_id].ref_val;
1537     } else {
1538         int c, bit;
1539
1540         if (have_a && have_l) {
1541             c = s->above_intra_ctx[col] + s->left_intra_ctx[row7];
1542             c += (c == 2);
1543         } else {
1544             c = have_a ? 2 * s->above_intra_ctx[col] :
1545                 have_l ? 2 * s->left_intra_ctx[row7] : 0;
1546         }
1547         bit = vp56_rac_get_prob(&s->c, s->prob.p.intra[c]);
1548         s->counts.intra[c][bit]++;
1549         b->intra = !bit;
1550     }
1551
1552     if ((b->intra || !b->skip) && s->txfmmode == TX_SWITCHABLE) {
1553         int c;
1554         if (have_a) {
1555             if (have_l) {
1556                 c = (s->above_skip_ctx[col] ? max_tx :
1557                      s->above_txfm_ctx[col]) +
1558                     (s->left_skip_ctx[row7] ? max_tx :
1559                      s->left_txfm_ctx[row7]) > max_tx;
1560             } else {
1561                 c = s->above_skip_ctx[col] ? 1 :
1562                     (s->above_txfm_ctx[col] * 2 > max_tx);
1563             }
1564         } else if (have_l) {
1565             c = s->left_skip_ctx[row7] ? 1 :
1566                 (s->left_txfm_ctx[row7] * 2 > max_tx);
1567         } else {
1568             c = 1;
1569         }
1570         switch (max_tx) {
1571         case TX_32X32:
1572             b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][0]);
1573             if (b->tx) {
1574                 b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][1]);
1575                 if (b->tx == 2)
1576                     b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][2]);
1577             }
1578             s->counts.tx32p[c][b->tx]++;
1579             break;
1580         case TX_16X16:
1581             b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][0]);
1582             if (b->tx)
1583                 b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][1]);
1584             s->counts.tx16p[c][b->tx]++;
1585             break;
1586         case TX_8X8:
1587             b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx8p[c]);
1588             s->counts.tx8p[c][b->tx]++;
1589             break;
1590         case TX_4X4:
1591             b->tx = TX_4X4;
1592             break;
1593         }
1594     } else {
1595         b->tx = FFMIN(max_tx, s->txfmmode);
1596     }
1597
1598     if (s->keyframe || s->intraonly) {
1599         uint8_t *a = &s->above_mode_ctx[col * 2];
1600         uint8_t *l = &s->left_mode_ctx[(row7) << 1];
1601
1602         b->comp = 0;
1603         if (b->bs > BS_8x8) {
1604             // FIXME the memory storage intermediates here aren't really
1605             // necessary, they're just there to make the code slightly
1606             // simpler for now
1607             b->mode[0] = a[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1608                                     vp9_default_kf_ymode_probs[a[0]][l[0]]);
1609             if (b->bs != BS_8x4) {
1610                 b->mode[1] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1611                                  vp9_default_kf_ymode_probs[a[1]][b->mode[0]]);
1612                 l[0] = a[1] = b->mode[1];
1613             } else {
1614                 l[0] = a[1] = b->mode[1] = b->mode[0];
1615             }
1616             if (b->bs != BS_4x8) {
1617                 b->mode[2] = a[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1618                                         vp9_default_kf_ymode_probs[a[0]][l[1]]);
1619                 if (b->bs != BS_8x4) {
1620                     b->mode[3] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1621                                   vp9_default_kf_ymode_probs[a[1]][b->mode[2]]);
1622                     l[1] = a[1] = b->mode[3];
1623                 } else {
1624                     l[1] = a[1] = b->mode[3] = b->mode[2];
1625                 }
1626             } else {
1627                 b->mode[2] = b->mode[0];
1628                 l[1] = a[1] = b->mode[3] = b->mode[1];
1629             }
1630         } else {
1631             b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1632                                           vp9_default_kf_ymode_probs[*a][*l]);
1633             b->mode[3] = b->mode[2] = b->mode[1] = b->mode[0];
1634             // FIXME this can probably be optimized
1635             memset(a, b->mode[0], bwh_tab[0][b->bs][0]);
1636             memset(l, b->mode[0], bwh_tab[0][b->bs][1]);
1637         }
1638         b->uvmode = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1639                                      vp9_default_kf_uvmode_probs[b->mode[3]]);
1640     } else if (b->intra) {
1641         b->comp = 0;
1642         if (b->bs > BS_8x8) {
1643             b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1644                                           s->prob.p.y_mode[0]);
1645             s->counts.y_mode[0][b->mode[0]]++;
1646             if (b->bs != BS_8x4) {
1647                 b->mode[1] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1648                                               s->prob.p.y_mode[0]);
1649                 s->counts.y_mode[0][b->mode[1]]++;
1650             } else {
1651                 b->mode[1] = b->mode[0];
1652             }
1653             if (b->bs != BS_4x8) {
1654                 b->mode[2] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1655                                               s->prob.p.y_mode[0]);
1656                 s->counts.y_mode[0][b->mode[2]]++;
1657                 if (b->bs != BS_8x4) {
1658                     b->mode[3] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1659                                                   s->prob.p.y_mode[0]);
1660                     s->counts.y_mode[0][b->mode[3]]++;
1661                 } else {
1662                     b->mode[3] = b->mode[2];
1663                 }
1664             } else {
1665                 b->mode[2] = b->mode[0];
1666                 b->mode[3] = b->mode[1];
1667             }
1668         } else {
1669             static const uint8_t size_group[10] = {
1670                 3, 3, 3, 3, 2, 2, 2, 1, 1, 1
1671             };
1672             int sz = size_group[b->bs];
1673
1674             b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1675                                           s->prob.p.y_mode[sz]);
1676             b->mode[1] = b->mode[2] = b->mode[3] = b->mode[0];
1677             s->counts.y_mode[sz][b->mode[3]]++;
1678         }
1679         b->uvmode = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1680                                      s->prob.p.uv_mode[b->mode[3]]);
1681         s->counts.uv_mode[b->mode[3]][b->uvmode]++;
1682     } else {
1683         static const uint8_t inter_mode_ctx_lut[14][14] = {
1684             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1685             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1686             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1687             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1688             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1689             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1690             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1691             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1692             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1693             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1694             { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
1695             { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
1696             { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 0, 3 },
1697             { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 3, 3, 4 },
1698         };
1699
1700         if (s->segmentation.enabled && s->segmentation.feat[b->seg_id].ref_enabled) {
1701             av_assert2(s->segmentation.feat[b->seg_id].ref_val != 0);
1702             b->comp = 0;
1703             b->ref[0] = s->segmentation.feat[b->seg_id].ref_val - 1;
1704         } else {
1705             // read comp_pred flag
1706             if (s->comppredmode != PRED_SWITCHABLE) {
1707                 b->comp = s->comppredmode == PRED_COMPREF;
1708             } else {
1709                 int c;
1710
1711                 // FIXME add intra as ref=0xff (or -1) to make these easier?
1712                 if (have_a) {
1713                     if (have_l) {
1714                         if (s->above_comp_ctx[col] && s->left_comp_ctx[row7]) {
1715                             c = 4;
1716                         } else if (s->above_comp_ctx[col]) {
1717                             c = 2 + (s->left_intra_ctx[row7] ||
1718                                      s->left_ref_ctx[row7] == s->fixcompref);
1719                         } else if (s->left_comp_ctx[row7]) {
1720                             c = 2 + (s->above_intra_ctx[col] ||
1721                                      s->above_ref_ctx[col] == s->fixcompref);
1722                         } else {
1723                             c = (!s->above_intra_ctx[col] &&
1724                                  s->above_ref_ctx[col] == s->fixcompref) ^
1725                             (!s->left_intra_ctx[row7] &&
1726                              s->left_ref_ctx[row & 7] == s->fixcompref);
1727                         }
1728                     } else {
1729                         c = s->above_comp_ctx[col] ? 3 :
1730                         (!s->above_intra_ctx[col] && s->above_ref_ctx[col] == s->fixcompref);
1731                     }
1732                 } else if (have_l) {
1733                     c = s->left_comp_ctx[row7] ? 3 :
1734                     (!s->left_intra_ctx[row7] && s->left_ref_ctx[row7] == s->fixcompref);
1735                 } else {
1736                     c = 1;
1737                 }
1738                 b->comp = vp56_rac_get_prob(&s->c, s->prob.p.comp[c]);
1739                 s->counts.comp[c][b->comp]++;
1740             }
1741
1742             // read actual references
1743             // FIXME probably cache a few variables here to prevent repetitive
1744             // memory accesses below
1745             if (b->comp) /* two references */ {
1746                 int fix_idx = s->signbias[s->fixcompref], var_idx = !fix_idx, c, bit;
1747
1748                 b->ref[fix_idx] = s->fixcompref;
1749                 // FIXME can this codeblob be replaced by some sort of LUT?
1750                 if (have_a) {
1751                     if (have_l) {
1752                         if (s->above_intra_ctx[col]) {
1753                             if (s->left_intra_ctx[row7]) {
1754                                 c = 2;
1755                             } else {
1756                                 c = 1 + 2 * (s->left_ref_ctx[row7] != s->varcompref[1]);
1757                             }
1758                         } else if (s->left_intra_ctx[row7]) {
1759                             c = 1 + 2 * (s->above_ref_ctx[col] != s->varcompref[1]);
1760                         } else {
1761                             int refl = s->left_ref_ctx[row7], refa = s->above_ref_ctx[col];
1762
1763                             if (refl == refa && refa == s->varcompref[1]) {
1764                                 c = 0;
1765                             } else if (!s->left_comp_ctx[row7] && !s->above_comp_ctx[col]) {
1766                                 if ((refa == s->fixcompref && refl == s->varcompref[0]) ||
1767                                     (refl == s->fixcompref && refa == s->varcompref[0])) {
1768                                     c = 4;
1769                                 } else {
1770                                     c = (refa == refl) ? 3 : 1;
1771                                 }
1772                             } else if (!s->left_comp_ctx[row7]) {
1773                                 if (refa == s->varcompref[1] && refl != s->varcompref[1]) {
1774                                     c = 1;
1775                                 } else {
1776                                     c = (refl == s->varcompref[1] &&
1777                                          refa != s->varcompref[1]) ? 2 : 4;
1778                                 }
1779                             } else if (!s->above_comp_ctx[col]) {
1780                                 if (refl == s->varcompref[1] && refa != s->varcompref[1]) {
1781                                     c = 1;
1782                                 } else {
1783                                     c = (refa == s->varcompref[1] &&
1784                                          refl != s->varcompref[1]) ? 2 : 4;
1785                                 }
1786                             } else {
1787                                 c = (refl == refa) ? 4 : 2;
1788                             }
1789                         }
1790                     } else {
1791                         if (s->above_intra_ctx[col]) {
1792                             c = 2;
1793                         } else if (s->above_comp_ctx[col]) {
1794                             c = 4 * (s->above_ref_ctx[col] != s->varcompref[1]);
1795                         } else {
1796                             c = 3 * (s->above_ref_ctx[col] != s->varcompref[1]);
1797                         }
1798                     }
1799                 } else if (have_l) {
1800                     if (s->left_intra_ctx[row7]) {
1801                         c = 2;
1802                     } else if (s->left_comp_ctx[row7]) {
1803                         c = 4 * (s->left_ref_ctx[row7] != s->varcompref[1]);
1804                     } else {
1805                         c = 3 * (s->left_ref_ctx[row7] != s->varcompref[1]);
1806                     }
1807                 } else {
1808                     c = 2;
1809                 }
1810                 bit = vp56_rac_get_prob(&s->c, s->prob.p.comp_ref[c]);
1811                 b->ref[var_idx] = s->varcompref[bit];
1812                 s->counts.comp_ref[c][bit]++;
1813             } else /* single reference */ {
1814                 int bit, c;
1815
1816                 if (have_a && !s->above_intra_ctx[col]) {
1817                     if (have_l && !s->left_intra_ctx[row7]) {
1818                         if (s->left_comp_ctx[row7]) {
1819                             if (s->above_comp_ctx[col]) {
1820                                 c = 1 + (!s->fixcompref || !s->left_ref_ctx[row7] ||
1821                                          !s->above_ref_ctx[col]);
1822                             } else {
1823                                 c = (3 * !s->above_ref_ctx[col]) +
1824                                     (!s->fixcompref || !s->left_ref_ctx[row7]);
1825                             }
1826                         } else if (s->above_comp_ctx[col]) {
1827                             c = (3 * !s->left_ref_ctx[row7]) +
1828                                 (!s->fixcompref || !s->above_ref_ctx[col]);
1829                         } else {
1830                             c = 2 * !s->left_ref_ctx[row7] + 2 * !s->above_ref_ctx[col];
1831                         }
1832                     } else if (s->above_intra_ctx[col]) {
1833                         c = 2;
1834                     } else if (s->above_comp_ctx[col]) {
1835                         c = 1 + (!s->fixcompref || !s->above_ref_ctx[col]);
1836                     } else {
1837                         c = 4 * (!s->above_ref_ctx[col]);
1838                     }
1839                 } else if (have_l && !s->left_intra_ctx[row7]) {
1840                     if (s->left_intra_ctx[row7]) {
1841                         c = 2;
1842                     } else if (s->left_comp_ctx[row7]) {
1843                         c = 1 + (!s->fixcompref || !s->left_ref_ctx[row7]);
1844                     } else {
1845                         c = 4 * (!s->left_ref_ctx[row7]);
1846                     }
1847                 } else {
1848                     c = 2;
1849                 }
1850                 bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][0]);
1851                 s->counts.single_ref[c][0][bit]++;
1852                 if (!bit) {
1853                     b->ref[0] = 0;
1854                 } else {
1855                     // FIXME can this codeblob be replaced by some sort of LUT?
1856                     if (have_a) {
1857                         if (have_l) {
1858                             if (s->left_intra_ctx[row7]) {
1859                                 if (s->above_intra_ctx[col]) {
1860                                     c = 2;
1861                                 } else if (s->above_comp_ctx[col]) {
1862                                     c = 1 + 2 * (s->fixcompref == 1 ||
1863                                                  s->above_ref_ctx[col] == 1);
1864                                 } else if (!s->above_ref_ctx[col]) {
1865                                     c = 3;
1866                                 } else {
1867                                     c = 4 * (s->above_ref_ctx[col] == 1);
1868                                 }
1869                             } else if (s->above_intra_ctx[col]) {
1870                                 if (s->left_intra_ctx[row7]) {
1871                                     c = 2;
1872                                 } else if (s->left_comp_ctx[row7]) {
1873                                     c = 1 + 2 * (s->fixcompref == 1 ||
1874                                                  s->left_ref_ctx[row7] == 1);
1875                                 } else if (!s->left_ref_ctx[row7]) {
1876                                     c = 3;
1877                                 } else {
1878                                     c = 4 * (s->left_ref_ctx[row7] == 1);
1879                                 }
1880                             } else if (s->above_comp_ctx[col]) {
1881                                 if (s->left_comp_ctx[row7]) {
1882                                     if (s->left_ref_ctx[row7] == s->above_ref_ctx[col]) {
1883                                         c = 3 * (s->fixcompref == 1 ||
1884                                                  s->left_ref_ctx[row7] == 1);
1885                                     } else {
1886                                         c = 2;
1887                                     }
1888                                 } else if (!s->left_ref_ctx[row7]) {
1889                                     c = 1 + 2 * (s->fixcompref == 1 ||
1890                                                  s->above_ref_ctx[col] == 1);
1891                                 } else {
1892                                     c = 3 * (s->left_ref_ctx[row7] == 1) +
1893                                     (s->fixcompref == 1 || s->above_ref_ctx[col] == 1);
1894                                 }
1895                             } else if (s->left_comp_ctx[row7]) {
1896                                 if (!s->above_ref_ctx[col]) {
1897                                     c = 1 + 2 * (s->fixcompref == 1 ||
1898                                                  s->left_ref_ctx[row7] == 1);
1899                                 } else {
1900                                     c = 3 * (s->above_ref_ctx[col] == 1) +
1901                                     (s->fixcompref == 1 || s->left_ref_ctx[row7] == 1);
1902                                 }
1903                             } else if (!s->above_ref_ctx[col]) {
1904                                 if (!s->left_ref_ctx[row7]) {
1905                                     c = 3;
1906                                 } else {
1907                                     c = 4 * (s->left_ref_ctx[row7] == 1);
1908                                 }
1909                             } else if (!s->left_ref_ctx[row7]) {
1910                                 c = 4 * (s->above_ref_ctx[col] == 1);
1911                             } else {
1912                                 c = 2 * (s->left_ref_ctx[row7] == 1) +
1913                                 2 * (s->above_ref_ctx[col] == 1);
1914                             }
1915                         } else {
1916                             if (s->above_intra_ctx[col] ||
1917                                 (!s->above_comp_ctx[col] && !s->above_ref_ctx[col])) {
1918                                 c = 2;
1919                             } else if (s->above_comp_ctx[col]) {
1920                                 c = 3 * (s->fixcompref == 1 || s->above_ref_ctx[col] == 1);
1921                             } else {
1922                                 c = 4 * (s->above_ref_ctx[col] == 1);
1923                             }
1924                         }
1925                     } else if (have_l) {
1926                         if (s->left_intra_ctx[row7] ||
1927                             (!s->left_comp_ctx[row7] && !s->left_ref_ctx[row7])) {
1928                             c = 2;
1929                         } else if (s->left_comp_ctx[row7]) {
1930                             c = 3 * (s->fixcompref == 1 || s->left_ref_ctx[row7] == 1);
1931                         } else {
1932                             c = 4 * (s->left_ref_ctx[row7] == 1);
1933                         }
1934                     } else {
1935                         c = 2;
1936                     }
1937                     bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][1]);
1938                     s->counts.single_ref[c][1][bit]++;
1939                     b->ref[0] = 1 + bit;
1940                 }
1941             }
1942         }
1943
1944         if (b->bs <= BS_8x8) {
1945             if (s->segmentation.enabled && s->segmentation.feat[b->seg_id].skip_enabled) {
1946                 b->mode[0] = b->mode[1] = b->mode[2] = b->mode[3] = ZEROMV;
1947             } else {
1948                 static const uint8_t off[10] = {
1949                     3, 0, 0, 1, 0, 0, 0, 0, 0, 0
1950                 };
1951
1952                 // FIXME this needs to use the LUT tables from find_ref_mvs
1953                 // because not all are -1,0/0,-1
1954                 int c = inter_mode_ctx_lut[s->above_mode_ctx[col + off[b->bs]]]
1955                                           [s->left_mode_ctx[row7 + off[b->bs]]];
1956
1957                 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1958                                               s->prob.p.mv_mode[c]);
1959                 b->mode[1] = b->mode[2] = b->mode[3] = b->mode[0];
1960                 s->counts.mv_mode[c][b->mode[0] - 10]++;
1961             }
1962         }
1963
1964         if (s->filtermode == FILTER_SWITCHABLE) {
1965             int c;
1966
1967             if (have_a && s->above_mode_ctx[col] >= NEARESTMV) {
1968                 if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
1969                     c = s->above_filter_ctx[col] == s->left_filter_ctx[row7] ?
1970                         s->left_filter_ctx[row7] : 3;
1971                 } else {
1972                     c = s->above_filter_ctx[col];
1973                 }
1974             } else if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
1975                 c = s->left_filter_ctx[row7];
1976             } else {
1977                 c = 3;
1978             }
1979
1980             filter_id = vp8_rac_get_tree(&s->c, vp9_filter_tree,
1981                                          s->prob.p.filter[c]);
1982             s->counts.filter[c][filter_id]++;
1983             b->filter = vp9_filter_lut[filter_id];
1984         } else {
1985             b->filter = s->filtermode;
1986         }
1987
1988         if (b->bs > BS_8x8) {
1989             int c = inter_mode_ctx_lut[s->above_mode_ctx[col]][s->left_mode_ctx[row7]];
1990
1991             b->mode[0] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1992                                           s->prob.p.mv_mode[c]);
1993             s->counts.mv_mode[c][b->mode[0] - 10]++;
1994             fill_mv(s, b->mv[0], b->mode[0], 0);
1995
1996             if (b->bs != BS_8x4) {
1997                 b->mode[1] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1998                                               s->prob.p.mv_mode[c]);
1999                 s->counts.mv_mode[c][b->mode[1] - 10]++;
2000                 fill_mv(s, b->mv[1], b->mode[1], 1);
2001             } else {
2002                 b->mode[1] = b->mode[0];
2003                 AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
2004                 AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
2005             }
2006
2007             if (b->bs != BS_4x8) {
2008                 b->mode[2] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
2009                                               s->prob.p.mv_mode[c]);
2010                 s->counts.mv_mode[c][b->mode[2] - 10]++;
2011                 fill_mv(s, b->mv[2], b->mode[2], 2);
2012
2013                 if (b->bs != BS_8x4) {
2014                     b->mode[3] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
2015                                                   s->prob.p.mv_mode[c]);
2016                     s->counts.mv_mode[c][b->mode[3] - 10]++;
2017                     fill_mv(s, b->mv[3], b->mode[3], 3);
2018                 } else {
2019                     b->mode[3] = b->mode[2];
2020                     AV_COPY32(&b->mv[3][0], &b->mv[2][0]);
2021                     AV_COPY32(&b->mv[3][1], &b->mv[2][1]);
2022                 }
2023             } else {
2024                 b->mode[2] = b->mode[0];
2025                 AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
2026                 AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
2027                 b->mode[3] = b->mode[1];
2028                 AV_COPY32(&b->mv[3][0], &b->mv[1][0]);
2029                 AV_COPY32(&b->mv[3][1], &b->mv[1][1]);
2030             }
2031         } else {
2032             fill_mv(s, b->mv[0], b->mode[0], -1);
2033             AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
2034             AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
2035             AV_COPY32(&b->mv[3][0], &b->mv[0][0]);
2036             AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
2037             AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
2038             AV_COPY32(&b->mv[3][1], &b->mv[0][1]);
2039         }
2040
2041         vref = b->ref[b->comp ? s->signbias[s->varcompref[0]] : 0];
2042     }
2043
2044 #if HAVE_FAST_64BIT
2045 #define SPLAT_CTX(var, val, n) \
2046     switch (n) { \
2047     case 1:  var = val;                                    break; \
2048     case 2:  AV_WN16A(&var, val *             0x0101);     break; \
2049     case 4:  AV_WN32A(&var, val *         0x01010101);     break; \
2050     case 8:  AV_WN64A(&var, val * 0x0101010101010101ULL);  break; \
2051     case 16: { \
2052         uint64_t v64 = val * 0x0101010101010101ULL; \
2053         AV_WN64A(              &var,     v64); \
2054         AV_WN64A(&((uint8_t *) &var)[8], v64); \
2055         break; \
2056     } \
2057     }
2058 #else
2059 #define SPLAT_CTX(var, val, n) \
2060     switch (n) { \
2061     case 1:  var = val;                         break; \
2062     case 2:  AV_WN16A(&var, val *     0x0101);  break; \
2063     case 4:  AV_WN32A(&var, val * 0x01010101);  break; \
2064     case 8: { \
2065         uint32_t v32 = val * 0x01010101; \
2066         AV_WN32A(              &var,     v32); \
2067         AV_WN32A(&((uint8_t *) &var)[4], v32); \
2068         break; \
2069     } \
2070     case 16: { \
2071         uint32_t v32 = val * 0x01010101; \
2072         AV_WN32A(              &var,      v32); \
2073         AV_WN32A(&((uint8_t *) &var)[4],  v32); \
2074         AV_WN32A(&((uint8_t *) &var)[8],  v32); \
2075         AV_WN32A(&((uint8_t *) &var)[12], v32); \
2076         break; \
2077     } \
2078     }
2079 #endif
2080
2081     switch (bwh_tab[1][b->bs][0]) {
2082 #define SET_CTXS(dir, off, n) \
2083     do { \
2084         SPLAT_CTX(s->dir##_skip_ctx[off],      b->skip,          n); \
2085         SPLAT_CTX(s->dir##_txfm_ctx[off],      b->tx,            n); \
2086         SPLAT_CTX(s->dir##_partition_ctx[off], dir##_ctx[b->bs], n); \
2087         if (!s->keyframe && !s->intraonly) { \
2088             SPLAT_CTX(s->dir##_intra_ctx[off], b->intra,   n); \
2089             SPLAT_CTX(s->dir##_comp_ctx[off],  b->comp,    n); \
2090             SPLAT_CTX(s->dir##_mode_ctx[off],  b->mode[3], n); \
2091             if (!b->intra) { \
2092                 SPLAT_CTX(s->dir##_ref_ctx[off], vref, n); \
2093                 if (s->filtermode == FILTER_SWITCHABLE) { \
2094                     SPLAT_CTX(s->dir##_filter_ctx[off], filter_id, n); \
2095                 } \
2096             } \
2097         } \
2098     } while (0)
2099     case 1: SET_CTXS(above, col, 1); break;
2100     case 2: SET_CTXS(above, col, 2); break;
2101     case 4: SET_CTXS(above, col, 4); break;
2102     case 8: SET_CTXS(above, col, 8); break;
2103     }
2104     switch (bwh_tab[1][b->bs][1]) {
2105     case 1: SET_CTXS(left, row7, 1); break;
2106     case 2: SET_CTXS(left, row7, 2); break;
2107     case 4: SET_CTXS(left, row7, 4); break;
2108     case 8: SET_CTXS(left, row7, 8); break;
2109     }
2110 #undef SPLAT_CTX
2111 #undef SET_CTXS
2112
2113     if (!s->keyframe && !s->intraonly) {
2114         if (b->bs > BS_8x8) {
2115             int mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
2116
2117             AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][0], &b->mv[1][0]);
2118             AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][1], &b->mv[1][1]);
2119             AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][0], mv0);
2120             AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][1], mv1);
2121             AV_COPY32(&s->above_mv_ctx[col * 2 + 0][0], &b->mv[2][0]);
2122             AV_COPY32(&s->above_mv_ctx[col * 2 + 0][1], &b->mv[2][1]);
2123             AV_WN32A(&s->above_mv_ctx[col * 2 + 1][0], mv0);
2124             AV_WN32A(&s->above_mv_ctx[col * 2 + 1][1], mv1);
2125         } else {
2126             int n, mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
2127
2128             for (n = 0; n < w4 * 2; n++) {
2129                 AV_WN32A(&s->above_mv_ctx[col * 2 + n][0], mv0);
2130                 AV_WN32A(&s->above_mv_ctx[col * 2 + n][1], mv1);
2131             }
2132             for (n = 0; n < h4 * 2; n++) {
2133                 AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][0], mv0);
2134                 AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][1], mv1);
2135             }
2136         }
2137     }
2138
2139     // FIXME kinda ugly
2140     for (y = 0; y < h4; y++) {
2141         int x, o = (row + y) * s->sb_cols * 8 + col;
2142         struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[o];
2143
2144         if (b->intra) {
2145             for (x = 0; x < w4; x++) {
2146                 mv[x].ref[0] =
2147                 mv[x].ref[1] = -1;
2148             }
2149         } else if (b->comp) {
2150             for (x = 0; x < w4; x++) {
2151                 mv[x].ref[0] = b->ref[0];
2152                 mv[x].ref[1] = b->ref[1];
2153                 AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
2154                 AV_COPY32(&mv[x].mv[1], &b->mv[3][1]);
2155             }
2156         } else {
2157             for (x = 0; x < w4; x++) {
2158                 mv[x].ref[0] = b->ref[0];
2159                 mv[x].ref[1] = -1;
2160                 AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
2161             }
2162         }
2163     }
2164 }
2165
2166 // FIXME merge cnt/eob arguments?
2167 static av_always_inline int
2168 decode_coeffs_b_generic(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
2169                         int is_tx32x32, int is8bitsperpixel, int bpp, unsigned (*cnt)[6][3],
2170                         unsigned (*eob)[6][2], uint8_t (*p)[6][11],
2171                         int nnz, const int16_t *scan, const int16_t (*nb)[2],
2172                         const int16_t *band_counts, const int16_t *qmul)
2173 {
2174     int i = 0, band = 0, band_left = band_counts[band];
2175     uint8_t *tp = p[0][nnz];
2176     uint8_t cache[1024];
2177
2178     do {
2179         int val, rc;
2180
2181         val = vp56_rac_get_prob_branchy(c, tp[0]); // eob
2182         eob[band][nnz][val]++;
2183         if (!val)
2184             break;
2185
2186     skip_eob:
2187         if (!vp56_rac_get_prob_branchy(c, tp[1])) { // zero
2188             cnt[band][nnz][0]++;
2189             if (!--band_left)
2190                 band_left = band_counts[++band];
2191             cache[scan[i]] = 0;
2192             nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
2193             tp = p[band][nnz];
2194             if (++i == n_coeffs)
2195                 break; //invalid input; blocks should end with EOB
2196             goto skip_eob;
2197         }
2198
2199         rc = scan[i];
2200         if (!vp56_rac_get_prob_branchy(c, tp[2])) { // one
2201             cnt[band][nnz][1]++;
2202             val = 1;
2203             cache[rc] = 1;
2204         } else {
2205             // fill in p[3-10] (model fill) - only once per frame for each pos
2206             if (!tp[3])
2207                 memcpy(&tp[3], vp9_model_pareto8[tp[2]], 8);
2208
2209             cnt[band][nnz][2]++;
2210             if (!vp56_rac_get_prob_branchy(c, tp[3])) { // 2, 3, 4
2211                 if (!vp56_rac_get_prob_branchy(c, tp[4])) {
2212                     cache[rc] = val = 2;
2213                 } else {
2214                     val = 3 + vp56_rac_get_prob(c, tp[5]);
2215                     cache[rc] = 3;
2216                 }
2217             } else if (!vp56_rac_get_prob_branchy(c, tp[6])) { // cat1/2
2218                 cache[rc] = 4;
2219                 if (!vp56_rac_get_prob_branchy(c, tp[7])) {
2220                     val = 5 + vp56_rac_get_prob(c, 159);
2221                 } else {
2222                     val  = 7 + (vp56_rac_get_prob(c, 165) << 1);
2223                     val +=      vp56_rac_get_prob(c, 145);
2224                 }
2225             } else { // cat 3-6
2226                 cache[rc] = 5;
2227                 if (!vp56_rac_get_prob_branchy(c, tp[8])) {
2228                     if (!vp56_rac_get_prob_branchy(c, tp[9])) {
2229                         val  = 11 + (vp56_rac_get_prob(c, 173) << 2);
2230                         val +=      (vp56_rac_get_prob(c, 148) << 1);
2231                         val +=       vp56_rac_get_prob(c, 140);
2232                     } else {
2233                         val  = 19 + (vp56_rac_get_prob(c, 176) << 3);
2234                         val +=      (vp56_rac_get_prob(c, 155) << 2);
2235                         val +=      (vp56_rac_get_prob(c, 140) << 1);
2236                         val +=       vp56_rac_get_prob(c, 135);
2237                     }
2238                 } else if (!vp56_rac_get_prob_branchy(c, tp[10])) {
2239                     val  = 35 + (vp56_rac_get_prob(c, 180) << 4);
2240                     val +=      (vp56_rac_get_prob(c, 157) << 3);
2241                     val +=      (vp56_rac_get_prob(c, 141) << 2);
2242                     val +=      (vp56_rac_get_prob(c, 134) << 1);
2243                     val +=       vp56_rac_get_prob(c, 130);
2244                 } else {
2245                     val = 67;
2246                     if (!is8bitsperpixel) {
2247                         if (bpp == 12) {
2248                             val += vp56_rac_get_prob(c, 255) << 17;
2249                             val += vp56_rac_get_prob(c, 255) << 16;
2250                         }
2251                         val +=  (vp56_rac_get_prob(c, 255) << 15);
2252                         val +=  (vp56_rac_get_prob(c, 255) << 14);
2253                     }
2254                     val +=      (vp56_rac_get_prob(c, 254) << 13);
2255                     val +=      (vp56_rac_get_prob(c, 254) << 12);
2256                     val +=      (vp56_rac_get_prob(c, 254) << 11);
2257                     val +=      (vp56_rac_get_prob(c, 252) << 10);
2258                     val +=      (vp56_rac_get_prob(c, 249) << 9);
2259                     val +=      (vp56_rac_get_prob(c, 243) << 8);
2260                     val +=      (vp56_rac_get_prob(c, 230) << 7);
2261                     val +=      (vp56_rac_get_prob(c, 196) << 6);
2262                     val +=      (vp56_rac_get_prob(c, 177) << 5);
2263                     val +=      (vp56_rac_get_prob(c, 153) << 4);
2264                     val +=      (vp56_rac_get_prob(c, 140) << 3);
2265                     val +=      (vp56_rac_get_prob(c, 133) << 2);
2266                     val +=      (vp56_rac_get_prob(c, 130) << 1);
2267                     val +=       vp56_rac_get_prob(c, 129);
2268                 }
2269             }
2270         }
2271 #define STORE_COEF(c, i, v) do { \
2272     if (is8bitsperpixel) { \
2273         c[i] = v; \
2274     } else { \
2275         AV_WN32A(&c[i * 2], v); \
2276     } \
2277 } while (0)
2278         if (!--band_left)
2279             band_left = band_counts[++band];
2280         if (is_tx32x32)
2281             STORE_COEF(coef, rc, ((vp8_rac_get(c) ? -val : val) * qmul[!!i]) / 2);
2282         else
2283             STORE_COEF(coef, rc, (vp8_rac_get(c) ? -val : val) * qmul[!!i]);
2284         nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
2285         tp = p[band][nnz];
2286     } while (++i < n_coeffs);
2287
2288     return i;
2289 }
2290
2291 static int decode_coeffs_b_8bpp(VP9Context *s, int16_t *coef, int n_coeffs,
2292                                 unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2293                                 uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2294                                 const int16_t (*nb)[2], const int16_t *band_counts,
2295                                 const int16_t *qmul)
2296 {
2297     return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 0, 1, 8, cnt, eob, p,
2298                                    nnz, scan, nb, band_counts, qmul);
2299 }
2300
2301 static int decode_coeffs_b32_8bpp(VP9Context *s, int16_t *coef, int n_coeffs,
2302                                   unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2303                                   uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2304                                   const int16_t (*nb)[2], const int16_t *band_counts,
2305                                   const int16_t *qmul)
2306 {
2307     return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 1, 1, 8, cnt, eob, p,
2308                                    nnz, scan, nb, band_counts, qmul);
2309 }
2310
2311 static int decode_coeffs_b_16bpp(VP9Context *s, int16_t *coef, int n_coeffs,
2312                                  unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2313                                  uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2314                                  const int16_t (*nb)[2], const int16_t *band_counts,
2315                                  const int16_t *qmul)
2316 {
2317     return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 0, 0, s->bpp, cnt, eob, p,
2318                                    nnz, scan, nb, band_counts, qmul);
2319 }
2320
2321 static int decode_coeffs_b32_16bpp(VP9Context *s, int16_t *coef, int n_coeffs,
2322                                    unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2323                                    uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2324                                    const int16_t (*nb)[2], const int16_t *band_counts,
2325                                    const int16_t *qmul)
2326 {
2327     return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 1, 0, s->bpp, cnt, eob, p,
2328                                    nnz, scan, nb, band_counts, qmul);
2329 }
2330
2331 static av_always_inline int decode_coeffs(AVCodecContext *ctx, int is8bitsperpixel)
2332 {
2333     VP9Context *s = ctx->priv_data;
2334     VP9Block *b = s->b;
2335     int row = s->row, col = s->col;
2336     uint8_t (*p)[6][11] = s->prob.coef[b->tx][0 /* y */][!b->intra];
2337     unsigned (*c)[6][3] = s->counts.coef[b->tx][0 /* y */][!b->intra];
2338     unsigned (*e)[6][2] = s->counts.eob[b->tx][0 /* y */][!b->intra];
2339     int w4 = bwh_tab[1][b->bs][0] << 1, h4 = bwh_tab[1][b->bs][1] << 1;
2340     int end_x = FFMIN(2 * (s->cols - col), w4);
2341     int end_y = FFMIN(2 * (s->rows - row), h4);
2342     int n, pl, x, y, res;
2343     int16_t (*qmul)[2] = s->segmentation.feat[b->seg_id].qmul;
2344     int tx = 4 * s->lossless + b->tx;
2345     const int16_t * const *yscans = vp9_scans[tx];
2346     const int16_t (* const *ynbs)[2] = vp9_scans_nb[tx];
2347     const int16_t *uvscan = vp9_scans[b->uvtx][DCT_DCT];
2348     const int16_t (*uvnb)[2] = vp9_scans_nb[b->uvtx][DCT_DCT];
2349     uint8_t *a = &s->above_y_nnz_ctx[col * 2];
2350     uint8_t *l = &s->left_y_nnz_ctx[(row & 7) << 1];
2351     static const int16_t band_counts[4][8] = {
2352         { 1, 2, 3, 4,  3,   16 - 13 },
2353         { 1, 2, 3, 4, 11,   64 - 21 },
2354         { 1, 2, 3, 4, 11,  256 - 21 },
2355         { 1, 2, 3, 4, 11, 1024 - 21 },
2356     };
2357     const int16_t *y_band_counts = band_counts[b->tx];
2358     const int16_t *uv_band_counts = band_counts[b->uvtx];
2359     int bytesperpixel = is8bitsperpixel ? 1 : 2;
2360     int total_coeff = 0;
2361
2362 #define MERGE(la, end, step, rd) \
2363     for (n = 0; n < end; n += step) \
2364         la[n] = !!rd(&la[n])
2365 #define MERGE_CTX(step, rd) \
2366     do { \
2367         MERGE(l, end_y, step, rd); \
2368         MERGE(a, end_x, step, rd); \
2369     } while (0)
2370
2371 #define DECODE_Y_COEF_LOOP(step, mode_index, v) \
2372     for (n = 0, y = 0; y < end_y; y += step) { \
2373         for (x = 0; x < end_x; x += step, n += step * step) { \
2374             enum TxfmType txtp = vp9_intra_txfm_type[b->mode[mode_index]]; \
2375             res = (is8bitsperpixel ? decode_coeffs_b##v##_8bpp : decode_coeffs_b##v##_16bpp) \
2376                                     (s, s->block + 16 * n * bytesperpixel, 16 * step * step, \
2377                                      c, e, p, a[x] + l[y], yscans[txtp], \
2378                                      ynbs[txtp], y_band_counts, qmul[0]); \
2379             a[x] = l[y] = !!res; \
2380             total_coeff |= !!res; \
2381             if (step >= 4) { \
2382                 AV_WN16A(&s->eob[n], res); \
2383             } else { \
2384                 s->eob[n] = res; \
2385             } \
2386         } \
2387     }
2388
2389 #define SPLAT(la, end, step, cond) \
2390     if (step == 2) { \
2391         for (n = 1; n < end; n += step) \
2392             la[n] = la[n - 1]; \
2393     } else if (step == 4) { \
2394         if (cond) { \
2395             for (n = 0; n < end; n += step) \
2396                 AV_WN32A(&la[n], la[n] * 0x01010101); \
2397         } else { \
2398             for (n = 0; n < end; n += step) \
2399                 memset(&la[n + 1], la[n], FFMIN(end - n - 1, 3)); \
2400         } \
2401     } else /* step == 8 */ { \
2402         if (cond) { \
2403             if (HAVE_FAST_64BIT) { \
2404                 for (n = 0; n < end; n += step) \
2405                     AV_WN64A(&la[n], la[n] * 0x0101010101010101ULL); \
2406             } else { \
2407                 for (n = 0; n < end; n += step) { \
2408                     uint32_t v32 = la[n] * 0x01010101; \
2409                     AV_WN32A(&la[n],     v32); \
2410                     AV_WN32A(&la[n + 4], v32); \
2411                 } \
2412             } \
2413         } else { \
2414             for (n = 0; n < end; n += step) \
2415                 memset(&la[n + 1], la[n], FFMIN(end - n - 1, 7)); \
2416         } \
2417     }
2418 #define SPLAT_CTX(step) \
2419     do { \
2420         SPLAT(a, end_x, step, end_x == w4); \
2421         SPLAT(l, end_y, step, end_y == h4); \
2422     } while (0)
2423
2424     /* y tokens */
2425     switch (b->tx) {
2426     case TX_4X4:
2427         DECODE_Y_COEF_LOOP(1, b->bs > BS_8x8 ? n : 0,);
2428         break;
2429     case TX_8X8:
2430         MERGE_CTX(2, AV_RN16A);
2431         DECODE_Y_COEF_LOOP(2, 0,);
2432         SPLAT_CTX(2);
2433         break;
2434     case TX_16X16:
2435         MERGE_CTX(4, AV_RN32A);
2436         DECODE_Y_COEF_LOOP(4, 0,);
2437         SPLAT_CTX(4);
2438         break;
2439     case TX_32X32:
2440         MERGE_CTX(8, AV_RN64A);
2441         DECODE_Y_COEF_LOOP(8, 0, 32);
2442         SPLAT_CTX(8);
2443         break;
2444     }
2445
2446 #define DECODE_UV_COEF_LOOP(step, v) \
2447     for (n = 0, y = 0; y < end_y; y += step) { \
2448         for (x = 0; x < end_x; x += step, n += step * step) { \
2449             res = (is8bitsperpixel ? decode_coeffs_b##v##_8bpp : decode_coeffs_b##v##_16bpp) \
2450                                     (s, s->uvblock[pl] + 16 * n * bytesperpixel, \
2451                                      16 * step * step, c, e, p, a[x] + l[y], \
2452                                      uvscan, uvnb, uv_band_counts, qmul[1]); \
2453             a[x] = l[y] = !!res; \
2454             total_coeff |= !!res; \
2455             if (step >= 4) { \
2456                 AV_WN16A(&s->uveob[pl][n], res); \
2457             } else { \
2458                 s->uveob[pl][n] = res; \
2459             } \
2460         } \
2461     }
2462
2463     p = s->prob.coef[b->uvtx][1 /* uv */][!b->intra];
2464     c = s->counts.coef[b->uvtx][1 /* uv */][!b->intra];
2465     e = s->counts.eob[b->uvtx][1 /* uv */][!b->intra];
2466     w4 >>= s->ss_h;
2467     end_x >>= s->ss_h;
2468     h4 >>= s->ss_v;
2469     end_y >>= s->ss_v;
2470     for (pl = 0; pl < 2; pl++) {
2471         a = &s->above_uv_nnz_ctx[pl][col << !s->ss_h];
2472         l = &s->left_uv_nnz_ctx[pl][(row & 7) << !s->ss_v];
2473         switch (b->uvtx) {
2474         case TX_4X4:
2475             DECODE_UV_COEF_LOOP(1,);
2476             break;
2477         case TX_8X8:
2478             MERGE_CTX(2, AV_RN16A);
2479             DECODE_UV_COEF_LOOP(2,);
2480             SPLAT_CTX(2);
2481             break;
2482         case TX_16X16:
2483             MERGE_CTX(4, AV_RN32A);
2484             DECODE_UV_COEF_LOOP(4,);
2485             SPLAT_CTX(4);
2486             break;
2487         case TX_32X32:
2488             MERGE_CTX(8, AV_RN64A);
2489             DECODE_UV_COEF_LOOP(8, 32);
2490             SPLAT_CTX(8);
2491             break;
2492         }
2493     }
2494
2495     return total_coeff;
2496 }
2497
2498 static int decode_coeffs_8bpp(AVCodecContext *ctx)
2499 {
2500     return decode_coeffs(ctx, 1);
2501 }
2502
2503 static int decode_coeffs_16bpp(AVCodecContext *ctx)
2504 {
2505     return decode_coeffs(ctx, 0);
2506 }
2507
2508 static av_always_inline int check_intra_mode(VP9Context *s, int mode, uint8_t **a,
2509                                              uint8_t *dst_edge, ptrdiff_t stride_edge,
2510                                              uint8_t *dst_inner, ptrdiff_t stride_inner,
2511                                              uint8_t *l, int col, int x, int w,
2512                                              int row, int y, enum TxfmMode tx,
2513                                              int p, int ss_h, int ss_v, int bytesperpixel)
2514 {
2515     int have_top = row > 0 || y > 0;
2516     int have_left = col > s->tiling.tile_col_start || x > 0;
2517     int have_right = x < w - 1;
2518     int bpp = s->bpp;
2519     static const uint8_t mode_conv[10][2 /* have_left */][2 /* have_top */] = {
2520         [VERT_PRED]            = { { DC_127_PRED,          VERT_PRED },
2521                                    { DC_127_PRED,          VERT_PRED } },
2522         [HOR_PRED]             = { { DC_129_PRED,          DC_129_PRED },
2523                                    { HOR_PRED,             HOR_PRED } },
2524         [DC_PRED]              = { { DC_128_PRED,          TOP_DC_PRED },
2525                                    { LEFT_DC_PRED,         DC_PRED } },
2526         [DIAG_DOWN_LEFT_PRED]  = { { DC_127_PRED,          DIAG_DOWN_LEFT_PRED },
2527                                    { DC_127_PRED,          DIAG_DOWN_LEFT_PRED } },
2528         [DIAG_DOWN_RIGHT_PRED] = { { DIAG_DOWN_RIGHT_PRED, DIAG_DOWN_RIGHT_PRED },
2529                                    { DIAG_DOWN_RIGHT_PRED, DIAG_DOWN_RIGHT_PRED } },
2530         [VERT_RIGHT_PRED]      = { { VERT_RIGHT_PRED,      VERT_RIGHT_PRED },
2531                                    { VERT_RIGHT_PRED,      VERT_RIGHT_PRED } },
2532         [HOR_DOWN_PRED]        = { { HOR_DOWN_PRED,        HOR_DOWN_PRED },
2533                                    { HOR_DOWN_PRED,        HOR_DOWN_PRED } },
2534         [VERT_LEFT_PRED]       = { { DC_127_PRED,          VERT_LEFT_PRED },
2535                                    { DC_127_PRED,          VERT_LEFT_PRED } },
2536         [HOR_UP_PRED]          = { { DC_129_PRED,          DC_129_PRED },
2537                                    { HOR_UP_PRED,          HOR_UP_PRED } },
2538         [TM_VP8_PRED]          = { { DC_129_PRED,          VERT_PRED },
2539                                    { HOR_PRED,             TM_VP8_PRED } },
2540     };
2541     static const struct {
2542         uint8_t needs_left:1;
2543         uint8_t needs_top:1;
2544         uint8_t needs_topleft:1;
2545         uint8_t needs_topright:1;
2546         uint8_t invert_left:1;
2547     } edges[N_INTRA_PRED_MODES] = {
2548         [VERT_PRED]            = { .needs_top  = 1 },
2549         [HOR_PRED]             = { .needs_left = 1 },
2550         [DC_PRED]              = { .needs_top  = 1, .needs_left = 1 },
2551         [DIAG_DOWN_LEFT_PRED]  = { .needs_top  = 1, .needs_topright = 1 },
2552         [DIAG_DOWN_RIGHT_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2553         [VERT_RIGHT_PRED]      = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2554         [HOR_DOWN_PRED]        = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2555         [VERT_LEFT_PRED]       = { .needs_top  = 1, .needs_topright = 1 },
2556         [HOR_UP_PRED]          = { .needs_left = 1, .invert_left = 1 },
2557         [TM_VP8_PRED]          = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2558         [LEFT_DC_PRED]         = { .needs_left = 1 },
2559         [TOP_DC_PRED]          = { .needs_top  = 1 },
2560         [DC_128_PRED]          = { 0 },
2561         [DC_127_PRED]          = { 0 },
2562         [DC_129_PRED]          = { 0 }
2563     };
2564
2565     av_assert2(mode >= 0 && mode < 10);
2566     mode = mode_conv[mode][have_left][have_top];
2567     if (edges[mode].needs_top) {
2568         uint8_t *top, *topleft;
2569         int n_px_need = 4 << tx, n_px_have = (((s->cols - col) << !ss_h) - x) * 4;
2570         int n_px_need_tr = 0;
2571
2572         if (tx == TX_4X4 && edges[mode].needs_topright && have_right)
2573             n_px_need_tr = 4;
2574
2575         // if top of sb64-row, use s->intra_pred_data[] instead of
2576         // dst[-stride] for intra prediction (it contains pre- instead of
2577         // post-loopfilter data)
2578         if (have_top) {
2579             top = !(row & 7) && !y ?
2580                 s->intra_pred_data[p] + (col * (8 >> ss_h) + x * 4) * bytesperpixel :
2581                 y == 0 ? &dst_edge[-stride_edge] : &dst_inner[-stride_inner];
2582             if (have_left)
2583                 topleft = !(row & 7) && !y ?
2584                     s->intra_pred_data[p] + (col * (8 >> ss_h) + x * 4) * bytesperpixel :
2585                     y == 0 || x == 0 ? &dst_edge[-stride_edge] :
2586                     &dst_inner[-stride_inner];
2587         }
2588
2589         if (have_top &&
2590             (!edges[mode].needs_topleft || (have_left && top == topleft)) &&
2591             (tx != TX_4X4 || !edges[mode].needs_topright || have_right) &&
2592             n_px_need + n_px_need_tr <= n_px_have) {
2593             *a = top;
2594         } else {
2595             if (have_top) {
2596                 if (n_px_need <= n_px_have) {
2597                     memcpy(*a, top, n_px_need * bytesperpixel);
2598                 } else {
2599 #define memset_bpp(c, i1, v, i2, num) do { \
2600     if (bytesperpixel == 1) { \
2601         memset(&(c)[(i1)], (v)[(i2)], (num)); \
2602     } else { \
2603         int n, val = AV_RN16A(&(v)[(i2) * 2]); \
2604         for (n = 0; n < (num); n++) { \
2605             AV_WN16A(&(c)[((i1) + n) * 2], val); \
2606         } \
2607     } \
2608 } while (0)
2609                     memcpy(*a, top, n_px_have * bytesperpixel);
2610                     memset_bpp(*a, n_px_have, (*a), n_px_have - 1, n_px_need - n_px_have);
2611                 }
2612             } else {
2613 #define memset_val(c, val, num) do { \
2614     if (bytesperpixel == 1) { \
2615         memset((c), (val), (num)); \
2616     } else { \
2617         int n; \
2618         for (n = 0; n < (num); n++) { \
2619             AV_WN16A(&(c)[n * 2], (val)); \
2620         } \
2621     } \
2622 } while (0)
2623                 memset_val(*a, (128 << (bpp - 8)) - 1, n_px_need);
2624             }
2625             if (edges[mode].needs_topleft) {
2626                 if (have_left && have_top) {
2627 #define assign_bpp(c, i1, v, i2) do { \
2628     if (bytesperpixel == 1) { \
2629         (c)[(i1)] = (v)[(i2)]; \
2630     } else { \
2631         AV_COPY16(&(c)[(i1) * 2], &(v)[(i2) * 2]); \
2632     } \
2633 } while (0)
2634                     assign_bpp(*a, -1, topleft, -1);
2635                 } else {
2636 #define assign_val(c, i, v) do { \
2637     if (bytesperpixel == 1) { \
2638         (c)[(i)] = (v); \
2639     } else { \
2640         AV_WN16A(&(c)[(i) * 2], (v)); \
2641     } \
2642 } while (0)
2643                     assign_val((*a), -1, (128 << (bpp - 8)) + (have_top ? +1 : -1));
2644                 }
2645             }
2646             if (tx == TX_4X4 && edges[mode].needs_topright) {
2647                 if (have_top && have_right &&
2648                     n_px_need + n_px_need_tr <= n_px_have) {
2649                     memcpy(&(*a)[4 * bytesperpixel], &top[4 * bytesperpixel], 4 * bytesperpixel);
2650                 } else {
2651                     memset_bpp(*a, 4, *a, 3, 4);
2652                 }
2653             }
2654         }
2655     }
2656     if (edges[mode].needs_left) {
2657         if (have_left) {
2658             int n_px_need = 4 << tx, i, n_px_have = (((s->rows - row) << !ss_v) - y) * 4;
2659             uint8_t *dst = x == 0 ? dst_edge : dst_inner;
2660             ptrdiff_t stride = x == 0 ? stride_edge : stride_inner;
2661
2662             if (edges[mode].invert_left) {
2663                 if (n_px_need <= n_px_have) {
2664                     for (i = 0; i < n_px_need; i++)
2665                         assign_bpp(l, i, &dst[i * stride], -1);
2666                 } else {
2667                     for (i = 0; i < n_px_have; i++)
2668                         assign_bpp(l, i, &dst[i * stride], -1);
2669                     memset_bpp(l, n_px_have, l, n_px_have - 1, n_px_need - n_px_have);
2670                 }
2671             } else {
2672                 if (n_px_need <= n_px_have) {
2673                     for (i = 0; i < n_px_need; i++)
2674                         assign_bpp(l, n_px_need - 1 - i, &dst[i * stride], -1);
2675                 } else {
2676                     for (i = 0; i < n_px_have; i++)
2677                         assign_bpp(l, n_px_need - 1 - i, &dst[i * stride], -1);
2678                     memset_bpp(l, 0, l, n_px_need - n_px_have, n_px_need - n_px_have);
2679                 }
2680             }
2681         } else {
2682             memset_val(l, (128 << (bpp - 8)) + 1, 4 << tx);
2683         }
2684     }
2685
2686     return mode;
2687 }
2688
2689 static av_always_inline void intra_recon(AVCodecContext *ctx, ptrdiff_t y_off,
2690                                          ptrdiff_t uv_off, int bytesperpixel)
2691 {
2692     VP9Context *s = ctx->priv_data;
2693     VP9Block *b = s->b;
2694     int row = s->row, col = s->col;
2695     int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
2696     int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
2697     int end_x = FFMIN(2 * (s->cols - col), w4);
2698     int end_y = FFMIN(2 * (s->rows - row), h4);
2699     int tx = 4 * s->lossless + b->tx, uvtx = b->uvtx + 4 * s->lossless;
2700     int uvstep1d = 1 << b->uvtx, p;
2701     uint8_t *dst = s->dst[0], *dst_r = s->frames[CUR_FRAME].tf.f->data[0] + y_off;
2702     LOCAL_ALIGNED_32(uint8_t, a_buf, [96]);
2703     LOCAL_ALIGNED_32(uint8_t, l, [64]);
2704
2705     for (n = 0, y = 0; y < end_y; y += step1d) {
2706         uint8_t *ptr = dst, *ptr_r = dst_r;
2707         for (x = 0; x < end_x; x += step1d, ptr += 4 * step1d * bytesperpixel,
2708                                ptr_r += 4 * step1d * bytesperpixel, n += step) {
2709             int mode = b->mode[b->bs > BS_8x8 && b->tx == TX_4X4 ?
2710                                y * 2 + x : 0];
2711             uint8_t *a = &a_buf[32];
2712             enum TxfmType txtp = vp9_intra_txfm_type[mode];
2713             int eob = b->skip ? 0 : b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
2714
2715             mode = check_intra_mode(s, mode, &a, ptr_r,
2716                                     s->frames[CUR_FRAME].tf.f->linesize[0],
2717                                     ptr, s->y_stride, l,
2718                                     col, x, w4, row, y, b->tx, 0, 0, 0, bytesperpixel);
2719             s->dsp.intra_pred[b->tx][mode](ptr, s->y_stride, l, a);
2720             if (eob)
2721                 s->dsp.itxfm_add[tx][txtp](ptr, s->y_stride,
2722                                            s->block + 16 * n * bytesperpixel, eob);
2723         }
2724         dst_r += 4 * step1d * s->frames[CUR_FRAME].tf.f->linesize[0];
2725         dst   += 4 * step1d * s->y_stride;
2726     }
2727
2728     // U/V
2729     w4 >>= s->ss_h;
2730     end_x >>= s->ss_h;
2731     end_y >>= s->ss_v;
2732     step = 1 << (b->uvtx * 2);
2733     for (p = 0; p < 2; p++) {
2734         dst   = s->dst[1 + p];
2735         dst_r = s->frames[CUR_FRAME].tf.f->data[1 + p] + uv_off;
2736         for (n = 0, y = 0; y < end_y; y += uvstep1d) {
2737             uint8_t *ptr = dst, *ptr_r = dst_r;
2738             for (x = 0; x < end_x; x += uvstep1d, ptr += 4 * uvstep1d * bytesperpixel,
2739                                    ptr_r += 4 * uvstep1d * bytesperpixel, n += step) {
2740                 int mode = b->uvmode;
2741                 uint8_t *a = &a_buf[32];
2742                 int eob = b->skip ? 0 : b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
2743
2744                 mode = check_intra_mode(s, mode, &a, ptr_r,
2745                                         s->frames[CUR_FRAME].tf.f->linesize[1],
2746                                         ptr, s->uv_stride, l, col, x, w4, row, y,
2747                                         b->uvtx, p + 1, s->ss_h, s->ss_v, bytesperpixel);
2748                 s->dsp.intra_pred[b->uvtx][mode](ptr, s->uv_stride, l, a);
2749                 if (eob)
2750                     s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
2751                                                     s->uvblock[p] + 16 * n * bytesperpixel, eob);
2752             }
2753             dst_r += 4 * uvstep1d * s->frames[CUR_FRAME].tf.f->linesize[1];
2754             dst   += 4 * uvstep1d * s->uv_stride;
2755         }
2756     }
2757 }
2758
2759 static void intra_recon_8bpp(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off)
2760 {
2761     intra_recon(ctx, y_off, uv_off, 1);
2762 }
2763
2764 static void intra_recon_16bpp(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off)
2765 {
2766     intra_recon(ctx, y_off, uv_off, 2);
2767 }
2768
2769 static av_always_inline void mc_luma_scaled(VP9Context *s, vp9_scaled_mc_func smc,
2770                                             uint8_t *dst, ptrdiff_t dst_stride,
2771                                             const uint8_t *ref, ptrdiff_t ref_stride,
2772                                             ThreadFrame *ref_frame,
2773                                             ptrdiff_t y, ptrdiff_t x, const VP56mv *in_mv,
2774                                             int px, int py, int pw, int ph,
2775                                             int bw, int bh, int w, int h, int bytesperpixel,
2776                                             const uint16_t *scale, const uint8_t *step)
2777 {
2778 #define scale_mv(n, dim) (((int64_t)(n) * scale[dim]) >> 14)
2779     int mx, my;
2780     int refbw_m1, refbh_m1;
2781     int th;
2782     VP56mv mv;
2783
2784     mv.x = av_clip(in_mv->x, -(x + pw - px + 4) << 3, (s->cols * 8 - x + px + 3) << 3);
2785     mv.y = av_clip(in_mv->y, -(y + ph - py + 4) << 3, (s->rows * 8 - y + py + 3) << 3);
2786     // BUG libvpx seems to scale the two components separately. This introduces
2787     // rounding errors but we have to reproduce them to be exactly compatible
2788     // with the output from libvpx...
2789     mx = scale_mv(mv.x * 2, 0) + scale_mv(x * 16, 0);
2790     my = scale_mv(mv.y * 2, 1) + scale_mv(y * 16, 1);
2791
2792     y = my >> 4;
2793     x = mx >> 4;
2794     ref += y * ref_stride + x * bytesperpixel;
2795     mx &= 15;
2796     my &= 15;
2797     refbw_m1 = ((bw - 1) * step[0] + mx) >> 4;
2798     refbh_m1 = ((bh - 1) * step[1] + my) >> 4;
2799     // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2800     // we use +7 because the last 7 pixels of each sbrow can be changed in
2801     // the longest loopfilter of the next sbrow
2802     th = (y + refbh_m1 + 4 + 7) >> 6;
2803     ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2804     if (x < 3 || y < 3 || x + 4 >= w - refbw_m1 || y + 4 >= h - refbh_m1) {
2805         s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2806                                  ref - 3 * ref_stride - 3 * bytesperpixel,
2807                                  288, ref_stride,
2808                                  refbw_m1 + 8, refbh_m1 + 8,
2809                                  x - 3, y - 3, w, h);
2810         ref = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
2811         ref_stride = 288;
2812     }
2813     smc(dst, dst_stride, ref, ref_stride, bh, mx, my, step[0], step[1]);
2814 }
2815
2816 static av_always_inline void mc_chroma_scaled(VP9Context *s, vp9_scaled_mc_func smc,
2817                                               uint8_t *dst_u, uint8_t *dst_v,
2818                                               ptrdiff_t dst_stride,
2819                                               const uint8_t *ref_u, ptrdiff_t src_stride_u,
2820                                               const uint8_t *ref_v, ptrdiff_t src_stride_v,
2821                                               ThreadFrame *ref_frame,
2822                                               ptrdiff_t y, ptrdiff_t x, const VP56mv *in_mv,
2823                                               int px, int py, int pw, int ph,
2824                                               int bw, int bh, int w, int h, int bytesperpixel,
2825                                               const uint16_t *scale, const uint8_t *step)
2826 {
2827     int mx, my;
2828     int refbw_m1, refbh_m1;
2829     int th;
2830     VP56mv mv;
2831
2832     if (s->ss_h) {
2833         // BUG https://code.google.com/p/webm/issues/detail?id=820
2834         mv.x = av_clip(in_mv->x, -(x + pw - px + 4) << 4, (s->cols * 4 - x + px + 3) << 4);
2835         mx = scale_mv(mv.x, 0) + (scale_mv(x * 16, 0) & ~15) + (scale_mv(x * 32, 0) & 15);
2836     } else {
2837         mv.x = av_clip(in_mv->x, -(x + pw - px + 4) << 3, (s->cols * 8 - x + px + 3) << 3);
2838         mx = scale_mv(mv.x << 1, 0) + scale_mv(x * 16, 0);
2839     }
2840     if (s->ss_v) {
2841         // BUG https://code.google.com/p/webm/issues/detail?id=820
2842         mv.y = av_clip(in_mv->y, -(y + ph - py + 4) << 4, (s->rows * 4 - y + py + 3) << 4);
2843         my = scale_mv(mv.y, 1) + (scale_mv(y * 16, 1) & ~15) + (scale_mv(y * 32, 1) & 15);
2844     } else {
2845         mv.y = av_clip(in_mv->y, -(y + ph - py + 4) << 3, (s->rows * 8 - y + py + 3) << 3);
2846         my = scale_mv(mv.y << 1, 1) + scale_mv(y * 16, 1);
2847     }
2848 #undef scale_mv
2849     y = my >> 4;
2850     x = mx >> 4;
2851     ref_u += y * src_stride_u + x * bytesperpixel;
2852     ref_v += y * src_stride_v + x * bytesperpixel;
2853     mx &= 15;
2854     my &= 15;
2855     refbw_m1 = ((bw - 1) * step[0] + mx) >> 4;
2856     refbh_m1 = ((bh - 1) * step[1] + my) >> 4;
2857     // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2858     // we use +7 because the last 7 pixels of each sbrow can be changed in
2859     // the longest loopfilter of the next sbrow
2860     th = (y + refbh_m1 + 4 + 7) >> (6 - s->ss_v);
2861     ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2862     if (x < 3 || y < 3 || x + 4 >= w - refbw_m1 || y + 4 >= h - refbh_m1) {
2863         s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2864                                  ref_u - 3 * src_stride_u - 3 * bytesperpixel,
2865                                  288, src_stride_u,
2866                                  refbw_m1 + 8, refbh_m1 + 8,
2867                                  x - 3, y - 3, w, h);
2868         ref_u = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
2869         smc(dst_u, dst_stride, ref_u, 288, bh, mx, my, step[0], step[1]);
2870
2871         s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2872                                  ref_v - 3 * src_stride_v - 3 * bytesperpixel,
2873                                  288, src_stride_v,
2874                                  refbw_m1 + 8, refbh_m1 + 8,
2875                                  x - 3, y - 3, w, h);
2876         ref_v = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
2877         smc(dst_v, dst_stride, ref_v, 288, bh, mx, my, step[0], step[1]);
2878     } else {
2879         smc(dst_u, dst_stride, ref_u, src_stride_u, bh, mx, my, step[0], step[1]);
2880         smc(dst_v, dst_stride, ref_v, src_stride_v, bh, mx, my, step[0], step[1]);
2881     }
2882 }
2883
2884 #define mc_luma_dir(s, mc, dst, dst_ls, src, src_ls, tref, row, col, mv, \
2885                     px, py, pw, ph, bw, bh, w, h, i) \
2886     mc_luma_scaled(s, s->dsp.s##mc, dst, dst_ls, src, src_ls, tref, row, col, \
2887                    mv, px, py, pw, ph, bw, bh, w, h, bytesperpixel, \
2888                    s->mvscale[b->ref[i]], s->mvstep[b->ref[i]])
2889 #define mc_chroma_dir(s, mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
2890                       row, col, mv, px, py, pw, ph, bw, bh, w, h, i) \
2891     mc_chroma_scaled(s, s->dsp.s##mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
2892                      row, col, mv, px, py, pw, ph, bw, bh, w, h, bytesperpixel, \
2893                      s->mvscale[b->ref[i]], s->mvstep[b->ref[i]])
2894 #define SCALED 1
2895 #define FN(x) x##_scaled_8bpp
2896 #define BYTES_PER_PIXEL 1
2897 #include "vp9_mc_template.c"
2898 #undef FN
2899 #undef BYTES_PER_PIXEL
2900 #define FN(x) x##_scaled_16bpp
2901 #define BYTES_PER_PIXEL 2
2902 #include "vp9_mc_template.c"
2903 #undef mc_luma_dir
2904 #undef mc_chroma_dir
2905 #undef FN
2906 #undef BYTES_PER_PIXEL
2907 #undef SCALED
2908
2909 static av_always_inline void mc_luma_unscaled(VP9Context *s, vp9_mc_func (*mc)[2],
2910                                               uint8_t *dst, ptrdiff_t dst_stride,
2911                                               const uint8_t *ref, ptrdiff_t ref_stride,
2912                                               ThreadFrame *ref_frame,
2913                                               ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
2914                                               int bw, int bh, int w, int h, int bytesperpixel)
2915 {
2916     int mx = mv->x, my = mv->y, th;
2917
2918     y += my >> 3;
2919     x += mx >> 3;
2920     ref += y * ref_stride + x * bytesperpixel;
2921     mx &= 7;
2922     my &= 7;
2923     // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2924     // we use +7 because the last 7 pixels of each sbrow can be changed in
2925     // the longest loopfilter of the next sbrow
2926     th = (y + bh + 4 * !!my + 7) >> 6;
2927     ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2928     if (x < !!mx * 3 || y < !!my * 3 ||
2929         x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
2930         s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2931                                  ref - !!my * 3 * ref_stride - !!mx * 3 * bytesperpixel,
2932                                  160, ref_stride,
2933                                  bw + !!mx * 7, bh + !!my * 7,
2934                                  x - !!mx * 3, y - !!my * 3, w, h);
2935         ref = s->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel;
2936         ref_stride = 160;
2937     }
2938     mc[!!mx][!!my](dst, dst_stride, ref, ref_stride, bh, mx << 1, my << 1);
2939 }
2940
2941 static av_always_inline void mc_chroma_unscaled(VP9Context *s, vp9_mc_func (*mc)[2],
2942                                                 uint8_t *dst_u, uint8_t *dst_v,
2943                                                 ptrdiff_t dst_stride,
2944                                                 const uint8_t *ref_u, ptrdiff_t src_stride_u,
2945                                                 const uint8_t *ref_v, ptrdiff_t src_stride_v,
2946                                                 ThreadFrame *ref_frame,
2947                                                 ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
2948                                                 int bw, int bh, int w, int h, int bytesperpixel)
2949 {
2950     int mx = mv->x << !s->ss_h, my = mv->y << !s->ss_v, th;
2951
2952     y += my >> 4;
2953     x += mx >> 4;
2954     ref_u += y * src_stride_u + x * bytesperpixel;
2955     ref_v += y * src_stride_v + x * bytesperpixel;
2956     mx &= 15;
2957     my &= 15;
2958     // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2959     // we use +7 because the last 7 pixels of each sbrow can be changed in
2960     // the longest loopfilter of the next sbrow
2961     th = (y + bh + 4 * !!my + 7) >> (6 - s->ss_v);
2962     ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2963     if (x < !!mx * 3 || y < !!my * 3 ||
2964         x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
2965         s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2966                                  ref_u - !!my * 3 * src_stride_u - !!mx * 3 * bytesperpixel,
2967                                  160, src_stride_u,
2968                                  bw + !!mx * 7, bh + !!my * 7,
2969                                  x - !!mx * 3, y - !!my * 3, w, h);
2970         ref_u = s->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel;
2971         mc[!!mx][!!my](dst_u, dst_stride, ref_u, 160, bh, mx, my);
2972
2973         s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2974                                  ref_v - !!my * 3 * src_stride_v - !!mx * 3 * bytesperpixel,
2975                                  160, src_stride_v,
2976                                  bw + !!mx * 7, bh + !!my * 7,
2977                                  x - !!mx * 3, y - !!my * 3, w, h);
2978         ref_v = s->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel;
2979         mc[!!mx][!!my](dst_v, dst_stride, ref_v, 160, bh, mx, my);
2980     } else {
2981         mc[!!mx][!!my](dst_u, dst_stride, ref_u, src_stride_u, bh, mx, my);
2982         mc[!!mx][!!my](dst_v, dst_stride, ref_v, src_stride_v, bh, mx, my);
2983     }
2984 }
2985
2986 #define mc_luma_dir(s, mc, dst, dst_ls, src, src_ls, tref, row, col, mv, \
2987                     px, py, pw, ph, bw, bh, w, h, i) \
2988     mc_luma_unscaled(s, s->dsp.mc, dst, dst_ls, src, src_ls, tref, row, col, \
2989                      mv, bw, bh, w, h, bytesperpixel)
2990 #define mc_chroma_dir(s, mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
2991                       row, col, mv, px, py, pw, ph, bw, bh, w, h, i) \
2992     mc_chroma_unscaled(s, s->dsp.mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
2993                        row, col, mv, bw, bh, w, h, bytesperpixel)
2994 #define SCALED 0
2995 #define FN(x) x##_8bpp
2996 #define BYTES_PER_PIXEL 1
2997 #include "vp9_mc_template.c"
2998 #undef FN
2999 #undef BYTES_PER_PIXEL
3000 #define FN(x) x##_16bpp
3001 #define BYTES_PER_PIXEL 2
3002 #include "vp9_mc_template.c"
3003 #undef mc_luma_dir_dir
3004 #undef mc_chroma_dir_dir
3005 #undef FN
3006 #undef BYTES_PER_PIXEL
3007 #undef SCALED
3008
3009 static av_always_inline void inter_recon(AVCodecContext *ctx, int bytesperpixel)
3010 {
3011     VP9Context *s = ctx->priv_data;
3012     VP9Block *b = s->b;
3013     int row = s->row, col = s->col;
3014
3015     if (s->mvscale[b->ref[0]][0] || (b->comp && s->mvscale[b->ref[1]][0])) {
3016         if (bytesperpixel == 1) {
3017             inter_pred_scaled_8bpp(ctx);
3018         } else {
3019             inter_pred_scaled_16bpp(ctx);
3020         }
3021     } else {
3022         if (bytesperpixel == 1) {
3023             inter_pred_8bpp(ctx);
3024         } else {
3025             inter_pred_16bpp(ctx);
3026         }
3027     }
3028     if (!b->skip) {
3029         /* mostly copied intra_recon() */
3030
3031         int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
3032         int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
3033         int end_x = FFMIN(2 * (s->cols - col), w4);
3034         int end_y = FFMIN(2 * (s->rows - row), h4);
3035         int tx = 4 * s->lossless + b->tx, uvtx = b->uvtx + 4 * s->lossless;
3036         int uvstep1d = 1 << b->uvtx, p;
3037         uint8_t *dst = s->dst[0];
3038
3039         // y itxfm add
3040         for (n = 0, y = 0; y < end_y; y += step1d) {
3041             uint8_t *ptr = dst;
3042             for (x = 0; x < end_x; x += step1d,
3043                  ptr += 4 * step1d * bytesperpixel, n += step) {
3044                 int eob = b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
3045
3046                 if (eob)
3047                     s->dsp.itxfm_add[tx][DCT_DCT](ptr, s->y_stride,
3048                                                   s->block + 16 * n * bytesperpixel, eob);
3049             }
3050             dst += 4 * s->y_stride * step1d;
3051         }
3052
3053         // uv itxfm add
3054         end_x >>= s->ss_h;
3055         end_y >>= s->ss_v;
3056         step = 1 << (b->uvtx * 2);
3057         for (p = 0; p < 2; p++) {
3058             dst = s->dst[p + 1];
3059             for (n = 0, y = 0; y < end_y; y += uvstep1d) {
3060                 uint8_t *ptr = dst;
3061                 for (x = 0; x < end_x; x += uvstep1d,
3062                      ptr += 4 * uvstep1d * bytesperpixel, n += step) {
3063                     int eob = b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
3064
3065                     if (eob)
3066                         s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
3067                                                         s->uvblock[p] + 16 * n * bytesperpixel, eob);
3068                 }
3069                 dst += 4 * uvstep1d * s->uv_stride;
3070             }
3071         }
3072     }
3073 }
3074
3075 static void inter_recon_8bpp(AVCodecContext *ctx)
3076 {
3077     inter_recon(ctx, 1);
3078 }
3079
3080 static void inter_recon_16bpp(AVCodecContext *ctx)
3081 {
3082     inter_recon(ctx, 2);
3083 }
3084
3085 static av_always_inline void mask_edges(uint8_t (*mask)[8][4], int ss_h, int ss_v,
3086                                         int row_and_7, int col_and_7,
3087                                         int w, int h, int col_end, int row_end,
3088                                         enum TxfmMode tx, int skip_inter)
3089 {
3090     static const unsigned wide_filter_col_mask[2] = { 0x11, 0x01 };
3091     static const unsigned wide_filter_row_mask[2] = { 0x03, 0x07 };
3092
3093     // FIXME I'm pretty sure all loops can be replaced by a single LUT if
3094     // we make VP9Filter.mask uint64_t (i.e. row/col all single variable)
3095     // and make the LUT 5-indexed (bl, bp, is_uv, tx and row/col), and then
3096     // use row_and_7/col_and_7 as shifts (1*col_and_7+8*row_and_7)
3097
3098     // the intended behaviour of the vp9 loopfilter is to work on 8-pixel
3099     // edges. This means that for UV, we work on two subsampled blocks at
3100     // a time, and we only use the topleft block's mode information to set
3101     // things like block strength. Thus, for any block size smaller than
3102     // 16x16, ignore the odd portion of the block.
3103     if (tx == TX_4X4 && (ss_v | ss_h)) {
3104         if (h == ss_v) {
3105             if (row_and_7 & 1)
3106                 return;
3107             if (!row_end)
3108                 h += 1;
3109         }
3110         if (w == ss_h) {
3111             if (col_and_7 & 1)
3112                 return;
3113             if (!col_end)
3114                 w += 1;
3115         }
3116     }
3117
3118     if (tx == TX_4X4 && !skip_inter) {
3119         int t = 1 << col_and_7, m_col = (t << w) - t, y;
3120         // on 32-px edges, use the 8-px wide loopfilter; else, use 4-px wide
3121         int m_row_8 = m_col & wide_filter_col_mask[ss_h], m_row_4 = m_col - m_row_8;
3122
3123         for (y = row_and_7; y < h + row_and_7; y++) {
3124             int col_mask_id = 2 - !(y & wide_filter_row_mask[ss_v]);
3125
3126             mask[0][y][1] |= m_row_8;
3127             mask[0][y][2] |= m_row_4;
3128             // for odd lines, if the odd col is not being filtered,
3129             // skip odd row also:
3130             // .---. <-- a
3131             // |   |
3132             // |___| <-- b
3133             // ^   ^
3134             // c   d
3135             //
3136             // if a/c are even row/col and b/d are odd, and d is skipped,
3137             // e.g. right edge of size-66x66.webm, then skip b also (bug)
3138             if ((ss_h & ss_v) && (col_end & 1) && (y & 1)) {
3139                 mask[1][y][col_mask_id] |= (t << (w - 1)) - t;
3140             } else {
3141                 mask[1][y][col_mask_id] |= m_col;
3142             }
3143             if (!ss_h)
3144                 mask[0][y][3] |= m_col;
3145             if (!ss_v) {
3146                 if (ss_h && (col_end & 1))
3147                     mask[1][y][3] |= (t << (w - 1)) - t;
3148                 else
3149                     mask[1][y][3] |= m_col;
3150             }
3151         }
3152     } else {
3153         int y, t = 1 << col_and_7, m_col = (t << w) - t;
3154
3155         if (!skip_inter) {
3156             int mask_id = (tx == TX_8X8);
3157             static const unsigned masks[4] = { 0xff, 0x55, 0x11, 0x01 };
3158             int l2 = tx + ss_h - 1, step1d;
3159             int m_row = m_col & masks[l2];
3160
3161             // at odd UV col/row edges tx16/tx32 loopfilter edges, force
3162             // 8wd loopfilter to prevent going off the visible edge.
3163             if (ss_h && tx > TX_8X8 && (w ^ (w - 1)) == 1) {
3164                 int m_row_16 = ((t << (w - 1)) - t) & masks[l2];
3165                 int m_row_8 = m_row - m_row_16;
3166
3167                 for (y = row_and_7; y < h + row_and_7; y++) {
3168                     mask[0][y][0] |= m_row_16;
3169                     mask[0][y][1] |= m_row_8;
3170                 }
3171             } else {
3172                 for (y = row_and_7; y < h + row_and_7; y++)
3173                     mask[0][y][mask_id] |= m_row;
3174             }
3175
3176             l2 = tx + ss_v - 1;
3177             step1d = 1 << l2;
3178             if (ss_v && tx > TX_8X8 && (h ^ (h - 1)) == 1) {
3179                 for (y = row_and_7; y < h + row_and_7 - 1; y += step1d)
3180                     mask[1][y][0] |= m_col;
3181                 if (y - row_and_7 == h - 1)
3182                     mask[1][y][1] |= m_col;
3183             } else {
3184                 for (y = row_and_7; y < h + row_and_7; y += step1d)
3185                     mask[1][y][mask_id] |= m_col;
3186             }
3187         } else if (tx != TX_4X4) {
3188             int mask_id;
3189
3190             mask_id = (tx == TX_8X8) || (h == ss_v);
3191             mask[1][row_and_7][mask_id] |= m_col;
3192             mask_id = (tx == TX_8X8) || (w == ss_h);
3193             for (y = row_and_7; y < h + row_and_7; y++)
3194                 mask[0][y][mask_id] |= t;
3195         } else {
3196             int t8 = t & wide_filter_col_mask[ss_h], t4 = t - t8;
3197
3198             for (y = row_and_7; y < h + row_and_7; y++) {
3199                 mask[0][y][2] |= t4;
3200                 mask[0][y][1] |= t8;
3201             }
3202             mask[1][row_and_7][2 - !(row_and_7 & wide_filter_row_mask[ss_v])] |= m_col;
3203         }
3204     }
3205 }
3206
3207 static void decode_b(AVCodecContext *ctx, int row, int col,
3208                      struct VP9Filter *lflvl, ptrdiff_t yoff, ptrdiff_t uvoff,
3209                      enum BlockLevel bl, enum BlockPartition bp)
3210 {
3211     VP9Context *s = ctx->priv_data;
3212     VP9Block *b = s->b;
3213     enum BlockSize bs = bl * 3 + bp;
3214     int bytesperpixel = s->bytesperpixel;
3215     int w4 = bwh_tab[1][bs][0], h4 = bwh_tab[1][bs][1], lvl;
3216     int emu[2];
3217     AVFrame *f = s->frames[CUR_FRAME].tf.f;
3218
3219     s->row = row;
3220     s->row7 = row & 7;
3221     s->col = col;
3222     s->col7 = col & 7;
3223     s->min_mv.x = -(128 + col * 64);
3224     s->min_mv.y = -(128 + row * 64);
3225     s->max_mv.x = 128 + (s->cols - col - w4) * 64;
3226     s->max_mv.y = 128 + (s->rows - row - h4) * 64;
3227     if (s->pass < 2) {
3228         b->bs = bs;
3229         b->bl = bl;
3230         b->bp = bp;
3231         decode_mode(ctx);
3232         b->uvtx = b->tx - ((s->ss_h && w4 * 2 == (1 << b->tx)) ||
3233                            (s->ss_v && h4 * 2 == (1 << b->tx)));
3234
3235         if (!b->skip) {
3236             int has_coeffs;
3237
3238             if (bytesperpixel == 1) {
3239                 has_coeffs = decode_coeffs_8bpp(ctx);
3240             } else {
3241                 has_coeffs = decode_coeffs_16bpp(ctx);
3242             }
3243             if (!has_coeffs && b->bs <= BS_8x8 && !b->intra) {
3244                 b->skip = 1;
3245                 memset(&s->above_skip_ctx[col], 1, w4);
3246                 memset(&s->left_skip_ctx[s->row7], 1, h4);
3247             }
3248         } else {
3249             int row7 = s->row7;
3250
3251 #define SPLAT_ZERO_CTX(v, n) \
3252     switch (n) { \
3253     case 1:  v = 0;          break; \
3254     case 2:  AV_ZERO16(&v);  break; \
3255     case 4:  AV_ZERO32(&v);  break; \
3256     case 8:  AV_ZERO64(&v);  break; \
3257     case 16: AV_ZERO128(&v); break; \
3258     }
3259 #define SPLAT_ZERO_YUV(dir, var, off, n, dir2) \
3260     do { \
3261         SPLAT_ZERO_CTX(s->dir##_y_##var[off * 2], n * 2); \
3262         if (s->ss_##dir2) { \
3263             SPLAT_ZERO_CTX(s->dir##_uv_##var[0][off], n); \
3264             SPLAT_ZERO_CTX(s->dir##_uv_##var[1][off], n); \
3265         } else { \
3266             SPLAT_ZERO_CTX(s->dir##_uv_##var[0][off * 2], n * 2); \
3267             SPLAT_ZERO_CTX(s->dir##_uv_##var[1][off * 2], n * 2); \
3268         } \
3269     } while (0)
3270
3271             switch (w4) {
3272             case 1: SPLAT_ZERO_YUV(above, nnz_ctx, col, 1, h); break;
3273             case 2: SPLAT_ZERO_YUV(above, nnz_ctx, col, 2, h); break;
3274             case 4: SPLAT_ZERO_YUV(above, nnz_ctx, col, 4, h); break;
3275             case 8: SPLAT_ZERO_YUV(above, nnz_ctx, col, 8, h); break;
3276             }
3277             switch (h4) {
3278             case 1: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 1, v); break;
3279             case 2: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 2, v); break;
3280             case 4: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 4, v); break;
3281             case 8: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 8, v); break;
3282             }
3283         }
3284         if (s->pass == 1) {
3285             s->b++;
3286             s->block += w4 * h4 * 64 * bytesperpixel;
3287             s->uvblock[0] += w4 * h4 * 64 * bytesperpixel >> (s->ss_h + s->ss_v);
3288             s->uvblock[1] += w4 * h4 * 64 * bytesperpixel >> (s->ss_h + s->ss_v);
3289             s->eob += 4 * w4 * h4;
3290             s->uveob[0] += 4 * w4 * h4 >> (s->ss_h + s->ss_v);
3291             s->uveob[1] += 4 * w4 * h4 >> (s->ss_h + s->ss_v);
3292
3293             return;
3294         }
3295     }
3296
3297     // emulated overhangs if the stride of the target buffer can't hold. This
3298     // makes it possible to support emu-edge and so on even if we have large block
3299     // overhangs
3300     emu[0] = (col + w4) * 8 > f->linesize[0] ||
3301              (row + h4) > s->rows;
3302     emu[1] = (col + w4) * 4 > f->linesize[1] ||
3303              (row + h4) > s->rows;
3304     if (emu[0]) {
3305         s->dst[0] = s->tmp_y;
3306         s->y_stride = 128;
3307     } else {
3308         s->dst[0] = f->data[0] + yoff;
3309         s->y_stride = f->linesize[0];
3310     }
3311     if (emu[1]) {
3312         s->dst[1] = s->tmp_uv[0];
3313         s->dst[2] = s->tmp_uv[1];
3314         s->uv_stride = 128;
3315     } else {
3316         s->dst[1] = f->data[1] + uvoff;
3317         s->dst[2] = f->data[2] + uvoff;
3318         s->uv_stride = f->linesize[1];
3319     }
3320     if (b->intra) {
3321         if (s->bpp > 8) {
3322             intra_recon_16bpp(ctx, yoff, uvoff);
3323         } else {
3324             intra_recon_8bpp(ctx, yoff, uvoff);
3325         }
3326     } else {
3327         if (s->bpp > 8) {
3328             inter_recon_16bpp(ctx);
3329         } else {
3330             inter_recon_8bpp(ctx);
3331         }
3332     }
3333     if (emu[0]) {
3334         int w = FFMIN(s->cols - col, w4) * 8, h = FFMIN(s->rows - row, h4) * 8, n, o = 0;
3335
3336         for (n = 0; o < w; n++) {
3337             int bw = 64 >> n;
3338
3339             av_assert2(n <= 4);
3340             if (w & bw) {
3341                 s->dsp.mc[n][0][0][0][0](f->data[0] + yoff + o, f->linesize[0],
3342                                          s->tmp_y + o, 128, h, 0, 0);
3343                 o += bw * bytesperpixel;
3344             }
3345         }
3346     }
3347     if (emu[1]) {
3348         int w = FFMIN(s->cols - col, w4) * 8 >> s->ss_h;
3349         int h = FFMIN(s->rows - row, h4) * 8 >> s->ss_v, n, o = 0;
3350
3351         for (n = s->ss_h; o < w; n++) {
3352             int bw = 64 >> n;
3353
3354             av_assert2(n <= 4);
3355             if (w & bw) {
3356                 s->dsp.mc[n][0][0][0][0](f->data[1] + uvoff + o, f->linesize[1],
3357                                          s->tmp_uv[0] + o, 128, h, 0, 0);
3358                 s->dsp.mc[n][0][0][0][0](f->data[2] + uvoff + o, f->linesize[2],
3359                                          s->tmp_uv[1] + o, 128, h, 0, 0);
3360                 o += bw * bytesperpixel;
3361             }
3362         }
3363     }
3364
3365     // pick filter level and find edges to apply filter to
3366     if (s->filter.level &&
3367         (lvl = s->segmentation.feat[b->seg_id].lflvl[b->intra ? 0 : b->ref[0] + 1]
3368                                                     [b->mode[3] != ZEROMV]) > 0) {
3369         int x_end = FFMIN(s->cols - col, w4), y_end = FFMIN(s->rows - row, h4);
3370         int skip_inter = !b->intra && b->skip, col7 = s->col7, row7 = s->row7;
3371
3372         setctx_2d(&lflvl->level[row7 * 8 + col7], w4, h4, 8, lvl);
3373         mask_edges(lflvl->mask[0], 0, 0, row7, col7, x_end, y_end, 0, 0, b->tx, skip_inter);
3374         if (s->ss_h || s->ss_v)
3375             mask_edges(lflvl->mask[1], s->ss_h, s->ss_v, row7, col7, x_end, y_end,
3376                        s->cols & 1 && col + w4 >= s->cols ? s->cols & 7 : 0,
3377                        s->rows & 1 && row + h4 >= s->rows ? s->rows & 7 : 0,
3378                        b->uvtx, skip_inter);
3379
3380         if (!s->filter.lim_lut[lvl]) {
3381             int sharp = s->filter.sharpness;
3382             int limit = lvl;
3383
3384             if (sharp > 0) {
3385                 limit >>= (sharp + 3) >> 2;
3386                 limit = FFMIN(limit, 9 - sharp);
3387             }
3388             limit = FFMAX(limit, 1);
3389
3390             s->filter.lim_lut[lvl] = limit;
3391             s->filter.mblim_lut[lvl] = 2 * (lvl + 2) + limit;
3392         }
3393     }
3394
3395     if (s->pass == 2) {
3396         s->b++;
3397         s->block += w4 * h4 * 64 * bytesperpixel;
3398         s->uvblock[0] += w4 * h4 * 64 * bytesperpixel >> (s->ss_v + s->ss_h);
3399         s->uvblock[1] += w4 * h4 * 64 * bytesperpixel >> (s->ss_v + s->ss_h);
3400         s->eob += 4 * w4 * h4;
3401         s->uveob[0] += 4 * w4 * h4 >> (s->ss_v + s->ss_h);
3402         s->uveob[1] += 4 * w4 * h4 >> (s->ss_v + s->ss_h);
3403     }
3404 }
3405
3406 static void decode_sb(AVCodecContext *ctx, int row, int col, struct VP9Filter *lflvl,
3407                       ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
3408 {
3409     VP9Context *s = ctx->priv_data;
3410     int c = ((s->above_partition_ctx[col] >> (3 - bl)) & 1) |
3411             (((s->left_partition_ctx[row & 0x7] >> (3 - bl)) & 1) << 1);
3412     const uint8_t *p = s->keyframe || s->intraonly ? vp9_default_kf_partition_probs[bl][c] :
3413                                                      s->prob.p.partition[bl][c];
3414     enum BlockPartition bp;
3415     ptrdiff_t hbs = 4 >> bl;
3416     AVFrame *f = s->frames[CUR_FRAME].tf.f;
3417     ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
3418     int bytesperpixel = s->bytesperpixel;
3419
3420     if (bl == BL_8X8) {
3421         bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
3422         decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3423     } else if (col + hbs < s->cols) { // FIXME why not <=?
3424         if (row + hbs < s->rows) { // FIXME why not <=?
3425             bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
3426             switch (bp) {
3427             case PARTITION_NONE:
3428                 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3429                 break;
3430             case PARTITION_H:
3431                 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3432                 yoff  += hbs * 8 * y_stride;
3433                 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3434                 decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, bl, bp);
3435                 break;
3436             case PARTITION_V:
3437                 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3438                 yoff  += hbs * 8 * bytesperpixel;
3439                 uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
3440                 decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, bl, bp);
3441                 break;
3442             case PARTITION_SPLIT:
3443                 decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3444                 decode_sb(ctx, row, col + hbs, lflvl,
3445                           yoff + 8 * hbs * bytesperpixel,
3446                           uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3447                 yoff  += hbs * 8 * y_stride;
3448                 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3449                 decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3450                 decode_sb(ctx, row + hbs, col + hbs, lflvl,
3451                           yoff + 8 * hbs * bytesperpixel,
3452                           uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3453                 break;
3454             default:
3455                 av_assert0(0);
3456             }
3457         } else if (vp56_rac_get_prob_branchy(&s->c, p[1])) {
3458             bp = PARTITION_SPLIT;
3459             decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3460             decode_sb(ctx, row, col + hbs, lflvl,
3461                       yoff + 8 * hbs * bytesperpixel,
3462                       uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3463         } else {
3464             bp = PARTITION_H;
3465             decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3466         }
3467     } else if (row + hbs < s->rows) { // FIXME why not <=?
3468         if (vp56_rac_get_prob_branchy(&s->c, p[2])) {
3469             bp = PARTITION_SPLIT;
3470             decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3471             yoff  += hbs * 8 * y_stride;
3472             uvoff += hbs * 8 * uv_stride >> s->ss_v;
3473             decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3474         } else {
3475             bp = PARTITION_V;
3476             decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3477         }
3478     } else {
3479         bp = PARTITION_SPLIT;
3480         decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3481     }
3482     s->counts.partition[bl][c][bp]++;
3483 }
3484
3485 static void decode_sb_mem(AVCodecContext *ctx, int row, int col, struct VP9Filter *lflvl,
3486                           ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
3487 {
3488     VP9Context *s = ctx->priv_data;
3489     VP9Block *b = s->b;
3490     ptrdiff_t hbs = 4 >> bl;
3491     AVFrame *f = s->frames[CUR_FRAME].tf.f;
3492     ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
3493     int bytesperpixel = s->bytesperpixel;
3494
3495     if (bl == BL_8X8) {
3496         av_assert2(b->bl == BL_8X8);
3497         decode_b(ctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
3498     } else if (s->b->bl == bl) {
3499         decode_b(ctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
3500         if (b->bp == PARTITION_H && row + hbs < s->rows) {
3501             yoff  += hbs * 8 * y_stride;
3502             uvoff += hbs * 8 * uv_stride >> s->ss_v;
3503             decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, b->bl, b->bp);
3504         } else if (b->bp == PARTITION_V && col + hbs < s->cols) {
3505             yoff  += hbs * 8 * bytesperpixel;
3506             uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
3507             decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, b->bl, b->bp);
3508         }
3509     } else {
3510         decode_sb_mem(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3511         if (col + hbs < s->cols) { // FIXME why not <=?
3512             if (row + hbs < s->rows) {
3513                 decode_sb_mem(ctx, row, col + hbs, lflvl, yoff + 8 * hbs * bytesperpixel,
3514                               uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3515                 yoff  += hbs * 8 * y_stride;
3516                 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3517                 decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3518                 decode_sb_mem(ctx, row + hbs, col + hbs, lflvl,
3519                               yoff + 8 * hbs * bytesperpixel,
3520                               uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3521             } else {
3522                 yoff  += hbs * 8 * bytesperpixel;
3523                 uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
3524                 decode_sb_mem(ctx, row, col + hbs, lflvl, yoff, uvoff, bl + 1);
3525             }
3526         } else if (row + hbs < s->rows) {
3527             yoff  += hbs * 8 * y_stride;
3528             uvoff += hbs * 8 * uv_stride >> s->ss_v;
3529             decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3530         }
3531     }
3532 }
3533
3534 static av_always_inline void filter_plane_cols(VP9Context *s, int col, int ss_h, int ss_v,
3535                                                uint8_t *lvl, uint8_t (*mask)[4],
3536                                                uint8_t *dst, ptrdiff_t ls)
3537 {
3538     int y, x, bytesperpixel = s->bytesperpixel;
3539
3540     // filter edges between columns (e.g. block1 | block2)
3541     for (y = 0; y < 8; y += 2 << ss_v, dst += 16 * ls, lvl += 16 << ss_v) {
3542         uint8_t *ptr = dst, *l = lvl, *hmask1 = mask[y], *hmask2 = mask[y + 1 + ss_v];
3543         unsigned hm1 = hmask1[0] | hmask1[1] | hmask1[2], hm13 = hmask1[3];
3544         unsigned hm2 = hmask2[1] | hmask2[2], hm23 = hmask2[3];
3545         unsigned hm = hm1 | hm2 | hm13 | hm23;
3546
3547         for (x = 1; hm & ~(x - 1); x <<= 1, ptr += 8 * bytesperpixel >> ss_h) {
3548             if (col || x > 1) {
3549                 if (hm1 & x) {
3550                     int L = *l, H = L >> 4;
3551                     int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3552
3553                     if (hmask1[0] & x) {
3554                         if (hmask2[0] & x) {
3555                             av_assert2(l[8 << ss_v] == L);
3556                             s->dsp.loop_filter_16[0](ptr, ls, E, I, H);
3557                         } else {
3558                             s->dsp.loop_filter_8[2][0](ptr, ls, E, I, H);
3559                         }
3560                     } else if (hm2 & x) {
3561                         L = l[8 << ss_v];
3562                         H |= (L >> 4) << 8;
3563                         E |= s->filter.mblim_lut[L] << 8;
3564                         I |= s->filter.lim_lut[L] << 8;
3565                         s->dsp.loop_filter_mix2[!!(hmask1[1] & x)]
3566                                                [!!(hmask2[1] & x)]
3567                                                [0](ptr, ls, E, I, H);
3568                     } else {
3569                         s->dsp.loop_filter_8[!!(hmask1[1] & x)]
3570                                             [0](ptr, ls, E, I, H);
3571                     }
3572                 } else if (hm2 & x) {
3573                     int L = l[8 << ss_v], H = L >> 4;
3574                     int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3575
3576                     s->dsp.loop_filter_8[!!(hmask2[1] & x)]
3577                                         [0](ptr + 8 * ls, ls, E, I, H);
3578                 }
3579             }
3580             if (ss_h) {
3581                 if (x & 0xAA)
3582                     l += 2;
3583             } else {
3584                 if (hm13 & x) {
3585                     int L = *l, H = L >> 4;
3586                     int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3587
3588                     if (hm23 & x) {
3589                         L = l[8 << ss_v];
3590                         H |= (L >> 4) << 8;
3591                         E |= s->filter.mblim_lut[L] << 8;
3592                         I |= s->filter.lim_lut[L] << 8;
3593                         s->dsp.loop_filter_mix2[0][0][0](ptr + 4 * bytesperpixel, ls, E, I, H);
3594                     } else {
3595                         s->dsp.loop_filter_8[0][0](ptr + 4 * bytesperpixel, ls, E, I, H);
3596                     }
3597                 } else if (hm23 & x) {
3598                     int L = l[8 << ss_v], H = L >> 4;
3599                     int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3600
3601                     s->dsp.loop_filter_8[0][0](ptr + 8 * ls + 4 * bytesperpixel, ls, E, I, H);
3602                 }
3603                 l++;
3604             }
3605         }
3606     }
3607 }
3608
3609 static av_always_inline void filter_plane_rows(VP9Context *s, int row, int ss_h, int ss_v,
3610                                                uint8_t *lvl, uint8_t (*mask)[4],
3611                                                uint8_t *dst, ptrdiff_t ls)
3612 {
3613     int y, x, bytesperpixel = s->bytesperpixel;
3614
3615     //                                 block1
3616     // filter edges between rows (e.g. ------)
3617     //                                 block2
3618     for (y = 0; y < 8; y++, dst += 8 * ls >> ss_v) {
3619         uint8_t *ptr = dst, *l = lvl, *vmask = mask[y];
3620         unsigned vm = vmask[0] | vmask[1] | vmask[2], vm3 = vmask[3];
3621
3622         for (x = 1; vm & ~(x - 1); x <<= (2 << ss_h), ptr += 16 * bytesperpixel, l += 2 << ss_h) {
3623             if (row || y) {
3624                 if (vm & x) {
3625                     int L = *l, H = L >> 4;
3626                     int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3627
3628                     if (vmask[0] & x) {
3629                         if (vmask[0] & (x << (1 + ss_h))) {
3630                             av_assert2(l[1 + ss_h] == L);
3631                             s->dsp.loop_filter_16[1](ptr, ls, E, I, H);
3632                         } else {
3633                             s->dsp.loop_filter_8[2][1](ptr, ls, E, I, H);
3634                         }
3635                     } else if (vm & (x << (1 + ss_h))) {
3636                         L = l[1 + ss_h];
3637                         H |= (L >> 4) << 8;
3638                         E |= s->filter.mblim_lut[L] << 8;
3639                         I |= s->filter.lim_lut[L] << 8;
3640                         s->dsp.loop_filter_mix2[!!(vmask[1] &  x)]
3641                                                [!!(vmask[1] & (x << (1 + ss_h)))]
3642                                                [1](ptr, ls, E, I, H);
3643                     } else {
3644                         s->dsp.loop_filter_8[!!(vmask[1] & x)]
3645                                             [1](ptr, ls, E, I, H);
3646                     }
3647                 } else if (vm & (x << (1 + ss_h))) {
3648                     int L = l[1 + ss_h], H = L >> 4;
3649                     int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3650
3651                     s->dsp.loop_filter_8[!!(vmask[1] & (x << (1 + ss_h)))]
3652                                         [1](ptr + 8 * bytesperpixel, ls, E, I, H);
3653                 }
3654             }
3655             if (!ss_v) {
3656                 if (vm3 & x) {
3657                     int L = *l, H = L >> 4;
3658                     int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3659
3660                     if (vm3 & (x << (1 + ss_h))) {
3661                         L = l[1 + ss_h];
3662                         H |= (L >> 4) << 8;
3663                         E |= s->filter.mblim_lut[L] << 8;
3664                         I |= s->filter.lim_lut[L] << 8;
3665                         s->dsp.loop_filter_mix2[0][0][1](ptr + ls * 4, ls, E, I, H);
3666                     } else {
3667                         s->dsp.loop_filter_8[0][1](ptr + ls * 4, ls, E, I, H);
3668                     }
3669                 } else if (vm3 & (x << (1 + ss_h))) {
3670                     int L = l[1 + ss_h], H = L >> 4;
3671                     int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3672
3673                     s->dsp.loop_filter_8[0][1](ptr + ls * 4 + 8 * bytesperpixel, ls, E, I, H);
3674                 }
3675             }
3676         }
3677         if (ss_v) {
3678             if (y & 1)
3679                 lvl += 16;
3680         } else {
3681             lvl += 8;
3682         }
3683     }
3684 }
3685
3686 static void loopfilter_sb(AVCodecContext *ctx, struct VP9Filter *lflvl,
3687                           int row, int col, ptrdiff_t yoff, ptrdiff_t uvoff)
3688 {
3689     VP9Context *s = ctx->priv_data;
3690     AVFrame *f = s->frames[CUR_FRAME].tf.f;
3691     uint8_t *dst = f->data[0] + yoff;
3692     ptrdiff_t ls_y = f->linesize[0], ls_uv = f->linesize[1];
3693     uint8_t (*uv_masks)[8][4] = lflvl->mask[s->ss_h | s->ss_v];
3694     int p;
3695
3696     // FIXME in how far can we interleave the v/h loopfilter calls? E.g.
3697     // if you think of them as acting on a 8x8 block max, we can interleave
3698     // each v/h within the single x loop, but that only works if we work on
3699     // 8 pixel blocks, and we won't always do that (we want at least 16px
3700     // to use SSE2 optimizations, perhaps 32 for AVX2)
3701
3702     filter_plane_cols(s, col, 0, 0, lflvl->level, lflvl->mask[0][0], dst, ls_y);
3703     filter_plane_rows(s, row, 0, 0, lflvl->level, lflvl->mask[0][1], dst, ls_y);
3704
3705     for (p = 0; p < 2; p++) {
3706         dst = f->data[1 + p] + uvoff;
3707         filter_plane_cols(s, col, s->ss_h, s->ss_v, lflvl->level, uv_masks[0], dst, ls_uv);
3708         filter_plane_rows(s, row, s->ss_h, s->ss_v, lflvl->level, uv_masks[1], dst, ls_uv);
3709     }
3710 }
3711
3712 static void set_tile_offset(int *start, int *end, int idx, int log2_n, int n)
3713 {
3714     int sb_start = ( idx      * n) >> log2_n;
3715     int sb_end   = ((idx + 1) * n) >> log2_n;
3716     *start = FFMIN(sb_start, n) << 3;
3717     *end   = FFMIN(sb_end,   n) << 3;
3718 }
3719
3720 static av_always_inline void adapt_prob(uint8_t *p, unsigned ct0, unsigned ct1,
3721                                         int max_count, int update_factor)
3722 {
3723     unsigned ct = ct0 + ct1, p2, p1;
3724
3725     if (!ct)
3726         return;
3727
3728     p1 = *p;
3729     p2 = ((ct0 << 8) + (ct >> 1)) / ct;
3730     p2 = av_clip(p2, 1, 255);
3731     ct = FFMIN(ct, max_count);
3732     update_factor = FASTDIV(update_factor * ct, max_count);
3733
3734     // (p1 * (256 - update_factor) + p2 * update_factor + 128) >> 8
3735     *p = p1 + (((p2 - p1) * update_factor + 128) >> 8);
3736 }
3737
3738 static void adapt_probs(VP9Context *s)
3739 {
3740     int i, j, k, l, m;
3741     prob_context *p = &s->prob_ctx[s->framectxid].p;
3742     int uf = (s->keyframe || s->intraonly || !s->last_keyframe) ? 112 : 128;
3743
3744     // coefficients
3745     for (i = 0; i < 4; i++)
3746         for (j = 0; j < 2; j++)
3747             for (k = 0; k < 2; k++)
3748                 for (l = 0; l < 6; l++)
3749                     for (m = 0; m < 6; m++) {
3750                         uint8_t *pp = s->prob_ctx[s->framectxid].coef[i][j][k][l][m];
3751                         unsigned *e = s->counts.eob[i][j][k][l][m];
3752                         unsigned *c = s->counts.coef[i][j][k][l][m];
3753
3754                         if (l == 0 && m >= 3) // dc only has 3 pt
3755                             break;
3756
3757                         adapt_prob(&pp[0], e[0], e[1], 24, uf);
3758                         adapt_prob(&pp[1], c[0], c[1] + c[2], 24, uf);
3759                         adapt_prob(&pp[2], c[1], c[2], 24, uf);
3760                     }
3761
3762     if (s->keyframe || s->intraonly) {
3763         memcpy(p->skip,  s->prob.p.skip,  sizeof(p->skip));
3764         memcpy(p->tx32p, s->prob.p.tx32p, sizeof(p->tx32p));
3765         memcpy(p->tx16p, s->prob.p.tx16p, sizeof(p->tx16p));
3766         memcpy(p->tx8p,  s->prob.p.tx8p,  sizeof(p->tx8p));
3767         return;
3768     }
3769
3770     // skip flag
3771     for (i = 0; i < 3; i++)
3772         adapt_prob(&p->skip[i], s->counts.skip[i][0], s->counts.skip[i][1], 20, 128);
3773
3774     // intra/inter flag
3775     for (i = 0; i < 4; i++)
3776         adapt_prob(&p->intra[i], s->counts.intra[i][0], s->counts.intra[i][1], 20, 128);
3777
3778     // comppred flag
3779     if (s->comppredmode == PRED_SWITCHABLE) {
3780       for (i = 0; i < 5; i++)
3781           adapt_prob(&p->comp[i], s->counts.comp[i][0], s->counts.comp[i][1], 20, 128);
3782     }
3783
3784     // reference frames
3785     if (s->comppredmode != PRED_SINGLEREF) {
3786       for (i = 0; i < 5; i++)
3787           adapt_prob(&p->comp_ref[i], s->counts.comp_ref[i][0],
3788                      s->counts.comp_ref[i][1], 20, 128);
3789     }
3790
3791     if (s->comppredmode != PRED_COMPREF) {
3792       for (i = 0; i < 5; i++) {
3793           uint8_t *pp = p->single_ref[i];
3794           unsigned (*c)[2] = s->counts.single_ref[i];
3795
3796           adapt_prob(&pp[0], c[0][0], c[0][1], 20, 128);
3797           adapt_prob(&pp[1], c[1][0], c[1][1], 20, 128);
3798       }
3799     }
3800
3801     // block partitioning
3802     for (i = 0; i < 4; i++)
3803         for (j = 0; j < 4; j++) {
3804             uint8_t *pp = p->partition[i][j];
3805             unsigned *c = s->counts.partition[i][j];
3806
3807             adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3808             adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3809             adapt_prob(&pp[2], c[2], c[3], 20, 128);
3810         }
3811
3812     // tx size
3813     if (s->txfmmode == TX_SWITCHABLE) {
3814       for (i = 0; i < 2; i++) {
3815           unsigned *c16 = s->counts.tx16p[i], *c32 = s->counts.tx32p[i];
3816
3817           adapt_prob(&p->tx8p[i], s->counts.tx8p[i][0], s->counts.tx8p[i][1], 20, 128);
3818           adapt_prob(&p->tx16p[i][0], c16[0], c16[1] + c16[2], 20, 128);
3819           adapt_prob(&p->tx16p[i][1], c16[1], c16[2], 20, 128);
3820           adapt_prob(&p->tx32p[i][0], c32[0], c32[1] + c32[2] + c32[3], 20, 128);
3821           adapt_prob(&p->tx32p[i][1], c32[1], c32[2] + c32[3], 20, 128);
3822           adapt_prob(&p->tx32p[i][2], c32[2], c32[3], 20, 128);
3823       }
3824     }
3825
3826     // interpolation filter
3827     if (s->filtermode == FILTER_SWITCHABLE) {
3828         for (i = 0; i < 4; i++) {
3829             uint8_t *pp = p->filter[i];
3830             unsigned *c = s->counts.filter[i];
3831
3832             adapt_prob(&pp[0], c[0], c[1] + c[2], 20, 128);
3833             adapt_prob(&pp[1], c[1], c[2], 20, 128);
3834         }
3835     }
3836
3837     // inter modes
3838     for (i = 0; i < 7; i++) {
3839         uint8_t *pp = p->mv_mode[i];
3840         unsigned *c = s->counts.mv_mode[i];
3841
3842         adapt_prob(&pp[0], c[2], c[1] + c[0] + c[3], 20, 128);
3843         adapt_prob(&pp[1], c[0], c[1] + c[3], 20, 128);
3844         adapt_prob(&pp[2], c[1], c[3], 20, 128);
3845     }
3846
3847     // mv joints
3848     {
3849         uint8_t *pp = p->mv_joint;
3850         unsigned *c = s->counts.mv_joint;
3851
3852         adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3853         adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3854         adapt_prob(&pp[2], c[2], c[3], 20, 128);
3855     }
3856
3857     // mv components
3858     for (i = 0; i < 2; i++) {
3859         uint8_t *pp;
3860         unsigned *c, (*c2)[2], sum;
3861
3862         adapt_prob(&p->mv_comp[i].sign, s->counts.mv_comp[i].sign[0],
3863                    s->counts.mv_comp[i].sign[1], 20, 128);
3864
3865         pp = p->mv_comp[i].classes;
3866         c = s->counts.mv_comp[i].classes;
3867         sum = c[1] + c[2] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9] + c[10];
3868         adapt_prob(&pp[0], c[0], sum, 20, 128);
3869         sum -= c[1];
3870         adapt_prob(&pp[1], c[1], sum, 20, 128);
3871         sum -= c[2] + c[3];
3872         adapt_prob(&pp[2], c[2] + c[3], sum, 20, 128);
3873         adapt_prob(&pp[3], c[2], c[3], 20, 128);
3874         sum -= c[4] + c[5];
3875         adapt_prob(&pp[4], c[4] + c[5], sum, 20, 128);
3876         adapt_prob(&pp[5], c[4], c[5], 20, 128);
3877         sum -= c[6];
3878         adapt_prob(&pp[6], c[6], sum, 20, 128);
3879         adapt_prob(&pp[7], c[7] + c[8], c[9] + c[10], 20, 128);
3880         adapt_prob(&pp[8], c[7], c[8], 20, 128);
3881         adapt_prob(&pp[9], c[9], c[10], 20, 128);
3882
3883         adapt_prob(&p->mv_comp[i].class0, s->counts.mv_comp[i].class0[0],
3884                    s->counts.mv_comp[i].class0[1], 20, 128);
3885         pp = p->mv_comp[i].bits;
3886         c2 = s->counts.mv_comp[i].bits;
3887         for (j = 0; j < 10; j++)
3888             adapt_prob(&pp[j], c2[j][0], c2[j][1], 20, 128);
3889
3890         for (j = 0; j < 2; j++) {
3891             pp = p->mv_comp[i].class0_fp[j];
3892             c = s->counts.mv_comp[i].class0_fp[j];
3893             adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3894             adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3895             adapt_prob(&pp[2], c[2], c[3], 20, 128);
3896         }
3897         pp = p->mv_comp[i].fp;
3898         c = s->counts.mv_comp[i].fp;
3899         adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3900         adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3901         adapt_prob(&pp[2], c[2], c[3], 20, 128);
3902
3903         if (s->highprecisionmvs) {
3904             adapt_prob(&p->mv_comp[i].class0_hp, s->counts.mv_comp[i].class0_hp[0],
3905                        s->counts.mv_comp[i].class0_hp[1], 20, 128);
3906             adapt_prob(&p->mv_comp[i].hp, s->counts.mv_comp[i].hp[0],
3907                        s->counts.mv_comp[i].hp[1], 20, 128);
3908         }
3909     }
3910
3911     // y intra modes
3912     for (i = 0; i < 4; i++) {
3913         uint8_t *pp = p->y_mode[i];
3914         unsigned *c = s->counts.y_mode[i], sum, s2;
3915
3916         sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
3917         adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
3918         sum -= c[TM_VP8_PRED];
3919         adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
3920         sum -= c[VERT_PRED];
3921         adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
3922         s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
3923         sum -= s2;
3924         adapt_prob(&pp[3], s2, sum, 20, 128);
3925         s2 -= c[HOR_PRED];
3926         adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
3927         adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], 20, 128);
3928         sum -= c[DIAG_DOWN_LEFT_PRED];
3929         adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
3930         sum -= c[VERT_LEFT_PRED];
3931         adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
3932         adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
3933     }
3934
3935     // uv intra modes
3936     for (i = 0; i < 10; i++) {
3937         uint8_t *pp = p->uv_mode[i];
3938         unsigned *c = s->counts.uv_mode[i], sum, s2;
3939
3940         sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
3941         adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
3942         sum -= c[TM_VP8_PRED];
3943         adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
3944         sum -= c[VERT_PRED];
3945         adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
3946         s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
3947         sum -= s2;
3948         adapt_prob(&pp[3], s2, sum, 20, 128);
3949         s2 -= c[HOR_PRED];
3950         adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
3951         adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], 20, 128);
3952         sum -= c[DIAG_DOWN_LEFT_PRED];
3953         adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
3954         sum -= c[VERT_LEFT_PRED];
3955         adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
3956         adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
3957     }
3958 }
3959
3960 static void free_buffers(VP9Context *s)
3961 {
3962     av_freep(&s->intra_pred_data[0]);
3963     av_freep(&s->b_base);
3964     av_freep(&s->block_base);
3965 }
3966
3967 static av_cold int vp9_decode_free(AVCodecContext *ctx)
3968 {
3969     VP9Context *s = ctx->priv_data;
3970     int i;
3971
3972     for (i = 0; i < 3; i++) {
3973         if (s->frames[i].tf.f->data[0])
3974             vp9_unref_frame(ctx, &s->frames[i]);
3975         av_frame_free(&s->frames[i].tf.f);
3976     }
3977     for (i = 0; i < 8; i++) {
3978         if (s->refs[i].f->data[0])
3979             ff_thread_release_buffer(ctx, &s->refs[i]);
3980         av_frame_free(&s->refs[i].f);
3981         if (s->next_refs[i].f->data[0])
3982             ff_thread_release_buffer(ctx, &s->next_refs[i]);
3983         av_frame_free(&s->next_refs[i].f);
3984     }
3985     free_buffers(s);
3986     av_freep(&s->c_b);
3987     s->c_b_size = 0;
3988
3989     return 0;
3990 }
3991
3992
3993 static int vp9_decode_frame(AVCodecContext *ctx, void *frame,
3994                             int *got_frame, AVPacket *pkt)
3995 {
3996     const uint8_t *data = pkt->data;
3997     int size = pkt->size;
3998     VP9Context *s = ctx->priv_data;
3999     int res, tile_row, tile_col, i, ref, row, col;
4000     int retain_segmap_ref = s->frames[REF_FRAME_SEGMAP].segmentation_map &&
4001                             (!s->segmentation.enabled || !s->segmentation.update_map);
4002     ptrdiff_t yoff, uvoff, ls_y, ls_uv;
4003     AVFrame *f;
4004     int bytesperpixel;
4005
4006     if ((res = decode_frame_header(ctx, data, size, &ref)) < 0) {
4007         return res;
4008     } else if (res == 0) {
4009         if (!s->refs[ref].f->data[0]) {
4010             av_log(ctx, AV_LOG_ERROR, "Requested reference %d not available\n", ref);
4011             return AVERROR_INVALIDDATA;
4012         }
4013         if ((res = av_frame_ref(frame, s->refs[ref].f)) < 0)
4014             return res;
4015         ((AVFrame *)frame)->pkt_pts = pkt->pts;
4016         ((AVFrame *)frame)->pkt_dts = pkt->dts;
4017         for (i = 0; i < 8; i++) {
4018             if (s->next_refs[i].f->data[0])
4019                 ff_thread_release_buffer(ctx, &s->next_refs[i]);
4020             if (s->refs[i].f->data[0] &&
4021                 (res = ff_thread_ref_frame(&s->next_refs[i], &s->refs[i])) < 0)
4022                 return res;
4023         }
4024         *got_frame = 1;
4025         return pkt->size;
4026     }
4027     data += res;
4028     size -= res;
4029
4030     if (!retain_segmap_ref || s->keyframe || s->intraonly) {
4031         if (s->frames[REF_FRAME_SEGMAP].tf.f->data[0])
4032             vp9_unref_frame(ctx, &s->frames[REF_FRAME_SEGMAP]);
4033         if (!s->keyframe && !s->intraonly && !s->errorres && s->frames[CUR_FRAME].tf.f->data[0] &&
4034             (res = vp9_ref_frame(ctx, &s->frames[REF_FRAME_SEGMAP], &s->frames[CUR_FRAME])) < 0)
4035             return res;
4036     }
4037     if (s->frames[REF_FRAME_MVPAIR].tf.f->data[0])
4038         vp9_unref_frame(ctx, &s->frames[REF_FRAME_MVPAIR]);
4039     if (!s->intraonly && !s->keyframe && !s->errorres && s->frames[CUR_FRAME].tf.f->data[0] &&
4040         (res = vp9_ref_frame(ctx, &s->frames[REF_FRAME_MVPAIR], &s->frames[CUR_FRAME])) < 0)
4041         return res;
4042     if (s->frames[CUR_FRAME].tf.f->data[0])
4043         vp9_unref_frame(ctx, &s->frames[CUR_FRAME]);
4044     if ((res = vp9_alloc_frame(ctx, &s->frames[CUR_FRAME])) < 0)
4045         return res;
4046     f = s->frames[CUR_FRAME].tf.f;
4047     f->key_frame = s->keyframe;
4048     f->pict_type = (s->keyframe || s->intraonly) ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
4049     ls_y = f->linesize[0];
4050     ls_uv =f->linesize[1];
4051
4052     // ref frame setup
4053     for (i = 0; i < 8; i++) {
4054         if (s->next_refs[i].f->data[0])
4055             ff_thread_release_buffer(ctx, &s->next_refs[i]);
4056         if (s->refreshrefmask & (1 << i)) {
4057             res = ff_thread_ref_frame(&s->next_refs[i], &s->frames[CUR_FRAME].tf);
4058         } else if (s->refs[i].f->data[0]) {
4059             res = ff_thread_ref_frame(&s->next_refs[i], &s->refs[i]);
4060         }
4061         if (res < 0)
4062             return res;
4063     }
4064
4065     // main tile decode loop
4066     bytesperpixel = s->bytesperpixel;
4067     memset(s->above_partition_ctx, 0, s->cols);
4068     memset(s->above_skip_ctx, 0, s->cols);
4069     if (s->keyframe || s->intraonly) {
4070         memset(s->above_mode_ctx, DC_PRED, s->cols * 2);
4071     } else {
4072         memset(s->above_mode_ctx, NEARESTMV, s->cols);
4073     }
4074     memset(s->above_y_nnz_ctx, 0, s->sb_cols * 16);
4075     memset(s->above_uv_nnz_ctx[0], 0, s->sb_cols * 16 >> s->ss_h);
4076     memset(s->above_uv_nnz_ctx[1], 0, s->sb_cols * 16 >> s->ss_h);
4077     memset(s->above_segpred_ctx, 0, s->cols);
4078     s->pass = s->frames[CUR_FRAME].uses_2pass =
4079         ctx->active_thread_type == FF_THREAD_FRAME && s->refreshctx && !s->parallelmode;
4080     if ((res = update_block_buffers(ctx)) < 0) {
4081         av_log(ctx, AV_LOG_ERROR,
4082                "Failed to allocate block buffers\n");
4083         return res;
4084     }
4085     if (s->refreshctx && s->parallelmode) {
4086         int j, k, l, m;
4087
4088         for (i = 0; i < 4; i++) {
4089             for (j = 0; j < 2; j++)
4090                 for (k = 0; k < 2; k++)
4091                     for (l = 0; l < 6; l++)
4092                         for (m = 0; m < 6; m++)
4093                             memcpy(s->prob_ctx[s->framectxid].coef[i][j][k][l][m],
4094                                    s->prob.coef[i][j][k][l][m], 3);
4095             if (s->txfmmode == i)
4096                 break;
4097         }
4098         s->prob_ctx[s->framectxid].p = s->prob.p;
4099         ff_thread_finish_setup(ctx);
4100     } else if (!s->refreshctx) {
4101         ff_thread_finish_setup(ctx);
4102     }
4103
4104     do {
4105         yoff = uvoff = 0;
4106         s->b = s->b_base;
4107         s->block = s->block_base;
4108         s->uvblock[0] = s->uvblock_base[0];
4109         s->uvblock[1] = s->uvblock_base[1];
4110         s->eob = s->eob_base;
4111         s->uveob[0] = s->uveob_base[0];
4112         s->uveob[1] = s->uveob_base[1];
4113
4114         for (tile_row = 0; tile_row < s->tiling.tile_rows; tile_row++) {
4115             set_tile_offset(&s->tiling.tile_row_start, &s->tiling.tile_row_end,
4116                             tile_row, s->tiling.log2_tile_rows, s->sb_rows);
4117             if (s->pass != 2) {
4118                 for (tile_col = 0; tile_col < s->tiling.tile_cols; tile_col++) {
4119                     int64_t tile_size;
4120
4121                     if (tile_col == s->tiling.tile_cols - 1 &&
4122                         tile_row == s->tiling.tile_rows - 1) {
4123                         tile_size = size;
4124                     } else {
4125                         tile_size = AV_RB32(data);
4126                         data += 4;
4127                         size -= 4;
4128                     }
4129                     if (tile_size > size) {
4130                         ff_thread_report_progress(&s->frames[CUR_FRAME].tf, INT_MAX, 0);
4131                         return AVERROR_INVALIDDATA;
4132                     }
4133                     ff_vp56_init_range_decoder(&s->c_b[tile_col], data, tile_size);
4134                     if (vp56_rac_get_prob_branchy(&s->c_b[tile_col], 128)) { // marker bit
4135                         ff_thread_report_progress(&s->frames[CUR_FRAME].tf, INT_MAX, 0);
4136                         return AVERROR_INVALIDDATA;
4137                     }
4138                     data += tile_size;
4139                     size -= tile_size;
4140                 }
4141             }
4142
4143             for (row = s->tiling.tile_row_start; row < s->tiling.tile_row_end;
4144                  row += 8, yoff += ls_y * 64, uvoff += ls_uv * 64 >> s->ss_v) {
4145                 struct VP9Filter *lflvl_ptr = s->lflvl;
4146                 ptrdiff_t yoff2 = yoff, uvoff2 = uvoff;
4147
4148                 for (tile_col = 0; tile_col < s->tiling.tile_cols; tile_col++) {
4149                     set_tile_offset(&s->tiling.tile_col_start, &s->tiling.tile_col_end,
4150                                     tile_col, s->tiling.log2_tile_cols, s->sb_cols);
4151
4152                     if (s->pass != 2) {
4153                         memset(s->left_partition_ctx, 0, 8);
4154                         memset(s->left_skip_ctx, 0, 8);
4155                         if (s->keyframe || s->intraonly) {
4156                             memset(s->left_mode_ctx, DC_PRED, 16);
4157                         } else {
4158                             memset(s->left_mode_ctx, NEARESTMV, 8);
4159                         }
4160                         memset(s->left_y_nnz_ctx, 0, 16);
4161                         memset(s->left_uv_nnz_ctx, 0, 32);
4162                         memset(s->left_segpred_ctx, 0, 8);
4163
4164                         memcpy(&s->c, &s->c_b[tile_col], sizeof(s->c));
4165                     }
4166
4167                     for (col = s->tiling.tile_col_start;
4168                          col < s->tiling.tile_col_end;
4169                          col += 8, yoff2 += 64 * bytesperpixel,
4170                          uvoff2 += 64 * bytesperpixel >> s->ss_h, lflvl_ptr++) {
4171                         // FIXME integrate with lf code (i.e. zero after each
4172                         // use, similar to invtxfm coefficients, or similar)
4173                         if (s->pass != 1) {
4174                             memset(lflvl_ptr->mask, 0, sizeof(lflvl_ptr->mask));
4175                         }
4176
4177                         if (s->pass == 2) {
4178                             decode_sb_mem(ctx, row, col, lflvl_ptr,
4179                                           yoff2, uvoff2, BL_64X64);
4180                         } else {
4181                             decode_sb(ctx, row, col, lflvl_ptr,
4182                                       yoff2, uvoff2, BL_64X64);
4183                         }
4184                     }
4185                     if (s->pass != 2) {
4186                         memcpy(&s->c_b[tile_col], &s->c, sizeof(s->c));
4187                     }
4188                 }
4189
4190                 if (s->pass == 1) {
4191                     continue;
4192                 }
4193
4194                 // backup pre-loopfilter reconstruction data for intra
4195                 // prediction of next row of sb64s
4196                 if (row + 8 < s->rows) {
4197                     memcpy(s->intra_pred_data[0],
4198                            f->data[0] + yoff + 63 * ls_y,
4199                            8 * s->cols * bytesperpixel);
4200                     memcpy(s->intra_pred_data[1],
4201                            f->data[1] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv,
4202                            8 * s->cols * bytesperpixel >> s->ss_h);
4203                     memcpy(s->intra_pred_data[2],
4204                            f->data[2] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv,
4205                            8 * s->cols * bytesperpixel >> s->ss_h);
4206                 }
4207
4208                 // loopfilter one row
4209                 if (s->filter.level) {
4210                     yoff2 = yoff;
4211                     uvoff2 = uvoff;
4212                     lflvl_ptr = s->lflvl;
4213                     for (col = 0; col < s->cols;
4214                          col += 8, yoff2 += 64 * bytesperpixel,
4215                          uvoff2 += 64 * bytesperpixel >> s->ss_h, lflvl_ptr++) {
4216                         loopfilter_sb(ctx, lflvl_ptr, row, col, yoff2, uvoff2);
4217                     }
4218                 }
4219
4220                 // FIXME maybe we can make this more finegrained by running the
4221                 // loopfilter per-block instead of after each sbrow
4222                 // In fact that would also make intra pred left preparation easier?
4223                 ff_thread_report_progress(&s->frames[CUR_FRAME].tf, row >> 3, 0);
4224             }
4225         }
4226
4227         if (s->pass < 2 && s->refreshctx && !s->parallelmode) {
4228             adapt_probs(s);
4229             ff_thread_finish_setup(ctx);
4230         }
4231     } while (s->pass++ == 1);
4232     ff_thread_report_progress(&s->frames[CUR_FRAME].tf, INT_MAX, 0);
4233
4234     // ref frame setup
4235     for (i = 0; i < 8; i++) {
4236         if (s->refs[i].f->data[0])
4237             ff_thread_release_buffer(ctx, &s->refs[i]);
4238         ff_thread_ref_frame(&s->refs[i], &s->next_refs[i]);
4239     }
4240
4241     if (!s->invisible) {
4242         if ((res = av_frame_ref(frame, s->frames[CUR_FRAME].tf.f)) < 0)
4243             return res;
4244         *got_frame = 1;
4245     }
4246
4247     return pkt->size;
4248 }
4249
4250 static void vp9_decode_flush(AVCodecContext *ctx)
4251 {
4252     VP9Context *s = ctx->priv_data;
4253     int i;
4254
4255     for (i = 0; i < 3; i++)
4256         vp9_unref_frame(ctx, &s->frames[i]);
4257     for (i = 0; i < 8; i++)
4258         ff_thread_release_buffer(ctx, &s->refs[i]);
4259 }
4260
4261 static int init_frames(AVCodecContext *ctx)
4262 {
4263     VP9Context *s = ctx->priv_data;
4264     int i;
4265
4266     for (i = 0; i < 3; i++) {
4267         s->frames[i].tf.f = av_frame_alloc();
4268         if (!s->frames[i].tf.f) {
4269             vp9_decode_free(ctx);
4270             av_log(ctx, AV_LOG_ERROR, "Failed to allocate frame buffer %d\n", i);
4271             return AVERROR(ENOMEM);
4272         }
4273     }
4274     for (i = 0; i < 8; i++) {
4275         s->refs[i].f = av_frame_alloc();
4276         s->next_refs[i].f = av_frame_alloc();
4277         if (!s->refs[i].f || !s->next_refs[i].f) {
4278             vp9_decode_free(ctx);
4279             av_log(ctx, AV_LOG_ERROR, "Failed to allocate frame buffer %d\n", i);
4280             return AVERROR(ENOMEM);
4281         }
4282     }
4283
4284     return 0;
4285 }
4286
4287 static av_cold int vp9_decode_init(AVCodecContext *ctx)
4288 {
4289     VP9Context *s = ctx->priv_data;
4290
4291     ctx->internal->allocate_progress = 1;
4292     s->last_bpp = 0;
4293     s->filter.sharpness = -1;
4294
4295     return init_frames(ctx);
4296 }
4297
4298 static av_cold int vp9_decode_init_thread_copy(AVCodecContext *avctx)
4299 {
4300     return init_frames(avctx);
4301 }
4302
4303 static int vp9_decode_update_thread_context(AVCodecContext *dst, const AVCodecContext *src)
4304 {
4305     int i, res;
4306     VP9Context *s = dst->priv_data, *ssrc = src->priv_data;
4307
4308     // detect size changes in other threads
4309     if (s->intra_pred_data[0] &&
4310         (!ssrc->intra_pred_data[0] || s->cols != ssrc->cols || s->rows != ssrc->rows)) {
4311         free_buffers(s);
4312     }
4313
4314     for (i = 0; i < 3; i++) {
4315         if (s->frames[i].tf.f->data[0])
4316             vp9_unref_frame(dst, &s->frames[i]);
4317         if (ssrc->frames[i].tf.f->data[0]) {
4318             if ((res = vp9_ref_frame(dst, &s->frames[i], &ssrc->frames[i])) < 0)
4319                 return res;
4320         }
4321     }
4322     for (i = 0; i < 8; i++) {
4323         if (s->refs[i].f->data[0])
4324             ff_thread_release_buffer(dst, &s->refs[i]);
4325         if (ssrc->next_refs[i].f->data[0]) {
4326             if ((res = ff_thread_ref_frame(&s->refs[i], &ssrc->next_refs[i])) < 0)
4327                 return res;
4328         }
4329     }
4330
4331     s->invisible = ssrc->invisible;
4332     s->keyframe = ssrc->keyframe;
4333     s->intraonly = ssrc->intraonly;
4334     s->ss_v = ssrc->ss_v;
4335     s->ss_h = ssrc->ss_h;
4336     s->segmentation.enabled = ssrc->segmentation.enabled;
4337     s->segmentation.update_map = ssrc->segmentation.update_map;
4338     s->bytesperpixel = ssrc->bytesperpixel;
4339     s->bpp = ssrc->bpp;
4340     s->bpp_index = ssrc->bpp_index;
4341     memcpy(&s->prob_ctx, &ssrc->prob_ctx, sizeof(s->prob_ctx));
4342     memcpy(&s->lf_delta, &ssrc->lf_delta, sizeof(s->lf_delta));
4343     if (ssrc->segmentation.enabled) {
4344         memcpy(&s->segmentation.feat, &ssrc->segmentation.feat,
4345                sizeof(s->segmentation.feat));
4346     }
4347
4348     return 0;
4349 }
4350
4351 static const AVProfile profiles[] = {
4352     { FF_PROFILE_VP9_0, "Profile 0" },
4353     { FF_PROFILE_VP9_1, "Profile 1" },
4354     { FF_PROFILE_VP9_2, "Profile 2" },
4355     { FF_PROFILE_VP9_3, "Profile 3" },
4356     { FF_PROFILE_UNKNOWN },
4357 };
4358
4359 AVCodec ff_vp9_decoder = {
4360     .name                  = "vp9",
4361     .long_name             = NULL_IF_CONFIG_SMALL("Google VP9"),
4362     .type                  = AVMEDIA_TYPE_VIDEO,
4363     .id                    = AV_CODEC_ID_VP9,
4364     .priv_data_size        = sizeof(VP9Context),
4365     .init                  = vp9_decode_init,
4366     .close                 = vp9_decode_free,
4367     .decode                = vp9_decode_frame,
4368     .capabilities          = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS,
4369     .flush                 = vp9_decode_flush,
4370     .init_thread_copy      = ONLY_IF_THREADS_ENABLED(vp9_decode_init_thread_copy),
4371     .update_thread_context = ONLY_IF_THREADS_ENABLED(vp9_decode_update_thread_context),
4372     .profiles              = NULL_IF_CONFIG_SMALL(profiles),
4373 };