git.sesse.net Git - ffmpeg/blob - libavcodec/vp9.c

   1 /*
   2  * VP9 compatible video decoder
   3  *
   4  * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
   5  * Copyright (C) 2013 Clément Bœsch <u pkh me>
   6  *
   7  * This file is part of FFmpeg.
   8  *
   9  * FFmpeg is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public
  11  * License as published by the Free Software Foundation; either
  12  * version 2.1 of the License, or (at your option) any later version.
  13  *
  14  * FFmpeg is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with FFmpeg; if not, write to the Free Software
  21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22  */
  23
  24 #include "avcodec.h"
  25 #include "get_bits.h"
  26 #include "internal.h"
  27 #include "thread.h"
  28 #include "videodsp.h"
  29 #include "vp56.h"
  30 #include "vp9.h"
  31 #include "vp9data.h"
  32 #include "vp9dsp.h"
  33 #include "libavutil/avassert.h"
  34
  35 #define VP9_SYNCCODE 0x498342
  36
  37 enum CompPredMode {
  38     PRED_SINGLEREF,
  39     PRED_COMPREF,
  40     PRED_SWITCHABLE,
  41 };
  42
  43 enum BlockLevel {
  44     BL_64X64,
  45     BL_32X32,
  46     BL_16X16,
  47     BL_8X8,
  48 };
  49
  50 enum BlockSize {
  51     BS_64x64,
  52     BS_64x32,
  53     BS_32x64,
  54     BS_32x32,
  55     BS_32x16,
  56     BS_16x32,
  57     BS_16x16,
  58     BS_16x8,
  59     BS_8x16,
  60     BS_8x8,
  61     BS_8x4,
  62     BS_4x8,
  63     BS_4x4,
  64     N_BS_SIZES,
  65 };
  66
  67 struct VP9mvrefPair {
  68     VP56mv mv[2];
  69     int8_t ref[2];
  70 };
  71
  72 typedef struct VP9Frame {
  73     ThreadFrame tf;
  74     AVBufferRef *extradata;
  75     uint8_t *segmentation_map;
  76     struct VP9mvrefPair *mv;
  77 } VP9Frame;
  78
  79 struct VP9Filter {
  80     uint8_t level[8 * 8];
  81     uint8_t /* bit=col */ mask[2 /* 0=y, 1=uv */][2 /* 0=col, 1=row */]
  82                               [8 /* rows */][4 /* 0=16, 1=8, 2=4, 3=inner4 */];
  83 };
  84
  85 typedef struct VP9Block {
  86     uint8_t seg_id, intra, comp, ref[2], mode[4], uvmode, skip;
  87     enum FilterMode filter;
  88     VP56mv mv[4 /* b_idx */][2 /* ref */];
  89     enum BlockSize bs;
  90     enum TxfmMode tx, uvtx;
  91     enum BlockLevel bl;
  92     enum BlockPartition bp;
  93 } VP9Block;
  94
  95 typedef struct VP9Context {
  96     VP9DSPContext dsp;
  97     VideoDSPContext vdsp;
  98     GetBitContext gb;
  99     VP56RangeCoder c;
 100     VP56RangeCoder *c_b;
 101     unsigned c_b_size;
 102     VP9Block *b_base, *b;
 103     int pass, uses_2pass, last_uses_2pass;
 104     int row, row7, col, col7;
 105     uint8_t *dst[3];
 106     ptrdiff_t y_stride, uv_stride;
 107
 108     // bitstream header
 109     uint8_t profile;
 110     uint8_t keyframe, last_keyframe;
 111     uint8_t invisible;
 112     uint8_t use_last_frame_mvs;
 113     uint8_t errorres;
 114     uint8_t colorspace;
 115     uint8_t fullrange;
 116     uint8_t intraonly;
 117     uint8_t resetctx;
 118     uint8_t refreshrefmask;
 119     uint8_t highprecisionmvs;
 120     enum FilterMode filtermode;
 121     uint8_t allowcompinter;
 122     uint8_t fixcompref;
 123     uint8_t refreshctx;
 124     uint8_t parallelmode;
 125     uint8_t framectxid;
 126     uint8_t refidx[3];
 127     uint8_t signbias[3];
 128     uint8_t varcompref[2];
 129     ThreadFrame refs[8], next_refs[8];
 130 #define CUR_FRAME 0
 131 #define LAST_FRAME 1
 132     VP9Frame frames[2];
 133
 134     struct {
 135         uint8_t level;
 136         int8_t sharpness;
 137         uint8_t lim_lut[64];
 138         uint8_t mblim_lut[64];
 139     } filter;
 140     struct {
 141         uint8_t enabled;
 142         int8_t mode[2];
 143         int8_t ref[4];
 144     } lf_delta;
 145     uint8_t yac_qi;
 146     int8_t ydc_qdelta, uvdc_qdelta, uvac_qdelta;
 147     uint8_t lossless;
 148     struct {
 149         uint8_t enabled;
 150         uint8_t temporal;
 151         uint8_t absolute_vals;
 152         uint8_t update_map;
 153         struct {
 154             uint8_t q_enabled;
 155             uint8_t lf_enabled;
 156             uint8_t ref_enabled;
 157             uint8_t skip_enabled;
 158             uint8_t ref_val;
 159             int16_t q_val;
 160             int8_t lf_val;
 161             int16_t qmul[2][2];
 162             uint8_t lflvl[4][2];
 163         } feat[8];
 164     } segmentation;
 165     struct {
 166         unsigned log2_tile_cols, log2_tile_rows;
 167         unsigned tile_cols, tile_rows;
 168         unsigned tile_row_start, tile_row_end, tile_col_start, tile_col_end;
 169     } tiling;
 170     unsigned sb_cols, sb_rows, rows, cols;
 171     struct {
 172         prob_context p;
 173         uint8_t coef[4][2][2][6][6][3];
 174     } prob_ctx[4];
 175     struct {
 176         prob_context p;
 177         uint8_t coef[4][2][2][6][6][11];
 178         uint8_t seg[7];
 179         uint8_t segpred[3];
 180     } prob;
 181     struct {
 182         unsigned y_mode[4][10];
 183         unsigned uv_mode[10][10];
 184         unsigned filter[4][3];
 185         unsigned mv_mode[7][4];
 186         unsigned intra[4][2];
 187         unsigned comp[5][2];
 188         unsigned single_ref[5][2][2];
 189         unsigned comp_ref[5][2];
 190         unsigned tx32p[2][4];
 191         unsigned tx16p[2][3];
 192         unsigned tx8p[2][2];
 193         unsigned skip[3][2];
 194         unsigned mv_joint[4];
 195         struct {
 196             unsigned sign[2];
 197             unsigned classes[11];
 198             unsigned class0[2];
 199             unsigned bits[10][2];
 200             unsigned class0_fp[2][4];
 201             unsigned fp[4];
 202             unsigned class0_hp[2];
 203             unsigned hp[2];
 204         } mv_comp[2];
 205         unsigned partition[4][4][4];
 206         unsigned coef[4][2][2][6][6][3];
 207         unsigned eob[4][2][2][6][6][2];
 208     } counts;
 209     enum TxfmMode txfmmode;
 210     enum CompPredMode comppredmode;
 211
 212     // contextual (left/above) cache
 213     uint8_t left_partition_ctx[8], *above_partition_ctx;
 214     uint8_t left_mode_ctx[16], *above_mode_ctx;
 215     // FIXME maybe merge some of the below in a flags field?
 216     uint8_t left_y_nnz_ctx[16], *above_y_nnz_ctx;
 217     uint8_t left_uv_nnz_ctx[2][8], *above_uv_nnz_ctx[2];
 218     uint8_t left_skip_ctx[8], *above_skip_ctx; // 1bit
 219     uint8_t left_txfm_ctx[8], *above_txfm_ctx; // 2bit
 220     uint8_t left_segpred_ctx[8], *above_segpred_ctx; // 1bit
 221     uint8_t left_intra_ctx[8], *above_intra_ctx; // 1bit
 222     uint8_t left_comp_ctx[8], *above_comp_ctx; // 1bit
 223     uint8_t left_ref_ctx[8], *above_ref_ctx; // 2bit
 224     uint8_t left_filter_ctx[8], *above_filter_ctx;
 225     VP56mv left_mv_ctx[16][2], (*above_mv_ctx)[2];
 226
 227     // whole-frame cache
 228     uint8_t *intra_pred_data[3];
 229     struct VP9Filter *lflvl;
 230     DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[71*80];
 231
 232     // block reconstruction intermediates
 233     int16_t *block_base, *block, *uvblock_base[2], *uvblock[2];
 234     uint8_t *eob_base, *uveob_base[2], *eob, *uveob[2];
 235     struct { int x, y; } min_mv, max_mv;
 236     DECLARE_ALIGNED(32, uint8_t, tmp_y)[64*64];
 237     DECLARE_ALIGNED(32, uint8_t, tmp_uv)[2][32*32];
 238 } VP9Context;
 239
 240 static const uint8_t bwh_tab[2][N_BS_SIZES][2] = {
 241     {
 242         { 16, 16 }, { 16, 8 }, { 8, 16 }, { 8, 8 }, { 8, 4 }, { 4, 8 },
 243         { 4, 4 }, { 4, 2 }, { 2, 4 }, { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 },
 244     }, {
 245         { 8, 8 }, { 8, 4 }, { 4, 8 }, { 4, 4 }, { 4, 2 }, { 2, 4 },
 246         { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },
 247     }
 248 };
 249
 250 static int vp9_alloc_frame(AVCodecContext *ctx, VP9Frame *f)
 251 {
 252     VP9Context *s = ctx->priv_data;
 253     int ret, sz;
 254
 255     if ((ret = ff_thread_get_buffer(ctx, &f->tf, AV_GET_BUFFER_FLAG_REF)) < 0)
 256         return ret;
 257     sz = 64 * s->sb_cols * s->sb_rows;
 258     if (!(f->extradata = av_buffer_allocz(sz * (1 + sizeof(struct VP9mvrefPair))))) {
 259         ff_thread_release_buffer(ctx, &f->tf);
 260         return AVERROR(ENOMEM);
 261     }
 262
 263     f->segmentation_map = f->extradata->data;
 264     f->mv = (struct VP9mvrefPair *) (f->extradata->data + sz);
 265
 266     // retain segmentation map if it doesn't update
 267     if (s->segmentation.enabled && !s->segmentation.update_map &&
 268         !s->keyframe && !s->intraonly) {
 269         memcpy(f->segmentation_map, s->frames[LAST_FRAME].segmentation_map, sz);
 270     }
 271
 272     return 0;
 273 }
 274
 275 static void vp9_unref_frame(AVCodecContext *ctx, VP9Frame *f)
 276 {
 277     ff_thread_release_buffer(ctx, &f->tf);
 278     av_buffer_unref(&f->extradata);
 279 }
 280
 281 static int vp9_ref_frame(AVCodecContext *ctx, VP9Frame *dst, VP9Frame *src)
 282 {
 283     int res;
 284
 285     if ((res = ff_thread_ref_frame(&dst->tf, &src->tf)) < 0) {
 286         return res;
 287     } else if (!(dst->extradata = av_buffer_ref(src->extradata))) {
 288         vp9_unref_frame(ctx, dst);
 289         return AVERROR(ENOMEM);
 290     }
 291
 292     dst->segmentation_map = src->segmentation_map;
 293     dst->mv = src->mv;
 294
 295     return 0;
 296 }
 297
 298 static int update_size(AVCodecContext *ctx, int w, int h)
 299 {
 300     VP9Context *s = ctx->priv_data;
 301     uint8_t *p;
 302
 303     av_assert0(w > 0 && h > 0);
 304
 305     if (s->above_partition_ctx && w == ctx->width && h == ctx->height)
 306         return 0;
 307
 308     ctx->width  = w;
 309     ctx->height = h;
 310     s->sb_cols  = (w + 63) >> 6;
 311     s->sb_rows  = (h + 63) >> 6;
 312     s->cols     = (w + 7) >> 3;
 313     s->rows     = (h + 7) >> 3;
 314
 315 #define assign(var, type, n) var = (type) p; p += s->sb_cols * n * sizeof(*var)
 316     av_freep(&s->above_partition_ctx);
 317     p = av_malloc(s->sb_cols * (240 + sizeof(*s->lflvl) + 16 * sizeof(*s->above_mv_ctx)));
 318     if (!p)
 319         return AVERROR(ENOMEM);
 320     assign(s->above_partition_ctx, uint8_t *,              8);
 321     assign(s->above_skip_ctx,      uint8_t *,              8);
 322     assign(s->above_txfm_ctx,      uint8_t *,              8);
 323     assign(s->above_mode_ctx,      uint8_t *,             16);
 324     assign(s->above_y_nnz_ctx,     uint8_t *,             16);
 325     assign(s->above_uv_nnz_ctx[0], uint8_t *,              8);
 326     assign(s->above_uv_nnz_ctx[1], uint8_t *,              8);
 327     assign(s->intra_pred_data[0],  uint8_t *,             64);
 328     assign(s->intra_pred_data[1],  uint8_t *,             32);
 329     assign(s->intra_pred_data[2],  uint8_t *,             32);
 330     assign(s->above_segpred_ctx,   uint8_t *,              8);
 331     assign(s->above_intra_ctx,     uint8_t *,              8);
 332     assign(s->above_comp_ctx,      uint8_t *,              8);
 333     assign(s->above_ref_ctx,       uint8_t *,              8);
 334     assign(s->above_filter_ctx,    uint8_t *,              8);
 335     assign(s->lflvl,               struct VP9Filter *,     1);
 336     assign(s->above_mv_ctx,        VP56mv(*)[2],          16);
 337 #undef assign
 338
 339     av_free(s->b_base);
 340     av_free(s->block_base);
 341     if (ctx->active_thread_type == FF_THREAD_FRAME && s->refreshctx && !s->parallelmode) {
 342         int sbs = s->sb_cols * s->sb_rows;
 343
 344         s->b_base = av_malloc(sizeof(VP9Block) * s->cols * s->rows);
 345         s->block_base = av_mallocz((64 * 64 + 128) * sbs * 3);
 346         if (!s->b_base || !s->block_base)
 347             return AVERROR(ENOMEM);
 348         s->uvblock_base[0] = s->block_base + sbs * 64 * 64;
 349         s->uvblock_base[1] = s->uvblock_base[0] + sbs * 32 * 32;
 350         s->eob_base = (uint8_t *) (s->uvblock_base[1] + sbs * 32 * 32);
 351         s->uveob_base[0] = s->eob_base + 256 * sbs;
 352         s->uveob_base[1] = s->uveob_base[0] + 64 * sbs;
 353     } else {
 354         s->b_base = av_malloc(sizeof(VP9Block));
 355         s->block_base = av_mallocz((64 * 64 + 128) * 3);
 356         if (!s->b_base || !s->block_base)
 357             return AVERROR(ENOMEM);
 358         s->uvblock_base[0] = s->block_base + 64 * 64;
 359         s->uvblock_base[1] = s->uvblock_base[0] + 32 * 32;
 360         s->eob_base = (uint8_t *) (s->uvblock_base[1] + 32 * 32);
 361         s->uveob_base[0] = s->eob_base + 256;
 362         s->uveob_base[1] = s->uveob_base[0] + 64;
 363     }
 364
 365     return 0;
 366 }
 367
 368 // for some reason the sign bit is at the end, not the start, of a bit sequence
 369 static av_always_inline int get_sbits_inv(GetBitContext *gb, int n)
 370 {
 371     int v = get_bits(gb, n);
 372     return get_bits1(gb) ? -v : v;
 373 }
 374
 375 static av_always_inline int inv_recenter_nonneg(int v, int m)
 376 {
 377     return v > 2 * m ? v : v & 1 ? m - ((v + 1) >> 1) : m + (v >> 1);
 378 }
 379
 380 // differential forward probability updates
 381 static int update_prob(VP56RangeCoder *c, int p)
 382 {
 383     static const int inv_map_table[254] = {
 384           7,  20,  33,  46,  59,  72,  85,  98, 111, 124, 137, 150, 163, 176,
 385         189, 202, 215, 228, 241, 254,   1,   2,   3,   4,   5,   6,   8,   9,
 386          10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  21,  22,  23,  24,
 387          25,  26,  27,  28,  29,  30,  31,  32,  34,  35,  36,  37,  38,  39,
 388          40,  41,  42,  43,  44,  45,  47,  48,  49,  50,  51,  52,  53,  54,
 389          55,  56,  57,  58,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,
 390          70,  71,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,
 391          86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,  99, 100,
 392         101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 112, 113, 114, 115,
 393         116, 117, 118, 119, 120, 121, 122, 123, 125, 126, 127, 128, 129, 130,
 394         131, 132, 133, 134, 135, 136, 138, 139, 140, 141, 142, 143, 144, 145,
 395         146, 147, 148, 149, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160,
 396         161, 162, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
 397         177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 190, 191,
 398         192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 203, 204, 205, 206,
 399         207, 208, 209, 210, 211, 212, 213, 214, 216, 217, 218, 219, 220, 221,
 400         222, 223, 224, 225, 226, 227, 229, 230, 231, 232, 233, 234, 235, 236,
 401         237, 238, 239, 240, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251,
 402         252, 253,
 403     };
 404     int d;
 405
 406     /* This code is trying to do a differential probability update. For a
 407      * current probability A in the range [1, 255], the difference to a new
 408      * probability of any value can be expressed differentially as 1-A,255-A
 409      * where some part of this (absolute range) exists both in positive as
 410      * well as the negative part, whereas another part only exists in one
 411      * half. We're trying to code this shared part differentially, i.e.
 412      * times two where the value of the lowest bit specifies the sign, and
 413      * the single part is then coded on top of this. This absolute difference
 414      * then again has a value of [0,254], but a bigger value in this range
 415      * indicates that we're further away from the original value A, so we
 416      * can code this as a VLC code, since higher values are increasingly
 417      * unlikely. The first 20 values in inv_map_table[] allow 'cheap, rough'
 418      * updates vs. the 'fine, exact' updates further down the range, which
 419      * adds one extra dimension to this differential update model. */
 420
 421     if (!vp8_rac_get(c)) {
 422         d = vp8_rac_get_uint(c, 4) + 0;
 423     } else if (!vp8_rac_get(c)) {
 424         d = vp8_rac_get_uint(c, 4) + 16;
 425     } else if (!vp8_rac_get(c)) {
 426         d = vp8_rac_get_uint(c, 5) + 32;
 427     } else {
 428         d = vp8_rac_get_uint(c, 7);
 429         if (d >= 65)
 430             d = (d << 1) - 65 + vp8_rac_get(c);
 431         d += 64;
 432     }
 433
 434     return p <= 128 ? 1 + inv_recenter_nonneg(inv_map_table[d], p - 1) :
 435                     255 - inv_recenter_nonneg(inv_map_table[d], 255 - p);
 436 }
 437
 438 static int decode_frame_header(AVCodecContext *ctx,
 439                                const uint8_t *data, int size, int *ref)
 440 {
 441     VP9Context *s = ctx->priv_data;
 442     int c, i, j, k, l, m, n, w, h, max, size2, res, sharp;
 443     int last_invisible;
 444     const uint8_t *data2;
 445
 446     /* general header */
 447     if ((res = init_get_bits8(&s->gb, data, size)) < 0) {
 448         av_log(ctx, AV_LOG_ERROR, "Failed to initialize bitstream reader\n");
 449         return res;
 450     }
 451     if (get_bits(&s->gb, 2) != 0x2) { // frame marker
 452         av_log(ctx, AV_LOG_ERROR, "Invalid frame marker\n");
 453         return AVERROR_INVALIDDATA;
 454     }
 455     s->profile = get_bits1(&s->gb);
 456     if (get_bits1(&s->gb)) { // reserved bit
 457         av_log(ctx, AV_LOG_ERROR, "Reserved bit should be zero\n");
 458         return AVERROR_INVALIDDATA;
 459     }
 460     if (get_bits1(&s->gb)) {
 461         *ref = get_bits(&s->gb, 3);
 462         return 0;
 463     }
 464     s->last_uses_2pass = s->uses_2pass;
 465     s->last_keyframe  = s->keyframe;
 466     s->keyframe       = !get_bits1(&s->gb);
 467     last_invisible    = s->invisible;
 468     s->invisible      = !get_bits1(&s->gb);
 469     s->errorres       = get_bits1(&s->gb);
 470     s->use_last_frame_mvs = !s->errorres && !last_invisible;
 471     if (s->keyframe) {
 472         if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
 473             av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
 474             return AVERROR_INVALIDDATA;
 475         }
 476         s->colorspace = get_bits(&s->gb, 3);
 477         if (s->colorspace == 7) { // RGB = profile 1
 478             av_log(ctx, AV_LOG_ERROR, "RGB not supported in profile 0\n");
 479             return AVERROR_INVALIDDATA;
 480         }
 481         s->fullrange  = get_bits1(&s->gb);
 482         // for profile 1, here follows the subsampling bits
 483         s->refreshrefmask = 0xff;
 484         w = get_bits(&s->gb, 16) + 1;
 485         h = get_bits(&s->gb, 16) + 1;
 486         if (get_bits1(&s->gb)) // display size
 487             skip_bits(&s->gb, 32);
 488     } else {
 489         s->intraonly  = s->invisible ? get_bits1(&s->gb) : 0;
 490         s->resetctx   = s->errorres ? 0 : get_bits(&s->gb, 2);
 491         if (s->intraonly) {
 492             if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
 493                 av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
 494                 return AVERROR_INVALIDDATA;
 495             }
 496             s->refreshrefmask = get_bits(&s->gb, 8);
 497             w = get_bits(&s->gb, 16) + 1;
 498             h = get_bits(&s->gb, 16) + 1;
 499             if (get_bits1(&s->gb)) // display size
 500                 skip_bits(&s->gb, 32);
 501         } else {
 502             s->refreshrefmask = get_bits(&s->gb, 8);
 503             s->refidx[0]      = get_bits(&s->gb, 3);
 504             s->signbias[0]    = get_bits1(&s->gb);
 505             s->refidx[1]      = get_bits(&s->gb, 3);
 506             s->signbias[1]    = get_bits1(&s->gb);
 507             s->refidx[2]      = get_bits(&s->gb, 3);
 508             s->signbias[2]    = get_bits1(&s->gb);
 509             if (!s->refs[s->refidx[0]].f->data[0] ||
 510                 !s->refs[s->refidx[1]].f->data[0] ||
 511                 !s->refs[s->refidx[2]].f->data[0]) {
 512                 av_log(ctx, AV_LOG_ERROR, "Not all references are available\n");
 513                 return AVERROR_INVALIDDATA;
 514             }
 515             if (get_bits1(&s->gb)) {
 516                 w = s->refs[s->refidx[0]].f->width;
 517                 h = s->refs[s->refidx[0]].f->height;
 518             } else if (get_bits1(&s->gb)) {
 519                 w = s->refs[s->refidx[1]].f->width;
 520                 h = s->refs[s->refidx[1]].f->height;
 521             } else if (get_bits1(&s->gb)) {
 522                 w = s->refs[s->refidx[2]].f->width;
 523                 h = s->refs[s->refidx[2]].f->height;
 524             } else {
 525                 w = get_bits(&s->gb, 16) + 1;
 526                 h = get_bits(&s->gb, 16) + 1;
 527             }
 528             s->use_last_frame_mvs &= s->frames[LAST_FRAME].tf.f->width == w &&
 529                                      s->frames[LAST_FRAME].tf.f->height == h;
 530             if (get_bits1(&s->gb)) // display size
 531                 skip_bits(&s->gb, 32);
 532             s->highprecisionmvs = get_bits1(&s->gb);
 533             s->filtermode = get_bits1(&s->gb) ? FILTER_SWITCHABLE :
 534                                                 get_bits(&s->gb, 2);
 535             s->allowcompinter = s->signbias[0] != s->signbias[1] ||
 536                                 s->signbias[0] != s->signbias[2];
 537             if (s->allowcompinter) {
 538                 if (s->signbias[0] == s->signbias[1]) {
 539                     s->fixcompref    = 2;
 540                     s->varcompref[0] = 0;
 541                     s->varcompref[1] = 1;
 542                 } else if (s->signbias[0] == s->signbias[2]) {
 543                     s->fixcompref    = 1;
 544                     s->varcompref[0] = 0;
 545                     s->varcompref[1] = 2;
 546                 } else {
 547                     s->fixcompref    = 0;
 548                     s->varcompref[0] = 1;
 549                     s->varcompref[1] = 2;
 550                 }
 551             }
 552         }
 553     }
 554     s->refreshctx   = s->errorres ? 0 : get_bits1(&s->gb);
 555     s->parallelmode = s->errorres ? 1 : get_bits1(&s->gb);
 556     s->framectxid   = c = get_bits(&s->gb, 2);
 557
 558     /* loopfilter header data */
 559     s->filter.level = get_bits(&s->gb, 6);
 560     sharp = get_bits(&s->gb, 3);
 561     // if sharpness changed, reinit lim/mblim LUTs. if it didn't change, keep
 562     // the old cache values since they are still valid
 563     if (s->filter.sharpness != sharp)
 564         memset(s->filter.lim_lut, 0, sizeof(s->filter.lim_lut));
 565     s->filter.sharpness = sharp;
 566     if ((s->lf_delta.enabled = get_bits1(&s->gb))) {
 567         if (get_bits1(&s->gb)) {
 568             for (i = 0; i < 4; i++)
 569                 if (get_bits1(&s->gb))
 570                     s->lf_delta.ref[i] = get_sbits_inv(&s->gb, 6);
 571             for (i = 0; i < 2; i++)
 572                 if (get_bits1(&s->gb))
 573                     s->lf_delta.mode[i] = get_sbits_inv(&s->gb, 6);
 574         }
 575     } else {
 576         memset(&s->lf_delta, 0, sizeof(s->lf_delta));
 577     }
 578
 579     /* quantization header data */
 580     s->yac_qi      = get_bits(&s->gb, 8);
 581     s->ydc_qdelta  = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
 582     s->uvdc_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
 583     s->uvac_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
 584     s->lossless    = s->yac_qi == 0 && s->ydc_qdelta == 0 &&
 585                      s->uvdc_qdelta == 0 && s->uvac_qdelta == 0;
 586
 587     /* segmentation header info */
 588     if ((s->segmentation.enabled = get_bits1(&s->gb))) {
 589         if ((s->segmentation.update_map = get_bits1(&s->gb))) {
 590             for (i = 0; i < 7; i++)
 591                 s->prob.seg[i] = get_bits1(&s->gb) ?
 592                                  get_bits(&s->gb, 8) : 255;
 593             if ((s->segmentation.temporal = get_bits1(&s->gb)))
 594                 for (i = 0; i < 3; i++)
 595                     s->prob.segpred[i] = get_bits1(&s->gb) ?
 596                                          get_bits(&s->gb, 8) : 255;
 597         }
 598
 599         if (get_bits1(&s->gb)) {
 600             s->segmentation.absolute_vals = get_bits1(&s->gb);
 601             for (i = 0; i < 8; i++) {
 602                 if ((s->segmentation.feat[i].q_enabled = get_bits1(&s->gb)))
 603                     s->segmentation.feat[i].q_val = get_sbits_inv(&s->gb, 8);
 604                 if ((s->segmentation.feat[i].lf_enabled = get_bits1(&s->gb)))
 605                     s->segmentation.feat[i].lf_val = get_sbits_inv(&s->gb, 6);
 606                 if ((s->segmentation.feat[i].ref_enabled = get_bits1(&s->gb)))
 607                     s->segmentation.feat[i].ref_val = get_bits(&s->gb, 2);
 608                 s->segmentation.feat[i].skip_enabled = get_bits1(&s->gb);
 609             }
 610         }
 611     } else {
 612         s->segmentation.feat[0].q_enabled    = 0;
 613         s->segmentation.feat[0].lf_enabled   = 0;
 614         s->segmentation.feat[0].skip_enabled = 0;
 615         s->segmentation.feat[0].ref_enabled  = 0;
 616     }
 617
 618     // set qmul[] based on Y/UV, AC/DC and segmentation Q idx deltas
 619     for (i = 0; i < (s->segmentation.enabled ? 8 : 1); i++) {
 620         int qyac, qydc, quvac, quvdc, lflvl, sh;
 621
 622         if (s->segmentation.feat[i].q_enabled) {
 623             if (s->segmentation.absolute_vals)
 624                 qyac = s->segmentation.feat[i].q_val;
 625             else
 626                 qyac = s->yac_qi + s->segmentation.feat[i].q_val;
 627         } else {
 628             qyac  = s->yac_qi;
 629         }
 630         qydc  = av_clip_uintp2(qyac + s->ydc_qdelta, 8);
 631         quvdc = av_clip_uintp2(qyac + s->uvdc_qdelta, 8);
 632         quvac = av_clip_uintp2(qyac + s->uvac_qdelta, 8);
 633         qyac  = av_clip_uintp2(qyac, 8);
 634
 635         s->segmentation.feat[i].qmul[0][0] = vp9_dc_qlookup[qydc];
 636         s->segmentation.feat[i].qmul[0][1] = vp9_ac_qlookup[qyac];
 637         s->segmentation.feat[i].qmul[1][0] = vp9_dc_qlookup[quvdc];
 638         s->segmentation.feat[i].qmul[1][1] = vp9_ac_qlookup[quvac];
 639
 640         sh = s->filter.level >= 32;
 641         if (s->segmentation.feat[i].lf_enabled) {
 642             if (s->segmentation.absolute_vals)
 643                 lflvl = s->segmentation.feat[i].lf_val;
 644             else
 645                 lflvl = s->filter.level + s->segmentation.feat[i].lf_val;
 646         } else {
 647             lflvl  = s->filter.level;
 648         }
 649         s->segmentation.feat[i].lflvl[0][0] =
 650         s->segmentation.feat[i].lflvl[0][1] =
 651             av_clip_uintp2(lflvl + (s->lf_delta.ref[0] << sh), 6);
 652         for (j = 1; j < 4; j++) {
 653             s->segmentation.feat[i].lflvl[j][0] =
 654                 av_clip_uintp2(lflvl + ((s->lf_delta.ref[j] +
 655                                          s->lf_delta.mode[0]) << sh), 6);
 656             s->segmentation.feat[i].lflvl[j][1] =
 657                 av_clip_uintp2(lflvl + ((s->lf_delta.ref[j] +
 658                                          s->lf_delta.mode[1]) << sh), 6);
 659         }
 660     }
 661
 662     /* tiling info */
 663     if ((res = update_size(ctx, w, h)) < 0) {
 664         av_log(ctx, AV_LOG_ERROR, "Failed to initialize decoder for %dx%d\n", w, h);
 665         return res;
 666     }
 667     for (s->tiling.log2_tile_cols = 0;
 668          (s->sb_cols >> s->tiling.log2_tile_cols) > 64;
 669          s->tiling.log2_tile_cols++) ;
 670     for (max = 0; (s->sb_cols >> max) >= 4; max++) ;
 671     max = FFMAX(0, max - 1);
 672     while (max > s->tiling.log2_tile_cols) {
 673         if (get_bits1(&s->gb))
 674             s->tiling.log2_tile_cols++;
 675         else
 676             break;
 677     }
 678     s->tiling.log2_tile_rows = decode012(&s->gb);
 679     s->tiling.tile_rows = 1 << s->tiling.log2_tile_rows;
 680     if (s->tiling.tile_cols != (1 << s->tiling.log2_tile_cols)) {
 681         s->tiling.tile_cols = 1 << s->tiling.log2_tile_cols;
 682         s->c_b = av_fast_realloc(s->c_b, &s->c_b_size,
 683                                  sizeof(VP56RangeCoder) * s->tiling.tile_cols);
 684         if (!s->c_b) {
 685             av_log(ctx, AV_LOG_ERROR, "Ran out of memory during range coder init\n");
 686             return AVERROR(ENOMEM);
 687         }
 688     }
 689
 690     if (s->keyframe || s->errorres || s->intraonly) {
 691         s->prob_ctx[0].p = s->prob_ctx[1].p = s->prob_ctx[2].p =
 692                            s->prob_ctx[3].p = vp9_default_probs;
 693         memcpy(s->prob_ctx[0].coef, vp9_default_coef_probs,
 694                sizeof(vp9_default_coef_probs));
 695         memcpy(s->prob_ctx[1].coef, vp9_default_coef_probs,
 696                sizeof(vp9_default_coef_probs));
 697         memcpy(s->prob_ctx[2].coef, vp9_default_coef_probs,
 698                sizeof(vp9_default_coef_probs));
 699         memcpy(s->prob_ctx[3].coef, vp9_default_coef_probs,
 700                sizeof(vp9_default_coef_probs));
 701     }
 702
 703     // next 16 bits is size of the rest of the header (arith-coded)
 704     size2 = get_bits(&s->gb, 16);
 705     data2 = align_get_bits(&s->gb);
 706     if (size2 > size - (data2 - data)) {
 707         av_log(ctx, AV_LOG_ERROR, "Invalid compressed header size\n");
 708         return AVERROR_INVALIDDATA;
 709     }
 710     ff_vp56_init_range_decoder(&s->c, data2, size2);
 711     if (vp56_rac_get_prob_branchy(&s->c, 128)) { // marker bit
 712         av_log(ctx, AV_LOG_ERROR, "Marker bit was set\n");
 713         return AVERROR_INVALIDDATA;
 714     }
 715
 716     if (s->keyframe || s->intraonly) {
 717         memset(s->counts.coef, 0, sizeof(s->counts.coef) + sizeof(s->counts.eob));
 718     } else {
 719         memset(&s->counts, 0, sizeof(s->counts));
 720     }
 721     // FIXME is it faster to not copy here, but do it down in the fw updates
 722     // as explicit copies if the fw update is missing (and skip the copy upon
 723     // fw update)?
 724     s->prob.p = s->prob_ctx[c].p;
 725
 726     // txfm updates
 727     if (s->lossless) {
 728         s->txfmmode = TX_4X4;
 729     } else {
 730         s->txfmmode = vp8_rac_get_uint(&s->c, 2);
 731         if (s->txfmmode == 3)
 732             s->txfmmode += vp8_rac_get(&s->c);
 733
 734         if (s->txfmmode == TX_SWITCHABLE) {
 735             for (i = 0; i < 2; i++)
 736                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 737                     s->prob.p.tx8p[i] = update_prob(&s->c, s->prob.p.tx8p[i]);
 738             for (i = 0; i < 2; i++)
 739                 for (j = 0; j < 2; j++)
 740                     if (vp56_rac_get_prob_branchy(&s->c, 252))
 741                         s->prob.p.tx16p[i][j] =
 742                             update_prob(&s->c, s->prob.p.tx16p[i][j]);
 743             for (i = 0; i < 2; i++)
 744                 for (j = 0; j < 3; j++)
 745                     if (vp56_rac_get_prob_branchy(&s->c, 252))
 746                         s->prob.p.tx32p[i][j] =
 747                             update_prob(&s->c, s->prob.p.tx32p[i][j]);
 748         }
 749     }
 750
 751     // coef updates
 752     for (i = 0; i < 4; i++) {
 753         uint8_t (*ref)[2][6][6][3] = s->prob_ctx[c].coef[i];
 754         if (vp8_rac_get(&s->c)) {
 755             for (j = 0; j < 2; j++)
 756                 for (k = 0; k < 2; k++)
 757                     for (l = 0; l < 6; l++)
 758                         for (m = 0; m < 6; m++) {
 759                             uint8_t *p = s->prob.coef[i][j][k][l][m];
 760                             uint8_t *r = ref[j][k][l][m];
 761                             if (m >= 3 && l == 0) // dc only has 3 pt
 762                                 break;
 763                             for (n = 0; n < 3; n++) {
 764                                 if (vp56_rac_get_prob_branchy(&s->c, 252)) {
 765                                     p[n] = update_prob(&s->c, r[n]);
 766                                 } else {
 767                                     p[n] = r[n];
 768                                 }
 769                             }
 770                             p[3] = 0;
 771                         }
 772         } else {
 773             for (j = 0; j < 2; j++)
 774                 for (k = 0; k < 2; k++)
 775                     for (l = 0; l < 6; l++)
 776                         for (m = 0; m < 6; m++) {
 777                             uint8_t *p = s->prob.coef[i][j][k][l][m];
 778                             uint8_t *r = ref[j][k][l][m];
 779                             if (m > 3 && l == 0) // dc only has 3 pt
 780                                 break;
 781                             memcpy(p, r, 3);
 782                             p[3] = 0;
 783                         }
 784         }
 785         if (s->txfmmode == i)
 786             break;
 787     }
 788
 789     // mode updates
 790     for (i = 0; i < 3; i++)
 791         if (vp56_rac_get_prob_branchy(&s->c, 252))
 792             s->prob.p.skip[i] = update_prob(&s->c, s->prob.p.skip[i]);
 793     if (!s->keyframe && !s->intraonly) {
 794         for (i = 0; i < 7; i++)
 795             for (j = 0; j < 3; j++)
 796                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 797                     s->prob.p.mv_mode[i][j] =
 798                         update_prob(&s->c, s->prob.p.mv_mode[i][j]);
 799
 800         if (s->filtermode == FILTER_SWITCHABLE)
 801             for (i = 0; i < 4; i++)
 802                 for (j = 0; j < 2; j++)
 803                     if (vp56_rac_get_prob_branchy(&s->c, 252))
 804                         s->prob.p.filter[i][j] =
 805                             update_prob(&s->c, s->prob.p.filter[i][j]);
 806
 807         for (i = 0; i < 4; i++)
 808             if (vp56_rac_get_prob_branchy(&s->c, 252))
 809                 s->prob.p.intra[i] = update_prob(&s->c, s->prob.p.intra[i]);
 810
 811         if (s->allowcompinter) {
 812             s->comppredmode = vp8_rac_get(&s->c);
 813             if (s->comppredmode)
 814                 s->comppredmode += vp8_rac_get(&s->c);
 815             if (s->comppredmode == PRED_SWITCHABLE)
 816                 for (i = 0; i < 5; i++)
 817                     if (vp56_rac_get_prob_branchy(&s->c, 252))
 818                         s->prob.p.comp[i] =
 819                             update_prob(&s->c, s->prob.p.comp[i]);
 820         } else {
 821             s->comppredmode = PRED_SINGLEREF;
 822         }
 823
 824         if (s->comppredmode != PRED_COMPREF) {
 825             for (i = 0; i < 5; i++) {
 826                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 827                     s->prob.p.single_ref[i][0] =
 828                         update_prob(&s->c, s->prob.p.single_ref[i][0]);
 829                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 830                     s->prob.p.single_ref[i][1] =
 831                         update_prob(&s->c, s->prob.p.single_ref[i][1]);
 832             }
 833         }
 834
 835         if (s->comppredmode != PRED_SINGLEREF) {
 836             for (i = 0; i < 5; i++)
 837                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 838                     s->prob.p.comp_ref[i] =
 839                         update_prob(&s->c, s->prob.p.comp_ref[i]);
 840         }
 841
 842         for (i = 0; i < 4; i++)
 843             for (j = 0; j < 9; j++)
 844                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 845                     s->prob.p.y_mode[i][j] =
 846                         update_prob(&s->c, s->prob.p.y_mode[i][j]);
 847
 848         for (i = 0; i < 4; i++)
 849             for (j = 0; j < 4; j++)
 850                 for (k = 0; k < 3; k++)
 851                     if (vp56_rac_get_prob_branchy(&s->c, 252))
 852                         s->prob.p.partition[3 - i][j][k] =
 853                             update_prob(&s->c, s->prob.p.partition[3 - i][j][k]);
 854
 855         // mv fields don't use the update_prob subexp model for some reason
 856         for (i = 0; i < 3; i++)
 857             if (vp56_rac_get_prob_branchy(&s->c, 252))
 858                 s->prob.p.mv_joint[i] = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 859
 860         for (i = 0; i < 2; i++) {
 861             if (vp56_rac_get_prob_branchy(&s->c, 252))
 862                 s->prob.p.mv_comp[i].sign = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 863
 864             for (j = 0; j < 10; j++)
 865                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 866                     s->prob.p.mv_comp[i].classes[j] =
 867                         (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 868
 869             if (vp56_rac_get_prob_branchy(&s->c, 252))
 870                 s->prob.p.mv_comp[i].class0 = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 871
 872             for (j = 0; j < 10; j++)
 873                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 874                     s->prob.p.mv_comp[i].bits[j] =
 875                         (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 876         }
 877
 878         for (i = 0; i < 2; i++) {
 879             for (j = 0; j < 2; j++)
 880                 for (k = 0; k < 3; k++)
 881                     if (vp56_rac_get_prob_branchy(&s->c, 252))
 882                         s->prob.p.mv_comp[i].class0_fp[j][k] =
 883                             (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 884
 885             for (j = 0; j < 3; j++)
 886                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 887                     s->prob.p.mv_comp[i].fp[j] =
 888                         (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 889         }
 890
 891         if (s->highprecisionmvs) {
 892             for (i = 0; i < 2; i++) {
 893                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 894                     s->prob.p.mv_comp[i].class0_hp =
 895                         (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 896
 897                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 898                     s->prob.p.mv_comp[i].hp =
 899                         (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 900             }
 901         }
 902     }
 903
 904     return (data2 - data) + size2;
 905 }
 906
 907 static av_always_inline void clamp_mv(VP56mv *dst, const VP56mv *src,
 908                                       VP9Context *s)
 909 {
 910     dst->x = av_clip(src->x, s->min_mv.x, s->max_mv.x);
 911     dst->y = av_clip(src->y, s->min_mv.y, s->max_mv.y);
 912 }
 913
 914 static void find_ref_mvs(VP9Context *s,
 915                          VP56mv *pmv, int ref, int z, int idx, int sb)
 916 {
 917     static const int8_t mv_ref_blk_off[N_BS_SIZES][8][2] = {
 918         [BS_64x64] = {{  3, -1 }, { -1,  3 }, {  4, -1 }, { -1,  4 },
 919                       { -1, -1 }, {  0, -1 }, { -1,  0 }, {  6, -1 }},
 920         [BS_64x32] = {{  0, -1 }, { -1,  0 }, {  4, -1 }, { -1,  2 },
 921                       { -1, -1 }, {  0, -3 }, { -3,  0 }, {  2, -1 }},
 922         [BS_32x64] = {{ -1,  0 }, {  0, -1 }, { -1,  4 }, {  2, -1 },
 923                       { -1, -1 }, { -3,  0 }, {  0, -3 }, { -1,  2 }},
 924         [BS_32x32] = {{  1, -1 }, { -1,  1 }, {  2, -1 }, { -1,  2 },
 925                       { -1, -1 }, {  0, -3 }, { -3,  0 }, { -3, -3 }},
 926         [BS_32x16] = {{  0, -1 }, { -1,  0 }, {  2, -1 }, { -1, -1 },
 927                       { -1,  1 }, {  0, -3 }, { -3,  0 }, { -3, -3 }},
 928         [BS_16x32] = {{ -1,  0 }, {  0, -1 }, { -1,  2 }, { -1, -1 },
 929                       {  1, -1 }, { -3,  0 }, {  0, -3 }, { -3, -3 }},
 930         [BS_16x16] = {{  0, -1 }, { -1,  0 }, {  1, -1 }, { -1,  1 },
 931                       { -1, -1 }, {  0, -3 }, { -3,  0 }, { -3, -3 }},
 932         [BS_16x8]  = {{  0, -1 }, { -1,  0 }, {  1, -1 }, { -1, -1 },
 933                       {  0, -2 }, { -2,  0 }, { -2, -1 }, { -1, -2 }},
 934         [BS_8x16]  = {{ -1,  0 }, {  0, -1 }, { -1,  1 }, { -1, -1 },
 935                       { -2,  0 }, {  0, -2 }, { -1, -2 }, { -2, -1 }},
 936         [BS_8x8]   = {{  0, -1 }, { -1,  0 }, { -1, -1 }, {  0, -2 },
 937                       { -2,  0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
 938         [BS_8x4]   = {{  0, -1 }, { -1,  0 }, { -1, -1 }, {  0, -2 },
 939                       { -2,  0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
 940         [BS_4x8]   = {{  0, -1 }, { -1,  0 }, { -1, -1 }, {  0, -2 },
 941                       { -2,  0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
 942         [BS_4x4]   = {{  0, -1 }, { -1,  0 }, { -1, -1 }, {  0, -2 },
 943                       { -2,  0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
 944     };
 945     VP9Block *b = s->b;
 946     int row = s->row, col = s->col, row7 = s->row7;
 947     const int8_t (*p)[2] = mv_ref_blk_off[b->bs];
 948 #define INVALID_MV 0x80008000U
 949     uint32_t mem = INVALID_MV;
 950     int i;
 951
 952 #define RETURN_DIRECT_MV(mv) \
 953     do { \
 954         uint32_t m = AV_RN32A(&mv); \
 955         if (!idx) { \
 956             AV_WN32A(pmv, m); \
 957             return; \
 958         } else if (mem == INVALID_MV) { \
 959             mem = m; \
 960         } else if (m != mem) { \
 961             AV_WN32A(pmv, m); \
 962             return; \
 963         } \
 964     } while (0)
 965
 966     if (sb >= 0) {
 967         if (sb == 2 || sb == 1) {
 968             RETURN_DIRECT_MV(b->mv[0][z]);
 969         } else if (sb == 3) {
 970             RETURN_DIRECT_MV(b->mv[2][z]);
 971             RETURN_DIRECT_MV(b->mv[1][z]);
 972             RETURN_DIRECT_MV(b->mv[0][z]);
 973         }
 974
 975 #define RETURN_MV(mv) \
 976     do { \
 977         if (sb > 0) { \
 978             VP56mv tmp; \
 979             uint32_t m; \
 980             clamp_mv(&tmp, &mv, s); \
 981             m = AV_RN32A(&tmp); \
 982             if (!idx) { \
 983                 AV_WN32A(pmv, m); \
 984                 return; \
 985             } else if (mem == INVALID_MV) { \
 986                 mem = m; \
 987             } else if (m != mem) { \
 988                 AV_WN32A(pmv, m); \
 989                 return; \
 990             } \
 991         } else { \
 992             uint32_t m = AV_RN32A(&mv); \
 993             if (!idx) { \
 994                 clamp_mv(pmv, &mv, s); \
 995                 return; \
 996             } else if (mem == INVALID_MV) { \
 997                 mem = m; \
 998             } else if (m != mem) { \
 999                 clamp_mv(pmv, &mv, s); \
1000                 return; \
1001             } \
1002         } \
1003     } while (0)
1004
1005         if (row > 0) {
1006             struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[(row - 1) * s->sb_cols * 8 + col];
1007             if (mv->ref[0] == ref) {
1008                 RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][0]);
1009             } else if (mv->ref[1] == ref) {
1010                 RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][1]);
1011             }
1012         }
1013         if (col > s->tiling.tile_col_start) {
1014             struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[row * s->sb_cols * 8 + col - 1];
1015             if (mv->ref[0] == ref) {
1016                 RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][0]);
1017             } else if (mv->ref[1] == ref) {
1018                 RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][1]);
1019             }
1020         }
1021         i = 2;
1022     } else {
1023         i = 0;
1024     }
1025
1026     // previously coded MVs in this neighbourhood, using same reference frame
1027     for (; i < 8; i++) {
1028         int c = p[i][0] + col, r = p[i][1] + row;
1029
1030         if (c >= s->tiling.tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
1031             struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
1032
1033             if (mv->ref[0] == ref) {
1034                 RETURN_MV(mv->mv[0]);
1035             } else if (mv->ref[1] == ref) {
1036                 RETURN_MV(mv->mv[1]);
1037             }
1038         }
1039     }
1040
1041     // MV at this position in previous frame, using same reference frame
1042     if (s->use_last_frame_mvs) {
1043         struct VP9mvrefPair *mv = &s->frames[LAST_FRAME].mv[row * s->sb_cols * 8 + col];
1044
1045         if (!s->last_uses_2pass)
1046             ff_thread_await_progress(&s->frames[LAST_FRAME].tf, row >> 3, 0);
1047         if (mv->ref[0] == ref) {
1048             RETURN_MV(mv->mv[0]);
1049         } else if (mv->ref[1] == ref) {
1050             RETURN_MV(mv->mv[1]);
1051         }
1052     }
1053
1054 #define RETURN_SCALE_MV(mv, scale) \
1055     do { \
1056         if (scale) { \
1057             VP56mv mv_temp = { -mv.x, -mv.y }; \
1058             RETURN_MV(mv_temp); \
1059         } else { \
1060             RETURN_MV(mv); \
1061         } \
1062     } while (0)
1063
1064     // previously coded MVs in this neighbourhood, using different reference frame
1065     for (i = 0; i < 8; i++) {
1066         int c = p[i][0] + col, r = p[i][1] + row;
1067
1068         if (c >= s->tiling.tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
1069             struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
1070
1071             if (mv->ref[0] != ref && mv->ref[0] >= 0) {
1072                 RETURN_SCALE_MV(mv->mv[0], s->signbias[mv->ref[0]] != s->signbias[ref]);
1073             }
1074             if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
1075                 // BUG - libvpx has this condition regardless of whether
1076                 // we used the first ref MV and pre-scaling
1077                 AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
1078                 RETURN_SCALE_MV(mv->mv[1], s->signbias[mv->ref[1]] != s->signbias[ref]);
1079             }
1080         }
1081     }
1082
1083     // MV at this position in previous frame, using different reference frame
1084     if (s->use_last_frame_mvs) {
1085         struct VP9mvrefPair *mv = &s->frames[LAST_FRAME].mv[row * s->sb_cols * 8 + col];
1086
1087         // no need to await_progress, because we already did that above
1088         if (mv->ref[0] != ref && mv->ref[0] >= 0) {
1089             RETURN_SCALE_MV(mv->mv[0], s->signbias[mv->ref[0]] != s->signbias[ref]);
1090         }
1091         if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
1092             // BUG - libvpx has this condition regardless of whether
1093             // we used the first ref MV and pre-scaling
1094             AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
1095             RETURN_SCALE_MV(mv->mv[1], s->signbias[mv->ref[1]] != s->signbias[ref]);
1096         }
1097     }
1098
1099     AV_ZERO32(pmv);
1100 #undef INVALID_MV
1101 #undef RETURN_MV
1102 #undef RETURN_SCALE_MV
1103 }
1104
1105 static av_always_inline int read_mv_component(VP9Context *s, int idx, int hp)
1106 {
1107     int bit, sign = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].sign);
1108     int n, c = vp8_rac_get_tree(&s->c, vp9_mv_class_tree,
1109                                 s->prob.p.mv_comp[idx].classes);
1110
1111     s->counts.mv_comp[idx].sign[sign]++;
1112     s->counts.mv_comp[idx].classes[c]++;
1113     if (c) {
1114         int m;
1115
1116         for (n = 0, m = 0; m < c; m++) {
1117             bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].bits[m]);
1118             n |= bit << m;
1119             s->counts.mv_comp[idx].bits[m][bit]++;
1120         }
1121         n <<= 3;
1122         bit = vp8_rac_get_tree(&s->c, vp9_mv_fp_tree, s->prob.p.mv_comp[idx].fp);
1123         n |= bit << 1;
1124         s->counts.mv_comp[idx].fp[bit]++;
1125         if (hp) {
1126             bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].hp);
1127             s->counts.mv_comp[idx].hp[bit]++;
1128             n |= bit;
1129         } else {
1130             n |= 1;
1131             // bug in libvpx - we count for bw entropy purposes even if the
1132             // bit wasn't coded
1133             s->counts.mv_comp[idx].hp[1]++;
1134         }
1135         n += 8 << c;
1136     } else {
1137         n = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0);
1138         s->counts.mv_comp[idx].class0[n]++;
1139         bit = vp8_rac_get_tree(&s->c, vp9_mv_fp_tree,
1140                                s->prob.p.mv_comp[idx].class0_fp[n]);
1141         s->counts.mv_comp[idx].class0_fp[n][bit]++;
1142         n = (n << 3) | (bit << 1);
1143         if (hp) {
1144             bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0_hp);
1145             s->counts.mv_comp[idx].class0_hp[bit]++;
1146             n |= bit;
1147         } else {
1148             n |= 1;
1149             // bug in libvpx - we count for bw entropy purposes even if the
1150             // bit wasn't coded
1151             s->counts.mv_comp[idx].class0_hp[1]++;
1152         }
1153     }
1154
1155     return sign ? -(n + 1) : (n + 1);
1156 }
1157
1158 static void fill_mv(VP9Context *s,
1159                     VP56mv *mv, int mode, int sb)
1160 {
1161     VP9Block *b = s->b;
1162
1163     if (mode == ZEROMV) {
1164         memset(mv, 0, sizeof(*mv) * 2);
1165     } else {
1166         int hp;
1167
1168         // FIXME cache this value and reuse for other subblocks
1169         find_ref_mvs(s, &mv[0], b->ref[0], 0, mode == NEARMV,
1170                      mode == NEWMV ? -1 : sb);
1171         // FIXME maybe move this code into find_ref_mvs()
1172         if ((mode == NEWMV || sb == -1) &&
1173             !(hp = s->highprecisionmvs && abs(mv[0].x) < 64 && abs(mv[0].y) < 64)) {
1174             if (mv[0].y & 1) {
1175                 if (mv[0].y < 0)
1176                     mv[0].y++;
1177                 else
1178                     mv[0].y--;
1179             }
1180             if (mv[0].x & 1) {
1181                 if (mv[0].x < 0)
1182                     mv[0].x++;
1183                 else
1184                     mv[0].x--;
1185             }
1186         }
1187         if (mode == NEWMV) {
1188             enum MVJoint j = vp8_rac_get_tree(&s->c, vp9_mv_joint_tree,
1189                                               s->prob.p.mv_joint);
1190
1191             s->counts.mv_joint[j]++;
1192             if (j >= MV_JOINT_V)
1193                 mv[0].y += read_mv_component(s, 0, hp);
1194             if (j & 1)
1195                 mv[0].x += read_mv_component(s, 1, hp);
1196         }
1197
1198         if (b->comp) {
1199             // FIXME cache this value and reuse for other subblocks
1200             find_ref_mvs(s, &mv[1], b->ref[1], 1, mode == NEARMV,
1201                          mode == NEWMV ? -1 : sb);
1202             if ((mode == NEWMV || sb == -1) &&
1203                 !(hp = s->highprecisionmvs && abs(mv[1].x) < 64 && abs(mv[1].y) < 64)) {
1204                 if (mv[1].y & 1) {
1205                     if (mv[1].y < 0)
1206                         mv[1].y++;
1207                     else
1208                         mv[1].y--;
1209                 }
1210                 if (mv[1].x & 1) {
1211                     if (mv[1].x < 0)
1212                         mv[1].x++;
1213                     else
1214                         mv[1].x--;
1215                 }
1216             }
1217             if (mode == NEWMV) {
1218                 enum MVJoint j = vp8_rac_get_tree(&s->c, vp9_mv_joint_tree,
1219                                                   s->prob.p.mv_joint);
1220
1221                 s->counts.mv_joint[j]++;
1222                 if (j >= MV_JOINT_V)
1223                     mv[1].y += read_mv_component(s, 0, hp);
1224                 if (j & 1)
1225                     mv[1].x += read_mv_component(s, 1, hp);
1226             }
1227         }
1228     }
1229 }
1230
1231 static void decode_mode(AVCodecContext *ctx)
1232 {
1233     static const uint8_t left_ctx[N_BS_SIZES] = {
1234         0x0, 0x8, 0x0, 0x8, 0xc, 0x8, 0xc, 0xe, 0xc, 0xe, 0xf, 0xe, 0xf
1235     };
1236     static const uint8_t above_ctx[N_BS_SIZES] = {
1237         0x0, 0x0, 0x8, 0x8, 0x8, 0xc, 0xc, 0xc, 0xe, 0xe, 0xe, 0xf, 0xf
1238     };
1239     static const uint8_t max_tx_for_bl_bp[N_BS_SIZES] = {
1240         TX_32X32, TX_32X32, TX_32X32, TX_32X32, TX_16X16, TX_16X16,
1241         TX_16X16, TX_8X8, TX_8X8, TX_8X8, TX_4X4, TX_4X4, TX_4X4
1242     };
1243     VP9Context *s = ctx->priv_data;
1244     VP9Block *b = s->b;
1245     int row = s->row, col = s->col, row7 = s->row7;
1246     enum TxfmMode max_tx = max_tx_for_bl_bp[b->bs];
1247     int w4 = FFMIN(s->cols - col, bwh_tab[1][b->bs][0]);
1248     int h4 = FFMIN(s->rows - row, bwh_tab[1][b->bs][1]), y;
1249     int have_a = row > 0, have_l = col > s->tiling.tile_col_start;
1250
1251     if (!s->segmentation.enabled) {
1252         b->seg_id = 0;
1253     } else if (s->keyframe || s->intraonly) {
1254         b->seg_id = s->segmentation.update_map ?
1255             vp8_rac_get_tree(&s->c, vp9_segmentation_tree, s->prob.seg) : 0;
1256     } else if (!s->segmentation.update_map ||
1257                (s->segmentation.temporal &&
1258                 vp56_rac_get_prob_branchy(&s->c,
1259                     s->prob.segpred[s->above_segpred_ctx[col] +
1260                                     s->left_segpred_ctx[row7]]))) {
1261         int pred = 8, x;
1262         uint8_t *refsegmap = s->frames[LAST_FRAME].segmentation_map;
1263
1264         if (!s->last_uses_2pass)
1265             ff_thread_await_progress(&s->frames[LAST_FRAME].tf, row >> 3, 0);
1266         for (y = 0; y < h4; y++)
1267             for (x = 0; x < w4; x++)
1268                 pred = FFMIN(pred, refsegmap[(y + row) * 8 * s->sb_cols + x + col]);
1269         av_assert1(pred < 8);
1270         b->seg_id = pred;
1271
1272         memset(&s->above_segpred_ctx[col], 1, w4);
1273         memset(&s->left_segpred_ctx[row7], 1, h4);
1274     } else {
1275         b->seg_id = vp8_rac_get_tree(&s->c, vp9_segmentation_tree,
1276                                      s->prob.seg);
1277
1278         memset(&s->above_segpred_ctx[col], 0, w4);
1279         memset(&s->left_segpred_ctx[row7], 0, h4);
1280     }
1281     if ((s->segmentation.enabled && s->segmentation.update_map) || s->keyframe) {
1282         uint8_t *segmap = s->frames[CUR_FRAME].segmentation_map;
1283
1284         for (y = 0; y < h4; y++)
1285             memset(&segmap[(y + row) * 8 * s->sb_cols + col], b->seg_id, w4);
1286     }
1287
1288     b->skip = s->segmentation.enabled &&
1289         s->segmentation.feat[b->seg_id].skip_enabled;
1290     if (!b->skip) {
1291         int c = s->left_skip_ctx[row7] + s->above_skip_ctx[col];
1292         b->skip = vp56_rac_get_prob(&s->c, s->prob.p.skip[c]);
1293         s->counts.skip[c][b->skip]++;
1294     }
1295
1296     if (s->keyframe || s->intraonly) {
1297         b->intra = 1;
1298     } else if (s->segmentation.feat[b->seg_id].ref_enabled) {
1299         b->intra = !s->segmentation.feat[b->seg_id].ref_val;
1300     } else {
1301         int c, bit;
1302
1303         if (have_a && have_l) {
1304             c = s->above_intra_ctx[col] + s->left_intra_ctx[row7];
1305             c += (c == 2);
1306         } else {
1307             c = have_a ? 2 * s->above_intra_ctx[col] :
1308                 have_l ? 2 * s->left_intra_ctx[row7] : 0;
1309         }
1310         bit = vp56_rac_get_prob(&s->c, s->prob.p.intra[c]);
1311         s->counts.intra[c][bit]++;
1312         b->intra = !bit;
1313     }
1314
1315     if ((b->intra || !b->skip) && s->txfmmode == TX_SWITCHABLE) {
1316         int c;
1317         if (have_a) {
1318             if (have_l) {
1319                 c = (s->above_skip_ctx[col] ? max_tx :
1320                      s->above_txfm_ctx[col]) +
1321                     (s->left_skip_ctx[row7] ? max_tx :
1322                      s->left_txfm_ctx[row7]) > max_tx;
1323             } else {
1324                 c = s->above_skip_ctx[col] ? 1 :
1325                     (s->above_txfm_ctx[col] * 2 > max_tx);
1326             }
1327         } else if (have_l) {
1328             c = s->left_skip_ctx[row7] ? 1 :
1329                 (s->left_txfm_ctx[row7] * 2 > max_tx);
1330         } else {
1331             c = 1;
1332         }
1333         switch (max_tx) {
1334         case TX_32X32:
1335             b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][0]);
1336             if (b->tx) {
1337                 b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][1]);
1338                 if (b->tx == 2)
1339                     b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][2]);
1340             }
1341             s->counts.tx32p[c][b->tx]++;
1342             break;
1343         case TX_16X16:
1344             b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][0]);
1345             if (b->tx)
1346                 b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][1]);
1347             s->counts.tx16p[c][b->tx]++;
1348             break;
1349         case TX_8X8:
1350             b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx8p[c]);
1351             s->counts.tx8p[c][b->tx]++;
1352             break;
1353         case TX_4X4:
1354             b->tx = TX_4X4;
1355             break;
1356         }
1357     } else {
1358         b->tx = FFMIN(max_tx, s->txfmmode);
1359     }
1360
1361     if (s->keyframe || s->intraonly) {
1362         uint8_t *a = &s->above_mode_ctx[col * 2];
1363         uint8_t *l = &s->left_mode_ctx[(row7) << 1];
1364
1365         b->comp = 0;
1366         if (b->bs > BS_8x8) {
1367             // FIXME the memory storage intermediates here aren't really
1368             // necessary, they're just there to make the code slightly
1369             // simpler for now
1370             b->mode[0] = a[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1371                                     vp9_default_kf_ymode_probs[a[0]][l[0]]);
1372             if (b->bs != BS_8x4) {
1373                 b->mode[1] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1374                                  vp9_default_kf_ymode_probs[a[1]][b->mode[0]]);
1375                 l[0] = a[1] = b->mode[1];
1376             } else {
1377                 l[0] = a[1] = b->mode[1] = b->mode[0];
1378             }
1379             if (b->bs != BS_4x8) {
1380                 b->mode[2] = a[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1381                                         vp9_default_kf_ymode_probs[a[0]][l[1]]);
1382                 if (b->bs != BS_8x4) {
1383                     b->mode[3] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1384                                   vp9_default_kf_ymode_probs[a[1]][b->mode[2]]);
1385                     l[1] = a[1] = b->mode[3];
1386                 } else {
1387                     l[1] = a[1] = b->mode[3] = b->mode[2];
1388                 }
1389             } else {
1390                 b->mode[2] = b->mode[0];
1391                 l[1] = a[1] = b->mode[3] = b->mode[1];
1392             }
1393         } else {
1394             b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1395                                           vp9_default_kf_ymode_probs[*a][*l]);
1396             b->mode[3] = b->mode[2] = b->mode[1] = b->mode[0];
1397             // FIXME this can probably be optimized
1398             memset(a, b->mode[0], bwh_tab[0][b->bs][0]);
1399             memset(l, b->mode[0], bwh_tab[0][b->bs][1]);
1400         }
1401         b->uvmode = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1402                                      vp9_default_kf_uvmode_probs[b->mode[3]]);
1403     } else if (b->intra) {
1404         b->comp = 0;
1405         if (b->bs > BS_8x8) {
1406             b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1407                                           s->prob.p.y_mode[0]);
1408             s->counts.y_mode[0][b->mode[0]]++;
1409             if (b->bs != BS_8x4) {
1410                 b->mode[1] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1411                                               s->prob.p.y_mode[0]);
1412                 s->counts.y_mode[0][b->mode[1]]++;
1413             } else {
1414                 b->mode[1] = b->mode[0];
1415             }
1416             if (b->bs != BS_4x8) {
1417                 b->mode[2] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1418                                               s->prob.p.y_mode[0]);
1419                 s->counts.y_mode[0][b->mode[2]]++;
1420                 if (b->bs != BS_8x4) {
1421                     b->mode[3] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1422                                                   s->prob.p.y_mode[0]);
1423                     s->counts.y_mode[0][b->mode[3]]++;
1424                 } else {
1425                     b->mode[3] = b->mode[2];
1426                 }
1427             } else {
1428                 b->mode[2] = b->mode[0];
1429                 b->mode[3] = b->mode[1];
1430             }
1431         } else {
1432             static const uint8_t size_group[10] = {
1433                 3, 3, 3, 3, 2, 2, 2, 1, 1, 1
1434             };
1435             int sz = size_group[b->bs];
1436
1437             b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1438                                           s->prob.p.y_mode[sz]);
1439             b->mode[1] = b->mode[2] = b->mode[3] = b->mode[0];
1440             s->counts.y_mode[sz][b->mode[3]]++;
1441         }
1442         b->uvmode = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1443                                      s->prob.p.uv_mode[b->mode[3]]);
1444         s->counts.uv_mode[b->mode[3]][b->uvmode]++;
1445     } else {
1446         static const uint8_t inter_mode_ctx_lut[14][14] = {
1447             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1448             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1449             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1450             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1451             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1452             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1453             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1454             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1455             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1456             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1457             { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
1458             { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
1459             { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 0, 3 },
1460             { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 3, 3, 4 },
1461         };
1462
1463         if (s->segmentation.feat[b->seg_id].ref_enabled) {
1464             av_assert2(s->segmentation.feat[b->seg_id].ref_val != 0);
1465             b->comp = 0;
1466             b->ref[0] = s->segmentation.feat[b->seg_id].ref_val - 1;
1467         } else {
1468             // read comp_pred flag
1469             if (s->comppredmode != PRED_SWITCHABLE) {
1470                 b->comp = s->comppredmode == PRED_COMPREF;
1471             } else {
1472                 int c;
1473
1474                 // FIXME add intra as ref=0xff (or -1) to make these easier?
1475                 if (have_a) {
1476                     if (have_l) {
1477                         if (s->above_comp_ctx[col] && s->left_comp_ctx[row7]) {
1478                             c = 4;
1479                         } else if (s->above_comp_ctx[col]) {
1480                             c = 2 + (s->left_intra_ctx[row7] ||
1481                                      s->left_ref_ctx[row7] == s->fixcompref);
1482                         } else if (s->left_comp_ctx[row7]) {
1483                             c = 2 + (s->above_intra_ctx[col] ||
1484                                      s->above_ref_ctx[col] == s->fixcompref);
1485                         } else {
1486                             c = (!s->above_intra_ctx[col] &&
1487                                  s->above_ref_ctx[col] == s->fixcompref) ^
1488                             (!s->left_intra_ctx[row7] &&
1489                              s->left_ref_ctx[row & 7] == s->fixcompref);
1490                         }
1491                     } else {
1492                         c = s->above_comp_ctx[col] ? 3 :
1493                         (!s->above_intra_ctx[col] && s->above_ref_ctx[col] == s->fixcompref);
1494                     }
1495                 } else if (have_l) {
1496                     c = s->left_comp_ctx[row7] ? 3 :
1497                     (!s->left_intra_ctx[row7] && s->left_ref_ctx[row7] == s->fixcompref);
1498                 } else {
1499                     c = 1;
1500                 }
1501                 b->comp = vp56_rac_get_prob(&s->c, s->prob.p.comp[c]);
1502                 s->counts.comp[c][b->comp]++;
1503             }
1504
1505             // read actual references
1506             // FIXME probably cache a few variables here to prevent repetitive
1507             // memory accesses below
1508             if (b->comp) /* two references */ {
1509                 int fix_idx = s->signbias[s->fixcompref], var_idx = !fix_idx, c, bit;
1510
1511                 b->ref[fix_idx] = s->fixcompref;
1512                 // FIXME can this codeblob be replaced by some sort of LUT?
1513                 if (have_a) {
1514                     if (have_l) {
1515                         if (s->above_intra_ctx[col]) {
1516                             if (s->left_intra_ctx[row7]) {
1517                                 c = 2;
1518                             } else {
1519                                 c = 1 + 2 * (s->left_ref_ctx[row7] != s->varcompref[1]);
1520                             }
1521                         } else if (s->left_intra_ctx[row7]) {
1522                             c = 1 + 2 * (s->above_ref_ctx[col] != s->varcompref[1]);
1523                         } else {
1524                             int refl = s->left_ref_ctx[row7], refa = s->above_ref_ctx[col];
1525
1526                             if (refl == refa && refa == s->varcompref[1]) {
1527                                 c = 0;
1528                             } else if (!s->left_comp_ctx[row7] && !s->above_comp_ctx[col]) {
1529                                 if ((refa == s->fixcompref && refl == s->varcompref[0]) ||
1530                                     (refl == s->fixcompref && refa == s->varcompref[0])) {
1531                                     c = 4;
1532                                 } else {
1533                                     c = (refa == refl) ? 3 : 1;
1534                                 }
1535                             } else if (!s->left_comp_ctx[row7]) {
1536                                 if (refa == s->varcompref[1] && refl != s->varcompref[1]) {
1537                                     c = 1;
1538                                 } else {
1539                                     c = (refl == s->varcompref[1] &&
1540                                          refa != s->varcompref[1]) ? 2 : 4;
1541                                 }
1542                             } else if (!s->above_comp_ctx[col]) {
1543                                 if (refl == s->varcompref[1] && refa != s->varcompref[1]) {
1544                                     c = 1;
1545                                 } else {
1546                                     c = (refa == s->varcompref[1] &&
1547                                          refl != s->varcompref[1]) ? 2 : 4;
1548                                 }
1549                             } else {
1550                                 c = (refl == refa) ? 4 : 2;
1551                             }
1552                         }
1553                     } else {
1554                         if (s->above_intra_ctx[col]) {
1555                             c = 2;
1556                         } else if (s->above_comp_ctx[col]) {
1557                             c = 4 * (s->above_ref_ctx[col] != s->varcompref[1]);
1558                         } else {
1559                             c = 3 * (s->above_ref_ctx[col] != s->varcompref[1]);
1560                         }
1561                     }
1562                 } else if (have_l) {
1563                     if (s->left_intra_ctx[row7]) {
1564                         c = 2;
1565                     } else if (s->left_comp_ctx[row7]) {
1566                         c = 4 * (s->left_ref_ctx[row7] != s->varcompref[1]);
1567                     } else {
1568                         c = 3 * (s->left_ref_ctx[row7] != s->varcompref[1]);
1569                     }
1570                 } else {
1571                     c = 2;
1572                 }
1573                 bit = vp56_rac_get_prob(&s->c, s->prob.p.comp_ref[c]);
1574                 b->ref[var_idx] = s->varcompref[bit];
1575                 s->counts.comp_ref[c][bit]++;
1576             } else /* single reference */ {
1577                 int bit, c;
1578
1579                 if (have_a && !s->above_intra_ctx[col]) {
1580                     if (have_l && !s->left_intra_ctx[row7]) {
1581                         if (s->left_comp_ctx[row7]) {
1582                             if (s->above_comp_ctx[col]) {
1583                                 c = 1 + (!s->fixcompref || !s->left_ref_ctx[row7] ||
1584                                          !s->above_ref_ctx[col]);
1585                             } else {
1586                                 c = (3 * !s->above_ref_ctx[col]) +
1587                                     (!s->fixcompref || !s->left_ref_ctx[row7]);
1588                             }
1589                         } else if (s->above_comp_ctx[col]) {
1590                             c = (3 * !s->left_ref_ctx[row7]) +
1591                                 (!s->fixcompref || !s->above_ref_ctx[col]);
1592                         } else {
1593                             c = 2 * !s->left_ref_ctx[row7] + 2 * !s->above_ref_ctx[col];
1594                         }
1595                     } else if (s->above_intra_ctx[col]) {
1596                         c = 2;
1597                     } else if (s->above_comp_ctx[col]) {
1598                         c = 1 + (!s->fixcompref || !s->above_ref_ctx[col]);
1599                     } else {
1600                         c = 4 * (!s->above_ref_ctx[col]);
1601                     }
1602                 } else if (have_l && !s->left_intra_ctx[row7]) {
1603                     if (s->left_intra_ctx[row7]) {
1604                         c = 2;
1605                     } else if (s->left_comp_ctx[row7]) {
1606                         c = 1 + (!s->fixcompref || !s->left_ref_ctx[row7]);
1607                     } else {
1608                         c = 4 * (!s->left_ref_ctx[row7]);
1609                     }
1610                 } else {
1611                     c = 2;
1612                 }
1613                 bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][0]);
1614                 s->counts.single_ref[c][0][bit]++;
1615                 if (!bit) {
1616                     b->ref[0] = 0;
1617                 } else {
1618                     // FIXME can this codeblob be replaced by some sort of LUT?
1619                     if (have_a) {
1620                         if (have_l) {
1621                             if (s->left_intra_ctx[row7]) {
1622                                 if (s->above_intra_ctx[col]) {
1623                                     c = 2;
1624                                 } else if (s->above_comp_ctx[col]) {
1625                                     c = 1 + 2 * (s->fixcompref == 1 ||
1626                                                  s->above_ref_ctx[col] == 1);
1627                                 } else if (!s->above_ref_ctx[col]) {
1628                                     c = 3;
1629                                 } else {
1630                                     c = 4 * (s->above_ref_ctx[col] == 1);
1631                                 }
1632                             } else if (s->above_intra_ctx[col]) {
1633                                 if (s->left_intra_ctx[row7]) {
1634                                     c = 2;
1635                                 } else if (s->left_comp_ctx[row7]) {
1636                                     c = 1 + 2 * (s->fixcompref == 1 ||
1637                                                  s->left_ref_ctx[row7] == 1);
1638                                 } else if (!s->left_ref_ctx[row7]) {
1639                                     c = 3;
1640                                 } else {
1641                                     c = 4 * (s->left_ref_ctx[row7] == 1);
1642                                 }
1643                             } else if (s->above_comp_ctx[col]) {
1644                                 if (s->left_comp_ctx[row7]) {
1645                                     if (s->left_ref_ctx[row7] == s->above_ref_ctx[col]) {
1646                                         c = 3 * (s->fixcompref == 1 ||
1647                                                  s->left_ref_ctx[row7] == 1);
1648                                     } else {
1649                                         c = 2;
1650                                     }
1651                                 } else if (!s->left_ref_ctx[row7]) {
1652                                     c = 1 + 2 * (s->fixcompref == 1 ||
1653                                                  s->above_ref_ctx[col] == 1);
1654                                 } else {
1655                                     c = 3 * (s->left_ref_ctx[row7] == 1) +
1656                                     (s->fixcompref == 1 || s->above_ref_ctx[col] == 1);
1657                                 }
1658                             } else if (s->left_comp_ctx[row7]) {
1659                                 if (!s->above_ref_ctx[col]) {
1660                                     c = 1 + 2 * (s->fixcompref == 1 ||
1661                                                  s->left_ref_ctx[row7] == 1);
1662                                 } else {
1663                                     c = 3 * (s->above_ref_ctx[col] == 1) +
1664                                     (s->fixcompref == 1 || s->left_ref_ctx[row7] == 1);
1665                                 }
1666                             } else if (!s->above_ref_ctx[col]) {
1667                                 if (!s->left_ref_ctx[row7]) {
1668                                     c = 3;
1669                                 } else {
1670                                     c = 4 * (s->left_ref_ctx[row7] == 1);
1671                                 }
1672                             } else if (!s->left_ref_ctx[row7]) {
1673                                 c = 4 * (s->above_ref_ctx[col] == 1);
1674                             } else {
1675                                 c = 2 * (s->left_ref_ctx[row7] == 1) +
1676                                 2 * (s->above_ref_ctx[col] == 1);
1677                             }
1678                         } else {
1679                             if (s->above_intra_ctx[col] ||
1680                                 (!s->above_comp_ctx[col] && !s->above_ref_ctx[col])) {
1681                                 c = 2;
1682                             } else if (s->above_comp_ctx[col]) {
1683                                 c = 3 * (s->fixcompref == 1 || s->above_ref_ctx[col] == 1);
1684                             } else {
1685                                 c = 4 * (s->above_ref_ctx[col] == 1);
1686                             }
1687                         }
1688                     } else if (have_l) {
1689                         if (s->left_intra_ctx[row7] ||
1690                             (!s->left_comp_ctx[row7] && !s->left_ref_ctx[row7])) {
1691                             c = 2;
1692                         } else if (s->left_comp_ctx[row7]) {
1693                             c = 3 * (s->fixcompref == 1 || s->left_ref_ctx[row7] == 1);
1694                         } else {
1695                             c = 4 * (s->left_ref_ctx[row7] == 1);
1696                         }
1697                     } else {
1698                         c = 2;
1699                     }
1700                     bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][1]);
1701                     s->counts.single_ref[c][1][bit]++;
1702                     b->ref[0] = 1 + bit;
1703                 }
1704             }
1705         }
1706
1707         if (b->bs <= BS_8x8) {
1708             if (s->segmentation.feat[b->seg_id].skip_enabled) {
1709                 b->mode[0] = b->mode[1] = b->mode[2] = b->mode[3] = ZEROMV;
1710             } else {
1711                 static const uint8_t off[10] = {
1712                     3, 0, 0, 1, 0, 0, 0, 0, 0, 0
1713                 };
1714
1715                 // FIXME this needs to use the LUT tables from find_ref_mvs
1716                 // because not all are -1,0/0,-1
1717                 int c = inter_mode_ctx_lut[s->above_mode_ctx[col + off[b->bs]]]
1718                                           [s->left_mode_ctx[row7 + off[b->bs]]];
1719
1720                 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1721                                               s->prob.p.mv_mode[c]);
1722                 b->mode[1] = b->mode[2] = b->mode[3] = b->mode[0];
1723                 s->counts.mv_mode[c][b->mode[0] - 10]++;
1724             }
1725         }
1726
1727         if (s->filtermode == FILTER_SWITCHABLE) {
1728             int c;
1729
1730             if (have_a && s->above_mode_ctx[col] >= NEARESTMV) {
1731                 if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
1732                     c = s->above_filter_ctx[col] == s->left_filter_ctx[row7] ?
1733                         s->left_filter_ctx[row7] : 3;
1734                 } else {
1735                     c = s->above_filter_ctx[col];
1736                 }
1737             } else if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
1738                 c = s->left_filter_ctx[row7];
1739             } else {
1740                 c = 3;
1741             }
1742
1743             b->filter = vp8_rac_get_tree(&s->c, vp9_filter_tree,
1744                                          s->prob.p.filter[c]);
1745             s->counts.filter[c][b->filter]++;
1746         } else {
1747             b->filter = s->filtermode;
1748         }
1749
1750         if (b->bs > BS_8x8) {
1751             int c = inter_mode_ctx_lut[s->above_mode_ctx[col]][s->left_mode_ctx[row7]];
1752
1753             b->mode[0] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1754                                           s->prob.p.mv_mode[c]);
1755             s->counts.mv_mode[c][b->mode[0] - 10]++;
1756             fill_mv(s, b->mv[0], b->mode[0], 0);
1757
1758             if (b->bs != BS_8x4) {
1759                 b->mode[1] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1760                                               s->prob.p.mv_mode[c]);
1761                 s->counts.mv_mode[c][b->mode[1] - 10]++;
1762                 fill_mv(s, b->mv[1], b->mode[1], 1);
1763             } else {
1764                 b->mode[1] = b->mode[0];
1765                 AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
1766                 AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
1767             }
1768
1769             if (b->bs != BS_4x8) {
1770                 b->mode[2] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1771                                               s->prob.p.mv_mode[c]);
1772                 s->counts.mv_mode[c][b->mode[2] - 10]++;
1773                 fill_mv(s, b->mv[2], b->mode[2], 2);
1774
1775                 if (b->bs != BS_8x4) {
1776                     b->mode[3] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1777                                                   s->prob.p.mv_mode[c]);
1778                     s->counts.mv_mode[c][b->mode[3] - 10]++;
1779                     fill_mv(s, b->mv[3], b->mode[3], 3);
1780                 } else {
1781                     b->mode[3] = b->mode[2];
1782                     AV_COPY32(&b->mv[3][0], &b->mv[2][0]);
1783                     AV_COPY32(&b->mv[3][1], &b->mv[2][1]);
1784                 }
1785             } else {
1786                 b->mode[2] = b->mode[0];
1787                 AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
1788                 AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
1789                 b->mode[3] = b->mode[1];
1790                 AV_COPY32(&b->mv[3][0], &b->mv[1][0]);
1791                 AV_COPY32(&b->mv[3][1], &b->mv[1][1]);
1792             }
1793         } else {
1794             fill_mv(s, b->mv[0], b->mode[0], -1);
1795             AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
1796             AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
1797             AV_COPY32(&b->mv[3][0], &b->mv[0][0]);
1798             AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
1799             AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
1800             AV_COPY32(&b->mv[3][1], &b->mv[0][1]);
1801         }
1802     }
1803
1804     // FIXME this can probably be optimized
1805     memset(&s->above_skip_ctx[col], b->skip, w4);
1806     memset(&s->left_skip_ctx[row7], b->skip, h4);
1807     memset(&s->above_txfm_ctx[col], b->tx, w4);
1808     memset(&s->left_txfm_ctx[row7], b->tx, h4);
1809     memset(&s->above_partition_ctx[col], above_ctx[b->bs], w4);
1810     memset(&s->left_partition_ctx[row7], left_ctx[b->bs], h4);
1811     if (!s->keyframe && !s->intraonly) {
1812         memset(&s->above_intra_ctx[col], b->intra, w4);
1813         memset(&s->left_intra_ctx[row7], b->intra, h4);
1814         memset(&s->above_comp_ctx[col], b->comp, w4);
1815         memset(&s->left_comp_ctx[row7], b->comp, h4);
1816         memset(&s->above_mode_ctx[col], b->mode[3], w4);
1817         memset(&s->left_mode_ctx[row7], b->mode[3], h4);
1818         if (s->filtermode == FILTER_SWITCHABLE && !b->intra ) {
1819             memset(&s->above_filter_ctx[col], b->filter, w4);
1820             memset(&s->left_filter_ctx[row7], b->filter, h4);
1821             b->filter = vp9_filter_lut[b->filter];
1822         }
1823         if (b->bs > BS_8x8) {
1824             int mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
1825
1826             AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][0], &b->mv[1][0]);
1827             AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][1], &b->mv[1][1]);
1828             AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][0], mv0);
1829             AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][1], mv1);
1830             AV_COPY32(&s->above_mv_ctx[col * 2 + 0][0], &b->mv[2][0]);
1831             AV_COPY32(&s->above_mv_ctx[col * 2 + 0][1], &b->mv[2][1]);
1832             AV_WN32A(&s->above_mv_ctx[col * 2 + 1][0], mv0);
1833             AV_WN32A(&s->above_mv_ctx[col * 2 + 1][1], mv1);
1834         } else {
1835             int n, mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
1836
1837             for (n = 0; n < w4 * 2; n++) {
1838                 AV_WN32A(&s->above_mv_ctx[col * 2 + n][0], mv0);
1839                 AV_WN32A(&s->above_mv_ctx[col * 2 + n][1], mv1);
1840             }
1841             for (n = 0; n < h4 * 2; n++) {
1842                 AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][0], mv0);
1843                 AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][1], mv1);
1844             }
1845         }
1846
1847         if (!b->intra) { // FIXME write 0xff or -1 if intra, so we can use this
1848                          // as a direct check in above branches
1849             int vref = b->ref[b->comp ? s->signbias[s->varcompref[0]] : 0];
1850
1851             memset(&s->above_ref_ctx[col], vref, w4);
1852             memset(&s->left_ref_ctx[row7], vref, h4);
1853         }
1854     }
1855
1856     // FIXME kinda ugly
1857     for (y = 0; y < h4; y++) {
1858         int x, o = (row + y) * s->sb_cols * 8 + col;
1859         struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[o];
1860
1861         if (b->intra) {
1862             for (x = 0; x < w4; x++) {
1863                 mv[x].ref[0] =
1864                 mv[x].ref[1] = -1;
1865             }
1866         } else if (b->comp) {
1867             for (x = 0; x < w4; x++) {
1868                 mv[x].ref[0] = b->ref[0];
1869                 mv[x].ref[1] = b->ref[1];
1870                 AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
1871                 AV_COPY32(&mv[x].mv[1], &b->mv[3][1]);
1872             }
1873         } else {
1874             for (x = 0; x < w4; x++) {
1875                 mv[x].ref[0] = b->ref[0];
1876                 mv[x].ref[1] = -1;
1877                 AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
1878             }
1879         }
1880     }
1881 }
1882
1883 // FIXME remove tx argument, and merge cnt/eob arguments?
1884 static int decode_coeffs_b(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
1885                            enum TxfmMode tx, unsigned (*cnt)[6][3],
1886                            unsigned (*eob)[6][2], uint8_t (*p)[6][11],
1887                            int nnz, const int16_t *scan, const int16_t (*nb)[2],
1888                            const int16_t *band_counts, const int16_t *qmul)
1889 {
1890     int i = 0, band = 0, band_left = band_counts[band];
1891     uint8_t *tp = p[0][nnz];
1892     uint8_t cache[1024];
1893
1894     do {
1895         int val, rc;
1896
1897         val = vp56_rac_get_prob_branchy(c, tp[0]); // eob
1898         eob[band][nnz][val]++;
1899         if (!val)
1900             break;
1901
1902     skip_eob:
1903         if (!vp56_rac_get_prob_branchy(c, tp[1])) { // zero
1904             cnt[band][nnz][0]++;
1905             if (!--band_left)
1906                 band_left = band_counts[++band];
1907             cache[scan[i]] = 0;
1908             nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
1909             tp = p[band][nnz];
1910             if (++i == n_coeffs)
1911                 break; //invalid input; blocks should end with EOB
1912             goto skip_eob;
1913         }
1914
1915         rc = scan[i];
1916         if (!vp56_rac_get_prob_branchy(c, tp[2])) { // one
1917             cnt[band][nnz][1]++;
1918             val = 1;
1919             cache[rc] = 1;
1920         } else {
1921             // fill in p[3-10] (model fill) - only once per frame for each pos
1922             if (!tp[3])
1923                 memcpy(&tp[3], vp9_model_pareto8[tp[2]], 8);
1924
1925             cnt[band][nnz][2]++;
1926             if (!vp56_rac_get_prob_branchy(c, tp[3])) { // 2, 3, 4
1927                 if (!vp56_rac_get_prob_branchy(c, tp[4])) {
1928                     cache[rc] = val = 2;
1929                 } else {
1930                     val = 3 + vp56_rac_get_prob(c, tp[5]);
1931                     cache[rc] = 3;
1932                 }
1933             } else if (!vp56_rac_get_prob_branchy(c, tp[6])) { // cat1/2
1934                 cache[rc] = 4;
1935                 if (!vp56_rac_get_prob_branchy(c, tp[7])) {
1936                     val = 5 + vp56_rac_get_prob(c, 159);
1937                 } else {
1938                     val  = 7 + (vp56_rac_get_prob(c, 165) << 1);
1939                     val +=      vp56_rac_get_prob(c, 145);
1940                 }
1941             } else { // cat 3-6
1942                 cache[rc] = 5;
1943                 if (!vp56_rac_get_prob_branchy(c, tp[8])) {
1944                     if (!vp56_rac_get_prob_branchy(c, tp[9])) {
1945                         val  = 11 + (vp56_rac_get_prob(c, 173) << 2);
1946                         val +=      (vp56_rac_get_prob(c, 148) << 1);
1947                         val +=       vp56_rac_get_prob(c, 140);
1948                     } else {
1949                         val  = 19 + (vp56_rac_get_prob(c, 176) << 3);
1950                         val +=      (vp56_rac_get_prob(c, 155) << 2);
1951                         val +=      (vp56_rac_get_prob(c, 140) << 1);
1952                         val +=       vp56_rac_get_prob(c, 135);
1953                     }
1954                 } else if (!vp56_rac_get_prob_branchy(c, tp[10])) {
1955                     val  = 35 + (vp56_rac_get_prob(c, 180) << 4);
1956                     val +=      (vp56_rac_get_prob(c, 157) << 3);
1957                     val +=      (vp56_rac_get_prob(c, 141) << 2);
1958                     val +=      (vp56_rac_get_prob(c, 134) << 1);
1959                     val +=       vp56_rac_get_prob(c, 130);
1960                 } else {
1961                     val  = 67 + (vp56_rac_get_prob(c, 254) << 13);
1962                     val +=      (vp56_rac_get_prob(c, 254) << 12);
1963                     val +=      (vp56_rac_get_prob(c, 254) << 11);
1964                     val +=      (vp56_rac_get_prob(c, 252) << 10);
1965                     val +=      (vp56_rac_get_prob(c, 249) << 9);
1966                     val +=      (vp56_rac_get_prob(c, 243) << 8);
1967                     val +=      (vp56_rac_get_prob(c, 230) << 7);
1968                     val +=      (vp56_rac_get_prob(c, 196) << 6);
1969                     val +=      (vp56_rac_get_prob(c, 177) << 5);
1970                     val +=      (vp56_rac_get_prob(c, 153) << 4);
1971                     val +=      (vp56_rac_get_prob(c, 140) << 3);
1972                     val +=      (vp56_rac_get_prob(c, 133) << 2);
1973                     val +=      (vp56_rac_get_prob(c, 130) << 1);
1974                     val +=       vp56_rac_get_prob(c, 129);
1975                 }
1976             }
1977         }
1978         if (!--band_left)
1979             band_left = band_counts[++band];
1980         if (tx == TX_32X32) // FIXME slow
1981             coef[rc] = ((vp8_rac_get(c) ? -val : val) * qmul[!!i]) / 2;
1982         else
1983             coef[rc] = (vp8_rac_get(c) ? -val : val) * qmul[!!i];
1984         nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
1985         tp = p[band][nnz];
1986     } while (++i < n_coeffs);
1987
1988     return i;
1989 }
1990
1991 static void decode_coeffs(AVCodecContext *ctx)
1992 {
1993     VP9Context *s = ctx->priv_data;
1994     VP9Block *b = s->b;
1995     int row = s->row, col = s->col;
1996     uint8_t (*p)[6][11] = s->prob.coef[b->tx][0 /* y */][!b->intra];
1997     unsigned (*c)[6][3] = s->counts.coef[b->tx][0 /* y */][!b->intra];
1998     unsigned (*e)[6][2] = s->counts.eob[b->tx][0 /* y */][!b->intra];
1999     int w4 = bwh_tab[1][b->bs][0] << 1, h4 = bwh_tab[1][b->bs][1] << 1;
2000     int end_x = FFMIN(2 * (s->cols - col), w4);
2001     int end_y = FFMIN(2 * (s->rows - row), h4);
2002     int n, pl, x, y, step1d = 1 << b->tx, step = 1 << (b->tx * 2);
2003     int uvstep1d = 1 << b->uvtx, uvstep = 1 << (b->uvtx * 2), res;
2004     int16_t (*qmul)[2] = s->segmentation.feat[b->seg_id].qmul;
2005     int tx = 4 * s->lossless + b->tx;
2006     const int16_t * const *yscans = vp9_scans[tx];
2007     const int16_t (* const *ynbs)[2] = vp9_scans_nb[tx];
2008     const int16_t *uvscan = vp9_scans[b->uvtx][DCT_DCT];
2009     const int16_t (*uvnb)[2] = vp9_scans_nb[b->uvtx][DCT_DCT];
2010     uint8_t *a = &s->above_y_nnz_ctx[col * 2];
2011     uint8_t *l = &s->left_y_nnz_ctx[(row & 7) << 1];
2012     static const int16_t band_counts[4][8] = {
2013         { 1, 2, 3, 4,  3,   16 - 13 },
2014         { 1, 2, 3, 4, 11,   64 - 21 },
2015         { 1, 2, 3, 4, 11,  256 - 21 },
2016         { 1, 2, 3, 4, 11, 1024 - 21 },
2017     };
2018     const int16_t *y_band_counts = band_counts[b->tx];
2019     const int16_t *uv_band_counts = band_counts[b->uvtx];
2020
2021     /* y tokens */
2022     if (b->tx > TX_4X4) { // FIXME slow
2023         for (y = 0; y < end_y; y += step1d)
2024             for (x = 1; x < step1d; x++)
2025                 l[y] |= l[y + x];
2026         for (x = 0; x < end_x; x += step1d)
2027             for (y = 1; y < step1d; y++)
2028                 a[x] |= a[x + y];
2029     }
2030     for (n = 0, y = 0; y < end_y; y += step1d) {
2031         for (x = 0; x < end_x; x += step1d, n += step) {
2032             enum TxfmType txtp = vp9_intra_txfm_type[b->mode[b->tx == TX_4X4 &&
2033                                                              b->bs > BS_8x8 ?
2034                                                              n : 0]];
2035             int nnz = a[x] + l[y];
2036             res = decode_coeffs_b(&s->c, s->block + 16 * n, 16 * step,
2037                                   b->tx, c, e, p, nnz, yscans[txtp],
2038                                   ynbs[txtp], y_band_counts, qmul[0]);
2039             a[x] = l[y] = !!res;
2040             if (b->tx > TX_8X8) {
2041                 AV_WN16A(&s->eob[n], res);
2042             } else {
2043                 s->eob[n] = res;
2044             }
2045         }
2046     }
2047     if (b->tx > TX_4X4) { // FIXME slow
2048         for (y = 0; y < end_y; y += step1d)
2049             memset(&l[y + 1], l[y], FFMIN(end_y - y - 1, step1d - 1));
2050         for (x = 0; x < end_x; x += step1d)
2051             memset(&a[x + 1], a[x], FFMIN(end_x - x - 1, step1d - 1));
2052     }
2053
2054     p = s->prob.coef[b->uvtx][1 /* uv */][!b->intra];
2055     c = s->counts.coef[b->uvtx][1 /* uv */][!b->intra];
2056     e = s->counts.eob[b->uvtx][1 /* uv */][!b->intra];
2057     w4 >>= 1;
2058     h4 >>= 1;
2059     end_x >>= 1;
2060     end_y >>= 1;
2061     for (pl = 0; pl < 2; pl++) {
2062         a = &s->above_uv_nnz_ctx[pl][col];
2063         l = &s->left_uv_nnz_ctx[pl][row & 7];
2064         if (b->uvtx > TX_4X4) { // FIXME slow
2065             for (y = 0; y < end_y; y += uvstep1d)
2066                 for (x = 1; x < uvstep1d; x++)
2067                     l[y] |= l[y + x];
2068             for (x = 0; x < end_x; x += uvstep1d)
2069                 for (y = 1; y < uvstep1d; y++)
2070                     a[x] |= a[x + y];
2071         }
2072         for (n = 0, y = 0; y < end_y; y += uvstep1d) {
2073             for (x = 0; x < end_x; x += uvstep1d, n += uvstep) {
2074                 int nnz = a[x] + l[y];
2075                 res = decode_coeffs_b(&s->c, s->uvblock[pl] + 16 * n,
2076                                       16 * uvstep, b->uvtx, c, e, p, nnz,
2077                                       uvscan, uvnb, uv_band_counts, qmul[1]);
2078                 a[x] = l[y] = !!res;
2079                 if (b->uvtx > TX_8X8) {
2080                     AV_WN16A(&s->uveob[pl][n], res);
2081                 } else {
2082                     s->uveob[pl][n] = res;
2083                 }
2084             }
2085         }
2086         if (b->uvtx > TX_4X4) { // FIXME slow
2087             for (y = 0; y < end_y; y += uvstep1d)
2088                 memset(&l[y + 1], l[y], FFMIN(end_y - y - 1, uvstep1d - 1));
2089             for (x = 0; x < end_x; x += uvstep1d)
2090                 memset(&a[x + 1], a[x], FFMIN(end_x - x - 1, uvstep1d - 1));
2091         }
2092     }
2093 }
2094
2095 static av_always_inline int check_intra_mode(VP9Context *s, int mode, uint8_t **a,
2096                                              uint8_t *dst_edge, ptrdiff_t stride_edge,
2097                                              uint8_t *dst_inner, ptrdiff_t stride_inner,
2098                                              uint8_t *l, int col, int x, int w,
2099                                              int row, int y, enum TxfmMode tx,
2100                                              int p)
2101 {
2102     int have_top = row > 0 || y > 0;
2103     int have_left = col > s->tiling.tile_col_start || x > 0;
2104     int have_right = x < w - 1;
2105     static const uint8_t mode_conv[10][2 /* have_left */][2 /* have_top */] = {
2106         [VERT_PRED]            = { { DC_127_PRED,          VERT_PRED },
2107                                    { DC_127_PRED,          VERT_PRED } },
2108         [HOR_PRED]             = { { DC_129_PRED,          DC_129_PRED },
2109                                    { HOR_PRED,             HOR_PRED } },
2110         [DC_PRED]              = { { DC_128_PRED,          TOP_DC_PRED },
2111                                    { LEFT_DC_PRED,         DC_PRED } },
2112         [DIAG_DOWN_LEFT_PRED]  = { { DC_127_PRED,          DIAG_DOWN_LEFT_PRED },
2113                                    { DC_127_PRED,          DIAG_DOWN_LEFT_PRED } },
2114         [DIAG_DOWN_RIGHT_PRED] = { { DIAG_DOWN_RIGHT_PRED, DIAG_DOWN_RIGHT_PRED },
2115                                    { DIAG_DOWN_RIGHT_PRED, DIAG_DOWN_RIGHT_PRED } },
2116         [VERT_RIGHT_PRED]      = { { VERT_RIGHT_PRED,      VERT_RIGHT_PRED },
2117                                    { VERT_RIGHT_PRED,      VERT_RIGHT_PRED } },
2118         [HOR_DOWN_PRED]        = { { HOR_DOWN_PRED,        HOR_DOWN_PRED },
2119                                    { HOR_DOWN_PRED,        HOR_DOWN_PRED } },
2120         [VERT_LEFT_PRED]       = { { DC_127_PRED,          VERT_LEFT_PRED },
2121                                    { DC_127_PRED,          VERT_LEFT_PRED } },
2122         [HOR_UP_PRED]          = { { DC_129_PRED,          DC_129_PRED },
2123                                    { HOR_UP_PRED,          HOR_UP_PRED } },
2124         [TM_VP8_PRED]          = { { DC_129_PRED,          VERT_PRED },
2125                                    { HOR_PRED,             TM_VP8_PRED } },
2126     };
2127     static const struct {
2128         uint8_t needs_left:1;
2129         uint8_t needs_top:1;
2130         uint8_t needs_topleft:1;
2131         uint8_t needs_topright:1;
2132     } edges[N_INTRA_PRED_MODES] = {
2133         [VERT_PRED]            = { .needs_top  = 1 },
2134         [HOR_PRED]             = { .needs_left = 1 },
2135         [DC_PRED]              = { .needs_top  = 1, .needs_left = 1 },
2136         [DIAG_DOWN_LEFT_PRED]  = { .needs_top  = 1, .needs_topright = 1 },
2137         [DIAG_DOWN_RIGHT_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2138         [VERT_RIGHT_PRED]      = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2139         [HOR_DOWN_PRED]        = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2140         [VERT_LEFT_PRED]       = { .needs_top  = 1, .needs_topright = 1 },
2141         [HOR_UP_PRED]          = { .needs_left = 1 },
2142         [TM_VP8_PRED]          = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2143         [LEFT_DC_PRED]         = { .needs_left = 1 },
2144         [TOP_DC_PRED]          = { .needs_top  = 1 },
2145         [DC_128_PRED]          = { 0 },
2146         [DC_127_PRED]          = { 0 },
2147         [DC_129_PRED]          = { 0 }
2148     };
2149
2150     av_assert2(mode >= 0 && mode < 10);
2151     mode = mode_conv[mode][have_left][have_top];
2152     if (edges[mode].needs_top) {
2153         uint8_t *top, *topleft;
2154         int n_px_need = 4 << tx, n_px_have = (((s->cols - col) << !p) - x) * 4;
2155         int n_px_need_tr = 0;
2156
2157         if (tx == TX_4X4 && edges[mode].needs_topright && have_right)
2158             n_px_need_tr = 4;
2159
2160         // if top of sb64-row, use s->intra_pred_data[] instead of
2161         // dst[-stride] for intra prediction (it contains pre- instead of
2162         // post-loopfilter data)
2163         if (have_top) {
2164             top = !(row & 7) && !y ?
2165                 s->intra_pred_data[p] + col * (8 >> !!p) + x * 4 :
2166                 y == 0 ? &dst_edge[-stride_edge] : &dst_inner[-stride_inner];
2167             if (have_left)
2168                 topleft = !(row & 7) && !y ?
2169                     s->intra_pred_data[p] + col * (8 >> !!p) + x * 4 :
2170                     y == 0 || x == 0 ? &dst_edge[-stride_edge] :
2171                     &dst_inner[-stride_inner];
2172         }
2173
2174         if (have_top &&
2175             (!edges[mode].needs_topleft || (have_left && top == topleft)) &&
2176             (tx != TX_4X4 || !edges[mode].needs_topright || have_right) &&
2177             n_px_need + n_px_need_tr <= n_px_have) {
2178             *a = top;
2179         } else {
2180             if (have_top) {
2181                 if (n_px_need <= n_px_have) {
2182                     memcpy(*a, top, n_px_need);
2183                 } else {
2184                     memcpy(*a, top, n_px_have);
2185                     memset(&(*a)[n_px_have], (*a)[n_px_have - 1],
2186                            n_px_need - n_px_have);
2187                 }
2188             } else {
2189                 memset(*a, 127, n_px_need);
2190             }
2191             if (edges[mode].needs_topleft) {
2192                 if (have_left && have_top) {
2193                     (*a)[-1] = topleft[-1];
2194                 } else {
2195                     (*a)[-1] = have_top ? 129 : 127;
2196                 }
2197             }
2198             if (tx == TX_4X4 && edges[mode].needs_topright) {
2199                 if (have_top && have_right &&
2200                     n_px_need + n_px_need_tr <= n_px_have) {
2201                     memcpy(&(*a)[4], &top[4], 4);
2202                 } else {
2203                     memset(&(*a)[4], (*a)[3], 4);
2204                 }
2205             }
2206         }
2207     }
2208     if (edges[mode].needs_left) {
2209         if (have_left) {
2210             int n_px_need = 4 << tx, i, n_px_have = (((s->rows - row) << !p) - y) * 4;
2211             uint8_t *dst = x == 0 ? dst_edge : dst_inner;
2212             ptrdiff_t stride = x == 0 ? stride_edge : stride_inner;
2213
2214             if (n_px_need <= n_px_have) {
2215                 for (i = 0; i < n_px_need; i++)
2216                     l[i] = dst[i * stride - 1];
2217             } else {
2218                 for (i = 0; i < n_px_have; i++)
2219                     l[i] = dst[i * stride - 1];
2220                 memset(&l[i], l[i - 1], n_px_need - n_px_have);
2221             }
2222         } else {
2223             memset(l, 129, 4 << tx);
2224         }
2225     }
2226
2227     return mode;
2228 }
2229
2230 static void intra_recon(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off)
2231 {
2232     VP9Context *s = ctx->priv_data;
2233     VP9Block *b = s->b;
2234     int row = s->row, col = s->col;
2235     int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
2236     int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
2237     int end_x = FFMIN(2 * (s->cols - col), w4);
2238     int end_y = FFMIN(2 * (s->rows - row), h4);
2239     int tx = 4 * s->lossless + b->tx, uvtx = b->uvtx + 4 * s->lossless;
2240     int uvstep1d = 1 << b->uvtx, p;
2241     uint8_t *dst = s->dst[0], *dst_r = s->frames[CUR_FRAME].tf.f->data[0] + y_off;
2242
2243     for (n = 0, y = 0; y < end_y; y += step1d) {
2244         uint8_t *ptr = dst, *ptr_r = dst_r;
2245         for (x = 0; x < end_x; x += step1d, ptr += 4 * step1d,
2246                                ptr_r += 4 * step1d, n += step) {
2247             int mode = b->mode[b->bs > BS_8x8 && b->tx == TX_4X4 ?
2248                                y * 2 + x : 0];
2249             LOCAL_ALIGNED_16(uint8_t, a_buf, [48]);
2250             uint8_t *a = &a_buf[16], l[32];
2251             enum TxfmType txtp = vp9_intra_txfm_type[mode];
2252             int eob = b->skip ? 0 : b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
2253
2254             mode = check_intra_mode(s, mode, &a, ptr_r,
2255                                     s->frames[CUR_FRAME].tf.f->linesize[0],
2256                                     ptr, s->y_stride, l,
2257                                     col, x, w4, row, y, b->tx, 0);
2258             s->dsp.intra_pred[b->tx][mode](ptr, s->y_stride, l, a);
2259             if (eob)
2260                 s->dsp.itxfm_add[tx][txtp](ptr, s->y_stride,
2261                                            s->block + 16 * n, eob);
2262         }
2263         dst_r += 4 * step1d * s->frames[CUR_FRAME].tf.f->linesize[0];
2264         dst   += 4 * step1d * s->y_stride;
2265     }
2266
2267     // U/V
2268     h4 >>= 1;
2269     w4 >>= 1;
2270     end_x >>= 1;
2271     end_y >>= 1;
2272     step = 1 << (b->uvtx * 2);
2273     for (p = 0; p < 2; p++) {
2274         dst   = s->dst[1 + p];
2275         dst_r = s->frames[CUR_FRAME].tf.f->data[1 + p] + uv_off;
2276         for (n = 0, y = 0; y < end_y; y += uvstep1d) {
2277             uint8_t *ptr = dst, *ptr_r = dst_r;
2278             for (x = 0; x < end_x; x += uvstep1d, ptr += 4 * uvstep1d,
2279                                    ptr_r += 4 * uvstep1d, n += step) {
2280                 int mode = b->uvmode;
2281                 LOCAL_ALIGNED_16(uint8_t, a_buf, [48]);
2282                 uint8_t *a = &a_buf[16], l[32];
2283                 int eob = b->skip ? 0 : b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
2284
2285                 mode = check_intra_mode(s, mode, &a, ptr_r,
2286                                         s->frames[CUR_FRAME].tf.f->linesize[1],
2287                                         ptr, s->uv_stride, l,
2288                                         col, x, w4, row, y, b->uvtx, p + 1);
2289                 s->dsp.intra_pred[b->uvtx][mode](ptr, s->uv_stride, l, a);
2290                 if (eob)
2291                     s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
2292                                                     s->uvblock[p] + 16 * n, eob);
2293             }
2294             dst_r += 4 * uvstep1d * s->frames[CUR_FRAME].tf.f->linesize[1];
2295             dst   += 4 * uvstep1d * s->uv_stride;
2296         }
2297     }
2298 }
2299
2300 static av_always_inline void mc_luma_dir(VP9Context *s, vp9_mc_func (*mc)[2],
2301                                          uint8_t *dst, ptrdiff_t dst_stride,
2302                                          const uint8_t *ref, ptrdiff_t ref_stride,
2303                                          ThreadFrame *ref_frame,
2304                                          ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
2305                                          int bw, int bh, int w, int h)
2306 {
2307     int mx = mv->x, my = mv->y, th;
2308
2309     y += my >> 3;
2310     x += mx >> 3;
2311     ref += y * ref_stride + x;
2312     mx &= 7;
2313     my &= 7;
2314     // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2315     // we use +7 because the last 7 pixels of each sbrow can be changed in
2316     // the longest loopfilter of the next sbrow
2317     th = (y + bh + 4 * !!my + 7) >> 6;
2318     ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2319     if (x < !!mx * 3 || y < !!my * 3 ||
2320         x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
2321         s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2322                                  ref - !!my * 3 * ref_stride - !!mx * 3,
2323                                  80, ref_stride,
2324                                  bw + !!mx * 7, bh + !!my * 7,
2325                                  x - !!mx * 3, y - !!my * 3, w, h);
2326         ref = s->edge_emu_buffer + !!my * 3 * 80 + !!mx * 3;
2327         ref_stride = 80;
2328     }
2329     mc[!!mx][!!my](dst, dst_stride, ref, ref_stride, bh, mx << 1, my << 1);
2330 }
2331
2332 static av_always_inline void mc_chroma_dir(VP9Context *s, vp9_mc_func (*mc)[2],
2333                                            uint8_t *dst_u, uint8_t *dst_v,
2334                                            ptrdiff_t dst_stride,
2335                                            const uint8_t *ref_u, ptrdiff_t src_stride_u,
2336                                            const uint8_t *ref_v, ptrdiff_t src_stride_v,
2337                                            ThreadFrame *ref_frame,
2338                                            ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
2339                                            int bw, int bh, int w, int h)
2340 {
2341     int mx = mv->x, my = mv->y, th;
2342
2343     y += my >> 4;
2344     x += mx >> 4;
2345     ref_u += y * src_stride_u + x;
2346     ref_v += y * src_stride_v + x;
2347     mx &= 15;
2348     my &= 15;
2349     // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2350     // we use +7 because the last 7 pixels of each sbrow can be changed in
2351     // the longest loopfilter of the next sbrow
2352     th = (y + bh + 4 * !!my + 7) >> 5;
2353     ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2354     if (x < !!mx * 3 || y < !!my * 3 ||
2355         x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
2356         s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2357                                  ref_u - !!my * 3 * src_stride_u - !!mx * 3,
2358                                  80, src_stride_u,
2359                                  bw + !!mx * 7, bh + !!my * 7,
2360                                  x - !!mx * 3, y - !!my * 3, w, h);
2361         ref_u = s->edge_emu_buffer + !!my * 3 * 80 + !!mx * 3;
2362         mc[!!mx][!!my](dst_u, dst_stride, ref_u, 80, bh, mx, my);
2363
2364         s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2365                                  ref_v - !!my * 3 * src_stride_v - !!mx * 3,
2366                                  80, src_stride_v,
2367                                  bw + !!mx * 7, bh + !!my * 7,
2368                                  x - !!mx * 3, y - !!my * 3, w, h);
2369         ref_v = s->edge_emu_buffer + !!my * 3 * 80 + !!mx * 3;
2370         mc[!!mx][!!my](dst_v, dst_stride, ref_v, 80, bh, mx, my);
2371     } else {
2372         mc[!!mx][!!my](dst_u, dst_stride, ref_u, src_stride_u, bh, mx, my);
2373         mc[!!mx][!!my](dst_v, dst_stride, ref_v, src_stride_v, bh, mx, my);
2374     }
2375 }
2376
2377 static void inter_recon(AVCodecContext *ctx)
2378 {
2379     static const uint8_t bwlog_tab[2][N_BS_SIZES] = {
2380         { 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4 },
2381         { 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4 },
2382     };
2383     VP9Context *s = ctx->priv_data;
2384     VP9Block *b = s->b;
2385     int row = s->row, col = s->col;
2386     ThreadFrame *tref1 = &s->refs[s->refidx[b->ref[0]]], *tref2;
2387     AVFrame *ref1 = tref1->f, *ref2;
2388     int w1 = ref1->width, h1 = ref1->height, w2, h2;
2389     ptrdiff_t ls_y = s->y_stride, ls_uv = s->uv_stride;
2390
2391     if (b->comp) {
2392         tref2 = &s->refs[s->refidx[b->ref[1]]];
2393         ref2 = tref2->f;
2394         w2 = ref2->width;
2395         h2 = ref2->height;
2396     }
2397
2398     // y inter pred
2399     if (b->bs > BS_8x8) {
2400         if (b->bs == BS_8x4) {
2401             mc_luma_dir(s, s->dsp.mc[3][b->filter][0], s->dst[0], ls_y,
2402                         ref1->data[0], ref1->linesize[0], tref1,
2403                         row << 3, col << 3, &b->mv[0][0], 8, 4, w1, h1);
2404             mc_luma_dir(s, s->dsp.mc[3][b->filter][0],
2405                         s->dst[0] + 4 * ls_y, ls_y,
2406                         ref1->data[0], ref1->linesize[0], tref1,
2407                         (row << 3) + 4, col << 3, &b->mv[2][0], 8, 4, w1, h1);
2408
2409             if (b->comp) {
2410                 mc_luma_dir(s, s->dsp.mc[3][b->filter][1], s->dst[0], ls_y,
2411                             ref2->data[0], ref2->linesize[0], tref2,
2412                             row << 3, col << 3, &b->mv[0][1], 8, 4, w2, h2);
2413                 mc_luma_dir(s, s->dsp.mc[3][b->filter][1],
2414                             s->dst[0] + 4 * ls_y, ls_y,
2415                             ref2->data[0], ref2->linesize[0], tref2,
2416                             (row << 3) + 4, col << 3, &b->mv[2][1], 8, 4, w2, h2);
2417             }
2418         } else if (b->bs == BS_4x8) {
2419             mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0], ls_y,
2420                         ref1->data[0], ref1->linesize[0], tref1,
2421                         row << 3, col << 3, &b->mv[0][0], 4, 8, w1, h1);
2422             mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0] + 4, ls_y,
2423                         ref1->data[0], ref1->linesize[0], tref1,
2424                         row << 3, (col << 3) + 4, &b->mv[1][0], 4, 8, w1, h1);
2425
2426             if (b->comp) {
2427                 mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0], ls_y,
2428                             ref2->data[0], ref2->linesize[0], tref2,
2429                             row << 3, col << 3, &b->mv[0][1], 4, 8, w2, h2);
2430                 mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0] + 4, ls_y,
2431                             ref2->data[0], ref2->linesize[0], tref2,
2432                             row << 3, (col << 3) + 4, &b->mv[1][1], 4, 8, w2, h2);
2433             }
2434         } else {
2435             av_assert2(b->bs == BS_4x4);
2436
2437             // FIXME if two horizontally adjacent blocks have the same MV,
2438             // do a w8 instead of a w4 call
2439             mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0], ls_y,
2440                         ref1->data[0], ref1->linesize[0], tref1,
2441                         row << 3, col << 3, &b->mv[0][0], 4, 4, w1, h1);
2442             mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0] + 4, ls_y,
2443                         ref1->data[0], ref1->linesize[0], tref1,
2444                         row << 3, (col << 3) + 4, &b->mv[1][0], 4, 4, w1, h1);
2445             mc_luma_dir(s, s->dsp.mc[4][b->filter][0],
2446                         s->dst[0] + 4 * ls_y, ls_y,
2447                         ref1->data[0], ref1->linesize[0], tref1,
2448                         (row << 3) + 4, col << 3, &b->mv[2][0], 4, 4, w1, h1);
2449             mc_luma_dir(s, s->dsp.mc[4][b->filter][0],
2450                         s->dst[0] + 4 * ls_y + 4, ls_y,
2451                         ref1->data[0], ref1->linesize[0], tref1,
2452                         (row << 3) + 4, (col << 3) + 4, &b->mv[3][0], 4, 4, w1, h1);
2453
2454             if (b->comp) {
2455                 mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0], ls_y,
2456                             ref2->data[0], ref2->linesize[0], tref2,
2457                             row << 3, col << 3, &b->mv[0][1], 4, 4, w2, h2);
2458                 mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0] + 4, ls_y,
2459                             ref2->data[0], ref2->linesize[0], tref2,
2460                             row << 3, (col << 3) + 4, &b->mv[1][1], 4, 4, w2, h2);
2461                 mc_luma_dir(s, s->dsp.mc[4][b->filter][1],
2462                             s->dst[0] + 4 * ls_y, ls_y,
2463                             ref2->data[0], ref2->linesize[0], tref2,
2464                             (row << 3) + 4, col << 3, &b->mv[2][1], 4, 4, w2, h2);
2465                 mc_luma_dir(s, s->dsp.mc[4][b->filter][1],
2466                             s->dst[0] + 4 * ls_y + 4, ls_y,
2467                             ref2->data[0], ref2->linesize[0], tref2,
2468                             (row << 3) + 4, (col << 3) + 4, &b->mv[3][1], 4, 4, w2, h2);
2469             }
2470         }
2471     } else {
2472         int bwl = bwlog_tab[0][b->bs];
2473         int bw = bwh_tab[0][b->bs][0] * 4, bh = bwh_tab[0][b->bs][1] * 4;
2474
2475         mc_luma_dir(s, s->dsp.mc[bwl][b->filter][0], s->dst[0], ls_y,
2476                     ref1->data[0], ref1->linesize[0], tref1,
2477                     row << 3, col << 3, &b->mv[0][0],bw, bh, w1, h1);
2478
2479         if (b->comp)
2480             mc_luma_dir(s, s->dsp.mc[bwl][b->filter][1], s->dst[0], ls_y,
2481                         ref2->data[0], ref2->linesize[0], tref2,
2482                         row << 3, col << 3, &b->mv[0][1], bw, bh, w2, h2);
2483     }
2484
2485     // uv inter pred
2486     {
2487         int bwl = bwlog_tab[1][b->bs];
2488         int bw = bwh_tab[1][b->bs][0] * 4, bh = bwh_tab[1][b->bs][1] * 4;
2489         VP56mv mvuv;
2490
2491         w1 = (w1 + 1) >> 1;
2492         h1 = (h1 + 1) >> 1;
2493         if (b->comp) {
2494             w2 = (w2 + 1) >> 1;
2495             h2 = (h2 + 1) >> 1;
2496         }
2497         if (b->bs > BS_8x8) {
2498             mvuv.x = ROUNDED_DIV(b->mv[0][0].x + b->mv[1][0].x + b->mv[2][0].x + b->mv[3][0].x, 4);
2499             mvuv.y = ROUNDED_DIV(b->mv[0][0].y + b->mv[1][0].y + b->mv[2][0].y + b->mv[3][0].y, 4);
2500         } else {
2501             mvuv = b->mv[0][0];
2502         }
2503
2504         mc_chroma_dir(s, s->dsp.mc[bwl][b->filter][0],
2505                       s->dst[1], s->dst[2], ls_uv,
2506                       ref1->data[1], ref1->linesize[1],
2507                       ref1->data[2], ref1->linesize[2], tref1,
2508                       row << 2, col << 2, &mvuv, bw, bh, w1, h1);
2509
2510         if (b->comp) {
2511             if (b->bs > BS_8x8) {
2512                 mvuv.x = ROUNDED_DIV(b->mv[0][1].x + b->mv[1][1].x + b->mv[2][1].x + b->mv[3][1].x, 4);
2513                 mvuv.y = ROUNDED_DIV(b->mv[0][1].y + b->mv[1][1].y + b->mv[2][1].y + b->mv[3][1].y, 4);
2514             } else {
2515                 mvuv = b->mv[0][1];
2516             }
2517             mc_chroma_dir(s, s->dsp.mc[bwl][b->filter][1],
2518                           s->dst[1], s->dst[2], ls_uv,
2519                           ref2->data[1], ref2->linesize[1],
2520                           ref2->data[2], ref2->linesize[2], tref2,
2521                           row << 2, col << 2, &mvuv, bw, bh, w2, h2);
2522         }
2523     }
2524
2525     if (!b->skip) {
2526         /* mostly copied intra_reconn() */
2527
2528         int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
2529         int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
2530         int end_x = FFMIN(2 * (s->cols - col), w4);
2531         int end_y = FFMIN(2 * (s->rows - row), h4);
2532         int tx = 4 * s->lossless + b->tx, uvtx = b->uvtx + 4 * s->lossless;
2533         int uvstep1d = 1 << b->uvtx, p;
2534         uint8_t *dst = s->dst[0];
2535
2536         // y itxfm add
2537         for (n = 0, y = 0; y < end_y; y += step1d) {
2538             uint8_t *ptr = dst;
2539             for (x = 0; x < end_x; x += step1d, ptr += 4 * step1d, n += step) {
2540                 int eob = b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
2541
2542                 if (eob)
2543                     s->dsp.itxfm_add[tx][DCT_DCT](ptr, s->y_stride,
2544                                                   s->block + 16 * n, eob);
2545             }
2546             dst += 4 * s->y_stride * step1d;
2547         }
2548
2549         // uv itxfm add
2550         h4 >>= 1;
2551         w4 >>= 1;
2552         end_x >>= 1;
2553         end_y >>= 1;
2554         step = 1 << (b->uvtx * 2);
2555         for (p = 0; p < 2; p++) {
2556             dst = s->dst[p + 1];
2557             for (n = 0, y = 0; y < end_y; y += uvstep1d) {
2558                 uint8_t *ptr = dst;
2559                 for (x = 0; x < end_x; x += uvstep1d, ptr += 4 * uvstep1d, n += step) {
2560                     int eob = b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
2561
2562                     if (eob)
2563                         s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
2564                                                         s->uvblock[p] + 16 * n, eob);
2565                 }
2566                 dst += 4 * uvstep1d * s->uv_stride;
2567             }
2568         }
2569     }
2570 }
2571
2572 static av_always_inline void mask_edges(struct VP9Filter *lflvl, int is_uv,
2573                                         int row_and_7, int col_and_7,
2574                                         int w, int h, int col_end, int row_end,
2575                                         enum TxfmMode tx, int skip_inter)
2576 {
2577     // FIXME I'm pretty sure all loops can be replaced by a single LUT if
2578     // we make VP9Filter.mask uint64_t (i.e. row/col all single variable)
2579     // and make the LUT 5-indexed (bl, bp, is_uv, tx and row/col), and then
2580     // use row_and_7/col_and_7 as shifts (1*col_and_7+8*row_and_7)
2581
2582     // the intended behaviour of the vp9 loopfilter is to work on 8-pixel
2583     // edges. This means that for UV, we work on two subsampled blocks at
2584     // a time, and we only use the topleft block's mode information to set
2585     // things like block strength. Thus, for any block size smaller than
2586     // 16x16, ignore the odd portion of the block.
2587     if (tx == TX_4X4 && is_uv) {
2588         if (h == 1) {
2589             if (row_and_7 & 1)
2590                 return;
2591             if (!row_end)
2592                 h += 1;
2593         }
2594         if (w == 1) {
2595             if (col_and_7 & 1)
2596                 return;
2597             if (!col_end)
2598                 w += 1;
2599         }
2600     }
2601
2602     if (tx == TX_4X4 && !skip_inter) {
2603         int t = 1 << col_and_7, m_col = (t << w) - t, y;
2604         int m_col_odd = (t << (w - 1)) - t;
2605
2606         // on 32-px edges, use the 8-px wide loopfilter; else, use 4-px wide
2607         if (is_uv) {
2608             int m_row_8 = m_col & 0x01, m_row_4 = m_col - m_row_8;
2609
2610             for (y = row_and_7; y < h + row_and_7; y++) {
2611                 int col_mask_id = 2 - !(y & 7);
2612
2613                 lflvl->mask[is_uv][0][y][1] |= m_row_8;
2614                 lflvl->mask[is_uv][0][y][2] |= m_row_4;
2615                 // for odd lines, if the odd col is not being filtered,
2616                 // skip odd row also:
2617                 // .---. <-- a
2618                 // |   |
2619                 // |___| <-- b
2620                 // ^   ^
2621                 // c   d
2622                 //
2623                 // if a/c are even row/col and b/d are odd, and d is skipped,
2624                 // e.g. right edge of size-66x66.webm, then skip b also (bug)
2625                 if ((col_end & 1) && (y & 1)) {
2626                     lflvl->mask[is_uv][1][y][col_mask_id] |= m_col_odd;
2627                 } else {
2628                     lflvl->mask[is_uv][1][y][col_mask_id] |= m_col;
2629                 }
2630             }
2631         } else {
2632             int m_row_8 = m_col & 0x11, m_row_4 = m_col - m_row_8;
2633
2634             for (y = row_and_7; y < h + row_and_7; y++) {
2635                 int col_mask_id = 2 - !(y & 3);
2636
2637                 lflvl->mask[is_uv][0][y][1] |= m_row_8; // row edge
2638                 lflvl->mask[is_uv][0][y][2] |= m_row_4;
2639                 lflvl->mask[is_uv][1][y][col_mask_id] |= m_col; // col edge
2640                 lflvl->mask[is_uv][0][y][3] |= m_col;
2641                 lflvl->mask[is_uv][1][y][3] |= m_col;
2642             }
2643         }
2644     } else {
2645         int y, t = 1 << col_and_7, m_col = (t << w) - t;
2646
2647         if (!skip_inter) {
2648             int mask_id = (tx == TX_8X8);
2649             int l2 = tx + is_uv - 1, step1d = 1 << l2;
2650             static const unsigned masks[4] = { 0xff, 0x55, 0x11, 0x01 };
2651             int m_row = m_col & masks[l2];
2652
2653             // at odd UV col/row edges tx16/tx32 loopfilter edges, force
2654             // 8wd loopfilter to prevent going off the visible edge.
2655             if (is_uv && tx > TX_8X8 && (w ^ (w - 1)) == 1) {
2656                 int m_row_16 = ((t << (w - 1)) - t) & masks[l2];
2657                 int m_row_8 = m_row - m_row_16;
2658
2659                 for (y = row_and_7; y < h + row_and_7; y++) {
2660                     lflvl->mask[is_uv][0][y][0] |= m_row_16;
2661                     lflvl->mask[is_uv][0][y][1] |= m_row_8;
2662                 }
2663             } else {
2664                 for (y = row_and_7; y < h + row_and_7; y++)
2665                     lflvl->mask[is_uv][0][y][mask_id] |= m_row;
2666             }
2667
2668             if (is_uv && tx > TX_8X8 && (h ^ (h - 1)) == 1) {
2669                 for (y = row_and_7; y < h + row_and_7 - 1; y += step1d)
2670                     lflvl->mask[is_uv][1][y][0] |= m_col;
2671                 if (y - row_and_7 == h - 1)
2672                     lflvl->mask[is_uv][1][y][1] |= m_col;
2673             } else {
2674                 for (y = row_and_7; y < h + row_and_7; y += step1d)
2675                     lflvl->mask[is_uv][1][y][mask_id] |= m_col;
2676             }
2677         } else if (tx != TX_4X4) {
2678             int mask_id;
2679
2680             mask_id = (tx == TX_8X8) || (is_uv && h == 1);
2681             lflvl->mask[is_uv][1][row_and_7][mask_id] |= m_col;
2682             mask_id = (tx == TX_8X8) || (is_uv && w == 1);
2683             for (y = row_and_7; y < h + row_and_7; y++)
2684                 lflvl->mask[is_uv][0][y][mask_id] |= t;
2685         } else if (is_uv) {
2686             int t8 = t & 0x01, t4 = t - t8;
2687
2688             for (y = row_and_7; y < h + row_and_7; y++) {
2689                 lflvl->mask[is_uv][0][y][2] |= t4;
2690                 lflvl->mask[is_uv][0][y][1] |= t8;
2691             }
2692             lflvl->mask[is_uv][1][row_and_7][2 - !(row_and_7 & 7)] |= m_col;
2693         } else {
2694             int t8 = t & 0x11, t4 = t - t8;
2695
2696             for (y = row_and_7; y < h + row_and_7; y++) {
2697                 lflvl->mask[is_uv][0][y][2] |= t4;
2698                 lflvl->mask[is_uv][0][y][1] |= t8;
2699             }
2700             lflvl->mask[is_uv][1][row_and_7][2 - !(row_and_7 & 3)] |= m_col;
2701         }
2702     }
2703 }
2704
2705 static void decode_b(AVCodecContext *ctx, int row, int col,
2706                      struct VP9Filter *lflvl, ptrdiff_t yoff, ptrdiff_t uvoff,
2707                      enum BlockLevel bl, enum BlockPartition bp)
2708 {
2709     VP9Context *s = ctx->priv_data;
2710     VP9Block *b = s->b;
2711     enum BlockSize bs = bl * 3 + bp;
2712     int y, w4 = bwh_tab[1][bs][0], h4 = bwh_tab[1][bs][1], lvl;
2713     int emu[2];
2714     AVFrame *f = s->frames[CUR_FRAME].tf.f;
2715
2716     s->row = row;
2717     s->row7 = row & 7;
2718     s->col = col;
2719     s->col7 = col & 7;
2720     s->min_mv.x = -(128 + col * 64);
2721     s->min_mv.y = -(128 + row * 64);
2722     s->max_mv.x = 128 + (s->cols - col - w4) * 64;
2723     s->max_mv.y = 128 + (s->rows - row - h4) * 64;
2724     if (s->pass < 2) {
2725         b->bs = bs;
2726         b->bl = bl;
2727         b->bp = bp;
2728         decode_mode(ctx);
2729         b->uvtx = b->tx - (w4 * 2 == (1 << b->tx) || h4 * 2 == (1 << b->tx));
2730
2731         if (!b->skip) {
2732             decode_coeffs(ctx);
2733         } else {
2734             int pl;
2735
2736             memset(&s->above_y_nnz_ctx[col * 2], 0, w4 * 2);
2737             memset(&s->left_y_nnz_ctx[(row & 7) << 1], 0, h4 * 2);
2738             for (pl = 0; pl < 2; pl++) {
2739                 memset(&s->above_uv_nnz_ctx[pl][col], 0, w4);
2740                 memset(&s->left_uv_nnz_ctx[pl][row & 7], 0, h4);
2741             }
2742         }
2743         if (s->pass == 1) {
2744             s->b++;
2745             s->block += w4 * h4 * 64;
2746             s->uvblock[0] += w4 * h4 * 16;
2747             s->uvblock[1] += w4 * h4 * 16;
2748             s->eob += 4 * w4 * h4;
2749             s->uveob[0] += w4 * h4;
2750             s->uveob[1] += w4 * h4;
2751
2752             return;
2753         }
2754     }
2755
2756     // emulated overhangs if the stride of the target buffer can't hold. This
2757     // allows to support emu-edge and so on even if we have large block
2758     // overhangs
2759     emu[0] = (col + w4) * 8 > f->linesize[0] ||
2760              (row + h4) > s->rows;
2761     emu[1] = (col + w4) * 4 > f->linesize[1] ||
2762              (row + h4) > s->rows;
2763     if (emu[0]) {
2764         s->dst[0] = s->tmp_y;
2765         s->y_stride = 64;
2766     } else {
2767         s->dst[0] = f->data[0] + yoff;
2768         s->y_stride = f->linesize[0];
2769     }
2770     if (emu[1]) {
2771         s->dst[1] = s->tmp_uv[0];
2772         s->dst[2] = s->tmp_uv[1];
2773         s->uv_stride = 32;
2774     } else {
2775         s->dst[1] = f->data[1] + uvoff;
2776         s->dst[2] = f->data[2] + uvoff;
2777         s->uv_stride = f->linesize[1];
2778     }
2779     if (b->intra) {
2780         intra_recon(ctx, yoff, uvoff);
2781     } else {
2782         inter_recon(ctx);
2783     }
2784     if (emu[0]) {
2785         int w = FFMIN(s->cols - col, w4) * 8, h = FFMIN(s->rows - row, h4) * 8, n, o = 0;
2786
2787         for (n = 0; o < w; n++) {
2788             int bw = 64 >> n;
2789
2790             av_assert2(n <= 4);
2791             if (w & bw) {
2792                 s->dsp.mc[n][0][0][0][0](f->data[0] + yoff + o, f->linesize[0],
2793                                          s->tmp_y + o, 64, h, 0, 0);
2794                 o += bw;
2795             }
2796         }
2797     }
2798     if (emu[1]) {
2799         int w = FFMIN(s->cols - col, w4) * 4, h = FFMIN(s->rows - row, h4) * 4, n, o = 0;
2800
2801         for (n = 1; o < w; n++) {
2802             int bw = 64 >> n;
2803
2804             av_assert2(n <= 4);
2805             if (w & bw) {
2806                 s->dsp.mc[n][0][0][0][0](f->data[1] + uvoff + o, f->linesize[1],
2807                                          s->tmp_uv[0] + o, 32, h, 0, 0);
2808                 s->dsp.mc[n][0][0][0][0](f->data[2] + uvoff + o, f->linesize[2],
2809                                          s->tmp_uv[1] + o, 32, h, 0, 0);
2810                 o += bw;
2811             }
2812         }
2813     }
2814
2815     // pick filter level and find edges to apply filter to
2816     if (s->filter.level &&
2817         (lvl = s->segmentation.feat[b->seg_id].lflvl[b->intra ? 0 : b->ref[0] + 1]
2818                                                     [b->mode[3] != ZEROMV]) > 0) {
2819         int x_end = FFMIN(s->cols - col, w4), y_end = FFMIN(s->rows - row, h4);
2820         int skip_inter = !b->intra && b->skip;
2821
2822         for (y = 0; y < h4; y++)
2823             memset(&lflvl->level[((row & 7) + y) * 8 + (col & 7)], lvl, w4);
2824         mask_edges(lflvl, 0, row & 7, col & 7, x_end, y_end, 0, 0, b->tx, skip_inter);
2825         mask_edges(lflvl, 1, row & 7, col & 7, x_end, y_end,
2826                    s->cols & 1 && col + w4 >= s->cols ? s->cols & 7 : 0,
2827                    s->rows & 1 && row + h4 >= s->rows ? s->rows & 7 : 0,
2828                    b->uvtx, skip_inter);
2829
2830         if (!s->filter.lim_lut[lvl]) {
2831             int sharp = s->filter.sharpness;
2832             int limit = lvl;
2833
2834             if (sharp > 0) {
2835                 limit >>= (sharp + 3) >> 2;
2836                 limit = FFMIN(limit, 9 - sharp);
2837             }
2838             limit = FFMAX(limit, 1);
2839
2840             s->filter.lim_lut[lvl] = limit;
2841             s->filter.mblim_lut[lvl] = 2 * (lvl + 2) + limit;
2842         }
2843     }
2844
2845     if (s->pass == 2) {
2846         s->b++;
2847         s->block += w4 * h4 * 64;
2848         s->uvblock[0] += w4 * h4 * 16;
2849         s->uvblock[1] += w4 * h4 * 16;
2850         s->eob += 4 * w4 * h4;
2851         s->uveob[0] += w4 * h4;
2852         s->uveob[1] += w4 * h4;
2853     }
2854 }
2855
2856 static void decode_sb(AVCodecContext *ctx, int row, int col, struct VP9Filter *lflvl,
2857                       ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
2858 {
2859     VP9Context *s = ctx->priv_data;
2860     int c = ((s->above_partition_ctx[col] >> (3 - bl)) & 1) |
2861             (((s->left_partition_ctx[row & 0x7] >> (3 - bl)) & 1) << 1);
2862     const uint8_t *p = s->keyframe ? vp9_default_kf_partition_probs[bl][c] :
2863                                      s->prob.p.partition[bl][c];
2864     enum BlockPartition bp;
2865     ptrdiff_t hbs = 4 >> bl;
2866     AVFrame *f = s->frames[CUR_FRAME].tf.f;
2867     ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
2868
2869     if (bl == BL_8X8) {
2870         bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
2871         decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
2872     } else if (col + hbs < s->cols) { // FIXME why not <=?
2873         if (row + hbs < s->rows) { // FIXME why not <=?
2874             bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
2875             switch (bp) {
2876             case PARTITION_NONE:
2877                 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
2878                 break;
2879             case PARTITION_H:
2880                 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
2881                 yoff  += hbs * 8 * y_stride;
2882                 uvoff += hbs * 4 * uv_stride;
2883                 decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, bl, bp);
2884                 break;
2885             case PARTITION_V:
2886                 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
2887                 yoff  += hbs * 8;
2888                 uvoff += hbs * 4;
2889                 decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, bl, bp);
2890                 break;
2891             case PARTITION_SPLIT:
2892                 decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
2893                 decode_sb(ctx, row, col + hbs, lflvl,
2894                           yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
2895                 yoff  += hbs * 8 * y_stride;
2896                 uvoff += hbs * 4 * uv_stride;
2897                 decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
2898                 decode_sb(ctx, row + hbs, col + hbs, lflvl,
2899                           yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
2900                 break;
2901             default:
2902                 av_assert0(0);
2903             }
2904         } else if (vp56_rac_get_prob_branchy(&s->c, p[1])) {
2905             bp = PARTITION_SPLIT;
2906             decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
2907             decode_sb(ctx, row, col + hbs, lflvl,
2908                       yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
2909         } else {
2910             bp = PARTITION_H;
2911             decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
2912         }
2913     } else if (row + hbs < s->rows) { // FIXME why not <=?
2914         if (vp56_rac_get_prob_branchy(&s->c, p[2])) {
2915             bp = PARTITION_SPLIT;
2916             decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
2917             yoff  += hbs * 8 * y_stride;
2918             uvoff += hbs * 4 * uv_stride;
2919             decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
2920         } else {
2921             bp = PARTITION_V;
2922             decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
2923         }
2924     } else {
2925         bp = PARTITION_SPLIT;
2926         decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
2927     }
2928     s->counts.partition[bl][c][bp]++;
2929 }
2930
2931 static void decode_sb_mem(AVCodecContext *ctx, int row, int col, struct VP9Filter *lflvl,
2932                           ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
2933 {
2934     VP9Context *s = ctx->priv_data;
2935     VP9Block *b = s->b;
2936     ptrdiff_t hbs = 4 >> bl;
2937     AVFrame *f = s->frames[CUR_FRAME].tf.f;
2938     ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
2939
2940     if (bl == BL_8X8) {
2941         av_assert2(b->bl == BL_8X8);
2942         decode_b(ctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
2943     } else if (s->b->bl == bl) {
2944         decode_b(ctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
2945         if (b->bp == PARTITION_H && row + hbs < s->rows) {
2946             yoff  += hbs * 8 * y_stride;
2947             uvoff += hbs * 4 * uv_stride;
2948             decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, b->bl, b->bp);
2949         } else if (b->bp == PARTITION_V && col + hbs < s->cols) {
2950             yoff  += hbs * 8;
2951             uvoff += hbs * 4;
2952             decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, b->bl, b->bp);
2953         }
2954     } else {
2955         decode_sb_mem(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
2956         if (col + hbs < s->cols) { // FIXME why not <=?
2957             if (row + hbs < s->rows) {
2958                 decode_sb_mem(ctx, row, col + hbs, lflvl, yoff + 8 * hbs,
2959                               uvoff + 4 * hbs, bl + 1);
2960                 yoff  += hbs * 8 * y_stride;
2961                 uvoff += hbs * 4 * uv_stride;
2962                 decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
2963                 decode_sb_mem(ctx, row + hbs, col + hbs, lflvl,
2964                                     yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
2965             } else {
2966                 yoff  += hbs * 8;
2967                 uvoff += hbs * 4;
2968                 decode_sb_mem(ctx, row, col + hbs, lflvl, yoff, uvoff, bl + 1);
2969             }
2970         } else if (row + hbs < s->rows) {
2971             yoff  += hbs * 8 * y_stride;
2972             uvoff += hbs * 4 * uv_stride;
2973             decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
2974         }
2975     }
2976 }
2977
2978 static void loopfilter_sb(AVCodecContext *ctx, struct VP9Filter *lflvl,
2979                           int row, int col, ptrdiff_t yoff, ptrdiff_t uvoff)
2980 {
2981     VP9Context *s = ctx->priv_data;
2982     AVFrame *f = s->frames[CUR_FRAME].tf.f;
2983     uint8_t *dst = f->data[0] + yoff, *lvl = lflvl->level;
2984     ptrdiff_t ls_y = f->linesize[0], ls_uv = f->linesize[1];
2985     int y, x, p;
2986
2987     // FIXME in how far can we interleave the v/h loopfilter calls? E.g.
2988     // if you think of them as acting on a 8x8 block max, we can interleave
2989     // each v/h within the single x loop, but that only works if we work on
2990     // 8 pixel blocks, and we won't always do that (we want at least 16px
2991     // to use SSE2 optimizations, perhaps 32 for AVX2)
2992
2993     // filter edges between columns, Y plane (e.g. block1 | block2)
2994     for (y = 0; y < 8; y += 2, dst += 16 * ls_y, lvl += 16) {
2995         uint8_t *ptr = dst, *l = lvl, *hmask1 = lflvl->mask[0][0][y];
2996         uint8_t *hmask2 = lflvl->mask[0][0][y + 1];
2997         unsigned hm1 = hmask1[0] | hmask1[1] | hmask1[2], hm13 = hmask1[3];
2998         unsigned hm2 = hmask2[1] | hmask2[2], hm23 = hmask2[3];
2999         unsigned hm = hm1 | hm2 | hm13 | hm23;
3000
3001         for (x = 1; hm & ~(x - 1); x <<= 1, ptr += 8, l++) {
3002             if (hm1 & x) {
3003                 int L = *l, H = L >> 4;
3004                 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3005
3006                 if (col || x > 1) {
3007                     if (hmask1[0] & x) {
3008                         if (hmask2[0] & x) {
3009                             av_assert2(l[8] == L);
3010                             s->dsp.loop_filter_16[0](ptr, ls_y, E, I, H);
3011                         } else {
3012                             s->dsp.loop_filter_8[2][0](ptr, ls_y, E, I, H);
3013                         }
3014                     } else if (hm2 & x) {
3015                         L = l[8];
3016                         H |= (L >> 4) << 8;
3017                         E |= s->filter.mblim_lut[L] << 8;
3018                         I |= s->filter.lim_lut[L] << 8;
3019                         s->dsp.loop_filter_mix2[!!(hmask1[1] & x)]
3020                                                [!!(hmask2[1] & x)]
3021                                                [0](ptr, ls_y, E, I, H);
3022                     } else {
3023                         s->dsp.loop_filter_8[!!(hmask1[1] & x)]
3024                                             [0](ptr, ls_y, E, I, H);
3025                     }
3026                 }
3027             } else if (hm2 & x) {
3028                 int L = l[8], H = L >> 4;
3029                 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3030
3031                 if (col || x > 1) {
3032                     s->dsp.loop_filter_8[!!(hmask2[1] & x)]
3033                                         [0](ptr + 8 * ls_y, ls_y, E, I, H);
3034                 }
3035             }
3036             if (hm13 & x) {
3037                 int L = *l, H = L >> 4;
3038                 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3039
3040                 if (hm23 & x) {
3041                     L = l[8];
3042                     H |= (L >> 4) << 8;
3043                     E |= s->filter.mblim_lut[L] << 8;
3044                     I |= s->filter.lim_lut[L] << 8;
3045                     s->dsp.loop_filter_mix2[0][0][0](ptr + 4, ls_y, E, I, H);
3046                 } else {
3047                     s->dsp.loop_filter_8[0][0](ptr + 4, ls_y, E, I, H);
3048                 }
3049             } else if (hm23 & x) {
3050                 int L = l[8], H = L >> 4;
3051                 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3052
3053                 s->dsp.loop_filter_8[0][0](ptr + 8 * ls_y + 4, ls_y, E, I, H);
3054             }
3055         }
3056     }
3057
3058     //                                          block1
3059     // filter edges between rows, Y plane (e.g. ------)
3060     //                                          block2
3061     dst = f->data[0] + yoff;
3062     lvl = lflvl->level;
3063     for (y = 0; y < 8; y++, dst += 8 * ls_y, lvl += 8) {
3064         uint8_t *ptr = dst, *l = lvl, *vmask = lflvl->mask[0][1][y];
3065         unsigned vm = vmask[0] | vmask[1] | vmask[2], vm3 = vmask[3];
3066
3067         for (x = 1; vm & ~(x - 1); x <<= 2, ptr += 16, l += 2) {
3068             if (row || y) {
3069                 if (vm & x) {
3070                     int L = *l, H = L >> 4;
3071                     int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3072
3073                     if (vmask[0] & x) {
3074                         if (vmask[0] & (x << 1)) {
3075                             av_assert2(l[1] == L);
3076                             s->dsp.loop_filter_16[1](ptr, ls_y, E, I, H);
3077                         } else {
3078                             s->dsp.loop_filter_8[2][1](ptr, ls_y, E, I, H);
3079                         }
3080                     } else if (vm & (x << 1)) {
3081                         L = l[1];
3082                         H |= (L >> 4) << 8;
3083                         E |= s->filter.mblim_lut[L] << 8;
3084                         I |= s->filter.lim_lut[L] << 8;
3085                         s->dsp.loop_filter_mix2[!!(vmask[1] &  x)]
3086                                                [!!(vmask[1] & (x << 1))]
3087                                                [1](ptr, ls_y, E, I, H);
3088                     } else {
3089                         s->dsp.loop_filter_8[!!(vmask[1] & x)]
3090                                             [1](ptr, ls_y, E, I, H);
3091                     }
3092                 } else if (vm & (x << 1)) {
3093                     int L = l[1], H = L >> 4;
3094                     int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3095
3096                     s->dsp.loop_filter_8[!!(vmask[1] & (x << 1))]
3097                                         [1](ptr + 8, ls_y, E, I, H);
3098                 }
3099             }
3100             if (vm3 & x) {
3101                 int L = *l, H = L >> 4;
3102                 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3103
3104                 if (vm3 & (x << 1)) {
3105                     L = l[1];
3106                     H |= (L >> 4) << 8;
3107                     E |= s->filter.mblim_lut[L] << 8;
3108                     I |= s->filter.lim_lut[L] << 8;
3109                     s->dsp.loop_filter_mix2[0][0][1](ptr + ls_y * 4, ls_y, E, I, H);
3110                 } else {
3111                     s->dsp.loop_filter_8[0][1](ptr + ls_y * 4, ls_y, E, I, H);
3112                 }
3113             } else if (vm3 & (x << 1)) {
3114                 int L = l[1], H = L >> 4;
3115                 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3116
3117                 s->dsp.loop_filter_8[0][1](ptr + ls_y * 4 + 8, ls_y, E, I, H);
3118             }
3119         }
3120     }
3121
3122     // same principle but for U/V planes
3123     for (p = 0; p < 2; p++) {
3124         lvl = lflvl->level;
3125         dst = f->data[1 + p] + uvoff;
3126         for (y = 0; y < 8; y += 4, dst += 16 * ls_uv, lvl += 32) {
3127             uint8_t *ptr = dst, *l = lvl, *hmask1 = lflvl->mask[1][0][y];
3128             uint8_t *hmask2 = lflvl->mask[1][0][y + 2];
3129             unsigned hm1 = hmask1[0] | hmask1[1] | hmask1[2];
3130             unsigned hm2 = hmask2[1] | hmask2[2], hm = hm1 | hm2;
3131
3132             for (x = 1; hm & ~(x - 1); x <<= 1, ptr += 4) {
3133                 if (col || x > 1) {
3134                     if (hm1 & x) {
3135                         int L = *l, H = L >> 4;
3136                         int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3137
3138                         if (hmask1[0] & x) {
3139                             if (hmask2[0] & x) {
3140                                 av_assert2(l[16] == L);
3141                                 s->dsp.loop_filter_16[0](ptr, ls_uv, E, I, H);
3142                             } else {
3143                                 s->dsp.loop_filter_8[2][0](ptr, ls_uv, E, I, H);
3144                             }
3145                         } else if (hm2 & x) {
3146                             L = l[16];
3147                             H |= (L >> 4) << 8;
3148                             E |= s->filter.mblim_lut[L] << 8;
3149                             I |= s->filter.lim_lut[L] << 8;
3150                             s->dsp.loop_filter_mix2[!!(hmask1[1] & x)]
3151                                                    [!!(hmask2[1] & x)]
3152                                                    [0](ptr, ls_uv, E, I, H);
3153                         } else {
3154                             s->dsp.loop_filter_8[!!(hmask1[1] & x)]
3155                                                 [0](ptr, ls_uv, E, I, H);
3156                         }
3157                     } else if (hm2 & x) {
3158                         int L = l[16], H = L >> 4;
3159                         int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3160
3161                         s->dsp.loop_filter_8[!!(hmask2[1] & x)]
3162                                             [0](ptr + 8 * ls_uv, ls_uv, E, I, H);
3163                     }
3164                 }
3165                 if (x & 0xAA)
3166                     l += 2;
3167             }
3168         }
3169         lvl = lflvl->level;
3170         dst = f->data[1 + p] + uvoff;
3171         for (y = 0; y < 8; y++, dst += 4 * ls_uv) {
3172             uint8_t *ptr = dst, *l = lvl, *vmask = lflvl->mask[1][1][y];
3173             unsigned vm = vmask[0] | vmask[1] | vmask[2];
3174
3175             for (x = 1; vm & ~(x - 1); x <<= 4, ptr += 16, l += 4) {
3176                 if (row || y) {
3177                     if (vm & x) {
3178                         int L = *l, H = L >> 4;
3179                         int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3180
3181                         if (vmask[0] & x) {
3182                             if (vmask[0] & (x << 2)) {
3183                                 av_assert2(l[2] == L);
3184                                 s->dsp.loop_filter_16[1](ptr, ls_uv, E, I, H);
3185                             } else {
3186                                 s->dsp.loop_filter_8[2][1](ptr, ls_uv, E, I, H);
3187                             }
3188                         } else if (vm & (x << 2)) {
3189                             L = l[2];
3190                             H |= (L >> 4) << 8;
3191                             E |= s->filter.mblim_lut[L] << 8;
3192                             I |= s->filter.lim_lut[L] << 8;
3193                             s->dsp.loop_filter_mix2[!!(vmask[1] &  x)]
3194                                                    [!!(vmask[1] & (x << 2))]
3195                                                    [1](ptr, ls_uv, E, I, H);
3196                         } else {
3197                             s->dsp.loop_filter_8[!!(vmask[1] & x)]
3198                                                 [1](ptr, ls_uv, E, I, H);
3199                         }
3200                     } else if (vm & (x << 2)) {
3201                         int L = l[2], H = L >> 4;
3202                         int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3203
3204                         s->dsp.loop_filter_8[!!(vmask[1] & (x << 2))]
3205                                             [1](ptr + 8, ls_uv, E, I, H);
3206                     }
3207                 }
3208             }
3209             if (y & 1)
3210                 lvl += 16;
3211         }
3212     }
3213 }
3214
3215 static void set_tile_offset(int *start, int *end, int idx, int log2_n, int n)
3216 {
3217     int sb_start = ( idx      * n) >> log2_n;
3218     int sb_end   = ((idx + 1) * n) >> log2_n;
3219     *start = FFMIN(sb_start, n) << 3;
3220     *end   = FFMIN(sb_end,   n) << 3;
3221 }
3222
3223 static av_always_inline void adapt_prob(uint8_t *p, unsigned ct0, unsigned ct1,
3224                                         int max_count, int update_factor)
3225 {
3226     unsigned ct = ct0 + ct1, p2, p1;
3227
3228     if (!ct)
3229         return;
3230
3231     p1 = *p;
3232     p2 = ((ct0 << 8) + (ct >> 1)) / ct;
3233     p2 = av_clip(p2, 1, 255);
3234     ct = FFMIN(ct, max_count);
3235     update_factor = FASTDIV(update_factor * ct, max_count);
3236
3237     // (p1 * (256 - update_factor) + p2 * update_factor + 128) >> 8
3238     *p = p1 + (((p2 - p1) * update_factor + 128) >> 8);
3239 }
3240
3241 static void adapt_probs(VP9Context *s)
3242 {
3243     int i, j, k, l, m;
3244     prob_context *p = &s->prob_ctx[s->framectxid].p;
3245     int uf = (s->keyframe || s->intraonly || !s->last_keyframe) ? 112 : 128;
3246
3247     // coefficients
3248     for (i = 0; i < 4; i++)
3249         for (j = 0; j < 2; j++)
3250             for (k = 0; k < 2; k++)
3251                 for (l = 0; l < 6; l++)
3252                     for (m = 0; m < 6; m++) {
3253                         uint8_t *pp = s->prob_ctx[s->framectxid].coef[i][j][k][l][m];
3254                         unsigned *e = s->counts.eob[i][j][k][l][m];
3255                         unsigned *c = s->counts.coef[i][j][k][l][m];
3256
3257                         if (l == 0 && m >= 3) // dc only has 3 pt
3258                             break;
3259
3260                         adapt_prob(&pp[0], e[0], e[1], 24, uf);
3261                         adapt_prob(&pp[1], c[0], c[1] + c[2], 24, uf);
3262                         adapt_prob(&pp[2], c[1], c[2], 24, uf);
3263                     }
3264
3265     if (s->keyframe || s->intraonly) {
3266         memcpy(p->skip,  s->prob.p.skip,  sizeof(p->skip));
3267         memcpy(p->tx32p, s->prob.p.tx32p, sizeof(p->tx32p));
3268         memcpy(p->tx16p, s->prob.p.tx16p, sizeof(p->tx16p));
3269         memcpy(p->tx8p,  s->prob.p.tx8p,  sizeof(p->tx8p));
3270         return;
3271     }
3272
3273     // skip flag
3274     for (i = 0; i < 3; i++)
3275         adapt_prob(&p->skip[i], s->counts.skip[i][0], s->counts.skip[i][1], 20, 128);
3276
3277     // intra/inter flag
3278     for (i = 0; i < 4; i++)
3279         adapt_prob(&p->intra[i], s->counts.intra[i][0], s->counts.intra[i][1], 20, 128);
3280
3281     // comppred flag
3282     if (s->comppredmode == PRED_SWITCHABLE) {
3283       for (i = 0; i < 5; i++)
3284           adapt_prob(&p->comp[i], s->counts.comp[i][0], s->counts.comp[i][1], 20, 128);
3285     }
3286
3287     // reference frames
3288     if (s->comppredmode != PRED_SINGLEREF) {
3289       for (i = 0; i < 5; i++)
3290           adapt_prob(&p->comp_ref[i], s->counts.comp_ref[i][0],
3291                      s->counts.comp_ref[i][1], 20, 128);
3292     }
3293
3294     if (s->comppredmode != PRED_COMPREF) {
3295       for (i = 0; i < 5; i++) {
3296           uint8_t *pp = p->single_ref[i];
3297           unsigned (*c)[2] = s->counts.single_ref[i];
3298
3299           adapt_prob(&pp[0], c[0][0], c[0][1], 20, 128);
3300           adapt_prob(&pp[1], c[1][0], c[1][1], 20, 128);
3301       }
3302     }
3303
3304     // block partitioning
3305     for (i = 0; i < 4; i++)
3306         for (j = 0; j < 4; j++) {
3307             uint8_t *pp = p->partition[i][j];
3308             unsigned *c = s->counts.partition[i][j];
3309
3310             adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3311             adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3312             adapt_prob(&pp[2], c[2], c[3], 20, 128);
3313         }
3314
3315     // tx size
3316     if (s->txfmmode == TX_SWITCHABLE) {
3317       for (i = 0; i < 2; i++) {
3318           unsigned *c16 = s->counts.tx16p[i], *c32 = s->counts.tx32p[i];
3319
3320           adapt_prob(&p->tx8p[i], s->counts.tx8p[i][0], s->counts.tx8p[i][1], 20, 128);
3321           adapt_prob(&p->tx16p[i][0], c16[0], c16[1] + c16[2], 20, 128);
3322           adapt_prob(&p->tx16p[i][1], c16[1], c16[2], 20, 128);
3323           adapt_prob(&p->tx32p[i][0], c32[0], c32[1] + c32[2] + c32[3], 20, 128);
3324           adapt_prob(&p->tx32p[i][1], c32[1], c32[2] + c32[3], 20, 128);
3325           adapt_prob(&p->tx32p[i][2], c32[2], c32[3], 20, 128);
3326       }
3327     }
3328
3329     // interpolation filter
3330     if (s->filtermode == FILTER_SWITCHABLE) {
3331         for (i = 0; i < 4; i++) {
3332             uint8_t *pp = p->filter[i];
3333             unsigned *c = s->counts.filter[i];
3334
3335             adapt_prob(&pp[0], c[0], c[1] + c[2], 20, 128);
3336             adapt_prob(&pp[1], c[1], c[2], 20, 128);
3337         }
3338     }
3339
3340     // inter modes
3341     for (i = 0; i < 7; i++) {
3342         uint8_t *pp = p->mv_mode[i];
3343         unsigned *c = s->counts.mv_mode[i];
3344
3345         adapt_prob(&pp[0], c[2], c[1] + c[0] + c[3], 20, 128);
3346         adapt_prob(&pp[1], c[0], c[1] + c[3], 20, 128);
3347         adapt_prob(&pp[2], c[1], c[3], 20, 128);
3348     }
3349
3350     // mv joints
3351     {
3352         uint8_t *pp = p->mv_joint;
3353         unsigned *c = s->counts.mv_joint;
3354
3355         adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3356         adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3357         adapt_prob(&pp[2], c[2], c[3], 20, 128);
3358     }
3359
3360     // mv components
3361     for (i = 0; i < 2; i++) {
3362         uint8_t *pp;
3363         unsigned *c, (*c2)[2], sum;
3364
3365         adapt_prob(&p->mv_comp[i].sign, s->counts.mv_comp[i].sign[0],
3366                    s->counts.mv_comp[i].sign[1], 20, 128);
3367
3368         pp = p->mv_comp[i].classes;
3369         c = s->counts.mv_comp[i].classes;
3370         sum = c[1] + c[2] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9] + c[10];
3371         adapt_prob(&pp[0], c[0], sum, 20, 128);
3372         sum -= c[1];
3373         adapt_prob(&pp[1], c[1], sum, 20, 128);
3374         sum -= c[2] + c[3];
3375         adapt_prob(&pp[2], c[2] + c[3], sum, 20, 128);
3376         adapt_prob(&pp[3], c[2], c[3], 20, 128);
3377         sum -= c[4] + c[5];
3378         adapt_prob(&pp[4], c[4] + c[5], sum, 20, 128);
3379         adapt_prob(&pp[5], c[4], c[5], 20, 128);
3380         sum -= c[6];
3381         adapt_prob(&pp[6], c[6], sum, 20, 128);
3382         adapt_prob(&pp[7], c[7] + c[8], c[9] + c[10], 20, 128);
3383         adapt_prob(&pp[8], c[7], c[8], 20, 128);
3384         adapt_prob(&pp[9], c[9], c[10], 20, 128);
3385
3386         adapt_prob(&p->mv_comp[i].class0, s->counts.mv_comp[i].class0[0],
3387                    s->counts.mv_comp[i].class0[1], 20, 128);
3388         pp = p->mv_comp[i].bits;
3389         c2 = s->counts.mv_comp[i].bits;
3390         for (j = 0; j < 10; j++)
3391             adapt_prob(&pp[j], c2[j][0], c2[j][1], 20, 128);
3392
3393         for (j = 0; j < 2; j++) {
3394             pp = p->mv_comp[i].class0_fp[j];
3395             c = s->counts.mv_comp[i].class0_fp[j];
3396             adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3397             adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3398             adapt_prob(&pp[2], c[2], c[3], 20, 128);
3399         }
3400         pp = p->mv_comp[i].fp;
3401         c = s->counts.mv_comp[i].fp;
3402         adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3403         adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3404         adapt_prob(&pp[2], c[2], c[3], 20, 128);
3405
3406         if (s->highprecisionmvs) {
3407             adapt_prob(&p->mv_comp[i].class0_hp, s->counts.mv_comp[i].class0_hp[0],
3408                        s->counts.mv_comp[i].class0_hp[1], 20, 128);
3409             adapt_prob(&p->mv_comp[i].hp, s->counts.mv_comp[i].hp[0],
3410                        s->counts.mv_comp[i].hp[1], 20, 128);
3411         }
3412     }
3413
3414     // y intra modes
3415     for (i = 0; i < 4; i++) {
3416         uint8_t *pp = p->y_mode[i];
3417         unsigned *c = s->counts.y_mode[i], sum, s2;
3418
3419         sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
3420         adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
3421         sum -= c[TM_VP8_PRED];
3422         adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
3423         sum -= c[VERT_PRED];
3424         adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
3425         s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
3426         sum -= s2;
3427         adapt_prob(&pp[3], s2, sum, 20, 128);
3428         s2 -= c[HOR_PRED];
3429         adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
3430         adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], 20, 128);
3431         sum -= c[DIAG_DOWN_LEFT_PRED];
3432         adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
3433         sum -= c[VERT_LEFT_PRED];
3434         adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
3435         adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
3436     }
3437
3438     // uv intra modes
3439     for (i = 0; i < 10; i++) {
3440         uint8_t *pp = p->uv_mode[i];
3441         unsigned *c = s->counts.uv_mode[i], sum, s2;
3442
3443         sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
3444         adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
3445         sum -= c[TM_VP8_PRED];
3446         adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
3447         sum -= c[VERT_PRED];
3448         adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
3449         s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
3450         sum -= s2;
3451         adapt_prob(&pp[3], s2, sum, 20, 128);
3452         s2 -= c[HOR_PRED];
3453         adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
3454         adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], 20, 128);
3455         sum -= c[DIAG_DOWN_LEFT_PRED];
3456         adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
3457         sum -= c[VERT_LEFT_PRED];
3458         adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
3459         adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
3460     }
3461 }
3462
3463 static void free_buffers(VP9Context *s)
3464 {
3465     av_freep(&s->above_partition_ctx);
3466     av_freep(&s->b_base);
3467     av_freep(&s->block_base);
3468 }
3469
3470 static av_cold int vp9_decode_free(AVCodecContext *ctx)
3471 {
3472     VP9Context *s = ctx->priv_data;
3473     int i;
3474
3475     for (i = 0; i < 2; i++) {
3476         if (s->frames[i].tf.f->data[0])
3477             vp9_unref_frame(ctx, &s->frames[i]);
3478         av_frame_free(&s->frames[i].tf.f);
3479     }
3480     for (i = 0; i < 8; i++) {
3481         if (s->refs[i].f->data[0])
3482             ff_thread_release_buffer(ctx, &s->refs[i]);
3483         av_frame_free(&s->refs[i].f);
3484         if (s->next_refs[i].f->data[0])
3485             ff_thread_release_buffer(ctx, &s->next_refs[i]);
3486         av_frame_free(&s->next_refs[i].f);
3487     }
3488     free_buffers(s);
3489     av_freep(&s->c_b);
3490     s->c_b_size = 0;
3491
3492     return 0;
3493 }
3494
3495
3496 static int vp9_decode_frame(AVCodecContext *ctx, void *frame,
3497                             int *got_frame, AVPacket *pkt)
3498 {
3499     const uint8_t *data = pkt->data;
3500     int size = pkt->size;
3501     VP9Context *s = ctx->priv_data;
3502     int res, tile_row, tile_col, i, ref, row, col;
3503     ptrdiff_t yoff, uvoff, ls_y, ls_uv;
3504     AVFrame *f;
3505
3506     if ((res = decode_frame_header(ctx, data, size, &ref)) < 0) {
3507         return res;
3508     } else if (res == 0) {
3509         if (!s->refs[ref].f->data[0]) {
3510             av_log(ctx, AV_LOG_ERROR, "Requested reference %d not available\n", ref);
3511             return AVERROR_INVALIDDATA;
3512         }
3513         if ((res = av_frame_ref(frame, s->refs[ref].f)) < 0)
3514             return res;
3515         *got_frame = 1;
3516         return 0;
3517     }
3518     data += res;
3519     size -= res;
3520
3521     if (s->frames[LAST_FRAME].tf.f->data[0])
3522         vp9_unref_frame(ctx, &s->frames[LAST_FRAME]);
3523     if (!s->keyframe && s->frames[CUR_FRAME].tf.f->data[0] &&
3524         (res = vp9_ref_frame(ctx, &s->frames[LAST_FRAME], &s->frames[CUR_FRAME])) < 0)
3525         return res;
3526     if (s->frames[CUR_FRAME].tf.f->data[0])
3527         vp9_unref_frame(ctx, &s->frames[CUR_FRAME]);
3528     if ((res = vp9_alloc_frame(ctx, &s->frames[CUR_FRAME])) < 0)
3529         return res;
3530     f = s->frames[CUR_FRAME].tf.f;
3531     f->key_frame = s->keyframe;
3532     f->pict_type = s->keyframe ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
3533     ls_y = f->linesize[0];
3534     ls_uv =f->linesize[1];
3535
3536     // ref frame setup
3537     for (i = 0; i < 8; i++) {
3538         if (s->next_refs[i].f->data[0])
3539             ff_thread_release_buffer(ctx, &s->next_refs[i]);
3540         if (s->refreshrefmask & (1 << i)) {
3541             res = ff_thread_ref_frame(&s->next_refs[i], &s->frames[CUR_FRAME].tf);
3542         } else {
3543             res = ff_thread_ref_frame(&s->next_refs[i], &s->refs[i]);
3544         }
3545         if (res < 0)
3546             return res;
3547     }
3548
3549     // main tile decode loop
3550     memset(s->above_partition_ctx, 0, s->cols);
3551     memset(s->above_skip_ctx, 0, s->cols);
3552     if (s->keyframe || s->intraonly) {
3553         memset(s->above_mode_ctx, DC_PRED, s->cols * 2);
3554     } else {
3555         memset(s->above_mode_ctx, NEARESTMV, s->cols);
3556     }
3557     memset(s->above_y_nnz_ctx, 0, s->sb_cols * 16);
3558     memset(s->above_uv_nnz_ctx[0], 0, s->sb_cols * 8);
3559     memset(s->above_uv_nnz_ctx[1], 0, s->sb_cols * 8);
3560     memset(s->above_segpred_ctx, 0, s->cols);
3561     s->pass = s->uses_2pass =
3562         ctx->active_thread_type == FF_THREAD_FRAME && s->refreshctx && !s->parallelmode;
3563     if (s->refreshctx && s->parallelmode) {
3564         int j, k, l, m;
3565
3566         for (i = 0; i < 4; i++) {
3567             for (j = 0; j < 2; j++)
3568                 for (k = 0; k < 2; k++)
3569                     for (l = 0; l < 6; l++)
3570                         for (m = 0; m < 6; m++)
3571                             memcpy(s->prob_ctx[s->framectxid].coef[i][j][k][l][m],
3572                                    s->prob.coef[i][j][k][l][m], 3);
3573             if (s->txfmmode == i)
3574                 break;
3575         }
3576         s->prob_ctx[s->framectxid].p = s->prob.p;
3577         ff_thread_finish_setup(ctx);
3578     }
3579
3580     do {
3581         yoff = uvoff = 0;
3582         s->b = s->b_base;
3583         s->block = s->block_base;
3584         s->uvblock[0] = s->uvblock_base[0];
3585         s->uvblock[1] = s->uvblock_base[1];
3586         s->eob = s->eob_base;
3587         s->uveob[0] = s->uveob_base[0];
3588         s->uveob[1] = s->uveob_base[1];
3589
3590         for (tile_row = 0; tile_row < s->tiling.tile_rows; tile_row++) {
3591             set_tile_offset(&s->tiling.tile_row_start, &s->tiling.tile_row_end,
3592                             tile_row, s->tiling.log2_tile_rows, s->sb_rows);
3593             if (s->pass != 2) {
3594                 for (tile_col = 0; tile_col < s->tiling.tile_cols; tile_col++) {
3595                     unsigned tile_size;
3596
3597                     if (tile_col == s->tiling.tile_cols - 1 &&
3598                         tile_row == s->tiling.tile_rows - 1) {
3599                         tile_size = size;
3600                     } else {
3601                         tile_size = AV_RB32(data);
3602                         data += 4;
3603                         size -= 4;
3604                     }
3605                     if (tile_size > size) {
3606                         ff_thread_report_progress(&s->frames[CUR_FRAME].tf, INT_MAX, 0);
3607                         return AVERROR_INVALIDDATA;
3608                     }
3609                     ff_vp56_init_range_decoder(&s->c_b[tile_col], data, tile_size);
3610                     if (vp56_rac_get_prob_branchy(&s->c_b[tile_col], 128)) { // marker bit
3611                         ff_thread_report_progress(&s->frames[CUR_FRAME].tf, INT_MAX, 0);
3612                         return AVERROR_INVALIDDATA;
3613                     }
3614                     data += tile_size;
3615                     size -= tile_size;
3616                 }
3617             }
3618
3619             for (row = s->tiling.tile_row_start; row < s->tiling.tile_row_end;
3620                  row += 8, yoff += ls_y * 64, uvoff += ls_uv * 32) {
3621                 struct VP9Filter *lflvl_ptr = s->lflvl;
3622                 ptrdiff_t yoff2 = yoff, uvoff2 = uvoff;
3623
3624                 for (tile_col = 0; tile_col < s->tiling.tile_cols; tile_col++) {
3625                     set_tile_offset(&s->tiling.tile_col_start, &s->tiling.tile_col_end,
3626                                     tile_col, s->tiling.log2_tile_cols, s->sb_cols);
3627
3628                     if (s->pass != 2) {
3629                         memset(s->left_partition_ctx, 0, 8);
3630                         memset(s->left_skip_ctx, 0, 8);
3631                         if (s->keyframe || s->intraonly) {
3632                             memset(s->left_mode_ctx, DC_PRED, 16);
3633                         } else {
3634                             memset(s->left_mode_ctx, NEARESTMV, 8);
3635                         }
3636                         memset(s->left_y_nnz_ctx, 0, 16);
3637                         memset(s->left_uv_nnz_ctx, 0, 16);
3638                         memset(s->left_segpred_ctx, 0, 8);
3639
3640                         memcpy(&s->c, &s->c_b[tile_col], sizeof(s->c));
3641                     }
3642
3643                     for (col = s->tiling.tile_col_start;
3644                          col < s->tiling.tile_col_end;
3645                          col += 8, yoff2 += 64, uvoff2 += 32, lflvl_ptr++) {
3646                         // FIXME integrate with lf code (i.e. zero after each
3647                         // use, similar to invtxfm coefficients, or similar)
3648                         if (s->pass != 1) {
3649                             memset(lflvl_ptr->mask, 0, sizeof(lflvl_ptr->mask));
3650                         }
3651
3652                         if (s->pass == 2) {
3653                             decode_sb_mem(ctx, row, col, lflvl_ptr,
3654                                           yoff2, uvoff2, BL_64X64);
3655                         } else {
3656                             decode_sb(ctx, row, col, lflvl_ptr,
3657                                       yoff2, uvoff2, BL_64X64);
3658                         }
3659                     }
3660                     if (s->pass != 2) {
3661                         memcpy(&s->c_b[tile_col], &s->c, sizeof(s->c));
3662                     }
3663                 }
3664
3665                 if (s->pass == 1) {
3666                     continue;
3667                 }
3668
3669                 // backup pre-loopfilter reconstruction data for intra
3670                 // prediction of next row of sb64s
3671                 if (row + 8 < s->rows) {
3672                     memcpy(s->intra_pred_data[0],
3673                            f->data[0] + yoff + 63 * ls_y,
3674                            8 * s->cols);
3675                     memcpy(s->intra_pred_data[1],
3676                            f->data[1] + uvoff + 31 * ls_uv,
3677                            4 * s->cols);
3678                     memcpy(s->intra_pred_data[2],
3679                            f->data[2] + uvoff + 31 * ls_uv,
3680                            4 * s->cols);
3681                 }
3682
3683                 // loopfilter one row
3684                 if (s->filter.level) {
3685                     yoff2 = yoff;
3686                     uvoff2 = uvoff;
3687                     lflvl_ptr = s->lflvl;
3688                     for (col = 0; col < s->cols;
3689                          col += 8, yoff2 += 64, uvoff2 += 32, lflvl_ptr++) {
3690                         loopfilter_sb(ctx, lflvl_ptr, row, col, yoff2, uvoff2);
3691                     }
3692                 }
3693
3694                 // FIXME maybe we can make this more finegrained by running the
3695                 // loopfilter per-block instead of after each sbrow
3696                 // In fact that would also make intra pred left preparation easier?
3697                 ff_thread_report_progress(&s->frames[CUR_FRAME].tf, row >> 3, 0);
3698             }
3699         }
3700
3701         if (s->pass < 2 && s->refreshctx && !s->parallelmode) {
3702             adapt_probs(s);
3703             ff_thread_finish_setup(ctx);
3704         }
3705     } while (s->pass++ == 1);
3706     ff_thread_report_progress(&s->frames[CUR_FRAME].tf, INT_MAX, 0);
3707
3708     // ref frame setup
3709     for (i = 0; i < 8; i++) {
3710         if (s->refs[i].f->data[0])
3711             ff_thread_release_buffer(ctx, &s->refs[i]);
3712         ff_thread_ref_frame(&s->refs[i], &s->next_refs[i]);
3713     }
3714
3715     if (!s->invisible) {
3716         if ((res = av_frame_ref(frame, s->frames[CUR_FRAME].tf.f)) < 0)
3717             return res;
3718         *got_frame = 1;
3719     }
3720
3721     return 0;
3722 }
3723
3724 static void vp9_decode_flush(AVCodecContext *ctx)
3725 {
3726     VP9Context *s = ctx->priv_data;
3727     int i;
3728
3729     for (i = 0; i < 2; i++)
3730         vp9_unref_frame(ctx, &s->frames[i]);
3731     for (i = 0; i < 8; i++)
3732         ff_thread_release_buffer(ctx, &s->refs[i]);
3733 }
3734
3735 static int init_frames(AVCodecContext *ctx)
3736 {
3737     VP9Context *s = ctx->priv_data;
3738     int i;
3739
3740     for (i = 0; i < 2; i++) {
3741         s->frames[i].tf.f = av_frame_alloc();
3742         if (!s->frames[i].tf.f) {
3743             vp9_decode_free(ctx);
3744             av_log(ctx, AV_LOG_ERROR, "Failed to allocate frame buffer %d\n", i);
3745             return AVERROR(ENOMEM);
3746         }
3747     }
3748     for (i = 0; i < 8; i++) {
3749         s->refs[i].f = av_frame_alloc();
3750         s->next_refs[i].f = av_frame_alloc();
3751         if (!s->refs[i].f || !s->next_refs[i].f) {
3752             vp9_decode_free(ctx);
3753             av_log(ctx, AV_LOG_ERROR, "Failed to allocate frame buffer %d\n", i);
3754             return AVERROR(ENOMEM);
3755         }
3756     }
3757
3758     return 0;
3759 }
3760
3761 static av_cold int vp9_decode_init(AVCodecContext *ctx)
3762 {
3763     VP9Context *s = ctx->priv_data;
3764
3765     ctx->internal->allocate_progress = 1;
3766     ctx->pix_fmt = AV_PIX_FMT_YUV420P;
3767     ff_vp9dsp_init(&s->dsp);
3768     ff_videodsp_init(&s->vdsp, 8);
3769     s->filter.sharpness = -1;
3770
3771     return init_frames(ctx);
3772 }
3773
3774 static av_cold int vp9_decode_init_thread_copy(AVCodecContext *avctx)
3775 {
3776     return init_frames(avctx);
3777 }
3778
3779 static int vp9_decode_update_thread_context(AVCodecContext *dst, const AVCodecContext *src)
3780 {
3781     int i, res;
3782     VP9Context *s = dst->priv_data, *ssrc = src->priv_data;
3783
3784     // detect size changes in other threads
3785     if (s->above_partition_ctx &&
3786         (!ssrc->above_partition_ctx || s->cols != ssrc->cols || s->rows != ssrc->rows)) {
3787         free_buffers(s);
3788     }
3789
3790     for (i = 0; i < 2; i++) {
3791         if (s->frames[i].tf.f->data[0])
3792             vp9_unref_frame(dst, &s->frames[i]);
3793         if (ssrc->frames[i].tf.f->data[0]) {
3794             if ((res = vp9_ref_frame(dst, &s->frames[i], &ssrc->frames[i])) < 0)
3795                 return res;
3796         }
3797     }
3798     for (i = 0; i < 8; i++) {
3799         if (s->refs[i].f->data[0])
3800             ff_thread_release_buffer(dst, &s->refs[i]);
3801         if (ssrc->next_refs[i].f->data[0]) {
3802             if ((res = ff_thread_ref_frame(&s->refs[i], &ssrc->next_refs[i])) < 0)
3803                 return res;
3804         }
3805     }
3806
3807     s->invisible = ssrc->invisible;
3808     s->keyframe = ssrc->keyframe;
3809     s->uses_2pass = ssrc->uses_2pass;
3810     memcpy(&s->prob_ctx, &ssrc->prob_ctx, sizeof(s->prob_ctx));
3811     memcpy(&s->lf_delta, &ssrc->lf_delta, sizeof(s->lf_delta));
3812     if (ssrc->segmentation.enabled) {
3813         memcpy(&s->segmentation.feat, &ssrc->segmentation.feat,
3814                sizeof(s->segmentation.feat));
3815     }
3816
3817     return 0;
3818 }
3819
3820 AVCodec ff_vp9_decoder = {
3821     .name                  = "vp9",
3822     .long_name             = NULL_IF_CONFIG_SMALL("Google VP9"),
3823     .type                  = AVMEDIA_TYPE_VIDEO,
3824     .id                    = AV_CODEC_ID_VP9,
3825     .priv_data_size        = sizeof(VP9Context),
3826     .init                  = vp9_decode_init,
3827     .close                 = vp9_decode_free,
3828     .decode                = vp9_decode_frame,
3829     .capabilities          = CODEC_CAP_DR1 | CODEC_CAP_FRAME_THREADS,
3830     .flush                 = vp9_decode_flush,
3831     .init_thread_copy      = ONLY_IF_THREADS_ENABLED(vp9_decode_init_thread_copy),
3832     .update_thread_context = ONLY_IF_THREADS_ENABLED(vp9_decode_update_thread_context),
3833 };