git.sesse.net Git - ffmpeg/blob - libavcodec/vp9.c

   1 /*
   2  * VP9 compatible video decoder
   3  *
   4  * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
   5  * Copyright (C) 2013 Clément Bœsch <u pkh me>
   6  *
   7  * This file is part of FFmpeg.
   8  *
   9  * FFmpeg is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public
  11  * License as published by the Free Software Foundation; either
  12  * version 2.1 of the License, or (at your option) any later version.
  13  *
  14  * FFmpeg is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with FFmpeg; if not, write to the Free Software
  21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22  */
  23
  24 #include "avcodec.h"
  25 #include "get_bits.h"
  26 #include "internal.h"
  27 #include "thread.h"
  28 #include "videodsp.h"
  29 #include "vp56.h"
  30 #include "vp9.h"
  31 #include "vp9data.h"
  32 #include "vp9dsp.h"
  33 #include "libavutil/avassert.h"
  34
  35 #define VP9_SYNCCODE 0x498342
  36
  37 enum CompPredMode {
  38     PRED_SINGLEREF,
  39     PRED_COMPREF,
  40     PRED_SWITCHABLE,
  41 };
  42
  43 enum BlockLevel {
  44     BL_64X64,
  45     BL_32X32,
  46     BL_16X16,
  47     BL_8X8,
  48 };
  49
  50 enum BlockSize {
  51     BS_64x64,
  52     BS_64x32,
  53     BS_32x64,
  54     BS_32x32,
  55     BS_32x16,
  56     BS_16x32,
  57     BS_16x16,
  58     BS_16x8,
  59     BS_8x16,
  60     BS_8x8,
  61     BS_8x4,
  62     BS_4x8,
  63     BS_4x4,
  64     N_BS_SIZES,
  65 };
  66
  67 struct VP9mvrefPair {
  68     VP56mv mv[2];
  69     int8_t ref[2];
  70 };
  71
  72 typedef struct VP9Frame {
  73     ThreadFrame tf;
  74     AVBufferRef *extradata;
  75     uint8_t *segmentation_map;
  76     struct VP9mvrefPair *mv;
  77 } VP9Frame;
  78
  79 struct VP9Filter {
  80     uint8_t level[8 * 8];
  81     uint8_t /* bit=col */ mask[2 /* 0=y, 1=uv */][2 /* 0=col, 1=row */]
  82                               [8 /* rows */][4 /* 0=16, 1=8, 2=4, 3=inner4 */];
  83 };
  84
  85 typedef struct VP9Block {
  86     uint8_t seg_id, intra, comp, ref[2], mode[4], uvmode, skip;
  87     enum FilterMode filter;
  88     VP56mv mv[4 /* b_idx */][2 /* ref */];
  89     enum BlockSize bs;
  90     enum TxfmMode tx, uvtx;
  91     enum BlockLevel bl;
  92     enum BlockPartition bp;
  93 } VP9Block;
  94
  95 typedef struct VP9Context {
  96     VP9DSPContext dsp;
  97     VideoDSPContext vdsp;
  98     GetBitContext gb;
  99     VP56RangeCoder c;
 100     VP56RangeCoder *c_b;
 101     unsigned c_b_size;
 102     VP9Block *b_base, *b;
 103     int pass, uses_2pass, last_uses_2pass;
 104     int row, row7, col, col7;
 105     uint8_t *dst[3];
 106     ptrdiff_t y_stride, uv_stride;
 107
 108     // bitstream header
 109     uint8_t profile;
 110     uint8_t keyframe, last_keyframe;
 111     uint8_t invisible;
 112     uint8_t use_last_frame_mvs;
 113     uint8_t errorres;
 114     uint8_t colorspace;
 115     uint8_t fullrange;
 116     uint8_t intraonly;
 117     uint8_t resetctx;
 118     uint8_t refreshrefmask;
 119     uint8_t highprecisionmvs;
 120     enum FilterMode filtermode;
 121     uint8_t allowcompinter;
 122     uint8_t fixcompref;
 123     uint8_t refreshctx;
 124     uint8_t parallelmode;
 125     uint8_t framectxid;
 126     uint8_t refidx[3];
 127     uint8_t signbias[3];
 128     uint8_t varcompref[2];
 129     ThreadFrame refs[8], next_refs[8];
 130 #define CUR_FRAME 0
 131 #define LAST_FRAME 1
 132     VP9Frame frames[2];
 133
 134     struct {
 135         uint8_t level;
 136         int8_t sharpness;
 137         uint8_t lim_lut[64];
 138         uint8_t mblim_lut[64];
 139     } filter;
 140     struct {
 141         uint8_t enabled;
 142         int8_t mode[2];
 143         int8_t ref[4];
 144     } lf_delta;
 145     uint8_t yac_qi;
 146     int8_t ydc_qdelta, uvdc_qdelta, uvac_qdelta;
 147     uint8_t lossless;
 148     struct {
 149         uint8_t enabled;
 150         uint8_t temporal;
 151         uint8_t absolute_vals;
 152         uint8_t update_map;
 153         struct {
 154             uint8_t q_enabled;
 155             uint8_t lf_enabled;
 156             uint8_t ref_enabled;
 157             uint8_t skip_enabled;
 158             uint8_t ref_val;
 159             int16_t q_val;
 160             int8_t lf_val;
 161             int16_t qmul[2][2];
 162             uint8_t lflvl[4][2];
 163         } feat[8];
 164     } segmentation;
 165     struct {
 166         unsigned log2_tile_cols, log2_tile_rows;
 167         unsigned tile_cols, tile_rows;
 168         unsigned tile_row_start, tile_row_end, tile_col_start, tile_col_end;
 169     } tiling;
 170     unsigned sb_cols, sb_rows, rows, cols;
 171     struct {
 172         prob_context p;
 173         uint8_t coef[4][2][2][6][6][3];
 174     } prob_ctx[4];
 175     struct {
 176         prob_context p;
 177         uint8_t coef[4][2][2][6][6][11];
 178         uint8_t seg[7];
 179         uint8_t segpred[3];
 180     } prob;
 181     struct {
 182         unsigned y_mode[4][10];
 183         unsigned uv_mode[10][10];
 184         unsigned filter[4][3];
 185         unsigned mv_mode[7][4];
 186         unsigned intra[4][2];
 187         unsigned comp[5][2];
 188         unsigned single_ref[5][2][2];
 189         unsigned comp_ref[5][2];
 190         unsigned tx32p[2][4];
 191         unsigned tx16p[2][3];
 192         unsigned tx8p[2][2];
 193         unsigned skip[3][2];
 194         unsigned mv_joint[4];
 195         struct {
 196             unsigned sign[2];
 197             unsigned classes[11];
 198             unsigned class0[2];
 199             unsigned bits[10][2];
 200             unsigned class0_fp[2][4];
 201             unsigned fp[4];
 202             unsigned class0_hp[2];
 203             unsigned hp[2];
 204         } mv_comp[2];
 205         unsigned partition[4][4][4];
 206         unsigned coef[4][2][2][6][6][3];
 207         unsigned eob[4][2][2][6][6][2];
 208     } counts;
 209     enum TxfmMode txfmmode;
 210     enum CompPredMode comppredmode;
 211
 212     // contextual (left/above) cache
 213     uint8_t left_partition_ctx[8], *above_partition_ctx;
 214     uint8_t left_mode_ctx[16], *above_mode_ctx;
 215     // FIXME maybe merge some of the below in a flags field?
 216     uint8_t left_y_nnz_ctx[16], *above_y_nnz_ctx;
 217     uint8_t left_uv_nnz_ctx[2][8], *above_uv_nnz_ctx[2];
 218     uint8_t left_skip_ctx[8], *above_skip_ctx; // 1bit
 219     uint8_t left_txfm_ctx[8], *above_txfm_ctx; // 2bit
 220     uint8_t left_segpred_ctx[8], *above_segpred_ctx; // 1bit
 221     uint8_t left_intra_ctx[8], *above_intra_ctx; // 1bit
 222     uint8_t left_comp_ctx[8], *above_comp_ctx; // 1bit
 223     uint8_t left_ref_ctx[8], *above_ref_ctx; // 2bit
 224     uint8_t left_filter_ctx[8], *above_filter_ctx;
 225     VP56mv left_mv_ctx[16][2], (*above_mv_ctx)[2];
 226
 227     // whole-frame cache
 228     uint8_t *intra_pred_data[3];
 229     struct VP9Filter *lflvl;
 230     DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[71*80];
 231
 232     // block reconstruction intermediates
 233     int16_t *block_base, *block, *uvblock_base[2], *uvblock[2];
 234     uint8_t *eob_base, *uveob_base[2], *eob, *uveob[2];
 235     struct { int x, y; } min_mv, max_mv;
 236     DECLARE_ALIGNED(32, uint8_t, tmp_y)[64*64];
 237     DECLARE_ALIGNED(32, uint8_t, tmp_uv)[2][32*32];
 238 } VP9Context;
 239
 240 static const uint8_t bwh_tab[2][N_BS_SIZES][2] = {
 241     {
 242         { 16, 16 }, { 16, 8 }, { 8, 16 }, { 8, 8 }, { 8, 4 }, { 4, 8 },
 243         { 4, 4 }, { 4, 2 }, { 2, 4 }, { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 },
 244     }, {
 245         { 8, 8 }, { 8, 4 }, { 4, 8 }, { 4, 4 }, { 4, 2 }, { 2, 4 },
 246         { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },
 247     }
 248 };
 249
 250 static int vp9_alloc_frame(AVCodecContext *ctx, VP9Frame *f)
 251 {
 252     VP9Context *s = ctx->priv_data;
 253     int ret, sz;
 254
 255     if ((ret = ff_thread_get_buffer(ctx, &f->tf, AV_GET_BUFFER_FLAG_REF)) < 0)
 256         return ret;
 257     sz = 64 * s->sb_cols * s->sb_rows;
 258     if (!(f->extradata = av_buffer_allocz(sz * (1 + sizeof(struct VP9mvrefPair))))) {
 259         ff_thread_release_buffer(ctx, &f->tf);
 260         return AVERROR(ENOMEM);
 261     }
 262
 263     f->segmentation_map = f->extradata->data;
 264     f->mv = (struct VP9mvrefPair *) (f->extradata->data + sz);
 265
 266     // retain segmentation map if it doesn't update
 267     if (s->segmentation.enabled && !s->segmentation.update_map &&
 268         !s->keyframe && !s->intraonly) {
 269         memcpy(f->segmentation_map, s->frames[LAST_FRAME].segmentation_map, sz);
 270     }
 271
 272     return 0;
 273 }
 274
 275 static void vp9_unref_frame(AVCodecContext *ctx, VP9Frame *f)
 276 {
 277     ff_thread_release_buffer(ctx, &f->tf);
 278     av_buffer_unref(&f->extradata);
 279 }
 280
 281 static int vp9_ref_frame(AVCodecContext *ctx, VP9Frame *dst, VP9Frame *src)
 282 {
 283     int res;
 284
 285     if ((res = ff_thread_ref_frame(&dst->tf, &src->tf)) < 0) {
 286         return res;
 287     } else if (!(dst->extradata = av_buffer_ref(src->extradata))) {
 288         vp9_unref_frame(ctx, dst);
 289         return AVERROR(ENOMEM);
 290     }
 291
 292     dst->segmentation_map = src->segmentation_map;
 293     dst->mv = src->mv;
 294
 295     return 0;
 296 }
 297
 298 static int update_size(AVCodecContext *ctx, int w, int h)
 299 {
 300     VP9Context *s = ctx->priv_data;
 301     uint8_t *p;
 302
 303     av_assert0(w > 0 && h > 0);
 304
 305     if (s->above_partition_ctx && w == ctx->width && h == ctx->height)
 306         return 0;
 307
 308     ctx->width  = w;
 309     ctx->height = h;
 310     s->sb_cols  = (w + 63) >> 6;
 311     s->sb_rows  = (h + 63) >> 6;
 312     s->cols     = (w + 7) >> 3;
 313     s->rows     = (h + 7) >> 3;
 314
 315 #define assign(var, type, n) var = (type) p; p += s->sb_cols * n * sizeof(*var)
 316     av_freep(&s->above_partition_ctx);
 317     p = av_malloc(s->sb_cols * (240 + sizeof(*s->lflvl) + 16 * sizeof(*s->above_mv_ctx)));
 318     if (!p)
 319         return AVERROR(ENOMEM);
 320     assign(s->above_partition_ctx, uint8_t *,              8);
 321     assign(s->above_skip_ctx,      uint8_t *,              8);
 322     assign(s->above_txfm_ctx,      uint8_t *,              8);
 323     assign(s->above_mode_ctx,      uint8_t *,             16);
 324     assign(s->above_y_nnz_ctx,     uint8_t *,             16);
 325     assign(s->above_uv_nnz_ctx[0], uint8_t *,              8);
 326     assign(s->above_uv_nnz_ctx[1], uint8_t *,              8);
 327     assign(s->intra_pred_data[0],  uint8_t *,             64);
 328     assign(s->intra_pred_data[1],  uint8_t *,             32);
 329     assign(s->intra_pred_data[2],  uint8_t *,             32);
 330     assign(s->above_segpred_ctx,   uint8_t *,              8);
 331     assign(s->above_intra_ctx,     uint8_t *,              8);
 332     assign(s->above_comp_ctx,      uint8_t *,              8);
 333     assign(s->above_ref_ctx,       uint8_t *,              8);
 334     assign(s->above_filter_ctx,    uint8_t *,              8);
 335     assign(s->lflvl,               struct VP9Filter *,     1);
 336     assign(s->above_mv_ctx,        VP56mv(*)[2],          16);
 337 #undef assign
 338
 339     av_free(s->b_base);
 340     av_free(s->block_base);
 341     if (ctx->active_thread_type == FF_THREAD_FRAME && s->refreshctx && !s->parallelmode) {
 342         int sbs = s->sb_cols * s->sb_rows;
 343
 344         s->b_base = av_malloc(sizeof(VP9Block) * s->cols * s->rows);
 345         s->block_base = av_mallocz((64 * 64 + 128) * sbs * 3);
 346         if (!s->b_base || !s->block_base)
 347             return AVERROR(ENOMEM);
 348         s->uvblock_base[0] = s->block_base + sbs * 64 * 64;
 349         s->uvblock_base[1] = s->uvblock_base[0] + sbs * 32 * 32;
 350         s->eob_base = (uint8_t *) (s->uvblock_base[1] + sbs * 32 * 32);
 351         s->uveob_base[0] = s->eob_base + 256 * sbs;
 352         s->uveob_base[1] = s->uveob_base[0] + 64 * sbs;
 353     } else {
 354         s->b_base = av_malloc(sizeof(VP9Block));
 355         s->block_base = av_mallocz((64 * 64 + 128) * 3);
 356         if (!s->b_base || !s->block_base)
 357             return AVERROR(ENOMEM);
 358         s->uvblock_base[0] = s->block_base + 64 * 64;
 359         s->uvblock_base[1] = s->uvblock_base[0] + 32 * 32;
 360         s->eob_base = (uint8_t *) (s->uvblock_base[1] + 32 * 32);
 361         s->uveob_base[0] = s->eob_base + 256;
 362         s->uveob_base[1] = s->uveob_base[0] + 64;
 363     }
 364
 365     return 0;
 366 }
 367
 368 // for some reason the sign bit is at the end, not the start, of a bit sequence
 369 static av_always_inline int get_sbits_inv(GetBitContext *gb, int n)
 370 {
 371     int v = get_bits(gb, n);
 372     return get_bits1(gb) ? -v : v;
 373 }
 374
 375 static av_always_inline int inv_recenter_nonneg(int v, int m)
 376 {
 377     return v > 2 * m ? v : v & 1 ? m - ((v + 1) >> 1) : m + (v >> 1);
 378 }
 379
 380 // differential forward probability updates
 381 static int update_prob(VP56RangeCoder *c, int p)
 382 {
 383     static const int inv_map_table[254] = {
 384           7,  20,  33,  46,  59,  72,  85,  98, 111, 124, 137, 150, 163, 176,
 385         189, 202, 215, 228, 241, 254,   1,   2,   3,   4,   5,   6,   8,   9,
 386          10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  21,  22,  23,  24,
 387          25,  26,  27,  28,  29,  30,  31,  32,  34,  35,  36,  37,  38,  39,
 388          40,  41,  42,  43,  44,  45,  47,  48,  49,  50,  51,  52,  53,  54,
 389          55,  56,  57,  58,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,
 390          70,  71,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,
 391          86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,  99, 100,
 392         101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 112, 113, 114, 115,
 393         116, 117, 118, 119, 120, 121, 122, 123, 125, 126, 127, 128, 129, 130,
 394         131, 132, 133, 134, 135, 136, 138, 139, 140, 141, 142, 143, 144, 145,
 395         146, 147, 148, 149, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160,
 396         161, 162, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
 397         177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 190, 191,
 398         192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 203, 204, 205, 206,
 399         207, 208, 209, 210, 211, 212, 213, 214, 216, 217, 218, 219, 220, 221,
 400         222, 223, 224, 225, 226, 227, 229, 230, 231, 232, 233, 234, 235, 236,
 401         237, 238, 239, 240, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251,
 402         252, 253,
 403     };
 404     int d;
 405
 406     /* This code is trying to do a differential probability update. For a
 407      * current probability A in the range [1, 255], the difference to a new
 408      * probability of any value can be expressed differentially as 1-A,255-A
 409      * where some part of this (absolute range) exists both in positive as
 410      * well as the negative part, whereas another part only exists in one
 411      * half. We're trying to code this shared part differentially, i.e.
 412      * times two where the value of the lowest bit specifies the sign, and
 413      * the single part is then coded on top of this. This absolute difference
 414      * then again has a value of [0,254], but a bigger value in this range
 415      * indicates that we're further away from the original value A, so we
 416      * can code this as a VLC code, since higher values are increasingly
 417      * unlikely. The first 20 values in inv_map_table[] allow 'cheap, rough'
 418      * updates vs. the 'fine, exact' updates further down the range, which
 419      * adds one extra dimension to this differential update model. */
 420
 421     if (!vp8_rac_get(c)) {
 422         d = vp8_rac_get_uint(c, 4) + 0;
 423     } else if (!vp8_rac_get(c)) {
 424         d = vp8_rac_get_uint(c, 4) + 16;
 425     } else if (!vp8_rac_get(c)) {
 426         d = vp8_rac_get_uint(c, 5) + 32;
 427     } else {
 428         d = vp8_rac_get_uint(c, 7);
 429         if (d >= 65)
 430             d = (d << 1) - 65 + vp8_rac_get(c);
 431         d += 64;
 432     }
 433
 434     return p <= 128 ? 1 + inv_recenter_nonneg(inv_map_table[d], p - 1) :
 435                     255 - inv_recenter_nonneg(inv_map_table[d], 255 - p);
 436 }
 437
 438 static int decode_frame_header(AVCodecContext *ctx,
 439                                const uint8_t *data, int size, int *ref)
 440 {
 441     VP9Context *s = ctx->priv_data;
 442     int c, i, j, k, l, m, n, w, h, max, size2, res, sharp;
 443     int last_invisible;
 444     const uint8_t *data2;
 445
 446     /* general header */
 447     if ((res = init_get_bits8(&s->gb, data, size)) < 0) {
 448         av_log(ctx, AV_LOG_ERROR, "Failed to initialize bitstream reader\n");
 449         return res;
 450     }
 451     if (get_bits(&s->gb, 2) != 0x2) { // frame marker
 452         av_log(ctx, AV_LOG_ERROR, "Invalid frame marker\n");
 453         return AVERROR_INVALIDDATA;
 454     }
 455     s->profile = get_bits1(&s->gb);
 456     if (get_bits1(&s->gb)) { // reserved bit
 457         av_log(ctx, AV_LOG_ERROR, "Reserved bit should be zero\n");
 458         return AVERROR_INVALIDDATA;
 459     }
 460     if (get_bits1(&s->gb)) {
 461         *ref = get_bits(&s->gb, 3);
 462         return 0;
 463     }
 464     s->last_uses_2pass = s->uses_2pass;
 465     s->last_keyframe  = s->keyframe;
 466     s->keyframe       = !get_bits1(&s->gb);
 467     last_invisible    = s->invisible;
 468     s->invisible      = !get_bits1(&s->gb);
 469     s->errorres       = get_bits1(&s->gb);
 470     // FIXME disable this upon resolution change
 471     s->use_last_frame_mvs = !s->errorres && !last_invisible;
 472     if (s->keyframe) {
 473         if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
 474             av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
 475             return AVERROR_INVALIDDATA;
 476         }
 477         s->colorspace = get_bits(&s->gb, 3);
 478         if (s->colorspace == 7) { // RGB = profile 1
 479             av_log(ctx, AV_LOG_ERROR, "RGB not supported in profile 0\n");
 480             return AVERROR_INVALIDDATA;
 481         }
 482         s->fullrange  = get_bits1(&s->gb);
 483         // for profile 1, here follows the subsampling bits
 484         s->refreshrefmask = 0xff;
 485         w = get_bits(&s->gb, 16) + 1;
 486         h = get_bits(&s->gb, 16) + 1;
 487         if (get_bits1(&s->gb)) // display size
 488             skip_bits(&s->gb, 32);
 489     } else {
 490         s->intraonly  = s->invisible ? get_bits1(&s->gb) : 0;
 491         s->resetctx   = s->errorres ? 0 : get_bits(&s->gb, 2);
 492         if (s->intraonly) {
 493             if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
 494                 av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
 495                 return AVERROR_INVALIDDATA;
 496             }
 497             s->refreshrefmask = get_bits(&s->gb, 8);
 498             w = get_bits(&s->gb, 16) + 1;
 499             h = get_bits(&s->gb, 16) + 1;
 500             if (get_bits1(&s->gb)) // display size
 501                 skip_bits(&s->gb, 32);
 502         } else {
 503             s->refreshrefmask = get_bits(&s->gb, 8);
 504             s->refidx[0]      = get_bits(&s->gb, 3);
 505             s->signbias[0]    = get_bits1(&s->gb);
 506             s->refidx[1]      = get_bits(&s->gb, 3);
 507             s->signbias[1]    = get_bits1(&s->gb);
 508             s->refidx[2]      = get_bits(&s->gb, 3);
 509             s->signbias[2]    = get_bits1(&s->gb);
 510             if (!s->refs[s->refidx[0]].f->data[0] ||
 511                 !s->refs[s->refidx[1]].f->data[0] ||
 512                 !s->refs[s->refidx[2]].f->data[0]) {
 513                 av_log(ctx, AV_LOG_ERROR, "Not all references are available\n");
 514                 return AVERROR_INVALIDDATA;
 515             }
 516             if (get_bits1(&s->gb)) {
 517                 w = s->refs[s->refidx[0]].f->width;
 518                 h = s->refs[s->refidx[0]].f->height;
 519             } else if (get_bits1(&s->gb)) {
 520                 w = s->refs[s->refidx[1]].f->width;
 521                 h = s->refs[s->refidx[1]].f->height;
 522             } else if (get_bits1(&s->gb)) {
 523                 w = s->refs[s->refidx[2]].f->width;
 524                 h = s->refs[s->refidx[2]].f->height;
 525             } else {
 526                 w = get_bits(&s->gb, 16) + 1;
 527                 h = get_bits(&s->gb, 16) + 1;
 528             }
 529             if (get_bits1(&s->gb)) // display size
 530                 skip_bits(&s->gb, 32);
 531             s->highprecisionmvs = get_bits1(&s->gb);
 532             s->filtermode = get_bits1(&s->gb) ? FILTER_SWITCHABLE :
 533                                                 get_bits(&s->gb, 2);
 534             s->allowcompinter = s->signbias[0] != s->signbias[1] ||
 535                                 s->signbias[0] != s->signbias[2];
 536             if (s->allowcompinter) {
 537                 if (s->signbias[0] == s->signbias[1]) {
 538                     s->fixcompref    = 2;
 539                     s->varcompref[0] = 0;
 540                     s->varcompref[1] = 1;
 541                 } else if (s->signbias[0] == s->signbias[2]) {
 542                     s->fixcompref    = 1;
 543                     s->varcompref[0] = 0;
 544                     s->varcompref[1] = 2;
 545                 } else {
 546                     s->fixcompref    = 0;
 547                     s->varcompref[0] = 1;
 548                     s->varcompref[1] = 2;
 549                 }
 550             }
 551         }
 552     }
 553     s->refreshctx   = s->errorres ? 0 : get_bits1(&s->gb);
 554     s->parallelmode = s->errorres ? 1 : get_bits1(&s->gb);
 555     s->framectxid   = c = get_bits(&s->gb, 2);
 556
 557     /* loopfilter header data */
 558     s->filter.level = get_bits(&s->gb, 6);
 559     sharp = get_bits(&s->gb, 3);
 560     // if sharpness changed, reinit lim/mblim LUTs. if it didn't change, keep
 561     // the old cache values since they are still valid
 562     if (s->filter.sharpness != sharp)
 563         memset(s->filter.lim_lut, 0, sizeof(s->filter.lim_lut));
 564     s->filter.sharpness = sharp;
 565     if ((s->lf_delta.enabled = get_bits1(&s->gb))) {
 566         if (get_bits1(&s->gb)) {
 567             for (i = 0; i < 4; i++)
 568                 if (get_bits1(&s->gb))
 569                     s->lf_delta.ref[i] = get_sbits_inv(&s->gb, 6);
 570             for (i = 0; i < 2; i++)
 571                 if (get_bits1(&s->gb))
 572                     s->lf_delta.mode[i] = get_sbits_inv(&s->gb, 6);
 573         }
 574     } else {
 575         memset(&s->lf_delta, 0, sizeof(s->lf_delta));
 576     }
 577
 578     /* quantization header data */
 579     s->yac_qi      = get_bits(&s->gb, 8);
 580     s->ydc_qdelta  = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
 581     s->uvdc_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
 582     s->uvac_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
 583     s->lossless    = s->yac_qi == 0 && s->ydc_qdelta == 0 &&
 584                      s->uvdc_qdelta == 0 && s->uvac_qdelta == 0;
 585
 586     /* segmentation header info */
 587     if ((s->segmentation.enabled = get_bits1(&s->gb))) {
 588         if ((s->segmentation.update_map = get_bits1(&s->gb))) {
 589             for (i = 0; i < 7; i++)
 590                 s->prob.seg[i] = get_bits1(&s->gb) ?
 591                                  get_bits(&s->gb, 8) : 255;
 592             if ((s->segmentation.temporal = get_bits1(&s->gb)))
 593                 for (i = 0; i < 3; i++)
 594                     s->prob.segpred[i] = get_bits1(&s->gb) ?
 595                                          get_bits(&s->gb, 8) : 255;
 596         }
 597
 598         if (get_bits1(&s->gb)) {
 599             s->segmentation.absolute_vals = get_bits1(&s->gb);
 600             for (i = 0; i < 8; i++) {
 601                 if ((s->segmentation.feat[i].q_enabled = get_bits1(&s->gb)))
 602                     s->segmentation.feat[i].q_val = get_sbits_inv(&s->gb, 8);
 603                 if ((s->segmentation.feat[i].lf_enabled = get_bits1(&s->gb)))
 604                     s->segmentation.feat[i].lf_val = get_sbits_inv(&s->gb, 6);
 605                 if ((s->segmentation.feat[i].ref_enabled = get_bits1(&s->gb)))
 606                     s->segmentation.feat[i].ref_val = get_bits(&s->gb, 2);
 607                 s->segmentation.feat[i].skip_enabled = get_bits1(&s->gb);
 608             }
 609         }
 610     } else {
 611         s->segmentation.feat[0].q_enabled    = 0;
 612         s->segmentation.feat[0].lf_enabled   = 0;
 613         s->segmentation.feat[0].skip_enabled = 0;
 614         s->segmentation.feat[0].ref_enabled  = 0;
 615     }
 616
 617     // set qmul[] based on Y/UV, AC/DC and segmentation Q idx deltas
 618     for (i = 0; i < (s->segmentation.enabled ? 8 : 1); i++) {
 619         int qyac, qydc, quvac, quvdc, lflvl, sh;
 620
 621         if (s->segmentation.feat[i].q_enabled) {
 622             if (s->segmentation.absolute_vals)
 623                 qyac = s->segmentation.feat[i].q_val;
 624             else
 625                 qyac = s->yac_qi + s->segmentation.feat[i].q_val;
 626         } else {
 627             qyac  = s->yac_qi;
 628         }
 629         qydc  = av_clip_uintp2(qyac + s->ydc_qdelta, 8);
 630         quvdc = av_clip_uintp2(qyac + s->uvdc_qdelta, 8);
 631         quvac = av_clip_uintp2(qyac + s->uvac_qdelta, 8);
 632         qyac  = av_clip_uintp2(qyac, 8);
 633
 634         s->segmentation.feat[i].qmul[0][0] = vp9_dc_qlookup[qydc];
 635         s->segmentation.feat[i].qmul[0][1] = vp9_ac_qlookup[qyac];
 636         s->segmentation.feat[i].qmul[1][0] = vp9_dc_qlookup[quvdc];
 637         s->segmentation.feat[i].qmul[1][1] = vp9_ac_qlookup[quvac];
 638
 639         sh = s->filter.level >= 32;
 640         if (s->segmentation.feat[i].lf_enabled) {
 641             if (s->segmentation.absolute_vals)
 642                 lflvl = s->segmentation.feat[i].lf_val;
 643             else
 644                 lflvl = s->filter.level + s->segmentation.feat[i].lf_val;
 645         } else {
 646             lflvl  = s->filter.level;
 647         }
 648         s->segmentation.feat[i].lflvl[0][0] =
 649         s->segmentation.feat[i].lflvl[0][1] =
 650             av_clip_uintp2(lflvl + (s->lf_delta.ref[0] << sh), 6);
 651         for (j = 1; j < 4; j++) {
 652             s->segmentation.feat[i].lflvl[j][0] =
 653                 av_clip_uintp2(lflvl + ((s->lf_delta.ref[j] +
 654                                          s->lf_delta.mode[0]) << sh), 6);
 655             s->segmentation.feat[i].lflvl[j][1] =
 656                 av_clip_uintp2(lflvl + ((s->lf_delta.ref[j] +
 657                                          s->lf_delta.mode[1]) << sh), 6);
 658         }
 659     }
 660
 661     /* tiling info */
 662     if ((res = update_size(ctx, w, h)) < 0) {
 663         av_log(ctx, AV_LOG_ERROR, "Failed to initialize decoder for %dx%d\n", w, h);
 664         return res;
 665     }
 666     for (s->tiling.log2_tile_cols = 0;
 667          (s->sb_cols >> s->tiling.log2_tile_cols) > 64;
 668          s->tiling.log2_tile_cols++) ;
 669     for (max = 0; (s->sb_cols >> max) >= 4; max++) ;
 670     max = FFMAX(0, max - 1);
 671     while (max > s->tiling.log2_tile_cols) {
 672         if (get_bits1(&s->gb))
 673             s->tiling.log2_tile_cols++;
 674         else
 675             break;
 676     }
 677     s->tiling.log2_tile_rows = decode012(&s->gb);
 678     s->tiling.tile_rows = 1 << s->tiling.log2_tile_rows;
 679     if (s->tiling.tile_cols != (1 << s->tiling.log2_tile_cols)) {
 680         s->tiling.tile_cols = 1 << s->tiling.log2_tile_cols;
 681         s->c_b = av_fast_realloc(s->c_b, &s->c_b_size,
 682                                  sizeof(VP56RangeCoder) * s->tiling.tile_cols);
 683         if (!s->c_b) {
 684             av_log(ctx, AV_LOG_ERROR, "Ran out of memory during range coder init\n");
 685             return AVERROR(ENOMEM);
 686         }
 687     }
 688
 689     if (s->keyframe || s->errorres || s->intraonly) {
 690         s->prob_ctx[0].p = s->prob_ctx[1].p = s->prob_ctx[2].p =
 691                            s->prob_ctx[3].p = vp9_default_probs;
 692         memcpy(s->prob_ctx[0].coef, vp9_default_coef_probs,
 693                sizeof(vp9_default_coef_probs));
 694         memcpy(s->prob_ctx[1].coef, vp9_default_coef_probs,
 695                sizeof(vp9_default_coef_probs));
 696         memcpy(s->prob_ctx[2].coef, vp9_default_coef_probs,
 697                sizeof(vp9_default_coef_probs));
 698         memcpy(s->prob_ctx[3].coef, vp9_default_coef_probs,
 699                sizeof(vp9_default_coef_probs));
 700     }
 701
 702     // next 16 bits is size of the rest of the header (arith-coded)
 703     size2 = get_bits(&s->gb, 16);
 704     data2 = align_get_bits(&s->gb);
 705     if (size2 > size - (data2 - data)) {
 706         av_log(ctx, AV_LOG_ERROR, "Invalid compressed header size\n");
 707         return AVERROR_INVALIDDATA;
 708     }
 709     ff_vp56_init_range_decoder(&s->c, data2, size2);
 710     if (vp56_rac_get_prob_branchy(&s->c, 128)) { // marker bit
 711         av_log(ctx, AV_LOG_ERROR, "Marker bit was set\n");
 712         return AVERROR_INVALIDDATA;
 713     }
 714
 715     if (s->keyframe || s->intraonly) {
 716         memset(s->counts.coef, 0, sizeof(s->counts.coef) + sizeof(s->counts.eob));
 717     } else {
 718         memset(&s->counts, 0, sizeof(s->counts));
 719     }
 720     // FIXME is it faster to not copy here, but do it down in the fw updates
 721     // as explicit copies if the fw update is missing (and skip the copy upon
 722     // fw update)?
 723     s->prob.p = s->prob_ctx[c].p;
 724
 725     // txfm updates
 726     if (s->lossless) {
 727         s->txfmmode = TX_4X4;
 728     } else {
 729         s->txfmmode = vp8_rac_get_uint(&s->c, 2);
 730         if (s->txfmmode == 3)
 731             s->txfmmode += vp8_rac_get(&s->c);
 732
 733         if (s->txfmmode == TX_SWITCHABLE) {
 734             for (i = 0; i < 2; i++)
 735                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 736                     s->prob.p.tx8p[i] = update_prob(&s->c, s->prob.p.tx8p[i]);
 737             for (i = 0; i < 2; i++)
 738                 for (j = 0; j < 2; j++)
 739                     if (vp56_rac_get_prob_branchy(&s->c, 252))
 740                         s->prob.p.tx16p[i][j] =
 741                             update_prob(&s->c, s->prob.p.tx16p[i][j]);
 742             for (i = 0; i < 2; i++)
 743                 for (j = 0; j < 3; j++)
 744                     if (vp56_rac_get_prob_branchy(&s->c, 252))
 745                         s->prob.p.tx32p[i][j] =
 746                             update_prob(&s->c, s->prob.p.tx32p[i][j]);
 747         }
 748     }
 749
 750     // coef updates
 751     for (i = 0; i < 4; i++) {
 752         uint8_t (*ref)[2][6][6][3] = s->prob_ctx[c].coef[i];
 753         if (vp8_rac_get(&s->c)) {
 754             for (j = 0; j < 2; j++)
 755                 for (k = 0; k < 2; k++)
 756                     for (l = 0; l < 6; l++)
 757                         for (m = 0; m < 6; m++) {
 758                             uint8_t *p = s->prob.coef[i][j][k][l][m];
 759                             uint8_t *r = ref[j][k][l][m];
 760                             if (m >= 3 && l == 0) // dc only has 3 pt
 761                                 break;
 762                             for (n = 0; n < 3; n++) {
 763                                 if (vp56_rac_get_prob_branchy(&s->c, 252)) {
 764                                     p[n] = update_prob(&s->c, r[n]);
 765                                 } else {
 766                                     p[n] = r[n];
 767                                 }
 768                             }
 769                             p[3] = 0;
 770                         }
 771         } else {
 772             for (j = 0; j < 2; j++)
 773                 for (k = 0; k < 2; k++)
 774                     for (l = 0; l < 6; l++)
 775                         for (m = 0; m < 6; m++) {
 776                             uint8_t *p = s->prob.coef[i][j][k][l][m];
 777                             uint8_t *r = ref[j][k][l][m];
 778                             if (m > 3 && l == 0) // dc only has 3 pt
 779                                 break;
 780                             memcpy(p, r, 3);
 781                             p[3] = 0;
 782                         }
 783         }
 784         if (s->txfmmode == i)
 785             break;
 786     }
 787
 788     // mode updates
 789     for (i = 0; i < 3; i++)
 790         if (vp56_rac_get_prob_branchy(&s->c, 252))
 791             s->prob.p.skip[i] = update_prob(&s->c, s->prob.p.skip[i]);
 792     if (!s->keyframe && !s->intraonly) {
 793         for (i = 0; i < 7; i++)
 794             for (j = 0; j < 3; j++)
 795                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 796                     s->prob.p.mv_mode[i][j] =
 797                         update_prob(&s->c, s->prob.p.mv_mode[i][j]);
 798
 799         if (s->filtermode == FILTER_SWITCHABLE)
 800             for (i = 0; i < 4; i++)
 801                 for (j = 0; j < 2; j++)
 802                     if (vp56_rac_get_prob_branchy(&s->c, 252))
 803                         s->prob.p.filter[i][j] =
 804                             update_prob(&s->c, s->prob.p.filter[i][j]);
 805
 806         for (i = 0; i < 4; i++)
 807             if (vp56_rac_get_prob_branchy(&s->c, 252))
 808                 s->prob.p.intra[i] = update_prob(&s->c, s->prob.p.intra[i]);
 809
 810         if (s->allowcompinter) {
 811             s->comppredmode = vp8_rac_get(&s->c);
 812             if (s->comppredmode)
 813                 s->comppredmode += vp8_rac_get(&s->c);
 814             if (s->comppredmode == PRED_SWITCHABLE)
 815                 for (i = 0; i < 5; i++)
 816                     if (vp56_rac_get_prob_branchy(&s->c, 252))
 817                         s->prob.p.comp[i] =
 818                             update_prob(&s->c, s->prob.p.comp[i]);
 819         } else {
 820             s->comppredmode = PRED_SINGLEREF;
 821         }
 822
 823         if (s->comppredmode != PRED_COMPREF) {
 824             for (i = 0; i < 5; i++) {
 825                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 826                     s->prob.p.single_ref[i][0] =
 827                         update_prob(&s->c, s->prob.p.single_ref[i][0]);
 828                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 829                     s->prob.p.single_ref[i][1] =
 830                         update_prob(&s->c, s->prob.p.single_ref[i][1]);
 831             }
 832         }
 833
 834         if (s->comppredmode != PRED_SINGLEREF) {
 835             for (i = 0; i < 5; i++)
 836                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 837                     s->prob.p.comp_ref[i] =
 838                         update_prob(&s->c, s->prob.p.comp_ref[i]);
 839         }
 840
 841         for (i = 0; i < 4; i++)
 842             for (j = 0; j < 9; j++)
 843                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 844                     s->prob.p.y_mode[i][j] =
 845                         update_prob(&s->c, s->prob.p.y_mode[i][j]);
 846
 847         for (i = 0; i < 4; i++)
 848             for (j = 0; j < 4; j++)
 849                 for (k = 0; k < 3; k++)
 850                     if (vp56_rac_get_prob_branchy(&s->c, 252))
 851                         s->prob.p.partition[3 - i][j][k] =
 852                             update_prob(&s->c, s->prob.p.partition[3 - i][j][k]);
 853
 854         // mv fields don't use the update_prob subexp model for some reason
 855         for (i = 0; i < 3; i++)
 856             if (vp56_rac_get_prob_branchy(&s->c, 252))
 857                 s->prob.p.mv_joint[i] = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 858
 859         for (i = 0; i < 2; i++) {
 860             if (vp56_rac_get_prob_branchy(&s->c, 252))
 861                 s->prob.p.mv_comp[i].sign = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 862
 863             for (j = 0; j < 10; j++)
 864                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 865                     s->prob.p.mv_comp[i].classes[j] =
 866                         (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 867
 868             if (vp56_rac_get_prob_branchy(&s->c, 252))
 869                 s->prob.p.mv_comp[i].class0 = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 870
 871             for (j = 0; j < 10; j++)
 872                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 873                     s->prob.p.mv_comp[i].bits[j] =
 874                         (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 875         }
 876
 877         for (i = 0; i < 2; i++) {
 878             for (j = 0; j < 2; j++)
 879                 for (k = 0; k < 3; k++)
 880                     if (vp56_rac_get_prob_branchy(&s->c, 252))
 881                         s->prob.p.mv_comp[i].class0_fp[j][k] =
 882                             (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 883
 884             for (j = 0; j < 3; j++)
 885                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 886                     s->prob.p.mv_comp[i].fp[j] =
 887                         (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 888         }
 889
 890         if (s->highprecisionmvs) {
 891             for (i = 0; i < 2; i++) {
 892                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 893                     s->prob.p.mv_comp[i].class0_hp =
 894                         (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 895
 896                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 897                     s->prob.p.mv_comp[i].hp =
 898                         (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 899             }
 900         }
 901     }
 902
 903     return (data2 - data) + size2;
 904 }
 905
 906 static av_always_inline void clamp_mv(VP56mv *dst, const VP56mv *src,
 907                                       VP9Context *s)
 908 {
 909     dst->x = av_clip(src->x, s->min_mv.x, s->max_mv.x);
 910     dst->y = av_clip(src->y, s->min_mv.y, s->max_mv.y);
 911 }
 912
 913 static void find_ref_mvs(VP9Context *s,
 914                          VP56mv *pmv, int ref, int z, int idx, int sb)
 915 {
 916     static const int8_t mv_ref_blk_off[N_BS_SIZES][8][2] = {
 917         [BS_64x64] = {{  3, -1 }, { -1,  3 }, {  4, -1 }, { -1,  4 },
 918                       { -1, -1 }, {  0, -1 }, { -1,  0 }, {  6, -1 }},
 919         [BS_64x32] = {{  0, -1 }, { -1,  0 }, {  4, -1 }, { -1,  2 },
 920                       { -1, -1 }, {  0, -3 }, { -3,  0 }, {  2, -1 }},
 921         [BS_32x64] = {{ -1,  0 }, {  0, -1 }, { -1,  4 }, {  2, -1 },
 922                       { -1, -1 }, { -3,  0 }, {  0, -3 }, { -1,  2 }},
 923         [BS_32x32] = {{  1, -1 }, { -1,  1 }, {  2, -1 }, { -1,  2 },
 924                       { -1, -1 }, {  0, -3 }, { -3,  0 }, { -3, -3 }},
 925         [BS_32x16] = {{  0, -1 }, { -1,  0 }, {  2, -1 }, { -1, -1 },
 926                       { -1,  1 }, {  0, -3 }, { -3,  0 }, { -3, -3 }},
 927         [BS_16x32] = {{ -1,  0 }, {  0, -1 }, { -1,  2 }, { -1, -1 },
 928                       {  1, -1 }, { -3,  0 }, {  0, -3 }, { -3, -3 }},
 929         [BS_16x16] = {{  0, -1 }, { -1,  0 }, {  1, -1 }, { -1,  1 },
 930                       { -1, -1 }, {  0, -3 }, { -3,  0 }, { -3, -3 }},
 931         [BS_16x8]  = {{  0, -1 }, { -1,  0 }, {  1, -1 }, { -1, -1 },
 932                       {  0, -2 }, { -2,  0 }, { -2, -1 }, { -1, -2 }},
 933         [BS_8x16]  = {{ -1,  0 }, {  0, -1 }, { -1,  1 }, { -1, -1 },
 934                       { -2,  0 }, {  0, -2 }, { -1, -2 }, { -2, -1 }},
 935         [BS_8x8]   = {{  0, -1 }, { -1,  0 }, { -1, -1 }, {  0, -2 },
 936                       { -2,  0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
 937         [BS_8x4]   = {{  0, -1 }, { -1,  0 }, { -1, -1 }, {  0, -2 },
 938                       { -2,  0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
 939         [BS_4x8]   = {{  0, -1 }, { -1,  0 }, { -1, -1 }, {  0, -2 },
 940                       { -2,  0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
 941         [BS_4x4]   = {{  0, -1 }, { -1,  0 }, { -1, -1 }, {  0, -2 },
 942                       { -2,  0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
 943     };
 944     VP9Block *b = s->b;
 945     int row = s->row, col = s->col, row7 = s->row7;
 946     const int8_t (*p)[2] = mv_ref_blk_off[b->bs];
 947 #define INVALID_MV 0x80008000U
 948     uint32_t mem = INVALID_MV;
 949     int i;
 950
 951 #define RETURN_DIRECT_MV(mv) \
 952     do { \
 953         uint32_t m = AV_RN32A(&mv); \
 954         if (!idx) { \
 955             AV_WN32A(pmv, m); \
 956             return; \
 957         } else if (mem == INVALID_MV) { \
 958             mem = m; \
 959         } else if (m != mem) { \
 960             AV_WN32A(pmv, m); \
 961             return; \
 962         } \
 963     } while (0)
 964
 965     if (sb >= 0) {
 966         if (sb == 2 || sb == 1) {
 967             RETURN_DIRECT_MV(b->mv[0][z]);
 968         } else if (sb == 3) {
 969             RETURN_DIRECT_MV(b->mv[2][z]);
 970             RETURN_DIRECT_MV(b->mv[1][z]);
 971             RETURN_DIRECT_MV(b->mv[0][z]);
 972         }
 973
 974 #define RETURN_MV(mv) \
 975     do { \
 976         if (sb > 0) { \
 977             VP56mv tmp; \
 978             uint32_t m; \
 979             clamp_mv(&tmp, &mv, s); \
 980             m = AV_RN32A(&tmp); \
 981             if (!idx) { \
 982                 AV_WN32A(pmv, m); \
 983                 return; \
 984             } else if (mem == INVALID_MV) { \
 985                 mem = m; \
 986             } else if (m != mem) { \
 987                 AV_WN32A(pmv, m); \
 988                 return; \
 989             } \
 990         } else { \
 991             uint32_t m = AV_RN32A(&mv); \
 992             if (!idx) { \
 993                 clamp_mv(pmv, &mv, s); \
 994                 return; \
 995             } else if (mem == INVALID_MV) { \
 996                 mem = m; \
 997             } else if (m != mem) { \
 998                 clamp_mv(pmv, &mv, s); \
 999                 return; \
1000             } \
1001         } \
1002     } while (0)
1003
1004         if (row > 0) {
1005             struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[(row - 1) * s->sb_cols * 8 + col];
1006             if (mv->ref[0] == ref) {
1007                 RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][0]);
1008             } else if (mv->ref[1] == ref) {
1009                 RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][1]);
1010             }
1011         }
1012         if (col > s->tiling.tile_col_start) {
1013             struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[row * s->sb_cols * 8 + col - 1];
1014             if (mv->ref[0] == ref) {
1015                 RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][0]);
1016             } else if (mv->ref[1] == ref) {
1017                 RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][1]);
1018             }
1019         }
1020         i = 2;
1021     } else {
1022         i = 0;
1023     }
1024
1025     // previously coded MVs in this neighbourhood, using same reference frame
1026     for (; i < 8; i++) {
1027         int c = p[i][0] + col, r = p[i][1] + row;
1028
1029         if (c >= s->tiling.tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
1030             struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
1031
1032             if (mv->ref[0] == ref) {
1033                 RETURN_MV(mv->mv[0]);
1034             } else if (mv->ref[1] == ref) {
1035                 RETURN_MV(mv->mv[1]);
1036             }
1037         }
1038     }
1039
1040     // MV at this position in previous frame, using same reference frame
1041     if (s->use_last_frame_mvs) {
1042         struct VP9mvrefPair *mv = &s->frames[LAST_FRAME].mv[row * s->sb_cols * 8 + col];
1043
1044         if (!s->last_uses_2pass)
1045             ff_thread_await_progress(&s->frames[LAST_FRAME].tf, row >> 3, 0);
1046         if (mv->ref[0] == ref) {
1047             RETURN_MV(mv->mv[0]);
1048         } else if (mv->ref[1] == ref) {
1049             RETURN_MV(mv->mv[1]);
1050         }
1051     }
1052
1053 #define RETURN_SCALE_MV(mv, scale) \
1054     do { \
1055         if (scale) { \
1056             VP56mv mv_temp = { -mv.x, -mv.y }; \
1057             RETURN_MV(mv_temp); \
1058         } else { \
1059             RETURN_MV(mv); \
1060         } \
1061     } while (0)
1062
1063     // previously coded MVs in this neighbourhood, using different reference frame
1064     for (i = 0; i < 8; i++) {
1065         int c = p[i][0] + col, r = p[i][1] + row;
1066
1067         if (c >= s->tiling.tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
1068             struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
1069
1070             if (mv->ref[0] != ref && mv->ref[0] >= 0) {
1071                 RETURN_SCALE_MV(mv->mv[0], s->signbias[mv->ref[0]] != s->signbias[ref]);
1072             }
1073             if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
1074                 // BUG - libvpx has this condition regardless of whether
1075                 // we used the first ref MV and pre-scaling
1076                 AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
1077                 RETURN_SCALE_MV(mv->mv[1], s->signbias[mv->ref[1]] != s->signbias[ref]);
1078             }
1079         }
1080     }
1081
1082     // MV at this position in previous frame, using different reference frame
1083     if (s->use_last_frame_mvs) {
1084         struct VP9mvrefPair *mv = &s->frames[LAST_FRAME].mv[row * s->sb_cols * 8 + col];
1085
1086         // no need to await_progress, because we already did that above
1087         if (mv->ref[0] != ref && mv->ref[0] >= 0) {
1088             RETURN_SCALE_MV(mv->mv[0], s->signbias[mv->ref[0]] != s->signbias[ref]);
1089         }
1090         if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
1091             // BUG - libvpx has this condition regardless of whether
1092             // we used the first ref MV and pre-scaling
1093             AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
1094             RETURN_SCALE_MV(mv->mv[1], s->signbias[mv->ref[1]] != s->signbias[ref]);
1095         }
1096     }
1097
1098     AV_ZERO32(pmv);
1099 #undef INVALID_MV
1100 #undef RETURN_MV
1101 #undef RETURN_SCALE_MV
1102 }
1103
1104 static av_always_inline int read_mv_component(VP9Context *s, int idx, int hp)
1105 {
1106     int bit, sign = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].sign);
1107     int n, c = vp8_rac_get_tree(&s->c, vp9_mv_class_tree,
1108                                 s->prob.p.mv_comp[idx].classes);
1109
1110     s->counts.mv_comp[idx].sign[sign]++;
1111     s->counts.mv_comp[idx].classes[c]++;
1112     if (c) {
1113         int m;
1114
1115         for (n = 0, m = 0; m < c; m++) {
1116             bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].bits[m]);
1117             n |= bit << m;
1118             s->counts.mv_comp[idx].bits[m][bit]++;
1119         }
1120         n <<= 3;
1121         bit = vp8_rac_get_tree(&s->c, vp9_mv_fp_tree, s->prob.p.mv_comp[idx].fp);
1122         n |= bit << 1;
1123         s->counts.mv_comp[idx].fp[bit]++;
1124         if (hp) {
1125             bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].hp);
1126             s->counts.mv_comp[idx].hp[bit]++;
1127             n |= bit;
1128         } else {
1129             n |= 1;
1130             // bug in libvpx - we count for bw entropy purposes even if the
1131             // bit wasn't coded
1132             s->counts.mv_comp[idx].hp[1]++;
1133         }
1134         n += 8 << c;
1135     } else {
1136         n = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0);
1137         s->counts.mv_comp[idx].class0[n]++;
1138         bit = vp8_rac_get_tree(&s->c, vp9_mv_fp_tree,
1139                                s->prob.p.mv_comp[idx].class0_fp[n]);
1140         s->counts.mv_comp[idx].class0_fp[n][bit]++;
1141         n = (n << 3) | (bit << 1);
1142         if (hp) {
1143             bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0_hp);
1144             s->counts.mv_comp[idx].class0_hp[bit]++;
1145             n |= bit;
1146         } else {
1147             n |= 1;
1148             // bug in libvpx - we count for bw entropy purposes even if the
1149             // bit wasn't coded
1150             s->counts.mv_comp[idx].class0_hp[1]++;
1151         }
1152     }
1153
1154     return sign ? -(n + 1) : (n + 1);
1155 }
1156
1157 static void fill_mv(VP9Context *s,
1158                     VP56mv *mv, int mode, int sb)
1159 {
1160     VP9Block *b = s->b;
1161
1162     if (mode == ZEROMV) {
1163         memset(mv, 0, sizeof(*mv) * 2);
1164     } else {
1165         int hp;
1166
1167         // FIXME cache this value and reuse for other subblocks
1168         find_ref_mvs(s, &mv[0], b->ref[0], 0, mode == NEARMV,
1169                      mode == NEWMV ? -1 : sb);
1170         // FIXME maybe move this code into find_ref_mvs()
1171         if ((mode == NEWMV || sb == -1) &&
1172             !(hp = s->highprecisionmvs && abs(mv[0].x) < 64 && abs(mv[0].y) < 64)) {
1173             if (mv[0].y & 1) {
1174                 if (mv[0].y < 0)
1175                     mv[0].y++;
1176                 else
1177                     mv[0].y--;
1178             }
1179             if (mv[0].x & 1) {
1180                 if (mv[0].x < 0)
1181                     mv[0].x++;
1182                 else
1183                     mv[0].x--;
1184             }
1185         }
1186         if (mode == NEWMV) {
1187             enum MVJoint j = vp8_rac_get_tree(&s->c, vp9_mv_joint_tree,
1188                                               s->prob.p.mv_joint);
1189
1190             s->counts.mv_joint[j]++;
1191             if (j >= MV_JOINT_V)
1192                 mv[0].y += read_mv_component(s, 0, hp);
1193             if (j & 1)
1194                 mv[0].x += read_mv_component(s, 1, hp);
1195         }
1196
1197         if (b->comp) {
1198             // FIXME cache this value and reuse for other subblocks
1199             find_ref_mvs(s, &mv[1], b->ref[1], 1, mode == NEARMV,
1200                          mode == NEWMV ? -1 : sb);
1201             if ((mode == NEWMV || sb == -1) &&
1202                 !(hp = s->highprecisionmvs && abs(mv[1].x) < 64 && abs(mv[1].y) < 64)) {
1203                 if (mv[1].y & 1) {
1204                     if (mv[1].y < 0)
1205                         mv[1].y++;
1206                     else
1207                         mv[1].y--;
1208                 }
1209                 if (mv[1].x & 1) {
1210                     if (mv[1].x < 0)
1211                         mv[1].x++;
1212                     else
1213                         mv[1].x--;
1214                 }
1215             }
1216             if (mode == NEWMV) {
1217                 enum MVJoint j = vp8_rac_get_tree(&s->c, vp9_mv_joint_tree,
1218                                                   s->prob.p.mv_joint);
1219
1220                 s->counts.mv_joint[j]++;
1221                 if (j >= MV_JOINT_V)
1222                     mv[1].y += read_mv_component(s, 0, hp);
1223                 if (j & 1)
1224                     mv[1].x += read_mv_component(s, 1, hp);
1225             }
1226         }
1227     }
1228 }
1229
1230 static void decode_mode(AVCodecContext *ctx)
1231 {
1232     static const uint8_t left_ctx[N_BS_SIZES] = {
1233         0x0, 0x8, 0x0, 0x8, 0xc, 0x8, 0xc, 0xe, 0xc, 0xe, 0xf, 0xe, 0xf
1234     };
1235     static const uint8_t above_ctx[N_BS_SIZES] = {
1236         0x0, 0x0, 0x8, 0x8, 0x8, 0xc, 0xc, 0xc, 0xe, 0xe, 0xe, 0xf, 0xf
1237     };
1238     static const uint8_t max_tx_for_bl_bp[N_BS_SIZES] = {
1239         TX_32X32, TX_32X32, TX_32X32, TX_32X32, TX_16X16, TX_16X16,
1240         TX_16X16, TX_8X8, TX_8X8, TX_8X8, TX_4X4, TX_4X4, TX_4X4
1241     };
1242     VP9Context *s = ctx->priv_data;
1243     VP9Block *b = s->b;
1244     int row = s->row, col = s->col, row7 = s->row7;
1245     enum TxfmMode max_tx = max_tx_for_bl_bp[b->bs];
1246     int w4 = FFMIN(s->cols - col, bwh_tab[1][b->bs][0]);
1247     int h4 = FFMIN(s->rows - row, bwh_tab[1][b->bs][1]), y;
1248     int have_a = row > 0, have_l = col > s->tiling.tile_col_start;
1249
1250     if (!s->segmentation.enabled) {
1251         b->seg_id = 0;
1252     } else if (s->keyframe || s->intraonly) {
1253         b->seg_id = s->segmentation.update_map ?
1254             vp8_rac_get_tree(&s->c, vp9_segmentation_tree, s->prob.seg) : 0;
1255     } else if (!s->segmentation.update_map ||
1256                (s->segmentation.temporal &&
1257                 vp56_rac_get_prob_branchy(&s->c,
1258                     s->prob.segpred[s->above_segpred_ctx[col] +
1259                                     s->left_segpred_ctx[row7]]))) {
1260         int pred = 8, x;
1261         uint8_t *refsegmap = s->frames[LAST_FRAME].segmentation_map;
1262
1263         if (!s->last_uses_2pass)
1264             ff_thread_await_progress(&s->frames[LAST_FRAME].tf, row >> 3, 0);
1265         for (y = 0; y < h4; y++)
1266             for (x = 0; x < w4; x++)
1267                 pred = FFMIN(pred, refsegmap[(y + row) * 8 * s->sb_cols + x + col]);
1268         av_assert1(pred < 8);
1269         b->seg_id = pred;
1270
1271         memset(&s->above_segpred_ctx[col], 1, w4);
1272         memset(&s->left_segpred_ctx[row7], 1, h4);
1273     } else {
1274         b->seg_id = vp8_rac_get_tree(&s->c, vp9_segmentation_tree,
1275                                      s->prob.seg);
1276
1277         memset(&s->above_segpred_ctx[col], 0, w4);
1278         memset(&s->left_segpred_ctx[row7], 0, h4);
1279     }
1280     if ((s->segmentation.enabled && s->segmentation.update_map) || s->keyframe) {
1281         uint8_t *segmap = s->frames[CUR_FRAME].segmentation_map;
1282
1283         for (y = 0; y < h4; y++)
1284             memset(&segmap[(y + row) * 8 * s->sb_cols + col], b->seg_id, w4);
1285     }
1286
1287     b->skip = s->segmentation.enabled &&
1288         s->segmentation.feat[b->seg_id].skip_enabled;
1289     if (!b->skip) {
1290         int c = s->left_skip_ctx[row7] + s->above_skip_ctx[col];
1291         b->skip = vp56_rac_get_prob(&s->c, s->prob.p.skip[c]);
1292         s->counts.skip[c][b->skip]++;
1293     }
1294
1295     if (s->keyframe || s->intraonly) {
1296         b->intra = 1;
1297     } else if (s->segmentation.feat[b->seg_id].ref_enabled) {
1298         b->intra = !s->segmentation.feat[b->seg_id].ref_val;
1299     } else {
1300         int c, bit;
1301
1302         if (have_a && have_l) {
1303             c = s->above_intra_ctx[col] + s->left_intra_ctx[row7];
1304             c += (c == 2);
1305         } else {
1306             c = have_a ? 2 * s->above_intra_ctx[col] :
1307                 have_l ? 2 * s->left_intra_ctx[row7] : 0;
1308         }
1309         bit = vp56_rac_get_prob(&s->c, s->prob.p.intra[c]);
1310         s->counts.intra[c][bit]++;
1311         b->intra = !bit;
1312     }
1313
1314     if ((b->intra || !b->skip) && s->txfmmode == TX_SWITCHABLE) {
1315         int c;
1316         if (have_a) {
1317             if (have_l) {
1318                 c = (s->above_skip_ctx[col] ? max_tx :
1319                      s->above_txfm_ctx[col]) +
1320                     (s->left_skip_ctx[row7] ? max_tx :
1321                      s->left_txfm_ctx[row7]) > max_tx;
1322             } else {
1323                 c = s->above_skip_ctx[col] ? 1 :
1324                     (s->above_txfm_ctx[col] * 2 > max_tx);
1325             }
1326         } else if (have_l) {
1327             c = s->left_skip_ctx[row7] ? 1 :
1328                 (s->left_txfm_ctx[row7] * 2 > max_tx);
1329         } else {
1330             c = 1;
1331         }
1332         switch (max_tx) {
1333         case TX_32X32:
1334             b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][0]);
1335             if (b->tx) {
1336                 b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][1]);
1337                 if (b->tx == 2)
1338                     b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][2]);
1339             }
1340             s->counts.tx32p[c][b->tx]++;
1341             break;
1342         case TX_16X16:
1343             b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][0]);
1344             if (b->tx)
1345                 b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][1]);
1346             s->counts.tx16p[c][b->tx]++;
1347             break;
1348         case TX_8X8:
1349             b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx8p[c]);
1350             s->counts.tx8p[c][b->tx]++;
1351             break;
1352         case TX_4X4:
1353             b->tx = TX_4X4;
1354             break;
1355         }
1356     } else {
1357         b->tx = FFMIN(max_tx, s->txfmmode);
1358     }
1359
1360     if (s->keyframe || s->intraonly) {
1361         uint8_t *a = &s->above_mode_ctx[col * 2];
1362         uint8_t *l = &s->left_mode_ctx[(row7) << 1];
1363
1364         b->comp = 0;
1365         if (b->bs > BS_8x8) {
1366             // FIXME the memory storage intermediates here aren't really
1367             // necessary, they're just there to make the code slightly
1368             // simpler for now
1369             b->mode[0] = a[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1370                                     vp9_default_kf_ymode_probs[a[0]][l[0]]);
1371             if (b->bs != BS_8x4) {
1372                 b->mode[1] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1373                                  vp9_default_kf_ymode_probs[a[1]][b->mode[0]]);
1374                 l[0] = a[1] = b->mode[1];
1375             } else {
1376                 l[0] = a[1] = b->mode[1] = b->mode[0];
1377             }
1378             if (b->bs != BS_4x8) {
1379                 b->mode[2] = a[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1380                                         vp9_default_kf_ymode_probs[a[0]][l[1]]);
1381                 if (b->bs != BS_8x4) {
1382                     b->mode[3] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1383                                   vp9_default_kf_ymode_probs[a[1]][b->mode[2]]);
1384                     l[1] = a[1] = b->mode[3];
1385                 } else {
1386                     l[1] = a[1] = b->mode[3] = b->mode[2];
1387                 }
1388             } else {
1389                 b->mode[2] = b->mode[0];
1390                 l[1] = a[1] = b->mode[3] = b->mode[1];
1391             }
1392         } else {
1393             b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1394                                           vp9_default_kf_ymode_probs[*a][*l]);
1395             b->mode[3] = b->mode[2] = b->mode[1] = b->mode[0];
1396             // FIXME this can probably be optimized
1397             memset(a, b->mode[0], bwh_tab[0][b->bs][0]);
1398             memset(l, b->mode[0], bwh_tab[0][b->bs][1]);
1399         }
1400         b->uvmode = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1401                                      vp9_default_kf_uvmode_probs[b->mode[3]]);
1402     } else if (b->intra) {
1403         b->comp = 0;
1404         if (b->bs > BS_8x8) {
1405             b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1406                                           s->prob.p.y_mode[0]);
1407             s->counts.y_mode[0][b->mode[0]]++;
1408             if (b->bs != BS_8x4) {
1409                 b->mode[1] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1410                                               s->prob.p.y_mode[0]);
1411                 s->counts.y_mode[0][b->mode[1]]++;
1412             } else {
1413                 b->mode[1] = b->mode[0];
1414             }
1415             if (b->bs != BS_4x8) {
1416                 b->mode[2] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1417                                               s->prob.p.y_mode[0]);
1418                 s->counts.y_mode[0][b->mode[2]]++;
1419                 if (b->bs != BS_8x4) {
1420                     b->mode[3] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1421                                                   s->prob.p.y_mode[0]);
1422                     s->counts.y_mode[0][b->mode[3]]++;
1423                 } else {
1424                     b->mode[3] = b->mode[2];
1425                 }
1426             } else {
1427                 b->mode[2] = b->mode[0];
1428                 b->mode[3] = b->mode[1];
1429             }
1430         } else {
1431             static const uint8_t size_group[10] = {
1432                 3, 3, 3, 3, 2, 2, 2, 1, 1, 1
1433             };
1434             int sz = size_group[b->bs];
1435
1436             b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1437                                           s->prob.p.y_mode[sz]);
1438             b->mode[1] = b->mode[2] = b->mode[3] = b->mode[0];
1439             s->counts.y_mode[sz][b->mode[3]]++;
1440         }
1441         b->uvmode = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1442                                      s->prob.p.uv_mode[b->mode[3]]);
1443         s->counts.uv_mode[b->mode[3]][b->uvmode]++;
1444     } else {
1445         static const uint8_t inter_mode_ctx_lut[14][14] = {
1446             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1447             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1448             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1449             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1450             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1451             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1452             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1453             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1454             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1455             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1456             { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
1457             { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
1458             { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 0, 3 },
1459             { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 3, 3, 4 },
1460         };
1461
1462         if (s->segmentation.feat[b->seg_id].ref_enabled) {
1463             av_assert2(s->segmentation.feat[b->seg_id].ref_val != 0);
1464             b->comp = 0;
1465             b->ref[0] = s->segmentation.feat[b->seg_id].ref_val - 1;
1466         } else {
1467             // read comp_pred flag
1468             if (s->comppredmode != PRED_SWITCHABLE) {
1469                 b->comp = s->comppredmode == PRED_COMPREF;
1470             } else {
1471                 int c;
1472
1473                 // FIXME add intra as ref=0xff (or -1) to make these easier?
1474                 if (have_a) {
1475                     if (have_l) {
1476                         if (s->above_comp_ctx[col] && s->left_comp_ctx[row7]) {
1477                             c = 4;
1478                         } else if (s->above_comp_ctx[col]) {
1479                             c = 2 + (s->left_intra_ctx[row7] ||
1480                                      s->left_ref_ctx[row7] == s->fixcompref);
1481                         } else if (s->left_comp_ctx[row7]) {
1482                             c = 2 + (s->above_intra_ctx[col] ||
1483                                      s->above_ref_ctx[col] == s->fixcompref);
1484                         } else {
1485                             c = (!s->above_intra_ctx[col] &&
1486                                  s->above_ref_ctx[col] == s->fixcompref) ^
1487                             (!s->left_intra_ctx[row7] &&
1488                              s->left_ref_ctx[row & 7] == s->fixcompref);
1489                         }
1490                     } else {
1491                         c = s->above_comp_ctx[col] ? 3 :
1492                         (!s->above_intra_ctx[col] && s->above_ref_ctx[col] == s->fixcompref);
1493                     }
1494                 } else if (have_l) {
1495                     c = s->left_comp_ctx[row7] ? 3 :
1496                     (!s->left_intra_ctx[row7] && s->left_ref_ctx[row7] == s->fixcompref);
1497                 } else {
1498                     c = 1;
1499                 }
1500                 b->comp = vp56_rac_get_prob(&s->c, s->prob.p.comp[c]);
1501                 s->counts.comp[c][b->comp]++;
1502             }
1503
1504             // read actual references
1505             // FIXME probably cache a few variables here to prevent repetitive
1506             // memory accesses below
1507             if (b->comp) /* two references */ {
1508                 int fix_idx = s->signbias[s->fixcompref], var_idx = !fix_idx, c, bit;
1509
1510                 b->ref[fix_idx] = s->fixcompref;
1511                 // FIXME can this codeblob be replaced by some sort of LUT?
1512                 if (have_a) {
1513                     if (have_l) {
1514                         if (s->above_intra_ctx[col]) {
1515                             if (s->left_intra_ctx[row7]) {
1516                                 c = 2;
1517                             } else {
1518                                 c = 1 + 2 * (s->left_ref_ctx[row7] != s->varcompref[1]);
1519                             }
1520                         } else if (s->left_intra_ctx[row7]) {
1521                             c = 1 + 2 * (s->above_ref_ctx[col] != s->varcompref[1]);
1522                         } else {
1523                             int refl = s->left_ref_ctx[row7], refa = s->above_ref_ctx[col];
1524
1525                             if (refl == refa && refa == s->varcompref[1]) {
1526                                 c = 0;
1527                             } else if (!s->left_comp_ctx[row7] && !s->above_comp_ctx[col]) {
1528                                 if ((refa == s->fixcompref && refl == s->varcompref[0]) ||
1529                                     (refl == s->fixcompref && refa == s->varcompref[0])) {
1530                                     c = 4;
1531                                 } else {
1532                                     c = (refa == refl) ? 3 : 1;
1533                                 }
1534                             } else if (!s->left_comp_ctx[row7]) {
1535                                 if (refa == s->varcompref[1] && refl != s->varcompref[1]) {
1536                                     c = 1;
1537                                 } else {
1538                                     c = (refl == s->varcompref[1] &&
1539                                          refa != s->varcompref[1]) ? 2 : 4;
1540                                 }
1541                             } else if (!s->above_comp_ctx[col]) {
1542                                 if (refl == s->varcompref[1] && refa != s->varcompref[1]) {
1543                                     c = 1;
1544                                 } else {
1545                                     c = (refa == s->varcompref[1] &&
1546                                          refl != s->varcompref[1]) ? 2 : 4;
1547                                 }
1548                             } else {
1549                                 c = (refl == refa) ? 4 : 2;
1550                             }
1551                         }
1552                     } else {
1553                         if (s->above_intra_ctx[col]) {
1554                             c = 2;
1555                         } else if (s->above_comp_ctx[col]) {
1556                             c = 4 * (s->above_ref_ctx[col] != s->varcompref[1]);
1557                         } else {
1558                             c = 3 * (s->above_ref_ctx[col] != s->varcompref[1]);
1559                         }
1560                     }
1561                 } else if (have_l) {
1562                     if (s->left_intra_ctx[row7]) {
1563                         c = 2;
1564                     } else if (s->left_comp_ctx[row7]) {
1565                         c = 4 * (s->left_ref_ctx[row7] != s->varcompref[1]);
1566                     } else {
1567                         c = 3 * (s->left_ref_ctx[row7] != s->varcompref[1]);
1568                     }
1569                 } else {
1570                     c = 2;
1571                 }
1572                 bit = vp56_rac_get_prob(&s->c, s->prob.p.comp_ref[c]);
1573                 b->ref[var_idx] = s->varcompref[bit];
1574                 s->counts.comp_ref[c][bit]++;
1575             } else /* single reference */ {
1576                 int bit, c;
1577
1578                 if (have_a && !s->above_intra_ctx[col]) {
1579                     if (have_l && !s->left_intra_ctx[row7]) {
1580                         if (s->left_comp_ctx[row7]) {
1581                             if (s->above_comp_ctx[col]) {
1582                                 c = 1 + (!s->fixcompref || !s->left_ref_ctx[row7] ||
1583                                          !s->above_ref_ctx[col]);
1584                             } else {
1585                                 c = (3 * !s->above_ref_ctx[col]) +
1586                                     (!s->fixcompref || !s->left_ref_ctx[row7]);
1587                             }
1588                         } else if (s->above_comp_ctx[col]) {
1589                             c = (3 * !s->left_ref_ctx[row7]) +
1590                                 (!s->fixcompref || !s->above_ref_ctx[col]);
1591                         } else {
1592                             c = 2 * !s->left_ref_ctx[row7] + 2 * !s->above_ref_ctx[col];
1593                         }
1594                     } else if (s->above_intra_ctx[col]) {
1595                         c = 2;
1596                     } else if (s->above_comp_ctx[col]) {
1597                         c = 1 + (!s->fixcompref || !s->above_ref_ctx[col]);
1598                     } else {
1599                         c = 4 * (!s->above_ref_ctx[col]);
1600                     }
1601                 } else if (have_l && !s->left_intra_ctx[row7]) {
1602                     if (s->left_intra_ctx[row7]) {
1603                         c = 2;
1604                     } else if (s->left_comp_ctx[row7]) {
1605                         c = 1 + (!s->fixcompref || !s->left_ref_ctx[row7]);
1606                     } else {
1607                         c = 4 * (!s->left_ref_ctx[row7]);
1608                     }
1609                 } else {
1610                     c = 2;
1611                 }
1612                 bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][0]);
1613                 s->counts.single_ref[c][0][bit]++;
1614                 if (!bit) {
1615                     b->ref[0] = 0;
1616                 } else {
1617                     // FIXME can this codeblob be replaced by some sort of LUT?
1618                     if (have_a) {
1619                         if (have_l) {
1620                             if (s->left_intra_ctx[row7]) {
1621                                 if (s->above_intra_ctx[col]) {
1622                                     c = 2;
1623                                 } else if (s->above_comp_ctx[col]) {
1624                                     c = 1 + 2 * (s->fixcompref == 1 ||
1625                                                  s->above_ref_ctx[col] == 1);
1626                                 } else if (!s->above_ref_ctx[col]) {
1627                                     c = 3;
1628                                 } else {
1629                                     c = 4 * (s->above_ref_ctx[col] == 1);
1630                                 }
1631                             } else if (s->above_intra_ctx[col]) {
1632                                 if (s->left_intra_ctx[row7]) {
1633                                     c = 2;
1634                                 } else if (s->left_comp_ctx[row7]) {
1635                                     c = 1 + 2 * (s->fixcompref == 1 ||
1636                                                  s->left_ref_ctx[row7] == 1);
1637                                 } else if (!s->left_ref_ctx[row7]) {
1638                                     c = 3;
1639                                 } else {
1640                                     c = 4 * (s->left_ref_ctx[row7] == 1);
1641                                 }
1642                             } else if (s->above_comp_ctx[col]) {
1643                                 if (s->left_comp_ctx[row7]) {
1644                                     if (s->left_ref_ctx[row7] == s->above_ref_ctx[col]) {
1645                                         c = 3 * (s->fixcompref == 1 ||
1646                                                  s->left_ref_ctx[row7] == 1);
1647                                     } else {
1648                                         c = 2;
1649                                     }
1650                                 } else if (!s->left_ref_ctx[row7]) {
1651                                     c = 1 + 2 * (s->fixcompref == 1 ||
1652                                                  s->above_ref_ctx[col] == 1);
1653                                 } else {
1654                                     c = 3 * (s->left_ref_ctx[row7] == 1) +
1655                                     (s->fixcompref == 1 || s->above_ref_ctx[col] == 1);
1656                                 }
1657                             } else if (s->left_comp_ctx[row7]) {
1658                                 if (!s->above_ref_ctx[col]) {
1659                                     c = 1 + 2 * (s->fixcompref == 1 ||
1660                                                  s->left_ref_ctx[row7] == 1);
1661                                 } else {
1662                                     c = 3 * (s->above_ref_ctx[col] == 1) +
1663                                     (s->fixcompref == 1 || s->left_ref_ctx[row7] == 1);
1664                                 }
1665                             } else if (!s->above_ref_ctx[col]) {
1666                                 if (!s->left_ref_ctx[row7]) {
1667                                     c = 3;
1668                                 } else {
1669                                     c = 4 * (s->left_ref_ctx[row7] == 1);
1670                                 }
1671                             } else if (!s->left_ref_ctx[row7]) {
1672                                 c = 4 * (s->above_ref_ctx[col] == 1);
1673                             } else {
1674                                 c = 2 * (s->left_ref_ctx[row7] == 1) +
1675                                 2 * (s->above_ref_ctx[col] == 1);
1676                             }
1677                         } else {
1678                             if (s->above_intra_ctx[col] ||
1679                                 (!s->above_comp_ctx[col] && !s->above_ref_ctx[col])) {
1680                                 c = 2;
1681                             } else if (s->above_comp_ctx[col]) {
1682                                 c = 3 * (s->fixcompref == 1 || s->above_ref_ctx[col] == 1);
1683                             } else {
1684                                 c = 4 * (s->above_ref_ctx[col] == 1);
1685                             }
1686                         }
1687                     } else if (have_l) {
1688                         if (s->left_intra_ctx[row7] ||
1689                             (!s->left_comp_ctx[row7] && !s->left_ref_ctx[row7])) {
1690                             c = 2;
1691                         } else if (s->left_comp_ctx[row7]) {
1692                             c = 3 * (s->fixcompref == 1 || s->left_ref_ctx[row7] == 1);
1693                         } else {
1694                             c = 4 * (s->left_ref_ctx[row7] == 1);
1695                         }
1696                     } else {
1697                         c = 2;
1698                     }
1699                     bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][1]);
1700                     s->counts.single_ref[c][1][bit]++;
1701                     b->ref[0] = 1 + bit;
1702                 }
1703             }
1704         }
1705
1706         if (b->bs <= BS_8x8) {
1707             if (s->segmentation.feat[b->seg_id].skip_enabled) {
1708                 b->mode[0] = b->mode[1] = b->mode[2] = b->mode[3] = ZEROMV;
1709             } else {
1710                 static const uint8_t off[10] = {
1711                     3, 0, 0, 1, 0, 0, 0, 0, 0, 0
1712                 };
1713
1714                 // FIXME this needs to use the LUT tables from find_ref_mvs
1715                 // because not all are -1,0/0,-1
1716                 int c = inter_mode_ctx_lut[s->above_mode_ctx[col + off[b->bs]]]
1717                                           [s->left_mode_ctx[row7 + off[b->bs]]];
1718
1719                 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1720                                               s->prob.p.mv_mode[c]);
1721                 b->mode[1] = b->mode[2] = b->mode[3] = b->mode[0];
1722                 s->counts.mv_mode[c][b->mode[0] - 10]++;
1723             }
1724         }
1725
1726         if (s->filtermode == FILTER_SWITCHABLE) {
1727             int c;
1728
1729             if (have_a && s->above_mode_ctx[col] >= NEARESTMV) {
1730                 if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
1731                     c = s->above_filter_ctx[col] == s->left_filter_ctx[row7] ?
1732                         s->left_filter_ctx[row7] : 3;
1733                 } else {
1734                     c = s->above_filter_ctx[col];
1735                 }
1736             } else if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
1737                 c = s->left_filter_ctx[row7];
1738             } else {
1739                 c = 3;
1740             }
1741
1742             b->filter = vp8_rac_get_tree(&s->c, vp9_filter_tree,
1743                                          s->prob.p.filter[c]);
1744             s->counts.filter[c][b->filter]++;
1745         } else {
1746             b->filter = s->filtermode;
1747         }
1748
1749         if (b->bs > BS_8x8) {
1750             int c = inter_mode_ctx_lut[s->above_mode_ctx[col]][s->left_mode_ctx[row7]];
1751
1752             b->mode[0] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1753                                           s->prob.p.mv_mode[c]);
1754             s->counts.mv_mode[c][b->mode[0] - 10]++;
1755             fill_mv(s, b->mv[0], b->mode[0], 0);
1756
1757             if (b->bs != BS_8x4) {
1758                 b->mode[1] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1759                                               s->prob.p.mv_mode[c]);
1760                 s->counts.mv_mode[c][b->mode[1] - 10]++;
1761                 fill_mv(s, b->mv[1], b->mode[1], 1);
1762             } else {
1763                 b->mode[1] = b->mode[0];
1764                 AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
1765                 AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
1766             }
1767
1768             if (b->bs != BS_4x8) {
1769                 b->mode[2] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1770                                               s->prob.p.mv_mode[c]);
1771                 s->counts.mv_mode[c][b->mode[2] - 10]++;
1772                 fill_mv(s, b->mv[2], b->mode[2], 2);
1773
1774                 if (b->bs != BS_8x4) {
1775                     b->mode[3] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1776                                                   s->prob.p.mv_mode[c]);
1777                     s->counts.mv_mode[c][b->mode[3] - 10]++;
1778                     fill_mv(s, b->mv[3], b->mode[3], 3);
1779                 } else {
1780                     b->mode[3] = b->mode[2];
1781                     AV_COPY32(&b->mv[3][0], &b->mv[2][0]);
1782                     AV_COPY32(&b->mv[3][1], &b->mv[2][1]);
1783                 }
1784             } else {
1785                 b->mode[2] = b->mode[0];
1786                 AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
1787                 AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
1788                 b->mode[3] = b->mode[1];
1789                 AV_COPY32(&b->mv[3][0], &b->mv[1][0]);
1790                 AV_COPY32(&b->mv[3][1], &b->mv[1][1]);
1791             }
1792         } else {
1793             fill_mv(s, b->mv[0], b->mode[0], -1);
1794             AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
1795             AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
1796             AV_COPY32(&b->mv[3][0], &b->mv[0][0]);
1797             AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
1798             AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
1799             AV_COPY32(&b->mv[3][1], &b->mv[0][1]);
1800         }
1801     }
1802
1803     // FIXME this can probably be optimized
1804     memset(&s->above_skip_ctx[col], b->skip, w4);
1805     memset(&s->left_skip_ctx[row7], b->skip, h4);
1806     memset(&s->above_txfm_ctx[col], b->tx, w4);
1807     memset(&s->left_txfm_ctx[row7], b->tx, h4);
1808     memset(&s->above_partition_ctx[col], above_ctx[b->bs], w4);
1809     memset(&s->left_partition_ctx[row7], left_ctx[b->bs], h4);
1810     if (!s->keyframe && !s->intraonly) {
1811         memset(&s->above_intra_ctx[col], b->intra, w4);
1812         memset(&s->left_intra_ctx[row7], b->intra, h4);
1813         memset(&s->above_comp_ctx[col], b->comp, w4);
1814         memset(&s->left_comp_ctx[row7], b->comp, h4);
1815         memset(&s->above_mode_ctx[col], b->mode[3], w4);
1816         memset(&s->left_mode_ctx[row7], b->mode[3], h4);
1817         if (s->filtermode == FILTER_SWITCHABLE && !b->intra ) {
1818             memset(&s->above_filter_ctx[col], b->filter, w4);
1819             memset(&s->left_filter_ctx[row7], b->filter, h4);
1820             b->filter = vp9_filter_lut[b->filter];
1821         }
1822         if (b->bs > BS_8x8) {
1823             int mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
1824
1825             AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][0], &b->mv[1][0]);
1826             AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][1], &b->mv[1][1]);
1827             AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][0], mv0);
1828             AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][1], mv1);
1829             AV_COPY32(&s->above_mv_ctx[col * 2 + 0][0], &b->mv[2][0]);
1830             AV_COPY32(&s->above_mv_ctx[col * 2 + 0][1], &b->mv[2][1]);
1831             AV_WN32A(&s->above_mv_ctx[col * 2 + 1][0], mv0);
1832             AV_WN32A(&s->above_mv_ctx[col * 2 + 1][1], mv1);
1833         } else {
1834             int n, mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
1835
1836             for (n = 0; n < w4 * 2; n++) {
1837                 AV_WN32A(&s->above_mv_ctx[col * 2 + n][0], mv0);
1838                 AV_WN32A(&s->above_mv_ctx[col * 2 + n][1], mv1);
1839             }
1840             for (n = 0; n < h4 * 2; n++) {
1841                 AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][0], mv0);
1842                 AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][1], mv1);
1843             }
1844         }
1845
1846         if (!b->intra) { // FIXME write 0xff or -1 if intra, so we can use this
1847                          // as a direct check in above branches
1848             int vref = b->ref[b->comp ? s->signbias[s->varcompref[0]] : 0];
1849
1850             memset(&s->above_ref_ctx[col], vref, w4);
1851             memset(&s->left_ref_ctx[row7], vref, h4);
1852         }
1853     }
1854
1855     // FIXME kinda ugly
1856     for (y = 0; y < h4; y++) {
1857         int x, o = (row + y) * s->sb_cols * 8 + col;
1858         struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[o];
1859
1860         if (b->intra) {
1861             for (x = 0; x < w4; x++) {
1862                 mv[x].ref[0] =
1863                 mv[x].ref[1] = -1;
1864             }
1865         } else if (b->comp) {
1866             for (x = 0; x < w4; x++) {
1867                 mv[x].ref[0] = b->ref[0];
1868                 mv[x].ref[1] = b->ref[1];
1869                 AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
1870                 AV_COPY32(&mv[x].mv[1], &b->mv[3][1]);
1871             }
1872         } else {
1873             for (x = 0; x < w4; x++) {
1874                 mv[x].ref[0] = b->ref[0];
1875                 mv[x].ref[1] = -1;
1876                 AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
1877             }
1878         }
1879     }
1880 }
1881
1882 // FIXME remove tx argument, and merge cnt/eob arguments?
1883 static int decode_coeffs_b(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
1884                            enum TxfmMode tx, unsigned (*cnt)[6][3],
1885                            unsigned (*eob)[6][2], uint8_t (*p)[6][11],
1886                            int nnz, const int16_t *scan, const int16_t (*nb)[2],
1887                            const int16_t *band_counts, const int16_t *qmul)
1888 {
1889     int i = 0, band = 0, band_left = band_counts[band];
1890     uint8_t *tp = p[0][nnz];
1891     uint8_t cache[1024];
1892
1893     do {
1894         int val, rc;
1895
1896         val = vp56_rac_get_prob_branchy(c, tp[0]); // eob
1897         eob[band][nnz][val]++;
1898         if (!val)
1899             break;
1900
1901     skip_eob:
1902         if (!vp56_rac_get_prob_branchy(c, tp[1])) { // zero
1903             cnt[band][nnz][0]++;
1904             if (!--band_left)
1905                 band_left = band_counts[++band];
1906             cache[scan[i]] = 0;
1907             nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
1908             tp = p[band][nnz];
1909             if (++i == n_coeffs)
1910                 break; //invalid input; blocks should end with EOB
1911             goto skip_eob;
1912         }
1913
1914         rc = scan[i];
1915         if (!vp56_rac_get_prob_branchy(c, tp[2])) { // one
1916             cnt[band][nnz][1]++;
1917             val = 1;
1918             cache[rc] = 1;
1919         } else {
1920             // fill in p[3-10] (model fill) - only once per frame for each pos
1921             if (!tp[3])
1922                 memcpy(&tp[3], vp9_model_pareto8[tp[2]], 8);
1923
1924             cnt[band][nnz][2]++;
1925             if (!vp56_rac_get_prob_branchy(c, tp[3])) { // 2, 3, 4
1926                 if (!vp56_rac_get_prob_branchy(c, tp[4])) {
1927                     cache[rc] = val = 2;
1928                 } else {
1929                     val = 3 + vp56_rac_get_prob(c, tp[5]);
1930                     cache[rc] = 3;
1931                 }
1932             } else if (!vp56_rac_get_prob_branchy(c, tp[6])) { // cat1/2
1933                 cache[rc] = 4;
1934                 if (!vp56_rac_get_prob_branchy(c, tp[7])) {
1935                     val = 5 + vp56_rac_get_prob(c, 159);
1936                 } else {
1937                     val  = 7 + (vp56_rac_get_prob(c, 165) << 1);
1938                     val +=      vp56_rac_get_prob(c, 145);
1939                 }
1940             } else { // cat 3-6
1941                 cache[rc] = 5;
1942                 if (!vp56_rac_get_prob_branchy(c, tp[8])) {
1943                     if (!vp56_rac_get_prob_branchy(c, tp[9])) {
1944                         val  = 11 + (vp56_rac_get_prob(c, 173) << 2);
1945                         val +=      (vp56_rac_get_prob(c, 148) << 1);
1946                         val +=       vp56_rac_get_prob(c, 140);
1947                     } else {
1948                         val  = 19 + (vp56_rac_get_prob(c, 176) << 3);
1949                         val +=      (vp56_rac_get_prob(c, 155) << 2);
1950                         val +=      (vp56_rac_get_prob(c, 140) << 1);
1951                         val +=       vp56_rac_get_prob(c, 135);
1952                     }
1953                 } else if (!vp56_rac_get_prob_branchy(c, tp[10])) {
1954                     val  = 35 + (vp56_rac_get_prob(c, 180) << 4);
1955                     val +=      (vp56_rac_get_prob(c, 157) << 3);
1956                     val +=      (vp56_rac_get_prob(c, 141) << 2);
1957                     val +=      (vp56_rac_get_prob(c, 134) << 1);
1958                     val +=       vp56_rac_get_prob(c, 130);
1959                 } else {
1960                     val  = 67 + (vp56_rac_get_prob(c, 254) << 13);
1961                     val +=      (vp56_rac_get_prob(c, 254) << 12);
1962                     val +=      (vp56_rac_get_prob(c, 254) << 11);
1963                     val +=      (vp56_rac_get_prob(c, 252) << 10);
1964                     val +=      (vp56_rac_get_prob(c, 249) << 9);
1965                     val +=      (vp56_rac_get_prob(c, 243) << 8);
1966                     val +=      (vp56_rac_get_prob(c, 230) << 7);
1967                     val +=      (vp56_rac_get_prob(c, 196) << 6);
1968                     val +=      (vp56_rac_get_prob(c, 177) << 5);
1969                     val +=      (vp56_rac_get_prob(c, 153) << 4);
1970                     val +=      (vp56_rac_get_prob(c, 140) << 3);
1971                     val +=      (vp56_rac_get_prob(c, 133) << 2);
1972                     val +=      (vp56_rac_get_prob(c, 130) << 1);
1973                     val +=       vp56_rac_get_prob(c, 129);
1974                 }
1975             }
1976         }
1977         if (!--band_left)
1978             band_left = band_counts[++band];
1979         if (tx == TX_32X32) // FIXME slow
1980             coef[rc] = ((vp8_rac_get(c) ? -val : val) * qmul[!!i]) / 2;
1981         else
1982             coef[rc] = (vp8_rac_get(c) ? -val : val) * qmul[!!i];
1983         nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
1984         tp = p[band][nnz];
1985     } while (++i < n_coeffs);
1986
1987     return i;
1988 }
1989
1990 static void decode_coeffs(AVCodecContext *ctx)
1991 {
1992     VP9Context *s = ctx->priv_data;
1993     VP9Block *b = s->b;
1994     int row = s->row, col = s->col;
1995     uint8_t (*p)[6][11] = s->prob.coef[b->tx][0 /* y */][!b->intra];
1996     unsigned (*c)[6][3] = s->counts.coef[b->tx][0 /* y */][!b->intra];
1997     unsigned (*e)[6][2] = s->counts.eob[b->tx][0 /* y */][!b->intra];
1998     int w4 = bwh_tab[1][b->bs][0] << 1, h4 = bwh_tab[1][b->bs][1] << 1;
1999     int end_x = FFMIN(2 * (s->cols - col), w4);
2000     int end_y = FFMIN(2 * (s->rows - row), h4);
2001     int n, pl, x, y, step1d = 1 << b->tx, step = 1 << (b->tx * 2);
2002     int uvstep1d = 1 << b->uvtx, uvstep = 1 << (b->uvtx * 2), res;
2003     int16_t (*qmul)[2] = s->segmentation.feat[b->seg_id].qmul;
2004     int tx = 4 * s->lossless + b->tx;
2005     const int16_t * const *yscans = vp9_scans[tx];
2006     const int16_t (* const *ynbs)[2] = vp9_scans_nb[tx];
2007     const int16_t *uvscan = vp9_scans[b->uvtx][DCT_DCT];
2008     const int16_t (*uvnb)[2] = vp9_scans_nb[b->uvtx][DCT_DCT];
2009     uint8_t *a = &s->above_y_nnz_ctx[col * 2];
2010     uint8_t *l = &s->left_y_nnz_ctx[(row & 7) << 1];
2011     static const int16_t band_counts[4][8] = {
2012         { 1, 2, 3, 4,  3,   16 - 13 },
2013         { 1, 2, 3, 4, 11,   64 - 21 },
2014         { 1, 2, 3, 4, 11,  256 - 21 },
2015         { 1, 2, 3, 4, 11, 1024 - 21 },
2016     };
2017     const int16_t *y_band_counts = band_counts[b->tx];
2018     const int16_t *uv_band_counts = band_counts[b->uvtx];
2019
2020     /* y tokens */
2021     if (b->tx > TX_4X4) { // FIXME slow
2022         for (y = 0; y < end_y; y += step1d)
2023             for (x = 1; x < step1d; x++)
2024                 l[y] |= l[y + x];
2025         for (x = 0; x < end_x; x += step1d)
2026             for (y = 1; y < step1d; y++)
2027                 a[x] |= a[x + y];
2028     }
2029     for (n = 0, y = 0; y < end_y; y += step1d) {
2030         for (x = 0; x < end_x; x += step1d, n += step) {
2031             enum TxfmType txtp = vp9_intra_txfm_type[b->mode[b->tx == TX_4X4 &&
2032                                                              b->bs > BS_8x8 ?
2033                                                              n : 0]];
2034             int nnz = a[x] + l[y];
2035             res = decode_coeffs_b(&s->c, s->block + 16 * n, 16 * step,
2036                                   b->tx, c, e, p, nnz, yscans[txtp],
2037                                   ynbs[txtp], y_band_counts, qmul[0]);
2038             a[x] = l[y] = !!res;
2039             if (b->tx > TX_8X8) {
2040                 AV_WN16A(&s->eob[n], res);
2041             } else {
2042                 s->eob[n] = res;
2043             }
2044         }
2045     }
2046     if (b->tx > TX_4X4) { // FIXME slow
2047         for (y = 0; y < end_y; y += step1d)
2048             memset(&l[y + 1], l[y], FFMIN(end_y - y - 1, step1d - 1));
2049         for (x = 0; x < end_x; x += step1d)
2050             memset(&a[x + 1], a[x], FFMIN(end_x - x - 1, step1d - 1));
2051     }
2052
2053     p = s->prob.coef[b->uvtx][1 /* uv */][!b->intra];
2054     c = s->counts.coef[b->uvtx][1 /* uv */][!b->intra];
2055     e = s->counts.eob[b->uvtx][1 /* uv */][!b->intra];
2056     w4 >>= 1;
2057     h4 >>= 1;
2058     end_x >>= 1;
2059     end_y >>= 1;
2060     for (pl = 0; pl < 2; pl++) {
2061         a = &s->above_uv_nnz_ctx[pl][col];
2062         l = &s->left_uv_nnz_ctx[pl][row & 7];
2063         if (b->uvtx > TX_4X4) { // FIXME slow
2064             for (y = 0; y < end_y; y += uvstep1d)
2065                 for (x = 1; x < uvstep1d; x++)
2066                     l[y] |= l[y + x];
2067             for (x = 0; x < end_x; x += uvstep1d)
2068                 for (y = 1; y < uvstep1d; y++)
2069                     a[x] |= a[x + y];
2070         }
2071         for (n = 0, y = 0; y < end_y; y += uvstep1d) {
2072             for (x = 0; x < end_x; x += uvstep1d, n += uvstep) {
2073                 int nnz = a[x] + l[y];
2074                 res = decode_coeffs_b(&s->c, s->uvblock[pl] + 16 * n,
2075                                       16 * uvstep, b->uvtx, c, e, p, nnz,
2076                                       uvscan, uvnb, uv_band_counts, qmul[1]);
2077                 a[x] = l[y] = !!res;
2078                 if (b->uvtx > TX_8X8) {
2079                     AV_WN16A(&s->uveob[pl][n], res);
2080                 } else {
2081                     s->uveob[pl][n] = res;
2082                 }
2083             }
2084         }
2085         if (b->uvtx > TX_4X4) { // FIXME slow
2086             for (y = 0; y < end_y; y += uvstep1d)
2087                 memset(&l[y + 1], l[y], FFMIN(end_y - y - 1, uvstep1d - 1));
2088             for (x = 0; x < end_x; x += uvstep1d)
2089                 memset(&a[x + 1], a[x], FFMIN(end_x - x - 1, uvstep1d - 1));
2090         }
2091     }
2092 }
2093
2094 static av_always_inline int check_intra_mode(VP9Context *s, int mode, uint8_t **a,
2095                                              uint8_t *dst_edge, ptrdiff_t stride_edge,
2096                                              uint8_t *dst_inner, ptrdiff_t stride_inner,
2097                                              uint8_t *l, int col, int x, int w,
2098                                              int row, int y, enum TxfmMode tx,
2099                                              int p)
2100 {
2101     int have_top = row > 0 || y > 0;
2102     int have_left = col > s->tiling.tile_col_start || x > 0;
2103     int have_right = x < w - 1;
2104     static const uint8_t mode_conv[10][2 /* have_left */][2 /* have_top */] = {
2105         [VERT_PRED]            = { { DC_127_PRED,          VERT_PRED },
2106                                    { DC_127_PRED,          VERT_PRED } },
2107         [HOR_PRED]             = { { DC_129_PRED,          DC_129_PRED },
2108                                    { HOR_PRED,             HOR_PRED } },
2109         [DC_PRED]              = { { DC_128_PRED,          TOP_DC_PRED },
2110                                    { LEFT_DC_PRED,         DC_PRED } },
2111         [DIAG_DOWN_LEFT_PRED]  = { { DC_127_PRED,          DIAG_DOWN_LEFT_PRED },
2112                                    { DC_127_PRED,          DIAG_DOWN_LEFT_PRED } },
2113         [DIAG_DOWN_RIGHT_PRED] = { { DIAG_DOWN_RIGHT_PRED, DIAG_DOWN_RIGHT_PRED },
2114                                    { DIAG_DOWN_RIGHT_PRED, DIAG_DOWN_RIGHT_PRED } },
2115         [VERT_RIGHT_PRED]      = { { VERT_RIGHT_PRED,      VERT_RIGHT_PRED },
2116                                    { VERT_RIGHT_PRED,      VERT_RIGHT_PRED } },
2117         [HOR_DOWN_PRED]        = { { HOR_DOWN_PRED,        HOR_DOWN_PRED },
2118                                    { HOR_DOWN_PRED,        HOR_DOWN_PRED } },
2119         [VERT_LEFT_PRED]       = { { DC_127_PRED,          VERT_LEFT_PRED },
2120                                    { DC_127_PRED,          VERT_LEFT_PRED } },
2121         [HOR_UP_PRED]          = { { DC_129_PRED,          DC_129_PRED },
2122                                    { HOR_UP_PRED,          HOR_UP_PRED } },
2123         [TM_VP8_PRED]          = { { DC_129_PRED,          VERT_PRED },
2124                                    { HOR_PRED,             TM_VP8_PRED } },
2125     };
2126     static const struct {
2127         uint8_t needs_left:1;
2128         uint8_t needs_top:1;
2129         uint8_t needs_topleft:1;
2130         uint8_t needs_topright:1;
2131     } edges[N_INTRA_PRED_MODES] = {
2132         [VERT_PRED]            = { .needs_top  = 1 },
2133         [HOR_PRED]             = { .needs_left = 1 },
2134         [DC_PRED]              = { .needs_top  = 1, .needs_left = 1 },
2135         [DIAG_DOWN_LEFT_PRED]  = { .needs_top  = 1, .needs_topright = 1 },
2136         [DIAG_DOWN_RIGHT_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2137         [VERT_RIGHT_PRED]      = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2138         [HOR_DOWN_PRED]        = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2139         [VERT_LEFT_PRED]       = { .needs_top  = 1, .needs_topright = 1 },
2140         [HOR_UP_PRED]          = { .needs_left = 1 },
2141         [TM_VP8_PRED]          = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2142         [LEFT_DC_PRED]         = { .needs_left = 1 },
2143         [TOP_DC_PRED]          = { .needs_top  = 1 },
2144         [DC_128_PRED]          = { 0 },
2145         [DC_127_PRED]          = { 0 },
2146         [DC_129_PRED]          = { 0 }
2147     };
2148
2149     av_assert2(mode >= 0 && mode < 10);
2150     mode = mode_conv[mode][have_left][have_top];
2151     if (edges[mode].needs_top) {
2152         uint8_t *top, *topleft;
2153         int n_px_need = 4 << tx, n_px_have = (((s->cols - col) << !p) - x) * 4;
2154         int n_px_need_tr = 0;
2155
2156         if (tx == TX_4X4 && edges[mode].needs_topright && have_right)
2157             n_px_need_tr = 4;
2158
2159         // if top of sb64-row, use s->intra_pred_data[] instead of
2160         // dst[-stride] for intra prediction (it contains pre- instead of
2161         // post-loopfilter data)
2162         if (have_top) {
2163             top = !(row & 7) && !y ?
2164                 s->intra_pred_data[p] + col * (8 >> !!p) + x * 4 :
2165                 y == 0 ? &dst_edge[-stride_edge] : &dst_inner[-stride_inner];
2166             if (have_left)
2167                 topleft = !(row & 7) && !y ?
2168                     s->intra_pred_data[p] + col * (8 >> !!p) + x * 4 :
2169                     y == 0 || x == 0 ? &dst_edge[-stride_edge] :
2170                     &dst_inner[-stride_inner];
2171         }
2172
2173         if (have_top &&
2174             (!edges[mode].needs_topleft || (have_left && top == topleft)) &&
2175             (tx != TX_4X4 || !edges[mode].needs_topright || have_right) &&
2176             n_px_need + n_px_need_tr <= n_px_have) {
2177             *a = top;
2178         } else {
2179             if (have_top) {
2180                 if (n_px_need <= n_px_have) {
2181                     memcpy(*a, top, n_px_need);
2182                 } else {
2183                     memcpy(*a, top, n_px_have);
2184                     memset(&(*a)[n_px_have], (*a)[n_px_have - 1],
2185                            n_px_need - n_px_have);
2186                 }
2187             } else {
2188                 memset(*a, 127, n_px_need);
2189             }
2190             if (edges[mode].needs_topleft) {
2191                 if (have_left && have_top) {
2192                     (*a)[-1] = topleft[-1];
2193                 } else {
2194                     (*a)[-1] = have_top ? 129 : 127;
2195                 }
2196             }
2197             if (tx == TX_4X4 && edges[mode].needs_topright) {
2198                 if (have_top && have_right &&
2199                     n_px_need + n_px_need_tr <= n_px_have) {
2200                     memcpy(&(*a)[4], &top[4], 4);
2201                 } else {
2202                     memset(&(*a)[4], (*a)[3], 4);
2203                 }
2204             }
2205         }
2206     }
2207     if (edges[mode].needs_left) {
2208         if (have_left) {
2209             int n_px_need = 4 << tx, i, n_px_have = (((s->rows - row) << !p) - y) * 4;
2210             uint8_t *dst = x == 0 ? dst_edge : dst_inner;
2211             ptrdiff_t stride = x == 0 ? stride_edge : stride_inner;
2212
2213             if (n_px_need <= n_px_have) {
2214                 for (i = 0; i < n_px_need; i++)
2215                     l[i] = dst[i * stride - 1];
2216             } else {
2217                 for (i = 0; i < n_px_have; i++)
2218                     l[i] = dst[i * stride - 1];
2219                 memset(&l[i], l[i - 1], n_px_need - n_px_have);
2220             }
2221         } else {
2222             memset(l, 129, 4 << tx);
2223         }
2224     }
2225
2226     return mode;
2227 }
2228
2229 static void intra_recon(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off)
2230 {
2231     VP9Context *s = ctx->priv_data;
2232     VP9Block *b = s->b;
2233     int row = s->row, col = s->col;
2234     int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
2235     int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
2236     int end_x = FFMIN(2 * (s->cols - col), w4);
2237     int end_y = FFMIN(2 * (s->rows - row), h4);
2238     int tx = 4 * s->lossless + b->tx, uvtx = b->uvtx + 4 * s->lossless;
2239     int uvstep1d = 1 << b->uvtx, p;
2240     uint8_t *dst = s->dst[0], *dst_r = s->frames[CUR_FRAME].tf.f->data[0] + y_off;
2241
2242     for (n = 0, y = 0; y < end_y; y += step1d) {
2243         uint8_t *ptr = dst, *ptr_r = dst_r;
2244         for (x = 0; x < end_x; x += step1d, ptr += 4 * step1d,
2245                                ptr_r += 4 * step1d, n += step) {
2246             int mode = b->mode[b->bs > BS_8x8 && b->tx == TX_4X4 ?
2247                                y * 2 + x : 0];
2248             LOCAL_ALIGNED_16(uint8_t, a_buf, [48]);
2249             uint8_t *a = &a_buf[16], l[32];
2250             enum TxfmType txtp = vp9_intra_txfm_type[mode];
2251             int eob = b->skip ? 0 : b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
2252
2253             mode = check_intra_mode(s, mode, &a, ptr_r,
2254                                     s->frames[CUR_FRAME].tf.f->linesize[0],
2255                                     ptr, s->y_stride, l,
2256                                     col, x, w4, row, y, b->tx, 0);
2257             s->dsp.intra_pred[b->tx][mode](ptr, s->y_stride, l, a);
2258             if (eob)
2259                 s->dsp.itxfm_add[tx][txtp](ptr, s->y_stride,
2260                                            s->block + 16 * n, eob);
2261         }
2262         dst_r += 4 * step1d * s->frames[CUR_FRAME].tf.f->linesize[0];
2263         dst   += 4 * step1d * s->y_stride;
2264     }
2265
2266     // U/V
2267     h4 >>= 1;
2268     w4 >>= 1;
2269     end_x >>= 1;
2270     end_y >>= 1;
2271     step = 1 << (b->uvtx * 2);
2272     for (p = 0; p < 2; p++) {
2273         dst   = s->dst[1 + p];
2274         dst_r = s->frames[CUR_FRAME].tf.f->data[1 + p] + uv_off;
2275         for (n = 0, y = 0; y < end_y; y += uvstep1d) {
2276             uint8_t *ptr = dst, *ptr_r = dst_r;
2277             for (x = 0; x < end_x; x += uvstep1d, ptr += 4 * uvstep1d,
2278                                    ptr_r += 4 * uvstep1d, n += step) {
2279                 int mode = b->uvmode;
2280                 LOCAL_ALIGNED_16(uint8_t, a_buf, [48]);
2281                 uint8_t *a = &a_buf[16], l[32];
2282                 int eob = b->skip ? 0 : b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
2283
2284                 mode = check_intra_mode(s, mode, &a, ptr_r,
2285                                         s->frames[CUR_FRAME].tf.f->linesize[1],
2286                                         ptr, s->uv_stride, l,
2287                                         col, x, w4, row, y, b->uvtx, p + 1);
2288                 s->dsp.intra_pred[b->uvtx][mode](ptr, s->uv_stride, l, a);
2289                 if (eob)
2290                     s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
2291                                                     s->uvblock[p] + 16 * n, eob);
2292             }
2293             dst_r += 4 * uvstep1d * s->frames[CUR_FRAME].tf.f->linesize[1];
2294             dst   += 4 * uvstep1d * s->uv_stride;
2295         }
2296     }
2297 }
2298
2299 static av_always_inline void mc_luma_dir(VP9Context *s, vp9_mc_func (*mc)[2],
2300                                          uint8_t *dst, ptrdiff_t dst_stride,
2301                                          const uint8_t *ref, ptrdiff_t ref_stride,
2302                                          ThreadFrame *ref_frame,
2303                                          ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
2304                                          int bw, int bh, int w, int h)
2305 {
2306     int mx = mv->x, my = mv->y, th;
2307
2308     y += my >> 3;
2309     x += mx >> 3;
2310     ref += y * ref_stride + x;
2311     mx &= 7;
2312     my &= 7;
2313     // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2314     // we use +7 because the last 7 pixels of each sbrow can be changed in
2315     // the longest loopfilter of the next sbrow
2316     th = (y + bh + 4 * !!my + 7) >> 6;
2317     ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2318     if (x < !!mx * 3 || y < !!my * 3 ||
2319         x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
2320         s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2321                                  ref - !!my * 3 * ref_stride - !!mx * 3,
2322                                  80, ref_stride,
2323                                  bw + !!mx * 7, bh + !!my * 7,
2324                                  x - !!mx * 3, y - !!my * 3, w, h);
2325         ref = s->edge_emu_buffer + !!my * 3 * 80 + !!mx * 3;
2326         ref_stride = 80;
2327     }
2328     mc[!!mx][!!my](dst, dst_stride, ref, ref_stride, bh, mx << 1, my << 1);
2329 }
2330
2331 static av_always_inline void mc_chroma_dir(VP9Context *s, vp9_mc_func (*mc)[2],
2332                                            uint8_t *dst_u, uint8_t *dst_v,
2333                                            ptrdiff_t dst_stride,
2334                                            const uint8_t *ref_u, ptrdiff_t src_stride_u,
2335                                            const uint8_t *ref_v, ptrdiff_t src_stride_v,
2336                                            ThreadFrame *ref_frame,
2337                                            ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
2338                                            int bw, int bh, int w, int h)
2339 {
2340     int mx = mv->x, my = mv->y, th;
2341
2342     y += my >> 4;
2343     x += mx >> 4;
2344     ref_u += y * src_stride_u + x;
2345     ref_v += y * src_stride_v + x;
2346     mx &= 15;
2347     my &= 15;
2348     // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2349     // we use +7 because the last 7 pixels of each sbrow can be changed in
2350     // the longest loopfilter of the next sbrow
2351     th = (y + bh + 4 * !!my + 7) >> 5;
2352     ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2353     if (x < !!mx * 3 || y < !!my * 3 ||
2354         x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
2355         s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2356                                  ref_u - !!my * 3 * src_stride_u - !!mx * 3,
2357                                  80, src_stride_u,
2358                                  bw + !!mx * 7, bh + !!my * 7,
2359                                  x - !!mx * 3, y - !!my * 3, w, h);
2360         ref_u = s->edge_emu_buffer + !!my * 3 * 80 + !!mx * 3;
2361         mc[!!mx][!!my](dst_u, dst_stride, ref_u, 80, bh, mx, my);
2362
2363         s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2364                                  ref_v - !!my * 3 * src_stride_v - !!mx * 3,
2365                                  80, src_stride_v,
2366                                  bw + !!mx * 7, bh + !!my * 7,
2367                                  x - !!mx * 3, y - !!my * 3, w, h);
2368         ref_v = s->edge_emu_buffer + !!my * 3 * 80 + !!mx * 3;
2369         mc[!!mx][!!my](dst_v, dst_stride, ref_v, 80, bh, mx, my);
2370     } else {
2371         mc[!!mx][!!my](dst_u, dst_stride, ref_u, src_stride_u, bh, mx, my);
2372         mc[!!mx][!!my](dst_v, dst_stride, ref_v, src_stride_v, bh, mx, my);
2373     }
2374 }
2375
2376 static void inter_recon(AVCodecContext *ctx)
2377 {
2378     static const uint8_t bwlog_tab[2][N_BS_SIZES] = {
2379         { 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4 },
2380         { 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4 },
2381     };
2382     VP9Context *s = ctx->priv_data;
2383     VP9Block *b = s->b;
2384     int row = s->row, col = s->col;
2385     ThreadFrame *tref1 = &s->refs[s->refidx[b->ref[0]]];
2386     AVFrame *ref1 = tref1->f;
2387     ThreadFrame *tref2 = b->comp ? &s->refs[s->refidx[b->ref[1]]] : NULL;
2388     AVFrame *ref2 = b->comp ? tref2->f : NULL;
2389     int w = ctx->width, h = ctx->height;
2390     ptrdiff_t ls_y = s->y_stride, ls_uv = s->uv_stride;
2391
2392     // y inter pred
2393     if (b->bs > BS_8x8) {
2394         if (b->bs == BS_8x4) {
2395             mc_luma_dir(s, s->dsp.mc[3][b->filter][0], s->dst[0], ls_y,
2396                         ref1->data[0], ref1->linesize[0], tref1,
2397                         row << 3, col << 3, &b->mv[0][0], 8, 4, w, h);
2398             mc_luma_dir(s, s->dsp.mc[3][b->filter][0],
2399                         s->dst[0] + 4 * ls_y, ls_y,
2400                         ref1->data[0], ref1->linesize[0], tref1,
2401                         (row << 3) + 4, col << 3, &b->mv[2][0], 8, 4, w, h);
2402
2403             if (b->comp) {
2404                 mc_luma_dir(s, s->dsp.mc[3][b->filter][1], s->dst[0], ls_y,
2405                             ref2->data[0], ref2->linesize[0], tref2,
2406                             row << 3, col << 3, &b->mv[0][1], 8, 4, w, h);
2407                 mc_luma_dir(s, s->dsp.mc[3][b->filter][1],
2408                             s->dst[0] + 4 * ls_y, ls_y,
2409                             ref2->data[0], ref2->linesize[0], tref2,
2410                             (row << 3) + 4, col << 3, &b->mv[2][1], 8, 4, w, h);
2411             }
2412         } else if (b->bs == BS_4x8) {
2413             mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0], ls_y,
2414                         ref1->data[0], ref1->linesize[0], tref1,
2415                         row << 3, col << 3, &b->mv[0][0], 4, 8, w, h);
2416             mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0] + 4, ls_y,
2417                         ref1->data[0], ref1->linesize[0], tref1,
2418                         row << 3, (col << 3) + 4, &b->mv[1][0], 4, 8, w, h);
2419
2420             if (b->comp) {
2421                 mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0], ls_y,
2422                             ref2->data[0], ref2->linesize[0], tref2,
2423                             row << 3, col << 3, &b->mv[0][1], 4, 8, w, h);
2424                 mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0] + 4, ls_y,
2425                             ref2->data[0], ref2->linesize[0], tref2,
2426                             row << 3, (col << 3) + 4, &b->mv[1][1], 4, 8, w, h);
2427             }
2428         } else {
2429             av_assert2(b->bs == BS_4x4);
2430
2431             // FIXME if two horizontally adjacent blocks have the same MV,
2432             // do a w8 instead of a w4 call
2433             mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0], ls_y,
2434                         ref1->data[0], ref1->linesize[0], tref1,
2435                         row << 3, col << 3, &b->mv[0][0], 4, 4, w, h);
2436             mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0] + 4, ls_y,
2437                         ref1->data[0], ref1->linesize[0], tref1,
2438                         row << 3, (col << 3) + 4, &b->mv[1][0], 4, 4, w, h);
2439             mc_luma_dir(s, s->dsp.mc[4][b->filter][0],
2440                         s->dst[0] + 4 * ls_y, ls_y,
2441                         ref1->data[0], ref1->linesize[0], tref1,
2442                         (row << 3) + 4, col << 3, &b->mv[2][0], 4, 4, w, h);
2443             mc_luma_dir(s, s->dsp.mc[4][b->filter][0],
2444                         s->dst[0] + 4 * ls_y + 4, ls_y,
2445                         ref1->data[0], ref1->linesize[0], tref1,
2446                         (row << 3) + 4, (col << 3) + 4, &b->mv[3][0], 4, 4, w, h);
2447
2448             if (b->comp) {
2449                 mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0], ls_y,
2450                             ref2->data[0], ref2->linesize[0], tref2,
2451                             row << 3, col << 3, &b->mv[0][1], 4, 4, w, h);
2452                 mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0] + 4, ls_y,
2453                             ref2->data[0], ref2->linesize[0], tref2,
2454                             row << 3, (col << 3) + 4, &b->mv[1][1], 4, 4, w, h);
2455                 mc_luma_dir(s, s->dsp.mc[4][b->filter][1],
2456                             s->dst[0] + 4 * ls_y, ls_y,
2457                             ref2->data[0], ref2->linesize[0], tref2,
2458                             (row << 3) + 4, col << 3, &b->mv[2][1], 4, 4, w, h);
2459                 mc_luma_dir(s, s->dsp.mc[4][b->filter][1],
2460                             s->dst[0] + 4 * ls_y + 4, ls_y,
2461                             ref2->data[0], ref2->linesize[0], tref2,
2462                             (row << 3) + 4, (col << 3) + 4, &b->mv[3][1], 4, 4, w, h);
2463             }
2464         }
2465     } else {
2466         int bwl = bwlog_tab[0][b->bs];
2467         int bw = bwh_tab[0][b->bs][0] * 4, bh = bwh_tab[0][b->bs][1] * 4;
2468
2469         mc_luma_dir(s, s->dsp.mc[bwl][b->filter][0], s->dst[0], ls_y,
2470                     ref1->data[0], ref1->linesize[0], tref1,
2471                     row << 3, col << 3, &b->mv[0][0],bw, bh, w, h);
2472
2473         if (b->comp)
2474             mc_luma_dir(s, s->dsp.mc[bwl][b->filter][1], s->dst[0], ls_y,
2475                         ref2->data[0], ref2->linesize[0], tref2,
2476                         row << 3, col << 3, &b->mv[0][1], bw, bh, w, h);
2477     }
2478
2479     // uv inter pred
2480     {
2481         int bwl = bwlog_tab[1][b->bs];
2482         int bw = bwh_tab[1][b->bs][0] * 4, bh = bwh_tab[1][b->bs][1] * 4;
2483         VP56mv mvuv;
2484
2485         w = (w + 1) >> 1;
2486         h = (h + 1) >> 1;
2487         if (b->bs > BS_8x8) {
2488             mvuv.x = ROUNDED_DIV(b->mv[0][0].x + b->mv[1][0].x + b->mv[2][0].x + b->mv[3][0].x, 4);
2489             mvuv.y = ROUNDED_DIV(b->mv[0][0].y + b->mv[1][0].y + b->mv[2][0].y + b->mv[3][0].y, 4);
2490         } else {
2491             mvuv = b->mv[0][0];
2492         }
2493
2494         mc_chroma_dir(s, s->dsp.mc[bwl][b->filter][0],
2495                       s->dst[1], s->dst[2], ls_uv,
2496                       ref1->data[1], ref1->linesize[1],
2497                       ref1->data[2], ref1->linesize[2], tref1,
2498                       row << 2, col << 2, &mvuv, bw, bh, w, h);
2499
2500         if (b->comp) {
2501             if (b->bs > BS_8x8) {
2502                 mvuv.x = ROUNDED_DIV(b->mv[0][1].x + b->mv[1][1].x + b->mv[2][1].x + b->mv[3][1].x, 4);
2503                 mvuv.y = ROUNDED_DIV(b->mv[0][1].y + b->mv[1][1].y + b->mv[2][1].y + b->mv[3][1].y, 4);
2504             } else {
2505                 mvuv = b->mv[0][1];
2506             }
2507             mc_chroma_dir(s, s->dsp.mc[bwl][b->filter][1],
2508                           s->dst[1], s->dst[2], ls_uv,
2509                           ref2->data[1], ref2->linesize[1],
2510                           ref2->data[2], ref2->linesize[2], tref2,
2511                           row << 2, col << 2, &mvuv, bw, bh, w, h);
2512         }
2513     }
2514
2515     if (!b->skip) {
2516         /* mostly copied intra_reconn() */
2517
2518         int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
2519         int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
2520         int end_x = FFMIN(2 * (s->cols - col), w4);
2521         int end_y = FFMIN(2 * (s->rows - row), h4);
2522         int tx = 4 * s->lossless + b->tx, uvtx = b->uvtx + 4 * s->lossless;
2523         int uvstep1d = 1 << b->uvtx, p;
2524         uint8_t *dst = s->dst[0];
2525
2526         // y itxfm add
2527         for (n = 0, y = 0; y < end_y; y += step1d) {
2528             uint8_t *ptr = dst;
2529             for (x = 0; x < end_x; x += step1d, ptr += 4 * step1d, n += step) {
2530                 int eob = b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
2531
2532                 if (eob)
2533                     s->dsp.itxfm_add[tx][DCT_DCT](ptr, s->y_stride,
2534                                                   s->block + 16 * n, eob);
2535             }
2536             dst += 4 * s->y_stride * step1d;
2537         }
2538
2539         // uv itxfm add
2540         h4 >>= 1;
2541         w4 >>= 1;
2542         end_x >>= 1;
2543         end_y >>= 1;
2544         step = 1 << (b->uvtx * 2);
2545         for (p = 0; p < 2; p++) {
2546             dst = s->dst[p + 1];
2547             for (n = 0, y = 0; y < end_y; y += uvstep1d) {
2548                 uint8_t *ptr = dst;
2549                 for (x = 0; x < end_x; x += uvstep1d, ptr += 4 * uvstep1d, n += step) {
2550                     int eob = b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
2551
2552                     if (eob)
2553                         s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
2554                                                         s->uvblock[p] + 16 * n, eob);
2555                 }
2556                 dst += 4 * uvstep1d * s->uv_stride;
2557             }
2558         }
2559     }
2560 }
2561
2562 static av_always_inline void mask_edges(struct VP9Filter *lflvl, int is_uv,
2563                                         int row_and_7, int col_and_7,
2564                                         int w, int h, int col_end, int row_end,
2565                                         enum TxfmMode tx, int skip_inter)
2566 {
2567     // FIXME I'm pretty sure all loops can be replaced by a single LUT if
2568     // we make VP9Filter.mask uint64_t (i.e. row/col all single variable)
2569     // and make the LUT 5-indexed (bl, bp, is_uv, tx and row/col), and then
2570     // use row_and_7/col_and_7 as shifts (1*col_and_7+8*row_and_7)
2571
2572     // the intended behaviour of the vp9 loopfilter is to work on 8-pixel
2573     // edges. This means that for UV, we work on two subsampled blocks at
2574     // a time, and we only use the topleft block's mode information to set
2575     // things like block strength. Thus, for any block size smaller than
2576     // 16x16, ignore the odd portion of the block.
2577     if (tx == TX_4X4 && is_uv) {
2578         if (h == 1) {
2579             if (row_and_7 & 1)
2580                 return;
2581             if (!row_end)
2582                 h += 1;
2583         }
2584         if (w == 1) {
2585             if (col_and_7 & 1)
2586                 return;
2587             if (!col_end)
2588                 w += 1;
2589         }
2590     }
2591
2592     if (tx == TX_4X4 && !skip_inter) {
2593         int t = 1 << col_and_7, m_col = (t << w) - t, y;
2594         int m_col_odd = (t << (w - 1)) - t;
2595
2596         // on 32-px edges, use the 8-px wide loopfilter; else, use 4-px wide
2597         if (is_uv) {
2598             int m_row_8 = m_col & 0x01, m_row_4 = m_col - m_row_8;
2599
2600             for (y = row_and_7; y < h + row_and_7; y++) {
2601                 int col_mask_id = 2 - !(y & 7);
2602
2603                 lflvl->mask[is_uv][0][y][1] |= m_row_8;
2604                 lflvl->mask[is_uv][0][y][2] |= m_row_4;
2605                 // for odd lines, if the odd col is not being filtered,
2606                 // skip odd row also:
2607                 // .---. <-- a
2608                 // |   |
2609                 // |___| <-- b
2610                 // ^   ^
2611                 // c   d
2612                 //
2613                 // if a/c are even row/col and b/d are odd, and d is skipped,
2614                 // e.g. right edge of size-66x66.webm, then skip b also (bug)
2615                 if ((col_end & 1) && (y & 1)) {
2616                     lflvl->mask[is_uv][1][y][col_mask_id] |= m_col_odd;
2617                 } else {
2618                     lflvl->mask[is_uv][1][y][col_mask_id] |= m_col;
2619                 }
2620             }
2621         } else {
2622             int m_row_8 = m_col & 0x11, m_row_4 = m_col - m_row_8;
2623
2624             for (y = row_and_7; y < h + row_and_7; y++) {
2625                 int col_mask_id = 2 - !(y & 3);
2626
2627                 lflvl->mask[is_uv][0][y][1] |= m_row_8; // row edge
2628                 lflvl->mask[is_uv][0][y][2] |= m_row_4;
2629                 lflvl->mask[is_uv][1][y][col_mask_id] |= m_col; // col edge
2630                 lflvl->mask[is_uv][0][y][3] |= m_col;
2631                 lflvl->mask[is_uv][1][y][3] |= m_col;
2632             }
2633         }
2634     } else {
2635         int y, t = 1 << col_and_7, m_col = (t << w) - t;
2636
2637         if (!skip_inter) {
2638             int mask_id = (tx == TX_8X8);
2639             int l2 = tx + is_uv - 1, step1d = 1 << l2;
2640             static const unsigned masks[4] = { 0xff, 0x55, 0x11, 0x01 };
2641             int m_row = m_col & masks[l2];
2642
2643             // at odd UV col/row edges tx16/tx32 loopfilter edges, force
2644             // 8wd loopfilter to prevent going off the visible edge.
2645             if (is_uv && tx > TX_8X8 && (w ^ (w - 1)) == 1) {
2646                 int m_row_16 = ((t << (w - 1)) - t) & masks[l2];
2647                 int m_row_8 = m_row - m_row_16;
2648
2649                 for (y = row_and_7; y < h + row_and_7; y++) {
2650                     lflvl->mask[is_uv][0][y][0] |= m_row_16;
2651                     lflvl->mask[is_uv][0][y][1] |= m_row_8;
2652                 }
2653             } else {
2654                 for (y = row_and_7; y < h + row_and_7; y++)
2655                     lflvl->mask[is_uv][0][y][mask_id] |= m_row;
2656             }
2657
2658             if (is_uv && tx > TX_8X8 && (h ^ (h - 1)) == 1) {
2659                 for (y = row_and_7; y < h + row_and_7 - 1; y += step1d)
2660                     lflvl->mask[is_uv][1][y][0] |= m_col;
2661                 if (y - row_and_7 == h - 1)
2662                     lflvl->mask[is_uv][1][y][1] |= m_col;
2663             } else {
2664                 for (y = row_and_7; y < h + row_and_7; y += step1d)
2665                     lflvl->mask[is_uv][1][y][mask_id] |= m_col;
2666             }
2667         } else if (tx != TX_4X4) {
2668             int mask_id;
2669
2670             mask_id = (tx == TX_8X8) || (is_uv && h == 1);
2671             lflvl->mask[is_uv][1][row_and_7][mask_id] |= m_col;
2672             mask_id = (tx == TX_8X8) || (is_uv && w == 1);
2673             for (y = row_and_7; y < h + row_and_7; y++)
2674                 lflvl->mask[is_uv][0][y][mask_id] |= t;
2675         } else if (is_uv) {
2676             int t8 = t & 0x01, t4 = t - t8;
2677
2678             for (y = row_and_7; y < h + row_and_7; y++) {
2679                 lflvl->mask[is_uv][0][y][2] |= t4;
2680                 lflvl->mask[is_uv][0][y][1] |= t8;
2681             }
2682             lflvl->mask[is_uv][1][row_and_7][2 - !(row_and_7 & 7)] |= m_col;
2683         } else {
2684             int t8 = t & 0x11, t4 = t - t8;
2685
2686             for (y = row_and_7; y < h + row_and_7; y++) {
2687                 lflvl->mask[is_uv][0][y][2] |= t4;
2688                 lflvl->mask[is_uv][0][y][1] |= t8;
2689             }
2690             lflvl->mask[is_uv][1][row_and_7][2 - !(row_and_7 & 3)] |= m_col;
2691         }
2692     }
2693 }
2694
2695 static void decode_b(AVCodecContext *ctx, int row, int col,
2696                      struct VP9Filter *lflvl, ptrdiff_t yoff, ptrdiff_t uvoff,
2697                      enum BlockLevel bl, enum BlockPartition bp)
2698 {
2699     VP9Context *s = ctx->priv_data;
2700     VP9Block *b = s->b;
2701     enum BlockSize bs = bl * 3 + bp;
2702     int y, w4 = bwh_tab[1][bs][0], h4 = bwh_tab[1][bs][1], lvl;
2703     int emu[2];
2704     AVFrame *f = s->frames[CUR_FRAME].tf.f;
2705
2706     s->row = row;
2707     s->row7 = row & 7;
2708     s->col = col;
2709     s->col7 = col & 7;
2710     s->min_mv.x = -(128 + col * 64);
2711     s->min_mv.y = -(128 + row * 64);
2712     s->max_mv.x = 128 + (s->cols - col - w4) * 64;
2713     s->max_mv.y = 128 + (s->rows - row - h4) * 64;
2714     if (s->pass < 2) {
2715         b->bs = bs;
2716         b->bl = bl;
2717         b->bp = bp;
2718         decode_mode(ctx);
2719         b->uvtx = b->tx - (w4 * 2 == (1 << b->tx) || h4 * 2 == (1 << b->tx));
2720
2721         if (!b->skip) {
2722             decode_coeffs(ctx);
2723         } else {
2724             int pl;
2725
2726             memset(&s->above_y_nnz_ctx[col * 2], 0, w4 * 2);
2727             memset(&s->left_y_nnz_ctx[(row & 7) << 1], 0, h4 * 2);
2728             for (pl = 0; pl < 2; pl++) {
2729                 memset(&s->above_uv_nnz_ctx[pl][col], 0, w4);
2730                 memset(&s->left_uv_nnz_ctx[pl][row & 7], 0, h4);
2731             }
2732         }
2733         if (s->pass == 1) {
2734             s->b++;
2735             s->block += w4 * h4 * 64;
2736             s->uvblock[0] += w4 * h4 * 16;
2737             s->uvblock[1] += w4 * h4 * 16;
2738             s->eob += 4 * w4 * h4;
2739             s->uveob[0] += w4 * h4;
2740             s->uveob[1] += w4 * h4;
2741
2742             return;
2743         }
2744     }
2745
2746     // emulated overhangs if the stride of the target buffer can't hold. This
2747     // allows to support emu-edge and so on even if we have large block
2748     // overhangs
2749     emu[0] = (col + w4) * 8 > f->linesize[0] ||
2750              (row + h4) > s->rows;
2751     emu[1] = (col + w4) * 4 > f->linesize[1] ||
2752              (row + h4) > s->rows;
2753     if (emu[0]) {
2754         s->dst[0] = s->tmp_y;
2755         s->y_stride = 64;
2756     } else {
2757         s->dst[0] = f->data[0] + yoff;
2758         s->y_stride = f->linesize[0];
2759     }
2760     if (emu[1]) {
2761         s->dst[1] = s->tmp_uv[0];
2762         s->dst[2] = s->tmp_uv[1];
2763         s->uv_stride = 32;
2764     } else {
2765         s->dst[1] = f->data[1] + uvoff;
2766         s->dst[2] = f->data[2] + uvoff;
2767         s->uv_stride = f->linesize[1];
2768     }
2769     if (b->intra) {
2770         intra_recon(ctx, yoff, uvoff);
2771     } else {
2772         inter_recon(ctx);
2773     }
2774     if (emu[0]) {
2775         int w = FFMIN(s->cols - col, w4) * 8, h = FFMIN(s->rows - row, h4) * 8, n, o = 0;
2776
2777         for (n = 0; o < w; n++) {
2778             int bw = 64 >> n;
2779
2780             av_assert2(n <= 4);
2781             if (w & bw) {
2782                 s->dsp.mc[n][0][0][0][0](f->data[0] + yoff + o, f->linesize[0],
2783                                          s->tmp_y + o, 64, h, 0, 0);
2784                 o += bw;
2785             }
2786         }
2787     }
2788     if (emu[1]) {
2789         int w = FFMIN(s->cols - col, w4) * 4, h = FFMIN(s->rows - row, h4) * 4, n, o = 0;
2790
2791         for (n = 1; o < w; n++) {
2792             int bw = 64 >> n;
2793
2794             av_assert2(n <= 4);
2795             if (w & bw) {
2796                 s->dsp.mc[n][0][0][0][0](f->data[1] + uvoff + o, f->linesize[1],
2797                                          s->tmp_uv[0] + o, 32, h, 0, 0);
2798                 s->dsp.mc[n][0][0][0][0](f->data[2] + uvoff + o, f->linesize[2],
2799                                          s->tmp_uv[1] + o, 32, h, 0, 0);
2800                 o += bw;
2801             }
2802         }
2803     }
2804
2805     // pick filter level and find edges to apply filter to
2806     if (s->filter.level &&
2807         (lvl = s->segmentation.feat[b->seg_id].lflvl[b->intra ? 0 : b->ref[0] + 1]
2808                                                     [b->mode[3] != ZEROMV]) > 0) {
2809         int x_end = FFMIN(s->cols - col, w4), y_end = FFMIN(s->rows - row, h4);
2810         int skip_inter = !b->intra && b->skip;
2811
2812         for (y = 0; y < h4; y++)
2813             memset(&lflvl->level[((row & 7) + y) * 8 + (col & 7)], lvl, w4);
2814         mask_edges(lflvl, 0, row & 7, col & 7, x_end, y_end, 0, 0, b->tx, skip_inter);
2815         mask_edges(lflvl, 1, row & 7, col & 7, x_end, y_end,
2816                    s->cols & 1 && col + w4 >= s->cols ? s->cols & 7 : 0,
2817                    s->rows & 1 && row + h4 >= s->rows ? s->rows & 7 : 0,
2818                    b->uvtx, skip_inter);
2819
2820         if (!s->filter.lim_lut[lvl]) {
2821             int sharp = s->filter.sharpness;
2822             int limit = lvl;
2823
2824             if (sharp > 0) {
2825                 limit >>= (sharp + 3) >> 2;
2826                 limit = FFMIN(limit, 9 - sharp);
2827             }
2828             limit = FFMAX(limit, 1);
2829
2830             s->filter.lim_lut[lvl] = limit;
2831             s->filter.mblim_lut[lvl] = 2 * (lvl + 2) + limit;
2832         }
2833     }
2834
2835     if (s->pass == 2) {
2836         s->b++;
2837         s->block += w4 * h4 * 64;
2838         s->uvblock[0] += w4 * h4 * 16;
2839         s->uvblock[1] += w4 * h4 * 16;
2840         s->eob += 4 * w4 * h4;
2841         s->uveob[0] += w4 * h4;
2842         s->uveob[1] += w4 * h4;
2843     }
2844 }
2845
2846 static void decode_sb(AVCodecContext *ctx, int row, int col, struct VP9Filter *lflvl,
2847                       ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
2848 {
2849     VP9Context *s = ctx->priv_data;
2850     int c = ((s->above_partition_ctx[col] >> (3 - bl)) & 1) |
2851             (((s->left_partition_ctx[row & 0x7] >> (3 - bl)) & 1) << 1);
2852     const uint8_t *p = s->keyframe ? vp9_default_kf_partition_probs[bl][c] :
2853                                      s->prob.p.partition[bl][c];
2854     enum BlockPartition bp;
2855     ptrdiff_t hbs = 4 >> bl;
2856     AVFrame *f = s->frames[CUR_FRAME].tf.f;
2857     ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
2858
2859     if (bl == BL_8X8) {
2860         bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
2861         decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
2862     } else if (col + hbs < s->cols) { // FIXME why not <=?
2863         if (row + hbs < s->rows) { // FIXME why not <=?
2864             bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
2865             switch (bp) {
2866             case PARTITION_NONE:
2867                 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
2868                 break;
2869             case PARTITION_H:
2870                 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
2871                 yoff  += hbs * 8 * y_stride;
2872                 uvoff += hbs * 4 * uv_stride;
2873                 decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, bl, bp);
2874                 break;
2875             case PARTITION_V:
2876                 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
2877                 yoff  += hbs * 8;
2878                 uvoff += hbs * 4;
2879                 decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, bl, bp);
2880                 break;
2881             case PARTITION_SPLIT:
2882                 decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
2883                 decode_sb(ctx, row, col + hbs, lflvl,
2884                           yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
2885                 yoff  += hbs * 8 * y_stride;
2886                 uvoff += hbs * 4 * uv_stride;
2887                 decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
2888                 decode_sb(ctx, row + hbs, col + hbs, lflvl,
2889                           yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
2890                 break;
2891             default:
2892                 av_assert0(0);
2893             }
2894         } else if (vp56_rac_get_prob_branchy(&s->c, p[1])) {
2895             bp = PARTITION_SPLIT;
2896             decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
2897             decode_sb(ctx, row, col + hbs, lflvl,
2898                       yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
2899         } else {
2900             bp = PARTITION_H;
2901             decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
2902         }
2903     } else if (row + hbs < s->rows) { // FIXME why not <=?
2904         if (vp56_rac_get_prob_branchy(&s->c, p[2])) {
2905             bp = PARTITION_SPLIT;
2906             decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
2907             yoff  += hbs * 8 * y_stride;
2908             uvoff += hbs * 4 * uv_stride;
2909             decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
2910         } else {
2911             bp = PARTITION_V;
2912             decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
2913         }
2914     } else {
2915         bp = PARTITION_SPLIT;
2916         decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
2917     }
2918     s->counts.partition[bl][c][bp]++;
2919 }
2920
2921 static void decode_sb_mem(AVCodecContext *ctx, int row, int col, struct VP9Filter *lflvl,
2922                           ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
2923 {
2924     VP9Context *s = ctx->priv_data;
2925     VP9Block *b = s->b;
2926     ptrdiff_t hbs = 4 >> bl;
2927     AVFrame *f = s->frames[CUR_FRAME].tf.f;
2928     ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
2929
2930     if (bl == BL_8X8) {
2931         av_assert2(b->bl == BL_8X8);
2932         decode_b(ctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
2933     } else if (s->b->bl == bl) {
2934         decode_b(ctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
2935         if (b->bp == PARTITION_H && row + hbs < s->rows) {
2936             yoff  += hbs * 8 * y_stride;
2937             uvoff += hbs * 4 * uv_stride;
2938             decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, b->bl, b->bp);
2939         } else if (b->bp == PARTITION_V && col + hbs < s->cols) {
2940             yoff  += hbs * 8;
2941             uvoff += hbs * 4;
2942             decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, b->bl, b->bp);
2943         }
2944     } else {
2945         decode_sb_mem(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
2946         if (col + hbs < s->cols) { // FIXME why not <=?
2947             if (row + hbs < s->rows) {
2948                 decode_sb_mem(ctx, row, col + hbs, lflvl, yoff + 8 * hbs,
2949                               uvoff + 4 * hbs, bl + 1);
2950                 yoff  += hbs * 8 * y_stride;
2951                 uvoff += hbs * 4 * uv_stride;
2952                 decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
2953                 decode_sb_mem(ctx, row + hbs, col + hbs, lflvl,
2954                                     yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
2955             } else {
2956                 yoff  += hbs * 8;
2957                 uvoff += hbs * 4;
2958                 decode_sb_mem(ctx, row, col + hbs, lflvl, yoff, uvoff, bl + 1);
2959             }
2960         } else if (row + hbs < s->rows) {
2961             yoff  += hbs * 8 * y_stride;
2962             uvoff += hbs * 4 * uv_stride;
2963             decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
2964         }
2965     }
2966 }
2967
2968 static void loopfilter_sb(AVCodecContext *ctx, struct VP9Filter *lflvl,
2969                           int row, int col, ptrdiff_t yoff, ptrdiff_t uvoff)
2970 {
2971     VP9Context *s = ctx->priv_data;
2972     AVFrame *f = s->frames[CUR_FRAME].tf.f;
2973     uint8_t *dst = f->data[0] + yoff, *lvl = lflvl->level;
2974     ptrdiff_t ls_y = f->linesize[0], ls_uv = f->linesize[1];
2975     int y, x, p;
2976
2977     // FIXME in how far can we interleave the v/h loopfilter calls? E.g.
2978     // if you think of them as acting on a 8x8 block max, we can interleave
2979     // each v/h within the single x loop, but that only works if we work on
2980     // 8 pixel blocks, and we won't always do that (we want at least 16px
2981     // to use SSE2 optimizations, perhaps 32 for AVX2)
2982
2983     // filter edges between columns, Y plane (e.g. block1 | block2)
2984     for (y = 0; y < 8; y += 2, dst += 16 * ls_y, lvl += 16) {
2985         uint8_t *ptr = dst, *l = lvl, *hmask1 = lflvl->mask[0][0][y];
2986         uint8_t *hmask2 = lflvl->mask[0][0][y + 1];
2987         unsigned hm1 = hmask1[0] | hmask1[1] | hmask1[2], hm13 = hmask1[3];
2988         unsigned hm2 = hmask2[1] | hmask2[2], hm23 = hmask2[3];
2989         unsigned hm = hm1 | hm2 | hm13 | hm23;
2990
2991         for (x = 1; hm & ~(x - 1); x <<= 1, ptr += 8, l++) {
2992             if (hm1 & x) {
2993                 int L = *l, H = L >> 4;
2994                 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
2995
2996                 if (col || x > 1) {
2997                     if (hmask1[0] & x) {
2998                         if (hmask2[0] & x) {
2999                             av_assert2(l[8] == L);
3000                             s->dsp.loop_filter_16[0](ptr, ls_y, E, I, H);
3001                         } else {
3002                             s->dsp.loop_filter_8[2][0](ptr, ls_y, E, I, H);
3003                         }
3004                     } else if (hm2 & x) {
3005                         L = l[8];
3006                         H |= (L >> 4) << 8;
3007                         E |= s->filter.mblim_lut[L] << 8;
3008                         I |= s->filter.lim_lut[L] << 8;
3009                         s->dsp.loop_filter_mix2[!!(hmask1[1] & x)]
3010                                                [!!(hmask2[1] & x)]
3011                                                [0](ptr, ls_y, E, I, H);
3012                     } else {
3013                         s->dsp.loop_filter_8[!!(hmask1[1] & x)]
3014                                             [0](ptr, ls_y, E, I, H);
3015                     }
3016                 }
3017             } else if (hm2 & x) {
3018                 int L = l[8], H = L >> 4;
3019                 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3020
3021                 if (col || x > 1) {
3022                     s->dsp.loop_filter_8[!!(hmask2[1] & x)]
3023                                         [0](ptr + 8 * ls_y, ls_y, E, I, H);
3024                 }
3025             }
3026             if (hm13 & x) {
3027                 int L = *l, H = L >> 4;
3028                 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3029
3030                 if (hm23 & x) {
3031                     L = l[8];
3032                     H |= (L >> 4) << 8;
3033                     E |= s->filter.mblim_lut[L] << 8;
3034                     I |= s->filter.lim_lut[L] << 8;
3035                     s->dsp.loop_filter_mix2[0][0][0](ptr + 4, ls_y, E, I, H);
3036                 } else {
3037                     s->dsp.loop_filter_8[0][0](ptr + 4, ls_y, E, I, H);
3038                 }
3039             } else if (hm23 & x) {
3040                 int L = l[8], H = L >> 4;
3041                 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3042
3043                 s->dsp.loop_filter_8[0][0](ptr + 8 * ls_y + 4, ls_y, E, I, H);
3044             }
3045         }
3046     }
3047
3048     //                                          block1
3049     // filter edges between rows, Y plane (e.g. ------)
3050     //                                          block2
3051     dst = f->data[0] + yoff;
3052     lvl = lflvl->level;
3053     for (y = 0; y < 8; y++, dst += 8 * ls_y, lvl += 8) {
3054         uint8_t *ptr = dst, *l = lvl, *vmask = lflvl->mask[0][1][y];
3055         unsigned vm = vmask[0] | vmask[1] | vmask[2], vm3 = vmask[3];
3056
3057         for (x = 1; vm & ~(x - 1); x <<= 2, ptr += 16, l += 2) {
3058             if (row || y) {
3059                 if (vm & x) {
3060                     int L = *l, H = L >> 4;
3061                     int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3062
3063                     if (vmask[0] & x) {
3064                         if (vmask[0] & (x << 1)) {
3065                             av_assert2(l[1] == L);
3066                             s->dsp.loop_filter_16[1](ptr, ls_y, E, I, H);
3067                         } else {
3068                             s->dsp.loop_filter_8[2][1](ptr, ls_y, E, I, H);
3069                         }
3070                     } else if (vm & (x << 1)) {
3071                         L = l[1];
3072                         H |= (L >> 4) << 8;
3073                         E |= s->filter.mblim_lut[L] << 8;
3074                         I |= s->filter.lim_lut[L] << 8;
3075                         s->dsp.loop_filter_mix2[!!(vmask[1] &  x)]
3076                                                [!!(vmask[1] & (x << 1))]
3077                                                [1](ptr, ls_y, E, I, H);
3078                     } else {
3079                         s->dsp.loop_filter_8[!!(vmask[1] & x)]
3080                                             [1](ptr, ls_y, E, I, H);
3081                     }
3082                 } else if (vm & (x << 1)) {
3083                     int L = l[1], H = L >> 4;
3084                     int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3085
3086                     s->dsp.loop_filter_8[!!(vmask[1] & (x << 1))]
3087                                         [1](ptr + 8, ls_y, E, I, H);
3088                 }
3089             }
3090             if (vm3 & x) {
3091                 int L = *l, H = L >> 4;
3092                 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3093
3094                 if (vm3 & (x << 1)) {
3095                     L = l[1];
3096                     H |= (L >> 4) << 8;
3097                     E |= s->filter.mblim_lut[L] << 8;
3098                     I |= s->filter.lim_lut[L] << 8;
3099                     s->dsp.loop_filter_mix2[0][0][1](ptr + ls_y * 4, ls_y, E, I, H);
3100                 } else {
3101                     s->dsp.loop_filter_8[0][1](ptr + ls_y * 4, ls_y, E, I, H);
3102                 }
3103             } else if (vm3 & (x << 1)) {
3104                 int L = l[1], H = L >> 4;
3105                 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3106
3107                 s->dsp.loop_filter_8[0][1](ptr + ls_y * 4 + 8, ls_y, E, I, H);
3108             }
3109         }
3110     }
3111
3112     // same principle but for U/V planes
3113     for (p = 0; p < 2; p++) {
3114         lvl = lflvl->level;
3115         dst = f->data[1 + p] + uvoff;
3116         for (y = 0; y < 8; y += 4, dst += 16 * ls_uv, lvl += 32) {
3117             uint8_t *ptr = dst, *l = lvl, *hmask1 = lflvl->mask[1][0][y];
3118             uint8_t *hmask2 = lflvl->mask[1][0][y + 2];
3119             unsigned hm1 = hmask1[0] | hmask1[1] | hmask1[2];
3120             unsigned hm2 = hmask2[1] | hmask2[2], hm = hm1 | hm2;
3121
3122             for (x = 1; hm & ~(x - 1); x <<= 1, ptr += 4) {
3123                 if (col || x > 1) {
3124                     if (hm1 & x) {
3125                         int L = *l, H = L >> 4;
3126                         int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3127
3128                         if (hmask1[0] & x) {
3129                             if (hmask2[0] & x) {
3130                                 av_assert2(l[16] == L);
3131                                 s->dsp.loop_filter_16[0](ptr, ls_uv, E, I, H);
3132                             } else {
3133                                 s->dsp.loop_filter_8[2][0](ptr, ls_uv, E, I, H);
3134                             }
3135                         } else if (hm2 & x) {
3136                             L = l[16];
3137                             H |= (L >> 4) << 8;
3138                             E |= s->filter.mblim_lut[L] << 8;
3139                             I |= s->filter.lim_lut[L] << 8;
3140                             s->dsp.loop_filter_mix2[!!(hmask1[1] & x)]
3141                                                    [!!(hmask2[1] & x)]
3142                                                    [0](ptr, ls_uv, E, I, H);
3143                         } else {
3144                             s->dsp.loop_filter_8[!!(hmask1[1] & x)]
3145                                                 [0](ptr, ls_uv, E, I, H);
3146                         }
3147                     } else if (hm2 & x) {
3148                         int L = l[16], H = L >> 4;
3149                         int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3150
3151                         s->dsp.loop_filter_8[!!(hmask2[1] & x)]
3152                                             [0](ptr + 8 * ls_uv, ls_uv, E, I, H);
3153                     }
3154                 }
3155                 if (x & 0xAA)
3156                     l += 2;
3157             }
3158         }
3159         lvl = lflvl->level;
3160         dst = f->data[1 + p] + uvoff;
3161         for (y = 0; y < 8; y++, dst += 4 * ls_uv) {
3162             uint8_t *ptr = dst, *l = lvl, *vmask = lflvl->mask[1][1][y];
3163             unsigned vm = vmask[0] | vmask[1] | vmask[2];
3164
3165             for (x = 1; vm & ~(x - 1); x <<= 4, ptr += 16, l += 4) {
3166                 if (row || y) {
3167                     if (vm & x) {
3168                         int L = *l, H = L >> 4;
3169                         int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3170
3171                         if (vmask[0] & x) {
3172                             if (vmask[0] & (x << 2)) {
3173                                 av_assert2(l[2] == L);
3174                                 s->dsp.loop_filter_16[1](ptr, ls_uv, E, I, H);
3175                             } else {
3176                                 s->dsp.loop_filter_8[2][1](ptr, ls_uv, E, I, H);
3177                             }
3178                         } else if (vm & (x << 2)) {
3179                             L = l[2];
3180                             H |= (L >> 4) << 8;
3181                             E |= s->filter.mblim_lut[L] << 8;
3182                             I |= s->filter.lim_lut[L] << 8;
3183                             s->dsp.loop_filter_mix2[!!(vmask[1] &  x)]
3184                                                    [!!(vmask[1] & (x << 2))]
3185                                                    [1](ptr, ls_uv, E, I, H);
3186                         } else {
3187                             s->dsp.loop_filter_8[!!(vmask[1] & x)]
3188                                                 [1](ptr, ls_uv, E, I, H);
3189                         }
3190                     } else if (vm & (x << 2)) {
3191                         int L = l[2], H = L >> 4;
3192                         int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3193
3194                         s->dsp.loop_filter_8[!!(vmask[1] & (x << 2))]
3195                                             [1](ptr + 8, ls_uv, E, I, H);
3196                     }
3197                 }
3198             }
3199             if (y & 1)
3200                 lvl += 16;
3201         }
3202     }
3203 }
3204
3205 static void set_tile_offset(int *start, int *end, int idx, int log2_n, int n)
3206 {
3207     int sb_start = ( idx      * n) >> log2_n;
3208     int sb_end   = ((idx + 1) * n) >> log2_n;
3209     *start = FFMIN(sb_start, n) << 3;
3210     *end   = FFMIN(sb_end,   n) << 3;
3211 }
3212
3213 static av_always_inline void adapt_prob(uint8_t *p, unsigned ct0, unsigned ct1,
3214                                         int max_count, int update_factor)
3215 {
3216     unsigned ct = ct0 + ct1, p2, p1;
3217
3218     if (!ct)
3219         return;
3220
3221     p1 = *p;
3222     p2 = ((ct0 << 8) + (ct >> 1)) / ct;
3223     p2 = av_clip(p2, 1, 255);
3224     ct = FFMIN(ct, max_count);
3225     update_factor = FASTDIV(update_factor * ct, max_count);
3226
3227     // (p1 * (256 - update_factor) + p2 * update_factor + 128) >> 8
3228     *p = p1 + (((p2 - p1) * update_factor + 128) >> 8);
3229 }
3230
3231 static void adapt_probs(VP9Context *s)
3232 {
3233     int i, j, k, l, m;
3234     prob_context *p = &s->prob_ctx[s->framectxid].p;
3235     int uf = (s->keyframe || s->intraonly || !s->last_keyframe) ? 112 : 128;
3236
3237     // coefficients
3238     for (i = 0; i < 4; i++)
3239         for (j = 0; j < 2; j++)
3240             for (k = 0; k < 2; k++)
3241                 for (l = 0; l < 6; l++)
3242                     for (m = 0; m < 6; m++) {
3243                         uint8_t *pp = s->prob_ctx[s->framectxid].coef[i][j][k][l][m];
3244                         unsigned *e = s->counts.eob[i][j][k][l][m];
3245                         unsigned *c = s->counts.coef[i][j][k][l][m];
3246
3247                         if (l == 0 && m >= 3) // dc only has 3 pt
3248                             break;
3249
3250                         adapt_prob(&pp[0], e[0], e[1], 24, uf);
3251                         adapt_prob(&pp[1], c[0], c[1] + c[2], 24, uf);
3252                         adapt_prob(&pp[2], c[1], c[2], 24, uf);
3253                     }
3254
3255     if (s->keyframe || s->intraonly) {
3256         memcpy(p->skip,  s->prob.p.skip,  sizeof(p->skip));
3257         memcpy(p->tx32p, s->prob.p.tx32p, sizeof(p->tx32p));
3258         memcpy(p->tx16p, s->prob.p.tx16p, sizeof(p->tx16p));
3259         memcpy(p->tx8p,  s->prob.p.tx8p,  sizeof(p->tx8p));
3260         return;
3261     }
3262
3263     // skip flag
3264     for (i = 0; i < 3; i++)
3265         adapt_prob(&p->skip[i], s->counts.skip[i][0], s->counts.skip[i][1], 20, 128);
3266
3267     // intra/inter flag
3268     for (i = 0; i < 4; i++)
3269         adapt_prob(&p->intra[i], s->counts.intra[i][0], s->counts.intra[i][1], 20, 128);
3270
3271     // comppred flag
3272     if (s->comppredmode == PRED_SWITCHABLE) {
3273       for (i = 0; i < 5; i++)
3274           adapt_prob(&p->comp[i], s->counts.comp[i][0], s->counts.comp[i][1], 20, 128);
3275     }
3276
3277     // reference frames
3278     if (s->comppredmode != PRED_SINGLEREF) {
3279       for (i = 0; i < 5; i++)
3280           adapt_prob(&p->comp_ref[i], s->counts.comp_ref[i][0],
3281                      s->counts.comp_ref[i][1], 20, 128);
3282     }
3283
3284     if (s->comppredmode != PRED_COMPREF) {
3285       for (i = 0; i < 5; i++) {
3286           uint8_t *pp = p->single_ref[i];
3287           unsigned (*c)[2] = s->counts.single_ref[i];
3288
3289           adapt_prob(&pp[0], c[0][0], c[0][1], 20, 128);
3290           adapt_prob(&pp[1], c[1][0], c[1][1], 20, 128);
3291       }
3292     }
3293
3294     // block partitioning
3295     for (i = 0; i < 4; i++)
3296         for (j = 0; j < 4; j++) {
3297             uint8_t *pp = p->partition[i][j];
3298             unsigned *c = s->counts.partition[i][j];
3299
3300             adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3301             adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3302             adapt_prob(&pp[2], c[2], c[3], 20, 128);
3303         }
3304
3305     // tx size
3306     if (s->txfmmode == TX_SWITCHABLE) {
3307       for (i = 0; i < 2; i++) {
3308           unsigned *c16 = s->counts.tx16p[i], *c32 = s->counts.tx32p[i];
3309
3310           adapt_prob(&p->tx8p[i], s->counts.tx8p[i][0], s->counts.tx8p[i][1], 20, 128);
3311           adapt_prob(&p->tx16p[i][0], c16[0], c16[1] + c16[2], 20, 128);
3312           adapt_prob(&p->tx16p[i][1], c16[1], c16[2], 20, 128);
3313           adapt_prob(&p->tx32p[i][0], c32[0], c32[1] + c32[2] + c32[3], 20, 128);
3314           adapt_prob(&p->tx32p[i][1], c32[1], c32[2] + c32[3], 20, 128);
3315           adapt_prob(&p->tx32p[i][2], c32[2], c32[3], 20, 128);
3316       }
3317     }
3318
3319     // interpolation filter
3320     if (s->filtermode == FILTER_SWITCHABLE) {
3321         for (i = 0; i < 4; i++) {
3322             uint8_t *pp = p->filter[i];
3323             unsigned *c = s->counts.filter[i];
3324
3325             adapt_prob(&pp[0], c[0], c[1] + c[2], 20, 128);
3326             adapt_prob(&pp[1], c[1], c[2], 20, 128);
3327         }
3328     }
3329
3330     // inter modes
3331     for (i = 0; i < 7; i++) {
3332         uint8_t *pp = p->mv_mode[i];
3333         unsigned *c = s->counts.mv_mode[i];
3334
3335         adapt_prob(&pp[0], c[2], c[1] + c[0] + c[3], 20, 128);
3336         adapt_prob(&pp[1], c[0], c[1] + c[3], 20, 128);
3337         adapt_prob(&pp[2], c[1], c[3], 20, 128);
3338     }
3339
3340     // mv joints
3341     {
3342         uint8_t *pp = p->mv_joint;
3343         unsigned *c = s->counts.mv_joint;
3344
3345         adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3346         adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3347         adapt_prob(&pp[2], c[2], c[3], 20, 128);
3348     }
3349
3350     // mv components
3351     for (i = 0; i < 2; i++) {
3352         uint8_t *pp;
3353         unsigned *c, (*c2)[2], sum;
3354
3355         adapt_prob(&p->mv_comp[i].sign, s->counts.mv_comp[i].sign[0],
3356                    s->counts.mv_comp[i].sign[1], 20, 128);
3357
3358         pp = p->mv_comp[i].classes;
3359         c = s->counts.mv_comp[i].classes;
3360         sum = c[1] + c[2] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9] + c[10];
3361         adapt_prob(&pp[0], c[0], sum, 20, 128);
3362         sum -= c[1];
3363         adapt_prob(&pp[1], c[1], sum, 20, 128);
3364         sum -= c[2] + c[3];
3365         adapt_prob(&pp[2], c[2] + c[3], sum, 20, 128);
3366         adapt_prob(&pp[3], c[2], c[3], 20, 128);
3367         sum -= c[4] + c[5];
3368         adapt_prob(&pp[4], c[4] + c[5], sum, 20, 128);
3369         adapt_prob(&pp[5], c[4], c[5], 20, 128);
3370         sum -= c[6];
3371         adapt_prob(&pp[6], c[6], sum, 20, 128);
3372         adapt_prob(&pp[7], c[7] + c[8], c[9] + c[10], 20, 128);
3373         adapt_prob(&pp[8], c[7], c[8], 20, 128);
3374         adapt_prob(&pp[9], c[9], c[10], 20, 128);
3375
3376         adapt_prob(&p->mv_comp[i].class0, s->counts.mv_comp[i].class0[0],
3377                    s->counts.mv_comp[i].class0[1], 20, 128);
3378         pp = p->mv_comp[i].bits;
3379         c2 = s->counts.mv_comp[i].bits;
3380         for (j = 0; j < 10; j++)
3381             adapt_prob(&pp[j], c2[j][0], c2[j][1], 20, 128);
3382
3383         for (j = 0; j < 2; j++) {
3384             pp = p->mv_comp[i].class0_fp[j];
3385             c = s->counts.mv_comp[i].class0_fp[j];
3386             adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3387             adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3388             adapt_prob(&pp[2], c[2], c[3], 20, 128);
3389         }
3390         pp = p->mv_comp[i].fp;
3391         c = s->counts.mv_comp[i].fp;
3392         adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3393         adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3394         adapt_prob(&pp[2], c[2], c[3], 20, 128);
3395
3396         if (s->highprecisionmvs) {
3397             adapt_prob(&p->mv_comp[i].class0_hp, s->counts.mv_comp[i].class0_hp[0],
3398                        s->counts.mv_comp[i].class0_hp[1], 20, 128);
3399             adapt_prob(&p->mv_comp[i].hp, s->counts.mv_comp[i].hp[0],
3400                        s->counts.mv_comp[i].hp[1], 20, 128);
3401         }
3402     }
3403
3404     // y intra modes
3405     for (i = 0; i < 4; i++) {
3406         uint8_t *pp = p->y_mode[i];
3407         unsigned *c = s->counts.y_mode[i], sum, s2;
3408
3409         sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
3410         adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
3411         sum -= c[TM_VP8_PRED];
3412         adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
3413         sum -= c[VERT_PRED];
3414         adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
3415         s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
3416         sum -= s2;
3417         adapt_prob(&pp[3], s2, sum, 20, 128);
3418         s2 -= c[HOR_PRED];
3419         adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
3420         adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], 20, 128);
3421         sum -= c[DIAG_DOWN_LEFT_PRED];
3422         adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
3423         sum -= c[VERT_LEFT_PRED];
3424         adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
3425         adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
3426     }
3427
3428     // uv intra modes
3429     for (i = 0; i < 10; i++) {
3430         uint8_t *pp = p->uv_mode[i];
3431         unsigned *c = s->counts.uv_mode[i], sum, s2;
3432
3433         sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
3434         adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
3435         sum -= c[TM_VP8_PRED];
3436         adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
3437         sum -= c[VERT_PRED];
3438         adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
3439         s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
3440         sum -= s2;
3441         adapt_prob(&pp[3], s2, sum, 20, 128);
3442         s2 -= c[HOR_PRED];
3443         adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
3444         adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], 20, 128);
3445         sum -= c[DIAG_DOWN_LEFT_PRED];
3446         adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
3447         sum -= c[VERT_LEFT_PRED];
3448         adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
3449         adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
3450     }
3451 }
3452
3453 static void free_buffers(VP9Context *s)
3454 {
3455     av_freep(&s->above_partition_ctx);
3456     av_freep(&s->b_base);
3457     av_freep(&s->block_base);
3458 }
3459
3460 static av_cold int vp9_decode_free(AVCodecContext *ctx)
3461 {
3462     VP9Context *s = ctx->priv_data;
3463     int i;
3464
3465     for (i = 0; i < 2; i++) {
3466         if (s->frames[i].tf.f->data[0])
3467             vp9_unref_frame(ctx, &s->frames[i]);
3468         av_frame_free(&s->frames[i].tf.f);
3469     }
3470     for (i = 0; i < 8; i++) {
3471         if (s->refs[i].f->data[0])
3472             ff_thread_release_buffer(ctx, &s->refs[i]);
3473         av_frame_free(&s->refs[i].f);
3474         if (s->next_refs[i].f->data[0])
3475             ff_thread_release_buffer(ctx, &s->next_refs[i]);
3476         av_frame_free(&s->next_refs[i].f);
3477     }
3478     free_buffers(s);
3479     av_freep(&s->c_b);
3480     s->c_b_size = 0;
3481
3482     return 0;
3483 }
3484
3485
3486 static int vp9_decode_frame(AVCodecContext *ctx, void *frame,
3487                             int *got_frame, AVPacket *pkt)
3488 {
3489     const uint8_t *data = pkt->data;
3490     int size = pkt->size;
3491     VP9Context *s = ctx->priv_data;
3492     int res, tile_row, tile_col, i, ref, row, col;
3493     ptrdiff_t yoff, uvoff, ls_y, ls_uv;
3494     AVFrame *f;
3495
3496     if ((res = decode_frame_header(ctx, data, size, &ref)) < 0) {
3497         return res;
3498     } else if (res == 0) {
3499         if (!s->refs[ref].f->data[0]) {
3500             av_log(ctx, AV_LOG_ERROR, "Requested reference %d not available\n", ref);
3501             return AVERROR_INVALIDDATA;
3502         }
3503         if ((res = av_frame_ref(frame, s->refs[ref].f)) < 0)
3504             return res;
3505         *got_frame = 1;
3506         return 0;
3507     }
3508     data += res;
3509     size -= res;
3510
3511     if (s->frames[LAST_FRAME].tf.f->data[0])
3512         vp9_unref_frame(ctx, &s->frames[LAST_FRAME]);
3513     if (!s->keyframe && s->frames[CUR_FRAME].tf.f->data[0] &&
3514         (res = vp9_ref_frame(ctx, &s->frames[LAST_FRAME], &s->frames[CUR_FRAME])) < 0)
3515         return res;
3516     if (s->frames[CUR_FRAME].tf.f->data[0])
3517         vp9_unref_frame(ctx, &s->frames[CUR_FRAME]);
3518     if ((res = vp9_alloc_frame(ctx, &s->frames[CUR_FRAME])) < 0)
3519         return res;
3520     f = s->frames[CUR_FRAME].tf.f;
3521     f->key_frame = s->keyframe;
3522     f->pict_type = s->keyframe ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
3523     ls_y = f->linesize[0];
3524     ls_uv =f->linesize[1];
3525
3526     // ref frame setup
3527     for (i = 0; i < 8; i++) {
3528         if (s->next_refs[i].f->data[0])
3529             ff_thread_release_buffer(ctx, &s->next_refs[i]);
3530         if (s->refreshrefmask & (1 << i)) {
3531             res = ff_thread_ref_frame(&s->next_refs[i], &s->frames[CUR_FRAME].tf);
3532         } else {
3533             res = ff_thread_ref_frame(&s->next_refs[i], &s->refs[i]);
3534         }
3535         if (res < 0)
3536             return res;
3537     }
3538
3539     // main tile decode loop
3540     memset(s->above_partition_ctx, 0, s->cols);
3541     memset(s->above_skip_ctx, 0, s->cols);
3542     if (s->keyframe || s->intraonly) {
3543         memset(s->above_mode_ctx, DC_PRED, s->cols * 2);
3544     } else {
3545         memset(s->above_mode_ctx, NEARESTMV, s->cols);
3546     }
3547     memset(s->above_y_nnz_ctx, 0, s->sb_cols * 16);
3548     memset(s->above_uv_nnz_ctx[0], 0, s->sb_cols * 8);
3549     memset(s->above_uv_nnz_ctx[1], 0, s->sb_cols * 8);
3550     memset(s->above_segpred_ctx, 0, s->cols);
3551     s->pass = s->uses_2pass =
3552         ctx->active_thread_type == FF_THREAD_FRAME && s->refreshctx && !s->parallelmode;
3553     if (s->refreshctx && s->parallelmode) {
3554         int j, k, l, m;
3555
3556         for (i = 0; i < 4; i++) {
3557             for (j = 0; j < 2; j++)
3558                 for (k = 0; k < 2; k++)
3559                     for (l = 0; l < 6; l++)
3560                         for (m = 0; m < 6; m++)
3561                             memcpy(s->prob_ctx[s->framectxid].coef[i][j][k][l][m],
3562                                    s->prob.coef[i][j][k][l][m], 3);
3563             if (s->txfmmode == i)
3564                 break;
3565         }
3566         s->prob_ctx[s->framectxid].p = s->prob.p;
3567         ff_thread_finish_setup(ctx);
3568     }
3569
3570     do {
3571         yoff = uvoff = 0;
3572         s->b = s->b_base;
3573         s->block = s->block_base;
3574         s->uvblock[0] = s->uvblock_base[0];
3575         s->uvblock[1] = s->uvblock_base[1];
3576         s->eob = s->eob_base;
3577         s->uveob[0] = s->uveob_base[0];
3578         s->uveob[1] = s->uveob_base[1];
3579
3580         for (tile_row = 0; tile_row < s->tiling.tile_rows; tile_row++) {
3581             set_tile_offset(&s->tiling.tile_row_start, &s->tiling.tile_row_end,
3582                             tile_row, s->tiling.log2_tile_rows, s->sb_rows);
3583             if (s->pass != 2) {
3584                 for (tile_col = 0; tile_col < s->tiling.tile_cols; tile_col++) {
3585                     unsigned tile_size;
3586
3587                     if (tile_col == s->tiling.tile_cols - 1 &&
3588                         tile_row == s->tiling.tile_rows - 1) {
3589                         tile_size = size;
3590                     } else {
3591                         tile_size = AV_RB32(data);
3592                         data += 4;
3593                         size -= 4;
3594                     }
3595                     if (tile_size > size) {
3596                         ff_thread_report_progress(&s->frames[CUR_FRAME].tf, INT_MAX, 0);
3597                         return AVERROR_INVALIDDATA;
3598                     }
3599                     ff_vp56_init_range_decoder(&s->c_b[tile_col], data, tile_size);
3600                     if (vp56_rac_get_prob_branchy(&s->c_b[tile_col], 128)) { // marker bit
3601                         ff_thread_report_progress(&s->frames[CUR_FRAME].tf, INT_MAX, 0);
3602                         return AVERROR_INVALIDDATA;
3603                     }
3604                     data += tile_size;
3605                     size -= tile_size;
3606                 }
3607             }
3608
3609             for (row = s->tiling.tile_row_start; row < s->tiling.tile_row_end;
3610                  row += 8, yoff += ls_y * 64, uvoff += ls_uv * 32) {
3611                 struct VP9Filter *lflvl_ptr = s->lflvl;
3612                 ptrdiff_t yoff2 = yoff, uvoff2 = uvoff;
3613
3614                 for (tile_col = 0; tile_col < s->tiling.tile_cols; tile_col++) {
3615                     set_tile_offset(&s->tiling.tile_col_start, &s->tiling.tile_col_end,
3616                                     tile_col, s->tiling.log2_tile_cols, s->sb_cols);
3617
3618                     if (s->pass != 2) {
3619                         memset(s->left_partition_ctx, 0, 8);
3620                         memset(s->left_skip_ctx, 0, 8);
3621                         if (s->keyframe || s->intraonly) {
3622                             memset(s->left_mode_ctx, DC_PRED, 16);
3623                         } else {
3624                             memset(s->left_mode_ctx, NEARESTMV, 8);
3625                         }
3626                         memset(s->left_y_nnz_ctx, 0, 16);
3627                         memset(s->left_uv_nnz_ctx, 0, 16);
3628                         memset(s->left_segpred_ctx, 0, 8);
3629
3630                         memcpy(&s->c, &s->c_b[tile_col], sizeof(s->c));
3631                     }
3632
3633                     for (col = s->tiling.tile_col_start;
3634                          col < s->tiling.tile_col_end;
3635                          col += 8, yoff2 += 64, uvoff2 += 32, lflvl_ptr++) {
3636                         // FIXME integrate with lf code (i.e. zero after each
3637                         // use, similar to invtxfm coefficients, or similar)
3638                         if (s->pass != 1) {
3639                             memset(lflvl_ptr->mask, 0, sizeof(lflvl_ptr->mask));
3640                         }
3641
3642                         if (s->pass == 2) {
3643                             decode_sb_mem(ctx, row, col, lflvl_ptr,
3644                                           yoff2, uvoff2, BL_64X64);
3645                         } else {
3646                             decode_sb(ctx, row, col, lflvl_ptr,
3647                                       yoff2, uvoff2, BL_64X64);
3648                         }
3649                     }
3650                     if (s->pass != 2) {
3651                         memcpy(&s->c_b[tile_col], &s->c, sizeof(s->c));
3652                     }
3653                 }
3654
3655                 if (s->pass == 1) {
3656                     continue;
3657                 }
3658
3659                 // backup pre-loopfilter reconstruction data for intra
3660                 // prediction of next row of sb64s
3661                 if (row + 8 < s->rows) {
3662                     memcpy(s->intra_pred_data[0],
3663                            f->data[0] + yoff + 63 * ls_y,
3664                            8 * s->cols);
3665                     memcpy(s->intra_pred_data[1],
3666                            f->data[1] + uvoff + 31 * ls_uv,
3667                            4 * s->cols);
3668                     memcpy(s->intra_pred_data[2],
3669                            f->data[2] + uvoff + 31 * ls_uv,
3670                            4 * s->cols);
3671                 }
3672
3673                 // loopfilter one row
3674                 if (s->filter.level) {
3675                     yoff2 = yoff;
3676                     uvoff2 = uvoff;
3677                     lflvl_ptr = s->lflvl;
3678                     for (col = 0; col < s->cols;
3679                          col += 8, yoff2 += 64, uvoff2 += 32, lflvl_ptr++) {
3680                         loopfilter_sb(ctx, lflvl_ptr, row, col, yoff2, uvoff2);
3681                     }
3682                 }
3683
3684                 // FIXME maybe we can make this more finegrained by running the
3685                 // loopfilter per-block instead of after each sbrow
3686                 // In fact that would also make intra pred left preparation easier?
3687                 ff_thread_report_progress(&s->frames[CUR_FRAME].tf, row >> 3, 0);
3688             }
3689         }
3690
3691         if (s->pass < 2 && s->refreshctx && !s->parallelmode) {
3692             adapt_probs(s);
3693             ff_thread_finish_setup(ctx);
3694         }
3695     } while (s->pass++ == 1);
3696     ff_thread_report_progress(&s->frames[CUR_FRAME].tf, INT_MAX, 0);
3697
3698     // ref frame setup
3699     for (i = 0; i < 8; i++) {
3700         if (s->refs[i].f->data[0])
3701             ff_thread_release_buffer(ctx, &s->refs[i]);
3702         ff_thread_ref_frame(&s->refs[i], &s->next_refs[i]);
3703     }
3704
3705     if (!s->invisible) {
3706         if ((res = av_frame_ref(frame, s->frames[CUR_FRAME].tf.f)) < 0)
3707             return res;
3708         *got_frame = 1;
3709     }
3710
3711     return 0;
3712 }
3713
3714 static void vp9_decode_flush(AVCodecContext *ctx)
3715 {
3716     VP9Context *s = ctx->priv_data;
3717     int i;
3718
3719     for (i = 0; i < 2; i++)
3720         vp9_unref_frame(ctx, &s->frames[i]);
3721     for (i = 0; i < 8; i++)
3722         ff_thread_release_buffer(ctx, &s->refs[i]);
3723 }
3724
3725 static int init_frames(AVCodecContext *ctx)
3726 {
3727     VP9Context *s = ctx->priv_data;
3728     int i;
3729
3730     for (i = 0; i < 2; i++) {
3731         s->frames[i].tf.f = av_frame_alloc();
3732         if (!s->frames[i].tf.f) {
3733             vp9_decode_free(ctx);
3734             av_log(ctx, AV_LOG_ERROR, "Failed to allocate frame buffer %d\n", i);
3735             return AVERROR(ENOMEM);
3736         }
3737     }
3738     for (i = 0; i < 8; i++) {
3739         s->refs[i].f = av_frame_alloc();
3740         s->next_refs[i].f = av_frame_alloc();
3741         if (!s->refs[i].f || !s->next_refs[i].f) {
3742             vp9_decode_free(ctx);
3743             av_log(ctx, AV_LOG_ERROR, "Failed to allocate frame buffer %d\n", i);
3744             return AVERROR(ENOMEM);
3745         }
3746     }
3747
3748     return 0;
3749 }
3750
3751 static av_cold int vp9_decode_init(AVCodecContext *ctx)
3752 {
3753     VP9Context *s = ctx->priv_data;
3754
3755     ctx->internal->allocate_progress = 1;
3756     ctx->pix_fmt = AV_PIX_FMT_YUV420P;
3757     ff_vp9dsp_init(&s->dsp);
3758     ff_videodsp_init(&s->vdsp, 8);
3759     s->filter.sharpness = -1;
3760
3761     return init_frames(ctx);
3762 }
3763
3764 static av_cold int vp9_decode_init_thread_copy(AVCodecContext *avctx)
3765 {
3766     return init_frames(avctx);
3767 }
3768
3769 static int vp9_decode_update_thread_context(AVCodecContext *dst, const AVCodecContext *src)
3770 {
3771     int i, res;
3772     VP9Context *s = dst->priv_data, *ssrc = src->priv_data;
3773
3774     // detect size changes in other threads
3775     if (s->above_partition_ctx &&
3776         (!ssrc->above_partition_ctx || s->cols != ssrc->cols || s->rows != ssrc->rows)) {
3777         free_buffers(s);
3778     }
3779
3780     for (i = 0; i < 2; i++) {
3781         if (s->frames[i].tf.f->data[0])
3782             vp9_unref_frame(dst, &s->frames[i]);
3783         if (ssrc->frames[i].tf.f->data[0]) {
3784             if ((res = vp9_ref_frame(dst, &s->frames[i], &ssrc->frames[i])) < 0)
3785                 return res;
3786         }
3787     }
3788     for (i = 0; i < 8; i++) {
3789         if (s->refs[i].f->data[0])
3790             ff_thread_release_buffer(dst, &s->refs[i]);
3791         if (ssrc->next_refs[i].f->data[0]) {
3792             if ((res = ff_thread_ref_frame(&s->refs[i], &ssrc->next_refs[i])) < 0)
3793                 return res;
3794         }
3795     }
3796
3797     s->invisible = ssrc->invisible;
3798     s->keyframe = ssrc->keyframe;
3799     s->uses_2pass = ssrc->uses_2pass;
3800     memcpy(&s->prob_ctx, &ssrc->prob_ctx, sizeof(s->prob_ctx));
3801     memcpy(&s->lf_delta, &ssrc->lf_delta, sizeof(s->lf_delta));
3802     if (ssrc->segmentation.enabled) {
3803         memcpy(&s->segmentation.feat, &ssrc->segmentation.feat,
3804                sizeof(s->segmentation.feat));
3805     }
3806
3807     return 0;
3808 }
3809
3810 AVCodec ff_vp9_decoder = {
3811     .name                  = "vp9",
3812     .long_name             = NULL_IF_CONFIG_SMALL("Google VP9"),
3813     .type                  = AVMEDIA_TYPE_VIDEO,
3814     .id                    = AV_CODEC_ID_VP9,
3815     .priv_data_size        = sizeof(VP9Context),
3816     .init                  = vp9_decode_init,
3817     .close                 = vp9_decode_free,
3818     .decode                = vp9_decode_frame,
3819     .capabilities          = CODEC_CAP_DR1 | CODEC_CAP_FRAME_THREADS,
3820     .flush                 = vp9_decode_flush,
3821     .init_thread_copy      = ONLY_IF_THREADS_ENABLED(vp9_decode_init_thread_copy),
3822     .update_thread_context = ONLY_IF_THREADS_ENABLED(vp9_decode_update_thread_context),
3823 };