git.sesse.net Git - ffmpeg/blob - libavcodec/vp9.c

   1 /*
   2  * VP9 compatible video decoder
   3  *
   4  * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
   5  * Copyright (C) 2013 Clément Bœsch <u pkh me>
   6  *
   7  * This file is part of FFmpeg.
   8  *
   9  * FFmpeg is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public
  11  * License as published by the Free Software Foundation; either
  12  * version 2.1 of the License, or (at your option) any later version.
  13  *
  14  * FFmpeg is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with FFmpeg; if not, write to the Free Software
  21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22  */
  23
  24 #include "avcodec.h"
  25 #include "get_bits.h"
  26 #include "internal.h"
  27 #include "thread.h"
  28 #include "videodsp.h"
  29 #include "vp56.h"
  30 #include "vp9.h"
  31 #include "vp9data.h"
  32 #include "vp9dsp.h"
  33 #include "libavutil/avassert.h"
  34
  35 #define VP9_SYNCCODE 0x498342
  36
  37 enum CompPredMode {
  38     PRED_SINGLEREF,
  39     PRED_COMPREF,
  40     PRED_SWITCHABLE,
  41 };
  42
  43 enum BlockLevel {
  44     BL_64X64,
  45     BL_32X32,
  46     BL_16X16,
  47     BL_8X8,
  48 };
  49
  50 enum BlockSize {
  51     BS_64x64,
  52     BS_64x32,
  53     BS_32x64,
  54     BS_32x32,
  55     BS_32x16,
  56     BS_16x32,
  57     BS_16x16,
  58     BS_16x8,
  59     BS_8x16,
  60     BS_8x8,
  61     BS_8x4,
  62     BS_4x8,
  63     BS_4x4,
  64     N_BS_SIZES,
  65 };
  66
  67 struct VP9mvrefPair {
  68     VP56mv mv[2];
  69     int8_t ref[2];
  70 };
  71
  72 typedef struct VP9Frame {
  73     ThreadFrame tf;
  74     AVBufferRef *extradata;
  75     uint8_t *segmentation_map;
  76     struct VP9mvrefPair *mv;
  77 } VP9Frame;
  78
  79 struct VP9Filter {
  80     uint8_t level[8 * 8];
  81     uint8_t /* bit=col */ mask[2 /* 0=y, 1=uv */][2 /* 0=col, 1=row */]
  82                               [8 /* rows */][4 /* 0=16, 1=8, 2=4, 3=inner4 */];
  83 };
  84
  85 typedef struct VP9Block {
  86     uint8_t seg_id, intra, comp, ref[2], mode[4], uvmode, skip;
  87     enum FilterMode filter;
  88     VP56mv mv[4 /* b_idx */][2 /* ref */];
  89     enum BlockSize bs;
  90     enum TxfmMode tx, uvtx;
  91     enum BlockLevel bl;
  92     enum BlockPartition bp;
  93 } VP9Block;
  94
  95 typedef struct VP9Context {
  96     VP9DSPContext dsp;
  97     VideoDSPContext vdsp;
  98     GetBitContext gb;
  99     VP56RangeCoder c;
 100     VP56RangeCoder *c_b;
 101     unsigned c_b_size;
 102     VP9Block *b_base, *b;
 103     int pass, uses_2pass, last_uses_2pass;
 104     int row, row7, col, col7;
 105     uint8_t *dst[3];
 106     ptrdiff_t y_stride, uv_stride;
 107
 108     // bitstream header
 109     uint8_t profile;
 110     uint8_t keyframe, last_keyframe;
 111     uint8_t invisible;
 112     uint8_t use_last_frame_mvs;
 113     uint8_t errorres;
 114     uint8_t colorspace;
 115     uint8_t fullrange;
 116     uint8_t intraonly;
 117     uint8_t resetctx;
 118     uint8_t refreshrefmask;
 119     uint8_t highprecisionmvs;
 120     enum FilterMode filtermode;
 121     uint8_t allowcompinter;
 122     uint8_t fixcompref;
 123     uint8_t refreshctx;
 124     uint8_t parallelmode;
 125     uint8_t framectxid;
 126     uint8_t refidx[3];
 127     uint8_t signbias[3];
 128     uint8_t varcompref[2];
 129     ThreadFrame refs[8], next_refs[8];
 130 #define CUR_FRAME 0
 131 #define LAST_FRAME 1
 132     VP9Frame frames[2];
 133
 134     struct {
 135         uint8_t level;
 136         int8_t sharpness;
 137         uint8_t lim_lut[64];
 138         uint8_t mblim_lut[64];
 139     } filter;
 140     struct {
 141         uint8_t enabled;
 142         int8_t mode[2];
 143         int8_t ref[4];
 144     } lf_delta;
 145     uint8_t yac_qi;
 146     int8_t ydc_qdelta, uvdc_qdelta, uvac_qdelta;
 147     uint8_t lossless;
 148     struct {
 149         uint8_t enabled;
 150         uint8_t temporal;
 151         uint8_t absolute_vals;
 152         uint8_t update_map;
 153         struct {
 154             uint8_t q_enabled;
 155             uint8_t lf_enabled;
 156             uint8_t ref_enabled;
 157             uint8_t skip_enabled;
 158             uint8_t ref_val;
 159             int16_t q_val;
 160             int8_t lf_val;
 161             int16_t qmul[2][2];
 162             uint8_t lflvl[4][2];
 163         } feat[8];
 164     } segmentation;
 165     struct {
 166         unsigned log2_tile_cols, log2_tile_rows;
 167         unsigned tile_cols, tile_rows;
 168         unsigned tile_row_start, tile_row_end, tile_col_start, tile_col_end;
 169     } tiling;
 170     unsigned sb_cols, sb_rows, rows, cols;
 171     struct {
 172         prob_context p;
 173         uint8_t coef[4][2][2][6][6][3];
 174     } prob_ctx[4];
 175     struct {
 176         prob_context p;
 177         uint8_t coef[4][2][2][6][6][11];
 178         uint8_t seg[7];
 179         uint8_t segpred[3];
 180     } prob;
 181     struct {
 182         unsigned y_mode[4][10];
 183         unsigned uv_mode[10][10];
 184         unsigned filter[4][3];
 185         unsigned mv_mode[7][4];
 186         unsigned intra[4][2];
 187         unsigned comp[5][2];
 188         unsigned single_ref[5][2][2];
 189         unsigned comp_ref[5][2];
 190         unsigned tx32p[2][4];
 191         unsigned tx16p[2][3];
 192         unsigned tx8p[2][2];
 193         unsigned skip[3][2];
 194         unsigned mv_joint[4];
 195         struct {
 196             unsigned sign[2];
 197             unsigned classes[11];
 198             unsigned class0[2];
 199             unsigned bits[10][2];
 200             unsigned class0_fp[2][4];
 201             unsigned fp[4];
 202             unsigned class0_hp[2];
 203             unsigned hp[2];
 204         } mv_comp[2];
 205         unsigned partition[4][4][4];
 206         unsigned coef[4][2][2][6][6][3];
 207         unsigned eob[4][2][2][6][6][2];
 208     } counts;
 209     enum TxfmMode txfmmode;
 210     enum CompPredMode comppredmode;
 211
 212     // contextual (left/above) cache
 213     uint8_t left_partition_ctx[8], *above_partition_ctx;
 214     uint8_t left_mode_ctx[16], *above_mode_ctx;
 215     // FIXME maybe merge some of the below in a flags field?
 216     uint8_t left_y_nnz_ctx[16], *above_y_nnz_ctx;
 217     uint8_t left_uv_nnz_ctx[2][8], *above_uv_nnz_ctx[2];
 218     uint8_t left_skip_ctx[8], *above_skip_ctx; // 1bit
 219     uint8_t left_txfm_ctx[8], *above_txfm_ctx; // 2bit
 220     uint8_t left_segpred_ctx[8], *above_segpred_ctx; // 1bit
 221     uint8_t left_intra_ctx[8], *above_intra_ctx; // 1bit
 222     uint8_t left_comp_ctx[8], *above_comp_ctx; // 1bit
 223     uint8_t left_ref_ctx[8], *above_ref_ctx; // 2bit
 224     uint8_t left_filter_ctx[8], *above_filter_ctx;
 225     VP56mv left_mv_ctx[16][2], (*above_mv_ctx)[2];
 226
 227     // whole-frame cache
 228     uint8_t *intra_pred_data[3];
 229     struct VP9Filter *lflvl;
 230     DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[71*80];
 231
 232     // block reconstruction intermediates
 233     int16_t *block_base, *block, *uvblock_base[2], *uvblock[2];
 234     uint8_t *eob_base, *uveob_base[2], *eob, *uveob[2];
 235     VP56mv min_mv, max_mv;
 236     DECLARE_ALIGNED(32, uint8_t, tmp_y)[64*64];
 237     DECLARE_ALIGNED(32, uint8_t, tmp_uv)[2][32*32];
 238 } VP9Context;
 239
 240 static const uint8_t bwh_tab[2][N_BS_SIZES][2] = {
 241     {
 242         { 16, 16 }, { 16, 8 }, { 8, 16 }, { 8, 8 }, { 8, 4 }, { 4, 8 },
 243         { 4, 4 }, { 4, 2 }, { 2, 4 }, { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 },
 244     }, {
 245         { 8, 8 }, { 8, 4 }, { 4, 8 }, { 4, 4 }, { 4, 2 }, { 2, 4 },
 246         { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },
 247     }
 248 };
 249
 250 static int vp9_alloc_frame(AVCodecContext *ctx, VP9Frame *f)
 251 {
 252     VP9Context *s = ctx->priv_data;
 253     int ret, sz;
 254
 255     if ((ret = ff_thread_get_buffer(ctx, &f->tf, AV_GET_BUFFER_FLAG_REF)) < 0)
 256         return ret;
 257     sz = 64 * s->sb_cols * s->sb_rows;
 258     if (!(f->extradata = av_buffer_allocz(sz * (1 + sizeof(struct VP9mvrefPair))))) {
 259         ff_thread_release_buffer(ctx, &f->tf);
 260         return AVERROR(ENOMEM);
 261     }
 262
 263     f->segmentation_map = f->extradata->data;
 264     f->mv = (struct VP9mvrefPair *) (f->extradata->data + sz);
 265
 266     // retain segmentation map if it doesn't update
 267     if (s->segmentation.enabled && !s->segmentation.update_map) {
 268         memcpy(f->segmentation_map, s->frames[LAST_FRAME].segmentation_map, sz);
 269     }
 270
 271     return 0;
 272 }
 273
 274 static void vp9_unref_frame(AVCodecContext *ctx, VP9Frame *f)
 275 {
 276     ff_thread_release_buffer(ctx, &f->tf);
 277     av_buffer_unref(&f->extradata);
 278 }
 279
 280 static int vp9_ref_frame(AVCodecContext *ctx, VP9Frame *dst, VP9Frame *src)
 281 {
 282     int res;
 283
 284     if ((res = ff_thread_ref_frame(&dst->tf, &src->tf)) < 0) {
 285         return res;
 286     } else if (!(dst->extradata = av_buffer_ref(src->extradata))) {
 287         vp9_unref_frame(ctx, dst);
 288         return AVERROR(ENOMEM);
 289     }
 290
 291     dst->segmentation_map = src->segmentation_map;
 292     dst->mv = src->mv;
 293
 294     return 0;
 295 }
 296
 297 static int update_size(AVCodecContext *ctx, int w, int h)
 298 {
 299     VP9Context *s = ctx->priv_data;
 300     uint8_t *p;
 301
 302     av_assert0(w > 0 && h > 0);
 303
 304     if (s->above_partition_ctx && w == ctx->width && h == ctx->height)
 305         return 0;
 306
 307     ctx->width  = w;
 308     ctx->height = h;
 309     s->sb_cols  = (w + 63) >> 6;
 310     s->sb_rows  = (h + 63) >> 6;
 311     s->cols     = (w + 7) >> 3;
 312     s->rows     = (h + 7) >> 3;
 313
 314 #define assign(var, type, n) var = (type) p; p += s->sb_cols * n * sizeof(*var)
 315     av_freep(&s->above_partition_ctx);
 316     p = av_malloc(s->sb_cols * (240 + sizeof(*s->lflvl) + 16 * sizeof(*s->above_mv_ctx)));
 317     if (!p)
 318         return AVERROR(ENOMEM);
 319     assign(s->above_partition_ctx, uint8_t *,              8);
 320     assign(s->above_skip_ctx,      uint8_t *,              8);
 321     assign(s->above_txfm_ctx,      uint8_t *,              8);
 322     assign(s->above_mode_ctx,      uint8_t *,             16);
 323     assign(s->above_y_nnz_ctx,     uint8_t *,             16);
 324     assign(s->above_uv_nnz_ctx[0], uint8_t *,              8);
 325     assign(s->above_uv_nnz_ctx[1], uint8_t *,              8);
 326     assign(s->intra_pred_data[0],  uint8_t *,             64);
 327     assign(s->intra_pred_data[1],  uint8_t *,             32);
 328     assign(s->intra_pred_data[2],  uint8_t *,             32);
 329     assign(s->above_segpred_ctx,   uint8_t *,              8);
 330     assign(s->above_intra_ctx,     uint8_t *,              8);
 331     assign(s->above_comp_ctx,      uint8_t *,              8);
 332     assign(s->above_ref_ctx,       uint8_t *,              8);
 333     assign(s->above_filter_ctx,    uint8_t *,              8);
 334     assign(s->lflvl,               struct VP9Filter *,     1);
 335     assign(s->above_mv_ctx,        VP56mv(*)[2],          16);
 336 #undef assign
 337
 338     av_free(s->b_base);
 339     av_free(s->block_base);
 340     if (ctx->active_thread_type == FF_THREAD_FRAME && s->refreshctx && !s->parallelmode) {
 341         int sbs = s->sb_cols * s->sb_rows;
 342
 343         s->b_base = av_malloc(sizeof(VP9Block) * s->cols * s->rows);
 344         s->block_base = av_mallocz((64 * 64 + 128) * sbs * 3);
 345         if (!s->b_base || !s->block_base)
 346             return AVERROR(ENOMEM);
 347         s->uvblock_base[0] = s->block_base + sbs * 64 * 64;
 348         s->uvblock_base[1] = s->uvblock_base[0] + sbs * 32 * 32;
 349         s->eob_base = (uint8_t *) (s->uvblock_base[1] + sbs * 32 * 32);
 350         s->uveob_base[0] = s->eob_base + 256 * sbs;
 351         s->uveob_base[1] = s->uveob_base[0] + 64 * sbs;
 352     } else {
 353         s->b_base = av_malloc(sizeof(VP9Block));
 354         s->block_base = av_mallocz((64 * 64 + 128) * 3);
 355         if (!s->b_base || !s->block_base)
 356             return AVERROR(ENOMEM);
 357         s->uvblock_base[0] = s->block_base + 64 * 64;
 358         s->uvblock_base[1] = s->uvblock_base[0] + 32 * 32;
 359         s->eob_base = (uint8_t *) (s->uvblock_base[1] + 32 * 32);
 360         s->uveob_base[0] = s->eob_base + 256;
 361         s->uveob_base[1] = s->uveob_base[0] + 64;
 362     }
 363
 364     return 0;
 365 }
 366
 367 // for some reason the sign bit is at the end, not the start, of a bit sequence
 368 static av_always_inline int get_sbits_inv(GetBitContext *gb, int n)
 369 {
 370     int v = get_bits(gb, n);
 371     return get_bits1(gb) ? -v : v;
 372 }
 373
 374 static av_always_inline int inv_recenter_nonneg(int v, int m)
 375 {
 376     return v > 2 * m ? v : v & 1 ? m - ((v + 1) >> 1) : m + (v >> 1);
 377 }
 378
 379 // differential forward probability updates
 380 static int update_prob(VP56RangeCoder *c, int p)
 381 {
 382     static const int inv_map_table[254] = {
 383           7,  20,  33,  46,  59,  72,  85,  98, 111, 124, 137, 150, 163, 176,
 384         189, 202, 215, 228, 241, 254,   1,   2,   3,   4,   5,   6,   8,   9,
 385          10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  21,  22,  23,  24,
 386          25,  26,  27,  28,  29,  30,  31,  32,  34,  35,  36,  37,  38,  39,
 387          40,  41,  42,  43,  44,  45,  47,  48,  49,  50,  51,  52,  53,  54,
 388          55,  56,  57,  58,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,
 389          70,  71,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,
 390          86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,  99, 100,
 391         101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 112, 113, 114, 115,
 392         116, 117, 118, 119, 120, 121, 122, 123, 125, 126, 127, 128, 129, 130,
 393         131, 132, 133, 134, 135, 136, 138, 139, 140, 141, 142, 143, 144, 145,
 394         146, 147, 148, 149, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160,
 395         161, 162, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
 396         177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 190, 191,
 397         192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 203, 204, 205, 206,
 398         207, 208, 209, 210, 211, 212, 213, 214, 216, 217, 218, 219, 220, 221,
 399         222, 223, 224, 225, 226, 227, 229, 230, 231, 232, 233, 234, 235, 236,
 400         237, 238, 239, 240, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251,
 401         252, 253,
 402     };
 403     int d;
 404
 405     /* This code is trying to do a differential probability update. For a
 406      * current probability A in the range [1, 255], the difference to a new
 407      * probability of any value can be expressed differentially as 1-A,255-A
 408      * where some part of this (absolute range) exists both in positive as
 409      * well as the negative part, whereas another part only exists in one
 410      * half. We're trying to code this shared part differentially, i.e.
 411      * times two where the value of the lowest bit specifies the sign, and
 412      * the single part is then coded on top of this. This absolute difference
 413      * then again has a value of [0,254], but a bigger value in this range
 414      * indicates that we're further away from the original value A, so we
 415      * can code this as a VLC code, since higher values are increasingly
 416      * unlikely. The first 20 values in inv_map_table[] allow 'cheap, rough'
 417      * updates vs. the 'fine, exact' updates further down the range, which
 418      * adds one extra dimension to this differential update model. */
 419
 420     if (!vp8_rac_get(c)) {
 421         d = vp8_rac_get_uint(c, 4) + 0;
 422     } else if (!vp8_rac_get(c)) {
 423         d = vp8_rac_get_uint(c, 4) + 16;
 424     } else if (!vp8_rac_get(c)) {
 425         d = vp8_rac_get_uint(c, 5) + 32;
 426     } else {
 427         d = vp8_rac_get_uint(c, 7);
 428         if (d >= 65)
 429             d = (d << 1) - 65 + vp8_rac_get(c);
 430         d += 64;
 431     }
 432
 433     return p <= 128 ? 1 + inv_recenter_nonneg(inv_map_table[d], p - 1) :
 434                     255 - inv_recenter_nonneg(inv_map_table[d], 255 - p);
 435 }
 436
 437 static int decode_frame_header(AVCodecContext *ctx,
 438                                const uint8_t *data, int size, int *ref)
 439 {
 440     VP9Context *s = ctx->priv_data;
 441     int c, i, j, k, l, m, n, w, h, max, size2, res, sharp;
 442     int last_invisible;
 443     const uint8_t *data2;
 444
 445     /* general header */
 446     if ((res = init_get_bits8(&s->gb, data, size)) < 0) {
 447         av_log(ctx, AV_LOG_ERROR, "Failed to initialize bitstream reader\n");
 448         return res;
 449     }
 450     if (get_bits(&s->gb, 2) != 0x2) { // frame marker
 451         av_log(ctx, AV_LOG_ERROR, "Invalid frame marker\n");
 452         return AVERROR_INVALIDDATA;
 453     }
 454     s->profile = get_bits1(&s->gb);
 455     if (get_bits1(&s->gb)) { // reserved bit
 456         av_log(ctx, AV_LOG_ERROR, "Reserved bit should be zero\n");
 457         return AVERROR_INVALIDDATA;
 458     }
 459     if (get_bits1(&s->gb)) {
 460         *ref = get_bits(&s->gb, 3);
 461         return 0;
 462     }
 463     s->last_uses_2pass = s->uses_2pass;
 464     s->last_keyframe  = s->keyframe;
 465     s->keyframe       = !get_bits1(&s->gb);
 466     last_invisible    = s->invisible;
 467     s->invisible      = !get_bits1(&s->gb);
 468     s->errorres       = get_bits1(&s->gb);
 469     // FIXME disable this upon resolution change
 470     s->use_last_frame_mvs = !s->errorres && !last_invisible;
 471     if (s->keyframe) {
 472         if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
 473             av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
 474             return AVERROR_INVALIDDATA;
 475         }
 476         s->colorspace = get_bits(&s->gb, 3);
 477         if (s->colorspace == 7) { // RGB = profile 1
 478             av_log(ctx, AV_LOG_ERROR, "RGB not supported in profile 0\n");
 479             return AVERROR_INVALIDDATA;
 480         }
 481         s->fullrange  = get_bits1(&s->gb);
 482         // for profile 1, here follows the subsampling bits
 483         s->refreshrefmask = 0xff;
 484         w = get_bits(&s->gb, 16) + 1;
 485         h = get_bits(&s->gb, 16) + 1;
 486         if (get_bits1(&s->gb)) // display size
 487             skip_bits(&s->gb, 32);
 488     } else {
 489         s->intraonly  = s->invisible ? get_bits1(&s->gb) : 0;
 490         s->resetctx   = s->errorres ? 0 : get_bits(&s->gb, 2);
 491         if (s->intraonly) {
 492             if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
 493                 av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
 494                 return AVERROR_INVALIDDATA;
 495             }
 496             s->refreshrefmask = get_bits(&s->gb, 8);
 497             w = get_bits(&s->gb, 16) + 1;
 498             h = get_bits(&s->gb, 16) + 1;
 499             if (get_bits1(&s->gb)) // display size
 500                 skip_bits(&s->gb, 32);
 501         } else {
 502             s->refreshrefmask = get_bits(&s->gb, 8);
 503             s->refidx[0]      = get_bits(&s->gb, 3);
 504             s->signbias[0]    = get_bits1(&s->gb);
 505             s->refidx[1]      = get_bits(&s->gb, 3);
 506             s->signbias[1]    = get_bits1(&s->gb);
 507             s->refidx[2]      = get_bits(&s->gb, 3);
 508             s->signbias[2]    = get_bits1(&s->gb);
 509             if (!s->refs[s->refidx[0]].f->data[0] ||
 510                 !s->refs[s->refidx[1]].f->data[0] ||
 511                 !s->refs[s->refidx[2]].f->data[0]) {
 512                 av_log(ctx, AV_LOG_ERROR, "Not all references are available\n");
 513                 return AVERROR_INVALIDDATA;
 514             }
 515             if (get_bits1(&s->gb)) {
 516                 w = s->refs[s->refidx[0]].f->width;
 517                 h = s->refs[s->refidx[0]].f->height;
 518             } else if (get_bits1(&s->gb)) {
 519                 w = s->refs[s->refidx[1]].f->width;
 520                 h = s->refs[s->refidx[1]].f->height;
 521             } else if (get_bits1(&s->gb)) {
 522                 w = s->refs[s->refidx[2]].f->width;
 523                 h = s->refs[s->refidx[2]].f->height;
 524             } else {
 525                 w = get_bits(&s->gb, 16) + 1;
 526                 h = get_bits(&s->gb, 16) + 1;
 527             }
 528             if (get_bits1(&s->gb)) // display size
 529                 skip_bits(&s->gb, 32);
 530             s->highprecisionmvs = get_bits1(&s->gb);
 531             s->filtermode = get_bits1(&s->gb) ? FILTER_SWITCHABLE :
 532                                                 get_bits(&s->gb, 2);
 533             s->allowcompinter = s->signbias[0] != s->signbias[1] ||
 534                                 s->signbias[0] != s->signbias[2];
 535             if (s->allowcompinter) {
 536                 if (s->signbias[0] == s->signbias[1]) {
 537                     s->fixcompref    = 2;
 538                     s->varcompref[0] = 0;
 539                     s->varcompref[1] = 1;
 540                 } else if (s->signbias[0] == s->signbias[2]) {
 541                     s->fixcompref    = 1;
 542                     s->varcompref[0] = 0;
 543                     s->varcompref[1] = 2;
 544                 } else {
 545                     s->fixcompref    = 0;
 546                     s->varcompref[0] = 1;
 547                     s->varcompref[1] = 2;
 548                 }
 549             }
 550         }
 551     }
 552     s->refreshctx   = s->errorres ? 0 : get_bits1(&s->gb);
 553     s->parallelmode = s->errorres ? 1 : get_bits1(&s->gb);
 554     s->framectxid   = c = get_bits(&s->gb, 2);
 555
 556     /* loopfilter header data */
 557     s->filter.level = get_bits(&s->gb, 6);
 558     sharp = get_bits(&s->gb, 3);
 559     // if sharpness changed, reinit lim/mblim LUTs. if it didn't change, keep
 560     // the old cache values since they are still valid
 561     if (s->filter.sharpness != sharp)
 562         memset(s->filter.lim_lut, 0, sizeof(s->filter.lim_lut));
 563     s->filter.sharpness = sharp;
 564     if ((s->lf_delta.enabled = get_bits1(&s->gb))) {
 565         if (get_bits1(&s->gb)) {
 566             for (i = 0; i < 4; i++)
 567                 if (get_bits1(&s->gb))
 568                     s->lf_delta.ref[i] = get_sbits_inv(&s->gb, 6);
 569             for (i = 0; i < 2; i++)
 570                 if (get_bits1(&s->gb))
 571                     s->lf_delta.mode[i] = get_sbits_inv(&s->gb, 6);
 572         }
 573     } else {
 574         memset(&s->lf_delta, 0, sizeof(s->lf_delta));
 575     }
 576
 577     /* quantization header data */
 578     s->yac_qi      = get_bits(&s->gb, 8);
 579     s->ydc_qdelta  = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
 580     s->uvdc_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
 581     s->uvac_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
 582     s->lossless    = s->yac_qi == 0 && s->ydc_qdelta == 0 &&
 583                      s->uvdc_qdelta == 0 && s->uvac_qdelta == 0;
 584
 585     /* segmentation header info */
 586     if ((s->segmentation.enabled = get_bits1(&s->gb))) {
 587         if ((s->segmentation.update_map = get_bits1(&s->gb))) {
 588             for (i = 0; i < 7; i++)
 589                 s->prob.seg[i] = get_bits1(&s->gb) ?
 590                                  get_bits(&s->gb, 8) : 255;
 591             if ((s->segmentation.temporal = get_bits1(&s->gb)))
 592                 for (i = 0; i < 3; i++)
 593                     s->prob.segpred[i] = get_bits1(&s->gb) ?
 594                                          get_bits(&s->gb, 8) : 255;
 595         }
 596
 597         if (get_bits1(&s->gb)) {
 598             s->segmentation.absolute_vals = get_bits1(&s->gb);
 599             for (i = 0; i < 8; i++) {
 600                 if ((s->segmentation.feat[i].q_enabled = get_bits1(&s->gb)))
 601                     s->segmentation.feat[i].q_val = get_sbits_inv(&s->gb, 8);
 602                 if ((s->segmentation.feat[i].lf_enabled = get_bits1(&s->gb)))
 603                     s->segmentation.feat[i].lf_val = get_sbits_inv(&s->gb, 6);
 604                 if ((s->segmentation.feat[i].ref_enabled = get_bits1(&s->gb)))
 605                     s->segmentation.feat[i].ref_val = get_bits(&s->gb, 2);
 606                 s->segmentation.feat[i].skip_enabled = get_bits1(&s->gb);
 607             }
 608         }
 609     } else {
 610         s->segmentation.feat[0].q_enabled    = 0;
 611         s->segmentation.feat[0].lf_enabled   = 0;
 612         s->segmentation.feat[0].skip_enabled = 0;
 613         s->segmentation.feat[0].ref_enabled  = 0;
 614     }
 615
 616     // set qmul[] based on Y/UV, AC/DC and segmentation Q idx deltas
 617     for (i = 0; i < (s->segmentation.enabled ? 8 : 1); i++) {
 618         int qyac, qydc, quvac, quvdc, lflvl, sh;
 619
 620         if (s->segmentation.feat[i].q_enabled) {
 621             if (s->segmentation.absolute_vals)
 622                 qyac = s->segmentation.feat[i].q_val;
 623             else
 624                 qyac = s->yac_qi + s->segmentation.feat[i].q_val;
 625         } else {
 626             qyac  = s->yac_qi;
 627         }
 628         qydc  = av_clip_uintp2(qyac + s->ydc_qdelta, 8);
 629         quvdc = av_clip_uintp2(qyac + s->uvdc_qdelta, 8);
 630         quvac = av_clip_uintp2(qyac + s->uvac_qdelta, 8);
 631         qyac  = av_clip_uintp2(qyac, 8);
 632
 633         s->segmentation.feat[i].qmul[0][0] = vp9_dc_qlookup[qydc];
 634         s->segmentation.feat[i].qmul[0][1] = vp9_ac_qlookup[qyac];
 635         s->segmentation.feat[i].qmul[1][0] = vp9_dc_qlookup[quvdc];
 636         s->segmentation.feat[i].qmul[1][1] = vp9_ac_qlookup[quvac];
 637
 638         sh = s->filter.level >= 32;
 639         if (s->segmentation.feat[i].lf_enabled) {
 640             if (s->segmentation.absolute_vals)
 641                 lflvl = s->segmentation.feat[i].lf_val;
 642             else
 643                 lflvl = s->filter.level + s->segmentation.feat[i].lf_val;
 644         } else {
 645             lflvl  = s->filter.level;
 646         }
 647         s->segmentation.feat[i].lflvl[0][0] =
 648         s->segmentation.feat[i].lflvl[0][1] =
 649             av_clip_uintp2(lflvl + (s->lf_delta.ref[0] << sh), 6);
 650         for (j = 1; j < 4; j++) {
 651             s->segmentation.feat[i].lflvl[j][0] =
 652                 av_clip_uintp2(lflvl + ((s->lf_delta.ref[j] +
 653                                          s->lf_delta.mode[0]) << sh), 6);
 654             s->segmentation.feat[i].lflvl[j][1] =
 655                 av_clip_uintp2(lflvl + ((s->lf_delta.ref[j] +
 656                                          s->lf_delta.mode[1]) << sh), 6);
 657         }
 658     }
 659
 660     /* tiling info */
 661     if ((res = update_size(ctx, w, h)) < 0) {
 662         av_log(ctx, AV_LOG_ERROR, "Failed to initialize decoder for %dx%d\n", w, h);
 663         return res;
 664     }
 665     for (s->tiling.log2_tile_cols = 0;
 666          (s->sb_cols >> s->tiling.log2_tile_cols) > 64;
 667          s->tiling.log2_tile_cols++) ;
 668     for (max = 0; (s->sb_cols >> max) >= 4; max++) ;
 669     max = FFMAX(0, max - 1);
 670     while (max > s->tiling.log2_tile_cols) {
 671         if (get_bits1(&s->gb))
 672             s->tiling.log2_tile_cols++;
 673         else
 674             break;
 675     }
 676     s->tiling.log2_tile_rows = decode012(&s->gb);
 677     s->tiling.tile_rows = 1 << s->tiling.log2_tile_rows;
 678     if (s->tiling.tile_cols != (1 << s->tiling.log2_tile_cols)) {
 679         s->tiling.tile_cols = 1 << s->tiling.log2_tile_cols;
 680         s->c_b = av_fast_realloc(s->c_b, &s->c_b_size,
 681                                  sizeof(VP56RangeCoder) * s->tiling.tile_cols);
 682         if (!s->c_b) {
 683             av_log(ctx, AV_LOG_ERROR, "Ran out of memory during range coder init\n");
 684             return AVERROR(ENOMEM);
 685         }
 686     }
 687
 688     if (s->keyframe || s->errorres || s->intraonly) {
 689         s->prob_ctx[0].p = s->prob_ctx[1].p = s->prob_ctx[2].p =
 690                            s->prob_ctx[3].p = vp9_default_probs;
 691         memcpy(s->prob_ctx[0].coef, vp9_default_coef_probs,
 692                sizeof(vp9_default_coef_probs));
 693         memcpy(s->prob_ctx[1].coef, vp9_default_coef_probs,
 694                sizeof(vp9_default_coef_probs));
 695         memcpy(s->prob_ctx[2].coef, vp9_default_coef_probs,
 696                sizeof(vp9_default_coef_probs));
 697         memcpy(s->prob_ctx[3].coef, vp9_default_coef_probs,
 698                sizeof(vp9_default_coef_probs));
 699     }
 700
 701     // next 16 bits is size of the rest of the header (arith-coded)
 702     size2 = get_bits(&s->gb, 16);
 703     data2 = align_get_bits(&s->gb);
 704     if (size2 > size - (data2 - data)) {
 705         av_log(ctx, AV_LOG_ERROR, "Invalid compressed header size\n");
 706         return AVERROR_INVALIDDATA;
 707     }
 708     ff_vp56_init_range_decoder(&s->c, data2, size2);
 709     if (vp56_rac_get_prob_branchy(&s->c, 128)) { // marker bit
 710         av_log(ctx, AV_LOG_ERROR, "Marker bit was set\n");
 711         return AVERROR_INVALIDDATA;
 712     }
 713
 714     if (s->keyframe || s->intraonly) {
 715         memset(s->counts.coef, 0, sizeof(s->counts.coef) + sizeof(s->counts.eob));
 716     } else {
 717         memset(&s->counts, 0, sizeof(s->counts));
 718     }
 719     // FIXME is it faster to not copy here, but do it down in the fw updates
 720     // as explicit copies if the fw update is missing (and skip the copy upon
 721     // fw update)?
 722     s->prob.p = s->prob_ctx[c].p;
 723
 724     // txfm updates
 725     if (s->lossless) {
 726         s->txfmmode = TX_4X4;
 727     } else {
 728         s->txfmmode = vp8_rac_get_uint(&s->c, 2);
 729         if (s->txfmmode == 3)
 730             s->txfmmode += vp8_rac_get(&s->c);
 731
 732         if (s->txfmmode == TX_SWITCHABLE) {
 733             for (i = 0; i < 2; i++)
 734                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 735                     s->prob.p.tx8p[i] = update_prob(&s->c, s->prob.p.tx8p[i]);
 736             for (i = 0; i < 2; i++)
 737                 for (j = 0; j < 2; j++)
 738                     if (vp56_rac_get_prob_branchy(&s->c, 252))
 739                         s->prob.p.tx16p[i][j] =
 740                             update_prob(&s->c, s->prob.p.tx16p[i][j]);
 741             for (i = 0; i < 2; i++)
 742                 for (j = 0; j < 3; j++)
 743                     if (vp56_rac_get_prob_branchy(&s->c, 252))
 744                         s->prob.p.tx32p[i][j] =
 745                             update_prob(&s->c, s->prob.p.tx32p[i][j]);
 746         }
 747     }
 748
 749     // coef updates
 750     for (i = 0; i < 4; i++) {
 751         uint8_t (*ref)[2][6][6][3] = s->prob_ctx[c].coef[i];
 752         if (vp8_rac_get(&s->c)) {
 753             for (j = 0; j < 2; j++)
 754                 for (k = 0; k < 2; k++)
 755                     for (l = 0; l < 6; l++)
 756                         for (m = 0; m < 6; m++) {
 757                             uint8_t *p = s->prob.coef[i][j][k][l][m];
 758                             uint8_t *r = ref[j][k][l][m];
 759                             if (m >= 3 && l == 0) // dc only has 3 pt
 760                                 break;
 761                             for (n = 0; n < 3; n++) {
 762                                 if (vp56_rac_get_prob_branchy(&s->c, 252)) {
 763                                     p[n] = update_prob(&s->c, r[n]);
 764                                 } else {
 765                                     p[n] = r[n];
 766                                 }
 767                             }
 768                             p[3] = 0;
 769                         }
 770         } else {
 771             for (j = 0; j < 2; j++)
 772                 for (k = 0; k < 2; k++)
 773                     for (l = 0; l < 6; l++)
 774                         for (m = 0; m < 6; m++) {
 775                             uint8_t *p = s->prob.coef[i][j][k][l][m];
 776                             uint8_t *r = ref[j][k][l][m];
 777                             if (m > 3 && l == 0) // dc only has 3 pt
 778                                 break;
 779                             memcpy(p, r, 3);
 780                             p[3] = 0;
 781                         }
 782         }
 783         if (s->txfmmode == i)
 784             break;
 785     }
 786
 787     // mode updates
 788     for (i = 0; i < 3; i++)
 789         if (vp56_rac_get_prob_branchy(&s->c, 252))
 790             s->prob.p.skip[i] = update_prob(&s->c, s->prob.p.skip[i]);
 791     if (!s->keyframe && !s->intraonly) {
 792         for (i = 0; i < 7; i++)
 793             for (j = 0; j < 3; j++)
 794                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 795                     s->prob.p.mv_mode[i][j] =
 796                         update_prob(&s->c, s->prob.p.mv_mode[i][j]);
 797
 798         if (s->filtermode == FILTER_SWITCHABLE)
 799             for (i = 0; i < 4; i++)
 800                 for (j = 0; j < 2; j++)
 801                     if (vp56_rac_get_prob_branchy(&s->c, 252))
 802                         s->prob.p.filter[i][j] =
 803                             update_prob(&s->c, s->prob.p.filter[i][j]);
 804
 805         for (i = 0; i < 4; i++)
 806             if (vp56_rac_get_prob_branchy(&s->c, 252))
 807                 s->prob.p.intra[i] = update_prob(&s->c, s->prob.p.intra[i]);
 808
 809         if (s->allowcompinter) {
 810             s->comppredmode = vp8_rac_get(&s->c);
 811             if (s->comppredmode)
 812                 s->comppredmode += vp8_rac_get(&s->c);
 813             if (s->comppredmode == PRED_SWITCHABLE)
 814                 for (i = 0; i < 5; i++)
 815                     if (vp56_rac_get_prob_branchy(&s->c, 252))
 816                         s->prob.p.comp[i] =
 817                             update_prob(&s->c, s->prob.p.comp[i]);
 818         } else {
 819             s->comppredmode = PRED_SINGLEREF;
 820         }
 821
 822         if (s->comppredmode != PRED_COMPREF) {
 823             for (i = 0; i < 5; i++) {
 824                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 825                     s->prob.p.single_ref[i][0] =
 826                         update_prob(&s->c, s->prob.p.single_ref[i][0]);
 827                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 828                     s->prob.p.single_ref[i][1] =
 829                         update_prob(&s->c, s->prob.p.single_ref[i][1]);
 830             }
 831         }
 832
 833         if (s->comppredmode != PRED_SINGLEREF) {
 834             for (i = 0; i < 5; i++)
 835                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 836                     s->prob.p.comp_ref[i] =
 837                         update_prob(&s->c, s->prob.p.comp_ref[i]);
 838         }
 839
 840         for (i = 0; i < 4; i++)
 841             for (j = 0; j < 9; j++)
 842                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 843                     s->prob.p.y_mode[i][j] =
 844                         update_prob(&s->c, s->prob.p.y_mode[i][j]);
 845
 846         for (i = 0; i < 4; i++)
 847             for (j = 0; j < 4; j++)
 848                 for (k = 0; k < 3; k++)
 849                     if (vp56_rac_get_prob_branchy(&s->c, 252))
 850                         s->prob.p.partition[3 - i][j][k] =
 851                             update_prob(&s->c, s->prob.p.partition[3 - i][j][k]);
 852
 853         // mv fields don't use the update_prob subexp model for some reason
 854         for (i = 0; i < 3; i++)
 855             if (vp56_rac_get_prob_branchy(&s->c, 252))
 856                 s->prob.p.mv_joint[i] = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 857
 858         for (i = 0; i < 2; i++) {
 859             if (vp56_rac_get_prob_branchy(&s->c, 252))
 860                 s->prob.p.mv_comp[i].sign = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 861
 862             for (j = 0; j < 10; j++)
 863                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 864                     s->prob.p.mv_comp[i].classes[j] =
 865                         (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 866
 867             if (vp56_rac_get_prob_branchy(&s->c, 252))
 868                 s->prob.p.mv_comp[i].class0 = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 869
 870             for (j = 0; j < 10; j++)
 871                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 872                     s->prob.p.mv_comp[i].bits[j] =
 873                         (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 874         }
 875
 876         for (i = 0; i < 2; i++) {
 877             for (j = 0; j < 2; j++)
 878                 for (k = 0; k < 3; k++)
 879                     if (vp56_rac_get_prob_branchy(&s->c, 252))
 880                         s->prob.p.mv_comp[i].class0_fp[j][k] =
 881                             (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 882
 883             for (j = 0; j < 3; j++)
 884                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 885                     s->prob.p.mv_comp[i].fp[j] =
 886                         (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 887         }
 888
 889         if (s->highprecisionmvs) {
 890             for (i = 0; i < 2; i++) {
 891                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 892                     s->prob.p.mv_comp[i].class0_hp =
 893                         (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 894
 895                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 896                     s->prob.p.mv_comp[i].hp =
 897                         (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 898             }
 899         }
 900     }
 901
 902     return (data2 - data) + size2;
 903 }
 904
 905 static av_always_inline void clamp_mv(VP56mv *dst, const VP56mv *src,
 906                                       VP9Context *s)
 907 {
 908     dst->x = av_clip(src->x, s->min_mv.x, s->max_mv.x);
 909     dst->y = av_clip(src->y, s->min_mv.y, s->max_mv.y);
 910 }
 911
 912 static void find_ref_mvs(VP9Context *s,
 913                          VP56mv *pmv, int ref, int z, int idx, int sb)
 914 {
 915     static const int8_t mv_ref_blk_off[N_BS_SIZES][8][2] = {
 916         [BS_64x64] = {{  3, -1 }, { -1,  3 }, {  4, -1 }, { -1,  4 },
 917                       { -1, -1 }, {  0, -1 }, { -1,  0 }, {  6, -1 }},
 918         [BS_64x32] = {{  0, -1 }, { -1,  0 }, {  4, -1 }, { -1,  2 },
 919                       { -1, -1 }, {  0, -3 }, { -3,  0 }, {  2, -1 }},
 920         [BS_32x64] = {{ -1,  0 }, {  0, -1 }, { -1,  4 }, {  2, -1 },
 921                       { -1, -1 }, { -3,  0 }, {  0, -3 }, { -1,  2 }},
 922         [BS_32x32] = {{  1, -1 }, { -1,  1 }, {  2, -1 }, { -1,  2 },
 923                       { -1, -1 }, {  0, -3 }, { -3,  0 }, { -3, -3 }},
 924         [BS_32x16] = {{  0, -1 }, { -1,  0 }, {  2, -1 }, { -1, -1 },
 925                       { -1,  1 }, {  0, -3 }, { -3,  0 }, { -3, -3 }},
 926         [BS_16x32] = {{ -1,  0 }, {  0, -1 }, { -1,  2 }, { -1, -1 },
 927                       {  1, -1 }, { -3,  0 }, {  0, -3 }, { -3, -3 }},
 928         [BS_16x16] = {{  0, -1 }, { -1,  0 }, {  1, -1 }, { -1,  1 },
 929                       { -1, -1 }, {  0, -3 }, { -3,  0 }, { -3, -3 }},
 930         [BS_16x8]  = {{  0, -1 }, { -1,  0 }, {  1, -1 }, { -1, -1 },
 931                       {  0, -2 }, { -2,  0 }, { -2, -1 }, { -1, -2 }},
 932         [BS_8x16]  = {{ -1,  0 }, {  0, -1 }, { -1,  1 }, { -1, -1 },
 933                       { -2,  0 }, {  0, -2 }, { -1, -2 }, { -2, -1 }},
 934         [BS_8x8]   = {{  0, -1 }, { -1,  0 }, { -1, -1 }, {  0, -2 },
 935                       { -2,  0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
 936         [BS_8x4]   = {{  0, -1 }, { -1,  0 }, { -1, -1 }, {  0, -2 },
 937                       { -2,  0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
 938         [BS_4x8]   = {{  0, -1 }, { -1,  0 }, { -1, -1 }, {  0, -2 },
 939                       { -2,  0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
 940         [BS_4x4]   = {{  0, -1 }, { -1,  0 }, { -1, -1 }, {  0, -2 },
 941                       { -2,  0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
 942     };
 943     VP9Block *b = s->b;
 944     int row = s->row, col = s->col, row7 = s->row7;
 945     const int8_t (*p)[2] = mv_ref_blk_off[b->bs];
 946 #define INVALID_MV 0x80008000U
 947     uint32_t mem = INVALID_MV;
 948     int i;
 949
 950 #define RETURN_DIRECT_MV(mv) \
 951     do { \
 952         uint32_t m = AV_RN32A(&mv); \
 953         if (!idx) { \
 954             AV_WN32A(pmv, m); \
 955             return; \
 956         } else if (mem == INVALID_MV) { \
 957             mem = m; \
 958         } else if (m != mem) { \
 959             AV_WN32A(pmv, m); \
 960             return; \
 961         } \
 962     } while (0)
 963
 964     if (sb >= 0) {
 965         if (sb == 2 || sb == 1) {
 966             RETURN_DIRECT_MV(b->mv[0][z]);
 967         } else if (sb == 3) {
 968             RETURN_DIRECT_MV(b->mv[2][z]);
 969             RETURN_DIRECT_MV(b->mv[1][z]);
 970             RETURN_DIRECT_MV(b->mv[0][z]);
 971         }
 972
 973 #define RETURN_MV(mv) \
 974     do { \
 975         if (sb > 0) { \
 976             VP56mv tmp; \
 977             uint32_t m; \
 978             clamp_mv(&tmp, &mv, s); \
 979             m = AV_RN32A(&tmp); \
 980             if (!idx) { \
 981                 AV_WN32A(pmv, m); \
 982                 return; \
 983             } else if (mem == INVALID_MV) { \
 984                 mem = m; \
 985             } else if (m != mem) { \
 986                 AV_WN32A(pmv, m); \
 987                 return; \
 988             } \
 989         } else { \
 990             uint32_t m = AV_RN32A(&mv); \
 991             if (!idx) { \
 992                 clamp_mv(pmv, &mv, s); \
 993                 return; \
 994             } else if (mem == INVALID_MV) { \
 995                 mem = m; \
 996             } else if (m != mem) { \
 997                 clamp_mv(pmv, &mv, s); \
 998                 return; \
 999             } \
1000         } \
1001     } while (0)
1002
1003         if (row > 0) {
1004             struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[(row - 1) * s->sb_cols * 8 + col];
1005             if (mv->ref[0] == ref) {
1006                 RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][0]);
1007             } else if (mv->ref[1] == ref) {
1008                 RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][1]);
1009             }
1010         }
1011         if (col > s->tiling.tile_col_start) {
1012             struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[row * s->sb_cols * 8 + col - 1];
1013             if (mv->ref[0] == ref) {
1014                 RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][0]);
1015             } else if (mv->ref[1] == ref) {
1016                 RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][1]);
1017             }
1018         }
1019         i = 2;
1020     } else {
1021         i = 0;
1022     }
1023
1024     // previously coded MVs in this neighbourhood, using same reference frame
1025     for (; i < 8; i++) {
1026         int c = p[i][0] + col, r = p[i][1] + row;
1027
1028         if (c >= s->tiling.tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
1029             struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
1030
1031             if (mv->ref[0] == ref) {
1032                 RETURN_MV(mv->mv[0]);
1033             } else if (mv->ref[1] == ref) {
1034                 RETURN_MV(mv->mv[1]);
1035             }
1036         }
1037     }
1038
1039     // MV at this position in previous frame, using same reference frame
1040     if (s->use_last_frame_mvs) {
1041         struct VP9mvrefPair *mv = &s->frames[LAST_FRAME].mv[row * s->sb_cols * 8 + col];
1042
1043         if (!s->last_uses_2pass)
1044             ff_thread_await_progress(&s->frames[LAST_FRAME].tf, row >> 3, 0);
1045         if (mv->ref[0] == ref) {
1046             RETURN_MV(mv->mv[0]);
1047         } else if (mv->ref[1] == ref) {
1048             RETURN_MV(mv->mv[1]);
1049         }
1050     }
1051
1052 #define RETURN_SCALE_MV(mv, scale) \
1053     do { \
1054         if (scale) { \
1055             VP56mv mv_temp = { -mv.x, -mv.y }; \
1056             RETURN_MV(mv_temp); \
1057         } else { \
1058             RETURN_MV(mv); \
1059         } \
1060     } while (0)
1061
1062     // previously coded MVs in this neighbourhood, using different reference frame
1063     for (i = 0; i < 8; i++) {
1064         int c = p[i][0] + col, r = p[i][1] + row;
1065
1066         if (c >= s->tiling.tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
1067             struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
1068
1069             if (mv->ref[0] != ref && mv->ref[0] >= 0) {
1070                 RETURN_SCALE_MV(mv->mv[0], s->signbias[mv->ref[0]] != s->signbias[ref]);
1071             }
1072             if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
1073                 // BUG - libvpx has this condition regardless of whether
1074                 // we used the first ref MV and pre-scaling
1075                 AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
1076                 RETURN_SCALE_MV(mv->mv[1], s->signbias[mv->ref[1]] != s->signbias[ref]);
1077             }
1078         }
1079     }
1080
1081     // MV at this position in previous frame, using different reference frame
1082     if (s->use_last_frame_mvs) {
1083         struct VP9mvrefPair *mv = &s->frames[LAST_FRAME].mv[row * s->sb_cols * 8 + col];
1084
1085         // no need to await_progress, because we already did that above
1086         if (mv->ref[0] != ref && mv->ref[0] >= 0) {
1087             RETURN_SCALE_MV(mv->mv[0], s->signbias[mv->ref[0]] != s->signbias[ref]);
1088         }
1089         if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
1090             // BUG - libvpx has this condition regardless of whether
1091             // we used the first ref MV and pre-scaling
1092             AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
1093             RETURN_SCALE_MV(mv->mv[1], s->signbias[mv->ref[1]] != s->signbias[ref]);
1094         }
1095     }
1096
1097     AV_ZERO32(pmv);
1098 #undef INVALID_MV
1099 #undef RETURN_MV
1100 #undef RETURN_SCALE_MV
1101 }
1102
1103 static av_always_inline int read_mv_component(VP9Context *s, int idx, int hp)
1104 {
1105     int bit, sign = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].sign);
1106     int n, c = vp8_rac_get_tree(&s->c, vp9_mv_class_tree,
1107                                 s->prob.p.mv_comp[idx].classes);
1108
1109     s->counts.mv_comp[idx].sign[sign]++;
1110     s->counts.mv_comp[idx].classes[c]++;
1111     if (c) {
1112         int m;
1113
1114         for (n = 0, m = 0; m < c; m++) {
1115             bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].bits[m]);
1116             n |= bit << m;
1117             s->counts.mv_comp[idx].bits[m][bit]++;
1118         }
1119         n <<= 3;
1120         bit = vp8_rac_get_tree(&s->c, vp9_mv_fp_tree, s->prob.p.mv_comp[idx].fp);
1121         n |= bit << 1;
1122         s->counts.mv_comp[idx].fp[bit]++;
1123         if (hp) {
1124             bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].hp);
1125             s->counts.mv_comp[idx].hp[bit]++;
1126             n |= bit;
1127         } else {
1128             n |= 1;
1129             // bug in libvpx - we count for bw entropy purposes even if the
1130             // bit wasn't coded
1131             s->counts.mv_comp[idx].hp[1]++;
1132         }
1133         n += 8 << c;
1134     } else {
1135         n = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0);
1136         s->counts.mv_comp[idx].class0[n]++;
1137         bit = vp8_rac_get_tree(&s->c, vp9_mv_fp_tree,
1138                                s->prob.p.mv_comp[idx].class0_fp[n]);
1139         s->counts.mv_comp[idx].class0_fp[n][bit]++;
1140         n = (n << 3) | (bit << 1);
1141         if (hp) {
1142             bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0_hp);
1143             s->counts.mv_comp[idx].class0_hp[bit]++;
1144             n |= bit;
1145         } else {
1146             n |= 1;
1147             // bug in libvpx - we count for bw entropy purposes even if the
1148             // bit wasn't coded
1149             s->counts.mv_comp[idx].class0_hp[1]++;
1150         }
1151     }
1152
1153     return sign ? -(n + 1) : (n + 1);
1154 }
1155
1156 static void fill_mv(VP9Context *s,
1157                     VP56mv *mv, int mode, int sb)
1158 {
1159     VP9Block *b = s->b;
1160
1161     if (mode == ZEROMV) {
1162         memset(mv, 0, sizeof(*mv) * 2);
1163     } else {
1164         int hp;
1165
1166         // FIXME cache this value and reuse for other subblocks
1167         find_ref_mvs(s, &mv[0], b->ref[0], 0, mode == NEARMV,
1168                      mode == NEWMV ? -1 : sb);
1169         // FIXME maybe move this code into find_ref_mvs()
1170         if ((mode == NEWMV || sb == -1) &&
1171             !(hp = s->highprecisionmvs && abs(mv[0].x) < 64 && abs(mv[0].y) < 64)) {
1172             if (mv[0].y & 1) {
1173                 if (mv[0].y < 0)
1174                     mv[0].y++;
1175                 else
1176                     mv[0].y--;
1177             }
1178             if (mv[0].x & 1) {
1179                 if (mv[0].x < 0)
1180                     mv[0].x++;
1181                 else
1182                     mv[0].x--;
1183             }
1184         }
1185         if (mode == NEWMV) {
1186             enum MVJoint j = vp8_rac_get_tree(&s->c, vp9_mv_joint_tree,
1187                                               s->prob.p.mv_joint);
1188
1189             s->counts.mv_joint[j]++;
1190             if (j >= MV_JOINT_V)
1191                 mv[0].y += read_mv_component(s, 0, hp);
1192             if (j & 1)
1193                 mv[0].x += read_mv_component(s, 1, hp);
1194         }
1195
1196         if (b->comp) {
1197             // FIXME cache this value and reuse for other subblocks
1198             find_ref_mvs(s, &mv[1], b->ref[1], 1, mode == NEARMV,
1199                          mode == NEWMV ? -1 : sb);
1200             if ((mode == NEWMV || sb == -1) &&
1201                 !(hp = s->highprecisionmvs && abs(mv[1].x) < 64 && abs(mv[1].y) < 64)) {
1202                 if (mv[1].y & 1) {
1203                     if (mv[1].y < 0)
1204                         mv[1].y++;
1205                     else
1206                         mv[1].y--;
1207                 }
1208                 if (mv[1].x & 1) {
1209                     if (mv[1].x < 0)
1210                         mv[1].x++;
1211                     else
1212                         mv[1].x--;
1213                 }
1214             }
1215             if (mode == NEWMV) {
1216                 enum MVJoint j = vp8_rac_get_tree(&s->c, vp9_mv_joint_tree,
1217                                                   s->prob.p.mv_joint);
1218
1219                 s->counts.mv_joint[j]++;
1220                 if (j >= MV_JOINT_V)
1221                     mv[1].y += read_mv_component(s, 0, hp);
1222                 if (j & 1)
1223                     mv[1].x += read_mv_component(s, 1, hp);
1224             }
1225         }
1226     }
1227 }
1228
1229 static void decode_mode(AVCodecContext *ctx)
1230 {
1231     static const uint8_t left_ctx[N_BS_SIZES] = {
1232         0x0, 0x8, 0x0, 0x8, 0xc, 0x8, 0xc, 0xe, 0xc, 0xe, 0xf, 0xe, 0xf
1233     };
1234     static const uint8_t above_ctx[N_BS_SIZES] = {
1235         0x0, 0x0, 0x8, 0x8, 0x8, 0xc, 0xc, 0xc, 0xe, 0xe, 0xe, 0xf, 0xf
1236     };
1237     static const uint8_t max_tx_for_bl_bp[N_BS_SIZES] = {
1238         TX_32X32, TX_32X32, TX_32X32, TX_32X32, TX_16X16, TX_16X16,
1239         TX_16X16, TX_8X8, TX_8X8, TX_8X8, TX_4X4, TX_4X4, TX_4X4
1240     };
1241     VP9Context *s = ctx->priv_data;
1242     VP9Block *b = s->b;
1243     int row = s->row, col = s->col, row7 = s->row7;
1244     enum TxfmMode max_tx = max_tx_for_bl_bp[b->bs];
1245     int w4 = FFMIN(s->cols - col, bwh_tab[1][b->bs][0]);
1246     int h4 = FFMIN(s->rows - row, bwh_tab[1][b->bs][1]), y;
1247     int have_a = row > 0, have_l = col > s->tiling.tile_col_start;
1248
1249     if (!s->segmentation.enabled) {
1250         b->seg_id = 0;
1251     } else if (s->keyframe || s->intraonly) {
1252         b->seg_id = s->segmentation.update_map ?
1253             vp8_rac_get_tree(&s->c, vp9_segmentation_tree, s->prob.seg) : 0;
1254     } else if (!s->segmentation.update_map ||
1255                (s->segmentation.temporal &&
1256                 vp56_rac_get_prob_branchy(&s->c,
1257                     s->prob.segpred[s->above_segpred_ctx[col] +
1258                                     s->left_segpred_ctx[row7]]))) {
1259         int pred = 8, x;
1260         uint8_t *refsegmap = s->frames[LAST_FRAME].segmentation_map;
1261
1262         if (!s->last_uses_2pass)
1263             ff_thread_await_progress(&s->frames[LAST_FRAME].tf, row >> 3, 0);
1264         for (y = 0; y < h4; y++)
1265             for (x = 0; x < w4; x++)
1266                 pred = FFMIN(pred, refsegmap[(y + row) * 8 * s->sb_cols + x + col]);
1267         av_assert1(pred < 8);
1268         b->seg_id = pred;
1269
1270         memset(&s->above_segpred_ctx[col], 1, w4);
1271         memset(&s->left_segpred_ctx[row7], 1, h4);
1272     } else {
1273         b->seg_id = vp8_rac_get_tree(&s->c, vp9_segmentation_tree,
1274                                      s->prob.seg);
1275
1276         memset(&s->above_segpred_ctx[col], 0, w4);
1277         memset(&s->left_segpred_ctx[row7], 0, h4);
1278     }
1279     if ((s->segmentation.enabled && s->segmentation.update_map) || s->keyframe) {
1280         uint8_t *segmap = s->frames[CUR_FRAME].segmentation_map;
1281
1282         for (y = 0; y < h4; y++)
1283             memset(&segmap[(y + row) * 8 * s->sb_cols + col], b->seg_id, w4);
1284     }
1285
1286     b->skip = s->segmentation.enabled &&
1287         s->segmentation.feat[b->seg_id].skip_enabled;
1288     if (!b->skip) {
1289         int c = s->left_skip_ctx[row7] + s->above_skip_ctx[col];
1290         b->skip = vp56_rac_get_prob(&s->c, s->prob.p.skip[c]);
1291         s->counts.skip[c][b->skip]++;
1292     }
1293
1294     if (s->keyframe || s->intraonly) {
1295         b->intra = 1;
1296     } else if (s->segmentation.feat[b->seg_id].ref_enabled) {
1297         b->intra = !s->segmentation.feat[b->seg_id].ref_val;
1298     } else {
1299         int c, bit;
1300
1301         if (have_a && have_l) {
1302             c = s->above_intra_ctx[col] + s->left_intra_ctx[row7];
1303             c += (c == 2);
1304         } else {
1305             c = have_a ? 2 * s->above_intra_ctx[col] :
1306                 have_l ? 2 * s->left_intra_ctx[row7] : 0;
1307         }
1308         bit = vp56_rac_get_prob(&s->c, s->prob.p.intra[c]);
1309         s->counts.intra[c][bit]++;
1310         b->intra = !bit;
1311     }
1312
1313     if ((b->intra || !b->skip) && s->txfmmode == TX_SWITCHABLE) {
1314         int c;
1315         if (have_a) {
1316             if (have_l) {
1317                 c = (s->above_skip_ctx[col] ? max_tx :
1318                      s->above_txfm_ctx[col]) +
1319                     (s->left_skip_ctx[row7] ? max_tx :
1320                      s->left_txfm_ctx[row7]) > max_tx;
1321             } else {
1322                 c = s->above_skip_ctx[col] ? 1 :
1323                     (s->above_txfm_ctx[col] * 2 > max_tx);
1324             }
1325         } else if (have_l) {
1326             c = s->left_skip_ctx[row7] ? 1 :
1327                 (s->left_txfm_ctx[row7] * 2 > max_tx);
1328         } else {
1329             c = 1;
1330         }
1331         switch (max_tx) {
1332         case TX_32X32:
1333             b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][0]);
1334             if (b->tx) {
1335                 b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][1]);
1336                 if (b->tx == 2)
1337                     b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][2]);
1338             }
1339             s->counts.tx32p[c][b->tx]++;
1340             break;
1341         case TX_16X16:
1342             b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][0]);
1343             if (b->tx)
1344                 b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][1]);
1345             s->counts.tx16p[c][b->tx]++;
1346             break;
1347         case TX_8X8:
1348             b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx8p[c]);
1349             s->counts.tx8p[c][b->tx]++;
1350             break;
1351         case TX_4X4:
1352             b->tx = TX_4X4;
1353             break;
1354         }
1355     } else {
1356         b->tx = FFMIN(max_tx, s->txfmmode);
1357     }
1358
1359     if (s->keyframe || s->intraonly) {
1360         uint8_t *a = &s->above_mode_ctx[col * 2];
1361         uint8_t *l = &s->left_mode_ctx[(row7) << 1];
1362
1363         b->comp = 0;
1364         if (b->bs > BS_8x8) {
1365             // FIXME the memory storage intermediates here aren't really
1366             // necessary, they're just there to make the code slightly
1367             // simpler for now
1368             b->mode[0] = a[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1369                                     vp9_default_kf_ymode_probs[a[0]][l[0]]);
1370             if (b->bs != BS_8x4) {
1371                 b->mode[1] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1372                                  vp9_default_kf_ymode_probs[a[1]][b->mode[0]]);
1373                 l[0] = a[1] = b->mode[1];
1374             } else {
1375                 l[0] = a[1] = b->mode[1] = b->mode[0];
1376             }
1377             if (b->bs != BS_4x8) {
1378                 b->mode[2] = a[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1379                                         vp9_default_kf_ymode_probs[a[0]][l[1]]);
1380                 if (b->bs != BS_8x4) {
1381                     b->mode[3] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1382                                   vp9_default_kf_ymode_probs[a[1]][b->mode[2]]);
1383                     l[1] = a[1] = b->mode[3];
1384                 } else {
1385                     l[1] = a[1] = b->mode[3] = b->mode[2];
1386                 }
1387             } else {
1388                 b->mode[2] = b->mode[0];
1389                 l[1] = a[1] = b->mode[3] = b->mode[1];
1390             }
1391         } else {
1392             b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1393                                           vp9_default_kf_ymode_probs[*a][*l]);
1394             b->mode[3] = b->mode[2] = b->mode[1] = b->mode[0];
1395             // FIXME this can probably be optimized
1396             memset(a, b->mode[0], bwh_tab[0][b->bs][0]);
1397             memset(l, b->mode[0], bwh_tab[0][b->bs][1]);
1398         }
1399         b->uvmode = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1400                                      vp9_default_kf_uvmode_probs[b->mode[3]]);
1401     } else if (b->intra) {
1402         b->comp = 0;
1403         if (b->bs > BS_8x8) {
1404             b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1405                                           s->prob.p.y_mode[0]);
1406             s->counts.y_mode[0][b->mode[0]]++;
1407             if (b->bs != BS_8x4) {
1408                 b->mode[1] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1409                                               s->prob.p.y_mode[0]);
1410                 s->counts.y_mode[0][b->mode[1]]++;
1411             } else {
1412                 b->mode[1] = b->mode[0];
1413             }
1414             if (b->bs != BS_4x8) {
1415                 b->mode[2] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1416                                               s->prob.p.y_mode[0]);
1417                 s->counts.y_mode[0][b->mode[2]]++;
1418                 if (b->bs != BS_8x4) {
1419                     b->mode[3] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1420                                                   s->prob.p.y_mode[0]);
1421                     s->counts.y_mode[0][b->mode[3]]++;
1422                 } else {
1423                     b->mode[3] = b->mode[2];
1424                 }
1425             } else {
1426                 b->mode[2] = b->mode[0];
1427                 b->mode[3] = b->mode[1];
1428             }
1429         } else {
1430             static const uint8_t size_group[10] = {
1431                 3, 3, 3, 3, 2, 2, 2, 1, 1, 1
1432             };
1433             int sz = size_group[b->bs];
1434
1435             b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1436                                           s->prob.p.y_mode[sz]);
1437             b->mode[1] = b->mode[2] = b->mode[3] = b->mode[0];
1438             s->counts.y_mode[sz][b->mode[3]]++;
1439         }
1440         b->uvmode = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1441                                      s->prob.p.uv_mode[b->mode[3]]);
1442         s->counts.uv_mode[b->mode[3]][b->uvmode]++;
1443     } else {
1444         static const uint8_t inter_mode_ctx_lut[14][14] = {
1445             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1446             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1447             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1448             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1449             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1450             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1451             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1452             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1453             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1454             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1455             { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
1456             { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
1457             { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 0, 3 },
1458             { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 3, 3, 4 },
1459         };
1460
1461         if (s->segmentation.feat[b->seg_id].ref_enabled) {
1462             av_assert2(s->segmentation.feat[b->seg_id].ref_val != 0);
1463             b->comp = 0;
1464             b->ref[0] = s->segmentation.feat[b->seg_id].ref_val - 1;
1465         } else {
1466             // read comp_pred flag
1467             if (s->comppredmode != PRED_SWITCHABLE) {
1468                 b->comp = s->comppredmode == PRED_COMPREF;
1469             } else {
1470                 int c;
1471
1472                 // FIXME add intra as ref=0xff (or -1) to make these easier?
1473                 if (have_a) {
1474                     if (have_l) {
1475                         if (s->above_comp_ctx[col] && s->left_comp_ctx[row7]) {
1476                             c = 4;
1477                         } else if (s->above_comp_ctx[col]) {
1478                             c = 2 + (s->left_intra_ctx[row7] ||
1479                                      s->left_ref_ctx[row7] == s->fixcompref);
1480                         } else if (s->left_comp_ctx[row7]) {
1481                             c = 2 + (s->above_intra_ctx[col] ||
1482                                      s->above_ref_ctx[col] == s->fixcompref);
1483                         } else {
1484                             c = (!s->above_intra_ctx[col] &&
1485                                  s->above_ref_ctx[col] == s->fixcompref) ^
1486                             (!s->left_intra_ctx[row7] &&
1487                              s->left_ref_ctx[row & 7] == s->fixcompref);
1488                         }
1489                     } else {
1490                         c = s->above_comp_ctx[col] ? 3 :
1491                         (!s->above_intra_ctx[col] && s->above_ref_ctx[col] == s->fixcompref);
1492                     }
1493                 } else if (have_l) {
1494                     c = s->left_comp_ctx[row7] ? 3 :
1495                     (!s->left_intra_ctx[row7] && s->left_ref_ctx[row7] == s->fixcompref);
1496                 } else {
1497                     c = 1;
1498                 }
1499                 b->comp = vp56_rac_get_prob(&s->c, s->prob.p.comp[c]);
1500                 s->counts.comp[c][b->comp]++;
1501             }
1502
1503             // read actual references
1504             // FIXME probably cache a few variables here to prevent repetitive
1505             // memory accesses below
1506             if (b->comp) /* two references */ {
1507                 int fix_idx = s->signbias[s->fixcompref], var_idx = !fix_idx, c, bit;
1508
1509                 b->ref[fix_idx] = s->fixcompref;
1510                 // FIXME can this codeblob be replaced by some sort of LUT?
1511                 if (have_a) {
1512                     if (have_l) {
1513                         if (s->above_intra_ctx[col]) {
1514                             if (s->left_intra_ctx[row7]) {
1515                                 c = 2;
1516                             } else {
1517                                 c = 1 + 2 * (s->left_ref_ctx[row7] != s->varcompref[1]);
1518                             }
1519                         } else if (s->left_intra_ctx[row7]) {
1520                             c = 1 + 2 * (s->above_ref_ctx[col] != s->varcompref[1]);
1521                         } else {
1522                             int refl = s->left_ref_ctx[row7], refa = s->above_ref_ctx[col];
1523
1524                             if (refl == refa && refa == s->varcompref[1]) {
1525                                 c = 0;
1526                             } else if (!s->left_comp_ctx[row7] && !s->above_comp_ctx[col]) {
1527                                 if ((refa == s->fixcompref && refl == s->varcompref[0]) ||
1528                                     (refl == s->fixcompref && refa == s->varcompref[0])) {
1529                                     c = 4;
1530                                 } else {
1531                                     c = (refa == refl) ? 3 : 1;
1532                                 }
1533                             } else if (!s->left_comp_ctx[row7]) {
1534                                 if (refa == s->varcompref[1] && refl != s->varcompref[1]) {
1535                                     c = 1;
1536                                 } else {
1537                                     c = (refl == s->varcompref[1] &&
1538                                          refa != s->varcompref[1]) ? 2 : 4;
1539                                 }
1540                             } else if (!s->above_comp_ctx[col]) {
1541                                 if (refl == s->varcompref[1] && refa != s->varcompref[1]) {
1542                                     c = 1;
1543                                 } else {
1544                                     c = (refa == s->varcompref[1] &&
1545                                          refl != s->varcompref[1]) ? 2 : 4;
1546                                 }
1547                             } else {
1548                                 c = (refl == refa) ? 4 : 2;
1549                             }
1550                         }
1551                     } else {
1552                         if (s->above_intra_ctx[col]) {
1553                             c = 2;
1554                         } else if (s->above_comp_ctx[col]) {
1555                             c = 4 * (s->above_ref_ctx[col] != s->varcompref[1]);
1556                         } else {
1557                             c = 3 * (s->above_ref_ctx[col] != s->varcompref[1]);
1558                         }
1559                     }
1560                 } else if (have_l) {
1561                     if (s->left_intra_ctx[row7]) {
1562                         c = 2;
1563                     } else if (s->left_comp_ctx[row7]) {
1564                         c = 4 * (s->left_ref_ctx[row7] != s->varcompref[1]);
1565                     } else {
1566                         c = 3 * (s->left_ref_ctx[row7] != s->varcompref[1]);
1567                     }
1568                 } else {
1569                     c = 2;
1570                 }
1571                 bit = vp56_rac_get_prob(&s->c, s->prob.p.comp_ref[c]);
1572                 b->ref[var_idx] = s->varcompref[bit];
1573                 s->counts.comp_ref[c][bit]++;
1574             } else /* single reference */ {
1575                 int bit, c;
1576
1577                 if (have_a && !s->above_intra_ctx[col]) {
1578                     if (have_l && !s->left_intra_ctx[row7]) {
1579                         if (s->left_comp_ctx[row7]) {
1580                             if (s->above_comp_ctx[col]) {
1581                                 c = 1 + (!s->fixcompref || !s->left_ref_ctx[row7] ||
1582                                          !s->above_ref_ctx[col]);
1583                             } else {
1584                                 c = (3 * !s->above_ref_ctx[col]) +
1585                                     (!s->fixcompref || !s->left_ref_ctx[row7]);
1586                             }
1587                         } else if (s->above_comp_ctx[col]) {
1588                             c = (3 * !s->left_ref_ctx[row7]) +
1589                                 (!s->fixcompref || !s->above_ref_ctx[col]);
1590                         } else {
1591                             c = 2 * !s->left_ref_ctx[row7] + 2 * !s->above_ref_ctx[col];
1592                         }
1593                     } else if (s->above_intra_ctx[col]) {
1594                         c = 2;
1595                     } else if (s->above_comp_ctx[col]) {
1596                         c = 1 + (!s->fixcompref || !s->above_ref_ctx[col]);
1597                     } else {
1598                         c = 4 * (!s->above_ref_ctx[col]);
1599                     }
1600                 } else if (have_l && !s->left_intra_ctx[row7]) {
1601                     if (s->left_intra_ctx[row7]) {
1602                         c = 2;
1603                     } else if (s->left_comp_ctx[row7]) {
1604                         c = 1 + (!s->fixcompref || !s->left_ref_ctx[row7]);
1605                     } else {
1606                         c = 4 * (!s->left_ref_ctx[row7]);
1607                     }
1608                 } else {
1609                     c = 2;
1610                 }
1611                 bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][0]);
1612                 s->counts.single_ref[c][0][bit]++;
1613                 if (!bit) {
1614                     b->ref[0] = 0;
1615                 } else {
1616                     // FIXME can this codeblob be replaced by some sort of LUT?
1617                     if (have_a) {
1618                         if (have_l) {
1619                             if (s->left_intra_ctx[row7]) {
1620                                 if (s->above_intra_ctx[col]) {
1621                                     c = 2;
1622                                 } else if (s->above_comp_ctx[col]) {
1623                                     c = 1 + 2 * (s->fixcompref == 1 ||
1624                                                  s->above_ref_ctx[col] == 1);
1625                                 } else if (!s->above_ref_ctx[col]) {
1626                                     c = 3;
1627                                 } else {
1628                                     c = 4 * (s->above_ref_ctx[col] == 1);
1629                                 }
1630                             } else if (s->above_intra_ctx[col]) {
1631                                 if (s->left_intra_ctx[row7]) {
1632                                     c = 2;
1633                                 } else if (s->left_comp_ctx[row7]) {
1634                                     c = 1 + 2 * (s->fixcompref == 1 ||
1635                                                  s->left_ref_ctx[row7] == 1);
1636                                 } else if (!s->left_ref_ctx[row7]) {
1637                                     c = 3;
1638                                 } else {
1639                                     c = 4 * (s->left_ref_ctx[row7] == 1);
1640                                 }
1641                             } else if (s->above_comp_ctx[col]) {
1642                                 if (s->left_comp_ctx[row7]) {
1643                                     if (s->left_ref_ctx[row7] == s->above_ref_ctx[col]) {
1644                                         c = 3 * (s->fixcompref == 1 ||
1645                                                  s->left_ref_ctx[row7] == 1);
1646                                     } else {
1647                                         c = 2;
1648                                     }
1649                                 } else if (!s->left_ref_ctx[row7]) {
1650                                     c = 1 + 2 * (s->fixcompref == 1 ||
1651                                                  s->above_ref_ctx[col] == 1);
1652                                 } else {
1653                                     c = 3 * (s->left_ref_ctx[row7] == 1) +
1654                                     (s->fixcompref == 1 || s->above_ref_ctx[col] == 1);
1655                                 }
1656                             } else if (s->left_comp_ctx[row7]) {
1657                                 if (!s->above_ref_ctx[col]) {
1658                                     c = 1 + 2 * (s->fixcompref == 1 ||
1659                                                  s->left_ref_ctx[row7] == 1);
1660                                 } else {
1661                                     c = 3 * (s->above_ref_ctx[col] == 1) +
1662                                     (s->fixcompref == 1 || s->left_ref_ctx[row7] == 1);
1663                                 }
1664                             } else if (!s->above_ref_ctx[col]) {
1665                                 if (!s->left_ref_ctx[row7]) {
1666                                     c = 3;
1667                                 } else {
1668                                     c = 4 * (s->left_ref_ctx[row7] == 1);
1669                                 }
1670                             } else if (!s->left_ref_ctx[row7]) {
1671                                 c = 4 * (s->above_ref_ctx[col] == 1);
1672                             } else {
1673                                 c = 2 * (s->left_ref_ctx[row7] == 1) +
1674                                 2 * (s->above_ref_ctx[col] == 1);
1675                             }
1676                         } else {
1677                             if (s->above_intra_ctx[col] ||
1678                                 (!s->above_comp_ctx[col] && !s->above_ref_ctx[col])) {
1679                                 c = 2;
1680                             } else if (s->above_comp_ctx[col]) {
1681                                 c = 3 * (s->fixcompref == 1 || s->above_ref_ctx[col] == 1);
1682                             } else {
1683                                 c = 4 * (s->above_ref_ctx[col] == 1);
1684                             }
1685                         }
1686                     } else if (have_l) {
1687                         if (s->left_intra_ctx[row7] ||
1688                             (!s->left_comp_ctx[row7] && !s->left_ref_ctx[row7])) {
1689                             c = 2;
1690                         } else if (s->left_comp_ctx[row7]) {
1691                             c = 3 * (s->fixcompref == 1 || s->left_ref_ctx[row7] == 1);
1692                         } else {
1693                             c = 4 * (s->left_ref_ctx[row7] == 1);
1694                         }
1695                     } else {
1696                         c = 2;
1697                     }
1698                     bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][1]);
1699                     s->counts.single_ref[c][1][bit]++;
1700                     b->ref[0] = 1 + bit;
1701                 }
1702             }
1703         }
1704
1705         if (b->bs <= BS_8x8) {
1706             if (s->segmentation.feat[b->seg_id].skip_enabled) {
1707                 b->mode[0] = b->mode[1] = b->mode[2] = b->mode[3] = ZEROMV;
1708             } else {
1709                 static const uint8_t off[10] = {
1710                     3, 0, 0, 1, 0, 0, 0, 0, 0, 0
1711                 };
1712
1713                 // FIXME this needs to use the LUT tables from find_ref_mvs
1714                 // because not all are -1,0/0,-1
1715                 int c = inter_mode_ctx_lut[s->above_mode_ctx[col + off[b->bs]]]
1716                                           [s->left_mode_ctx[row7 + off[b->bs]]];
1717
1718                 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1719                                               s->prob.p.mv_mode[c]);
1720                 b->mode[1] = b->mode[2] = b->mode[3] = b->mode[0];
1721                 s->counts.mv_mode[c][b->mode[0] - 10]++;
1722             }
1723         }
1724
1725         if (s->filtermode == FILTER_SWITCHABLE) {
1726             int c;
1727
1728             if (have_a && s->above_mode_ctx[col] >= NEARESTMV) {
1729                 if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
1730                     c = s->above_filter_ctx[col] == s->left_filter_ctx[row7] ?
1731                         s->left_filter_ctx[row7] : 3;
1732                 } else {
1733                     c = s->above_filter_ctx[col];
1734                 }
1735             } else if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
1736                 c = s->left_filter_ctx[row7];
1737             } else {
1738                 c = 3;
1739             }
1740
1741             b->filter = vp8_rac_get_tree(&s->c, vp9_filter_tree,
1742                                          s->prob.p.filter[c]);
1743             s->counts.filter[c][b->filter]++;
1744         } else {
1745             b->filter = s->filtermode;
1746         }
1747
1748         if (b->bs > BS_8x8) {
1749             int c = inter_mode_ctx_lut[s->above_mode_ctx[col]][s->left_mode_ctx[row7]];
1750
1751             b->mode[0] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1752                                           s->prob.p.mv_mode[c]);
1753             s->counts.mv_mode[c][b->mode[0] - 10]++;
1754             fill_mv(s, b->mv[0], b->mode[0], 0);
1755
1756             if (b->bs != BS_8x4) {
1757                 b->mode[1] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1758                                               s->prob.p.mv_mode[c]);
1759                 s->counts.mv_mode[c][b->mode[1] - 10]++;
1760                 fill_mv(s, b->mv[1], b->mode[1], 1);
1761             } else {
1762                 b->mode[1] = b->mode[0];
1763                 AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
1764                 AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
1765             }
1766
1767             if (b->bs != BS_4x8) {
1768                 b->mode[2] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1769                                               s->prob.p.mv_mode[c]);
1770                 s->counts.mv_mode[c][b->mode[2] - 10]++;
1771                 fill_mv(s, b->mv[2], b->mode[2], 2);
1772
1773                 if (b->bs != BS_8x4) {
1774                     b->mode[3] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1775                                                   s->prob.p.mv_mode[c]);
1776                     s->counts.mv_mode[c][b->mode[3] - 10]++;
1777                     fill_mv(s, b->mv[3], b->mode[3], 3);
1778                 } else {
1779                     b->mode[3] = b->mode[2];
1780                     AV_COPY32(&b->mv[3][0], &b->mv[2][0]);
1781                     AV_COPY32(&b->mv[3][1], &b->mv[2][1]);
1782                 }
1783             } else {
1784                 b->mode[2] = b->mode[0];
1785                 AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
1786                 AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
1787                 b->mode[3] = b->mode[1];
1788                 AV_COPY32(&b->mv[3][0], &b->mv[1][0]);
1789                 AV_COPY32(&b->mv[3][1], &b->mv[1][1]);
1790             }
1791         } else {
1792             fill_mv(s, b->mv[0], b->mode[0], -1);
1793             AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
1794             AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
1795             AV_COPY32(&b->mv[3][0], &b->mv[0][0]);
1796             AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
1797             AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
1798             AV_COPY32(&b->mv[3][1], &b->mv[0][1]);
1799         }
1800     }
1801
1802     // FIXME this can probably be optimized
1803     memset(&s->above_skip_ctx[col], b->skip, w4);
1804     memset(&s->left_skip_ctx[row7], b->skip, h4);
1805     memset(&s->above_txfm_ctx[col], b->tx, w4);
1806     memset(&s->left_txfm_ctx[row7], b->tx, h4);
1807     memset(&s->above_partition_ctx[col], above_ctx[b->bs], w4);
1808     memset(&s->left_partition_ctx[row7], left_ctx[b->bs], h4);
1809     if (!s->keyframe && !s->intraonly) {
1810         memset(&s->above_intra_ctx[col], b->intra, w4);
1811         memset(&s->left_intra_ctx[row7], b->intra, h4);
1812         memset(&s->above_comp_ctx[col], b->comp, w4);
1813         memset(&s->left_comp_ctx[row7], b->comp, h4);
1814         memset(&s->above_mode_ctx[col], b->mode[3], w4);
1815         memset(&s->left_mode_ctx[row7], b->mode[3], h4);
1816         if (s->filtermode == FILTER_SWITCHABLE && !b->intra ) {
1817             memset(&s->above_filter_ctx[col], b->filter, w4);
1818             memset(&s->left_filter_ctx[row7], b->filter, h4);
1819             b->filter = vp9_filter_lut[b->filter];
1820         }
1821         if (b->bs > BS_8x8) {
1822             int mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
1823
1824             AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][0], &b->mv[1][0]);
1825             AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][1], &b->mv[1][1]);
1826             AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][0], mv0);
1827             AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][1], mv1);
1828             AV_COPY32(&s->above_mv_ctx[col * 2 + 0][0], &b->mv[2][0]);
1829             AV_COPY32(&s->above_mv_ctx[col * 2 + 0][1], &b->mv[2][1]);
1830             AV_WN32A(&s->above_mv_ctx[col * 2 + 1][0], mv0);
1831             AV_WN32A(&s->above_mv_ctx[col * 2 + 1][1], mv1);
1832         } else {
1833             int n, mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
1834
1835             for (n = 0; n < w4 * 2; n++) {
1836                 AV_WN32A(&s->above_mv_ctx[col * 2 + n][0], mv0);
1837                 AV_WN32A(&s->above_mv_ctx[col * 2 + n][1], mv1);
1838             }
1839             for (n = 0; n < h4 * 2; n++) {
1840                 AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][0], mv0);
1841                 AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][1], mv1);
1842             }
1843         }
1844
1845         if (!b->intra) { // FIXME write 0xff or -1 if intra, so we can use this
1846                          // as a direct check in above branches
1847             int vref = b->ref[b->comp ? s->signbias[s->varcompref[0]] : 0];
1848
1849             memset(&s->above_ref_ctx[col], vref, w4);
1850             memset(&s->left_ref_ctx[row7], vref, h4);
1851         }
1852     }
1853
1854     // FIXME kinda ugly
1855     for (y = 0; y < h4; y++) {
1856         int x, o = (row + y) * s->sb_cols * 8 + col;
1857         struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[o];
1858
1859         if (b->intra) {
1860             for (x = 0; x < w4; x++) {
1861                 mv[x].ref[0] =
1862                 mv[x].ref[1] = -1;
1863             }
1864         } else if (b->comp) {
1865             for (x = 0; x < w4; x++) {
1866                 mv[x].ref[0] = b->ref[0];
1867                 mv[x].ref[1] = b->ref[1];
1868                 AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
1869                 AV_COPY32(&mv[x].mv[1], &b->mv[3][1]);
1870             }
1871         } else {
1872             for (x = 0; x < w4; x++) {
1873                 mv[x].ref[0] = b->ref[0];
1874                 mv[x].ref[1] = -1;
1875                 AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
1876             }
1877         }
1878     }
1879 }
1880
1881 // FIXME remove tx argument, and merge cnt/eob arguments?
1882 static int decode_coeffs_b(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
1883                            enum TxfmMode tx, unsigned (*cnt)[6][3],
1884                            unsigned (*eob)[6][2], uint8_t (*p)[6][11],
1885                            int nnz, const int16_t *scan, const int16_t (*nb)[2],
1886                            const int16_t *band_counts, const int16_t *qmul)
1887 {
1888     int i = 0, band = 0, band_left = band_counts[band];
1889     uint8_t *tp = p[0][nnz];
1890     uint8_t cache[1024];
1891
1892     do {
1893         int val, rc;
1894
1895         val = vp56_rac_get_prob_branchy(c, tp[0]); // eob
1896         eob[band][nnz][val]++;
1897         if (!val)
1898             break;
1899
1900     skip_eob:
1901         if (!vp56_rac_get_prob_branchy(c, tp[1])) { // zero
1902             cnt[band][nnz][0]++;
1903             if (!--band_left)
1904                 band_left = band_counts[++band];
1905             cache[scan[i]] = 0;
1906             nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
1907             tp = p[band][nnz];
1908             if (++i == n_coeffs)
1909                 break; //invalid input; blocks should end with EOB
1910             goto skip_eob;
1911         }
1912
1913         rc = scan[i];
1914         if (!vp56_rac_get_prob_branchy(c, tp[2])) { // one
1915             cnt[band][nnz][1]++;
1916             val = 1;
1917             cache[rc] = 1;
1918         } else {
1919             // fill in p[3-10] (model fill) - only once per frame for each pos
1920             if (!tp[3])
1921                 memcpy(&tp[3], vp9_model_pareto8[tp[2]], 8);
1922
1923             cnt[band][nnz][2]++;
1924             if (!vp56_rac_get_prob_branchy(c, tp[3])) { // 2, 3, 4
1925                 if (!vp56_rac_get_prob_branchy(c, tp[4])) {
1926                     cache[rc] = val = 2;
1927                 } else {
1928                     val = 3 + vp56_rac_get_prob(c, tp[5]);
1929                     cache[rc] = 3;
1930                 }
1931             } else if (!vp56_rac_get_prob_branchy(c, tp[6])) { // cat1/2
1932                 cache[rc] = 4;
1933                 if (!vp56_rac_get_prob_branchy(c, tp[7])) {
1934                     val = 5 + vp56_rac_get_prob(c, 159);
1935                 } else {
1936                     val  = 7 + (vp56_rac_get_prob(c, 165) << 1);
1937                     val +=      vp56_rac_get_prob(c, 145);
1938                 }
1939             } else { // cat 3-6
1940                 cache[rc] = 5;
1941                 if (!vp56_rac_get_prob_branchy(c, tp[8])) {
1942                     if (!vp56_rac_get_prob_branchy(c, tp[9])) {
1943                         val  = 11 + (vp56_rac_get_prob(c, 173) << 2);
1944                         val +=      (vp56_rac_get_prob(c, 148) << 1);
1945                         val +=       vp56_rac_get_prob(c, 140);
1946                     } else {
1947                         val  = 19 + (vp56_rac_get_prob(c, 176) << 3);
1948                         val +=      (vp56_rac_get_prob(c, 155) << 2);
1949                         val +=      (vp56_rac_get_prob(c, 140) << 1);
1950                         val +=       vp56_rac_get_prob(c, 135);
1951                     }
1952                 } else if (!vp56_rac_get_prob_branchy(c, tp[10])) {
1953                     val  = 35 + (vp56_rac_get_prob(c, 180) << 4);
1954                     val +=      (vp56_rac_get_prob(c, 157) << 3);
1955                     val +=      (vp56_rac_get_prob(c, 141) << 2);
1956                     val +=      (vp56_rac_get_prob(c, 134) << 1);
1957                     val +=       vp56_rac_get_prob(c, 130);
1958                 } else {
1959                     val  = 67 + (vp56_rac_get_prob(c, 254) << 13);
1960                     val +=      (vp56_rac_get_prob(c, 254) << 12);
1961                     val +=      (vp56_rac_get_prob(c, 254) << 11);
1962                     val +=      (vp56_rac_get_prob(c, 252) << 10);
1963                     val +=      (vp56_rac_get_prob(c, 249) << 9);
1964                     val +=      (vp56_rac_get_prob(c, 243) << 8);
1965                     val +=      (vp56_rac_get_prob(c, 230) << 7);
1966                     val +=      (vp56_rac_get_prob(c, 196) << 6);
1967                     val +=      (vp56_rac_get_prob(c, 177) << 5);
1968                     val +=      (vp56_rac_get_prob(c, 153) << 4);
1969                     val +=      (vp56_rac_get_prob(c, 140) << 3);
1970                     val +=      (vp56_rac_get_prob(c, 133) << 2);
1971                     val +=      (vp56_rac_get_prob(c, 130) << 1);
1972                     val +=       vp56_rac_get_prob(c, 129);
1973                 }
1974             }
1975         }
1976         if (!--band_left)
1977             band_left = band_counts[++band];
1978         if (tx == TX_32X32) // FIXME slow
1979             coef[rc] = ((vp8_rac_get(c) ? -val : val) * qmul[!!i]) / 2;
1980         else
1981             coef[rc] = (vp8_rac_get(c) ? -val : val) * qmul[!!i];
1982         nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
1983         tp = p[band][nnz];
1984     } while (++i < n_coeffs);
1985
1986     return i;
1987 }
1988
1989 static void decode_coeffs(AVCodecContext *ctx)
1990 {
1991     VP9Context *s = ctx->priv_data;
1992     VP9Block *b = s->b;
1993     int row = s->row, col = s->col;
1994     uint8_t (*p)[6][11] = s->prob.coef[b->tx][0 /* y */][!b->intra];
1995     unsigned (*c)[6][3] = s->counts.coef[b->tx][0 /* y */][!b->intra];
1996     unsigned (*e)[6][2] = s->counts.eob[b->tx][0 /* y */][!b->intra];
1997     int w4 = bwh_tab[1][b->bs][0] << 1, h4 = bwh_tab[1][b->bs][1] << 1;
1998     int end_x = FFMIN(2 * (s->cols - col), w4);
1999     int end_y = FFMIN(2 * (s->rows - row), h4);
2000     int n, pl, x, y, step1d = 1 << b->tx, step = 1 << (b->tx * 2);
2001     int uvstep1d = 1 << b->uvtx, uvstep = 1 << (b->uvtx * 2), res;
2002     int16_t (*qmul)[2] = s->segmentation.feat[b->seg_id].qmul;
2003     int tx = 4 * s->lossless + b->tx;
2004     const int16_t * const *yscans = vp9_scans[tx];
2005     const int16_t (* const *ynbs)[2] = vp9_scans_nb[tx];
2006     const int16_t *uvscan = vp9_scans[b->uvtx][DCT_DCT];
2007     const int16_t (*uvnb)[2] = vp9_scans_nb[b->uvtx][DCT_DCT];
2008     uint8_t *a = &s->above_y_nnz_ctx[col * 2];
2009     uint8_t *l = &s->left_y_nnz_ctx[(row & 7) << 1];
2010     static const int16_t band_counts[4][8] = {
2011         { 1, 2, 3, 4,  3,   16 - 13 },
2012         { 1, 2, 3, 4, 11,   64 - 21 },
2013         { 1, 2, 3, 4, 11,  256 - 21 },
2014         { 1, 2, 3, 4, 11, 1024 - 21 },
2015     };
2016     const int16_t *y_band_counts = band_counts[b->tx];
2017     const int16_t *uv_band_counts = band_counts[b->uvtx];
2018
2019     /* y tokens */
2020     if (b->tx > TX_4X4) { // FIXME slow
2021         for (y = 0; y < end_y; y += step1d)
2022             for (x = 1; x < step1d; x++)
2023                 l[y] |= l[y + x];
2024         for (x = 0; x < end_x; x += step1d)
2025             for (y = 1; y < step1d; y++)
2026                 a[x] |= a[x + y];
2027     }
2028     for (n = 0, y = 0; y < end_y; y += step1d) {
2029         for (x = 0; x < end_x; x += step1d, n += step) {
2030             enum TxfmType txtp = vp9_intra_txfm_type[b->mode[b->tx == TX_4X4 &&
2031                                                              b->bs > BS_8x8 ?
2032                                                              n : 0]];
2033             int nnz = a[x] + l[y];
2034             res = decode_coeffs_b(&s->c, s->block + 16 * n, 16 * step,
2035                                   b->tx, c, e, p, nnz, yscans[txtp],
2036                                   ynbs[txtp], y_band_counts, qmul[0]);
2037             a[x] = l[y] = !!res;
2038             if (b->tx > TX_8X8) {
2039                 AV_WN16A(&s->eob[n], res);
2040             } else {
2041                 s->eob[n] = res;
2042             }
2043         }
2044     }
2045     if (b->tx > TX_4X4) { // FIXME slow
2046         for (y = 0; y < end_y; y += step1d)
2047             memset(&l[y + 1], l[y], FFMIN(end_y - y - 1, step1d - 1));
2048         for (x = 0; x < end_x; x += step1d)
2049             memset(&a[x + 1], a[x], FFMIN(end_x - x - 1, step1d - 1));
2050     }
2051
2052     p = s->prob.coef[b->uvtx][1 /* uv */][!b->intra];
2053     c = s->counts.coef[b->uvtx][1 /* uv */][!b->intra];
2054     e = s->counts.eob[b->uvtx][1 /* uv */][!b->intra];
2055     w4 >>= 1;
2056     h4 >>= 1;
2057     end_x >>= 1;
2058     end_y >>= 1;
2059     for (pl = 0; pl < 2; pl++) {
2060         a = &s->above_uv_nnz_ctx[pl][col];
2061         l = &s->left_uv_nnz_ctx[pl][row & 7];
2062         if (b->uvtx > TX_4X4) { // FIXME slow
2063             for (y = 0; y < end_y; y += uvstep1d)
2064                 for (x = 1; x < uvstep1d; x++)
2065                     l[y] |= l[y + x];
2066             for (x = 0; x < end_x; x += uvstep1d)
2067                 for (y = 1; y < uvstep1d; y++)
2068                     a[x] |= a[x + y];
2069         }
2070         for (n = 0, y = 0; y < end_y; y += uvstep1d) {
2071             for (x = 0; x < end_x; x += uvstep1d, n += uvstep) {
2072                 int nnz = a[x] + l[y];
2073                 res = decode_coeffs_b(&s->c, s->uvblock[pl] + 16 * n,
2074                                       16 * uvstep, b->uvtx, c, e, p, nnz,
2075                                       uvscan, uvnb, uv_band_counts, qmul[1]);
2076                 a[x] = l[y] = !!res;
2077                 if (b->uvtx > TX_8X8) {
2078                     AV_WN16A(&s->uveob[pl][n], res);
2079                 } else {
2080                     s->uveob[pl][n] = res;
2081                 }
2082             }
2083         }
2084         if (b->uvtx > TX_4X4) { // FIXME slow
2085             for (y = 0; y < end_y; y += uvstep1d)
2086                 memset(&l[y + 1], l[y], FFMIN(end_y - y - 1, uvstep1d - 1));
2087             for (x = 0; x < end_x; x += uvstep1d)
2088                 memset(&a[x + 1], a[x], FFMIN(end_x - x - 1, uvstep1d - 1));
2089         }
2090     }
2091 }
2092
2093 static av_always_inline int check_intra_mode(VP9Context *s, int mode, uint8_t **a,
2094                                              uint8_t *dst_edge, ptrdiff_t stride_edge,
2095                                              uint8_t *dst_inner, ptrdiff_t stride_inner,
2096                                              uint8_t *l, int col, int x, int w,
2097                                              int row, int y, enum TxfmMode tx,
2098                                              int p)
2099 {
2100     int have_top = row > 0 || y > 0;
2101     int have_left = col > s->tiling.tile_col_start || x > 0;
2102     int have_right = x < w - 1;
2103     static const uint8_t mode_conv[10][2 /* have_left */][2 /* have_top */] = {
2104         [VERT_PRED]            = { { DC_127_PRED,          VERT_PRED },
2105                                    { DC_127_PRED,          VERT_PRED } },
2106         [HOR_PRED]             = { { DC_129_PRED,          DC_129_PRED },
2107                                    { HOR_PRED,             HOR_PRED } },
2108         [DC_PRED]              = { { DC_128_PRED,          TOP_DC_PRED },
2109                                    { LEFT_DC_PRED,         DC_PRED } },
2110         [DIAG_DOWN_LEFT_PRED]  = { { DC_127_PRED,          DIAG_DOWN_LEFT_PRED },
2111                                    { DC_127_PRED,          DIAG_DOWN_LEFT_PRED } },
2112         [DIAG_DOWN_RIGHT_PRED] = { { DIAG_DOWN_RIGHT_PRED, DIAG_DOWN_RIGHT_PRED },
2113                                    { DIAG_DOWN_RIGHT_PRED, DIAG_DOWN_RIGHT_PRED } },
2114         [VERT_RIGHT_PRED]      = { { VERT_RIGHT_PRED,      VERT_RIGHT_PRED },
2115                                    { VERT_RIGHT_PRED,      VERT_RIGHT_PRED } },
2116         [HOR_DOWN_PRED]        = { { HOR_DOWN_PRED,        HOR_DOWN_PRED },
2117                                    { HOR_DOWN_PRED,        HOR_DOWN_PRED } },
2118         [VERT_LEFT_PRED]       = { { DC_127_PRED,          VERT_LEFT_PRED },
2119                                    { DC_127_PRED,          VERT_LEFT_PRED } },
2120         [HOR_UP_PRED]          = { { DC_129_PRED,          DC_129_PRED },
2121                                    { HOR_UP_PRED,          HOR_UP_PRED } },
2122         [TM_VP8_PRED]          = { { DC_129_PRED,          VERT_PRED },
2123                                    { HOR_PRED,             TM_VP8_PRED } },
2124     };
2125     static const struct {
2126         uint8_t needs_left:1;
2127         uint8_t needs_top:1;
2128         uint8_t needs_topleft:1;
2129         uint8_t needs_topright:1;
2130     } edges[N_INTRA_PRED_MODES] = {
2131         [VERT_PRED]            = { .needs_top  = 1 },
2132         [HOR_PRED]             = { .needs_left = 1 },
2133         [DC_PRED]              = { .needs_top  = 1, .needs_left = 1 },
2134         [DIAG_DOWN_LEFT_PRED]  = { .needs_top  = 1, .needs_topright = 1 },
2135         [DIAG_DOWN_RIGHT_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2136         [VERT_RIGHT_PRED]      = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2137         [HOR_DOWN_PRED]        = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2138         [VERT_LEFT_PRED]       = { .needs_top  = 1, .needs_topright = 1 },
2139         [HOR_UP_PRED]          = { .needs_left = 1 },
2140         [TM_VP8_PRED]          = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2141         [LEFT_DC_PRED]         = { .needs_left = 1 },
2142         [TOP_DC_PRED]          = { .needs_top  = 1 },
2143         [DC_128_PRED]          = { 0 },
2144         [DC_127_PRED]          = { 0 },
2145         [DC_129_PRED]          = { 0 }
2146     };
2147
2148     av_assert2(mode >= 0 && mode < 10);
2149     mode = mode_conv[mode][have_left][have_top];
2150     if (edges[mode].needs_top) {
2151         uint8_t *top, *topleft;
2152         int n_px_need = 4 << tx, n_px_have = (((s->cols - col) << !p) - x) * 4;
2153         int n_px_need_tr = 0;
2154
2155         if (tx == TX_4X4 && edges[mode].needs_topright && have_right)
2156             n_px_need_tr = 4;
2157
2158         // if top of sb64-row, use s->intra_pred_data[] instead of
2159         // dst[-stride] for intra prediction (it contains pre- instead of
2160         // post-loopfilter data)
2161         if (have_top) {
2162             top = !(row & 7) && !y ?
2163                 s->intra_pred_data[p] + col * (8 >> !!p) + x * 4 :
2164                 y == 0 ? &dst_edge[-stride_edge] : &dst_inner[-stride_inner];
2165             if (have_left)
2166                 topleft = !(row & 7) && !y ?
2167                     s->intra_pred_data[p] + col * (8 >> !!p) + x * 4 :
2168                     y == 0 || x == 0 ? &dst_edge[-stride_edge] :
2169                     &dst_inner[-stride_inner];
2170         }
2171
2172         if (have_top &&
2173             (!edges[mode].needs_topleft || (have_left && top == topleft)) &&
2174             (tx != TX_4X4 || !edges[mode].needs_topright || have_right) &&
2175             n_px_need + n_px_need_tr <= n_px_have) {
2176             *a = top;
2177         } else {
2178             if (have_top) {
2179                 if (n_px_need <= n_px_have) {
2180                     memcpy(*a, top, n_px_need);
2181                 } else {
2182                     memcpy(*a, top, n_px_have);
2183                     memset(&(*a)[n_px_have], (*a)[n_px_have - 1],
2184                            n_px_need - n_px_have);
2185                 }
2186             } else {
2187                 memset(*a, 127, n_px_need);
2188             }
2189             if (edges[mode].needs_topleft) {
2190                 if (have_left && have_top) {
2191                     (*a)[-1] = topleft[-1];
2192                 } else {
2193                     (*a)[-1] = have_top ? 129 : 127;
2194                 }
2195             }
2196             if (tx == TX_4X4 && edges[mode].needs_topright) {
2197                 if (have_top && have_right &&
2198                     n_px_need + n_px_need_tr <= n_px_have) {
2199                     memcpy(&(*a)[4], &top[4], 4);
2200                 } else {
2201                     memset(&(*a)[4], (*a)[3], 4);
2202                 }
2203             }
2204         }
2205     }
2206     if (edges[mode].needs_left) {
2207         if (have_left) {
2208             int n_px_need = 4 << tx, i, n_px_have = (((s->rows - row) << !p) - y) * 4;
2209             uint8_t *dst = x == 0 ? dst_edge : dst_inner;
2210             ptrdiff_t stride = x == 0 ? stride_edge : stride_inner;
2211
2212             if (n_px_need <= n_px_have) {
2213                 for (i = 0; i < n_px_need; i++)
2214                     l[i] = dst[i * stride - 1];
2215             } else {
2216                 for (i = 0; i < n_px_have; i++)
2217                     l[i] = dst[i * stride - 1];
2218                 memset(&l[i], l[i - 1], n_px_need - n_px_have);
2219             }
2220         } else {
2221             memset(l, 129, 4 << tx);
2222         }
2223     }
2224
2225     return mode;
2226 }
2227
2228 static void intra_recon(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off)
2229 {
2230     VP9Context *s = ctx->priv_data;
2231     VP9Block *b = s->b;
2232     int row = s->row, col = s->col;
2233     int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
2234     int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
2235     int end_x = FFMIN(2 * (s->cols - col), w4);
2236     int end_y = FFMIN(2 * (s->rows - row), h4);
2237     int tx = 4 * s->lossless + b->tx, uvtx = b->uvtx + 4 * s->lossless;
2238     int uvstep1d = 1 << b->uvtx, p;
2239     uint8_t *dst = s->dst[0], *dst_r = s->frames[CUR_FRAME].tf.f->data[0] + y_off;
2240
2241     for (n = 0, y = 0; y < end_y; y += step1d) {
2242         uint8_t *ptr = dst, *ptr_r = dst_r;
2243         for (x = 0; x < end_x; x += step1d, ptr += 4 * step1d,
2244                                ptr_r += 4 * step1d, n += step) {
2245             int mode = b->mode[b->bs > BS_8x8 && b->tx == TX_4X4 ?
2246                                y * 2 + x : 0];
2247             LOCAL_ALIGNED_16(uint8_t, a_buf, [48]);
2248             uint8_t *a = &a_buf[16], l[32];
2249             enum TxfmType txtp = vp9_intra_txfm_type[mode];
2250             int eob = b->skip ? 0 : b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
2251
2252             mode = check_intra_mode(s, mode, &a, ptr_r,
2253                                     s->frames[CUR_FRAME].tf.f->linesize[0],
2254                                     ptr, s->y_stride, l,
2255                                     col, x, w4, row, y, b->tx, 0);
2256             s->dsp.intra_pred[b->tx][mode](ptr, s->y_stride, l, a);
2257             if (eob)
2258                 s->dsp.itxfm_add[tx][txtp](ptr, s->y_stride,
2259                                            s->block + 16 * n, eob);
2260         }
2261         dst_r += 4 * step1d * s->frames[CUR_FRAME].tf.f->linesize[0];
2262         dst   += 4 * step1d * s->y_stride;
2263     }
2264
2265     // U/V
2266     h4 >>= 1;
2267     w4 >>= 1;
2268     end_x >>= 1;
2269     end_y >>= 1;
2270     step = 1 << (b->uvtx * 2);
2271     for (p = 0; p < 2; p++) {
2272         dst   = s->dst[1 + p];
2273         dst_r = s->frames[CUR_FRAME].tf.f->data[1 + p] + uv_off;
2274         for (n = 0, y = 0; y < end_y; y += uvstep1d) {
2275             uint8_t *ptr = dst, *ptr_r = dst_r;
2276             for (x = 0; x < end_x; x += uvstep1d, ptr += 4 * uvstep1d,
2277                                    ptr_r += 4 * uvstep1d, n += step) {
2278                 int mode = b->uvmode;
2279                 LOCAL_ALIGNED_16(uint8_t, a_buf, [48]);
2280                 uint8_t *a = &a_buf[16], l[32];
2281                 int eob = b->skip ? 0 : b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
2282
2283                 mode = check_intra_mode(s, mode, &a, ptr_r,
2284                                         s->frames[CUR_FRAME].tf.f->linesize[1],
2285                                         ptr, s->uv_stride, l,
2286                                         col, x, w4, row, y, b->uvtx, p + 1);
2287                 s->dsp.intra_pred[b->uvtx][mode](ptr, s->uv_stride, l, a);
2288                 if (eob)
2289                     s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
2290                                                     s->uvblock[p] + 16 * n, eob);
2291             }
2292             dst_r += 4 * uvstep1d * s->frames[CUR_FRAME].tf.f->linesize[1];
2293             dst   += 4 * uvstep1d * s->uv_stride;
2294         }
2295     }
2296 }
2297
2298 static av_always_inline void mc_luma_dir(VP9Context *s, vp9_mc_func (*mc)[2],
2299                                          uint8_t *dst, ptrdiff_t dst_stride,
2300                                          const uint8_t *ref, ptrdiff_t ref_stride,
2301                                          ThreadFrame *ref_frame,
2302                                          ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
2303                                          int bw, int bh, int w, int h)
2304 {
2305     int mx = mv->x, my = mv->y, th;
2306
2307     y += my >> 3;
2308     x += mx >> 3;
2309     ref += y * ref_stride + x;
2310     mx &= 7;
2311     my &= 7;
2312     // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2313     // we use +7 because the last 7 pixels of each sbrow can be changed in
2314     // the longest loopfilter of the next sbrow
2315     th = (y + bh + 4 * !!my + 7) >> 6;
2316     ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2317     if (x < !!mx * 3 || y < !!my * 3 ||
2318         x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
2319         s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2320                                  ref - !!my * 3 * ref_stride - !!mx * 3,
2321                                  80, ref_stride,
2322                                  bw + !!mx * 7, bh + !!my * 7,
2323                                  x - !!mx * 3, y - !!my * 3, w, h);
2324         ref = s->edge_emu_buffer + !!my * 3 * 80 + !!mx * 3;
2325         ref_stride = 80;
2326     }
2327     mc[!!mx][!!my](dst, dst_stride, ref, ref_stride, bh, mx << 1, my << 1);
2328 }
2329
2330 static av_always_inline void mc_chroma_dir(VP9Context *s, vp9_mc_func (*mc)[2],
2331                                            uint8_t *dst_u, uint8_t *dst_v,
2332                                            ptrdiff_t dst_stride,
2333                                            const uint8_t *ref_u, ptrdiff_t src_stride_u,
2334                                            const uint8_t *ref_v, ptrdiff_t src_stride_v,
2335                                            ThreadFrame *ref_frame,
2336                                            ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
2337                                            int bw, int bh, int w, int h)
2338 {
2339     int mx = mv->x, my = mv->y, th;
2340
2341     y += my >> 4;
2342     x += mx >> 4;
2343     ref_u += y * src_stride_u + x;
2344     ref_v += y * src_stride_v + x;
2345     mx &= 15;
2346     my &= 15;
2347     // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2348     // we use +7 because the last 7 pixels of each sbrow can be changed in
2349     // the longest loopfilter of the next sbrow
2350     th = (y + bh + 4 * !!my + 7) >> 5;
2351     ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2352     if (x < !!mx * 3 || y < !!my * 3 ||
2353         x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
2354         s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2355                                  ref_u - !!my * 3 * src_stride_u - !!mx * 3,
2356                                  80, src_stride_u,
2357                                  bw + !!mx * 7, bh + !!my * 7,
2358                                  x - !!mx * 3, y - !!my * 3, w, h);
2359         ref_u = s->edge_emu_buffer + !!my * 3 * 80 + !!mx * 3;
2360         mc[!!mx][!!my](dst_u, dst_stride, ref_u, 80, bh, mx, my);
2361
2362         s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2363                                  ref_v - !!my * 3 * src_stride_v - !!mx * 3,
2364                                  80, src_stride_v,
2365                                  bw + !!mx * 7, bh + !!my * 7,
2366                                  x - !!mx * 3, y - !!my * 3, w, h);
2367         ref_v = s->edge_emu_buffer + !!my * 3 * 80 + !!mx * 3;
2368         mc[!!mx][!!my](dst_v, dst_stride, ref_v, 80, bh, mx, my);
2369     } else {
2370         mc[!!mx][!!my](dst_u, dst_stride, ref_u, src_stride_u, bh, mx, my);
2371         mc[!!mx][!!my](dst_v, dst_stride, ref_v, src_stride_v, bh, mx, my);
2372     }
2373 }
2374
2375 static void inter_recon(AVCodecContext *ctx)
2376 {
2377     static const uint8_t bwlog_tab[2][N_BS_SIZES] = {
2378         { 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4 },
2379         { 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4 },
2380     };
2381     VP9Context *s = ctx->priv_data;
2382     VP9Block *b = s->b;
2383     int row = s->row, col = s->col;
2384     ThreadFrame *tref1 = &s->refs[s->refidx[b->ref[0]]];
2385     AVFrame *ref1 = tref1->f;
2386     ThreadFrame *tref2 = b->comp ? &s->refs[s->refidx[b->ref[1]]] : NULL;
2387     AVFrame *ref2 = b->comp ? tref2->f : NULL;
2388     int w = ctx->width, h = ctx->height;
2389     ptrdiff_t ls_y = s->y_stride, ls_uv = s->uv_stride;
2390
2391     // y inter pred
2392     if (b->bs > BS_8x8) {
2393         if (b->bs == BS_8x4) {
2394             mc_luma_dir(s, s->dsp.mc[3][b->filter][0], s->dst[0], ls_y,
2395                         ref1->data[0], ref1->linesize[0], tref1,
2396                         row << 3, col << 3, &b->mv[0][0], 8, 4, w, h);
2397             mc_luma_dir(s, s->dsp.mc[3][b->filter][0],
2398                         s->dst[0] + 4 * ls_y, ls_y,
2399                         ref1->data[0], ref1->linesize[0], tref1,
2400                         (row << 3) + 4, col << 3, &b->mv[2][0], 8, 4, w, h);
2401
2402             if (b->comp) {
2403                 mc_luma_dir(s, s->dsp.mc[3][b->filter][1], s->dst[0], ls_y,
2404                             ref2->data[0], ref2->linesize[0], tref2,
2405                             row << 3, col << 3, &b->mv[0][1], 8, 4, w, h);
2406                 mc_luma_dir(s, s->dsp.mc[3][b->filter][1],
2407                             s->dst[0] + 4 * ls_y, ls_y,
2408                             ref2->data[0], ref2->linesize[0], tref2,
2409                             (row << 3) + 4, col << 3, &b->mv[2][1], 8, 4, w, h);
2410             }
2411         } else if (b->bs == BS_4x8) {
2412             mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0], ls_y,
2413                         ref1->data[0], ref1->linesize[0], tref1,
2414                         row << 3, col << 3, &b->mv[0][0], 4, 8, w, h);
2415             mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0] + 4, ls_y,
2416                         ref1->data[0], ref1->linesize[0], tref1,
2417                         row << 3, (col << 3) + 4, &b->mv[1][0], 4, 8, w, h);
2418
2419             if (b->comp) {
2420                 mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0], ls_y,
2421                             ref2->data[0], ref2->linesize[0], tref2,
2422                             row << 3, col << 3, &b->mv[0][1], 4, 8, w, h);
2423                 mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0] + 4, ls_y,
2424                             ref2->data[0], ref2->linesize[0], tref2,
2425                             row << 3, (col << 3) + 4, &b->mv[1][1], 4, 8, w, h);
2426             }
2427         } else {
2428             av_assert2(b->bs == BS_4x4);
2429
2430             // FIXME if two horizontally adjacent blocks have the same MV,
2431             // do a w8 instead of a w4 call
2432             mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0], ls_y,
2433                         ref1->data[0], ref1->linesize[0], tref1,
2434                         row << 3, col << 3, &b->mv[0][0], 4, 4, w, h);
2435             mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0] + 4, ls_y,
2436                         ref1->data[0], ref1->linesize[0], tref1,
2437                         row << 3, (col << 3) + 4, &b->mv[1][0], 4, 4, w, h);
2438             mc_luma_dir(s, s->dsp.mc[4][b->filter][0],
2439                         s->dst[0] + 4 * ls_y, ls_y,
2440                         ref1->data[0], ref1->linesize[0], tref1,
2441                         (row << 3) + 4, col << 3, &b->mv[2][0], 4, 4, w, h);
2442             mc_luma_dir(s, s->dsp.mc[4][b->filter][0],
2443                         s->dst[0] + 4 * ls_y + 4, ls_y,
2444                         ref1->data[0], ref1->linesize[0], tref1,
2445                         (row << 3) + 4, (col << 3) + 4, &b->mv[3][0], 4, 4, w, h);
2446
2447             if (b->comp) {
2448                 mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0], ls_y,
2449                             ref2->data[0], ref2->linesize[0], tref2,
2450                             row << 3, col << 3, &b->mv[0][1], 4, 4, w, h);
2451                 mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0] + 4, ls_y,
2452                             ref2->data[0], ref2->linesize[0], tref2,
2453                             row << 3, (col << 3) + 4, &b->mv[1][1], 4, 4, w, h);
2454                 mc_luma_dir(s, s->dsp.mc[4][b->filter][1],
2455                             s->dst[0] + 4 * ls_y, ls_y,
2456                             ref2->data[0], ref2->linesize[0], tref2,
2457                             (row << 3) + 4, col << 3, &b->mv[2][1], 4, 4, w, h);
2458                 mc_luma_dir(s, s->dsp.mc[4][b->filter][1],
2459                             s->dst[0] + 4 * ls_y + 4, ls_y,
2460                             ref2->data[0], ref2->linesize[0], tref2,
2461                             (row << 3) + 4, (col << 3) + 4, &b->mv[3][1], 4, 4, w, h);
2462             }
2463         }
2464     } else {
2465         int bwl = bwlog_tab[0][b->bs];
2466         int bw = bwh_tab[0][b->bs][0] * 4, bh = bwh_tab[0][b->bs][1] * 4;
2467
2468         mc_luma_dir(s, s->dsp.mc[bwl][b->filter][0], s->dst[0], ls_y,
2469                     ref1->data[0], ref1->linesize[0], tref1,
2470                     row << 3, col << 3, &b->mv[0][0],bw, bh, w, h);
2471
2472         if (b->comp)
2473             mc_luma_dir(s, s->dsp.mc[bwl][b->filter][1], s->dst[0], ls_y,
2474                         ref2->data[0], ref2->linesize[0], tref2,
2475                         row << 3, col << 3, &b->mv[0][1], bw, bh, w, h);
2476     }
2477
2478     // uv inter pred
2479     {
2480         int bwl = bwlog_tab[1][b->bs];
2481         int bw = bwh_tab[1][b->bs][0] * 4, bh = bwh_tab[1][b->bs][1] * 4;
2482         VP56mv mvuv;
2483
2484         w = (w + 1) >> 1;
2485         h = (h + 1) >> 1;
2486         if (b->bs > BS_8x8) {
2487             mvuv.x = ROUNDED_DIV(b->mv[0][0].x + b->mv[1][0].x + b->mv[2][0].x + b->mv[3][0].x, 4);
2488             mvuv.y = ROUNDED_DIV(b->mv[0][0].y + b->mv[1][0].y + b->mv[2][0].y + b->mv[3][0].y, 4);
2489         } else {
2490             mvuv = b->mv[0][0];
2491         }
2492
2493         mc_chroma_dir(s, s->dsp.mc[bwl][b->filter][0],
2494                       s->dst[1], s->dst[2], ls_uv,
2495                       ref1->data[1], ref1->linesize[1],
2496                       ref1->data[2], ref1->linesize[2], tref1,
2497                       row << 2, col << 2, &mvuv, bw, bh, w, h);
2498
2499         if (b->comp) {
2500             if (b->bs > BS_8x8) {
2501                 mvuv.x = ROUNDED_DIV(b->mv[0][1].x + b->mv[1][1].x + b->mv[2][1].x + b->mv[3][1].x, 4);
2502                 mvuv.y = ROUNDED_DIV(b->mv[0][1].y + b->mv[1][1].y + b->mv[2][1].y + b->mv[3][1].y, 4);
2503             } else {
2504                 mvuv = b->mv[0][1];
2505             }
2506             mc_chroma_dir(s, s->dsp.mc[bwl][b->filter][1],
2507                           s->dst[1], s->dst[2], ls_uv,
2508                           ref2->data[1], ref2->linesize[1],
2509                           ref2->data[2], ref2->linesize[2], tref2,
2510                           row << 2, col << 2, &mvuv, bw, bh, w, h);
2511         }
2512     }
2513
2514     if (!b->skip) {
2515         /* mostly copied intra_reconn() */
2516
2517         int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
2518         int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
2519         int end_x = FFMIN(2 * (s->cols - col), w4);
2520         int end_y = FFMIN(2 * (s->rows - row), h4);
2521         int tx = 4 * s->lossless + b->tx, uvtx = b->uvtx + 4 * s->lossless;
2522         int uvstep1d = 1 << b->uvtx, p;
2523         uint8_t *dst = s->dst[0];
2524
2525         // y itxfm add
2526         for (n = 0, y = 0; y < end_y; y += step1d) {
2527             uint8_t *ptr = dst;
2528             for (x = 0; x < end_x; x += step1d, ptr += 4 * step1d, n += step) {
2529                 int eob = b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
2530
2531                 if (eob)
2532                     s->dsp.itxfm_add[tx][DCT_DCT](ptr, s->y_stride,
2533                                                   s->block + 16 * n, eob);
2534             }
2535             dst += 4 * s->y_stride * step1d;
2536         }
2537
2538         // uv itxfm add
2539         h4 >>= 1;
2540         w4 >>= 1;
2541         end_x >>= 1;
2542         end_y >>= 1;
2543         step = 1 << (b->uvtx * 2);
2544         for (p = 0; p < 2; p++) {
2545             dst = s->dst[p + 1];
2546             for (n = 0, y = 0; y < end_y; y += uvstep1d) {
2547                 uint8_t *ptr = dst;
2548                 for (x = 0; x < end_x; x += uvstep1d, ptr += 4 * uvstep1d, n += step) {
2549                     int eob = b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
2550
2551                     if (eob)
2552                         s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
2553                                                         s->uvblock[p] + 16 * n, eob);
2554                 }
2555                 dst += 4 * uvstep1d * s->uv_stride;
2556             }
2557         }
2558     }
2559 }
2560
2561 static av_always_inline void mask_edges(struct VP9Filter *lflvl, int is_uv,
2562                                         int row_and_7, int col_and_7,
2563                                         int w, int h, int col_end, int row_end,
2564                                         enum TxfmMode tx, int skip_inter)
2565 {
2566     // FIXME I'm pretty sure all loops can be replaced by a single LUT if
2567     // we make VP9Filter.mask uint64_t (i.e. row/col all single variable)
2568     // and make the LUT 5-indexed (bl, bp, is_uv, tx and row/col), and then
2569     // use row_and_7/col_and_7 as shifts (1*col_and_7+8*row_and_7)
2570
2571     // the intended behaviour of the vp9 loopfilter is to work on 8-pixel
2572     // edges. This means that for UV, we work on two subsampled blocks at
2573     // a time, and we only use the topleft block's mode information to set
2574     // things like block strength. Thus, for any block size smaller than
2575     // 16x16, ignore the odd portion of the block.
2576     if (tx == TX_4X4 && is_uv) {
2577         if (h == 1) {
2578             if (row_and_7 & 1)
2579                 return;
2580             if (!row_end)
2581                 h += 1;
2582         }
2583         if (w == 1) {
2584             if (col_and_7 & 1)
2585                 return;
2586             if (!col_end)
2587                 w += 1;
2588         }
2589     }
2590
2591     if (tx == TX_4X4 && !skip_inter) {
2592         int t = 1 << col_and_7, m_col = (t << w) - t, y;
2593         int m_col_odd = (t << (w - 1)) - t;
2594
2595         // on 32-px edges, use the 8-px wide loopfilter; else, use 4-px wide
2596         if (is_uv) {
2597             int m_row_8 = m_col & 0x01, m_row_4 = m_col - m_row_8;
2598
2599             for (y = row_and_7; y < h + row_and_7; y++) {
2600                 int col_mask_id = 2 - !(y & 7);
2601
2602                 lflvl->mask[is_uv][0][y][1] |= m_row_8;
2603                 lflvl->mask[is_uv][0][y][2] |= m_row_4;
2604                 // for odd lines, if the odd col is not being filtered,
2605                 // skip odd row also:
2606                 // .---. <-- a
2607                 // |   |
2608                 // |___| <-- b
2609                 // ^   ^
2610                 // c   d
2611                 //
2612                 // if a/c are even row/col and b/d are odd, and d is skipped,
2613                 // e.g. right edge of size-66x66.webm, then skip b also (bug)
2614                 if ((col_end & 1) && (y & 1)) {
2615                     lflvl->mask[is_uv][1][y][col_mask_id] |= m_col_odd;
2616                 } else {
2617                     lflvl->mask[is_uv][1][y][col_mask_id] |= m_col;
2618                 }
2619             }
2620         } else {
2621             int m_row_8 = m_col & 0x11, m_row_4 = m_col - m_row_8;
2622
2623             for (y = row_and_7; y < h + row_and_7; y++) {
2624                 int col_mask_id = 2 - !(y & 3);
2625
2626                 lflvl->mask[is_uv][0][y][1] |= m_row_8; // row edge
2627                 lflvl->mask[is_uv][0][y][2] |= m_row_4;
2628                 lflvl->mask[is_uv][1][y][col_mask_id] |= m_col; // col edge
2629                 lflvl->mask[is_uv][0][y][3] |= m_col;
2630                 lflvl->mask[is_uv][1][y][3] |= m_col;
2631             }
2632         }
2633     } else {
2634         int y, t = 1 << col_and_7, m_col = (t << w) - t;
2635
2636         if (!skip_inter) {
2637             int mask_id = (tx == TX_8X8);
2638             int l2 = tx + is_uv - 1, step1d = 1 << l2;
2639             static const unsigned masks[4] = { 0xff, 0x55, 0x11, 0x01 };
2640             int m_row = m_col & masks[l2];
2641
2642             // at odd UV col/row edges tx16/tx32 loopfilter edges, force
2643             // 8wd loopfilter to prevent going off the visible edge.
2644             if (is_uv && tx > TX_8X8 && (w ^ (w - 1)) == 1) {
2645                 int m_row_16 = ((t << (w - 1)) - t) & masks[l2];
2646                 int m_row_8 = m_row - m_row_16;
2647
2648                 for (y = row_and_7; y < h + row_and_7; y++) {
2649                     lflvl->mask[is_uv][0][y][0] |= m_row_16;
2650                     lflvl->mask[is_uv][0][y][1] |= m_row_8;
2651                 }
2652             } else {
2653                 for (y = row_and_7; y < h + row_and_7; y++)
2654                     lflvl->mask[is_uv][0][y][mask_id] |= m_row;
2655             }
2656
2657             if (is_uv && tx > TX_8X8 && (h ^ (h - 1)) == 1) {
2658                 for (y = row_and_7; y < h + row_and_7 - 1; y += step1d)
2659                     lflvl->mask[is_uv][1][y][0] |= m_col;
2660                 if (y - row_and_7 == h - 1)
2661                     lflvl->mask[is_uv][1][y][1] |= m_col;
2662             } else {
2663                 for (y = row_and_7; y < h + row_and_7; y += step1d)
2664                     lflvl->mask[is_uv][1][y][mask_id] |= m_col;
2665             }
2666         } else if (tx != TX_4X4) {
2667             int mask_id;
2668
2669             mask_id = (tx == TX_8X8) || (is_uv && h == 1);
2670             lflvl->mask[is_uv][1][row_and_7][mask_id] |= m_col;
2671             mask_id = (tx == TX_8X8) || (is_uv && w == 1);
2672             for (y = row_and_7; y < h + row_and_7; y++)
2673                 lflvl->mask[is_uv][0][y][mask_id] |= t;
2674         } else if (is_uv) {
2675             int t8 = t & 0x01, t4 = t - t8;
2676
2677             for (y = row_and_7; y < h + row_and_7; y++) {
2678                 lflvl->mask[is_uv][0][y][2] |= t4;
2679                 lflvl->mask[is_uv][0][y][1] |= t8;
2680             }
2681             lflvl->mask[is_uv][1][row_and_7][2 - !(row_and_7 & 7)] |= m_col;
2682         } else {
2683             int t8 = t & 0x11, t4 = t - t8;
2684
2685             for (y = row_and_7; y < h + row_and_7; y++) {
2686                 lflvl->mask[is_uv][0][y][2] |= t4;
2687                 lflvl->mask[is_uv][0][y][1] |= t8;
2688             }
2689             lflvl->mask[is_uv][1][row_and_7][2 - !(row_and_7 & 3)] |= m_col;
2690         }
2691     }
2692 }
2693
2694 static void decode_b(AVCodecContext *ctx, int row, int col,
2695                      struct VP9Filter *lflvl, ptrdiff_t yoff, ptrdiff_t uvoff,
2696                      enum BlockLevel bl, enum BlockPartition bp)
2697 {
2698     VP9Context *s = ctx->priv_data;
2699     VP9Block *b = s->b;
2700     enum BlockSize bs = bl * 3 + bp;
2701     int y, w4 = bwh_tab[1][bs][0], h4 = bwh_tab[1][bs][1], lvl;
2702     int emu[2];
2703     AVFrame *f = s->frames[CUR_FRAME].tf.f;
2704
2705     s->row = row;
2706     s->row7 = row & 7;
2707     s->col = col;
2708     s->col7 = col & 7;
2709     s->min_mv.x = -(128 + col * 64);
2710     s->min_mv.y = -(128 + row * 64);
2711     s->max_mv.x = 128 + (s->cols - col - w4) * 64;
2712     s->max_mv.y = 128 + (s->rows - row - h4) * 64;
2713     if (s->pass < 2) {
2714         b->bs = bs;
2715         b->bl = bl;
2716         b->bp = bp;
2717         decode_mode(ctx);
2718         b->uvtx = b->tx - (w4 * 2 == (1 << b->tx) || h4 * 2 == (1 << b->tx));
2719
2720         if (!b->skip) {
2721             decode_coeffs(ctx);
2722         } else {
2723             int pl;
2724
2725             memset(&s->above_y_nnz_ctx[col * 2], 0, w4 * 2);
2726             memset(&s->left_y_nnz_ctx[(row & 7) << 1], 0, h4 * 2);
2727             for (pl = 0; pl < 2; pl++) {
2728                 memset(&s->above_uv_nnz_ctx[pl][col], 0, w4);
2729                 memset(&s->left_uv_nnz_ctx[pl][row & 7], 0, h4);
2730             }
2731         }
2732         if (s->pass == 1) {
2733             s->b++;
2734             s->block += w4 * h4 * 64;
2735             s->uvblock[0] += w4 * h4 * 16;
2736             s->uvblock[1] += w4 * h4 * 16;
2737             s->eob += 4 * w4 * h4;
2738             s->uveob[0] += w4 * h4;
2739             s->uveob[1] += w4 * h4;
2740
2741             return;
2742         }
2743     }
2744
2745     // emulated overhangs if the stride of the target buffer can't hold. This
2746     // allows to support emu-edge and so on even if we have large block
2747     // overhangs
2748     emu[0] = (col + w4) * 8 > f->linesize[0] ||
2749              (row + h4) > s->rows + 2 * !(ctx->flags & CODEC_FLAG_EMU_EDGE);
2750     emu[1] = (col + w4) * 4 > f->linesize[1] ||
2751              (row + h4) > s->rows + 2 * !(ctx->flags & CODEC_FLAG_EMU_EDGE);
2752     if (emu[0]) {
2753         s->dst[0] = s->tmp_y;
2754         s->y_stride = 64;
2755     } else {
2756         s->dst[0] = f->data[0] + yoff;
2757         s->y_stride = f->linesize[0];
2758     }
2759     if (emu[1]) {
2760         s->dst[1] = s->tmp_uv[0];
2761         s->dst[2] = s->tmp_uv[1];
2762         s->uv_stride = 32;
2763     } else {
2764         s->dst[1] = f->data[1] + uvoff;
2765         s->dst[2] = f->data[2] + uvoff;
2766         s->uv_stride = f->linesize[1];
2767     }
2768     if (b->intra) {
2769         intra_recon(ctx, yoff, uvoff);
2770     } else {
2771         inter_recon(ctx);
2772     }
2773     if (emu[0]) {
2774         int w = FFMIN(s->cols - col, w4) * 8, h = FFMIN(s->rows - row, h4) * 8, n, o = 0;
2775
2776         for (n = 0; o < w; n++) {
2777             int bw = 64 >> n;
2778
2779             av_assert2(n <= 4);
2780             if (w & bw) {
2781                 s->dsp.mc[n][0][0][0][0](f->data[0] + yoff + o, f->linesize[0],
2782                                          s->tmp_y + o, 64, h, 0, 0);
2783                 o += bw;
2784             }
2785         }
2786     }
2787     if (emu[1]) {
2788         int w = FFMIN(s->cols - col, w4) * 4, h = FFMIN(s->rows - row, h4) * 4, n, o = 0;
2789
2790         for (n = 1; o < w; n++) {
2791             int bw = 64 >> n;
2792
2793             av_assert2(n <= 4);
2794             if (w & bw) {
2795                 s->dsp.mc[n][0][0][0][0](f->data[1] + uvoff + o, f->linesize[1],
2796                                          s->tmp_uv[0] + o, 32, h, 0, 0);
2797                 s->dsp.mc[n][0][0][0][0](f->data[2] + uvoff + o, f->linesize[2],
2798                                          s->tmp_uv[1] + o, 32, h, 0, 0);
2799                 o += bw;
2800             }
2801         }
2802     }
2803
2804     // pick filter level and find edges to apply filter to
2805     if (s->filter.level &&
2806         (lvl = s->segmentation.feat[b->seg_id].lflvl[b->intra ? 0 : b->ref[0] + 1]
2807                                                     [b->mode[3] != ZEROMV]) > 0) {
2808         int x_end = FFMIN(s->cols - col, w4), y_end = FFMIN(s->rows - row, h4);
2809         int skip_inter = !b->intra && b->skip;
2810
2811         for (y = 0; y < h4; y++)
2812             memset(&lflvl->level[((row & 7) + y) * 8 + (col & 7)], lvl, w4);
2813         mask_edges(lflvl, 0, row & 7, col & 7, x_end, y_end, 0, 0, b->tx, skip_inter);
2814         mask_edges(lflvl, 1, row & 7, col & 7, x_end, y_end,
2815                    s->cols & 1 && col + w4 >= s->cols ? s->cols & 7 : 0,
2816                    s->rows & 1 && row + h4 >= s->rows ? s->rows & 7 : 0,
2817                    b->uvtx, skip_inter);
2818
2819         if (!s->filter.lim_lut[lvl]) {
2820             int sharp = s->filter.sharpness;
2821             int limit = lvl;
2822
2823             if (sharp > 0) {
2824                 limit >>= (sharp + 3) >> 2;
2825                 limit = FFMIN(limit, 9 - sharp);
2826             }
2827             limit = FFMAX(limit, 1);
2828
2829             s->filter.lim_lut[lvl] = limit;
2830             s->filter.mblim_lut[lvl] = 2 * (lvl + 2) + limit;
2831         }
2832     }
2833
2834     if (s->pass == 2) {
2835         s->b++;
2836         s->block += w4 * h4 * 64;
2837         s->uvblock[0] += w4 * h4 * 16;
2838         s->uvblock[1] += w4 * h4 * 16;
2839         s->eob += 4 * w4 * h4;
2840         s->uveob[0] += w4 * h4;
2841         s->uveob[1] += w4 * h4;
2842     }
2843 }
2844
2845 static void decode_sb(AVCodecContext *ctx, int row, int col, struct VP9Filter *lflvl,
2846                       ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
2847 {
2848     VP9Context *s = ctx->priv_data;
2849     int c = ((s->above_partition_ctx[col] >> (3 - bl)) & 1) |
2850             (((s->left_partition_ctx[row & 0x7] >> (3 - bl)) & 1) << 1);
2851     const uint8_t *p = s->keyframe ? vp9_default_kf_partition_probs[bl][c] :
2852                                      s->prob.p.partition[bl][c];
2853     enum BlockPartition bp;
2854     ptrdiff_t hbs = 4 >> bl;
2855     AVFrame *f = s->frames[CUR_FRAME].tf.f;
2856     ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
2857
2858     if (bl == BL_8X8) {
2859         bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
2860         decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
2861     } else if (col + hbs < s->cols) { // FIXME why not <=?
2862         if (row + hbs < s->rows) { // FIXME why not <=?
2863             bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
2864             switch (bp) {
2865             case PARTITION_NONE:
2866                 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
2867                 break;
2868             case PARTITION_H:
2869                 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
2870                 yoff  += hbs * 8 * y_stride;
2871                 uvoff += hbs * 4 * uv_stride;
2872                 decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, bl, bp);
2873                 break;
2874             case PARTITION_V:
2875                 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
2876                 yoff  += hbs * 8;
2877                 uvoff += hbs * 4;
2878                 decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, bl, bp);
2879                 break;
2880             case PARTITION_SPLIT:
2881                 decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
2882                 decode_sb(ctx, row, col + hbs, lflvl,
2883                           yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
2884                 yoff  += hbs * 8 * y_stride;
2885                 uvoff += hbs * 4 * uv_stride;
2886                 decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
2887                 decode_sb(ctx, row + hbs, col + hbs, lflvl,
2888                           yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
2889                 break;
2890             default:
2891                 av_assert0(0);
2892             }
2893         } else if (vp56_rac_get_prob_branchy(&s->c, p[1])) {
2894             bp = PARTITION_SPLIT;
2895             decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
2896             decode_sb(ctx, row, col + hbs, lflvl,
2897                       yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
2898         } else {
2899             bp = PARTITION_H;
2900             decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
2901         }
2902     } else if (row + hbs < s->rows) { // FIXME why not <=?
2903         if (vp56_rac_get_prob_branchy(&s->c, p[2])) {
2904             bp = PARTITION_SPLIT;
2905             decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
2906             yoff  += hbs * 8 * y_stride;
2907             uvoff += hbs * 4 * uv_stride;
2908             decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
2909         } else {
2910             bp = PARTITION_V;
2911             decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
2912         }
2913     } else {
2914         bp = PARTITION_SPLIT;
2915         decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
2916     }
2917     s->counts.partition[bl][c][bp]++;
2918 }
2919
2920 static void decode_sb_mem(AVCodecContext *ctx, int row, int col, struct VP9Filter *lflvl,
2921                           ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
2922 {
2923     VP9Context *s = ctx->priv_data;
2924     VP9Block *b = s->b;
2925     ptrdiff_t hbs = 4 >> bl;
2926     AVFrame *f = s->frames[CUR_FRAME].tf.f;
2927     ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
2928
2929     if (bl == BL_8X8) {
2930         av_assert2(b->bl == BL_8X8);
2931         decode_b(ctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
2932     } else if (s->b->bl == bl) {
2933         decode_b(ctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
2934         if (b->bp == PARTITION_H && row + hbs < s->rows) {
2935             yoff  += hbs * 8 * y_stride;
2936             uvoff += hbs * 4 * uv_stride;
2937             decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, b->bl, b->bp);
2938         } else if (b->bp == PARTITION_V && col + hbs < s->cols) {
2939             yoff  += hbs * 8;
2940             uvoff += hbs * 4;
2941             decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, b->bl, b->bp);
2942         }
2943     } else {
2944         decode_sb_mem(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
2945         if (col + hbs < s->cols) { // FIXME why not <=?
2946             if (row + hbs < s->rows) {
2947                 decode_sb_mem(ctx, row, col + hbs, lflvl, yoff + 8 * hbs,
2948                               uvoff + 4 * hbs, bl + 1);
2949                 yoff  += hbs * 8 * y_stride;
2950                 uvoff += hbs * 4 * uv_stride;
2951                 decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
2952                 decode_sb_mem(ctx, row + hbs, col + hbs, lflvl,
2953                                     yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
2954             } else {
2955                 yoff  += hbs * 8;
2956                 uvoff += hbs * 4;
2957                 decode_sb_mem(ctx, row, col + hbs, lflvl, yoff, uvoff, bl + 1);
2958             }
2959         } else if (row + hbs < s->rows) {
2960             yoff  += hbs * 8 * y_stride;
2961             uvoff += hbs * 4 * uv_stride;
2962             decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
2963         }
2964     }
2965 }
2966
2967 static void loopfilter_sb(AVCodecContext *ctx, struct VP9Filter *lflvl,
2968                           int row, int col, ptrdiff_t yoff, ptrdiff_t uvoff)
2969 {
2970     VP9Context *s = ctx->priv_data;
2971     AVFrame *f = s->frames[CUR_FRAME].tf.f;
2972     uint8_t *dst = f->data[0] + yoff, *lvl = lflvl->level;
2973     ptrdiff_t ls_y = f->linesize[0], ls_uv = f->linesize[1];
2974     int y, x, p;
2975
2976     // FIXME in how far can we interleave the v/h loopfilter calls? E.g.
2977     // if you think of them as acting on a 8x8 block max, we can interleave
2978     // each v/h within the single x loop, but that only works if we work on
2979     // 8 pixel blocks, and we won't always do that (we want at least 16px
2980     // to use SSE2 optimizations, perhaps 32 for AVX2)
2981
2982     // filter edges between columns, Y plane (e.g. block1 | block2)
2983     for (y = 0; y < 8; y += 2, dst += 16 * ls_y, lvl += 16) {
2984         uint8_t *ptr = dst, *l = lvl, *hmask1 = lflvl->mask[0][0][y];
2985         uint8_t *hmask2 = lflvl->mask[0][0][y + 1];
2986         unsigned hm1 = hmask1[0] | hmask1[1] | hmask1[2], hm13 = hmask1[3];
2987         unsigned hm2 = hmask2[1] | hmask2[2], hm23 = hmask2[3];
2988         unsigned hm = hm1 | hm2 | hm13 | hm23;
2989
2990         for (x = 1; hm & ~(x - 1); x <<= 1, ptr += 8, l++) {
2991             if (hm1 & x) {
2992                 int L = *l, H = L >> 4;
2993                 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
2994
2995                 if (col || x > 1) {
2996                     if (hmask1[0] & x) {
2997                         if (hmask2[0] & x) {
2998                             av_assert2(l[8] == L);
2999                             s->dsp.loop_filter_16[0](ptr, ls_y, E, I, H);
3000                         } else {
3001                             s->dsp.loop_filter_8[2][0](ptr, ls_y, E, I, H);
3002                         }
3003                     } else if (hm2 & x) {
3004                         L = l[8];
3005                         H |= (L >> 4) << 8;
3006                         E |= s->filter.mblim_lut[L] << 8;
3007                         I |= s->filter.lim_lut[L] << 8;
3008                         s->dsp.loop_filter_mix2[!!(hmask1[1] & x)]
3009                                                [!!(hmask2[1] & x)]
3010                                                [0](ptr, ls_y, E, I, H);
3011                     } else {
3012                         s->dsp.loop_filter_8[!!(hmask1[1] & x)]
3013                                             [0](ptr, ls_y, E, I, H);
3014                     }
3015                 }
3016             } else if (hm2 & x) {
3017                 int L = l[8], H = L >> 4;
3018                 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3019
3020                 if (col || x > 1) {
3021                     s->dsp.loop_filter_8[!!(hmask2[1] & x)]
3022                                         [0](ptr + 8 * ls_y, ls_y, E, I, H);
3023                 }
3024             }
3025             if (hm13 & x) {
3026                 int L = *l, H = L >> 4;
3027                 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3028
3029                 if (hm23 & x) {
3030                     L = l[8];
3031                     H |= (L >> 4) << 8;
3032                     E |= s->filter.mblim_lut[L] << 8;
3033                     I |= s->filter.lim_lut[L] << 8;
3034                     s->dsp.loop_filter_mix2[0][0][0](ptr + 4, ls_y, E, I, H);
3035                 } else {
3036                     s->dsp.loop_filter_8[0][0](ptr + 4, ls_y, E, I, H);
3037                 }
3038             } else if (hm23 & x) {
3039                 int L = l[8], H = L >> 4;
3040                 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3041
3042                 s->dsp.loop_filter_8[0][0](ptr + 8 * ls_y + 4, ls_y, E, I, H);
3043             }
3044         }
3045     }
3046
3047     //                                          block1
3048     // filter edges between rows, Y plane (e.g. ------)
3049     //                                          block2
3050     dst = f->data[0] + yoff;
3051     lvl = lflvl->level;
3052     for (y = 0; y < 8; y++, dst += 8 * ls_y, lvl += 8) {
3053         uint8_t *ptr = dst, *l = lvl, *vmask = lflvl->mask[0][1][y];
3054         unsigned vm = vmask[0] | vmask[1] | vmask[2], vm3 = vmask[3];
3055
3056         for (x = 1; vm & ~(x - 1); x <<= 2, ptr += 16, l += 2) {
3057             if (row || y) {
3058                 if (vm & x) {
3059                     int L = *l, H = L >> 4;
3060                     int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3061
3062                     if (vmask[0] & x) {
3063                         if (vmask[0] & (x << 1)) {
3064                             av_assert2(l[1] == L);
3065                             s->dsp.loop_filter_16[1](ptr, ls_y, E, I, H);
3066                         } else {
3067                             s->dsp.loop_filter_8[2][1](ptr, ls_y, E, I, H);
3068                         }
3069                     } else if (vm & (x << 1)) {
3070                         L = l[1];
3071                         H |= (L >> 4) << 8;
3072                         E |= s->filter.mblim_lut[L] << 8;
3073                         I |= s->filter.lim_lut[L] << 8;
3074                         s->dsp.loop_filter_mix2[!!(vmask[1] &  x)]
3075                                                [!!(vmask[1] & (x << 1))]
3076                                                [1](ptr, ls_y, E, I, H);
3077                     } else {
3078                         s->dsp.loop_filter_8[!!(vmask[1] & x)]
3079                                             [1](ptr, ls_y, E, I, H);
3080                     }
3081                 } else if (vm & (x << 1)) {
3082                     int L = l[1], H = L >> 4;
3083                     int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3084
3085                     s->dsp.loop_filter_8[!!(vmask[1] & (x << 1))]
3086                                         [1](ptr + 8, ls_y, E, I, H);
3087                 }
3088             }
3089             if (vm3 & x) {
3090                 int L = *l, H = L >> 4;
3091                 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3092
3093                 if (vm3 & (x << 1)) {
3094                     L = l[1];
3095                     H |= (L >> 4) << 8;
3096                     E |= s->filter.mblim_lut[L] << 8;
3097                     I |= s->filter.lim_lut[L] << 8;
3098                     s->dsp.loop_filter_mix2[0][0][1](ptr + ls_y * 4, ls_y, E, I, H);
3099                 } else {
3100                     s->dsp.loop_filter_8[0][1](ptr + ls_y * 4, ls_y, E, I, H);
3101                 }
3102             } else if (vm3 & (x << 1)) {
3103                 int L = l[1], H = L >> 4;
3104                 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3105
3106                 s->dsp.loop_filter_8[0][1](ptr + ls_y * 4 + 8, ls_y, E, I, H);
3107             }
3108         }
3109     }
3110
3111     // same principle but for U/V planes
3112     for (p = 0; p < 2; p++) {
3113         lvl = lflvl->level;
3114         dst = f->data[1 + p] + uvoff;
3115         for (y = 0; y < 8; y += 4, dst += 16 * ls_uv, lvl += 32) {
3116             uint8_t *ptr = dst, *l = lvl, *hmask1 = lflvl->mask[1][0][y];
3117             uint8_t *hmask2 = lflvl->mask[1][0][y + 2];
3118             unsigned hm1 = hmask1[0] | hmask1[1] | hmask1[2];
3119             unsigned hm2 = hmask2[1] | hmask2[2], hm = hm1 | hm2;
3120
3121             for (x = 1; hm & ~(x - 1); x <<= 1, ptr += 4) {
3122                 if (col || x > 1) {
3123                     if (hm1 & x) {
3124                         int L = *l, H = L >> 4;
3125                         int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3126
3127                         if (hmask1[0] & x) {
3128                             if (hmask2[0] & x) {
3129                                 av_assert2(l[16] == L);
3130                                 s->dsp.loop_filter_16[0](ptr, ls_uv, E, I, H);
3131                             } else {
3132                                 s->dsp.loop_filter_8[2][0](ptr, ls_uv, E, I, H);
3133                             }
3134                         } else if (hm2 & x) {
3135                             L = l[16];
3136                             H |= (L >> 4) << 8;
3137                             E |= s->filter.mblim_lut[L] << 8;
3138                             I |= s->filter.lim_lut[L] << 8;
3139                             s->dsp.loop_filter_mix2[!!(hmask1[1] & x)]
3140                                                    [!!(hmask2[1] & x)]
3141                                                    [0](ptr, ls_uv, E, I, H);
3142                         } else {
3143                             s->dsp.loop_filter_8[!!(hmask1[1] & x)]
3144                                                 [0](ptr, ls_uv, E, I, H);
3145                         }
3146                     } else if (hm2 & x) {
3147                         int L = l[16], H = L >> 4;
3148                         int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3149
3150                         s->dsp.loop_filter_8[!!(hmask2[1] & x)]
3151                                             [0](ptr + 8 * ls_uv, ls_uv, E, I, H);
3152                     }
3153                 }
3154                 if (x & 0xAA)
3155                     l += 2;
3156             }
3157         }
3158         lvl = lflvl->level;
3159         dst = f->data[1 + p] + uvoff;
3160         for (y = 0; y < 8; y++, dst += 4 * ls_uv) {
3161             uint8_t *ptr = dst, *l = lvl, *vmask = lflvl->mask[1][1][y];
3162             unsigned vm = vmask[0] | vmask[1] | vmask[2];
3163
3164             for (x = 1; vm & ~(x - 1); x <<= 4, ptr += 16, l += 4) {
3165                 if (row || y) {
3166                     if (vm & x) {
3167                         int L = *l, H = L >> 4;
3168                         int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3169
3170                         if (vmask[0] & x) {
3171                             if (vmask[0] & (x << 2)) {
3172                                 av_assert2(l[2] == L);
3173                                 s->dsp.loop_filter_16[1](ptr, ls_uv, E, I, H);
3174                             } else {
3175                                 s->dsp.loop_filter_8[2][1](ptr, ls_uv, E, I, H);
3176                             }
3177                         } else if (vm & (x << 2)) {
3178                             L = l[2];
3179                             H |= (L >> 4) << 8;
3180                             E |= s->filter.mblim_lut[L] << 8;
3181                             I |= s->filter.lim_lut[L] << 8;
3182                             s->dsp.loop_filter_mix2[!!(vmask[1] &  x)]
3183                                                    [!!(vmask[1] & (x << 2))]
3184                                                    [1](ptr, ls_uv, E, I, H);
3185                         } else {
3186                             s->dsp.loop_filter_8[!!(vmask[1] & x)]
3187                                                 [1](ptr, ls_uv, E, I, H);
3188                         }
3189                     } else if (vm & (x << 2)) {
3190                         int L = l[2], H = L >> 4;
3191                         int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3192
3193                         s->dsp.loop_filter_8[!!(vmask[1] & (x << 2))]
3194                                             [1](ptr + 8, ls_uv, E, I, H);
3195                     }
3196                 }
3197             }
3198             if (y & 1)
3199                 lvl += 16;
3200         }
3201     }
3202 }
3203
3204 static void set_tile_offset(int *start, int *end, int idx, int log2_n, int n)
3205 {
3206     int sb_start = ( idx      * n) >> log2_n;
3207     int sb_end   = ((idx + 1) * n) >> log2_n;
3208     *start = FFMIN(sb_start, n) << 3;
3209     *end   = FFMIN(sb_end,   n) << 3;
3210 }
3211
3212 static av_always_inline void adapt_prob(uint8_t *p, unsigned ct0, unsigned ct1,
3213                                         int max_count, int update_factor)
3214 {
3215     unsigned ct = ct0 + ct1, p2, p1;
3216
3217     if (!ct)
3218         return;
3219
3220     p1 = *p;
3221     p2 = ((ct0 << 8) + (ct >> 1)) / ct;
3222     p2 = av_clip(p2, 1, 255);
3223     ct = FFMIN(ct, max_count);
3224     update_factor = FASTDIV(update_factor * ct, max_count);
3225
3226     // (p1 * (256 - update_factor) + p2 * update_factor + 128) >> 8
3227     *p = p1 + (((p2 - p1) * update_factor + 128) >> 8);
3228 }
3229
3230 static void adapt_probs(VP9Context *s)
3231 {
3232     int i, j, k, l, m;
3233     prob_context *p = &s->prob_ctx[s->framectxid].p;
3234     int uf = (s->keyframe || s->intraonly || !s->last_keyframe) ? 112 : 128;
3235
3236     // coefficients
3237     for (i = 0; i < 4; i++)
3238         for (j = 0; j < 2; j++)
3239             for (k = 0; k < 2; k++)
3240                 for (l = 0; l < 6; l++)
3241                     for (m = 0; m < 6; m++) {
3242                         uint8_t *pp = s->prob_ctx[s->framectxid].coef[i][j][k][l][m];
3243                         unsigned *e = s->counts.eob[i][j][k][l][m];
3244                         unsigned *c = s->counts.coef[i][j][k][l][m];
3245
3246                         if (l == 0 && m >= 3) // dc only has 3 pt
3247                             break;
3248
3249                         adapt_prob(&pp[0], e[0], e[1], 24, uf);
3250                         adapt_prob(&pp[1], c[0], c[1] + c[2], 24, uf);
3251                         adapt_prob(&pp[2], c[1], c[2], 24, uf);
3252                     }
3253
3254     if (s->keyframe || s->intraonly) {
3255         memcpy(p->skip,  s->prob.p.skip,  sizeof(p->skip));
3256         memcpy(p->tx32p, s->prob.p.tx32p, sizeof(p->tx32p));
3257         memcpy(p->tx16p, s->prob.p.tx16p, sizeof(p->tx16p));
3258         memcpy(p->tx8p,  s->prob.p.tx8p,  sizeof(p->tx8p));
3259         return;
3260     }
3261
3262     // skip flag
3263     for (i = 0; i < 3; i++)
3264         adapt_prob(&p->skip[i], s->counts.skip[i][0], s->counts.skip[i][1], 20, 128);
3265
3266     // intra/inter flag
3267     for (i = 0; i < 4; i++)
3268         adapt_prob(&p->intra[i], s->counts.intra[i][0], s->counts.intra[i][1], 20, 128);
3269
3270     // comppred flag
3271     if (s->comppredmode == PRED_SWITCHABLE) {
3272       for (i = 0; i < 5; i++)
3273           adapt_prob(&p->comp[i], s->counts.comp[i][0], s->counts.comp[i][1], 20, 128);
3274     }
3275
3276     // reference frames
3277     if (s->comppredmode != PRED_SINGLEREF) {
3278       for (i = 0; i < 5; i++)
3279           adapt_prob(&p->comp_ref[i], s->counts.comp_ref[i][0],
3280                      s->counts.comp_ref[i][1], 20, 128);
3281     }
3282
3283     if (s->comppredmode != PRED_COMPREF) {
3284       for (i = 0; i < 5; i++) {
3285           uint8_t *pp = p->single_ref[i];
3286           unsigned (*c)[2] = s->counts.single_ref[i];
3287
3288           adapt_prob(&pp[0], c[0][0], c[0][1], 20, 128);
3289           adapt_prob(&pp[1], c[1][0], c[1][1], 20, 128);
3290       }
3291     }
3292
3293     // block partitioning
3294     for (i = 0; i < 4; i++)
3295         for (j = 0; j < 4; j++) {
3296             uint8_t *pp = p->partition[i][j];
3297             unsigned *c = s->counts.partition[i][j];
3298
3299             adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3300             adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3301             adapt_prob(&pp[2], c[2], c[3], 20, 128);
3302         }
3303
3304     // tx size
3305     if (s->txfmmode == TX_SWITCHABLE) {
3306       for (i = 0; i < 2; i++) {
3307           unsigned *c16 = s->counts.tx16p[i], *c32 = s->counts.tx32p[i];
3308
3309           adapt_prob(&p->tx8p[i], s->counts.tx8p[i][0], s->counts.tx8p[i][1], 20, 128);
3310           adapt_prob(&p->tx16p[i][0], c16[0], c16[1] + c16[2], 20, 128);
3311           adapt_prob(&p->tx16p[i][1], c16[1], c16[2], 20, 128);
3312           adapt_prob(&p->tx32p[i][0], c32[0], c32[1] + c32[2] + c32[3], 20, 128);
3313           adapt_prob(&p->tx32p[i][1], c32[1], c32[2] + c32[3], 20, 128);
3314           adapt_prob(&p->tx32p[i][2], c32[2], c32[3], 20, 128);
3315       }
3316     }
3317
3318     // interpolation filter
3319     if (s->filtermode == FILTER_SWITCHABLE) {
3320         for (i = 0; i < 4; i++) {
3321             uint8_t *pp = p->filter[i];
3322             unsigned *c = s->counts.filter[i];
3323
3324             adapt_prob(&pp[0], c[0], c[1] + c[2], 20, 128);
3325             adapt_prob(&pp[1], c[1], c[2], 20, 128);
3326         }
3327     }
3328
3329     // inter modes
3330     for (i = 0; i < 7; i++) {
3331         uint8_t *pp = p->mv_mode[i];
3332         unsigned *c = s->counts.mv_mode[i];
3333
3334         adapt_prob(&pp[0], c[2], c[1] + c[0] + c[3], 20, 128);
3335         adapt_prob(&pp[1], c[0], c[1] + c[3], 20, 128);
3336         adapt_prob(&pp[2], c[1], c[3], 20, 128);
3337     }
3338
3339     // mv joints
3340     {
3341         uint8_t *pp = p->mv_joint;
3342         unsigned *c = s->counts.mv_joint;
3343
3344         adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3345         adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3346         adapt_prob(&pp[2], c[2], c[3], 20, 128);
3347     }
3348
3349     // mv components
3350     for (i = 0; i < 2; i++) {
3351         uint8_t *pp;
3352         unsigned *c, (*c2)[2], sum;
3353
3354         adapt_prob(&p->mv_comp[i].sign, s->counts.mv_comp[i].sign[0],
3355                    s->counts.mv_comp[i].sign[1], 20, 128);
3356
3357         pp = p->mv_comp[i].classes;
3358         c = s->counts.mv_comp[i].classes;
3359         sum = c[1] + c[2] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9] + c[10];
3360         adapt_prob(&pp[0], c[0], sum, 20, 128);
3361         sum -= c[1];
3362         adapt_prob(&pp[1], c[1], sum, 20, 128);
3363         sum -= c[2] + c[3];
3364         adapt_prob(&pp[2], c[2] + c[3], sum, 20, 128);
3365         adapt_prob(&pp[3], c[2], c[3], 20, 128);
3366         sum -= c[4] + c[5];
3367         adapt_prob(&pp[4], c[4] + c[5], sum, 20, 128);
3368         adapt_prob(&pp[5], c[4], c[5], 20, 128);
3369         sum -= c[6];
3370         adapt_prob(&pp[6], c[6], sum, 20, 128);
3371         adapt_prob(&pp[7], c[7] + c[8], c[9] + c[10], 20, 128);
3372         adapt_prob(&pp[8], c[7], c[8], 20, 128);
3373         adapt_prob(&pp[9], c[9], c[10], 20, 128);
3374
3375         adapt_prob(&p->mv_comp[i].class0, s->counts.mv_comp[i].class0[0],
3376                    s->counts.mv_comp[i].class0[1], 20, 128);
3377         pp = p->mv_comp[i].bits;
3378         c2 = s->counts.mv_comp[i].bits;
3379         for (j = 0; j < 10; j++)
3380             adapt_prob(&pp[j], c2[j][0], c2[j][1], 20, 128);
3381
3382         for (j = 0; j < 2; j++) {
3383             pp = p->mv_comp[i].class0_fp[j];
3384             c = s->counts.mv_comp[i].class0_fp[j];
3385             adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3386             adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3387             adapt_prob(&pp[2], c[2], c[3], 20, 128);
3388         }
3389         pp = p->mv_comp[i].fp;
3390         c = s->counts.mv_comp[i].fp;
3391         adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3392         adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3393         adapt_prob(&pp[2], c[2], c[3], 20, 128);
3394
3395         if (s->highprecisionmvs) {
3396             adapt_prob(&p->mv_comp[i].class0_hp, s->counts.mv_comp[i].class0_hp[0],
3397                        s->counts.mv_comp[i].class0_hp[1], 20, 128);
3398             adapt_prob(&p->mv_comp[i].hp, s->counts.mv_comp[i].hp[0],
3399                        s->counts.mv_comp[i].hp[1], 20, 128);
3400         }
3401     }
3402
3403     // y intra modes
3404     for (i = 0; i < 4; i++) {
3405         uint8_t *pp = p->y_mode[i];
3406         unsigned *c = s->counts.y_mode[i], sum, s2;
3407
3408         sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
3409         adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
3410         sum -= c[TM_VP8_PRED];
3411         adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
3412         sum -= c[VERT_PRED];
3413         adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
3414         s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
3415         sum -= s2;
3416         adapt_prob(&pp[3], s2, sum, 20, 128);
3417         s2 -= c[HOR_PRED];
3418         adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
3419         adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], 20, 128);
3420         sum -= c[DIAG_DOWN_LEFT_PRED];
3421         adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
3422         sum -= c[VERT_LEFT_PRED];
3423         adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
3424         adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
3425     }
3426
3427     // uv intra modes
3428     for (i = 0; i < 10; i++) {
3429         uint8_t *pp = p->uv_mode[i];
3430         unsigned *c = s->counts.uv_mode[i], sum, s2;
3431
3432         sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
3433         adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
3434         sum -= c[TM_VP8_PRED];
3435         adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
3436         sum -= c[VERT_PRED];
3437         adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
3438         s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
3439         sum -= s2;
3440         adapt_prob(&pp[3], s2, sum, 20, 128);
3441         s2 -= c[HOR_PRED];
3442         adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
3443         adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], 20, 128);
3444         sum -= c[DIAG_DOWN_LEFT_PRED];
3445         adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
3446         sum -= c[VERT_LEFT_PRED];
3447         adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
3448         adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
3449     }
3450 }
3451
3452 static av_cold int vp9_decode_free(AVCodecContext *ctx)
3453 {
3454     VP9Context *s = ctx->priv_data;
3455     int i;
3456
3457     for (i = 0; i < 2; i++) {
3458         if (s->frames[i].tf.f->data[0])
3459             vp9_unref_frame(ctx, &s->frames[i]);
3460         av_frame_free(&s->frames[i].tf.f);
3461     }
3462     for (i = 0; i < 8; i++) {
3463         if (s->refs[i].f->data[0])
3464             ff_thread_release_buffer(ctx, &s->refs[i]);
3465         av_frame_free(&s->refs[i].f);
3466         if (s->next_refs[i].f->data[0])
3467             ff_thread_release_buffer(ctx, &s->next_refs[i]);
3468         av_frame_free(&s->next_refs[i].f);
3469     }
3470     av_freep(&s->above_partition_ctx);
3471     av_freep(&s->c_b);
3472     s->c_b_size = 0;
3473     av_freep(&s->b_base);
3474     av_freep(&s->block_base);
3475
3476     return 0;
3477 }
3478
3479
3480 static int vp9_decode_frame(AVCodecContext *ctx, void *frame,
3481                             int *got_frame, AVPacket *pkt)
3482 {
3483     const uint8_t *data = pkt->data;
3484     int size = pkt->size;
3485     VP9Context *s = ctx->priv_data;
3486     int res, tile_row, tile_col, i, ref, row, col;
3487     ptrdiff_t yoff, uvoff, ls_y, ls_uv;
3488     AVFrame *f;
3489
3490     if ((res = decode_frame_header(ctx, data, size, &ref)) < 0) {
3491         return res;
3492     } else if (res == 0) {
3493         if (!s->refs[ref].f->data[0]) {
3494             av_log(ctx, AV_LOG_ERROR, "Requested reference %d not available\n", ref);
3495             return AVERROR_INVALIDDATA;
3496         }
3497         if ((res = av_frame_ref(frame, s->refs[ref].f)) < 0)
3498             return res;
3499         *got_frame = 1;
3500         return 0;
3501     }
3502     data += res;
3503     size -= res;
3504
3505     if (s->frames[LAST_FRAME].tf.f->data[0])
3506         vp9_unref_frame(ctx, &s->frames[LAST_FRAME]);
3507     if (!s->keyframe && s->frames[CUR_FRAME].tf.f->data[0] &&
3508         (res = vp9_ref_frame(ctx, &s->frames[LAST_FRAME], &s->frames[CUR_FRAME])) < 0)
3509         return res;
3510     if (s->frames[CUR_FRAME].tf.f->data[0])
3511         vp9_unref_frame(ctx, &s->frames[CUR_FRAME]);
3512     if ((res = vp9_alloc_frame(ctx, &s->frames[CUR_FRAME])) < 0)
3513         return res;
3514     f = s->frames[CUR_FRAME].tf.f;
3515     f->key_frame = s->keyframe;
3516     f->pict_type = s->keyframe ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
3517     ls_y = f->linesize[0];
3518     ls_uv =f->linesize[1];
3519
3520     // ref frame setup
3521     for (i = 0; i < 8; i++) {
3522         if (s->next_refs[i].f->data[0])
3523             ff_thread_release_buffer(ctx, &s->next_refs[i]);
3524         if (s->refreshrefmask & (1 << i)) {
3525             res = ff_thread_ref_frame(&s->next_refs[i], &s->frames[CUR_FRAME].tf);
3526         } else {
3527             res = ff_thread_ref_frame(&s->next_refs[i], &s->refs[i]);
3528         }
3529         if (res < 0)
3530             return res;
3531     }
3532
3533     // main tile decode loop
3534     memset(s->above_partition_ctx, 0, s->cols);
3535     memset(s->above_skip_ctx, 0, s->cols);
3536     if (s->keyframe || s->intraonly) {
3537         memset(s->above_mode_ctx, DC_PRED, s->cols * 2);
3538     } else {
3539         memset(s->above_mode_ctx, NEARESTMV, s->cols);
3540     }
3541     memset(s->above_y_nnz_ctx, 0, s->sb_cols * 16);
3542     memset(s->above_uv_nnz_ctx[0], 0, s->sb_cols * 8);
3543     memset(s->above_uv_nnz_ctx[1], 0, s->sb_cols * 8);
3544     memset(s->above_segpred_ctx, 0, s->cols);
3545     s->pass = s->uses_2pass =
3546         ctx->active_thread_type == FF_THREAD_FRAME && s->refreshctx && !s->parallelmode;
3547     if (s->refreshctx && s->parallelmode) {
3548         int j, k, l, m;
3549
3550         for (i = 0; i < 4; i++)
3551             for (j = 0; j < 2; j++)
3552                 for (k = 0; k < 2; k++)
3553                     for (l = 0; l < 6; l++)
3554                         for (m = 0; m < 6; m++)
3555                             memcpy(s->prob_ctx[s->framectxid].coef[i][j][k][l][m],
3556                                    s->prob.coef[i][j][k][l][m], 3);
3557         s->prob_ctx[s->framectxid].p = s->prob.p;
3558         ff_thread_finish_setup(ctx);
3559     }
3560
3561     do {
3562         yoff = uvoff = 0;
3563         s->b = s->b_base;
3564         s->block = s->block_base;
3565         s->uvblock[0] = s->uvblock_base[0];
3566         s->uvblock[1] = s->uvblock_base[1];
3567         s->eob = s->eob_base;
3568         s->uveob[0] = s->uveob_base[0];
3569         s->uveob[1] = s->uveob_base[1];
3570
3571         for (tile_row = 0; tile_row < s->tiling.tile_rows; tile_row++) {
3572             set_tile_offset(&s->tiling.tile_row_start, &s->tiling.tile_row_end,
3573                             tile_row, s->tiling.log2_tile_rows, s->sb_rows);
3574             if (s->pass != 2) {
3575                 for (tile_col = 0; tile_col < s->tiling.tile_cols; tile_col++) {
3576                     unsigned tile_size;
3577
3578                     if (tile_col == s->tiling.tile_cols - 1 &&
3579                         tile_row == s->tiling.tile_rows - 1) {
3580                         tile_size = size;
3581                     } else {
3582                         tile_size = AV_RB32(data);
3583                         data += 4;
3584                         size -= 4;
3585                     }
3586                     if (tile_size > size)
3587                         return AVERROR_INVALIDDATA;
3588                     ff_vp56_init_range_decoder(&s->c_b[tile_col], data, tile_size);
3589                     if (vp56_rac_get_prob_branchy(&s->c_b[tile_col], 128)) // marker bit
3590                         return AVERROR_INVALIDDATA;
3591                     data += tile_size;
3592                     size -= tile_size;
3593                 }
3594             }
3595
3596             for (row = s->tiling.tile_row_start; row < s->tiling.tile_row_end;
3597                  row += 8, yoff += ls_y * 64, uvoff += ls_uv * 32) {
3598                 struct VP9Filter *lflvl_ptr = s->lflvl;
3599                 ptrdiff_t yoff2 = yoff, uvoff2 = uvoff;
3600
3601                 for (tile_col = 0; tile_col < s->tiling.tile_cols; tile_col++) {
3602                     set_tile_offset(&s->tiling.tile_col_start, &s->tiling.tile_col_end,
3603                                     tile_col, s->tiling.log2_tile_cols, s->sb_cols);
3604
3605                     if (s->pass != 2) {
3606                         memset(s->left_partition_ctx, 0, 8);
3607                         memset(s->left_skip_ctx, 0, 8);
3608                         if (s->keyframe || s->intraonly) {
3609                             memset(s->left_mode_ctx, DC_PRED, 16);
3610                         } else {
3611                             memset(s->left_mode_ctx, NEARESTMV, 8);
3612                         }
3613                         memset(s->left_y_nnz_ctx, 0, 16);
3614                         memset(s->left_uv_nnz_ctx, 0, 16);
3615                         memset(s->left_segpred_ctx, 0, 8);
3616
3617                         memcpy(&s->c, &s->c_b[tile_col], sizeof(s->c));
3618                     }
3619
3620                     for (col = s->tiling.tile_col_start;
3621                          col < s->tiling.tile_col_end;
3622                          col += 8, yoff2 += 64, uvoff2 += 32, lflvl_ptr++) {
3623                         // FIXME integrate with lf code (i.e. zero after each
3624                         // use, similar to invtxfm coefficients, or similar)
3625                         if (s->pass != 1) {
3626                             memset(lflvl_ptr->mask, 0, sizeof(lflvl_ptr->mask));
3627                         }
3628
3629                         if (s->pass == 2) {
3630                             decode_sb_mem(ctx, row, col, lflvl_ptr,
3631                                           yoff2, uvoff2, BL_64X64);
3632                         } else {
3633                             decode_sb(ctx, row, col, lflvl_ptr,
3634                                       yoff2, uvoff2, BL_64X64);
3635                         }
3636                     }
3637                     if (s->pass != 2) {
3638                         memcpy(&s->c_b[tile_col], &s->c, sizeof(s->c));
3639                     }
3640                 }
3641
3642                 if (s->pass == 1) {
3643                     continue;
3644                 }
3645
3646                 // backup pre-loopfilter reconstruction data for intra
3647                 // prediction of next row of sb64s
3648                 if (row + 8 < s->rows) {
3649                     memcpy(s->intra_pred_data[0],
3650                            f->data[0] + yoff + 63 * ls_y,
3651                            8 * s->cols);
3652                     memcpy(s->intra_pred_data[1],
3653                            f->data[1] + uvoff + 31 * ls_uv,
3654                            4 * s->cols);
3655                     memcpy(s->intra_pred_data[2],
3656                            f->data[2] + uvoff + 31 * ls_uv,
3657                            4 * s->cols);
3658                 }
3659
3660                 // loopfilter one row
3661                 if (s->filter.level) {
3662                     yoff2 = yoff;
3663                     uvoff2 = uvoff;
3664                     lflvl_ptr = s->lflvl;
3665                     for (col = 0; col < s->cols;
3666                          col += 8, yoff2 += 64, uvoff2 += 32, lflvl_ptr++) {
3667                         loopfilter_sb(ctx, lflvl_ptr, row, col, yoff2, uvoff2);
3668                     }
3669                 }
3670
3671                 // FIXME maybe we can make this more finegrained by running the
3672                 // loopfilter per-block instead of after each sbrow
3673                 // In fact that would also make intra pred left preparation easier?
3674                 ff_thread_report_progress(&s->frames[CUR_FRAME].tf, row >> 3, 0);
3675             }
3676         }
3677
3678         if (s->pass < 2 && s->refreshctx && !s->parallelmode) {
3679             adapt_probs(s);
3680             ff_thread_finish_setup(ctx);
3681         }
3682     } while (s->pass++ == 1);
3683     ff_thread_report_progress(&s->frames[CUR_FRAME].tf, INT_MAX, 0);
3684
3685     // ref frame setup
3686     for (i = 0; i < 8; i++) {
3687         if (s->refs[i].f->data[0])
3688             ff_thread_release_buffer(ctx, &s->refs[i]);
3689         ff_thread_ref_frame(&s->refs[i], &s->next_refs[i]);
3690     }
3691
3692     if (!s->invisible) {
3693         if ((res = av_frame_ref(frame, s->frames[CUR_FRAME].tf.f)) < 0)
3694             return res;
3695         *got_frame = 1;
3696     }
3697
3698     return 0;
3699 }
3700
3701 static void vp9_decode_flush(AVCodecContext *ctx)
3702 {
3703     VP9Context *s = ctx->priv_data;
3704     int i;
3705
3706     for (i = 0; i < 2; i++)
3707         vp9_unref_frame(ctx, &s->frames[i]);
3708     for (i = 0; i < 8; i++)
3709         ff_thread_release_buffer(ctx, &s->refs[i]);
3710 }
3711
3712 static int init_frames(AVCodecContext *ctx)
3713 {
3714     VP9Context *s = ctx->priv_data;
3715     int i;
3716
3717     for (i = 0; i < 2; i++) {
3718         s->frames[i].tf.f = av_frame_alloc();
3719         if (!s->frames[i].tf.f) {
3720             vp9_decode_free(ctx);
3721             av_log(ctx, AV_LOG_ERROR, "Failed to allocate frame buffer %d\n", i);
3722             return AVERROR(ENOMEM);
3723         }
3724     }
3725     for (i = 0; i < 8; i++) {
3726         s->refs[i].f = av_frame_alloc();
3727         s->next_refs[i].f = av_frame_alloc();
3728         if (!s->refs[i].f || !s->next_refs[i].f) {
3729             vp9_decode_free(ctx);
3730             av_log(ctx, AV_LOG_ERROR, "Failed to allocate frame buffer %d\n", i);
3731             return AVERROR(ENOMEM);
3732         }
3733     }
3734
3735     return 0;
3736 }
3737
3738 static av_cold int vp9_decode_init(AVCodecContext *ctx)
3739 {
3740     VP9Context *s = ctx->priv_data;
3741
3742     ctx->internal->allocate_progress = 1;
3743     ctx->pix_fmt = AV_PIX_FMT_YUV420P;
3744     ff_vp9dsp_init(&s->dsp);
3745     ff_videodsp_init(&s->vdsp, 8);
3746     s->filter.sharpness = -1;
3747
3748     return init_frames(ctx);
3749 }
3750
3751 static av_cold int vp9_decode_init_thread_copy(AVCodecContext *avctx)
3752 {
3753     return init_frames(avctx);
3754 }
3755
3756 static int vp9_decode_update_thread_context(AVCodecContext *dst, const AVCodecContext *src)
3757 {
3758     int i, res;
3759     VP9Context *s = dst->priv_data, *ssrc = src->priv_data;
3760
3761     // FIXME scalability, size, etc.
3762
3763     for (i = 0; i < 2; i++) {
3764         if (s->frames[i].tf.f->data[0])
3765             vp9_unref_frame(dst, &s->frames[i]);
3766         if (ssrc->frames[i].tf.f->data[0]) {
3767             if ((res = vp9_ref_frame(dst, &s->frames[i], &ssrc->frames[i])) < 0)
3768                 return res;
3769         }
3770     }
3771     for (i = 0; i < 8; i++) {
3772         if (s->refs[i].f->data[0])
3773             ff_thread_release_buffer(dst, &s->refs[i]);
3774         if (ssrc->next_refs[i].f->data[0]) {
3775             if ((res = ff_thread_ref_frame(&s->refs[i], &ssrc->next_refs[i])) < 0)
3776                 return res;
3777         }
3778     }
3779
3780     s->invisible = ssrc->invisible;
3781     s->keyframe = ssrc->keyframe;
3782     s->uses_2pass = ssrc->uses_2pass;
3783     memcpy(&s->prob_ctx, &ssrc->prob_ctx, sizeof(s->prob_ctx));
3784     memcpy(&s->lf_delta, &ssrc->lf_delta, sizeof(s->lf_delta));
3785     if (ssrc->segmentation.enabled) {
3786         memcpy(&s->segmentation.feat, &ssrc->segmentation.feat,
3787                sizeof(s->segmentation.feat));
3788     }
3789
3790     return 0;
3791 }
3792
3793 AVCodec ff_vp9_decoder = {
3794     .name                  = "vp9",
3795     .long_name             = NULL_IF_CONFIG_SMALL("Google VP9"),
3796     .type                  = AVMEDIA_TYPE_VIDEO,
3797     .id                    = AV_CODEC_ID_VP9,
3798     .priv_data_size        = sizeof(VP9Context),
3799     .init                  = vp9_decode_init,
3800     .close                 = vp9_decode_free,
3801     .decode                = vp9_decode_frame,
3802     .capabilities          = CODEC_CAP_DR1 | CODEC_CAP_FRAME_THREADS,
3803     .flush                 = vp9_decode_flush,
3804     .init_thread_copy      = ONLY_IF_THREADS_ENABLED(vp9_decode_init_thread_copy),
3805     .update_thread_context = ONLY_IF_THREADS_ENABLED(vp9_decode_update_thread_context),
3806 };