git.sesse.net Git - ffmpeg/blob - libavcodec/vp9.c

   1 /*
   2  * VP9 compatible video decoder
   3  *
   4  * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
   5  * Copyright (C) 2013 Clément Bœsch <u pkh me>
   6  *
   7  * This file is part of FFmpeg.
   8  *
   9  * FFmpeg is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public
  11  * License as published by the Free Software Foundation; either
  12  * version 2.1 of the License, or (at your option) any later version.
  13  *
  14  * FFmpeg is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with FFmpeg; if not, write to the Free Software
  21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22  */
  23
  24 #include "avcodec.h"
  25 #include "get_bits.h"
  26 #include "internal.h"
  27 #include "videodsp.h"
  28 #include "vp56.h"
  29 #include "vp9.h"
  30 #include "vp9data.h"
  31 #include "vp9dsp.h"
  32 #include "libavutil/avassert.h"
  33
  34 #define VP9_SYNCCODE 0x498342
  35
  36 enum CompPredMode {
  37     PRED_SINGLEREF,
  38     PRED_COMPREF,
  39     PRED_SWITCHABLE,
  40 };
  41
  42 enum BlockLevel {
  43     BL_64X64,
  44     BL_32X32,
  45     BL_16X16,
  46     BL_8X8,
  47 };
  48
  49 enum BlockSize {
  50     BS_64x64,
  51     BS_64x32,
  52     BS_32x64,
  53     BS_32x32,
  54     BS_32x16,
  55     BS_16x32,
  56     BS_16x16,
  57     BS_16x8,
  58     BS_8x16,
  59     BS_8x8,
  60     BS_8x4,
  61     BS_4x8,
  62     BS_4x4,
  63     N_BS_SIZES,
  64 };
  65
  66 struct VP9mvrefPair {
  67     VP56mv mv[2];
  68     int8_t ref[2];
  69 };
  70
  71 struct VP9Filter {
  72     uint8_t level[8 * 8];
  73     uint8_t /* bit=col */ mask[2 /* 0=y, 1=uv */][2 /* 0=col, 1=row */]
  74                               [8 /* rows */][4 /* 0=16, 1=8, 2=4, 3=inner4 */];
  75 };
  76
  77 typedef struct VP9Block {
  78     uint8_t seg_id, intra, comp, ref[2], mode[4], uvmode, skip;
  79     enum FilterMode filter;
  80     VP56mv mv[4 /* b_idx */][2 /* ref */];
  81     enum BlockSize bs;
  82     enum TxfmMode tx, uvtx;
  83
  84     int row, row7, col, col7;
  85     uint8_t *dst[3];
  86     ptrdiff_t y_stride, uv_stride;
  87 } VP9Block;
  88
  89 typedef struct VP9Context {
  90     VP9DSPContext dsp;
  91     VideoDSPContext vdsp;
  92     GetBitContext gb;
  93     VP56RangeCoder c;
  94     VP56RangeCoder *c_b;
  95     unsigned c_b_size;
  96     VP9Block b;
  97
  98     // bitstream header
  99     uint8_t profile;
 100     uint8_t keyframe, last_keyframe;
 101     uint8_t invisible, last_invisible;
 102     uint8_t use_last_frame_mvs;
 103     uint8_t errorres;
 104     uint8_t colorspace;
 105     uint8_t fullrange;
 106     uint8_t intraonly;
 107     uint8_t resetctx;
 108     uint8_t refreshrefmask;
 109     uint8_t highprecisionmvs;
 110     enum FilterMode filtermode;
 111     uint8_t allowcompinter;
 112     uint8_t fixcompref;
 113     uint8_t refreshctx;
 114     uint8_t parallelmode;
 115     uint8_t framectxid;
 116     uint8_t refidx[3];
 117     uint8_t signbias[3];
 118     uint8_t varcompref[2];
 119     AVFrame *refs[8], *f, *fb[10];
 120
 121     struct {
 122         uint8_t level;
 123         int8_t sharpness;
 124         uint8_t lim_lut[64];
 125         uint8_t mblim_lut[64];
 126     } filter;
 127     struct {
 128         uint8_t enabled;
 129         int8_t mode[2];
 130         int8_t ref[4];
 131     } lf_delta;
 132     uint8_t yac_qi;
 133     int8_t ydc_qdelta, uvdc_qdelta, uvac_qdelta;
 134     uint8_t lossless;
 135     struct {
 136         uint8_t enabled;
 137         uint8_t temporal;
 138         uint8_t absolute_vals;
 139         uint8_t update_map;
 140         struct {
 141             uint8_t q_enabled;
 142             uint8_t lf_enabled;
 143             uint8_t ref_enabled;
 144             uint8_t skip_enabled;
 145             uint8_t ref_val;
 146             int16_t q_val;
 147             int8_t lf_val;
 148             int16_t qmul[2][2];
 149             uint8_t lflvl[4][2];
 150         } feat[8];
 151     } segmentation;
 152     struct {
 153         unsigned log2_tile_cols, log2_tile_rows;
 154         unsigned tile_cols, tile_rows;
 155         unsigned tile_row_start, tile_row_end, tile_col_start, tile_col_end;
 156     } tiling;
 157     unsigned sb_cols, sb_rows, rows, cols;
 158     struct {
 159         prob_context p;
 160         uint8_t coef[4][2][2][6][6][3];
 161     } prob_ctx[4];
 162     struct {
 163         prob_context p;
 164         uint8_t coef[4][2][2][6][6][11];
 165         uint8_t seg[7];
 166         uint8_t segpred[3];
 167     } prob;
 168     struct {
 169         unsigned y_mode[4][10];
 170         unsigned uv_mode[10][10];
 171         unsigned filter[4][3];
 172         unsigned mv_mode[7][4];
 173         unsigned intra[4][2];
 174         unsigned comp[5][2];
 175         unsigned single_ref[5][2][2];
 176         unsigned comp_ref[5][2];
 177         unsigned tx32p[2][4];
 178         unsigned tx16p[2][3];
 179         unsigned tx8p[2][2];
 180         unsigned skip[3][2];
 181         unsigned mv_joint[4];
 182         struct {
 183             unsigned sign[2];
 184             unsigned classes[11];
 185             unsigned class0[2];
 186             unsigned bits[10][2];
 187             unsigned class0_fp[2][4];
 188             unsigned fp[4];
 189             unsigned class0_hp[2];
 190             unsigned hp[2];
 191         } mv_comp[2];
 192         unsigned partition[4][4][4];
 193         unsigned coef[4][2][2][6][6][3];
 194         unsigned eob[4][2][2][6][6][2];
 195     } counts;
 196     enum TxfmMode txfmmode;
 197     enum CompPredMode comppredmode;
 198
 199     // contextual (left/above) cache
 200     uint8_t left_partition_ctx[8], *above_partition_ctx;
 201     uint8_t left_mode_ctx[16], *above_mode_ctx;
 202     // FIXME maybe merge some of the below in a flags field?
 203     uint8_t left_y_nnz_ctx[16], *above_y_nnz_ctx;
 204     uint8_t left_uv_nnz_ctx[2][8], *above_uv_nnz_ctx[2];
 205     uint8_t left_skip_ctx[8], *above_skip_ctx; // 1bit
 206     uint8_t left_txfm_ctx[8], *above_txfm_ctx; // 2bit
 207     uint8_t left_segpred_ctx[8], *above_segpred_ctx; // 1bit
 208     uint8_t left_intra_ctx[8], *above_intra_ctx; // 1bit
 209     uint8_t left_comp_ctx[8], *above_comp_ctx; // 1bit
 210     uint8_t left_ref_ctx[8], *above_ref_ctx; // 2bit
 211     uint8_t left_filter_ctx[8], *above_filter_ctx;
 212     VP56mv left_mv_ctx[16][2], (*above_mv_ctx)[2];
 213
 214     // whole-frame cache
 215     uint8_t *intra_pred_data[3];
 216     uint8_t *segmentation_map;
 217     struct VP9mvrefPair *mv[2];
 218     struct VP9Filter *lflvl;
 219     DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[71*80];
 220
 221     // block reconstruction intermediates
 222     DECLARE_ALIGNED(32, int16_t, block)[4096];
 223     DECLARE_ALIGNED(32, int16_t, uvblock)[2][1024];
 224     uint8_t eob[256];
 225     uint8_t uveob[2][64];
 226     VP56mv min_mv, max_mv;
 227     DECLARE_ALIGNED(32, uint8_t, tmp_y)[64*64];
 228     DECLARE_ALIGNED(32, uint8_t, tmp_uv)[2][32*32];
 229 } VP9Context;
 230
 231 static const uint8_t bwh_tab[2][N_BS_SIZES][2] = {
 232     {
 233         { 16, 16 }, { 16, 8 }, { 8, 16 }, { 8, 8 }, { 8, 4 }, { 4, 8 },
 234         { 4, 4 }, { 4, 2 }, { 2, 4 }, { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 },
 235     }, {
 236         { 8, 8 }, { 8, 4 }, { 4, 8 }, { 4, 4 }, { 4, 2 }, { 2, 4 },
 237         { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },
 238     }
 239 };
 240
 241 static int update_size(AVCodecContext *ctx, int w, int h)
 242 {
 243     VP9Context *s = ctx->priv_data;
 244     uint8_t *p;
 245
 246     if (s->above_partition_ctx && w == ctx->width && h == ctx->height)
 247         return 0;
 248
 249     ctx->width  = w;
 250     ctx->height = h;
 251     s->sb_cols  = (w + 63) >> 6;
 252     s->sb_rows  = (h + 63) >> 6;
 253     s->cols     = (w + 7) >> 3;
 254     s->rows     = (h + 7) >> 3;
 255
 256 #define assign(var, type, n) var = (type) p; p += s->sb_cols * n * sizeof(*var)
 257     av_free(s->above_partition_ctx);
 258     p = av_malloc(s->sb_cols * (240 + sizeof(*s->lflvl) + 16 * sizeof(*s->above_mv_ctx) +
 259                                 64 * s->sb_rows * (1 + sizeof(*s->mv[0]) * 2)));
 260     if (!p)
 261         return AVERROR(ENOMEM);
 262     assign(s->above_partition_ctx, uint8_t *,              8);
 263     assign(s->above_skip_ctx,      uint8_t *,              8);
 264     assign(s->above_txfm_ctx,      uint8_t *,              8);
 265     assign(s->above_mode_ctx,      uint8_t *,             16);
 266     assign(s->above_y_nnz_ctx,     uint8_t *,             16);
 267     assign(s->above_uv_nnz_ctx[0], uint8_t *,              8);
 268     assign(s->above_uv_nnz_ctx[1], uint8_t *,              8);
 269     assign(s->intra_pred_data[0],  uint8_t *,             64);
 270     assign(s->intra_pred_data[1],  uint8_t *,             32);
 271     assign(s->intra_pred_data[2],  uint8_t *,             32);
 272     assign(s->above_segpred_ctx,   uint8_t *,              8);
 273     assign(s->above_intra_ctx,     uint8_t *,              8);
 274     assign(s->above_comp_ctx,      uint8_t *,              8);
 275     assign(s->above_ref_ctx,       uint8_t *,              8);
 276     assign(s->above_filter_ctx,    uint8_t *,              8);
 277     assign(s->lflvl,               struct VP9Filter *,     1);
 278     assign(s->above_mv_ctx,        VP56mv(*)[2],          16);
 279     assign(s->segmentation_map,    uint8_t *,             64 * s->sb_rows);
 280     assign(s->mv[0],               struct VP9mvrefPair *, 64 * s->sb_rows);
 281     assign(s->mv[1],               struct VP9mvrefPair *, 64 * s->sb_rows);
 282 #undef assign
 283
 284     return 0;
 285 }
 286
 287 // for some reason the sign bit is at the end, not the start, of a bit sequence
 288 static av_always_inline int get_sbits_inv(GetBitContext *gb, int n)
 289 {
 290     int v = get_bits(gb, n);
 291     return get_bits1(gb) ? -v : v;
 292 }
 293
 294 static av_always_inline int inv_recenter_nonneg(int v, int m)
 295 {
 296     return v > 2 * m ? v : v & 1 ? m - ((v + 1) >> 1) : m + (v >> 1);
 297 }
 298
 299 // differential forward probability updates
 300 static int update_prob(VP56RangeCoder *c, int p)
 301 {
 302     static const int inv_map_table[254] = {
 303           7,  20,  33,  46,  59,  72,  85,  98, 111, 124, 137, 150, 163, 176,
 304         189, 202, 215, 228, 241, 254,   1,   2,   3,   4,   5,   6,   8,   9,
 305          10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  21,  22,  23,  24,
 306          25,  26,  27,  28,  29,  30,  31,  32,  34,  35,  36,  37,  38,  39,
 307          40,  41,  42,  43,  44,  45,  47,  48,  49,  50,  51,  52,  53,  54,
 308          55,  56,  57,  58,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,
 309          70,  71,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,
 310          86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,  99, 100,
 311         101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 112, 113, 114, 115,
 312         116, 117, 118, 119, 120, 121, 122, 123, 125, 126, 127, 128, 129, 130,
 313         131, 132, 133, 134, 135, 136, 138, 139, 140, 141, 142, 143, 144, 145,
 314         146, 147, 148, 149, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160,
 315         161, 162, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
 316         177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 190, 191,
 317         192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 203, 204, 205, 206,
 318         207, 208, 209, 210, 211, 212, 213, 214, 216, 217, 218, 219, 220, 221,
 319         222, 223, 224, 225, 226, 227, 229, 230, 231, 232, 233, 234, 235, 236,
 320         237, 238, 239, 240, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251,
 321         252, 253,
 322     };
 323     int d;
 324
 325     /* This code is trying to do a differential probability update. For a
 326      * current probability A in the range [1, 255], the difference to a new
 327      * probability of any value can be expressed differentially as 1-A,255-A
 328      * where some part of this (absolute range) exists both in positive as
 329      * well as the negative part, whereas another part only exists in one
 330      * half. We're trying to code this shared part differentially, i.e.
 331      * times two where the value of the lowest bit specifies the sign, and
 332      * the single part is then coded on top of this. This absolute difference
 333      * then again has a value of [0,254], but a bigger value in this range
 334      * indicates that we're further away from the original value A, so we
 335      * can code this as a VLC code, since higher values are increasingly
 336      * unlikely. The first 20 values in inv_map_table[] allow 'cheap, rough'
 337      * updates vs. the 'fine, exact' updates further down the range, which
 338      * adds one extra dimension to this differential update model. */
 339
 340     if (!vp8_rac_get(c)) {
 341         d = vp8_rac_get_uint(c, 4) + 0;
 342     } else if (!vp8_rac_get(c)) {
 343         d = vp8_rac_get_uint(c, 4) + 16;
 344     } else if (!vp8_rac_get(c)) {
 345         d = vp8_rac_get_uint(c, 5) + 32;
 346     } else {
 347         d = vp8_rac_get_uint(c, 7);
 348         if (d >= 65)
 349             d = (d << 1) - 65 + vp8_rac_get(c);
 350         d += 64;
 351     }
 352
 353     return p <= 128 ? 1 + inv_recenter_nonneg(inv_map_table[d], p - 1) :
 354                     255 - inv_recenter_nonneg(inv_map_table[d], 255 - p);
 355 }
 356
 357 static int decode_frame_header(AVCodecContext *ctx,
 358                                const uint8_t *data, int size, int *ref)
 359 {
 360     VP9Context *s = ctx->priv_data;
 361     int c, i, j, k, l, m, n, w, h, max, size2, res, sharp;
 362     const uint8_t *data2;
 363
 364     /* general header */
 365     if ((res = init_get_bits8(&s->gb, data, size)) < 0) {
 366         av_log(ctx, AV_LOG_ERROR, "Failed to intialize bitstream reader\n");
 367         return res;
 368     }
 369     if (get_bits(&s->gb, 2) != 0x2) { // frame marker
 370         av_log(ctx, AV_LOG_ERROR, "Invalid frame marker\n");
 371         return AVERROR_INVALIDDATA;
 372     }
 373     s->profile = get_bits1(&s->gb);
 374     if (get_bits1(&s->gb)) { // reserved bit
 375         av_log(ctx, AV_LOG_ERROR, "Reserved bit should be zero\n");
 376         return AVERROR_INVALIDDATA;
 377     }
 378     if (get_bits1(&s->gb)) {
 379         *ref = get_bits(&s->gb, 3);
 380         return 0;
 381     }
 382     s->last_keyframe  = s->keyframe;
 383     s->keyframe       = !get_bits1(&s->gb);
 384     s->last_invisible = s->invisible;
 385     s->invisible      = !get_bits1(&s->gb);
 386     s->errorres       = get_bits1(&s->gb);
 387     // FIXME disable this upon resolution change
 388     s->use_last_frame_mvs = !s->errorres && !s->last_invisible;
 389     if (s->keyframe) {
 390         if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
 391             av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
 392             return AVERROR_INVALIDDATA;
 393         }
 394         s->colorspace = get_bits(&s->gb, 3);
 395         if (s->colorspace == 7) { // RGB = profile 1
 396             av_log(ctx, AV_LOG_ERROR, "RGB not supported in profile 0\n");
 397             return AVERROR_INVALIDDATA;
 398         }
 399         s->fullrange  = get_bits1(&s->gb);
 400         // for profile 1, here follows the subsampling bits
 401         s->refreshrefmask = 0xff;
 402         w = get_bits(&s->gb, 16) + 1;
 403         h = get_bits(&s->gb, 16) + 1;
 404         if (get_bits1(&s->gb)) // display size
 405             skip_bits(&s->gb, 32);
 406     } else {
 407         s->intraonly  = s->invisible ? get_bits1(&s->gb) : 0;
 408         s->resetctx   = s->errorres ? 0 : get_bits(&s->gb, 2);
 409         if (s->intraonly) {
 410             if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
 411                 av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
 412                 return AVERROR_INVALIDDATA;
 413             }
 414             s->refreshrefmask = get_bits(&s->gb, 8);
 415             w = get_bits(&s->gb, 16) + 1;
 416             h = get_bits(&s->gb, 16) + 1;
 417             if (get_bits1(&s->gb)) // display size
 418                 skip_bits(&s->gb, 32);
 419         } else {
 420             s->refreshrefmask = get_bits(&s->gb, 8);
 421             s->refidx[0]      = get_bits(&s->gb, 3);
 422             s->signbias[0]    = get_bits1(&s->gb);
 423             s->refidx[1]      = get_bits(&s->gb, 3);
 424             s->signbias[1]    = get_bits1(&s->gb);
 425             s->refidx[2]      = get_bits(&s->gb, 3);
 426             s->signbias[2]    = get_bits1(&s->gb);
 427             if (!s->refs[s->refidx[0]] || !s->refs[s->refidx[1]] ||
 428                 !s->refs[s->refidx[2]]) {
 429                 av_log(ctx, AV_LOG_ERROR, "Not all references are available\n");
 430                 return AVERROR_INVALIDDATA;
 431             }
 432             if (get_bits1(&s->gb)) {
 433                 w = s->refs[s->refidx[0]]->width;
 434                 h = s->refs[s->refidx[0]]->height;
 435             } else if (get_bits1(&s->gb)) {
 436                 w = s->refs[s->refidx[1]]->width;
 437                 h = s->refs[s->refidx[1]]->height;
 438             } else if (get_bits1(&s->gb)) {
 439                 w = s->refs[s->refidx[2]]->width;
 440                 h = s->refs[s->refidx[2]]->height;
 441             } else {
 442                 w = get_bits(&s->gb, 16) + 1;
 443                 h = get_bits(&s->gb, 16) + 1;
 444             }
 445             if (get_bits1(&s->gb)) // display size
 446                 skip_bits(&s->gb, 32);
 447             s->highprecisionmvs = get_bits1(&s->gb);
 448             s->filtermode = get_bits1(&s->gb) ? FILTER_SWITCHABLE :
 449                                                 get_bits(&s->gb, 2);
 450             s->allowcompinter = s->signbias[0] != s->signbias[1] ||
 451                                 s->signbias[0] != s->signbias[2];
 452             if (s->allowcompinter) {
 453                 if (s->signbias[0] == s->signbias[1]) {
 454                     s->fixcompref    = 2;
 455                     s->varcompref[0] = 0;
 456                     s->varcompref[1] = 1;
 457                 } else if (s->signbias[0] == s->signbias[2]) {
 458                     s->fixcompref    = 1;
 459                     s->varcompref[0] = 0;
 460                     s->varcompref[1] = 2;
 461                 } else {
 462                     s->fixcompref    = 0;
 463                     s->varcompref[0] = 1;
 464                     s->varcompref[1] = 2;
 465                 }
 466             }
 467         }
 468     }
 469     s->refreshctx   = s->errorres ? 0 : get_bits1(&s->gb);
 470     s->parallelmode = s->errorres ? 1 : get_bits1(&s->gb);
 471     s->framectxid   = c = get_bits(&s->gb, 2);
 472
 473     /* loopfilter header data */
 474     s->filter.level = get_bits(&s->gb, 6);
 475     sharp = get_bits(&s->gb, 3);
 476     // if sharpness changed, reinit lim/mblim LUTs. if it didn't change, keep
 477     // the old cache values since they are still valid
 478     if (s->filter.sharpness != sharp)
 479         memset(s->filter.lim_lut, 0, sizeof(s->filter.lim_lut));
 480     s->filter.sharpness = sharp;
 481     if ((s->lf_delta.enabled = get_bits1(&s->gb))) {
 482         if (get_bits1(&s->gb)) {
 483             for (i = 0; i < 4; i++)
 484                 if (get_bits1(&s->gb))
 485                     s->lf_delta.ref[i] = get_sbits_inv(&s->gb, 6);
 486             for (i = 0; i < 2; i++)
 487                 if (get_bits1(&s->gb))
 488                     s->lf_delta.mode[i] = get_sbits_inv(&s->gb, 6);
 489         }
 490     } else {
 491         memset(&s->lf_delta, 0, sizeof(s->lf_delta));
 492     }
 493
 494     /* quantization header data */
 495     s->yac_qi      = get_bits(&s->gb, 8);
 496     s->ydc_qdelta  = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
 497     s->uvdc_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
 498     s->uvac_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
 499     s->lossless    = s->yac_qi == 0 && s->ydc_qdelta == 0 &&
 500                      s->uvdc_qdelta == 0 && s->uvac_qdelta == 0;
 501
 502     /* segmentation header info */
 503     if ((s->segmentation.enabled = get_bits1(&s->gb))) {
 504         if ((s->segmentation.update_map = get_bits1(&s->gb))) {
 505             for (i = 0; i < 7; i++)
 506                 s->prob.seg[i] = get_bits1(&s->gb) ?
 507                                  get_bits(&s->gb, 8) : 255;
 508             if ((s->segmentation.temporal = get_bits1(&s->gb)))
 509                 for (i = 0; i < 3; i++)
 510                     s->prob.segpred[i] = get_bits1(&s->gb) ?
 511                                          get_bits(&s->gb, 8) : 255;
 512         }
 513
 514         if (get_bits1(&s->gb)) {
 515             s->segmentation.absolute_vals = get_bits1(&s->gb);
 516             for (i = 0; i < 8; i++) {
 517                 if ((s->segmentation.feat[i].q_enabled = get_bits1(&s->gb)))
 518                     s->segmentation.feat[i].q_val = get_sbits_inv(&s->gb, 8);
 519                 if ((s->segmentation.feat[i].lf_enabled = get_bits1(&s->gb)))
 520                     s->segmentation.feat[i].lf_val = get_sbits_inv(&s->gb, 6);
 521                 if ((s->segmentation.feat[i].ref_enabled = get_bits1(&s->gb)))
 522                     s->segmentation.feat[i].ref_val = get_bits(&s->gb, 2);
 523                 s->segmentation.feat[i].skip_enabled = get_bits1(&s->gb);
 524             }
 525         }
 526     } else {
 527         s->segmentation.feat[0].q_enabled    = 0;
 528         s->segmentation.feat[0].lf_enabled   = 0;
 529         s->segmentation.feat[0].skip_enabled = 0;
 530         s->segmentation.feat[0].ref_enabled  = 0;
 531     }
 532
 533     // set qmul[] based on Y/UV, AC/DC and segmentation Q idx deltas
 534     for (i = 0; i < (s->segmentation.enabled ? 8 : 1); i++) {
 535         int qyac, qydc, quvac, quvdc, lflvl, sh;
 536
 537         if (s->segmentation.feat[i].q_enabled) {
 538             if (s->segmentation.absolute_vals)
 539                 qyac = s->segmentation.feat[i].q_val;
 540             else
 541                 qyac = s->yac_qi + s->segmentation.feat[i].q_val;
 542         } else {
 543             qyac  = s->yac_qi;
 544         }
 545         qydc  = av_clip_uintp2(qyac + s->ydc_qdelta, 8);
 546         quvdc = av_clip_uintp2(qyac + s->uvdc_qdelta, 8);
 547         quvac = av_clip_uintp2(qyac + s->uvac_qdelta, 8);
 548         qyac  = av_clip_uintp2(qyac, 8);
 549
 550         s->segmentation.feat[i].qmul[0][0] = vp9_dc_qlookup[qydc];
 551         s->segmentation.feat[i].qmul[0][1] = vp9_ac_qlookup[qyac];
 552         s->segmentation.feat[i].qmul[1][0] = vp9_dc_qlookup[quvdc];
 553         s->segmentation.feat[i].qmul[1][1] = vp9_ac_qlookup[quvac];
 554
 555         sh = s->filter.level >= 32;
 556         if (s->segmentation.feat[i].lf_enabled) {
 557             if (s->segmentation.absolute_vals)
 558                 lflvl = s->segmentation.feat[i].lf_val;
 559             else
 560                 lflvl = s->filter.level + s->segmentation.feat[i].lf_val;
 561         } else {
 562             lflvl  = s->filter.level;
 563         }
 564         s->segmentation.feat[i].lflvl[0][0] =
 565         s->segmentation.feat[i].lflvl[0][1] =
 566             av_clip_uintp2(lflvl + (s->lf_delta.ref[0] << sh), 6);
 567         for (j = 1; j < 4; j++) {
 568             s->segmentation.feat[i].lflvl[j][0] =
 569                 av_clip_uintp2(lflvl + ((s->lf_delta.ref[j] +
 570                                          s->lf_delta.mode[0]) << sh), 6);
 571             s->segmentation.feat[i].lflvl[j][1] =
 572                 av_clip_uintp2(lflvl + ((s->lf_delta.ref[j] +
 573                                          s->lf_delta.mode[1]) << sh), 6);
 574         }
 575     }
 576
 577     /* tiling info */
 578     if ((res = update_size(ctx, w, h)) < 0) {
 579         av_log(ctx, AV_LOG_ERROR, "Failed to initialize decoder for %dx%d\n", w, h);
 580         return res;
 581     }
 582     for (s->tiling.log2_tile_cols = 0;
 583          (s->sb_cols >> s->tiling.log2_tile_cols) > 64;
 584          s->tiling.log2_tile_cols++) ;
 585     for (max = 0; (s->sb_cols >> max) >= 4; max++) ;
 586     max = FFMAX(0, max - 1);
 587     while (max > s->tiling.log2_tile_cols) {
 588         if (get_bits1(&s->gb))
 589             s->tiling.log2_tile_cols++;
 590         else
 591             break;
 592     }
 593     s->tiling.log2_tile_rows = decode012(&s->gb);
 594     s->tiling.tile_rows = 1 << s->tiling.log2_tile_rows;
 595     if (s->tiling.tile_cols != (1 << s->tiling.log2_tile_cols)) {
 596         s->tiling.tile_cols = 1 << s->tiling.log2_tile_cols;
 597         s->c_b = av_fast_realloc(s->c_b, &s->c_b_size,
 598                                  sizeof(VP56RangeCoder) * s->tiling.tile_cols);
 599         if (!s->c_b) {
 600             av_log(ctx, AV_LOG_ERROR, "Ran out of memory during range coder init\n");
 601             return AVERROR(ENOMEM);
 602         }
 603     }
 604
 605     if (s->keyframe || s->errorres || s->intraonly) {
 606         s->prob_ctx[0].p = s->prob_ctx[1].p = s->prob_ctx[2].p =
 607                            s->prob_ctx[3].p = vp9_default_probs;
 608         memcpy(s->prob_ctx[0].coef, vp9_default_coef_probs,
 609                sizeof(vp9_default_coef_probs));
 610         memcpy(s->prob_ctx[1].coef, vp9_default_coef_probs,
 611                sizeof(vp9_default_coef_probs));
 612         memcpy(s->prob_ctx[2].coef, vp9_default_coef_probs,
 613                sizeof(vp9_default_coef_probs));
 614         memcpy(s->prob_ctx[3].coef, vp9_default_coef_probs,
 615                sizeof(vp9_default_coef_probs));
 616     }
 617
 618     // next 16 bits is size of the rest of the header (arith-coded)
 619     size2 = get_bits(&s->gb, 16);
 620     data2 = align_get_bits(&s->gb);
 621     if (size2 > size - (data2 - data)) {
 622         av_log(ctx, AV_LOG_ERROR, "Invalid compressed header size\n");
 623         return AVERROR_INVALIDDATA;
 624     }
 625     ff_vp56_init_range_decoder(&s->c, data2, size2);
 626     if (vp56_rac_get_prob_branchy(&s->c, 128)) { // marker bit
 627         av_log(ctx, AV_LOG_ERROR, "Marker bit was set\n");
 628         return AVERROR_INVALIDDATA;
 629     }
 630
 631     if (s->keyframe || s->intraonly) {
 632         memset(s->counts.coef, 0, sizeof(s->counts.coef) + sizeof(s->counts.eob));
 633     } else {
 634         memset(&s->counts, 0, sizeof(s->counts));
 635     }
 636     // FIXME is it faster to not copy here, but do it down in the fw updates
 637     // as explicit copies if the fw update is missing (and skip the copy upon
 638     // fw update)?
 639     s->prob.p = s->prob_ctx[c].p;
 640
 641     // txfm updates
 642     if (s->lossless) {
 643         s->txfmmode = TX_4X4;
 644     } else {
 645         s->txfmmode = vp8_rac_get_uint(&s->c, 2);
 646         if (s->txfmmode == 3)
 647             s->txfmmode += vp8_rac_get(&s->c);
 648
 649         if (s->txfmmode == TX_SWITCHABLE) {
 650             for (i = 0; i < 2; i++)
 651                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 652                     s->prob.p.tx8p[i] = update_prob(&s->c, s->prob.p.tx8p[i]);
 653             for (i = 0; i < 2; i++)
 654                 for (j = 0; j < 2; j++)
 655                     if (vp56_rac_get_prob_branchy(&s->c, 252))
 656                         s->prob.p.tx16p[i][j] =
 657                             update_prob(&s->c, s->prob.p.tx16p[i][j]);
 658             for (i = 0; i < 2; i++)
 659                 for (j = 0; j < 3; j++)
 660                     if (vp56_rac_get_prob_branchy(&s->c, 252))
 661                         s->prob.p.tx32p[i][j] =
 662                             update_prob(&s->c, s->prob.p.tx32p[i][j]);
 663         }
 664     }
 665
 666     // coef updates
 667     for (i = 0; i < 4; i++) {
 668         uint8_t (*ref)[2][6][6][3] = s->prob_ctx[c].coef[i];
 669         if (vp8_rac_get(&s->c)) {
 670             for (j = 0; j < 2; j++)
 671                 for (k = 0; k < 2; k++)
 672                     for (l = 0; l < 6; l++)
 673                         for (m = 0; m < 6; m++) {
 674                             uint8_t *p = s->prob.coef[i][j][k][l][m];
 675                             uint8_t *r = ref[j][k][l][m];
 676                             if (m >= 3 && l == 0) // dc only has 3 pt
 677                                 break;
 678                             for (n = 0; n < 3; n++) {
 679                                 if (vp56_rac_get_prob_branchy(&s->c, 252)) {
 680                                     p[n] = update_prob(&s->c, r[n]);
 681                                 } else {
 682                                     p[n] = r[n];
 683                                 }
 684                             }
 685                             p[3] = 0;
 686                         }
 687         } else {
 688             for (j = 0; j < 2; j++)
 689                 for (k = 0; k < 2; k++)
 690                     for (l = 0; l < 6; l++)
 691                         for (m = 0; m < 6; m++) {
 692                             uint8_t *p = s->prob.coef[i][j][k][l][m];
 693                             uint8_t *r = ref[j][k][l][m];
 694                             if (m > 3 && l == 0) // dc only has 3 pt
 695                                 break;
 696                             memcpy(p, r, 3);
 697                             p[3] = 0;
 698                         }
 699         }
 700         if (s->txfmmode == i)
 701             break;
 702     }
 703
 704     // mode updates
 705     for (i = 0; i < 3; i++)
 706         if (vp56_rac_get_prob_branchy(&s->c, 252))
 707             s->prob.p.skip[i] = update_prob(&s->c, s->prob.p.skip[i]);
 708     if (!s->keyframe && !s->intraonly) {
 709         for (i = 0; i < 7; i++)
 710             for (j = 0; j < 3; j++)
 711                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 712                     s->prob.p.mv_mode[i][j] =
 713                         update_prob(&s->c, s->prob.p.mv_mode[i][j]);
 714
 715         if (s->filtermode == FILTER_SWITCHABLE)
 716             for (i = 0; i < 4; i++)
 717                 for (j = 0; j < 2; j++)
 718                     if (vp56_rac_get_prob_branchy(&s->c, 252))
 719                         s->prob.p.filter[i][j] =
 720                             update_prob(&s->c, s->prob.p.filter[i][j]);
 721
 722         for (i = 0; i < 4; i++)
 723             if (vp56_rac_get_prob_branchy(&s->c, 252))
 724                 s->prob.p.intra[i] = update_prob(&s->c, s->prob.p.intra[i]);
 725
 726         if (s->allowcompinter) {
 727             s->comppredmode = vp8_rac_get(&s->c);
 728             if (s->comppredmode)
 729                 s->comppredmode += vp8_rac_get(&s->c);
 730             if (s->comppredmode == PRED_SWITCHABLE)
 731                 for (i = 0; i < 5; i++)
 732                     if (vp56_rac_get_prob_branchy(&s->c, 252))
 733                         s->prob.p.comp[i] =
 734                             update_prob(&s->c, s->prob.p.comp[i]);
 735         } else {
 736             s->comppredmode = PRED_SINGLEREF;
 737         }
 738
 739         if (s->comppredmode != PRED_COMPREF) {
 740             for (i = 0; i < 5; i++) {
 741                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 742                     s->prob.p.single_ref[i][0] =
 743                         update_prob(&s->c, s->prob.p.single_ref[i][0]);
 744                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 745                     s->prob.p.single_ref[i][1] =
 746                         update_prob(&s->c, s->prob.p.single_ref[i][1]);
 747             }
 748         }
 749
 750         if (s->comppredmode != PRED_SINGLEREF) {
 751             for (i = 0; i < 5; i++)
 752                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 753                     s->prob.p.comp_ref[i] =
 754                         update_prob(&s->c, s->prob.p.comp_ref[i]);
 755         }
 756
 757         for (i = 0; i < 4; i++)
 758             for (j = 0; j < 9; j++)
 759                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 760                     s->prob.p.y_mode[i][j] =
 761                         update_prob(&s->c, s->prob.p.y_mode[i][j]);
 762
 763         for (i = 0; i < 4; i++)
 764             for (j = 0; j < 4; j++)
 765                 for (k = 0; k < 3; k++)
 766                     if (vp56_rac_get_prob_branchy(&s->c, 252))
 767                         s->prob.p.partition[3 - i][j][k] =
 768                             update_prob(&s->c, s->prob.p.partition[3 - i][j][k]);
 769
 770         // mv fields don't use the update_prob subexp model for some reason
 771         for (i = 0; i < 3; i++)
 772             if (vp56_rac_get_prob_branchy(&s->c, 252))
 773                 s->prob.p.mv_joint[i] = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 774
 775         for (i = 0; i < 2; i++) {
 776             if (vp56_rac_get_prob_branchy(&s->c, 252))
 777                 s->prob.p.mv_comp[i].sign = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 778
 779             for (j = 0; j < 10; j++)
 780                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 781                     s->prob.p.mv_comp[i].classes[j] =
 782                         (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 783
 784             if (vp56_rac_get_prob_branchy(&s->c, 252))
 785                 s->prob.p.mv_comp[i].class0 = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 786
 787             for (j = 0; j < 10; j++)
 788                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 789                     s->prob.p.mv_comp[i].bits[j] =
 790                         (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 791         }
 792
 793         for (i = 0; i < 2; i++) {
 794             for (j = 0; j < 2; j++)
 795                 for (k = 0; k < 3; k++)
 796                     if (vp56_rac_get_prob_branchy(&s->c, 252))
 797                         s->prob.p.mv_comp[i].class0_fp[j][k] =
 798                             (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 799
 800             for (j = 0; j < 3; j++)
 801                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 802                     s->prob.p.mv_comp[i].fp[j] =
 803                         (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 804         }
 805
 806         if (s->highprecisionmvs) {
 807             for (i = 0; i < 2; i++) {
 808                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 809                     s->prob.p.mv_comp[i].class0_hp =
 810                         (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 811
 812                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 813                     s->prob.p.mv_comp[i].hp =
 814                         (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 815             }
 816         }
 817     }
 818
 819     return (data2 - data) + size2;
 820 }
 821
 822 static av_always_inline void clamp_mv(VP56mv *dst, const VP56mv *src,
 823                                       VP9Context *s)
 824 {
 825     dst->x = av_clip(src->x, s->min_mv.x, s->max_mv.x);
 826     dst->y = av_clip(src->y, s->min_mv.y, s->max_mv.y);
 827 }
 828
 829 static void find_ref_mvs(VP9Context *s,
 830                          VP56mv *pmv, int ref, int z, int idx, int sb)
 831 {
 832     static const int8_t mv_ref_blk_off[N_BS_SIZES][8][2] = {
 833         [BS_64x64] = {{  3, -1 }, { -1,  3 }, {  4, -1 }, { -1,  4 },
 834                       { -1, -1 }, {  0, -1 }, { -1,  0 }, {  6, -1 }},
 835         [BS_64x32] = {{  0, -1 }, { -1,  0 }, {  4, -1 }, { -1,  2 },
 836                       { -1, -1 }, {  0, -3 }, { -3,  0 }, {  2, -1 }},
 837         [BS_32x64] = {{ -1,  0 }, {  0, -1 }, { -1,  4 }, {  2, -1 },
 838                       { -1, -1 }, { -3,  0 }, {  0, -3 }, { -1,  2 }},
 839         [BS_32x32] = {{  1, -1 }, { -1,  1 }, {  2, -1 }, { -1,  2 },
 840                       { -1, -1 }, {  0, -3 }, { -3,  0 }, { -3, -3 }},
 841         [BS_32x16] = {{  0, -1 }, { -1,  0 }, {  2, -1 }, { -1, -1 },
 842                       { -1,  1 }, {  0, -3 }, { -3,  0 }, { -3, -3 }},
 843         [BS_16x32] = {{ -1,  0 }, {  0, -1 }, { -1,  2 }, { -1, -1 },
 844                       {  1, -1 }, { -3,  0 }, {  0, -3 }, { -3, -3 }},
 845         [BS_16x16] = {{  0, -1 }, { -1,  0 }, {  1, -1 }, { -1,  1 },
 846                       { -1, -1 }, {  0, -3 }, { -3,  0 }, { -3, -3 }},
 847         [BS_16x8]  = {{  0, -1 }, { -1,  0 }, {  1, -1 }, { -1, -1 },
 848                       {  0, -2 }, { -2,  0 }, { -2, -1 }, { -1, -2 }},
 849         [BS_8x16]  = {{ -1,  0 }, {  0, -1 }, { -1,  1 }, { -1, -1 },
 850                       { -2,  0 }, {  0, -2 }, { -1, -2 }, { -2, -1 }},
 851         [BS_8x8]   = {{  0, -1 }, { -1,  0 }, { -1, -1 }, {  0, -2 },
 852                       { -2,  0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
 853         [BS_8x4]   = {{  0, -1 }, { -1,  0 }, { -1, -1 }, {  0, -2 },
 854                       { -2,  0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
 855         [BS_4x8]   = {{  0, -1 }, { -1,  0 }, { -1, -1 }, {  0, -2 },
 856                       { -2,  0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
 857         [BS_4x4]   = {{  0, -1 }, { -1,  0 }, { -1, -1 }, {  0, -2 },
 858                       { -2,  0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
 859     };
 860     VP9Block *const b = &s->b;
 861     int row = b->row, col = b->col, row7 = b->row7;
 862     const int8_t (*p)[2] = mv_ref_blk_off[b->bs];
 863 #define INVALID_MV 0x80008000U
 864     uint32_t mem = INVALID_MV;
 865     int i;
 866
 867 #define RETURN_DIRECT_MV(mv) \
 868     do { \
 869         uint32_t m = AV_RN32A(&mv); \
 870         if (!idx) { \
 871             AV_WN32A(pmv, m); \
 872             return; \
 873         } else if (mem == INVALID_MV) { \
 874             mem = m; \
 875         } else if (m != mem) { \
 876             AV_WN32A(pmv, m); \
 877             return; \
 878         } \
 879     } while (0)
 880
 881     if (sb >= 0) {
 882         if (sb == 2 || sb == 1) {
 883             RETURN_DIRECT_MV(b->mv[0][z]);
 884         } else if (sb == 3) {
 885             RETURN_DIRECT_MV(b->mv[2][z]);
 886             RETURN_DIRECT_MV(b->mv[1][z]);
 887             RETURN_DIRECT_MV(b->mv[0][z]);
 888         }
 889
 890 #define RETURN_MV(mv) \
 891     do { \
 892         if (sb > 0) { \
 893             VP56mv tmp; \
 894             uint32_t m; \
 895             clamp_mv(&tmp, &mv, s); \
 896             m = AV_RN32A(&tmp); \
 897             if (!idx) { \
 898                 AV_WN32A(pmv, m); \
 899                 return; \
 900             } else if (mem == INVALID_MV) { \
 901                 mem = m; \
 902             } else if (m != mem) { \
 903                 AV_WN32A(pmv, m); \
 904                 return; \
 905             } \
 906         } else { \
 907             uint32_t m = AV_RN32A(&mv); \
 908             if (!idx) { \
 909                 clamp_mv(pmv, &mv, s); \
 910                 return; \
 911             } else if (mem == INVALID_MV) { \
 912                 mem = m; \
 913             } else if (m != mem) { \
 914                 clamp_mv(pmv, &mv, s); \
 915                 return; \
 916             } \
 917         } \
 918     } while (0)
 919
 920         if (row > 0) {
 921             struct VP9mvrefPair *mv = &s->mv[0][(row - 1) * s->sb_cols * 8 + col];
 922             if (mv->ref[0] == ref) {
 923                 RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][0]);
 924             } else if (mv->ref[1] == ref) {
 925                 RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][1]);
 926             }
 927         }
 928         if (col > s->tiling.tile_col_start) {
 929             struct VP9mvrefPair *mv = &s->mv[0][row * s->sb_cols * 8 + col - 1];
 930             if (mv->ref[0] == ref) {
 931                 RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][0]);
 932             } else if (mv->ref[1] == ref) {
 933                 RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][1]);
 934             }
 935         }
 936         i = 2;
 937     } else {
 938         i = 0;
 939     }
 940
 941     // previously coded MVs in this neighbourhood, using same reference frame
 942     for (; i < 8; i++) {
 943         int c = p[i][0] + col, r = p[i][1] + row;
 944
 945         if (c >= s->tiling.tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
 946             struct VP9mvrefPair *mv = &s->mv[0][r * s->sb_cols * 8 + c];
 947
 948             if (mv->ref[0] == ref) {
 949                 RETURN_MV(mv->mv[0]);
 950             } else if (mv->ref[1] == ref) {
 951                 RETURN_MV(mv->mv[1]);
 952             }
 953         }
 954     }
 955
 956     // MV at this position in previous frame, using same reference frame
 957     if (s->use_last_frame_mvs) {
 958         struct VP9mvrefPair *mv = &s->mv[1][row * s->sb_cols * 8 + col];
 959
 960         if (mv->ref[0] == ref) {
 961             RETURN_MV(mv->mv[0]);
 962         } else if (mv->ref[1] == ref) {
 963             RETURN_MV(mv->mv[1]);
 964         }
 965     }
 966
 967 #define RETURN_SCALE_MV(mv, scale) \
 968     do { \
 969         if (scale) { \
 970             VP56mv mv_temp = { -mv.x, -mv.y }; \
 971             RETURN_MV(mv_temp); \
 972         } else { \
 973             RETURN_MV(mv); \
 974         } \
 975     } while (0)
 976
 977     // previously coded MVs in this neighbourhood, using different reference frame
 978     for (i = 0; i < 8; i++) {
 979         int c = p[i][0] + col, r = p[i][1] + row;
 980
 981         if (c >= s->tiling.tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
 982             struct VP9mvrefPair *mv = &s->mv[0][r * s->sb_cols * 8 + c];
 983
 984             if (mv->ref[0] != ref && mv->ref[0] >= 0) {
 985                 RETURN_SCALE_MV(mv->mv[0], s->signbias[mv->ref[0]] != s->signbias[ref]);
 986             }
 987             if (mv->ref[1] != ref && mv->ref[1] >= 0) {
 988                 RETURN_SCALE_MV(mv->mv[1], s->signbias[mv->ref[1]] != s->signbias[ref]);
 989             }
 990         }
 991     }
 992
 993     // MV at this position in previous frame, using different reference frame
 994     if (s->use_last_frame_mvs) {
 995         struct VP9mvrefPair *mv = &s->mv[1][row * s->sb_cols * 8 + col];
 996
 997         if (mv->ref[0] != ref && mv->ref[0] >= 0) {
 998             RETURN_SCALE_MV(mv->mv[0], s->signbias[mv->ref[0]] != s->signbias[ref]);
 999         }
1000         if (mv->ref[1] != ref && mv->ref[1] >= 0) {
1001             RETURN_SCALE_MV(mv->mv[1], s->signbias[mv->ref[1]] != s->signbias[ref]);
1002         }
1003     }
1004
1005     AV_ZERO32(pmv);
1006 #undef INVALID_MV
1007 #undef RETURN_MV
1008 #undef RETURN_SCALE_MV
1009 }
1010
1011 static av_always_inline int read_mv_component(VP9Context *s, int idx, int hp)
1012 {
1013     int bit, sign = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].sign);
1014     int n, c = vp8_rac_get_tree(&s->c, vp9_mv_class_tree,
1015                                 s->prob.p.mv_comp[idx].classes);
1016
1017     s->counts.mv_comp[idx].sign[sign]++;
1018     s->counts.mv_comp[idx].classes[c]++;
1019     if (c) {
1020         int m;
1021
1022         for (n = 0, m = 0; m < c; m++) {
1023             bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].bits[m]);
1024             n |= bit << m;
1025             s->counts.mv_comp[idx].bits[m][bit]++;
1026         }
1027         n <<= 3;
1028         bit = vp8_rac_get_tree(&s->c, vp9_mv_fp_tree, s->prob.p.mv_comp[idx].fp);
1029         n |= bit << 1;
1030         s->counts.mv_comp[idx].fp[bit]++;
1031         if (hp) {
1032             bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].hp);
1033             s->counts.mv_comp[idx].hp[bit]++;
1034             n |= bit;
1035         } else {
1036             n |= 1;
1037             // bug in libvpx - we count for bw entropy purposes even if the
1038             // bit wasn't coded
1039             s->counts.mv_comp[idx].hp[1]++;
1040         }
1041         n += 8 << c;
1042     } else {
1043         n = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0);
1044         s->counts.mv_comp[idx].class0[n]++;
1045         bit = vp8_rac_get_tree(&s->c, vp9_mv_fp_tree,
1046                                s->prob.p.mv_comp[idx].class0_fp[n]);
1047         s->counts.mv_comp[idx].class0_fp[n][bit]++;
1048         n = (n << 3) | (bit << 1);
1049         if (hp) {
1050             bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0_hp);
1051             s->counts.mv_comp[idx].class0_hp[bit]++;
1052             n |= bit;
1053         } else {
1054             n |= 1;
1055             // bug in libvpx - we count for bw entropy purposes even if the
1056             // bit wasn't coded
1057             s->counts.mv_comp[idx].class0_hp[1]++;
1058         }
1059     }
1060
1061     return sign ? -(n + 1) : (n + 1);
1062 }
1063
1064 static void fill_mv(VP9Context *s,
1065                     VP56mv *mv, int mode, int sb)
1066 {
1067     VP9Block *const b = &s->b;
1068
1069     if (mode == ZEROMV) {
1070         memset(mv, 0, sizeof(*mv) * 2);
1071     } else {
1072         int hp;
1073
1074         // FIXME cache this value and reuse for other subblocks
1075         find_ref_mvs(s, &mv[0], b->ref[0], 0, mode == NEARMV,
1076                      mode == NEWMV ? -1 : sb);
1077         // FIXME maybe move this code into find_ref_mvs()
1078         if ((mode == NEWMV || sb == -1) &&
1079             !(hp = s->highprecisionmvs && abs(mv[0].x) < 64 && abs(mv[0].y) < 64)) {
1080             if (mv[0].y & 1) {
1081                 if (mv[0].y < 0)
1082                     mv[0].y++;
1083                 else
1084                     mv[0].y--;
1085             }
1086             if (mv[0].x & 1) {
1087                 if (mv[0].x < 0)
1088                     mv[0].x++;
1089                 else
1090                     mv[0].x--;
1091             }
1092         }
1093         if (mode == NEWMV) {
1094             enum MVJoint j = vp8_rac_get_tree(&s->c, vp9_mv_joint_tree,
1095                                               s->prob.p.mv_joint);
1096
1097             s->counts.mv_joint[j]++;
1098             if (j >= MV_JOINT_V)
1099                 mv[0].y += read_mv_component(s, 0, hp);
1100             if (j & 1)
1101                 mv[0].x += read_mv_component(s, 1, hp);
1102         }
1103
1104         if (b->comp) {
1105             // FIXME cache this value and reuse for other subblocks
1106             find_ref_mvs(s, &mv[1], b->ref[1], 1, mode == NEARMV,
1107                          mode == NEWMV ? -1 : sb);
1108             if ((mode == NEWMV || sb == -1) &&
1109                 !(hp = s->highprecisionmvs && abs(mv[1].x) < 64 && abs(mv[1].y) < 64)) {
1110                 if (mv[1].y & 1) {
1111                     if (mv[1].y < 0)
1112                         mv[1].y++;
1113                     else
1114                         mv[1].y--;
1115                 }
1116                 if (mv[1].x & 1) {
1117                     if (mv[1].x < 0)
1118                         mv[1].x++;
1119                     else
1120                         mv[1].x--;
1121                 }
1122             }
1123             if (mode == NEWMV) {
1124                 enum MVJoint j = vp8_rac_get_tree(&s->c, vp9_mv_joint_tree,
1125                                                   s->prob.p.mv_joint);
1126
1127                 s->counts.mv_joint[j]++;
1128                 if (j >= MV_JOINT_V)
1129                     mv[1].y += read_mv_component(s, 0, hp);
1130                 if (j & 1)
1131                     mv[1].x += read_mv_component(s, 1, hp);
1132             }
1133         }
1134     }
1135 }
1136
1137 static void decode_mode(AVCodecContext *ctx)
1138 {
1139     static const uint8_t left_ctx[N_BS_SIZES] = {
1140         0x0, 0x8, 0x0, 0x8, 0xc, 0x8, 0xc, 0xe, 0xc, 0xe, 0xf, 0xe, 0xf
1141     };
1142     static const uint8_t above_ctx[N_BS_SIZES] = {
1143         0x0, 0x0, 0x8, 0x8, 0x8, 0xc, 0xc, 0xc, 0xe, 0xe, 0xe, 0xf, 0xf
1144     };
1145     static const uint8_t max_tx_for_bl_bp[N_BS_SIZES] = {
1146         TX_32X32, TX_32X32, TX_32X32, TX_32X32, TX_16X16, TX_16X16,
1147         TX_16X16, TX_8X8, TX_8X8, TX_8X8, TX_4X4, TX_4X4, TX_4X4
1148     };
1149     VP9Context *s = ctx->priv_data;
1150     VP9Block *const b = &s->b;
1151     int row = b->row, col = b->col, row7 = b->row7;
1152     enum TxfmMode max_tx = max_tx_for_bl_bp[b->bs];
1153     int w4 = FFMIN(s->cols - col, bwh_tab[1][b->bs][0]);
1154     int h4 = FFMIN(s->rows - row, bwh_tab[1][b->bs][1]), y;
1155     int have_a = row > 0, have_l = col > s->tiling.tile_col_start;
1156
1157     if (!s->segmentation.enabled) {
1158         b->seg_id = 0;
1159     } else if (s->keyframe || s->intraonly) {
1160         b->seg_id = s->segmentation.update_map ?
1161             vp8_rac_get_tree(&s->c, vp9_segmentation_tree, s->prob.seg) : 0;
1162     } else if (!s->segmentation.update_map ||
1163                (s->segmentation.temporal &&
1164                 vp56_rac_get_prob_branchy(&s->c,
1165                     s->prob.segpred[s->above_segpred_ctx[col] +
1166                                     s->left_segpred_ctx[row7]]))) {
1167         int pred = 8, x;
1168
1169         for (y = 0; y < h4; y++)
1170             for (x = 0; x < w4; x++)
1171                 pred = FFMIN(pred, s->segmentation_map[(y + row) * 8 * s->sb_cols + x + col]);
1172         b->seg_id = pred;
1173
1174         memset(&s->above_segpred_ctx[col], 1, w4);
1175         memset(&s->left_segpred_ctx[row7], 1, h4);
1176     } else {
1177         b->seg_id = vp8_rac_get_tree(&s->c, vp9_segmentation_tree,
1178                                      s->prob.seg);
1179
1180         memset(&s->above_segpred_ctx[col], 0, w4);
1181         memset(&s->left_segpred_ctx[row7], 0, h4);
1182     }
1183     if ((s->segmentation.enabled && s->segmentation.update_map) || s->keyframe) {
1184         for (y = 0; y < h4; y++)
1185             memset(&s->segmentation_map[(y + row) * 8 * s->sb_cols + col],
1186                    b->seg_id, w4);
1187     }
1188
1189     b->skip = s->segmentation.enabled &&
1190         s->segmentation.feat[b->seg_id].skip_enabled;
1191     if (!b->skip) {
1192         int c = s->left_skip_ctx[row7] + s->above_skip_ctx[col];
1193         b->skip = vp56_rac_get_prob(&s->c, s->prob.p.skip[c]);
1194         s->counts.skip[c][b->skip]++;
1195     }
1196
1197     if (s->keyframe || s->intraonly) {
1198         b->intra = 1;
1199     } else if (s->segmentation.feat[b->seg_id].ref_enabled) {
1200         b->intra = !s->segmentation.feat[b->seg_id].ref_val;
1201     } else {
1202         int c, bit;
1203
1204         if (have_a && have_l) {
1205             c = s->above_intra_ctx[col] + s->left_intra_ctx[row7];
1206             c += (c == 2);
1207         } else {
1208             c = have_a ? 2 * s->above_intra_ctx[col] :
1209                 have_l ? 2 * s->left_intra_ctx[row7] : 0;
1210         }
1211         bit = vp56_rac_get_prob(&s->c, s->prob.p.intra[c]);
1212         s->counts.intra[c][bit]++;
1213         b->intra = !bit;
1214     }
1215
1216     if ((b->intra || !b->skip) && s->txfmmode == TX_SWITCHABLE) {
1217         int c;
1218         if (have_a) {
1219             if (have_l) {
1220                 c = (s->above_skip_ctx[col] ? max_tx :
1221                      s->above_txfm_ctx[col]) +
1222                     (s->left_skip_ctx[row7] ? max_tx :
1223                      s->left_txfm_ctx[row7]) > max_tx;
1224             } else {
1225                 c = s->above_skip_ctx[col] ? 1 :
1226                     (s->above_txfm_ctx[col] * 2 > max_tx);
1227             }
1228         } else if (have_l) {
1229             c = s->left_skip_ctx[row7] ? 1 :
1230                 (s->left_txfm_ctx[row7] * 2 > max_tx);
1231         } else {
1232             c = 1;
1233         }
1234         switch (max_tx) {
1235         case TX_32X32:
1236             b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][0]);
1237             if (b->tx) {
1238                 b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][1]);
1239                 if (b->tx == 2)
1240                     b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][2]);
1241             }
1242             s->counts.tx32p[c][b->tx]++;
1243             break;
1244         case TX_16X16:
1245             b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][0]);
1246             if (b->tx)
1247                 b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][1]);
1248             s->counts.tx16p[c][b->tx]++;
1249             break;
1250         case TX_8X8:
1251             b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx8p[c]);
1252             s->counts.tx8p[c][b->tx]++;
1253             break;
1254         case TX_4X4:
1255             b->tx = TX_4X4;
1256             break;
1257         }
1258     } else {
1259         b->tx = FFMIN(max_tx, s->txfmmode);
1260     }
1261
1262     if (s->keyframe || s->intraonly) {
1263         uint8_t *a = &s->above_mode_ctx[col * 2];
1264         uint8_t *l = &s->left_mode_ctx[(row7) << 1];
1265
1266         b->comp = 0;
1267         if (b->bs > BS_8x8) {
1268             // FIXME the memory storage intermediates here aren't really
1269             // necessary, they're just there to make the code slightly
1270             // simpler for now
1271             b->mode[0] = a[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1272                                     vp9_default_kf_ymode_probs[a[0]][l[0]]);
1273             if (b->bs != BS_8x4) {
1274                 b->mode[1] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1275                                  vp9_default_kf_ymode_probs[a[1]][b->mode[0]]);
1276                 l[0] = a[1] = b->mode[1];
1277             } else {
1278                 l[0] = a[1] = b->mode[1] = b->mode[0];
1279             }
1280             if (b->bs != BS_4x8) {
1281                 b->mode[2] = a[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1282                                         vp9_default_kf_ymode_probs[a[0]][l[1]]);
1283                 if (b->bs != BS_8x4) {
1284                     b->mode[3] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1285                                   vp9_default_kf_ymode_probs[a[1]][b->mode[2]]);
1286                     l[1] = a[1] = b->mode[3];
1287                 } else {
1288                     l[1] = a[1] = b->mode[3] = b->mode[2];
1289                 }
1290             } else {
1291                 b->mode[2] = b->mode[0];
1292                 l[1] = a[1] = b->mode[3] = b->mode[1];
1293             }
1294         } else {
1295             b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1296                                           vp9_default_kf_ymode_probs[*a][*l]);
1297             b->mode[3] = b->mode[2] = b->mode[1] = b->mode[0];
1298             // FIXME this can probably be optimized
1299             memset(a, b->mode[0], bwh_tab[0][b->bs][0]);
1300             memset(l, b->mode[0], bwh_tab[0][b->bs][1]);
1301         }
1302         b->uvmode = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1303                                      vp9_default_kf_uvmode_probs[b->mode[3]]);
1304     } else if (b->intra) {
1305         b->comp = 0;
1306         if (b->bs > BS_8x8) {
1307             b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1308                                           s->prob.p.y_mode[0]);
1309             s->counts.y_mode[0][b->mode[0]]++;
1310             if (b->bs != BS_8x4) {
1311                 b->mode[1] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1312                                               s->prob.p.y_mode[0]);
1313                 s->counts.y_mode[0][b->mode[1]]++;
1314             } else {
1315                 b->mode[1] = b->mode[0];
1316             }
1317             if (b->bs != BS_4x8) {
1318                 b->mode[2] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1319                                               s->prob.p.y_mode[0]);
1320                 s->counts.y_mode[0][b->mode[2]]++;
1321                 if (b->bs != BS_8x4) {
1322                     b->mode[3] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1323                                                   s->prob.p.y_mode[0]);
1324                     s->counts.y_mode[0][b->mode[3]]++;
1325                 } else {
1326                     b->mode[3] = b->mode[2];
1327                 }
1328             } else {
1329                 b->mode[2] = b->mode[0];
1330                 b->mode[3] = b->mode[1];
1331             }
1332         } else {
1333             static const uint8_t size_group[10] = {
1334                 3, 3, 3, 3, 2, 2, 2, 1, 1, 1
1335             };
1336             int sz = size_group[b->bs];
1337
1338             b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1339                                           s->prob.p.y_mode[sz]);
1340             b->mode[1] = b->mode[2] = b->mode[3] = b->mode[0];
1341             s->counts.y_mode[sz][b->mode[3]]++;
1342         }
1343         b->uvmode = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1344                                      s->prob.p.uv_mode[b->mode[3]]);
1345         s->counts.uv_mode[b->mode[3]][b->uvmode]++;
1346     } else {
1347         static const uint8_t inter_mode_ctx_lut[14][14] = {
1348             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1349             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1350             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1351             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1352             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1353             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1354             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1355             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1356             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1357             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1358             { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
1359             { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
1360             { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 0, 3 },
1361             { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 3, 3, 4 },
1362         };
1363
1364         if (s->segmentation.feat[b->seg_id].ref_enabled) {
1365             av_assert2(s->segmentation.feat[b->seg_id].ref_val != 0);
1366             b->comp = 0;
1367             b->ref[0] = s->segmentation.feat[b->seg_id].ref_val - 1;
1368         } else {
1369             // read comp_pred flag
1370             if (s->comppredmode != PRED_SWITCHABLE) {
1371                 b->comp = s->comppredmode == PRED_COMPREF;
1372             } else {
1373                 int c;
1374
1375                 // FIXME add intra as ref=0xff (or -1) to make these easier?
1376                 if (have_a) {
1377                     if (have_l) {
1378                         if (s->above_comp_ctx[col] && s->left_comp_ctx[row7]) {
1379                             c = 4;
1380                         } else if (s->above_comp_ctx[col]) {
1381                             c = 2 + (s->left_intra_ctx[row7] ||
1382                                      s->left_ref_ctx[row7] == s->fixcompref);
1383                         } else if (s->left_comp_ctx[row7]) {
1384                             c = 2 + (s->above_intra_ctx[col] ||
1385                                      s->above_ref_ctx[col] == s->fixcompref);
1386                         } else {
1387                             c = (!s->above_intra_ctx[col] &&
1388                                  s->above_ref_ctx[col] == s->fixcompref) ^
1389                             (!s->left_intra_ctx[row7] &&
1390                              s->left_ref_ctx[row & 7] == s->fixcompref);
1391                         }
1392                     } else {
1393                         c = s->above_comp_ctx[col] ? 3 :
1394                         (!s->above_intra_ctx[col] && s->above_ref_ctx[col] == s->fixcompref);
1395                     }
1396                 } else if (have_l) {
1397                     c = s->left_comp_ctx[row7] ? 3 :
1398                     (!s->left_intra_ctx[row7] && s->left_ref_ctx[row7] == s->fixcompref);
1399                 } else {
1400                     c = 1;
1401                 }
1402                 b->comp = vp56_rac_get_prob(&s->c, s->prob.p.comp[c]);
1403                 s->counts.comp[c][b->comp]++;
1404             }
1405
1406             // read actual references
1407             // FIXME probably cache a few variables here to prevent repetitive
1408             // memory accesses below
1409             if (b->comp) /* two references */ {
1410                 int fix_idx = s->signbias[s->fixcompref], var_idx = !fix_idx, c, bit;
1411
1412                 b->ref[fix_idx] = s->fixcompref;
1413                 // FIXME can this codeblob be replaced by some sort of LUT?
1414                 if (have_a) {
1415                     if (have_l) {
1416                         if (s->above_intra_ctx[col]) {
1417                             if (s->left_intra_ctx[row7]) {
1418                                 c = 2;
1419                             } else {
1420                                 c = 1 + 2 * (s->left_ref_ctx[row7] != s->varcompref[1]);
1421                             }
1422                         } else if (s->left_intra_ctx[row7]) {
1423                             c = 1 + 2 * (s->above_ref_ctx[col] != s->varcompref[1]);
1424                         } else {
1425                             int refl = s->left_ref_ctx[row7], refa = s->above_ref_ctx[col];
1426
1427                             if (refl == refa && refa == s->varcompref[1]) {
1428                                 c = 0;
1429                             } else if (!s->left_comp_ctx[row7] && !s->above_comp_ctx[col]) {
1430                                 if ((refa == s->fixcompref && refl == s->varcompref[0]) ||
1431                                     (refl == s->fixcompref && refa == s->varcompref[0])) {
1432                                     c = 4;
1433                                 } else {
1434                                     c = (refa == refl) ? 3 : 1;
1435                                 }
1436                             } else if (!s->left_comp_ctx[row7]) {
1437                                 if (refa == s->varcompref[1] && refl != s->varcompref[1]) {
1438                                     c = 1;
1439                                 } else {
1440                                     c = (refl == s->varcompref[1] &&
1441                                          refa != s->varcompref[1]) ? 2 : 4;
1442                                 }
1443                             } else if (!s->above_comp_ctx[col]) {
1444                                 if (refl == s->varcompref[1] && refa != s->varcompref[1]) {
1445                                     c = 1;
1446                                 } else {
1447                                     c = (refa == s->varcompref[1] &&
1448                                          refl != s->varcompref[1]) ? 2 : 4;
1449                                 }
1450                             } else {
1451                                 c = (refl == refa) ? 4 : 2;
1452                             }
1453                         }
1454                     } else {
1455                         if (s->above_intra_ctx[col]) {
1456                             c = 2;
1457                         } else if (s->above_comp_ctx[col]) {
1458                             c = 4 * (s->above_ref_ctx[col] != s->varcompref[1]);
1459                         } else {
1460                             c = 3 * (s->above_ref_ctx[col] != s->varcompref[1]);
1461                         }
1462                     }
1463                 } else if (have_l) {
1464                     if (s->left_intra_ctx[row7]) {
1465                         c = 2;
1466                     } else if (s->left_comp_ctx[row7]) {
1467                         c = 4 * (s->left_ref_ctx[row7] != s->varcompref[1]);
1468                     } else {
1469                         c = 3 * (s->left_ref_ctx[row7] != s->varcompref[1]);
1470                     }
1471                 } else {
1472                     c = 2;
1473                 }
1474                 bit = vp56_rac_get_prob(&s->c, s->prob.p.comp_ref[c]);
1475                 b->ref[var_idx] = s->varcompref[bit];
1476                 s->counts.comp_ref[c][bit]++;
1477             } else /* single reference */ {
1478                 int bit, c;
1479
1480                 if (have_a && !s->above_intra_ctx[col]) {
1481                     if (have_l && !s->left_intra_ctx[row7]) {
1482                         if (s->left_comp_ctx[row7]) {
1483                             if (s->above_comp_ctx[col]) {
1484                                 c = 1 + (!s->fixcompref || !s->left_ref_ctx[row7] ||
1485                                          !s->above_ref_ctx[col]);
1486                             } else {
1487                                 c = (3 * !s->above_ref_ctx[col]) +
1488                                     (!s->fixcompref || !s->left_ref_ctx[row7]);
1489                             }
1490                         } else if (s->above_comp_ctx[col]) {
1491                             c = (3 * !s->left_ref_ctx[row7]) +
1492                                 (!s->fixcompref || !s->above_ref_ctx[col]);
1493                         } else {
1494                             c = 2 * !s->left_ref_ctx[row7] + 2 * !s->above_ref_ctx[col];
1495                         }
1496                     } else if (s->above_intra_ctx[col]) {
1497                         c = 2;
1498                     } else if (s->above_comp_ctx[col]) {
1499                         c = 1 + (!s->fixcompref || !s->above_ref_ctx[col]);
1500                     } else {
1501                         c = 4 * (!s->above_ref_ctx[col]);
1502                     }
1503                 } else if (have_l && !s->left_intra_ctx[row7]) {
1504                     if (s->left_intra_ctx[row7]) {
1505                         c = 2;
1506                     } else if (s->left_comp_ctx[row7]) {
1507                         c = 1 + (!s->fixcompref || !s->left_ref_ctx[row7]);
1508                     } else {
1509                         c = 4 * (!s->left_ref_ctx[row7]);
1510                     }
1511                 } else {
1512                     c = 2;
1513                 }
1514                 bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][0]);
1515                 s->counts.single_ref[c][0][bit]++;
1516                 if (!bit) {
1517                     b->ref[0] = 0;
1518                 } else {
1519                     // FIXME can this codeblob be replaced by some sort of LUT?
1520                     if (have_a) {
1521                         if (have_l) {
1522                             if (s->left_intra_ctx[row7]) {
1523                                 if (s->above_intra_ctx[col]) {
1524                                     c = 2;
1525                                 } else if (s->above_comp_ctx[col]) {
1526                                     c = 1 + 2 * (s->fixcompref == 1 ||
1527                                                  s->above_ref_ctx[col] == 1);
1528                                 } else if (!s->above_ref_ctx[col]) {
1529                                     c = 3;
1530                                 } else {
1531                                     c = 4 * (s->above_ref_ctx[col] == 1);
1532                                 }
1533                             } else if (s->above_intra_ctx[col]) {
1534                                 if (s->left_intra_ctx[row7]) {
1535                                     c = 2;
1536                                 } else if (s->left_comp_ctx[row7]) {
1537                                     c = 1 + 2 * (s->fixcompref == 1 ||
1538                                                  s->left_ref_ctx[row7] == 1);
1539                                 } else if (!s->left_ref_ctx[row7]) {
1540                                     c = 3;
1541                                 } else {
1542                                     c = 4 * (s->left_ref_ctx[row7] == 1);
1543                                 }
1544                             } else if (s->above_comp_ctx[col]) {
1545                                 if (s->left_comp_ctx[row7]) {
1546                                     if (s->left_ref_ctx[row7] == s->above_ref_ctx[col]) {
1547                                         c = 3 * (s->fixcompref == 1 ||
1548                                                  s->left_ref_ctx[row7] == 1);
1549                                     } else {
1550                                         c = 2;
1551                                     }
1552                                 } else if (!s->left_ref_ctx[row7]) {
1553                                     c = 1 + 2 * (s->fixcompref == 1 ||
1554                                                  s->above_ref_ctx[col] == 1);
1555                                 } else {
1556                                     c = 3 * (s->left_ref_ctx[row7] == 1) +
1557                                     (s->fixcompref == 1 || s->above_ref_ctx[col] == 1);
1558                                 }
1559                             } else if (s->left_comp_ctx[row7]) {
1560                                 if (!s->above_ref_ctx[col]) {
1561                                     c = 1 + 2 * (s->fixcompref == 1 ||
1562                                                  s->left_ref_ctx[row7] == 1);
1563                                 } else {
1564                                     c = 3 * (s->above_ref_ctx[col] == 1) +
1565                                     (s->fixcompref == 1 || s->left_ref_ctx[row7] == 1);
1566                                 }
1567                             } else if (!s->above_ref_ctx[col]) {
1568                                 if (!s->left_ref_ctx[row7]) {
1569                                     c = 3;
1570                                 } else {
1571                                     c = 4 * (s->left_ref_ctx[row7] == 1);
1572                                 }
1573                             } else if (!s->left_ref_ctx[row7]) {
1574                                 c = 4 * (s->above_ref_ctx[col] == 1);
1575                             } else {
1576                                 c = 2 * (s->left_ref_ctx[row7] == 1) +
1577                                 2 * (s->above_ref_ctx[col] == 1);
1578                             }
1579                         } else {
1580                             if (s->above_intra_ctx[col] ||
1581                                 (!s->above_comp_ctx[col] && !s->above_ref_ctx[col])) {
1582                                 c = 2;
1583                             } else if (s->above_comp_ctx[col]) {
1584                                 c = 3 * (s->fixcompref == 1 || s->above_ref_ctx[col] == 1);
1585                             } else {
1586                                 c = 4 * (s->above_ref_ctx[col] == 1);
1587                             }
1588                         }
1589                     } else if (have_l) {
1590                         if (s->left_intra_ctx[row7] ||
1591                             (!s->left_comp_ctx[row7] && !s->left_ref_ctx[row7])) {
1592                             c = 2;
1593                         } else if (s->left_comp_ctx[row7]) {
1594                             c = 3 * (s->fixcompref == 1 || s->left_ref_ctx[row7] == 1);
1595                         } else {
1596                             c = 4 * (s->left_ref_ctx[row7] == 1);
1597                         }
1598                     } else {
1599                         c = 2;
1600                     }
1601                     bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][1]);
1602                     s->counts.single_ref[c][1][bit]++;
1603                     b->ref[0] = 1 + bit;
1604                 }
1605             }
1606         }
1607
1608         if (b->bs <= BS_8x8) {
1609             if (s->segmentation.feat[b->seg_id].skip_enabled) {
1610                 b->mode[0] = b->mode[1] = b->mode[2] = b->mode[3] = ZEROMV;
1611             } else {
1612                 static const uint8_t off[10] = {
1613                     3, 0, 0, 1, 0, 0, 0, 0, 0, 0
1614                 };
1615
1616                 // FIXME this needs to use the LUT tables from find_ref_mvs
1617                 // because not all are -1,0/0,-1
1618                 int c = inter_mode_ctx_lut[s->above_mode_ctx[col + off[b->bs]]]
1619                                           [s->left_mode_ctx[row7 + off[b->bs]]];
1620
1621                 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1622                                               s->prob.p.mv_mode[c]);
1623                 b->mode[1] = b->mode[2] = b->mode[3] = b->mode[0];
1624                 s->counts.mv_mode[c][b->mode[0] - 10]++;
1625             }
1626         }
1627
1628         if (s->filtermode == FILTER_SWITCHABLE) {
1629             int c;
1630
1631             if (have_a && s->above_mode_ctx[col] >= NEARESTMV) {
1632                 if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
1633                     c = s->above_filter_ctx[col] == s->left_filter_ctx[row7] ?
1634                         s->left_filter_ctx[row7] : 3;
1635                 } else {
1636                     c = s->above_filter_ctx[col];
1637                 }
1638             } else if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
1639                 c = s->left_filter_ctx[row7];
1640             } else {
1641                 c = 3;
1642             }
1643
1644             b->filter = vp8_rac_get_tree(&s->c, vp9_filter_tree,
1645                                          s->prob.p.filter[c]);
1646             s->counts.filter[c][b->filter]++;
1647         } else {
1648             b->filter = s->filtermode;
1649         }
1650
1651         if (b->bs > BS_8x8) {
1652             int c = inter_mode_ctx_lut[s->above_mode_ctx[col]][s->left_mode_ctx[row7]];
1653
1654             b->mode[0] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1655                                           s->prob.p.mv_mode[c]);
1656             s->counts.mv_mode[c][b->mode[0] - 10]++;
1657             fill_mv(s, b->mv[0], b->mode[0], 0);
1658
1659             if (b->bs != BS_8x4) {
1660                 b->mode[1] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1661                                               s->prob.p.mv_mode[c]);
1662                 s->counts.mv_mode[c][b->mode[1] - 10]++;
1663                 fill_mv(s, b->mv[1], b->mode[1], 1);
1664             } else {
1665                 b->mode[1] = b->mode[0];
1666                 AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
1667                 AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
1668             }
1669
1670             if (b->bs != BS_4x8) {
1671                 b->mode[2] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1672                                               s->prob.p.mv_mode[c]);
1673                 s->counts.mv_mode[c][b->mode[2] - 10]++;
1674                 fill_mv(s, b->mv[2], b->mode[2], 2);
1675
1676                 if (b->bs != BS_8x4) {
1677                     b->mode[3] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1678                                                   s->prob.p.mv_mode[c]);
1679                     s->counts.mv_mode[c][b->mode[3] - 10]++;
1680                     fill_mv(s, b->mv[3], b->mode[3], 3);
1681                 } else {
1682                     b->mode[3] = b->mode[2];
1683                     AV_COPY32(&b->mv[3][0], &b->mv[2][0]);
1684                     AV_COPY32(&b->mv[3][1], &b->mv[2][1]);
1685                 }
1686             } else {
1687                 b->mode[2] = b->mode[0];
1688                 AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
1689                 AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
1690                 b->mode[3] = b->mode[1];
1691                 AV_COPY32(&b->mv[3][0], &b->mv[1][0]);
1692                 AV_COPY32(&b->mv[3][1], &b->mv[1][1]);
1693             }
1694         } else {
1695             fill_mv(s, b->mv[0], b->mode[0], -1);
1696             AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
1697             AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
1698             AV_COPY32(&b->mv[3][0], &b->mv[0][0]);
1699             AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
1700             AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
1701             AV_COPY32(&b->mv[3][1], &b->mv[0][1]);
1702         }
1703     }
1704
1705     // FIXME this can probably be optimized
1706     memset(&s->above_skip_ctx[col], b->skip, w4);
1707     memset(&s->left_skip_ctx[row7], b->skip, h4);
1708     memset(&s->above_txfm_ctx[col], b->tx, w4);
1709     memset(&s->left_txfm_ctx[row7], b->tx, h4);
1710     memset(&s->above_partition_ctx[col], above_ctx[b->bs], w4);
1711     memset(&s->left_partition_ctx[row7], left_ctx[b->bs], h4);
1712     if (!s->keyframe && !s->intraonly) {
1713         memset(&s->above_intra_ctx[col], b->intra, w4);
1714         memset(&s->left_intra_ctx[row7], b->intra, h4);
1715         memset(&s->above_comp_ctx[col], b->comp, w4);
1716         memset(&s->left_comp_ctx[row7], b->comp, h4);
1717         memset(&s->above_mode_ctx[col], b->mode[3], w4);
1718         memset(&s->left_mode_ctx[row7], b->mode[3], h4);
1719         if (s->filtermode == FILTER_SWITCHABLE && !b->intra ) {
1720             memset(&s->above_filter_ctx[col], b->filter, w4);
1721             memset(&s->left_filter_ctx[row7], b->filter, h4);
1722             b->filter = vp9_filter_lut[b->filter];
1723         }
1724         if (b->bs > BS_8x8) {
1725             int mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
1726
1727             AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][0], &b->mv[1][0]);
1728             AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][1], &b->mv[1][1]);
1729             AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][0], mv0);
1730             AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][1], mv1);
1731             AV_COPY32(&s->above_mv_ctx[col * 2 + 0][0], &b->mv[2][0]);
1732             AV_COPY32(&s->above_mv_ctx[col * 2 + 0][1], &b->mv[2][1]);
1733             AV_WN32A(&s->above_mv_ctx[col * 2 + 1][0], mv0);
1734             AV_WN32A(&s->above_mv_ctx[col * 2 + 1][1], mv1);
1735         } else {
1736             int n, mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
1737
1738             for (n = 0; n < w4 * 2; n++) {
1739                 AV_WN32A(&s->above_mv_ctx[col * 2 + n][0], mv0);
1740                 AV_WN32A(&s->above_mv_ctx[col * 2 + n][1], mv1);
1741             }
1742             for (n = 0; n < h4 * 2; n++) {
1743                 AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][0], mv0);
1744                 AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][1], mv1);
1745             }
1746         }
1747
1748         if (!b->intra) { // FIXME write 0xff or -1 if intra, so we can use this
1749                          // as a direct check in above branches
1750             int vref = b->ref[b->comp ? s->signbias[s->varcompref[0]] : 0];
1751
1752             memset(&s->above_ref_ctx[col], vref, w4);
1753             memset(&s->left_ref_ctx[row7], vref, h4);
1754         }
1755     }
1756
1757     // FIXME kinda ugly
1758     for (y = 0; y < h4; y++) {
1759         int x, o = (row + y) * s->sb_cols * 8 + col;
1760
1761         if (b->intra) {
1762             for (x = 0; x < w4; x++) {
1763                 s->mv[0][o + x].ref[0] =
1764                 s->mv[0][o + x].ref[1] = -1;
1765             }
1766         } else if (b->comp) {
1767             for (x = 0; x < w4; x++) {
1768                 s->mv[0][o + x].ref[0] = b->ref[0];
1769                 s->mv[0][o + x].ref[1] = b->ref[1];
1770                 AV_COPY32(&s->mv[0][o + x].mv[0], &b->mv[3][0]);
1771                 AV_COPY32(&s->mv[0][o + x].mv[1], &b->mv[3][1]);
1772             }
1773         } else {
1774             for (x = 0; x < w4; x++) {
1775                 s->mv[0][o + x].ref[0] = b->ref[0];
1776                 s->mv[0][o + x].ref[1] = -1;
1777                 AV_COPY32(&s->mv[0][o + x].mv[0], &b->mv[3][0]);
1778             }
1779         }
1780     }
1781 }
1782
1783 // FIXME remove tx argument, and merge cnt/eob arguments?
1784 static int decode_coeffs_b(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
1785                            enum TxfmMode tx, unsigned (*cnt)[6][3],
1786                            unsigned (*eob)[6][2], uint8_t (*p)[6][11],
1787                            int nnz, const int16_t *scan, const int16_t (*nb)[2],
1788                            const int16_t *band_counts, const int16_t *qmul)
1789 {
1790     int i = 0, band = 0, band_left = band_counts[band];
1791     uint8_t *tp = p[0][nnz];
1792     uint8_t cache[1024];
1793
1794     do {
1795         int val, rc;
1796
1797         val = vp56_rac_get_prob_branchy(c, tp[0]); // eob
1798         eob[band][nnz][val]++;
1799         if (!val)
1800             break;
1801
1802     skip_eob:
1803         if (!vp56_rac_get_prob_branchy(c, tp[1])) { // zero
1804             cnt[band][nnz][0]++;
1805             if (!--band_left)
1806                 band_left = band_counts[++band];
1807             cache[scan[i]] = 0;
1808             nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
1809             tp = p[band][nnz];
1810             if (++i == n_coeffs)
1811                 break; //invalid input; blocks should end with EOB
1812             goto skip_eob;
1813         }
1814
1815         rc = scan[i];
1816         if (!vp56_rac_get_prob_branchy(c, tp[2])) { // one
1817             cnt[band][nnz][1]++;
1818             val = 1;
1819             cache[rc] = 1;
1820         } else {
1821             // fill in p[3-10] (model fill) - only once per frame for each pos
1822             if (!tp[3])
1823                 memcpy(&tp[3], vp9_model_pareto8[tp[2]], 8);
1824
1825             cnt[band][nnz][2]++;
1826             if (!vp56_rac_get_prob_branchy(c, tp[3])) { // 2, 3, 4
1827                 if (!vp56_rac_get_prob_branchy(c, tp[4])) {
1828                     cache[rc] = val = 2;
1829                 } else {
1830                     val = 3 + vp56_rac_get_prob(c, tp[5]);
1831                     cache[rc] = 3;
1832                 }
1833             } else if (!vp56_rac_get_prob_branchy(c, tp[6])) { // cat1/2
1834                 cache[rc] = 4;
1835                 if (!vp56_rac_get_prob_branchy(c, tp[7])) {
1836                     val = 5 + vp56_rac_get_prob(c, 159);
1837                 } else {
1838                     val = 7 + (vp56_rac_get_prob(c, 165) << 1) +
1839                                vp56_rac_get_prob(c, 145);
1840                 }
1841             } else { // cat 3-6
1842                 cache[rc] = 5;
1843                 if (!vp56_rac_get_prob_branchy(c, tp[8])) {
1844                     if (!vp56_rac_get_prob_branchy(c, tp[9])) {
1845                         val = 11 + (vp56_rac_get_prob(c, 173) << 2) +
1846                                    (vp56_rac_get_prob(c, 148) << 1) +
1847                                     vp56_rac_get_prob(c, 140);
1848                     } else {
1849                         val = 19 + (vp56_rac_get_prob(c, 176) << 3) +
1850                                    (vp56_rac_get_prob(c, 155) << 2) +
1851                                    (vp56_rac_get_prob(c, 140) << 1) +
1852                                     vp56_rac_get_prob(c, 135);
1853                     }
1854                 } else if (!vp56_rac_get_prob_branchy(c, tp[10])) {
1855                     val = 35 + (vp56_rac_get_prob(c, 180) << 4) +
1856                                (vp56_rac_get_prob(c, 157) << 3) +
1857                                (vp56_rac_get_prob(c, 141) << 2) +
1858                                (vp56_rac_get_prob(c, 134) << 1) +
1859                                 vp56_rac_get_prob(c, 130);
1860                 } else {
1861                     val = 67 + (vp56_rac_get_prob(c, 254) << 13) +
1862                                (vp56_rac_get_prob(c, 254) << 12) +
1863                                (vp56_rac_get_prob(c, 254) << 11) +
1864                                (vp56_rac_get_prob(c, 252) << 10) +
1865                                (vp56_rac_get_prob(c, 249) << 9) +
1866                                (vp56_rac_get_prob(c, 243) << 8) +
1867                                (vp56_rac_get_prob(c, 230) << 7) +
1868                                (vp56_rac_get_prob(c, 196) << 6) +
1869                                (vp56_rac_get_prob(c, 177) << 5) +
1870                                (vp56_rac_get_prob(c, 153) << 4) +
1871                                (vp56_rac_get_prob(c, 140) << 3) +
1872                                (vp56_rac_get_prob(c, 133) << 2) +
1873                                (vp56_rac_get_prob(c, 130) << 1) +
1874                                 vp56_rac_get_prob(c, 129);
1875                 }
1876             }
1877         }
1878         if (!--band_left)
1879             band_left = band_counts[++band];
1880         if (tx == TX_32X32) // FIXME slow
1881             coef[rc] = ((vp8_rac_get(c) ? -val : val) * qmul[!!i]) / 2;
1882         else
1883             coef[rc] = (vp8_rac_get(c) ? -val : val) * qmul[!!i];
1884         nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
1885         tp = p[band][nnz];
1886     } while (++i < n_coeffs);
1887
1888     return i;
1889 }
1890
1891 static int decode_coeffs(AVCodecContext *ctx)
1892 {
1893     VP9Context *s = ctx->priv_data;
1894     VP9Block *const b = &s->b;
1895     int row = b->row, col = b->col;
1896     uint8_t (*p)[6][11] = s->prob.coef[b->tx][0 /* y */][!b->intra];
1897     unsigned (*c)[6][3] = s->counts.coef[b->tx][0 /* y */][!b->intra];
1898     unsigned (*e)[6][2] = s->counts.eob[b->tx][0 /* y */][!b->intra];
1899     int w4 = bwh_tab[1][b->bs][0] << 1, h4 = bwh_tab[1][b->bs][1] << 1;
1900     int end_x = FFMIN(2 * (s->cols - col), w4);
1901     int end_y = FFMIN(2 * (s->rows - row), h4);
1902     int n, pl, x, y, step1d = 1 << b->tx, step = 1 << (b->tx * 2);
1903     int uvstep1d = 1 << b->uvtx, uvstep = 1 << (b->uvtx * 2), res;
1904     int16_t (*qmul)[2] = s->segmentation.feat[b->seg_id].qmul;
1905     int tx = 4 * s->lossless + b->tx;
1906     const int16_t **yscans = vp9_scans[tx];
1907     const int16_t (**ynbs)[2] = vp9_scans_nb[tx];
1908     const int16_t *uvscan = vp9_scans[b->uvtx][DCT_DCT];
1909     const int16_t (*uvnb)[2] = vp9_scans_nb[b->uvtx][DCT_DCT];
1910     uint8_t *a = &s->above_y_nnz_ctx[col * 2];
1911     uint8_t *l = &s->left_y_nnz_ctx[(row & 7) << 1];
1912     static const int16_t band_counts[4][6] = {
1913         { 1, 2, 3, 4,  3,   16 - 13 },
1914         { 1, 2, 3, 4, 11,   64 - 21 },
1915         { 1, 2, 3, 4, 11,  256 - 21 },
1916         { 1, 2, 3, 4, 11, 1024 - 21 },
1917     };
1918     const int16_t *y_band_counts = band_counts[b->tx];
1919     const int16_t *uv_band_counts = band_counts[b->uvtx];
1920
1921     /* y tokens */
1922     if (b->tx > TX_4X4) { // FIXME slow
1923         for (y = 0; y < end_y; y += step1d)
1924             for (x = 1; x < step1d; x++)
1925                 l[y] |= l[y + x];
1926         for (x = 0; x < end_x; x += step1d)
1927             for (y = 1; y < step1d; y++)
1928                 a[x] |= a[x + y];
1929     }
1930     for (n = 0, y = 0; y < end_y; y += step1d) {
1931         for (x = 0; x < end_x; x += step1d, n += step) {
1932             enum TxfmType txtp = vp9_intra_txfm_type[b->mode[b->tx == TX_4X4 &&
1933                                                              b->bs > BS_8x8 ?
1934                                                              n : 0]];
1935             int nnz = a[x] + l[y];
1936             if ((res = decode_coeffs_b(&s->c, s->block + 16 * n, 16 * step,
1937                                        b->tx, c, e, p, nnz, yscans[txtp],
1938                                        ynbs[txtp], y_band_counts, qmul[0])) < 0)
1939                 return res;
1940             a[x] = l[y] = !!res;
1941             if (b->tx > TX_8X8) {
1942                 AV_WN16A(&s->eob[n], res);
1943             } else {
1944                 s->eob[n] = res;
1945             }
1946         }
1947     }
1948     if (b->tx > TX_4X4) { // FIXME slow
1949         for (y = 0; y < end_y; y += step1d)
1950             memset(&l[y + 1], l[y], FFMIN(end_y - y - 1, step1d - 1));
1951         for (x = 0; x < end_x; x += step1d)
1952             memset(&a[x + 1], a[x], FFMIN(end_x - x - 1, step1d - 1));
1953     }
1954
1955     p = s->prob.coef[b->uvtx][1 /* uv */][!b->intra];
1956     c = s->counts.coef[b->uvtx][1 /* uv */][!b->intra];
1957     e = s->counts.eob[b->uvtx][1 /* uv */][!b->intra];
1958     w4 >>= 1;
1959     h4 >>= 1;
1960     end_x >>= 1;
1961     end_y >>= 1;
1962     for (pl = 0; pl < 2; pl++) {
1963         a = &s->above_uv_nnz_ctx[pl][col];
1964         l = &s->left_uv_nnz_ctx[pl][row & 7];
1965         if (b->uvtx > TX_4X4) { // FIXME slow
1966             for (y = 0; y < end_y; y += uvstep1d)
1967                 for (x = 1; x < uvstep1d; x++)
1968                     l[y] |= l[y + x];
1969             for (x = 0; x < end_x; x += uvstep1d)
1970                 for (y = 1; y < uvstep1d; y++)
1971                     a[x] |= a[x + y];
1972         }
1973         for (n = 0, y = 0; y < end_y; y += uvstep1d) {
1974             for (x = 0; x < end_x; x += uvstep1d, n += uvstep) {
1975                 int nnz = a[x] + l[y];
1976                 if ((res = decode_coeffs_b(&s->c, s->uvblock[pl] + 16 * n,
1977                                            16 * uvstep, b->uvtx, c, e, p, nnz,
1978                                            uvscan, uvnb, uv_band_counts,
1979                                            qmul[1])) < 0)
1980                     return res;
1981                 a[x] = l[y] = !!res;
1982                 if (b->uvtx > TX_8X8) {
1983                     AV_WN16A(&s->uveob[pl][n], res);
1984                 } else {
1985                     s->uveob[pl][n] = res;
1986                 }
1987             }
1988         }
1989         if (b->uvtx > TX_4X4) { // FIXME slow
1990             for (y = 0; y < end_y; y += uvstep1d)
1991                 memset(&l[y + 1], l[y], FFMIN(end_y - y - 1, uvstep1d - 1));
1992             for (x = 0; x < end_x; x += uvstep1d)
1993                 memset(&a[x + 1], a[x], FFMIN(end_x - x - 1, uvstep1d - 1));
1994         }
1995     }
1996
1997     return 0;
1998 }
1999
2000 static av_always_inline int check_intra_mode(VP9Context *s, int mode, uint8_t **a,
2001                                              uint8_t *dst_edge, ptrdiff_t stride_edge,
2002                                              uint8_t *dst_inner, ptrdiff_t stride_inner,
2003                                              uint8_t *l, int col, int x, int w,
2004                                              int row, int y, enum TxfmMode tx,
2005                                              int p)
2006 {
2007     int have_top = row > 0 || y > 0;
2008     int have_left = col > s->tiling.tile_col_start || x > 0;
2009     int have_right = x < w - 1;
2010     static const uint8_t mode_conv[10][2 /* have_left */][2 /* have_top */] = {
2011         [VERT_PRED]            = { { DC_127_PRED,          VERT_PRED },
2012                                    { DC_127_PRED,          VERT_PRED } },
2013         [HOR_PRED]             = { { DC_129_PRED,          DC_129_PRED },
2014                                    { HOR_PRED,             HOR_PRED } },
2015         [DC_PRED]              = { { DC_128_PRED,          TOP_DC_PRED },
2016                                    { LEFT_DC_PRED,         DC_PRED } },
2017         [DIAG_DOWN_LEFT_PRED]  = { { DC_127_PRED,          DIAG_DOWN_LEFT_PRED },
2018                                    { DC_127_PRED,          DIAG_DOWN_LEFT_PRED } },
2019         [DIAG_DOWN_RIGHT_PRED] = { { DIAG_DOWN_RIGHT_PRED, DIAG_DOWN_RIGHT_PRED },
2020                                    { DIAG_DOWN_RIGHT_PRED, DIAG_DOWN_RIGHT_PRED } },
2021         [VERT_RIGHT_PRED]      = { { VERT_RIGHT_PRED,      VERT_RIGHT_PRED },
2022                                    { VERT_RIGHT_PRED,      VERT_RIGHT_PRED } },
2023         [HOR_DOWN_PRED]        = { { HOR_DOWN_PRED,        HOR_DOWN_PRED },
2024                                    { HOR_DOWN_PRED,        HOR_DOWN_PRED } },
2025         [VERT_LEFT_PRED]       = { { DC_127_PRED,          VERT_LEFT_PRED },
2026                                    { DC_127_PRED,          VERT_LEFT_PRED } },
2027         [HOR_UP_PRED]          = { { DC_129_PRED,          DC_129_PRED },
2028                                    { HOR_UP_PRED,          HOR_UP_PRED } },
2029         [TM_VP8_PRED]          = { { DC_129_PRED,          VERT_PRED },
2030                                    { HOR_PRED,             TM_VP8_PRED } },
2031     };
2032     static const struct {
2033         uint8_t needs_left:1;
2034         uint8_t needs_top:1;
2035         uint8_t needs_topleft:1;
2036         uint8_t needs_topright:1;
2037     } edges[N_INTRA_PRED_MODES] = {
2038         [VERT_PRED]            = { .needs_top  = 1 },
2039         [HOR_PRED]             = { .needs_left = 1 },
2040         [DC_PRED]              = { .needs_top  = 1, .needs_left = 1 },
2041         [DIAG_DOWN_LEFT_PRED]  = { .needs_top  = 1, .needs_topright = 1 },
2042         [DIAG_DOWN_RIGHT_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2043         [VERT_RIGHT_PRED]      = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2044         [HOR_DOWN_PRED]        = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2045         [VERT_LEFT_PRED]       = { .needs_top  = 1, .needs_topright = 1 },
2046         [HOR_UP_PRED]          = { .needs_left = 1 },
2047         [TM_VP8_PRED]          = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2048         [LEFT_DC_PRED]         = { .needs_left = 1 },
2049         [TOP_DC_PRED]          = { .needs_top  = 1 },
2050         [DC_128_PRED]          = { 0 },
2051         [DC_127_PRED]          = { 0 },
2052         [DC_129_PRED]          = { 0 }
2053     };
2054
2055     av_assert2(mode >= 0 && mode < 10);
2056     mode = mode_conv[mode][have_left][have_top];
2057     if (edges[mode].needs_top) {
2058         uint8_t *top, *topleft;
2059         int n_px_need = 4 << tx, n_px_have = (((s->cols - col) << !p) - x) * 4;
2060         int n_px_need_tr = 0;
2061
2062         if (tx == TX_4X4 && edges[mode].needs_topright && have_right)
2063             n_px_need_tr = 4;
2064
2065         // if top of sb64-row, use s->intra_pred_data[] instead of
2066         // dst[-stride] for intra prediction (it contains pre- instead of
2067         // post-loopfilter data)
2068         if (have_top) {
2069             top = !(row & 7) && !y ?
2070                 s->intra_pred_data[p] + col * (8 >> !!p) + x * 4 :
2071                 y == 0 ? &dst_edge[-stride_edge] : &dst_inner[-stride_inner];
2072             if (have_left)
2073                 topleft = !(row & 7) && !y ?
2074                     s->intra_pred_data[p] + col * (8 >> !!p) + x * 4 :
2075                     y == 0 || x == 0 ? &dst_edge[-stride_edge] :
2076                     &dst_inner[-stride_inner];
2077         }
2078
2079         if (have_top &&
2080             (!edges[mode].needs_topleft || (have_left && top == topleft)) &&
2081             (tx != TX_4X4 || !edges[mode].needs_topright || have_right) &&
2082             n_px_need + n_px_need_tr <= n_px_have) {
2083             *a = top;
2084         } else {
2085             if (have_top) {
2086                 if (n_px_need <= n_px_have) {
2087                     memcpy(*a, top, n_px_need);
2088                 } else {
2089                     memcpy(*a, top, n_px_have);
2090                     memset(&(*a)[n_px_have], (*a)[n_px_have - 1],
2091                            n_px_need - n_px_have);
2092                 }
2093             } else {
2094                 memset(*a, 127, n_px_need);
2095             }
2096             if (edges[mode].needs_topleft) {
2097                 if (have_left && have_top) {
2098                     (*a)[-1] = topleft[-1];
2099                 } else {
2100                     (*a)[-1] = have_top ? 129 : 127;
2101                 }
2102             }
2103             if (tx == TX_4X4 && edges[mode].needs_topright) {
2104                 if (have_top && have_right &&
2105                     n_px_need + n_px_need_tr <= n_px_have) {
2106                     memcpy(&(*a)[4], &top[4], 4);
2107                 } else {
2108                     memset(&(*a)[4], (*a)[3], 4);
2109                 }
2110             }
2111         }
2112     }
2113     if (edges[mode].needs_left) {
2114         if (have_left) {
2115             int n_px_need = 4 << tx, i, n_px_have = (((s->rows - row) << !p) - y) * 4;
2116             uint8_t *dst = x == 0 ? dst_edge : dst_inner;
2117             ptrdiff_t stride = x == 0 ? stride_edge : stride_inner;
2118
2119             if (n_px_need <= n_px_have) {
2120                 for (i = 0; i < n_px_need; i++)
2121                     l[i] = dst[i * stride - 1];
2122             } else {
2123                 for (i = 0; i < n_px_have; i++)
2124                     l[i] = dst[i * stride - 1];
2125                 memset(&l[i], l[i - 1], n_px_need - n_px_have);
2126             }
2127         } else {
2128             memset(l, 129, 4 << tx);
2129         }
2130     }
2131
2132     return mode;
2133 }
2134
2135 static void intra_recon(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off)
2136 {
2137     VP9Context *s = ctx->priv_data;
2138     VP9Block *const b = &s->b;
2139     int row = b->row, col = b->col;
2140     int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
2141     int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
2142     int end_x = FFMIN(2 * (s->cols - col), w4);
2143     int end_y = FFMIN(2 * (s->rows - row), h4);
2144     int tx = 4 * s->lossless + b->tx, uvtx = b->uvtx + 4 * s->lossless;
2145     int uvstep1d = 1 << b->uvtx, p;
2146     uint8_t *dst = b->dst[0], *dst_r = s->f->data[0] + y_off;
2147
2148     for (n = 0, y = 0; y < end_y; y += step1d) {
2149         uint8_t *ptr = dst, *ptr_r = dst_r;
2150         for (x = 0; x < end_x; x += step1d, ptr += 4 * step1d,
2151                                ptr_r += 4 * step1d, n += step) {
2152             int mode = b->mode[b->bs > BS_8x8 && b->tx == TX_4X4 ?
2153                                y * 2 + x : 0];
2154             LOCAL_ALIGNED_16(uint8_t, a_buf, [48]);
2155             uint8_t *a = &a_buf[16], l[32];
2156             enum TxfmType txtp = vp9_intra_txfm_type[mode];
2157             int eob = b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
2158
2159             mode = check_intra_mode(s, mode, &a, ptr_r, s->f->linesize[0],
2160                                     ptr, b->y_stride, l,
2161                                     col, x, w4, row, y, b->tx, 0);
2162             s->dsp.intra_pred[b->tx][mode](ptr, b->y_stride, l, a);
2163             if (eob)
2164                 s->dsp.itxfm_add[tx][txtp](ptr, b->y_stride,
2165                                            s->block + 16 * n, eob);
2166         }
2167         dst_r += 4 * s->f->linesize[0] * step1d;
2168         dst   += 4 * b->y_stride       * step1d;
2169     }
2170
2171     // U/V
2172     h4 >>= 1;
2173     w4 >>= 1;
2174     end_x >>= 1;
2175     end_y >>= 1;
2176     step = 1 << (b->uvtx * 2);
2177     for (p = 0; p < 2; p++) {
2178         dst   = b->dst[1 + p];
2179         dst_r = s->f->data[1 + p] + uv_off;
2180         for (n = 0, y = 0; y < end_y; y += uvstep1d) {
2181             uint8_t *ptr = dst, *ptr_r = dst_r;
2182             for (x = 0; x < end_x; x += uvstep1d, ptr += 4 * uvstep1d,
2183                                    ptr_r += 4 * uvstep1d, n += step) {
2184                 int mode = b->uvmode;
2185                 LOCAL_ALIGNED_16(uint8_t, a_buf, [48]);
2186                 uint8_t *a = &a_buf[16], l[32];
2187                 int eob = b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
2188
2189                 mode = check_intra_mode(s, mode, &a, ptr_r, s->f->linesize[1],
2190                                         ptr, b->uv_stride, l,
2191                                         col, x, w4, row, y, b->uvtx, p + 1);
2192                 s->dsp.intra_pred[b->uvtx][mode](ptr, b->uv_stride, l, a);
2193                 if (eob)
2194                     s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, b->uv_stride,
2195                                                     s->uvblock[p] + 16 * n, eob);
2196             }
2197             dst_r += 4 * uvstep1d * s->f->linesize[1];
2198             dst   += 4 * uvstep1d * b->uv_stride;
2199         }
2200     }
2201 }
2202
2203 static av_always_inline void mc_luma_dir(VP9Context *s, vp9_mc_func (*mc)[2],
2204                                          uint8_t *dst, ptrdiff_t dst_stride,
2205                                          const uint8_t *ref, ptrdiff_t ref_stride,
2206                                          ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
2207                                          int bw, int bh, int w, int h)
2208 {
2209     int mx = mv->x, my = mv->y;
2210
2211     y += my >> 3;
2212     x += mx >> 3;
2213     ref += y * ref_stride + x;
2214     mx &= 7;
2215     my &= 7;
2216     // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2217     if (x < !!mx * 3 || y < !!my * 3 ||
2218         x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
2219         s->vdsp.emulated_edge_mc(s->edge_emu_buffer, 80,
2220                                  ref - !!my * 3 * ref_stride - !!mx * 3,
2221                                  ref_stride,
2222                                  bw + !!mx * 7, bh + !!my * 7,
2223                                  x - !!mx * 3, y - !!my * 3, w, h);
2224         ref = s->edge_emu_buffer + !!my * 3 * 80 + !!mx * 3;
2225         ref_stride = 80;
2226     }
2227     mc[!!mx][!!my](dst, dst_stride, ref, ref_stride, bh, mx << 1, my << 1);
2228 }
2229
2230 static av_always_inline void mc_chroma_dir(VP9Context *s, vp9_mc_func (*mc)[2],
2231                                            uint8_t *dst_u, uint8_t *dst_v,
2232                                            ptrdiff_t dst_stride,
2233                                            const uint8_t *ref_u, ptrdiff_t src_stride_u,
2234                                            const uint8_t *ref_v, ptrdiff_t src_stride_v,
2235                                            ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
2236                                            int bw, int bh, int w, int h)
2237 {
2238     int mx = mv->x, my = mv->y;
2239
2240     y += my >> 4;
2241     x += mx >> 4;
2242     ref_u += y * src_stride_u + x;
2243     ref_v += y * src_stride_v + x;
2244     mx &= 15;
2245     my &= 15;
2246     // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2247     if (x < !!mx * 3 || y < !!my * 3 ||
2248         x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
2249         s->vdsp.emulated_edge_mc(s->edge_emu_buffer, 80,
2250                                  ref_u - !!my * 3 * src_stride_u - !!mx * 3, src_stride_u,
2251                                  bw + !!mx * 7, bh + !!my * 7,
2252                                  x - !!mx * 3, y - !!my * 3, w, h);
2253         ref_u = s->edge_emu_buffer + !!my * 3 * 80 + !!mx * 3;
2254         mc[!!mx][!!my](dst_u, dst_stride, ref_u, 80, bh, mx, my);
2255
2256         s->vdsp.emulated_edge_mc(s->edge_emu_buffer, 80,
2257                                  ref_v - !!my * 3 * src_stride_v - !!mx * 3, src_stride_v,
2258                                  bw + !!mx * 7, bh + !!my * 7,
2259                                  x - !!mx * 3, y - !!my * 3, w, h);
2260         ref_v = s->edge_emu_buffer + !!my * 3 * 80 + !!mx * 3;
2261         mc[!!mx][!!my](dst_v, dst_stride, ref_v, 80, bh, mx, my);
2262     } else {
2263         mc[!!mx][!!my](dst_u, dst_stride, ref_u, src_stride_u, bh, mx, my);
2264         mc[!!mx][!!my](dst_v, dst_stride, ref_v, src_stride_v, bh, mx, my);
2265     }
2266 }
2267
2268 static void inter_recon(AVCodecContext *ctx)
2269 {
2270     static const uint8_t bwlog_tab[2][N_BS_SIZES] = {
2271         { 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4 },
2272         { 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4 },
2273     };
2274     VP9Context *s = ctx->priv_data;
2275     VP9Block *const b = &s->b;
2276     int row = b->row, col = b->col;
2277     AVFrame *ref1 = s->refs[s->refidx[b->ref[0]]];
2278     AVFrame *ref2 = b->comp ? s->refs[s->refidx[b->ref[1]]] : NULL;
2279     int w = ctx->width, h = ctx->height;
2280     ptrdiff_t ls_y = b->y_stride, ls_uv = b->uv_stride;
2281
2282     // y inter pred
2283     if (b->bs > BS_8x8) {
2284         if (b->bs == BS_8x4) {
2285             mc_luma_dir(s, s->dsp.mc[3][b->filter][0], b->dst[0], ls_y,
2286                         ref1->data[0], ref1->linesize[0],
2287                         row << 3, col << 3, &b->mv[0][0], 8, 4, w, h);
2288             mc_luma_dir(s, s->dsp.mc[3][b->filter][0],
2289                         b->dst[0] + 4 * ls_y, ls_y,
2290                         ref1->data[0], ref1->linesize[0],
2291                         (row << 3) + 4, col << 3, &b->mv[2][0], 8, 4, w, h);
2292
2293             if (b->comp) {
2294                 mc_luma_dir(s, s->dsp.mc[3][b->filter][1], b->dst[0], ls_y,
2295                             ref2->data[0], ref2->linesize[0],
2296                             row << 3, col << 3, &b->mv[0][1], 8, 4, w, h);
2297                 mc_luma_dir(s, s->dsp.mc[3][b->filter][1],
2298                             b->dst[0] + 4 * ls_y, ls_y,
2299                             ref2->data[0], ref2->linesize[0],
2300                             (row << 3) + 4, col << 3, &b->mv[2][1], 8, 4, w, h);
2301             }
2302         } else if (b->bs == BS_4x8) {
2303             mc_luma_dir(s, s->dsp.mc[4][b->filter][0], b->dst[0], ls_y,
2304                         ref1->data[0], ref1->linesize[0],
2305                         row << 3, col << 3, &b->mv[0][0], 4, 8, w, h);
2306             mc_luma_dir(s, s->dsp.mc[4][b->filter][0], b->dst[0] + 4, ls_y,
2307                         ref1->data[0], ref1->linesize[0],
2308                         row << 3, (col << 3) + 4, &b->mv[1][0], 4, 8, w, h);
2309
2310             if (b->comp) {
2311                 mc_luma_dir(s, s->dsp.mc[4][b->filter][1], b->dst[0], ls_y,
2312                             ref2->data[0], ref2->linesize[0],
2313                             row << 3, col << 3, &b->mv[0][1], 4, 8, w, h);
2314                 mc_luma_dir(s, s->dsp.mc[4][b->filter][1], b->dst[0] + 4, ls_y,
2315                             ref2->data[0], ref2->linesize[0],
2316                             row << 3, (col << 3) + 4, &b->mv[1][1], 4, 8, w, h);
2317             }
2318         } else {
2319             av_assert2(b->bs == BS_4x4);
2320
2321             // FIXME if two horizontally adjacent blocks have the same MV,
2322             // do a w8 instead of a w4 call
2323             mc_luma_dir(s, s->dsp.mc[4][b->filter][0], b->dst[0], ls_y,
2324                         ref1->data[0], ref1->linesize[0],
2325                         row << 3, col << 3, &b->mv[0][0], 4, 4, w, h);
2326             mc_luma_dir(s, s->dsp.mc[4][b->filter][0], b->dst[0] + 4, ls_y,
2327                         ref1->data[0], ref1->linesize[0],
2328                         row << 3, (col << 3) + 4, &b->mv[1][0], 4, 4, w, h);
2329             mc_luma_dir(s, s->dsp.mc[4][b->filter][0],
2330                         b->dst[0] + 4 * ls_y, ls_y,
2331                         ref1->data[0], ref1->linesize[0],
2332                         (row << 3) + 4, col << 3, &b->mv[2][0], 4, 4, w, h);
2333             mc_luma_dir(s, s->dsp.mc[4][b->filter][0],
2334                         b->dst[0] + 4 * ls_y + 4, ls_y,
2335                         ref1->data[0], ref1->linesize[0],
2336                         (row << 3) + 4, (col << 3) + 4, &b->mv[3][0], 4, 4, w, h);
2337
2338             if (b->comp) {
2339                 mc_luma_dir(s, s->dsp.mc[4][b->filter][1], b->dst[0], ls_y,
2340                             ref2->data[0], ref2->linesize[0],
2341                             row << 3, col << 3, &b->mv[0][1], 4, 4, w, h);
2342                 mc_luma_dir(s, s->dsp.mc[4][b->filter][1], b->dst[0] + 4, ls_y,
2343                             ref2->data[0], ref2->linesize[0],
2344                             row << 3, (col << 3) + 4, &b->mv[1][1], 4, 4, w, h);
2345                 mc_luma_dir(s, s->dsp.mc[4][b->filter][1],
2346                             b->dst[0] + 4 * ls_y, ls_y,
2347                             ref2->data[0], ref2->linesize[0],
2348                             (row << 3) + 4, col << 3, &b->mv[2][1], 4, 4, w, h);
2349                 mc_luma_dir(s, s->dsp.mc[4][b->filter][1],
2350                             b->dst[0] + 4 * ls_y + 4, ls_y,
2351                             ref2->data[0], ref2->linesize[0],
2352                             (row << 3) + 4, (col << 3) + 4, &b->mv[3][1], 4, 4, w, h);
2353             }
2354         }
2355     } else {
2356         int bwl = bwlog_tab[0][b->bs];
2357         int bw = bwh_tab[0][b->bs][0] * 4, bh = bwh_tab[0][b->bs][1] * 4;
2358
2359         mc_luma_dir(s, s->dsp.mc[bwl][b->filter][0], b->dst[0], ls_y,
2360                     ref1->data[0], ref1->linesize[0],
2361                     row << 3, col << 3, &b->mv[0][0],bw, bh, w, h);
2362
2363         if (b->comp)
2364             mc_luma_dir(s, s->dsp.mc[bwl][b->filter][1], b->dst[0], ls_y,
2365                         ref2->data[0], ref2->linesize[0],
2366                         row << 3, col << 3, &b->mv[0][1], bw, bh, w, h);
2367     }
2368
2369     // uv inter pred
2370     {
2371         int bwl = bwlog_tab[1][b->bs];
2372         int bw = bwh_tab[1][b->bs][0] * 4, bh = bwh_tab[1][b->bs][1] * 4;
2373         VP56mv mvuv;
2374
2375         w = (w + 1) >> 1;
2376         h = (h + 1) >> 1;
2377         if (b->bs > BS_8x8) {
2378             mvuv.x = ROUNDED_DIV(b->mv[0][0].x + b->mv[1][0].x + b->mv[2][0].x + b->mv[3][0].x, 4);
2379             mvuv.y = ROUNDED_DIV(b->mv[0][0].y + b->mv[1][0].y + b->mv[2][0].y + b->mv[3][0].y, 4);
2380         } else {
2381             mvuv = b->mv[0][0];
2382         }
2383
2384         mc_chroma_dir(s, s->dsp.mc[bwl][b->filter][0],
2385                       b->dst[1], b->dst[2], ls_uv,
2386                       ref1->data[1], ref1->linesize[1],
2387                       ref1->data[2], ref1->linesize[2],
2388                       row << 2, col << 2, &mvuv, bw, bh, w, h);
2389
2390         if (b->comp) {
2391             if (b->bs > BS_8x8) {
2392                 mvuv.x = ROUNDED_DIV(b->mv[0][1].x + b->mv[1][1].x + b->mv[2][1].x + b->mv[3][1].x, 4);
2393                 mvuv.y = ROUNDED_DIV(b->mv[0][1].y + b->mv[1][1].y + b->mv[2][1].y + b->mv[3][1].y, 4);
2394             } else {
2395                 mvuv = b->mv[0][1];
2396             }
2397             mc_chroma_dir(s, s->dsp.mc[bwl][b->filter][1],
2398                           b->dst[1], b->dst[2], ls_uv,
2399                           ref2->data[1], ref2->linesize[1],
2400                           ref2->data[2], ref2->linesize[2],
2401                           row << 2, col << 2, &mvuv, bw, bh, w, h);
2402         }
2403     }
2404
2405     if (!b->skip) {
2406         /* mostly copied intra_reconn() */
2407
2408         int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
2409         int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
2410         int end_x = FFMIN(2 * (s->cols - col), w4);
2411         int end_y = FFMIN(2 * (s->rows - row), h4);
2412         int tx = 4 * s->lossless + b->tx, uvtx = b->uvtx + 4 * s->lossless;
2413         int uvstep1d = 1 << b->uvtx, p;
2414         uint8_t *dst = b->dst[0];
2415
2416         // y itxfm add
2417         for (n = 0, y = 0; y < end_y; y += step1d) {
2418             uint8_t *ptr = dst;
2419             for (x = 0; x < end_x; x += step1d, ptr += 4 * step1d, n += step) {
2420                 int eob = b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
2421
2422                 if (eob)
2423                     s->dsp.itxfm_add[tx][DCT_DCT](ptr, b->y_stride,
2424                                                   s->block + 16 * n, eob);
2425             }
2426             dst += 4 * b->y_stride * step1d;
2427         }
2428
2429         // uv itxfm add
2430         h4 >>= 1;
2431         w4 >>= 1;
2432         end_x >>= 1;
2433         end_y >>= 1;
2434         step = 1 << (b->uvtx * 2);
2435         for (p = 0; p < 2; p++) {
2436             dst = b->dst[p + 1];
2437             for (n = 0, y = 0; y < end_y; y += uvstep1d) {
2438                 uint8_t *ptr = dst;
2439                 for (x = 0; x < end_x; x += uvstep1d, ptr += 4 * uvstep1d, n += step) {
2440                     int eob = b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
2441
2442                     if (eob)
2443                         s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, b->uv_stride,
2444                                                         s->uvblock[p] + 16 * n, eob);
2445                 }
2446                 dst += 4 * uvstep1d * b->uv_stride;
2447             }
2448         }
2449     }
2450 }
2451
2452 static av_always_inline void mask_edges(struct VP9Filter *lflvl, int is_uv,
2453                                         int row_and_7, int col_and_7,
2454                                         int w, int h, int col_end, int row_end,
2455                                         enum TxfmMode tx, int skip_inter)
2456 {
2457     // FIXME I'm pretty sure all loops can be replaced by a single LUT if
2458     // we make VP9Filter.mask uint64_t (i.e. row/col all single variable)
2459     // and make the LUT 5-indexed (bl, bp, is_uv, tx and row/col), and then
2460     // use row_and_7/col_and_7 as shifts (1*col_and_7+8*row_and_7)
2461
2462     // the intended behaviour of the vp9 loopfilter is to work on 8-pixel
2463     // edges. This means that for UV, we work on two subsampled blocks at
2464     // a time, and we only use the topleft block's mode information to set
2465     // things like block strength. Thus, for any block size smaller than
2466     // 16x16, ignore the odd portion of the block.
2467     if (tx == TX_4X4 && is_uv) {
2468         if (h == 1) {
2469             if (row_and_7 & 1)
2470                 return;
2471             if (!row_end)
2472                 h += 1;
2473         }
2474         if (w == 1) {
2475             if (col_and_7 & 1)
2476                 return;
2477             if (!col_end)
2478                 w += 1;
2479         }
2480     }
2481
2482     if (tx == TX_4X4 && !skip_inter) {
2483         int t = 1 << col_and_7, m_col = (t << w) - t, y;
2484         int m_col_odd = (t << (w - 1)) - t;
2485
2486         // on 32-px edges, use the 8-px wide loopfilter; else, use 4-px wide
2487         if (is_uv) {
2488             int m_row_8 = m_col & 0x01, m_row_4 = m_col - m_row_8;
2489
2490             for (y = row_and_7; y < h + row_and_7; y++) {
2491                 int col_mask_id = 2 - !(y & 7);
2492
2493                 lflvl->mask[is_uv][0][y][1] |= m_row_8;
2494                 lflvl->mask[is_uv][0][y][2] |= m_row_4;
2495                 // for odd lines, if the odd col is not being filtered,
2496                 // skip odd row also:
2497                 // .---. <-- a
2498                 // |   |
2499                 // |___| <-- b
2500                 // ^   ^
2501                 // c   d
2502                 //
2503                 // if a/c are even row/col and b/d are odd, and d is skipped,
2504                 // e.g. right edge of size-66x66.webm, then skip b also (bug)
2505                 if ((col_end & 1) && (y & 1)) {
2506                     lflvl->mask[is_uv][1][y][col_mask_id] |= m_col_odd;
2507                 } else {
2508                     lflvl->mask[is_uv][1][y][col_mask_id] |= m_col;
2509                 }
2510             }
2511         } else {
2512             int m_row_8 = m_col & 0x11, m_row_4 = m_col - m_row_8;
2513
2514             for (y = row_and_7; y < h + row_and_7; y++) {
2515                 int col_mask_id = 2 - !(y & 3);
2516
2517                 lflvl->mask[is_uv][0][y][1] |= m_row_8; // row edge
2518                 lflvl->mask[is_uv][0][y][2] |= m_row_4;
2519                 lflvl->mask[is_uv][1][y][col_mask_id] |= m_col; // col edge
2520                 lflvl->mask[is_uv][0][y][3] |= m_col;
2521                 lflvl->mask[is_uv][1][y][3] |= m_col;
2522             }
2523         }
2524     } else {
2525         int y, t = 1 << col_and_7, m_col = (t << w) - t;
2526
2527         if (!skip_inter) {
2528             int mask_id = (tx == TX_8X8);
2529             int l2 = tx + is_uv - 1, step1d = 1 << l2;
2530             static const unsigned masks[4] = { 0xff, 0x55, 0x11, 0x01 };
2531             int m_row = m_col & masks[l2];
2532
2533             // at odd UV col/row edges tx16/tx32 loopfilter edges, force
2534             // 8wd loopfilter to prevent going off the visible edge.
2535             if (is_uv && tx > TX_8X8 && (w ^ (w - 1)) == 1) {
2536                 int m_row_16 = ((t << (w - 1)) - t) & masks[l2];
2537                 int m_row_8 = m_row - m_row_16;
2538
2539                 for (y = row_and_7; y < h + row_and_7; y++) {
2540                     lflvl->mask[is_uv][0][y][0] |= m_row_16;
2541                     lflvl->mask[is_uv][0][y][1] |= m_row_8;
2542                 }
2543             } else {
2544                 for (y = row_and_7; y < h + row_and_7; y++)
2545                     lflvl->mask[is_uv][0][y][mask_id] |= m_row;
2546             }
2547
2548             if (is_uv && tx > TX_8X8 && (h ^ (h - 1)) == 1) {
2549                 for (y = row_and_7; y < h + row_and_7 - 1; y += step1d)
2550                     lflvl->mask[is_uv][1][y][0] |= m_col;
2551                 if (y - row_and_7 == h - 1)
2552                     lflvl->mask[is_uv][1][y][1] |= m_col;
2553             } else {
2554                 for (y = row_and_7; y < h + row_and_7; y += step1d)
2555                     lflvl->mask[is_uv][1][y][mask_id] |= m_col;
2556             }
2557         } else if (tx != TX_4X4) {
2558             int mask_id;
2559
2560             mask_id = (tx == TX_8X8) || (is_uv && h == 1);
2561             lflvl->mask[is_uv][1][row_and_7][mask_id] |= m_col;
2562             mask_id = (tx == TX_8X8) || (is_uv && w == 1);
2563             for (y = row_and_7; y < h + row_and_7; y++)
2564                 lflvl->mask[is_uv][0][y][mask_id] |= t;
2565         } else if (is_uv) {
2566             int t8 = t & 0x01, t4 = t - t8;
2567
2568             for (y = row_and_7; y < h + row_and_7; y++) {
2569                 lflvl->mask[is_uv][0][y][2] |= t4;
2570                 lflvl->mask[is_uv][0][y][1] |= t8;
2571             }
2572             lflvl->mask[is_uv][1][row_and_7][2 - !(row_and_7 & 7)] |= m_col;
2573         } else {
2574             int t8 = t & 0x11, t4 = t - t8;
2575
2576             for (y = row_and_7; y < h + row_and_7; y++) {
2577                 lflvl->mask[is_uv][0][y][2] |= t4;
2578                 lflvl->mask[is_uv][0][y][1] |= t8;
2579             }
2580             lflvl->mask[is_uv][1][row_and_7][2 - !(row_and_7 & 3)] |= m_col;
2581         }
2582     }
2583 }
2584
2585 static int decode_b(AVCodecContext *ctx, int row, int col,
2586                     struct VP9Filter *lflvl, ptrdiff_t yoff, ptrdiff_t uvoff,
2587                     enum BlockLevel bl, enum BlockPartition bp)
2588 {
2589     VP9Context *s = ctx->priv_data;
2590     VP9Block *const b = &s->b;
2591     enum BlockSize bs = bl * 3 + bp;
2592     int res, y, w4 = bwh_tab[1][bs][0], h4 = bwh_tab[1][bs][1], lvl;
2593     int emu[2];
2594
2595     b->row = row;
2596     b->row7 = row & 7;
2597     b->col = col;
2598     b->col7 = col & 7;
2599     s->min_mv.x = -(128 + col * 64);
2600     s->min_mv.y = -(128 + row * 64);
2601     s->max_mv.x = 128 + (s->cols - col - w4) * 64;
2602     s->max_mv.y = 128 + (s->rows - row - h4) * 64;
2603     b->bs = bs;
2604     decode_mode(ctx);
2605     b->uvtx = b->tx - (w4 * 2 == (1 << b->tx) || h4 * 2 == (1 << b->tx));
2606
2607     if (!b->skip) {
2608         if ((res = decode_coeffs(ctx)) < 0)
2609             return res;
2610     } else {
2611         int pl;
2612
2613         memset(&s->above_y_nnz_ctx[col * 2], 0, w4 * 2);
2614         memset(&s->left_y_nnz_ctx[(row & 7) << 1], 0, h4 * 2);
2615         for (pl = 0; pl < 2; pl++) {
2616             memset(&s->above_uv_nnz_ctx[pl][col], 0, w4);
2617             memset(&s->left_uv_nnz_ctx[pl][row & 7], 0, h4);
2618         }
2619     }
2620
2621     // emulated overhangs if the stride of the target buffer can't hold. This
2622     // allows to support emu-edge and so on even if we have large block
2623     // overhangs
2624     emu[0] = (col + w4) * 8 > s->f->linesize[0] ||
2625              (row + h4) > s->rows + 2 * !(ctx->flags & CODEC_FLAG_EMU_EDGE);
2626     emu[1] = (col + w4) * 4 > s->f->linesize[1] ||
2627              (row + h4) > s->rows + 2 * !(ctx->flags & CODEC_FLAG_EMU_EDGE);
2628     if (emu[0]) {
2629         b->dst[0] = s->tmp_y;
2630         b->y_stride = 64;
2631     } else {
2632         b->dst[0] = s->f->data[0] + yoff;
2633         b->y_stride = s->f->linesize[0];
2634     }
2635     if (emu[1]) {
2636         b->dst[1] = s->tmp_uv[0];
2637         b->dst[2] = s->tmp_uv[1];
2638         b->uv_stride = 32;
2639     } else {
2640         b->dst[1] = s->f->data[1] + uvoff;
2641         b->dst[2] = s->f->data[2] + uvoff;
2642         b->uv_stride = s->f->linesize[1];
2643     }
2644     if (b->intra) {
2645         intra_recon(ctx, yoff, uvoff);
2646     } else {
2647         inter_recon(ctx);
2648     }
2649     if (emu[0]) {
2650         int w = FFMIN(s->cols - col, w4) * 8, h = FFMIN(s->rows - row, h4) * 8, n, o = 0;
2651
2652         for (n = 0; o < w; n++) {
2653             int bw = 64 >> n;
2654
2655             av_assert2(n <= 4);
2656             if (w & bw) {
2657                 s->dsp.mc[n][0][0][0][0](s->f->data[0] + yoff + o, s->f->linesize[0],
2658                                          s->tmp_y + o, 64, h, 0, 0);
2659                 o += bw;
2660             }
2661         }
2662     }
2663     if (emu[1]) {
2664         int w = FFMIN(s->cols - col, w4) * 4, h = FFMIN(s->rows - row, h4) * 4, n, o = 0;
2665
2666         for (n = 1; o < w; n++) {
2667             int bw = 64 >> n;
2668
2669             av_assert2(n <= 4);
2670             if (w & bw) {
2671                 s->dsp.mc[n][0][0][0][0](s->f->data[1] + uvoff + o, s->f->linesize[1],
2672                                          s->tmp_uv[0] + o, 32, h, 0, 0);
2673                 s->dsp.mc[n][0][0][0][0](s->f->data[2] + uvoff + o, s->f->linesize[2],
2674                                          s->tmp_uv[1] + o, 32, h, 0, 0);
2675                 o += bw;
2676             }
2677         }
2678     }
2679
2680     // pick filter level and find edges to apply filter to
2681     if (s->filter.level &&
2682         (lvl = s->segmentation.feat[b->seg_id].lflvl[b->intra ? 0 : b->ref[0] + 1]
2683                                                     [b->mode[3] != ZEROMV]) > 0) {
2684         int x_end = FFMIN(s->cols - col, w4), y_end = FFMIN(s->rows - row, h4);
2685         int skip_inter = !b->intra && b->skip;
2686
2687         for (y = 0; y < h4; y++)
2688             memset(&lflvl->level[((row & 7) + y) * 8 + (col & 7)], lvl, w4);
2689         mask_edges(lflvl, 0, row & 7, col & 7, x_end, y_end, 0, 0, b->tx, skip_inter);
2690         mask_edges(lflvl, 1, row & 7, col & 7, x_end, y_end,
2691                    s->cols & 1 && col + w4 >= s->cols ? s->cols & 7 : 0,
2692                    s->rows & 1 && row + h4 >= s->rows ? s->rows & 7 : 0,
2693                    b->uvtx, skip_inter);
2694
2695         if (!s->filter.lim_lut[lvl]) {
2696             int sharp = s->filter.sharpness;
2697             int limit = lvl;
2698
2699             if (sharp > 0) {
2700                 limit >>= (sharp + 3) >> 2;
2701                 limit = FFMIN(limit, 9 - sharp);
2702             }
2703             limit = FFMAX(limit, 1);
2704
2705             s->filter.lim_lut[lvl] = limit;
2706             s->filter.mblim_lut[lvl] = 2 * (lvl + 2) + limit;
2707         }
2708     }
2709
2710     return 0;
2711 }
2712
2713 static int decode_sb(AVCodecContext *ctx, int row, int col, struct VP9Filter *lflvl,
2714                      ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
2715 {
2716     VP9Context *s = ctx->priv_data;
2717     int c = ((s->above_partition_ctx[col] >> (3 - bl)) & 1) |
2718             (((s->left_partition_ctx[row & 0x7] >> (3 - bl)) & 1) << 1), res;
2719     const uint8_t *p = s->keyframe ? vp9_default_kf_partition_probs[bl][c] :
2720                                      s->prob.p.partition[bl][c];
2721     enum BlockPartition bp;
2722     ptrdiff_t hbs = 4 >> bl;
2723
2724     if (bl == BL_8X8) {
2725         bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
2726         res = decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
2727     } else if (col + hbs < s->cols) {
2728         if (row + hbs < s->rows) {
2729             bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
2730             switch (bp) {
2731             case PARTITION_NONE:
2732                 res = decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
2733                 break;
2734             case PARTITION_H:
2735                 if (!(res = decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp))) {
2736                     yoff  += hbs * 8 * s->f->linesize[0];
2737                     uvoff += hbs * 4 * s->f->linesize[1];
2738                     res = decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, bl, bp);
2739                 }
2740                 break;
2741             case PARTITION_V:
2742                 if (!(res = decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp))) {
2743                     yoff  += hbs * 8;
2744                     uvoff += hbs * 4;
2745                     res = decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, bl, bp);
2746                 }
2747                 break;
2748             case PARTITION_SPLIT:
2749                 if (!(res = decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1))) {
2750                     if (!(res = decode_sb(ctx, row, col + hbs, lflvl,
2751                                           yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1))) {
2752                         yoff  += hbs * 8 * s->f->linesize[0];
2753                         uvoff += hbs * 4 * s->f->linesize[1];
2754                         if (!(res = decode_sb(ctx, row + hbs, col, lflvl,
2755                                               yoff, uvoff, bl + 1)))
2756                             res = decode_sb(ctx, row + hbs, col + hbs, lflvl,
2757                                             yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
2758                     }
2759                 }
2760                 break;
2761             }
2762         } else if (vp56_rac_get_prob_branchy(&s->c, p[1])) {
2763             bp = PARTITION_SPLIT;
2764             if (!(res = decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1)))
2765                 res = decode_sb(ctx, row, col + hbs, lflvl,
2766                                 yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
2767         } else {
2768             bp = PARTITION_H;
2769             res = decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
2770         }
2771     } else if (row + hbs < s->rows) {
2772         if (vp56_rac_get_prob_branchy(&s->c, p[2])) {
2773             bp = PARTITION_SPLIT;
2774             if (!(res = decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1))) {
2775                 yoff  += hbs * 8 * s->f->linesize[0];
2776                 uvoff += hbs * 4 * s->f->linesize[1];
2777                 res = decode_sb(ctx, row + hbs, col, lflvl,
2778                                 yoff, uvoff, bl + 1);
2779             }
2780         } else {
2781             bp = PARTITION_V;
2782             res = decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
2783         }
2784     } else {
2785         bp = PARTITION_SPLIT;
2786         res = decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
2787     }
2788     s->counts.partition[bl][c][bp]++;
2789
2790     return res;
2791 }
2792
2793 static void loopfilter_sb(AVCodecContext *ctx, struct VP9Filter *lflvl,
2794                           int row, int col, ptrdiff_t yoff, ptrdiff_t uvoff)
2795 {
2796     VP9Context *s = ctx->priv_data;
2797     uint8_t *dst = s->f->data[0] + yoff, *lvl = lflvl->level;
2798     ptrdiff_t ls_y = s->f->linesize[0], ls_uv = s->f->linesize[1];
2799     int y, x, p;
2800
2801     // FIXME in how far can we interleave the v/h loopfilter calls? E.g.
2802     // if you think of them as acting on a 8x8 block max, we can interleave
2803     // each v/h within the single x loop, but that only works if we work on
2804     // 8 pixel blocks, and we won't always do that (we want at least 16px
2805     // to use SSE2 optimizations, perhaps 32 for AVX2)
2806
2807     // filter edges between columns, Y plane (e.g. block1 | block2)
2808     for (y = 0; y < 8; y += 2, dst += 16 * ls_y, lvl += 16) {
2809         uint8_t *ptr = dst, *l = lvl, *hmask1 = lflvl->mask[0][0][y];
2810         uint8_t *hmask2 = lflvl->mask[0][0][y + 1];
2811         unsigned hm1 = hmask1[0] | hmask1[1] | hmask1[2], hm13 = hmask1[3];
2812         unsigned hm2 = hmask2[1] | hmask2[2], hm23 = hmask2[3];
2813         unsigned hm = hm1 | hm2 | hm13 | hm23;
2814
2815         for (x = 1; hm & ~(x - 1); x <<= 1, ptr += 8, l++) {
2816             if (hm1 & x) {
2817                 int L = *l, H = L >> 4;
2818                 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
2819
2820                 if (col || x > 1) {
2821                     if (hmask1[0] & x) {
2822                         if (hmask2[0] & x) {
2823                             av_assert2(l[8] == L);
2824                             s->dsp.loop_filter_16[0](ptr, ls_y, E, I, H);
2825                         } else {
2826                             s->dsp.loop_filter_8[2][0](ptr, ls_y, E, I, H);
2827                         }
2828                     } else if (hm2 & x) {
2829                         L = l[8];
2830                         H |= (L >> 4) << 8;
2831                         E |= s->filter.mblim_lut[L] << 8;
2832                         I |= s->filter.lim_lut[L] << 8;
2833                         s->dsp.loop_filter_mix2[!!(hmask1[1] & x)]
2834                                                [!!(hmask2[1] & x)]
2835                                                [0](ptr, ls_y, E, I, H);
2836                     } else {
2837                         s->dsp.loop_filter_8[!!(hmask1[1] & x)]
2838                                             [0](ptr, ls_y, E, I, H);
2839                     }
2840                 }
2841             } else if (hm2 & x) {
2842                 int L = l[8], H = L >> 4;
2843                 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
2844
2845                 if (col || x > 1) {
2846                     s->dsp.loop_filter_8[!!(hmask2[1] & x)]
2847                                         [0](ptr + 8 * ls_y, ls_y, E, I, H);
2848                 }
2849             }
2850             if (hm13 & x) {
2851                 int L = *l, H = L >> 4;
2852                 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
2853
2854                 if (hm23 & x) {
2855                     L = l[8];
2856                     H |= (L >> 4) << 8;
2857                     E |= s->filter.mblim_lut[L] << 8;
2858                     I |= s->filter.lim_lut[L] << 8;
2859                     s->dsp.loop_filter_mix2[0][0][0](ptr + 4, ls_y, E, I, H);
2860                 } else {
2861                     s->dsp.loop_filter_8[0][0](ptr + 4, ls_y, E, I, H);
2862                 }
2863             } else if (hm23 & x) {
2864                 int L = l[8], H = L >> 4;
2865                 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
2866
2867                 s->dsp.loop_filter_8[0][0](ptr + 8 * ls_y + 4, ls_y, E, I, H);
2868             }
2869         }
2870     }
2871
2872     //                                          block1
2873     // filter edges between rows, Y plane (e.g. ------)
2874     //                                          block2
2875     dst = s->f->data[0] + yoff;
2876     lvl = lflvl->level;
2877     for (y = 0; y < 8; y++, dst += 8 * ls_y, lvl += 8) {
2878         uint8_t *ptr = dst, *l = lvl, *vmask = lflvl->mask[0][1][y];
2879         unsigned vm = vmask[0] | vmask[1] | vmask[2], vm3 = vmask[3];
2880
2881         for (x = 1; vm & ~(x - 1); x <<= 2, ptr += 16, l += 2) {
2882             if (row || y) {
2883                 if (vm & x) {
2884                     int L = *l, H = L >> 4;
2885                     int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
2886
2887                     if (vmask[0] & x) {
2888                         if (vmask[0] & (x << 1)) {
2889                             av_assert2(l[1] == L);
2890                             s->dsp.loop_filter_16[1](ptr, ls_y, E, I, H);
2891                         } else {
2892                             s->dsp.loop_filter_8[2][1](ptr, ls_y, E, I, H);
2893                         }
2894                     } else if (vm & (x << 1)) {
2895                         L = l[1];
2896                         H |= (L >> 4) << 8;
2897                         E |= s->filter.mblim_lut[L] << 8;
2898                         I |= s->filter.lim_lut[L] << 8;
2899                         s->dsp.loop_filter_mix2[!!(vmask[1] &  x)]
2900                                                [!!(vmask[1] & (x << 1))]
2901                                                [1](ptr, ls_y, E, I, H);
2902                     } else {
2903                         s->dsp.loop_filter_8[!!(vmask[1] & x)]
2904                                             [1](ptr, ls_y, E, I, H);
2905                     }
2906                 } else if (vm & (x << 1)) {
2907                     int L = l[1], H = L >> 4;
2908                     int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
2909
2910                     s->dsp.loop_filter_8[!!(vmask[1] & (x << 1))]
2911                                         [1](ptr + 8, ls_y, E, I, H);
2912                 }
2913             }
2914             if (vm3 & x) {
2915                 int L = *l, H = L >> 4;
2916                 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
2917
2918                 if (vm3 & (x << 1)) {
2919                     L = l[1];
2920                     H |= (L >> 4) << 8;
2921                     E |= s->filter.mblim_lut[L] << 8;
2922                     I |= s->filter.lim_lut[L] << 8;
2923                     s->dsp.loop_filter_mix2[0][0][1](ptr + ls_y * 4, ls_y, E, I, H);
2924                 } else {
2925                     s->dsp.loop_filter_8[0][1](ptr + ls_y * 4, ls_y, E, I, H);
2926                 }
2927             } else if (vm3 & (x << 1)) {
2928                 int L = l[1], H = L >> 4;
2929                 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
2930
2931                 s->dsp.loop_filter_8[0][1](ptr + ls_y * 4 + 8, ls_y, E, I, H);
2932             }
2933         }
2934     }
2935
2936     // same principle but for U/V planes
2937     for (p = 0; p < 2; p++) {
2938         lvl = lflvl->level;
2939         dst = s->f->data[1 + p] + uvoff;
2940         for (y = 0; y < 8; y += 4, dst += 16 * ls_uv, lvl += 32) {
2941             uint8_t *ptr = dst, *l = lvl, *hmask1 = lflvl->mask[1][0][y];
2942             uint8_t *hmask2 = lflvl->mask[1][0][y + 2];
2943             unsigned hm1 = hmask1[0] | hmask1[1] | hmask1[2];
2944             unsigned hm2 = hmask2[1] | hmask2[2], hm = hm1 | hm2;
2945
2946             for (x = 1; hm & ~(x - 1); x <<= 1, ptr += 4) {
2947                 if (col || x > 1) {
2948                     if (hm1 & x) {
2949                         int L = *l, H = L >> 4;
2950                         int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
2951
2952                         if (hmask1[0] & x) {
2953                             if (hmask2[0] & x) {
2954                                 av_assert2(l[16] == L);
2955                                 s->dsp.loop_filter_16[0](ptr, ls_uv, E, I, H);
2956                             } else {
2957                                 s->dsp.loop_filter_8[2][0](ptr, ls_uv, E, I, H);
2958                             }
2959                         } else if (hm2 & x) {
2960                             L = l[16];
2961                             H |= (L >> 4) << 8;
2962                             E |= s->filter.mblim_lut[L] << 8;
2963                             I |= s->filter.lim_lut[L] << 8;
2964                             s->dsp.loop_filter_mix2[!!(hmask1[1] & x)]
2965                                                    [!!(hmask2[1] & x)]
2966                                                    [0](ptr, ls_uv, E, I, H);
2967                         } else {
2968                             s->dsp.loop_filter_8[!!(hmask1[1] & x)]
2969                                                 [0](ptr, ls_uv, E, I, H);
2970                         }
2971                     } else if (hm2 & x) {
2972                         int L = l[16], H = L >> 4;
2973                         int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
2974
2975                         s->dsp.loop_filter_8[!!(hmask2[1] & x)]
2976                                             [0](ptr + 8 * ls_uv, ls_uv, E, I, H);
2977                     }
2978                 }
2979                 if (x & 0xAA)
2980                     l += 2;
2981             }
2982         }
2983         lvl = lflvl->level;
2984         dst = s->f->data[1 + p] + uvoff;
2985         for (y = 0; y < 8; y++, dst += 4 * ls_uv) {
2986             uint8_t *ptr = dst, *l = lvl, *vmask = lflvl->mask[1][1][y];
2987             unsigned vm = vmask[0] | vmask[1] | vmask[2];
2988
2989             for (x = 1; vm & ~(x - 1); x <<= 4, ptr += 16, l += 4) {
2990                 if (row || y) {
2991                     if (vm & x) {
2992                         int L = *l, H = L >> 4;
2993                         int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
2994
2995                         if (vmask[0] & x) {
2996                             if (vmask[0] & (x << 2)) {
2997                                 av_assert2(l[2] == L);
2998                                 s->dsp.loop_filter_16[1](ptr, ls_uv, E, I, H);
2999                             } else {
3000                                 s->dsp.loop_filter_8[2][1](ptr, ls_uv, E, I, H);
3001                             }
3002                         } else if (vm & (x << 2)) {
3003                             L = l[2];
3004                             H |= (L >> 4) << 8;
3005                             E |= s->filter.mblim_lut[L] << 8;
3006                             I |= s->filter.lim_lut[L] << 8;
3007                             s->dsp.loop_filter_mix2[!!(vmask[1] &  x)]
3008                                                    [!!(vmask[1] & (x << 2))]
3009                                                    [1](ptr, ls_uv, E, I, H);
3010                         } else {
3011                             s->dsp.loop_filter_8[!!(vmask[1] & x)]
3012                                                 [1](ptr, ls_uv, E, I, H);
3013                         }
3014                     } else if (vm & (x << 2)) {
3015                         int L = l[2], H = L >> 4;
3016                         int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3017
3018                         s->dsp.loop_filter_8[!!(vmask[1] & (x << 2))]
3019                                             [1](ptr + 8, ls_uv, E, I, H);
3020                     }
3021                 }
3022             }
3023             if (y & 1)
3024                 lvl += 16;
3025         }
3026     }
3027 }
3028
3029 static void set_tile_offset(int *start, int *end, int idx, int log2_n, int n)
3030 {
3031     int sb_start = ( idx      * n) >> log2_n;
3032     int sb_end   = ((idx + 1) * n) >> log2_n;
3033     *start = FFMIN(sb_start, n) << 3;
3034     *end   = FFMIN(sb_end,   n) << 3;
3035 }
3036
3037 static av_always_inline void adapt_prob(uint8_t *p, unsigned ct0, unsigned ct1,
3038                                         int max_count, int update_factor)
3039 {
3040     unsigned ct = ct0 + ct1, p2, p1;
3041
3042     if (!ct)
3043         return;
3044
3045     p1 = *p;
3046     p2 = ((ct0 << 8) + (ct >> 1)) / ct;
3047     p2 = av_clip(p2, 1, 255);
3048     ct = FFMIN(ct, max_count);
3049     update_factor = FASTDIV(update_factor * ct, max_count);
3050
3051     // (p1 * (256 - update_factor) + p2 * update_factor + 128) >> 8
3052     *p = p1 + (((p2 - p1) * update_factor + 128) >> 8);
3053 }
3054
3055 static void adapt_probs(VP9Context *s)
3056 {
3057     int i, j, k, l, m;
3058     prob_context *p = &s->prob_ctx[s->framectxid].p;
3059     int uf = (s->keyframe || s->intraonly || !s->last_keyframe) ? 112 : 128;
3060
3061     // coefficients
3062     for (i = 0; i < 4; i++)
3063         for (j = 0; j < 2; j++)
3064             for (k = 0; k < 2; k++)
3065                 for (l = 0; l < 6; l++)
3066                     for (m = 0; m < 6; m++) {
3067                         uint8_t *pp = s->prob_ctx[s->framectxid].coef[i][j][k][l][m];
3068                         unsigned *e = s->counts.eob[i][j][k][l][m];
3069                         unsigned *c = s->counts.coef[i][j][k][l][m];
3070
3071                         if (l == 0 && m >= 3) // dc only has 3 pt
3072                             break;
3073
3074                         adapt_prob(&pp[0], e[0], e[1], 24, uf);
3075                         adapt_prob(&pp[1], c[0], c[1] + c[2], 24, uf);
3076                         adapt_prob(&pp[2], c[1], c[2], 24, uf);
3077                     }
3078
3079     if (s->keyframe || s->intraonly) {
3080         memcpy(p->skip,  s->prob.p.skip,  sizeof(p->skip));
3081         memcpy(p->tx32p, s->prob.p.tx32p, sizeof(p->tx32p));
3082         memcpy(p->tx16p, s->prob.p.tx16p, sizeof(p->tx16p));
3083         memcpy(p->tx8p,  s->prob.p.tx8p,  sizeof(p->tx8p));
3084         return;
3085     }
3086
3087     // skip flag
3088     for (i = 0; i < 3; i++)
3089         adapt_prob(&p->skip[i], s->counts.skip[i][0], s->counts.skip[i][1], 20, 128);
3090
3091     // intra/inter flag
3092     for (i = 0; i < 4; i++)
3093         adapt_prob(&p->intra[i], s->counts.intra[i][0], s->counts.intra[i][1], 20, 128);
3094
3095     // comppred flag
3096     if (s->comppredmode == PRED_SWITCHABLE) {
3097       for (i = 0; i < 5; i++)
3098           adapt_prob(&p->comp[i], s->counts.comp[i][0], s->counts.comp[i][1], 20, 128);
3099     }
3100
3101     // reference frames
3102     if (s->comppredmode != PRED_SINGLEREF) {
3103       for (i = 0; i < 5; i++)
3104           adapt_prob(&p->comp_ref[i], s->counts.comp_ref[i][0],
3105                      s->counts.comp_ref[i][1], 20, 128);
3106     }
3107
3108     if (s->comppredmode != PRED_COMPREF) {
3109       for (i = 0; i < 5; i++) {
3110           uint8_t *pp = p->single_ref[i];
3111           unsigned (*c)[2] = s->counts.single_ref[i];
3112
3113           adapt_prob(&pp[0], c[0][0], c[0][1], 20, 128);
3114           adapt_prob(&pp[1], c[1][0], c[1][1], 20, 128);
3115       }
3116     }
3117
3118     // block partitioning
3119     for (i = 0; i < 4; i++)
3120         for (j = 0; j < 4; j++) {
3121             uint8_t *pp = p->partition[i][j];
3122             unsigned *c = s->counts.partition[i][j];
3123
3124             adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3125             adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3126             adapt_prob(&pp[2], c[2], c[3], 20, 128);
3127         }
3128
3129     // tx size
3130     if (s->txfmmode == TX_SWITCHABLE) {
3131       for (i = 0; i < 2; i++) {
3132           unsigned *c16 = s->counts.tx16p[i], *c32 = s->counts.tx32p[i];
3133
3134           adapt_prob(&p->tx8p[i], s->counts.tx8p[i][0], s->counts.tx8p[i][1], 20, 128);
3135           adapt_prob(&p->tx16p[i][0], c16[0], c16[1] + c16[2], 20, 128);
3136           adapt_prob(&p->tx16p[i][1], c16[1], c16[2], 20, 128);
3137           adapt_prob(&p->tx32p[i][0], c32[0], c32[1] + c32[2] + c32[3], 20, 128);
3138           adapt_prob(&p->tx32p[i][1], c32[1], c32[2] + c32[3], 20, 128);
3139           adapt_prob(&p->tx32p[i][2], c32[2], c32[3], 20, 128);
3140       }
3141     }
3142
3143     // interpolation filter
3144     if (s->filtermode == FILTER_SWITCHABLE) {
3145         for (i = 0; i < 4; i++) {
3146             uint8_t *pp = p->filter[i];
3147             unsigned *c = s->counts.filter[i];
3148
3149             adapt_prob(&pp[0], c[0], c[1] + c[2], 20, 128);
3150             adapt_prob(&pp[1], c[1], c[2], 20, 128);
3151         }
3152     }
3153
3154     // inter modes
3155     for (i = 0; i < 7; i++) {
3156         uint8_t *pp = p->mv_mode[i];
3157         unsigned *c = s->counts.mv_mode[i];
3158
3159         adapt_prob(&pp[0], c[2], c[1] + c[0] + c[3], 20, 128);
3160         adapt_prob(&pp[1], c[0], c[1] + c[3], 20, 128);
3161         adapt_prob(&pp[2], c[1], c[3], 20, 128);
3162     }
3163
3164     // mv joints
3165     {
3166         uint8_t *pp = p->mv_joint;
3167         unsigned *c = s->counts.mv_joint;
3168
3169         adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3170         adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3171         adapt_prob(&pp[2], c[2], c[3], 20, 128);
3172     }
3173
3174     // mv components
3175     for (i = 0; i < 2; i++) {
3176         uint8_t *pp;
3177         unsigned *c, (*c2)[2], sum;
3178
3179         adapt_prob(&p->mv_comp[i].sign, s->counts.mv_comp[i].sign[0],
3180                    s->counts.mv_comp[i].sign[1], 20, 128);
3181
3182         pp = p->mv_comp[i].classes;
3183         c = s->counts.mv_comp[i].classes;
3184         sum = c[1] + c[2] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9] + c[10];
3185         adapt_prob(&pp[0], c[0], sum, 20, 128);
3186         sum -= c[1];
3187         adapt_prob(&pp[1], c[1], sum, 20, 128);
3188         sum -= c[2] + c[3];
3189         adapt_prob(&pp[2], c[2] + c[3], sum, 20, 128);
3190         adapt_prob(&pp[3], c[2], c[3], 20, 128);
3191         sum -= c[4] + c[5];
3192         adapt_prob(&pp[4], c[4] + c[5], sum, 20, 128);
3193         adapt_prob(&pp[5], c[4], c[5], 20, 128);
3194         sum -= c[6];
3195         adapt_prob(&pp[6], c[6], sum, 20, 128);
3196         adapt_prob(&pp[7], c[7] + c[8], c[9] + c[10], 20, 128);
3197         adapt_prob(&pp[8], c[7], c[8], 20, 128);
3198         adapt_prob(&pp[9], c[9], c[10], 20, 128);
3199
3200         adapt_prob(&p->mv_comp[i].class0, s->counts.mv_comp[i].class0[0],
3201                    s->counts.mv_comp[i].class0[1], 20, 128);
3202         pp = p->mv_comp[i].bits;
3203         c2 = s->counts.mv_comp[i].bits;
3204         for (j = 0; j < 10; j++)
3205             adapt_prob(&pp[j], c2[j][0], c2[j][1], 20, 128);
3206
3207         for (j = 0; j < 2; j++) {
3208             pp = p->mv_comp[i].class0_fp[j];
3209             c = s->counts.mv_comp[i].class0_fp[j];
3210             adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3211             adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3212             adapt_prob(&pp[2], c[2], c[3], 20, 128);
3213         }
3214         pp = p->mv_comp[i].fp;
3215         c = s->counts.mv_comp[i].fp;
3216         adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3217         adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3218         adapt_prob(&pp[2], c[2], c[3], 20, 128);
3219
3220         if (s->highprecisionmvs) {
3221             adapt_prob(&p->mv_comp[i].class0_hp, s->counts.mv_comp[i].class0_hp[0],
3222                        s->counts.mv_comp[i].class0_hp[1], 20, 128);
3223             adapt_prob(&p->mv_comp[i].hp, s->counts.mv_comp[i].hp[0],
3224                        s->counts.mv_comp[i].hp[1], 20, 128);
3225         }
3226     }
3227
3228     // y intra modes
3229     for (i = 0; i < 4; i++) {
3230         uint8_t *pp = p->y_mode[i];
3231         unsigned *c = s->counts.y_mode[i], sum, s2;
3232
3233         sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
3234         adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
3235         sum -= c[TM_VP8_PRED];
3236         adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
3237         sum -= c[VERT_PRED];
3238         adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
3239         s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
3240         sum -= s2;
3241         adapt_prob(&pp[3], s2, sum, 20, 128);
3242         s2 -= c[HOR_PRED];
3243         adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
3244         adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], 20, 128);
3245         sum -= c[DIAG_DOWN_LEFT_PRED];
3246         adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
3247         sum -= c[VERT_LEFT_PRED];
3248         adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
3249         adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
3250     }
3251
3252     // uv intra modes
3253     for (i = 0; i < 10; i++) {
3254         uint8_t *pp = p->uv_mode[i];
3255         unsigned *c = s->counts.uv_mode[i], sum, s2;
3256
3257         sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
3258         adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
3259         sum -= c[TM_VP8_PRED];
3260         adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
3261         sum -= c[VERT_PRED];
3262         adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
3263         s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
3264         sum -= s2;
3265         adapt_prob(&pp[3], s2, sum, 20, 128);
3266         s2 -= c[HOR_PRED];
3267         adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
3268         adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], 20, 128);
3269         sum -= c[DIAG_DOWN_LEFT_PRED];
3270         adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
3271         sum -= c[VERT_LEFT_PRED];
3272         adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
3273         adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
3274     }
3275 }
3276
3277 static int vp9_decode_frame(AVCodecContext *ctx, void *out_pic,
3278                             int *got_frame, const uint8_t *data, int size)
3279 {
3280     VP9Context *s = ctx->priv_data;
3281     int res, tile_row, tile_col, i, ref, row, col;
3282     ptrdiff_t yoff = 0, uvoff = 0;
3283     //AVFrame *prev_frame = s->f; // for segmentation map
3284
3285     if ((res = decode_frame_header(ctx, data, size, &ref)) < 0) {
3286         return res;
3287     } else if (res == 0) {
3288         if (!s->refs[ref]) {
3289             av_log(ctx, AV_LOG_ERROR, "Requested reference %d not available\n", ref);
3290             return AVERROR_INVALIDDATA;
3291         }
3292         if ((res = av_frame_ref(out_pic, s->refs[ref])) < 0)
3293             return res;
3294         *got_frame = 1;
3295         return 0;
3296     }
3297     data += res;
3298     size -= res;
3299
3300     // discard old references
3301     for (i = 0; i < 10; i++) {
3302         AVFrame *f = s->fb[i];
3303         if (f->data[0] && f != s->f &&
3304             f != s->refs[0] && f != s->refs[1] &&
3305             f != s->refs[2] && f != s->refs[3] &&
3306             f != s->refs[4] && f != s->refs[5] &&
3307             f != s->refs[6] && f != s->refs[7])
3308             av_frame_unref(f);
3309     }
3310
3311     // find unused reference
3312     for (i = 0; i < 10; i++)
3313         if (!s->fb[i]->data[0])
3314             break;
3315     s->f = s->fb[i];
3316     if ((res = ff_get_buffer(ctx, s->f,
3317                              s->refreshrefmask ? AV_GET_BUFFER_FLAG_REF : 0)) < 0)
3318         return res;
3319     s->f->key_frame = s->keyframe;
3320     s->f->pict_type = s->keyframe ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
3321
3322     // main tile decode loop
3323     memset(s->above_partition_ctx, 0, s->cols);
3324     memset(s->above_skip_ctx, 0, s->cols);
3325     if (s->keyframe || s->intraonly) {
3326         memset(s->above_mode_ctx, DC_PRED, s->cols * 2);
3327     } else {
3328         memset(s->above_mode_ctx, NEARESTMV, s->cols);
3329     }
3330     memset(s->above_y_nnz_ctx, 0, s->sb_cols * 16);
3331     memset(s->above_uv_nnz_ctx[0], 0, s->sb_cols * 8);
3332     memset(s->above_uv_nnz_ctx[1], 0, s->sb_cols * 8);
3333     memset(s->above_segpred_ctx, 0, s->cols);
3334     for (tile_row = 0; tile_row < s->tiling.tile_rows; tile_row++) {
3335         set_tile_offset(&s->tiling.tile_row_start, &s->tiling.tile_row_end,
3336                         tile_row, s->tiling.log2_tile_rows, s->sb_rows);
3337         for (tile_col = 0; tile_col < s->tiling.tile_cols; tile_col++) {
3338             unsigned tile_size;
3339
3340             if (tile_col == s->tiling.tile_cols - 1 &&
3341                 tile_row == s->tiling.tile_rows - 1) {
3342                 tile_size = size;
3343             } else {
3344                 tile_size = AV_RB32(data);
3345                 data += 4;
3346                 size -= 4;
3347             }
3348             if (tile_size > size)
3349                 return AVERROR_INVALIDDATA;
3350             ff_vp56_init_range_decoder(&s->c_b[tile_col], data, tile_size);
3351             if (vp56_rac_get_prob_branchy(&s->c_b[tile_col], 128)) // marker bit
3352                 return AVERROR_INVALIDDATA;
3353             data += tile_size;
3354             size -= tile_size;
3355         }
3356
3357         for (row = s->tiling.tile_row_start;
3358              row < s->tiling.tile_row_end;
3359              row += 8, yoff += s->f->linesize[0] * 64,
3360              uvoff += s->f->linesize[1] * 32) {
3361             struct VP9Filter *lflvl_ptr = s->lflvl;
3362             ptrdiff_t yoff2 = yoff, uvoff2 = uvoff;
3363
3364             for (tile_col = 0; tile_col < s->tiling.tile_cols; tile_col++) {
3365                 set_tile_offset(&s->tiling.tile_col_start, &s->tiling.tile_col_end,
3366                                 tile_col, s->tiling.log2_tile_cols, s->sb_cols);
3367
3368                 memset(s->left_partition_ctx, 0, 8);
3369                 memset(s->left_skip_ctx, 0, 8);
3370                 if (s->keyframe || s->intraonly) {
3371                     memset(s->left_mode_ctx, DC_PRED, 16);
3372                 } else {
3373                     memset(s->left_mode_ctx, NEARESTMV, 8);
3374                 }
3375                 memset(s->left_y_nnz_ctx, 0, 16);
3376                 memset(s->left_uv_nnz_ctx, 0, 16);
3377                 memset(s->left_segpred_ctx, 0, 8);
3378
3379                 memcpy(&s->c, &s->c_b[tile_col], sizeof(s->c));
3380                 for (col = s->tiling.tile_col_start;
3381                      col < s->tiling.tile_col_end;
3382                      col += 8, yoff2 += 64, uvoff2 += 32, lflvl_ptr++) {
3383                     // FIXME integrate with lf code (i.e. zero after each
3384                     // use, similar to invtxfm coefficients, or similar)
3385                     memset(lflvl_ptr->mask, 0, sizeof(lflvl_ptr->mask));
3386
3387                     if ((res = decode_sb(ctx, row, col, lflvl_ptr,
3388                                          yoff2, uvoff2, BL_64X64)) < 0)
3389                         return res;
3390                 }
3391                 memcpy(&s->c_b[tile_col], &s->c, sizeof(s->c));
3392             }
3393
3394             // backup pre-loopfilter reconstruction data for intra
3395             // prediction of next row of sb64s
3396             if (row + 8 < s->rows) {
3397                 memcpy(s->intra_pred_data[0],
3398                        s->f->data[0] + yoff + 63 * s->f->linesize[0],
3399                        8 * s->cols);
3400                 memcpy(s->intra_pred_data[1],
3401                        s->f->data[1] + uvoff + 31 * s->f->linesize[1],
3402                        4 * s->cols);
3403                 memcpy(s->intra_pred_data[2],
3404                        s->f->data[2] + uvoff + 31 * s->f->linesize[2],
3405                        4 * s->cols);
3406             }
3407
3408             // loopfilter one row
3409             if (s->filter.level) {
3410                 yoff2 = yoff;
3411                 uvoff2 = uvoff;
3412                 lflvl_ptr = s->lflvl;
3413                 for (col = 0; col < s->cols;
3414                      col += 8, yoff2 += 64, uvoff2 += 32, lflvl_ptr++) {
3415                     loopfilter_sb(ctx, lflvl_ptr, row, col, yoff2, uvoff2);
3416                 }
3417             }
3418         }
3419     }
3420
3421     // bw adaptivity (or in case of parallel decoding mode, fw adaptivity
3422     // probability maintenance between frames)
3423     if (s->refreshctx) {
3424         if (s->parallelmode) {
3425             int i, j, k, l, m;
3426
3427             for (i = 0; i < 4; i++)
3428                 for (j = 0; j < 2; j++)
3429                     for (k = 0; k < 2; k++)
3430                         for (l = 0; l < 6; l++)
3431                             for (m = 0; m < 6; m++)
3432                                 memcpy(s->prob_ctx[s->framectxid].coef[i][j][k][l][m],
3433                                        s->prob.coef[i][j][k][l][m], 3);
3434             s->prob_ctx[s->framectxid].p = s->prob.p;
3435         } else {
3436             adapt_probs(s);
3437         }
3438     }
3439     FFSWAP(struct VP9mvrefPair *, s->mv[0], s->mv[1]);
3440
3441     // ref frame setup
3442     for (i = 0; i < 8; i++)
3443         if (s->refreshrefmask & (1 << i))
3444             s->refs[i] = s->f;
3445
3446     if (!s->invisible) {
3447         if ((res = av_frame_ref(out_pic, s->f)) < 0)
3448             return res;
3449         *got_frame = 1;
3450     }
3451
3452     return 0;
3453 }
3454
3455 static int vp9_decode_packet(AVCodecContext *avctx, void *out_pic,
3456                              int *got_frame, AVPacket *avpkt)
3457 {
3458     const uint8_t *data = avpkt->data;
3459     int size = avpkt->size, marker, res;
3460
3461     // read superframe index - this is a collection of individual frames that
3462     // together lead to one visible frame
3463     av_assert1(size > 0); // without CODEC_CAP_DELAY, this is implied
3464     marker = data[size - 1];
3465     if ((marker & 0xe0) == 0xc0) {
3466         int nbytes = 1 + ((marker >> 3) & 0x3);
3467         int n_frames = 1 + (marker & 0x7), idx_sz = 2 + n_frames * nbytes;
3468
3469         if (size >= idx_sz && data[size - idx_sz] == marker) {
3470             const uint8_t *idx = data + size + 1 - idx_sz;
3471             switch (nbytes) {
3472 #define case_n(a, rd) \
3473                 case a: \
3474                     while (n_frames--) { \
3475                         int sz = rd; \
3476                         idx += a; \
3477                         if (sz > size) { \
3478                             av_log(avctx, AV_LOG_ERROR, \
3479                                    "Superframe packet size too big: %d > %d\n", \
3480                                    sz, size); \
3481                             return AVERROR_INVALIDDATA; \
3482                         } \
3483                         res = vp9_decode_frame(avctx, out_pic, got_frame, \
3484                                                data, sz); \
3485                         if (res < 0) \
3486                             return res; \
3487                         data += sz; \
3488                         size -= sz; \
3489                     } \
3490                     break;
3491                 case_n(1, *idx);
3492                 case_n(2, AV_RL16(idx));
3493                 case_n(3, AV_RL24(idx));
3494                 case_n(4, AV_RL32(idx));
3495             }
3496             return size;
3497         }
3498     }
3499     // if we get here, there was no valid superframe index, i.e. this is just
3500     // one whole single frame - decode it as such from the complete input buf
3501     if ((res = vp9_decode_frame(avctx, out_pic, got_frame, data, size)) < 0)
3502         return res;
3503     return size;
3504 }
3505
3506 static void vp9_decode_flush(AVCodecContext *ctx)
3507 {
3508     VP9Context *s = ctx->priv_data;
3509     int i;
3510
3511     for (i = 0; i < 10; i++)
3512         if (s->fb[i]->data[0])
3513             av_frame_unref(s->fb[i]);
3514     for (i = 0; i < 8; i++)
3515         s->refs[i] = NULL;
3516     s->f = NULL;
3517 }
3518
3519 static av_cold int vp9_decode_init(AVCodecContext *ctx)
3520 {
3521     VP9Context *s = ctx->priv_data;
3522     int i;
3523
3524     ctx->pix_fmt = AV_PIX_FMT_YUV420P;
3525     ff_vp9dsp_init(&s->dsp);
3526     ff_videodsp_init(&s->vdsp, 8);
3527     for (i = 0; i < 10; i++) {
3528         s->fb[i] = av_frame_alloc();
3529         if (!s->fb[i]) {
3530             av_log(ctx, AV_LOG_ERROR, "Failed to allocate frame buffer %d\n", i);
3531             return AVERROR(ENOMEM);
3532         }
3533     }
3534     s->filter.sharpness = -1;
3535
3536     return 0;
3537 }
3538
3539 static av_cold int vp9_decode_free(AVCodecContext *ctx)
3540 {
3541     VP9Context *s = ctx->priv_data;
3542     int i;
3543
3544     for (i = 0; i < 10; i++) {
3545         if (s->fb[i]->data[0])
3546             av_frame_unref(s->fb[i]);
3547         av_frame_free(&s->fb[i]);
3548     }
3549     av_freep(&s->above_partition_ctx);
3550     s->above_skip_ctx = s->above_txfm_ctx = s->above_mode_ctx = NULL;
3551     s->above_y_nnz_ctx = s->above_uv_nnz_ctx[0] = s->above_uv_nnz_ctx[1] = NULL;
3552     s->intra_pred_data[0] = s->intra_pred_data[1] = s->intra_pred_data[2] = NULL;
3553     s->above_segpred_ctx = s->above_intra_ctx = s->above_comp_ctx = NULL;
3554     s->above_ref_ctx = s->above_filter_ctx = NULL;
3555     s->above_mv_ctx = NULL;
3556     s->segmentation_map = NULL;
3557     s->mv[0] = s->mv[1] = NULL;
3558     s->lflvl = NULL;
3559     av_freep(&s->c_b);
3560     s->c_b_size = 0;
3561
3562     return 0;
3563 }
3564
3565 AVCodec ff_vp9_decoder = {
3566   .name                  = "vp9",
3567   .type                  = AVMEDIA_TYPE_VIDEO,
3568   .id                    = AV_CODEC_ID_VP9,
3569   .priv_data_size        = sizeof(VP9Context),
3570   .init                  = vp9_decode_init,
3571   .close                 = vp9_decode_free,
3572   .decode                = vp9_decode_packet,
3573   .capabilities          = CODEC_CAP_DR1,
3574   .flush                 = vp9_decode_flush,
3575   .long_name             = NULL_IF_CONFIG_SMALL("Google VP9"),
3576 };