git.sesse.net Git - ffmpeg/blob - libavcodec/vp9.c

   1 /*
   2  * VP9 compatible video decoder
   3  *
   4  * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
   5  * Copyright (C) 2013 Clément Bœsch <u pkh me>
   6  *
   7  * This file is part of FFmpeg.
   8  *
   9  * FFmpeg is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public
  11  * License as published by the Free Software Foundation; either
  12  * version 2.1 of the License, or (at your option) any later version.
  13  *
  14  * FFmpeg is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with FFmpeg; if not, write to the Free Software
  21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22  */
  23
  24 #include "avcodec.h"
  25 #include "get_bits.h"
  26 #include "internal.h"
  27 #include "thread.h"
  28 #include "videodsp.h"
  29 #include "vp56.h"
  30 #include "vp9.h"
  31 #include "vp9data.h"
  32 #include "vp9dsp.h"
  33 #include "libavutil/avassert.h"
  34
  35 #define VP9_SYNCCODE 0x498342
  36
  37 enum CompPredMode {
  38     PRED_SINGLEREF,
  39     PRED_COMPREF,
  40     PRED_SWITCHABLE,
  41 };
  42
  43 enum BlockLevel {
  44     BL_64X64,
  45     BL_32X32,
  46     BL_16X16,
  47     BL_8X8,
  48 };
  49
  50 enum BlockSize {
  51     BS_64x64,
  52     BS_64x32,
  53     BS_32x64,
  54     BS_32x32,
  55     BS_32x16,
  56     BS_16x32,
  57     BS_16x16,
  58     BS_16x8,
  59     BS_8x16,
  60     BS_8x8,
  61     BS_8x4,
  62     BS_4x8,
  63     BS_4x4,
  64     N_BS_SIZES,
  65 };
  66
  67 struct VP9mvrefPair {
  68     VP56mv mv[2];
  69     int8_t ref[2];
  70 };
  71
  72 typedef struct VP9Frame {
  73     ThreadFrame tf;
  74     AVBufferRef *extradata;
  75     uint8_t *segmentation_map;
  76     struct VP9mvrefPair *mv;
  77 } VP9Frame;
  78
  79 struct VP9Filter {
  80     uint8_t level[8 * 8];
  81     uint8_t /* bit=col */ mask[2 /* 0=y, 1=uv */][2 /* 0=col, 1=row */]
  82                               [8 /* rows */][4 /* 0=16, 1=8, 2=4, 3=inner4 */];
  83 };
  84
  85 typedef struct VP9Block {
  86     uint8_t seg_id, intra, comp, ref[2], mode[4], uvmode, skip;
  87     enum FilterMode filter;
  88     VP56mv mv[4 /* b_idx */][2 /* ref */];
  89     enum BlockSize bs;
  90     enum TxfmMode tx, uvtx;
  91     enum BlockLevel bl;
  92     enum BlockPartition bp;
  93 } VP9Block;
  94
  95 typedef struct VP9Context {
  96     VP9DSPContext dsp;
  97     VideoDSPContext vdsp;
  98     GetBitContext gb;
  99     VP56RangeCoder c;
 100     VP56RangeCoder *c_b;
 101     unsigned c_b_size;
 102     VP9Block *b_base, *b;
 103     int pass, uses_2pass, last_uses_2pass;
 104     int row, row7, col, col7;
 105     uint8_t *dst[3];
 106     ptrdiff_t y_stride, uv_stride;
 107
 108     // bitstream header
 109     uint8_t profile;
 110     uint8_t keyframe, last_keyframe;
 111     uint8_t invisible;
 112     uint8_t use_last_frame_mvs;
 113     uint8_t errorres;
 114     uint8_t colorspace;
 115     uint8_t fullrange;
 116     uint8_t intraonly;
 117     uint8_t resetctx;
 118     uint8_t refreshrefmask;
 119     uint8_t highprecisionmvs;
 120     enum FilterMode filtermode;
 121     uint8_t allowcompinter;
 122     uint8_t fixcompref;
 123     uint8_t refreshctx;
 124     uint8_t parallelmode;
 125     uint8_t framectxid;
 126     uint8_t refidx[3];
 127     uint8_t signbias[3];
 128     uint8_t varcompref[2];
 129     ThreadFrame refs[8], next_refs[8];
 130 #define CUR_FRAME 0
 131 #define LAST_FRAME 1
 132     VP9Frame frames[2];
 133
 134     struct {
 135         uint8_t level;
 136         int8_t sharpness;
 137         uint8_t lim_lut[64];
 138         uint8_t mblim_lut[64];
 139     } filter;
 140     struct {
 141         uint8_t enabled;
 142         int8_t mode[2];
 143         int8_t ref[4];
 144     } lf_delta;
 145     uint8_t yac_qi;
 146     int8_t ydc_qdelta, uvdc_qdelta, uvac_qdelta;
 147     uint8_t lossless;
 148     struct {
 149         uint8_t enabled;
 150         uint8_t temporal;
 151         uint8_t absolute_vals;
 152         uint8_t update_map;
 153         struct {
 154             uint8_t q_enabled;
 155             uint8_t lf_enabled;
 156             uint8_t ref_enabled;
 157             uint8_t skip_enabled;
 158             uint8_t ref_val;
 159             int16_t q_val;
 160             int8_t lf_val;
 161             int16_t qmul[2][2];
 162             uint8_t lflvl[4][2];
 163         } feat[8];
 164     } segmentation;
 165     struct {
 166         unsigned log2_tile_cols, log2_tile_rows;
 167         unsigned tile_cols, tile_rows;
 168         unsigned tile_row_start, tile_row_end, tile_col_start, tile_col_end;
 169     } tiling;
 170     unsigned sb_cols, sb_rows, rows, cols;
 171     struct {
 172         prob_context p;
 173         uint8_t coef[4][2][2][6][6][3];
 174     } prob_ctx[4];
 175     struct {
 176         prob_context p;
 177         uint8_t coef[4][2][2][6][6][11];
 178         uint8_t seg[7];
 179         uint8_t segpred[3];
 180     } prob;
 181     struct {
 182         unsigned y_mode[4][10];
 183         unsigned uv_mode[10][10];
 184         unsigned filter[4][3];
 185         unsigned mv_mode[7][4];
 186         unsigned intra[4][2];
 187         unsigned comp[5][2];
 188         unsigned single_ref[5][2][2];
 189         unsigned comp_ref[5][2];
 190         unsigned tx32p[2][4];
 191         unsigned tx16p[2][3];
 192         unsigned tx8p[2][2];
 193         unsigned skip[3][2];
 194         unsigned mv_joint[4];
 195         struct {
 196             unsigned sign[2];
 197             unsigned classes[11];
 198             unsigned class0[2];
 199             unsigned bits[10][2];
 200             unsigned class0_fp[2][4];
 201             unsigned fp[4];
 202             unsigned class0_hp[2];
 203             unsigned hp[2];
 204         } mv_comp[2];
 205         unsigned partition[4][4][4];
 206         unsigned coef[4][2][2][6][6][3];
 207         unsigned eob[4][2][2][6][6][2];
 208     } counts;
 209     enum TxfmMode txfmmode;
 210     enum CompPredMode comppredmode;
 211
 212     // contextual (left/above) cache
 213     DECLARE_ALIGNED(16, uint8_t, left_y_nnz_ctx)[16];
 214     DECLARE_ALIGNED(16, uint8_t, left_mode_ctx)[16];
 215     DECLARE_ALIGNED(16, VP56mv, left_mv_ctx)[16][2];
 216     DECLARE_ALIGNED(8, uint8_t, left_uv_nnz_ctx)[2][8];
 217     DECLARE_ALIGNED(8, uint8_t, left_partition_ctx)[8];
 218     DECLARE_ALIGNED(8, uint8_t, left_skip_ctx)[8];
 219     DECLARE_ALIGNED(8, uint8_t, left_txfm_ctx)[8];
 220     DECLARE_ALIGNED(8, uint8_t, left_segpred_ctx)[8];
 221     DECLARE_ALIGNED(8, uint8_t, left_intra_ctx)[8];
 222     DECLARE_ALIGNED(8, uint8_t, left_comp_ctx)[8];
 223     DECLARE_ALIGNED(8, uint8_t, left_ref_ctx)[8];
 224     DECLARE_ALIGNED(8, uint8_t, left_filter_ctx)[8];
 225     uint8_t *above_partition_ctx;
 226     uint8_t *above_mode_ctx;
 227     // FIXME maybe merge some of the below in a flags field?
 228     uint8_t *above_y_nnz_ctx;
 229     uint8_t *above_uv_nnz_ctx[2];
 230     uint8_t *above_skip_ctx; // 1bit
 231     uint8_t *above_txfm_ctx; // 2bit
 232     uint8_t *above_segpred_ctx; // 1bit
 233     uint8_t *above_intra_ctx; // 1bit
 234     uint8_t *above_comp_ctx; // 1bit
 235     uint8_t *above_ref_ctx; // 2bit
 236     uint8_t *above_filter_ctx;
 237     VP56mv (*above_mv_ctx)[2];
 238
 239     // whole-frame cache
 240     uint8_t *intra_pred_data[3];
 241     struct VP9Filter *lflvl;
 242     DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[71*80];
 243
 244     // block reconstruction intermediates
 245     int block_alloc_using_2pass;
 246     int16_t *block_base, *block, *uvblock_base[2], *uvblock[2];
 247     uint8_t *eob_base, *uveob_base[2], *eob, *uveob[2];
 248     struct { int x, y; } min_mv, max_mv;
 249     DECLARE_ALIGNED(32, uint8_t, tmp_y)[64*64];
 250     DECLARE_ALIGNED(32, uint8_t, tmp_uv)[2][32*32];
 251 } VP9Context;
 252
 253 static const uint8_t bwh_tab[2][N_BS_SIZES][2] = {
 254     {
 255         { 16, 16 }, { 16, 8 }, { 8, 16 }, { 8, 8 }, { 8, 4 }, { 4, 8 },
 256         { 4, 4 }, { 4, 2 }, { 2, 4 }, { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 },
 257     }, {
 258         { 8, 8 }, { 8, 4 }, { 4, 8 }, { 4, 4 }, { 4, 2 }, { 2, 4 },
 259         { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },
 260     }
 261 };
 262
 263 static int vp9_alloc_frame(AVCodecContext *ctx, VP9Frame *f)
 264 {
 265     VP9Context *s = ctx->priv_data;
 266     int ret, sz;
 267
 268     if ((ret = ff_thread_get_buffer(ctx, &f->tf, AV_GET_BUFFER_FLAG_REF)) < 0)
 269         return ret;
 270     sz = 64 * s->sb_cols * s->sb_rows;
 271     if (!(f->extradata = av_buffer_allocz(sz * (1 + sizeof(struct VP9mvrefPair))))) {
 272         ff_thread_release_buffer(ctx, &f->tf);
 273         return AVERROR(ENOMEM);
 274     }
 275
 276     f->segmentation_map = f->extradata->data;
 277     f->mv = (struct VP9mvrefPair *) (f->extradata->data + sz);
 278
 279     // retain segmentation map if it doesn't update
 280     if (s->segmentation.enabled && !s->segmentation.update_map &&
 281         !s->intraonly && !s->keyframe) {
 282         memcpy(f->segmentation_map, s->frames[LAST_FRAME].segmentation_map, sz);
 283     }
 284
 285     return 0;
 286 }
 287
 288 static void vp9_unref_frame(AVCodecContext *ctx, VP9Frame *f)
 289 {
 290     ff_thread_release_buffer(ctx, &f->tf);
 291     av_buffer_unref(&f->extradata);
 292 }
 293
 294 static int vp9_ref_frame(AVCodecContext *ctx, VP9Frame *dst, VP9Frame *src)
 295 {
 296     int res;
 297
 298     if ((res = ff_thread_ref_frame(&dst->tf, &src->tf)) < 0) {
 299         return res;
 300     } else if (!(dst->extradata = av_buffer_ref(src->extradata))) {
 301         vp9_unref_frame(ctx, dst);
 302         return AVERROR(ENOMEM);
 303     }
 304
 305     dst->segmentation_map = src->segmentation_map;
 306     dst->mv = src->mv;
 307
 308     return 0;
 309 }
 310
 311 static int update_size(AVCodecContext *ctx, int w, int h)
 312 {
 313     VP9Context *s = ctx->priv_data;
 314     uint8_t *p;
 315
 316     av_assert0(w > 0 && h > 0);
 317
 318     if (s->intra_pred_data[0] && w == ctx->width && h == ctx->height)
 319         return 0;
 320
 321     ctx->width  = w;
 322     ctx->height = h;
 323     s->sb_cols  = (w + 63) >> 6;
 324     s->sb_rows  = (h + 63) >> 6;
 325     s->cols     = (w + 7) >> 3;
 326     s->rows     = (h + 7) >> 3;
 327
 328 #define assign(var, type, n) var = (type) p; p += s->sb_cols * n * sizeof(*var)
 329     av_freep(&s->intra_pred_data[0]);
 330     p = av_malloc(s->sb_cols * (240 + sizeof(*s->lflvl) + 16 * sizeof(*s->above_mv_ctx)));
 331     if (!p)
 332         return AVERROR(ENOMEM);
 333     assign(s->intra_pred_data[0],  uint8_t *,             64);
 334     assign(s->intra_pred_data[1],  uint8_t *,             32);
 335     assign(s->intra_pred_data[2],  uint8_t *,             32);
 336     assign(s->above_y_nnz_ctx,     uint8_t *,             16);
 337     assign(s->above_mode_ctx,      uint8_t *,             16);
 338     assign(s->above_mv_ctx,        VP56mv(*)[2],          16);
 339     assign(s->above_partition_ctx, uint8_t *,              8);
 340     assign(s->above_skip_ctx,      uint8_t *,              8);
 341     assign(s->above_txfm_ctx,      uint8_t *,              8);
 342     assign(s->above_uv_nnz_ctx[0], uint8_t *,              8);
 343     assign(s->above_uv_nnz_ctx[1], uint8_t *,              8);
 344     assign(s->above_segpred_ctx,   uint8_t *,              8);
 345     assign(s->above_intra_ctx,     uint8_t *,              8);
 346     assign(s->above_comp_ctx,      uint8_t *,              8);
 347     assign(s->above_ref_ctx,       uint8_t *,              8);
 348     assign(s->above_filter_ctx,    uint8_t *,              8);
 349     assign(s->lflvl,               struct VP9Filter *,     1);
 350 #undef assign
 351
 352     // these will be re-allocated a little later
 353     av_freep(&s->b_base);
 354     av_freep(&s->block_base);
 355
 356     return 0;
 357 }
 358
 359 static int update_block_buffers(AVCodecContext *ctx)
 360 {
 361     VP9Context *s = ctx->priv_data;
 362
 363     if (s->b_base && s->block_base && s->block_alloc_using_2pass == s->uses_2pass)
 364         return 0;
 365
 366     av_free(s->b_base);
 367     av_free(s->block_base);
 368     if (s->uses_2pass) {
 369         int sbs = s->sb_cols * s->sb_rows;
 370
 371         s->b_base = av_malloc(sizeof(VP9Block) * s->cols * s->rows);
 372         s->block_base = av_mallocz((64 * 64 + 128) * sbs * 3);
 373         if (!s->b_base || !s->block_base)
 374             return AVERROR(ENOMEM);
 375         s->uvblock_base[0] = s->block_base + sbs * 64 * 64;
 376         s->uvblock_base[1] = s->uvblock_base[0] + sbs * 32 * 32;
 377         s->eob_base = (uint8_t *) (s->uvblock_base[1] + sbs * 32 * 32);
 378         s->uveob_base[0] = s->eob_base + 256 * sbs;
 379         s->uveob_base[1] = s->uveob_base[0] + 64 * sbs;
 380     } else {
 381         s->b_base = av_malloc(sizeof(VP9Block));
 382         s->block_base = av_mallocz((64 * 64 + 128) * 3);
 383         if (!s->b_base || !s->block_base)
 384             return AVERROR(ENOMEM);
 385         s->uvblock_base[0] = s->block_base + 64 * 64;
 386         s->uvblock_base[1] = s->uvblock_base[0] + 32 * 32;
 387         s->eob_base = (uint8_t *) (s->uvblock_base[1] + 32 * 32);
 388         s->uveob_base[0] = s->eob_base + 256;
 389         s->uveob_base[1] = s->uveob_base[0] + 64;
 390     }
 391     s->block_alloc_using_2pass = s->uses_2pass;
 392
 393     return 0;
 394 }
 395
 396 // for some reason the sign bit is at the end, not the start, of a bit sequence
 397 static av_always_inline int get_sbits_inv(GetBitContext *gb, int n)
 398 {
 399     int v = get_bits(gb, n);
 400     return get_bits1(gb) ? -v : v;
 401 }
 402
 403 static av_always_inline int inv_recenter_nonneg(int v, int m)
 404 {
 405     return v > 2 * m ? v : v & 1 ? m - ((v + 1) >> 1) : m + (v >> 1);
 406 }
 407
 408 // differential forward probability updates
 409 static int update_prob(VP56RangeCoder *c, int p)
 410 {
 411     static const int inv_map_table[254] = {
 412           7,  20,  33,  46,  59,  72,  85,  98, 111, 124, 137, 150, 163, 176,
 413         189, 202, 215, 228, 241, 254,   1,   2,   3,   4,   5,   6,   8,   9,
 414          10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  21,  22,  23,  24,
 415          25,  26,  27,  28,  29,  30,  31,  32,  34,  35,  36,  37,  38,  39,
 416          40,  41,  42,  43,  44,  45,  47,  48,  49,  50,  51,  52,  53,  54,
 417          55,  56,  57,  58,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,
 418          70,  71,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,
 419          86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,  99, 100,
 420         101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 112, 113, 114, 115,
 421         116, 117, 118, 119, 120, 121, 122, 123, 125, 126, 127, 128, 129, 130,
 422         131, 132, 133, 134, 135, 136, 138, 139, 140, 141, 142, 143, 144, 145,
 423         146, 147, 148, 149, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160,
 424         161, 162, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
 425         177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 190, 191,
 426         192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 203, 204, 205, 206,
 427         207, 208, 209, 210, 211, 212, 213, 214, 216, 217, 218, 219, 220, 221,
 428         222, 223, 224, 225, 226, 227, 229, 230, 231, 232, 233, 234, 235, 236,
 429         237, 238, 239, 240, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251,
 430         252, 253,
 431     };
 432     int d;
 433
 434     /* This code is trying to do a differential probability update. For a
 435      * current probability A in the range [1, 255], the difference to a new
 436      * probability of any value can be expressed differentially as 1-A,255-A
 437      * where some part of this (absolute range) exists both in positive as
 438      * well as the negative part, whereas another part only exists in one
 439      * half. We're trying to code this shared part differentially, i.e.
 440      * times two where the value of the lowest bit specifies the sign, and
 441      * the single part is then coded on top of this. This absolute difference
 442      * then again has a value of [0,254], but a bigger value in this range
 443      * indicates that we're further away from the original value A, so we
 444      * can code this as a VLC code, since higher values are increasingly
 445      * unlikely. The first 20 values in inv_map_table[] allow 'cheap, rough'
 446      * updates vs. the 'fine, exact' updates further down the range, which
 447      * adds one extra dimension to this differential update model. */
 448
 449     if (!vp8_rac_get(c)) {
 450         d = vp8_rac_get_uint(c, 4) + 0;
 451     } else if (!vp8_rac_get(c)) {
 452         d = vp8_rac_get_uint(c, 4) + 16;
 453     } else if (!vp8_rac_get(c)) {
 454         d = vp8_rac_get_uint(c, 5) + 32;
 455     } else {
 456         d = vp8_rac_get_uint(c, 7);
 457         if (d >= 65)
 458             d = (d << 1) - 65 + vp8_rac_get(c);
 459         d += 64;
 460     }
 461
 462     return p <= 128 ? 1 + inv_recenter_nonneg(inv_map_table[d], p - 1) :
 463                     255 - inv_recenter_nonneg(inv_map_table[d], 255 - p);
 464 }
 465
 466 static int decode_frame_header(AVCodecContext *ctx,
 467                                const uint8_t *data, int size, int *ref)
 468 {
 469     VP9Context *s = ctx->priv_data;
 470     int c, i, j, k, l, m, n, w, h, max, size2, res, sharp;
 471     int last_invisible;
 472     const uint8_t *data2;
 473
 474     /* general header */
 475     if ((res = init_get_bits8(&s->gb, data, size)) < 0) {
 476         av_log(ctx, AV_LOG_ERROR, "Failed to initialize bitstream reader\n");
 477         return res;
 478     }
 479     if (get_bits(&s->gb, 2) != 0x2) { // frame marker
 480         av_log(ctx, AV_LOG_ERROR, "Invalid frame marker\n");
 481         return AVERROR_INVALIDDATA;
 482     }
 483     s->profile = get_bits1(&s->gb);
 484     if (get_bits1(&s->gb)) { // reserved bit
 485         av_log(ctx, AV_LOG_ERROR, "Reserved bit should be zero\n");
 486         return AVERROR_INVALIDDATA;
 487     }
 488     if (get_bits1(&s->gb)) {
 489         *ref = get_bits(&s->gb, 3);
 490         return 0;
 491     }
 492     s->last_uses_2pass = s->uses_2pass;
 493     s->last_keyframe  = s->keyframe;
 494     s->keyframe       = !get_bits1(&s->gb);
 495     last_invisible    = s->invisible;
 496     s->invisible      = !get_bits1(&s->gb);
 497     s->errorres       = get_bits1(&s->gb);
 498     s->use_last_frame_mvs = !s->errorres && !last_invisible;
 499     if (s->keyframe) {
 500         if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
 501             av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
 502             return AVERROR_INVALIDDATA;
 503         }
 504         s->colorspace = get_bits(&s->gb, 3);
 505         if (s->colorspace == 7) { // RGB = profile 1
 506             av_log(ctx, AV_LOG_ERROR, "RGB not supported in profile 0\n");
 507             return AVERROR_INVALIDDATA;
 508         }
 509         s->fullrange  = get_bits1(&s->gb);
 510         // for profile 1, here follows the subsampling bits
 511         s->refreshrefmask = 0xff;
 512         w = get_bits(&s->gb, 16) + 1;
 513         h = get_bits(&s->gb, 16) + 1;
 514         if (get_bits1(&s->gb)) // display size
 515             skip_bits(&s->gb, 32);
 516     } else {
 517         s->intraonly  = s->invisible ? get_bits1(&s->gb) : 0;
 518         s->resetctx   = s->errorres ? 0 : get_bits(&s->gb, 2);
 519         if (s->intraonly) {
 520             if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
 521                 av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
 522                 return AVERROR_INVALIDDATA;
 523             }
 524             s->refreshrefmask = get_bits(&s->gb, 8);
 525             w = get_bits(&s->gb, 16) + 1;
 526             h = get_bits(&s->gb, 16) + 1;
 527             if (get_bits1(&s->gb)) // display size
 528                 skip_bits(&s->gb, 32);
 529         } else {
 530             s->refreshrefmask = get_bits(&s->gb, 8);
 531             s->refidx[0]      = get_bits(&s->gb, 3);
 532             s->signbias[0]    = get_bits1(&s->gb);
 533             s->refidx[1]      = get_bits(&s->gb, 3);
 534             s->signbias[1]    = get_bits1(&s->gb);
 535             s->refidx[2]      = get_bits(&s->gb, 3);
 536             s->signbias[2]    = get_bits1(&s->gb);
 537             if (!s->refs[s->refidx[0]].f->data[0] ||
 538                 !s->refs[s->refidx[1]].f->data[0] ||
 539                 !s->refs[s->refidx[2]].f->data[0]) {
 540                 av_log(ctx, AV_LOG_ERROR, "Not all references are available\n");
 541                 return AVERROR_INVALIDDATA;
 542             }
 543             if (get_bits1(&s->gb)) {
 544                 w = s->refs[s->refidx[0]].f->width;
 545                 h = s->refs[s->refidx[0]].f->height;
 546             } else if (get_bits1(&s->gb)) {
 547                 w = s->refs[s->refidx[1]].f->width;
 548                 h = s->refs[s->refidx[1]].f->height;
 549             } else if (get_bits1(&s->gb)) {
 550                 w = s->refs[s->refidx[2]].f->width;
 551                 h = s->refs[s->refidx[2]].f->height;
 552             } else {
 553                 w = get_bits(&s->gb, 16) + 1;
 554                 h = get_bits(&s->gb, 16) + 1;
 555             }
 556             // Note that in this code, "CUR_FRAME" is actually before we
 557             // have formally allocated a frame, and thus actually represents
 558             // the _last_ frame
 559             s->use_last_frame_mvs &= s->frames[CUR_FRAME].tf.f->width == w &&
 560                                      s->frames[CUR_FRAME].tf.f->height == h;
 561             if (get_bits1(&s->gb)) // display size
 562                 skip_bits(&s->gb, 32);
 563             s->highprecisionmvs = get_bits1(&s->gb);
 564             s->filtermode = get_bits1(&s->gb) ? FILTER_SWITCHABLE :
 565                                                 get_bits(&s->gb, 2);
 566             s->allowcompinter = s->signbias[0] != s->signbias[1] ||
 567                                 s->signbias[0] != s->signbias[2];
 568             if (s->allowcompinter) {
 569                 if (s->signbias[0] == s->signbias[1]) {
 570                     s->fixcompref    = 2;
 571                     s->varcompref[0] = 0;
 572                     s->varcompref[1] = 1;
 573                 } else if (s->signbias[0] == s->signbias[2]) {
 574                     s->fixcompref    = 1;
 575                     s->varcompref[0] = 0;
 576                     s->varcompref[1] = 2;
 577                 } else {
 578                     s->fixcompref    = 0;
 579                     s->varcompref[0] = 1;
 580                     s->varcompref[1] = 2;
 581                 }
 582             }
 583         }
 584     }
 585     s->refreshctx   = s->errorres ? 0 : get_bits1(&s->gb);
 586     s->parallelmode = s->errorres ? 1 : get_bits1(&s->gb);
 587     s->framectxid   = c = get_bits(&s->gb, 2);
 588
 589     /* loopfilter header data */
 590     s->filter.level = get_bits(&s->gb, 6);
 591     sharp = get_bits(&s->gb, 3);
 592     // if sharpness changed, reinit lim/mblim LUTs. if it didn't change, keep
 593     // the old cache values since they are still valid
 594     if (s->filter.sharpness != sharp)
 595         memset(s->filter.lim_lut, 0, sizeof(s->filter.lim_lut));
 596     s->filter.sharpness = sharp;
 597     if ((s->lf_delta.enabled = get_bits1(&s->gb))) {
 598         if (get_bits1(&s->gb)) {
 599             for (i = 0; i < 4; i++)
 600                 if (get_bits1(&s->gb))
 601                     s->lf_delta.ref[i] = get_sbits_inv(&s->gb, 6);
 602             for (i = 0; i < 2; i++)
 603                 if (get_bits1(&s->gb))
 604                     s->lf_delta.mode[i] = get_sbits_inv(&s->gb, 6);
 605         }
 606     } else {
 607         memset(&s->lf_delta, 0, sizeof(s->lf_delta));
 608     }
 609
 610     /* quantization header data */
 611     s->yac_qi      = get_bits(&s->gb, 8);
 612     s->ydc_qdelta  = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
 613     s->uvdc_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
 614     s->uvac_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
 615     s->lossless    = s->yac_qi == 0 && s->ydc_qdelta == 0 &&
 616                      s->uvdc_qdelta == 0 && s->uvac_qdelta == 0;
 617
 618     /* segmentation header info */
 619     if ((s->segmentation.enabled = get_bits1(&s->gb))) {
 620         if ((s->segmentation.update_map = get_bits1(&s->gb))) {
 621             for (i = 0; i < 7; i++)
 622                 s->prob.seg[i] = get_bits1(&s->gb) ?
 623                                  get_bits(&s->gb, 8) : 255;
 624             if ((s->segmentation.temporal = get_bits1(&s->gb))) {
 625                 for (i = 0; i < 3; i++)
 626                     s->prob.segpred[i] = get_bits1(&s->gb) ?
 627                                          get_bits(&s->gb, 8) : 255;
 628             }
 629         }
 630         if ((!s->segmentation.update_map || s->segmentation.temporal) &&
 631             (w != s->frames[CUR_FRAME].tf.f->width ||
 632              h != s->frames[CUR_FRAME].tf.f->height)) {
 633             av_log(ctx, AV_LOG_ERROR,
 634                    "Reference segmap (temp=%d,update=%d) enabled on size-change!\n",
 635                    s->segmentation.temporal, s->segmentation.update_map);
 636             return AVERROR_INVALIDDATA;
 637         }
 638
 639         if (get_bits1(&s->gb)) {
 640             s->segmentation.absolute_vals = get_bits1(&s->gb);
 641             for (i = 0; i < 8; i++) {
 642                 if ((s->segmentation.feat[i].q_enabled = get_bits1(&s->gb)))
 643                     s->segmentation.feat[i].q_val = get_sbits_inv(&s->gb, 8);
 644                 if ((s->segmentation.feat[i].lf_enabled = get_bits1(&s->gb)))
 645                     s->segmentation.feat[i].lf_val = get_sbits_inv(&s->gb, 6);
 646                 if ((s->segmentation.feat[i].ref_enabled = get_bits1(&s->gb)))
 647                     s->segmentation.feat[i].ref_val = get_bits(&s->gb, 2);
 648                 s->segmentation.feat[i].skip_enabled = get_bits1(&s->gb);
 649             }
 650         }
 651     } else {
 652         s->segmentation.feat[0].q_enabled    = 0;
 653         s->segmentation.feat[0].lf_enabled   = 0;
 654         s->segmentation.feat[0].skip_enabled = 0;
 655         s->segmentation.feat[0].ref_enabled  = 0;
 656     }
 657
 658     // set qmul[] based on Y/UV, AC/DC and segmentation Q idx deltas
 659     for (i = 0; i < (s->segmentation.enabled ? 8 : 1); i++) {
 660         int qyac, qydc, quvac, quvdc, lflvl, sh;
 661
 662         if (s->segmentation.feat[i].q_enabled) {
 663             if (s->segmentation.absolute_vals)
 664                 qyac = s->segmentation.feat[i].q_val;
 665             else
 666                 qyac = s->yac_qi + s->segmentation.feat[i].q_val;
 667         } else {
 668             qyac  = s->yac_qi;
 669         }
 670         qydc  = av_clip_uintp2(qyac + s->ydc_qdelta, 8);
 671         quvdc = av_clip_uintp2(qyac + s->uvdc_qdelta, 8);
 672         quvac = av_clip_uintp2(qyac + s->uvac_qdelta, 8);
 673         qyac  = av_clip_uintp2(qyac, 8);
 674
 675         s->segmentation.feat[i].qmul[0][0] = vp9_dc_qlookup[qydc];
 676         s->segmentation.feat[i].qmul[0][1] = vp9_ac_qlookup[qyac];
 677         s->segmentation.feat[i].qmul[1][0] = vp9_dc_qlookup[quvdc];
 678         s->segmentation.feat[i].qmul[1][1] = vp9_ac_qlookup[quvac];
 679
 680         sh = s->filter.level >= 32;
 681         if (s->segmentation.feat[i].lf_enabled) {
 682             if (s->segmentation.absolute_vals)
 683                 lflvl = s->segmentation.feat[i].lf_val;
 684             else
 685                 lflvl = s->filter.level + s->segmentation.feat[i].lf_val;
 686         } else {
 687             lflvl  = s->filter.level;
 688         }
 689         s->segmentation.feat[i].lflvl[0][0] =
 690         s->segmentation.feat[i].lflvl[0][1] =
 691             av_clip_uintp2(lflvl + (s->lf_delta.ref[0] << sh), 6);
 692         for (j = 1; j < 4; j++) {
 693             s->segmentation.feat[i].lflvl[j][0] =
 694                 av_clip_uintp2(lflvl + ((s->lf_delta.ref[j] +
 695                                          s->lf_delta.mode[0]) << sh), 6);
 696             s->segmentation.feat[i].lflvl[j][1] =
 697                 av_clip_uintp2(lflvl + ((s->lf_delta.ref[j] +
 698                                          s->lf_delta.mode[1]) << sh), 6);
 699         }
 700     }
 701
 702     /* tiling info */
 703     if ((res = update_size(ctx, w, h)) < 0) {
 704         av_log(ctx, AV_LOG_ERROR, "Failed to initialize decoder for %dx%d\n", w, h);
 705         return res;
 706     }
 707     for (s->tiling.log2_tile_cols = 0;
 708          (s->sb_cols >> s->tiling.log2_tile_cols) > 64;
 709          s->tiling.log2_tile_cols++) ;
 710     for (max = 0; (s->sb_cols >> max) >= 4; max++) ;
 711     max = FFMAX(0, max - 1);
 712     while (max > s->tiling.log2_tile_cols) {
 713         if (get_bits1(&s->gb))
 714             s->tiling.log2_tile_cols++;
 715         else
 716             break;
 717     }
 718     s->tiling.log2_tile_rows = decode012(&s->gb);
 719     s->tiling.tile_rows = 1 << s->tiling.log2_tile_rows;
 720     if (s->tiling.tile_cols != (1 << s->tiling.log2_tile_cols)) {
 721         s->tiling.tile_cols = 1 << s->tiling.log2_tile_cols;
 722         s->c_b = av_fast_realloc(s->c_b, &s->c_b_size,
 723                                  sizeof(VP56RangeCoder) * s->tiling.tile_cols);
 724         if (!s->c_b) {
 725             av_log(ctx, AV_LOG_ERROR, "Ran out of memory during range coder init\n");
 726             return AVERROR(ENOMEM);
 727         }
 728     }
 729
 730     if (s->keyframe || s->errorres || s->intraonly) {
 731         s->prob_ctx[0].p = s->prob_ctx[1].p = s->prob_ctx[2].p =
 732                            s->prob_ctx[3].p = vp9_default_probs;
 733         memcpy(s->prob_ctx[0].coef, vp9_default_coef_probs,
 734                sizeof(vp9_default_coef_probs));
 735         memcpy(s->prob_ctx[1].coef, vp9_default_coef_probs,
 736                sizeof(vp9_default_coef_probs));
 737         memcpy(s->prob_ctx[2].coef, vp9_default_coef_probs,
 738                sizeof(vp9_default_coef_probs));
 739         memcpy(s->prob_ctx[3].coef, vp9_default_coef_probs,
 740                sizeof(vp9_default_coef_probs));
 741     }
 742
 743     // next 16 bits is size of the rest of the header (arith-coded)
 744     size2 = get_bits(&s->gb, 16);
 745     data2 = align_get_bits(&s->gb);
 746     if (size2 > size - (data2 - data)) {
 747         av_log(ctx, AV_LOG_ERROR, "Invalid compressed header size\n");
 748         return AVERROR_INVALIDDATA;
 749     }
 750     ff_vp56_init_range_decoder(&s->c, data2, size2);
 751     if (vp56_rac_get_prob_branchy(&s->c, 128)) { // marker bit
 752         av_log(ctx, AV_LOG_ERROR, "Marker bit was set\n");
 753         return AVERROR_INVALIDDATA;
 754     }
 755
 756     if (s->keyframe || s->intraonly) {
 757         memset(s->counts.coef, 0, sizeof(s->counts.coef) + sizeof(s->counts.eob));
 758     } else {
 759         memset(&s->counts, 0, sizeof(s->counts));
 760     }
 761     // FIXME is it faster to not copy here, but do it down in the fw updates
 762     // as explicit copies if the fw update is missing (and skip the copy upon
 763     // fw update)?
 764     s->prob.p = s->prob_ctx[c].p;
 765
 766     // txfm updates
 767     if (s->lossless) {
 768         s->txfmmode = TX_4X4;
 769     } else {
 770         s->txfmmode = vp8_rac_get_uint(&s->c, 2);
 771         if (s->txfmmode == 3)
 772             s->txfmmode += vp8_rac_get(&s->c);
 773
 774         if (s->txfmmode == TX_SWITCHABLE) {
 775             for (i = 0; i < 2; i++)
 776                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 777                     s->prob.p.tx8p[i] = update_prob(&s->c, s->prob.p.tx8p[i]);
 778             for (i = 0; i < 2; i++)
 779                 for (j = 0; j < 2; j++)
 780                     if (vp56_rac_get_prob_branchy(&s->c, 252))
 781                         s->prob.p.tx16p[i][j] =
 782                             update_prob(&s->c, s->prob.p.tx16p[i][j]);
 783             for (i = 0; i < 2; i++)
 784                 for (j = 0; j < 3; j++)
 785                     if (vp56_rac_get_prob_branchy(&s->c, 252))
 786                         s->prob.p.tx32p[i][j] =
 787                             update_prob(&s->c, s->prob.p.tx32p[i][j]);
 788         }
 789     }
 790
 791     // coef updates
 792     for (i = 0; i < 4; i++) {
 793         uint8_t (*ref)[2][6][6][3] = s->prob_ctx[c].coef[i];
 794         if (vp8_rac_get(&s->c)) {
 795             for (j = 0; j < 2; j++)
 796                 for (k = 0; k < 2; k++)
 797                     for (l = 0; l < 6; l++)
 798                         for (m = 0; m < 6; m++) {
 799                             uint8_t *p = s->prob.coef[i][j][k][l][m];
 800                             uint8_t *r = ref[j][k][l][m];
 801                             if (m >= 3 && l == 0) // dc only has 3 pt
 802                                 break;
 803                             for (n = 0; n < 3; n++) {
 804                                 if (vp56_rac_get_prob_branchy(&s->c, 252)) {
 805                                     p[n] = update_prob(&s->c, r[n]);
 806                                 } else {
 807                                     p[n] = r[n];
 808                                 }
 809                             }
 810                             p[3] = 0;
 811                         }
 812         } else {
 813             for (j = 0; j < 2; j++)
 814                 for (k = 0; k < 2; k++)
 815                     for (l = 0; l < 6; l++)
 816                         for (m = 0; m < 6; m++) {
 817                             uint8_t *p = s->prob.coef[i][j][k][l][m];
 818                             uint8_t *r = ref[j][k][l][m];
 819                             if (m > 3 && l == 0) // dc only has 3 pt
 820                                 break;
 821                             memcpy(p, r, 3);
 822                             p[3] = 0;
 823                         }
 824         }
 825         if (s->txfmmode == i)
 826             break;
 827     }
 828
 829     // mode updates
 830     for (i = 0; i < 3; i++)
 831         if (vp56_rac_get_prob_branchy(&s->c, 252))
 832             s->prob.p.skip[i] = update_prob(&s->c, s->prob.p.skip[i]);
 833     if (!s->keyframe && !s->intraonly) {
 834         for (i = 0; i < 7; i++)
 835             for (j = 0; j < 3; j++)
 836                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 837                     s->prob.p.mv_mode[i][j] =
 838                         update_prob(&s->c, s->prob.p.mv_mode[i][j]);
 839
 840         if (s->filtermode == FILTER_SWITCHABLE)
 841             for (i = 0; i < 4; i++)
 842                 for (j = 0; j < 2; j++)
 843                     if (vp56_rac_get_prob_branchy(&s->c, 252))
 844                         s->prob.p.filter[i][j] =
 845                             update_prob(&s->c, s->prob.p.filter[i][j]);
 846
 847         for (i = 0; i < 4; i++)
 848             if (vp56_rac_get_prob_branchy(&s->c, 252))
 849                 s->prob.p.intra[i] = update_prob(&s->c, s->prob.p.intra[i]);
 850
 851         if (s->allowcompinter) {
 852             s->comppredmode = vp8_rac_get(&s->c);
 853             if (s->comppredmode)
 854                 s->comppredmode += vp8_rac_get(&s->c);
 855             if (s->comppredmode == PRED_SWITCHABLE)
 856                 for (i = 0; i < 5; i++)
 857                     if (vp56_rac_get_prob_branchy(&s->c, 252))
 858                         s->prob.p.comp[i] =
 859                             update_prob(&s->c, s->prob.p.comp[i]);
 860         } else {
 861             s->comppredmode = PRED_SINGLEREF;
 862         }
 863
 864         if (s->comppredmode != PRED_COMPREF) {
 865             for (i = 0; i < 5; i++) {
 866                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 867                     s->prob.p.single_ref[i][0] =
 868                         update_prob(&s->c, s->prob.p.single_ref[i][0]);
 869                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 870                     s->prob.p.single_ref[i][1] =
 871                         update_prob(&s->c, s->prob.p.single_ref[i][1]);
 872             }
 873         }
 874
 875         if (s->comppredmode != PRED_SINGLEREF) {
 876             for (i = 0; i < 5; i++)
 877                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 878                     s->prob.p.comp_ref[i] =
 879                         update_prob(&s->c, s->prob.p.comp_ref[i]);
 880         }
 881
 882         for (i = 0; i < 4; i++)
 883             for (j = 0; j < 9; j++)
 884                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 885                     s->prob.p.y_mode[i][j] =
 886                         update_prob(&s->c, s->prob.p.y_mode[i][j]);
 887
 888         for (i = 0; i < 4; i++)
 889             for (j = 0; j < 4; j++)
 890                 for (k = 0; k < 3; k++)
 891                     if (vp56_rac_get_prob_branchy(&s->c, 252))
 892                         s->prob.p.partition[3 - i][j][k] =
 893                             update_prob(&s->c, s->prob.p.partition[3 - i][j][k]);
 894
 895         // mv fields don't use the update_prob subexp model for some reason
 896         for (i = 0; i < 3; i++)
 897             if (vp56_rac_get_prob_branchy(&s->c, 252))
 898                 s->prob.p.mv_joint[i] = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 899
 900         for (i = 0; i < 2; i++) {
 901             if (vp56_rac_get_prob_branchy(&s->c, 252))
 902                 s->prob.p.mv_comp[i].sign = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 903
 904             for (j = 0; j < 10; j++)
 905                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 906                     s->prob.p.mv_comp[i].classes[j] =
 907                         (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 908
 909             if (vp56_rac_get_prob_branchy(&s->c, 252))
 910                 s->prob.p.mv_comp[i].class0 = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 911
 912             for (j = 0; j < 10; j++)
 913                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 914                     s->prob.p.mv_comp[i].bits[j] =
 915                         (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 916         }
 917
 918         for (i = 0; i < 2; i++) {
 919             for (j = 0; j < 2; j++)
 920                 for (k = 0; k < 3; k++)
 921                     if (vp56_rac_get_prob_branchy(&s->c, 252))
 922                         s->prob.p.mv_comp[i].class0_fp[j][k] =
 923                             (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 924
 925             for (j = 0; j < 3; j++)
 926                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 927                     s->prob.p.mv_comp[i].fp[j] =
 928                         (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 929         }
 930
 931         if (s->highprecisionmvs) {
 932             for (i = 0; i < 2; i++) {
 933                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 934                     s->prob.p.mv_comp[i].class0_hp =
 935                         (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 936
 937                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 938                     s->prob.p.mv_comp[i].hp =
 939                         (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 940             }
 941         }
 942     }
 943
 944     return (data2 - data) + size2;
 945 }
 946
 947 static av_always_inline void clamp_mv(VP56mv *dst, const VP56mv *src,
 948                                       VP9Context *s)
 949 {
 950     dst->x = av_clip(src->x, s->min_mv.x, s->max_mv.x);
 951     dst->y = av_clip(src->y, s->min_mv.y, s->max_mv.y);
 952 }
 953
 954 static void find_ref_mvs(VP9Context *s,
 955                          VP56mv *pmv, int ref, int z, int idx, int sb)
 956 {
 957     static const int8_t mv_ref_blk_off[N_BS_SIZES][8][2] = {
 958         [BS_64x64] = {{  3, -1 }, { -1,  3 }, {  4, -1 }, { -1,  4 },
 959                       { -1, -1 }, {  0, -1 }, { -1,  0 }, {  6, -1 }},
 960         [BS_64x32] = {{  0, -1 }, { -1,  0 }, {  4, -1 }, { -1,  2 },
 961                       { -1, -1 }, {  0, -3 }, { -3,  0 }, {  2, -1 }},
 962         [BS_32x64] = {{ -1,  0 }, {  0, -1 }, { -1,  4 }, {  2, -1 },
 963                       { -1, -1 }, { -3,  0 }, {  0, -3 }, { -1,  2 }},
 964         [BS_32x32] = {{  1, -1 }, { -1,  1 }, {  2, -1 }, { -1,  2 },
 965                       { -1, -1 }, {  0, -3 }, { -3,  0 }, { -3, -3 }},
 966         [BS_32x16] = {{  0, -1 }, { -1,  0 }, {  2, -1 }, { -1, -1 },
 967                       { -1,  1 }, {  0, -3 }, { -3,  0 }, { -3, -3 }},
 968         [BS_16x32] = {{ -1,  0 }, {  0, -1 }, { -1,  2 }, { -1, -1 },
 969                       {  1, -1 }, { -3,  0 }, {  0, -3 }, { -3, -3 }},
 970         [BS_16x16] = {{  0, -1 }, { -1,  0 }, {  1, -1 }, { -1,  1 },
 971                       { -1, -1 }, {  0, -3 }, { -3,  0 }, { -3, -3 }},
 972         [BS_16x8]  = {{  0, -1 }, { -1,  0 }, {  1, -1 }, { -1, -1 },
 973                       {  0, -2 }, { -2,  0 }, { -2, -1 }, { -1, -2 }},
 974         [BS_8x16]  = {{ -1,  0 }, {  0, -1 }, { -1,  1 }, { -1, -1 },
 975                       { -2,  0 }, {  0, -2 }, { -1, -2 }, { -2, -1 }},
 976         [BS_8x8]   = {{  0, -1 }, { -1,  0 }, { -1, -1 }, {  0, -2 },
 977                       { -2,  0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
 978         [BS_8x4]   = {{  0, -1 }, { -1,  0 }, { -1, -1 }, {  0, -2 },
 979                       { -2,  0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
 980         [BS_4x8]   = {{  0, -1 }, { -1,  0 }, { -1, -1 }, {  0, -2 },
 981                       { -2,  0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
 982         [BS_4x4]   = {{  0, -1 }, { -1,  0 }, { -1, -1 }, {  0, -2 },
 983                       { -2,  0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
 984     };
 985     VP9Block *b = s->b;
 986     int row = s->row, col = s->col, row7 = s->row7;
 987     const int8_t (*p)[2] = mv_ref_blk_off[b->bs];
 988 #define INVALID_MV 0x80008000U
 989     uint32_t mem = INVALID_MV;
 990     int i;
 991
 992 #define RETURN_DIRECT_MV(mv) \
 993     do { \
 994         uint32_t m = AV_RN32A(&mv); \
 995         if (!idx) { \
 996             AV_WN32A(pmv, m); \
 997             return; \
 998         } else if (mem == INVALID_MV) { \
 999             mem = m; \
1000         } else if (m != mem) { \
1001             AV_WN32A(pmv, m); \
1002             return; \
1003         } \
1004     } while (0)
1005
1006     if (sb >= 0) {
1007         if (sb == 2 || sb == 1) {
1008             RETURN_DIRECT_MV(b->mv[0][z]);
1009         } else if (sb == 3) {
1010             RETURN_DIRECT_MV(b->mv[2][z]);
1011             RETURN_DIRECT_MV(b->mv[1][z]);
1012             RETURN_DIRECT_MV(b->mv[0][z]);
1013         }
1014
1015 #define RETURN_MV(mv) \
1016     do { \
1017         if (sb > 0) { \
1018             VP56mv tmp; \
1019             uint32_t m; \
1020             clamp_mv(&tmp, &mv, s); \
1021             m = AV_RN32A(&tmp); \
1022             if (!idx) { \
1023                 AV_WN32A(pmv, m); \
1024                 return; \
1025             } else if (mem == INVALID_MV) { \
1026                 mem = m; \
1027             } else if (m != mem) { \
1028                 AV_WN32A(pmv, m); \
1029                 return; \
1030             } \
1031         } else { \
1032             uint32_t m = AV_RN32A(&mv); \
1033             if (!idx) { \
1034                 clamp_mv(pmv, &mv, s); \
1035                 return; \
1036             } else if (mem == INVALID_MV) { \
1037                 mem = m; \
1038             } else if (m != mem) { \
1039                 clamp_mv(pmv, &mv, s); \
1040                 return; \
1041             } \
1042         } \
1043     } while (0)
1044
1045         if (row > 0) {
1046             struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[(row - 1) * s->sb_cols * 8 + col];
1047             if (mv->ref[0] == ref) {
1048                 RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][0]);
1049             } else if (mv->ref[1] == ref) {
1050                 RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][1]);
1051             }
1052         }
1053         if (col > s->tiling.tile_col_start) {
1054             struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[row * s->sb_cols * 8 + col - 1];
1055             if (mv->ref[0] == ref) {
1056                 RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][0]);
1057             } else if (mv->ref[1] == ref) {
1058                 RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][1]);
1059             }
1060         }
1061         i = 2;
1062     } else {
1063         i = 0;
1064     }
1065
1066     // previously coded MVs in this neighbourhood, using same reference frame
1067     for (; i < 8; i++) {
1068         int c = p[i][0] + col, r = p[i][1] + row;
1069
1070         if (c >= s->tiling.tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
1071             struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
1072
1073             if (mv->ref[0] == ref) {
1074                 RETURN_MV(mv->mv[0]);
1075             } else if (mv->ref[1] == ref) {
1076                 RETURN_MV(mv->mv[1]);
1077             }
1078         }
1079     }
1080
1081     // MV at this position in previous frame, using same reference frame
1082     if (s->use_last_frame_mvs) {
1083         struct VP9mvrefPair *mv = &s->frames[LAST_FRAME].mv[row * s->sb_cols * 8 + col];
1084
1085         if (!s->last_uses_2pass)
1086             ff_thread_await_progress(&s->frames[LAST_FRAME].tf, row >> 3, 0);
1087         if (mv->ref[0] == ref) {
1088             RETURN_MV(mv->mv[0]);
1089         } else if (mv->ref[1] == ref) {
1090             RETURN_MV(mv->mv[1]);
1091         }
1092     }
1093
1094 #define RETURN_SCALE_MV(mv, scale) \
1095     do { \
1096         if (scale) { \
1097             VP56mv mv_temp = { -mv.x, -mv.y }; \
1098             RETURN_MV(mv_temp); \
1099         } else { \
1100             RETURN_MV(mv); \
1101         } \
1102     } while (0)
1103
1104     // previously coded MVs in this neighbourhood, using different reference frame
1105     for (i = 0; i < 8; i++) {
1106         int c = p[i][0] + col, r = p[i][1] + row;
1107
1108         if (c >= s->tiling.tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
1109             struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
1110
1111             if (mv->ref[0] != ref && mv->ref[0] >= 0) {
1112                 RETURN_SCALE_MV(mv->mv[0], s->signbias[mv->ref[0]] != s->signbias[ref]);
1113             }
1114             if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
1115                 // BUG - libvpx has this condition regardless of whether
1116                 // we used the first ref MV and pre-scaling
1117                 AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
1118                 RETURN_SCALE_MV(mv->mv[1], s->signbias[mv->ref[1]] != s->signbias[ref]);
1119             }
1120         }
1121     }
1122
1123     // MV at this position in previous frame, using different reference frame
1124     if (s->use_last_frame_mvs) {
1125         struct VP9mvrefPair *mv = &s->frames[LAST_FRAME].mv[row * s->sb_cols * 8 + col];
1126
1127         // no need to await_progress, because we already did that above
1128         if (mv->ref[0] != ref && mv->ref[0] >= 0) {
1129             RETURN_SCALE_MV(mv->mv[0], s->signbias[mv->ref[0]] != s->signbias[ref]);
1130         }
1131         if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
1132             // BUG - libvpx has this condition regardless of whether
1133             // we used the first ref MV and pre-scaling
1134             AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
1135             RETURN_SCALE_MV(mv->mv[1], s->signbias[mv->ref[1]] != s->signbias[ref]);
1136         }
1137     }
1138
1139     AV_ZERO32(pmv);
1140 #undef INVALID_MV
1141 #undef RETURN_MV
1142 #undef RETURN_SCALE_MV
1143 }
1144
1145 static av_always_inline int read_mv_component(VP9Context *s, int idx, int hp)
1146 {
1147     int bit, sign = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].sign);
1148     int n, c = vp8_rac_get_tree(&s->c, vp9_mv_class_tree,
1149                                 s->prob.p.mv_comp[idx].classes);
1150
1151     s->counts.mv_comp[idx].sign[sign]++;
1152     s->counts.mv_comp[idx].classes[c]++;
1153     if (c) {
1154         int m;
1155
1156         for (n = 0, m = 0; m < c; m++) {
1157             bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].bits[m]);
1158             n |= bit << m;
1159             s->counts.mv_comp[idx].bits[m][bit]++;
1160         }
1161         n <<= 3;
1162         bit = vp8_rac_get_tree(&s->c, vp9_mv_fp_tree, s->prob.p.mv_comp[idx].fp);
1163         n |= bit << 1;
1164         s->counts.mv_comp[idx].fp[bit]++;
1165         if (hp) {
1166             bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].hp);
1167             s->counts.mv_comp[idx].hp[bit]++;
1168             n |= bit;
1169         } else {
1170             n |= 1;
1171             // bug in libvpx - we count for bw entropy purposes even if the
1172             // bit wasn't coded
1173             s->counts.mv_comp[idx].hp[1]++;
1174         }
1175         n += 8 << c;
1176     } else {
1177         n = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0);
1178         s->counts.mv_comp[idx].class0[n]++;
1179         bit = vp8_rac_get_tree(&s->c, vp9_mv_fp_tree,
1180                                s->prob.p.mv_comp[idx].class0_fp[n]);
1181         s->counts.mv_comp[idx].class0_fp[n][bit]++;
1182         n = (n << 3) | (bit << 1);
1183         if (hp) {
1184             bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0_hp);
1185             s->counts.mv_comp[idx].class0_hp[bit]++;
1186             n |= bit;
1187         } else {
1188             n |= 1;
1189             // bug in libvpx - we count for bw entropy purposes even if the
1190             // bit wasn't coded
1191             s->counts.mv_comp[idx].class0_hp[1]++;
1192         }
1193     }
1194
1195     return sign ? -(n + 1) : (n + 1);
1196 }
1197
1198 static void fill_mv(VP9Context *s,
1199                     VP56mv *mv, int mode, int sb)
1200 {
1201     VP9Block *b = s->b;
1202
1203     if (mode == ZEROMV) {
1204         AV_ZERO64(mv);
1205     } else {
1206         int hp;
1207
1208         // FIXME cache this value and reuse for other subblocks
1209         find_ref_mvs(s, &mv[0], b->ref[0], 0, mode == NEARMV,
1210                      mode == NEWMV ? -1 : sb);
1211         // FIXME maybe move this code into find_ref_mvs()
1212         if ((mode == NEWMV || sb == -1) &&
1213             !(hp = s->highprecisionmvs && abs(mv[0].x) < 64 && abs(mv[0].y) < 64)) {
1214             if (mv[0].y & 1) {
1215                 if (mv[0].y < 0)
1216                     mv[0].y++;
1217                 else
1218                     mv[0].y--;
1219             }
1220             if (mv[0].x & 1) {
1221                 if (mv[0].x < 0)
1222                     mv[0].x++;
1223                 else
1224                     mv[0].x--;
1225             }
1226         }
1227         if (mode == NEWMV) {
1228             enum MVJoint j = vp8_rac_get_tree(&s->c, vp9_mv_joint_tree,
1229                                               s->prob.p.mv_joint);
1230
1231             s->counts.mv_joint[j]++;
1232             if (j >= MV_JOINT_V)
1233                 mv[0].y += read_mv_component(s, 0, hp);
1234             if (j & 1)
1235                 mv[0].x += read_mv_component(s, 1, hp);
1236         }
1237
1238         if (b->comp) {
1239             // FIXME cache this value and reuse for other subblocks
1240             find_ref_mvs(s, &mv[1], b->ref[1], 1, mode == NEARMV,
1241                          mode == NEWMV ? -1 : sb);
1242             if ((mode == NEWMV || sb == -1) &&
1243                 !(hp = s->highprecisionmvs && abs(mv[1].x) < 64 && abs(mv[1].y) < 64)) {
1244                 if (mv[1].y & 1) {
1245                     if (mv[1].y < 0)
1246                         mv[1].y++;
1247                     else
1248                         mv[1].y--;
1249                 }
1250                 if (mv[1].x & 1) {
1251                     if (mv[1].x < 0)
1252                         mv[1].x++;
1253                     else
1254                         mv[1].x--;
1255                 }
1256             }
1257             if (mode == NEWMV) {
1258                 enum MVJoint j = vp8_rac_get_tree(&s->c, vp9_mv_joint_tree,
1259                                                   s->prob.p.mv_joint);
1260
1261                 s->counts.mv_joint[j]++;
1262                 if (j >= MV_JOINT_V)
1263                     mv[1].y += read_mv_component(s, 0, hp);
1264                 if (j & 1)
1265                     mv[1].x += read_mv_component(s, 1, hp);
1266             }
1267         }
1268     }
1269 }
1270
1271 static av_always_inline void setctx_2d(uint8_t *ptr, int w, int h,
1272                                        ptrdiff_t stride, int v)
1273 {
1274     switch (w) {
1275     case 1:
1276         do {
1277             *ptr = v;
1278             ptr += stride;
1279         } while (--h);
1280         break;
1281     case 2: {
1282         int v16 = v * 0x0101;
1283         do {
1284             AV_WN16A(ptr, v16);
1285             ptr += stride;
1286         } while (--h);
1287         break;
1288     }
1289     case 4: {
1290         uint32_t v32 = v * 0x01010101;
1291         do {
1292             AV_WN32A(ptr, v32);
1293             ptr += stride;
1294         } while (--h);
1295         break;
1296     }
1297     case 8: {
1298 #if HAVE_FAST_64BIT
1299         uint64_t v64 = v * 0x0101010101010101ULL;
1300         do {
1301             AV_WN64A(ptr, v64);
1302             ptr += stride;
1303         } while (--h);
1304 #else
1305         uint32_t v32 = v * 0x01010101;
1306         do {
1307             AV_WN32A(ptr,     v32);
1308             AV_WN32A(ptr + 4, v32);
1309             ptr += stride;
1310         } while (--h);
1311 #endif
1312         break;
1313     }
1314     }
1315 }
1316
1317 static void decode_mode(AVCodecContext *ctx)
1318 {
1319     static const uint8_t left_ctx[N_BS_SIZES] = {
1320         0x0, 0x8, 0x0, 0x8, 0xc, 0x8, 0xc, 0xe, 0xc, 0xe, 0xf, 0xe, 0xf
1321     };
1322     static const uint8_t above_ctx[N_BS_SIZES] = {
1323         0x0, 0x0, 0x8, 0x8, 0x8, 0xc, 0xc, 0xc, 0xe, 0xe, 0xe, 0xf, 0xf
1324     };
1325     static const uint8_t max_tx_for_bl_bp[N_BS_SIZES] = {
1326         TX_32X32, TX_32X32, TX_32X32, TX_32X32, TX_16X16, TX_16X16,
1327         TX_16X16, TX_8X8, TX_8X8, TX_8X8, TX_4X4, TX_4X4, TX_4X4
1328     };
1329     VP9Context *s = ctx->priv_data;
1330     VP9Block *b = s->b;
1331     int row = s->row, col = s->col, row7 = s->row7;
1332     enum TxfmMode max_tx = max_tx_for_bl_bp[b->bs];
1333     int w4 = FFMIN(s->cols - col, bwh_tab[1][b->bs][0]);
1334     int h4 = FFMIN(s->rows - row, bwh_tab[1][b->bs][1]), y;
1335     int have_a = row > 0, have_l = col > s->tiling.tile_col_start;
1336     int vref, filter_id;
1337
1338     if (!s->segmentation.enabled) {
1339         b->seg_id = 0;
1340     } else if (s->keyframe || s->intraonly) {
1341         b->seg_id = vp8_rac_get_tree(&s->c, vp9_segmentation_tree, s->prob.seg);
1342     } else if (!s->segmentation.update_map ||
1343                (s->segmentation.temporal &&
1344                 vp56_rac_get_prob_branchy(&s->c,
1345                     s->prob.segpred[s->above_segpred_ctx[col] +
1346                                     s->left_segpred_ctx[row7]]))) {
1347         int pred = 8, x;
1348         uint8_t *refsegmap = s->frames[LAST_FRAME].segmentation_map;
1349
1350         if (!s->last_uses_2pass)
1351             ff_thread_await_progress(&s->frames[LAST_FRAME].tf, row >> 3, 0);
1352         for (y = 0; y < h4; y++)
1353             for (x = 0; x < w4; x++)
1354                 pred = FFMIN(pred, refsegmap[(y + row) * 8 * s->sb_cols + x + col]);
1355         av_assert1(pred < 8);
1356         b->seg_id = pred;
1357
1358         memset(&s->above_segpred_ctx[col], 1, w4);
1359         memset(&s->left_segpred_ctx[row7], 1, h4);
1360     } else {
1361         b->seg_id = vp8_rac_get_tree(&s->c, vp9_segmentation_tree,
1362                                      s->prob.seg);
1363
1364         memset(&s->above_segpred_ctx[col], 0, w4);
1365         memset(&s->left_segpred_ctx[row7], 0, h4);
1366     }
1367     if (s->segmentation.enabled &&
1368         (s->segmentation.update_map || s->keyframe || s->intraonly)) {
1369         setctx_2d(&s->frames[CUR_FRAME].segmentation_map[row * 8 * s->sb_cols + col],
1370                   w4, h4, 8 * s->sb_cols, b->seg_id);
1371     }
1372
1373     b->skip = s->segmentation.enabled &&
1374         s->segmentation.feat[b->seg_id].skip_enabled;
1375     if (!b->skip) {
1376         int c = s->left_skip_ctx[row7] + s->above_skip_ctx[col];
1377         b->skip = vp56_rac_get_prob(&s->c, s->prob.p.skip[c]);
1378         s->counts.skip[c][b->skip]++;
1379     }
1380
1381     if (s->keyframe || s->intraonly) {
1382         b->intra = 1;
1383     } else if (s->segmentation.feat[b->seg_id].ref_enabled) {
1384         b->intra = !s->segmentation.feat[b->seg_id].ref_val;
1385     } else {
1386         int c, bit;
1387
1388         if (have_a && have_l) {
1389             c = s->above_intra_ctx[col] + s->left_intra_ctx[row7];
1390             c += (c == 2);
1391         } else {
1392             c = have_a ? 2 * s->above_intra_ctx[col] :
1393                 have_l ? 2 * s->left_intra_ctx[row7] : 0;
1394         }
1395         bit = vp56_rac_get_prob(&s->c, s->prob.p.intra[c]);
1396         s->counts.intra[c][bit]++;
1397         b->intra = !bit;
1398     }
1399
1400     if ((b->intra || !b->skip) && s->txfmmode == TX_SWITCHABLE) {
1401         int c;
1402         if (have_a) {
1403             if (have_l) {
1404                 c = (s->above_skip_ctx[col] ? max_tx :
1405                      s->above_txfm_ctx[col]) +
1406                     (s->left_skip_ctx[row7] ? max_tx :
1407                      s->left_txfm_ctx[row7]) > max_tx;
1408             } else {
1409                 c = s->above_skip_ctx[col] ? 1 :
1410                     (s->above_txfm_ctx[col] * 2 > max_tx);
1411             }
1412         } else if (have_l) {
1413             c = s->left_skip_ctx[row7] ? 1 :
1414                 (s->left_txfm_ctx[row7] * 2 > max_tx);
1415         } else {
1416             c = 1;
1417         }
1418         switch (max_tx) {
1419         case TX_32X32:
1420             b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][0]);
1421             if (b->tx) {
1422                 b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][1]);
1423                 if (b->tx == 2)
1424                     b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][2]);
1425             }
1426             s->counts.tx32p[c][b->tx]++;
1427             break;
1428         case TX_16X16:
1429             b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][0]);
1430             if (b->tx)
1431                 b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][1]);
1432             s->counts.tx16p[c][b->tx]++;
1433             break;
1434         case TX_8X8:
1435             b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx8p[c]);
1436             s->counts.tx8p[c][b->tx]++;
1437             break;
1438         case TX_4X4:
1439             b->tx = TX_4X4;
1440             break;
1441         }
1442     } else {
1443         b->tx = FFMIN(max_tx, s->txfmmode);
1444     }
1445
1446     if (s->keyframe || s->intraonly) {
1447         uint8_t *a = &s->above_mode_ctx[col * 2];
1448         uint8_t *l = &s->left_mode_ctx[(row7) << 1];
1449
1450         b->comp = 0;
1451         if (b->bs > BS_8x8) {
1452             // FIXME the memory storage intermediates here aren't really
1453             // necessary, they're just there to make the code slightly
1454             // simpler for now
1455             b->mode[0] = a[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1456                                     vp9_default_kf_ymode_probs[a[0]][l[0]]);
1457             if (b->bs != BS_8x4) {
1458                 b->mode[1] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1459                                  vp9_default_kf_ymode_probs[a[1]][b->mode[0]]);
1460                 l[0] = a[1] = b->mode[1];
1461             } else {
1462                 l[0] = a[1] = b->mode[1] = b->mode[0];
1463             }
1464             if (b->bs != BS_4x8) {
1465                 b->mode[2] = a[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1466                                         vp9_default_kf_ymode_probs[a[0]][l[1]]);
1467                 if (b->bs != BS_8x4) {
1468                     b->mode[3] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1469                                   vp9_default_kf_ymode_probs[a[1]][b->mode[2]]);
1470                     l[1] = a[1] = b->mode[3];
1471                 } else {
1472                     l[1] = a[1] = b->mode[3] = b->mode[2];
1473                 }
1474             } else {
1475                 b->mode[2] = b->mode[0];
1476                 l[1] = a[1] = b->mode[3] = b->mode[1];
1477             }
1478         } else {
1479             b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1480                                           vp9_default_kf_ymode_probs[*a][*l]);
1481             b->mode[3] = b->mode[2] = b->mode[1] = b->mode[0];
1482             // FIXME this can probably be optimized
1483             memset(a, b->mode[0], bwh_tab[0][b->bs][0]);
1484             memset(l, b->mode[0], bwh_tab[0][b->bs][1]);
1485         }
1486         b->uvmode = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1487                                      vp9_default_kf_uvmode_probs[b->mode[3]]);
1488     } else if (b->intra) {
1489         b->comp = 0;
1490         if (b->bs > BS_8x8) {
1491             b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1492                                           s->prob.p.y_mode[0]);
1493             s->counts.y_mode[0][b->mode[0]]++;
1494             if (b->bs != BS_8x4) {
1495                 b->mode[1] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1496                                               s->prob.p.y_mode[0]);
1497                 s->counts.y_mode[0][b->mode[1]]++;
1498             } else {
1499                 b->mode[1] = b->mode[0];
1500             }
1501             if (b->bs != BS_4x8) {
1502                 b->mode[2] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1503                                               s->prob.p.y_mode[0]);
1504                 s->counts.y_mode[0][b->mode[2]]++;
1505                 if (b->bs != BS_8x4) {
1506                     b->mode[3] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1507                                                   s->prob.p.y_mode[0]);
1508                     s->counts.y_mode[0][b->mode[3]]++;
1509                 } else {
1510                     b->mode[3] = b->mode[2];
1511                 }
1512             } else {
1513                 b->mode[2] = b->mode[0];
1514                 b->mode[3] = b->mode[1];
1515             }
1516         } else {
1517             static const uint8_t size_group[10] = {
1518                 3, 3, 3, 3, 2, 2, 2, 1, 1, 1
1519             };
1520             int sz = size_group[b->bs];
1521
1522             b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1523                                           s->prob.p.y_mode[sz]);
1524             b->mode[1] = b->mode[2] = b->mode[3] = b->mode[0];
1525             s->counts.y_mode[sz][b->mode[3]]++;
1526         }
1527         b->uvmode = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1528                                      s->prob.p.uv_mode[b->mode[3]]);
1529         s->counts.uv_mode[b->mode[3]][b->uvmode]++;
1530     } else {
1531         static const uint8_t inter_mode_ctx_lut[14][14] = {
1532             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1533             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1534             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1535             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1536             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1537             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1538             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1539             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1540             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1541             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1542             { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
1543             { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
1544             { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 0, 3 },
1545             { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 3, 3, 4 },
1546         };
1547
1548         if (s->segmentation.feat[b->seg_id].ref_enabled) {
1549             av_assert2(s->segmentation.feat[b->seg_id].ref_val != 0);
1550             b->comp = 0;
1551             b->ref[0] = s->segmentation.feat[b->seg_id].ref_val - 1;
1552         } else {
1553             // read comp_pred flag
1554             if (s->comppredmode != PRED_SWITCHABLE) {
1555                 b->comp = s->comppredmode == PRED_COMPREF;
1556             } else {
1557                 int c;
1558
1559                 // FIXME add intra as ref=0xff (or -1) to make these easier?
1560                 if (have_a) {
1561                     if (have_l) {
1562                         if (s->above_comp_ctx[col] && s->left_comp_ctx[row7]) {
1563                             c = 4;
1564                         } else if (s->above_comp_ctx[col]) {
1565                             c = 2 + (s->left_intra_ctx[row7] ||
1566                                      s->left_ref_ctx[row7] == s->fixcompref);
1567                         } else if (s->left_comp_ctx[row7]) {
1568                             c = 2 + (s->above_intra_ctx[col] ||
1569                                      s->above_ref_ctx[col] == s->fixcompref);
1570                         } else {
1571                             c = (!s->above_intra_ctx[col] &&
1572                                  s->above_ref_ctx[col] == s->fixcompref) ^
1573                             (!s->left_intra_ctx[row7] &&
1574                              s->left_ref_ctx[row & 7] == s->fixcompref);
1575                         }
1576                     } else {
1577                         c = s->above_comp_ctx[col] ? 3 :
1578                         (!s->above_intra_ctx[col] && s->above_ref_ctx[col] == s->fixcompref);
1579                     }
1580                 } else if (have_l) {
1581                     c = s->left_comp_ctx[row7] ? 3 :
1582                     (!s->left_intra_ctx[row7] && s->left_ref_ctx[row7] == s->fixcompref);
1583                 } else {
1584                     c = 1;
1585                 }
1586                 b->comp = vp56_rac_get_prob(&s->c, s->prob.p.comp[c]);
1587                 s->counts.comp[c][b->comp]++;
1588             }
1589
1590             // read actual references
1591             // FIXME probably cache a few variables here to prevent repetitive
1592             // memory accesses below
1593             if (b->comp) /* two references */ {
1594                 int fix_idx = s->signbias[s->fixcompref], var_idx = !fix_idx, c, bit;
1595
1596                 b->ref[fix_idx] = s->fixcompref;
1597                 // FIXME can this codeblob be replaced by some sort of LUT?
1598                 if (have_a) {
1599                     if (have_l) {
1600                         if (s->above_intra_ctx[col]) {
1601                             if (s->left_intra_ctx[row7]) {
1602                                 c = 2;
1603                             } else {
1604                                 c = 1 + 2 * (s->left_ref_ctx[row7] != s->varcompref[1]);
1605                             }
1606                         } else if (s->left_intra_ctx[row7]) {
1607                             c = 1 + 2 * (s->above_ref_ctx[col] != s->varcompref[1]);
1608                         } else {
1609                             int refl = s->left_ref_ctx[row7], refa = s->above_ref_ctx[col];
1610
1611                             if (refl == refa && refa == s->varcompref[1]) {
1612                                 c = 0;
1613                             } else if (!s->left_comp_ctx[row7] && !s->above_comp_ctx[col]) {
1614                                 if ((refa == s->fixcompref && refl == s->varcompref[0]) ||
1615                                     (refl == s->fixcompref && refa == s->varcompref[0])) {
1616                                     c = 4;
1617                                 } else {
1618                                     c = (refa == refl) ? 3 : 1;
1619                                 }
1620                             } else if (!s->left_comp_ctx[row7]) {
1621                                 if (refa == s->varcompref[1] && refl != s->varcompref[1]) {
1622                                     c = 1;
1623                                 } else {
1624                                     c = (refl == s->varcompref[1] &&
1625                                          refa != s->varcompref[1]) ? 2 : 4;
1626                                 }
1627                             } else if (!s->above_comp_ctx[col]) {
1628                                 if (refl == s->varcompref[1] && refa != s->varcompref[1]) {
1629                                     c = 1;
1630                                 } else {
1631                                     c = (refa == s->varcompref[1] &&
1632                                          refl != s->varcompref[1]) ? 2 : 4;
1633                                 }
1634                             } else {
1635                                 c = (refl == refa) ? 4 : 2;
1636                             }
1637                         }
1638                     } else {
1639                         if (s->above_intra_ctx[col]) {
1640                             c = 2;
1641                         } else if (s->above_comp_ctx[col]) {
1642                             c = 4 * (s->above_ref_ctx[col] != s->varcompref[1]);
1643                         } else {
1644                             c = 3 * (s->above_ref_ctx[col] != s->varcompref[1]);
1645                         }
1646                     }
1647                 } else if (have_l) {
1648                     if (s->left_intra_ctx[row7]) {
1649                         c = 2;
1650                     } else if (s->left_comp_ctx[row7]) {
1651                         c = 4 * (s->left_ref_ctx[row7] != s->varcompref[1]);
1652                     } else {
1653                         c = 3 * (s->left_ref_ctx[row7] != s->varcompref[1]);
1654                     }
1655                 } else {
1656                     c = 2;
1657                 }
1658                 bit = vp56_rac_get_prob(&s->c, s->prob.p.comp_ref[c]);
1659                 b->ref[var_idx] = s->varcompref[bit];
1660                 s->counts.comp_ref[c][bit]++;
1661             } else /* single reference */ {
1662                 int bit, c;
1663
1664                 if (have_a && !s->above_intra_ctx[col]) {
1665                     if (have_l && !s->left_intra_ctx[row7]) {
1666                         if (s->left_comp_ctx[row7]) {
1667                             if (s->above_comp_ctx[col]) {
1668                                 c = 1 + (!s->fixcompref || !s->left_ref_ctx[row7] ||
1669                                          !s->above_ref_ctx[col]);
1670                             } else {
1671                                 c = (3 * !s->above_ref_ctx[col]) +
1672                                     (!s->fixcompref || !s->left_ref_ctx[row7]);
1673                             }
1674                         } else if (s->above_comp_ctx[col]) {
1675                             c = (3 * !s->left_ref_ctx[row7]) +
1676                                 (!s->fixcompref || !s->above_ref_ctx[col]);
1677                         } else {
1678                             c = 2 * !s->left_ref_ctx[row7] + 2 * !s->above_ref_ctx[col];
1679                         }
1680                     } else if (s->above_intra_ctx[col]) {
1681                         c = 2;
1682                     } else if (s->above_comp_ctx[col]) {
1683                         c = 1 + (!s->fixcompref || !s->above_ref_ctx[col]);
1684                     } else {
1685                         c = 4 * (!s->above_ref_ctx[col]);
1686                     }
1687                 } else if (have_l && !s->left_intra_ctx[row7]) {
1688                     if (s->left_intra_ctx[row7]) {
1689                         c = 2;
1690                     } else if (s->left_comp_ctx[row7]) {
1691                         c = 1 + (!s->fixcompref || !s->left_ref_ctx[row7]);
1692                     } else {
1693                         c = 4 * (!s->left_ref_ctx[row7]);
1694                     }
1695                 } else {
1696                     c = 2;
1697                 }
1698                 bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][0]);
1699                 s->counts.single_ref[c][0][bit]++;
1700                 if (!bit) {
1701                     b->ref[0] = 0;
1702                 } else {
1703                     // FIXME can this codeblob be replaced by some sort of LUT?
1704                     if (have_a) {
1705                         if (have_l) {
1706                             if (s->left_intra_ctx[row7]) {
1707                                 if (s->above_intra_ctx[col]) {
1708                                     c = 2;
1709                                 } else if (s->above_comp_ctx[col]) {
1710                                     c = 1 + 2 * (s->fixcompref == 1 ||
1711                                                  s->above_ref_ctx[col] == 1);
1712                                 } else if (!s->above_ref_ctx[col]) {
1713                                     c = 3;
1714                                 } else {
1715                                     c = 4 * (s->above_ref_ctx[col] == 1);
1716                                 }
1717                             } else if (s->above_intra_ctx[col]) {
1718                                 if (s->left_intra_ctx[row7]) {
1719                                     c = 2;
1720                                 } else if (s->left_comp_ctx[row7]) {
1721                                     c = 1 + 2 * (s->fixcompref == 1 ||
1722                                                  s->left_ref_ctx[row7] == 1);
1723                                 } else if (!s->left_ref_ctx[row7]) {
1724                                     c = 3;
1725                                 } else {
1726                                     c = 4 * (s->left_ref_ctx[row7] == 1);
1727                                 }
1728                             } else if (s->above_comp_ctx[col]) {
1729                                 if (s->left_comp_ctx[row7]) {
1730                                     if (s->left_ref_ctx[row7] == s->above_ref_ctx[col]) {
1731                                         c = 3 * (s->fixcompref == 1 ||
1732                                                  s->left_ref_ctx[row7] == 1);
1733                                     } else {
1734                                         c = 2;
1735                                     }
1736                                 } else if (!s->left_ref_ctx[row7]) {
1737                                     c = 1 + 2 * (s->fixcompref == 1 ||
1738                                                  s->above_ref_ctx[col] == 1);
1739                                 } else {
1740                                     c = 3 * (s->left_ref_ctx[row7] == 1) +
1741                                     (s->fixcompref == 1 || s->above_ref_ctx[col] == 1);
1742                                 }
1743                             } else if (s->left_comp_ctx[row7]) {
1744                                 if (!s->above_ref_ctx[col]) {
1745                                     c = 1 + 2 * (s->fixcompref == 1 ||
1746                                                  s->left_ref_ctx[row7] == 1);
1747                                 } else {
1748                                     c = 3 * (s->above_ref_ctx[col] == 1) +
1749                                     (s->fixcompref == 1 || s->left_ref_ctx[row7] == 1);
1750                                 }
1751                             } else if (!s->above_ref_ctx[col]) {
1752                                 if (!s->left_ref_ctx[row7]) {
1753                                     c = 3;
1754                                 } else {
1755                                     c = 4 * (s->left_ref_ctx[row7] == 1);
1756                                 }
1757                             } else if (!s->left_ref_ctx[row7]) {
1758                                 c = 4 * (s->above_ref_ctx[col] == 1);
1759                             } else {
1760                                 c = 2 * (s->left_ref_ctx[row7] == 1) +
1761                                 2 * (s->above_ref_ctx[col] == 1);
1762                             }
1763                         } else {
1764                             if (s->above_intra_ctx[col] ||
1765                                 (!s->above_comp_ctx[col] && !s->above_ref_ctx[col])) {
1766                                 c = 2;
1767                             } else if (s->above_comp_ctx[col]) {
1768                                 c = 3 * (s->fixcompref == 1 || s->above_ref_ctx[col] == 1);
1769                             } else {
1770                                 c = 4 * (s->above_ref_ctx[col] == 1);
1771                             }
1772                         }
1773                     } else if (have_l) {
1774                         if (s->left_intra_ctx[row7] ||
1775                             (!s->left_comp_ctx[row7] && !s->left_ref_ctx[row7])) {
1776                             c = 2;
1777                         } else if (s->left_comp_ctx[row7]) {
1778                             c = 3 * (s->fixcompref == 1 || s->left_ref_ctx[row7] == 1);
1779                         } else {
1780                             c = 4 * (s->left_ref_ctx[row7] == 1);
1781                         }
1782                     } else {
1783                         c = 2;
1784                     }
1785                     bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][1]);
1786                     s->counts.single_ref[c][1][bit]++;
1787                     b->ref[0] = 1 + bit;
1788                 }
1789             }
1790         }
1791
1792         if (b->bs <= BS_8x8) {
1793             if (s->segmentation.feat[b->seg_id].skip_enabled) {
1794                 b->mode[0] = b->mode[1] = b->mode[2] = b->mode[3] = ZEROMV;
1795             } else {
1796                 static const uint8_t off[10] = {
1797                     3, 0, 0, 1, 0, 0, 0, 0, 0, 0
1798                 };
1799
1800                 // FIXME this needs to use the LUT tables from find_ref_mvs
1801                 // because not all are -1,0/0,-1
1802                 int c = inter_mode_ctx_lut[s->above_mode_ctx[col + off[b->bs]]]
1803                                           [s->left_mode_ctx[row7 + off[b->bs]]];
1804
1805                 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1806                                               s->prob.p.mv_mode[c]);
1807                 b->mode[1] = b->mode[2] = b->mode[3] = b->mode[0];
1808                 s->counts.mv_mode[c][b->mode[0] - 10]++;
1809             }
1810         }
1811
1812         if (s->filtermode == FILTER_SWITCHABLE) {
1813             int c;
1814
1815             if (have_a && s->above_mode_ctx[col] >= NEARESTMV) {
1816                 if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
1817                     c = s->above_filter_ctx[col] == s->left_filter_ctx[row7] ?
1818                         s->left_filter_ctx[row7] : 3;
1819                 } else {
1820                     c = s->above_filter_ctx[col];
1821                 }
1822             } else if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
1823                 c = s->left_filter_ctx[row7];
1824             } else {
1825                 c = 3;
1826             }
1827
1828             filter_id = vp8_rac_get_tree(&s->c, vp9_filter_tree,
1829                                          s->prob.p.filter[c]);
1830             s->counts.filter[c][filter_id]++;
1831             b->filter = vp9_filter_lut[filter_id];
1832         } else {
1833             b->filter = s->filtermode;
1834         }
1835
1836         if (b->bs > BS_8x8) {
1837             int c = inter_mode_ctx_lut[s->above_mode_ctx[col]][s->left_mode_ctx[row7]];
1838
1839             b->mode[0] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1840                                           s->prob.p.mv_mode[c]);
1841             s->counts.mv_mode[c][b->mode[0] - 10]++;
1842             fill_mv(s, b->mv[0], b->mode[0], 0);
1843
1844             if (b->bs != BS_8x4) {
1845                 b->mode[1] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1846                                               s->prob.p.mv_mode[c]);
1847                 s->counts.mv_mode[c][b->mode[1] - 10]++;
1848                 fill_mv(s, b->mv[1], b->mode[1], 1);
1849             } else {
1850                 b->mode[1] = b->mode[0];
1851                 AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
1852                 AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
1853             }
1854
1855             if (b->bs != BS_4x8) {
1856                 b->mode[2] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1857                                               s->prob.p.mv_mode[c]);
1858                 s->counts.mv_mode[c][b->mode[2] - 10]++;
1859                 fill_mv(s, b->mv[2], b->mode[2], 2);
1860
1861                 if (b->bs != BS_8x4) {
1862                     b->mode[3] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1863                                                   s->prob.p.mv_mode[c]);
1864                     s->counts.mv_mode[c][b->mode[3] - 10]++;
1865                     fill_mv(s, b->mv[3], b->mode[3], 3);
1866                 } else {
1867                     b->mode[3] = b->mode[2];
1868                     AV_COPY32(&b->mv[3][0], &b->mv[2][0]);
1869                     AV_COPY32(&b->mv[3][1], &b->mv[2][1]);
1870                 }
1871             } else {
1872                 b->mode[2] = b->mode[0];
1873                 AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
1874                 AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
1875                 b->mode[3] = b->mode[1];
1876                 AV_COPY32(&b->mv[3][0], &b->mv[1][0]);
1877                 AV_COPY32(&b->mv[3][1], &b->mv[1][1]);
1878             }
1879         } else {
1880             fill_mv(s, b->mv[0], b->mode[0], -1);
1881             AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
1882             AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
1883             AV_COPY32(&b->mv[3][0], &b->mv[0][0]);
1884             AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
1885             AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
1886             AV_COPY32(&b->mv[3][1], &b->mv[0][1]);
1887         }
1888
1889         vref = b->ref[b->comp ? s->signbias[s->varcompref[0]] : 0];
1890     }
1891
1892 #if HAVE_FAST_64BIT
1893 #define SPLAT_CTX(var, val, n) \
1894     switch (n) { \
1895     case 1:  var = val;                                    break; \
1896     case 2:  AV_WN16A(&var, val *             0x0101);     break; \
1897     case 4:  AV_WN32A(&var, val *         0x01010101);     break; \
1898     case 8:  AV_WN64A(&var, val * 0x0101010101010101ULL);  break; \
1899     case 16: { \
1900         uint64_t v64 = val * 0x0101010101010101ULL; \
1901         AV_WN64A(              &var,     v64); \
1902         AV_WN64A(&((uint8_t *) &var)[8], v64); \
1903         break; \
1904     } \
1905     }
1906 #else
1907 #define SPLAT_CTX(var, val, n) \
1908     switch (n) { \
1909     case 1:  var = val;                         break; \
1910     case 2:  AV_WN16A(&var, val *     0x0101);  break; \
1911     case 4:  AV_WN32A(&var, val * 0x01010101);  break; \
1912     case 8: { \
1913         uint32_t v32 = val * 0x01010101; \
1914         AV_WN32A(              &var,     v32); \
1915         AV_WN32A(&((uint8_t *) &var)[4], v32); \
1916         break; \
1917     } \
1918     case 16: { \
1919         uint32_t v32 = val * 0x01010101; \
1920         AV_WN32A(              &var,      v32); \
1921         AV_WN32A(&((uint8_t *) &var)[4],  v32); \
1922         AV_WN32A(&((uint8_t *) &var)[8],  v32); \
1923         AV_WN32A(&((uint8_t *) &var)[12], v32); \
1924         break; \
1925     } \
1926     }
1927 #endif
1928
1929     switch (bwh_tab[1][b->bs][0]) {
1930 #define SET_CTXS(dir, off, n) \
1931     do { \
1932         SPLAT_CTX(s->dir##_skip_ctx[off],      b->skip,          n); \
1933         SPLAT_CTX(s->dir##_txfm_ctx[off],      b->tx,            n); \
1934         SPLAT_CTX(s->dir##_partition_ctx[off], dir##_ctx[b->bs], n); \
1935         if (!s->keyframe && !s->intraonly) { \
1936             SPLAT_CTX(s->dir##_intra_ctx[off], b->intra,   n); \
1937             SPLAT_CTX(s->dir##_comp_ctx[off],  b->comp,    n); \
1938             SPLAT_CTX(s->dir##_mode_ctx[off],  b->mode[3], n); \
1939             if (!b->intra) { \
1940                 SPLAT_CTX(s->dir##_ref_ctx[off], vref, n); \
1941                 if (s->filtermode == FILTER_SWITCHABLE) { \
1942                     SPLAT_CTX(s->dir##_filter_ctx[off], filter_id, n); \
1943                 } \
1944             } \
1945         } \
1946     } while (0)
1947     case 1: SET_CTXS(above, col, 1); break;
1948     case 2: SET_CTXS(above, col, 2); break;
1949     case 4: SET_CTXS(above, col, 4); break;
1950     case 8: SET_CTXS(above, col, 8); break;
1951     }
1952     switch (bwh_tab[1][b->bs][1]) {
1953     case 1: SET_CTXS(left, row7, 1); break;
1954     case 2: SET_CTXS(left, row7, 2); break;
1955     case 4: SET_CTXS(left, row7, 4); break;
1956     case 8: SET_CTXS(left, row7, 8); break;
1957     }
1958 #undef SPLAT_CTX
1959 #undef SET_CTXS
1960
1961     if (!s->keyframe && !s->intraonly) {
1962         if (b->bs > BS_8x8) {
1963             int mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
1964
1965             AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][0], &b->mv[1][0]);
1966             AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][1], &b->mv[1][1]);
1967             AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][0], mv0);
1968             AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][1], mv1);
1969             AV_COPY32(&s->above_mv_ctx[col * 2 + 0][0], &b->mv[2][0]);
1970             AV_COPY32(&s->above_mv_ctx[col * 2 + 0][1], &b->mv[2][1]);
1971             AV_WN32A(&s->above_mv_ctx[col * 2 + 1][0], mv0);
1972             AV_WN32A(&s->above_mv_ctx[col * 2 + 1][1], mv1);
1973         } else {
1974             int n, mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
1975
1976             for (n = 0; n < w4 * 2; n++) {
1977                 AV_WN32A(&s->above_mv_ctx[col * 2 + n][0], mv0);
1978                 AV_WN32A(&s->above_mv_ctx[col * 2 + n][1], mv1);
1979             }
1980             for (n = 0; n < h4 * 2; n++) {
1981                 AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][0], mv0);
1982                 AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][1], mv1);
1983             }
1984         }
1985     }
1986
1987     // FIXME kinda ugly
1988     for (y = 0; y < h4; y++) {
1989         int x, o = (row + y) * s->sb_cols * 8 + col;
1990         struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[o];
1991
1992         if (b->intra) {
1993             for (x = 0; x < w4; x++) {
1994                 mv[x].ref[0] =
1995                 mv[x].ref[1] = -1;
1996             }
1997         } else if (b->comp) {
1998             for (x = 0; x < w4; x++) {
1999                 mv[x].ref[0] = b->ref[0];
2000                 mv[x].ref[1] = b->ref[1];
2001                 AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
2002                 AV_COPY32(&mv[x].mv[1], &b->mv[3][1]);
2003             }
2004         } else {
2005             for (x = 0; x < w4; x++) {
2006                 mv[x].ref[0] = b->ref[0];
2007                 mv[x].ref[1] = -1;
2008                 AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
2009             }
2010         }
2011     }
2012 }
2013
2014 // FIXME remove tx argument, and merge cnt/eob arguments?
2015 static int decode_coeffs_b(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
2016                            enum TxfmMode tx, unsigned (*cnt)[6][3],
2017                            unsigned (*eob)[6][2], uint8_t (*p)[6][11],
2018                            int nnz, const int16_t *scan, const int16_t (*nb)[2],
2019                            const int16_t *band_counts, const int16_t *qmul)
2020 {
2021     int i = 0, band = 0, band_left = band_counts[band];
2022     uint8_t *tp = p[0][nnz];
2023     uint8_t cache[1024];
2024
2025     do {
2026         int val, rc;
2027
2028         val = vp56_rac_get_prob_branchy(c, tp[0]); // eob
2029         eob[band][nnz][val]++;
2030         if (!val)
2031             break;
2032
2033     skip_eob:
2034         if (!vp56_rac_get_prob_branchy(c, tp[1])) { // zero
2035             cnt[band][nnz][0]++;
2036             if (!--band_left)
2037                 band_left = band_counts[++band];
2038             cache[scan[i]] = 0;
2039             nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
2040             tp = p[band][nnz];
2041             if (++i == n_coeffs)
2042                 break; //invalid input; blocks should end with EOB
2043             goto skip_eob;
2044         }
2045
2046         rc = scan[i];
2047         if (!vp56_rac_get_prob_branchy(c, tp[2])) { // one
2048             cnt[band][nnz][1]++;
2049             val = 1;
2050             cache[rc] = 1;
2051         } else {
2052             // fill in p[3-10] (model fill) - only once per frame for each pos
2053             if (!tp[3])
2054                 memcpy(&tp[3], vp9_model_pareto8[tp[2]], 8);
2055
2056             cnt[band][nnz][2]++;
2057             if (!vp56_rac_get_prob_branchy(c, tp[3])) { // 2, 3, 4
2058                 if (!vp56_rac_get_prob_branchy(c, tp[4])) {
2059                     cache[rc] = val = 2;
2060                 } else {
2061                     val = 3 + vp56_rac_get_prob(c, tp[5]);
2062                     cache[rc] = 3;
2063                 }
2064             } else if (!vp56_rac_get_prob_branchy(c, tp[6])) { // cat1/2
2065                 cache[rc] = 4;
2066                 if (!vp56_rac_get_prob_branchy(c, tp[7])) {
2067                     val = 5 + vp56_rac_get_prob(c, 159);
2068                 } else {
2069                     val  = 7 + (vp56_rac_get_prob(c, 165) << 1);
2070                     val +=      vp56_rac_get_prob(c, 145);
2071                 }
2072             } else { // cat 3-6
2073                 cache[rc] = 5;
2074                 if (!vp56_rac_get_prob_branchy(c, tp[8])) {
2075                     if (!vp56_rac_get_prob_branchy(c, tp[9])) {
2076                         val  = 11 + (vp56_rac_get_prob(c, 173) << 2);
2077                         val +=      (vp56_rac_get_prob(c, 148) << 1);
2078                         val +=       vp56_rac_get_prob(c, 140);
2079                     } else {
2080                         val  = 19 + (vp56_rac_get_prob(c, 176) << 3);
2081                         val +=      (vp56_rac_get_prob(c, 155) << 2);
2082                         val +=      (vp56_rac_get_prob(c, 140) << 1);
2083                         val +=       vp56_rac_get_prob(c, 135);
2084                     }
2085                 } else if (!vp56_rac_get_prob_branchy(c, tp[10])) {
2086                     val  = 35 + (vp56_rac_get_prob(c, 180) << 4);
2087                     val +=      (vp56_rac_get_prob(c, 157) << 3);
2088                     val +=      (vp56_rac_get_prob(c, 141) << 2);
2089                     val +=      (vp56_rac_get_prob(c, 134) << 1);
2090                     val +=       vp56_rac_get_prob(c, 130);
2091                 } else {
2092                     val  = 67 + (vp56_rac_get_prob(c, 254) << 13);
2093                     val +=      (vp56_rac_get_prob(c, 254) << 12);
2094                     val +=      (vp56_rac_get_prob(c, 254) << 11);
2095                     val +=      (vp56_rac_get_prob(c, 252) << 10);
2096                     val +=      (vp56_rac_get_prob(c, 249) << 9);
2097                     val +=      (vp56_rac_get_prob(c, 243) << 8);
2098                     val +=      (vp56_rac_get_prob(c, 230) << 7);
2099                     val +=      (vp56_rac_get_prob(c, 196) << 6);
2100                     val +=      (vp56_rac_get_prob(c, 177) << 5);
2101                     val +=      (vp56_rac_get_prob(c, 153) << 4);
2102                     val +=      (vp56_rac_get_prob(c, 140) << 3);
2103                     val +=      (vp56_rac_get_prob(c, 133) << 2);
2104                     val +=      (vp56_rac_get_prob(c, 130) << 1);
2105                     val +=       vp56_rac_get_prob(c, 129);
2106                 }
2107             }
2108         }
2109         if (!--band_left)
2110             band_left = band_counts[++band];
2111         if (tx == TX_32X32) // FIXME slow
2112             coef[rc] = ((vp8_rac_get(c) ? -val : val) * qmul[!!i]) / 2;
2113         else
2114             coef[rc] = (vp8_rac_get(c) ? -val : val) * qmul[!!i];
2115         nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
2116         tp = p[band][nnz];
2117     } while (++i < n_coeffs);
2118
2119     return i;
2120 }
2121
2122 static void decode_coeffs(AVCodecContext *ctx)
2123 {
2124     VP9Context *s = ctx->priv_data;
2125     VP9Block *b = s->b;
2126     int row = s->row, col = s->col;
2127     uint8_t (*p)[6][11] = s->prob.coef[b->tx][0 /* y */][!b->intra];
2128     unsigned (*c)[6][3] = s->counts.coef[b->tx][0 /* y */][!b->intra];
2129     unsigned (*e)[6][2] = s->counts.eob[b->tx][0 /* y */][!b->intra];
2130     int w4 = bwh_tab[1][b->bs][0] << 1, h4 = bwh_tab[1][b->bs][1] << 1;
2131     int end_x = FFMIN(2 * (s->cols - col), w4);
2132     int end_y = FFMIN(2 * (s->rows - row), h4);
2133     int n, pl, x, y, step1d = 1 << b->tx, step = 1 << (b->tx * 2);
2134     int uvstep1d = 1 << b->uvtx, uvstep = 1 << (b->uvtx * 2), res;
2135     int16_t (*qmul)[2] = s->segmentation.feat[b->seg_id].qmul;
2136     int tx = 4 * s->lossless + b->tx;
2137     const int16_t * const *yscans = vp9_scans[tx];
2138     const int16_t (* const *ynbs)[2] = vp9_scans_nb[tx];
2139     const int16_t *uvscan = vp9_scans[b->uvtx][DCT_DCT];
2140     const int16_t (*uvnb)[2] = vp9_scans_nb[b->uvtx][DCT_DCT];
2141     uint8_t *a = &s->above_y_nnz_ctx[col * 2];
2142     uint8_t *l = &s->left_y_nnz_ctx[(row & 7) << 1];
2143     static const int16_t band_counts[4][8] = {
2144         { 1, 2, 3, 4,  3,   16 - 13 },
2145         { 1, 2, 3, 4, 11,   64 - 21 },
2146         { 1, 2, 3, 4, 11,  256 - 21 },
2147         { 1, 2, 3, 4, 11, 1024 - 21 },
2148     };
2149     const int16_t *y_band_counts = band_counts[b->tx];
2150     const int16_t *uv_band_counts = band_counts[b->uvtx];
2151
2152 #define MERGE(la, end, step, rd) \
2153     for (n = 0; n < end; n += step) \
2154         la[n] = !!rd(&la[n])
2155 #define MERGE_CTX(step, rd) \
2156     do { \
2157         MERGE(l, end_y, step, rd); \
2158         MERGE(a, end_x, step, rd); \
2159     } while (0)
2160
2161     /* y tokens */
2162     switch (b->tx) {
2163     case TX_8X8:   MERGE_CTX(2, AV_RN16A); break;
2164     case TX_16X16: MERGE_CTX(4, AV_RN32A); break;
2165     case TX_32X32: MERGE_CTX(8, AV_RN64A); break;
2166     }
2167     for (n = 0, y = 0; y < end_y; y += step1d) {
2168         for (x = 0; x < end_x; x += step1d, n += step) {
2169             enum TxfmType txtp = vp9_intra_txfm_type[b->mode[b->tx == TX_4X4 &&
2170                                                              b->bs > BS_8x8 ?
2171                                                              n : 0]];
2172             int nnz = a[x] + l[y];
2173             res = decode_coeffs_b(&s->c, s->block + 16 * n, 16 * step,
2174                                   b->tx, c, e, p, nnz, yscans[txtp],
2175                                   ynbs[txtp], y_band_counts, qmul[0]);
2176             a[x] = l[y] = !!res;
2177             if (b->tx > TX_8X8) {
2178                 AV_WN16A(&s->eob[n], res);
2179             } else {
2180                 s->eob[n] = res;
2181             }
2182         }
2183     }
2184 #define SPLAT(la, end, step, cond) \
2185     if (step == 2) { \
2186         for (n = 1; n < end; n += step) \
2187             la[n] = la[n - 1]; \
2188     } else if (step == 4) { \
2189         if (cond) { \
2190             for (n = 0; n < end; n += step) \
2191                 AV_WN32A(&la[n], la[n] * 0x01010101); \
2192         } else { \
2193             for (n = 0; n < end; n += step) \
2194                 memset(&la[n + 1], la[n], FFMIN(end - n - 1, 3)); \
2195         } \
2196     } else /* step == 8 */ { \
2197         if (cond) { \
2198             if (HAVE_FAST_64BIT) { \
2199                 for (n = 0; n < end; n += step) \
2200                     AV_WN64A(&la[n], la[n] * 0x0101010101010101ULL); \
2201             } else { \
2202                 for (n = 0; n < end; n += step) { \
2203                     uint32_t v32 = la[n] * 0x01010101; \
2204                     AV_WN32A(&la[n],     v32); \
2205                     AV_WN32A(&la[n + 4], v32); \
2206                 } \
2207             } \
2208         } else { \
2209             for (n = 0; n < end; n += step) \
2210                 memset(&la[n + 1], la[n], FFMIN(end - n - 1, 7)); \
2211         } \
2212     }
2213 #define SPLAT_CTX(step) \
2214     do { \
2215         SPLAT(a, end_x, step, end_x == w4); \
2216         SPLAT(l, end_y, step, end_y == h4); \
2217     } while (0)
2218     switch (b->tx) {
2219     case TX_8X8:   SPLAT_CTX(2); break;
2220     case TX_16X16: SPLAT_CTX(4); break;
2221     case TX_32X32: SPLAT_CTX(8); break;
2222     }
2223
2224     p = s->prob.coef[b->uvtx][1 /* uv */][!b->intra];
2225     c = s->counts.coef[b->uvtx][1 /* uv */][!b->intra];
2226     e = s->counts.eob[b->uvtx][1 /* uv */][!b->intra];
2227     w4 >>= 1;
2228     h4 >>= 1;
2229     end_x >>= 1;
2230     end_y >>= 1;
2231     for (pl = 0; pl < 2; pl++) {
2232         a = &s->above_uv_nnz_ctx[pl][col];
2233         l = &s->left_uv_nnz_ctx[pl][row & 7];
2234         switch (b->uvtx) {
2235         case TX_8X8:   MERGE_CTX(2, AV_RN16A); break;
2236         case TX_16X16: MERGE_CTX(4, AV_RN32A); break;
2237         case TX_32X32: MERGE_CTX(8, AV_RN64A); break;
2238         }
2239         for (n = 0, y = 0; y < end_y; y += uvstep1d) {
2240             for (x = 0; x < end_x; x += uvstep1d, n += uvstep) {
2241                 int nnz = a[x] + l[y];
2242                 res = decode_coeffs_b(&s->c, s->uvblock[pl] + 16 * n,
2243                                       16 * uvstep, b->uvtx, c, e, p, nnz,
2244                                       uvscan, uvnb, uv_band_counts, qmul[1]);
2245                 a[x] = l[y] = !!res;
2246                 if (b->uvtx > TX_8X8) {
2247                     AV_WN16A(&s->uveob[pl][n], res);
2248                 } else {
2249                     s->uveob[pl][n] = res;
2250                 }
2251             }
2252         }
2253         switch (b->uvtx) {
2254         case TX_8X8:   SPLAT_CTX(2); break;
2255         case TX_16X16: SPLAT_CTX(4); break;
2256         case TX_32X32: SPLAT_CTX(8); break;
2257         }
2258     }
2259 }
2260
2261 static av_always_inline int check_intra_mode(VP9Context *s, int mode, uint8_t **a,
2262                                              uint8_t *dst_edge, ptrdiff_t stride_edge,
2263                                              uint8_t *dst_inner, ptrdiff_t stride_inner,
2264                                              uint8_t *l, int col, int x, int w,
2265                                              int row, int y, enum TxfmMode tx,
2266                                              int p)
2267 {
2268     int have_top = row > 0 || y > 0;
2269     int have_left = col > s->tiling.tile_col_start || x > 0;
2270     int have_right = x < w - 1;
2271     static const uint8_t mode_conv[10][2 /* have_left */][2 /* have_top */] = {
2272         [VERT_PRED]            = { { DC_127_PRED,          VERT_PRED },
2273                                    { DC_127_PRED,          VERT_PRED } },
2274         [HOR_PRED]             = { { DC_129_PRED,          DC_129_PRED },
2275                                    { HOR_PRED,             HOR_PRED } },
2276         [DC_PRED]              = { { DC_128_PRED,          TOP_DC_PRED },
2277                                    { LEFT_DC_PRED,         DC_PRED } },
2278         [DIAG_DOWN_LEFT_PRED]  = { { DC_127_PRED,          DIAG_DOWN_LEFT_PRED },
2279                                    { DC_127_PRED,          DIAG_DOWN_LEFT_PRED } },
2280         [DIAG_DOWN_RIGHT_PRED] = { { DIAG_DOWN_RIGHT_PRED, DIAG_DOWN_RIGHT_PRED },
2281                                    { DIAG_DOWN_RIGHT_PRED, DIAG_DOWN_RIGHT_PRED } },
2282         [VERT_RIGHT_PRED]      = { { VERT_RIGHT_PRED,      VERT_RIGHT_PRED },
2283                                    { VERT_RIGHT_PRED,      VERT_RIGHT_PRED } },
2284         [HOR_DOWN_PRED]        = { { HOR_DOWN_PRED,        HOR_DOWN_PRED },
2285                                    { HOR_DOWN_PRED,        HOR_DOWN_PRED } },
2286         [VERT_LEFT_PRED]       = { { DC_127_PRED,          VERT_LEFT_PRED },
2287                                    { DC_127_PRED,          VERT_LEFT_PRED } },
2288         [HOR_UP_PRED]          = { { DC_129_PRED,          DC_129_PRED },
2289                                    { HOR_UP_PRED,          HOR_UP_PRED } },
2290         [TM_VP8_PRED]          = { { DC_129_PRED,          VERT_PRED },
2291                                    { HOR_PRED,             TM_VP8_PRED } },
2292     };
2293     static const struct {
2294         uint8_t needs_left:1;
2295         uint8_t needs_top:1;
2296         uint8_t needs_topleft:1;
2297         uint8_t needs_topright:1;
2298     } edges[N_INTRA_PRED_MODES] = {
2299         [VERT_PRED]            = { .needs_top  = 1 },
2300         [HOR_PRED]             = { .needs_left = 1 },
2301         [DC_PRED]              = { .needs_top  = 1, .needs_left = 1 },
2302         [DIAG_DOWN_LEFT_PRED]  = { .needs_top  = 1, .needs_topright = 1 },
2303         [DIAG_DOWN_RIGHT_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2304         [VERT_RIGHT_PRED]      = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2305         [HOR_DOWN_PRED]        = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2306         [VERT_LEFT_PRED]       = { .needs_top  = 1, .needs_topright = 1 },
2307         [HOR_UP_PRED]          = { .needs_left = 1 },
2308         [TM_VP8_PRED]          = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2309         [LEFT_DC_PRED]         = { .needs_left = 1 },
2310         [TOP_DC_PRED]          = { .needs_top  = 1 },
2311         [DC_128_PRED]          = { 0 },
2312         [DC_127_PRED]          = { 0 },
2313         [DC_129_PRED]          = { 0 }
2314     };
2315
2316     av_assert2(mode >= 0 && mode < 10);
2317     mode = mode_conv[mode][have_left][have_top];
2318     if (edges[mode].needs_top) {
2319         uint8_t *top, *topleft;
2320         int n_px_need = 4 << tx, n_px_have = (((s->cols - col) << !p) - x) * 4;
2321         int n_px_need_tr = 0;
2322
2323         if (tx == TX_4X4 && edges[mode].needs_topright && have_right)
2324             n_px_need_tr = 4;
2325
2326         // if top of sb64-row, use s->intra_pred_data[] instead of
2327         // dst[-stride] for intra prediction (it contains pre- instead of
2328         // post-loopfilter data)
2329         if (have_top) {
2330             top = !(row & 7) && !y ?
2331                 s->intra_pred_data[p] + col * (8 >> !!p) + x * 4 :
2332                 y == 0 ? &dst_edge[-stride_edge] : &dst_inner[-stride_inner];
2333             if (have_left)
2334                 topleft = !(row & 7) && !y ?
2335                     s->intra_pred_data[p] + col * (8 >> !!p) + x * 4 :
2336                     y == 0 || x == 0 ? &dst_edge[-stride_edge] :
2337                     &dst_inner[-stride_inner];
2338         }
2339
2340         if (have_top &&
2341             (!edges[mode].needs_topleft || (have_left && top == topleft)) &&
2342             (tx != TX_4X4 || !edges[mode].needs_topright || have_right) &&
2343             n_px_need + n_px_need_tr <= n_px_have) {
2344             *a = top;
2345         } else {
2346             if (have_top) {
2347                 if (n_px_need <= n_px_have) {
2348                     memcpy(*a, top, n_px_need);
2349                 } else {
2350                     memcpy(*a, top, n_px_have);
2351                     memset(&(*a)[n_px_have], (*a)[n_px_have - 1],
2352                            n_px_need - n_px_have);
2353                 }
2354             } else {
2355                 memset(*a, 127, n_px_need);
2356             }
2357             if (edges[mode].needs_topleft) {
2358                 if (have_left && have_top) {
2359                     (*a)[-1] = topleft[-1];
2360                 } else {
2361                     (*a)[-1] = have_top ? 129 : 127;
2362                 }
2363             }
2364             if (tx == TX_4X4 && edges[mode].needs_topright) {
2365                 if (have_top && have_right &&
2366                     n_px_need + n_px_need_tr <= n_px_have) {
2367                     memcpy(&(*a)[4], &top[4], 4);
2368                 } else {
2369                     memset(&(*a)[4], (*a)[3], 4);
2370                 }
2371             }
2372         }
2373     }
2374     if (edges[mode].needs_left) {
2375         if (have_left) {
2376             int n_px_need = 4 << tx, i, n_px_have = (((s->rows - row) << !p) - y) * 4;
2377             uint8_t *dst = x == 0 ? dst_edge : dst_inner;
2378             ptrdiff_t stride = x == 0 ? stride_edge : stride_inner;
2379
2380             if (n_px_need <= n_px_have) {
2381                 for (i = 0; i < n_px_need; i++)
2382                     l[n_px_need - 1 - i] = dst[i * stride - 1];
2383             } else {
2384                 for (i = 0; i < n_px_have; i++)
2385                     l[n_px_need - 1 - i] = dst[i * stride - 1];
2386                 memset(l, l[n_px_need - n_px_have], n_px_need - n_px_have);
2387             }
2388         } else {
2389             memset(l, 129, 4 << tx);
2390         }
2391     }
2392
2393     return mode;
2394 }
2395
2396 static void intra_recon(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off)
2397 {
2398     VP9Context *s = ctx->priv_data;
2399     VP9Block *b = s->b;
2400     int row = s->row, col = s->col;
2401     int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
2402     int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
2403     int end_x = FFMIN(2 * (s->cols - col), w4);
2404     int end_y = FFMIN(2 * (s->rows - row), h4);
2405     int tx = 4 * s->lossless + b->tx, uvtx = b->uvtx + 4 * s->lossless;
2406     int uvstep1d = 1 << b->uvtx, p;
2407     uint8_t *dst = s->dst[0], *dst_r = s->frames[CUR_FRAME].tf.f->data[0] + y_off;
2408     LOCAL_ALIGNED_16(uint8_t, a_buf, [48]);
2409     LOCAL_ALIGNED_16(uint8_t, l, [32]);
2410
2411     for (n = 0, y = 0; y < end_y; y += step1d) {
2412         uint8_t *ptr = dst, *ptr_r = dst_r;
2413         for (x = 0; x < end_x; x += step1d, ptr += 4 * step1d,
2414                                ptr_r += 4 * step1d, n += step) {
2415             int mode = b->mode[b->bs > BS_8x8 && b->tx == TX_4X4 ?
2416                                y * 2 + x : 0];
2417             uint8_t *a = &a_buf[16];
2418             enum TxfmType txtp = vp9_intra_txfm_type[mode];
2419             int eob = b->skip ? 0 : b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
2420
2421             mode = check_intra_mode(s, mode, &a, ptr_r,
2422                                     s->frames[CUR_FRAME].tf.f->linesize[0],
2423                                     ptr, s->y_stride, l,
2424                                     col, x, w4, row, y, b->tx, 0);
2425             s->dsp.intra_pred[b->tx][mode](ptr, s->y_stride, l, a);
2426             if (eob)
2427                 s->dsp.itxfm_add[tx][txtp](ptr, s->y_stride,
2428                                            s->block + 16 * n, eob);
2429         }
2430         dst_r += 4 * step1d * s->frames[CUR_FRAME].tf.f->linesize[0];
2431         dst   += 4 * step1d * s->y_stride;
2432     }
2433
2434     // U/V
2435     h4 >>= 1;
2436     w4 >>= 1;
2437     end_x >>= 1;
2438     end_y >>= 1;
2439     step = 1 << (b->uvtx * 2);
2440     for (p = 0; p < 2; p++) {
2441         dst   = s->dst[1 + p];
2442         dst_r = s->frames[CUR_FRAME].tf.f->data[1 + p] + uv_off;
2443         for (n = 0, y = 0; y < end_y; y += uvstep1d) {
2444             uint8_t *ptr = dst, *ptr_r = dst_r;
2445             for (x = 0; x < end_x; x += uvstep1d, ptr += 4 * uvstep1d,
2446                                    ptr_r += 4 * uvstep1d, n += step) {
2447                 int mode = b->uvmode;
2448                 uint8_t *a = &a_buf[16];
2449                 int eob = b->skip ? 0 : b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
2450
2451                 mode = check_intra_mode(s, mode, &a, ptr_r,
2452                                         s->frames[CUR_FRAME].tf.f->linesize[1],
2453                                         ptr, s->uv_stride, l,
2454                                         col, x, w4, row, y, b->uvtx, p + 1);
2455                 s->dsp.intra_pred[b->uvtx][mode](ptr, s->uv_stride, l, a);
2456                 if (eob)
2457                     s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
2458                                                     s->uvblock[p] + 16 * n, eob);
2459             }
2460             dst_r += 4 * uvstep1d * s->frames[CUR_FRAME].tf.f->linesize[1];
2461             dst   += 4 * uvstep1d * s->uv_stride;
2462         }
2463     }
2464 }
2465
2466 static av_always_inline void mc_luma_dir(VP9Context *s, vp9_mc_func (*mc)[2],
2467                                          uint8_t *dst, ptrdiff_t dst_stride,
2468                                          const uint8_t *ref, ptrdiff_t ref_stride,
2469                                          ThreadFrame *ref_frame,
2470                                          ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
2471                                          int bw, int bh, int w, int h)
2472 {
2473     int mx = mv->x, my = mv->y, th;
2474
2475     y += my >> 3;
2476     x += mx >> 3;
2477     ref += y * ref_stride + x;
2478     mx &= 7;
2479     my &= 7;
2480     // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2481     // we use +7 because the last 7 pixels of each sbrow can be changed in
2482     // the longest loopfilter of the next sbrow
2483     th = (y + bh + 4 * !!my + 7) >> 6;
2484     ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2485     if (x < !!mx * 3 || y < !!my * 3 ||
2486         x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
2487         s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2488                                  ref - !!my * 3 * ref_stride - !!mx * 3,
2489                                  80, ref_stride,
2490                                  bw + !!mx * 7, bh + !!my * 7,
2491                                  x - !!mx * 3, y - !!my * 3, w, h);
2492         ref = s->edge_emu_buffer + !!my * 3 * 80 + !!mx * 3;
2493         ref_stride = 80;
2494     }
2495     mc[!!mx][!!my](dst, dst_stride, ref, ref_stride, bh, mx << 1, my << 1);
2496 }
2497
2498 static av_always_inline void mc_chroma_dir(VP9Context *s, vp9_mc_func (*mc)[2],
2499                                            uint8_t *dst_u, uint8_t *dst_v,
2500                                            ptrdiff_t dst_stride,
2501                                            const uint8_t *ref_u, ptrdiff_t src_stride_u,
2502                                            const uint8_t *ref_v, ptrdiff_t src_stride_v,
2503                                            ThreadFrame *ref_frame,
2504                                            ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
2505                                            int bw, int bh, int w, int h)
2506 {
2507     int mx = mv->x, my = mv->y, th;
2508
2509     y += my >> 4;
2510     x += mx >> 4;
2511     ref_u += y * src_stride_u + x;
2512     ref_v += y * src_stride_v + x;
2513     mx &= 15;
2514     my &= 15;
2515     // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2516     // we use +7 because the last 7 pixels of each sbrow can be changed in
2517     // the longest loopfilter of the next sbrow
2518     th = (y + bh + 4 * !!my + 7) >> 5;
2519     ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2520     if (x < !!mx * 3 || y < !!my * 3 ||
2521         x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
2522         s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2523                                  ref_u - !!my * 3 * src_stride_u - !!mx * 3,
2524                                  80, src_stride_u,
2525                                  bw + !!mx * 7, bh + !!my * 7,
2526                                  x - !!mx * 3, y - !!my * 3, w, h);
2527         ref_u = s->edge_emu_buffer + !!my * 3 * 80 + !!mx * 3;
2528         mc[!!mx][!!my](dst_u, dst_stride, ref_u, 80, bh, mx, my);
2529
2530         s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2531                                  ref_v - !!my * 3 * src_stride_v - !!mx * 3,
2532                                  80, src_stride_v,
2533                                  bw + !!mx * 7, bh + !!my * 7,
2534                                  x - !!mx * 3, y - !!my * 3, w, h);
2535         ref_v = s->edge_emu_buffer + !!my * 3 * 80 + !!mx * 3;
2536         mc[!!mx][!!my](dst_v, dst_stride, ref_v, 80, bh, mx, my);
2537     } else {
2538         mc[!!mx][!!my](dst_u, dst_stride, ref_u, src_stride_u, bh, mx, my);
2539         mc[!!mx][!!my](dst_v, dst_stride, ref_v, src_stride_v, bh, mx, my);
2540     }
2541 }
2542
2543 static void inter_recon(AVCodecContext *ctx)
2544 {
2545     static const uint8_t bwlog_tab[2][N_BS_SIZES] = {
2546         { 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4 },
2547         { 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4 },
2548     };
2549     VP9Context *s = ctx->priv_data;
2550     VP9Block *b = s->b;
2551     int row = s->row, col = s->col;
2552     ThreadFrame *tref1 = &s->refs[s->refidx[b->ref[0]]], *tref2;
2553     AVFrame *ref1 = tref1->f, *ref2;
2554     int w1 = ref1->width, h1 = ref1->height, w2, h2;
2555     ptrdiff_t ls_y = s->y_stride, ls_uv = s->uv_stride;
2556
2557     if (b->comp) {
2558         tref2 = &s->refs[s->refidx[b->ref[1]]];
2559         ref2 = tref2->f;
2560         w2 = ref2->width;
2561         h2 = ref2->height;
2562     }
2563
2564     // y inter pred
2565     if (b->bs > BS_8x8) {
2566         if (b->bs == BS_8x4) {
2567             mc_luma_dir(s, s->dsp.mc[3][b->filter][0], s->dst[0], ls_y,
2568                         ref1->data[0], ref1->linesize[0], tref1,
2569                         row << 3, col << 3, &b->mv[0][0], 8, 4, w1, h1);
2570             mc_luma_dir(s, s->dsp.mc[3][b->filter][0],
2571                         s->dst[0] + 4 * ls_y, ls_y,
2572                         ref1->data[0], ref1->linesize[0], tref1,
2573                         (row << 3) + 4, col << 3, &b->mv[2][0], 8, 4, w1, h1);
2574
2575             if (b->comp) {
2576                 mc_luma_dir(s, s->dsp.mc[3][b->filter][1], s->dst[0], ls_y,
2577                             ref2->data[0], ref2->linesize[0], tref2,
2578                             row << 3, col << 3, &b->mv[0][1], 8, 4, w2, h2);
2579                 mc_luma_dir(s, s->dsp.mc[3][b->filter][1],
2580                             s->dst[0] + 4 * ls_y, ls_y,
2581                             ref2->data[0], ref2->linesize[0], tref2,
2582                             (row << 3) + 4, col << 3, &b->mv[2][1], 8, 4, w2, h2);
2583             }
2584         } else if (b->bs == BS_4x8) {
2585             mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0], ls_y,
2586                         ref1->data[0], ref1->linesize[0], tref1,
2587                         row << 3, col << 3, &b->mv[0][0], 4, 8, w1, h1);
2588             mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0] + 4, ls_y,
2589                         ref1->data[0], ref1->linesize[0], tref1,
2590                         row << 3, (col << 3) + 4, &b->mv[1][0], 4, 8, w1, h1);
2591
2592             if (b->comp) {
2593                 mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0], ls_y,
2594                             ref2->data[0], ref2->linesize[0], tref2,
2595                             row << 3, col << 3, &b->mv[0][1], 4, 8, w2, h2);
2596                 mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0] + 4, ls_y,
2597                             ref2->data[0], ref2->linesize[0], tref2,
2598                             row << 3, (col << 3) + 4, &b->mv[1][1], 4, 8, w2, h2);
2599             }
2600         } else {
2601             av_assert2(b->bs == BS_4x4);
2602
2603             // FIXME if two horizontally adjacent blocks have the same MV,
2604             // do a w8 instead of a w4 call
2605             mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0], ls_y,
2606                         ref1->data[0], ref1->linesize[0], tref1,
2607                         row << 3, col << 3, &b->mv[0][0], 4, 4, w1, h1);
2608             mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0] + 4, ls_y,
2609                         ref1->data[0], ref1->linesize[0], tref1,
2610                         row << 3, (col << 3) + 4, &b->mv[1][0], 4, 4, w1, h1);
2611             mc_luma_dir(s, s->dsp.mc[4][b->filter][0],
2612                         s->dst[0] + 4 * ls_y, ls_y,
2613                         ref1->data[0], ref1->linesize[0], tref1,
2614                         (row << 3) + 4, col << 3, &b->mv[2][0], 4, 4, w1, h1);
2615             mc_luma_dir(s, s->dsp.mc[4][b->filter][0],
2616                         s->dst[0] + 4 * ls_y + 4, ls_y,
2617                         ref1->data[0], ref1->linesize[0], tref1,
2618                         (row << 3) + 4, (col << 3) + 4, &b->mv[3][0], 4, 4, w1, h1);
2619
2620             if (b->comp) {
2621                 mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0], ls_y,
2622                             ref2->data[0], ref2->linesize[0], tref2,
2623                             row << 3, col << 3, &b->mv[0][1], 4, 4, w2, h2);
2624                 mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0] + 4, ls_y,
2625                             ref2->data[0], ref2->linesize[0], tref2,
2626                             row << 3, (col << 3) + 4, &b->mv[1][1], 4, 4, w2, h2);
2627                 mc_luma_dir(s, s->dsp.mc[4][b->filter][1],
2628                             s->dst[0] + 4 * ls_y, ls_y,
2629                             ref2->data[0], ref2->linesize[0], tref2,
2630                             (row << 3) + 4, col << 3, &b->mv[2][1], 4, 4, w2, h2);
2631                 mc_luma_dir(s, s->dsp.mc[4][b->filter][1],
2632                             s->dst[0] + 4 * ls_y + 4, ls_y,
2633                             ref2->data[0], ref2->linesize[0], tref2,
2634                             (row << 3) + 4, (col << 3) + 4, &b->mv[3][1], 4, 4, w2, h2);
2635             }
2636         }
2637     } else {
2638         int bwl = bwlog_tab[0][b->bs];
2639         int bw = bwh_tab[0][b->bs][0] * 4, bh = bwh_tab[0][b->bs][1] * 4;
2640
2641         mc_luma_dir(s, s->dsp.mc[bwl][b->filter][0], s->dst[0], ls_y,
2642                     ref1->data[0], ref1->linesize[0], tref1,
2643                     row << 3, col << 3, &b->mv[0][0],bw, bh, w1, h1);
2644
2645         if (b->comp)
2646             mc_luma_dir(s, s->dsp.mc[bwl][b->filter][1], s->dst[0], ls_y,
2647                         ref2->data[0], ref2->linesize[0], tref2,
2648                         row << 3, col << 3, &b->mv[0][1], bw, bh, w2, h2);
2649     }
2650
2651     // uv inter pred
2652     {
2653         int bwl = bwlog_tab[1][b->bs];
2654         int bw = bwh_tab[1][b->bs][0] * 4, bh = bwh_tab[1][b->bs][1] * 4;
2655         VP56mv mvuv;
2656
2657         w1 = (w1 + 1) >> 1;
2658         h1 = (h1 + 1) >> 1;
2659         if (b->comp) {
2660             w2 = (w2 + 1) >> 1;
2661             h2 = (h2 + 1) >> 1;
2662         }
2663         if (b->bs > BS_8x8) {
2664             mvuv.x = ROUNDED_DIV(b->mv[0][0].x + b->mv[1][0].x + b->mv[2][0].x + b->mv[3][0].x, 4);
2665             mvuv.y = ROUNDED_DIV(b->mv[0][0].y + b->mv[1][0].y + b->mv[2][0].y + b->mv[3][0].y, 4);
2666         } else {
2667             mvuv = b->mv[0][0];
2668         }
2669
2670         mc_chroma_dir(s, s->dsp.mc[bwl][b->filter][0],
2671                       s->dst[1], s->dst[2], ls_uv,
2672                       ref1->data[1], ref1->linesize[1],
2673                       ref1->data[2], ref1->linesize[2], tref1,
2674                       row << 2, col << 2, &mvuv, bw, bh, w1, h1);
2675
2676         if (b->comp) {
2677             if (b->bs > BS_8x8) {
2678                 mvuv.x = ROUNDED_DIV(b->mv[0][1].x + b->mv[1][1].x + b->mv[2][1].x + b->mv[3][1].x, 4);
2679                 mvuv.y = ROUNDED_DIV(b->mv[0][1].y + b->mv[1][1].y + b->mv[2][1].y + b->mv[3][1].y, 4);
2680             } else {
2681                 mvuv = b->mv[0][1];
2682             }
2683             mc_chroma_dir(s, s->dsp.mc[bwl][b->filter][1],
2684                           s->dst[1], s->dst[2], ls_uv,
2685                           ref2->data[1], ref2->linesize[1],
2686                           ref2->data[2], ref2->linesize[2], tref2,
2687                           row << 2, col << 2, &mvuv, bw, bh, w2, h2);
2688         }
2689     }
2690
2691     if (!b->skip) {
2692         /* mostly copied intra_reconn() */
2693
2694         int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
2695         int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
2696         int end_x = FFMIN(2 * (s->cols - col), w4);
2697         int end_y = FFMIN(2 * (s->rows - row), h4);
2698         int tx = 4 * s->lossless + b->tx, uvtx = b->uvtx + 4 * s->lossless;
2699         int uvstep1d = 1 << b->uvtx, p;
2700         uint8_t *dst = s->dst[0];
2701
2702         // y itxfm add
2703         for (n = 0, y = 0; y < end_y; y += step1d) {
2704             uint8_t *ptr = dst;
2705             for (x = 0; x < end_x; x += step1d, ptr += 4 * step1d, n += step) {
2706                 int eob = b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
2707
2708                 if (eob)
2709                     s->dsp.itxfm_add[tx][DCT_DCT](ptr, s->y_stride,
2710                                                   s->block + 16 * n, eob);
2711             }
2712             dst += 4 * s->y_stride * step1d;
2713         }
2714
2715         // uv itxfm add
2716         h4 >>= 1;
2717         w4 >>= 1;
2718         end_x >>= 1;
2719         end_y >>= 1;
2720         step = 1 << (b->uvtx * 2);
2721         for (p = 0; p < 2; p++) {
2722             dst = s->dst[p + 1];
2723             for (n = 0, y = 0; y < end_y; y += uvstep1d) {
2724                 uint8_t *ptr = dst;
2725                 for (x = 0; x < end_x; x += uvstep1d, ptr += 4 * uvstep1d, n += step) {
2726                     int eob = b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
2727
2728                     if (eob)
2729                         s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
2730                                                         s->uvblock[p] + 16 * n, eob);
2731                 }
2732                 dst += 4 * uvstep1d * s->uv_stride;
2733             }
2734         }
2735     }
2736 }
2737
2738 static av_always_inline void mask_edges(struct VP9Filter *lflvl, int is_uv,
2739                                         int row_and_7, int col_and_7,
2740                                         int w, int h, int col_end, int row_end,
2741                                         enum TxfmMode tx, int skip_inter)
2742 {
2743     // FIXME I'm pretty sure all loops can be replaced by a single LUT if
2744     // we make VP9Filter.mask uint64_t (i.e. row/col all single variable)
2745     // and make the LUT 5-indexed (bl, bp, is_uv, tx and row/col), and then
2746     // use row_and_7/col_and_7 as shifts (1*col_and_7+8*row_and_7)
2747
2748     // the intended behaviour of the vp9 loopfilter is to work on 8-pixel
2749     // edges. This means that for UV, we work on two subsampled blocks at
2750     // a time, and we only use the topleft block's mode information to set
2751     // things like block strength. Thus, for any block size smaller than
2752     // 16x16, ignore the odd portion of the block.
2753     if (tx == TX_4X4 && is_uv) {
2754         if (h == 1) {
2755             if (row_and_7 & 1)
2756                 return;
2757             if (!row_end)
2758                 h += 1;
2759         }
2760         if (w == 1) {
2761             if (col_and_7 & 1)
2762                 return;
2763             if (!col_end)
2764                 w += 1;
2765         }
2766     }
2767
2768     if (tx == TX_4X4 && !skip_inter) {
2769         int t = 1 << col_and_7, m_col = (t << w) - t, y;
2770         int m_col_odd = (t << (w - 1)) - t;
2771
2772         // on 32-px edges, use the 8-px wide loopfilter; else, use 4-px wide
2773         if (is_uv) {
2774             int m_row_8 = m_col & 0x01, m_row_4 = m_col - m_row_8;
2775
2776             for (y = row_and_7; y < h + row_and_7; y++) {
2777                 int col_mask_id = 2 - !(y & 7);
2778
2779                 lflvl->mask[is_uv][0][y][1] |= m_row_8;
2780                 lflvl->mask[is_uv][0][y][2] |= m_row_4;
2781                 // for odd lines, if the odd col is not being filtered,
2782                 // skip odd row also:
2783                 // .---. <-- a
2784                 // |   |
2785                 // |___| <-- b
2786                 // ^   ^
2787                 // c   d
2788                 //
2789                 // if a/c are even row/col and b/d are odd, and d is skipped,
2790                 // e.g. right edge of size-66x66.webm, then skip b also (bug)
2791                 if ((col_end & 1) && (y & 1)) {
2792                     lflvl->mask[is_uv][1][y][col_mask_id] |= m_col_odd;
2793                 } else {
2794                     lflvl->mask[is_uv][1][y][col_mask_id] |= m_col;
2795                 }
2796             }
2797         } else {
2798             int m_row_8 = m_col & 0x11, m_row_4 = m_col - m_row_8;
2799
2800             for (y = row_and_7; y < h + row_and_7; y++) {
2801                 int col_mask_id = 2 - !(y & 3);
2802
2803                 lflvl->mask[is_uv][0][y][1] |= m_row_8; // row edge
2804                 lflvl->mask[is_uv][0][y][2] |= m_row_4;
2805                 lflvl->mask[is_uv][1][y][col_mask_id] |= m_col; // col edge
2806                 lflvl->mask[is_uv][0][y][3] |= m_col;
2807                 lflvl->mask[is_uv][1][y][3] |= m_col;
2808             }
2809         }
2810     } else {
2811         int y, t = 1 << col_and_7, m_col = (t << w) - t;
2812
2813         if (!skip_inter) {
2814             int mask_id = (tx == TX_8X8);
2815             int l2 = tx + is_uv - 1, step1d = 1 << l2;
2816             static const unsigned masks[4] = { 0xff, 0x55, 0x11, 0x01 };
2817             int m_row = m_col & masks[l2];
2818
2819             // at odd UV col/row edges tx16/tx32 loopfilter edges, force
2820             // 8wd loopfilter to prevent going off the visible edge.
2821             if (is_uv && tx > TX_8X8 && (w ^ (w - 1)) == 1) {
2822                 int m_row_16 = ((t << (w - 1)) - t) & masks[l2];
2823                 int m_row_8 = m_row - m_row_16;
2824
2825                 for (y = row_and_7; y < h + row_and_7; y++) {
2826                     lflvl->mask[is_uv][0][y][0] |= m_row_16;
2827                     lflvl->mask[is_uv][0][y][1] |= m_row_8;
2828                 }
2829             } else {
2830                 for (y = row_and_7; y < h + row_and_7; y++)
2831                     lflvl->mask[is_uv][0][y][mask_id] |= m_row;
2832             }
2833
2834             if (is_uv && tx > TX_8X8 && (h ^ (h - 1)) == 1) {
2835                 for (y = row_and_7; y < h + row_and_7 - 1; y += step1d)
2836                     lflvl->mask[is_uv][1][y][0] |= m_col;
2837                 if (y - row_and_7 == h - 1)
2838                     lflvl->mask[is_uv][1][y][1] |= m_col;
2839             } else {
2840                 for (y = row_and_7; y < h + row_and_7; y += step1d)
2841                     lflvl->mask[is_uv][1][y][mask_id] |= m_col;
2842             }
2843         } else if (tx != TX_4X4) {
2844             int mask_id;
2845
2846             mask_id = (tx == TX_8X8) || (is_uv && h == 1);
2847             lflvl->mask[is_uv][1][row_and_7][mask_id] |= m_col;
2848             mask_id = (tx == TX_8X8) || (is_uv && w == 1);
2849             for (y = row_and_7; y < h + row_and_7; y++)
2850                 lflvl->mask[is_uv][0][y][mask_id] |= t;
2851         } else if (is_uv) {
2852             int t8 = t & 0x01, t4 = t - t8;
2853
2854             for (y = row_and_7; y < h + row_and_7; y++) {
2855                 lflvl->mask[is_uv][0][y][2] |= t4;
2856                 lflvl->mask[is_uv][0][y][1] |= t8;
2857             }
2858             lflvl->mask[is_uv][1][row_and_7][2 - !(row_and_7 & 7)] |= m_col;
2859         } else {
2860             int t8 = t & 0x11, t4 = t - t8;
2861
2862             for (y = row_and_7; y < h + row_and_7; y++) {
2863                 lflvl->mask[is_uv][0][y][2] |= t4;
2864                 lflvl->mask[is_uv][0][y][1] |= t8;
2865             }
2866             lflvl->mask[is_uv][1][row_and_7][2 - !(row_and_7 & 3)] |= m_col;
2867         }
2868     }
2869 }
2870
2871 static void decode_b(AVCodecContext *ctx, int row, int col,
2872                      struct VP9Filter *lflvl, ptrdiff_t yoff, ptrdiff_t uvoff,
2873                      enum BlockLevel bl, enum BlockPartition bp)
2874 {
2875     VP9Context *s = ctx->priv_data;
2876     VP9Block *b = s->b;
2877     enum BlockSize bs = bl * 3 + bp;
2878     int w4 = bwh_tab[1][bs][0], h4 = bwh_tab[1][bs][1], lvl;
2879     int emu[2];
2880     AVFrame *f = s->frames[CUR_FRAME].tf.f;
2881
2882     s->row = row;
2883     s->row7 = row & 7;
2884     s->col = col;
2885     s->col7 = col & 7;
2886     s->min_mv.x = -(128 + col * 64);
2887     s->min_mv.y = -(128 + row * 64);
2888     s->max_mv.x = 128 + (s->cols - col - w4) * 64;
2889     s->max_mv.y = 128 + (s->rows - row - h4) * 64;
2890     if (s->pass < 2) {
2891         b->bs = bs;
2892         b->bl = bl;
2893         b->bp = bp;
2894         decode_mode(ctx);
2895         b->uvtx = b->tx - (w4 * 2 == (1 << b->tx) || h4 * 2 == (1 << b->tx));
2896
2897         if (!b->skip) {
2898             decode_coeffs(ctx);
2899         } else {
2900             int row7 = s->row7;
2901
2902 #define SPLAT_ZERO_CTX(v, n) \
2903     switch (n) { \
2904     case 1:  v = 0;          break; \
2905     case 2:  AV_ZERO16(&v);  break; \
2906     case 4:  AV_ZERO32(&v);  break; \
2907     case 8:  AV_ZERO64(&v);  break; \
2908     case 16: AV_ZERO128(&v); break; \
2909     }
2910 #define SPLAT_ZERO_YUV(dir, var, off, n) \
2911     do { \
2912         SPLAT_ZERO_CTX(s->dir##_y_##var[off * 2], n * 2); \
2913         SPLAT_ZERO_CTX(s->dir##_uv_##var[0][off], n); \
2914         SPLAT_ZERO_CTX(s->dir##_uv_##var[1][off], n); \
2915     } while (0)
2916
2917             switch (w4) {
2918             case 1: SPLAT_ZERO_YUV(above, nnz_ctx, col, 1); break;
2919             case 2: SPLAT_ZERO_YUV(above, nnz_ctx, col, 2); break;
2920             case 4: SPLAT_ZERO_YUV(above, nnz_ctx, col, 4); break;
2921             case 8: SPLAT_ZERO_YUV(above, nnz_ctx, col, 8); break;
2922             }
2923             switch (h4) {
2924             case 1: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 1); break;
2925             case 2: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 2); break;
2926             case 4: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 4); break;
2927             case 8: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 8); break;
2928             }
2929         }
2930         if (s->pass == 1) {
2931             s->b++;
2932             s->block += w4 * h4 * 64;
2933             s->uvblock[0] += w4 * h4 * 16;
2934             s->uvblock[1] += w4 * h4 * 16;
2935             s->eob += 4 * w4 * h4;
2936             s->uveob[0] += w4 * h4;
2937             s->uveob[1] += w4 * h4;
2938
2939             return;
2940         }
2941     }
2942
2943     // emulated overhangs if the stride of the target buffer can't hold. This
2944     // allows to support emu-edge and so on even if we have large block
2945     // overhangs
2946     emu[0] = (col + w4) * 8 > f->linesize[0] ||
2947              (row + h4) > s->rows;
2948     emu[1] = (col + w4) * 4 > f->linesize[1] ||
2949              (row + h4) > s->rows;
2950     if (emu[0]) {
2951         s->dst[0] = s->tmp_y;
2952         s->y_stride = 64;
2953     } else {
2954         s->dst[0] = f->data[0] + yoff;
2955         s->y_stride = f->linesize[0];
2956     }
2957     if (emu[1]) {
2958         s->dst[1] = s->tmp_uv[0];
2959         s->dst[2] = s->tmp_uv[1];
2960         s->uv_stride = 32;
2961     } else {
2962         s->dst[1] = f->data[1] + uvoff;
2963         s->dst[2] = f->data[2] + uvoff;
2964         s->uv_stride = f->linesize[1];
2965     }
2966     if (b->intra) {
2967         intra_recon(ctx, yoff, uvoff);
2968     } else {
2969         inter_recon(ctx);
2970     }
2971     if (emu[0]) {
2972         int w = FFMIN(s->cols - col, w4) * 8, h = FFMIN(s->rows - row, h4) * 8, n, o = 0;
2973
2974         for (n = 0; o < w; n++) {
2975             int bw = 64 >> n;
2976
2977             av_assert2(n <= 4);
2978             if (w & bw) {
2979                 s->dsp.mc[n][0][0][0][0](f->data[0] + yoff + o, f->linesize[0],
2980                                          s->tmp_y + o, 64, h, 0, 0);
2981                 o += bw;
2982             }
2983         }
2984     }
2985     if (emu[1]) {
2986         int w = FFMIN(s->cols - col, w4) * 4, h = FFMIN(s->rows - row, h4) * 4, n, o = 0;
2987
2988         for (n = 1; o < w; n++) {
2989             int bw = 64 >> n;
2990
2991             av_assert2(n <= 4);
2992             if (w & bw) {
2993                 s->dsp.mc[n][0][0][0][0](f->data[1] + uvoff + o, f->linesize[1],
2994                                          s->tmp_uv[0] + o, 32, h, 0, 0);
2995                 s->dsp.mc[n][0][0][0][0](f->data[2] + uvoff + o, f->linesize[2],
2996                                          s->tmp_uv[1] + o, 32, h, 0, 0);
2997                 o += bw;
2998             }
2999         }
3000     }
3001
3002     // pick filter level and find edges to apply filter to
3003     if (s->filter.level &&
3004         (lvl = s->segmentation.feat[b->seg_id].lflvl[b->intra ? 0 : b->ref[0] + 1]
3005                                                     [b->mode[3] != ZEROMV]) > 0) {
3006         int x_end = FFMIN(s->cols - col, w4), y_end = FFMIN(s->rows - row, h4);
3007         int skip_inter = !b->intra && b->skip, col7 = s->col7, row7 = s->row7;
3008
3009         setctx_2d(&lflvl->level[row7 * 8 + col7], w4, h4, 8, lvl);
3010         mask_edges(lflvl, 0, row7, col7, x_end, y_end, 0, 0, b->tx, skip_inter);
3011         mask_edges(lflvl, 1, row7, col7, x_end, y_end,
3012                    s->cols & 1 && col + w4 >= s->cols ? s->cols & 7 : 0,
3013                    s->rows & 1 && row + h4 >= s->rows ? s->rows & 7 : 0,
3014                    b->uvtx, skip_inter);
3015
3016         if (!s->filter.lim_lut[lvl]) {
3017             int sharp = s->filter.sharpness;
3018             int limit = lvl;
3019
3020             if (sharp > 0) {
3021                 limit >>= (sharp + 3) >> 2;
3022                 limit = FFMIN(limit, 9 - sharp);
3023             }
3024             limit = FFMAX(limit, 1);
3025
3026             s->filter.lim_lut[lvl] = limit;
3027             s->filter.mblim_lut[lvl] = 2 * (lvl + 2) + limit;
3028         }
3029     }
3030
3031     if (s->pass == 2) {
3032         s->b++;
3033         s->block += w4 * h4 * 64;
3034         s->uvblock[0] += w4 * h4 * 16;
3035         s->uvblock[1] += w4 * h4 * 16;
3036         s->eob += 4 * w4 * h4;
3037         s->uveob[0] += w4 * h4;
3038         s->uveob[1] += w4 * h4;
3039     }
3040 }
3041
3042 static void decode_sb(AVCodecContext *ctx, int row, int col, struct VP9Filter *lflvl,
3043                       ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
3044 {
3045     VP9Context *s = ctx->priv_data;
3046     int c = ((s->above_partition_ctx[col] >> (3 - bl)) & 1) |
3047             (((s->left_partition_ctx[row & 0x7] >> (3 - bl)) & 1) << 1);
3048     const uint8_t *p = s->keyframe ? vp9_default_kf_partition_probs[bl][c] :
3049                                      s->prob.p.partition[bl][c];
3050     enum BlockPartition bp;
3051     ptrdiff_t hbs = 4 >> bl;
3052     AVFrame *f = s->frames[CUR_FRAME].tf.f;
3053     ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
3054
3055     if (bl == BL_8X8) {
3056         bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
3057         decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3058     } else if (col + hbs < s->cols) { // FIXME why not <=?
3059         if (row + hbs < s->rows) { // FIXME why not <=?
3060             bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
3061             switch (bp) {
3062             case PARTITION_NONE:
3063                 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3064                 break;
3065             case PARTITION_H:
3066                 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3067                 yoff  += hbs * 8 * y_stride;
3068                 uvoff += hbs * 4 * uv_stride;
3069                 decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, bl, bp);
3070                 break;
3071             case PARTITION_V:
3072                 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3073                 yoff  += hbs * 8;
3074                 uvoff += hbs * 4;
3075                 decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, bl, bp);
3076                 break;
3077             case PARTITION_SPLIT:
3078                 decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3079                 decode_sb(ctx, row, col + hbs, lflvl,
3080                           yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
3081                 yoff  += hbs * 8 * y_stride;
3082                 uvoff += hbs * 4 * uv_stride;
3083                 decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3084                 decode_sb(ctx, row + hbs, col + hbs, lflvl,
3085                           yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
3086                 break;
3087             default:
3088                 av_assert0(0);
3089             }
3090         } else if (vp56_rac_get_prob_branchy(&s->c, p[1])) {
3091             bp = PARTITION_SPLIT;
3092             decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3093             decode_sb(ctx, row, col + hbs, lflvl,
3094                       yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
3095         } else {
3096             bp = PARTITION_H;
3097             decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3098         }
3099     } else if (row + hbs < s->rows) { // FIXME why not <=?
3100         if (vp56_rac_get_prob_branchy(&s->c, p[2])) {
3101             bp = PARTITION_SPLIT;
3102             decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3103             yoff  += hbs * 8 * y_stride;
3104             uvoff += hbs * 4 * uv_stride;
3105             decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3106         } else {
3107             bp = PARTITION_V;
3108             decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3109         }
3110     } else {
3111         bp = PARTITION_SPLIT;
3112         decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3113     }
3114     s->counts.partition[bl][c][bp]++;
3115 }
3116
3117 static void decode_sb_mem(AVCodecContext *ctx, int row, int col, struct VP9Filter *lflvl,
3118                           ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
3119 {
3120     VP9Context *s = ctx->priv_data;
3121     VP9Block *b = s->b;
3122     ptrdiff_t hbs = 4 >> bl;
3123     AVFrame *f = s->frames[CUR_FRAME].tf.f;
3124     ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
3125
3126     if (bl == BL_8X8) {
3127         av_assert2(b->bl == BL_8X8);
3128         decode_b(ctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
3129     } else if (s->b->bl == bl) {
3130         decode_b(ctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
3131         if (b->bp == PARTITION_H && row + hbs < s->rows) {
3132             yoff  += hbs * 8 * y_stride;
3133             uvoff += hbs * 4 * uv_stride;
3134             decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, b->bl, b->bp);
3135         } else if (b->bp == PARTITION_V && col + hbs < s->cols) {
3136             yoff  += hbs * 8;
3137             uvoff += hbs * 4;
3138             decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, b->bl, b->bp);
3139         }
3140     } else {
3141         decode_sb_mem(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3142         if (col + hbs < s->cols) { // FIXME why not <=?
3143             if (row + hbs < s->rows) {
3144                 decode_sb_mem(ctx, row, col + hbs, lflvl, yoff + 8 * hbs,
3145                               uvoff + 4 * hbs, bl + 1);
3146                 yoff  += hbs * 8 * y_stride;
3147                 uvoff += hbs * 4 * uv_stride;
3148                 decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3149                 decode_sb_mem(ctx, row + hbs, col + hbs, lflvl,
3150                                     yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
3151             } else {
3152                 yoff  += hbs * 8;
3153                 uvoff += hbs * 4;
3154                 decode_sb_mem(ctx, row, col + hbs, lflvl, yoff, uvoff, bl + 1);
3155             }
3156         } else if (row + hbs < s->rows) {
3157             yoff  += hbs * 8 * y_stride;
3158             uvoff += hbs * 4 * uv_stride;
3159             decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3160         }
3161     }
3162 }
3163
3164 static void loopfilter_sb(AVCodecContext *ctx, struct VP9Filter *lflvl,
3165                           int row, int col, ptrdiff_t yoff, ptrdiff_t uvoff)
3166 {
3167     VP9Context *s = ctx->priv_data;
3168     AVFrame *f = s->frames[CUR_FRAME].tf.f;
3169     uint8_t *dst = f->data[0] + yoff, *lvl = lflvl->level;
3170     ptrdiff_t ls_y = f->linesize[0], ls_uv = f->linesize[1];
3171     int y, x, p;
3172
3173     // FIXME in how far can we interleave the v/h loopfilter calls? E.g.
3174     // if you think of them as acting on a 8x8 block max, we can interleave
3175     // each v/h within the single x loop, but that only works if we work on
3176     // 8 pixel blocks, and we won't always do that (we want at least 16px
3177     // to use SSE2 optimizations, perhaps 32 for AVX2)
3178
3179     // filter edges between columns, Y plane (e.g. block1 | block2)
3180     for (y = 0; y < 8; y += 2, dst += 16 * ls_y, lvl += 16) {
3181         uint8_t *ptr = dst, *l = lvl, *hmask1 = lflvl->mask[0][0][y];
3182         uint8_t *hmask2 = lflvl->mask[0][0][y + 1];
3183         unsigned hm1 = hmask1[0] | hmask1[1] | hmask1[2], hm13 = hmask1[3];
3184         unsigned hm2 = hmask2[1] | hmask2[2], hm23 = hmask2[3];
3185         unsigned hm = hm1 | hm2 | hm13 | hm23;
3186
3187         for (x = 1; hm & ~(x - 1); x <<= 1, ptr += 8, l++) {
3188             if (hm1 & x) {
3189                 int L = *l, H = L >> 4;
3190                 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3191
3192                 if (col || x > 1) {
3193                     if (hmask1[0] & x) {
3194                         if (hmask2[0] & x) {
3195                             av_assert2(l[8] == L);
3196                             s->dsp.loop_filter_16[0](ptr, ls_y, E, I, H);
3197                         } else {
3198                             s->dsp.loop_filter_8[2][0](ptr, ls_y, E, I, H);
3199                         }
3200                     } else if (hm2 & x) {
3201                         L = l[8];
3202                         H |= (L >> 4) << 8;
3203                         E |= s->filter.mblim_lut[L] << 8;
3204                         I |= s->filter.lim_lut[L] << 8;
3205                         s->dsp.loop_filter_mix2[!!(hmask1[1] & x)]
3206                                                [!!(hmask2[1] & x)]
3207                                                [0](ptr, ls_y, E, I, H);
3208                     } else {
3209                         s->dsp.loop_filter_8[!!(hmask1[1] & x)]
3210                                             [0](ptr, ls_y, E, I, H);
3211                     }
3212                 }
3213             } else if (hm2 & x) {
3214                 int L = l[8], H = L >> 4;
3215                 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3216
3217                 if (col || x > 1) {
3218                     s->dsp.loop_filter_8[!!(hmask2[1] & x)]
3219                                         [0](ptr + 8 * ls_y, ls_y, E, I, H);
3220                 }
3221             }
3222             if (hm13 & x) {
3223                 int L = *l, H = L >> 4;
3224                 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3225
3226                 if (hm23 & x) {
3227                     L = l[8];
3228                     H |= (L >> 4) << 8;
3229                     E |= s->filter.mblim_lut[L] << 8;
3230                     I |= s->filter.lim_lut[L] << 8;
3231                     s->dsp.loop_filter_mix2[0][0][0](ptr + 4, ls_y, E, I, H);
3232                 } else {
3233                     s->dsp.loop_filter_8[0][0](ptr + 4, ls_y, E, I, H);
3234                 }
3235             } else if (hm23 & x) {
3236                 int L = l[8], H = L >> 4;
3237                 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3238
3239                 s->dsp.loop_filter_8[0][0](ptr + 8 * ls_y + 4, ls_y, E, I, H);
3240             }
3241         }
3242     }
3243
3244     //                                          block1
3245     // filter edges between rows, Y plane (e.g. ------)
3246     //                                          block2
3247     dst = f->data[0] + yoff;
3248     lvl = lflvl->level;
3249     for (y = 0; y < 8; y++, dst += 8 * ls_y, lvl += 8) {
3250         uint8_t *ptr = dst, *l = lvl, *vmask = lflvl->mask[0][1][y];
3251         unsigned vm = vmask[0] | vmask[1] | vmask[2], vm3 = vmask[3];
3252
3253         for (x = 1; vm & ~(x - 1); x <<= 2, ptr += 16, l += 2) {
3254             if (row || y) {
3255                 if (vm & x) {
3256                     int L = *l, H = L >> 4;
3257                     int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3258
3259                     if (vmask[0] & x) {
3260                         if (vmask[0] & (x << 1)) {
3261                             av_assert2(l[1] == L);
3262                             s->dsp.loop_filter_16[1](ptr, ls_y, E, I, H);
3263                         } else {
3264                             s->dsp.loop_filter_8[2][1](ptr, ls_y, E, I, H);
3265                         }
3266                     } else if (vm & (x << 1)) {
3267                         L = l[1];
3268                         H |= (L >> 4) << 8;
3269                         E |= s->filter.mblim_lut[L] << 8;
3270                         I |= s->filter.lim_lut[L] << 8;
3271                         s->dsp.loop_filter_mix2[!!(vmask[1] &  x)]
3272                                                [!!(vmask[1] & (x << 1))]
3273                                                [1](ptr, ls_y, E, I, H);
3274                     } else {
3275                         s->dsp.loop_filter_8[!!(vmask[1] & x)]
3276                                             [1](ptr, ls_y, E, I, H);
3277                     }
3278                 } else if (vm & (x << 1)) {
3279                     int L = l[1], H = L >> 4;
3280                     int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3281
3282                     s->dsp.loop_filter_8[!!(vmask[1] & (x << 1))]
3283                                         [1](ptr + 8, ls_y, E, I, H);
3284                 }
3285             }
3286             if (vm3 & x) {
3287                 int L = *l, H = L >> 4;
3288                 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3289
3290                 if (vm3 & (x << 1)) {
3291                     L = l[1];
3292                     H |= (L >> 4) << 8;
3293                     E |= s->filter.mblim_lut[L] << 8;
3294                     I |= s->filter.lim_lut[L] << 8;
3295                     s->dsp.loop_filter_mix2[0][0][1](ptr + ls_y * 4, ls_y, E, I, H);
3296                 } else {
3297                     s->dsp.loop_filter_8[0][1](ptr + ls_y * 4, ls_y, E, I, H);
3298                 }
3299             } else if (vm3 & (x << 1)) {
3300                 int L = l[1], H = L >> 4;
3301                 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3302
3303                 s->dsp.loop_filter_8[0][1](ptr + ls_y * 4 + 8, ls_y, E, I, H);
3304             }
3305         }
3306     }
3307
3308     // same principle but for U/V planes
3309     for (p = 0; p < 2; p++) {
3310         lvl = lflvl->level;
3311         dst = f->data[1 + p] + uvoff;
3312         for (y = 0; y < 8; y += 4, dst += 16 * ls_uv, lvl += 32) {
3313             uint8_t *ptr = dst, *l = lvl, *hmask1 = lflvl->mask[1][0][y];
3314             uint8_t *hmask2 = lflvl->mask[1][0][y + 2];
3315             unsigned hm1 = hmask1[0] | hmask1[1] | hmask1[2];
3316             unsigned hm2 = hmask2[1] | hmask2[2], hm = hm1 | hm2;
3317
3318             for (x = 1; hm & ~(x - 1); x <<= 1, ptr += 4) {
3319                 if (col || x > 1) {
3320                     if (hm1 & x) {
3321                         int L = *l, H = L >> 4;
3322                         int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3323
3324                         if (hmask1[0] & x) {
3325                             if (hmask2[0] & x) {
3326                                 av_assert2(l[16] == L);
3327                                 s->dsp.loop_filter_16[0](ptr, ls_uv, E, I, H);
3328                             } else {
3329                                 s->dsp.loop_filter_8[2][0](ptr, ls_uv, E, I, H);
3330                             }
3331                         } else if (hm2 & x) {
3332                             L = l[16];
3333                             H |= (L >> 4) << 8;
3334                             E |= s->filter.mblim_lut[L] << 8;
3335                             I |= s->filter.lim_lut[L] << 8;
3336                             s->dsp.loop_filter_mix2[!!(hmask1[1] & x)]
3337                                                    [!!(hmask2[1] & x)]
3338                                                    [0](ptr, ls_uv, E, I, H);
3339                         } else {
3340                             s->dsp.loop_filter_8[!!(hmask1[1] & x)]
3341                                                 [0](ptr, ls_uv, E, I, H);
3342                         }
3343                     } else if (hm2 & x) {
3344                         int L = l[16], H = L >> 4;
3345                         int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3346
3347                         s->dsp.loop_filter_8[!!(hmask2[1] & x)]
3348                                             [0](ptr + 8 * ls_uv, ls_uv, E, I, H);
3349                     }
3350                 }
3351                 if (x & 0xAA)
3352                     l += 2;
3353             }
3354         }
3355         lvl = lflvl->level;
3356         dst = f->data[1 + p] + uvoff;
3357         for (y = 0; y < 8; y++, dst += 4 * ls_uv) {
3358             uint8_t *ptr = dst, *l = lvl, *vmask = lflvl->mask[1][1][y];
3359             unsigned vm = vmask[0] | vmask[1] | vmask[2];
3360
3361             for (x = 1; vm & ~(x - 1); x <<= 4, ptr += 16, l += 4) {
3362                 if (row || y) {
3363                     if (vm & x) {
3364                         int L = *l, H = L >> 4;
3365                         int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3366
3367                         if (vmask[0] & x) {
3368                             if (vmask[0] & (x << 2)) {
3369                                 av_assert2(l[2] == L);
3370                                 s->dsp.loop_filter_16[1](ptr, ls_uv, E, I, H);
3371                             } else {
3372                                 s->dsp.loop_filter_8[2][1](ptr, ls_uv, E, I, H);
3373                             }
3374                         } else if (vm & (x << 2)) {
3375                             L = l[2];
3376                             H |= (L >> 4) << 8;
3377                             E |= s->filter.mblim_lut[L] << 8;
3378                             I |= s->filter.lim_lut[L] << 8;
3379                             s->dsp.loop_filter_mix2[!!(vmask[1] &  x)]
3380                                                    [!!(vmask[1] & (x << 2))]
3381                                                    [1](ptr, ls_uv, E, I, H);
3382                         } else {
3383                             s->dsp.loop_filter_8[!!(vmask[1] & x)]
3384                                                 [1](ptr, ls_uv, E, I, H);
3385                         }
3386                     } else if (vm & (x << 2)) {
3387                         int L = l[2], H = L >> 4;
3388                         int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3389
3390                         s->dsp.loop_filter_8[!!(vmask[1] & (x << 2))]
3391                                             [1](ptr + 8, ls_uv, E, I, H);
3392                     }
3393                 }
3394             }
3395             if (y & 1)
3396                 lvl += 16;
3397         }
3398     }
3399 }
3400
3401 static void set_tile_offset(int *start, int *end, int idx, int log2_n, int n)
3402 {
3403     int sb_start = ( idx      * n) >> log2_n;
3404     int sb_end   = ((idx + 1) * n) >> log2_n;
3405     *start = FFMIN(sb_start, n) << 3;
3406     *end   = FFMIN(sb_end,   n) << 3;
3407 }
3408
3409 static av_always_inline void adapt_prob(uint8_t *p, unsigned ct0, unsigned ct1,
3410                                         int max_count, int update_factor)
3411 {
3412     unsigned ct = ct0 + ct1, p2, p1;
3413
3414     if (!ct)
3415         return;
3416
3417     p1 = *p;
3418     p2 = ((ct0 << 8) + (ct >> 1)) / ct;
3419     p2 = av_clip(p2, 1, 255);
3420     ct = FFMIN(ct, max_count);
3421     update_factor = FASTDIV(update_factor * ct, max_count);
3422
3423     // (p1 * (256 - update_factor) + p2 * update_factor + 128) >> 8
3424     *p = p1 + (((p2 - p1) * update_factor + 128) >> 8);
3425 }
3426
3427 static void adapt_probs(VP9Context *s)
3428 {
3429     int i, j, k, l, m;
3430     prob_context *p = &s->prob_ctx[s->framectxid].p;
3431     int uf = (s->keyframe || s->intraonly || !s->last_keyframe) ? 112 : 128;
3432
3433     // coefficients
3434     for (i = 0; i < 4; i++)
3435         for (j = 0; j < 2; j++)
3436             for (k = 0; k < 2; k++)
3437                 for (l = 0; l < 6; l++)
3438                     for (m = 0; m < 6; m++) {
3439                         uint8_t *pp = s->prob_ctx[s->framectxid].coef[i][j][k][l][m];
3440                         unsigned *e = s->counts.eob[i][j][k][l][m];
3441                         unsigned *c = s->counts.coef[i][j][k][l][m];
3442
3443                         if (l == 0 && m >= 3) // dc only has 3 pt
3444                             break;
3445
3446                         adapt_prob(&pp[0], e[0], e[1], 24, uf);
3447                         adapt_prob(&pp[1], c[0], c[1] + c[2], 24, uf);
3448                         adapt_prob(&pp[2], c[1], c[2], 24, uf);
3449                     }
3450
3451     if (s->keyframe || s->intraonly) {
3452         memcpy(p->skip,  s->prob.p.skip,  sizeof(p->skip));
3453         memcpy(p->tx32p, s->prob.p.tx32p, sizeof(p->tx32p));
3454         memcpy(p->tx16p, s->prob.p.tx16p, sizeof(p->tx16p));
3455         memcpy(p->tx8p,  s->prob.p.tx8p,  sizeof(p->tx8p));
3456         return;
3457     }
3458
3459     // skip flag
3460     for (i = 0; i < 3; i++)
3461         adapt_prob(&p->skip[i], s->counts.skip[i][0], s->counts.skip[i][1], 20, 128);
3462
3463     // intra/inter flag
3464     for (i = 0; i < 4; i++)
3465         adapt_prob(&p->intra[i], s->counts.intra[i][0], s->counts.intra[i][1], 20, 128);
3466
3467     // comppred flag
3468     if (s->comppredmode == PRED_SWITCHABLE) {
3469       for (i = 0; i < 5; i++)
3470           adapt_prob(&p->comp[i], s->counts.comp[i][0], s->counts.comp[i][1], 20, 128);
3471     }
3472
3473     // reference frames
3474     if (s->comppredmode != PRED_SINGLEREF) {
3475       for (i = 0; i < 5; i++)
3476           adapt_prob(&p->comp_ref[i], s->counts.comp_ref[i][0],
3477                      s->counts.comp_ref[i][1], 20, 128);
3478     }
3479
3480     if (s->comppredmode != PRED_COMPREF) {
3481       for (i = 0; i < 5; i++) {
3482           uint8_t *pp = p->single_ref[i];
3483           unsigned (*c)[2] = s->counts.single_ref[i];
3484
3485           adapt_prob(&pp[0], c[0][0], c[0][1], 20, 128);
3486           adapt_prob(&pp[1], c[1][0], c[1][1], 20, 128);
3487       }
3488     }
3489
3490     // block partitioning
3491     for (i = 0; i < 4; i++)
3492         for (j = 0; j < 4; j++) {
3493             uint8_t *pp = p->partition[i][j];
3494             unsigned *c = s->counts.partition[i][j];
3495
3496             adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3497             adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3498             adapt_prob(&pp[2], c[2], c[3], 20, 128);
3499         }
3500
3501     // tx size
3502     if (s->txfmmode == TX_SWITCHABLE) {
3503       for (i = 0; i < 2; i++) {
3504           unsigned *c16 = s->counts.tx16p[i], *c32 = s->counts.tx32p[i];
3505
3506           adapt_prob(&p->tx8p[i], s->counts.tx8p[i][0], s->counts.tx8p[i][1], 20, 128);
3507           adapt_prob(&p->tx16p[i][0], c16[0], c16[1] + c16[2], 20, 128);
3508           adapt_prob(&p->tx16p[i][1], c16[1], c16[2], 20, 128);
3509           adapt_prob(&p->tx32p[i][0], c32[0], c32[1] + c32[2] + c32[3], 20, 128);
3510           adapt_prob(&p->tx32p[i][1], c32[1], c32[2] + c32[3], 20, 128);
3511           adapt_prob(&p->tx32p[i][2], c32[2], c32[3], 20, 128);
3512       }
3513     }
3514
3515     // interpolation filter
3516     if (s->filtermode == FILTER_SWITCHABLE) {
3517         for (i = 0; i < 4; i++) {
3518             uint8_t *pp = p->filter[i];
3519             unsigned *c = s->counts.filter[i];
3520
3521             adapt_prob(&pp[0], c[0], c[1] + c[2], 20, 128);
3522             adapt_prob(&pp[1], c[1], c[2], 20, 128);
3523         }
3524     }
3525
3526     // inter modes
3527     for (i = 0; i < 7; i++) {
3528         uint8_t *pp = p->mv_mode[i];
3529         unsigned *c = s->counts.mv_mode[i];
3530
3531         adapt_prob(&pp[0], c[2], c[1] + c[0] + c[3], 20, 128);
3532         adapt_prob(&pp[1], c[0], c[1] + c[3], 20, 128);
3533         adapt_prob(&pp[2], c[1], c[3], 20, 128);
3534     }
3535
3536     // mv joints
3537     {
3538         uint8_t *pp = p->mv_joint;
3539         unsigned *c = s->counts.mv_joint;
3540
3541         adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3542         adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3543         adapt_prob(&pp[2], c[2], c[3], 20, 128);
3544     }
3545
3546     // mv components
3547     for (i = 0; i < 2; i++) {
3548         uint8_t *pp;
3549         unsigned *c, (*c2)[2], sum;
3550
3551         adapt_prob(&p->mv_comp[i].sign, s->counts.mv_comp[i].sign[0],
3552                    s->counts.mv_comp[i].sign[1], 20, 128);
3553
3554         pp = p->mv_comp[i].classes;
3555         c = s->counts.mv_comp[i].classes;
3556         sum = c[1] + c[2] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9] + c[10];
3557         adapt_prob(&pp[0], c[0], sum, 20, 128);
3558         sum -= c[1];
3559         adapt_prob(&pp[1], c[1], sum, 20, 128);
3560         sum -= c[2] + c[3];
3561         adapt_prob(&pp[2], c[2] + c[3], sum, 20, 128);
3562         adapt_prob(&pp[3], c[2], c[3], 20, 128);
3563         sum -= c[4] + c[5];
3564         adapt_prob(&pp[4], c[4] + c[5], sum, 20, 128);
3565         adapt_prob(&pp[5], c[4], c[5], 20, 128);
3566         sum -= c[6];
3567         adapt_prob(&pp[6], c[6], sum, 20, 128);
3568         adapt_prob(&pp[7], c[7] + c[8], c[9] + c[10], 20, 128);
3569         adapt_prob(&pp[8], c[7], c[8], 20, 128);
3570         adapt_prob(&pp[9], c[9], c[10], 20, 128);
3571
3572         adapt_prob(&p->mv_comp[i].class0, s->counts.mv_comp[i].class0[0],
3573                    s->counts.mv_comp[i].class0[1], 20, 128);
3574         pp = p->mv_comp[i].bits;
3575         c2 = s->counts.mv_comp[i].bits;
3576         for (j = 0; j < 10; j++)
3577             adapt_prob(&pp[j], c2[j][0], c2[j][1], 20, 128);
3578
3579         for (j = 0; j < 2; j++) {
3580             pp = p->mv_comp[i].class0_fp[j];
3581             c = s->counts.mv_comp[i].class0_fp[j];
3582             adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3583             adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3584             adapt_prob(&pp[2], c[2], c[3], 20, 128);
3585         }
3586         pp = p->mv_comp[i].fp;
3587         c = s->counts.mv_comp[i].fp;
3588         adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3589         adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3590         adapt_prob(&pp[2], c[2], c[3], 20, 128);
3591
3592         if (s->highprecisionmvs) {
3593             adapt_prob(&p->mv_comp[i].class0_hp, s->counts.mv_comp[i].class0_hp[0],
3594                        s->counts.mv_comp[i].class0_hp[1], 20, 128);
3595             adapt_prob(&p->mv_comp[i].hp, s->counts.mv_comp[i].hp[0],
3596                        s->counts.mv_comp[i].hp[1], 20, 128);
3597         }
3598     }
3599
3600     // y intra modes
3601     for (i = 0; i < 4; i++) {
3602         uint8_t *pp = p->y_mode[i];
3603         unsigned *c = s->counts.y_mode[i], sum, s2;
3604
3605         sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
3606         adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
3607         sum -= c[TM_VP8_PRED];
3608         adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
3609         sum -= c[VERT_PRED];
3610         adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
3611         s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
3612         sum -= s2;
3613         adapt_prob(&pp[3], s2, sum, 20, 128);
3614         s2 -= c[HOR_PRED];
3615         adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
3616         adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], 20, 128);
3617         sum -= c[DIAG_DOWN_LEFT_PRED];
3618         adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
3619         sum -= c[VERT_LEFT_PRED];
3620         adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
3621         adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
3622     }
3623
3624     // uv intra modes
3625     for (i = 0; i < 10; i++) {
3626         uint8_t *pp = p->uv_mode[i];
3627         unsigned *c = s->counts.uv_mode[i], sum, s2;
3628
3629         sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
3630         adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
3631         sum -= c[TM_VP8_PRED];
3632         adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
3633         sum -= c[VERT_PRED];
3634         adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
3635         s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
3636         sum -= s2;
3637         adapt_prob(&pp[3], s2, sum, 20, 128);
3638         s2 -= c[HOR_PRED];
3639         adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
3640         adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], 20, 128);
3641         sum -= c[DIAG_DOWN_LEFT_PRED];
3642         adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
3643         sum -= c[VERT_LEFT_PRED];
3644         adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
3645         adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
3646     }
3647 }
3648
3649 static void free_buffers(VP9Context *s)
3650 {
3651     av_freep(&s->intra_pred_data[0]);
3652     av_freep(&s->b_base);
3653     av_freep(&s->block_base);
3654 }
3655
3656 static av_cold int vp9_decode_free(AVCodecContext *ctx)
3657 {
3658     VP9Context *s = ctx->priv_data;
3659     int i;
3660
3661     for (i = 0; i < 2; i++) {
3662         if (s->frames[i].tf.f->data[0])
3663             vp9_unref_frame(ctx, &s->frames[i]);
3664         av_frame_free(&s->frames[i].tf.f);
3665     }
3666     for (i = 0; i < 8; i++) {
3667         if (s->refs[i].f->data[0])
3668             ff_thread_release_buffer(ctx, &s->refs[i]);
3669         av_frame_free(&s->refs[i].f);
3670         if (s->next_refs[i].f->data[0])
3671             ff_thread_release_buffer(ctx, &s->next_refs[i]);
3672         av_frame_free(&s->next_refs[i].f);
3673     }
3674     free_buffers(s);
3675     av_freep(&s->c_b);
3676     s->c_b_size = 0;
3677
3678     return 0;
3679 }
3680
3681
3682 static int vp9_decode_frame(AVCodecContext *ctx, void *frame,
3683                             int *got_frame, AVPacket *pkt)
3684 {
3685     const uint8_t *data = pkt->data;
3686     int size = pkt->size;
3687     VP9Context *s = ctx->priv_data;
3688     int res, tile_row, tile_col, i, ref, row, col;
3689     ptrdiff_t yoff, uvoff, ls_y, ls_uv;
3690     AVFrame *f;
3691
3692     if ((res = decode_frame_header(ctx, data, size, &ref)) < 0) {
3693         return res;
3694     } else if (res == 0) {
3695         if (!s->refs[ref].f->data[0]) {
3696             av_log(ctx, AV_LOG_ERROR, "Requested reference %d not available\n", ref);
3697             return AVERROR_INVALIDDATA;
3698         }
3699         if ((res = av_frame_ref(frame, s->refs[ref].f)) < 0)
3700             return res;
3701         *got_frame = 1;
3702         return 0;
3703     }
3704     data += res;
3705     size -= res;
3706
3707     if (s->frames[LAST_FRAME].tf.f->data[0])
3708         vp9_unref_frame(ctx, &s->frames[LAST_FRAME]);
3709     if (!s->keyframe && s->frames[CUR_FRAME].tf.f->data[0] &&
3710         (res = vp9_ref_frame(ctx, &s->frames[LAST_FRAME], &s->frames[CUR_FRAME])) < 0)
3711         return res;
3712     if (s->frames[CUR_FRAME].tf.f->data[0])
3713         vp9_unref_frame(ctx, &s->frames[CUR_FRAME]);
3714     if ((res = vp9_alloc_frame(ctx, &s->frames[CUR_FRAME])) < 0)
3715         return res;
3716     f = s->frames[CUR_FRAME].tf.f;
3717     f->key_frame = s->keyframe;
3718     f->pict_type = s->keyframe ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
3719     ls_y = f->linesize[0];
3720     ls_uv =f->linesize[1];
3721
3722     // ref frame setup
3723     for (i = 0; i < 8; i++) {
3724         if (s->next_refs[i].f->data[0])
3725             ff_thread_release_buffer(ctx, &s->next_refs[i]);
3726         if (s->refreshrefmask & (1 << i)) {
3727             res = ff_thread_ref_frame(&s->next_refs[i], &s->frames[CUR_FRAME].tf);
3728         } else {
3729             res = ff_thread_ref_frame(&s->next_refs[i], &s->refs[i]);
3730         }
3731         if (res < 0)
3732             return res;
3733     }
3734
3735     // main tile decode loop
3736     memset(s->above_partition_ctx, 0, s->cols);
3737     memset(s->above_skip_ctx, 0, s->cols);
3738     if (s->keyframe || s->intraonly) {
3739         memset(s->above_mode_ctx, DC_PRED, s->cols * 2);
3740     } else {
3741         memset(s->above_mode_ctx, NEARESTMV, s->cols);
3742     }
3743     memset(s->above_y_nnz_ctx, 0, s->sb_cols * 16);
3744     memset(s->above_uv_nnz_ctx[0], 0, s->sb_cols * 8);
3745     memset(s->above_uv_nnz_ctx[1], 0, s->sb_cols * 8);
3746     memset(s->above_segpred_ctx, 0, s->cols);
3747     s->pass = s->uses_2pass =
3748         ctx->active_thread_type == FF_THREAD_FRAME && s->refreshctx && !s->parallelmode;
3749     if ((res = update_block_buffers(ctx)) < 0) {
3750         av_log(ctx, AV_LOG_ERROR,
3751                "Failed to allocate block buffers\n");
3752         return res;
3753     }
3754     if (s->refreshctx && s->parallelmode) {
3755         int j, k, l, m;
3756
3757         for (i = 0; i < 4; i++) {
3758             for (j = 0; j < 2; j++)
3759                 for (k = 0; k < 2; k++)
3760                     for (l = 0; l < 6; l++)
3761                         for (m = 0; m < 6; m++)
3762                             memcpy(s->prob_ctx[s->framectxid].coef[i][j][k][l][m],
3763                                    s->prob.coef[i][j][k][l][m], 3);
3764             if (s->txfmmode == i)
3765                 break;
3766         }
3767         s->prob_ctx[s->framectxid].p = s->prob.p;
3768         ff_thread_finish_setup(ctx);
3769     }
3770
3771     do {
3772         yoff = uvoff = 0;
3773         s->b = s->b_base;
3774         s->block = s->block_base;
3775         s->uvblock[0] = s->uvblock_base[0];
3776         s->uvblock[1] = s->uvblock_base[1];
3777         s->eob = s->eob_base;
3778         s->uveob[0] = s->uveob_base[0];
3779         s->uveob[1] = s->uveob_base[1];
3780
3781         for (tile_row = 0; tile_row < s->tiling.tile_rows; tile_row++) {
3782             set_tile_offset(&s->tiling.tile_row_start, &s->tiling.tile_row_end,
3783                             tile_row, s->tiling.log2_tile_rows, s->sb_rows);
3784             if (s->pass != 2) {
3785                 for (tile_col = 0; tile_col < s->tiling.tile_cols; tile_col++) {
3786                     unsigned tile_size;
3787
3788                     if (tile_col == s->tiling.tile_cols - 1 &&
3789                         tile_row == s->tiling.tile_rows - 1) {
3790                         tile_size = size;
3791                     } else {
3792                         tile_size = AV_RB32(data);
3793                         data += 4;
3794                         size -= 4;
3795                     }
3796                     if (tile_size > size) {
3797                         ff_thread_report_progress(&s->frames[CUR_FRAME].tf, INT_MAX, 0);
3798                         return AVERROR_INVALIDDATA;
3799                     }
3800                     ff_vp56_init_range_decoder(&s->c_b[tile_col], data, tile_size);
3801                     if (vp56_rac_get_prob_branchy(&s->c_b[tile_col], 128)) { // marker bit
3802                         ff_thread_report_progress(&s->frames[CUR_FRAME].tf, INT_MAX, 0);
3803                         return AVERROR_INVALIDDATA;
3804                     }
3805                     data += tile_size;
3806                     size -= tile_size;
3807                 }
3808             }
3809
3810             for (row = s->tiling.tile_row_start; row < s->tiling.tile_row_end;
3811                  row += 8, yoff += ls_y * 64, uvoff += ls_uv * 32) {
3812                 struct VP9Filter *lflvl_ptr = s->lflvl;
3813                 ptrdiff_t yoff2 = yoff, uvoff2 = uvoff;
3814
3815                 for (tile_col = 0; tile_col < s->tiling.tile_cols; tile_col++) {
3816                     set_tile_offset(&s->tiling.tile_col_start, &s->tiling.tile_col_end,
3817                                     tile_col, s->tiling.log2_tile_cols, s->sb_cols);
3818
3819                     if (s->pass != 2) {
3820                         memset(s->left_partition_ctx, 0, 8);
3821                         memset(s->left_skip_ctx, 0, 8);
3822                         if (s->keyframe || s->intraonly) {
3823                             memset(s->left_mode_ctx, DC_PRED, 16);
3824                         } else {
3825                             memset(s->left_mode_ctx, NEARESTMV, 8);
3826                         }
3827                         memset(s->left_y_nnz_ctx, 0, 16);
3828                         memset(s->left_uv_nnz_ctx, 0, 16);
3829                         memset(s->left_segpred_ctx, 0, 8);
3830
3831                         memcpy(&s->c, &s->c_b[tile_col], sizeof(s->c));
3832                     }
3833
3834                     for (col = s->tiling.tile_col_start;
3835                          col < s->tiling.tile_col_end;
3836                          col += 8, yoff2 += 64, uvoff2 += 32, lflvl_ptr++) {
3837                         // FIXME integrate with lf code (i.e. zero after each
3838                         // use, similar to invtxfm coefficients, or similar)
3839                         if (s->pass != 1) {
3840                             memset(lflvl_ptr->mask, 0, sizeof(lflvl_ptr->mask));
3841                         }
3842
3843                         if (s->pass == 2) {
3844                             decode_sb_mem(ctx, row, col, lflvl_ptr,
3845                                           yoff2, uvoff2, BL_64X64);
3846                         } else {
3847                             decode_sb(ctx, row, col, lflvl_ptr,
3848                                       yoff2, uvoff2, BL_64X64);
3849                         }
3850                     }
3851                     if (s->pass != 2) {
3852                         memcpy(&s->c_b[tile_col], &s->c, sizeof(s->c));
3853                     }
3854                 }
3855
3856                 if (s->pass == 1) {
3857                     continue;
3858                 }
3859
3860                 // backup pre-loopfilter reconstruction data for intra
3861                 // prediction of next row of sb64s
3862                 if (row + 8 < s->rows) {
3863                     memcpy(s->intra_pred_data[0],
3864                            f->data[0] + yoff + 63 * ls_y,
3865                            8 * s->cols);
3866                     memcpy(s->intra_pred_data[1],
3867                            f->data[1] + uvoff + 31 * ls_uv,
3868                            4 * s->cols);
3869                     memcpy(s->intra_pred_data[2],
3870                            f->data[2] + uvoff + 31 * ls_uv,
3871                            4 * s->cols);
3872                 }
3873
3874                 // loopfilter one row
3875                 if (s->filter.level) {
3876                     yoff2 = yoff;
3877                     uvoff2 = uvoff;
3878                     lflvl_ptr = s->lflvl;
3879                     for (col = 0; col < s->cols;
3880                          col += 8, yoff2 += 64, uvoff2 += 32, lflvl_ptr++) {
3881                         loopfilter_sb(ctx, lflvl_ptr, row, col, yoff2, uvoff2);
3882                     }
3883                 }
3884
3885                 // FIXME maybe we can make this more finegrained by running the
3886                 // loopfilter per-block instead of after each sbrow
3887                 // In fact that would also make intra pred left preparation easier?
3888                 ff_thread_report_progress(&s->frames[CUR_FRAME].tf, row >> 3, 0);
3889             }
3890         }
3891
3892         if (s->pass < 2 && s->refreshctx && !s->parallelmode) {
3893             adapt_probs(s);
3894             ff_thread_finish_setup(ctx);
3895         }
3896     } while (s->pass++ == 1);
3897     ff_thread_report_progress(&s->frames[CUR_FRAME].tf, INT_MAX, 0);
3898
3899     // ref frame setup
3900     for (i = 0; i < 8; i++) {
3901         if (s->refs[i].f->data[0])
3902             ff_thread_release_buffer(ctx, &s->refs[i]);
3903         ff_thread_ref_frame(&s->refs[i], &s->next_refs[i]);
3904     }
3905
3906     if (!s->invisible) {
3907         if ((res = av_frame_ref(frame, s->frames[CUR_FRAME].tf.f)) < 0)
3908             return res;
3909         *got_frame = 1;
3910     }
3911
3912     return 0;
3913 }
3914
3915 static void vp9_decode_flush(AVCodecContext *ctx)
3916 {
3917     VP9Context *s = ctx->priv_data;
3918     int i;
3919
3920     for (i = 0; i < 2; i++)
3921         vp9_unref_frame(ctx, &s->frames[i]);
3922     for (i = 0; i < 8; i++)
3923         ff_thread_release_buffer(ctx, &s->refs[i]);
3924 }
3925
3926 static int init_frames(AVCodecContext *ctx)
3927 {
3928     VP9Context *s = ctx->priv_data;
3929     int i;
3930
3931     for (i = 0; i < 2; i++) {
3932         s->frames[i].tf.f = av_frame_alloc();
3933         if (!s->frames[i].tf.f) {
3934             vp9_decode_free(ctx);
3935             av_log(ctx, AV_LOG_ERROR, "Failed to allocate frame buffer %d\n", i);
3936             return AVERROR(ENOMEM);
3937         }
3938     }
3939     for (i = 0; i < 8; i++) {
3940         s->refs[i].f = av_frame_alloc();
3941         s->next_refs[i].f = av_frame_alloc();
3942         if (!s->refs[i].f || !s->next_refs[i].f) {
3943             vp9_decode_free(ctx);
3944             av_log(ctx, AV_LOG_ERROR, "Failed to allocate frame buffer %d\n", i);
3945             return AVERROR(ENOMEM);
3946         }
3947     }
3948
3949     return 0;
3950 }
3951
3952 static av_cold int vp9_decode_init(AVCodecContext *ctx)
3953 {
3954     VP9Context *s = ctx->priv_data;
3955
3956     ctx->internal->allocate_progress = 1;
3957     ctx->pix_fmt = AV_PIX_FMT_YUV420P;
3958     ff_vp9dsp_init(&s->dsp);
3959     ff_videodsp_init(&s->vdsp, 8);
3960     s->filter.sharpness = -1;
3961
3962     return init_frames(ctx);
3963 }
3964
3965 static av_cold int vp9_decode_init_thread_copy(AVCodecContext *avctx)
3966 {
3967     return init_frames(avctx);
3968 }
3969
3970 static int vp9_decode_update_thread_context(AVCodecContext *dst, const AVCodecContext *src)
3971 {
3972     int i, res;
3973     VP9Context *s = dst->priv_data, *ssrc = src->priv_data;
3974
3975     // detect size changes in other threads
3976     if (s->intra_pred_data[0] &&
3977         (!ssrc->intra_pred_data[0] || s->cols != ssrc->cols || s->rows != ssrc->rows)) {
3978         free_buffers(s);
3979     }
3980
3981     for (i = 0; i < 2; i++) {
3982         if (s->frames[i].tf.f->data[0])
3983             vp9_unref_frame(dst, &s->frames[i]);
3984         if (ssrc->frames[i].tf.f->data[0]) {
3985             if ((res = vp9_ref_frame(dst, &s->frames[i], &ssrc->frames[i])) < 0)
3986                 return res;
3987         }
3988     }
3989     for (i = 0; i < 8; i++) {
3990         if (s->refs[i].f->data[0])
3991             ff_thread_release_buffer(dst, &s->refs[i]);
3992         if (ssrc->next_refs[i].f->data[0]) {
3993             if ((res = ff_thread_ref_frame(&s->refs[i], &ssrc->next_refs[i])) < 0)
3994                 return res;
3995         }
3996     }
3997
3998     s->invisible = ssrc->invisible;
3999     s->keyframe = ssrc->keyframe;
4000     s->uses_2pass = ssrc->uses_2pass;
4001     memcpy(&s->prob_ctx, &ssrc->prob_ctx, sizeof(s->prob_ctx));
4002     memcpy(&s->lf_delta, &ssrc->lf_delta, sizeof(s->lf_delta));
4003     if (ssrc->segmentation.enabled) {
4004         memcpy(&s->segmentation.feat, &ssrc->segmentation.feat,
4005                sizeof(s->segmentation.feat));
4006     }
4007
4008     return 0;
4009 }
4010
4011 AVCodec ff_vp9_decoder = {
4012     .name                  = "vp9",
4013     .long_name             = NULL_IF_CONFIG_SMALL("Google VP9"),
4014     .type                  = AVMEDIA_TYPE_VIDEO,
4015     .id                    = AV_CODEC_ID_VP9,
4016     .priv_data_size        = sizeof(VP9Context),
4017     .init                  = vp9_decode_init,
4018     .close                 = vp9_decode_free,
4019     .decode                = vp9_decode_frame,
4020     .capabilities          = CODEC_CAP_DR1 | CODEC_CAP_FRAME_THREADS,
4021     .flush                 = vp9_decode_flush,
4022     .init_thread_copy      = ONLY_IF_THREADS_ENABLED(vp9_decode_init_thread_copy),
4023     .update_thread_context = ONLY_IF_THREADS_ENABLED(vp9_decode_update_thread_context),
4024 };