git.sesse.net Git - ffmpeg/blob - libavcodec/vp9.c

   1 /*
   2  * VP9 compatible video decoder
   3  *
   4  * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
   5  * Copyright (C) 2013 Clément Bœsch <u pkh me>
   6  *
   7  * This file is part of FFmpeg.
   8  *
   9  * FFmpeg is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public
  11  * License as published by the Free Software Foundation; either
  12  * version 2.1 of the License, or (at your option) any later version.
  13  *
  14  * FFmpeg is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with FFmpeg; if not, write to the Free Software
  21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22  */
  23
  24 #include "avcodec.h"
  25 #include "get_bits.h"
  26 #include "internal.h"
  27 #include "thread.h"
  28 #include "videodsp.h"
  29 #include "vp56.h"
  30 #include "vp9.h"
  31 #include "vp9data.h"
  32 #include "vp9dsp.h"
  33 #include "libavutil/avassert.h"
  34
  35 #define VP9_SYNCCODE 0x498342
  36
  37 enum CompPredMode {
  38     PRED_SINGLEREF,
  39     PRED_COMPREF,
  40     PRED_SWITCHABLE,
  41 };
  42
  43 enum BlockLevel {
  44     BL_64X64,
  45     BL_32X32,
  46     BL_16X16,
  47     BL_8X8,
  48 };
  49
  50 enum BlockSize {
  51     BS_64x64,
  52     BS_64x32,
  53     BS_32x64,
  54     BS_32x32,
  55     BS_32x16,
  56     BS_16x32,
  57     BS_16x16,
  58     BS_16x8,
  59     BS_8x16,
  60     BS_8x8,
  61     BS_8x4,
  62     BS_4x8,
  63     BS_4x4,
  64     N_BS_SIZES,
  65 };
  66
  67 struct VP9mvrefPair {
  68     VP56mv mv[2];
  69     int8_t ref[2];
  70 };
  71
  72 typedef struct VP9Frame {
  73     ThreadFrame tf;
  74     AVBufferRef *extradata;
  75     uint8_t *segmentation_map;
  76     struct VP9mvrefPair *mv;
  77 } VP9Frame;
  78
  79 struct VP9Filter {
  80     uint8_t level[8 * 8];
  81     uint8_t /* bit=col */ mask[2 /* 0=y, 1=uv */][2 /* 0=col, 1=row */]
  82                               [8 /* rows */][4 /* 0=16, 1=8, 2=4, 3=inner4 */];
  83 };
  84
  85 typedef struct VP9Block {
  86     uint8_t seg_id, intra, comp, ref[2], mode[4], uvmode, skip;
  87     enum FilterMode filter;
  88     VP56mv mv[4 /* b_idx */][2 /* ref */];
  89     enum BlockSize bs;
  90     enum TxfmMode tx, uvtx;
  91     enum BlockLevel bl;
  92     enum BlockPartition bp;
  93 } VP9Block;
  94
  95 typedef struct VP9Context {
  96     VP9DSPContext dsp;
  97     VideoDSPContext vdsp;
  98     GetBitContext gb;
  99     VP56RangeCoder c;
 100     VP56RangeCoder *c_b;
 101     unsigned c_b_size;
 102     VP9Block *b_base, *b;
 103     int pass, uses_2pass, last_uses_2pass;
 104     int row, row7, col, col7;
 105     uint8_t *dst[3];
 106     ptrdiff_t y_stride, uv_stride;
 107
 108     // bitstream header
 109     uint8_t profile;
 110     uint8_t keyframe, last_keyframe;
 111     uint8_t invisible;
 112     uint8_t use_last_frame_mvs;
 113     uint8_t errorres;
 114     uint8_t colorspace;
 115     uint8_t fullrange;
 116     uint8_t intraonly;
 117     uint8_t resetctx;
 118     uint8_t refreshrefmask;
 119     uint8_t highprecisionmvs;
 120     enum FilterMode filtermode;
 121     uint8_t allowcompinter;
 122     uint8_t fixcompref;
 123     uint8_t refreshctx;
 124     uint8_t parallelmode;
 125     uint8_t framectxid;
 126     uint8_t refidx[3];
 127     uint8_t signbias[3];
 128     uint8_t varcompref[2];
 129     ThreadFrame refs[8], next_refs[8];
 130 #define CUR_FRAME 0
 131 #define LAST_FRAME 1
 132     VP9Frame frames[2];
 133
 134     struct {
 135         uint8_t level;
 136         int8_t sharpness;
 137         uint8_t lim_lut[64];
 138         uint8_t mblim_lut[64];
 139     } filter;
 140     struct {
 141         uint8_t enabled;
 142         int8_t mode[2];
 143         int8_t ref[4];
 144     } lf_delta;
 145     uint8_t yac_qi;
 146     int8_t ydc_qdelta, uvdc_qdelta, uvac_qdelta;
 147     uint8_t lossless;
 148 #define MAX_SEGMENT 8
 149     struct {
 150         uint8_t enabled;
 151         uint8_t temporal;
 152         uint8_t absolute_vals;
 153         uint8_t update_map;
 154         struct {
 155             uint8_t q_enabled;
 156             uint8_t lf_enabled;
 157             uint8_t ref_enabled;
 158             uint8_t skip_enabled;
 159             uint8_t ref_val;
 160             int16_t q_val;
 161             int8_t lf_val;
 162             int16_t qmul[2][2];
 163             uint8_t lflvl[4][2];
 164         } feat[MAX_SEGMENT];
 165     } segmentation;
 166     struct {
 167         unsigned log2_tile_cols, log2_tile_rows;
 168         unsigned tile_cols, tile_rows;
 169         unsigned tile_row_start, tile_row_end, tile_col_start, tile_col_end;
 170     } tiling;
 171     unsigned sb_cols, sb_rows, rows, cols;
 172     struct {
 173         prob_context p;
 174         uint8_t coef[4][2][2][6][6][3];
 175     } prob_ctx[4];
 176     struct {
 177         prob_context p;
 178         uint8_t coef[4][2][2][6][6][11];
 179         uint8_t seg[7];
 180         uint8_t segpred[3];
 181     } prob;
 182     struct {
 183         unsigned y_mode[4][10];
 184         unsigned uv_mode[10][10];
 185         unsigned filter[4][3];
 186         unsigned mv_mode[7][4];
 187         unsigned intra[4][2];
 188         unsigned comp[5][2];
 189         unsigned single_ref[5][2][2];
 190         unsigned comp_ref[5][2];
 191         unsigned tx32p[2][4];
 192         unsigned tx16p[2][3];
 193         unsigned tx8p[2][2];
 194         unsigned skip[3][2];
 195         unsigned mv_joint[4];
 196         struct {
 197             unsigned sign[2];
 198             unsigned classes[11];
 199             unsigned class0[2];
 200             unsigned bits[10][2];
 201             unsigned class0_fp[2][4];
 202             unsigned fp[4];
 203             unsigned class0_hp[2];
 204             unsigned hp[2];
 205         } mv_comp[2];
 206         unsigned partition[4][4][4];
 207         unsigned coef[4][2][2][6][6][3];
 208         unsigned eob[4][2][2][6][6][2];
 209     } counts;
 210     enum TxfmMode txfmmode;
 211     enum CompPredMode comppredmode;
 212
 213     // contextual (left/above) cache
 214     DECLARE_ALIGNED(16, uint8_t, left_y_nnz_ctx)[16];
 215     DECLARE_ALIGNED(16, uint8_t, left_mode_ctx)[16];
 216     DECLARE_ALIGNED(16, VP56mv, left_mv_ctx)[16][2];
 217     DECLARE_ALIGNED(8, uint8_t, left_uv_nnz_ctx)[2][8];
 218     DECLARE_ALIGNED(8, uint8_t, left_partition_ctx)[8];
 219     DECLARE_ALIGNED(8, uint8_t, left_skip_ctx)[8];
 220     DECLARE_ALIGNED(8, uint8_t, left_txfm_ctx)[8];
 221     DECLARE_ALIGNED(8, uint8_t, left_segpred_ctx)[8];
 222     DECLARE_ALIGNED(8, uint8_t, left_intra_ctx)[8];
 223     DECLARE_ALIGNED(8, uint8_t, left_comp_ctx)[8];
 224     DECLARE_ALIGNED(8, uint8_t, left_ref_ctx)[8];
 225     DECLARE_ALIGNED(8, uint8_t, left_filter_ctx)[8];
 226     uint8_t *above_partition_ctx;
 227     uint8_t *above_mode_ctx;
 228     // FIXME maybe merge some of the below in a flags field?
 229     uint8_t *above_y_nnz_ctx;
 230     uint8_t *above_uv_nnz_ctx[2];
 231     uint8_t *above_skip_ctx; // 1bit
 232     uint8_t *above_txfm_ctx; // 2bit
 233     uint8_t *above_segpred_ctx; // 1bit
 234     uint8_t *above_intra_ctx; // 1bit
 235     uint8_t *above_comp_ctx; // 1bit
 236     uint8_t *above_ref_ctx; // 2bit
 237     uint8_t *above_filter_ctx;
 238     VP56mv (*above_mv_ctx)[2];
 239
 240     // whole-frame cache
 241     uint8_t *intra_pred_data[3];
 242     struct VP9Filter *lflvl;
 243     DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[71*80];
 244
 245     // block reconstruction intermediates
 246     int block_alloc_using_2pass;
 247     int16_t *block_base, *block, *uvblock_base[2], *uvblock[2];
 248     uint8_t *eob_base, *uveob_base[2], *eob, *uveob[2];
 249     struct { int x, y; } min_mv, max_mv;
 250     DECLARE_ALIGNED(32, uint8_t, tmp_y)[64*64];
 251     DECLARE_ALIGNED(32, uint8_t, tmp_uv)[2][32*32];
 252 } VP9Context;
 253
 254 static const uint8_t bwh_tab[2][N_BS_SIZES][2] = {
 255     {
 256         { 16, 16 }, { 16, 8 }, { 8, 16 }, { 8, 8 }, { 8, 4 }, { 4, 8 },
 257         { 4, 4 }, { 4, 2 }, { 2, 4 }, { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 },
 258     }, {
 259         { 8, 8 }, { 8, 4 }, { 4, 8 }, { 4, 4 }, { 4, 2 }, { 2, 4 },
 260         { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },
 261     }
 262 };
 263
 264 static int vp9_alloc_frame(AVCodecContext *ctx, VP9Frame *f)
 265 {
 266     VP9Context *s = ctx->priv_data;
 267     int ret, sz;
 268
 269     if ((ret = ff_thread_get_buffer(ctx, &f->tf, AV_GET_BUFFER_FLAG_REF)) < 0)
 270         return ret;
 271     sz = 64 * s->sb_cols * s->sb_rows;
 272     if (!(f->extradata = av_buffer_allocz(sz * (1 + sizeof(struct VP9mvrefPair))))) {
 273         ff_thread_release_buffer(ctx, &f->tf);
 274         return AVERROR(ENOMEM);
 275     }
 276
 277     f->segmentation_map = f->extradata->data;
 278     f->mv = (struct VP9mvrefPair *) (f->extradata->data + sz);
 279
 280     // retain segmentation map if it doesn't update
 281     if (s->segmentation.enabled && !s->segmentation.update_map &&
 282         !s->intraonly && !s->keyframe && !s->errorres &&
 283         ctx->active_thread_type != FF_THREAD_FRAME) {
 284         memcpy(f->segmentation_map, s->frames[LAST_FRAME].segmentation_map, sz);
 285     }
 286
 287     return 0;
 288 }
 289
 290 static void vp9_unref_frame(AVCodecContext *ctx, VP9Frame *f)
 291 {
 292     ff_thread_release_buffer(ctx, &f->tf);
 293     av_buffer_unref(&f->extradata);
 294 }
 295
 296 static int vp9_ref_frame(AVCodecContext *ctx, VP9Frame *dst, VP9Frame *src)
 297 {
 298     int res;
 299
 300     if ((res = ff_thread_ref_frame(&dst->tf, &src->tf)) < 0) {
 301         return res;
 302     } else if (!(dst->extradata = av_buffer_ref(src->extradata))) {
 303         vp9_unref_frame(ctx, dst);
 304         return AVERROR(ENOMEM);
 305     }
 306
 307     dst->segmentation_map = src->segmentation_map;
 308     dst->mv = src->mv;
 309
 310     return 0;
 311 }
 312
 313 static int update_size(AVCodecContext *ctx, int w, int h)
 314 {
 315     VP9Context *s = ctx->priv_data;
 316     uint8_t *p;
 317
 318     av_assert0(w > 0 && h > 0);
 319
 320     if (s->intra_pred_data[0] && w == ctx->width && h == ctx->height)
 321         return 0;
 322
 323     ctx->width  = w;
 324     ctx->height = h;
 325     s->sb_cols  = (w + 63) >> 6;
 326     s->sb_rows  = (h + 63) >> 6;
 327     s->cols     = (w + 7) >> 3;
 328     s->rows     = (h + 7) >> 3;
 329
 330 #define assign(var, type, n) var = (type) p; p += s->sb_cols * (n) * sizeof(*var)
 331     av_freep(&s->intra_pred_data[0]);
 332     p = av_malloc(s->sb_cols * (240 + sizeof(*s->lflvl) + 16 * sizeof(*s->above_mv_ctx)));
 333     if (!p)
 334         return AVERROR(ENOMEM);
 335     assign(s->intra_pred_data[0],  uint8_t *,             64);
 336     assign(s->intra_pred_data[1],  uint8_t *,             32);
 337     assign(s->intra_pred_data[2],  uint8_t *,             32);
 338     assign(s->above_y_nnz_ctx,     uint8_t *,             16);
 339     assign(s->above_mode_ctx,      uint8_t *,             16);
 340     assign(s->above_mv_ctx,        VP56mv(*)[2],          16);
 341     assign(s->above_partition_ctx, uint8_t *,              8);
 342     assign(s->above_skip_ctx,      uint8_t *,              8);
 343     assign(s->above_txfm_ctx,      uint8_t *,              8);
 344     assign(s->above_uv_nnz_ctx[0], uint8_t *,              8);
 345     assign(s->above_uv_nnz_ctx[1], uint8_t *,              8);
 346     assign(s->above_segpred_ctx,   uint8_t *,              8);
 347     assign(s->above_intra_ctx,     uint8_t *,              8);
 348     assign(s->above_comp_ctx,      uint8_t *,              8);
 349     assign(s->above_ref_ctx,       uint8_t *,              8);
 350     assign(s->above_filter_ctx,    uint8_t *,              8);
 351     assign(s->lflvl,               struct VP9Filter *,     1);
 352 #undef assign
 353
 354     // these will be re-allocated a little later
 355     av_freep(&s->b_base);
 356     av_freep(&s->block_base);
 357
 358     return 0;
 359 }
 360
 361 static int update_block_buffers(AVCodecContext *ctx)
 362 {
 363     VP9Context *s = ctx->priv_data;
 364
 365     if (s->b_base && s->block_base && s->block_alloc_using_2pass == s->uses_2pass)
 366         return 0;
 367
 368     av_free(s->b_base);
 369     av_free(s->block_base);
 370     if (s->uses_2pass) {
 371         int sbs = s->sb_cols * s->sb_rows;
 372
 373         s->b_base = av_malloc_array(s->cols * s->rows, sizeof(VP9Block));
 374         s->block_base = av_mallocz((64 * 64 + 128) * sbs * 3);
 375         if (!s->b_base || !s->block_base)
 376             return AVERROR(ENOMEM);
 377         s->uvblock_base[0] = s->block_base + sbs * 64 * 64;
 378         s->uvblock_base[1] = s->uvblock_base[0] + sbs * 32 * 32;
 379         s->eob_base = (uint8_t *) (s->uvblock_base[1] + sbs * 32 * 32);
 380         s->uveob_base[0] = s->eob_base + 256 * sbs;
 381         s->uveob_base[1] = s->uveob_base[0] + 64 * sbs;
 382     } else {
 383         s->b_base = av_malloc(sizeof(VP9Block));
 384         s->block_base = av_mallocz((64 * 64 + 128) * 3);
 385         if (!s->b_base || !s->block_base)
 386             return AVERROR(ENOMEM);
 387         s->uvblock_base[0] = s->block_base + 64 * 64;
 388         s->uvblock_base[1] = s->uvblock_base[0] + 32 * 32;
 389         s->eob_base = (uint8_t *) (s->uvblock_base[1] + 32 * 32);
 390         s->uveob_base[0] = s->eob_base + 256;
 391         s->uveob_base[1] = s->uveob_base[0] + 64;
 392     }
 393     s->block_alloc_using_2pass = s->uses_2pass;
 394
 395     return 0;
 396 }
 397
 398 // for some reason the sign bit is at the end, not the start, of a bit sequence
 399 static av_always_inline int get_sbits_inv(GetBitContext *gb, int n)
 400 {
 401     int v = get_bits(gb, n);
 402     return get_bits1(gb) ? -v : v;
 403 }
 404
 405 static av_always_inline int inv_recenter_nonneg(int v, int m)
 406 {
 407     return v > 2 * m ? v : v & 1 ? m - ((v + 1) >> 1) : m + (v >> 1);
 408 }
 409
 410 // differential forward probability updates
 411 static int update_prob(VP56RangeCoder *c, int p)
 412 {
 413     static const int inv_map_table[254] = {
 414           7,  20,  33,  46,  59,  72,  85,  98, 111, 124, 137, 150, 163, 176,
 415         189, 202, 215, 228, 241, 254,   1,   2,   3,   4,   5,   6,   8,   9,
 416          10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  21,  22,  23,  24,
 417          25,  26,  27,  28,  29,  30,  31,  32,  34,  35,  36,  37,  38,  39,
 418          40,  41,  42,  43,  44,  45,  47,  48,  49,  50,  51,  52,  53,  54,
 419          55,  56,  57,  58,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,
 420          70,  71,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,
 421          86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,  99, 100,
 422         101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 112, 113, 114, 115,
 423         116, 117, 118, 119, 120, 121, 122, 123, 125, 126, 127, 128, 129, 130,
 424         131, 132, 133, 134, 135, 136, 138, 139, 140, 141, 142, 143, 144, 145,
 425         146, 147, 148, 149, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160,
 426         161, 162, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
 427         177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 190, 191,
 428         192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 203, 204, 205, 206,
 429         207, 208, 209, 210, 211, 212, 213, 214, 216, 217, 218, 219, 220, 221,
 430         222, 223, 224, 225, 226, 227, 229, 230, 231, 232, 233, 234, 235, 236,
 431         237, 238, 239, 240, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251,
 432         252, 253,
 433     };
 434     int d;
 435
 436     /* This code is trying to do a differential probability update. For a
 437      * current probability A in the range [1, 255], the difference to a new
 438      * probability of any value can be expressed differentially as 1-A,255-A
 439      * where some part of this (absolute range) exists both in positive as
 440      * well as the negative part, whereas another part only exists in one
 441      * half. We're trying to code this shared part differentially, i.e.
 442      * times two where the value of the lowest bit specifies the sign, and
 443      * the single part is then coded on top of this. This absolute difference
 444      * then again has a value of [0,254], but a bigger value in this range
 445      * indicates that we're further away from the original value A, so we
 446      * can code this as a VLC code, since higher values are increasingly
 447      * unlikely. The first 20 values in inv_map_table[] allow 'cheap, rough'
 448      * updates vs. the 'fine, exact' updates further down the range, which
 449      * adds one extra dimension to this differential update model. */
 450
 451     if (!vp8_rac_get(c)) {
 452         d = vp8_rac_get_uint(c, 4) + 0;
 453     } else if (!vp8_rac_get(c)) {
 454         d = vp8_rac_get_uint(c, 4) + 16;
 455     } else if (!vp8_rac_get(c)) {
 456         d = vp8_rac_get_uint(c, 5) + 32;
 457     } else {
 458         d = vp8_rac_get_uint(c, 7);
 459         if (d >= 65)
 460             d = (d << 1) - 65 + vp8_rac_get(c);
 461         d += 64;
 462     }
 463
 464     return p <= 128 ? 1 + inv_recenter_nonneg(inv_map_table[d], p - 1) :
 465                     255 - inv_recenter_nonneg(inv_map_table[d], 255 - p);
 466 }
 467
 468 static int decode_frame_header(AVCodecContext *ctx,
 469                                const uint8_t *data, int size, int *ref)
 470 {
 471     VP9Context *s = ctx->priv_data;
 472     int c, i, j, k, l, m, n, w, h, max, size2, res, sharp;
 473     int last_invisible;
 474     const uint8_t *data2;
 475
 476     /* general header */
 477     if ((res = init_get_bits8(&s->gb, data, size)) < 0) {
 478         av_log(ctx, AV_LOG_ERROR, "Failed to initialize bitstream reader\n");
 479         return res;
 480     }
 481     if (get_bits(&s->gb, 2) != 0x2) { // frame marker
 482         av_log(ctx, AV_LOG_ERROR, "Invalid frame marker\n");
 483         return AVERROR_INVALIDDATA;
 484     }
 485     s->profile = get_bits1(&s->gb);
 486     if (get_bits1(&s->gb)) { // reserved bit
 487         av_log(ctx, AV_LOG_ERROR, "Reserved bit should be zero\n");
 488         return AVERROR_INVALIDDATA;
 489     }
 490     if (get_bits1(&s->gb)) {
 491         *ref = get_bits(&s->gb, 3);
 492         return 0;
 493     }
 494     s->last_uses_2pass = s->uses_2pass;
 495     s->last_keyframe  = s->keyframe;
 496     s->keyframe       = !get_bits1(&s->gb);
 497     last_invisible    = s->invisible;
 498     s->invisible      = !get_bits1(&s->gb);
 499     s->errorres       = get_bits1(&s->gb);
 500     s->use_last_frame_mvs = !s->errorres && !last_invisible;
 501     if (s->keyframe) {
 502         if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
 503             av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
 504             return AVERROR_INVALIDDATA;
 505         }
 506         s->colorspace = get_bits(&s->gb, 3);
 507         if (s->colorspace == 7) { // RGB = profile 1
 508             av_log(ctx, AV_LOG_ERROR, "RGB not supported in profile 0\n");
 509             return AVERROR_INVALIDDATA;
 510         }
 511         s->fullrange  = get_bits1(&s->gb);
 512         // for profile 1, here follows the subsampling bits
 513         s->refreshrefmask = 0xff;
 514         w = get_bits(&s->gb, 16) + 1;
 515         h = get_bits(&s->gb, 16) + 1;
 516         if (get_bits1(&s->gb)) // display size
 517             skip_bits(&s->gb, 32);
 518     } else {
 519         s->intraonly  = s->invisible ? get_bits1(&s->gb) : 0;
 520         s->resetctx   = s->errorres ? 0 : get_bits(&s->gb, 2);
 521         if (s->intraonly) {
 522             if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
 523                 av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
 524                 return AVERROR_INVALIDDATA;
 525             }
 526             s->refreshrefmask = get_bits(&s->gb, 8);
 527             w = get_bits(&s->gb, 16) + 1;
 528             h = get_bits(&s->gb, 16) + 1;
 529             if (get_bits1(&s->gb)) // display size
 530                 skip_bits(&s->gb, 32);
 531         } else {
 532             s->refreshrefmask = get_bits(&s->gb, 8);
 533             s->refidx[0]      = get_bits(&s->gb, 3);
 534             s->signbias[0]    = get_bits1(&s->gb);
 535             s->refidx[1]      = get_bits(&s->gb, 3);
 536             s->signbias[1]    = get_bits1(&s->gb);
 537             s->refidx[2]      = get_bits(&s->gb, 3);
 538             s->signbias[2]    = get_bits1(&s->gb);
 539             if (!s->refs[s->refidx[0]].f->data[0] ||
 540                 !s->refs[s->refidx[1]].f->data[0] ||
 541                 !s->refs[s->refidx[2]].f->data[0]) {
 542                 av_log(ctx, AV_LOG_ERROR, "Not all references are available\n");
 543                 return AVERROR_INVALIDDATA;
 544             }
 545             if (get_bits1(&s->gb)) {
 546                 w = s->refs[s->refidx[0]].f->width;
 547                 h = s->refs[s->refidx[0]].f->height;
 548             } else if (get_bits1(&s->gb)) {
 549                 w = s->refs[s->refidx[1]].f->width;
 550                 h = s->refs[s->refidx[1]].f->height;
 551             } else if (get_bits1(&s->gb)) {
 552                 w = s->refs[s->refidx[2]].f->width;
 553                 h = s->refs[s->refidx[2]].f->height;
 554             } else {
 555                 w = get_bits(&s->gb, 16) + 1;
 556                 h = get_bits(&s->gb, 16) + 1;
 557             }
 558             // Note that in this code, "CUR_FRAME" is actually before we
 559             // have formally allocated a frame, and thus actually represents
 560             // the _last_ frame
 561             s->use_last_frame_mvs &= s->frames[CUR_FRAME].tf.f->width == w &&
 562                                      s->frames[CUR_FRAME].tf.f->height == h;
 563             if (get_bits1(&s->gb)) // display size
 564                 skip_bits(&s->gb, 32);
 565             s->highprecisionmvs = get_bits1(&s->gb);
 566             s->filtermode = get_bits1(&s->gb) ? FILTER_SWITCHABLE :
 567                                                 get_bits(&s->gb, 2);
 568             s->allowcompinter = s->signbias[0] != s->signbias[1] ||
 569                                 s->signbias[0] != s->signbias[2];
 570             if (s->allowcompinter) {
 571                 if (s->signbias[0] == s->signbias[1]) {
 572                     s->fixcompref    = 2;
 573                     s->varcompref[0] = 0;
 574                     s->varcompref[1] = 1;
 575                 } else if (s->signbias[0] == s->signbias[2]) {
 576                     s->fixcompref    = 1;
 577                     s->varcompref[0] = 0;
 578                     s->varcompref[1] = 2;
 579                 } else {
 580                     s->fixcompref    = 0;
 581                     s->varcompref[0] = 1;
 582                     s->varcompref[1] = 2;
 583                 }
 584             }
 585         }
 586     }
 587     s->refreshctx   = s->errorres ? 0 : get_bits1(&s->gb);
 588     s->parallelmode = s->errorres ? 1 : get_bits1(&s->gb);
 589     s->framectxid   = c = get_bits(&s->gb, 2);
 590
 591     /* loopfilter header data */
 592     s->filter.level = get_bits(&s->gb, 6);
 593     sharp = get_bits(&s->gb, 3);
 594     // if sharpness changed, reinit lim/mblim LUTs. if it didn't change, keep
 595     // the old cache values since they are still valid
 596     if (s->filter.sharpness != sharp)
 597         memset(s->filter.lim_lut, 0, sizeof(s->filter.lim_lut));
 598     s->filter.sharpness = sharp;
 599     if ((s->lf_delta.enabled = get_bits1(&s->gb))) {
 600         if (get_bits1(&s->gb)) {
 601             for (i = 0; i < 4; i++)
 602                 if (get_bits1(&s->gb))
 603                     s->lf_delta.ref[i] = get_sbits_inv(&s->gb, 6);
 604             for (i = 0; i < 2; i++)
 605                 if (get_bits1(&s->gb))
 606                     s->lf_delta.mode[i] = get_sbits_inv(&s->gb, 6);
 607         }
 608     } else {
 609         memset(&s->lf_delta, 0, sizeof(s->lf_delta));
 610     }
 611
 612     /* quantization header data */
 613     s->yac_qi      = get_bits(&s->gb, 8);
 614     s->ydc_qdelta  = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
 615     s->uvdc_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
 616     s->uvac_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
 617     s->lossless    = s->yac_qi == 0 && s->ydc_qdelta == 0 &&
 618                      s->uvdc_qdelta == 0 && s->uvac_qdelta == 0;
 619
 620     /* segmentation header info */
 621     if ((s->segmentation.enabled = get_bits1(&s->gb))) {
 622         if ((s->segmentation.update_map = get_bits1(&s->gb))) {
 623             for (i = 0; i < 7; i++)
 624                 s->prob.seg[i] = get_bits1(&s->gb) ?
 625                                  get_bits(&s->gb, 8) : 255;
 626             if ((s->segmentation.temporal = get_bits1(&s->gb))) {
 627                 for (i = 0; i < 3; i++)
 628                     s->prob.segpred[i] = get_bits1(&s->gb) ?
 629                                          get_bits(&s->gb, 8) : 255;
 630             }
 631         }
 632         if ((!s->segmentation.update_map || s->segmentation.temporal) &&
 633             (w != s->frames[CUR_FRAME].tf.f->width ||
 634              h != s->frames[CUR_FRAME].tf.f->height)) {
 635             av_log(ctx, AV_LOG_ERROR,
 636                    "Reference segmap (temp=%d,update=%d) enabled on size-change!\n",
 637                    s->segmentation.temporal, s->segmentation.update_map);
 638             return AVERROR_INVALIDDATA;
 639         }
 640
 641         if (get_bits1(&s->gb)) {
 642             s->segmentation.absolute_vals = get_bits1(&s->gb);
 643             for (i = 0; i < 8; i++) {
 644                 if ((s->segmentation.feat[i].q_enabled = get_bits1(&s->gb)))
 645                     s->segmentation.feat[i].q_val = get_sbits_inv(&s->gb, 8);
 646                 if ((s->segmentation.feat[i].lf_enabled = get_bits1(&s->gb)))
 647                     s->segmentation.feat[i].lf_val = get_sbits_inv(&s->gb, 6);
 648                 if ((s->segmentation.feat[i].ref_enabled = get_bits1(&s->gb)))
 649                     s->segmentation.feat[i].ref_val = get_bits(&s->gb, 2);
 650                 s->segmentation.feat[i].skip_enabled = get_bits1(&s->gb);
 651             }
 652         }
 653     } else {
 654         s->segmentation.feat[0].q_enabled    = 0;
 655         s->segmentation.feat[0].lf_enabled   = 0;
 656         s->segmentation.feat[0].skip_enabled = 0;
 657         s->segmentation.feat[0].ref_enabled  = 0;
 658     }
 659
 660     // set qmul[] based on Y/UV, AC/DC and segmentation Q idx deltas
 661     for (i = 0; i < (s->segmentation.enabled ? 8 : 1); i++) {
 662         int qyac, qydc, quvac, quvdc, lflvl, sh;
 663
 664         if (s->segmentation.feat[i].q_enabled) {
 665             if (s->segmentation.absolute_vals)
 666                 qyac = s->segmentation.feat[i].q_val;
 667             else
 668                 qyac = s->yac_qi + s->segmentation.feat[i].q_val;
 669         } else {
 670             qyac  = s->yac_qi;
 671         }
 672         qydc  = av_clip_uintp2(qyac + s->ydc_qdelta, 8);
 673         quvdc = av_clip_uintp2(qyac + s->uvdc_qdelta, 8);
 674         quvac = av_clip_uintp2(qyac + s->uvac_qdelta, 8);
 675         qyac  = av_clip_uintp2(qyac, 8);
 676
 677         s->segmentation.feat[i].qmul[0][0] = vp9_dc_qlookup[qydc];
 678         s->segmentation.feat[i].qmul[0][1] = vp9_ac_qlookup[qyac];
 679         s->segmentation.feat[i].qmul[1][0] = vp9_dc_qlookup[quvdc];
 680         s->segmentation.feat[i].qmul[1][1] = vp9_ac_qlookup[quvac];
 681
 682         sh = s->filter.level >= 32;
 683         if (s->segmentation.feat[i].lf_enabled) {
 684             if (s->segmentation.absolute_vals)
 685                 lflvl = s->segmentation.feat[i].lf_val;
 686             else
 687                 lflvl = s->filter.level + s->segmentation.feat[i].lf_val;
 688         } else {
 689             lflvl  = s->filter.level;
 690         }
 691         s->segmentation.feat[i].lflvl[0][0] =
 692         s->segmentation.feat[i].lflvl[0][1] =
 693             av_clip_uintp2(lflvl + (s->lf_delta.ref[0] << sh), 6);
 694         for (j = 1; j < 4; j++) {
 695             s->segmentation.feat[i].lflvl[j][0] =
 696                 av_clip_uintp2(lflvl + ((s->lf_delta.ref[j] +
 697                                          s->lf_delta.mode[0]) << sh), 6);
 698             s->segmentation.feat[i].lflvl[j][1] =
 699                 av_clip_uintp2(lflvl + ((s->lf_delta.ref[j] +
 700                                          s->lf_delta.mode[1]) << sh), 6);
 701         }
 702     }
 703
 704     /* tiling info */
 705     if ((res = update_size(ctx, w, h)) < 0) {
 706         av_log(ctx, AV_LOG_ERROR, "Failed to initialize decoder for %dx%d\n", w, h);
 707         return res;
 708     }
 709     for (s->tiling.log2_tile_cols = 0;
 710          (s->sb_cols >> s->tiling.log2_tile_cols) > 64;
 711          s->tiling.log2_tile_cols++) ;
 712     for (max = 0; (s->sb_cols >> max) >= 4; max++) ;
 713     max = FFMAX(0, max - 1);
 714     while (max > s->tiling.log2_tile_cols) {
 715         if (get_bits1(&s->gb))
 716             s->tiling.log2_tile_cols++;
 717         else
 718             break;
 719     }
 720     s->tiling.log2_tile_rows = decode012(&s->gb);
 721     s->tiling.tile_rows = 1 << s->tiling.log2_tile_rows;
 722     if (s->tiling.tile_cols != (1 << s->tiling.log2_tile_cols)) {
 723         s->tiling.tile_cols = 1 << s->tiling.log2_tile_cols;
 724         s->c_b = av_fast_realloc(s->c_b, &s->c_b_size,
 725                                  sizeof(VP56RangeCoder) * s->tiling.tile_cols);
 726         if (!s->c_b) {
 727             av_log(ctx, AV_LOG_ERROR, "Ran out of memory during range coder init\n");
 728             return AVERROR(ENOMEM);
 729         }
 730     }
 731
 732     if (s->keyframe || s->errorres || s->intraonly) {
 733         s->prob_ctx[0].p = s->prob_ctx[1].p = s->prob_ctx[2].p =
 734                            s->prob_ctx[3].p = vp9_default_probs;
 735         memcpy(s->prob_ctx[0].coef, vp9_default_coef_probs,
 736                sizeof(vp9_default_coef_probs));
 737         memcpy(s->prob_ctx[1].coef, vp9_default_coef_probs,
 738                sizeof(vp9_default_coef_probs));
 739         memcpy(s->prob_ctx[2].coef, vp9_default_coef_probs,
 740                sizeof(vp9_default_coef_probs));
 741         memcpy(s->prob_ctx[3].coef, vp9_default_coef_probs,
 742                sizeof(vp9_default_coef_probs));
 743     }
 744
 745     // next 16 bits is size of the rest of the header (arith-coded)
 746     size2 = get_bits(&s->gb, 16);
 747     data2 = align_get_bits(&s->gb);
 748     if (size2 > size - (data2 - data)) {
 749         av_log(ctx, AV_LOG_ERROR, "Invalid compressed header size\n");
 750         return AVERROR_INVALIDDATA;
 751     }
 752     ff_vp56_init_range_decoder(&s->c, data2, size2);
 753     if (vp56_rac_get_prob_branchy(&s->c, 128)) { // marker bit
 754         av_log(ctx, AV_LOG_ERROR, "Marker bit was set\n");
 755         return AVERROR_INVALIDDATA;
 756     }
 757
 758     if (s->keyframe || s->intraonly) {
 759         memset(s->counts.coef, 0, sizeof(s->counts.coef) + sizeof(s->counts.eob));
 760     } else {
 761         memset(&s->counts, 0, sizeof(s->counts));
 762     }
 763     // FIXME is it faster to not copy here, but do it down in the fw updates
 764     // as explicit copies if the fw update is missing (and skip the copy upon
 765     // fw update)?
 766     s->prob.p = s->prob_ctx[c].p;
 767
 768     // txfm updates
 769     if (s->lossless) {
 770         s->txfmmode = TX_4X4;
 771     } else {
 772         s->txfmmode = vp8_rac_get_uint(&s->c, 2);
 773         if (s->txfmmode == 3)
 774             s->txfmmode += vp8_rac_get(&s->c);
 775
 776         if (s->txfmmode == TX_SWITCHABLE) {
 777             for (i = 0; i < 2; i++)
 778                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 779                     s->prob.p.tx8p[i] = update_prob(&s->c, s->prob.p.tx8p[i]);
 780             for (i = 0; i < 2; i++)
 781                 for (j = 0; j < 2; j++)
 782                     if (vp56_rac_get_prob_branchy(&s->c, 252))
 783                         s->prob.p.tx16p[i][j] =
 784                             update_prob(&s->c, s->prob.p.tx16p[i][j]);
 785             for (i = 0; i < 2; i++)
 786                 for (j = 0; j < 3; j++)
 787                     if (vp56_rac_get_prob_branchy(&s->c, 252))
 788                         s->prob.p.tx32p[i][j] =
 789                             update_prob(&s->c, s->prob.p.tx32p[i][j]);
 790         }
 791     }
 792
 793     // coef updates
 794     for (i = 0; i < 4; i++) {
 795         uint8_t (*ref)[2][6][6][3] = s->prob_ctx[c].coef[i];
 796         if (vp8_rac_get(&s->c)) {
 797             for (j = 0; j < 2; j++)
 798                 for (k = 0; k < 2; k++)
 799                     for (l = 0; l < 6; l++)
 800                         for (m = 0; m < 6; m++) {
 801                             uint8_t *p = s->prob.coef[i][j][k][l][m];
 802                             uint8_t *r = ref[j][k][l][m];
 803                             if (m >= 3 && l == 0) // dc only has 3 pt
 804                                 break;
 805                             for (n = 0; n < 3; n++) {
 806                                 if (vp56_rac_get_prob_branchy(&s->c, 252)) {
 807                                     p[n] = update_prob(&s->c, r[n]);
 808                                 } else {
 809                                     p[n] = r[n];
 810                                 }
 811                             }
 812                             p[3] = 0;
 813                         }
 814         } else {
 815             for (j = 0; j < 2; j++)
 816                 for (k = 0; k < 2; k++)
 817                     for (l = 0; l < 6; l++)
 818                         for (m = 0; m < 6; m++) {
 819                             uint8_t *p = s->prob.coef[i][j][k][l][m];
 820                             uint8_t *r = ref[j][k][l][m];
 821                             if (m > 3 && l == 0) // dc only has 3 pt
 822                                 break;
 823                             memcpy(p, r, 3);
 824                             p[3] = 0;
 825                         }
 826         }
 827         if (s->txfmmode == i)
 828             break;
 829     }
 830
 831     // mode updates
 832     for (i = 0; i < 3; i++)
 833         if (vp56_rac_get_prob_branchy(&s->c, 252))
 834             s->prob.p.skip[i] = update_prob(&s->c, s->prob.p.skip[i]);
 835     if (!s->keyframe && !s->intraonly) {
 836         for (i = 0; i < 7; i++)
 837             for (j = 0; j < 3; j++)
 838                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 839                     s->prob.p.mv_mode[i][j] =
 840                         update_prob(&s->c, s->prob.p.mv_mode[i][j]);
 841
 842         if (s->filtermode == FILTER_SWITCHABLE)
 843             for (i = 0; i < 4; i++)
 844                 for (j = 0; j < 2; j++)
 845                     if (vp56_rac_get_prob_branchy(&s->c, 252))
 846                         s->prob.p.filter[i][j] =
 847                             update_prob(&s->c, s->prob.p.filter[i][j]);
 848
 849         for (i = 0; i < 4; i++)
 850             if (vp56_rac_get_prob_branchy(&s->c, 252))
 851                 s->prob.p.intra[i] = update_prob(&s->c, s->prob.p.intra[i]);
 852
 853         if (s->allowcompinter) {
 854             s->comppredmode = vp8_rac_get(&s->c);
 855             if (s->comppredmode)
 856                 s->comppredmode += vp8_rac_get(&s->c);
 857             if (s->comppredmode == PRED_SWITCHABLE)
 858                 for (i = 0; i < 5; i++)
 859                     if (vp56_rac_get_prob_branchy(&s->c, 252))
 860                         s->prob.p.comp[i] =
 861                             update_prob(&s->c, s->prob.p.comp[i]);
 862         } else {
 863             s->comppredmode = PRED_SINGLEREF;
 864         }
 865
 866         if (s->comppredmode != PRED_COMPREF) {
 867             for (i = 0; i < 5; i++) {
 868                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 869                     s->prob.p.single_ref[i][0] =
 870                         update_prob(&s->c, s->prob.p.single_ref[i][0]);
 871                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 872                     s->prob.p.single_ref[i][1] =
 873                         update_prob(&s->c, s->prob.p.single_ref[i][1]);
 874             }
 875         }
 876
 877         if (s->comppredmode != PRED_SINGLEREF) {
 878             for (i = 0; i < 5; i++)
 879                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 880                     s->prob.p.comp_ref[i] =
 881                         update_prob(&s->c, s->prob.p.comp_ref[i]);
 882         }
 883
 884         for (i = 0; i < 4; i++)
 885             for (j = 0; j < 9; j++)
 886                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 887                     s->prob.p.y_mode[i][j] =
 888                         update_prob(&s->c, s->prob.p.y_mode[i][j]);
 889
 890         for (i = 0; i < 4; i++)
 891             for (j = 0; j < 4; j++)
 892                 for (k = 0; k < 3; k++)
 893                     if (vp56_rac_get_prob_branchy(&s->c, 252))
 894                         s->prob.p.partition[3 - i][j][k] =
 895                             update_prob(&s->c, s->prob.p.partition[3 - i][j][k]);
 896
 897         // mv fields don't use the update_prob subexp model for some reason
 898         for (i = 0; i < 3; i++)
 899             if (vp56_rac_get_prob_branchy(&s->c, 252))
 900                 s->prob.p.mv_joint[i] = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 901
 902         for (i = 0; i < 2; i++) {
 903             if (vp56_rac_get_prob_branchy(&s->c, 252))
 904                 s->prob.p.mv_comp[i].sign = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 905
 906             for (j = 0; j < 10; j++)
 907                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 908                     s->prob.p.mv_comp[i].classes[j] =
 909                         (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 910
 911             if (vp56_rac_get_prob_branchy(&s->c, 252))
 912                 s->prob.p.mv_comp[i].class0 = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 913
 914             for (j = 0; j < 10; j++)
 915                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 916                     s->prob.p.mv_comp[i].bits[j] =
 917                         (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 918         }
 919
 920         for (i = 0; i < 2; i++) {
 921             for (j = 0; j < 2; j++)
 922                 for (k = 0; k < 3; k++)
 923                     if (vp56_rac_get_prob_branchy(&s->c, 252))
 924                         s->prob.p.mv_comp[i].class0_fp[j][k] =
 925                             (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 926
 927             for (j = 0; j < 3; j++)
 928                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 929                     s->prob.p.mv_comp[i].fp[j] =
 930                         (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 931         }
 932
 933         if (s->highprecisionmvs) {
 934             for (i = 0; i < 2; i++) {
 935                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 936                     s->prob.p.mv_comp[i].class0_hp =
 937                         (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 938
 939                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 940                     s->prob.p.mv_comp[i].hp =
 941                         (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 942             }
 943         }
 944     }
 945
 946     return (data2 - data) + size2;
 947 }
 948
 949 static av_always_inline void clamp_mv(VP56mv *dst, const VP56mv *src,
 950                                       VP9Context *s)
 951 {
 952     dst->x = av_clip(src->x, s->min_mv.x, s->max_mv.x);
 953     dst->y = av_clip(src->y, s->min_mv.y, s->max_mv.y);
 954 }
 955
 956 static void find_ref_mvs(VP9Context *s,
 957                          VP56mv *pmv, int ref, int z, int idx, int sb)
 958 {
 959     static const int8_t mv_ref_blk_off[N_BS_SIZES][8][2] = {
 960         [BS_64x64] = {{  3, -1 }, { -1,  3 }, {  4, -1 }, { -1,  4 },
 961                       { -1, -1 }, {  0, -1 }, { -1,  0 }, {  6, -1 }},
 962         [BS_64x32] = {{  0, -1 }, { -1,  0 }, {  4, -1 }, { -1,  2 },
 963                       { -1, -1 }, {  0, -3 }, { -3,  0 }, {  2, -1 }},
 964         [BS_32x64] = {{ -1,  0 }, {  0, -1 }, { -1,  4 }, {  2, -1 },
 965                       { -1, -1 }, { -3,  0 }, {  0, -3 }, { -1,  2 }},
 966         [BS_32x32] = {{  1, -1 }, { -1,  1 }, {  2, -1 }, { -1,  2 },
 967                       { -1, -1 }, {  0, -3 }, { -3,  0 }, { -3, -3 }},
 968         [BS_32x16] = {{  0, -1 }, { -1,  0 }, {  2, -1 }, { -1, -1 },
 969                       { -1,  1 }, {  0, -3 }, { -3,  0 }, { -3, -3 }},
 970         [BS_16x32] = {{ -1,  0 }, {  0, -1 }, { -1,  2 }, { -1, -1 },
 971                       {  1, -1 }, { -3,  0 }, {  0, -3 }, { -3, -3 }},
 972         [BS_16x16] = {{  0, -1 }, { -1,  0 }, {  1, -1 }, { -1,  1 },
 973                       { -1, -1 }, {  0, -3 }, { -3,  0 }, { -3, -3 }},
 974         [BS_16x8]  = {{  0, -1 }, { -1,  0 }, {  1, -1 }, { -1, -1 },
 975                       {  0, -2 }, { -2,  0 }, { -2, -1 }, { -1, -2 }},
 976         [BS_8x16]  = {{ -1,  0 }, {  0, -1 }, { -1,  1 }, { -1, -1 },
 977                       { -2,  0 }, {  0, -2 }, { -1, -2 }, { -2, -1 }},
 978         [BS_8x8]   = {{  0, -1 }, { -1,  0 }, { -1, -1 }, {  0, -2 },
 979                       { -2,  0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
 980         [BS_8x4]   = {{  0, -1 }, { -1,  0 }, { -1, -1 }, {  0, -2 },
 981                       { -2,  0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
 982         [BS_4x8]   = {{  0, -1 }, { -1,  0 }, { -1, -1 }, {  0, -2 },
 983                       { -2,  0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
 984         [BS_4x4]   = {{  0, -1 }, { -1,  0 }, { -1, -1 }, {  0, -2 },
 985                       { -2,  0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
 986     };
 987     VP9Block *b = s->b;
 988     int row = s->row, col = s->col, row7 = s->row7;
 989     const int8_t (*p)[2] = mv_ref_blk_off[b->bs];
 990 #define INVALID_MV 0x80008000U
 991     uint32_t mem = INVALID_MV;
 992     int i;
 993
 994 #define RETURN_DIRECT_MV(mv) \
 995     do { \
 996         uint32_t m = AV_RN32A(&mv); \
 997         if (!idx) { \
 998             AV_WN32A(pmv, m); \
 999             return; \
1000         } else if (mem == INVALID_MV) { \
1001             mem = m; \
1002         } else if (m != mem) { \
1003             AV_WN32A(pmv, m); \
1004             return; \
1005         } \
1006     } while (0)
1007
1008     if (sb >= 0) {
1009         if (sb == 2 || sb == 1) {
1010             RETURN_DIRECT_MV(b->mv[0][z]);
1011         } else if (sb == 3) {
1012             RETURN_DIRECT_MV(b->mv[2][z]);
1013             RETURN_DIRECT_MV(b->mv[1][z]);
1014             RETURN_DIRECT_MV(b->mv[0][z]);
1015         }
1016
1017 #define RETURN_MV(mv) \
1018     do { \
1019         if (sb > 0) { \
1020             VP56mv tmp; \
1021             uint32_t m; \
1022             clamp_mv(&tmp, &mv, s); \
1023             m = AV_RN32A(&tmp); \
1024             if (!idx) { \
1025                 AV_WN32A(pmv, m); \
1026                 return; \
1027             } else if (mem == INVALID_MV) { \
1028                 mem = m; \
1029             } else if (m != mem) { \
1030                 AV_WN32A(pmv, m); \
1031                 return; \
1032             } \
1033         } else { \
1034             uint32_t m = AV_RN32A(&mv); \
1035             if (!idx) { \
1036                 clamp_mv(pmv, &mv, s); \
1037                 return; \
1038             } else if (mem == INVALID_MV) { \
1039                 mem = m; \
1040             } else if (m != mem) { \
1041                 clamp_mv(pmv, &mv, s); \
1042                 return; \
1043             } \
1044         } \
1045     } while (0)
1046
1047         if (row > 0) {
1048             struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[(row - 1) * s->sb_cols * 8 + col];
1049             if (mv->ref[0] == ref) {
1050                 RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][0]);
1051             } else if (mv->ref[1] == ref) {
1052                 RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][1]);
1053             }
1054         }
1055         if (col > s->tiling.tile_col_start) {
1056             struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[row * s->sb_cols * 8 + col - 1];
1057             if (mv->ref[0] == ref) {
1058                 RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][0]);
1059             } else if (mv->ref[1] == ref) {
1060                 RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][1]);
1061             }
1062         }
1063         i = 2;
1064     } else {
1065         i = 0;
1066     }
1067
1068     // previously coded MVs in this neighbourhood, using same reference frame
1069     for (; i < 8; i++) {
1070         int c = p[i][0] + col, r = p[i][1] + row;
1071
1072         if (c >= s->tiling.tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
1073             struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
1074
1075             if (mv->ref[0] == ref) {
1076                 RETURN_MV(mv->mv[0]);
1077             } else if (mv->ref[1] == ref) {
1078                 RETURN_MV(mv->mv[1]);
1079             }
1080         }
1081     }
1082
1083     // MV at this position in previous frame, using same reference frame
1084     if (s->use_last_frame_mvs) {
1085         struct VP9mvrefPair *mv = &s->frames[LAST_FRAME].mv[row * s->sb_cols * 8 + col];
1086
1087         if (!s->last_uses_2pass)
1088             ff_thread_await_progress(&s->frames[LAST_FRAME].tf, row >> 3, 0);
1089         if (mv->ref[0] == ref) {
1090             RETURN_MV(mv->mv[0]);
1091         } else if (mv->ref[1] == ref) {
1092             RETURN_MV(mv->mv[1]);
1093         }
1094     }
1095
1096 #define RETURN_SCALE_MV(mv, scale) \
1097     do { \
1098         if (scale) { \
1099             VP56mv mv_temp = { -mv.x, -mv.y }; \
1100             RETURN_MV(mv_temp); \
1101         } else { \
1102             RETURN_MV(mv); \
1103         } \
1104     } while (0)
1105
1106     // previously coded MVs in this neighbourhood, using different reference frame
1107     for (i = 0; i < 8; i++) {
1108         int c = p[i][0] + col, r = p[i][1] + row;
1109
1110         if (c >= s->tiling.tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
1111             struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
1112
1113             if (mv->ref[0] != ref && mv->ref[0] >= 0) {
1114                 RETURN_SCALE_MV(mv->mv[0], s->signbias[mv->ref[0]] != s->signbias[ref]);
1115             }
1116             if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
1117                 // BUG - libvpx has this condition regardless of whether
1118                 // we used the first ref MV and pre-scaling
1119                 AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
1120                 RETURN_SCALE_MV(mv->mv[1], s->signbias[mv->ref[1]] != s->signbias[ref]);
1121             }
1122         }
1123     }
1124
1125     // MV at this position in previous frame, using different reference frame
1126     if (s->use_last_frame_mvs) {
1127         struct VP9mvrefPair *mv = &s->frames[LAST_FRAME].mv[row * s->sb_cols * 8 + col];
1128
1129         // no need to await_progress, because we already did that above
1130         if (mv->ref[0] != ref && mv->ref[0] >= 0) {
1131             RETURN_SCALE_MV(mv->mv[0], s->signbias[mv->ref[0]] != s->signbias[ref]);
1132         }
1133         if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
1134             // BUG - libvpx has this condition regardless of whether
1135             // we used the first ref MV and pre-scaling
1136             AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
1137             RETURN_SCALE_MV(mv->mv[1], s->signbias[mv->ref[1]] != s->signbias[ref]);
1138         }
1139     }
1140
1141     AV_ZERO32(pmv);
1142 #undef INVALID_MV
1143 #undef RETURN_MV
1144 #undef RETURN_SCALE_MV
1145 }
1146
1147 static av_always_inline int read_mv_component(VP9Context *s, int idx, int hp)
1148 {
1149     int bit, sign = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].sign);
1150     int n, c = vp8_rac_get_tree(&s->c, vp9_mv_class_tree,
1151                                 s->prob.p.mv_comp[idx].classes);
1152
1153     s->counts.mv_comp[idx].sign[sign]++;
1154     s->counts.mv_comp[idx].classes[c]++;
1155     if (c) {
1156         int m;
1157
1158         for (n = 0, m = 0; m < c; m++) {
1159             bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].bits[m]);
1160             n |= bit << m;
1161             s->counts.mv_comp[idx].bits[m][bit]++;
1162         }
1163         n <<= 3;
1164         bit = vp8_rac_get_tree(&s->c, vp9_mv_fp_tree, s->prob.p.mv_comp[idx].fp);
1165         n |= bit << 1;
1166         s->counts.mv_comp[idx].fp[bit]++;
1167         if (hp) {
1168             bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].hp);
1169             s->counts.mv_comp[idx].hp[bit]++;
1170             n |= bit;
1171         } else {
1172             n |= 1;
1173             // bug in libvpx - we count for bw entropy purposes even if the
1174             // bit wasn't coded
1175             s->counts.mv_comp[idx].hp[1]++;
1176         }
1177         n += 8 << c;
1178     } else {
1179         n = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0);
1180         s->counts.mv_comp[idx].class0[n]++;
1181         bit = vp8_rac_get_tree(&s->c, vp9_mv_fp_tree,
1182                                s->prob.p.mv_comp[idx].class0_fp[n]);
1183         s->counts.mv_comp[idx].class0_fp[n][bit]++;
1184         n = (n << 3) | (bit << 1);
1185         if (hp) {
1186             bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0_hp);
1187             s->counts.mv_comp[idx].class0_hp[bit]++;
1188             n |= bit;
1189         } else {
1190             n |= 1;
1191             // bug in libvpx - we count for bw entropy purposes even if the
1192             // bit wasn't coded
1193             s->counts.mv_comp[idx].class0_hp[1]++;
1194         }
1195     }
1196
1197     return sign ? -(n + 1) : (n + 1);
1198 }
1199
1200 static void fill_mv(VP9Context *s,
1201                     VP56mv *mv, int mode, int sb)
1202 {
1203     VP9Block *b = s->b;
1204
1205     if (mode == ZEROMV) {
1206         AV_ZERO64(mv);
1207     } else {
1208         int hp;
1209
1210         // FIXME cache this value and reuse for other subblocks
1211         find_ref_mvs(s, &mv[0], b->ref[0], 0, mode == NEARMV,
1212                      mode == NEWMV ? -1 : sb);
1213         // FIXME maybe move this code into find_ref_mvs()
1214         if ((mode == NEWMV || sb == -1) &&
1215             !(hp = s->highprecisionmvs && abs(mv[0].x) < 64 && abs(mv[0].y) < 64)) {
1216             if (mv[0].y & 1) {
1217                 if (mv[0].y < 0)
1218                     mv[0].y++;
1219                 else
1220                     mv[0].y--;
1221             }
1222             if (mv[0].x & 1) {
1223                 if (mv[0].x < 0)
1224                     mv[0].x++;
1225                 else
1226                     mv[0].x--;
1227             }
1228         }
1229         if (mode == NEWMV) {
1230             enum MVJoint j = vp8_rac_get_tree(&s->c, vp9_mv_joint_tree,
1231                                               s->prob.p.mv_joint);
1232
1233             s->counts.mv_joint[j]++;
1234             if (j >= MV_JOINT_V)
1235                 mv[0].y += read_mv_component(s, 0, hp);
1236             if (j & 1)
1237                 mv[0].x += read_mv_component(s, 1, hp);
1238         }
1239
1240         if (b->comp) {
1241             // FIXME cache this value and reuse for other subblocks
1242             find_ref_mvs(s, &mv[1], b->ref[1], 1, mode == NEARMV,
1243                          mode == NEWMV ? -1 : sb);
1244             if ((mode == NEWMV || sb == -1) &&
1245                 !(hp = s->highprecisionmvs && abs(mv[1].x) < 64 && abs(mv[1].y) < 64)) {
1246                 if (mv[1].y & 1) {
1247                     if (mv[1].y < 0)
1248                         mv[1].y++;
1249                     else
1250                         mv[1].y--;
1251                 }
1252                 if (mv[1].x & 1) {
1253                     if (mv[1].x < 0)
1254                         mv[1].x++;
1255                     else
1256                         mv[1].x--;
1257                 }
1258             }
1259             if (mode == NEWMV) {
1260                 enum MVJoint j = vp8_rac_get_tree(&s->c, vp9_mv_joint_tree,
1261                                                   s->prob.p.mv_joint);
1262
1263                 s->counts.mv_joint[j]++;
1264                 if (j >= MV_JOINT_V)
1265                     mv[1].y += read_mv_component(s, 0, hp);
1266                 if (j & 1)
1267                     mv[1].x += read_mv_component(s, 1, hp);
1268             }
1269         }
1270     }
1271 }
1272
1273 static av_always_inline void setctx_2d(uint8_t *ptr, int w, int h,
1274                                        ptrdiff_t stride, int v)
1275 {
1276     switch (w) {
1277     case 1:
1278         do {
1279             *ptr = v;
1280             ptr += stride;
1281         } while (--h);
1282         break;
1283     case 2: {
1284         int v16 = v * 0x0101;
1285         do {
1286             AV_WN16A(ptr, v16);
1287             ptr += stride;
1288         } while (--h);
1289         break;
1290     }
1291     case 4: {
1292         uint32_t v32 = v * 0x01010101;
1293         do {
1294             AV_WN32A(ptr, v32);
1295             ptr += stride;
1296         } while (--h);
1297         break;
1298     }
1299     case 8: {
1300 #if HAVE_FAST_64BIT
1301         uint64_t v64 = v * 0x0101010101010101ULL;
1302         do {
1303             AV_WN64A(ptr, v64);
1304             ptr += stride;
1305         } while (--h);
1306 #else
1307         uint32_t v32 = v * 0x01010101;
1308         do {
1309             AV_WN32A(ptr,     v32);
1310             AV_WN32A(ptr + 4, v32);
1311             ptr += stride;
1312         } while (--h);
1313 #endif
1314         break;
1315     }
1316     }
1317 }
1318
1319 static void decode_mode(AVCodecContext *ctx)
1320 {
1321     static const uint8_t left_ctx[N_BS_SIZES] = {
1322         0x0, 0x8, 0x0, 0x8, 0xc, 0x8, 0xc, 0xe, 0xc, 0xe, 0xf, 0xe, 0xf
1323     };
1324     static const uint8_t above_ctx[N_BS_SIZES] = {
1325         0x0, 0x0, 0x8, 0x8, 0x8, 0xc, 0xc, 0xc, 0xe, 0xe, 0xe, 0xf, 0xf
1326     };
1327     static const uint8_t max_tx_for_bl_bp[N_BS_SIZES] = {
1328         TX_32X32, TX_32X32, TX_32X32, TX_32X32, TX_16X16, TX_16X16,
1329         TX_16X16, TX_8X8, TX_8X8, TX_8X8, TX_4X4, TX_4X4, TX_4X4
1330     };
1331     VP9Context *s = ctx->priv_data;
1332     VP9Block *b = s->b;
1333     int row = s->row, col = s->col, row7 = s->row7;
1334     enum TxfmMode max_tx = max_tx_for_bl_bp[b->bs];
1335     int w4 = FFMIN(s->cols - col, bwh_tab[1][b->bs][0]);
1336     int h4 = FFMIN(s->rows - row, bwh_tab[1][b->bs][1]), y;
1337     int have_a = row > 0, have_l = col > s->tiling.tile_col_start;
1338     int vref, filter_id;
1339
1340     if (!s->segmentation.enabled) {
1341         b->seg_id = 0;
1342     } else if (s->keyframe || s->intraonly) {
1343         b->seg_id = vp8_rac_get_tree(&s->c, vp9_segmentation_tree, s->prob.seg);
1344     } else if (!s->segmentation.update_map ||
1345                (s->segmentation.temporal &&
1346                 vp56_rac_get_prob_branchy(&s->c,
1347                     s->prob.segpred[s->above_segpred_ctx[col] +
1348                                     s->left_segpred_ctx[row7]]))) {
1349         if (!s->errorres) {
1350             int pred = 8, x;
1351             uint8_t *refsegmap = s->frames[LAST_FRAME].segmentation_map;
1352
1353             if (!s->last_uses_2pass)
1354                 ff_thread_await_progress(&s->frames[LAST_FRAME].tf, row >> 3, 0);
1355             for (y = 0; y < h4; y++) {
1356                 int idx_base = (y + row) * 8 * s->sb_cols + col;
1357                 for (x = 0; x < w4; x++)
1358                     pred = FFMIN(pred, refsegmap[idx_base + x]);
1359                 if (!s->segmentation.update_map && ctx->active_thread_type == FF_THREAD_FRAME) {
1360                     // FIXME maybe retain reference to previous frame as
1361                     // segmap reference instead of copying the whole map
1362                     // into a new buffer
1363                     memcpy(&s->frames[CUR_FRAME].segmentation_map[idx_base],
1364                            &refsegmap[idx_base], w4);
1365                 }
1366             }
1367             av_assert1(pred < 8);
1368             b->seg_id = pred;
1369         } else {
1370             b->seg_id = 0;
1371         }
1372
1373         memset(&s->above_segpred_ctx[col], 1, w4);
1374         memset(&s->left_segpred_ctx[row7], 1, h4);
1375     } else {
1376         b->seg_id = vp8_rac_get_tree(&s->c, vp9_segmentation_tree,
1377                                      s->prob.seg);
1378
1379         memset(&s->above_segpred_ctx[col], 0, w4);
1380         memset(&s->left_segpred_ctx[row7], 0, h4);
1381     }
1382     if (s->segmentation.enabled &&
1383         (s->segmentation.update_map || s->keyframe || s->intraonly)) {
1384         setctx_2d(&s->frames[CUR_FRAME].segmentation_map[row * 8 * s->sb_cols + col],
1385                   w4, h4, 8 * s->sb_cols, b->seg_id);
1386     }
1387
1388     b->skip = s->segmentation.enabled &&
1389         s->segmentation.feat[b->seg_id].skip_enabled;
1390     if (!b->skip) {
1391         int c = s->left_skip_ctx[row7] + s->above_skip_ctx[col];
1392         b->skip = vp56_rac_get_prob(&s->c, s->prob.p.skip[c]);
1393         s->counts.skip[c][b->skip]++;
1394     }
1395
1396     if (s->keyframe || s->intraonly) {
1397         b->intra = 1;
1398     } else if (s->segmentation.feat[b->seg_id].ref_enabled) {
1399         b->intra = !s->segmentation.feat[b->seg_id].ref_val;
1400     } else {
1401         int c, bit;
1402
1403         if (have_a && have_l) {
1404             c = s->above_intra_ctx[col] + s->left_intra_ctx[row7];
1405             c += (c == 2);
1406         } else {
1407             c = have_a ? 2 * s->above_intra_ctx[col] :
1408                 have_l ? 2 * s->left_intra_ctx[row7] : 0;
1409         }
1410         bit = vp56_rac_get_prob(&s->c, s->prob.p.intra[c]);
1411         s->counts.intra[c][bit]++;
1412         b->intra = !bit;
1413     }
1414
1415     if ((b->intra || !b->skip) && s->txfmmode == TX_SWITCHABLE) {
1416         int c;
1417         if (have_a) {
1418             if (have_l) {
1419                 c = (s->above_skip_ctx[col] ? max_tx :
1420                      s->above_txfm_ctx[col]) +
1421                     (s->left_skip_ctx[row7] ? max_tx :
1422                      s->left_txfm_ctx[row7]) > max_tx;
1423             } else {
1424                 c = s->above_skip_ctx[col] ? 1 :
1425                     (s->above_txfm_ctx[col] * 2 > max_tx);
1426             }
1427         } else if (have_l) {
1428             c = s->left_skip_ctx[row7] ? 1 :
1429                 (s->left_txfm_ctx[row7] * 2 > max_tx);
1430         } else {
1431             c = 1;
1432         }
1433         switch (max_tx) {
1434         case TX_32X32:
1435             b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][0]);
1436             if (b->tx) {
1437                 b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][1]);
1438                 if (b->tx == 2)
1439                     b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][2]);
1440             }
1441             s->counts.tx32p[c][b->tx]++;
1442             break;
1443         case TX_16X16:
1444             b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][0]);
1445             if (b->tx)
1446                 b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][1]);
1447             s->counts.tx16p[c][b->tx]++;
1448             break;
1449         case TX_8X8:
1450             b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx8p[c]);
1451             s->counts.tx8p[c][b->tx]++;
1452             break;
1453         case TX_4X4:
1454             b->tx = TX_4X4;
1455             break;
1456         }
1457     } else {
1458         b->tx = FFMIN(max_tx, s->txfmmode);
1459     }
1460
1461     if (s->keyframe || s->intraonly) {
1462         uint8_t *a = &s->above_mode_ctx[col * 2];
1463         uint8_t *l = &s->left_mode_ctx[(row7) << 1];
1464
1465         b->comp = 0;
1466         if (b->bs > BS_8x8) {
1467             // FIXME the memory storage intermediates here aren't really
1468             // necessary, they're just there to make the code slightly
1469             // simpler for now
1470             b->mode[0] = a[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1471                                     vp9_default_kf_ymode_probs[a[0]][l[0]]);
1472             if (b->bs != BS_8x4) {
1473                 b->mode[1] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1474                                  vp9_default_kf_ymode_probs[a[1]][b->mode[0]]);
1475                 l[0] = a[1] = b->mode[1];
1476             } else {
1477                 l[0] = a[1] = b->mode[1] = b->mode[0];
1478             }
1479             if (b->bs != BS_4x8) {
1480                 b->mode[2] = a[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1481                                         vp9_default_kf_ymode_probs[a[0]][l[1]]);
1482                 if (b->bs != BS_8x4) {
1483                     b->mode[3] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1484                                   vp9_default_kf_ymode_probs[a[1]][b->mode[2]]);
1485                     l[1] = a[1] = b->mode[3];
1486                 } else {
1487                     l[1] = a[1] = b->mode[3] = b->mode[2];
1488                 }
1489             } else {
1490                 b->mode[2] = b->mode[0];
1491                 l[1] = a[1] = b->mode[3] = b->mode[1];
1492             }
1493         } else {
1494             b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1495                                           vp9_default_kf_ymode_probs[*a][*l]);
1496             b->mode[3] = b->mode[2] = b->mode[1] = b->mode[0];
1497             // FIXME this can probably be optimized
1498             memset(a, b->mode[0], bwh_tab[0][b->bs][0]);
1499             memset(l, b->mode[0], bwh_tab[0][b->bs][1]);
1500         }
1501         b->uvmode = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1502                                      vp9_default_kf_uvmode_probs[b->mode[3]]);
1503     } else if (b->intra) {
1504         b->comp = 0;
1505         if (b->bs > BS_8x8) {
1506             b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1507                                           s->prob.p.y_mode[0]);
1508             s->counts.y_mode[0][b->mode[0]]++;
1509             if (b->bs != BS_8x4) {
1510                 b->mode[1] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1511                                               s->prob.p.y_mode[0]);
1512                 s->counts.y_mode[0][b->mode[1]]++;
1513             } else {
1514                 b->mode[1] = b->mode[0];
1515             }
1516             if (b->bs != BS_4x8) {
1517                 b->mode[2] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1518                                               s->prob.p.y_mode[0]);
1519                 s->counts.y_mode[0][b->mode[2]]++;
1520                 if (b->bs != BS_8x4) {
1521                     b->mode[3] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1522                                                   s->prob.p.y_mode[0]);
1523                     s->counts.y_mode[0][b->mode[3]]++;
1524                 } else {
1525                     b->mode[3] = b->mode[2];
1526                 }
1527             } else {
1528                 b->mode[2] = b->mode[0];
1529                 b->mode[3] = b->mode[1];
1530             }
1531         } else {
1532             static const uint8_t size_group[10] = {
1533                 3, 3, 3, 3, 2, 2, 2, 1, 1, 1
1534             };
1535             int sz = size_group[b->bs];
1536
1537             b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1538                                           s->prob.p.y_mode[sz]);
1539             b->mode[1] = b->mode[2] = b->mode[3] = b->mode[0];
1540             s->counts.y_mode[sz][b->mode[3]]++;
1541         }
1542         b->uvmode = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1543                                      s->prob.p.uv_mode[b->mode[3]]);
1544         s->counts.uv_mode[b->mode[3]][b->uvmode]++;
1545     } else {
1546         static const uint8_t inter_mode_ctx_lut[14][14] = {
1547             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1548             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1549             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1550             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1551             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1552             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1553             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1554             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1555             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1556             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1557             { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
1558             { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
1559             { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 0, 3 },
1560             { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 3, 3, 4 },
1561         };
1562
1563         if (s->segmentation.feat[b->seg_id].ref_enabled) {
1564             av_assert2(s->segmentation.feat[b->seg_id].ref_val != 0);
1565             b->comp = 0;
1566             b->ref[0] = s->segmentation.feat[b->seg_id].ref_val - 1;
1567         } else {
1568             // read comp_pred flag
1569             if (s->comppredmode != PRED_SWITCHABLE) {
1570                 b->comp = s->comppredmode == PRED_COMPREF;
1571             } else {
1572                 int c;
1573
1574                 // FIXME add intra as ref=0xff (or -1) to make these easier?
1575                 if (have_a) {
1576                     if (have_l) {
1577                         if (s->above_comp_ctx[col] && s->left_comp_ctx[row7]) {
1578                             c = 4;
1579                         } else if (s->above_comp_ctx[col]) {
1580                             c = 2 + (s->left_intra_ctx[row7] ||
1581                                      s->left_ref_ctx[row7] == s->fixcompref);
1582                         } else if (s->left_comp_ctx[row7]) {
1583                             c = 2 + (s->above_intra_ctx[col] ||
1584                                      s->above_ref_ctx[col] == s->fixcompref);
1585                         } else {
1586                             c = (!s->above_intra_ctx[col] &&
1587                                  s->above_ref_ctx[col] == s->fixcompref) ^
1588                             (!s->left_intra_ctx[row7] &&
1589                              s->left_ref_ctx[row & 7] == s->fixcompref);
1590                         }
1591                     } else {
1592                         c = s->above_comp_ctx[col] ? 3 :
1593                         (!s->above_intra_ctx[col] && s->above_ref_ctx[col] == s->fixcompref);
1594                     }
1595                 } else if (have_l) {
1596                     c = s->left_comp_ctx[row7] ? 3 :
1597                     (!s->left_intra_ctx[row7] && s->left_ref_ctx[row7] == s->fixcompref);
1598                 } else {
1599                     c = 1;
1600                 }
1601                 b->comp = vp56_rac_get_prob(&s->c, s->prob.p.comp[c]);
1602                 s->counts.comp[c][b->comp]++;
1603             }
1604
1605             // read actual references
1606             // FIXME probably cache a few variables here to prevent repetitive
1607             // memory accesses below
1608             if (b->comp) /* two references */ {
1609                 int fix_idx = s->signbias[s->fixcompref], var_idx = !fix_idx, c, bit;
1610
1611                 b->ref[fix_idx] = s->fixcompref;
1612                 // FIXME can this codeblob be replaced by some sort of LUT?
1613                 if (have_a) {
1614                     if (have_l) {
1615                         if (s->above_intra_ctx[col]) {
1616                             if (s->left_intra_ctx[row7]) {
1617                                 c = 2;
1618                             } else {
1619                                 c = 1 + 2 * (s->left_ref_ctx[row7] != s->varcompref[1]);
1620                             }
1621                         } else if (s->left_intra_ctx[row7]) {
1622                             c = 1 + 2 * (s->above_ref_ctx[col] != s->varcompref[1]);
1623                         } else {
1624                             int refl = s->left_ref_ctx[row7], refa = s->above_ref_ctx[col];
1625
1626                             if (refl == refa && refa == s->varcompref[1]) {
1627                                 c = 0;
1628                             } else if (!s->left_comp_ctx[row7] && !s->above_comp_ctx[col]) {
1629                                 if ((refa == s->fixcompref && refl == s->varcompref[0]) ||
1630                                     (refl == s->fixcompref && refa == s->varcompref[0])) {
1631                                     c = 4;
1632                                 } else {
1633                                     c = (refa == refl) ? 3 : 1;
1634                                 }
1635                             } else if (!s->left_comp_ctx[row7]) {
1636                                 if (refa == s->varcompref[1] && refl != s->varcompref[1]) {
1637                                     c = 1;
1638                                 } else {
1639                                     c = (refl == s->varcompref[1] &&
1640                                          refa != s->varcompref[1]) ? 2 : 4;
1641                                 }
1642                             } else if (!s->above_comp_ctx[col]) {
1643                                 if (refl == s->varcompref[1] && refa != s->varcompref[1]) {
1644                                     c = 1;
1645                                 } else {
1646                                     c = (refa == s->varcompref[1] &&
1647                                          refl != s->varcompref[1]) ? 2 : 4;
1648                                 }
1649                             } else {
1650                                 c = (refl == refa) ? 4 : 2;
1651                             }
1652                         }
1653                     } else {
1654                         if (s->above_intra_ctx[col]) {
1655                             c = 2;
1656                         } else if (s->above_comp_ctx[col]) {
1657                             c = 4 * (s->above_ref_ctx[col] != s->varcompref[1]);
1658                         } else {
1659                             c = 3 * (s->above_ref_ctx[col] != s->varcompref[1]);
1660                         }
1661                     }
1662                 } else if (have_l) {
1663                     if (s->left_intra_ctx[row7]) {
1664                         c = 2;
1665                     } else if (s->left_comp_ctx[row7]) {
1666                         c = 4 * (s->left_ref_ctx[row7] != s->varcompref[1]);
1667                     } else {
1668                         c = 3 * (s->left_ref_ctx[row7] != s->varcompref[1]);
1669                     }
1670                 } else {
1671                     c = 2;
1672                 }
1673                 bit = vp56_rac_get_prob(&s->c, s->prob.p.comp_ref[c]);
1674                 b->ref[var_idx] = s->varcompref[bit];
1675                 s->counts.comp_ref[c][bit]++;
1676             } else /* single reference */ {
1677                 int bit, c;
1678
1679                 if (have_a && !s->above_intra_ctx[col]) {
1680                     if (have_l && !s->left_intra_ctx[row7]) {
1681                         if (s->left_comp_ctx[row7]) {
1682                             if (s->above_comp_ctx[col]) {
1683                                 c = 1 + (!s->fixcompref || !s->left_ref_ctx[row7] ||
1684                                          !s->above_ref_ctx[col]);
1685                             } else {
1686                                 c = (3 * !s->above_ref_ctx[col]) +
1687                                     (!s->fixcompref || !s->left_ref_ctx[row7]);
1688                             }
1689                         } else if (s->above_comp_ctx[col]) {
1690                             c = (3 * !s->left_ref_ctx[row7]) +
1691                                 (!s->fixcompref || !s->above_ref_ctx[col]);
1692                         } else {
1693                             c = 2 * !s->left_ref_ctx[row7] + 2 * !s->above_ref_ctx[col];
1694                         }
1695                     } else if (s->above_intra_ctx[col]) {
1696                         c = 2;
1697                     } else if (s->above_comp_ctx[col]) {
1698                         c = 1 + (!s->fixcompref || !s->above_ref_ctx[col]);
1699                     } else {
1700                         c = 4 * (!s->above_ref_ctx[col]);
1701                     }
1702                 } else if (have_l && !s->left_intra_ctx[row7]) {
1703                     if (s->left_intra_ctx[row7]) {
1704                         c = 2;
1705                     } else if (s->left_comp_ctx[row7]) {
1706                         c = 1 + (!s->fixcompref || !s->left_ref_ctx[row7]);
1707                     } else {
1708                         c = 4 * (!s->left_ref_ctx[row7]);
1709                     }
1710                 } else {
1711                     c = 2;
1712                 }
1713                 bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][0]);
1714                 s->counts.single_ref[c][0][bit]++;
1715                 if (!bit) {
1716                     b->ref[0] = 0;
1717                 } else {
1718                     // FIXME can this codeblob be replaced by some sort of LUT?
1719                     if (have_a) {
1720                         if (have_l) {
1721                             if (s->left_intra_ctx[row7]) {
1722                                 if (s->above_intra_ctx[col]) {
1723                                     c = 2;
1724                                 } else if (s->above_comp_ctx[col]) {
1725                                     c = 1 + 2 * (s->fixcompref == 1 ||
1726                                                  s->above_ref_ctx[col] == 1);
1727                                 } else if (!s->above_ref_ctx[col]) {
1728                                     c = 3;
1729                                 } else {
1730                                     c = 4 * (s->above_ref_ctx[col] == 1);
1731                                 }
1732                             } else if (s->above_intra_ctx[col]) {
1733                                 if (s->left_intra_ctx[row7]) {
1734                                     c = 2;
1735                                 } else if (s->left_comp_ctx[row7]) {
1736                                     c = 1 + 2 * (s->fixcompref == 1 ||
1737                                                  s->left_ref_ctx[row7] == 1);
1738                                 } else if (!s->left_ref_ctx[row7]) {
1739                                     c = 3;
1740                                 } else {
1741                                     c = 4 * (s->left_ref_ctx[row7] == 1);
1742                                 }
1743                             } else if (s->above_comp_ctx[col]) {
1744                                 if (s->left_comp_ctx[row7]) {
1745                                     if (s->left_ref_ctx[row7] == s->above_ref_ctx[col]) {
1746                                         c = 3 * (s->fixcompref == 1 ||
1747                                                  s->left_ref_ctx[row7] == 1);
1748                                     } else {
1749                                         c = 2;
1750                                     }
1751                                 } else if (!s->left_ref_ctx[row7]) {
1752                                     c = 1 + 2 * (s->fixcompref == 1 ||
1753                                                  s->above_ref_ctx[col] == 1);
1754                                 } else {
1755                                     c = 3 * (s->left_ref_ctx[row7] == 1) +
1756                                     (s->fixcompref == 1 || s->above_ref_ctx[col] == 1);
1757                                 }
1758                             } else if (s->left_comp_ctx[row7]) {
1759                                 if (!s->above_ref_ctx[col]) {
1760                                     c = 1 + 2 * (s->fixcompref == 1 ||
1761                                                  s->left_ref_ctx[row7] == 1);
1762                                 } else {
1763                                     c = 3 * (s->above_ref_ctx[col] == 1) +
1764                                     (s->fixcompref == 1 || s->left_ref_ctx[row7] == 1);
1765                                 }
1766                             } else if (!s->above_ref_ctx[col]) {
1767                                 if (!s->left_ref_ctx[row7]) {
1768                                     c = 3;
1769                                 } else {
1770                                     c = 4 * (s->left_ref_ctx[row7] == 1);
1771                                 }
1772                             } else if (!s->left_ref_ctx[row7]) {
1773                                 c = 4 * (s->above_ref_ctx[col] == 1);
1774                             } else {
1775                                 c = 2 * (s->left_ref_ctx[row7] == 1) +
1776                                 2 * (s->above_ref_ctx[col] == 1);
1777                             }
1778                         } else {
1779                             if (s->above_intra_ctx[col] ||
1780                                 (!s->above_comp_ctx[col] && !s->above_ref_ctx[col])) {
1781                                 c = 2;
1782                             } else if (s->above_comp_ctx[col]) {
1783                                 c = 3 * (s->fixcompref == 1 || s->above_ref_ctx[col] == 1);
1784                             } else {
1785                                 c = 4 * (s->above_ref_ctx[col] == 1);
1786                             }
1787                         }
1788                     } else if (have_l) {
1789                         if (s->left_intra_ctx[row7] ||
1790                             (!s->left_comp_ctx[row7] && !s->left_ref_ctx[row7])) {
1791                             c = 2;
1792                         } else if (s->left_comp_ctx[row7]) {
1793                             c = 3 * (s->fixcompref == 1 || s->left_ref_ctx[row7] == 1);
1794                         } else {
1795                             c = 4 * (s->left_ref_ctx[row7] == 1);
1796                         }
1797                     } else {
1798                         c = 2;
1799                     }
1800                     bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][1]);
1801                     s->counts.single_ref[c][1][bit]++;
1802                     b->ref[0] = 1 + bit;
1803                 }
1804             }
1805         }
1806
1807         if (b->bs <= BS_8x8) {
1808             if (s->segmentation.feat[b->seg_id].skip_enabled) {
1809                 b->mode[0] = b->mode[1] = b->mode[2] = b->mode[3] = ZEROMV;
1810             } else {
1811                 static const uint8_t off[10] = {
1812                     3, 0, 0, 1, 0, 0, 0, 0, 0, 0
1813                 };
1814
1815                 // FIXME this needs to use the LUT tables from find_ref_mvs
1816                 // because not all are -1,0/0,-1
1817                 int c = inter_mode_ctx_lut[s->above_mode_ctx[col + off[b->bs]]]
1818                                           [s->left_mode_ctx[row7 + off[b->bs]]];
1819
1820                 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1821                                               s->prob.p.mv_mode[c]);
1822                 b->mode[1] = b->mode[2] = b->mode[3] = b->mode[0];
1823                 s->counts.mv_mode[c][b->mode[0] - 10]++;
1824             }
1825         }
1826
1827         if (s->filtermode == FILTER_SWITCHABLE) {
1828             int c;
1829
1830             if (have_a && s->above_mode_ctx[col] >= NEARESTMV) {
1831                 if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
1832                     c = s->above_filter_ctx[col] == s->left_filter_ctx[row7] ?
1833                         s->left_filter_ctx[row7] : 3;
1834                 } else {
1835                     c = s->above_filter_ctx[col];
1836                 }
1837             } else if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
1838                 c = s->left_filter_ctx[row7];
1839             } else {
1840                 c = 3;
1841             }
1842
1843             filter_id = vp8_rac_get_tree(&s->c, vp9_filter_tree,
1844                                          s->prob.p.filter[c]);
1845             s->counts.filter[c][filter_id]++;
1846             b->filter = vp9_filter_lut[filter_id];
1847         } else {
1848             b->filter = s->filtermode;
1849         }
1850
1851         if (b->bs > BS_8x8) {
1852             int c = inter_mode_ctx_lut[s->above_mode_ctx[col]][s->left_mode_ctx[row7]];
1853
1854             b->mode[0] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1855                                           s->prob.p.mv_mode[c]);
1856             s->counts.mv_mode[c][b->mode[0] - 10]++;
1857             fill_mv(s, b->mv[0], b->mode[0], 0);
1858
1859             if (b->bs != BS_8x4) {
1860                 b->mode[1] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1861                                               s->prob.p.mv_mode[c]);
1862                 s->counts.mv_mode[c][b->mode[1] - 10]++;
1863                 fill_mv(s, b->mv[1], b->mode[1], 1);
1864             } else {
1865                 b->mode[1] = b->mode[0];
1866                 AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
1867                 AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
1868             }
1869
1870             if (b->bs != BS_4x8) {
1871                 b->mode[2] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1872                                               s->prob.p.mv_mode[c]);
1873                 s->counts.mv_mode[c][b->mode[2] - 10]++;
1874                 fill_mv(s, b->mv[2], b->mode[2], 2);
1875
1876                 if (b->bs != BS_8x4) {
1877                     b->mode[3] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1878                                                   s->prob.p.mv_mode[c]);
1879                     s->counts.mv_mode[c][b->mode[3] - 10]++;
1880                     fill_mv(s, b->mv[3], b->mode[3], 3);
1881                 } else {
1882                     b->mode[3] = b->mode[2];
1883                     AV_COPY32(&b->mv[3][0], &b->mv[2][0]);
1884                     AV_COPY32(&b->mv[3][1], &b->mv[2][1]);
1885                 }
1886             } else {
1887                 b->mode[2] = b->mode[0];
1888                 AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
1889                 AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
1890                 b->mode[3] = b->mode[1];
1891                 AV_COPY32(&b->mv[3][0], &b->mv[1][0]);
1892                 AV_COPY32(&b->mv[3][1], &b->mv[1][1]);
1893             }
1894         } else {
1895             fill_mv(s, b->mv[0], b->mode[0], -1);
1896             AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
1897             AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
1898             AV_COPY32(&b->mv[3][0], &b->mv[0][0]);
1899             AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
1900             AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
1901             AV_COPY32(&b->mv[3][1], &b->mv[0][1]);
1902         }
1903
1904         vref = b->ref[b->comp ? s->signbias[s->varcompref[0]] : 0];
1905     }
1906
1907 #if HAVE_FAST_64BIT
1908 #define SPLAT_CTX(var, val, n) \
1909     switch (n) { \
1910     case 1:  var = val;                                    break; \
1911     case 2:  AV_WN16A(&var, val *             0x0101);     break; \
1912     case 4:  AV_WN32A(&var, val *         0x01010101);     break; \
1913     case 8:  AV_WN64A(&var, val * 0x0101010101010101ULL);  break; \
1914     case 16: { \
1915         uint64_t v64 = val * 0x0101010101010101ULL; \
1916         AV_WN64A(              &var,     v64); \
1917         AV_WN64A(&((uint8_t *) &var)[8], v64); \
1918         break; \
1919     } \
1920     }
1921 #else
1922 #define SPLAT_CTX(var, val, n) \
1923     switch (n) { \
1924     case 1:  var = val;                         break; \
1925     case 2:  AV_WN16A(&var, val *     0x0101);  break; \
1926     case 4:  AV_WN32A(&var, val * 0x01010101);  break; \
1927     case 8: { \
1928         uint32_t v32 = val * 0x01010101; \
1929         AV_WN32A(              &var,     v32); \
1930         AV_WN32A(&((uint8_t *) &var)[4], v32); \
1931         break; \
1932     } \
1933     case 16: { \
1934         uint32_t v32 = val * 0x01010101; \
1935         AV_WN32A(              &var,      v32); \
1936         AV_WN32A(&((uint8_t *) &var)[4],  v32); \
1937         AV_WN32A(&((uint8_t *) &var)[8],  v32); \
1938         AV_WN32A(&((uint8_t *) &var)[12], v32); \
1939         break; \
1940     } \
1941     }
1942 #endif
1943
1944     switch (bwh_tab[1][b->bs][0]) {
1945 #define SET_CTXS(dir, off, n) \
1946     do { \
1947         SPLAT_CTX(s->dir##_skip_ctx[off],      b->skip,          n); \
1948         SPLAT_CTX(s->dir##_txfm_ctx[off],      b->tx,            n); \
1949         SPLAT_CTX(s->dir##_partition_ctx[off], dir##_ctx[b->bs], n); \
1950         if (!s->keyframe && !s->intraonly) { \
1951             SPLAT_CTX(s->dir##_intra_ctx[off], b->intra,   n); \
1952             SPLAT_CTX(s->dir##_comp_ctx[off],  b->comp,    n); \
1953             SPLAT_CTX(s->dir##_mode_ctx[off],  b->mode[3], n); \
1954             if (!b->intra) { \
1955                 SPLAT_CTX(s->dir##_ref_ctx[off], vref, n); \
1956                 if (s->filtermode == FILTER_SWITCHABLE) { \
1957                     SPLAT_CTX(s->dir##_filter_ctx[off], filter_id, n); \
1958                 } \
1959             } \
1960         } \
1961     } while (0)
1962     case 1: SET_CTXS(above, col, 1); break;
1963     case 2: SET_CTXS(above, col, 2); break;
1964     case 4: SET_CTXS(above, col, 4); break;
1965     case 8: SET_CTXS(above, col, 8); break;
1966     }
1967     switch (bwh_tab[1][b->bs][1]) {
1968     case 1: SET_CTXS(left, row7, 1); break;
1969     case 2: SET_CTXS(left, row7, 2); break;
1970     case 4: SET_CTXS(left, row7, 4); break;
1971     case 8: SET_CTXS(left, row7, 8); break;
1972     }
1973 #undef SPLAT_CTX
1974 #undef SET_CTXS
1975
1976     if (!s->keyframe && !s->intraonly) {
1977         if (b->bs > BS_8x8) {
1978             int mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
1979
1980             AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][0], &b->mv[1][0]);
1981             AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][1], &b->mv[1][1]);
1982             AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][0], mv0);
1983             AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][1], mv1);
1984             AV_COPY32(&s->above_mv_ctx[col * 2 + 0][0], &b->mv[2][0]);
1985             AV_COPY32(&s->above_mv_ctx[col * 2 + 0][1], &b->mv[2][1]);
1986             AV_WN32A(&s->above_mv_ctx[col * 2 + 1][0], mv0);
1987             AV_WN32A(&s->above_mv_ctx[col * 2 + 1][1], mv1);
1988         } else {
1989             int n, mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
1990
1991             for (n = 0; n < w4 * 2; n++) {
1992                 AV_WN32A(&s->above_mv_ctx[col * 2 + n][0], mv0);
1993                 AV_WN32A(&s->above_mv_ctx[col * 2 + n][1], mv1);
1994             }
1995             for (n = 0; n < h4 * 2; n++) {
1996                 AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][0], mv0);
1997                 AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][1], mv1);
1998             }
1999         }
2000     }
2001
2002     // FIXME kinda ugly
2003     for (y = 0; y < h4; y++) {
2004         int x, o = (row + y) * s->sb_cols * 8 + col;
2005         struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[o];
2006
2007         if (b->intra) {
2008             for (x = 0; x < w4; x++) {
2009                 mv[x].ref[0] =
2010                 mv[x].ref[1] = -1;
2011             }
2012         } else if (b->comp) {
2013             for (x = 0; x < w4; x++) {
2014                 mv[x].ref[0] = b->ref[0];
2015                 mv[x].ref[1] = b->ref[1];
2016                 AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
2017                 AV_COPY32(&mv[x].mv[1], &b->mv[3][1]);
2018             }
2019         } else {
2020             for (x = 0; x < w4; x++) {
2021                 mv[x].ref[0] = b->ref[0];
2022                 mv[x].ref[1] = -1;
2023                 AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
2024             }
2025         }
2026     }
2027 }
2028
2029 // FIXME merge cnt/eob arguments?
2030 static av_always_inline int
2031 decode_coeffs_b_generic(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
2032                         int is_tx32x32, unsigned (*cnt)[6][3],
2033                         unsigned (*eob)[6][2], uint8_t (*p)[6][11],
2034                         int nnz, const int16_t *scan, const int16_t (*nb)[2],
2035                         const int16_t *band_counts, const int16_t *qmul)
2036 {
2037     int i = 0, band = 0, band_left = band_counts[band];
2038     uint8_t *tp = p[0][nnz];
2039     uint8_t cache[1024];
2040
2041     do {
2042         int val, rc;
2043
2044         val = vp56_rac_get_prob_branchy(c, tp[0]); // eob
2045         eob[band][nnz][val]++;
2046         if (!val)
2047             break;
2048
2049     skip_eob:
2050         if (!vp56_rac_get_prob_branchy(c, tp[1])) { // zero
2051             cnt[band][nnz][0]++;
2052             if (!--band_left)
2053                 band_left = band_counts[++band];
2054             cache[scan[i]] = 0;
2055             nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
2056             tp = p[band][nnz];
2057             if (++i == n_coeffs)
2058                 break; //invalid input; blocks should end with EOB
2059             goto skip_eob;
2060         }
2061
2062         rc = scan[i];
2063         if (!vp56_rac_get_prob_branchy(c, tp[2])) { // one
2064             cnt[band][nnz][1]++;
2065             val = 1;
2066             cache[rc] = 1;
2067         } else {
2068             // fill in p[3-10] (model fill) - only once per frame for each pos
2069             if (!tp[3])
2070                 memcpy(&tp[3], vp9_model_pareto8[tp[2]], 8);
2071
2072             cnt[band][nnz][2]++;
2073             if (!vp56_rac_get_prob_branchy(c, tp[3])) { // 2, 3, 4
2074                 if (!vp56_rac_get_prob_branchy(c, tp[4])) {
2075                     cache[rc] = val = 2;
2076                 } else {
2077                     val = 3 + vp56_rac_get_prob(c, tp[5]);
2078                     cache[rc] = 3;
2079                 }
2080             } else if (!vp56_rac_get_prob_branchy(c, tp[6])) { // cat1/2
2081                 cache[rc] = 4;
2082                 if (!vp56_rac_get_prob_branchy(c, tp[7])) {
2083                     val = 5 + vp56_rac_get_prob(c, 159);
2084                 } else {
2085                     val  = 7 + (vp56_rac_get_prob(c, 165) << 1);
2086                     val +=      vp56_rac_get_prob(c, 145);
2087                 }
2088             } else { // cat 3-6
2089                 cache[rc] = 5;
2090                 if (!vp56_rac_get_prob_branchy(c, tp[8])) {
2091                     if (!vp56_rac_get_prob_branchy(c, tp[9])) {
2092                         val  = 11 + (vp56_rac_get_prob(c, 173) << 2);
2093                         val +=      (vp56_rac_get_prob(c, 148) << 1);
2094                         val +=       vp56_rac_get_prob(c, 140);
2095                     } else {
2096                         val  = 19 + (vp56_rac_get_prob(c, 176) << 3);
2097                         val +=      (vp56_rac_get_prob(c, 155) << 2);
2098                         val +=      (vp56_rac_get_prob(c, 140) << 1);
2099                         val +=       vp56_rac_get_prob(c, 135);
2100                     }
2101                 } else if (!vp56_rac_get_prob_branchy(c, tp[10])) {
2102                     val  = 35 + (vp56_rac_get_prob(c, 180) << 4);
2103                     val +=      (vp56_rac_get_prob(c, 157) << 3);
2104                     val +=      (vp56_rac_get_prob(c, 141) << 2);
2105                     val +=      (vp56_rac_get_prob(c, 134) << 1);
2106                     val +=       vp56_rac_get_prob(c, 130);
2107                 } else {
2108                     val  = 67 + (vp56_rac_get_prob(c, 254) << 13);
2109                     val +=      (vp56_rac_get_prob(c, 254) << 12);
2110                     val +=      (vp56_rac_get_prob(c, 254) << 11);
2111                     val +=      (vp56_rac_get_prob(c, 252) << 10);
2112                     val +=      (vp56_rac_get_prob(c, 249) << 9);
2113                     val +=      (vp56_rac_get_prob(c, 243) << 8);
2114                     val +=      (vp56_rac_get_prob(c, 230) << 7);
2115                     val +=      (vp56_rac_get_prob(c, 196) << 6);
2116                     val +=      (vp56_rac_get_prob(c, 177) << 5);
2117                     val +=      (vp56_rac_get_prob(c, 153) << 4);
2118                     val +=      (vp56_rac_get_prob(c, 140) << 3);
2119                     val +=      (vp56_rac_get_prob(c, 133) << 2);
2120                     val +=      (vp56_rac_get_prob(c, 130) << 1);
2121                     val +=       vp56_rac_get_prob(c, 129);
2122                 }
2123             }
2124         }
2125         if (!--band_left)
2126             band_left = band_counts[++band];
2127         if (is_tx32x32)
2128             coef[rc] = ((vp8_rac_get(c) ? -val : val) * qmul[!!i]) / 2;
2129         else
2130             coef[rc] = (vp8_rac_get(c) ? -val : val) * qmul[!!i];
2131         nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
2132         tp = p[band][nnz];
2133     } while (++i < n_coeffs);
2134
2135     return i;
2136 }
2137
2138 static int decode_coeffs_b(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
2139                            unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2140                            uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2141                            const int16_t (*nb)[2], const int16_t *band_counts,
2142                            const int16_t *qmul)
2143 {
2144     return decode_coeffs_b_generic(c, coef, n_coeffs, 0, cnt, eob, p,
2145                                    nnz, scan, nb, band_counts, qmul);
2146 }
2147
2148 static int decode_coeffs_b32(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
2149                              unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2150                              uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2151                              const int16_t (*nb)[2], const int16_t *band_counts,
2152                              const int16_t *qmul)
2153 {
2154     return decode_coeffs_b_generic(c, coef, n_coeffs, 1, cnt, eob, p,
2155                                    nnz, scan, nb, band_counts, qmul);
2156 }
2157
2158 static void decode_coeffs(AVCodecContext *ctx)
2159 {
2160     VP9Context *s = ctx->priv_data;
2161     VP9Block *b = s->b;
2162     int row = s->row, col = s->col;
2163     uint8_t (*p)[6][11] = s->prob.coef[b->tx][0 /* y */][!b->intra];
2164     unsigned (*c)[6][3] = s->counts.coef[b->tx][0 /* y */][!b->intra];
2165     unsigned (*e)[6][2] = s->counts.eob[b->tx][0 /* y */][!b->intra];
2166     int w4 = bwh_tab[1][b->bs][0] << 1, h4 = bwh_tab[1][b->bs][1] << 1;
2167     int end_x = FFMIN(2 * (s->cols - col), w4);
2168     int end_y = FFMIN(2 * (s->rows - row), h4);
2169     int n, pl, x, y, res;
2170     int16_t (*qmul)[2] = s->segmentation.feat[b->seg_id].qmul;
2171     int tx = 4 * s->lossless + b->tx;
2172     const int16_t * const *yscans = vp9_scans[tx];
2173     const int16_t (* const *ynbs)[2] = vp9_scans_nb[tx];
2174     const int16_t *uvscan = vp9_scans[b->uvtx][DCT_DCT];
2175     const int16_t (*uvnb)[2] = vp9_scans_nb[b->uvtx][DCT_DCT];
2176     uint8_t *a = &s->above_y_nnz_ctx[col * 2];
2177     uint8_t *l = &s->left_y_nnz_ctx[(row & 7) << 1];
2178     static const int16_t band_counts[4][8] = {
2179         { 1, 2, 3, 4,  3,   16 - 13 },
2180         { 1, 2, 3, 4, 11,   64 - 21 },
2181         { 1, 2, 3, 4, 11,  256 - 21 },
2182         { 1, 2, 3, 4, 11, 1024 - 21 },
2183     };
2184     const int16_t *y_band_counts = band_counts[b->tx];
2185     const int16_t *uv_band_counts = band_counts[b->uvtx];
2186
2187 #define MERGE(la, end, step, rd) \
2188     for (n = 0; n < end; n += step) \
2189         la[n] = !!rd(&la[n])
2190 #define MERGE_CTX(step, rd) \
2191     do { \
2192         MERGE(l, end_y, step, rd); \
2193         MERGE(a, end_x, step, rd); \
2194     } while (0)
2195
2196 #define DECODE_Y_COEF_LOOP(step, mode_index, v) \
2197     for (n = 0, y = 0; y < end_y; y += step) { \
2198         for (x = 0; x < end_x; x += step, n += step * step) { \
2199             enum TxfmType txtp = vp9_intra_txfm_type[b->mode[mode_index]]; \
2200             res = decode_coeffs_b##v(&s->c, s->block + 16 * n, 16 * step * step, \
2201                                      c, e, p, a[x] + l[y], yscans[txtp], \
2202                                      ynbs[txtp], y_band_counts, qmul[0]); \
2203             a[x] = l[y] = !!res; \
2204             if (step >= 4) { \
2205                 AV_WN16A(&s->eob[n], res); \
2206             } else { \
2207                 s->eob[n] = res; \
2208             } \
2209         } \
2210     }
2211
2212 #define SPLAT(la, end, step, cond) \
2213     if (step == 2) { \
2214         for (n = 1; n < end; n += step) \
2215             la[n] = la[n - 1]; \
2216     } else if (step == 4) { \
2217         if (cond) { \
2218             for (n = 0; n < end; n += step) \
2219                 AV_WN32A(&la[n], la[n] * 0x01010101); \
2220         } else { \
2221             for (n = 0; n < end; n += step) \
2222                 memset(&la[n + 1], la[n], FFMIN(end - n - 1, 3)); \
2223         } \
2224     } else /* step == 8 */ { \
2225         if (cond) { \
2226             if (HAVE_FAST_64BIT) { \
2227                 for (n = 0; n < end; n += step) \
2228                     AV_WN64A(&la[n], la[n] * 0x0101010101010101ULL); \
2229             } else { \
2230                 for (n = 0; n < end; n += step) { \
2231                     uint32_t v32 = la[n] * 0x01010101; \
2232                     AV_WN32A(&la[n],     v32); \
2233                     AV_WN32A(&la[n + 4], v32); \
2234                 } \
2235             } \
2236         } else { \
2237             for (n = 0; n < end; n += step) \
2238                 memset(&la[n + 1], la[n], FFMIN(end - n - 1, 7)); \
2239         } \
2240     }
2241 #define SPLAT_CTX(step) \
2242     do { \
2243         SPLAT(a, end_x, step, end_x == w4); \
2244         SPLAT(l, end_y, step, end_y == h4); \
2245     } while (0)
2246
2247     /* y tokens */
2248     switch (b->tx) {
2249     case TX_4X4:
2250         DECODE_Y_COEF_LOOP(1, b->bs > BS_8x8 ? n : 0,);
2251         break;
2252     case TX_8X8:
2253         MERGE_CTX(2, AV_RN16A);
2254         DECODE_Y_COEF_LOOP(2, 0,);
2255         SPLAT_CTX(2);
2256         break;
2257     case TX_16X16:
2258         MERGE_CTX(4, AV_RN32A);
2259         DECODE_Y_COEF_LOOP(4, 0,);
2260         SPLAT_CTX(4);
2261         break;
2262     case TX_32X32:
2263         MERGE_CTX(8, AV_RN64A);
2264         DECODE_Y_COEF_LOOP(8, 0, 32);
2265         SPLAT_CTX(8);
2266         break;
2267     }
2268
2269 #define DECODE_UV_COEF_LOOP(step) \
2270     for (n = 0, y = 0; y < end_y; y += step) { \
2271         for (x = 0; x < end_x; x += step, n += step * step) { \
2272             res = decode_coeffs_b(&s->c, s->uvblock[pl] + 16 * n, \
2273                                   16 * step * step, c, e, p, a[x] + l[y], \
2274                                   uvscan, uvnb, uv_band_counts, qmul[1]); \
2275             a[x] = l[y] = !!res; \
2276             if (step >= 4) { \
2277                 AV_WN16A(&s->uveob[pl][n], res); \
2278             } else { \
2279                 s->uveob[pl][n] = res; \
2280             } \
2281         } \
2282     }
2283
2284     p = s->prob.coef[b->uvtx][1 /* uv */][!b->intra];
2285     c = s->counts.coef[b->uvtx][1 /* uv */][!b->intra];
2286     e = s->counts.eob[b->uvtx][1 /* uv */][!b->intra];
2287     w4 >>= 1;
2288     h4 >>= 1;
2289     end_x >>= 1;
2290     end_y >>= 1;
2291     for (pl = 0; pl < 2; pl++) {
2292         a = &s->above_uv_nnz_ctx[pl][col];
2293         l = &s->left_uv_nnz_ctx[pl][row & 7];
2294         switch (b->uvtx) {
2295         case TX_4X4:
2296             DECODE_UV_COEF_LOOP(1);
2297             break;
2298         case TX_8X8:
2299             MERGE_CTX(2, AV_RN16A);
2300             DECODE_UV_COEF_LOOP(2);
2301             SPLAT_CTX(2);
2302             break;
2303         case TX_16X16:
2304             MERGE_CTX(4, AV_RN32A);
2305             DECODE_UV_COEF_LOOP(4);
2306             SPLAT_CTX(4);
2307             break;
2308         case TX_32X32:
2309             MERGE_CTX(8, AV_RN64A);
2310             // a 64x64 (max) uv block can ever only contain 1 tx32x32 block
2311             // so there is no need to loop
2312             res = decode_coeffs_b32(&s->c, s->uvblock[pl],
2313                                     1024, c, e, p, a[0] + l[0],
2314                                     uvscan, uvnb, uv_band_counts, qmul[1]);
2315             a[0] = l[0] = !!res;
2316             AV_WN16A(&s->uveob[pl][0], res);
2317             SPLAT_CTX(8);
2318             break;
2319         }
2320     }
2321 }
2322
2323 static av_always_inline int check_intra_mode(VP9Context *s, int mode, uint8_t **a,
2324                                              uint8_t *dst_edge, ptrdiff_t stride_edge,
2325                                              uint8_t *dst_inner, ptrdiff_t stride_inner,
2326                                              uint8_t *l, int col, int x, int w,
2327                                              int row, int y, enum TxfmMode tx,
2328                                              int p)
2329 {
2330     int have_top = row > 0 || y > 0;
2331     int have_left = col > s->tiling.tile_col_start || x > 0;
2332     int have_right = x < w - 1;
2333     static const uint8_t mode_conv[10][2 /* have_left */][2 /* have_top */] = {
2334         [VERT_PRED]            = { { DC_127_PRED,          VERT_PRED },
2335                                    { DC_127_PRED,          VERT_PRED } },
2336         [HOR_PRED]             = { { DC_129_PRED,          DC_129_PRED },
2337                                    { HOR_PRED,             HOR_PRED } },
2338         [DC_PRED]              = { { DC_128_PRED,          TOP_DC_PRED },
2339                                    { LEFT_DC_PRED,         DC_PRED } },
2340         [DIAG_DOWN_LEFT_PRED]  = { { DC_127_PRED,          DIAG_DOWN_LEFT_PRED },
2341                                    { DC_127_PRED,          DIAG_DOWN_LEFT_PRED } },
2342         [DIAG_DOWN_RIGHT_PRED] = { { DIAG_DOWN_RIGHT_PRED, DIAG_DOWN_RIGHT_PRED },
2343                                    { DIAG_DOWN_RIGHT_PRED, DIAG_DOWN_RIGHT_PRED } },
2344         [VERT_RIGHT_PRED]      = { { VERT_RIGHT_PRED,      VERT_RIGHT_PRED },
2345                                    { VERT_RIGHT_PRED,      VERT_RIGHT_PRED } },
2346         [HOR_DOWN_PRED]        = { { HOR_DOWN_PRED,        HOR_DOWN_PRED },
2347                                    { HOR_DOWN_PRED,        HOR_DOWN_PRED } },
2348         [VERT_LEFT_PRED]       = { { DC_127_PRED,          VERT_LEFT_PRED },
2349                                    { DC_127_PRED,          VERT_LEFT_PRED } },
2350         [HOR_UP_PRED]          = { { DC_129_PRED,          DC_129_PRED },
2351                                    { HOR_UP_PRED,          HOR_UP_PRED } },
2352         [TM_VP8_PRED]          = { { DC_129_PRED,          VERT_PRED },
2353                                    { HOR_PRED,             TM_VP8_PRED } },
2354     };
2355     static const struct {
2356         uint8_t needs_left:1;
2357         uint8_t needs_top:1;
2358         uint8_t needs_topleft:1;
2359         uint8_t needs_topright:1;
2360         uint8_t invert_left:1;
2361     } edges[N_INTRA_PRED_MODES] = {
2362         [VERT_PRED]            = { .needs_top  = 1 },
2363         [HOR_PRED]             = { .needs_left = 1 },
2364         [DC_PRED]              = { .needs_top  = 1, .needs_left = 1 },
2365         [DIAG_DOWN_LEFT_PRED]  = { .needs_top  = 1, .needs_topright = 1 },
2366         [DIAG_DOWN_RIGHT_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2367         [VERT_RIGHT_PRED]      = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2368         [HOR_DOWN_PRED]        = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2369         [VERT_LEFT_PRED]       = { .needs_top  = 1, .needs_topright = 1 },
2370         [HOR_UP_PRED]          = { .needs_left = 1, .invert_left = 1 },
2371         [TM_VP8_PRED]          = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2372         [LEFT_DC_PRED]         = { .needs_left = 1 },
2373         [TOP_DC_PRED]          = { .needs_top  = 1 },
2374         [DC_128_PRED]          = { 0 },
2375         [DC_127_PRED]          = { 0 },
2376         [DC_129_PRED]          = { 0 }
2377     };
2378
2379     av_assert2(mode >= 0 && mode < 10);
2380     mode = mode_conv[mode][have_left][have_top];
2381     if (edges[mode].needs_top) {
2382         uint8_t *top, *topleft;
2383         int n_px_need = 4 << tx, n_px_have = (((s->cols - col) << !p) - x) * 4;
2384         int n_px_need_tr = 0;
2385
2386         if (tx == TX_4X4 && edges[mode].needs_topright && have_right)
2387             n_px_need_tr = 4;
2388
2389         // if top of sb64-row, use s->intra_pred_data[] instead of
2390         // dst[-stride] for intra prediction (it contains pre- instead of
2391         // post-loopfilter data)
2392         if (have_top) {
2393             top = !(row & 7) && !y ?
2394                 s->intra_pred_data[p] + col * (8 >> !!p) + x * 4 :
2395                 y == 0 ? &dst_edge[-stride_edge] : &dst_inner[-stride_inner];
2396             if (have_left)
2397                 topleft = !(row & 7) && !y ?
2398                     s->intra_pred_data[p] + col * (8 >> !!p) + x * 4 :
2399                     y == 0 || x == 0 ? &dst_edge[-stride_edge] :
2400                     &dst_inner[-stride_inner];
2401         }
2402
2403         if (have_top &&
2404             (!edges[mode].needs_topleft || (have_left && top == topleft)) &&
2405             (tx != TX_4X4 || !edges[mode].needs_topright || have_right) &&
2406             n_px_need + n_px_need_tr <= n_px_have) {
2407             *a = top;
2408         } else {
2409             if (have_top) {
2410                 if (n_px_need <= n_px_have) {
2411                     memcpy(*a, top, n_px_need);
2412                 } else {
2413                     memcpy(*a, top, n_px_have);
2414                     memset(&(*a)[n_px_have], (*a)[n_px_have - 1],
2415                            n_px_need - n_px_have);
2416                 }
2417             } else {
2418                 memset(*a, 127, n_px_need);
2419             }
2420             if (edges[mode].needs_topleft) {
2421                 if (have_left && have_top) {
2422                     (*a)[-1] = topleft[-1];
2423                 } else {
2424                     (*a)[-1] = have_top ? 129 : 127;
2425                 }
2426             }
2427             if (tx == TX_4X4 && edges[mode].needs_topright) {
2428                 if (have_top && have_right &&
2429                     n_px_need + n_px_need_tr <= n_px_have) {
2430                     memcpy(&(*a)[4], &top[4], 4);
2431                 } else {
2432                     memset(&(*a)[4], (*a)[3], 4);
2433                 }
2434             }
2435         }
2436     }
2437     if (edges[mode].needs_left) {
2438         if (have_left) {
2439             int n_px_need = 4 << tx, i, n_px_have = (((s->rows - row) << !p) - y) * 4;
2440             uint8_t *dst = x == 0 ? dst_edge : dst_inner;
2441             ptrdiff_t stride = x == 0 ? stride_edge : stride_inner;
2442
2443             if (edges[mode].invert_left) {
2444                 if (n_px_need <= n_px_have) {
2445                     for (i = 0; i < n_px_need; i++)
2446                         l[i] = dst[i * stride - 1];
2447                 } else {
2448                     for (i = 0; i < n_px_have; i++)
2449                         l[i] = dst[i * stride - 1];
2450                     memset(&l[n_px_have], l[n_px_have - 1], n_px_need - n_px_have);
2451                 }
2452             } else {
2453                 if (n_px_need <= n_px_have) {
2454                     for (i = 0; i < n_px_need; i++)
2455                         l[n_px_need - 1 - i] = dst[i * stride - 1];
2456                 } else {
2457                     for (i = 0; i < n_px_have; i++)
2458                         l[n_px_need - 1 - i] = dst[i * stride - 1];
2459                     memset(l, l[n_px_need - n_px_have], n_px_need - n_px_have);
2460                 }
2461             }
2462         } else {
2463             memset(l, 129, 4 << tx);
2464         }
2465     }
2466
2467     return mode;
2468 }
2469
2470 static void intra_recon(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off)
2471 {
2472     VP9Context *s = ctx->priv_data;
2473     VP9Block *b = s->b;
2474     int row = s->row, col = s->col;
2475     int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
2476     int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
2477     int end_x = FFMIN(2 * (s->cols - col), w4);
2478     int end_y = FFMIN(2 * (s->rows - row), h4);
2479     int tx = 4 * s->lossless + b->tx, uvtx = b->uvtx + 4 * s->lossless;
2480     int uvstep1d = 1 << b->uvtx, p;
2481     uint8_t *dst = s->dst[0], *dst_r = s->frames[CUR_FRAME].tf.f->data[0] + y_off;
2482     LOCAL_ALIGNED_32(uint8_t, a_buf, [64]);
2483     LOCAL_ALIGNED_32(uint8_t, l, [32]);
2484
2485     for (n = 0, y = 0; y < end_y; y += step1d) {
2486         uint8_t *ptr = dst, *ptr_r = dst_r;
2487         for (x = 0; x < end_x; x += step1d, ptr += 4 * step1d,
2488                                ptr_r += 4 * step1d, n += step) {
2489             int mode = b->mode[b->bs > BS_8x8 && b->tx == TX_4X4 ?
2490                                y * 2 + x : 0];
2491             uint8_t *a = &a_buf[32];
2492             enum TxfmType txtp = vp9_intra_txfm_type[mode];
2493             int eob = b->skip ? 0 : b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
2494
2495             mode = check_intra_mode(s, mode, &a, ptr_r,
2496                                     s->frames[CUR_FRAME].tf.f->linesize[0],
2497                                     ptr, s->y_stride, l,
2498                                     col, x, w4, row, y, b->tx, 0);
2499             s->dsp.intra_pred[b->tx][mode](ptr, s->y_stride, l, a);
2500             if (eob)
2501                 s->dsp.itxfm_add[tx][txtp](ptr, s->y_stride,
2502                                            s->block + 16 * n, eob);
2503         }
2504         dst_r += 4 * step1d * s->frames[CUR_FRAME].tf.f->linesize[0];
2505         dst   += 4 * step1d * s->y_stride;
2506     }
2507
2508     // U/V
2509     w4 >>= 1;
2510     end_x >>= 1;
2511     end_y >>= 1;
2512     step = 1 << (b->uvtx * 2);
2513     for (p = 0; p < 2; p++) {
2514         dst   = s->dst[1 + p];
2515         dst_r = s->frames[CUR_FRAME].tf.f->data[1 + p] + uv_off;
2516         for (n = 0, y = 0; y < end_y; y += uvstep1d) {
2517             uint8_t *ptr = dst, *ptr_r = dst_r;
2518             for (x = 0; x < end_x; x += uvstep1d, ptr += 4 * uvstep1d,
2519                                    ptr_r += 4 * uvstep1d, n += step) {
2520                 int mode = b->uvmode;
2521                 uint8_t *a = &a_buf[16];
2522                 int eob = b->skip ? 0 : b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
2523
2524                 mode = check_intra_mode(s, mode, &a, ptr_r,
2525                                         s->frames[CUR_FRAME].tf.f->linesize[1],
2526                                         ptr, s->uv_stride, l,
2527                                         col, x, w4, row, y, b->uvtx, p + 1);
2528                 s->dsp.intra_pred[b->uvtx][mode](ptr, s->uv_stride, l, a);
2529                 if (eob)
2530                     s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
2531                                                     s->uvblock[p] + 16 * n, eob);
2532             }
2533             dst_r += 4 * uvstep1d * s->frames[CUR_FRAME].tf.f->linesize[1];
2534             dst   += 4 * uvstep1d * s->uv_stride;
2535         }
2536     }
2537 }
2538
2539 static av_always_inline void mc_luma_dir(VP9Context *s, vp9_mc_func (*mc)[2],
2540                                          uint8_t *dst, ptrdiff_t dst_stride,
2541                                          const uint8_t *ref, ptrdiff_t ref_stride,
2542                                          ThreadFrame *ref_frame,
2543                                          ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
2544                                          int bw, int bh, int w, int h)
2545 {
2546     int mx = mv->x, my = mv->y, th;
2547
2548     y += my >> 3;
2549     x += mx >> 3;
2550     ref += y * ref_stride + x;
2551     mx &= 7;
2552     my &= 7;
2553     // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2554     // we use +7 because the last 7 pixels of each sbrow can be changed in
2555     // the longest loopfilter of the next sbrow
2556     th = (y + bh + 4 * !!my + 7) >> 6;
2557     ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2558     if (x < !!mx * 3 || y < !!my * 3 ||
2559         x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
2560         s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2561                                  ref - !!my * 3 * ref_stride - !!mx * 3,
2562                                  80, ref_stride,
2563                                  bw + !!mx * 7, bh + !!my * 7,
2564                                  x - !!mx * 3, y - !!my * 3, w, h);
2565         ref = s->edge_emu_buffer + !!my * 3 * 80 + !!mx * 3;
2566         ref_stride = 80;
2567     }
2568     mc[!!mx][!!my](dst, dst_stride, ref, ref_stride, bh, mx << 1, my << 1);
2569 }
2570
2571 static av_always_inline void mc_chroma_dir(VP9Context *s, vp9_mc_func (*mc)[2],
2572                                            uint8_t *dst_u, uint8_t *dst_v,
2573                                            ptrdiff_t dst_stride,
2574                                            const uint8_t *ref_u, ptrdiff_t src_stride_u,
2575                                            const uint8_t *ref_v, ptrdiff_t src_stride_v,
2576                                            ThreadFrame *ref_frame,
2577                                            ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
2578                                            int bw, int bh, int w, int h)
2579 {
2580     int mx = mv->x, my = mv->y, th;
2581
2582     y += my >> 4;
2583     x += mx >> 4;
2584     ref_u += y * src_stride_u + x;
2585     ref_v += y * src_stride_v + x;
2586     mx &= 15;
2587     my &= 15;
2588     // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2589     // we use +7 because the last 7 pixels of each sbrow can be changed in
2590     // the longest loopfilter of the next sbrow
2591     th = (y + bh + 4 * !!my + 7) >> 5;
2592     ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2593     if (x < !!mx * 3 || y < !!my * 3 ||
2594         x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
2595         s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2596                                  ref_u - !!my * 3 * src_stride_u - !!mx * 3,
2597                                  80, src_stride_u,
2598                                  bw + !!mx * 7, bh + !!my * 7,
2599                                  x - !!mx * 3, y - !!my * 3, w, h);
2600         ref_u = s->edge_emu_buffer + !!my * 3 * 80 + !!mx * 3;
2601         mc[!!mx][!!my](dst_u, dst_stride, ref_u, 80, bh, mx, my);
2602
2603         s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2604                                  ref_v - !!my * 3 * src_stride_v - !!mx * 3,
2605                                  80, src_stride_v,
2606                                  bw + !!mx * 7, bh + !!my * 7,
2607                                  x - !!mx * 3, y - !!my * 3, w, h);
2608         ref_v = s->edge_emu_buffer + !!my * 3 * 80 + !!mx * 3;
2609         mc[!!mx][!!my](dst_v, dst_stride, ref_v, 80, bh, mx, my);
2610     } else {
2611         mc[!!mx][!!my](dst_u, dst_stride, ref_u, src_stride_u, bh, mx, my);
2612         mc[!!mx][!!my](dst_v, dst_stride, ref_v, src_stride_v, bh, mx, my);
2613     }
2614 }
2615
2616 static void inter_recon(AVCodecContext *ctx)
2617 {
2618     static const uint8_t bwlog_tab[2][N_BS_SIZES] = {
2619         { 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4 },
2620         { 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4 },
2621     };
2622     VP9Context *s = ctx->priv_data;
2623     VP9Block *b = s->b;
2624     int row = s->row, col = s->col;
2625     ThreadFrame *tref1 = &s->refs[s->refidx[b->ref[0]]], *tref2;
2626     AVFrame *ref1 = tref1->f, *ref2;
2627     int w1 = ref1->width, h1 = ref1->height, w2, h2;
2628     ptrdiff_t ls_y = s->y_stride, ls_uv = s->uv_stride;
2629
2630     if (b->comp) {
2631         tref2 = &s->refs[s->refidx[b->ref[1]]];
2632         ref2 = tref2->f;
2633         w2 = ref2->width;
2634         h2 = ref2->height;
2635     }
2636
2637     // y inter pred
2638     if (b->bs > BS_8x8) {
2639         if (b->bs == BS_8x4) {
2640             mc_luma_dir(s, s->dsp.mc[3][b->filter][0], s->dst[0], ls_y,
2641                         ref1->data[0], ref1->linesize[0], tref1,
2642                         row << 3, col << 3, &b->mv[0][0], 8, 4, w1, h1);
2643             mc_luma_dir(s, s->dsp.mc[3][b->filter][0],
2644                         s->dst[0] + 4 * ls_y, ls_y,
2645                         ref1->data[0], ref1->linesize[0], tref1,
2646                         (row << 3) + 4, col << 3, &b->mv[2][0], 8, 4, w1, h1);
2647
2648             if (b->comp) {
2649                 mc_luma_dir(s, s->dsp.mc[3][b->filter][1], s->dst[0], ls_y,
2650                             ref2->data[0], ref2->linesize[0], tref2,
2651                             row << 3, col << 3, &b->mv[0][1], 8, 4, w2, h2);
2652                 mc_luma_dir(s, s->dsp.mc[3][b->filter][1],
2653                             s->dst[0] + 4 * ls_y, ls_y,
2654                             ref2->data[0], ref2->linesize[0], tref2,
2655                             (row << 3) + 4, col << 3, &b->mv[2][1], 8, 4, w2, h2);
2656             }
2657         } else if (b->bs == BS_4x8) {
2658             mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0], ls_y,
2659                         ref1->data[0], ref1->linesize[0], tref1,
2660                         row << 3, col << 3, &b->mv[0][0], 4, 8, w1, h1);
2661             mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0] + 4, ls_y,
2662                         ref1->data[0], ref1->linesize[0], tref1,
2663                         row << 3, (col << 3) + 4, &b->mv[1][0], 4, 8, w1, h1);
2664
2665             if (b->comp) {
2666                 mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0], ls_y,
2667                             ref2->data[0], ref2->linesize[0], tref2,
2668                             row << 3, col << 3, &b->mv[0][1], 4, 8, w2, h2);
2669                 mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0] + 4, ls_y,
2670                             ref2->data[0], ref2->linesize[0], tref2,
2671                             row << 3, (col << 3) + 4, &b->mv[1][1], 4, 8, w2, h2);
2672             }
2673         } else {
2674             av_assert2(b->bs == BS_4x4);
2675
2676             // FIXME if two horizontally adjacent blocks have the same MV,
2677             // do a w8 instead of a w4 call
2678             mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0], ls_y,
2679                         ref1->data[0], ref1->linesize[0], tref1,
2680                         row << 3, col << 3, &b->mv[0][0], 4, 4, w1, h1);
2681             mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0] + 4, ls_y,
2682                         ref1->data[0], ref1->linesize[0], tref1,
2683                         row << 3, (col << 3) + 4, &b->mv[1][0], 4, 4, w1, h1);
2684             mc_luma_dir(s, s->dsp.mc[4][b->filter][0],
2685                         s->dst[0] + 4 * ls_y, ls_y,
2686                         ref1->data[0], ref1->linesize[0], tref1,
2687                         (row << 3) + 4, col << 3, &b->mv[2][0], 4, 4, w1, h1);
2688             mc_luma_dir(s, s->dsp.mc[4][b->filter][0],
2689                         s->dst[0] + 4 * ls_y + 4, ls_y,
2690                         ref1->data[0], ref1->linesize[0], tref1,
2691                         (row << 3) + 4, (col << 3) + 4, &b->mv[3][0], 4, 4, w1, h1);
2692
2693             if (b->comp) {
2694                 mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0], ls_y,
2695                             ref2->data[0], ref2->linesize[0], tref2,
2696                             row << 3, col << 3, &b->mv[0][1], 4, 4, w2, h2);
2697                 mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0] + 4, ls_y,
2698                             ref2->data[0], ref2->linesize[0], tref2,
2699                             row << 3, (col << 3) + 4, &b->mv[1][1], 4, 4, w2, h2);
2700                 mc_luma_dir(s, s->dsp.mc[4][b->filter][1],
2701                             s->dst[0] + 4 * ls_y, ls_y,
2702                             ref2->data[0], ref2->linesize[0], tref2,
2703                             (row << 3) + 4, col << 3, &b->mv[2][1], 4, 4, w2, h2);
2704                 mc_luma_dir(s, s->dsp.mc[4][b->filter][1],
2705                             s->dst[0] + 4 * ls_y + 4, ls_y,
2706                             ref2->data[0], ref2->linesize[0], tref2,
2707                             (row << 3) + 4, (col << 3) + 4, &b->mv[3][1], 4, 4, w2, h2);
2708             }
2709         }
2710     } else {
2711         int bwl = bwlog_tab[0][b->bs];
2712         int bw = bwh_tab[0][b->bs][0] * 4, bh = bwh_tab[0][b->bs][1] * 4;
2713
2714         mc_luma_dir(s, s->dsp.mc[bwl][b->filter][0], s->dst[0], ls_y,
2715                     ref1->data[0], ref1->linesize[0], tref1,
2716                     row << 3, col << 3, &b->mv[0][0],bw, bh, w1, h1);
2717
2718         if (b->comp)
2719             mc_luma_dir(s, s->dsp.mc[bwl][b->filter][1], s->dst[0], ls_y,
2720                         ref2->data[0], ref2->linesize[0], tref2,
2721                         row << 3, col << 3, &b->mv[0][1], bw, bh, w2, h2);
2722     }
2723
2724     // uv inter pred
2725     {
2726         int bwl = bwlog_tab[1][b->bs];
2727         int bw = bwh_tab[1][b->bs][0] * 4, bh = bwh_tab[1][b->bs][1] * 4;
2728         VP56mv mvuv;
2729
2730         w1 = (w1 + 1) >> 1;
2731         h1 = (h1 + 1) >> 1;
2732         if (b->comp) {
2733             w2 = (w2 + 1) >> 1;
2734             h2 = (h2 + 1) >> 1;
2735         }
2736         if (b->bs > BS_8x8) {
2737             mvuv.x = ROUNDED_DIV(b->mv[0][0].x + b->mv[1][0].x + b->mv[2][0].x + b->mv[3][0].x, 4);
2738             mvuv.y = ROUNDED_DIV(b->mv[0][0].y + b->mv[1][0].y + b->mv[2][0].y + b->mv[3][0].y, 4);
2739         } else {
2740             mvuv = b->mv[0][0];
2741         }
2742
2743         mc_chroma_dir(s, s->dsp.mc[bwl][b->filter][0],
2744                       s->dst[1], s->dst[2], ls_uv,
2745                       ref1->data[1], ref1->linesize[1],
2746                       ref1->data[2], ref1->linesize[2], tref1,
2747                       row << 2, col << 2, &mvuv, bw, bh, w1, h1);
2748
2749         if (b->comp) {
2750             if (b->bs > BS_8x8) {
2751                 mvuv.x = ROUNDED_DIV(b->mv[0][1].x + b->mv[1][1].x + b->mv[2][1].x + b->mv[3][1].x, 4);
2752                 mvuv.y = ROUNDED_DIV(b->mv[0][1].y + b->mv[1][1].y + b->mv[2][1].y + b->mv[3][1].y, 4);
2753             } else {
2754                 mvuv = b->mv[0][1];
2755             }
2756             mc_chroma_dir(s, s->dsp.mc[bwl][b->filter][1],
2757                           s->dst[1], s->dst[2], ls_uv,
2758                           ref2->data[1], ref2->linesize[1],
2759                           ref2->data[2], ref2->linesize[2], tref2,
2760                           row << 2, col << 2, &mvuv, bw, bh, w2, h2);
2761         }
2762     }
2763
2764     if (!b->skip) {
2765         /* mostly copied intra_reconn() */
2766
2767         int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
2768         int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
2769         int end_x = FFMIN(2 * (s->cols - col), w4);
2770         int end_y = FFMIN(2 * (s->rows - row), h4);
2771         int tx = 4 * s->lossless + b->tx, uvtx = b->uvtx + 4 * s->lossless;
2772         int uvstep1d = 1 << b->uvtx, p;
2773         uint8_t *dst = s->dst[0];
2774
2775         // y itxfm add
2776         for (n = 0, y = 0; y < end_y; y += step1d) {
2777             uint8_t *ptr = dst;
2778             for (x = 0; x < end_x; x += step1d, ptr += 4 * step1d, n += step) {
2779                 int eob = b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
2780
2781                 if (eob)
2782                     s->dsp.itxfm_add[tx][DCT_DCT](ptr, s->y_stride,
2783                                                   s->block + 16 * n, eob);
2784             }
2785             dst += 4 * s->y_stride * step1d;
2786         }
2787
2788         // uv itxfm add
2789         end_x >>= 1;
2790         end_y >>= 1;
2791         step = 1 << (b->uvtx * 2);
2792         for (p = 0; p < 2; p++) {
2793             dst = s->dst[p + 1];
2794             for (n = 0, y = 0; y < end_y; y += uvstep1d) {
2795                 uint8_t *ptr = dst;
2796                 for (x = 0; x < end_x; x += uvstep1d, ptr += 4 * uvstep1d, n += step) {
2797                     int eob = b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
2798
2799                     if (eob)
2800                         s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
2801                                                         s->uvblock[p] + 16 * n, eob);
2802                 }
2803                 dst += 4 * uvstep1d * s->uv_stride;
2804             }
2805         }
2806     }
2807 }
2808
2809 static av_always_inline void mask_edges(struct VP9Filter *lflvl, int is_uv,
2810                                         int row_and_7, int col_and_7,
2811                                         int w, int h, int col_end, int row_end,
2812                                         enum TxfmMode tx, int skip_inter)
2813 {
2814     // FIXME I'm pretty sure all loops can be replaced by a single LUT if
2815     // we make VP9Filter.mask uint64_t (i.e. row/col all single variable)
2816     // and make the LUT 5-indexed (bl, bp, is_uv, tx and row/col), and then
2817     // use row_and_7/col_and_7 as shifts (1*col_and_7+8*row_and_7)
2818
2819     // the intended behaviour of the vp9 loopfilter is to work on 8-pixel
2820     // edges. This means that for UV, we work on two subsampled blocks at
2821     // a time, and we only use the topleft block's mode information to set
2822     // things like block strength. Thus, for any block size smaller than
2823     // 16x16, ignore the odd portion of the block.
2824     if (tx == TX_4X4 && is_uv) {
2825         if (h == 1) {
2826             if (row_and_7 & 1)
2827                 return;
2828             if (!row_end)
2829                 h += 1;
2830         }
2831         if (w == 1) {
2832             if (col_and_7 & 1)
2833                 return;
2834             if (!col_end)
2835                 w += 1;
2836         }
2837     }
2838
2839     if (tx == TX_4X4 && !skip_inter) {
2840         int t = 1 << col_and_7, m_col = (t << w) - t, y;
2841         int m_col_odd = (t << (w - 1)) - t;
2842
2843         // on 32-px edges, use the 8-px wide loopfilter; else, use 4-px wide
2844         if (is_uv) {
2845             int m_row_8 = m_col & 0x01, m_row_4 = m_col - m_row_8;
2846
2847             for (y = row_and_7; y < h + row_and_7; y++) {
2848                 int col_mask_id = 2 - !(y & 7);
2849
2850                 lflvl->mask[is_uv][0][y][1] |= m_row_8;
2851                 lflvl->mask[is_uv][0][y][2] |= m_row_4;
2852                 // for odd lines, if the odd col is not being filtered,
2853                 // skip odd row also:
2854                 // .---. <-- a
2855                 // |   |
2856                 // |___| <-- b
2857                 // ^   ^
2858                 // c   d
2859                 //
2860                 // if a/c are even row/col and b/d are odd, and d is skipped,
2861                 // e.g. right edge of size-66x66.webm, then skip b also (bug)
2862                 if ((col_end & 1) && (y & 1)) {
2863                     lflvl->mask[is_uv][1][y][col_mask_id] |= m_col_odd;
2864                 } else {
2865                     lflvl->mask[is_uv][1][y][col_mask_id] |= m_col;
2866                 }
2867             }
2868         } else {
2869             int m_row_8 = m_col & 0x11, m_row_4 = m_col - m_row_8;
2870
2871             for (y = row_and_7; y < h + row_and_7; y++) {
2872                 int col_mask_id = 2 - !(y & 3);
2873
2874                 lflvl->mask[is_uv][0][y][1] |= m_row_8; // row edge
2875                 lflvl->mask[is_uv][0][y][2] |= m_row_4;
2876                 lflvl->mask[is_uv][1][y][col_mask_id] |= m_col; // col edge
2877                 lflvl->mask[is_uv][0][y][3] |= m_col;
2878                 lflvl->mask[is_uv][1][y][3] |= m_col;
2879             }
2880         }
2881     } else {
2882         int y, t = 1 << col_and_7, m_col = (t << w) - t;
2883
2884         if (!skip_inter) {
2885             int mask_id = (tx == TX_8X8);
2886             int l2 = tx + is_uv - 1, step1d = 1 << l2;
2887             static const unsigned masks[4] = { 0xff, 0x55, 0x11, 0x01 };
2888             int m_row = m_col & masks[l2];
2889
2890             // at odd UV col/row edges tx16/tx32 loopfilter edges, force
2891             // 8wd loopfilter to prevent going off the visible edge.
2892             if (is_uv && tx > TX_8X8 && (w ^ (w - 1)) == 1) {
2893                 int m_row_16 = ((t << (w - 1)) - t) & masks[l2];
2894                 int m_row_8 = m_row - m_row_16;
2895
2896                 for (y = row_and_7; y < h + row_and_7; y++) {
2897                     lflvl->mask[is_uv][0][y][0] |= m_row_16;
2898                     lflvl->mask[is_uv][0][y][1] |= m_row_8;
2899                 }
2900             } else {
2901                 for (y = row_and_7; y < h + row_and_7; y++)
2902                     lflvl->mask[is_uv][0][y][mask_id] |= m_row;
2903             }
2904
2905             if (is_uv && tx > TX_8X8 && (h ^ (h - 1)) == 1) {
2906                 for (y = row_and_7; y < h + row_and_7 - 1; y += step1d)
2907                     lflvl->mask[is_uv][1][y][0] |= m_col;
2908                 if (y - row_and_7 == h - 1)
2909                     lflvl->mask[is_uv][1][y][1] |= m_col;
2910             } else {
2911                 for (y = row_and_7; y < h + row_and_7; y += step1d)
2912                     lflvl->mask[is_uv][1][y][mask_id] |= m_col;
2913             }
2914         } else if (tx != TX_4X4) {
2915             int mask_id;
2916
2917             mask_id = (tx == TX_8X8) || (is_uv && h == 1);
2918             lflvl->mask[is_uv][1][row_and_7][mask_id] |= m_col;
2919             mask_id = (tx == TX_8X8) || (is_uv && w == 1);
2920             for (y = row_and_7; y < h + row_and_7; y++)
2921                 lflvl->mask[is_uv][0][y][mask_id] |= t;
2922         } else if (is_uv) {
2923             int t8 = t & 0x01, t4 = t - t8;
2924
2925             for (y = row_and_7; y < h + row_and_7; y++) {
2926                 lflvl->mask[is_uv][0][y][2] |= t4;
2927                 lflvl->mask[is_uv][0][y][1] |= t8;
2928             }
2929             lflvl->mask[is_uv][1][row_and_7][2 - !(row_and_7 & 7)] |= m_col;
2930         } else {
2931             int t8 = t & 0x11, t4 = t - t8;
2932
2933             for (y = row_and_7; y < h + row_and_7; y++) {
2934                 lflvl->mask[is_uv][0][y][2] |= t4;
2935                 lflvl->mask[is_uv][0][y][1] |= t8;
2936             }
2937             lflvl->mask[is_uv][1][row_and_7][2 - !(row_and_7 & 3)] |= m_col;
2938         }
2939     }
2940 }
2941
2942 static void decode_b(AVCodecContext *ctx, int row, int col,
2943                      struct VP9Filter *lflvl, ptrdiff_t yoff, ptrdiff_t uvoff,
2944                      enum BlockLevel bl, enum BlockPartition bp)
2945 {
2946     VP9Context *s = ctx->priv_data;
2947     VP9Block *b = s->b;
2948     enum BlockSize bs = bl * 3 + bp;
2949     int w4 = bwh_tab[1][bs][0], h4 = bwh_tab[1][bs][1], lvl;
2950     int emu[2];
2951     AVFrame *f = s->frames[CUR_FRAME].tf.f;
2952
2953     s->row = row;
2954     s->row7 = row & 7;
2955     s->col = col;
2956     s->col7 = col & 7;
2957     s->min_mv.x = -(128 + col * 64);
2958     s->min_mv.y = -(128 + row * 64);
2959     s->max_mv.x = 128 + (s->cols - col - w4) * 64;
2960     s->max_mv.y = 128 + (s->rows - row - h4) * 64;
2961     if (s->pass < 2) {
2962         b->bs = bs;
2963         b->bl = bl;
2964         b->bp = bp;
2965         decode_mode(ctx);
2966         b->uvtx = b->tx - (w4 * 2 == (1 << b->tx) || h4 * 2 == (1 << b->tx));
2967
2968         if (!b->skip) {
2969             decode_coeffs(ctx);
2970         } else {
2971             int row7 = s->row7;
2972
2973 #define SPLAT_ZERO_CTX(v, n) \
2974     switch (n) { \
2975     case 1:  v = 0;          break; \
2976     case 2:  AV_ZERO16(&v);  break; \
2977     case 4:  AV_ZERO32(&v);  break; \
2978     case 8:  AV_ZERO64(&v);  break; \
2979     case 16: AV_ZERO128(&v); break; \
2980     }
2981 #define SPLAT_ZERO_YUV(dir, var, off, n) \
2982     do { \
2983         SPLAT_ZERO_CTX(s->dir##_y_##var[off * 2], n * 2); \
2984         SPLAT_ZERO_CTX(s->dir##_uv_##var[0][off], n); \
2985         SPLAT_ZERO_CTX(s->dir##_uv_##var[1][off], n); \
2986     } while (0)
2987
2988             switch (w4) {
2989             case 1: SPLAT_ZERO_YUV(above, nnz_ctx, col, 1); break;
2990             case 2: SPLAT_ZERO_YUV(above, nnz_ctx, col, 2); break;
2991             case 4: SPLAT_ZERO_YUV(above, nnz_ctx, col, 4); break;
2992             case 8: SPLAT_ZERO_YUV(above, nnz_ctx, col, 8); break;
2993             }
2994             switch (h4) {
2995             case 1: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 1); break;
2996             case 2: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 2); break;
2997             case 4: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 4); break;
2998             case 8: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 8); break;
2999             }
3000         }
3001         if (s->pass == 1) {
3002             s->b++;
3003             s->block += w4 * h4 * 64;
3004             s->uvblock[0] += w4 * h4 * 16;
3005             s->uvblock[1] += w4 * h4 * 16;
3006             s->eob += 4 * w4 * h4;
3007             s->uveob[0] += w4 * h4;
3008             s->uveob[1] += w4 * h4;
3009
3010             return;
3011         }
3012     }
3013
3014     // emulated overhangs if the stride of the target buffer can't hold. This
3015     // allows to support emu-edge and so on even if we have large block
3016     // overhangs
3017     emu[0] = (col + w4) * 8 > f->linesize[0] ||
3018              (row + h4) > s->rows;
3019     emu[1] = (col + w4) * 4 > f->linesize[1] ||
3020              (row + h4) > s->rows;
3021     if (emu[0]) {
3022         s->dst[0] = s->tmp_y;
3023         s->y_stride = 64;
3024     } else {
3025         s->dst[0] = f->data[0] + yoff;
3026         s->y_stride = f->linesize[0];
3027     }
3028     if (emu[1]) {
3029         s->dst[1] = s->tmp_uv[0];
3030         s->dst[2] = s->tmp_uv[1];
3031         s->uv_stride = 32;
3032     } else {
3033         s->dst[1] = f->data[1] + uvoff;
3034         s->dst[2] = f->data[2] + uvoff;
3035         s->uv_stride = f->linesize[1];
3036     }
3037     if (b->intra) {
3038         intra_recon(ctx, yoff, uvoff);
3039     } else {
3040         inter_recon(ctx);
3041     }
3042     if (emu[0]) {
3043         int w = FFMIN(s->cols - col, w4) * 8, h = FFMIN(s->rows - row, h4) * 8, n, o = 0;
3044
3045         for (n = 0; o < w; n++) {
3046             int bw = 64 >> n;
3047
3048             av_assert2(n <= 4);
3049             if (w & bw) {
3050                 s->dsp.mc[n][0][0][0][0](f->data[0] + yoff + o, f->linesize[0],
3051                                          s->tmp_y + o, 64, h, 0, 0);
3052                 o += bw;
3053             }
3054         }
3055     }
3056     if (emu[1]) {
3057         int w = FFMIN(s->cols - col, w4) * 4, h = FFMIN(s->rows - row, h4) * 4, n, o = 0;
3058
3059         for (n = 1; o < w; n++) {
3060             int bw = 64 >> n;
3061
3062             av_assert2(n <= 4);
3063             if (w & bw) {
3064                 s->dsp.mc[n][0][0][0][0](f->data[1] + uvoff + o, f->linesize[1],
3065                                          s->tmp_uv[0] + o, 32, h, 0, 0);
3066                 s->dsp.mc[n][0][0][0][0](f->data[2] + uvoff + o, f->linesize[2],
3067                                          s->tmp_uv[1] + o, 32, h, 0, 0);
3068                 o += bw;
3069             }
3070         }
3071     }
3072
3073     // pick filter level and find edges to apply filter to
3074     if (s->filter.level &&
3075         (lvl = s->segmentation.feat[b->seg_id].lflvl[b->intra ? 0 : b->ref[0] + 1]
3076                                                     [b->mode[3] != ZEROMV]) > 0) {
3077         int x_end = FFMIN(s->cols - col, w4), y_end = FFMIN(s->rows - row, h4);
3078         int skip_inter = !b->intra && b->skip, col7 = s->col7, row7 = s->row7;
3079
3080         setctx_2d(&lflvl->level[row7 * 8 + col7], w4, h4, 8, lvl);
3081         mask_edges(lflvl, 0, row7, col7, x_end, y_end, 0, 0, b->tx, skip_inter);
3082         mask_edges(lflvl, 1, row7, col7, x_end, y_end,
3083                    s->cols & 1 && col + w4 >= s->cols ? s->cols & 7 : 0,
3084                    s->rows & 1 && row + h4 >= s->rows ? s->rows & 7 : 0,
3085                    b->uvtx, skip_inter);
3086
3087         if (!s->filter.lim_lut[lvl]) {
3088             int sharp = s->filter.sharpness;
3089             int limit = lvl;
3090
3091             if (sharp > 0) {
3092                 limit >>= (sharp + 3) >> 2;
3093                 limit = FFMIN(limit, 9 - sharp);
3094             }
3095             limit = FFMAX(limit, 1);
3096
3097             s->filter.lim_lut[lvl] = limit;
3098             s->filter.mblim_lut[lvl] = 2 * (lvl + 2) + limit;
3099         }
3100     }
3101
3102     if (s->pass == 2) {
3103         s->b++;
3104         s->block += w4 * h4 * 64;
3105         s->uvblock[0] += w4 * h4 * 16;
3106         s->uvblock[1] += w4 * h4 * 16;
3107         s->eob += 4 * w4 * h4;
3108         s->uveob[0] += w4 * h4;
3109         s->uveob[1] += w4 * h4;
3110     }
3111 }
3112
3113 static void decode_sb(AVCodecContext *ctx, int row, int col, struct VP9Filter *lflvl,
3114                       ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
3115 {
3116     VP9Context *s = ctx->priv_data;
3117     int c = ((s->above_partition_ctx[col] >> (3 - bl)) & 1) |
3118             (((s->left_partition_ctx[row & 0x7] >> (3 - bl)) & 1) << 1);
3119     const uint8_t *p = s->keyframe ? vp9_default_kf_partition_probs[bl][c] :
3120                                      s->prob.p.partition[bl][c];
3121     enum BlockPartition bp;
3122     ptrdiff_t hbs = 4 >> bl;
3123     AVFrame *f = s->frames[CUR_FRAME].tf.f;
3124     ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
3125
3126     if (bl == BL_8X8) {
3127         bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
3128         decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3129     } else if (col + hbs < s->cols) { // FIXME why not <=?
3130         if (row + hbs < s->rows) { // FIXME why not <=?
3131             bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
3132             switch (bp) {
3133             case PARTITION_NONE:
3134                 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3135                 break;
3136             case PARTITION_H:
3137                 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3138                 yoff  += hbs * 8 * y_stride;
3139                 uvoff += hbs * 4 * uv_stride;
3140                 decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, bl, bp);
3141                 break;
3142             case PARTITION_V:
3143                 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3144                 yoff  += hbs * 8;
3145                 uvoff += hbs * 4;
3146                 decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, bl, bp);
3147                 break;
3148             case PARTITION_SPLIT:
3149                 decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3150                 decode_sb(ctx, row, col + hbs, lflvl,
3151                           yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
3152                 yoff  += hbs * 8 * y_stride;
3153                 uvoff += hbs * 4 * uv_stride;
3154                 decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3155                 decode_sb(ctx, row + hbs, col + hbs, lflvl,
3156                           yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
3157                 break;
3158             default:
3159                 av_assert0(0);
3160             }
3161         } else if (vp56_rac_get_prob_branchy(&s->c, p[1])) {
3162             bp = PARTITION_SPLIT;
3163             decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3164             decode_sb(ctx, row, col + hbs, lflvl,
3165                       yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
3166         } else {
3167             bp = PARTITION_H;
3168             decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3169         }
3170     } else if (row + hbs < s->rows) { // FIXME why not <=?
3171         if (vp56_rac_get_prob_branchy(&s->c, p[2])) {
3172             bp = PARTITION_SPLIT;
3173             decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3174             yoff  += hbs * 8 * y_stride;
3175             uvoff += hbs * 4 * uv_stride;
3176             decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3177         } else {
3178             bp = PARTITION_V;
3179             decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3180         }
3181     } else {
3182         bp = PARTITION_SPLIT;
3183         decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3184     }
3185     s->counts.partition[bl][c][bp]++;
3186 }
3187
3188 static void decode_sb_mem(AVCodecContext *ctx, int row, int col, struct VP9Filter *lflvl,
3189                           ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
3190 {
3191     VP9Context *s = ctx->priv_data;
3192     VP9Block *b = s->b;
3193     ptrdiff_t hbs = 4 >> bl;
3194     AVFrame *f = s->frames[CUR_FRAME].tf.f;
3195     ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
3196
3197     if (bl == BL_8X8) {
3198         av_assert2(b->bl == BL_8X8);
3199         decode_b(ctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
3200     } else if (s->b->bl == bl) {
3201         decode_b(ctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
3202         if (b->bp == PARTITION_H && row + hbs < s->rows) {
3203             yoff  += hbs * 8 * y_stride;
3204             uvoff += hbs * 4 * uv_stride;
3205             decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, b->bl, b->bp);
3206         } else if (b->bp == PARTITION_V && col + hbs < s->cols) {
3207             yoff  += hbs * 8;
3208             uvoff += hbs * 4;
3209             decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, b->bl, b->bp);
3210         }
3211     } else {
3212         decode_sb_mem(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3213         if (col + hbs < s->cols) { // FIXME why not <=?
3214             if (row + hbs < s->rows) {
3215                 decode_sb_mem(ctx, row, col + hbs, lflvl, yoff + 8 * hbs,
3216                               uvoff + 4 * hbs, bl + 1);
3217                 yoff  += hbs * 8 * y_stride;
3218                 uvoff += hbs * 4 * uv_stride;
3219                 decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3220                 decode_sb_mem(ctx, row + hbs, col + hbs, lflvl,
3221                                     yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
3222             } else {
3223                 yoff  += hbs * 8;
3224                 uvoff += hbs * 4;
3225                 decode_sb_mem(ctx, row, col + hbs, lflvl, yoff, uvoff, bl + 1);
3226             }
3227         } else if (row + hbs < s->rows) {
3228             yoff  += hbs * 8 * y_stride;
3229             uvoff += hbs * 4 * uv_stride;
3230             decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3231         }
3232     }
3233 }
3234
3235 static void loopfilter_sb(AVCodecContext *ctx, struct VP9Filter *lflvl,
3236                           int row, int col, ptrdiff_t yoff, ptrdiff_t uvoff)
3237 {
3238     VP9Context *s = ctx->priv_data;
3239     AVFrame *f = s->frames[CUR_FRAME].tf.f;
3240     uint8_t *dst = f->data[0] + yoff, *lvl = lflvl->level;
3241     ptrdiff_t ls_y = f->linesize[0], ls_uv = f->linesize[1];
3242     int y, x, p;
3243
3244     // FIXME in how far can we interleave the v/h loopfilter calls? E.g.
3245     // if you think of them as acting on a 8x8 block max, we can interleave
3246     // each v/h within the single x loop, but that only works if we work on
3247     // 8 pixel blocks, and we won't always do that (we want at least 16px
3248     // to use SSE2 optimizations, perhaps 32 for AVX2)
3249
3250     // filter edges between columns, Y plane (e.g. block1 | block2)
3251     for (y = 0; y < 8; y += 2, dst += 16 * ls_y, lvl += 16) {
3252         uint8_t *ptr = dst, *l = lvl, *hmask1 = lflvl->mask[0][0][y];
3253         uint8_t *hmask2 = lflvl->mask[0][0][y + 1];
3254         unsigned hm1 = hmask1[0] | hmask1[1] | hmask1[2], hm13 = hmask1[3];
3255         unsigned hm2 = hmask2[1] | hmask2[2], hm23 = hmask2[3];
3256         unsigned hm = hm1 | hm2 | hm13 | hm23;
3257
3258         for (x = 1; hm & ~(x - 1); x <<= 1, ptr += 8, l++) {
3259             if (hm1 & x) {
3260                 int L = *l, H = L >> 4;
3261                 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3262
3263                 if (col || x > 1) {
3264                     if (hmask1[0] & x) {
3265                         if (hmask2[0] & x) {
3266                             av_assert2(l[8] == L);
3267                             s->dsp.loop_filter_16[0](ptr, ls_y, E, I, H);
3268                         } else {
3269                             s->dsp.loop_filter_8[2][0](ptr, ls_y, E, I, H);
3270                         }
3271                     } else if (hm2 & x) {
3272                         L = l[8];
3273                         H |= (L >> 4) << 8;
3274                         E |= s->filter.mblim_lut[L] << 8;
3275                         I |= s->filter.lim_lut[L] << 8;
3276                         s->dsp.loop_filter_mix2[!!(hmask1[1] & x)]
3277                                                [!!(hmask2[1] & x)]
3278                                                [0](ptr, ls_y, E, I, H);
3279                     } else {
3280                         s->dsp.loop_filter_8[!!(hmask1[1] & x)]
3281                                             [0](ptr, ls_y, E, I, H);
3282                     }
3283                 }
3284             } else if (hm2 & x) {
3285                 int L = l[8], H = L >> 4;
3286                 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3287
3288                 if (col || x > 1) {
3289                     s->dsp.loop_filter_8[!!(hmask2[1] & x)]
3290                                         [0](ptr + 8 * ls_y, ls_y, E, I, H);
3291                 }
3292             }
3293             if (hm13 & x) {
3294                 int L = *l, H = L >> 4;
3295                 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3296
3297                 if (hm23 & x) {
3298                     L = l[8];
3299                     H |= (L >> 4) << 8;
3300                     E |= s->filter.mblim_lut[L] << 8;
3301                     I |= s->filter.lim_lut[L] << 8;
3302                     s->dsp.loop_filter_mix2[0][0][0](ptr + 4, ls_y, E, I, H);
3303                 } else {
3304                     s->dsp.loop_filter_8[0][0](ptr + 4, ls_y, E, I, H);
3305                 }
3306             } else if (hm23 & x) {
3307                 int L = l[8], H = L >> 4;
3308                 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3309
3310                 s->dsp.loop_filter_8[0][0](ptr + 8 * ls_y + 4, ls_y, E, I, H);
3311             }
3312         }
3313     }
3314
3315     //                                          block1
3316     // filter edges between rows, Y plane (e.g. ------)
3317     //                                          block2
3318     dst = f->data[0] + yoff;
3319     lvl = lflvl->level;
3320     for (y = 0; y < 8; y++, dst += 8 * ls_y, lvl += 8) {
3321         uint8_t *ptr = dst, *l = lvl, *vmask = lflvl->mask[0][1][y];
3322         unsigned vm = vmask[0] | vmask[1] | vmask[2], vm3 = vmask[3];
3323
3324         for (x = 1; vm & ~(x - 1); x <<= 2, ptr += 16, l += 2) {
3325             if (row || y) {
3326                 if (vm & x) {
3327                     int L = *l, H = L >> 4;
3328                     int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3329
3330                     if (vmask[0] & x) {
3331                         if (vmask[0] & (x << 1)) {
3332                             av_assert2(l[1] == L);
3333                             s->dsp.loop_filter_16[1](ptr, ls_y, E, I, H);
3334                         } else {
3335                             s->dsp.loop_filter_8[2][1](ptr, ls_y, E, I, H);
3336                         }
3337                     } else if (vm & (x << 1)) {
3338                         L = l[1];
3339                         H |= (L >> 4) << 8;
3340                         E |= s->filter.mblim_lut[L] << 8;
3341                         I |= s->filter.lim_lut[L] << 8;
3342                         s->dsp.loop_filter_mix2[!!(vmask[1] &  x)]
3343                                                [!!(vmask[1] & (x << 1))]
3344                                                [1](ptr, ls_y, E, I, H);
3345                     } else {
3346                         s->dsp.loop_filter_8[!!(vmask[1] & x)]
3347                                             [1](ptr, ls_y, E, I, H);
3348                     }
3349                 } else if (vm & (x << 1)) {
3350                     int L = l[1], H = L >> 4;
3351                     int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3352
3353                     s->dsp.loop_filter_8[!!(vmask[1] & (x << 1))]
3354                                         [1](ptr + 8, ls_y, E, I, H);
3355                 }
3356             }
3357             if (vm3 & x) {
3358                 int L = *l, H = L >> 4;
3359                 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3360
3361                 if (vm3 & (x << 1)) {
3362                     L = l[1];
3363                     H |= (L >> 4) << 8;
3364                     E |= s->filter.mblim_lut[L] << 8;
3365                     I |= s->filter.lim_lut[L] << 8;
3366                     s->dsp.loop_filter_mix2[0][0][1](ptr + ls_y * 4, ls_y, E, I, H);
3367                 } else {
3368                     s->dsp.loop_filter_8[0][1](ptr + ls_y * 4, ls_y, E, I, H);
3369                 }
3370             } else if (vm3 & (x << 1)) {
3371                 int L = l[1], H = L >> 4;
3372                 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3373
3374                 s->dsp.loop_filter_8[0][1](ptr + ls_y * 4 + 8, ls_y, E, I, H);
3375             }
3376         }
3377     }
3378
3379     // same principle but for U/V planes
3380     for (p = 0; p < 2; p++) {
3381         lvl = lflvl->level;
3382         dst = f->data[1 + p] + uvoff;
3383         for (y = 0; y < 8; y += 4, dst += 16 * ls_uv, lvl += 32) {
3384             uint8_t *ptr = dst, *l = lvl, *hmask1 = lflvl->mask[1][0][y];
3385             uint8_t *hmask2 = lflvl->mask[1][0][y + 2];
3386             unsigned hm1 = hmask1[0] | hmask1[1] | hmask1[2];
3387             unsigned hm2 = hmask2[1] | hmask2[2], hm = hm1 | hm2;
3388
3389             for (x = 1; hm & ~(x - 1); x <<= 1, ptr += 4) {
3390                 if (col || x > 1) {
3391                     if (hm1 & x) {
3392                         int L = *l, H = L >> 4;
3393                         int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3394
3395                         if (hmask1[0] & x) {
3396                             if (hmask2[0] & x) {
3397                                 av_assert2(l[16] == L);
3398                                 s->dsp.loop_filter_16[0](ptr, ls_uv, E, I, H);
3399                             } else {
3400                                 s->dsp.loop_filter_8[2][0](ptr, ls_uv, E, I, H);
3401                             }
3402                         } else if (hm2 & x) {
3403                             L = l[16];
3404                             H |= (L >> 4) << 8;
3405                             E |= s->filter.mblim_lut[L] << 8;
3406                             I |= s->filter.lim_lut[L] << 8;
3407                             s->dsp.loop_filter_mix2[!!(hmask1[1] & x)]
3408                                                    [!!(hmask2[1] & x)]
3409                                                    [0](ptr, ls_uv, E, I, H);
3410                         } else {
3411                             s->dsp.loop_filter_8[!!(hmask1[1] & x)]
3412                                                 [0](ptr, ls_uv, E, I, H);
3413                         }
3414                     } else if (hm2 & x) {
3415                         int L = l[16], H = L >> 4;
3416                         int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3417
3418                         s->dsp.loop_filter_8[!!(hmask2[1] & x)]
3419                                             [0](ptr + 8 * ls_uv, ls_uv, E, I, H);
3420                     }
3421                 }
3422                 if (x & 0xAA)
3423                     l += 2;
3424             }
3425         }
3426         lvl = lflvl->level;
3427         dst = f->data[1 + p] + uvoff;
3428         for (y = 0; y < 8; y++, dst += 4 * ls_uv) {
3429             uint8_t *ptr = dst, *l = lvl, *vmask = lflvl->mask[1][1][y];
3430             unsigned vm = vmask[0] | vmask[1] | vmask[2];
3431
3432             for (x = 1; vm & ~(x - 1); x <<= 4, ptr += 16, l += 4) {
3433                 if (row || y) {
3434                     if (vm & x) {
3435                         int L = *l, H = L >> 4;
3436                         int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3437
3438                         if (vmask[0] & x) {
3439                             if (vmask[0] & (x << 2)) {
3440                                 av_assert2(l[2] == L);
3441                                 s->dsp.loop_filter_16[1](ptr, ls_uv, E, I, H);
3442                             } else {
3443                                 s->dsp.loop_filter_8[2][1](ptr, ls_uv, E, I, H);
3444                             }
3445                         } else if (vm & (x << 2)) {
3446                             L = l[2];
3447                             H |= (L >> 4) << 8;
3448                             E |= s->filter.mblim_lut[L] << 8;
3449                             I |= s->filter.lim_lut[L] << 8;
3450                             s->dsp.loop_filter_mix2[!!(vmask[1] &  x)]
3451                                                    [!!(vmask[1] & (x << 2))]
3452                                                    [1](ptr, ls_uv, E, I, H);
3453                         } else {
3454                             s->dsp.loop_filter_8[!!(vmask[1] & x)]
3455                                                 [1](ptr, ls_uv, E, I, H);
3456                         }
3457                     } else if (vm & (x << 2)) {
3458                         int L = l[2], H = L >> 4;
3459                         int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3460
3461                         s->dsp.loop_filter_8[!!(vmask[1] & (x << 2))]
3462                                             [1](ptr + 8, ls_uv, E, I, H);
3463                     }
3464                 }
3465             }
3466             if (y & 1)
3467                 lvl += 16;
3468         }
3469     }
3470 }
3471
3472 static void set_tile_offset(int *start, int *end, int idx, int log2_n, int n)
3473 {
3474     int sb_start = ( idx      * n) >> log2_n;
3475     int sb_end   = ((idx + 1) * n) >> log2_n;
3476     *start = FFMIN(sb_start, n) << 3;
3477     *end   = FFMIN(sb_end,   n) << 3;
3478 }
3479
3480 static av_always_inline void adapt_prob(uint8_t *p, unsigned ct0, unsigned ct1,
3481                                         int max_count, int update_factor)
3482 {
3483     unsigned ct = ct0 + ct1, p2, p1;
3484
3485     if (!ct)
3486         return;
3487
3488     p1 = *p;
3489     p2 = ((ct0 << 8) + (ct >> 1)) / ct;
3490     p2 = av_clip(p2, 1, 255);
3491     ct = FFMIN(ct, max_count);
3492     update_factor = FASTDIV(update_factor * ct, max_count);
3493
3494     // (p1 * (256 - update_factor) + p2 * update_factor + 128) >> 8
3495     *p = p1 + (((p2 - p1) * update_factor + 128) >> 8);
3496 }
3497
3498 static void adapt_probs(VP9Context *s)
3499 {
3500     int i, j, k, l, m;
3501     prob_context *p = &s->prob_ctx[s->framectxid].p;
3502     int uf = (s->keyframe || s->intraonly || !s->last_keyframe) ? 112 : 128;
3503
3504     // coefficients
3505     for (i = 0; i < 4; i++)
3506         for (j = 0; j < 2; j++)
3507             for (k = 0; k < 2; k++)
3508                 for (l = 0; l < 6; l++)
3509                     for (m = 0; m < 6; m++) {
3510                         uint8_t *pp = s->prob_ctx[s->framectxid].coef[i][j][k][l][m];
3511                         unsigned *e = s->counts.eob[i][j][k][l][m];
3512                         unsigned *c = s->counts.coef[i][j][k][l][m];
3513
3514                         if (l == 0 && m >= 3) // dc only has 3 pt
3515                             break;
3516
3517                         adapt_prob(&pp[0], e[0], e[1], 24, uf);
3518                         adapt_prob(&pp[1], c[0], c[1] + c[2], 24, uf);
3519                         adapt_prob(&pp[2], c[1], c[2], 24, uf);
3520                     }
3521
3522     if (s->keyframe || s->intraonly) {
3523         memcpy(p->skip,  s->prob.p.skip,  sizeof(p->skip));
3524         memcpy(p->tx32p, s->prob.p.tx32p, sizeof(p->tx32p));
3525         memcpy(p->tx16p, s->prob.p.tx16p, sizeof(p->tx16p));
3526         memcpy(p->tx8p,  s->prob.p.tx8p,  sizeof(p->tx8p));
3527         return;
3528     }
3529
3530     // skip flag
3531     for (i = 0; i < 3; i++)
3532         adapt_prob(&p->skip[i], s->counts.skip[i][0], s->counts.skip[i][1], 20, 128);
3533
3534     // intra/inter flag
3535     for (i = 0; i < 4; i++)
3536         adapt_prob(&p->intra[i], s->counts.intra[i][0], s->counts.intra[i][1], 20, 128);
3537
3538     // comppred flag
3539     if (s->comppredmode == PRED_SWITCHABLE) {
3540       for (i = 0; i < 5; i++)
3541           adapt_prob(&p->comp[i], s->counts.comp[i][0], s->counts.comp[i][1], 20, 128);
3542     }
3543
3544     // reference frames
3545     if (s->comppredmode != PRED_SINGLEREF) {
3546       for (i = 0; i < 5; i++)
3547           adapt_prob(&p->comp_ref[i], s->counts.comp_ref[i][0],
3548                      s->counts.comp_ref[i][1], 20, 128);
3549     }
3550
3551     if (s->comppredmode != PRED_COMPREF) {
3552       for (i = 0; i < 5; i++) {
3553           uint8_t *pp = p->single_ref[i];
3554           unsigned (*c)[2] = s->counts.single_ref[i];
3555
3556           adapt_prob(&pp[0], c[0][0], c[0][1], 20, 128);
3557           adapt_prob(&pp[1], c[1][0], c[1][1], 20, 128);
3558       }
3559     }
3560
3561     // block partitioning
3562     for (i = 0; i < 4; i++)
3563         for (j = 0; j < 4; j++) {
3564             uint8_t *pp = p->partition[i][j];
3565             unsigned *c = s->counts.partition[i][j];
3566
3567             adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3568             adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3569             adapt_prob(&pp[2], c[2], c[3], 20, 128);
3570         }
3571
3572     // tx size
3573     if (s->txfmmode == TX_SWITCHABLE) {
3574       for (i = 0; i < 2; i++) {
3575           unsigned *c16 = s->counts.tx16p[i], *c32 = s->counts.tx32p[i];
3576
3577           adapt_prob(&p->tx8p[i], s->counts.tx8p[i][0], s->counts.tx8p[i][1], 20, 128);
3578           adapt_prob(&p->tx16p[i][0], c16[0], c16[1] + c16[2], 20, 128);
3579           adapt_prob(&p->tx16p[i][1], c16[1], c16[2], 20, 128);
3580           adapt_prob(&p->tx32p[i][0], c32[0], c32[1] + c32[2] + c32[3], 20, 128);
3581           adapt_prob(&p->tx32p[i][1], c32[1], c32[2] + c32[3], 20, 128);
3582           adapt_prob(&p->tx32p[i][2], c32[2], c32[3], 20, 128);
3583       }
3584     }
3585
3586     // interpolation filter
3587     if (s->filtermode == FILTER_SWITCHABLE) {
3588         for (i = 0; i < 4; i++) {
3589             uint8_t *pp = p->filter[i];
3590             unsigned *c = s->counts.filter[i];
3591
3592             adapt_prob(&pp[0], c[0], c[1] + c[2], 20, 128);
3593             adapt_prob(&pp[1], c[1], c[2], 20, 128);
3594         }
3595     }
3596
3597     // inter modes
3598     for (i = 0; i < 7; i++) {
3599         uint8_t *pp = p->mv_mode[i];
3600         unsigned *c = s->counts.mv_mode[i];
3601
3602         adapt_prob(&pp[0], c[2], c[1] + c[0] + c[3], 20, 128);
3603         adapt_prob(&pp[1], c[0], c[1] + c[3], 20, 128);
3604         adapt_prob(&pp[2], c[1], c[3], 20, 128);
3605     }
3606
3607     // mv joints
3608     {
3609         uint8_t *pp = p->mv_joint;
3610         unsigned *c = s->counts.mv_joint;
3611
3612         adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3613         adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3614         adapt_prob(&pp[2], c[2], c[3], 20, 128);
3615     }
3616
3617     // mv components
3618     for (i = 0; i < 2; i++) {
3619         uint8_t *pp;
3620         unsigned *c, (*c2)[2], sum;
3621
3622         adapt_prob(&p->mv_comp[i].sign, s->counts.mv_comp[i].sign[0],
3623                    s->counts.mv_comp[i].sign[1], 20, 128);
3624
3625         pp = p->mv_comp[i].classes;
3626         c = s->counts.mv_comp[i].classes;
3627         sum = c[1] + c[2] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9] + c[10];
3628         adapt_prob(&pp[0], c[0], sum, 20, 128);
3629         sum -= c[1];
3630         adapt_prob(&pp[1], c[1], sum, 20, 128);
3631         sum -= c[2] + c[3];
3632         adapt_prob(&pp[2], c[2] + c[3], sum, 20, 128);
3633         adapt_prob(&pp[3], c[2], c[3], 20, 128);
3634         sum -= c[4] + c[5];
3635         adapt_prob(&pp[4], c[4] + c[5], sum, 20, 128);
3636         adapt_prob(&pp[5], c[4], c[5], 20, 128);
3637         sum -= c[6];
3638         adapt_prob(&pp[6], c[6], sum, 20, 128);
3639         adapt_prob(&pp[7], c[7] + c[8], c[9] + c[10], 20, 128);
3640         adapt_prob(&pp[8], c[7], c[8], 20, 128);
3641         adapt_prob(&pp[9], c[9], c[10], 20, 128);
3642
3643         adapt_prob(&p->mv_comp[i].class0, s->counts.mv_comp[i].class0[0],
3644                    s->counts.mv_comp[i].class0[1], 20, 128);
3645         pp = p->mv_comp[i].bits;
3646         c2 = s->counts.mv_comp[i].bits;
3647         for (j = 0; j < 10; j++)
3648             adapt_prob(&pp[j], c2[j][0], c2[j][1], 20, 128);
3649
3650         for (j = 0; j < 2; j++) {
3651             pp = p->mv_comp[i].class0_fp[j];
3652             c = s->counts.mv_comp[i].class0_fp[j];
3653             adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3654             adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3655             adapt_prob(&pp[2], c[2], c[3], 20, 128);
3656         }
3657         pp = p->mv_comp[i].fp;
3658         c = s->counts.mv_comp[i].fp;
3659         adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3660         adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3661         adapt_prob(&pp[2], c[2], c[3], 20, 128);
3662
3663         if (s->highprecisionmvs) {
3664             adapt_prob(&p->mv_comp[i].class0_hp, s->counts.mv_comp[i].class0_hp[0],
3665                        s->counts.mv_comp[i].class0_hp[1], 20, 128);
3666             adapt_prob(&p->mv_comp[i].hp, s->counts.mv_comp[i].hp[0],
3667                        s->counts.mv_comp[i].hp[1], 20, 128);
3668         }
3669     }
3670
3671     // y intra modes
3672     for (i = 0; i < 4; i++) {
3673         uint8_t *pp = p->y_mode[i];
3674         unsigned *c = s->counts.y_mode[i], sum, s2;
3675
3676         sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
3677         adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
3678         sum -= c[TM_VP8_PRED];
3679         adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
3680         sum -= c[VERT_PRED];
3681         adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
3682         s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
3683         sum -= s2;
3684         adapt_prob(&pp[3], s2, sum, 20, 128);
3685         s2 -= c[HOR_PRED];
3686         adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
3687         adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], 20, 128);
3688         sum -= c[DIAG_DOWN_LEFT_PRED];
3689         adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
3690         sum -= c[VERT_LEFT_PRED];
3691         adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
3692         adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
3693     }
3694
3695     // uv intra modes
3696     for (i = 0; i < 10; i++) {
3697         uint8_t *pp = p->uv_mode[i];
3698         unsigned *c = s->counts.uv_mode[i], sum, s2;
3699
3700         sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
3701         adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
3702         sum -= c[TM_VP8_PRED];
3703         adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
3704         sum -= c[VERT_PRED];
3705         adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
3706         s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
3707         sum -= s2;
3708         adapt_prob(&pp[3], s2, sum, 20, 128);
3709         s2 -= c[HOR_PRED];
3710         adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
3711         adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], 20, 128);
3712         sum -= c[DIAG_DOWN_LEFT_PRED];
3713         adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
3714         sum -= c[VERT_LEFT_PRED];
3715         adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
3716         adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
3717     }
3718 }
3719
3720 static void free_buffers(VP9Context *s)
3721 {
3722     av_freep(&s->intra_pred_data[0]);
3723     av_freep(&s->b_base);
3724     av_freep(&s->block_base);
3725 }
3726
3727 static av_cold int vp9_decode_free(AVCodecContext *ctx)
3728 {
3729     VP9Context *s = ctx->priv_data;
3730     int i;
3731
3732     for (i = 0; i < 2; i++) {
3733         if (s->frames[i].tf.f->data[0])
3734             vp9_unref_frame(ctx, &s->frames[i]);
3735         av_frame_free(&s->frames[i].tf.f);
3736     }
3737     for (i = 0; i < 8; i++) {
3738         if (s->refs[i].f->data[0])
3739             ff_thread_release_buffer(ctx, &s->refs[i]);
3740         av_frame_free(&s->refs[i].f);
3741         if (s->next_refs[i].f->data[0])
3742             ff_thread_release_buffer(ctx, &s->next_refs[i]);
3743         av_frame_free(&s->next_refs[i].f);
3744     }
3745     free_buffers(s);
3746     av_freep(&s->c_b);
3747     s->c_b_size = 0;
3748
3749     return 0;
3750 }
3751
3752
3753 static int vp9_decode_frame(AVCodecContext *ctx, void *frame,
3754                             int *got_frame, AVPacket *pkt)
3755 {
3756     const uint8_t *data = pkt->data;
3757     int size = pkt->size;
3758     VP9Context *s = ctx->priv_data;
3759     int res, tile_row, tile_col, i, ref, row, col;
3760     ptrdiff_t yoff, uvoff, ls_y, ls_uv;
3761     AVFrame *f;
3762
3763     if ((res = decode_frame_header(ctx, data, size, &ref)) < 0) {
3764         return res;
3765     } else if (res == 0) {
3766         if (!s->refs[ref].f->data[0]) {
3767             av_log(ctx, AV_LOG_ERROR, "Requested reference %d not available\n", ref);
3768             return AVERROR_INVALIDDATA;
3769         }
3770         if ((res = av_frame_ref(frame, s->refs[ref].f)) < 0)
3771             return res;
3772         *got_frame = 1;
3773         return pkt->size;
3774     }
3775     data += res;
3776     size -= res;
3777
3778     if (s->frames[LAST_FRAME].tf.f->data[0])
3779         vp9_unref_frame(ctx, &s->frames[LAST_FRAME]);
3780     if (!s->keyframe && s->frames[CUR_FRAME].tf.f->data[0] &&
3781         (res = vp9_ref_frame(ctx, &s->frames[LAST_FRAME], &s->frames[CUR_FRAME])) < 0)
3782         return res;
3783     if (s->frames[CUR_FRAME].tf.f->data[0])
3784         vp9_unref_frame(ctx, &s->frames[CUR_FRAME]);
3785     if ((res = vp9_alloc_frame(ctx, &s->frames[CUR_FRAME])) < 0)
3786         return res;
3787     f = s->frames[CUR_FRAME].tf.f;
3788     f->key_frame = s->keyframe;
3789     f->pict_type = s->keyframe ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
3790     ls_y = f->linesize[0];
3791     ls_uv =f->linesize[1];
3792
3793     // ref frame setup
3794     for (i = 0; i < 8; i++) {
3795         if (s->next_refs[i].f->data[0])
3796             ff_thread_release_buffer(ctx, &s->next_refs[i]);
3797         if (s->refreshrefmask & (1 << i)) {
3798             res = ff_thread_ref_frame(&s->next_refs[i], &s->frames[CUR_FRAME].tf);
3799         } else {
3800             res = ff_thread_ref_frame(&s->next_refs[i], &s->refs[i]);
3801         }
3802         if (res < 0)
3803             return res;
3804     }
3805
3806     if (s->fullrange)
3807         ctx->color_range = AVCOL_RANGE_JPEG;
3808     else
3809         ctx->color_range = AVCOL_RANGE_MPEG;
3810
3811     switch (s->colorspace) {
3812     case 1: ctx->colorspace = AVCOL_SPC_BT470BG; break;
3813     case 2: ctx->colorspace = AVCOL_SPC_BT709; break;
3814     case 3: ctx->colorspace = AVCOL_SPC_SMPTE170M; break;
3815     case 4: ctx->colorspace = AVCOL_SPC_SMPTE240M; break;
3816     }
3817
3818     // main tile decode loop
3819     memset(s->above_partition_ctx, 0, s->cols);
3820     memset(s->above_skip_ctx, 0, s->cols);
3821     if (s->keyframe || s->intraonly) {
3822         memset(s->above_mode_ctx, DC_PRED, s->cols * 2);
3823     } else {
3824         memset(s->above_mode_ctx, NEARESTMV, s->cols);
3825     }
3826     memset(s->above_y_nnz_ctx, 0, s->sb_cols * 16);
3827     memset(s->above_uv_nnz_ctx[0], 0, s->sb_cols * 8);
3828     memset(s->above_uv_nnz_ctx[1], 0, s->sb_cols * 8);
3829     memset(s->above_segpred_ctx, 0, s->cols);
3830     s->pass = s->uses_2pass =
3831         ctx->active_thread_type == FF_THREAD_FRAME && s->refreshctx && !s->parallelmode;
3832     if ((res = update_block_buffers(ctx)) < 0) {
3833         av_log(ctx, AV_LOG_ERROR,
3834                "Failed to allocate block buffers\n");
3835         return res;
3836     }
3837     if (s->refreshctx && s->parallelmode) {
3838         int j, k, l, m;
3839
3840         for (i = 0; i < 4; i++) {
3841             for (j = 0; j < 2; j++)
3842                 for (k = 0; k < 2; k++)
3843                     for (l = 0; l < 6; l++)
3844                         for (m = 0; m < 6; m++)
3845                             memcpy(s->prob_ctx[s->framectxid].coef[i][j][k][l][m],
3846                                    s->prob.coef[i][j][k][l][m], 3);
3847             if (s->txfmmode == i)
3848                 break;
3849         }
3850         s->prob_ctx[s->framectxid].p = s->prob.p;
3851         ff_thread_finish_setup(ctx);
3852     } else if (!s->refreshctx) {
3853         ff_thread_finish_setup(ctx);
3854     }
3855
3856     do {
3857         yoff = uvoff = 0;
3858         s->b = s->b_base;
3859         s->block = s->block_base;
3860         s->uvblock[0] = s->uvblock_base[0];
3861         s->uvblock[1] = s->uvblock_base[1];
3862         s->eob = s->eob_base;
3863         s->uveob[0] = s->uveob_base[0];
3864         s->uveob[1] = s->uveob_base[1];
3865
3866         for (tile_row = 0; tile_row < s->tiling.tile_rows; tile_row++) {
3867             set_tile_offset(&s->tiling.tile_row_start, &s->tiling.tile_row_end,
3868                             tile_row, s->tiling.log2_tile_rows, s->sb_rows);
3869             if (s->pass != 2) {
3870                 for (tile_col = 0; tile_col < s->tiling.tile_cols; tile_col++) {
3871                     unsigned tile_size;
3872
3873                     if (tile_col == s->tiling.tile_cols - 1 &&
3874                         tile_row == s->tiling.tile_rows - 1) {
3875                         tile_size = size;
3876                     } else {
3877                         tile_size = AV_RB32(data);
3878                         data += 4;
3879                         size -= 4;
3880                     }
3881                     if (tile_size > size) {
3882                         ff_thread_report_progress(&s->frames[CUR_FRAME].tf, INT_MAX, 0);
3883                         return AVERROR_INVALIDDATA;
3884                     }
3885                     ff_vp56_init_range_decoder(&s->c_b[tile_col], data, tile_size);
3886                     if (vp56_rac_get_prob_branchy(&s->c_b[tile_col], 128)) { // marker bit
3887                         ff_thread_report_progress(&s->frames[CUR_FRAME].tf, INT_MAX, 0);
3888                         return AVERROR_INVALIDDATA;
3889                     }
3890                     data += tile_size;
3891                     size -= tile_size;
3892                 }
3893             }
3894
3895             for (row = s->tiling.tile_row_start; row < s->tiling.tile_row_end;
3896                  row += 8, yoff += ls_y * 64, uvoff += ls_uv * 32) {
3897                 struct VP9Filter *lflvl_ptr = s->lflvl;
3898                 ptrdiff_t yoff2 = yoff, uvoff2 = uvoff;
3899
3900                 for (tile_col = 0; tile_col < s->tiling.tile_cols; tile_col++) {
3901                     set_tile_offset(&s->tiling.tile_col_start, &s->tiling.tile_col_end,
3902                                     tile_col, s->tiling.log2_tile_cols, s->sb_cols);
3903
3904                     if (s->pass != 2) {
3905                         memset(s->left_partition_ctx, 0, 8);
3906                         memset(s->left_skip_ctx, 0, 8);
3907                         if (s->keyframe || s->intraonly) {
3908                             memset(s->left_mode_ctx, DC_PRED, 16);
3909                         } else {
3910                             memset(s->left_mode_ctx, NEARESTMV, 8);
3911                         }
3912                         memset(s->left_y_nnz_ctx, 0, 16);
3913                         memset(s->left_uv_nnz_ctx, 0, 16);
3914                         memset(s->left_segpred_ctx, 0, 8);
3915
3916                         memcpy(&s->c, &s->c_b[tile_col], sizeof(s->c));
3917                     }
3918
3919                     for (col = s->tiling.tile_col_start;
3920                          col < s->tiling.tile_col_end;
3921                          col += 8, yoff2 += 64, uvoff2 += 32, lflvl_ptr++) {
3922                         // FIXME integrate with lf code (i.e. zero after each
3923                         // use, similar to invtxfm coefficients, or similar)
3924                         if (s->pass != 1) {
3925                             memset(lflvl_ptr->mask, 0, sizeof(lflvl_ptr->mask));
3926                         }
3927
3928                         if (s->pass == 2) {
3929                             decode_sb_mem(ctx, row, col, lflvl_ptr,
3930                                           yoff2, uvoff2, BL_64X64);
3931                         } else {
3932                             decode_sb(ctx, row, col, lflvl_ptr,
3933                                       yoff2, uvoff2, BL_64X64);
3934                         }
3935                     }
3936                     if (s->pass != 2) {
3937                         memcpy(&s->c_b[tile_col], &s->c, sizeof(s->c));
3938                     }
3939                 }
3940
3941                 if (s->pass == 1) {
3942                     continue;
3943                 }
3944
3945                 // backup pre-loopfilter reconstruction data for intra
3946                 // prediction of next row of sb64s
3947                 if (row + 8 < s->rows) {
3948                     memcpy(s->intra_pred_data[0],
3949                            f->data[0] + yoff + 63 * ls_y,
3950                            8 * s->cols);
3951                     memcpy(s->intra_pred_data[1],
3952                            f->data[1] + uvoff + 31 * ls_uv,
3953                            4 * s->cols);
3954                     memcpy(s->intra_pred_data[2],
3955                            f->data[2] + uvoff + 31 * ls_uv,
3956                            4 * s->cols);
3957                 }
3958
3959                 // loopfilter one row
3960                 if (s->filter.level) {
3961                     yoff2 = yoff;
3962                     uvoff2 = uvoff;
3963                     lflvl_ptr = s->lflvl;
3964                     for (col = 0; col < s->cols;
3965                          col += 8, yoff2 += 64, uvoff2 += 32, lflvl_ptr++) {
3966                         loopfilter_sb(ctx, lflvl_ptr, row, col, yoff2, uvoff2);
3967                     }
3968                 }
3969
3970                 // FIXME maybe we can make this more finegrained by running the
3971                 // loopfilter per-block instead of after each sbrow
3972                 // In fact that would also make intra pred left preparation easier?
3973                 ff_thread_report_progress(&s->frames[CUR_FRAME].tf, row >> 3, 0);
3974             }
3975         }
3976
3977         if (s->pass < 2 && s->refreshctx && !s->parallelmode) {
3978             adapt_probs(s);
3979             ff_thread_finish_setup(ctx);
3980         }
3981     } while (s->pass++ == 1);
3982     ff_thread_report_progress(&s->frames[CUR_FRAME].tf, INT_MAX, 0);
3983
3984     // ref frame setup
3985     for (i = 0; i < 8; i++) {
3986         if (s->refs[i].f->data[0])
3987             ff_thread_release_buffer(ctx, &s->refs[i]);
3988         ff_thread_ref_frame(&s->refs[i], &s->next_refs[i]);
3989     }
3990
3991     if (!s->invisible) {
3992         if ((res = av_frame_ref(frame, s->frames[CUR_FRAME].tf.f)) < 0)
3993             return res;
3994         *got_frame = 1;
3995     }
3996
3997     return pkt->size;
3998 }
3999
4000 static void vp9_decode_flush(AVCodecContext *ctx)
4001 {
4002     VP9Context *s = ctx->priv_data;
4003     int i;
4004
4005     for (i = 0; i < 2; i++)
4006         vp9_unref_frame(ctx, &s->frames[i]);
4007     for (i = 0; i < 8; i++)
4008         ff_thread_release_buffer(ctx, &s->refs[i]);
4009 }
4010
4011 static int init_frames(AVCodecContext *ctx)
4012 {
4013     VP9Context *s = ctx->priv_data;
4014     int i;
4015
4016     for (i = 0; i < 2; i++) {
4017         s->frames[i].tf.f = av_frame_alloc();
4018         if (!s->frames[i].tf.f) {
4019             vp9_decode_free(ctx);
4020             av_log(ctx, AV_LOG_ERROR, "Failed to allocate frame buffer %d\n", i);
4021             return AVERROR(ENOMEM);
4022         }
4023     }
4024     for (i = 0; i < 8; i++) {
4025         s->refs[i].f = av_frame_alloc();
4026         s->next_refs[i].f = av_frame_alloc();
4027         if (!s->refs[i].f || !s->next_refs[i].f) {
4028             vp9_decode_free(ctx);
4029             av_log(ctx, AV_LOG_ERROR, "Failed to allocate frame buffer %d\n", i);
4030             return AVERROR(ENOMEM);
4031         }
4032     }
4033
4034     return 0;
4035 }
4036
4037 static av_cold int vp9_decode_init(AVCodecContext *ctx)
4038 {
4039     VP9Context *s = ctx->priv_data;
4040
4041     ctx->internal->allocate_progress = 1;
4042     ctx->pix_fmt = AV_PIX_FMT_YUV420P;
4043     ff_vp9dsp_init(&s->dsp);
4044     ff_videodsp_init(&s->vdsp, 8);
4045     s->filter.sharpness = -1;
4046
4047     return init_frames(ctx);
4048 }
4049
4050 static av_cold int vp9_decode_init_thread_copy(AVCodecContext *avctx)
4051 {
4052     return init_frames(avctx);
4053 }
4054
4055 static int vp9_decode_update_thread_context(AVCodecContext *dst, const AVCodecContext *src)
4056 {
4057     int i, res;
4058     VP9Context *s = dst->priv_data, *ssrc = src->priv_data;
4059
4060     // detect size changes in other threads
4061     if (s->intra_pred_data[0] &&
4062         (!ssrc->intra_pred_data[0] || s->cols != ssrc->cols || s->rows != ssrc->rows)) {
4063         free_buffers(s);
4064     }
4065
4066     for (i = 0; i < 2; i++) {
4067         if (s->frames[i].tf.f->data[0])
4068             vp9_unref_frame(dst, &s->frames[i]);
4069         if (ssrc->frames[i].tf.f->data[0]) {
4070             if ((res = vp9_ref_frame(dst, &s->frames[i], &ssrc->frames[i])) < 0)
4071                 return res;
4072         }
4073     }
4074     for (i = 0; i < 8; i++) {
4075         if (s->refs[i].f->data[0])
4076             ff_thread_release_buffer(dst, &s->refs[i]);
4077         if (ssrc->next_refs[i].f->data[0]) {
4078             if ((res = ff_thread_ref_frame(&s->refs[i], &ssrc->next_refs[i])) < 0)
4079                 return res;
4080         }
4081     }
4082
4083     s->invisible = ssrc->invisible;
4084     s->keyframe = ssrc->keyframe;
4085     s->uses_2pass = ssrc->uses_2pass;
4086     memcpy(&s->prob_ctx, &ssrc->prob_ctx, sizeof(s->prob_ctx));
4087     memcpy(&s->lf_delta, &ssrc->lf_delta, sizeof(s->lf_delta));
4088     if (ssrc->segmentation.enabled) {
4089         memcpy(&s->segmentation.feat, &ssrc->segmentation.feat,
4090                sizeof(s->segmentation.feat));
4091     }
4092
4093     return 0;
4094 }
4095
4096 AVCodec ff_vp9_decoder = {
4097     .name                  = "vp9",
4098     .long_name             = NULL_IF_CONFIG_SMALL("Google VP9"),
4099     .type                  = AVMEDIA_TYPE_VIDEO,
4100     .id                    = AV_CODEC_ID_VP9,
4101     .priv_data_size        = sizeof(VP9Context),
4102     .init                  = vp9_decode_init,
4103     .close                 = vp9_decode_free,
4104     .decode                = vp9_decode_frame,
4105     .capabilities          = CODEC_CAP_DR1 | CODEC_CAP_FRAME_THREADS,
4106     .flush                 = vp9_decode_flush,
4107     .init_thread_copy      = ONLY_IF_THREADS_ENABLED(vp9_decode_init_thread_copy),
4108     .update_thread_context = ONLY_IF_THREADS_ENABLED(vp9_decode_update_thread_context),
4109 };