git.sesse.net Git - ffmpeg/blob - libavcodec/vp9.c

   1 /*
   2  * VP9 compatible video decoder
   3  *
   4  * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
   5  * Copyright (C) 2013 Clément Bœsch <u pkh me>
   6  *
   7  * This file is part of FFmpeg.
   8  *
   9  * FFmpeg is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public
  11  * License as published by the Free Software Foundation; either
  12  * version 2.1 of the License, or (at your option) any later version.
  13  *
  14  * FFmpeg is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with FFmpeg; if not, write to the Free Software
  21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22  */
  23
  24 #include "avcodec.h"
  25 #include "get_bits.h"
  26 #include "internal.h"
  27 #include "thread.h"
  28 #include "videodsp.h"
  29 #include "vp56.h"
  30 #include "vp9.h"
  31 #include "vp9data.h"
  32 #include "vp9dsp.h"
  33 #include "libavutil/avassert.h"
  34 #include "libavutil/pixdesc.h"
  35
  36 #define VP9_SYNCCODE 0x498342
  37
  38 struct VP9Filter {
  39     uint8_t level[8 * 8];
  40     uint8_t /* bit=col */ mask[2 /* 0=y, 1=uv */][2 /* 0=col, 1=row */]
  41                               [8 /* rows */][4 /* 0=16, 1=8, 2=4, 3=inner4 */];
  42 };
  43
  44 typedef struct VP9Block {
  45     uint8_t seg_id, intra, comp, ref[2], mode[4], uvmode, skip;
  46     enum FilterMode filter;
  47     VP56mv mv[4 /* b_idx */][2 /* ref */];
  48     enum BlockSize bs;
  49     enum TxfmMode tx, uvtx;
  50     enum BlockLevel bl;
  51     enum BlockPartition bp;
  52 } VP9Block;
  53
  54 typedef struct VP9Context {
  55     VP9SharedContext s;
  56
  57     VP9DSPContext dsp;
  58     VideoDSPContext vdsp;
  59     GetBitContext gb;
  60     VP56RangeCoder c;
  61     VP56RangeCoder *c_b;
  62     unsigned c_b_size;
  63     VP9Block *b_base, *b;
  64     int pass;
  65     int row, row7, col, col7;
  66     uint8_t *dst[3];
  67     ptrdiff_t y_stride, uv_stride;
  68
  69     uint8_t ss_h, ss_v;
  70     uint8_t last_bpp, bpp, bpp_index, bytesperpixel;
  71     uint8_t last_keyframe;
  72     ThreadFrame next_refs[8];
  73
  74     struct {
  75         uint8_t lim_lut[64];
  76         uint8_t mblim_lut[64];
  77     } filter_lut;
  78     unsigned tile_row_start, tile_row_end, tile_col_start, tile_col_end;
  79     unsigned sb_cols, sb_rows, rows, cols;
  80     struct {
  81         prob_context p;
  82         uint8_t coef[4][2][2][6][6][3];
  83     } prob_ctx[4];
  84     struct {
  85         prob_context p;
  86         uint8_t coef[4][2][2][6][6][11];
  87     } prob;
  88     struct {
  89         unsigned y_mode[4][10];
  90         unsigned uv_mode[10][10];
  91         unsigned filter[4][3];
  92         unsigned mv_mode[7][4];
  93         unsigned intra[4][2];
  94         unsigned comp[5][2];
  95         unsigned single_ref[5][2][2];
  96         unsigned comp_ref[5][2];
  97         unsigned tx32p[2][4];
  98         unsigned tx16p[2][3];
  99         unsigned tx8p[2][2];
 100         unsigned skip[3][2];
 101         unsigned mv_joint[4];
 102         struct {
 103             unsigned sign[2];
 104             unsigned classes[11];
 105             unsigned class0[2];
 106             unsigned bits[10][2];
 107             unsigned class0_fp[2][4];
 108             unsigned fp[4];
 109             unsigned class0_hp[2];
 110             unsigned hp[2];
 111         } mv_comp[2];
 112         unsigned partition[4][4][4];
 113         unsigned coef[4][2][2][6][6][3];
 114         unsigned eob[4][2][2][6][6][2];
 115     } counts;
 116
 117     // contextual (left/above) cache
 118     DECLARE_ALIGNED(16, uint8_t, left_y_nnz_ctx)[16];
 119     DECLARE_ALIGNED(16, uint8_t, left_mode_ctx)[16];
 120     DECLARE_ALIGNED(16, VP56mv, left_mv_ctx)[16][2];
 121     DECLARE_ALIGNED(16, uint8_t, left_uv_nnz_ctx)[2][16];
 122     DECLARE_ALIGNED(8, uint8_t, left_partition_ctx)[8];
 123     DECLARE_ALIGNED(8, uint8_t, left_skip_ctx)[8];
 124     DECLARE_ALIGNED(8, uint8_t, left_txfm_ctx)[8];
 125     DECLARE_ALIGNED(8, uint8_t, left_segpred_ctx)[8];
 126     DECLARE_ALIGNED(8, uint8_t, left_intra_ctx)[8];
 127     DECLARE_ALIGNED(8, uint8_t, left_comp_ctx)[8];
 128     DECLARE_ALIGNED(8, uint8_t, left_ref_ctx)[8];
 129     DECLARE_ALIGNED(8, uint8_t, left_filter_ctx)[8];
 130     uint8_t *above_partition_ctx;
 131     uint8_t *above_mode_ctx;
 132     // FIXME maybe merge some of the below in a flags field?
 133     uint8_t *above_y_nnz_ctx;
 134     uint8_t *above_uv_nnz_ctx[2];
 135     uint8_t *above_skip_ctx; // 1bit
 136     uint8_t *above_txfm_ctx; // 2bit
 137     uint8_t *above_segpred_ctx; // 1bit
 138     uint8_t *above_intra_ctx; // 1bit
 139     uint8_t *above_comp_ctx; // 1bit
 140     uint8_t *above_ref_ctx; // 2bit
 141     uint8_t *above_filter_ctx;
 142     VP56mv (*above_mv_ctx)[2];
 143
 144     // whole-frame cache
 145     uint8_t *intra_pred_data[3];
 146     struct VP9Filter *lflvl;
 147     DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[135 * 144 * 2];
 148
 149     // block reconstruction intermediates
 150     int block_alloc_using_2pass;
 151     int16_t *block_base, *block, *uvblock_base[2], *uvblock[2];
 152     uint8_t *eob_base, *uveob_base[2], *eob, *uveob[2];
 153     struct { int x, y; } min_mv, max_mv;
 154     DECLARE_ALIGNED(32, uint8_t, tmp_y)[64 * 64 * 2];
 155     DECLARE_ALIGNED(32, uint8_t, tmp_uv)[2][64 * 64 * 2];
 156     uint16_t mvscale[3][2];
 157     uint8_t mvstep[3][2];
 158 } VP9Context;
 159
 160 static const uint8_t bwh_tab[2][N_BS_SIZES][2] = {
 161     {
 162         { 16, 16 }, { 16, 8 }, { 8, 16 }, { 8, 8 }, { 8, 4 }, { 4, 8 },
 163         { 4, 4 }, { 4, 2 }, { 2, 4 }, { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 },
 164     }, {
 165         { 8, 8 }, { 8, 4 }, { 4, 8 }, { 4, 4 }, { 4, 2 }, { 2, 4 },
 166         { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },
 167     }
 168 };
 169
 170 static int vp9_alloc_frame(AVCodecContext *ctx, VP9Frame *f)
 171 {
 172     VP9Context *s = ctx->priv_data;
 173     int ret, sz;
 174
 175     if ((ret = ff_thread_get_buffer(ctx, &f->tf, AV_GET_BUFFER_FLAG_REF)) < 0)
 176         return ret;
 177     sz = 64 * s->sb_cols * s->sb_rows;
 178     if (!(f->extradata = av_buffer_allocz(sz * (1 + sizeof(struct VP9mvrefPair))))) {
 179         ff_thread_release_buffer(ctx, &f->tf);
 180         return AVERROR(ENOMEM);
 181     }
 182
 183     f->segmentation_map = f->extradata->data;
 184     f->mv = (struct VP9mvrefPair *) (f->extradata->data + sz);
 185
 186     return 0;
 187 }
 188
 189 static void vp9_unref_frame(AVCodecContext *ctx, VP9Frame *f)
 190 {
 191     ff_thread_release_buffer(ctx, &f->tf);
 192     av_buffer_unref(&f->extradata);
 193     f->segmentation_map = NULL;
 194 }
 195
 196 static int vp9_ref_frame(AVCodecContext *ctx, VP9Frame *dst, VP9Frame *src)
 197 {
 198     int res;
 199
 200     if ((res = ff_thread_ref_frame(&dst->tf, &src->tf)) < 0) {
 201         return res;
 202     } else if (!(dst->extradata = av_buffer_ref(src->extradata))) {
 203         vp9_unref_frame(ctx, dst);
 204         return AVERROR(ENOMEM);
 205     }
 206
 207     dst->segmentation_map = src->segmentation_map;
 208     dst->mv = src->mv;
 209     dst->uses_2pass = src->uses_2pass;
 210
 211     return 0;
 212 }
 213
 214 static int update_size(AVCodecContext *ctx, int w, int h, enum AVPixelFormat fmt)
 215 {
 216     VP9Context *s = ctx->priv_data;
 217     uint8_t *p;
 218     int bytesperpixel = s->bytesperpixel;
 219
 220     av_assert0(w > 0 && h > 0);
 221
 222     if (s->intra_pred_data[0] && w == ctx->width && h == ctx->height && ctx->pix_fmt == fmt)
 223         return 0;
 224
 225     ctx->width   = w;
 226     ctx->height  = h;
 227     ctx->pix_fmt = fmt;
 228     s->sb_cols   = (w + 63) >> 6;
 229     s->sb_rows   = (h + 63) >> 6;
 230     s->cols      = (w + 7) >> 3;
 231     s->rows      = (h + 7) >> 3;
 232
 233 #define assign(var, type, n) var = (type) p; p += s->sb_cols * (n) * sizeof(*var)
 234     av_freep(&s->intra_pred_data[0]);
 235     // FIXME we slightly over-allocate here for subsampled chroma, but a little
 236     // bit of padding shouldn't affect performance...
 237     p = av_malloc(s->sb_cols * (128 + 192 * bytesperpixel +
 238                                 sizeof(*s->lflvl) + 16 * sizeof(*s->above_mv_ctx)));
 239     if (!p)
 240         return AVERROR(ENOMEM);
 241     assign(s->intra_pred_data[0],  uint8_t *,             64 * bytesperpixel);
 242     assign(s->intra_pred_data[1],  uint8_t *,             64 * bytesperpixel);
 243     assign(s->intra_pred_data[2],  uint8_t *,             64 * bytesperpixel);
 244     assign(s->above_y_nnz_ctx,     uint8_t *,             16);
 245     assign(s->above_mode_ctx,      uint8_t *,             16);
 246     assign(s->above_mv_ctx,        VP56mv(*)[2],          16);
 247     assign(s->above_uv_nnz_ctx[0], uint8_t *,             16);
 248     assign(s->above_uv_nnz_ctx[1], uint8_t *,             16);
 249     assign(s->above_partition_ctx, uint8_t *,              8);
 250     assign(s->above_skip_ctx,      uint8_t *,              8);
 251     assign(s->above_txfm_ctx,      uint8_t *,              8);
 252     assign(s->above_segpred_ctx,   uint8_t *,              8);
 253     assign(s->above_intra_ctx,     uint8_t *,              8);
 254     assign(s->above_comp_ctx,      uint8_t *,              8);
 255     assign(s->above_ref_ctx,       uint8_t *,              8);
 256     assign(s->above_filter_ctx,    uint8_t *,              8);
 257     assign(s->lflvl,               struct VP9Filter *,     1);
 258 #undef assign
 259
 260     // these will be re-allocated a little later
 261     av_freep(&s->b_base);
 262     av_freep(&s->block_base);
 263
 264     if (s->bpp != s->last_bpp) {
 265         ff_vp9dsp_init(&s->dsp, s->bpp, ctx->flags & AV_CODEC_FLAG_BITEXACT);
 266         ff_videodsp_init(&s->vdsp, s->bpp);
 267         s->last_bpp = s->bpp;
 268     }
 269
 270     return 0;
 271 }
 272
 273 static int update_block_buffers(AVCodecContext *ctx)
 274 {
 275     VP9Context *s = ctx->priv_data;
 276     int chroma_blocks, chroma_eobs, bytesperpixel = s->bytesperpixel;
 277
 278     if (s->b_base && s->block_base && s->block_alloc_using_2pass == s->s.frames[CUR_FRAME].uses_2pass)
 279         return 0;
 280
 281     av_free(s->b_base);
 282     av_free(s->block_base);
 283     chroma_blocks = 64 * 64 >> (s->ss_h + s->ss_v);
 284     chroma_eobs   = 16 * 16 >> (s->ss_h + s->ss_v);
 285     if (s->s.frames[CUR_FRAME].uses_2pass) {
 286         int sbs = s->sb_cols * s->sb_rows;
 287
 288         s->b_base = av_malloc_array(s->cols * s->rows, sizeof(VP9Block));
 289         s->block_base = av_mallocz(((64 * 64 + 2 * chroma_blocks) * bytesperpixel * sizeof(int16_t) +
 290                                     16 * 16 + 2 * chroma_eobs) * sbs);
 291         if (!s->b_base || !s->block_base)
 292             return AVERROR(ENOMEM);
 293         s->uvblock_base[0] = s->block_base + sbs * 64 * 64 * bytesperpixel;
 294         s->uvblock_base[1] = s->uvblock_base[0] + sbs * chroma_blocks * bytesperpixel;
 295         s->eob_base = (uint8_t *) (s->uvblock_base[1] + sbs * chroma_blocks * bytesperpixel);
 296         s->uveob_base[0] = s->eob_base + 16 * 16 * sbs;
 297         s->uveob_base[1] = s->uveob_base[0] + chroma_eobs * sbs;
 298     } else {
 299         s->b_base = av_malloc(sizeof(VP9Block));
 300         s->block_base = av_mallocz((64 * 64 + 2 * chroma_blocks) * bytesperpixel * sizeof(int16_t) +
 301                                    16 * 16 + 2 * chroma_eobs);
 302         if (!s->b_base || !s->block_base)
 303             return AVERROR(ENOMEM);
 304         s->uvblock_base[0] = s->block_base + 64 * 64 * bytesperpixel;
 305         s->uvblock_base[1] = s->uvblock_base[0] + chroma_blocks * bytesperpixel;
 306         s->eob_base = (uint8_t *) (s->uvblock_base[1] + chroma_blocks * bytesperpixel);
 307         s->uveob_base[0] = s->eob_base + 16 * 16;
 308         s->uveob_base[1] = s->uveob_base[0] + chroma_eobs;
 309     }
 310     s->block_alloc_using_2pass = s->s.frames[CUR_FRAME].uses_2pass;
 311
 312     return 0;
 313 }
 314
 315 // for some reason the sign bit is at the end, not the start, of a bit sequence
 316 static av_always_inline int get_sbits_inv(GetBitContext *gb, int n)
 317 {
 318     int v = get_bits(gb, n);
 319     return get_bits1(gb) ? -v : v;
 320 }
 321
 322 static av_always_inline int inv_recenter_nonneg(int v, int m)
 323 {
 324     return v > 2 * m ? v : v & 1 ? m - ((v + 1) >> 1) : m + (v >> 1);
 325 }
 326
 327 // differential forward probability updates
 328 static int update_prob(VP56RangeCoder *c, int p)
 329 {
 330     static const int inv_map_table[255] = {
 331           7,  20,  33,  46,  59,  72,  85,  98, 111, 124, 137, 150, 163, 176,
 332         189, 202, 215, 228, 241, 254,   1,   2,   3,   4,   5,   6,   8,   9,
 333          10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  21,  22,  23,  24,
 334          25,  26,  27,  28,  29,  30,  31,  32,  34,  35,  36,  37,  38,  39,
 335          40,  41,  42,  43,  44,  45,  47,  48,  49,  50,  51,  52,  53,  54,
 336          55,  56,  57,  58,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,
 337          70,  71,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,
 338          86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,  99, 100,
 339         101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 112, 113, 114, 115,
 340         116, 117, 118, 119, 120, 121, 122, 123, 125, 126, 127, 128, 129, 130,
 341         131, 132, 133, 134, 135, 136, 138, 139, 140, 141, 142, 143, 144, 145,
 342         146, 147, 148, 149, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160,
 343         161, 162, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
 344         177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 190, 191,
 345         192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 203, 204, 205, 206,
 346         207, 208, 209, 210, 211, 212, 213, 214, 216, 217, 218, 219, 220, 221,
 347         222, 223, 224, 225, 226, 227, 229, 230, 231, 232, 233, 234, 235, 236,
 348         237, 238, 239, 240, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251,
 349         252, 253, 253,
 350     };
 351     int d;
 352
 353     /* This code is trying to do a differential probability update. For a
 354      * current probability A in the range [1, 255], the difference to a new
 355      * probability of any value can be expressed differentially as 1-A,255-A
 356      * where some part of this (absolute range) exists both in positive as
 357      * well as the negative part, whereas another part only exists in one
 358      * half. We're trying to code this shared part differentially, i.e.
 359      * times two where the value of the lowest bit specifies the sign, and
 360      * the single part is then coded on top of this. This absolute difference
 361      * then again has a value of [0,254], but a bigger value in this range
 362      * indicates that we're further away from the original value A, so we
 363      * can code this as a VLC code, since higher values are increasingly
 364      * unlikely. The first 20 values in inv_map_table[] allow 'cheap, rough'
 365      * updates vs. the 'fine, exact' updates further down the range, which
 366      * adds one extra dimension to this differential update model. */
 367
 368     if (!vp8_rac_get(c)) {
 369         d = vp8_rac_get_uint(c, 4) + 0;
 370     } else if (!vp8_rac_get(c)) {
 371         d = vp8_rac_get_uint(c, 4) + 16;
 372     } else if (!vp8_rac_get(c)) {
 373         d = vp8_rac_get_uint(c, 5) + 32;
 374     } else {
 375         d = vp8_rac_get_uint(c, 7);
 376         if (d >= 65)
 377             d = (d << 1) - 65 + vp8_rac_get(c);
 378         d += 64;
 379         av_assert2(d < FF_ARRAY_ELEMS(inv_map_table));
 380     }
 381
 382     return p <= 128 ? 1 + inv_recenter_nonneg(inv_map_table[d], p - 1) :
 383                     255 - inv_recenter_nonneg(inv_map_table[d], 255 - p);
 384 }
 385
 386 static enum AVPixelFormat read_colorspace_details(AVCodecContext *ctx)
 387 {
 388     static const enum AVColorSpace colorspaces[8] = {
 389         AVCOL_SPC_UNSPECIFIED, AVCOL_SPC_BT470BG, AVCOL_SPC_BT709, AVCOL_SPC_SMPTE170M,
 390         AVCOL_SPC_SMPTE240M, AVCOL_SPC_BT2020_NCL, AVCOL_SPC_RESERVED, AVCOL_SPC_RGB,
 391     };
 392     VP9Context *s = ctx->priv_data;
 393     enum AVPixelFormat res;
 394     int bits = ctx->profile <= 1 ? 0 : 1 + get_bits1(&s->gb); // 0:8, 1:10, 2:12
 395
 396     s->bpp_index = bits;
 397     s->bpp = 8 + bits * 2;
 398     s->bytesperpixel = (7 + s->bpp) >> 3;
 399     ctx->colorspace = colorspaces[get_bits(&s->gb, 3)];
 400     if (ctx->colorspace == AVCOL_SPC_RGB) { // RGB = profile 1
 401         static const enum AVPixelFormat pix_fmt_rgb[3] = {
 402             AV_PIX_FMT_GBRP, AV_PIX_FMT_GBRP10, AV_PIX_FMT_GBRP12
 403         };
 404         if (ctx->profile & 1) {
 405             s->ss_h = s->ss_v = 0;
 406             res = pix_fmt_rgb[bits];
 407             ctx->color_range = AVCOL_RANGE_JPEG;
 408             if (get_bits1(&s->gb)) {
 409                 av_log(ctx, AV_LOG_ERROR, "Reserved bit set in RGB\n");
 410                 return AVERROR_INVALIDDATA;
 411             }
 412         } else {
 413             av_log(ctx, AV_LOG_ERROR, "RGB not supported in profile %d\n",
 414                    ctx->profile);
 415             return AVERROR_INVALIDDATA;
 416         }
 417     } else {
 418         static const enum AVPixelFormat pix_fmt_for_ss[3][2 /* v */][2 /* h */] = {
 419             { { AV_PIX_FMT_YUV444P, AV_PIX_FMT_YUV422P },
 420               { AV_PIX_FMT_YUV440P, AV_PIX_FMT_YUV420P } },
 421             { { AV_PIX_FMT_YUV444P10, AV_PIX_FMT_YUV422P10 },
 422               { AV_PIX_FMT_YUV440P10, AV_PIX_FMT_YUV420P10 } },
 423             { { AV_PIX_FMT_YUV444P12, AV_PIX_FMT_YUV422P12 },
 424               { AV_PIX_FMT_YUV440P12, AV_PIX_FMT_YUV420P12 } }
 425         };
 426         ctx->color_range = get_bits1(&s->gb) ? AVCOL_RANGE_JPEG : AVCOL_RANGE_MPEG;
 427         if (ctx->profile & 1) {
 428             s->ss_h = get_bits1(&s->gb);
 429             s->ss_v = get_bits1(&s->gb);
 430             if ((res = pix_fmt_for_ss[bits][s->ss_v][s->ss_h]) == AV_PIX_FMT_YUV420P) {
 431                 av_log(ctx, AV_LOG_ERROR, "YUV 4:2:0 not supported in profile %d\n",
 432                        ctx->profile);
 433                 return AVERROR_INVALIDDATA;
 434             } else if (get_bits1(&s->gb)) {
 435                 av_log(ctx, AV_LOG_ERROR, "Profile %d color details reserved bit set\n",
 436                        ctx->profile);
 437                 return AVERROR_INVALIDDATA;
 438             }
 439         } else {
 440             s->ss_h = s->ss_v = 1;
 441             res = pix_fmt_for_ss[bits][1][1];
 442         }
 443     }
 444
 445     return res;
 446 }
 447
 448 static int decode_frame_header(AVCodecContext *ctx,
 449                                const uint8_t *data, int size, int *ref)
 450 {
 451     VP9Context *s = ctx->priv_data;
 452     int c, i, j, k, l, m, n, w, h, max, size2, res, sharp;
 453     enum AVPixelFormat fmt = ctx->pix_fmt;
 454     int last_invisible;
 455     const uint8_t *data2;
 456
 457     /* general header */
 458     if ((res = init_get_bits8(&s->gb, data, size)) < 0) {
 459         av_log(ctx, AV_LOG_ERROR, "Failed to initialize bitstream reader\n");
 460         return res;
 461     }
 462     if (get_bits(&s->gb, 2) != 0x2) { // frame marker
 463         av_log(ctx, AV_LOG_ERROR, "Invalid frame marker\n");
 464         return AVERROR_INVALIDDATA;
 465     }
 466     ctx->profile  = get_bits1(&s->gb);
 467     ctx->profile |= get_bits1(&s->gb) << 1;
 468     if (ctx->profile == 3) ctx->profile += get_bits1(&s->gb);
 469     if (ctx->profile > 3) {
 470         av_log(ctx, AV_LOG_ERROR, "Profile %d is not yet supported\n", ctx->profile);
 471         return AVERROR_INVALIDDATA;
 472     }
 473     s->s.h.profile = ctx->profile;
 474     if (get_bits1(&s->gb)) {
 475         *ref = get_bits(&s->gb, 3);
 476         return 0;
 477     }
 478     s->last_keyframe  = s->s.h.keyframe;
 479     s->s.h.keyframe     = !get_bits1(&s->gb);
 480     last_invisible    = s->s.h.invisible;
 481     s->s.h.invisible    = !get_bits1(&s->gb);
 482     s->s.h.errorres     = get_bits1(&s->gb);
 483     s->s.h.use_last_frame_mvs = !s->s.h.errorres && !last_invisible;
 484     if (s->s.h.keyframe) {
 485         if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
 486             av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
 487             return AVERROR_INVALIDDATA;
 488         }
 489         if ((fmt = read_colorspace_details(ctx)) < 0)
 490             return fmt;
 491         // for profile 1, here follows the subsampling bits
 492         s->s.h.refreshrefmask = 0xff;
 493         w = get_bits(&s->gb, 16) + 1;
 494         h = get_bits(&s->gb, 16) + 1;
 495         if (get_bits1(&s->gb)) // display size
 496             skip_bits(&s->gb, 32);
 497     } else {
 498         s->s.h.intraonly  = s->s.h.invisible ? get_bits1(&s->gb) : 0;
 499         s->s.h.resetctx   = s->s.h.errorres ? 0 : get_bits(&s->gb, 2);
 500         if (s->s.h.intraonly) {
 501             if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
 502                 av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
 503                 return AVERROR_INVALIDDATA;
 504             }
 505             if (ctx->profile >= 1) {
 506                 if ((fmt = read_colorspace_details(ctx)) < 0)
 507                     return fmt;
 508             } else {
 509                 s->ss_h = s->ss_v = 1;
 510                 s->bpp = 8;
 511                 s->bpp_index = 0;
 512                 s->bytesperpixel = 1;
 513                 fmt = AV_PIX_FMT_YUV420P;
 514                 ctx->colorspace = AVCOL_SPC_BT470BG;
 515                 ctx->color_range = AVCOL_RANGE_JPEG;
 516             }
 517             s->s.h.refreshrefmask = get_bits(&s->gb, 8);
 518             w = get_bits(&s->gb, 16) + 1;
 519             h = get_bits(&s->gb, 16) + 1;
 520             if (get_bits1(&s->gb)) // display size
 521                 skip_bits(&s->gb, 32);
 522         } else {
 523             s->s.h.refreshrefmask = get_bits(&s->gb, 8);
 524             s->s.h.refidx[0]      = get_bits(&s->gb, 3);
 525             s->s.h.signbias[0]    = get_bits1(&s->gb) && !s->s.h.errorres;
 526             s->s.h.refidx[1]      = get_bits(&s->gb, 3);
 527             s->s.h.signbias[1]    = get_bits1(&s->gb) && !s->s.h.errorres;
 528             s->s.h.refidx[2]      = get_bits(&s->gb, 3);
 529             s->s.h.signbias[2]    = get_bits1(&s->gb) && !s->s.h.errorres;
 530             if (!s->s.refs[s->s.h.refidx[0]].f->buf[0] ||
 531                 !s->s.refs[s->s.h.refidx[1]].f->buf[0] ||
 532                 !s->s.refs[s->s.h.refidx[2]].f->buf[0]) {
 533                 av_log(ctx, AV_LOG_ERROR, "Not all references are available\n");
 534                 return AVERROR_INVALIDDATA;
 535             }
 536             if (get_bits1(&s->gb)) {
 537                 w = s->s.refs[s->s.h.refidx[0]].f->width;
 538                 h = s->s.refs[s->s.h.refidx[0]].f->height;
 539             } else if (get_bits1(&s->gb)) {
 540                 w = s->s.refs[s->s.h.refidx[1]].f->width;
 541                 h = s->s.refs[s->s.h.refidx[1]].f->height;
 542             } else if (get_bits1(&s->gb)) {
 543                 w = s->s.refs[s->s.h.refidx[2]].f->width;
 544                 h = s->s.refs[s->s.h.refidx[2]].f->height;
 545             } else {
 546                 w = get_bits(&s->gb, 16) + 1;
 547                 h = get_bits(&s->gb, 16) + 1;
 548             }
 549             // Note that in this code, "CUR_FRAME" is actually before we
 550             // have formally allocated a frame, and thus actually represents
 551             // the _last_ frame
 552             s->s.h.use_last_frame_mvs &= s->s.frames[CUR_FRAME].tf.f->width == w &&
 553                                        s->s.frames[CUR_FRAME].tf.f->height == h;
 554             if (get_bits1(&s->gb)) // display size
 555                 skip_bits(&s->gb, 32);
 556             s->s.h.highprecisionmvs = get_bits1(&s->gb);
 557             s->s.h.filtermode = get_bits1(&s->gb) ? FILTER_SWITCHABLE :
 558                                                   get_bits(&s->gb, 2);
 559             s->s.h.allowcompinter = s->s.h.signbias[0] != s->s.h.signbias[1] ||
 560                                   s->s.h.signbias[0] != s->s.h.signbias[2];
 561             if (s->s.h.allowcompinter) {
 562                 if (s->s.h.signbias[0] == s->s.h.signbias[1]) {
 563                     s->s.h.fixcompref    = 2;
 564                     s->s.h.varcompref[0] = 0;
 565                     s->s.h.varcompref[1] = 1;
 566                 } else if (s->s.h.signbias[0] == s->s.h.signbias[2]) {
 567                     s->s.h.fixcompref    = 1;
 568                     s->s.h.varcompref[0] = 0;
 569                     s->s.h.varcompref[1] = 2;
 570                 } else {
 571                     s->s.h.fixcompref    = 0;
 572                     s->s.h.varcompref[0] = 1;
 573                     s->s.h.varcompref[1] = 2;
 574                 }
 575             }
 576
 577             for (i = 0; i < 3; i++) {
 578                 AVFrame *ref = s->s.refs[s->s.h.refidx[i]].f;
 579                 int refw = ref->width, refh = ref->height;
 580
 581                 if (ref->format != fmt) {
 582                     av_log(ctx, AV_LOG_ERROR,
 583                            "Ref pixfmt (%s) did not match current frame (%s)",
 584                            av_get_pix_fmt_name(ref->format),
 585                            av_get_pix_fmt_name(fmt));
 586                     return AVERROR_INVALIDDATA;
 587                 } else if (refw == w && refh == h) {
 588                     s->mvscale[i][0] = s->mvscale[i][1] = 0;
 589                 } else {
 590                     if (w * 2 < refw || h * 2 < refh || w > 16 * refw || h > 16 * refh) {
 591                         av_log(ctx, AV_LOG_ERROR,
 592                                "Invalid ref frame dimensions %dx%d for frame size %dx%d\n",
 593                                refw, refh, w, h);
 594                         return AVERROR_INVALIDDATA;
 595                     }
 596                     s->mvscale[i][0] = (refw << 14) / w;
 597                     s->mvscale[i][1] = (refh << 14) / h;
 598                     s->mvstep[i][0] = 16 * s->mvscale[i][0] >> 14;
 599                     s->mvstep[i][1] = 16 * s->mvscale[i][1] >> 14;
 600                 }
 601             }
 602         }
 603     }
 604     s->s.h.refreshctx   = s->s.h.errorres ? 0 : get_bits1(&s->gb);
 605     s->s.h.parallelmode = s->s.h.errorres ? 1 : get_bits1(&s->gb);
 606     s->s.h.framectxid   = c = get_bits(&s->gb, 2);
 607
 608     /* loopfilter header data */
 609     if (s->s.h.keyframe || s->s.h.errorres || s->s.h.intraonly) {
 610         // reset loopfilter defaults
 611         s->s.h.lf_delta.ref[0] = 1;
 612         s->s.h.lf_delta.ref[1] = 0;
 613         s->s.h.lf_delta.ref[2] = -1;
 614         s->s.h.lf_delta.ref[3] = -1;
 615         s->s.h.lf_delta.mode[0] = 0;
 616         s->s.h.lf_delta.mode[1] = 0;
 617         memset(s->s.h.segmentation.feat, 0, sizeof(s->s.h.segmentation.feat));
 618     }
 619     s->s.h.filter.level = get_bits(&s->gb, 6);
 620     sharp = get_bits(&s->gb, 3);
 621     // if sharpness changed, reinit lim/mblim LUTs. if it didn't change, keep
 622     // the old cache values since they are still valid
 623     if (s->s.h.filter.sharpness != sharp)
 624         memset(s->filter_lut.lim_lut, 0, sizeof(s->filter_lut.lim_lut));
 625     s->s.h.filter.sharpness = sharp;
 626     if ((s->s.h.lf_delta.enabled = get_bits1(&s->gb))) {
 627         if ((s->s.h.lf_delta.updated = get_bits1(&s->gb))) {
 628             for (i = 0; i < 4; i++)
 629                 if (get_bits1(&s->gb))
 630                     s->s.h.lf_delta.ref[i] = get_sbits_inv(&s->gb, 6);
 631             for (i = 0; i < 2; i++)
 632                 if (get_bits1(&s->gb))
 633                     s->s.h.lf_delta.mode[i] = get_sbits_inv(&s->gb, 6);
 634         }
 635     }
 636
 637     /* quantization header data */
 638     s->s.h.yac_qi      = get_bits(&s->gb, 8);
 639     s->s.h.ydc_qdelta  = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
 640     s->s.h.uvdc_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
 641     s->s.h.uvac_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
 642     s->s.h.lossless    = s->s.h.yac_qi == 0 && s->s.h.ydc_qdelta == 0 &&
 643                        s->s.h.uvdc_qdelta == 0 && s->s.h.uvac_qdelta == 0;
 644     if (s->s.h.lossless)
 645         ctx->properties |= FF_CODEC_PROPERTY_LOSSLESS;
 646
 647     /* segmentation header info */
 648     if ((s->s.h.segmentation.enabled = get_bits1(&s->gb))) {
 649         if ((s->s.h.segmentation.update_map = get_bits1(&s->gb))) {
 650             for (i = 0; i < 7; i++)
 651                 s->s.h.segmentation.prob[i] = get_bits1(&s->gb) ?
 652                                  get_bits(&s->gb, 8) : 255;
 653             if ((s->s.h.segmentation.temporal = get_bits1(&s->gb))) {
 654                 for (i = 0; i < 3; i++)
 655                     s->s.h.segmentation.pred_prob[i] = get_bits1(&s->gb) ?
 656                                          get_bits(&s->gb, 8) : 255;
 657             }
 658         }
 659
 660         if (get_bits1(&s->gb)) {
 661             s->s.h.segmentation.absolute_vals = get_bits1(&s->gb);
 662             for (i = 0; i < 8; i++) {
 663                 if ((s->s.h.segmentation.feat[i].q_enabled = get_bits1(&s->gb)))
 664                     s->s.h.segmentation.feat[i].q_val = get_sbits_inv(&s->gb, 8);
 665                 if ((s->s.h.segmentation.feat[i].lf_enabled = get_bits1(&s->gb)))
 666                     s->s.h.segmentation.feat[i].lf_val = get_sbits_inv(&s->gb, 6);
 667                 if ((s->s.h.segmentation.feat[i].ref_enabled = get_bits1(&s->gb)))
 668                     s->s.h.segmentation.feat[i].ref_val = get_bits(&s->gb, 2);
 669                 s->s.h.segmentation.feat[i].skip_enabled = get_bits1(&s->gb);
 670             }
 671         }
 672     }
 673
 674     // set qmul[] based on Y/UV, AC/DC and segmentation Q idx deltas
 675     for (i = 0; i < (s->s.h.segmentation.enabled ? 8 : 1); i++) {
 676         int qyac, qydc, quvac, quvdc, lflvl, sh;
 677
 678         if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[i].q_enabled) {
 679             if (s->s.h.segmentation.absolute_vals)
 680                 qyac = av_clip_uintp2(s->s.h.segmentation.feat[i].q_val, 8);
 681             else
 682                 qyac = av_clip_uintp2(s->s.h.yac_qi + s->s.h.segmentation.feat[i].q_val, 8);
 683         } else {
 684             qyac  = s->s.h.yac_qi;
 685         }
 686         qydc  = av_clip_uintp2(qyac + s->s.h.ydc_qdelta, 8);
 687         quvdc = av_clip_uintp2(qyac + s->s.h.uvdc_qdelta, 8);
 688         quvac = av_clip_uintp2(qyac + s->s.h.uvac_qdelta, 8);
 689         qyac  = av_clip_uintp2(qyac, 8);
 690
 691         s->s.h.segmentation.feat[i].qmul[0][0] = vp9_dc_qlookup[s->bpp_index][qydc];
 692         s->s.h.segmentation.feat[i].qmul[0][1] = vp9_ac_qlookup[s->bpp_index][qyac];
 693         s->s.h.segmentation.feat[i].qmul[1][0] = vp9_dc_qlookup[s->bpp_index][quvdc];
 694         s->s.h.segmentation.feat[i].qmul[1][1] = vp9_ac_qlookup[s->bpp_index][quvac];
 695
 696         sh = s->s.h.filter.level >= 32;
 697         if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[i].lf_enabled) {
 698             if (s->s.h.segmentation.absolute_vals)
 699                 lflvl = av_clip_uintp2(s->s.h.segmentation.feat[i].lf_val, 6);
 700             else
 701                 lflvl = av_clip_uintp2(s->s.h.filter.level + s->s.h.segmentation.feat[i].lf_val, 6);
 702         } else {
 703             lflvl  = s->s.h.filter.level;
 704         }
 705         if (s->s.h.lf_delta.enabled) {
 706             s->s.h.segmentation.feat[i].lflvl[0][0] =
 707             s->s.h.segmentation.feat[i].lflvl[0][1] =
 708                 av_clip_uintp2(lflvl + (s->s.h.lf_delta.ref[0] << sh), 6);
 709             for (j = 1; j < 4; j++) {
 710                 s->s.h.segmentation.feat[i].lflvl[j][0] =
 711                     av_clip_uintp2(lflvl + ((s->s.h.lf_delta.ref[j] +
 712                                              s->s.h.lf_delta.mode[0]) * (1 << sh)), 6);
 713                 s->s.h.segmentation.feat[i].lflvl[j][1] =
 714                     av_clip_uintp2(lflvl + ((s->s.h.lf_delta.ref[j] +
 715                                              s->s.h.lf_delta.mode[1]) * (1 << sh)), 6);
 716             }
 717         } else {
 718             memset(s->s.h.segmentation.feat[i].lflvl, lflvl,
 719                    sizeof(s->s.h.segmentation.feat[i].lflvl));
 720         }
 721     }
 722
 723     /* tiling info */
 724     if ((res = update_size(ctx, w, h, fmt)) < 0) {
 725         av_log(ctx, AV_LOG_ERROR, "Failed to initialize decoder for %dx%d @ %d\n", w, h, fmt);
 726         return res;
 727     }
 728     for (s->s.h.tiling.log2_tile_cols = 0;
 729          s->sb_cols > (64 << s->s.h.tiling.log2_tile_cols);
 730          s->s.h.tiling.log2_tile_cols++) ;
 731     for (max = 0; (s->sb_cols >> max) >= 4; max++) ;
 732     max = FFMAX(0, max - 1);
 733     while (max > s->s.h.tiling.log2_tile_cols) {
 734         if (get_bits1(&s->gb))
 735             s->s.h.tiling.log2_tile_cols++;
 736         else
 737             break;
 738     }
 739     s->s.h.tiling.log2_tile_rows = decode012(&s->gb);
 740     s->s.h.tiling.tile_rows = 1 << s->s.h.tiling.log2_tile_rows;
 741     if (s->s.h.tiling.tile_cols != (1 << s->s.h.tiling.log2_tile_cols)) {
 742         s->s.h.tiling.tile_cols = 1 << s->s.h.tiling.log2_tile_cols;
 743         s->c_b = av_fast_realloc(s->c_b, &s->c_b_size,
 744                                  sizeof(VP56RangeCoder) * s->s.h.tiling.tile_cols);
 745         if (!s->c_b) {
 746             av_log(ctx, AV_LOG_ERROR, "Ran out of memory during range coder init\n");
 747             return AVERROR(ENOMEM);
 748         }
 749     }
 750
 751     if (s->s.h.keyframe || s->s.h.errorres || (s->s.h.intraonly && s->s.h.resetctx == 3)) {
 752         s->prob_ctx[0].p = s->prob_ctx[1].p = s->prob_ctx[2].p =
 753                            s->prob_ctx[3].p = vp9_default_probs;
 754         memcpy(s->prob_ctx[0].coef, vp9_default_coef_probs,
 755                sizeof(vp9_default_coef_probs));
 756         memcpy(s->prob_ctx[1].coef, vp9_default_coef_probs,
 757                sizeof(vp9_default_coef_probs));
 758         memcpy(s->prob_ctx[2].coef, vp9_default_coef_probs,
 759                sizeof(vp9_default_coef_probs));
 760         memcpy(s->prob_ctx[3].coef, vp9_default_coef_probs,
 761                sizeof(vp9_default_coef_probs));
 762     } else if (s->s.h.intraonly && s->s.h.resetctx == 2) {
 763         s->prob_ctx[c].p = vp9_default_probs;
 764         memcpy(s->prob_ctx[c].coef, vp9_default_coef_probs,
 765                sizeof(vp9_default_coef_probs));
 766     }
 767
 768     // next 16 bits is size of the rest of the header (arith-coded)
 769     s->s.h.compressed_header_size = size2 = get_bits(&s->gb, 16);
 770     s->s.h.uncompressed_header_size = (get_bits_count(&s->gb) + 7) / 8;
 771
 772     data2 = align_get_bits(&s->gb);
 773     if (size2 > size - (data2 - data)) {
 774         av_log(ctx, AV_LOG_ERROR, "Invalid compressed header size\n");
 775         return AVERROR_INVALIDDATA;
 776     }
 777     ff_vp56_init_range_decoder(&s->c, data2, size2);
 778     if (vp56_rac_get_prob_branchy(&s->c, 128)) { // marker bit
 779         av_log(ctx, AV_LOG_ERROR, "Marker bit was set\n");
 780         return AVERROR_INVALIDDATA;
 781     }
 782
 783     if (s->s.h.keyframe || s->s.h.intraonly) {
 784         memset(s->counts.coef, 0, sizeof(s->counts.coef));
 785         memset(s->counts.eob,  0, sizeof(s->counts.eob));
 786     } else {
 787         memset(&s->counts, 0, sizeof(s->counts));
 788     }
 789     // FIXME is it faster to not copy here, but do it down in the fw updates
 790     // as explicit copies if the fw update is missing (and skip the copy upon
 791     // fw update)?
 792     s->prob.p = s->prob_ctx[c].p;
 793
 794     // txfm updates
 795     if (s->s.h.lossless) {
 796         s->s.h.txfmmode = TX_4X4;
 797     } else {
 798         s->s.h.txfmmode = vp8_rac_get_uint(&s->c, 2);
 799         if (s->s.h.txfmmode == 3)
 800             s->s.h.txfmmode += vp8_rac_get(&s->c);
 801
 802         if (s->s.h.txfmmode == TX_SWITCHABLE) {
 803             for (i = 0; i < 2; i++)
 804                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 805                     s->prob.p.tx8p[i] = update_prob(&s->c, s->prob.p.tx8p[i]);
 806             for (i = 0; i < 2; i++)
 807                 for (j = 0; j < 2; j++)
 808                     if (vp56_rac_get_prob_branchy(&s->c, 252))
 809                         s->prob.p.tx16p[i][j] =
 810                             update_prob(&s->c, s->prob.p.tx16p[i][j]);
 811             for (i = 0; i < 2; i++)
 812                 for (j = 0; j < 3; j++)
 813                     if (vp56_rac_get_prob_branchy(&s->c, 252))
 814                         s->prob.p.tx32p[i][j] =
 815                             update_prob(&s->c, s->prob.p.tx32p[i][j]);
 816         }
 817     }
 818
 819     // coef updates
 820     for (i = 0; i < 4; i++) {
 821         uint8_t (*ref)[2][6][6][3] = s->prob_ctx[c].coef[i];
 822         if (vp8_rac_get(&s->c)) {
 823             for (j = 0; j < 2; j++)
 824                 for (k = 0; k < 2; k++)
 825                     for (l = 0; l < 6; l++)
 826                         for (m = 0; m < 6; m++) {
 827                             uint8_t *p = s->prob.coef[i][j][k][l][m];
 828                             uint8_t *r = ref[j][k][l][m];
 829                             if (m >= 3 && l == 0) // dc only has 3 pt
 830                                 break;
 831                             for (n = 0; n < 3; n++) {
 832                                 if (vp56_rac_get_prob_branchy(&s->c, 252)) {
 833                                     p[n] = update_prob(&s->c, r[n]);
 834                                 } else {
 835                                     p[n] = r[n];
 836                                 }
 837                             }
 838                             p[3] = 0;
 839                         }
 840         } else {
 841             for (j = 0; j < 2; j++)
 842                 for (k = 0; k < 2; k++)
 843                     for (l = 0; l < 6; l++)
 844                         for (m = 0; m < 6; m++) {
 845                             uint8_t *p = s->prob.coef[i][j][k][l][m];
 846                             uint8_t *r = ref[j][k][l][m];
 847                             if (m > 3 && l == 0) // dc only has 3 pt
 848                                 break;
 849                             memcpy(p, r, 3);
 850                             p[3] = 0;
 851                         }
 852         }
 853         if (s->s.h.txfmmode == i)
 854             break;
 855     }
 856
 857     // mode updates
 858     for (i = 0; i < 3; i++)
 859         if (vp56_rac_get_prob_branchy(&s->c, 252))
 860             s->prob.p.skip[i] = update_prob(&s->c, s->prob.p.skip[i]);
 861     if (!s->s.h.keyframe && !s->s.h.intraonly) {
 862         for (i = 0; i < 7; i++)
 863             for (j = 0; j < 3; j++)
 864                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 865                     s->prob.p.mv_mode[i][j] =
 866                         update_prob(&s->c, s->prob.p.mv_mode[i][j]);
 867
 868         if (s->s.h.filtermode == FILTER_SWITCHABLE)
 869             for (i = 0; i < 4; i++)
 870                 for (j = 0; j < 2; j++)
 871                     if (vp56_rac_get_prob_branchy(&s->c, 252))
 872                         s->prob.p.filter[i][j] =
 873                             update_prob(&s->c, s->prob.p.filter[i][j]);
 874
 875         for (i = 0; i < 4; i++)
 876             if (vp56_rac_get_prob_branchy(&s->c, 252))
 877                 s->prob.p.intra[i] = update_prob(&s->c, s->prob.p.intra[i]);
 878
 879         if (s->s.h.allowcompinter) {
 880             s->s.h.comppredmode = vp8_rac_get(&s->c);
 881             if (s->s.h.comppredmode)
 882                 s->s.h.comppredmode += vp8_rac_get(&s->c);
 883             if (s->s.h.comppredmode == PRED_SWITCHABLE)
 884                 for (i = 0; i < 5; i++)
 885                     if (vp56_rac_get_prob_branchy(&s->c, 252))
 886                         s->prob.p.comp[i] =
 887                             update_prob(&s->c, s->prob.p.comp[i]);
 888         } else {
 889             s->s.h.comppredmode = PRED_SINGLEREF;
 890         }
 891
 892         if (s->s.h.comppredmode != PRED_COMPREF) {
 893             for (i = 0; i < 5; i++) {
 894                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 895                     s->prob.p.single_ref[i][0] =
 896                         update_prob(&s->c, s->prob.p.single_ref[i][0]);
 897                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 898                     s->prob.p.single_ref[i][1] =
 899                         update_prob(&s->c, s->prob.p.single_ref[i][1]);
 900             }
 901         }
 902
 903         if (s->s.h.comppredmode != PRED_SINGLEREF) {
 904             for (i = 0; i < 5; i++)
 905                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 906                     s->prob.p.comp_ref[i] =
 907                         update_prob(&s->c, s->prob.p.comp_ref[i]);
 908         }
 909
 910         for (i = 0; i < 4; i++)
 911             for (j = 0; j < 9; j++)
 912                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 913                     s->prob.p.y_mode[i][j] =
 914                         update_prob(&s->c, s->prob.p.y_mode[i][j]);
 915
 916         for (i = 0; i < 4; i++)
 917             for (j = 0; j < 4; j++)
 918                 for (k = 0; k < 3; k++)
 919                     if (vp56_rac_get_prob_branchy(&s->c, 252))
 920                         s->prob.p.partition[3 - i][j][k] =
 921                             update_prob(&s->c, s->prob.p.partition[3 - i][j][k]);
 922
 923         // mv fields don't use the update_prob subexp model for some reason
 924         for (i = 0; i < 3; i++)
 925             if (vp56_rac_get_prob_branchy(&s->c, 252))
 926                 s->prob.p.mv_joint[i] = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 927
 928         for (i = 0; i < 2; i++) {
 929             if (vp56_rac_get_prob_branchy(&s->c, 252))
 930                 s->prob.p.mv_comp[i].sign = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 931
 932             for (j = 0; j < 10; j++)
 933                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 934                     s->prob.p.mv_comp[i].classes[j] =
 935                         (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 936
 937             if (vp56_rac_get_prob_branchy(&s->c, 252))
 938                 s->prob.p.mv_comp[i].class0 = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 939
 940             for (j = 0; j < 10; j++)
 941                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 942                     s->prob.p.mv_comp[i].bits[j] =
 943                         (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 944         }
 945
 946         for (i = 0; i < 2; i++) {
 947             for (j = 0; j < 2; j++)
 948                 for (k = 0; k < 3; k++)
 949                     if (vp56_rac_get_prob_branchy(&s->c, 252))
 950                         s->prob.p.mv_comp[i].class0_fp[j][k] =
 951                             (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 952
 953             for (j = 0; j < 3; j++)
 954                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 955                     s->prob.p.mv_comp[i].fp[j] =
 956                         (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 957         }
 958
 959         if (s->s.h.highprecisionmvs) {
 960             for (i = 0; i < 2; i++) {
 961                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 962                     s->prob.p.mv_comp[i].class0_hp =
 963                         (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 964
 965                 if (vp56_rac_get_prob_branchy(&s->c, 252))
 966                     s->prob.p.mv_comp[i].hp =
 967                         (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 968             }
 969         }
 970     }
 971
 972     return (data2 - data) + size2;
 973 }
 974
 975 static av_always_inline void clamp_mv(VP56mv *dst, const VP56mv *src,
 976                                       VP9Context *s)
 977 {
 978     dst->x = av_clip(src->x, s->min_mv.x, s->max_mv.x);
 979     dst->y = av_clip(src->y, s->min_mv.y, s->max_mv.y);
 980 }
 981
 982 static void find_ref_mvs(VP9Context *s,
 983                          VP56mv *pmv, int ref, int z, int idx, int sb)
 984 {
 985     static const int8_t mv_ref_blk_off[N_BS_SIZES][8][2] = {
 986         [BS_64x64] = {{  3, -1 }, { -1,  3 }, {  4, -1 }, { -1,  4 },
 987                       { -1, -1 }, {  0, -1 }, { -1,  0 }, {  6, -1 }},
 988         [BS_64x32] = {{  0, -1 }, { -1,  0 }, {  4, -1 }, { -1,  2 },
 989                       { -1, -1 }, {  0, -3 }, { -3,  0 }, {  2, -1 }},
 990         [BS_32x64] = {{ -1,  0 }, {  0, -1 }, { -1,  4 }, {  2, -1 },
 991                       { -1, -1 }, { -3,  0 }, {  0, -3 }, { -1,  2 }},
 992         [BS_32x32] = {{  1, -1 }, { -1,  1 }, {  2, -1 }, { -1,  2 },
 993                       { -1, -1 }, {  0, -3 }, { -3,  0 }, { -3, -3 }},
 994         [BS_32x16] = {{  0, -1 }, { -1,  0 }, {  2, -1 }, { -1, -1 },
 995                       { -1,  1 }, {  0, -3 }, { -3,  0 }, { -3, -3 }},
 996         [BS_16x32] = {{ -1,  0 }, {  0, -1 }, { -1,  2 }, { -1, -1 },
 997                       {  1, -1 }, { -3,  0 }, {  0, -3 }, { -3, -3 }},
 998         [BS_16x16] = {{  0, -1 }, { -1,  0 }, {  1, -1 }, { -1,  1 },
 999                       { -1, -1 }, {  0, -3 }, { -3,  0 }, { -3, -3 }},
1000         [BS_16x8]  = {{  0, -1 }, { -1,  0 }, {  1, -1 }, { -1, -1 },
1001                       {  0, -2 }, { -2,  0 }, { -2, -1 }, { -1, -2 }},
1002         [BS_8x16]  = {{ -1,  0 }, {  0, -1 }, { -1,  1 }, { -1, -1 },
1003                       { -2,  0 }, {  0, -2 }, { -1, -2 }, { -2, -1 }},
1004         [BS_8x8]   = {{  0, -1 }, { -1,  0 }, { -1, -1 }, {  0, -2 },
1005                       { -2,  0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
1006         [BS_8x4]   = {{  0, -1 }, { -1,  0 }, { -1, -1 }, {  0, -2 },
1007                       { -2,  0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
1008         [BS_4x8]   = {{  0, -1 }, { -1,  0 }, { -1, -1 }, {  0, -2 },
1009                       { -2,  0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
1010         [BS_4x4]   = {{  0, -1 }, { -1,  0 }, { -1, -1 }, {  0, -2 },
1011                       { -2,  0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
1012     };
1013     VP9Block *b = s->b;
1014     int row = s->row, col = s->col, row7 = s->row7;
1015     const int8_t (*p)[2] = mv_ref_blk_off[b->bs];
1016 #define INVALID_MV 0x80008000U
1017     uint32_t mem = INVALID_MV, mem_sub8x8 = INVALID_MV;
1018     int i;
1019
1020 #define RETURN_DIRECT_MV(mv) \
1021     do { \
1022         uint32_t m = AV_RN32A(&mv); \
1023         if (!idx) { \
1024             AV_WN32A(pmv, m); \
1025             return; \
1026         } else if (mem == INVALID_MV) { \
1027             mem = m; \
1028         } else if (m != mem) { \
1029             AV_WN32A(pmv, m); \
1030             return; \
1031         } \
1032     } while (0)
1033
1034     if (sb >= 0) {
1035         if (sb == 2 || sb == 1) {
1036             RETURN_DIRECT_MV(b->mv[0][z]);
1037         } else if (sb == 3) {
1038             RETURN_DIRECT_MV(b->mv[2][z]);
1039             RETURN_DIRECT_MV(b->mv[1][z]);
1040             RETURN_DIRECT_MV(b->mv[0][z]);
1041         }
1042
1043 #define RETURN_MV(mv) \
1044     do { \
1045         if (sb > 0) { \
1046             VP56mv tmp; \
1047             uint32_t m; \
1048             av_assert2(idx == 1); \
1049             av_assert2(mem != INVALID_MV); \
1050             if (mem_sub8x8 == INVALID_MV) { \
1051                 clamp_mv(&tmp, &mv, s); \
1052                 m = AV_RN32A(&tmp); \
1053                 if (m != mem) { \
1054                     AV_WN32A(pmv, m); \
1055                     return; \
1056                 } \
1057                 mem_sub8x8 = AV_RN32A(&mv); \
1058             } else if (mem_sub8x8 != AV_RN32A(&mv)) { \
1059                 clamp_mv(&tmp, &mv, s); \
1060                 m = AV_RN32A(&tmp); \
1061                 if (m != mem) { \
1062                     AV_WN32A(pmv, m); \
1063                 } else { \
1064                     /* BUG I'm pretty sure this isn't the intention */ \
1065                     AV_WN32A(pmv, 0); \
1066                 } \
1067                 return; \
1068             } \
1069         } else { \
1070             uint32_t m = AV_RN32A(&mv); \
1071             if (!idx) { \
1072                 clamp_mv(pmv, &mv, s); \
1073                 return; \
1074             } else if (mem == INVALID_MV) { \
1075                 mem = m; \
1076             } else if (m != mem) { \
1077                 clamp_mv(pmv, &mv, s); \
1078                 return; \
1079             } \
1080         } \
1081     } while (0)
1082
1083         if (row > 0) {
1084             struct VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[(row - 1) * s->sb_cols * 8 + col];
1085             if (mv->ref[0] == ref) {
1086                 RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][0]);
1087             } else if (mv->ref[1] == ref) {
1088                 RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][1]);
1089             }
1090         }
1091         if (col > s->tile_col_start) {
1092             struct VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[row * s->sb_cols * 8 + col - 1];
1093             if (mv->ref[0] == ref) {
1094                 RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][0]);
1095             } else if (mv->ref[1] == ref) {
1096                 RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][1]);
1097             }
1098         }
1099         i = 2;
1100     } else {
1101         i = 0;
1102     }
1103
1104     // previously coded MVs in this neighbourhood, using same reference frame
1105     for (; i < 8; i++) {
1106         int c = p[i][0] + col, r = p[i][1] + row;
1107
1108         if (c >= s->tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
1109             struct VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
1110
1111             if (mv->ref[0] == ref) {
1112                 RETURN_MV(mv->mv[0]);
1113             } else if (mv->ref[1] == ref) {
1114                 RETURN_MV(mv->mv[1]);
1115             }
1116         }
1117     }
1118
1119     // MV at this position in previous frame, using same reference frame
1120     if (s->s.h.use_last_frame_mvs) {
1121         struct VP9mvrefPair *mv = &s->s.frames[REF_FRAME_MVPAIR].mv[row * s->sb_cols * 8 + col];
1122
1123         if (!s->s.frames[REF_FRAME_MVPAIR].uses_2pass)
1124             ff_thread_await_progress(&s->s.frames[REF_FRAME_MVPAIR].tf, row >> 3, 0);
1125         if (mv->ref[0] == ref) {
1126             RETURN_MV(mv->mv[0]);
1127         } else if (mv->ref[1] == ref) {
1128             RETURN_MV(mv->mv[1]);
1129         }
1130     }
1131
1132 #define RETURN_SCALE_MV(mv, scale) \
1133     do { \
1134         if (scale) { \
1135             VP56mv mv_temp = { -mv.x, -mv.y }; \
1136             RETURN_MV(mv_temp); \
1137         } else { \
1138             RETURN_MV(mv); \
1139         } \
1140     } while (0)
1141
1142     // previously coded MVs in this neighbourhood, using different reference frame
1143     for (i = 0; i < 8; i++) {
1144         int c = p[i][0] + col, r = p[i][1] + row;
1145
1146         if (c >= s->tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
1147             struct VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
1148
1149             if (mv->ref[0] != ref && mv->ref[0] >= 0) {
1150                 RETURN_SCALE_MV(mv->mv[0], s->s.h.signbias[mv->ref[0]] != s->s.h.signbias[ref]);
1151             }
1152             if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
1153                 // BUG - libvpx has this condition regardless of whether
1154                 // we used the first ref MV and pre-scaling
1155                 AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
1156                 RETURN_SCALE_MV(mv->mv[1], s->s.h.signbias[mv->ref[1]] != s->s.h.signbias[ref]);
1157             }
1158         }
1159     }
1160
1161     // MV at this position in previous frame, using different reference frame
1162     if (s->s.h.use_last_frame_mvs) {
1163         struct VP9mvrefPair *mv = &s->s.frames[REF_FRAME_MVPAIR].mv[row * s->sb_cols * 8 + col];
1164
1165         // no need to await_progress, because we already did that above
1166         if (mv->ref[0] != ref && mv->ref[0] >= 0) {
1167             RETURN_SCALE_MV(mv->mv[0], s->s.h.signbias[mv->ref[0]] != s->s.h.signbias[ref]);
1168         }
1169         if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
1170             // BUG - libvpx has this condition regardless of whether
1171             // we used the first ref MV and pre-scaling
1172             AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
1173             RETURN_SCALE_MV(mv->mv[1], s->s.h.signbias[mv->ref[1]] != s->s.h.signbias[ref]);
1174         }
1175     }
1176
1177     AV_ZERO32(pmv);
1178     clamp_mv(pmv, pmv, s);
1179 #undef INVALID_MV
1180 #undef RETURN_MV
1181 #undef RETURN_SCALE_MV
1182 }
1183
1184 static av_always_inline int read_mv_component(VP9Context *s, int idx, int hp)
1185 {
1186     int bit, sign = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].sign);
1187     int n, c = vp8_rac_get_tree(&s->c, vp9_mv_class_tree,
1188                                 s->prob.p.mv_comp[idx].classes);
1189
1190     s->counts.mv_comp[idx].sign[sign]++;
1191     s->counts.mv_comp[idx].classes[c]++;
1192     if (c) {
1193         int m;
1194
1195         for (n = 0, m = 0; m < c; m++) {
1196             bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].bits[m]);
1197             n |= bit << m;
1198             s->counts.mv_comp[idx].bits[m][bit]++;
1199         }
1200         n <<= 3;
1201         bit = vp8_rac_get_tree(&s->c, vp9_mv_fp_tree, s->prob.p.mv_comp[idx].fp);
1202         n |= bit << 1;
1203         s->counts.mv_comp[idx].fp[bit]++;
1204         if (hp) {
1205             bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].hp);
1206             s->counts.mv_comp[idx].hp[bit]++;
1207             n |= bit;
1208         } else {
1209             n |= 1;
1210             // bug in libvpx - we count for bw entropy purposes even if the
1211             // bit wasn't coded
1212             s->counts.mv_comp[idx].hp[1]++;
1213         }
1214         n += 8 << c;
1215     } else {
1216         n = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0);
1217         s->counts.mv_comp[idx].class0[n]++;
1218         bit = vp8_rac_get_tree(&s->c, vp9_mv_fp_tree,
1219                                s->prob.p.mv_comp[idx].class0_fp[n]);
1220         s->counts.mv_comp[idx].class0_fp[n][bit]++;
1221         n = (n << 3) | (bit << 1);
1222         if (hp) {
1223             bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0_hp);
1224             s->counts.mv_comp[idx].class0_hp[bit]++;
1225             n |= bit;
1226         } else {
1227             n |= 1;
1228             // bug in libvpx - we count for bw entropy purposes even if the
1229             // bit wasn't coded
1230             s->counts.mv_comp[idx].class0_hp[1]++;
1231         }
1232     }
1233
1234     return sign ? -(n + 1) : (n + 1);
1235 }
1236
1237 static void fill_mv(VP9Context *s,
1238                     VP56mv *mv, int mode, int sb)
1239 {
1240     VP9Block *b = s->b;
1241
1242     if (mode == ZEROMV) {
1243         AV_ZERO64(mv);
1244     } else {
1245         int hp;
1246
1247         // FIXME cache this value and reuse for other subblocks
1248         find_ref_mvs(s, &mv[0], b->ref[0], 0, mode == NEARMV,
1249                      mode == NEWMV ? -1 : sb);
1250         // FIXME maybe move this code into find_ref_mvs()
1251         if ((mode == NEWMV || sb == -1) &&
1252             !(hp = s->s.h.highprecisionmvs && abs(mv[0].x) < 64 && abs(mv[0].y) < 64)) {
1253             if (mv[0].y & 1) {
1254                 if (mv[0].y < 0)
1255                     mv[0].y++;
1256                 else
1257                     mv[0].y--;
1258             }
1259             if (mv[0].x & 1) {
1260                 if (mv[0].x < 0)
1261                     mv[0].x++;
1262                 else
1263                     mv[0].x--;
1264             }
1265         }
1266         if (mode == NEWMV) {
1267             enum MVJoint j = vp8_rac_get_tree(&s->c, vp9_mv_joint_tree,
1268                                               s->prob.p.mv_joint);
1269
1270             s->counts.mv_joint[j]++;
1271             if (j >= MV_JOINT_V)
1272                 mv[0].y += read_mv_component(s, 0, hp);
1273             if (j & 1)
1274                 mv[0].x += read_mv_component(s, 1, hp);
1275         }
1276
1277         if (b->comp) {
1278             // FIXME cache this value and reuse for other subblocks
1279             find_ref_mvs(s, &mv[1], b->ref[1], 1, mode == NEARMV,
1280                          mode == NEWMV ? -1 : sb);
1281             if ((mode == NEWMV || sb == -1) &&
1282                 !(hp = s->s.h.highprecisionmvs && abs(mv[1].x) < 64 && abs(mv[1].y) < 64)) {
1283                 if (mv[1].y & 1) {
1284                     if (mv[1].y < 0)
1285                         mv[1].y++;
1286                     else
1287                         mv[1].y--;
1288                 }
1289                 if (mv[1].x & 1) {
1290                     if (mv[1].x < 0)
1291                         mv[1].x++;
1292                     else
1293                         mv[1].x--;
1294                 }
1295             }
1296             if (mode == NEWMV) {
1297                 enum MVJoint j = vp8_rac_get_tree(&s->c, vp9_mv_joint_tree,
1298                                                   s->prob.p.mv_joint);
1299
1300                 s->counts.mv_joint[j]++;
1301                 if (j >= MV_JOINT_V)
1302                     mv[1].y += read_mv_component(s, 0, hp);
1303                 if (j & 1)
1304                     mv[1].x += read_mv_component(s, 1, hp);
1305             }
1306         }
1307     }
1308 }
1309
1310 static av_always_inline void setctx_2d(uint8_t *ptr, int w, int h,
1311                                        ptrdiff_t stride, int v)
1312 {
1313     switch (w) {
1314     case 1:
1315         do {
1316             *ptr = v;
1317             ptr += stride;
1318         } while (--h);
1319         break;
1320     case 2: {
1321         int v16 = v * 0x0101;
1322         do {
1323             AV_WN16A(ptr, v16);
1324             ptr += stride;
1325         } while (--h);
1326         break;
1327     }
1328     case 4: {
1329         uint32_t v32 = v * 0x01010101;
1330         do {
1331             AV_WN32A(ptr, v32);
1332             ptr += stride;
1333         } while (--h);
1334         break;
1335     }
1336     case 8: {
1337 #if HAVE_FAST_64BIT
1338         uint64_t v64 = v * 0x0101010101010101ULL;
1339         do {
1340             AV_WN64A(ptr, v64);
1341             ptr += stride;
1342         } while (--h);
1343 #else
1344         uint32_t v32 = v * 0x01010101;
1345         do {
1346             AV_WN32A(ptr,     v32);
1347             AV_WN32A(ptr + 4, v32);
1348             ptr += stride;
1349         } while (--h);
1350 #endif
1351         break;
1352     }
1353     }
1354 }
1355
1356 static void decode_mode(AVCodecContext *ctx)
1357 {
1358     static const uint8_t left_ctx[N_BS_SIZES] = {
1359         0x0, 0x8, 0x0, 0x8, 0xc, 0x8, 0xc, 0xe, 0xc, 0xe, 0xf, 0xe, 0xf
1360     };
1361     static const uint8_t above_ctx[N_BS_SIZES] = {
1362         0x0, 0x0, 0x8, 0x8, 0x8, 0xc, 0xc, 0xc, 0xe, 0xe, 0xe, 0xf, 0xf
1363     };
1364     static const uint8_t max_tx_for_bl_bp[N_BS_SIZES] = {
1365         TX_32X32, TX_32X32, TX_32X32, TX_32X32, TX_16X16, TX_16X16,
1366         TX_16X16, TX_8X8, TX_8X8, TX_8X8, TX_4X4, TX_4X4, TX_4X4
1367     };
1368     VP9Context *s = ctx->priv_data;
1369     VP9Block *b = s->b;
1370     int row = s->row, col = s->col, row7 = s->row7;
1371     enum TxfmMode max_tx = max_tx_for_bl_bp[b->bs];
1372     int bw4 = bwh_tab[1][b->bs][0], w4 = FFMIN(s->cols - col, bw4);
1373     int bh4 = bwh_tab[1][b->bs][1], h4 = FFMIN(s->rows - row, bh4), y;
1374     int have_a = row > 0, have_l = col > s->tile_col_start;
1375     int vref, filter_id;
1376
1377     if (!s->s.h.segmentation.enabled) {
1378         b->seg_id = 0;
1379     } else if (s->s.h.keyframe || s->s.h.intraonly) {
1380         b->seg_id = !s->s.h.segmentation.update_map ? 0 :
1381                     vp8_rac_get_tree(&s->c, vp9_segmentation_tree, s->s.h.segmentation.prob);
1382     } else if (!s->s.h.segmentation.update_map ||
1383                (s->s.h.segmentation.temporal &&
1384                 vp56_rac_get_prob_branchy(&s->c,
1385                     s->s.h.segmentation.pred_prob[s->above_segpred_ctx[col] +
1386                                     s->left_segpred_ctx[row7]]))) {
1387         if (!s->s.h.errorres && s->s.frames[REF_FRAME_SEGMAP].segmentation_map) {
1388             int pred = 8, x;
1389             uint8_t *refsegmap = s->s.frames[REF_FRAME_SEGMAP].segmentation_map;
1390
1391             if (!s->s.frames[REF_FRAME_SEGMAP].uses_2pass)
1392                 ff_thread_await_progress(&s->s.frames[REF_FRAME_SEGMAP].tf, row >> 3, 0);
1393             for (y = 0; y < h4; y++) {
1394                 int idx_base = (y + row) * 8 * s->sb_cols + col;
1395                 for (x = 0; x < w4; x++)
1396                     pred = FFMIN(pred, refsegmap[idx_base + x]);
1397             }
1398             av_assert1(pred < 8);
1399             b->seg_id = pred;
1400         } else {
1401             b->seg_id = 0;
1402         }
1403
1404         memset(&s->above_segpred_ctx[col], 1, w4);
1405         memset(&s->left_segpred_ctx[row7], 1, h4);
1406     } else {
1407         b->seg_id = vp8_rac_get_tree(&s->c, vp9_segmentation_tree,
1408                                      s->s.h.segmentation.prob);
1409
1410         memset(&s->above_segpred_ctx[col], 0, w4);
1411         memset(&s->left_segpred_ctx[row7], 0, h4);
1412     }
1413     if (s->s.h.segmentation.enabled &&
1414         (s->s.h.segmentation.update_map || s->s.h.keyframe || s->s.h.intraonly)) {
1415         setctx_2d(&s->s.frames[CUR_FRAME].segmentation_map[row * 8 * s->sb_cols + col],
1416                   bw4, bh4, 8 * s->sb_cols, b->seg_id);
1417     }
1418
1419     b->skip = s->s.h.segmentation.enabled &&
1420         s->s.h.segmentation.feat[b->seg_id].skip_enabled;
1421     if (!b->skip) {
1422         int c = s->left_skip_ctx[row7] + s->above_skip_ctx[col];
1423         b->skip = vp56_rac_get_prob(&s->c, s->prob.p.skip[c]);
1424         s->counts.skip[c][b->skip]++;
1425     }
1426
1427     if (s->s.h.keyframe || s->s.h.intraonly) {
1428         b->intra = 1;
1429     } else if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[b->seg_id].ref_enabled) {
1430         b->intra = !s->s.h.segmentation.feat[b->seg_id].ref_val;
1431     } else {
1432         int c, bit;
1433
1434         if (have_a && have_l) {
1435             c = s->above_intra_ctx[col] + s->left_intra_ctx[row7];
1436             c += (c == 2);
1437         } else {
1438             c = have_a ? 2 * s->above_intra_ctx[col] :
1439                 have_l ? 2 * s->left_intra_ctx[row7] : 0;
1440         }
1441         bit = vp56_rac_get_prob(&s->c, s->prob.p.intra[c]);
1442         s->counts.intra[c][bit]++;
1443         b->intra = !bit;
1444     }
1445
1446     if ((b->intra || !b->skip) && s->s.h.txfmmode == TX_SWITCHABLE) {
1447         int c;
1448         if (have_a) {
1449             if (have_l) {
1450                 c = (s->above_skip_ctx[col] ? max_tx :
1451                      s->above_txfm_ctx[col]) +
1452                     (s->left_skip_ctx[row7] ? max_tx :
1453                      s->left_txfm_ctx[row7]) > max_tx;
1454             } else {
1455                 c = s->above_skip_ctx[col] ? 1 :
1456                     (s->above_txfm_ctx[col] * 2 > max_tx);
1457             }
1458         } else if (have_l) {
1459             c = s->left_skip_ctx[row7] ? 1 :
1460                 (s->left_txfm_ctx[row7] * 2 > max_tx);
1461         } else {
1462             c = 1;
1463         }
1464         switch (max_tx) {
1465         case TX_32X32:
1466             b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][0]);
1467             if (b->tx) {
1468                 b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][1]);
1469                 if (b->tx == 2)
1470                     b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][2]);
1471             }
1472             s->counts.tx32p[c][b->tx]++;
1473             break;
1474         case TX_16X16:
1475             b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][0]);
1476             if (b->tx)
1477                 b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][1]);
1478             s->counts.tx16p[c][b->tx]++;
1479             break;
1480         case TX_8X8:
1481             b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx8p[c]);
1482             s->counts.tx8p[c][b->tx]++;
1483             break;
1484         case TX_4X4:
1485             b->tx = TX_4X4;
1486             break;
1487         }
1488     } else {
1489         b->tx = FFMIN(max_tx, s->s.h.txfmmode);
1490     }
1491
1492     if (s->s.h.keyframe || s->s.h.intraonly) {
1493         uint8_t *a = &s->above_mode_ctx[col * 2];
1494         uint8_t *l = &s->left_mode_ctx[(row7) << 1];
1495
1496         b->comp = 0;
1497         if (b->bs > BS_8x8) {
1498             // FIXME the memory storage intermediates here aren't really
1499             // necessary, they're just there to make the code slightly
1500             // simpler for now
1501             b->mode[0] = a[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1502                                     vp9_default_kf_ymode_probs[a[0]][l[0]]);
1503             if (b->bs != BS_8x4) {
1504                 b->mode[1] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1505                                  vp9_default_kf_ymode_probs[a[1]][b->mode[0]]);
1506                 l[0] = a[1] = b->mode[1];
1507             } else {
1508                 l[0] = a[1] = b->mode[1] = b->mode[0];
1509             }
1510             if (b->bs != BS_4x8) {
1511                 b->mode[2] = a[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1512                                         vp9_default_kf_ymode_probs[a[0]][l[1]]);
1513                 if (b->bs != BS_8x4) {
1514                     b->mode[3] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1515                                   vp9_default_kf_ymode_probs[a[1]][b->mode[2]]);
1516                     l[1] = a[1] = b->mode[3];
1517                 } else {
1518                     l[1] = a[1] = b->mode[3] = b->mode[2];
1519                 }
1520             } else {
1521                 b->mode[2] = b->mode[0];
1522                 l[1] = a[1] = b->mode[3] = b->mode[1];
1523             }
1524         } else {
1525             b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1526                                           vp9_default_kf_ymode_probs[*a][*l]);
1527             b->mode[3] = b->mode[2] = b->mode[1] = b->mode[0];
1528             // FIXME this can probably be optimized
1529             memset(a, b->mode[0], bwh_tab[0][b->bs][0]);
1530             memset(l, b->mode[0], bwh_tab[0][b->bs][1]);
1531         }
1532         b->uvmode = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1533                                      vp9_default_kf_uvmode_probs[b->mode[3]]);
1534     } else if (b->intra) {
1535         b->comp = 0;
1536         if (b->bs > BS_8x8) {
1537             b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1538                                           s->prob.p.y_mode[0]);
1539             s->counts.y_mode[0][b->mode[0]]++;
1540             if (b->bs != BS_8x4) {
1541                 b->mode[1] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1542                                               s->prob.p.y_mode[0]);
1543                 s->counts.y_mode[0][b->mode[1]]++;
1544             } else {
1545                 b->mode[1] = b->mode[0];
1546             }
1547             if (b->bs != BS_4x8) {
1548                 b->mode[2] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1549                                               s->prob.p.y_mode[0]);
1550                 s->counts.y_mode[0][b->mode[2]]++;
1551                 if (b->bs != BS_8x4) {
1552                     b->mode[3] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1553                                                   s->prob.p.y_mode[0]);
1554                     s->counts.y_mode[0][b->mode[3]]++;
1555                 } else {
1556                     b->mode[3] = b->mode[2];
1557                 }
1558             } else {
1559                 b->mode[2] = b->mode[0];
1560                 b->mode[3] = b->mode[1];
1561             }
1562         } else {
1563             static const uint8_t size_group[10] = {
1564                 3, 3, 3, 3, 2, 2, 2, 1, 1, 1
1565             };
1566             int sz = size_group[b->bs];
1567
1568             b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1569                                           s->prob.p.y_mode[sz]);
1570             b->mode[1] = b->mode[2] = b->mode[3] = b->mode[0];
1571             s->counts.y_mode[sz][b->mode[3]]++;
1572         }
1573         b->uvmode = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1574                                      s->prob.p.uv_mode[b->mode[3]]);
1575         s->counts.uv_mode[b->mode[3]][b->uvmode]++;
1576     } else {
1577         static const uint8_t inter_mode_ctx_lut[14][14] = {
1578             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1579             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1580             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1581             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1582             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1583             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1584             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1585             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1586             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1587             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1588             { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
1589             { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
1590             { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 0, 3 },
1591             { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 3, 3, 4 },
1592         };
1593
1594         if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[b->seg_id].ref_enabled) {
1595             av_assert2(s->s.h.segmentation.feat[b->seg_id].ref_val != 0);
1596             b->comp = 0;
1597             b->ref[0] = s->s.h.segmentation.feat[b->seg_id].ref_val - 1;
1598         } else {
1599             // read comp_pred flag
1600             if (s->s.h.comppredmode != PRED_SWITCHABLE) {
1601                 b->comp = s->s.h.comppredmode == PRED_COMPREF;
1602             } else {
1603                 int c;
1604
1605                 // FIXME add intra as ref=0xff (or -1) to make these easier?
1606                 if (have_a) {
1607                     if (have_l) {
1608                         if (s->above_comp_ctx[col] && s->left_comp_ctx[row7]) {
1609                             c = 4;
1610                         } else if (s->above_comp_ctx[col]) {
1611                             c = 2 + (s->left_intra_ctx[row7] ||
1612                                      s->left_ref_ctx[row7] == s->s.h.fixcompref);
1613                         } else if (s->left_comp_ctx[row7]) {
1614                             c = 2 + (s->above_intra_ctx[col] ||
1615                                      s->above_ref_ctx[col] == s->s.h.fixcompref);
1616                         } else {
1617                             c = (!s->above_intra_ctx[col] &&
1618                                  s->above_ref_ctx[col] == s->s.h.fixcompref) ^
1619                             (!s->left_intra_ctx[row7] &&
1620                              s->left_ref_ctx[row & 7] == s->s.h.fixcompref);
1621                         }
1622                     } else {
1623                         c = s->above_comp_ctx[col] ? 3 :
1624                         (!s->above_intra_ctx[col] && s->above_ref_ctx[col] == s->s.h.fixcompref);
1625                     }
1626                 } else if (have_l) {
1627                     c = s->left_comp_ctx[row7] ? 3 :
1628                     (!s->left_intra_ctx[row7] && s->left_ref_ctx[row7] == s->s.h.fixcompref);
1629                 } else {
1630                     c = 1;
1631                 }
1632                 b->comp = vp56_rac_get_prob(&s->c, s->prob.p.comp[c]);
1633                 s->counts.comp[c][b->comp]++;
1634             }
1635
1636             // read actual references
1637             // FIXME probably cache a few variables here to prevent repetitive
1638             // memory accesses below
1639             if (b->comp) /* two references */ {
1640                 int fix_idx = s->s.h.signbias[s->s.h.fixcompref], var_idx = !fix_idx, c, bit;
1641
1642                 b->ref[fix_idx] = s->s.h.fixcompref;
1643                 // FIXME can this codeblob be replaced by some sort of LUT?
1644                 if (have_a) {
1645                     if (have_l) {
1646                         if (s->above_intra_ctx[col]) {
1647                             if (s->left_intra_ctx[row7]) {
1648                                 c = 2;
1649                             } else {
1650                                 c = 1 + 2 * (s->left_ref_ctx[row7] != s->s.h.varcompref[1]);
1651                             }
1652                         } else if (s->left_intra_ctx[row7]) {
1653                             c = 1 + 2 * (s->above_ref_ctx[col] != s->s.h.varcompref[1]);
1654                         } else {
1655                             int refl = s->left_ref_ctx[row7], refa = s->above_ref_ctx[col];
1656
1657                             if (refl == refa && refa == s->s.h.varcompref[1]) {
1658                                 c = 0;
1659                             } else if (!s->left_comp_ctx[row7] && !s->above_comp_ctx[col]) {
1660                                 if ((refa == s->s.h.fixcompref && refl == s->s.h.varcompref[0]) ||
1661                                     (refl == s->s.h.fixcompref && refa == s->s.h.varcompref[0])) {
1662                                     c = 4;
1663                                 } else {
1664                                     c = (refa == refl) ? 3 : 1;
1665                                 }
1666                             } else if (!s->left_comp_ctx[row7]) {
1667                                 if (refa == s->s.h.varcompref[1] && refl != s->s.h.varcompref[1]) {
1668                                     c = 1;
1669                                 } else {
1670                                     c = (refl == s->s.h.varcompref[1] &&
1671                                          refa != s->s.h.varcompref[1]) ? 2 : 4;
1672                                 }
1673                             } else if (!s->above_comp_ctx[col]) {
1674                                 if (refl == s->s.h.varcompref[1] && refa != s->s.h.varcompref[1]) {
1675                                     c = 1;
1676                                 } else {
1677                                     c = (refa == s->s.h.varcompref[1] &&
1678                                          refl != s->s.h.varcompref[1]) ? 2 : 4;
1679                                 }
1680                             } else {
1681                                 c = (refl == refa) ? 4 : 2;
1682                             }
1683                         }
1684                     } else {
1685                         if (s->above_intra_ctx[col]) {
1686                             c = 2;
1687                         } else if (s->above_comp_ctx[col]) {
1688                             c = 4 * (s->above_ref_ctx[col] != s->s.h.varcompref[1]);
1689                         } else {
1690                             c = 3 * (s->above_ref_ctx[col] != s->s.h.varcompref[1]);
1691                         }
1692                     }
1693                 } else if (have_l) {
1694                     if (s->left_intra_ctx[row7]) {
1695                         c = 2;
1696                     } else if (s->left_comp_ctx[row7]) {
1697                         c = 4 * (s->left_ref_ctx[row7] != s->s.h.varcompref[1]);
1698                     } else {
1699                         c = 3 * (s->left_ref_ctx[row7] != s->s.h.varcompref[1]);
1700                     }
1701                 } else {
1702                     c = 2;
1703                 }
1704                 bit = vp56_rac_get_prob(&s->c, s->prob.p.comp_ref[c]);
1705                 b->ref[var_idx] = s->s.h.varcompref[bit];
1706                 s->counts.comp_ref[c][bit]++;
1707             } else /* single reference */ {
1708                 int bit, c;
1709
1710                 if (have_a && !s->above_intra_ctx[col]) {
1711                     if (have_l && !s->left_intra_ctx[row7]) {
1712                         if (s->left_comp_ctx[row7]) {
1713                             if (s->above_comp_ctx[col]) {
1714                                 c = 1 + (!s->s.h.fixcompref || !s->left_ref_ctx[row7] ||
1715                                          !s->above_ref_ctx[col]);
1716                             } else {
1717                                 c = (3 * !s->above_ref_ctx[col]) +
1718                                     (!s->s.h.fixcompref || !s->left_ref_ctx[row7]);
1719                             }
1720                         } else if (s->above_comp_ctx[col]) {
1721                             c = (3 * !s->left_ref_ctx[row7]) +
1722                                 (!s->s.h.fixcompref || !s->above_ref_ctx[col]);
1723                         } else {
1724                             c = 2 * !s->left_ref_ctx[row7] + 2 * !s->above_ref_ctx[col];
1725                         }
1726                     } else if (s->above_intra_ctx[col]) {
1727                         c = 2;
1728                     } else if (s->above_comp_ctx[col]) {
1729                         c = 1 + (!s->s.h.fixcompref || !s->above_ref_ctx[col]);
1730                     } else {
1731                         c = 4 * (!s->above_ref_ctx[col]);
1732                     }
1733                 } else if (have_l && !s->left_intra_ctx[row7]) {
1734                     if (s->left_intra_ctx[row7]) {
1735                         c = 2;
1736                     } else if (s->left_comp_ctx[row7]) {
1737                         c = 1 + (!s->s.h.fixcompref || !s->left_ref_ctx[row7]);
1738                     } else {
1739                         c = 4 * (!s->left_ref_ctx[row7]);
1740                     }
1741                 } else {
1742                     c = 2;
1743                 }
1744                 bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][0]);
1745                 s->counts.single_ref[c][0][bit]++;
1746                 if (!bit) {
1747                     b->ref[0] = 0;
1748                 } else {
1749                     // FIXME can this codeblob be replaced by some sort of LUT?
1750                     if (have_a) {
1751                         if (have_l) {
1752                             if (s->left_intra_ctx[row7]) {
1753                                 if (s->above_intra_ctx[col]) {
1754                                     c = 2;
1755                                 } else if (s->above_comp_ctx[col]) {
1756                                     c = 1 + 2 * (s->s.h.fixcompref == 1 ||
1757                                                  s->above_ref_ctx[col] == 1);
1758                                 } else if (!s->above_ref_ctx[col]) {
1759                                     c = 3;
1760                                 } else {
1761                                     c = 4 * (s->above_ref_ctx[col] == 1);
1762                                 }
1763                             } else if (s->above_intra_ctx[col]) {
1764                                 if (s->left_intra_ctx[row7]) {
1765                                     c = 2;
1766                                 } else if (s->left_comp_ctx[row7]) {
1767                                     c = 1 + 2 * (s->s.h.fixcompref == 1 ||
1768                                                  s->left_ref_ctx[row7] == 1);
1769                                 } else if (!s->left_ref_ctx[row7]) {
1770                                     c = 3;
1771                                 } else {
1772                                     c = 4 * (s->left_ref_ctx[row7] == 1);
1773                                 }
1774                             } else if (s->above_comp_ctx[col]) {
1775                                 if (s->left_comp_ctx[row7]) {
1776                                     if (s->left_ref_ctx[row7] == s->above_ref_ctx[col]) {
1777                                         c = 3 * (s->s.h.fixcompref == 1 ||
1778                                                  s->left_ref_ctx[row7] == 1);
1779                                     } else {
1780                                         c = 2;
1781                                     }
1782                                 } else if (!s->left_ref_ctx[row7]) {
1783                                     c = 1 + 2 * (s->s.h.fixcompref == 1 ||
1784                                                  s->above_ref_ctx[col] == 1);
1785                                 } else {
1786                                     c = 3 * (s->left_ref_ctx[row7] == 1) +
1787                                     (s->s.h.fixcompref == 1 || s->above_ref_ctx[col] == 1);
1788                                 }
1789                             } else if (s->left_comp_ctx[row7]) {
1790                                 if (!s->above_ref_ctx[col]) {
1791                                     c = 1 + 2 * (s->s.h.fixcompref == 1 ||
1792                                                  s->left_ref_ctx[row7] == 1);
1793                                 } else {
1794                                     c = 3 * (s->above_ref_ctx[col] == 1) +
1795                                     (s->s.h.fixcompref == 1 || s->left_ref_ctx[row7] == 1);
1796                                 }
1797                             } else if (!s->above_ref_ctx[col]) {
1798                                 if (!s->left_ref_ctx[row7]) {
1799                                     c = 3;
1800                                 } else {
1801                                     c = 4 * (s->left_ref_ctx[row7] == 1);
1802                                 }
1803                             } else if (!s->left_ref_ctx[row7]) {
1804                                 c = 4 * (s->above_ref_ctx[col] == 1);
1805                             } else {
1806                                 c = 2 * (s->left_ref_ctx[row7] == 1) +
1807                                 2 * (s->above_ref_ctx[col] == 1);
1808                             }
1809                         } else {
1810                             if (s->above_intra_ctx[col] ||
1811                                 (!s->above_comp_ctx[col] && !s->above_ref_ctx[col])) {
1812                                 c = 2;
1813                             } else if (s->above_comp_ctx[col]) {
1814                                 c = 3 * (s->s.h.fixcompref == 1 || s->above_ref_ctx[col] == 1);
1815                             } else {
1816                                 c = 4 * (s->above_ref_ctx[col] == 1);
1817                             }
1818                         }
1819                     } else if (have_l) {
1820                         if (s->left_intra_ctx[row7] ||
1821                             (!s->left_comp_ctx[row7] && !s->left_ref_ctx[row7])) {
1822                             c = 2;
1823                         } else if (s->left_comp_ctx[row7]) {
1824                             c = 3 * (s->s.h.fixcompref == 1 || s->left_ref_ctx[row7] == 1);
1825                         } else {
1826                             c = 4 * (s->left_ref_ctx[row7] == 1);
1827                         }
1828                     } else {
1829                         c = 2;
1830                     }
1831                     bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][1]);
1832                     s->counts.single_ref[c][1][bit]++;
1833                     b->ref[0] = 1 + bit;
1834                 }
1835             }
1836         }
1837
1838         if (b->bs <= BS_8x8) {
1839             if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[b->seg_id].skip_enabled) {
1840                 b->mode[0] = b->mode[1] = b->mode[2] = b->mode[3] = ZEROMV;
1841             } else {
1842                 static const uint8_t off[10] = {
1843                     3, 0, 0, 1, 0, 0, 0, 0, 0, 0
1844                 };
1845
1846                 // FIXME this needs to use the LUT tables from find_ref_mvs
1847                 // because not all are -1,0/0,-1
1848                 int c = inter_mode_ctx_lut[s->above_mode_ctx[col + off[b->bs]]]
1849                                           [s->left_mode_ctx[row7 + off[b->bs]]];
1850
1851                 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1852                                               s->prob.p.mv_mode[c]);
1853                 b->mode[1] = b->mode[2] = b->mode[3] = b->mode[0];
1854                 s->counts.mv_mode[c][b->mode[0] - 10]++;
1855             }
1856         }
1857
1858         if (s->s.h.filtermode == FILTER_SWITCHABLE) {
1859             int c;
1860
1861             if (have_a && s->above_mode_ctx[col] >= NEARESTMV) {
1862                 if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
1863                     c = s->above_filter_ctx[col] == s->left_filter_ctx[row7] ?
1864                         s->left_filter_ctx[row7] : 3;
1865                 } else {
1866                     c = s->above_filter_ctx[col];
1867                 }
1868             } else if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
1869                 c = s->left_filter_ctx[row7];
1870             } else {
1871                 c = 3;
1872             }
1873
1874             filter_id = vp8_rac_get_tree(&s->c, vp9_filter_tree,
1875                                          s->prob.p.filter[c]);
1876             s->counts.filter[c][filter_id]++;
1877             b->filter = vp9_filter_lut[filter_id];
1878         } else {
1879             b->filter = s->s.h.filtermode;
1880         }
1881
1882         if (b->bs > BS_8x8) {
1883             int c = inter_mode_ctx_lut[s->above_mode_ctx[col]][s->left_mode_ctx[row7]];
1884
1885             b->mode[0] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1886                                           s->prob.p.mv_mode[c]);
1887             s->counts.mv_mode[c][b->mode[0] - 10]++;
1888             fill_mv(s, b->mv[0], b->mode[0], 0);
1889
1890             if (b->bs != BS_8x4) {
1891                 b->mode[1] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1892                                               s->prob.p.mv_mode[c]);
1893                 s->counts.mv_mode[c][b->mode[1] - 10]++;
1894                 fill_mv(s, b->mv[1], b->mode[1], 1);
1895             } else {
1896                 b->mode[1] = b->mode[0];
1897                 AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
1898                 AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
1899             }
1900
1901             if (b->bs != BS_4x8) {
1902                 b->mode[2] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1903                                               s->prob.p.mv_mode[c]);
1904                 s->counts.mv_mode[c][b->mode[2] - 10]++;
1905                 fill_mv(s, b->mv[2], b->mode[2], 2);
1906
1907                 if (b->bs != BS_8x4) {
1908                     b->mode[3] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1909                                                   s->prob.p.mv_mode[c]);
1910                     s->counts.mv_mode[c][b->mode[3] - 10]++;
1911                     fill_mv(s, b->mv[3], b->mode[3], 3);
1912                 } else {
1913                     b->mode[3] = b->mode[2];
1914                     AV_COPY32(&b->mv[3][0], &b->mv[2][0]);
1915                     AV_COPY32(&b->mv[3][1], &b->mv[2][1]);
1916                 }
1917             } else {
1918                 b->mode[2] = b->mode[0];
1919                 AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
1920                 AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
1921                 b->mode[3] = b->mode[1];
1922                 AV_COPY32(&b->mv[3][0], &b->mv[1][0]);
1923                 AV_COPY32(&b->mv[3][1], &b->mv[1][1]);
1924             }
1925         } else {
1926             fill_mv(s, b->mv[0], b->mode[0], -1);
1927             AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
1928             AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
1929             AV_COPY32(&b->mv[3][0], &b->mv[0][0]);
1930             AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
1931             AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
1932             AV_COPY32(&b->mv[3][1], &b->mv[0][1]);
1933         }
1934
1935         vref = b->ref[b->comp ? s->s.h.signbias[s->s.h.varcompref[0]] : 0];
1936     }
1937
1938 #if HAVE_FAST_64BIT
1939 #define SPLAT_CTX(var, val, n) \
1940     switch (n) { \
1941     case 1:  var = val;                                    break; \
1942     case 2:  AV_WN16A(&var, val *             0x0101);     break; \
1943     case 4:  AV_WN32A(&var, val *         0x01010101);     break; \
1944     case 8:  AV_WN64A(&var, val * 0x0101010101010101ULL);  break; \
1945     case 16: { \
1946         uint64_t v64 = val * 0x0101010101010101ULL; \
1947         AV_WN64A(              &var,     v64); \
1948         AV_WN64A(&((uint8_t *) &var)[8], v64); \
1949         break; \
1950     } \
1951     }
1952 #else
1953 #define SPLAT_CTX(var, val, n) \
1954     switch (n) { \
1955     case 1:  var = val;                         break; \
1956     case 2:  AV_WN16A(&var, val *     0x0101);  break; \
1957     case 4:  AV_WN32A(&var, val * 0x01010101);  break; \
1958     case 8: { \
1959         uint32_t v32 = val * 0x01010101; \
1960         AV_WN32A(              &var,     v32); \
1961         AV_WN32A(&((uint8_t *) &var)[4], v32); \
1962         break; \
1963     } \
1964     case 16: { \
1965         uint32_t v32 = val * 0x01010101; \
1966         AV_WN32A(              &var,      v32); \
1967         AV_WN32A(&((uint8_t *) &var)[4],  v32); \
1968         AV_WN32A(&((uint8_t *) &var)[8],  v32); \
1969         AV_WN32A(&((uint8_t *) &var)[12], v32); \
1970         break; \
1971     } \
1972     }
1973 #endif
1974
1975     switch (bwh_tab[1][b->bs][0]) {
1976 #define SET_CTXS(dir, off, n) \
1977     do { \
1978         SPLAT_CTX(s->dir##_skip_ctx[off],      b->skip,          n); \
1979         SPLAT_CTX(s->dir##_txfm_ctx[off],      b->tx,            n); \
1980         SPLAT_CTX(s->dir##_partition_ctx[off], dir##_ctx[b->bs], n); \
1981         if (!s->s.h.keyframe && !s->s.h.intraonly) { \
1982             SPLAT_CTX(s->dir##_intra_ctx[off], b->intra,   n); \
1983             SPLAT_CTX(s->dir##_comp_ctx[off],  b->comp,    n); \
1984             SPLAT_CTX(s->dir##_mode_ctx[off],  b->mode[3], n); \
1985             if (!b->intra) { \
1986                 SPLAT_CTX(s->dir##_ref_ctx[off], vref, n); \
1987                 if (s->s.h.filtermode == FILTER_SWITCHABLE) { \
1988                     SPLAT_CTX(s->dir##_filter_ctx[off], filter_id, n); \
1989                 } \
1990             } \
1991         } \
1992     } while (0)
1993     case 1: SET_CTXS(above, col, 1); break;
1994     case 2: SET_CTXS(above, col, 2); break;
1995     case 4: SET_CTXS(above, col, 4); break;
1996     case 8: SET_CTXS(above, col, 8); break;
1997     }
1998     switch (bwh_tab[1][b->bs][1]) {
1999     case 1: SET_CTXS(left, row7, 1); break;
2000     case 2: SET_CTXS(left, row7, 2); break;
2001     case 4: SET_CTXS(left, row7, 4); break;
2002     case 8: SET_CTXS(left, row7, 8); break;
2003     }
2004 #undef SPLAT_CTX
2005 #undef SET_CTXS
2006
2007     if (!s->s.h.keyframe && !s->s.h.intraonly) {
2008         if (b->bs > BS_8x8) {
2009             int mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
2010
2011             AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][0], &b->mv[1][0]);
2012             AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][1], &b->mv[1][1]);
2013             AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][0], mv0);
2014             AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][1], mv1);
2015             AV_COPY32(&s->above_mv_ctx[col * 2 + 0][0], &b->mv[2][0]);
2016             AV_COPY32(&s->above_mv_ctx[col * 2 + 0][1], &b->mv[2][1]);
2017             AV_WN32A(&s->above_mv_ctx[col * 2 + 1][0], mv0);
2018             AV_WN32A(&s->above_mv_ctx[col * 2 + 1][1], mv1);
2019         } else {
2020             int n, mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
2021
2022             for (n = 0; n < w4 * 2; n++) {
2023                 AV_WN32A(&s->above_mv_ctx[col * 2 + n][0], mv0);
2024                 AV_WN32A(&s->above_mv_ctx[col * 2 + n][1], mv1);
2025             }
2026             for (n = 0; n < h4 * 2; n++) {
2027                 AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][0], mv0);
2028                 AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][1], mv1);
2029             }
2030         }
2031     }
2032
2033     // FIXME kinda ugly
2034     for (y = 0; y < h4; y++) {
2035         int x, o = (row + y) * s->sb_cols * 8 + col;
2036         struct VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[o];
2037
2038         if (b->intra) {
2039             for (x = 0; x < w4; x++) {
2040                 mv[x].ref[0] =
2041                 mv[x].ref[1] = -1;
2042             }
2043         } else if (b->comp) {
2044             for (x = 0; x < w4; x++) {
2045                 mv[x].ref[0] = b->ref[0];
2046                 mv[x].ref[1] = b->ref[1];
2047                 AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
2048                 AV_COPY32(&mv[x].mv[1], &b->mv[3][1]);
2049             }
2050         } else {
2051             for (x = 0; x < w4; x++) {
2052                 mv[x].ref[0] = b->ref[0];
2053                 mv[x].ref[1] = -1;
2054                 AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
2055             }
2056         }
2057     }
2058 }
2059
2060 // FIXME merge cnt/eob arguments?
2061 static av_always_inline int
2062 decode_coeffs_b_generic(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
2063                         int is_tx32x32, int is8bitsperpixel, int bpp, unsigned (*cnt)[6][3],
2064                         unsigned (*eob)[6][2], uint8_t (*p)[6][11],
2065                         int nnz, const int16_t *scan, const int16_t (*nb)[2],
2066                         const int16_t *band_counts, const int16_t *qmul)
2067 {
2068     int i = 0, band = 0, band_left = band_counts[band];
2069     uint8_t *tp = p[0][nnz];
2070     uint8_t cache[1024];
2071
2072     do {
2073         int val, rc;
2074
2075         val = vp56_rac_get_prob_branchy(c, tp[0]); // eob
2076         eob[band][nnz][val]++;
2077         if (!val)
2078             break;
2079
2080     skip_eob:
2081         if (!vp56_rac_get_prob_branchy(c, tp[1])) { // zero
2082             cnt[band][nnz][0]++;
2083             if (!--band_left)
2084                 band_left = band_counts[++band];
2085             cache[scan[i]] = 0;
2086             nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
2087             tp = p[band][nnz];
2088             if (++i == n_coeffs)
2089                 break; //invalid input; blocks should end with EOB
2090             goto skip_eob;
2091         }
2092
2093         rc = scan[i];
2094         if (!vp56_rac_get_prob_branchy(c, tp[2])) { // one
2095             cnt[band][nnz][1]++;
2096             val = 1;
2097             cache[rc] = 1;
2098         } else {
2099             // fill in p[3-10] (model fill) - only once per frame for each pos
2100             if (!tp[3])
2101                 memcpy(&tp[3], vp9_model_pareto8[tp[2]], 8);
2102
2103             cnt[band][nnz][2]++;
2104             if (!vp56_rac_get_prob_branchy(c, tp[3])) { // 2, 3, 4
2105                 if (!vp56_rac_get_prob_branchy(c, tp[4])) {
2106                     cache[rc] = val = 2;
2107                 } else {
2108                     val = 3 + vp56_rac_get_prob(c, tp[5]);
2109                     cache[rc] = 3;
2110                 }
2111             } else if (!vp56_rac_get_prob_branchy(c, tp[6])) { // cat1/2
2112                 cache[rc] = 4;
2113                 if (!vp56_rac_get_prob_branchy(c, tp[7])) {
2114                     val = 5 + vp56_rac_get_prob(c, 159);
2115                 } else {
2116                     val  = 7 + (vp56_rac_get_prob(c, 165) << 1);
2117                     val +=      vp56_rac_get_prob(c, 145);
2118                 }
2119             } else { // cat 3-6
2120                 cache[rc] = 5;
2121                 if (!vp56_rac_get_prob_branchy(c, tp[8])) {
2122                     if (!vp56_rac_get_prob_branchy(c, tp[9])) {
2123                         val  = 11 + (vp56_rac_get_prob(c, 173) << 2);
2124                         val +=      (vp56_rac_get_prob(c, 148) << 1);
2125                         val +=       vp56_rac_get_prob(c, 140);
2126                     } else {
2127                         val  = 19 + (vp56_rac_get_prob(c, 176) << 3);
2128                         val +=      (vp56_rac_get_prob(c, 155) << 2);
2129                         val +=      (vp56_rac_get_prob(c, 140) << 1);
2130                         val +=       vp56_rac_get_prob(c, 135);
2131                     }
2132                 } else if (!vp56_rac_get_prob_branchy(c, tp[10])) {
2133                     val  = 35 + (vp56_rac_get_prob(c, 180) << 4);
2134                     val +=      (vp56_rac_get_prob(c, 157) << 3);
2135                     val +=      (vp56_rac_get_prob(c, 141) << 2);
2136                     val +=      (vp56_rac_get_prob(c, 134) << 1);
2137                     val +=       vp56_rac_get_prob(c, 130);
2138                 } else {
2139                     val = 67;
2140                     if (!is8bitsperpixel) {
2141                         if (bpp == 12) {
2142                             val += vp56_rac_get_prob(c, 255) << 17;
2143                             val += vp56_rac_get_prob(c, 255) << 16;
2144                         }
2145                         val +=  (vp56_rac_get_prob(c, 255) << 15);
2146                         val +=  (vp56_rac_get_prob(c, 255) << 14);
2147                     }
2148                     val +=      (vp56_rac_get_prob(c, 254) << 13);
2149                     val +=      (vp56_rac_get_prob(c, 254) << 12);
2150                     val +=      (vp56_rac_get_prob(c, 254) << 11);
2151                     val +=      (vp56_rac_get_prob(c, 252) << 10);
2152                     val +=      (vp56_rac_get_prob(c, 249) << 9);
2153                     val +=      (vp56_rac_get_prob(c, 243) << 8);
2154                     val +=      (vp56_rac_get_prob(c, 230) << 7);
2155                     val +=      (vp56_rac_get_prob(c, 196) << 6);
2156                     val +=      (vp56_rac_get_prob(c, 177) << 5);
2157                     val +=      (vp56_rac_get_prob(c, 153) << 4);
2158                     val +=      (vp56_rac_get_prob(c, 140) << 3);
2159                     val +=      (vp56_rac_get_prob(c, 133) << 2);
2160                     val +=      (vp56_rac_get_prob(c, 130) << 1);
2161                     val +=       vp56_rac_get_prob(c, 129);
2162                 }
2163             }
2164         }
2165 #define STORE_COEF(c, i, v) do { \
2166     if (is8bitsperpixel) { \
2167         c[i] = v; \
2168     } else { \
2169         AV_WN32A(&c[i * 2], v); \
2170     } \
2171 } while (0)
2172         if (!--band_left)
2173             band_left = band_counts[++band];
2174         if (is_tx32x32)
2175             STORE_COEF(coef, rc, ((vp8_rac_get(c) ? -val : val) * qmul[!!i]) / 2);
2176         else
2177             STORE_COEF(coef, rc, (vp8_rac_get(c) ? -val : val) * qmul[!!i]);
2178         nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
2179         tp = p[band][nnz];
2180     } while (++i < n_coeffs);
2181
2182     return i;
2183 }
2184
2185 static int decode_coeffs_b_8bpp(VP9Context *s, int16_t *coef, int n_coeffs,
2186                                 unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2187                                 uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2188                                 const int16_t (*nb)[2], const int16_t *band_counts,
2189                                 const int16_t *qmul)
2190 {
2191     return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 0, 1, 8, cnt, eob, p,
2192                                    nnz, scan, nb, band_counts, qmul);
2193 }
2194
2195 static int decode_coeffs_b32_8bpp(VP9Context *s, int16_t *coef, int n_coeffs,
2196                                   unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2197                                   uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2198                                   const int16_t (*nb)[2], const int16_t *band_counts,
2199                                   const int16_t *qmul)
2200 {
2201     return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 1, 1, 8, cnt, eob, p,
2202                                    nnz, scan, nb, band_counts, qmul);
2203 }
2204
2205 static int decode_coeffs_b_16bpp(VP9Context *s, int16_t *coef, int n_coeffs,
2206                                  unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2207                                  uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2208                                  const int16_t (*nb)[2], const int16_t *band_counts,
2209                                  const int16_t *qmul)
2210 {
2211     return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 0, 0, s->bpp, cnt, eob, p,
2212                                    nnz, scan, nb, band_counts, qmul);
2213 }
2214
2215 static int decode_coeffs_b32_16bpp(VP9Context *s, int16_t *coef, int n_coeffs,
2216                                    unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2217                                    uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2218                                    const int16_t (*nb)[2], const int16_t *band_counts,
2219                                    const int16_t *qmul)
2220 {
2221     return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 1, 0, s->bpp, cnt, eob, p,
2222                                    nnz, scan, nb, band_counts, qmul);
2223 }
2224
2225 static av_always_inline int decode_coeffs(AVCodecContext *ctx, int is8bitsperpixel)
2226 {
2227     VP9Context *s = ctx->priv_data;
2228     VP9Block *b = s->b;
2229     int row = s->row, col = s->col;
2230     uint8_t (*p)[6][11] = s->prob.coef[b->tx][0 /* y */][!b->intra];
2231     unsigned (*c)[6][3] = s->counts.coef[b->tx][0 /* y */][!b->intra];
2232     unsigned (*e)[6][2] = s->counts.eob[b->tx][0 /* y */][!b->intra];
2233     int w4 = bwh_tab[1][b->bs][0] << 1, h4 = bwh_tab[1][b->bs][1] << 1;
2234     int end_x = FFMIN(2 * (s->cols - col), w4);
2235     int end_y = FFMIN(2 * (s->rows - row), h4);
2236     int n, pl, x, y, res;
2237     int16_t (*qmul)[2] = s->s.h.segmentation.feat[b->seg_id].qmul;
2238     int tx = 4 * s->s.h.lossless + b->tx;
2239     const int16_t * const *yscans = vp9_scans[tx];
2240     const int16_t (* const *ynbs)[2] = vp9_scans_nb[tx];
2241     const int16_t *uvscan = vp9_scans[b->uvtx][DCT_DCT];
2242     const int16_t (*uvnb)[2] = vp9_scans_nb[b->uvtx][DCT_DCT];
2243     uint8_t *a = &s->above_y_nnz_ctx[col * 2];
2244     uint8_t *l = &s->left_y_nnz_ctx[(row & 7) << 1];
2245     static const int16_t band_counts[4][8] = {
2246         { 1, 2, 3, 4,  3,   16 - 13 },
2247         { 1, 2, 3, 4, 11,   64 - 21 },
2248         { 1, 2, 3, 4, 11,  256 - 21 },
2249         { 1, 2, 3, 4, 11, 1024 - 21 },
2250     };
2251     const int16_t *y_band_counts = band_counts[b->tx];
2252     const int16_t *uv_band_counts = band_counts[b->uvtx];
2253     int bytesperpixel = is8bitsperpixel ? 1 : 2;
2254     int total_coeff = 0;
2255
2256 #define MERGE(la, end, step, rd) \
2257     for (n = 0; n < end; n += step) \
2258         la[n] = !!rd(&la[n])
2259 #define MERGE_CTX(step, rd) \
2260     do { \
2261         MERGE(l, end_y, step, rd); \
2262         MERGE(a, end_x, step, rd); \
2263     } while (0)
2264
2265 #define DECODE_Y_COEF_LOOP(step, mode_index, v) \
2266     for (n = 0, y = 0; y < end_y; y += step) { \
2267         for (x = 0; x < end_x; x += step, n += step * step) { \
2268             enum TxfmType txtp = vp9_intra_txfm_type[b->mode[mode_index]]; \
2269             res = (is8bitsperpixel ? decode_coeffs_b##v##_8bpp : decode_coeffs_b##v##_16bpp) \
2270                                     (s, s->block + 16 * n * bytesperpixel, 16 * step * step, \
2271                                      c, e, p, a[x] + l[y], yscans[txtp], \
2272                                      ynbs[txtp], y_band_counts, qmul[0]); \
2273             a[x] = l[y] = !!res; \
2274             total_coeff |= !!res; \
2275             if (step >= 4) { \
2276                 AV_WN16A(&s->eob[n], res); \
2277             } else { \
2278                 s->eob[n] = res; \
2279             } \
2280         } \
2281     }
2282
2283 #define SPLAT(la, end, step, cond) \
2284     if (step == 2) { \
2285         for (n = 1; n < end; n += step) \
2286             la[n] = la[n - 1]; \
2287     } else if (step == 4) { \
2288         if (cond) { \
2289             for (n = 0; n < end; n += step) \
2290                 AV_WN32A(&la[n], la[n] * 0x01010101); \
2291         } else { \
2292             for (n = 0; n < end; n += step) \
2293                 memset(&la[n + 1], la[n], FFMIN(end - n - 1, 3)); \
2294         } \
2295     } else /* step == 8 */ { \
2296         if (cond) { \
2297             if (HAVE_FAST_64BIT) { \
2298                 for (n = 0; n < end; n += step) \
2299                     AV_WN64A(&la[n], la[n] * 0x0101010101010101ULL); \
2300             } else { \
2301                 for (n = 0; n < end; n += step) { \
2302                     uint32_t v32 = la[n] * 0x01010101; \
2303                     AV_WN32A(&la[n],     v32); \
2304                     AV_WN32A(&la[n + 4], v32); \
2305                 } \
2306             } \
2307         } else { \
2308             for (n = 0; n < end; n += step) \
2309                 memset(&la[n + 1], la[n], FFMIN(end - n - 1, 7)); \
2310         } \
2311     }
2312 #define SPLAT_CTX(step) \
2313     do { \
2314         SPLAT(a, end_x, step, end_x == w4); \
2315         SPLAT(l, end_y, step, end_y == h4); \
2316     } while (0)
2317
2318     /* y tokens */
2319     switch (b->tx) {
2320     case TX_4X4:
2321         DECODE_Y_COEF_LOOP(1, b->bs > BS_8x8 ? n : 0,);
2322         break;
2323     case TX_8X8:
2324         MERGE_CTX(2, AV_RN16A);
2325         DECODE_Y_COEF_LOOP(2, 0,);
2326         SPLAT_CTX(2);
2327         break;
2328     case TX_16X16:
2329         MERGE_CTX(4, AV_RN32A);
2330         DECODE_Y_COEF_LOOP(4, 0,);
2331         SPLAT_CTX(4);
2332         break;
2333     case TX_32X32:
2334         MERGE_CTX(8, AV_RN64A);
2335         DECODE_Y_COEF_LOOP(8, 0, 32);
2336         SPLAT_CTX(8);
2337         break;
2338     }
2339
2340 #define DECODE_UV_COEF_LOOP(step, v) \
2341     for (n = 0, y = 0; y < end_y; y += step) { \
2342         for (x = 0; x < end_x; x += step, n += step * step) { \
2343             res = (is8bitsperpixel ? decode_coeffs_b##v##_8bpp : decode_coeffs_b##v##_16bpp) \
2344                                     (s, s->uvblock[pl] + 16 * n * bytesperpixel, \
2345                                      16 * step * step, c, e, p, a[x] + l[y], \
2346                                      uvscan, uvnb, uv_band_counts, qmul[1]); \
2347             a[x] = l[y] = !!res; \
2348             total_coeff |= !!res; \
2349             if (step >= 4) { \
2350                 AV_WN16A(&s->uveob[pl][n], res); \
2351             } else { \
2352                 s->uveob[pl][n] = res; \
2353             } \
2354         } \
2355     }
2356
2357     p = s->prob.coef[b->uvtx][1 /* uv */][!b->intra];
2358     c = s->counts.coef[b->uvtx][1 /* uv */][!b->intra];
2359     e = s->counts.eob[b->uvtx][1 /* uv */][!b->intra];
2360     w4 >>= s->ss_h;
2361     end_x >>= s->ss_h;
2362     h4 >>= s->ss_v;
2363     end_y >>= s->ss_v;
2364     for (pl = 0; pl < 2; pl++) {
2365         a = &s->above_uv_nnz_ctx[pl][col << !s->ss_h];
2366         l = &s->left_uv_nnz_ctx[pl][(row & 7) << !s->ss_v];
2367         switch (b->uvtx) {
2368         case TX_4X4:
2369             DECODE_UV_COEF_LOOP(1,);
2370             break;
2371         case TX_8X8:
2372             MERGE_CTX(2, AV_RN16A);
2373             DECODE_UV_COEF_LOOP(2,);
2374             SPLAT_CTX(2);
2375             break;
2376         case TX_16X16:
2377             MERGE_CTX(4, AV_RN32A);
2378             DECODE_UV_COEF_LOOP(4,);
2379             SPLAT_CTX(4);
2380             break;
2381         case TX_32X32:
2382             MERGE_CTX(8, AV_RN64A);
2383             DECODE_UV_COEF_LOOP(8, 32);
2384             SPLAT_CTX(8);
2385             break;
2386         }
2387     }
2388
2389     return total_coeff;
2390 }
2391
2392 static int decode_coeffs_8bpp(AVCodecContext *ctx)
2393 {
2394     return decode_coeffs(ctx, 1);
2395 }
2396
2397 static int decode_coeffs_16bpp(AVCodecContext *ctx)
2398 {
2399     return decode_coeffs(ctx, 0);
2400 }
2401
2402 static av_always_inline int check_intra_mode(VP9Context *s, int mode, uint8_t **a,
2403                                              uint8_t *dst_edge, ptrdiff_t stride_edge,
2404                                              uint8_t *dst_inner, ptrdiff_t stride_inner,
2405                                              uint8_t *l, int col, int x, int w,
2406                                              int row, int y, enum TxfmMode tx,
2407                                              int p, int ss_h, int ss_v, int bytesperpixel)
2408 {
2409     int have_top = row > 0 || y > 0;
2410     int have_left = col > s->tile_col_start || x > 0;
2411     int have_right = x < w - 1;
2412     int bpp = s->bpp;
2413     static const uint8_t mode_conv[10][2 /* have_left */][2 /* have_top */] = {
2414         [VERT_PRED]            = { { DC_127_PRED,          VERT_PRED },
2415                                    { DC_127_PRED,          VERT_PRED } },
2416         [HOR_PRED]             = { { DC_129_PRED,          DC_129_PRED },
2417                                    { HOR_PRED,             HOR_PRED } },
2418         [DC_PRED]              = { { DC_128_PRED,          TOP_DC_PRED },
2419                                    { LEFT_DC_PRED,         DC_PRED } },
2420         [DIAG_DOWN_LEFT_PRED]  = { { DC_127_PRED,          DIAG_DOWN_LEFT_PRED },
2421                                    { DC_127_PRED,          DIAG_DOWN_LEFT_PRED } },
2422         [DIAG_DOWN_RIGHT_PRED] = { { DIAG_DOWN_RIGHT_PRED, DIAG_DOWN_RIGHT_PRED },
2423                                    { DIAG_DOWN_RIGHT_PRED, DIAG_DOWN_RIGHT_PRED } },
2424         [VERT_RIGHT_PRED]      = { { VERT_RIGHT_PRED,      VERT_RIGHT_PRED },
2425                                    { VERT_RIGHT_PRED,      VERT_RIGHT_PRED } },
2426         [HOR_DOWN_PRED]        = { { HOR_DOWN_PRED,        HOR_DOWN_PRED },
2427                                    { HOR_DOWN_PRED,        HOR_DOWN_PRED } },
2428         [VERT_LEFT_PRED]       = { { DC_127_PRED,          VERT_LEFT_PRED },
2429                                    { DC_127_PRED,          VERT_LEFT_PRED } },
2430         [HOR_UP_PRED]          = { { DC_129_PRED,          DC_129_PRED },
2431                                    { HOR_UP_PRED,          HOR_UP_PRED } },
2432         [TM_VP8_PRED]          = { { DC_129_PRED,          VERT_PRED },
2433                                    { HOR_PRED,             TM_VP8_PRED } },
2434     };
2435     static const struct {
2436         uint8_t needs_left:1;
2437         uint8_t needs_top:1;
2438         uint8_t needs_topleft:1;
2439         uint8_t needs_topright:1;
2440         uint8_t invert_left:1;
2441     } edges[N_INTRA_PRED_MODES] = {
2442         [VERT_PRED]            = { .needs_top  = 1 },
2443         [HOR_PRED]             = { .needs_left = 1 },
2444         [DC_PRED]              = { .needs_top  = 1, .needs_left = 1 },
2445         [DIAG_DOWN_LEFT_PRED]  = { .needs_top  = 1, .needs_topright = 1 },
2446         [DIAG_DOWN_RIGHT_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2447         [VERT_RIGHT_PRED]      = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2448         [HOR_DOWN_PRED]        = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2449         [VERT_LEFT_PRED]       = { .needs_top  = 1, .needs_topright = 1 },
2450         [HOR_UP_PRED]          = { .needs_left = 1, .invert_left = 1 },
2451         [TM_VP8_PRED]          = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2452         [LEFT_DC_PRED]         = { .needs_left = 1 },
2453         [TOP_DC_PRED]          = { .needs_top  = 1 },
2454         [DC_128_PRED]          = { 0 },
2455         [DC_127_PRED]          = { 0 },
2456         [DC_129_PRED]          = { 0 }
2457     };
2458
2459     av_assert2(mode >= 0 && mode < 10);
2460     mode = mode_conv[mode][have_left][have_top];
2461     if (edges[mode].needs_top) {
2462         uint8_t *top, *topleft;
2463         int n_px_need = 4 << tx, n_px_have = (((s->cols - col) << !ss_h) - x) * 4;
2464         int n_px_need_tr = 0;
2465
2466         if (tx == TX_4X4 && edges[mode].needs_topright && have_right)
2467             n_px_need_tr = 4;
2468
2469         // if top of sb64-row, use s->intra_pred_data[] instead of
2470         // dst[-stride] for intra prediction (it contains pre- instead of
2471         // post-loopfilter data)
2472         if (have_top) {
2473             top = !(row & 7) && !y ?
2474                 s->intra_pred_data[p] + (col * (8 >> ss_h) + x * 4) * bytesperpixel :
2475                 y == 0 ? &dst_edge[-stride_edge] : &dst_inner[-stride_inner];
2476             if (have_left)
2477                 topleft = !(row & 7) && !y ?
2478                     s->intra_pred_data[p] + (col * (8 >> ss_h) + x * 4) * bytesperpixel :
2479                     y == 0 || x == 0 ? &dst_edge[-stride_edge] :
2480                     &dst_inner[-stride_inner];
2481         }
2482
2483         if (have_top &&
2484             (!edges[mode].needs_topleft || (have_left && top == topleft)) &&
2485             (tx != TX_4X4 || !edges[mode].needs_topright || have_right) &&
2486             n_px_need + n_px_need_tr <= n_px_have) {
2487             *a = top;
2488         } else {
2489             if (have_top) {
2490                 if (n_px_need <= n_px_have) {
2491                     memcpy(*a, top, n_px_need * bytesperpixel);
2492                 } else {
2493 #define memset_bpp(c, i1, v, i2, num) do { \
2494     if (bytesperpixel == 1) { \
2495         memset(&(c)[(i1)], (v)[(i2)], (num)); \
2496     } else { \
2497         int n, val = AV_RN16A(&(v)[(i2) * 2]); \
2498         for (n = 0; n < (num); n++) { \
2499             AV_WN16A(&(c)[((i1) + n) * 2], val); \
2500         } \
2501     } \
2502 } while (0)
2503                     memcpy(*a, top, n_px_have * bytesperpixel);
2504                     memset_bpp(*a, n_px_have, (*a), n_px_have - 1, n_px_need - n_px_have);
2505                 }
2506             } else {
2507 #define memset_val(c, val, num) do { \
2508     if (bytesperpixel == 1) { \
2509         memset((c), (val), (num)); \
2510     } else { \
2511         int n; \
2512         for (n = 0; n < (num); n++) { \
2513             AV_WN16A(&(c)[n * 2], (val)); \
2514         } \
2515     } \
2516 } while (0)
2517                 memset_val(*a, (128 << (bpp - 8)) - 1, n_px_need);
2518             }
2519             if (edges[mode].needs_topleft) {
2520                 if (have_left && have_top) {
2521 #define assign_bpp(c, i1, v, i2) do { \
2522     if (bytesperpixel == 1) { \
2523         (c)[(i1)] = (v)[(i2)]; \
2524     } else { \
2525         AV_COPY16(&(c)[(i1) * 2], &(v)[(i2) * 2]); \
2526     } \
2527 } while (0)
2528                     assign_bpp(*a, -1, topleft, -1);
2529                 } else {
2530 #define assign_val(c, i, v) do { \
2531     if (bytesperpixel == 1) { \
2532         (c)[(i)] = (v); \
2533     } else { \
2534         AV_WN16A(&(c)[(i) * 2], (v)); \
2535     } \
2536 } while (0)
2537                     assign_val((*a), -1, (128 << (bpp - 8)) + (have_top ? +1 : -1));
2538                 }
2539             }
2540             if (tx == TX_4X4 && edges[mode].needs_topright) {
2541                 if (have_top && have_right &&
2542                     n_px_need + n_px_need_tr <= n_px_have) {
2543                     memcpy(&(*a)[4 * bytesperpixel], &top[4 * bytesperpixel], 4 * bytesperpixel);
2544                 } else {
2545                     memset_bpp(*a, 4, *a, 3, 4);
2546                 }
2547             }
2548         }
2549     }
2550     if (edges[mode].needs_left) {
2551         if (have_left) {
2552             int n_px_need = 4 << tx, i, n_px_have = (((s->rows - row) << !ss_v) - y) * 4;
2553             uint8_t *dst = x == 0 ? dst_edge : dst_inner;
2554             ptrdiff_t stride = x == 0 ? stride_edge : stride_inner;
2555
2556             if (edges[mode].invert_left) {
2557                 if (n_px_need <= n_px_have) {
2558                     for (i = 0; i < n_px_need; i++)
2559                         assign_bpp(l, i, &dst[i * stride], -1);
2560                 } else {
2561                     for (i = 0; i < n_px_have; i++)
2562                         assign_bpp(l, i, &dst[i * stride], -1);
2563                     memset_bpp(l, n_px_have, l, n_px_have - 1, n_px_need - n_px_have);
2564                 }
2565             } else {
2566                 if (n_px_need <= n_px_have) {
2567                     for (i = 0; i < n_px_need; i++)
2568                         assign_bpp(l, n_px_need - 1 - i, &dst[i * stride], -1);
2569                 } else {
2570                     for (i = 0; i < n_px_have; i++)
2571                         assign_bpp(l, n_px_need - 1 - i, &dst[i * stride], -1);
2572                     memset_bpp(l, 0, l, n_px_need - n_px_have, n_px_need - n_px_have);
2573                 }
2574             }
2575         } else {
2576             memset_val(l, (128 << (bpp - 8)) + 1, 4 << tx);
2577         }
2578     }
2579
2580     return mode;
2581 }
2582
2583 static av_always_inline void intra_recon(AVCodecContext *ctx, ptrdiff_t y_off,
2584                                          ptrdiff_t uv_off, int bytesperpixel)
2585 {
2586     VP9Context *s = ctx->priv_data;
2587     VP9Block *b = s->b;
2588     int row = s->row, col = s->col;
2589     int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
2590     int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
2591     int end_x = FFMIN(2 * (s->cols - col), w4);
2592     int end_y = FFMIN(2 * (s->rows - row), h4);
2593     int tx = 4 * s->s.h.lossless + b->tx, uvtx = b->uvtx + 4 * s->s.h.lossless;
2594     int uvstep1d = 1 << b->uvtx, p;
2595     uint8_t *dst = s->dst[0], *dst_r = s->s.frames[CUR_FRAME].tf.f->data[0] + y_off;
2596     LOCAL_ALIGNED_32(uint8_t, a_buf, [96]);
2597     LOCAL_ALIGNED_32(uint8_t, l, [64]);
2598
2599     for (n = 0, y = 0; y < end_y; y += step1d) {
2600         uint8_t *ptr = dst, *ptr_r = dst_r;
2601         for (x = 0; x < end_x; x += step1d, ptr += 4 * step1d * bytesperpixel,
2602                                ptr_r += 4 * step1d * bytesperpixel, n += step) {
2603             int mode = b->mode[b->bs > BS_8x8 && b->tx == TX_4X4 ?
2604                                y * 2 + x : 0];
2605             uint8_t *a = &a_buf[32];
2606             enum TxfmType txtp = vp9_intra_txfm_type[mode];
2607             int eob = b->skip ? 0 : b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
2608
2609             mode = check_intra_mode(s, mode, &a, ptr_r,
2610                                     s->s.frames[CUR_FRAME].tf.f->linesize[0],
2611                                     ptr, s->y_stride, l,
2612                                     col, x, w4, row, y, b->tx, 0, 0, 0, bytesperpixel);
2613             s->dsp.intra_pred[b->tx][mode](ptr, s->y_stride, l, a);
2614             if (eob)
2615                 s->dsp.itxfm_add[tx][txtp](ptr, s->y_stride,
2616                                            s->block + 16 * n * bytesperpixel, eob);
2617         }
2618         dst_r += 4 * step1d * s->s.frames[CUR_FRAME].tf.f->linesize[0];
2619         dst   += 4 * step1d * s->y_stride;
2620     }
2621
2622     // U/V
2623     w4 >>= s->ss_h;
2624     end_x >>= s->ss_h;
2625     end_y >>= s->ss_v;
2626     step = 1 << (b->uvtx * 2);
2627     for (p = 0; p < 2; p++) {
2628         dst   = s->dst[1 + p];
2629         dst_r = s->s.frames[CUR_FRAME].tf.f->data[1 + p] + uv_off;
2630         for (n = 0, y = 0; y < end_y; y += uvstep1d) {
2631             uint8_t *ptr = dst, *ptr_r = dst_r;
2632             for (x = 0; x < end_x; x += uvstep1d, ptr += 4 * uvstep1d * bytesperpixel,
2633                                    ptr_r += 4 * uvstep1d * bytesperpixel, n += step) {
2634                 int mode = b->uvmode;
2635                 uint8_t *a = &a_buf[32];
2636                 int eob = b->skip ? 0 : b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
2637
2638                 mode = check_intra_mode(s, mode, &a, ptr_r,
2639                                         s->s.frames[CUR_FRAME].tf.f->linesize[1],
2640                                         ptr, s->uv_stride, l, col, x, w4, row, y,
2641                                         b->uvtx, p + 1, s->ss_h, s->ss_v, bytesperpixel);
2642                 s->dsp.intra_pred[b->uvtx][mode](ptr, s->uv_stride, l, a);
2643                 if (eob)
2644                     s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
2645                                                     s->uvblock[p] + 16 * n * bytesperpixel, eob);
2646             }
2647             dst_r += 4 * uvstep1d * s->s.frames[CUR_FRAME].tf.f->linesize[1];
2648             dst   += 4 * uvstep1d * s->uv_stride;
2649         }
2650     }
2651 }
2652
2653 static void intra_recon_8bpp(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off)
2654 {
2655     intra_recon(ctx, y_off, uv_off, 1);
2656 }
2657
2658 static void intra_recon_16bpp(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off)
2659 {
2660     intra_recon(ctx, y_off, uv_off, 2);
2661 }
2662
2663 static av_always_inline void mc_luma_unscaled(VP9Context *s, vp9_mc_func (*mc)[2],
2664                                               uint8_t *dst, ptrdiff_t dst_stride,
2665                                               const uint8_t *ref, ptrdiff_t ref_stride,
2666                                               ThreadFrame *ref_frame,
2667                                               ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
2668                                               int bw, int bh, int w, int h, int bytesperpixel)
2669 {
2670     int mx = mv->x, my = mv->y, th;
2671
2672     y += my >> 3;
2673     x += mx >> 3;
2674     ref += y * ref_stride + x * bytesperpixel;
2675     mx &= 7;
2676     my &= 7;
2677     // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2678     // we use +7 because the last 7 pixels of each sbrow can be changed in
2679     // the longest loopfilter of the next sbrow
2680     th = (y + bh + 4 * !!my + 7) >> 6;
2681     ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2682     if (x < !!mx * 3 || y < !!my * 3 ||
2683         x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
2684         s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2685                                  ref - !!my * 3 * ref_stride - !!mx * 3 * bytesperpixel,
2686                                  160, ref_stride,
2687                                  bw + !!mx * 7, bh + !!my * 7,
2688                                  x - !!mx * 3, y - !!my * 3, w, h);
2689         ref = s->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel;
2690         ref_stride = 160;
2691     }
2692     mc[!!mx][!!my](dst, dst_stride, ref, ref_stride, bh, mx << 1, my << 1);
2693 }
2694
2695 static av_always_inline void mc_chroma_unscaled(VP9Context *s, vp9_mc_func (*mc)[2],
2696                                                 uint8_t *dst_u, uint8_t *dst_v,
2697                                                 ptrdiff_t dst_stride,
2698                                                 const uint8_t *ref_u, ptrdiff_t src_stride_u,
2699                                                 const uint8_t *ref_v, ptrdiff_t src_stride_v,
2700                                                 ThreadFrame *ref_frame,
2701                                                 ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
2702                                                 int bw, int bh, int w, int h, int bytesperpixel)
2703 {
2704     int mx = mv->x << !s->ss_h, my = mv->y << !s->ss_v, th;
2705
2706     y += my >> 4;
2707     x += mx >> 4;
2708     ref_u += y * src_stride_u + x * bytesperpixel;
2709     ref_v += y * src_stride_v + x * bytesperpixel;
2710     mx &= 15;
2711     my &= 15;
2712     // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2713     // we use +7 because the last 7 pixels of each sbrow can be changed in
2714     // the longest loopfilter of the next sbrow
2715     th = (y + bh + 4 * !!my + 7) >> (6 - s->ss_v);
2716     ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2717     if (x < !!mx * 3 || y < !!my * 3 ||
2718         x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
2719         s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2720                                  ref_u - !!my * 3 * src_stride_u - !!mx * 3 * bytesperpixel,
2721                                  160, src_stride_u,
2722                                  bw + !!mx * 7, bh + !!my * 7,
2723                                  x - !!mx * 3, y - !!my * 3, w, h);
2724         ref_u = s->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel;
2725         mc[!!mx][!!my](dst_u, dst_stride, ref_u, 160, bh, mx, my);
2726
2727         s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2728                                  ref_v - !!my * 3 * src_stride_v - !!mx * 3 * bytesperpixel,
2729                                  160, src_stride_v,
2730                                  bw + !!mx * 7, bh + !!my * 7,
2731                                  x - !!mx * 3, y - !!my * 3, w, h);
2732         ref_v = s->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel;
2733         mc[!!mx][!!my](dst_v, dst_stride, ref_v, 160, bh, mx, my);
2734     } else {
2735         mc[!!mx][!!my](dst_u, dst_stride, ref_u, src_stride_u, bh, mx, my);
2736         mc[!!mx][!!my](dst_v, dst_stride, ref_v, src_stride_v, bh, mx, my);
2737     }
2738 }
2739
2740 #define mc_luma_dir(s, mc, dst, dst_ls, src, src_ls, tref, row, col, mv, \
2741                     px, py, pw, ph, bw, bh, w, h, i) \
2742     mc_luma_unscaled(s, s->dsp.mc, dst, dst_ls, src, src_ls, tref, row, col, \
2743                      mv, bw, bh, w, h, bytesperpixel)
2744 #define mc_chroma_dir(s, mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
2745                       row, col, mv, px, py, pw, ph, bw, bh, w, h, i) \
2746     mc_chroma_unscaled(s, s->dsp.mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
2747                        row, col, mv, bw, bh, w, h, bytesperpixel)
2748 #define SCALED 0
2749 #define FN(x) x##_8bpp
2750 #define BYTES_PER_PIXEL 1
2751 #include "vp9_mc_template.c"
2752 #undef FN
2753 #undef BYTES_PER_PIXEL
2754 #define FN(x) x##_16bpp
2755 #define BYTES_PER_PIXEL 2
2756 #include "vp9_mc_template.c"
2757 #undef mc_luma_dir
2758 #undef mc_chroma_dir
2759 #undef FN
2760 #undef BYTES_PER_PIXEL
2761 #undef SCALED
2762
2763 static av_always_inline void mc_luma_scaled(VP9Context *s, vp9_scaled_mc_func smc,
2764                                             vp9_mc_func (*mc)[2],
2765                                             uint8_t *dst, ptrdiff_t dst_stride,
2766                                             const uint8_t *ref, ptrdiff_t ref_stride,
2767                                             ThreadFrame *ref_frame,
2768                                             ptrdiff_t y, ptrdiff_t x, const VP56mv *in_mv,
2769                                             int px, int py, int pw, int ph,
2770                                             int bw, int bh, int w, int h, int bytesperpixel,
2771                                             const uint16_t *scale, const uint8_t *step)
2772 {
2773     if (s->s.frames[CUR_FRAME].tf.f->width == ref_frame->f->width &&
2774         s->s.frames[CUR_FRAME].tf.f->height == ref_frame->f->height) {
2775         mc_luma_unscaled(s, mc, dst, dst_stride, ref, ref_stride, ref_frame,
2776                          y, x, in_mv, bw, bh, w, h, bytesperpixel);
2777     } else {
2778 #define scale_mv(n, dim) (((int64_t)(n) * scale[dim]) >> 14)
2779     int mx, my;
2780     int refbw_m1, refbh_m1;
2781     int th;
2782     VP56mv mv;
2783
2784     mv.x = av_clip(in_mv->x, -(x + pw - px + 4) << 3, (s->cols * 8 - x + px + 3) << 3);
2785     mv.y = av_clip(in_mv->y, -(y + ph - py + 4) << 3, (s->rows * 8 - y + py + 3) << 3);
2786     // BUG libvpx seems to scale the two components separately. This introduces
2787     // rounding errors but we have to reproduce them to be exactly compatible
2788     // with the output from libvpx...
2789     mx = scale_mv(mv.x * 2, 0) + scale_mv(x * 16, 0);
2790     my = scale_mv(mv.y * 2, 1) + scale_mv(y * 16, 1);
2791
2792     y = my >> 4;
2793     x = mx >> 4;
2794     ref += y * ref_stride + x * bytesperpixel;
2795     mx &= 15;
2796     my &= 15;
2797     refbw_m1 = ((bw - 1) * step[0] + mx) >> 4;
2798     refbh_m1 = ((bh - 1) * step[1] + my) >> 4;
2799     // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2800     // we use +7 because the last 7 pixels of each sbrow can be changed in
2801     // the longest loopfilter of the next sbrow
2802     th = (y + refbh_m1 + 4 + 7) >> 6;
2803     ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2804     if (x < 3 || y < 3 || x + 4 >= w - refbw_m1 || y + 4 >= h - refbh_m1) {
2805         s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2806                                  ref - 3 * ref_stride - 3 * bytesperpixel,
2807                                  288, ref_stride,
2808                                  refbw_m1 + 8, refbh_m1 + 8,
2809                                  x - 3, y - 3, w, h);
2810         ref = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
2811         ref_stride = 288;
2812     }
2813     smc(dst, dst_stride, ref, ref_stride, bh, mx, my, step[0], step[1]);
2814     }
2815 }
2816
2817 static av_always_inline void mc_chroma_scaled(VP9Context *s, vp9_scaled_mc_func smc,
2818                                               vp9_mc_func (*mc)[2],
2819                                               uint8_t *dst_u, uint8_t *dst_v,
2820                                               ptrdiff_t dst_stride,
2821                                               const uint8_t *ref_u, ptrdiff_t src_stride_u,
2822                                               const uint8_t *ref_v, ptrdiff_t src_stride_v,
2823                                               ThreadFrame *ref_frame,
2824                                               ptrdiff_t y, ptrdiff_t x, const VP56mv *in_mv,
2825                                               int px, int py, int pw, int ph,
2826                                               int bw, int bh, int w, int h, int bytesperpixel,
2827                                               const uint16_t *scale, const uint8_t *step)
2828 {
2829     if (s->s.frames[CUR_FRAME].tf.f->width == ref_frame->f->width &&
2830         s->s.frames[CUR_FRAME].tf.f->height == ref_frame->f->height) {
2831         mc_chroma_unscaled(s, mc, dst_u, dst_v, dst_stride, ref_u, src_stride_u,
2832                            ref_v, src_stride_v, ref_frame,
2833                            y, x, in_mv, bw, bh, w, h, bytesperpixel);
2834     } else {
2835     int mx, my;
2836     int refbw_m1, refbh_m1;
2837     int th;
2838     VP56mv mv;
2839
2840     if (s->ss_h) {
2841         // BUG https://code.google.com/p/webm/issues/detail?id=820
2842         mv.x = av_clip(in_mv->x, -(x + pw - px + 4) << 4, (s->cols * 4 - x + px + 3) << 4);
2843         mx = scale_mv(mv.x, 0) + (scale_mv(x * 16, 0) & ~15) + (scale_mv(x * 32, 0) & 15);
2844     } else {
2845         mv.x = av_clip(in_mv->x, -(x + pw - px + 4) << 3, (s->cols * 8 - x + px + 3) << 3);
2846         mx = scale_mv(mv.x << 1, 0) + scale_mv(x * 16, 0);
2847     }
2848     if (s->ss_v) {
2849         // BUG https://code.google.com/p/webm/issues/detail?id=820
2850         mv.y = av_clip(in_mv->y, -(y + ph - py + 4) << 4, (s->rows * 4 - y + py + 3) << 4);
2851         my = scale_mv(mv.y, 1) + (scale_mv(y * 16, 1) & ~15) + (scale_mv(y * 32, 1) & 15);
2852     } else {
2853         mv.y = av_clip(in_mv->y, -(y + ph - py + 4) << 3, (s->rows * 8 - y + py + 3) << 3);
2854         my = scale_mv(mv.y << 1, 1) + scale_mv(y * 16, 1);
2855     }
2856 #undef scale_mv
2857     y = my >> 4;
2858     x = mx >> 4;
2859     ref_u += y * src_stride_u + x * bytesperpixel;
2860     ref_v += y * src_stride_v + x * bytesperpixel;
2861     mx &= 15;
2862     my &= 15;
2863     refbw_m1 = ((bw - 1) * step[0] + mx) >> 4;
2864     refbh_m1 = ((bh - 1) * step[1] + my) >> 4;
2865     // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2866     // we use +7 because the last 7 pixels of each sbrow can be changed in
2867     // the longest loopfilter of the next sbrow
2868     th = (y + refbh_m1 + 4 + 7) >> (6 - s->ss_v);
2869     ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2870     if (x < 3 || y < 3 || x + 4 >= w - refbw_m1 || y + 4 >= h - refbh_m1) {
2871         s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2872                                  ref_u - 3 * src_stride_u - 3 * bytesperpixel,
2873                                  288, src_stride_u,
2874                                  refbw_m1 + 8, refbh_m1 + 8,
2875                                  x - 3, y - 3, w, h);
2876         ref_u = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
2877         smc(dst_u, dst_stride, ref_u, 288, bh, mx, my, step[0], step[1]);
2878
2879         s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2880                                  ref_v - 3 * src_stride_v - 3 * bytesperpixel,
2881                                  288, src_stride_v,
2882                                  refbw_m1 + 8, refbh_m1 + 8,
2883                                  x - 3, y - 3, w, h);
2884         ref_v = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
2885         smc(dst_v, dst_stride, ref_v, 288, bh, mx, my, step[0], step[1]);
2886     } else {
2887         smc(dst_u, dst_stride, ref_u, src_stride_u, bh, mx, my, step[0], step[1]);
2888         smc(dst_v, dst_stride, ref_v, src_stride_v, bh, mx, my, step[0], step[1]);
2889     }
2890     }
2891 }
2892
2893 #define mc_luma_dir(s, mc, dst, dst_ls, src, src_ls, tref, row, col, mv, \
2894                     px, py, pw, ph, bw, bh, w, h, i) \
2895     mc_luma_scaled(s, s->dsp.s##mc, s->dsp.mc, dst, dst_ls, src, src_ls, tref, row, col, \
2896                    mv, px, py, pw, ph, bw, bh, w, h, bytesperpixel, \
2897                    s->mvscale[b->ref[i]], s->mvstep[b->ref[i]])
2898 #define mc_chroma_dir(s, mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
2899                       row, col, mv, px, py, pw, ph, bw, bh, w, h, i) \
2900     mc_chroma_scaled(s, s->dsp.s##mc, s->dsp.mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
2901                      row, col, mv, px, py, pw, ph, bw, bh, w, h, bytesperpixel, \
2902                      s->mvscale[b->ref[i]], s->mvstep[b->ref[i]])
2903 #define SCALED 1
2904 #define FN(x) x##_scaled_8bpp
2905 #define BYTES_PER_PIXEL 1
2906 #include "vp9_mc_template.c"
2907 #undef FN
2908 #undef BYTES_PER_PIXEL
2909 #define FN(x) x##_scaled_16bpp
2910 #define BYTES_PER_PIXEL 2
2911 #include "vp9_mc_template.c"
2912 #undef mc_luma_dir
2913 #undef mc_chroma_dir
2914 #undef FN
2915 #undef BYTES_PER_PIXEL
2916 #undef SCALED
2917
2918 static av_always_inline void inter_recon(AVCodecContext *ctx, int bytesperpixel)
2919 {
2920     VP9Context *s = ctx->priv_data;
2921     VP9Block *b = s->b;
2922     int row = s->row, col = s->col;
2923
2924     if (s->mvscale[b->ref[0]][0] || (b->comp && s->mvscale[b->ref[1]][0])) {
2925         if (bytesperpixel == 1) {
2926             inter_pred_scaled_8bpp(ctx);
2927         } else {
2928             inter_pred_scaled_16bpp(ctx);
2929         }
2930     } else {
2931         if (bytesperpixel == 1) {
2932             inter_pred_8bpp(ctx);
2933         } else {
2934             inter_pred_16bpp(ctx);
2935         }
2936     }
2937     if (!b->skip) {
2938         /* mostly copied intra_recon() */
2939
2940         int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
2941         int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
2942         int end_x = FFMIN(2 * (s->cols - col), w4);
2943         int end_y = FFMIN(2 * (s->rows - row), h4);
2944         int tx = 4 * s->s.h.lossless + b->tx, uvtx = b->uvtx + 4 * s->s.h.lossless;
2945         int uvstep1d = 1 << b->uvtx, p;
2946         uint8_t *dst = s->dst[0];
2947
2948         // y itxfm add
2949         for (n = 0, y = 0; y < end_y; y += step1d) {
2950             uint8_t *ptr = dst;
2951             for (x = 0; x < end_x; x += step1d,
2952                  ptr += 4 * step1d * bytesperpixel, n += step) {
2953                 int eob = b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
2954
2955                 if (eob)
2956                     s->dsp.itxfm_add[tx][DCT_DCT](ptr, s->y_stride,
2957                                                   s->block + 16 * n * bytesperpixel, eob);
2958             }
2959             dst += 4 * s->y_stride * step1d;
2960         }
2961
2962         // uv itxfm add
2963         end_x >>= s->ss_h;
2964         end_y >>= s->ss_v;
2965         step = 1 << (b->uvtx * 2);
2966         for (p = 0; p < 2; p++) {
2967             dst = s->dst[p + 1];
2968             for (n = 0, y = 0; y < end_y; y += uvstep1d) {
2969                 uint8_t *ptr = dst;
2970                 for (x = 0; x < end_x; x += uvstep1d,
2971                      ptr += 4 * uvstep1d * bytesperpixel, n += step) {
2972                     int eob = b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
2973
2974                     if (eob)
2975                         s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
2976                                                         s->uvblock[p] + 16 * n * bytesperpixel, eob);
2977                 }
2978                 dst += 4 * uvstep1d * s->uv_stride;
2979             }
2980         }
2981     }
2982 }
2983
2984 static void inter_recon_8bpp(AVCodecContext *ctx)
2985 {
2986     inter_recon(ctx, 1);
2987 }
2988
2989 static void inter_recon_16bpp(AVCodecContext *ctx)
2990 {
2991     inter_recon(ctx, 2);
2992 }
2993
2994 static av_always_inline void mask_edges(uint8_t (*mask)[8][4], int ss_h, int ss_v,
2995                                         int row_and_7, int col_and_7,
2996                                         int w, int h, int col_end, int row_end,
2997                                         enum TxfmMode tx, int skip_inter)
2998 {
2999     static const unsigned wide_filter_col_mask[2] = { 0x11, 0x01 };
3000     static const unsigned wide_filter_row_mask[2] = { 0x03, 0x07 };
3001
3002     // FIXME I'm pretty sure all loops can be replaced by a single LUT if
3003     // we make VP9Filter.mask uint64_t (i.e. row/col all single variable)
3004     // and make the LUT 5-indexed (bl, bp, is_uv, tx and row/col), and then
3005     // use row_and_7/col_and_7 as shifts (1*col_and_7+8*row_and_7)
3006
3007     // the intended behaviour of the vp9 loopfilter is to work on 8-pixel
3008     // edges. This means that for UV, we work on two subsampled blocks at
3009     // a time, and we only use the topleft block's mode information to set
3010     // things like block strength. Thus, for any block size smaller than
3011     // 16x16, ignore the odd portion of the block.
3012     if (tx == TX_4X4 && (ss_v | ss_h)) {
3013         if (h == ss_v) {
3014             if (row_and_7 & 1)
3015                 return;
3016             if (!row_end)
3017                 h += 1;
3018         }
3019         if (w == ss_h) {
3020             if (col_and_7 & 1)
3021                 return;
3022             if (!col_end)
3023                 w += 1;
3024         }
3025     }
3026
3027     if (tx == TX_4X4 && !skip_inter) {
3028         int t = 1 << col_and_7, m_col = (t << w) - t, y;
3029         // on 32-px edges, use the 8-px wide loopfilter; else, use 4-px wide
3030         int m_row_8 = m_col & wide_filter_col_mask[ss_h], m_row_4 = m_col - m_row_8;
3031
3032         for (y = row_and_7; y < h + row_and_7; y++) {
3033             int col_mask_id = 2 - !(y & wide_filter_row_mask[ss_v]);
3034
3035             mask[0][y][1] |= m_row_8;
3036             mask[0][y][2] |= m_row_4;
3037             // for odd lines, if the odd col is not being filtered,
3038             // skip odd row also:
3039             // .---. <-- a
3040             // |   |
3041             // |___| <-- b
3042             // ^   ^
3043             // c   d
3044             //
3045             // if a/c are even row/col and b/d are odd, and d is skipped,
3046             // e.g. right edge of size-66x66.webm, then skip b also (bug)
3047             if ((ss_h & ss_v) && (col_end & 1) && (y & 1)) {
3048                 mask[1][y][col_mask_id] |= (t << (w - 1)) - t;
3049             } else {
3050                 mask[1][y][col_mask_id] |= m_col;
3051             }
3052             if (!ss_h)
3053                 mask[0][y][3] |= m_col;
3054             if (!ss_v) {
3055                 if (ss_h && (col_end & 1))
3056                     mask[1][y][3] |= (t << (w - 1)) - t;
3057                 else
3058                     mask[1][y][3] |= m_col;
3059             }
3060         }
3061     } else {
3062         int y, t = 1 << col_and_7, m_col = (t << w) - t;
3063
3064         if (!skip_inter) {
3065             int mask_id = (tx == TX_8X8);
3066             static const unsigned masks[4] = { 0xff, 0x55, 0x11, 0x01 };
3067             int l2 = tx + ss_h - 1, step1d;
3068             int m_row = m_col & masks[l2];
3069
3070             // at odd UV col/row edges tx16/tx32 loopfilter edges, force
3071             // 8wd loopfilter to prevent going off the visible edge.
3072             if (ss_h && tx > TX_8X8 && (w ^ (w - 1)) == 1) {
3073                 int m_row_16 = ((t << (w - 1)) - t) & masks[l2];
3074                 int m_row_8 = m_row - m_row_16;
3075
3076                 for (y = row_and_7; y < h + row_and_7; y++) {
3077                     mask[0][y][0] |= m_row_16;
3078                     mask[0][y][1] |= m_row_8;
3079                 }
3080             } else {
3081                 for (y = row_and_7; y < h + row_and_7; y++)
3082                     mask[0][y][mask_id] |= m_row;
3083             }
3084
3085             l2 = tx + ss_v - 1;
3086             step1d = 1 << l2;
3087             if (ss_v && tx > TX_8X8 && (h ^ (h - 1)) == 1) {
3088                 for (y = row_and_7; y < h + row_and_7 - 1; y += step1d)
3089                     mask[1][y][0] |= m_col;
3090                 if (y - row_and_7 == h - 1)
3091                     mask[1][y][1] |= m_col;
3092             } else {
3093                 for (y = row_and_7; y < h + row_and_7; y += step1d)
3094                     mask[1][y][mask_id] |= m_col;
3095             }
3096         } else if (tx != TX_4X4) {
3097             int mask_id;
3098
3099             mask_id = (tx == TX_8X8) || (h == ss_v);
3100             mask[1][row_and_7][mask_id] |= m_col;
3101             mask_id = (tx == TX_8X8) || (w == ss_h);
3102             for (y = row_and_7; y < h + row_and_7; y++)
3103                 mask[0][y][mask_id] |= t;
3104         } else {
3105             int t8 = t & wide_filter_col_mask[ss_h], t4 = t - t8;
3106
3107             for (y = row_and_7; y < h + row_and_7; y++) {
3108                 mask[0][y][2] |= t4;
3109                 mask[0][y][1] |= t8;
3110             }
3111             mask[1][row_and_7][2 - !(row_and_7 & wide_filter_row_mask[ss_v])] |= m_col;
3112         }
3113     }
3114 }
3115
3116 static void decode_b(AVCodecContext *ctx, int row, int col,
3117                      struct VP9Filter *lflvl, ptrdiff_t yoff, ptrdiff_t uvoff,
3118                      enum BlockLevel bl, enum BlockPartition bp)
3119 {
3120     VP9Context *s = ctx->priv_data;
3121     VP9Block *b = s->b;
3122     enum BlockSize bs = bl * 3 + bp;
3123     int bytesperpixel = s->bytesperpixel;
3124     int w4 = bwh_tab[1][bs][0], h4 = bwh_tab[1][bs][1], lvl;
3125     int emu[2];
3126     AVFrame *f = s->s.frames[CUR_FRAME].tf.f;
3127
3128     s->row = row;
3129     s->row7 = row & 7;
3130     s->col = col;
3131     s->col7 = col & 7;
3132     s->min_mv.x = -(128 + col * 64);
3133     s->min_mv.y = -(128 + row * 64);
3134     s->max_mv.x = 128 + (s->cols - col - w4) * 64;
3135     s->max_mv.y = 128 + (s->rows - row - h4) * 64;
3136     if (s->pass < 2) {
3137         b->bs = bs;
3138         b->bl = bl;
3139         b->bp = bp;
3140         decode_mode(ctx);
3141         b->uvtx = b->tx - ((s->ss_h && w4 * 2 == (1 << b->tx)) ||
3142                            (s->ss_v && h4 * 2 == (1 << b->tx)));
3143
3144         if (!b->skip) {
3145             int has_coeffs;
3146
3147             if (bytesperpixel == 1) {
3148                 has_coeffs = decode_coeffs_8bpp(ctx);
3149             } else {
3150                 has_coeffs = decode_coeffs_16bpp(ctx);
3151             }
3152             if (!has_coeffs && b->bs <= BS_8x8 && !b->intra) {
3153                 b->skip = 1;
3154                 memset(&s->above_skip_ctx[col], 1, w4);
3155                 memset(&s->left_skip_ctx[s->row7], 1, h4);
3156             }
3157         } else {
3158             int row7 = s->row7;
3159
3160 #define SPLAT_ZERO_CTX(v, n) \
3161     switch (n) { \
3162     case 1:  v = 0;          break; \
3163     case 2:  AV_ZERO16(&v);  break; \
3164     case 4:  AV_ZERO32(&v);  break; \
3165     case 8:  AV_ZERO64(&v);  break; \
3166     case 16: AV_ZERO128(&v); break; \
3167     }
3168 #define SPLAT_ZERO_YUV(dir, var, off, n, dir2) \
3169     do { \
3170         SPLAT_ZERO_CTX(s->dir##_y_##var[off * 2], n * 2); \
3171         if (s->ss_##dir2) { \
3172             SPLAT_ZERO_CTX(s->dir##_uv_##var[0][off], n); \
3173             SPLAT_ZERO_CTX(s->dir##_uv_##var[1][off], n); \
3174         } else { \
3175             SPLAT_ZERO_CTX(s->dir##_uv_##var[0][off * 2], n * 2); \
3176             SPLAT_ZERO_CTX(s->dir##_uv_##var[1][off * 2], n * 2); \
3177         } \
3178     } while (0)
3179
3180             switch (w4) {
3181             case 1: SPLAT_ZERO_YUV(above, nnz_ctx, col, 1, h); break;
3182             case 2: SPLAT_ZERO_YUV(above, nnz_ctx, col, 2, h); break;
3183             case 4: SPLAT_ZERO_YUV(above, nnz_ctx, col, 4, h); break;
3184             case 8: SPLAT_ZERO_YUV(above, nnz_ctx, col, 8, h); break;
3185             }
3186             switch (h4) {
3187             case 1: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 1, v); break;
3188             case 2: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 2, v); break;
3189             case 4: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 4, v); break;
3190             case 8: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 8, v); break;
3191             }
3192         }
3193
3194         if (s->pass == 1) {
3195             s->b++;
3196             s->block += w4 * h4 * 64 * bytesperpixel;
3197             s->uvblock[0] += w4 * h4 * 64 * bytesperpixel >> (s->ss_h + s->ss_v);
3198             s->uvblock[1] += w4 * h4 * 64 * bytesperpixel >> (s->ss_h + s->ss_v);
3199             s->eob += 4 * w4 * h4;
3200             s->uveob[0] += 4 * w4 * h4 >> (s->ss_h + s->ss_v);
3201             s->uveob[1] += 4 * w4 * h4 >> (s->ss_h + s->ss_v);
3202
3203             return;
3204         }
3205     }
3206
3207     // emulated overhangs if the stride of the target buffer can't hold. This
3208     // makes it possible to support emu-edge and so on even if we have large block
3209     // overhangs
3210     emu[0] = (col + w4) * 8 * bytesperpixel > f->linesize[0] ||
3211              (row + h4) > s->rows;
3212     emu[1] = ((col + w4) * 8 >> s->ss_h) * bytesperpixel > f->linesize[1] ||
3213              (row + h4) > s->rows;
3214     if (emu[0]) {
3215         s->dst[0] = s->tmp_y;
3216         s->y_stride = 128;
3217     } else {
3218         s->dst[0] = f->data[0] + yoff;
3219         s->y_stride = f->linesize[0];
3220     }
3221     if (emu[1]) {
3222         s->dst[1] = s->tmp_uv[0];
3223         s->dst[2] = s->tmp_uv[1];
3224         s->uv_stride = 128;
3225     } else {
3226         s->dst[1] = f->data[1] + uvoff;
3227         s->dst[2] = f->data[2] + uvoff;
3228         s->uv_stride = f->linesize[1];
3229     }
3230     if (b->intra) {
3231         if (s->bpp > 8) {
3232             intra_recon_16bpp(ctx, yoff, uvoff);
3233         } else {
3234             intra_recon_8bpp(ctx, yoff, uvoff);
3235         }
3236     } else {
3237         if (s->bpp > 8) {
3238             inter_recon_16bpp(ctx);
3239         } else {
3240             inter_recon_8bpp(ctx);
3241         }
3242     }
3243     if (emu[0]) {
3244         int w = FFMIN(s->cols - col, w4) * 8, h = FFMIN(s->rows - row, h4) * 8, n, o = 0;
3245
3246         for (n = 0; o < w; n++) {
3247             int bw = 64 >> n;
3248
3249             av_assert2(n <= 4);
3250             if (w & bw) {
3251                 s->dsp.mc[n][0][0][0][0](f->data[0] + yoff + o * bytesperpixel, f->linesize[0],
3252                                          s->tmp_y + o * bytesperpixel, 128, h, 0, 0);
3253                 o += bw;
3254             }
3255         }
3256     }
3257     if (emu[1]) {
3258         int w = FFMIN(s->cols - col, w4) * 8 >> s->ss_h;
3259         int h = FFMIN(s->rows - row, h4) * 8 >> s->ss_v, n, o = 0;
3260
3261         for (n = s->ss_h; o < w; n++) {
3262             int bw = 64 >> n;
3263
3264             av_assert2(n <= 4);
3265             if (w & bw) {
3266                 s->dsp.mc[n][0][0][0][0](f->data[1] + uvoff + o * bytesperpixel, f->linesize[1],
3267                                          s->tmp_uv[0] + o * bytesperpixel, 128, h, 0, 0);
3268                 s->dsp.mc[n][0][0][0][0](f->data[2] + uvoff + o * bytesperpixel, f->linesize[2],
3269                                          s->tmp_uv[1] + o * bytesperpixel, 128, h, 0, 0);
3270                 o += bw;
3271             }
3272         }
3273     }
3274
3275     // pick filter level and find edges to apply filter to
3276     if (s->s.h.filter.level &&
3277         (lvl = s->s.h.segmentation.feat[b->seg_id].lflvl[b->intra ? 0 : b->ref[0] + 1]
3278                                                       [b->mode[3] != ZEROMV]) > 0) {
3279         int x_end = FFMIN(s->cols - col, w4), y_end = FFMIN(s->rows - row, h4);
3280         int skip_inter = !b->intra && b->skip, col7 = s->col7, row7 = s->row7;
3281
3282         setctx_2d(&lflvl->level[row7 * 8 + col7], w4, h4, 8, lvl);
3283         mask_edges(lflvl->mask[0], 0, 0, row7, col7, x_end, y_end, 0, 0, b->tx, skip_inter);
3284         if (s->ss_h || s->ss_v)
3285             mask_edges(lflvl->mask[1], s->ss_h, s->ss_v, row7, col7, x_end, y_end,
3286                        s->cols & 1 && col + w4 >= s->cols ? s->cols & 7 : 0,
3287                        s->rows & 1 && row + h4 >= s->rows ? s->rows & 7 : 0,
3288                        b->uvtx, skip_inter);
3289
3290         if (!s->filter_lut.lim_lut[lvl]) {
3291             int sharp = s->s.h.filter.sharpness;
3292             int limit = lvl;
3293
3294             if (sharp > 0) {
3295                 limit >>= (sharp + 3) >> 2;
3296                 limit = FFMIN(limit, 9 - sharp);
3297             }
3298             limit = FFMAX(limit, 1);
3299
3300             s->filter_lut.lim_lut[lvl] = limit;
3301             s->filter_lut.mblim_lut[lvl] = 2 * (lvl + 2) + limit;
3302         }
3303     }
3304
3305     if (s->pass == 2) {
3306         s->b++;
3307         s->block += w4 * h4 * 64 * bytesperpixel;
3308         s->uvblock[0] += w4 * h4 * 64 * bytesperpixel >> (s->ss_v + s->ss_h);
3309         s->uvblock[1] += w4 * h4 * 64 * bytesperpixel >> (s->ss_v + s->ss_h);
3310         s->eob += 4 * w4 * h4;
3311         s->uveob[0] += 4 * w4 * h4 >> (s->ss_v + s->ss_h);
3312         s->uveob[1] += 4 * w4 * h4 >> (s->ss_v + s->ss_h);
3313     }
3314 }
3315
3316 static void decode_sb(AVCodecContext *ctx, int row, int col, struct VP9Filter *lflvl,
3317                       ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
3318 {
3319     VP9Context *s = ctx->priv_data;
3320     int c = ((s->above_partition_ctx[col] >> (3 - bl)) & 1) |
3321             (((s->left_partition_ctx[row & 0x7] >> (3 - bl)) & 1) << 1);
3322     const uint8_t *p = s->s.h.keyframe || s->s.h.intraonly ? vp9_default_kf_partition_probs[bl][c] :
3323                                                      s->prob.p.partition[bl][c];
3324     enum BlockPartition bp;
3325     ptrdiff_t hbs = 4 >> bl;
3326     AVFrame *f = s->s.frames[CUR_FRAME].tf.f;
3327     ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
3328     int bytesperpixel = s->bytesperpixel;
3329
3330     if (bl == BL_8X8) {
3331         bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
3332         decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3333     } else if (col + hbs < s->cols) { // FIXME why not <=?
3334         if (row + hbs < s->rows) { // FIXME why not <=?
3335             bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
3336             switch (bp) {
3337             case PARTITION_NONE:
3338                 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3339                 break;
3340             case PARTITION_H:
3341                 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3342                 yoff  += hbs * 8 * y_stride;
3343                 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3344                 decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, bl, bp);
3345                 break;
3346             case PARTITION_V:
3347                 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3348                 yoff  += hbs * 8 * bytesperpixel;
3349                 uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
3350                 decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, bl, bp);
3351                 break;
3352             case PARTITION_SPLIT:
3353                 decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3354                 decode_sb(ctx, row, col + hbs, lflvl,
3355                           yoff + 8 * hbs * bytesperpixel,
3356                           uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3357                 yoff  += hbs * 8 * y_stride;
3358                 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3359                 decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3360                 decode_sb(ctx, row + hbs, col + hbs, lflvl,
3361                           yoff + 8 * hbs * bytesperpixel,
3362                           uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3363                 break;
3364             default:
3365                 av_assert0(0);
3366             }
3367         } else if (vp56_rac_get_prob_branchy(&s->c, p[1])) {
3368             bp = PARTITION_SPLIT;
3369             decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3370             decode_sb(ctx, row, col + hbs, lflvl,
3371                       yoff + 8 * hbs * bytesperpixel,
3372                       uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3373         } else {
3374             bp = PARTITION_H;
3375             decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3376         }
3377     } else if (row + hbs < s->rows) { // FIXME why not <=?
3378         if (vp56_rac_get_prob_branchy(&s->c, p[2])) {
3379             bp = PARTITION_SPLIT;
3380             decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3381             yoff  += hbs * 8 * y_stride;
3382             uvoff += hbs * 8 * uv_stride >> s->ss_v;
3383             decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3384         } else {
3385             bp = PARTITION_V;
3386             decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3387         }
3388     } else {
3389         bp = PARTITION_SPLIT;
3390         decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3391     }
3392     s->counts.partition[bl][c][bp]++;
3393 }
3394
3395 static void decode_sb_mem(AVCodecContext *ctx, int row, int col, struct VP9Filter *lflvl,
3396                           ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
3397 {
3398     VP9Context *s = ctx->priv_data;
3399     VP9Block *b = s->b;
3400     ptrdiff_t hbs = 4 >> bl;
3401     AVFrame *f = s->s.frames[CUR_FRAME].tf.f;
3402     ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
3403     int bytesperpixel = s->bytesperpixel;
3404
3405     if (bl == BL_8X8) {
3406         av_assert2(b->bl == BL_8X8);
3407         decode_b(ctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
3408     } else if (s->b->bl == bl) {
3409         decode_b(ctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
3410         if (b->bp == PARTITION_H && row + hbs < s->rows) {
3411             yoff  += hbs * 8 * y_stride;
3412             uvoff += hbs * 8 * uv_stride >> s->ss_v;
3413             decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, b->bl, b->bp);
3414         } else if (b->bp == PARTITION_V && col + hbs < s->cols) {
3415             yoff  += hbs * 8 * bytesperpixel;
3416             uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
3417             decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, b->bl, b->bp);
3418         }
3419     } else {
3420         decode_sb_mem(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3421         if (col + hbs < s->cols) { // FIXME why not <=?
3422             if (row + hbs < s->rows) {
3423                 decode_sb_mem(ctx, row, col + hbs, lflvl, yoff + 8 * hbs * bytesperpixel,
3424                               uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3425                 yoff  += hbs * 8 * y_stride;
3426                 uvoff += hbs * 8 * uv_stride >> s->ss_v;
3427                 decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3428                 decode_sb_mem(ctx, row + hbs, col + hbs, lflvl,
3429                               yoff + 8 * hbs * bytesperpixel,
3430                               uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3431             } else {
3432                 yoff  += hbs * 8 * bytesperpixel;
3433                 uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
3434                 decode_sb_mem(ctx, row, col + hbs, lflvl, yoff, uvoff, bl + 1);
3435             }
3436         } else if (row + hbs < s->rows) {
3437             yoff  += hbs * 8 * y_stride;
3438             uvoff += hbs * 8 * uv_stride >> s->ss_v;
3439             decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3440         }
3441     }
3442 }
3443
3444 static av_always_inline void filter_plane_cols(VP9Context *s, int col, int ss_h, int ss_v,
3445                                                uint8_t *lvl, uint8_t (*mask)[4],
3446                                                uint8_t *dst, ptrdiff_t ls)
3447 {
3448     int y, x, bytesperpixel = s->bytesperpixel;
3449
3450     // filter edges between columns (e.g. block1 | block2)
3451     for (y = 0; y < 8; y += 2 << ss_v, dst += 16 * ls, lvl += 16 << ss_v) {
3452         uint8_t *ptr = dst, *l = lvl, *hmask1 = mask[y], *hmask2 = mask[y + 1 + ss_v];
3453         unsigned hm1 = hmask1[0] | hmask1[1] | hmask1[2], hm13 = hmask1[3];
3454         unsigned hm2 = hmask2[1] | hmask2[2], hm23 = hmask2[3];
3455         unsigned hm = hm1 | hm2 | hm13 | hm23;
3456
3457         for (x = 1; hm & ~(x - 1); x <<= 1, ptr += 8 * bytesperpixel >> ss_h) {
3458             if (col || x > 1) {
3459                 if (hm1 & x) {
3460                     int L = *l, H = L >> 4;
3461                     int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3462
3463                     if (hmask1[0] & x) {
3464                         if (hmask2[0] & x) {
3465                             av_assert2(l[8 << ss_v] == L);
3466                             s->dsp.loop_filter_16[0](ptr, ls, E, I, H);
3467                         } else {
3468                             s->dsp.loop_filter_8[2][0](ptr, ls, E, I, H);
3469                         }
3470                     } else if (hm2 & x) {
3471                         L = l[8 << ss_v];
3472                         H |= (L >> 4) << 8;
3473                         E |= s->filter_lut.mblim_lut[L] << 8;
3474                         I |= s->filter_lut.lim_lut[L] << 8;
3475                         s->dsp.loop_filter_mix2[!!(hmask1[1] & x)]
3476                                                [!!(hmask2[1] & x)]
3477                                                [0](ptr, ls, E, I, H);
3478                     } else {
3479                         s->dsp.loop_filter_8[!!(hmask1[1] & x)]
3480                                             [0](ptr, ls, E, I, H);
3481                     }
3482                 } else if (hm2 & x) {
3483                     int L = l[8 << ss_v], H = L >> 4;
3484                     int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3485
3486                     s->dsp.loop_filter_8[!!(hmask2[1] & x)]
3487                                         [0](ptr + 8 * ls, ls, E, I, H);
3488                 }
3489             }
3490             if (ss_h) {
3491                 if (x & 0xAA)
3492                     l += 2;
3493             } else {
3494                 if (hm13 & x) {
3495                     int L = *l, H = L >> 4;
3496                     int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3497
3498                     if (hm23 & x) {
3499                         L = l[8 << ss_v];
3500                         H |= (L >> 4) << 8;
3501                         E |= s->filter_lut.mblim_lut[L] << 8;
3502                         I |= s->filter_lut.lim_lut[L] << 8;
3503                         s->dsp.loop_filter_mix2[0][0][0](ptr + 4 * bytesperpixel, ls, E, I, H);
3504                     } else {
3505                         s->dsp.loop_filter_8[0][0](ptr + 4 * bytesperpixel, ls, E, I, H);
3506                     }
3507                 } else if (hm23 & x) {
3508                     int L = l[8 << ss_v], H = L >> 4;
3509                     int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3510
3511                     s->dsp.loop_filter_8[0][0](ptr + 8 * ls + 4 * bytesperpixel, ls, E, I, H);
3512                 }
3513                 l++;
3514             }
3515         }
3516     }
3517 }
3518
3519 static av_always_inline void filter_plane_rows(VP9Context *s, int row, int ss_h, int ss_v,
3520                                                uint8_t *lvl, uint8_t (*mask)[4],
3521                                                uint8_t *dst, ptrdiff_t ls)
3522 {
3523     int y, x, bytesperpixel = s->bytesperpixel;
3524
3525     //                                 block1
3526     // filter edges between rows (e.g. ------)
3527     //                                 block2
3528     for (y = 0; y < 8; y++, dst += 8 * ls >> ss_v) {
3529         uint8_t *ptr = dst, *l = lvl, *vmask = mask[y];
3530         unsigned vm = vmask[0] | vmask[1] | vmask[2], vm3 = vmask[3];
3531
3532         for (x = 1; vm & ~(x - 1); x <<= (2 << ss_h), ptr += 16 * bytesperpixel, l += 2 << ss_h) {
3533             if (row || y) {
3534                 if (vm & x) {
3535                     int L = *l, H = L >> 4;
3536                     int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3537
3538                     if (vmask[0] & x) {
3539                         if (vmask[0] & (x << (1 + ss_h))) {
3540                             av_assert2(l[1 + ss_h] == L);
3541                             s->dsp.loop_filter_16[1](ptr, ls, E, I, H);
3542                         } else {
3543                             s->dsp.loop_filter_8[2][1](ptr, ls, E, I, H);
3544                         }
3545                     } else if (vm & (x << (1 + ss_h))) {
3546                         L = l[1 + ss_h];
3547                         H |= (L >> 4) << 8;
3548                         E |= s->filter_lut.mblim_lut[L] << 8;
3549                         I |= s->filter_lut.lim_lut[L] << 8;
3550                         s->dsp.loop_filter_mix2[!!(vmask[1] &  x)]
3551                                                [!!(vmask[1] & (x << (1 + ss_h)))]
3552                                                [1](ptr, ls, E, I, H);
3553                     } else {
3554                         s->dsp.loop_filter_8[!!(vmask[1] & x)]
3555                                             [1](ptr, ls, E, I, H);
3556                     }
3557                 } else if (vm & (x << (1 + ss_h))) {
3558                     int L = l[1 + ss_h], H = L >> 4;
3559                     int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3560
3561                     s->dsp.loop_filter_8[!!(vmask[1] & (x << (1 + ss_h)))]
3562                                         [1](ptr + 8 * bytesperpixel, ls, E, I, H);
3563                 }
3564             }
3565             if (!ss_v) {
3566                 if (vm3 & x) {
3567                     int L = *l, H = L >> 4;
3568                     int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3569
3570                     if (vm3 & (x << (1 + ss_h))) {
3571                         L = l[1 + ss_h];
3572                         H |= (L >> 4) << 8;
3573                         E |= s->filter_lut.mblim_lut[L] << 8;
3574                         I |= s->filter_lut.lim_lut[L] << 8;
3575                         s->dsp.loop_filter_mix2[0][0][1](ptr + ls * 4, ls, E, I, H);
3576                     } else {
3577                         s->dsp.loop_filter_8[0][1](ptr + ls * 4, ls, E, I, H);
3578                     }
3579                 } else if (vm3 & (x << (1 + ss_h))) {
3580                     int L = l[1 + ss_h], H = L >> 4;
3581                     int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3582
3583                     s->dsp.loop_filter_8[0][1](ptr + ls * 4 + 8 * bytesperpixel, ls, E, I, H);
3584                 }
3585             }
3586         }
3587         if (ss_v) {
3588             if (y & 1)
3589                 lvl += 16;
3590         } else {
3591             lvl += 8;
3592         }
3593     }
3594 }
3595
3596 static void loopfilter_sb(AVCodecContext *ctx, struct VP9Filter *lflvl,
3597                           int row, int col, ptrdiff_t yoff, ptrdiff_t uvoff)
3598 {
3599     VP9Context *s = ctx->priv_data;
3600     AVFrame *f = s->s.frames[CUR_FRAME].tf.f;
3601     uint8_t *dst = f->data[0] + yoff;
3602     ptrdiff_t ls_y = f->linesize[0], ls_uv = f->linesize[1];
3603     uint8_t (*uv_masks)[8][4] = lflvl->mask[s->ss_h | s->ss_v];
3604     int p;
3605
3606     // FIXME in how far can we interleave the v/h loopfilter calls? E.g.
3607     // if you think of them as acting on a 8x8 block max, we can interleave
3608     // each v/h within the single x loop, but that only works if we work on
3609     // 8 pixel blocks, and we won't always do that (we want at least 16px
3610     // to use SSE2 optimizations, perhaps 32 for AVX2)
3611
3612     filter_plane_cols(s, col, 0, 0, lflvl->level, lflvl->mask[0][0], dst, ls_y);
3613     filter_plane_rows(s, row, 0, 0, lflvl->level, lflvl->mask[0][1], dst, ls_y);
3614
3615     for (p = 0; p < 2; p++) {
3616         dst = f->data[1 + p] + uvoff;
3617         filter_plane_cols(s, col, s->ss_h, s->ss_v, lflvl->level, uv_masks[0], dst, ls_uv);
3618         filter_plane_rows(s, row, s->ss_h, s->ss_v, lflvl->level, uv_masks[1], dst, ls_uv);
3619     }
3620 }
3621
3622 static void set_tile_offset(int *start, int *end, int idx, int log2_n, int n)
3623 {
3624     int sb_start = ( idx      * n) >> log2_n;
3625     int sb_end   = ((idx + 1) * n) >> log2_n;
3626     *start = FFMIN(sb_start, n) << 3;
3627     *end   = FFMIN(sb_end,   n) << 3;
3628 }
3629
3630 static av_always_inline void adapt_prob(uint8_t *p, unsigned ct0, unsigned ct1,
3631                                         int max_count, int update_factor)
3632 {
3633     unsigned ct = ct0 + ct1, p2, p1;
3634
3635     if (!ct)
3636         return;
3637
3638     p1 = *p;
3639     p2 = ((ct0 << 8) + (ct >> 1)) / ct;
3640     p2 = av_clip(p2, 1, 255);
3641     ct = FFMIN(ct, max_count);
3642     update_factor = FASTDIV(update_factor * ct, max_count);
3643
3644     // (p1 * (256 - update_factor) + p2 * update_factor + 128) >> 8
3645     *p = p1 + (((p2 - p1) * update_factor + 128) >> 8);
3646 }
3647
3648 static void adapt_probs(VP9Context *s)
3649 {
3650     int i, j, k, l, m;
3651     prob_context *p = &s->prob_ctx[s->s.h.framectxid].p;
3652     int uf = (s->s.h.keyframe || s->s.h.intraonly || !s->last_keyframe) ? 112 : 128;
3653
3654     // coefficients
3655     for (i = 0; i < 4; i++)
3656         for (j = 0; j < 2; j++)
3657             for (k = 0; k < 2; k++)
3658                 for (l = 0; l < 6; l++)
3659                     for (m = 0; m < 6; m++) {
3660                         uint8_t *pp = s->prob_ctx[s->s.h.framectxid].coef[i][j][k][l][m];
3661                         unsigned *e = s->counts.eob[i][j][k][l][m];
3662                         unsigned *c = s->counts.coef[i][j][k][l][m];
3663
3664                         if (l == 0 && m >= 3) // dc only has 3 pt
3665                             break;
3666
3667                         adapt_prob(&pp[0], e[0], e[1], 24, uf);
3668                         adapt_prob(&pp[1], c[0], c[1] + c[2], 24, uf);
3669                         adapt_prob(&pp[2], c[1], c[2], 24, uf);
3670                     }
3671
3672     if (s->s.h.keyframe || s->s.h.intraonly) {
3673         memcpy(p->skip,  s->prob.p.skip,  sizeof(p->skip));
3674         memcpy(p->tx32p, s->prob.p.tx32p, sizeof(p->tx32p));
3675         memcpy(p->tx16p, s->prob.p.tx16p, sizeof(p->tx16p));
3676         memcpy(p->tx8p,  s->prob.p.tx8p,  sizeof(p->tx8p));
3677         return;
3678     }
3679
3680     // skip flag
3681     for (i = 0; i < 3; i++)
3682         adapt_prob(&p->skip[i], s->counts.skip[i][0], s->counts.skip[i][1], 20, 128);
3683
3684     // intra/inter flag
3685     for (i = 0; i < 4; i++)
3686         adapt_prob(&p->intra[i], s->counts.intra[i][0], s->counts.intra[i][1], 20, 128);
3687
3688     // comppred flag
3689     if (s->s.h.comppredmode == PRED_SWITCHABLE) {
3690       for (i = 0; i < 5; i++)
3691           adapt_prob(&p->comp[i], s->counts.comp[i][0], s->counts.comp[i][1], 20, 128);
3692     }
3693
3694     // reference frames
3695     if (s->s.h.comppredmode != PRED_SINGLEREF) {
3696       for (i = 0; i < 5; i++)
3697           adapt_prob(&p->comp_ref[i], s->counts.comp_ref[i][0],
3698                      s->counts.comp_ref[i][1], 20, 128);
3699     }
3700
3701     if (s->s.h.comppredmode != PRED_COMPREF) {
3702       for (i = 0; i < 5; i++) {
3703           uint8_t *pp = p->single_ref[i];
3704           unsigned (*c)[2] = s->counts.single_ref[i];
3705
3706           adapt_prob(&pp[0], c[0][0], c[0][1], 20, 128);
3707           adapt_prob(&pp[1], c[1][0], c[1][1], 20, 128);
3708       }
3709     }
3710
3711     // block partitioning
3712     for (i = 0; i < 4; i++)
3713         for (j = 0; j < 4; j++) {
3714             uint8_t *pp = p->partition[i][j];
3715             unsigned *c = s->counts.partition[i][j];
3716
3717             adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3718             adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3719             adapt_prob(&pp[2], c[2], c[3], 20, 128);
3720         }
3721
3722     // tx size
3723     if (s->s.h.txfmmode == TX_SWITCHABLE) {
3724       for (i = 0; i < 2; i++) {
3725           unsigned *c16 = s->counts.tx16p[i], *c32 = s->counts.tx32p[i];
3726
3727           adapt_prob(&p->tx8p[i], s->counts.tx8p[i][0], s->counts.tx8p[i][1], 20, 128);
3728           adapt_prob(&p->tx16p[i][0], c16[0], c16[1] + c16[2], 20, 128);
3729           adapt_prob(&p->tx16p[i][1], c16[1], c16[2], 20, 128);
3730           adapt_prob(&p->tx32p[i][0], c32[0], c32[1] + c32[2] + c32[3], 20, 128);
3731           adapt_prob(&p->tx32p[i][1], c32[1], c32[2] + c32[3], 20, 128);
3732           adapt_prob(&p->tx32p[i][2], c32[2], c32[3], 20, 128);
3733       }
3734     }
3735
3736     // interpolation filter
3737     if (s->s.h.filtermode == FILTER_SWITCHABLE) {
3738         for (i = 0; i < 4; i++) {
3739             uint8_t *pp = p->filter[i];
3740             unsigned *c = s->counts.filter[i];
3741
3742             adapt_prob(&pp[0], c[0], c[1] + c[2], 20, 128);
3743             adapt_prob(&pp[1], c[1], c[2], 20, 128);
3744         }
3745     }
3746
3747     // inter modes
3748     for (i = 0; i < 7; i++) {
3749         uint8_t *pp = p->mv_mode[i];
3750         unsigned *c = s->counts.mv_mode[i];
3751
3752         adapt_prob(&pp[0], c[2], c[1] + c[0] + c[3], 20, 128);
3753         adapt_prob(&pp[1], c[0], c[1] + c[3], 20, 128);
3754         adapt_prob(&pp[2], c[1], c[3], 20, 128);
3755     }
3756
3757     // mv joints
3758     {
3759         uint8_t *pp = p->mv_joint;
3760         unsigned *c = s->counts.mv_joint;
3761
3762         adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3763         adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3764         adapt_prob(&pp[2], c[2], c[3], 20, 128);
3765     }
3766
3767     // mv components
3768     for (i = 0; i < 2; i++) {
3769         uint8_t *pp;
3770         unsigned *c, (*c2)[2], sum;
3771
3772         adapt_prob(&p->mv_comp[i].sign, s->counts.mv_comp[i].sign[0],
3773                    s->counts.mv_comp[i].sign[1], 20, 128);
3774
3775         pp = p->mv_comp[i].classes;
3776         c = s->counts.mv_comp[i].classes;
3777         sum = c[1] + c[2] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9] + c[10];
3778         adapt_prob(&pp[0], c[0], sum, 20, 128);
3779         sum -= c[1];
3780         adapt_prob(&pp[1], c[1], sum, 20, 128);
3781         sum -= c[2] + c[3];
3782         adapt_prob(&pp[2], c[2] + c[3], sum, 20, 128);
3783         adapt_prob(&pp[3], c[2], c[3], 20, 128);
3784         sum -= c[4] + c[5];
3785         adapt_prob(&pp[4], c[4] + c[5], sum, 20, 128);
3786         adapt_prob(&pp[5], c[4], c[5], 20, 128);
3787         sum -= c[6];
3788         adapt_prob(&pp[6], c[6], sum, 20, 128);
3789         adapt_prob(&pp[7], c[7] + c[8], c[9] + c[10], 20, 128);
3790         adapt_prob(&pp[8], c[7], c[8], 20, 128);
3791         adapt_prob(&pp[9], c[9], c[10], 20, 128);
3792
3793         adapt_prob(&p->mv_comp[i].class0, s->counts.mv_comp[i].class0[0],
3794                    s->counts.mv_comp[i].class0[1], 20, 128);
3795         pp = p->mv_comp[i].bits;
3796         c2 = s->counts.mv_comp[i].bits;
3797         for (j = 0; j < 10; j++)
3798             adapt_prob(&pp[j], c2[j][0], c2[j][1], 20, 128);
3799
3800         for (j = 0; j < 2; j++) {
3801             pp = p->mv_comp[i].class0_fp[j];
3802             c = s->counts.mv_comp[i].class0_fp[j];
3803             adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3804             adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3805             adapt_prob(&pp[2], c[2], c[3], 20, 128);
3806         }
3807         pp = p->mv_comp[i].fp;
3808         c = s->counts.mv_comp[i].fp;
3809         adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3810         adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3811         adapt_prob(&pp[2], c[2], c[3], 20, 128);
3812
3813         if (s->s.h.highprecisionmvs) {
3814             adapt_prob(&p->mv_comp[i].class0_hp, s->counts.mv_comp[i].class0_hp[0],
3815                        s->counts.mv_comp[i].class0_hp[1], 20, 128);
3816             adapt_prob(&p->mv_comp[i].hp, s->counts.mv_comp[i].hp[0],
3817                        s->counts.mv_comp[i].hp[1], 20, 128);
3818         }
3819     }
3820
3821     // y intra modes
3822     for (i = 0; i < 4; i++) {
3823         uint8_t *pp = p->y_mode[i];
3824         unsigned *c = s->counts.y_mode[i], sum, s2;
3825
3826         sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
3827         adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
3828         sum -= c[TM_VP8_PRED];
3829         adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
3830         sum -= c[VERT_PRED];
3831         adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
3832         s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
3833         sum -= s2;
3834         adapt_prob(&pp[3], s2, sum, 20, 128);
3835         s2 -= c[HOR_PRED];
3836         adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
3837         adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], 20, 128);
3838         sum -= c[DIAG_DOWN_LEFT_PRED];
3839         adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
3840         sum -= c[VERT_LEFT_PRED];
3841         adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
3842         adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
3843     }
3844
3845     // uv intra modes
3846     for (i = 0; i < 10; i++) {
3847         uint8_t *pp = p->uv_mode[i];
3848         unsigned *c = s->counts.uv_mode[i], sum, s2;
3849
3850         sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
3851         adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
3852         sum -= c[TM_VP8_PRED];
3853         adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
3854         sum -= c[VERT_PRED];
3855         adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
3856         s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
3857         sum -= s2;
3858         adapt_prob(&pp[3], s2, sum, 20, 128);
3859         s2 -= c[HOR_PRED];
3860         adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
3861         adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], 20, 128);
3862         sum -= c[DIAG_DOWN_LEFT_PRED];
3863         adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
3864         sum -= c[VERT_LEFT_PRED];
3865         adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
3866         adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
3867     }
3868 }
3869
3870 static void free_buffers(VP9Context *s)
3871 {
3872     av_freep(&s->intra_pred_data[0]);
3873     av_freep(&s->b_base);
3874     av_freep(&s->block_base);
3875 }
3876
3877 static av_cold int vp9_decode_free(AVCodecContext *ctx)
3878 {
3879     VP9Context *s = ctx->priv_data;
3880     int i;
3881
3882     for (i = 0; i < 3; i++) {
3883         if (s->s.frames[i].tf.f->buf[0])
3884             vp9_unref_frame(ctx, &s->s.frames[i]);
3885         av_frame_free(&s->s.frames[i].tf.f);
3886     }
3887     for (i = 0; i < 8; i++) {
3888         if (s->s.refs[i].f->buf[0])
3889             ff_thread_release_buffer(ctx, &s->s.refs[i]);
3890         av_frame_free(&s->s.refs[i].f);
3891         if (s->next_refs[i].f->buf[0])
3892             ff_thread_release_buffer(ctx, &s->next_refs[i]);
3893         av_frame_free(&s->next_refs[i].f);
3894     }
3895     free_buffers(s);
3896     av_freep(&s->c_b);
3897     s->c_b_size = 0;
3898
3899     return 0;
3900 }
3901
3902
3903 static int vp9_decode_frame(AVCodecContext *ctx, void *frame,
3904                             int *got_frame, AVPacket *pkt)
3905 {
3906     const uint8_t *data = pkt->data;
3907     int size = pkt->size;
3908     VP9Context *s = ctx->priv_data;
3909     int res, tile_row, tile_col, i, ref, row, col;
3910     int retain_segmap_ref = s->s.frames[REF_FRAME_SEGMAP].segmentation_map &&
3911                             (!s->s.h.segmentation.enabled || !s->s.h.segmentation.update_map);
3912     ptrdiff_t yoff, uvoff, ls_y, ls_uv;
3913     AVFrame *f;
3914     int bytesperpixel;
3915
3916     if ((res = decode_frame_header(ctx, data, size, &ref)) < 0) {
3917         return res;
3918     } else if (res == 0) {
3919         if (!s->s.refs[ref].f->buf[0]) {
3920             av_log(ctx, AV_LOG_ERROR, "Requested reference %d not available\n", ref);
3921             return AVERROR_INVALIDDATA;
3922         }
3923         if ((res = av_frame_ref(frame, s->s.refs[ref].f)) < 0)
3924             return res;
3925         ((AVFrame *)frame)->pkt_pts = pkt->pts;
3926         ((AVFrame *)frame)->pkt_dts = pkt->dts;
3927         for (i = 0; i < 8; i++) {
3928             if (s->next_refs[i].f->buf[0])
3929                 ff_thread_release_buffer(ctx, &s->next_refs[i]);
3930             if (s->s.refs[i].f->buf[0] &&
3931                 (res = ff_thread_ref_frame(&s->next_refs[i], &s->s.refs[i])) < 0)
3932                 return res;
3933         }
3934         *got_frame = 1;
3935         return pkt->size;
3936     }
3937     data += res;
3938     size -= res;
3939
3940     if (!retain_segmap_ref || s->s.h.keyframe || s->s.h.intraonly) {
3941         if (s->s.frames[REF_FRAME_SEGMAP].tf.f->buf[0])
3942             vp9_unref_frame(ctx, &s->s.frames[REF_FRAME_SEGMAP]);
3943         if (!s->s.h.keyframe && !s->s.h.intraonly && !s->s.h.errorres && s->s.frames[CUR_FRAME].tf.f->buf[0] &&
3944             (res = vp9_ref_frame(ctx, &s->s.frames[REF_FRAME_SEGMAP], &s->s.frames[CUR_FRAME])) < 0)
3945             return res;
3946     }
3947     if (s->s.frames[REF_FRAME_MVPAIR].tf.f->buf[0])
3948         vp9_unref_frame(ctx, &s->s.frames[REF_FRAME_MVPAIR]);
3949     if (!s->s.h.intraonly && !s->s.h.keyframe && !s->s.h.errorres && s->s.frames[CUR_FRAME].tf.f->buf[0] &&
3950         (res = vp9_ref_frame(ctx, &s->s.frames[REF_FRAME_MVPAIR], &s->s.frames[CUR_FRAME])) < 0)
3951         return res;
3952     if (s->s.frames[CUR_FRAME].tf.f->buf[0])
3953         vp9_unref_frame(ctx, &s->s.frames[CUR_FRAME]);
3954     if ((res = vp9_alloc_frame(ctx, &s->s.frames[CUR_FRAME])) < 0)
3955         return res;
3956     f = s->s.frames[CUR_FRAME].tf.f;
3957     f->key_frame = s->s.h.keyframe;
3958     f->pict_type = (s->s.h.keyframe || s->s.h.intraonly) ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
3959     ls_y = f->linesize[0];
3960     ls_uv =f->linesize[1];
3961
3962     if (s->s.frames[REF_FRAME_SEGMAP].tf.f->buf[0] &&
3963         (s->s.frames[REF_FRAME_MVPAIR].tf.f->width  != s->s.frames[CUR_FRAME].tf.f->width ||
3964          s->s.frames[REF_FRAME_MVPAIR].tf.f->height != s->s.frames[CUR_FRAME].tf.f->height)) {
3965         vp9_unref_frame(ctx, &s->s.frames[REF_FRAME_SEGMAP]);
3966     }
3967
3968     // ref frame setup
3969     for (i = 0; i < 8; i++) {
3970         if (s->next_refs[i].f->buf[0])
3971             ff_thread_release_buffer(ctx, &s->next_refs[i]);
3972         if (s->s.h.refreshrefmask & (1 << i)) {
3973             res = ff_thread_ref_frame(&s->next_refs[i], &s->s.frames[CUR_FRAME].tf);
3974         } else if (s->s.refs[i].f->buf[0]) {
3975             res = ff_thread_ref_frame(&s->next_refs[i], &s->s.refs[i]);
3976         }
3977         if (res < 0)
3978             return res;
3979     }
3980
3981     // main tile decode loop
3982     bytesperpixel = s->bytesperpixel;
3983     memset(s->above_partition_ctx, 0, s->cols);
3984     memset(s->above_skip_ctx, 0, s->cols);
3985     if (s->s.h.keyframe || s->s.h.intraonly) {
3986         memset(s->above_mode_ctx, DC_PRED, s->cols * 2);
3987     } else {
3988         memset(s->above_mode_ctx, NEARESTMV, s->cols);
3989     }
3990     memset(s->above_y_nnz_ctx, 0, s->sb_cols * 16);
3991     memset(s->above_uv_nnz_ctx[0], 0, s->sb_cols * 16 >> s->ss_h);
3992     memset(s->above_uv_nnz_ctx[1], 0, s->sb_cols * 16 >> s->ss_h);
3993     memset(s->above_segpred_ctx, 0, s->cols);
3994     s->pass = s->s.frames[CUR_FRAME].uses_2pass =
3995         ctx->active_thread_type == FF_THREAD_FRAME && s->s.h.refreshctx && !s->s.h.parallelmode;
3996     if ((res = update_block_buffers(ctx)) < 0) {
3997         av_log(ctx, AV_LOG_ERROR,
3998                "Failed to allocate block buffers\n");
3999         return res;
4000     }
4001     if (s->s.h.refreshctx && s->s.h.parallelmode) {
4002         int j, k, l, m;
4003
4004         for (i = 0; i < 4; i++) {
4005             for (j = 0; j < 2; j++)
4006                 for (k = 0; k < 2; k++)
4007                     for (l = 0; l < 6; l++)
4008                         for (m = 0; m < 6; m++)
4009                             memcpy(s->prob_ctx[s->s.h.framectxid].coef[i][j][k][l][m],
4010                                    s->prob.coef[i][j][k][l][m], 3);
4011             if (s->s.h.txfmmode == i)
4012                 break;
4013         }
4014         s->prob_ctx[s->s.h.framectxid].p = s->prob.p;
4015         ff_thread_finish_setup(ctx);
4016     } else if (!s->s.h.refreshctx) {
4017         ff_thread_finish_setup(ctx);
4018     }
4019
4020     do {
4021         yoff = uvoff = 0;
4022         s->b = s->b_base;
4023         s->block = s->block_base;
4024         s->uvblock[0] = s->uvblock_base[0];
4025         s->uvblock[1] = s->uvblock_base[1];
4026         s->eob = s->eob_base;
4027         s->uveob[0] = s->uveob_base[0];
4028         s->uveob[1] = s->uveob_base[1];
4029
4030         for (tile_row = 0; tile_row < s->s.h.tiling.tile_rows; tile_row++) {
4031             set_tile_offset(&s->tile_row_start, &s->tile_row_end,
4032                             tile_row, s->s.h.tiling.log2_tile_rows, s->sb_rows);
4033             if (s->pass != 2) {
4034                 for (tile_col = 0; tile_col < s->s.h.tiling.tile_cols; tile_col++) {
4035                     int64_t tile_size;
4036
4037                     if (tile_col == s->s.h.tiling.tile_cols - 1 &&
4038                         tile_row == s->s.h.tiling.tile_rows - 1) {
4039                         tile_size = size;
4040                     } else {
4041                         tile_size = AV_RB32(data);
4042                         data += 4;
4043                         size -= 4;
4044                     }
4045                     if (tile_size > size) {
4046                         ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, INT_MAX, 0);
4047                         return AVERROR_INVALIDDATA;
4048                     }
4049                     ff_vp56_init_range_decoder(&s->c_b[tile_col], data, tile_size);
4050                     if (vp56_rac_get_prob_branchy(&s->c_b[tile_col], 128)) { // marker bit
4051                         ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, INT_MAX, 0);
4052                         return AVERROR_INVALIDDATA;
4053                     }
4054                     data += tile_size;
4055                     size -= tile_size;
4056                 }
4057             }
4058
4059             for (row = s->tile_row_start; row < s->tile_row_end;
4060                  row += 8, yoff += ls_y * 64, uvoff += ls_uv * 64 >> s->ss_v) {
4061                 struct VP9Filter *lflvl_ptr = s->lflvl;
4062                 ptrdiff_t yoff2 = yoff, uvoff2 = uvoff;
4063
4064                 for (tile_col = 0; tile_col < s->s.h.tiling.tile_cols; tile_col++) {
4065                     set_tile_offset(&s->tile_col_start, &s->tile_col_end,
4066                                     tile_col, s->s.h.tiling.log2_tile_cols, s->sb_cols);
4067
4068                     if (s->pass != 2) {
4069                         memset(s->left_partition_ctx, 0, 8);
4070                         memset(s->left_skip_ctx, 0, 8);
4071                         if (s->s.h.keyframe || s->s.h.intraonly) {
4072                             memset(s->left_mode_ctx, DC_PRED, 16);
4073                         } else {
4074                             memset(s->left_mode_ctx, NEARESTMV, 8);
4075                         }
4076                         memset(s->left_y_nnz_ctx, 0, 16);
4077                         memset(s->left_uv_nnz_ctx, 0, 32);
4078                         memset(s->left_segpred_ctx, 0, 8);
4079
4080                         memcpy(&s->c, &s->c_b[tile_col], sizeof(s->c));
4081                     }
4082
4083                     for (col = s->tile_col_start;
4084                          col < s->tile_col_end;
4085                          col += 8, yoff2 += 64 * bytesperpixel,
4086                          uvoff2 += 64 * bytesperpixel >> s->ss_h, lflvl_ptr++) {
4087                         // FIXME integrate with lf code (i.e. zero after each
4088                         // use, similar to invtxfm coefficients, or similar)
4089                         if (s->pass != 1) {
4090                             memset(lflvl_ptr->mask, 0, sizeof(lflvl_ptr->mask));
4091                         }
4092
4093                         if (s->pass == 2) {
4094                             decode_sb_mem(ctx, row, col, lflvl_ptr,
4095                                           yoff2, uvoff2, BL_64X64);
4096                         } else {
4097                             decode_sb(ctx, row, col, lflvl_ptr,
4098                                       yoff2, uvoff2, BL_64X64);
4099                         }
4100                     }
4101                     if (s->pass != 2) {
4102                         memcpy(&s->c_b[tile_col], &s->c, sizeof(s->c));
4103                     }
4104                 }
4105
4106                 if (s->pass == 1) {
4107                     continue;
4108                 }
4109
4110                 // backup pre-loopfilter reconstruction data for intra
4111                 // prediction of next row of sb64s
4112                 if (row + 8 < s->rows) {
4113                     memcpy(s->intra_pred_data[0],
4114                            f->data[0] + yoff + 63 * ls_y,
4115                            8 * s->cols * bytesperpixel);
4116                     memcpy(s->intra_pred_data[1],
4117                            f->data[1] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv,
4118                            8 * s->cols * bytesperpixel >> s->ss_h);
4119                     memcpy(s->intra_pred_data[2],
4120                            f->data[2] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv,
4121                            8 * s->cols * bytesperpixel >> s->ss_h);
4122                 }
4123
4124                 // loopfilter one row
4125                 if (s->s.h.filter.level) {
4126                     yoff2 = yoff;
4127                     uvoff2 = uvoff;
4128                     lflvl_ptr = s->lflvl;
4129                     for (col = 0; col < s->cols;
4130                          col += 8, yoff2 += 64 * bytesperpixel,
4131                          uvoff2 += 64 * bytesperpixel >> s->ss_h, lflvl_ptr++) {
4132                         loopfilter_sb(ctx, lflvl_ptr, row, col, yoff2, uvoff2);
4133                     }
4134                 }
4135
4136                 // FIXME maybe we can make this more finegrained by running the
4137                 // loopfilter per-block instead of after each sbrow
4138                 // In fact that would also make intra pred left preparation easier?
4139                 ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, row >> 3, 0);
4140             }
4141         }
4142
4143         if (s->pass < 2 && s->s.h.refreshctx && !s->s.h.parallelmode) {
4144             adapt_probs(s);
4145             ff_thread_finish_setup(ctx);
4146         }
4147     } while (s->pass++ == 1);
4148     ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, INT_MAX, 0);
4149
4150     // ref frame setup
4151     for (i = 0; i < 8; i++) {
4152         if (s->s.refs[i].f->buf[0])
4153             ff_thread_release_buffer(ctx, &s->s.refs[i]);
4154         if (s->next_refs[i].f->buf[0] &&
4155             (res = ff_thread_ref_frame(&s->s.refs[i], &s->next_refs[i])) < 0)
4156             return res;
4157     }
4158
4159     if (!s->s.h.invisible) {
4160         if ((res = av_frame_ref(frame, s->s.frames[CUR_FRAME].tf.f)) < 0)
4161             return res;
4162         *got_frame = 1;
4163     }
4164
4165     return pkt->size;
4166 }
4167
4168 static void vp9_decode_flush(AVCodecContext *ctx)
4169 {
4170     VP9Context *s = ctx->priv_data;
4171     int i;
4172
4173     for (i = 0; i < 3; i++)
4174         vp9_unref_frame(ctx, &s->s.frames[i]);
4175     for (i = 0; i < 8; i++)
4176         ff_thread_release_buffer(ctx, &s->s.refs[i]);
4177 }
4178
4179 static int init_frames(AVCodecContext *ctx)
4180 {
4181     VP9Context *s = ctx->priv_data;
4182     int i;
4183
4184     for (i = 0; i < 3; i++) {
4185         s->s.frames[i].tf.f = av_frame_alloc();
4186         if (!s->s.frames[i].tf.f) {
4187             vp9_decode_free(ctx);
4188             av_log(ctx, AV_LOG_ERROR, "Failed to allocate frame buffer %d\n", i);
4189             return AVERROR(ENOMEM);
4190         }
4191     }
4192     for (i = 0; i < 8; i++) {
4193         s->s.refs[i].f = av_frame_alloc();
4194         s->next_refs[i].f = av_frame_alloc();
4195         if (!s->s.refs[i].f || !s->next_refs[i].f) {
4196             vp9_decode_free(ctx);
4197             av_log(ctx, AV_LOG_ERROR, "Failed to allocate frame buffer %d\n", i);
4198             return AVERROR(ENOMEM);
4199         }
4200     }
4201
4202     return 0;
4203 }
4204
4205 static av_cold int vp9_decode_init(AVCodecContext *ctx)
4206 {
4207     VP9Context *s = ctx->priv_data;
4208
4209     ctx->internal->allocate_progress = 1;
4210     s->last_bpp = 0;
4211     s->s.h.filter.sharpness = -1;
4212
4213     return init_frames(ctx);
4214 }
4215
4216 #if HAVE_THREADS
4217 static av_cold int vp9_decode_init_thread_copy(AVCodecContext *avctx)
4218 {
4219     return init_frames(avctx);
4220 }
4221
4222 static int vp9_decode_update_thread_context(AVCodecContext *dst, const AVCodecContext *src)
4223 {
4224     int i, res;
4225     VP9Context *s = dst->priv_data, *ssrc = src->priv_data;
4226
4227     // detect size changes in other threads
4228     if (s->intra_pred_data[0] &&
4229         (!ssrc->intra_pred_data[0] || s->cols != ssrc->cols ||
4230          s->rows != ssrc->rows || s->bpp != ssrc->bpp)) {
4231         free_buffers(s);
4232     }
4233
4234     for (i = 0; i < 3; i++) {
4235         if (s->s.frames[i].tf.f->buf[0])
4236             vp9_unref_frame(dst, &s->s.frames[i]);
4237         if (ssrc->s.frames[i].tf.f->buf[0]) {
4238             if ((res = vp9_ref_frame(dst, &s->s.frames[i], &ssrc->s.frames[i])) < 0)
4239                 return res;
4240         }
4241     }
4242     for (i = 0; i < 8; i++) {
4243         if (s->s.refs[i].f->buf[0])
4244             ff_thread_release_buffer(dst, &s->s.refs[i]);
4245         if (ssrc->next_refs[i].f->buf[0]) {
4246             if ((res = ff_thread_ref_frame(&s->s.refs[i], &ssrc->next_refs[i])) < 0)
4247                 return res;
4248         }
4249     }
4250
4251     s->s.h.invisible = ssrc->s.h.invisible;
4252     s->s.h.keyframe = ssrc->s.h.keyframe;
4253     s->s.h.intraonly = ssrc->s.h.intraonly;
4254     s->ss_v = ssrc->ss_v;
4255     s->ss_h = ssrc->ss_h;
4256     s->s.h.segmentation.enabled = ssrc->s.h.segmentation.enabled;
4257     s->s.h.segmentation.update_map = ssrc->s.h.segmentation.update_map;
4258     s->s.h.segmentation.absolute_vals = ssrc->s.h.segmentation.absolute_vals;
4259     s->bytesperpixel = ssrc->bytesperpixel;
4260     s->bpp = ssrc->bpp;
4261     s->bpp_index = ssrc->bpp_index;
4262     memcpy(&s->prob_ctx, &ssrc->prob_ctx, sizeof(s->prob_ctx));
4263     memcpy(&s->s.h.lf_delta, &ssrc->s.h.lf_delta, sizeof(s->s.h.lf_delta));
4264     memcpy(&s->s.h.segmentation.feat, &ssrc->s.h.segmentation.feat,
4265            sizeof(s->s.h.segmentation.feat));
4266
4267     return 0;
4268 }
4269 #endif
4270
4271 static const AVProfile profiles[] = {
4272     { FF_PROFILE_VP9_0, "Profile 0" },
4273     { FF_PROFILE_VP9_1, "Profile 1" },
4274     { FF_PROFILE_VP9_2, "Profile 2" },
4275     { FF_PROFILE_VP9_3, "Profile 3" },
4276     { FF_PROFILE_UNKNOWN },
4277 };
4278
4279 AVCodec ff_vp9_decoder = {
4280     .name                  = "vp9",
4281     .long_name             = NULL_IF_CONFIG_SMALL("Google VP9"),
4282     .type                  = AVMEDIA_TYPE_VIDEO,
4283     .id                    = AV_CODEC_ID_VP9,
4284     .priv_data_size        = sizeof(VP9Context),
4285     .init                  = vp9_decode_init,
4286     .close                 = vp9_decode_free,
4287     .decode                = vp9_decode_frame,
4288     .capabilities          = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS,
4289     .flush                 = vp9_decode_flush,
4290     .init_thread_copy      = ONLY_IF_THREADS_ENABLED(vp9_decode_init_thread_copy),
4291     .update_thread_context = ONLY_IF_THREADS_ENABLED(vp9_decode_update_thread_context),
4292     .profiles              = NULL_IF_CONFIG_SMALL(profiles),
4293 };