vp9: support non-420 chroma subsampling for profile 1 token decoding.

[ffmpeg] / libavcodec / vp9.c
diff --git a/libavcodec/vp9.c b/libavcodec/vp9.c

index 89257fa2185db10c71ef0d999ecebb415a0c688e..754d829da06a10809d8fb23774b5427fb64038ca 100644 (file)
--- a/libavcodec/vp9.c
+++ b/libavcodec/vp9.c
@@ -112,8 +112,7 @@ typedef struct VP9Context {
      uint8_t invisible;
      uint8_t use_last_frame_mvs;
      uint8_t errorres;
-    uint8_t colorspace;
-    uint8_t fullrange;
+    uint8_t ss_h, ss_v;
      uint8_t intraonly;
      uint8_t resetctx;
      uint8_t refreshrefmask;
@@ -216,7 +215,7 @@ typedef struct VP9Context {
      DECLARE_ALIGNED(16, uint8_t, left_y_nnz_ctx)[16];
      DECLARE_ALIGNED(16, uint8_t, left_mode_ctx)[16];
      DECLARE_ALIGNED(16, VP56mv, left_mv_ctx)[16][2];
-    DECLARE_ALIGNED(8, uint8_t, left_uv_nnz_ctx)[2][8];
+    DECLARE_ALIGNED(16, uint8_t, left_uv_nnz_ctx)[2][16];
      DECLARE_ALIGNED(8, uint8_t, left_partition_ctx)[8];
      DECLARE_ALIGNED(8, uint8_t, left_skip_ctx)[8];
      DECLARE_ALIGNED(8, uint8_t, left_txfm_ctx)[8];
@@ -242,15 +241,17 @@ typedef struct VP9Context {
      // whole-frame cache
      uint8_t *intra_pred_data[3];
      struct VP9Filter *lflvl;
-    DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[71*80];
+    DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[135*144];
  
      // block reconstruction intermediates
      int block_alloc_using_2pass;
      int16_t *block_base, *block, *uvblock_base[2], *uvblock[2];
      uint8_t *eob_base, *uveob_base[2], *eob, *uveob[2];
      struct { int x, y; } min_mv, max_mv;
-    DECLARE_ALIGNED(32, uint8_t, tmp_y)[64*64];
-    DECLARE_ALIGNED(32, uint8_t, tmp_uv)[2][32*32];
+    DECLARE_ALIGNED(32, uint8_t, tmp_y)[64 * 64];
+    DECLARE_ALIGNED(32, uint8_t, tmp_uv)[2][64 * 64];
+    uint16_t mvscale[3][2];
+    uint8_t mvstep[3][2];
  } VP9Context;
  
  static const uint8_t bwh_tab[2][N_BS_SIZES][2] = {
@@ -306,39 +307,42 @@ static int vp9_ref_frame(AVCodecContext *ctx, VP9Frame *dst, VP9Frame *src)
      return 0;
  }
  
-static int update_size(AVCodecContext *ctx, int w, int h)
+static int update_size(AVCodecContext *ctx, int w, int h, enum AVPixelFormat fmt)
  {
      VP9Context *s = ctx->priv_data;
      uint8_t *p;
  
      av_assert0(w > 0 && h > 0);
  
-    if (s->intra_pred_data[0] && w == ctx->width && h == ctx->height)
+    if (s->intra_pred_data[0] && w == ctx->width && h == ctx->height && ctx->pix_fmt == fmt)
          return 0;
  
-    ctx->width  = w;
-    ctx->height = h;
-    s->sb_cols  = (w + 63) >> 6;
-    s->sb_rows  = (h + 63) >> 6;
-    s->cols     = (w + 7) >> 3;
-    s->rows     = (h + 7) >> 3;
+    ctx->width   = w;
+    ctx->height  = h;
+    ctx->pix_fmt = fmt;
+    s->sb_cols   = (w + 63) >> 6;
+    s->sb_rows   = (h + 63) >> 6;
+    s->cols      = (w + 7) >> 3;
+    s->rows      = (h + 7) >> 3;
  
  #define assign(var, type, n) var = (type) p; p += s->sb_cols * (n) * sizeof(*var)
      av_freep(&s->intra_pred_data[0]);
-    p = av_malloc(s->sb_cols * (240 + sizeof(*s->lflvl) + 16 * sizeof(*s->above_mv_ctx)));
+    // FIXME we slightly over-allocate here for subsampled chroma, but a little
+    // bit of padding shouldn't affect performance...
+    p = av_malloc(s->sb_cols * (320 + sizeof(*s->lflvl) + 16 * sizeof(*s->above_mv_ctx)));
      if (!p)
          return AVERROR(ENOMEM);
      assign(s->intra_pred_data[0],  uint8_t *,             64);
-    assign(s->intra_pred_data[1],  uint8_t *,             32);
-    assign(s->intra_pred_data[2],  uint8_t *,             32);
+    assign(s->intra_pred_data[1],  uint8_t *,             64);
+    assign(s->intra_pred_data[2],  uint8_t *,             64);
      assign(s->above_y_nnz_ctx,     uint8_t *,             16);
      assign(s->above_mode_ctx,      uint8_t *,             16);
      assign(s->above_mv_ctx,        VP56mv(*)[2],          16);
+    assign(s->above_uv_nnz_ctx[0], uint8_t *,             16);
+    assign(s->above_uv_nnz_ctx[1], uint8_t *,             16);
      assign(s->above_partition_ctx, uint8_t *,              8);
      assign(s->above_skip_ctx,      uint8_t *,              8);
      assign(s->above_txfm_ctx,      uint8_t *,              8);
-    assign(s->above_uv_nnz_ctx[0], uint8_t *,              8);
-    assign(s->above_uv_nnz_ctx[1], uint8_t *,              8);
      assign(s->above_segpred_ctx,   uint8_t *,              8);
      assign(s->above_intra_ctx,     uint8_t *,              8);
      assign(s->above_comp_ctx,      uint8_t *,              8);
@@ -357,34 +361,39 @@ static int update_size(AVCodecContext *ctx, int w, int h)
  static int update_block_buffers(AVCodecContext *ctx)
  {
      VP9Context *s = ctx->priv_data;
+    int chroma_blocks, chroma_eobs;
  
      if (s->b_base && s->block_base && s->block_alloc_using_2pass == s->frames[CUR_FRAME].uses_2pass)
          return 0;
  
      av_free(s->b_base);
      av_free(s->block_base);
+    chroma_blocks = 64 * 64 >> (s->ss_h + s->ss_v);
+    chroma_eobs   = 16 * 16 >> (s->ss_h + s->ss_v);
      if (s->frames[CUR_FRAME].uses_2pass) {
          int sbs = s->sb_cols * s->sb_rows;
  
          s->b_base = av_malloc_array(s->cols * s->rows, sizeof(VP9Block));
-        s->block_base = av_mallocz((64 * 64 + 128) * sbs * 3);
+        s->block_base = av_mallocz(((64 * 64 + 2 * chroma_blocks) * sizeof(int16_t) +
+                                    16 * 16 + 2 * chroma_eobs) * sbs);
          if (!s->b_base || !s->block_base)
              return AVERROR(ENOMEM);
          s->uvblock_base[0] = s->block_base + sbs * 64 * 64;
-        s->uvblock_base[1] = s->uvblock_base[0] + sbs * 32 * 32;
-        s->eob_base = (uint8_t *) (s->uvblock_base[1] + sbs * 32 * 32);
-        s->uveob_base[0] = s->eob_base + 256 * sbs;
-        s->uveob_base[1] = s->uveob_base[0] + 64 * sbs;
+        s->uvblock_base[1] = s->uvblock_base[0] + sbs * chroma_blocks;
+        s->eob_base = (uint8_t *) (s->uvblock_base[1] + sbs * chroma_blocks);
+        s->uveob_base[0] = s->eob_base + 16 * 16 * sbs;
+        s->uveob_base[1] = s->uveob_base[0] + chroma_eobs * sbs;
      } else {
          s->b_base = av_malloc(sizeof(VP9Block));
-        s->block_base = av_mallocz((64 * 64 + 128) * 3);
+        s->block_base = av_mallocz((64 * 64 + 2 * chroma_blocks) * sizeof(int16_t) +
+                                   16 * 16 + 2 * chroma_eobs);
          if (!s->b_base || !s->block_base)
              return AVERROR(ENOMEM);
          s->uvblock_base[0] = s->block_base + 64 * 64;
-        s->uvblock_base[1] = s->uvblock_base[0] + 32 * 32;
-        s->eob_base = (uint8_t *) (s->uvblock_base[1] + 32 * 32);
-        s->uveob_base[0] = s->eob_base + 256;
-        s->uveob_base[1] = s->uveob_base[0] + 64;
+        s->uvblock_base[1] = s->uvblock_base[0] + chroma_blocks;
+        s->eob_base = (uint8_t *) (s->uvblock_base[1] + chroma_blocks);
+        s->uveob_base[0] = s->eob_base + 16 * 16;
+        s->uveob_base[1] = s->uveob_base[0] + chroma_eobs;
      }
      s->block_alloc_using_2pass = s->frames[CUR_FRAME].uses_2pass;
  
@@ -461,11 +470,56 @@ static int update_prob(VP56RangeCoder *c, int p)
                      255 - inv_recenter_nonneg(inv_map_table[d], 255 - p);
  }
  
+static enum AVPixelFormat read_colorspace_details(AVCodecContext *ctx)
+{
+    static const enum AVColorSpace colorspaces[8] = {
+        AVCOL_SPC_UNSPECIFIED, AVCOL_SPC_BT470BG, AVCOL_SPC_BT709, AVCOL_SPC_SMPTE170M,
+        AVCOL_SPC_SMPTE240M, AVCOL_SPC_BT2020_NCL, AVCOL_SPC_RESERVED, AVCOL_SPC_RGB,
+    };
+    VP9Context *s = ctx->priv_data;
+    enum AVPixelFormat res;
+
+    ctx->colorspace = colorspaces[get_bits(&s->gb, 3)];
+    if (ctx->colorspace == AVCOL_SPC_RGB) { // RGB = profile 1
+        if (s->profile == 1) {
+            s->ss_h = s->ss_v = 1;
+            res = AV_PIX_FMT_GBRP;
+            ctx->color_range = AVCOL_RANGE_JPEG;
+        } else {
+            av_log(ctx, AV_LOG_ERROR, "RGB not supported in profile 0\n");
+            return AVERROR_INVALIDDATA;
+        }
+    } else {
+        static const enum AVPixelFormat pix_fmt_for_ss[2 /* v */][2 /* h */] = {
+            { AV_PIX_FMT_YUV444P, AV_PIX_FMT_YUV422P },
+            { AV_PIX_FMT_YUV440P, AV_PIX_FMT_YUV420P },
+        };
+        ctx->color_range = get_bits1(&s->gb) ? AVCOL_RANGE_JPEG : AVCOL_RANGE_MPEG;
+        if (s->profile == 1) {
+            s->ss_h = get_bits1(&s->gb);
+            s->ss_v = get_bits1(&s->gb);
+            if ((res = pix_fmt_for_ss[s->ss_v][s->ss_h]) == AV_PIX_FMT_YUV420P) {
+                av_log(ctx, AV_LOG_ERROR, "YUV 4:2:0 not supported in profile 1\n");
+                return AVERROR_INVALIDDATA;
+            } else if (get_bits1(&s->gb)) {
+                av_log(ctx, AV_LOG_ERROR, "Profile 1 color details reserved bit set\n");
+                return AVERROR_INVALIDDATA;
+            }
+        } else {
+            s->ss_h = s->ss_v = 1;
+            res = AV_PIX_FMT_YUV420P;
+        }
+    }
+
+    return res;
+}
+
  static int decode_frame_header(AVCodecContext *ctx,
                                 const uint8_t *data, int size, int *ref)
  {
      VP9Context *s = ctx->priv_data;
      int c, i, j, k, l, m, n, w, h, max, size2, res, sharp;
+    enum AVPixelFormat fmt = ctx->pix_fmt;
      int last_invisible;
      const uint8_t *data2;
  
@@ -479,8 +533,9 @@ static int decode_frame_header(AVCodecContext *ctx,
          return AVERROR_INVALIDDATA;
      }
      s->profile = get_bits1(&s->gb);
-    if (get_bits1(&s->gb)) { // reserved bit
-        av_log(ctx, AV_LOG_ERROR, "Reserved bit should be zero\n");
+    s->profile |= get_bits1(&s->gb) << 1;
+    if (s->profile > 1) {
+        av_log(ctx, AV_LOG_ERROR, "Profile %d is not yet supported\n", s->profile);
          return AVERROR_INVALIDDATA;
      }
      if (get_bits1(&s->gb)) {
@@ -498,12 +553,8 @@ static int decode_frame_header(AVCodecContext *ctx,
              av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
              return AVERROR_INVALIDDATA;
          }
-        s->colorspace = get_bits(&s->gb, 3);
-        if (s->colorspace == 7) { // RGB = profile 1
-            av_log(ctx, AV_LOG_ERROR, "RGB not supported in profile 0\n");
-            return AVERROR_INVALIDDATA;
-        }
-        s->fullrange  = get_bits1(&s->gb);
+        if ((fmt = read_colorspace_details(ctx)) < 0)
+            return fmt;
          // for profile 1, here follows the subsampling bits
          s->refreshrefmask = 0xff;
          w = get_bits(&s->gb, 16) + 1;
@@ -518,6 +569,15 @@ static int decode_frame_header(AVCodecContext *ctx,
                  av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
                  return AVERROR_INVALIDDATA;
              }
+            if (s->profile == 1) {
+                if ((fmt = read_colorspace_details(ctx)) < 0)
+                    return fmt;
+            } else {
+                s->ss_h = s->ss_v = 1;
+                fmt = AV_PIX_FMT_YUV420P;
+                ctx->colorspace = AVCOL_SPC_BT470BG;
+                ctx->color_range = AVCOL_RANGE_JPEG;
+            }
              s->refreshrefmask = get_bits(&s->gb, 8);
              w = get_bits(&s->gb, 16) + 1;
              h = get_bits(&s->gb, 16) + 1;
@@ -577,6 +637,26 @@ static int decode_frame_header(AVCodecContext *ctx,
                      s->varcompref[1] = 2;
                  }
              }
+
+            for (i = 0; i < 3; i++) {
+                AVFrame *ref = s->refs[s->refidx[i]].f;
+                int refw = ref->width, refh = ref->height;
+
+                if (refw == w && refh == h) {
+                    s->mvscale[i][0] = s->mvscale[i][1] = 0;
+                } else {
+                    if (w * 2 < refw || h * 2 < refh || w > 16 * refw || h > 16 * refh) {
+                        av_log(ctx, AV_LOG_ERROR,
+                               "Invalid ref frame dimensions %dx%d for frame size %dx%d\n",
+                               refw, refh, w, h);
+                        return AVERROR_INVALIDDATA;
+                    }
+                    s->mvscale[i][0] = (refw << 14) / w;
+                    s->mvscale[i][1] = (refh << 14) / h;
+                    s->mvstep[i][0] = 16 * s->mvscale[i][0] >> 14;
+                    s->mvstep[i][1] = 16 * s->mvscale[i][1] >> 14;
+                }
+            }
          }
      }
      s->refreshctx   = s->errorres ? 0 : get_bits1(&s->gb);
@@ -600,8 +680,6 @@ static int decode_frame_header(AVCodecContext *ctx,
                  if (get_bits1(&s->gb))
                      s->lf_delta.mode[i] = get_sbits_inv(&s->gb, 6);
          }
-    } else {
-        memset(&s->lf_delta, 0, sizeof(s->lf_delta));
      }
  
      /* quantization header data */
@@ -683,22 +761,27 @@ static int decode_frame_header(AVCodecContext *ctx,
          } else {
              lflvl  = s->filter.level;
          }
-        s->segmentation.feat[i].lflvl[0][0] =
-        s->segmentation.feat[i].lflvl[0][1] =
-            av_clip_uintp2(lflvl + (s->lf_delta.ref[0] << sh), 6);
-        for (j = 1; j < 4; j++) {
-            s->segmentation.feat[i].lflvl[j][0] =
-                av_clip_uintp2(lflvl + ((s->lf_delta.ref[j] +
-                                         s->lf_delta.mode[0]) * (1 << sh)), 6);
-            s->segmentation.feat[i].lflvl[j][1] =
-                av_clip_uintp2(lflvl + ((s->lf_delta.ref[j] +
-                                         s->lf_delta.mode[1]) * (1 << sh)), 6);
+        if (s->lf_delta.enabled) {
+            s->segmentation.feat[i].lflvl[0][0] =
+            s->segmentation.feat[i].lflvl[0][1] =
+                av_clip_uintp2(lflvl + (s->lf_delta.ref[0] << sh), 6);
+            for (j = 1; j < 4; j++) {
+                s->segmentation.feat[i].lflvl[j][0] =
+                    av_clip_uintp2(lflvl + ((s->lf_delta.ref[j] +
+                                             s->lf_delta.mode[0]) * (1 << sh)), 6);
+                s->segmentation.feat[i].lflvl[j][1] =
+                    av_clip_uintp2(lflvl + ((s->lf_delta.ref[j] +
+                                             s->lf_delta.mode[1]) * (1 << sh)), 6);
+            }
+        } else {
+            memset(s->segmentation.feat[i].lflvl, lflvl,
+                   sizeof(s->segmentation.feat[i].lflvl));
          }
      }
  
      /* tiling info */
-    if ((res = update_size(ctx, w, h)) < 0) {
-        av_log(ctx, AV_LOG_ERROR, "Failed to initialize decoder for %dx%d\n", w, h);
+    if ((res = update_size(ctx, w, h, fmt)) < 0) {
+        av_log(ctx, AV_LOG_ERROR, "Failed to initialize decoder for %dx%d @ %d\n", w, h, fmt);
          return res;
      }
      for (s->tiling.log2_tile_cols = 0;
@@ -1327,8 +1410,8 @@ static void decode_mode(AVCodecContext *ctx)
      VP9Block *b = s->b;
      int row = s->row, col = s->col, row7 = s->row7;
      enum TxfmMode max_tx = max_tx_for_bl_bp[b->bs];
-    int w4 = FFMIN(s->cols - col, bwh_tab[1][b->bs][0]);
-    int h4 = FFMIN(s->rows - row, bwh_tab[1][b->bs][1]), y;
+    int bw4 = bwh_tab[1][b->bs][0], w4 = FFMIN(s->cols - col, bw4);
+    int bh4 = bwh_tab[1][b->bs][1], h4 = FFMIN(s->rows - row, bh4), y;
      int have_a = row > 0, have_l = col > s->tiling.tile_col_start;
      int vref, filter_id;
  
@@ -1370,7 +1453,7 @@ static void decode_mode(AVCodecContext *ctx)
      if (s->segmentation.enabled &&
          (s->segmentation.update_map || s->keyframe || s->intraonly)) {
          setctx_2d(&s->frames[CUR_FRAME].segmentation_map[row * 8 * s->sb_cols + col],
-                  w4, h4, 8 * s->sb_cols, b->seg_id);
+                  bw4, bh4, 8 * s->sb_cols, b->seg_id);
      }
  
      b->skip = s->segmentation.enabled &&
@@ -2254,12 +2337,12 @@ static void decode_coeffs(AVCodecContext *ctx)
          break;
      }
  
-#define DECODE_UV_COEF_LOOP(step) \
+#define DECODE_UV_COEF_LOOP(step, decode_coeffs_fn) \
      for (n = 0, y = 0; y < end_y; y += step) { \
          for (x = 0; x < end_x; x += step, n += step * step) { \
-            res = decode_coeffs_b(&s->c, s->uvblock[pl] + 16 * n, \
-                                  16 * step * step, c, e, p, a[x] + l[y], \
-                                  uvscan, uvnb, uv_band_counts, qmul[1]); \
+            res = decode_coeffs_fn(&s->c, s->uvblock[pl] + 16 * n, \
+                                   16 * step * step, c, e, p, a[x] + l[y], \
+                                   uvscan, uvnb, uv_band_counts, qmul[1]); \
              a[x] = l[y] = !!res; \
              if (step >= 4) { \
                  AV_WN16A(&s->uveob[pl][n], res); \
@@ -2272,36 +2355,30 @@ static void decode_coeffs(AVCodecContext *ctx)
      p = s->prob.coef[b->uvtx][1 /* uv */][!b->intra];
      c = s->counts.coef[b->uvtx][1 /* uv */][!b->intra];
      e = s->counts.eob[b->uvtx][1 /* uv */][!b->intra];
-    w4 >>= 1;
-    h4 >>= 1;
-    end_x >>= 1;
-    end_y >>= 1;
+    w4 >>= s->ss_h;
+    end_x >>= s->ss_h;
+    h4 >>= s->ss_v;
+    end_y >>= s->ss_v;
      for (pl = 0; pl < 2; pl++) {
-        a = &s->above_uv_nnz_ctx[pl][col];
-        l = &s->left_uv_nnz_ctx[pl][row & 7];
+        a = &s->above_uv_nnz_ctx[pl][col << !s->ss_h];
+        l = &s->left_uv_nnz_ctx[pl][(row & 7) << !s->ss_v];
          switch (b->uvtx) {
          case TX_4X4:
-            DECODE_UV_COEF_LOOP(1);
+            DECODE_UV_COEF_LOOP(1, decode_coeffs_b);
              break;
          case TX_8X8:
              MERGE_CTX(2, AV_RN16A);
-            DECODE_UV_COEF_LOOP(2);
+            DECODE_UV_COEF_LOOP(2, decode_coeffs_b);
              SPLAT_CTX(2);
              break;
          case TX_16X16:
              MERGE_CTX(4, AV_RN32A);
-            DECODE_UV_COEF_LOOP(4);
+            DECODE_UV_COEF_LOOP(4, decode_coeffs_b);
              SPLAT_CTX(4);
              break;
          case TX_32X32:
              MERGE_CTX(8, AV_RN64A);
-            // a 64x64 (max) uv block can ever only contain 1 tx32x32 block
-            // so there is no need to loop
-            res = decode_coeffs_b32(&s->c, s->uvblock[pl],
-                                    1024, c, e, p, a[0] + l[0],
-                                    uvscan, uvnb, uv_band_counts, qmul[1]);
-            a[0] = l[0] = !!res;
-            AV_WN16A(&s->uveob[pl][0], res);
+            DECODE_UV_COEF_LOOP(8, decode_coeffs_b32);
              SPLAT_CTX(8);
              break;
          }
@@ -2524,12 +2601,118 @@ static void intra_recon(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off)
      }
  }
  
-static av_always_inline void mc_luma_dir(VP9Context *s, vp9_mc_func (*mc)[2],
-                                         uint8_t *dst, ptrdiff_t dst_stride,
-                                         const uint8_t *ref, ptrdiff_t ref_stride,
-                                         ThreadFrame *ref_frame,
-                                         ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
-                                         int bw, int bh, int w, int h)
+static av_always_inline void mc_luma_scaled(VP9Context *s, vp9_scaled_mc_func smc,
+                                            uint8_t *dst, ptrdiff_t dst_stride,
+                                            const uint8_t *ref, ptrdiff_t ref_stride,
+                                            ThreadFrame *ref_frame,
+                                            ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
+                                            int bw, int bh, int w, int h,
+                                            const uint16_t *scale, const uint8_t *step)
+{
+#define scale_mv(n, dim) (((int64_t)n * scale[dim]) >> 14)
+    // BUG libvpx seems to scale the two components separately. This introduces
+    // rounding errors but we have to reproduce them to be exactly compatible
+    // with the output from libvpx...
+    int mx = scale_mv(mv->x * 2, 0) + scale_mv(x * 16, 0);
+    int my = scale_mv(mv->y * 2, 1) + scale_mv(y * 16, 1);
+    int refbw_m1, refbh_m1;
+    int th;
+
+    y = my >> 4;
+    x = mx >> 4;
+    ref += y * ref_stride + x;
+    mx &= 15;
+    my &= 15;
+    refbw_m1 = ((bw - 1) * step[0] + mx) >> 4;
+    refbh_m1 = ((bh - 1) * step[1] + my) >> 4;
+    // FIXME bilinear filter only needs 0/1 pixels, not 3/4
+    // we use +7 because the last 7 pixels of each sbrow can be changed in
+    // the longest loopfilter of the next sbrow
+    th = (y + refbh_m1 + 4 + 7) >> 6;
+    ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
+    if (x < 3 || y < 3 || x + 4 >= w - refbw_m1 || y + 4 >= h - refbh_m1) {
+        s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
+                                 ref - 3 * ref_stride - 3,
+                                 144, ref_stride,
+                                 refbw_m1 + 8, refbh_m1 + 8,
+                                 x - 3, y - 3, w, h);
+        ref = s->edge_emu_buffer + 3 * 144 + 3;
+        ref_stride = 144;
+    }
+    smc(dst, dst_stride, ref, ref_stride, bh, mx, my, step[0], step[1]);
+}
+
+static av_always_inline void mc_chroma_scaled(VP9Context *s, vp9_scaled_mc_func smc,
+                                              uint8_t *dst_u, uint8_t *dst_v,
+                                              ptrdiff_t dst_stride,
+                                              const uint8_t *ref_u, ptrdiff_t src_stride_u,
+                                              const uint8_t *ref_v, ptrdiff_t src_stride_v,
+                                              ThreadFrame *ref_frame,
+                                              ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
+                                              int bw, int bh, int w, int h,
+                                              const uint16_t *scale, const uint8_t *step)
+{
+    // BUG https://code.google.com/p/webm/issues/detail?id=820
+    int mx = scale_mv(mv->x, 0) + (scale_mv(x * 16, 0) & ~15) + (scale_mv(x * 32, 0) & 15);
+    int my = scale_mv(mv->y, 1) + (scale_mv(y * 16, 1) & ~15) + (scale_mv(y * 32, 1) & 15);
+#undef scale_mv
+    int refbw_m1, refbh_m1;
+    int th;
+
+    y = my >> 4;
+    x = mx >> 4;
+    ref_u += y * src_stride_u + x;
+    ref_v += y * src_stride_v + x;
+    mx &= 15;
+    my &= 15;
+    refbw_m1 = ((bw - 1) * step[0] + mx) >> 4;
+    refbh_m1 = ((bh - 1) * step[1] + my) >> 4;
+    // FIXME bilinear filter only needs 0/1 pixels, not 3/4
+    // we use +7 because the last 7 pixels of each sbrow can be changed in
+    // the longest loopfilter of the next sbrow
+    th = (y + refbh_m1 + 4 + 7) >> 5;
+    ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
+    if (x < 3 || y < 3 || x + 4 >= w - refbw_m1 || y + 4 >= h - refbh_m1) {
+        s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
+                                 ref_u - 3 * src_stride_u - 3,
+                                 144, src_stride_u,
+                                 refbw_m1 + 8, refbh_m1 + 8,
+                                 x - 3, y - 3, w, h);
+        ref_u = s->edge_emu_buffer + 3 * 144 + 3;
+        smc(dst_u, dst_stride, ref_u, 144, bh, mx, my, step[0], step[1]);
+
+        s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
+                                 ref_v - 3 * src_stride_v - 3,
+                                 144, src_stride_v,
+                                 refbw_m1 + 8, refbh_m1 + 8,
+                                 x - 3, y - 3, w, h);
+        ref_v = s->edge_emu_buffer + 3 * 144 + 3;
+        smc(dst_v, dst_stride, ref_v, 144, bh, mx, my, step[0], step[1]);
+    } else {
+        smc(dst_u, dst_stride, ref_u, src_stride_u, bh, mx, my, step[0], step[1]);
+        smc(dst_v, dst_stride, ref_v, src_stride_v, bh, mx, my, step[0], step[1]);
+    }
+}
+
+#define FN(x) x##_scaled
+#define mc_luma_dir(s, mc, dst, dst_ls, src, src_ls, tref, row, col, mv, bw, bh, w, h, i) \
+    mc_luma_scaled(s, s->dsp.s##mc, dst, dst_ls, src, src_ls, tref, row, col, \
+                   mv, bw, bh, w, h, s->mvscale[b->ref[i]], s->mvstep[b->ref[i]])
+#define mc_chroma_dir(s, mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
+                      row, col, mv, bw, bh, w, h, i) \
+    mc_chroma_scaled(s, s->dsp.s##mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
+                     row, col, mv, bw, bh, w, h, s->mvscale[b->ref[i]], s->mvstep[b->ref[i]])
+#include "vp9_mc_template.c"
+#undef mc_luma_dir
+#undef mc_chroma_dir
+#undef FN
+
+static av_always_inline void mc_luma_unscaled(VP9Context *s, vp9_mc_func (*mc)[2],
+                                              uint8_t *dst, ptrdiff_t dst_stride,
+                                              const uint8_t *ref, ptrdiff_t ref_stride,
+                                              ThreadFrame *ref_frame,
+                                              ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
+                                              int bw, int bh, int w, int h)
  {
      int mx = mv->x, my = mv->y, th;
  
@@ -2556,14 +2739,14 @@ static av_always_inline void mc_luma_dir(VP9Context *s, vp9_mc_func (*mc)[2],
      mc[!!mx][!!my](dst, dst_stride, ref, ref_stride, bh, mx << 1, my << 1);
  }
  
-static av_always_inline void mc_chroma_dir(VP9Context *s, vp9_mc_func (*mc)[2],
-                                           uint8_t *dst_u, uint8_t *dst_v,
-                                           ptrdiff_t dst_stride,
-                                           const uint8_t *ref_u, ptrdiff_t src_stride_u,
-                                           const uint8_t *ref_v, ptrdiff_t src_stride_v,
-                                           ThreadFrame *ref_frame,
-                                           ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
-                                           int bw, int bh, int w, int h)
+static av_always_inline void mc_chroma_unscaled(VP9Context *s, vp9_mc_func (*mc)[2],
+                                                uint8_t *dst_u, uint8_t *dst_v,
+                                                ptrdiff_t dst_stride,
+                                                const uint8_t *ref_u, ptrdiff_t src_stride_u,
+                                                const uint8_t *ref_v, ptrdiff_t src_stride_v,
+                                                ThreadFrame *ref_frame,
+                                                ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
+                                                int bw, int bh, int w, int h)
  {
      int mx = mv->x, my = mv->y, th;
  
@@ -2601,156 +2784,32 @@ static av_always_inline void mc_chroma_dir(VP9Context *s, vp9_mc_func (*mc)[2],
      }
  }
  
+#define FN(x) x
+#define mc_luma_dir(s, mc, dst, dst_ls, src, src_ls, tref, row, col, mv, bw, bh, w, h, i) \
+    mc_luma_unscaled(s, s->dsp.mc, dst, dst_ls, src, src_ls, tref, row, col, \
+                     mv, bw, bh, w, h)
+#define mc_chroma_dir(s, mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
+                      row, col, mv, bw, bh, w, h, i) \
+    mc_chroma_unscaled(s, s->dsp.mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
+                       row, col, mv, bw, bh, w, h)
+#include "vp9_mc_template.c"
+#undef mc_luma_dir_dir
+#undef mc_chroma_dir_dir
+#undef FN
+
  static void inter_recon(AVCodecContext *ctx)
  {
-    static const uint8_t bwlog_tab[2][N_BS_SIZES] = {
-        { 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4 },
-        { 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4 },
-    };
      VP9Context *s = ctx->priv_data;
      VP9Block *b = s->b;
      int row = s->row, col = s->col;
-    ThreadFrame *tref1 = &s->refs[s->refidx[b->ref[0]]], *tref2;
-    AVFrame *ref1 = tref1->f, *ref2;
-    int w1 = ref1->width, h1 = ref1->height, w2, h2;
-    ptrdiff_t ls_y = s->y_stride, ls_uv = s->uv_stride;
-
-    if (b->comp) {
-        tref2 = &s->refs[s->refidx[b->ref[1]]];
-        ref2 = tref2->f;
-        w2 = ref2->width;
-        h2 = ref2->height;
-    }
  
-    // y inter pred
-    if (b->bs > BS_8x8) {
-        if (b->bs == BS_8x4) {
-            mc_luma_dir(s, s->dsp.mc[3][b->filter][0], s->dst[0], ls_y,
-                        ref1->data[0], ref1->linesize[0], tref1,
-                        row << 3, col << 3, &b->mv[0][0], 8, 4, w1, h1);
-            mc_luma_dir(s, s->dsp.mc[3][b->filter][0],
-                        s->dst[0] + 4 * ls_y, ls_y,
-                        ref1->data[0], ref1->linesize[0], tref1,
-                        (row << 3) + 4, col << 3, &b->mv[2][0], 8, 4, w1, h1);
-
-            if (b->comp) {
-                mc_luma_dir(s, s->dsp.mc[3][b->filter][1], s->dst[0], ls_y,
-                            ref2->data[0], ref2->linesize[0], tref2,
-                            row << 3, col << 3, &b->mv[0][1], 8, 4, w2, h2);
-                mc_luma_dir(s, s->dsp.mc[3][b->filter][1],
-                            s->dst[0] + 4 * ls_y, ls_y,
-                            ref2->data[0], ref2->linesize[0], tref2,
-                            (row << 3) + 4, col << 3, &b->mv[2][1], 8, 4, w2, h2);
-            }
-        } else if (b->bs == BS_4x8) {
-            mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0], ls_y,
-                        ref1->data[0], ref1->linesize[0], tref1,
-                        row << 3, col << 3, &b->mv[0][0], 4, 8, w1, h1);
-            mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0] + 4, ls_y,
-                        ref1->data[0], ref1->linesize[0], tref1,
-                        row << 3, (col << 3) + 4, &b->mv[1][0], 4, 8, w1, h1);
-
-            if (b->comp) {
-                mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0], ls_y,
-                            ref2->data[0], ref2->linesize[0], tref2,
-                            row << 3, col << 3, &b->mv[0][1], 4, 8, w2, h2);
-                mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0] + 4, ls_y,
-                            ref2->data[0], ref2->linesize[0], tref2,
-                            row << 3, (col << 3) + 4, &b->mv[1][1], 4, 8, w2, h2);
-            }
-        } else {
-            av_assert2(b->bs == BS_4x4);
-
-            // FIXME if two horizontally adjacent blocks have the same MV,
-            // do a w8 instead of a w4 call
-            mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0], ls_y,
-                        ref1->data[0], ref1->linesize[0], tref1,
-                        row << 3, col << 3, &b->mv[0][0], 4, 4, w1, h1);
-            mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0] + 4, ls_y,
-                        ref1->data[0], ref1->linesize[0], tref1,
-                        row << 3, (col << 3) + 4, &b->mv[1][0], 4, 4, w1, h1);
-            mc_luma_dir(s, s->dsp.mc[4][b->filter][0],
-                        s->dst[0] + 4 * ls_y, ls_y,
-                        ref1->data[0], ref1->linesize[0], tref1,
-                        (row << 3) + 4, col << 3, &b->mv[2][0], 4, 4, w1, h1);
-            mc_luma_dir(s, s->dsp.mc[4][b->filter][0],
-                        s->dst[0] + 4 * ls_y + 4, ls_y,
-                        ref1->data[0], ref1->linesize[0], tref1,
-                        (row << 3) + 4, (col << 3) + 4, &b->mv[3][0], 4, 4, w1, h1);
-
-            if (b->comp) {
-                mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0], ls_y,
-                            ref2->data[0], ref2->linesize[0], tref2,
-                            row << 3, col << 3, &b->mv[0][1], 4, 4, w2, h2);
-                mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0] + 4, ls_y,
-                            ref2->data[0], ref2->linesize[0], tref2,
-                            row << 3, (col << 3) + 4, &b->mv[1][1], 4, 4, w2, h2);
-                mc_luma_dir(s, s->dsp.mc[4][b->filter][1],
-                            s->dst[0] + 4 * ls_y, ls_y,
-                            ref2->data[0], ref2->linesize[0], tref2,
-                            (row << 3) + 4, col << 3, &b->mv[2][1], 4, 4, w2, h2);
-                mc_luma_dir(s, s->dsp.mc[4][b->filter][1],
-                            s->dst[0] + 4 * ls_y + 4, ls_y,
-                            ref2->data[0], ref2->linesize[0], tref2,
-                            (row << 3) + 4, (col << 3) + 4, &b->mv[3][1], 4, 4, w2, h2);
-            }
-        }
+    if (s->mvscale[b->ref[0]][0] || (b->comp && s->mvscale[b->ref[1]][0])) {
+        inter_pred_scaled(ctx);
      } else {
-        int bwl = bwlog_tab[0][b->bs];
-        int bw = bwh_tab[0][b->bs][0] * 4, bh = bwh_tab[0][b->bs][1] * 4;
-
-        mc_luma_dir(s, s->dsp.mc[bwl][b->filter][0], s->dst[0], ls_y,
-                    ref1->data[0], ref1->linesize[0], tref1,
-                    row << 3, col << 3, &b->mv[0][0],bw, bh, w1, h1);
-
-        if (b->comp)
-            mc_luma_dir(s, s->dsp.mc[bwl][b->filter][1], s->dst[0], ls_y,
-                        ref2->data[0], ref2->linesize[0], tref2,
-                        row << 3, col << 3, &b->mv[0][1], bw, bh, w2, h2);
+        inter_pred(ctx);
      }
-
-    // uv inter pred
-    {
-        int bwl = bwlog_tab[1][b->bs];
-        int bw = bwh_tab[1][b->bs][0] * 4, bh = bwh_tab[1][b->bs][1] * 4;
-        VP56mv mvuv;
-
-        w1 = (w1 + 1) >> 1;
-        h1 = (h1 + 1) >> 1;
-        if (b->comp) {
-            w2 = (w2 + 1) >> 1;
-            h2 = (h2 + 1) >> 1;
-        }
-        if (b->bs > BS_8x8) {
-            mvuv.x = ROUNDED_DIV(b->mv[0][0].x + b->mv[1][0].x + b->mv[2][0].x + b->mv[3][0].x, 4);
-            mvuv.y = ROUNDED_DIV(b->mv[0][0].y + b->mv[1][0].y + b->mv[2][0].y + b->mv[3][0].y, 4);
-        } else {
-            mvuv = b->mv[0][0];
-        }
-
-        mc_chroma_dir(s, s->dsp.mc[bwl][b->filter][0],
-                      s->dst[1], s->dst[2], ls_uv,
-                      ref1->data[1], ref1->linesize[1],
-                      ref1->data[2], ref1->linesize[2], tref1,
-                      row << 2, col << 2, &mvuv, bw, bh, w1, h1);
-
-        if (b->comp) {
-            if (b->bs > BS_8x8) {
-                mvuv.x = ROUNDED_DIV(b->mv[0][1].x + b->mv[1][1].x + b->mv[2][1].x + b->mv[3][1].x, 4);
-                mvuv.y = ROUNDED_DIV(b->mv[0][1].y + b->mv[1][1].y + b->mv[2][1].y + b->mv[3][1].y, 4);
-            } else {
-                mvuv = b->mv[0][1];
-            }
-            mc_chroma_dir(s, s->dsp.mc[bwl][b->filter][1],
-                          s->dst[1], s->dst[2], ls_uv,
-                          ref2->data[1], ref2->linesize[1],
-                          ref2->data[2], ref2->linesize[2], tref2,
-                          row << 2, col << 2, &mvuv, bw, bh, w2, h2);
-        }
-    }
-
      if (!b->skip) {
-        /* mostly copied intra_reconn() */
+        /* mostly copied intra_recon() */
  
          int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
          int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
@@ -2966,34 +3025,39 @@ static void decode_b(AVCodecContext *ctx, int row, int col,
      case 8:  AV_ZERO64(&v);  break; \
      case 16: AV_ZERO128(&v); break; \
      }
-#define SPLAT_ZERO_YUV(dir, var, off, n) \
+#define SPLAT_ZERO_YUV(dir, var, off, n, dir2) \
      do { \
          SPLAT_ZERO_CTX(s->dir##_y_##var[off * 2], n * 2); \
-        SPLAT_ZERO_CTX(s->dir##_uv_##var[0][off], n); \
-        SPLAT_ZERO_CTX(s->dir##_uv_##var[1][off], n); \
+        if (s->ss_##dir2) { \
+            SPLAT_ZERO_CTX(s->dir##_uv_##var[0][off], n); \
+            SPLAT_ZERO_CTX(s->dir##_uv_##var[1][off], n); \
+        } else { \
+            SPLAT_ZERO_CTX(s->dir##_uv_##var[0][off * 2], n * 2); \
+            SPLAT_ZERO_CTX(s->dir##_uv_##var[1][off * 2], n * 2); \
+        } \
      } while (0)
  
              switch (w4) {
-            case 1: SPLAT_ZERO_YUV(above, nnz_ctx, col, 1); break;
-            case 2: SPLAT_ZERO_YUV(above, nnz_ctx, col, 2); break;
-            case 4: SPLAT_ZERO_YUV(above, nnz_ctx, col, 4); break;
-            case 8: SPLAT_ZERO_YUV(above, nnz_ctx, col, 8); break;
+            case 1: SPLAT_ZERO_YUV(above, nnz_ctx, col, 1, h); break;
+            case 2: SPLAT_ZERO_YUV(above, nnz_ctx, col, 2, h); break;
+            case 4: SPLAT_ZERO_YUV(above, nnz_ctx, col, 4, h); break;
+            case 8: SPLAT_ZERO_YUV(above, nnz_ctx, col, 8, h); break;
              }
              switch (h4) {
-            case 1: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 1); break;
-            case 2: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 2); break;
-            case 4: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 4); break;
-            case 8: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 8); break;
+            case 1: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 1, v); break;
+            case 2: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 2, v); break;
+            case 4: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 4, v); break;
+            case 8: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 8, v); break;
              }
          }
          if (s->pass == 1) {
              s->b++;
              s->block += w4 * h4 * 64;
-            s->uvblock[0] += w4 * h4 * 16;
-            s->uvblock[1] += w4 * h4 * 16;
+            s->uvblock[0] += w4 * h4 * 64 >> (s->ss_h + s->ss_v);
+            s->uvblock[1] += w4 * h4 * 64 >> (s->ss_h + s->ss_v);
              s->eob += 4 * w4 * h4;
-            s->uveob[0] += w4 * h4;
-            s->uveob[1] += w4 * h4;
+            s->uveob[0] += 4 * w4 * h4 >> (s->ss_h + s->ss_v);
+            s->uveob[1] += 4 * w4 * h4 >> (s->ss_h + s->ss_v);
  
              return;
          }
@@ -3090,11 +3154,11 @@ static void decode_b(AVCodecContext *ctx, int row, int col,
      if (s->pass == 2) {
          s->b++;
          s->block += w4 * h4 * 64;
-        s->uvblock[0] += w4 * h4 * 16;
-        s->uvblock[1] += w4 * h4 * 16;
+        s->uvblock[0] += w4 * h4 * 64 >> (s->ss_v + s->ss_h);
+        s->uvblock[1] += w4 * h4 * 64 >> (s->ss_v + s->ss_h);
          s->eob += 4 * w4 * h4;
-        s->uveob[0] += w4 * h4;
-        s->uveob[1] += w4 * h4;
+        s->uveob[0] += 4 * w4 * h4 >> (s->ss_v + s->ss_h);
+        s->uveob[1] += 4 * w4 * h4 >> (s->ss_v + s->ss_h);
      }
  }
  
@@ -3104,8 +3168,8 @@ static void decode_sb(AVCodecContext *ctx, int row, int col, struct VP9Filter *l
      VP9Context *s = ctx->priv_data;
      int c = ((s->above_partition_ctx[col] >> (3 - bl)) & 1) |
              (((s->left_partition_ctx[row & 0x7] >> (3 - bl)) & 1) << 1);
-    const uint8_t *p = s->keyframe ? vp9_default_kf_partition_probs[bl][c] :
-                                     s->prob.p.partition[bl][c];
+    const uint8_t *p = s->keyframe || s->intraonly ? vp9_default_kf_partition_probs[bl][c] :
+                                                     s->prob.p.partition[bl][c];
      enum BlockPartition bp;
      ptrdiff_t hbs = 4 >> bl;
      AVFrame *f = s->frames[CUR_FRAME].tf.f;
@@ -3758,6 +3822,15 @@ static int vp9_decode_frame(AVCodecContext *ctx, void *frame,
          }
          if ((res = av_frame_ref(frame, s->refs[ref].f)) < 0)
              return res;
+        ((AVFrame *)frame)->pkt_pts = pkt->pts;
+        ((AVFrame *)frame)->pkt_dts = pkt->dts;
+        for (i = 0; i < 8; i++) {
+            if (s->next_refs[i].f->data[0])
+                ff_thread_release_buffer(ctx, &s->next_refs[i]);
+            if (s->refs[i].f->data[0] &&
+                (res = ff_thread_ref_frame(&s->next_refs[i], &s->refs[i])) < 0)
+                return res;
+        }
          *got_frame = 1;
          return pkt->size;
      }
@@ -3782,7 +3855,7 @@ static int vp9_decode_frame(AVCodecContext *ctx, void *frame,
          return res;
      f = s->frames[CUR_FRAME].tf.f;
      f->key_frame = s->keyframe;
-    f->pict_type = s->keyframe ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
+    f->pict_type = (s->keyframe || s->intraonly) ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
      ls_y = f->linesize[0];
      ls_uv =f->linesize[1];
  
@@ -3792,25 +3865,13 @@ static int vp9_decode_frame(AVCodecContext *ctx, void *frame,
              ff_thread_release_buffer(ctx, &s->next_refs[i]);
          if (s->refreshrefmask & (1 << i)) {
              res = ff_thread_ref_frame(&s->next_refs[i], &s->frames[CUR_FRAME].tf);
-        } else {
+        } else if (s->refs[i].f->data[0]) {
              res = ff_thread_ref_frame(&s->next_refs[i], &s->refs[i]);
          }
          if (res < 0)
              return res;
      }
  
-    if (s->fullrange)
-        ctx->color_range = AVCOL_RANGE_JPEG;
-    else
-        ctx->color_range = AVCOL_RANGE_MPEG;
-
-    switch (s->colorspace) {
-    case 1: ctx->colorspace = AVCOL_SPC_BT470BG; break;
-    case 2: ctx->colorspace = AVCOL_SPC_BT709; break;
-    case 3: ctx->colorspace = AVCOL_SPC_SMPTE170M; break;
-    case 4: ctx->colorspace = AVCOL_SPC_SMPTE240M; break;
-    }
-
      // main tile decode loop
      memset(s->above_partition_ctx, 0, s->cols);
      memset(s->above_skip_ctx, 0, s->cols);
@@ -3820,8 +3881,8 @@ static int vp9_decode_frame(AVCodecContext *ctx, void *frame,
          memset(s->above_mode_ctx, NEARESTMV, s->cols);
      }
      memset(s->above_y_nnz_ctx, 0, s->sb_cols * 16);
-    memset(s->above_uv_nnz_ctx[0], 0, s->sb_cols * 8);
-    memset(s->above_uv_nnz_ctx[1], 0, s->sb_cols * 8);
+    memset(s->above_uv_nnz_ctx[0], 0, s->sb_cols * 16 >> s->ss_h);
+    memset(s->above_uv_nnz_ctx[1], 0, s->sb_cols * 16 >> s->ss_h);
      memset(s->above_segpred_ctx, 0, s->cols);
      s->pass = s->frames[CUR_FRAME].uses_2pass =
          ctx->active_thread_type == FF_THREAD_FRAME && s->refreshctx && !s->parallelmode;
@@ -3906,7 +3967,7 @@ static int vp9_decode_frame(AVCodecContext *ctx, void *frame,
                              memset(s->left_mode_ctx, NEARESTMV, 8);
                          }
                          memset(s->left_y_nnz_ctx, 0, 16);
-                        memset(s->left_uv_nnz_ctx, 0, 16);
+                        memset(s->left_uv_nnz_ctx, 0, 32);
                          memset(s->left_segpred_ctx, 0, 8);
  
                          memcpy(&s->c, &s->c_b[tile_col], sizeof(s->c));
@@ -4035,7 +4096,6 @@ static av_cold int vp9_decode_init(AVCodecContext *ctx)
      VP9Context *s = ctx->priv_data;
  
      ctx->internal->allocate_progress = 1;
-    ctx->pix_fmt = AV_PIX_FMT_YUV420P;
      ff_vp9dsp_init(&s->dsp);
      ff_videodsp_init(&s->vdsp, 8);
      s->filter.sharpness = -1;
@@ -4078,6 +4138,8 @@ static int vp9_decode_update_thread_context(AVCodecContext *dst, const AVCodecCo
  
      s->invisible = ssrc->invisible;
      s->keyframe = ssrc->keyframe;
+    s->ss_v = ssrc->ss_v;
+    s->ss_h = ssrc->ss_h;
      s->segmentation.enabled = ssrc->segmentation.enabled;
      s->segmentation.update_map = ssrc->segmentation.update_map;
      memcpy(&s->prob_ctx, &ssrc->prob_ctx, sizeof(s->prob_ctx));