Merge commit '5c8a5765dc5f4e29afb85b95be393c30f45412a8'

[ffmpeg] / libavcodec / vp9.c
diff --git a/libavcodec/vp9.c b/libavcodec/vp9.c

index 4d7310f6d4d149c82fa0e69ed95a1d60de26700d..6b5de19266f334c32e0960fbb8a696d672870131 100644 (file)
--- a/libavcodec/vp9.c
+++ b/libavcodec/vp9.c
@@ -36,6 +36,62 @@
  
  #define VP9_SYNCCODE 0x498342
  
+#if HAVE_THREADS
+static void vp9_free_entries(AVCodecContext *avctx) {
+    VP9Context *s = avctx->priv_data;
+
+    if (avctx->active_thread_type & FF_THREAD_SLICE)  {
+        pthread_mutex_destroy(&s->progress_mutex);
+        pthread_cond_destroy(&s->progress_cond);
+        av_freep(&s->entries);
+    }
+}
+
+static int vp9_alloc_entries(AVCodecContext *avctx, int n) {
+    VP9Context *s = avctx->priv_data;
+    int i;
+
+    if (avctx->active_thread_type & FF_THREAD_SLICE)  {
+        if (s->entries)
+            av_freep(&s->entries);
+
+        s->entries = av_malloc_array(n, sizeof(atomic_int));
+
+        if (!s->entries) {
+            av_freep(&s->entries);
+            return AVERROR(ENOMEM);
+        }
+
+        for (i  = 0; i < n; i++)
+            atomic_init(&s->entries[i], 0);
+
+        pthread_mutex_init(&s->progress_mutex, NULL);
+        pthread_cond_init(&s->progress_cond, NULL);
+    }
+    return 0;
+}
+
+static void vp9_report_tile_progress(VP9Context *s, int field, int n) {
+    pthread_mutex_lock(&s->progress_mutex);
+    atomic_fetch_add_explicit(&s->entries[field], n, memory_order_release);
+    pthread_cond_signal(&s->progress_cond);
+    pthread_mutex_unlock(&s->progress_mutex);
+}
+
+static void vp9_await_tile_progress(VP9Context *s, int field, int n) {
+    if (atomic_load_explicit(&s->entries[field], memory_order_acquire) >= n)
+        return;
+
+    pthread_mutex_lock(&s->progress_mutex);
+    while (atomic_load_explicit(&s->entries[field], memory_order_relaxed) != n)
+        pthread_cond_wait(&s->progress_cond, &s->progress_mutex);
+    pthread_mutex_unlock(&s->progress_mutex);
+}
+#else
+static void vp9_free_entries(AVCodecContext *avctx) {}
+static int vp9_alloc_entries(AVCodecContext *avctx, int n) { return 0; }
+#endif
+
  static void vp9_frame_unref(AVCodecContext *avctx, VP9Frame *f)
  {
      ff_thread_release_buffer(avctx, &f->tf);
@@ -113,11 +169,12 @@ fail:
  
  static int update_size(AVCodecContext *avctx, int w, int h)
  {
-#define HWACCEL_MAX (CONFIG_VP9_DXVA2_HWACCEL + CONFIG_VP9_D3D11VA_HWACCEL + CONFIG_VP9_VAAPI_HWACCEL)
+#define HWACCEL_MAX (CONFIG_VP9_DXVA2_HWACCEL + CONFIG_VP9_D3D11VA_HWACCEL * 2 + CONFIG_VP9_VAAPI_HWACCEL)
      enum AVPixelFormat pix_fmts[HWACCEL_MAX + 2], *fmtp = pix_fmts;
      VP9Context *s = avctx->priv_data;
      uint8_t *p;
      int bytesperpixel = s->bytesperpixel, ret, cols, rows;
+    int lflvl_len, i;
  
      av_assert0(w > 0 && h > 0);
  
@@ -132,6 +189,7 @@ static int update_size(AVCodecContext *avctx, int w, int h)
  #endif
  #if CONFIG_VP9_D3D11VA_HWACCEL
              *fmtp++ = AV_PIX_FMT_D3D11VA_VLD;
+            *fmtp++ = AV_PIX_FMT_D3D11;
  #endif
  #if CONFIG_VP9_VAAPI_HWACCEL
              *fmtp++ = AV_PIX_FMT_VAAPI;
@@ -169,13 +227,14 @@ static int update_size(AVCodecContext *avctx, int w, int h)
      s->sb_rows   = (h + 63) >> 6;
      s->cols      = (w + 7) >> 3;
      s->rows      = (h + 7) >> 3;
+    lflvl_len    = avctx->active_thread_type == FF_THREAD_SLICE ? s->sb_rows : 1;
  
  #define assign(var, type, n) var = (type) p; p += s->sb_cols * (n) * sizeof(*var)
      av_freep(&s->intra_pred_data[0]);
      // FIXME we slightly over-allocate here for subsampled chroma, but a little
      // bit of padding shouldn't affect performance...
      p = av_malloc(s->sb_cols * (128 + 192 * bytesperpixel +
-                                sizeof(*s->lflvl) + 16 * sizeof(*s->above_mv_ctx)));
+                                lflvl_len * sizeof(*s->lflvl) + 16 * sizeof(*s->above_mv_ctx)));
      if (!p)
          return AVERROR(ENOMEM);
      assign(s->intra_pred_data[0],  uint8_t *,             64 * bytesperpixel);
@@ -194,12 +253,15 @@ static int update_size(AVCodecContext *avctx, int w, int h)
      assign(s->above_comp_ctx,      uint8_t *,              8);
      assign(s->above_ref_ctx,       uint8_t *,              8);
      assign(s->above_filter_ctx,    uint8_t *,              8);
-    assign(s->lflvl,               VP9Filter *,            1);
+    assign(s->lflvl,               VP9Filter *,            lflvl_len);
  #undef assign
  
-    // these will be re-allocated a little later
-    av_freep(&s->b_base);
-    av_freep(&s->block_base);
+    if (s->td) {
+        for (i = 0; i < s->active_tile_cols; i++) {
+            av_freep(&s->td[i].b_base);
+            av_freep(&s->td[i].block_base);
+        }
+    }
  
      if (s->s.h.bpp != s->last_bpp) {
          ff_vp9dsp_init(&s->dsp, s->s.h.bpp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
@@ -212,40 +274,50 @@ static int update_size(AVCodecContext *avctx, int w, int h)
  
  static int update_block_buffers(AVCodecContext *avctx)
  {
+    int i;
      VP9Context *s = avctx->priv_data;
      int chroma_blocks, chroma_eobs, bytesperpixel = s->bytesperpixel;
+    VP9TileData *td = &s->td[0];
  
-    if (s->b_base && s->block_base && s->block_alloc_using_2pass == s->s.frames[CUR_FRAME].uses_2pass)
+    if (td->b_base && td->block_base && s->block_alloc_using_2pass == s->s.frames[CUR_FRAME].uses_2pass)
          return 0;
  
-    av_free(s->b_base);
-    av_free(s->block_base);
+    av_free(td->b_base);
+    av_free(td->block_base);
      chroma_blocks = 64 * 64 >> (s->ss_h + s->ss_v);
      chroma_eobs   = 16 * 16 >> (s->ss_h + s->ss_v);
      if (s->s.frames[CUR_FRAME].uses_2pass) {
          int sbs = s->sb_cols * s->sb_rows;
  
-        s->b_base = av_malloc_array(s->cols * s->rows, sizeof(VP9Block));
-        s->block_base = av_mallocz(((64 * 64 + 2 * chroma_blocks) * bytesperpixel * sizeof(int16_t) +
+        td->b_base = av_malloc_array(s->cols * s->rows, sizeof(VP9Block));
+        td->block_base = av_mallocz(((64 * 64 + 2 * chroma_blocks) * bytesperpixel * sizeof(int16_t) +
                                      16 * 16 + 2 * chroma_eobs) * sbs);
-        if (!s->b_base || !s->block_base)
+        if (!td->b_base || !td->block_base)
              return AVERROR(ENOMEM);
-        s->uvblock_base[0] = s->block_base + sbs * 64 * 64 * bytesperpixel;
-        s->uvblock_base[1] = s->uvblock_base[0] + sbs * chroma_blocks * bytesperpixel;
-        s->eob_base = (uint8_t *) (s->uvblock_base[1] + sbs * chroma_blocks * bytesperpixel);
-        s->uveob_base[0] = s->eob_base + 16 * 16 * sbs;
-        s->uveob_base[1] = s->uveob_base[0] + chroma_eobs * sbs;
+        td->uvblock_base[0] = td->block_base + sbs * 64 * 64 * bytesperpixel;
+        td->uvblock_base[1] = td->uvblock_base[0] + sbs * chroma_blocks * bytesperpixel;
+        td->eob_base = (uint8_t *) (td->uvblock_base[1] + sbs * chroma_blocks * bytesperpixel);
+        td->uveob_base[0] = td->eob_base + 16 * 16 * sbs;
+        td->uveob_base[1] = td->uveob_base[0] + chroma_eobs * sbs;
      } else {
-        s->b_base = av_malloc(sizeof(VP9Block));
-        s->block_base = av_mallocz((64 * 64 + 2 * chroma_blocks) * bytesperpixel * sizeof(int16_t) +
-                                   16 * 16 + 2 * chroma_eobs);
-        if (!s->b_base || !s->block_base)
-            return AVERROR(ENOMEM);
-        s->uvblock_base[0] = s->block_base + 64 * 64 * bytesperpixel;
-        s->uvblock_base[1] = s->uvblock_base[0] + chroma_blocks * bytesperpixel;
-        s->eob_base = (uint8_t *) (s->uvblock_base[1] + chroma_blocks * bytesperpixel);
-        s->uveob_base[0] = s->eob_base + 16 * 16;
-        s->uveob_base[1] = s->uveob_base[0] + chroma_eobs;
+        for (i = 1; i < s->active_tile_cols; i++) {
+            if (s->td[i].b_base && s->td[i].block_base) {
+                av_free(s->td[i].b_base);
+                av_free(s->td[i].block_base);
+            }
+        }
+        for (i = 0; i < s->active_tile_cols; i++) {
+            s->td[i].b_base = av_malloc(sizeof(VP9Block));
+            s->td[i].block_base = av_mallocz((64 * 64 + 2 * chroma_blocks) * bytesperpixel * sizeof(int16_t) +
+                                       16 * 16 + 2 * chroma_eobs);
+            if (!s->td[i].b_base || !s->td[i].block_base)
+                return AVERROR(ENOMEM);
+            s->td[i].uvblock_base[0] = s->td[i].block_base + 64 * 64 * bytesperpixel;
+            s->td[i].uvblock_base[1] = s->td[i].uvblock_base[0] + chroma_blocks * bytesperpixel;
+            s->td[i].eob_base = (uint8_t *) (s->td[i].uvblock_base[1] + chroma_blocks * bytesperpixel);
+            s->td[i].uveob_base[0] = s->td[i].eob_base + 16 * 16;
+            s->td[i].uveob_base[1] = s->td[i].uveob_base[0] + chroma_eobs;
+        }
      }
      s->block_alloc_using_2pass = s->s.frames[CUR_FRAME].uses_2pass;
  
@@ -458,7 +530,7 @@ static int decode_frame_header(AVCodecContext *avctx,
                  s->bytesperpixel = 1;
                  s->pix_fmt = AV_PIX_FMT_YUV420P;
                  avctx->colorspace = AVCOL_SPC_BT470BG;
-                avctx->color_range = AVCOL_RANGE_JPEG;
+                avctx->color_range = AVCOL_RANGE_MPEG;
              }
              s->s.h.refreshrefmask = get_bits(&s->gb, 8);
              w = get_bits(&s->gb, 16) + 1;
@@ -542,8 +614,20 @@ static int decode_frame_header(AVCodecContext *avctx,
      sharp = get_bits(&s->gb, 3);
      // if sharpness changed, reinit lim/mblim LUTs. if it didn't change, keep
      // the old cache values since they are still valid
-    if (s->s.h.filter.sharpness != sharp)
-        memset(s->filter_lut.lim_lut, 0, sizeof(s->filter_lut.lim_lut));
+    if (s->s.h.filter.sharpness != sharp) {
+        for (i = 1; i <= 63; i++) {
+            int limit = i;
+
+            if (sharp > 0) {
+                limit >>= (sharp + 3) >> 2;
+                limit = FFMIN(limit, 9 - sharp);
+            }
+            limit = FFMAX(limit, 1);
+
+            s->filter_lut.lim_lut[i] = limit;
+            s->filter_lut.mblim_lut[i] = 2 * (i + 2) + limit;
+        }
+    }
      s->s.h.filter.sharpness = sharp;
      if ((s->s.h.lf_delta.enabled = get_bits1(&s->gb))) {
          if ((s->s.h.lf_delta.updated = get_bits1(&s->gb))) {
@@ -661,12 +745,36 @@ static int decode_frame_header(AVCodecContext *avctx,
      s->s.h.tiling.log2_tile_rows = decode012(&s->gb);
      s->s.h.tiling.tile_rows = 1 << s->s.h.tiling.log2_tile_rows;
      if (s->s.h.tiling.tile_cols != (1 << s->s.h.tiling.log2_tile_cols)) {
+        int n_range_coders;
+        VP56RangeCoder *rc;
+
+        if (s->td) {
+            for (i = 0; i < s->active_tile_cols; i++) {
+                av_free(s->td[i].b_base);
+                av_free(s->td[i].block_base);
+            }
+            av_free(s->td);
+        }
+
          s->s.h.tiling.tile_cols = 1 << s->s.h.tiling.log2_tile_cols;
-        s->c_b = av_fast_realloc(s->c_b, &s->c_b_size,
-                                 sizeof(VP56RangeCoder) * s->s.h.tiling.tile_cols);
-        if (!s->c_b) {
-            av_log(avctx, AV_LOG_ERROR, "Ran out of memory during range coder init\n");
+        vp9_free_entries(avctx);
+        s->active_tile_cols = avctx->active_thread_type == FF_THREAD_SLICE ?
+                              s->s.h.tiling.tile_cols : 1;
+        vp9_alloc_entries(avctx, s->sb_rows);
+        if (avctx->active_thread_type == FF_THREAD_SLICE) {
+            n_range_coders = 4; // max_tile_rows
+        } else {
+            n_range_coders = s->s.h.tiling.tile_cols;
+        }
+        s->td = av_mallocz_array(s->active_tile_cols, sizeof(VP9TileData) +
+                                 n_range_coders * sizeof(VP56RangeCoder));
+        if (!s->td)
              return AVERROR(ENOMEM);
+        rc = (VP56RangeCoder *) &s->td[s->active_tile_cols];
+        for (i = 0; i < s->active_tile_cols; i++) {
+            s->td[i].s = s;
+            s->td[i].c_b = rc;
+            rc += n_range_coders;
          }
      }
  
@@ -734,12 +842,15 @@ static int decode_frame_header(AVCodecContext *avctx,
          return AVERROR_INVALIDDATA;
      }
  
-    if (s->s.h.keyframe || s->s.h.intraonly) {
-        memset(s->counts.coef, 0, sizeof(s->counts.coef));
-        memset(s->counts.eob,  0, sizeof(s->counts.eob));
-    } else {
-        memset(&s->counts, 0, sizeof(s->counts));
+    for (i = 0; i < s->active_tile_cols; i++) {
+        if (s->s.h.keyframe || s->s.h.intraonly) {
+            memset(s->td[i].counts.coef, 0, sizeof(s->td[0].counts.coef));
+            memset(s->td[i].counts.eob,  0, sizeof(s->td[0].counts.eob));
+        } else {
+            memset(&s->td[i].counts, 0, sizeof(s->td[0].counts));
+        }
      }
+
      /* FIXME is it faster to not copy here, but do it down in the fw updates
       * as explicit copies if the fw update is missing (and skip the copy upon
       * fw update)? */
@@ -788,7 +899,7 @@ static int decode_frame_header(AVCodecContext *avctx,
                                  else
                                      p[n] = r[n];
                              }
-                            p[3] = 0;
+                            memcpy(&p[3], ff_vp9_model_pareto8[p[2]], 8);
                          }
          } else {
              for (j = 0; j < 2; j++)
@@ -800,7 +911,7 @@ static int decode_frame_header(AVCodecContext *avctx,
                              if (m > 3 && l == 0) // dc only has 3 pt
                                  break;
                              memcpy(p, r, 3);
-                            p[3] = 0;
+                            memcpy(&p[3], ff_vp9_model_pareto8[p[2]], 8);
                          }
          }
          if (s->s.h.txfmmode == i)
@@ -928,12 +1039,12 @@ static int decode_frame_header(AVCodecContext *avctx,
      return (data2 - data) + size2;
  }
  
-static void decode_sb(AVCodecContext *avctx, int row, int col, VP9Filter *lflvl,
+static void decode_sb(VP9TileData *td, int row, int col, VP9Filter *lflvl,
                        ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
  {
-    VP9Context *s = avctx->priv_data;
+    const VP9Context *s = td->s;
      int c = ((s->above_partition_ctx[col] >> (3 - bl)) & 1) |
-            (((s->left_partition_ctx[row & 0x7] >> (3 - bl)) & 1) << 1);
+            (((td->left_partition_ctx[row & 0x7] >> (3 - bl)) & 1) << 1);
      const uint8_t *p = s->s.h.keyframe || s->s.h.intraonly ? ff_vp9_default_kf_partition_probs[bl][c] :
                                                       s->prob.p.partition[bl][c];
      enum BlockPartition bp;
@@ -943,75 +1054,75 @@ static void decode_sb(AVCodecContext *avctx, int row, int col, VP9Filter *lflvl,
      int bytesperpixel = s->bytesperpixel;
  
      if (bl == BL_8X8) {
-        bp = vp8_rac_get_tree(&s->c, ff_vp9_partition_tree, p);
-        ff_vp9_decode_block(avctx, row, col, lflvl, yoff, uvoff, bl, bp);
+        bp = vp8_rac_get_tree(td->c, ff_vp9_partition_tree, p);
+        ff_vp9_decode_block(td, row, col, lflvl, yoff, uvoff, bl, bp);
      } else if (col + hbs < s->cols) { // FIXME why not <=?
          if (row + hbs < s->rows) { // FIXME why not <=?
-            bp = vp8_rac_get_tree(&s->c, ff_vp9_partition_tree, p);
+            bp = vp8_rac_get_tree(td->c, ff_vp9_partition_tree, p);
              switch (bp) {
              case PARTITION_NONE:
-                ff_vp9_decode_block(avctx, row, col, lflvl, yoff, uvoff, bl, bp);
+                ff_vp9_decode_block(td, row, col, lflvl, yoff, uvoff, bl, bp);
                  break;
              case PARTITION_H:
-                ff_vp9_decode_block(avctx, row, col, lflvl, yoff, uvoff, bl, bp);
+                ff_vp9_decode_block(td, row, col, lflvl, yoff, uvoff, bl, bp);
                  yoff  += hbs * 8 * y_stride;
                  uvoff += hbs * 8 * uv_stride >> s->ss_v;
-                ff_vp9_decode_block(avctx, row + hbs, col, lflvl, yoff, uvoff, bl, bp);
+                ff_vp9_decode_block(td, row + hbs, col, lflvl, yoff, uvoff, bl, bp);
                  break;
              case PARTITION_V:
-                ff_vp9_decode_block(avctx, row, col, lflvl, yoff, uvoff, bl, bp);
+                ff_vp9_decode_block(td, row, col, lflvl, yoff, uvoff, bl, bp);
                  yoff  += hbs * 8 * bytesperpixel;
                  uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
-                ff_vp9_decode_block(avctx, row, col + hbs, lflvl, yoff, uvoff, bl, bp);
+                ff_vp9_decode_block(td, row, col + hbs, lflvl, yoff, uvoff, bl, bp);
                  break;
              case PARTITION_SPLIT:
-                decode_sb(avctx, row, col, lflvl, yoff, uvoff, bl + 1);
-                decode_sb(avctx, row, col + hbs, lflvl,
+                decode_sb(td, row, col, lflvl, yoff, uvoff, bl + 1);
+                decode_sb(td, row, col + hbs, lflvl,
                            yoff + 8 * hbs * bytesperpixel,
                            uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
                  yoff  += hbs * 8 * y_stride;
                  uvoff += hbs * 8 * uv_stride >> s->ss_v;
-                decode_sb(avctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
-                decode_sb(avctx, row + hbs, col + hbs, lflvl,
+                decode_sb(td, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
+                decode_sb(td, row + hbs, col + hbs, lflvl,
                            yoff + 8 * hbs * bytesperpixel,
                            uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
                  break;
              default:
                  av_assert0(0);
              }
-        } else if (vp56_rac_get_prob_branchy(&s->c, p[1])) {
+        } else if (vp56_rac_get_prob_branchy(td->c, p[1])) {
              bp = PARTITION_SPLIT;
-            decode_sb(avctx, row, col, lflvl, yoff, uvoff, bl + 1);
-            decode_sb(avctx, row, col + hbs, lflvl,
+            decode_sb(td, row, col, lflvl, yoff, uvoff, bl + 1);
+            decode_sb(td, row, col + hbs, lflvl,
                        yoff + 8 * hbs * bytesperpixel,
                        uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
          } else {
              bp = PARTITION_H;
-            ff_vp9_decode_block(avctx, row, col, lflvl, yoff, uvoff, bl, bp);
+            ff_vp9_decode_block(td, row, col, lflvl, yoff, uvoff, bl, bp);
          }
      } else if (row + hbs < s->rows) { // FIXME why not <=?
-        if (vp56_rac_get_prob_branchy(&s->c, p[2])) {
+        if (vp56_rac_get_prob_branchy(td->c, p[2])) {
              bp = PARTITION_SPLIT;
-            decode_sb(avctx, row, col, lflvl, yoff, uvoff, bl + 1);
+            decode_sb(td, row, col, lflvl, yoff, uvoff, bl + 1);
              yoff  += hbs * 8 * y_stride;
              uvoff += hbs * 8 * uv_stride >> s->ss_v;
-            decode_sb(avctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
+            decode_sb(td, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
          } else {
              bp = PARTITION_V;
-            ff_vp9_decode_block(avctx, row, col, lflvl, yoff, uvoff, bl, bp);
+            ff_vp9_decode_block(td, row, col, lflvl, yoff, uvoff, bl, bp);
          }
      } else {
          bp = PARTITION_SPLIT;
-        decode_sb(avctx, row, col, lflvl, yoff, uvoff, bl + 1);
+        decode_sb(td, row, col, lflvl, yoff, uvoff, bl + 1);
      }
-    s->counts.partition[bl][c][bp]++;
+    td->counts.partition[bl][c][bp]++;
  }
  
-static void decode_sb_mem(AVCodecContext *avctx, int row, int col, VP9Filter *lflvl,
+static void decode_sb_mem(VP9TileData *td, int row, int col, VP9Filter *lflvl,
                            ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
  {
-    VP9Context *s = avctx->priv_data;
-    VP9Block *b = s->b;
+    const VP9Context *s = td->s;
+    VP9Block *b = td->b;
      ptrdiff_t hbs = 4 >> bl;
      AVFrame *f = s->s.frames[CUR_FRAME].tf.f;
      ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
@@ -1019,39 +1130,39 @@ static void decode_sb_mem(AVCodecContext *avctx, int row, int col, VP9Filter *lf
  
      if (bl == BL_8X8) {
          av_assert2(b->bl == BL_8X8);
-        ff_vp9_decode_block(avctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
-    } else if (s->b->bl == bl) {
-        ff_vp9_decode_block(avctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
+        ff_vp9_decode_block(td, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
+    } else if (td->b->bl == bl) {
+        ff_vp9_decode_block(td, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
          if (b->bp == PARTITION_H && row + hbs < s->rows) {
              yoff  += hbs * 8 * y_stride;
              uvoff += hbs * 8 * uv_stride >> s->ss_v;
-            ff_vp9_decode_block(avctx, row + hbs, col, lflvl, yoff, uvoff, b->bl, b->bp);
+            ff_vp9_decode_block(td, row + hbs, col, lflvl, yoff, uvoff, b->bl, b->bp);
          } else if (b->bp == PARTITION_V && col + hbs < s->cols) {
              yoff  += hbs * 8 * bytesperpixel;
              uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
-            ff_vp9_decode_block(avctx, row, col + hbs, lflvl, yoff, uvoff, b->bl, b->bp);
+            ff_vp9_decode_block(td, row, col + hbs, lflvl, yoff, uvoff, b->bl, b->bp);
          }
      } else {
-        decode_sb_mem(avctx, row, col, lflvl, yoff, uvoff, bl + 1);
+        decode_sb_mem(td, row, col, lflvl, yoff, uvoff, bl + 1);
          if (col + hbs < s->cols) { // FIXME why not <=?
              if (row + hbs < s->rows) {
-                decode_sb_mem(avctx, row, col + hbs, lflvl, yoff + 8 * hbs * bytesperpixel,
+                decode_sb_mem(td, row, col + hbs, lflvl, yoff + 8 * hbs * bytesperpixel,
                                uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
                  yoff  += hbs * 8 * y_stride;
                  uvoff += hbs * 8 * uv_stride >> s->ss_v;
-                decode_sb_mem(avctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
-                decode_sb_mem(avctx, row + hbs, col + hbs, lflvl,
+                decode_sb_mem(td, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
+                decode_sb_mem(td, row + hbs, col + hbs, lflvl,
                                yoff + 8 * hbs * bytesperpixel,
                                uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
              } else {
                  yoff  += hbs * 8 * bytesperpixel;
                  uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
-                decode_sb_mem(avctx, row, col + hbs, lflvl, yoff, uvoff, bl + 1);
+                decode_sb_mem(td, row, col + hbs, lflvl, yoff, uvoff, bl + 1);
              }
          } else if (row + hbs < s->rows) {
              yoff  += hbs * 8 * y_stride;
              uvoff += hbs * 8 * uv_stride >> s->ss_v;
-            decode_sb_mem(avctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
+            decode_sb_mem(td, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
          }
      }
  }
@@ -1066,9 +1177,13 @@ static void set_tile_offset(int *start, int *end, int idx, int log2_n, int n)
  
  static void free_buffers(VP9Context *s)
  {
+    int i;
+
      av_freep(&s->intra_pred_data[0]);
-    av_freep(&s->b_base);
-    av_freep(&s->block_base);
+    for (i = 0; i < s->active_tile_cols; i++) {
+        av_freep(&s->td[i].b_base);
+        av_freep(&s->td[i].block_base);
+    }
  }
  
  static av_cold int vp9_decode_free(AVCodecContext *avctx)
@@ -1089,13 +1204,252 @@ static av_cold int vp9_decode_free(AVCodecContext *avctx)
              ff_thread_release_buffer(avctx, &s->next_refs[i]);
          av_frame_free(&s->next_refs[i].f);
      }
+
      free_buffers(s);
-    av_freep(&s->c_b);
-    s->c_b_size = 0;
+    vp9_free_entries(avctx);
+    av_freep(&s->td);
+    return 0;
+}
+
+static int decode_tiles(AVCodecContext *avctx,
+                        const uint8_t *data, int size)
+{
+    VP9Context *s = avctx->priv_data;
+    VP9TileData *td = &s->td[0];
+    int row, col, tile_row, tile_col, ret;
+    int bytesperpixel;
+    int tile_row_start, tile_row_end, tile_col_start, tile_col_end;
+    AVFrame *f;
+    ptrdiff_t yoff, uvoff, ls_y, ls_uv;
+
+    f = s->s.frames[CUR_FRAME].tf.f;
+    ls_y = f->linesize[0];
+    ls_uv =f->linesize[1];
+    bytesperpixel = s->bytesperpixel;
+
+    yoff = uvoff = 0;
+    for (tile_row = 0; tile_row < s->s.h.tiling.tile_rows; tile_row++) {
+        set_tile_offset(&tile_row_start, &tile_row_end,
+                        tile_row, s->s.h.tiling.log2_tile_rows, s->sb_rows);
+
+        for (tile_col = 0; tile_col < s->s.h.tiling.tile_cols; tile_col++) {
+            int64_t tile_size;
+
+            if (tile_col == s->s.h.tiling.tile_cols - 1 &&
+                tile_row == s->s.h.tiling.tile_rows - 1) {
+                tile_size = size;
+            } else {
+                tile_size = AV_RB32(data);
+                data += 4;
+                size -= 4;
+            }
+            if (tile_size > size) {
+                ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, INT_MAX, 0);
+                return AVERROR_INVALIDDATA;
+            }
+            ret = ff_vp56_init_range_decoder(&td->c_b[tile_col], data, tile_size);
+            if (ret < 0)
+                return ret;
+            if (vp56_rac_get_prob_branchy(&td->c_b[tile_col], 128)) { // marker bit
+                ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, INT_MAX, 0);
+                return AVERROR_INVALIDDATA;
+            }
+            data += tile_size;
+            size -= tile_size;
+        }
+
+        for (row = tile_row_start; row < tile_row_end;
+             row += 8, yoff += ls_y * 64, uvoff += ls_uv * 64 >> s->ss_v) {
+            VP9Filter *lflvl_ptr = s->lflvl;
+            ptrdiff_t yoff2 = yoff, uvoff2 = uvoff;
+
+            for (tile_col = 0; tile_col < s->s.h.tiling.tile_cols; tile_col++) {
+                set_tile_offset(&tile_col_start, &tile_col_end,
+                                tile_col, s->s.h.tiling.log2_tile_cols, s->sb_cols);
+                td->tile_col_start = tile_col_start;
+                if (s->pass != 2) {
+                    memset(td->left_partition_ctx, 0, 8);
+                    memset(td->left_skip_ctx, 0, 8);
+                    if (s->s.h.keyframe || s->s.h.intraonly) {
+                        memset(td->left_mode_ctx, DC_PRED, 16);
+                    } else {
+                        memset(td->left_mode_ctx, NEARESTMV, 8);
+                    }
+                    memset(td->left_y_nnz_ctx, 0, 16);
+                    memset(td->left_uv_nnz_ctx, 0, 32);
+                    memset(td->left_segpred_ctx, 0, 8);
+
+                    td->c = &td->c_b[tile_col];
+                }
+
+                for (col = tile_col_start;
+                     col < tile_col_end;
+                     col += 8, yoff2 += 64 * bytesperpixel,
+                     uvoff2 += 64 * bytesperpixel >> s->ss_h, lflvl_ptr++) {
+                    // FIXME integrate with lf code (i.e. zero after each
+                    // use, similar to invtxfm coefficients, or similar)
+                    if (s->pass != 1) {
+                        memset(lflvl_ptr->mask, 0, sizeof(lflvl_ptr->mask));
+                    }
  
+                    if (s->pass == 2) {
+                        decode_sb_mem(td, row, col, lflvl_ptr,
+                                      yoff2, uvoff2, BL_64X64);
+                    } else {
+                        decode_sb(td, row, col, lflvl_ptr,
+                                  yoff2, uvoff2, BL_64X64);
+                    }
+                }
+            }
+
+            if (s->pass == 1)
+                continue;
+
+            // backup pre-loopfilter reconstruction data for intra
+            // prediction of next row of sb64s
+            if (row + 8 < s->rows) {
+                memcpy(s->intra_pred_data[0],
+                       f->data[0] + yoff + 63 * ls_y,
+                       8 * s->cols * bytesperpixel);
+                memcpy(s->intra_pred_data[1],
+                       f->data[1] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv,
+                       8 * s->cols * bytesperpixel >> s->ss_h);
+                memcpy(s->intra_pred_data[2],
+                       f->data[2] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv,
+                       8 * s->cols * bytesperpixel >> s->ss_h);
+            }
+
+            // loopfilter one row
+            if (s->s.h.filter.level) {
+                yoff2 = yoff;
+                uvoff2 = uvoff;
+                lflvl_ptr = s->lflvl;
+                for (col = 0; col < s->cols;
+                     col += 8, yoff2 += 64 * bytesperpixel,
+                     uvoff2 += 64 * bytesperpixel >> s->ss_h, lflvl_ptr++) {
+                    ff_vp9_loopfilter_sb(avctx, lflvl_ptr, row, col,
+                                         yoff2, uvoff2);
+                }
+            }
+
+            // FIXME maybe we can make this more finegrained by running the
+            // loopfilter per-block instead of after each sbrow
+            // In fact that would also make intra pred left preparation easier?
+            ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, row >> 3, 0);
+        }
+    }
      return 0;
  }
  
+#if HAVE_THREADS
+static av_always_inline
+int decode_tiles_mt(AVCodecContext *avctx, void *tdata, int jobnr,
+                              int threadnr)
+{
+    VP9Context *s = avctx->priv_data;
+    VP9TileData *td = &s->td[jobnr];
+    ptrdiff_t uvoff, yoff, ls_y, ls_uv;
+    int bytesperpixel = s->bytesperpixel, row, col, tile_row;
+    unsigned tile_cols_len;
+    int tile_row_start, tile_row_end, tile_col_start, tile_col_end;
+    VP9Filter *lflvl_ptr_base;
+    AVFrame *f;
+
+    f = s->s.frames[CUR_FRAME].tf.f;
+    ls_y = f->linesize[0];
+    ls_uv =f->linesize[1];
+
+    set_tile_offset(&tile_col_start, &tile_col_end,
+                    jobnr, s->s.h.tiling.log2_tile_cols, s->sb_cols);
+    td->tile_col_start  = tile_col_start;
+    uvoff = (64 * bytesperpixel >> s->ss_h)*(tile_col_start >> 3);
+    yoff = (64 * bytesperpixel)*(tile_col_start >> 3);
+    lflvl_ptr_base = s->lflvl+(tile_col_start >> 3);
+
+    for (tile_row = 0; tile_row < s->s.h.tiling.tile_rows; tile_row++) {
+        set_tile_offset(&tile_row_start, &tile_row_end,
+                        tile_row, s->s.h.tiling.log2_tile_rows, s->sb_rows);
+
+        td->c = &td->c_b[tile_row];
+        for (row = tile_row_start; row < tile_row_end;
+             row += 8, yoff += ls_y * 64, uvoff += ls_uv * 64 >> s->ss_v) {
+            ptrdiff_t yoff2 = yoff, uvoff2 = uvoff;
+            VP9Filter *lflvl_ptr = lflvl_ptr_base+s->sb_cols*(row >> 3);
+
+            memset(td->left_partition_ctx, 0, 8);
+            memset(td->left_skip_ctx, 0, 8);
+            if (s->s.h.keyframe || s->s.h.intraonly) {
+                memset(td->left_mode_ctx, DC_PRED, 16);
+            } else {
+                memset(td->left_mode_ctx, NEARESTMV, 8);
+            }
+            memset(td->left_y_nnz_ctx, 0, 16);
+            memset(td->left_uv_nnz_ctx, 0, 32);
+            memset(td->left_segpred_ctx, 0, 8);
+
+            for (col = tile_col_start;
+                 col < tile_col_end;
+                 col += 8, yoff2 += 64 * bytesperpixel,
+                 uvoff2 += 64 * bytesperpixel >> s->ss_h, lflvl_ptr++) {
+                // FIXME integrate with lf code (i.e. zero after each
+                // use, similar to invtxfm coefficients, or similar)
+                memset(lflvl_ptr->mask, 0, sizeof(lflvl_ptr->mask));
+                decode_sb(td, row, col, lflvl_ptr,
+                            yoff2, uvoff2, BL_64X64);
+            }
+
+            // backup pre-loopfilter reconstruction data for intra
+            // prediction of next row of sb64s
+            tile_cols_len = tile_col_end - tile_col_start;
+            if (row + 8 < s->rows) {
+                memcpy(s->intra_pred_data[0] + (tile_col_start * 8 * bytesperpixel),
+                       f->data[0] + yoff + 63 * ls_y,
+                       8 * tile_cols_len * bytesperpixel);
+                memcpy(s->intra_pred_data[1] + (tile_col_start * 8 * bytesperpixel >> s->ss_h),
+                       f->data[1] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv,
+                       8 * tile_cols_len * bytesperpixel >> s->ss_h);
+                memcpy(s->intra_pred_data[2] + (tile_col_start * 8 * bytesperpixel >> s->ss_h),
+                       f->data[2] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv,
+                       8 * tile_cols_len * bytesperpixel >> s->ss_h);
+            }
+
+            vp9_report_tile_progress(s, row >> 3, 1);
+        }
+    }
+    return 0;
+}
+
+static av_always_inline
+int loopfilter_proc(AVCodecContext *avctx)
+{
+    VP9Context *s = avctx->priv_data;
+    ptrdiff_t uvoff, yoff, ls_y, ls_uv;
+    VP9Filter *lflvl_ptr;
+    int bytesperpixel = s->bytesperpixel, col, i;
+    AVFrame *f;
+
+    f = s->s.frames[CUR_FRAME].tf.f;
+    ls_y = f->linesize[0];
+    ls_uv =f->linesize[1];
+
+    for (i = 0; i < s->sb_rows; i++) {
+        vp9_await_tile_progress(s, i, s->s.h.tiling.tile_cols);
+
+        if (s->s.h.filter.level) {
+            yoff = (ls_y * 64)*i;
+            uvoff =  (ls_uv * 64 >> s->ss_v)*i;
+            lflvl_ptr = s->lflvl+s->sb_cols*i;
+            for (col = 0; col < s->cols;
+                 col += 8, yoff += 64 * bytesperpixel,
+                 uvoff += 64 * bytesperpixel >> s->ss_h, lflvl_ptr++) {
+                ff_vp9_loopfilter_sb(avctx, lflvl_ptr, i << 3, col,
+                                     yoff, uvoff);
+            }
+        }
+    }
+    return 0;
+}
+#endif
  
  static int vp9_decode_frame(AVCodecContext *avctx, void *frame,
                              int *got_frame, AVPacket *pkt)
@@ -1103,12 +1457,10 @@ static int vp9_decode_frame(AVCodecContext *avctx, void *frame,
      const uint8_t *data = pkt->data;
      int size = pkt->size;
      VP9Context *s = avctx->priv_data;
-    int ret, tile_row, tile_col, i, ref, row, col;
+    int ret, i, j, ref;
      int retain_segmap_ref = s->s.frames[REF_FRAME_SEGMAP].segmentation_map &&
                              (!s->s.h.segmentation.enabled || !s->s.h.segmentation.update_map);
-    ptrdiff_t yoff, uvoff, ls_y, ls_uv;
      AVFrame *f;
-    int bytesperpixel;
  
      if ((ret = decode_frame_header(avctx, data, size, &ref)) < 0) {
          return ret;
@@ -1158,8 +1510,6 @@ FF_ENABLE_DEPRECATION_WARNINGS
      f = s->s.frames[CUR_FRAME].tf.f;
      f->key_frame = s->s.h.keyframe;
      f->pict_type = (s->s.h.keyframe || s->s.h.intraonly) ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
-    ls_y = f->linesize[0];
-    ls_uv =f->linesize[1];
  
      if (s->s.frames[REF_FRAME_SEGMAP].tf.f->buf[0] &&
          (s->s.frames[REF_FRAME_MVPAIR].tf.f->width  != s->s.frames[CUR_FRAME].tf.f->width ||
@@ -1194,7 +1544,6 @@ FF_ENABLE_DEPRECATION_WARNINGS
      }
  
      // main tile decode loop
-    bytesperpixel = s->bytesperpixel;
      memset(s->above_partition_ctx, 0, s->cols);
      memset(s->above_skip_ctx, 0, s->cols);
      if (s->s.h.keyframe || s->s.h.intraonly) {
@@ -1232,20 +1581,31 @@ FF_ENABLE_DEPRECATION_WARNINGS
          ff_thread_finish_setup(avctx);
      }
  
+#if HAVE_THREADS
+    if (avctx->active_thread_type & FF_THREAD_SLICE) {
+        for (i = 0; i < s->sb_rows; i++)
+            atomic_store(&s->entries[i], 0);
+    }
+#endif
+
      do {
-        yoff = uvoff = 0;
-        s->b = s->b_base;
-        s->block = s->block_base;
-        s->uvblock[0] = s->uvblock_base[0];
-        s->uvblock[1] = s->uvblock_base[1];
-        s->eob = s->eob_base;
-        s->uveob[0] = s->uveob_base[0];
-        s->uveob[1] = s->uveob_base[1];
-
-        for (tile_row = 0; tile_row < s->s.h.tiling.tile_rows; tile_row++) {
-            set_tile_offset(&s->tile_row_start, &s->tile_row_end,
-                            tile_row, s->s.h.tiling.log2_tile_rows, s->sb_rows);
-            if (s->pass != 2) {
+        for (i = 0; i < s->active_tile_cols; i++) {
+            s->td[i].b = s->td[i].b_base;
+            s->td[i].block = s->td[i].block_base;
+            s->td[i].uvblock[0] = s->td[i].uvblock_base[0];
+            s->td[i].uvblock[1] = s->td[i].uvblock_base[1];
+            s->td[i].eob = s->td[i].eob_base;
+            s->td[i].uveob[0] = s->td[i].uveob_base[0];
+            s->td[i].uveob[1] = s->td[i].uveob_base[1];
+        }
+
+#if HAVE_THREADS
+        if (avctx->active_thread_type == FF_THREAD_SLICE) {
+            int tile_row, tile_col;
+
+            av_assert1(!s->pass);
+
+            for (tile_row = 0; tile_row < s->s.h.tiling.tile_rows; tile_row++) {
                  for (tile_col = 0; tile_col < s->s.h.tiling.tile_cols; tile_col++) {
                      int64_t tile_size;
  
@@ -1257,105 +1617,33 @@ FF_ENABLE_DEPRECATION_WARNINGS
                          data += 4;
                          size -= 4;
                      }
-                    if (tile_size > size) {
-                        ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, INT_MAX, 0);
+                    if (tile_size > size)
                          return AVERROR_INVALIDDATA;
-                    }
-                    ret = ff_vp56_init_range_decoder(&s->c_b[tile_col], data, tile_size);
+                    ret = ff_vp56_init_range_decoder(&s->td[tile_col].c_b[tile_row], data, tile_size);
                      if (ret < 0)
                          return ret;
-                    if (vp56_rac_get_prob_branchy(&s->c_b[tile_col], 128)) { // marker bit
-                        ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, INT_MAX, 0);
+                    if (vp56_rac_get_prob_branchy(&s->td[tile_col].c_b[tile_row], 128)) // marker bit
                          return AVERROR_INVALIDDATA;
-                    }
                      data += tile_size;
                      size -= tile_size;
                  }
              }
  
-            for (row = s->tile_row_start; row < s->tile_row_end;
-                 row += 8, yoff += ls_y * 64, uvoff += ls_uv * 64 >> s->ss_v) {
-                VP9Filter *lflvl_ptr = s->lflvl;
-                ptrdiff_t yoff2 = yoff, uvoff2 = uvoff;
-
-                for (tile_col = 0; tile_col < s->s.h.tiling.tile_cols; tile_col++) {
-                    set_tile_offset(&s->tile_col_start, &s->tile_col_end,
-                                    tile_col, s->s.h.tiling.log2_tile_cols, s->sb_cols);
-
-                    if (s->pass != 2) {
-                        memset(s->left_partition_ctx, 0, 8);
-                        memset(s->left_skip_ctx, 0, 8);
-                        if (s->s.h.keyframe || s->s.h.intraonly) {
-                            memset(s->left_mode_ctx, DC_PRED, 16);
-                        } else {
-                            memset(s->left_mode_ctx, NEARESTMV, 8);
-                        }
-                        memset(s->left_y_nnz_ctx, 0, 16);
-                        memset(s->left_uv_nnz_ctx, 0, 32);
-                        memset(s->left_segpred_ctx, 0, 8);
-
-                        memcpy(&s->c, &s->c_b[tile_col], sizeof(s->c));
-                    }
-
-                    for (col = s->tile_col_start;
-                         col < s->tile_col_end;
-                         col += 8, yoff2 += 64 * bytesperpixel,
-                         uvoff2 += 64 * bytesperpixel >> s->ss_h, lflvl_ptr++) {
-                        // FIXME integrate with lf code (i.e. zero after each
-                        // use, similar to invtxfm coefficients, or similar)
-                        if (s->pass != 1) {
-                            memset(lflvl_ptr->mask, 0, sizeof(lflvl_ptr->mask));
-                        }
-
-                        if (s->pass == 2) {
-                            decode_sb_mem(avctx, row, col, lflvl_ptr,
-                                          yoff2, uvoff2, BL_64X64);
-                        } else {
-                            decode_sb(avctx, row, col, lflvl_ptr,
-                                      yoff2, uvoff2, BL_64X64);
-                        }
-                    }
-                    if (s->pass != 2)
-                        memcpy(&s->c_b[tile_col], &s->c, sizeof(s->c));
-                }
-
-                if (s->pass == 1)
-                    continue;
-
-                // backup pre-loopfilter reconstruction data for intra
-                // prediction of next row of sb64s
-                if (row + 8 < s->rows) {
-                    memcpy(s->intra_pred_data[0],
-                           f->data[0] + yoff + 63 * ls_y,
-                           8 * s->cols * bytesperpixel);
-                    memcpy(s->intra_pred_data[1],
-                           f->data[1] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv,
-                           8 * s->cols * bytesperpixel >> s->ss_h);
-                    memcpy(s->intra_pred_data[2],
-                           f->data[2] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv,
-                           8 * s->cols * bytesperpixel >> s->ss_h);
-                }
-
-                // loopfilter one row
-                if (s->s.h.filter.level) {
-                    yoff2 = yoff;
-                    uvoff2 = uvoff;
-                    lflvl_ptr = s->lflvl;
-                    for (col = 0; col < s->cols;
-                         col += 8, yoff2 += 64 * bytesperpixel,
-                         uvoff2 += 64 * bytesperpixel >> s->ss_h, lflvl_ptr++) {
-                        ff_vp9_loopfilter_sb(avctx, lflvl_ptr, row, col,
-                                             yoff2, uvoff2);
-                    }
-                }
-
-                // FIXME maybe we can make this more finegrained by running the
-                // loopfilter per-block instead of after each sbrow
-                // In fact that would also make intra pred left preparation easier?
-                ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, row >> 3, 0);
-            }
+            ff_slice_thread_execute_with_mainfunc(avctx, decode_tiles_mt, loopfilter_proc, s->td, NULL, s->s.h.tiling.tile_cols);
+        } else
+#endif
+        {
+            ret = decode_tiles(avctx, data, size);
+            if (ret < 0)
+                return ret;
          }
  
+        // Sum all counts fields into td[0].counts for tile threading
+        if (avctx->active_thread_type == FF_THREAD_SLICE)
+            for (i = 1; i < s->s.h.tiling.tile_cols; i++)
+                for (j = 0; j < sizeof(s->td[i].counts) / sizeof(unsigned); j++)
+                    ((unsigned *)&s->td[0].counts)[j] += ((unsigned *)&s->td[i].counts)[j];
+
          if (s->pass < 2 && s->s.h.refreshctx && !s->s.h.parallelmode) {
              ff_vp9_adapt_probs(s);
              ff_thread_finish_setup(avctx);
@@ -1491,7 +1779,8 @@ AVCodec ff_vp9_decoder = {
      .init                  = vp9_decode_init,
      .close                 = vp9_decode_free,
      .decode                = vp9_decode_frame,
-    .capabilities          = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS,
+    .capabilities          = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS | AV_CODEC_CAP_SLICE_THREADS,
+    .caps_internal         = FF_CODEC_CAP_SLICE_THREAD_HAS_MF,
      .flush                 = vp9_decode_flush,
      .init_thread_copy      = ONLY_IF_THREADS_ENABLED(vp9_decode_init_thread_copy),
      .update_thread_context = ONLY_IF_THREADS_ENABLED(vp9_decode_update_thread_context),