h264: Copy h264chroma dsp context to slice thread copies

[ffmpeg] / libavcodec / h264.c
diff --git a/libavcodec/h264.c b/libavcodec/h264.c

index 2d6a08e0322fb2dc460073024db1a9d7e009168f..542070be938436f505216528c72e9777fe3293e2 100644 (file)
--- a/libavcodec/h264.c
+++ b/libavcodec/h264.c
@@ -34,10 +34,12 @@
  #include "mpegvideo.h"
  #include "h264.h"
  #include "h264data.h"
+#include "h264chroma.h"
  #include "h264_mvpred.h"
  #include "golomb.h"
  #include "mathops.h"
  #include "rectangle.h"
+#include "svq3.h"
  #include "thread.h"
  #include "vdpau_internal.h"
  #include "libavutil/avassert.h"
@@ -59,12 +61,21 @@ static const uint8_t div6[QP_MAX_NUM + 1] = {
      7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10,
  };
  
-static const enum PixelFormat hwaccel_pixfmt_list_h264_jpeg_420[] = {
-    PIX_FMT_DXVA2_VLD,
-    PIX_FMT_VAAPI_VLD,
-    PIX_FMT_VDA_VLD,
-    PIX_FMT_YUVJ420P,
-    PIX_FMT_NONE
+static const enum AVPixelFormat hwaccel_pixfmt_list_h264_jpeg_420[] = {
+#if CONFIG_H264_DXVA2_HWACCEL
+    AV_PIX_FMT_DXVA2_VLD,
+#endif
+#if CONFIG_H264_VAAPI_HWACCEL
+    AV_PIX_FMT_VAAPI_VLD,
+#endif
+#if CONFIG_H264_VDA_HWACCEL
+    AV_PIX_FMT_VDA_VLD,
+#endif
+#if CONFIG_H264_VDPAU_HWACCEL
+    AV_PIX_FMT_VDPAU,
+#endif
+    AV_PIX_FMT_YUVJ420P,
+    AV_PIX_FMT_NONE
  };
  
  /**
@@ -175,42 +186,50 @@ const uint8_t *ff_h264_decode_nal(H264Context *h, const uint8_t *src,
      src++;
      length--;
  
+#define STARTCODE_TEST                                                  \
+        if (i + 2 < length && src[i + 1] == 0 && src[i + 2] <= 3) {     \
+            if (src[i + 2] != 3) {                                      \
+                /* startcode, so we must be past the end */             \
+                length = i;                                             \
+            }                                                           \
+            break;                                                      \
+        }
  #if HAVE_FAST_UNALIGNED
+#define FIND_FIRST_ZERO                                                 \
+        if (i > 0 && !src[i])                                           \
+            i--;                                                        \
+        while (src[i])                                                  \
+            i++
  #if HAVE_FAST_64BIT
-#define RS 7
      for (i = 0; i + 1 < length; i += 9) {
          if (!((~AV_RN64A(src + i) &
                 (AV_RN64A(src + i) - 0x0100010001000101ULL)) &
                0x8000800080008080ULL))
+            continue;
+        FIND_FIRST_ZERO;
+        STARTCODE_TEST;
+        i -= 7;
+    }
  #else
-#define RS 3
      for (i = 0; i + 1 < length; i += 5) {
          if (!((~AV_RN32A(src + i) &
                 (AV_RN32A(src + i) - 0x01000101U)) &
                0x80008080U))
-#endif
              continue;
-        if (i > 0 && !src[i])
-            i--;
-        while (src[i])
-            i++;
+        FIND_FIRST_ZERO;
+        STARTCODE_TEST;
+        i -= 3;
+    }
+#endif
  #else
-#define RS 0
      for (i = 0; i + 1 < length; i += 2) {
          if (src[i])
              continue;
          if (i > 0 && src[i - 1] == 0)
              i--;
-#endif
-        if (i + 2 < length && src[i + 1] == 0 && src[i + 2] <= 3) {
-            if (src[i + 2] != 3) {
-                /* startcode, so we must be past the end */
-                length = i;
-            }
-            break;
-        }
-        i -= RS;
+        STARTCODE_TEST;
      }
+#endif
  
      if (i >= length - 1) { // no escaped 0
          *dst_length = length;
@@ -227,7 +246,6 @@ const uint8_t *ff_h264_decode_nal(H264Context *h, const uint8_t *src,
      if (dst == NULL)
          return NULL;
  
-    // printf("decoding esc\n");
      memcpy(dst, src, i);
      si = di = i;
      while (si + 2 < length) {
@@ -283,10 +301,11 @@ static inline int get_lowest_part_list_y(H264Context *h, Picture *pic, int n,
                                           int height, int y_offset, int list)
  {
      int raw_my        = h->mv_cache[list][scan8[n]][1];
-    int filter_height = (raw_my & 3) ? 2 : 0;
+    int filter_height_up   = (raw_my & 3) ? 2 : 0;
+    int filter_height_down = (raw_my & 3) ? 3 : 0;
      int full_my       = (raw_my >> 2) + y_offset;
-    int top           = full_my - filter_height;
-    int bottom        = full_my + filter_height + height;
+    int top           = full_my - filter_height_up;
+    int bottom        = full_my + filter_height_down + height;
  
      return FFMAX(abs(top), bottom);
  }
@@ -479,11 +498,11 @@ static av_always_inline void mc_dir_part(H264Context *h, Picture *pic,
          full_my                <          0 - extra_height ||
          full_mx + 16 /*FIXME*/ > pic_width  + extra_width  ||
          full_my + 16 /*FIXME*/ > pic_height + extra_height) {
-        s->dsp.emulated_edge_mc(s->edge_emu_buffer,
-                                src_y - (2 << pixel_shift) - 2 * h->mb_linesize,
-                                h->mb_linesize,
-                                16 + 5, 16 + 5 /*FIXME*/, full_mx - 2,
-                                full_my - 2, pic_width, pic_height);
+        s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
+                                 src_y - (2 << pixel_shift) - 2 * h->mb_linesize,
+                                 h->mb_linesize,
+                                 16 + 5, 16 + 5 /*FIXME*/, full_mx - 2,
+                                 full_my - 2, pic_width, pic_height);
          src_y = s->edge_emu_buffer + (2 << pixel_shift) + 2 * h->mb_linesize;
          emu   = 1;
      }
@@ -498,12 +517,12 @@ static av_always_inline void mc_dir_part(H264Context *h, Picture *pic,
      if (chroma_idc == 3 /* yuv444 */) {
          src_cb = pic->f.data[1] + offset;
          if (emu) {
-            s->dsp.emulated_edge_mc(s->edge_emu_buffer,
-                                    src_cb - (2 << pixel_shift) - 2 * h->mb_linesize,
-                                    h->mb_linesize,
-                                    16 + 5, 16 + 5 /*FIXME*/,
-                                    full_mx - 2, full_my - 2,
-                                    pic_width, pic_height);
+            s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
+                                     src_cb - (2 << pixel_shift) - 2 * h->mb_linesize,
+                                     h->mb_linesize,
+                                     16 + 5, 16 + 5 /*FIXME*/,
+                                     full_mx - 2, full_my - 2,
+                                     pic_width, pic_height);
              src_cb = s->edge_emu_buffer + (2 << pixel_shift) + 2 * h->mb_linesize;
          }
          qpix_op[luma_xy](dest_cb, src_cb, h->mb_linesize); // FIXME try variable height perhaps?
@@ -512,12 +531,12 @@ static av_always_inline void mc_dir_part(H264Context *h, Picture *pic,
  
          src_cr = pic->f.data[2] + offset;
          if (emu) {
-            s->dsp.emulated_edge_mc(s->edge_emu_buffer,
-                                    src_cr - (2 << pixel_shift) - 2 * h->mb_linesize,
-                                    h->mb_linesize,
-                                    16 + 5, 16 + 5 /*FIXME*/,
-                                    full_mx - 2, full_my - 2,
-                                    pic_width, pic_height);
+            s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
+                                     src_cr - (2 << pixel_shift) - 2 * h->mb_linesize,
+                                     h->mb_linesize,
+                                     16 + 5, 16 + 5 /*FIXME*/,
+                                     full_mx - 2, full_my - 2,
+                                     pic_width, pic_height);
              src_cr = s->edge_emu_buffer + (2 << pixel_shift) + 2 * h->mb_linesize;
          }
          qpix_op[luma_xy](dest_cr, src_cr, h->mb_linesize); // FIXME try variable height perhaps?
@@ -539,9 +558,9 @@ static av_always_inline void mc_dir_part(H264Context *h, Picture *pic,
               (my >> ysh) * h->mb_uvlinesize;
  
      if (emu) {
-        s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize,
-                                9, 8 * chroma_idc + 1, (mx >> 3), (my >> ysh),
-                                pic_width >> 1, pic_height >> (chroma_idc == 1 /* yuv420 */));
+        s->vdsp.emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize,
+                                 9, 8 * chroma_idc + 1, (mx >> 3), (my >> ysh),
+                                 pic_width >> 1, pic_height >> (chroma_idc == 1 /* yuv420 */));
          src_cb = s->edge_emu_buffer;
      }
      chroma_op(dest_cb, src_cb, h->mb_uvlinesize,
@@ -549,9 +568,9 @@ static av_always_inline void mc_dir_part(H264Context *h, Picture *pic,
                mx & 7, (my << (chroma_idc == 2 /* yuv422 */)) & 7);
  
      if (emu) {
-        s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize,
-                                9, 8 * chroma_idc + 1, (mx >> 3), (my >> ysh),
-                                pic_width >> 1, pic_height >> (chroma_idc == 1 /* yuv420 */));
+        s->vdsp.emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize,
+                                 9, 8 * chroma_idc + 1, (mx >> 3), (my >> ysh),
+                                 pic_width >> 1, pic_height >> (chroma_idc == 1 /* yuv420 */));
          src_cr = s->edge_emu_buffer;
      }
      chroma_op(dest_cr, src_cr, h->mb_uvlinesize, height >> (chroma_idc == 1 /* yuv420 */),
@@ -645,9 +664,9 @@ static av_always_inline void mc_part_weighted(H264Context *h, int n, int square,
      if (list0 && list1) {
          /* don't optimize for luma-only case, since B-frames usually
           * use implicit weights => chroma too. */
-        uint8_t *tmp_cb = s->obmc_scratchpad;
-        uint8_t *tmp_cr = s->obmc_scratchpad + (16 << pixel_shift);
-        uint8_t *tmp_y  = s->obmc_scratchpad + 16 * h->mb_uvlinesize;
+        uint8_t *tmp_cb = h->bipred_scratchpad;
+        uint8_t *tmp_cr = h->bipred_scratchpad + (16 << pixel_shift);
+        uint8_t *tmp_y  = h->bipred_scratchpad + 16 * h->mb_uvlinesize;
          int refn0       = h->ref_cache[0][scan8[n]];
          int refn1       = h->ref_cache[1][scan8[n]];
  
@@ -714,33 +733,6 @@ static av_always_inline void mc_part_weighted(H264Context *h, int n, int square,
      }
  }
  
-static av_always_inline void mc_part(H264Context *h, int n, int square,
-                                     int height, int delta,
-                                     uint8_t *dest_y, uint8_t *dest_cb,
-                                     uint8_t *dest_cr,
-                                     int x_offset, int y_offset,
-                                     qpel_mc_func *qpix_put,
-                                     h264_chroma_mc_func chroma_put,
-                                     qpel_mc_func *qpix_avg,
-                                     h264_chroma_mc_func chroma_avg,
-                                     h264_weight_func *weight_op,
-                                     h264_biweight_func *weight_avg,
-                                     int list0, int list1,
-                                     int pixel_shift, int chroma_idc)
-{
-    if ((h->use_weight == 2 && list0 && list1 &&
-         (h->implicit_weight[h->ref_cache[0][scan8[n]]][h->ref_cache[1][scan8[n]]][h->s.mb_y & 1] != 32)) ||
-        h->use_weight == 1)
-        mc_part_weighted(h, n, square, height, delta, dest_y, dest_cb, dest_cr,
-                         x_offset, y_offset, qpix_put, chroma_put,
-                         weight_op[0], weight_op[1], weight_avg[0],
-                         weight_avg[1], list0, list1, pixel_shift, chroma_idc);
-    else
-        mc_part_std(h, n, square, height, delta, dest_y, dest_cb, dest_cr,
-                    x_offset, y_offset, qpix_put, chroma_put, qpix_avg,
-                    chroma_avg, list0, list1, pixel_shift, chroma_idc);
-}
-
  static av_always_inline void prefetch_motion(H264Context *h, int list,
                                               int pixel_shift, int chroma_idc)
  {
@@ -755,157 +747,17 @@ static av_always_inline void prefetch_motion(H264Context *h, int list,
          int off       = (mx << pixel_shift) +
                          (my + (s->mb_x & 3) * 4) * h->mb_linesize +
                          (64 << pixel_shift);
-        s->dsp.prefetch(src[0] + off, s->linesize, 4);
+        s->vdsp.prefetch(src[0] + off, s->linesize, 4);
          if (chroma_idc == 3 /* yuv444 */) {
-            s->dsp.prefetch(src[1] + off, s->linesize, 4);
-            s->dsp.prefetch(src[2] + off, s->linesize, 4);
+            s->vdsp.prefetch(src[1] + off, s->linesize, 4);
+            s->vdsp.prefetch(src[2] + off, s->linesize, 4);
          } else {
              off = ((mx >> 1) << pixel_shift) +
                    ((my >> 1) + (s->mb_x & 7)) * s->uvlinesize +
                    (64 << pixel_shift);
-            s->dsp.prefetch(src[1] + off, src[2] - src[1], 2);
-        }
-    }
-}
-
-static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y,
-                                       uint8_t *dest_cb, uint8_t *dest_cr,
-                                       qpel_mc_func(*qpix_put)[16],
-                                       h264_chroma_mc_func(*chroma_put),
-                                       qpel_mc_func(*qpix_avg)[16],
-                                       h264_chroma_mc_func(*chroma_avg),
-                                       h264_weight_func *weight_op,
-                                       h264_biweight_func *weight_avg,
-                                       int pixel_shift, int chroma_idc)
-{
-    MpegEncContext *const s = &h->s;
-    const int mb_xy   = h->mb_xy;
-    const int mb_type = s->current_picture.f.mb_type[mb_xy];
-
-    assert(IS_INTER(mb_type));
-
-    if (HAVE_THREADS && (s->avctx->active_thread_type & FF_THREAD_FRAME))
-        await_references(h);
-    prefetch_motion(h, 0, pixel_shift, chroma_idc);
-
-    if (IS_16X16(mb_type)) {
-        mc_part(h, 0, 1, 16, 0, dest_y, dest_cb, dest_cr, 0, 0,
-                qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
-                weight_op, weight_avg,
-                IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1),
-                pixel_shift, chroma_idc);
-    } else if (IS_16X8(mb_type)) {
-        mc_part(h, 0, 0, 8, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 0,
-                qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
-                weight_op, weight_avg,
-                IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1),
-                pixel_shift, chroma_idc);
-        mc_part(h, 8, 0, 8, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 4,
-                qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
-                weight_op, weight_avg,
-                IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1),
-                pixel_shift, chroma_idc);
-    } else if (IS_8X16(mb_type)) {
-        mc_part(h, 0, 0, 16, 8 * h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
-                qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
-                &weight_op[1], &weight_avg[1],
-                IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1),
-                pixel_shift, chroma_idc);
-        mc_part(h, 4, 0, 16, 8 * h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
-                qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
-                &weight_op[1], &weight_avg[1],
-                IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1),
-                pixel_shift, chroma_idc);
-    } else {
-        int i;
-
-        assert(IS_8X8(mb_type));
-
-        for (i = 0; i < 4; i++) {
-            const int sub_mb_type = h->sub_mb_type[i];
-            const int n  = 4 * i;
-            int x_offset = (i & 1) << 2;
-            int y_offset = (i & 2) << 1;
-
-            if (IS_SUB_8X8(sub_mb_type)) {
-                mc_part(h, n, 1, 8, 0, dest_y, dest_cb, dest_cr,
-                        x_offset, y_offset,
-                        qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
-                        &weight_op[1], &weight_avg[1],
-                        IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
-                        pixel_shift, chroma_idc);
-            } else if (IS_SUB_8X4(sub_mb_type)) {
-                mc_part(h, n, 0, 4, 4 << pixel_shift, dest_y, dest_cb, dest_cr,
-                        x_offset, y_offset,
-                        qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
-                        &weight_op[1], &weight_avg[1],
-                        IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
-                        pixel_shift, chroma_idc);
-                mc_part(h, n + 2, 0, 4, 4 << pixel_shift,
-                        dest_y, dest_cb, dest_cr, x_offset, y_offset + 2,
-                        qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
-                        &weight_op[1], &weight_avg[1],
-                        IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
-                        pixel_shift, chroma_idc);
-            } else if (IS_SUB_4X8(sub_mb_type)) {
-                mc_part(h, n, 0, 8, 4 * h->mb_linesize,
-                        dest_y, dest_cb, dest_cr, x_offset, y_offset,
-                        qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
-                        &weight_op[2], &weight_avg[2],
-                        IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
-                        pixel_shift, chroma_idc);
-                mc_part(h, n + 1, 0, 8, 4 * h->mb_linesize,
-                        dest_y, dest_cb, dest_cr, x_offset + 2, y_offset,
-                        qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
-                        &weight_op[2], &weight_avg[2],
-                        IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
-                        pixel_shift, chroma_idc);
-            } else {
-                int j;
-                assert(IS_SUB_4X4(sub_mb_type));
-                for (j = 0; j < 4; j++) {
-                    int sub_x_offset = x_offset + 2 * (j & 1);
-                    int sub_y_offset = y_offset + (j & 2);
-                    mc_part(h, n + j, 1, 4, 0,
-                            dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
-                            qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
-                            &weight_op[2], &weight_avg[2],
-                            IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
-                            pixel_shift, chroma_idc);
-                }
-            }
+            s->vdsp.prefetch(src[1] + off, src[2] - src[1], 2);
          }
      }
-
-    prefetch_motion(h, 1, pixel_shift, chroma_idc);
-}
-
-static av_always_inline void hl_motion_420(H264Context *h, uint8_t *dest_y,
-                                           uint8_t *dest_cb, uint8_t *dest_cr,
-                                           qpel_mc_func(*qpix_put)[16],
-                                           h264_chroma_mc_func(*chroma_put),
-                                           qpel_mc_func(*qpix_avg)[16],
-                                           h264_chroma_mc_func(*chroma_avg),
-                                           h264_weight_func *weight_op,
-                                           h264_biweight_func *weight_avg,
-                                           int pixel_shift)
-{
-    hl_motion(h, dest_y, dest_cb, dest_cr, qpix_put, chroma_put,
-              qpix_avg, chroma_avg, weight_op, weight_avg, pixel_shift, 1);
-}
-
-static av_always_inline void hl_motion_422(H264Context *h, uint8_t *dest_y,
-                                           uint8_t *dest_cb, uint8_t *dest_cr,
-                                           qpel_mc_func(*qpix_put)[16],
-                                           h264_chroma_mc_func(*chroma_put),
-                                           qpel_mc_func(*qpix_avg)[16],
-                                           h264_chroma_mc_func(*chroma_avg),
-                                           h264_weight_func *weight_op,
-                                           h264_biweight_func *weight_avg,
-                                           int pixel_shift)
-{
-    hl_motion(h, dest_y, dest_cb, dest_cr, qpix_put, chroma_put,
-              qpix_avg, chroma_avg, weight_op, weight_avg, pixel_shift, 2);
  }
  
  static void free_tables(H264Context *h, int free_rbsp)
@@ -933,7 +785,7 @@ static void free_tables(H264Context *h, int free_rbsp)
              continue;
          av_freep(&hx->top_borders[1]);
          av_freep(&hx->top_borders[0]);
-        av_freep(&hx->s.obmc_scratchpad);
+        av_freep(&hx->bipred_scratchpad);
          if (free_rbsp) {
              av_freep(&hx->rbsp_buffer[1]);
              av_freep(&hx->rbsp_buffer[0]);
@@ -1058,8 +910,6 @@ int ff_h264_alloc_tables(H264Context *h)
              h->mb2br_xy[mb_xy] = 8 * (FMO ? mb_xy : (mb_xy % (2 * s->mb_stride)));
          }
  
-    s->obmc_scratchpad = NULL;
-
      if (!h->dequant4_coeff[0])
          init_dequant_tables(h);
  
@@ -1087,7 +937,7 @@ static void clone_tables(H264Context *dst, H264Context *src, int i)
      dst->mvd_table[1]           = src->mvd_table[1] + i * 8 * 2 * s->mb_stride;
      dst->direct_table           = src->direct_table;
      dst->list_counts            = src->list_counts;
-    dst->s.obmc_scratchpad      = NULL;
+    dst->bipred_scratchpad      = NULL;
      ff_h264_pred_init(&dst->hpc, src->s.codec_id, src->sps.bit_depth_luma,
                        src->sps.chroma_format_idc);
  }
@@ -1116,7 +966,8 @@ fail:
      return -1; // free_tables will clean up for us
  }
  
-static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size);
+static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size,
+                            int parse_extradata);
  
  static av_cold void common_init(H264Context *h)
  {
@@ -1127,6 +978,8 @@ static av_cold void common_init(H264Context *h)
      s->codec_id = s->avctx->codec->id;
  
      ff_h264dsp_init(&h->h264dsp, 8, 1);
+    ff_h264chroma_init(&h->h264chroma, h->sps.bit_depth_chroma);
+    ff_h264qpel_init(&h->h264qpel, 8);
      ff_h264_pred_init(&h->hpc, s->codec_id, 8, 1);
  
      h->dequant_coeff_pps = -1;
@@ -1134,6 +987,7 @@ static av_cold void common_init(H264Context *h)
  
      /* needed so that IDCT permutation is known early */
      ff_dsputil_init(&s->dsp, s->avctx);
+    ff_videodsp_init(&s->vdsp, 8);
  
      memset(h->pps.scaling_matrix4, 16, 6 * 16 * sizeof(uint8_t));
      memset(h->pps.scaling_matrix8, 16, 2 * 64 * sizeof(uint8_t));
@@ -1163,7 +1017,7 @@ int ff_h264_decode_extradata(H264Context *h)
              nalsize = AV_RB16(p) + 2;
              if (p - avctx->extradata + nalsize > avctx->extradata_size)
                  return -1;
-            if (decode_nal_units(h, p, nalsize) < 0) {
+            if (decode_nal_units(h, p, nalsize, 1) < 0) {
                  av_log(avctx, AV_LOG_ERROR,
                         "Decoding sps %d from avcC failed\n", i);
                  return -1;
@@ -1176,7 +1030,7 @@ int ff_h264_decode_extradata(H264Context *h)
              nalsize = AV_RB16(p) + 2;
              if (p - avctx->extradata + nalsize > avctx->extradata_size)
                  return -1;
-            if (decode_nal_units(h, p, nalsize) < 0) {
+            if (decode_nal_units(h, p, nalsize, 1) < 0) {
                  av_log(avctx, AV_LOG_ERROR,
                         "Decoding pps %d from avcC failed\n", i);
                  return -1;
@@ -1187,7 +1041,7 @@ int ff_h264_decode_extradata(H264Context *h)
          h->nal_length_size = (avctx->extradata[4] & 0x03) + 1;
      } else {
          h->is_avc = 0;
-        if (decode_nal_units(h, avctx->extradata, avctx->extradata_size) < 0)
+        if (decode_nal_units(h, avctx->extradata, avctx->extradata_size, 1) < 0)
              return -1;
      }
      return 0;
@@ -1227,7 +1081,7 @@ av_cold int ff_h264_decode_init(AVCodecContext *avctx)
      h->prev_poc_msb = 1 << 16;
      h->x264_build   = -1;
      ff_h264_reset_sei(h);
-    if (avctx->codec_id == CODEC_ID_H264) {
+    if (avctx->codec_id == AV_CODEC_ID_H264) {
          if (avctx->ticks_per_frame == 1)
              s->avctx->time_base.den *= 2;
          avctx->ticks_per_frame = 2;
@@ -1287,6 +1141,8 @@ static int decode_init_thread_copy(AVCodecContext *avctx)
      memset(h->sps_buffers, 0, sizeof(h->sps_buffers));
      memset(h->pps_buffers, 0, sizeof(h->pps_buffers));
  
+    h->s.context_initialized = 0;
+
      return 0;
  }
  
@@ -1294,6 +1150,10 @@ static int decode_init_thread_copy(AVCodecContext *avctx)
      memcpy(&to->start_field, &from->start_field,                        \
             (char *)&to->end_field - (char *)&to->start_field)
  
+static int h264_slice_header_init(H264Context *, int);
+
+static int h264_set_parameter_from_sps(H264Context *h);
+
  static int decode_update_thread_context(AVCodecContext *dst,
                                          const AVCodecContext *src)
  {
@@ -1305,11 +1165,42 @@ static int decode_update_thread_context(AVCodecContext *dst,
      if (dst == src || !s1->context_initialized)
          return 0;
  
+    if (inited &&
+        (s->width      != s1->width      ||
+         s->height     != s1->height     ||
+         s->mb_width   != s1->mb_width   ||
+         s->mb_height  != s1->mb_height  ||
+         h->sps.bit_depth_luma    != h1->sps.bit_depth_luma    ||
+         h->sps.chroma_format_idc != h1->sps.chroma_format_idc ||
+         h->sps.colorspace        != h1->sps.colorspace)) {
+
+        av_freep(&h->bipred_scratchpad);
+
+        s->width     = s1->width;
+        s->height    = s1->height;
+        s->mb_height = s1->mb_height;
+        h->b_stride  = h1->b_stride;
+
+        if ((err = h264_slice_header_init(h, 1)) < 0) {
+            av_log(h->s.avctx, AV_LOG_ERROR, "h264_slice_header_init() failed");
+            return err;
+        }
+        h->context_reinitialized = 1;
+
+        /* update linesize on resize for h264. The h264 decoder doesn't
+         * necessarily call ff_MPV_frame_start in the new thread */
+        s->linesize   = s1->linesize;
+        s->uvlinesize = s1->uvlinesize;
+
+        /* copy block_offset since frame_start may not be called */
+        memcpy(h->block_offset, h1->block_offset, sizeof(h->block_offset));
+        h264_set_parameter_from_sps(h);
+    }
+
      err = ff_mpeg_update_thread_context(dst, src);
      if (err)
          return err;
  
-    // FIXME handle width/height changing
      if (!inited) {
          for (i = 0; i < MAX_SPS_COUNT; i++)
              av_freep(h->sps_buffers + i);
@@ -1332,17 +1223,19 @@ static int decode_update_thread_context(AVCodecContext *dst,
              h->rbsp_buffer[i]      = NULL;
              h->rbsp_buffer_size[i] = 0;
          }
+        h->bipred_scratchpad = NULL;
  
          h->thread_context[0] = h;
  
-        /* frame_start may not be called for the next thread (if it's decoding
-         * a bottom field) so this has to be allocated here */
-        h->s.obmc_scratchpad = av_malloc(16 * 6 * s->linesize);
-
          s->dsp.clear_blocks(h->mb);
          s->dsp.clear_blocks(h->mb + (24 * 16 << h->pixel_shift));
      }
  
+    /* frame_start may not be called for the next thread (if it's decoding
+     * a bottom field) so this has to be allocated here */
+    if (!h->bipred_scratchpad)
+        h->bipred_scratchpad = av_malloc(16 * 6 * s->linesize);
+
      // extradata/NAL handling
      h->is_avc = h1->is_avc;
  
@@ -1373,7 +1266,7 @@ static int decode_update_thread_context(AVCodecContext *dst,
  
      // reference lists
      copy_fields(h, h1, ref_count, list_count);
-    copy_fields(h, h1, ref_list, intra_gb);
+    copy_fields(h, h1, ref2frm, intra_gb);
      copy_fields(h, h1, short_ref, cabac_init_idc);
  
      copy_picture_range(h->short_ref, h1->short_ref, 32, s, s1);
@@ -1386,7 +1279,7 @@ static int decode_update_thread_context(AVCodecContext *dst,
      if (!s->current_picture_ptr)
          return 0;
  
-    if (!s->dropable) {
+    if (!s->droppable) {
          err = ff_h264_execute_ref_pic_marking(h, h->mmco, h->mmco_index);
          h->prev_poc_msb = h->poc_msb;
          h->prev_poc_lsb = h->poc_lsb;
@@ -1432,8 +1325,8 @@ int ff_h264_frame_start(H264Context *h)
      /* can't be in alloc_tables because linesize isn't known there.
       * FIXME: redo bipred weight to not require extra buffer? */
      for (i = 0; i < s->slice_context_count; i++)
-        if (h->thread_context[i] && !h->thread_context[i]->s.obmc_scratchpad)
-            h->thread_context[i]->s.obmc_scratchpad = av_malloc(16 * 6 * s->linesize);
+        if (h->thread_context[i] && !h->thread_context[i]->bipred_scratchpad)
+            h->thread_context[i]->bipred_scratchpad = av_malloc(16 * 6 * s->linesize);
  
      /* Some macroblocks can be accessed before they're available in case
       * of lost slices, MBAFF or threading. */
@@ -1449,7 +1342,7 @@ int ff_h264_frame_start(H264Context *h)
       * SVQ3 as well as most other codecs have only last/next/current and thus
       * get released even with set reference, besides SVQ3 and others do not
       * mark frames as reference later "naturally". */
-    if (s->codec_id != CODEC_ID_SVQ3)
+    if (s->codec_id != AV_CODEC_ID_SVQ3)
          s->current_picture_ptr->f.reference = 0;
  
      s->current_picture_ptr->field_poc[0]     =
@@ -1525,7 +1418,6 @@ static void decode_postinit(H264Context *h, int setup_finished)
              cur->f.repeat_pict = 1;
              break;
          case SEI_PIC_STRUCT_FRAME_DOUBLING:
-            // Force progressive here, doubling interlaced frame is a bad idea.
              cur->f.repeat_pict = 2;
              break;
          case SEI_PIC_STRUCT_FRAME_TRIPLING:
@@ -1870,7 +1762,7 @@ static av_always_inline void xchg_mb_border(H264Context *h, uint8_t *src_y,
      }
  }
  
-static av_always_inline int dctcoef_get(DCTELEM *mb, int high_bit_depth,
+static av_always_inline int dctcoef_get(int16_t *mb, int high_bit_depth,
                                          int index)
  {
      if (high_bit_depth) {
@@ -1879,7 +1771,7 @@ static av_always_inline int dctcoef_get(DCTELEM *mb, int high_bit_depth,
          return AV_RN16A(mb + index);
  }
  
-static av_always_inline void dctcoef_set(DCTELEM *mb, int high_bit_depth,
+static av_always_inline void dctcoef_set(int16_t *mb, int high_bit_depth,
                                           int index, int value)
  {
      if (high_bit_depth) {
@@ -1898,8 +1790,8 @@ static av_always_inline void hl_decode_mb_predict_luma(H264Context *h,
                                                         uint8_t *dest_y, int p)
  {
      MpegEncContext *const s = &h->s;
-    void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
-    void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
+    void (*idct_add)(uint8_t *dst, int16_t *block, int stride);
+    void (*idct_dc_add)(uint8_t *dst, int16_t *block, int stride);
      int i;
      int qscale = p == 0 ? s->qscale : h->chroma_qp[p - 1];
      block_offset += 16 * p;
@@ -2015,7 +1907,7 @@ static av_always_inline void hl_decode_mb_idct_luma(H264Context *h, int mb_type,
                                                      uint8_t *dest_y, int p)
  {
      MpegEncContext *const s = &h->s;
-    void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
+    void (*idct_add)(uint8_t *dst, int16_t *block, int stride);
      int i;
      block_offset += 16 * p;
      if (!IS_INTRA4x4(mb_type)) {
@@ -2077,373 +1969,17 @@ static av_always_inline void hl_decode_mb_idct_luma(H264Context *h, int mb_type,
      }
  }
  
-static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple,
-                                                   int pixel_shift)
-{
-    MpegEncContext *const s = &h->s;
-    const int mb_x    = s->mb_x;
-    const int mb_y    = s->mb_y;
-    const int mb_xy   = h->mb_xy;
-    const int mb_type = s->current_picture.f.mb_type[mb_xy];
-    uint8_t *dest_y, *dest_cb, *dest_cr;
-    int linesize, uvlinesize /*dct_offset*/;
-    int i, j;
-    int *block_offset = &h->block_offset[0];
-    const int transform_bypass = !simple && (s->qscale == 0 && h->sps.transform_bypass);
-    /* is_h264 should always be true if SVQ3 is disabled. */
-    const int is_h264 = !CONFIG_SVQ3_DECODER || simple || s->codec_id == CODEC_ID_H264;
-    void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
-    const int block_h   = 16 >> s->chroma_y_shift;
-    const int chroma422 = CHROMA422;
-
-    dest_y  = s->current_picture.f.data[0] + ((mb_x << pixel_shift)     + mb_y * s->linesize)  * 16;
-    dest_cb = s->current_picture.f.data[1] +  (mb_x << pixel_shift) * 8 + mb_y * s->uvlinesize * block_h;
-    dest_cr = s->current_picture.f.data[2] +  (mb_x << pixel_shift) * 8 + mb_y * s->uvlinesize * block_h;
-
-    s->dsp.prefetch(dest_y  + (s->mb_x & 3) * 4 * s->linesize   + (64 << pixel_shift), s->linesize,       4);
-    s->dsp.prefetch(dest_cb + (s->mb_x & 7)     * s->uvlinesize + (64 << pixel_shift), dest_cr - dest_cb, 2);
-
-    h->list_counts[mb_xy] = h->list_count;
-
-    if (!simple && MB_FIELD) {
-        linesize     = h->mb_linesize = s->linesize * 2;
-        uvlinesize   = h->mb_uvlinesize = s->uvlinesize * 2;
-        block_offset = &h->block_offset[48];
-        if (mb_y & 1) { // FIXME move out of this function?
-            dest_y  -= s->linesize * 15;
-            dest_cb -= s->uvlinesize * (block_h - 1);
-            dest_cr -= s->uvlinesize * (block_h - 1);
-        }
-        if (FRAME_MBAFF) {
-            int list;
-            for (list = 0; list < h->list_count; list++) {
-                if (!USES_LIST(mb_type, list))
-                    continue;
-                if (IS_16X16(mb_type)) {
-                    int8_t *ref = &h->ref_cache[list][scan8[0]];
-                    fill_rectangle(ref, 4, 4, 8, (16 + *ref) ^ (s->mb_y & 1), 1);
-                } else {
-                    for (i = 0; i < 16; i += 4) {
-                        int ref = h->ref_cache[list][scan8[i]];
-                        if (ref >= 0)
-                            fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2,
-                                           8, (16 + ref) ^ (s->mb_y & 1), 1);
-                    }
-                }
-            }
-        }
-    } else {
-        linesize   = h->mb_linesize   = s->linesize;
-        uvlinesize = h->mb_uvlinesize = s->uvlinesize;
-        // dct_offset = s->linesize * 16;
-    }
-
-    if (!simple && IS_INTRA_PCM(mb_type)) {
-        if (pixel_shift) {
-            const int bit_depth = h->sps.bit_depth_luma;
-            int j;
-            GetBitContext gb;
-            init_get_bits(&gb, (uint8_t *)h->mb,
-                          ff_h264_mb_sizes[h->sps.chroma_format_idc] * bit_depth);
-
-            for (i = 0; i < 16; i++) {
-                uint16_t *tmp_y = (uint16_t *)(dest_y + i * linesize);
-                for (j = 0; j < 16; j++)
-                    tmp_y[j] = get_bits(&gb, bit_depth);
-            }
-            if (simple || !CONFIG_GRAY || !(s->flags & CODEC_FLAG_GRAY)) {
-                if (!h->sps.chroma_format_idc) {
-                    for (i = 0; i < block_h; i++) {
-                        uint16_t *tmp_cb = (uint16_t *)(dest_cb + i * uvlinesize);
-                        for (j = 0; j < 8; j++)
-                            tmp_cb[j] = 1 << (bit_depth - 1);
-                    }
-                    for (i = 0; i < block_h; i++) {
-                        uint16_t *tmp_cr = (uint16_t *)(dest_cr + i * uvlinesize);
-                        for (j = 0; j < 8; j++)
-                            tmp_cr[j] = 1 << (bit_depth - 1);
-                    }
-                } else {
-                    for (i = 0; i < block_h; i++) {
-                        uint16_t *tmp_cb = (uint16_t *)(dest_cb + i * uvlinesize);
-                        for (j = 0; j < 8; j++)
-                            tmp_cb[j] = get_bits(&gb, bit_depth);
-                    }
-                    for (i = 0; i < block_h; i++) {
-                        uint16_t *tmp_cr = (uint16_t *)(dest_cr + i * uvlinesize);
-                        for (j = 0; j < 8; j++)
-                            tmp_cr[j] = get_bits(&gb, bit_depth);
-                    }
-                }
-            }
-        } else {
-            for (i = 0; i < 16; i++)
-                memcpy(dest_y + i * linesize, (uint8_t *)h->mb + i * 16, 16);
-            if (simple || !CONFIG_GRAY || !(s->flags & CODEC_FLAG_GRAY)) {
-                if (!h->sps.chroma_format_idc) {
-                    for (i = 0; i < block_h; i++) {
-                        memset(dest_cb + i * uvlinesize, 128, 8);
-                        memset(dest_cr + i * uvlinesize, 128, 8);
-                    }
-                } else {
-                    uint8_t *src_cb = (uint8_t *)h->mb + 256;
-                    uint8_t *src_cr = (uint8_t *)h->mb + 256 + block_h * 8;
-                    for (i = 0; i < block_h; i++) {
-                        memcpy(dest_cb + i * uvlinesize, src_cb + i * 8, 8);
-                        memcpy(dest_cr + i * uvlinesize, src_cr + i * 8, 8);
-                    }
-                }
-            }
-        }
-    } else {
-        if (IS_INTRA(mb_type)) {
-            if (h->deblocking_filter)
-                xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize,
-                               uvlinesize, 1, 0, simple, pixel_shift);
-
-            if (simple || !CONFIG_GRAY || !(s->flags & CODEC_FLAG_GRAY)) {
-                h->hpc.pred8x8[h->chroma_pred_mode](dest_cb, uvlinesize);
-                h->hpc.pred8x8[h->chroma_pred_mode](dest_cr, uvlinesize);
-            }
+#define BITS   8
+#define SIMPLE 1
+#include "h264_mb_template.c"
  
-            hl_decode_mb_predict_luma(h, mb_type, is_h264, simple,
-                                      transform_bypass, pixel_shift,
-                                      block_offset, linesize, dest_y, 0);
-
-            if (h->deblocking_filter)
-                xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize,
-                               uvlinesize, 0, 0, simple, pixel_shift);
-        } else if (is_h264) {
-            if (chroma422) {
-                hl_motion_422(h, dest_y, dest_cb, dest_cr,
-                              s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
-                              s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
-                              h->h264dsp.weight_h264_pixels_tab,
-                              h->h264dsp.biweight_h264_pixels_tab,
-                              pixel_shift);
-            } else {
-                hl_motion_420(h, dest_y, dest_cb, dest_cr,
-                              s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
-                              s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
-                              h->h264dsp.weight_h264_pixels_tab,
-                              h->h264dsp.biweight_h264_pixels_tab,
-                              pixel_shift);
-            }
-        }
+#undef  BITS
+#define BITS   16
+#include "h264_mb_template.c"
  
-        hl_decode_mb_idct_luma(h, mb_type, is_h264, simple, transform_bypass,
-                               pixel_shift, block_offset, linesize, dest_y, 0);
-
-        if ((simple || !CONFIG_GRAY || !(s->flags & CODEC_FLAG_GRAY)) &&
-            (h->cbp & 0x30)) {
-            uint8_t *dest[2] = { dest_cb, dest_cr };
-            if (transform_bypass) {
-                if (IS_INTRA(mb_type) && h->sps.profile_idc == 244 &&
-                    (h->chroma_pred_mode == VERT_PRED8x8 ||
-                     h->chroma_pred_mode == HOR_PRED8x8)) {
-                    h->hpc.pred8x8_add[h->chroma_pred_mode](dest[0],
-                                                            block_offset + 16,
-                                                            h->mb + (16 * 16 * 1 << pixel_shift),
-                                                            uvlinesize);
-                    h->hpc.pred8x8_add[h->chroma_pred_mode](dest[1],
-                                                            block_offset + 32,
-                                                            h->mb + (16 * 16 * 2 << pixel_shift),
-                                                            uvlinesize);
-                } else {
-                    idct_add = s->dsp.add_pixels4;
-                    for (j = 1; j < 3; j++) {
-                        for (i = j * 16; i < j * 16 + 4; i++)
-                            if (h->non_zero_count_cache[scan8[i]] ||
-                                dctcoef_get(h->mb, pixel_shift, i * 16))
-                                idct_add(dest[j - 1] + block_offset[i],
-                                         h->mb + (i * 16 << pixel_shift),
-                                         uvlinesize);
-                        if (chroma422) {
-                            for (i = j * 16 + 4; i < j * 16 + 8; i++)
-                                if (h->non_zero_count_cache[scan8[i + 4]] ||
-                                    dctcoef_get(h->mb, pixel_shift, i * 16))
-                                    idct_add(dest[j - 1] + block_offset[i + 4],
-                                             h->mb + (i * 16 << pixel_shift),
-                                             uvlinesize);
-                        }
-                    }
-                }
-            } else {
-                if (is_h264) {
-                    int qp[2];
-                    if (chroma422) {
-                        qp[0] = h->chroma_qp[0] + 3;
-                        qp[1] = h->chroma_qp[1] + 3;
-                    } else {
-                        qp[0] = h->chroma_qp[0];
-                        qp[1] = h->chroma_qp[1];
-                    }
-                    if (h->non_zero_count_cache[scan8[CHROMA_DC_BLOCK_INDEX + 0]])
-                        h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + (16 * 16 * 1 << pixel_shift),
-                                                               h->dequant4_coeff[IS_INTRA(mb_type) ? 1 : 4][qp[0]][0]);
-                    if (h->non_zero_count_cache[scan8[CHROMA_DC_BLOCK_INDEX + 1]])
-                        h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + (16 * 16 * 2 << pixel_shift),
-                                                               h->dequant4_coeff[IS_INTRA(mb_type) ? 2 : 5][qp[1]][0]);
-                    h->h264dsp.h264_idct_add8(dest, block_offset,
-                                              h->mb, uvlinesize,
-                                              h->non_zero_count_cache);
-                } else if (CONFIG_SVQ3_DECODER) {
-                    h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + 16 * 16 * 1,
-                                                           h->dequant4_coeff[IS_INTRA(mb_type) ? 1 : 4][h->chroma_qp[0]][0]);
-                    h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + 16 * 16 * 2,
-                                                           h->dequant4_coeff[IS_INTRA(mb_type) ? 2 : 5][h->chroma_qp[1]][0]);
-                    for (j = 1; j < 3; j++) {
-                        for (i = j * 16; i < j * 16 + 4; i++)
-                            if (h->non_zero_count_cache[scan8[i]] || h->mb[i * 16]) {
-                                uint8_t *const ptr = dest[j - 1] + block_offset[i];
-                                ff_svq3_add_idct_c(ptr, h->mb + i * 16,
-                                                   uvlinesize,
-                                                   ff_h264_chroma_qp[0][s->qscale + 12] - 12, 2);
-                            }
-                    }
-                }
-            }
-        }
-    }
-    if (h->cbp || IS_INTRA(mb_type)) {
-        s->dsp.clear_blocks(h->mb);
-        s->dsp.clear_blocks(h->mb + (24 * 16 << pixel_shift));
-    }
-}
-
-static av_always_inline void hl_decode_mb_444_internal(H264Context *h,
-                                                       int simple,
-                                                       int pixel_shift)
-{
-    MpegEncContext *const s = &h->s;
-    const int mb_x    = s->mb_x;
-    const int mb_y    = s->mb_y;
-    const int mb_xy   = h->mb_xy;
-    const int mb_type = s->current_picture.f.mb_type[mb_xy];
-    uint8_t *dest[3];
-    int linesize;
-    int i, j, p;
-    int *block_offset = &h->block_offset[0];
-    const int transform_bypass = !simple && (s->qscale == 0 && h->sps.transform_bypass);
-    const int plane_count      = (simple || !CONFIG_GRAY || !(s->flags & CODEC_FLAG_GRAY)) ? 3 : 1;
-
-    for (p = 0; p < plane_count; p++) {
-        dest[p] = s->current_picture.f.data[p] +
-                  ((mb_x << pixel_shift) + mb_y * s->linesize) * 16;
-        s->dsp.prefetch(dest[p] + (s->mb_x & 3) * 4 * s->linesize + (64 << pixel_shift),
-                        s->linesize, 4);
-    }
-
-    h->list_counts[mb_xy] = h->list_count;
-
-    if (!simple && MB_FIELD) {
-        linesize     = h->mb_linesize = h->mb_uvlinesize = s->linesize * 2;
-        block_offset = &h->block_offset[48];
-        if (mb_y & 1) // FIXME move out of this function?
-            for (p = 0; p < 3; p++)
-                dest[p] -= s->linesize * 15;
-        if (FRAME_MBAFF) {
-            int list;
-            for (list = 0; list < h->list_count; list++) {
-                if (!USES_LIST(mb_type, list))
-                    continue;
-                if (IS_16X16(mb_type)) {
-                    int8_t *ref = &h->ref_cache[list][scan8[0]];
-                    fill_rectangle(ref, 4, 4, 8, (16 + *ref) ^ (s->mb_y & 1), 1);
-                } else {
-                    for (i = 0; i < 16; i += 4) {
-                        int ref = h->ref_cache[list][scan8[i]];
-                        if (ref >= 0)
-                            fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2,
-                                           8, (16 + ref) ^ (s->mb_y & 1), 1);
-                    }
-                }
-            }
-        }
-    } else {
-        linesize = h->mb_linesize = h->mb_uvlinesize = s->linesize;
-    }
-
-    if (!simple && IS_INTRA_PCM(mb_type)) {
-        if (pixel_shift) {
-            const int bit_depth = h->sps.bit_depth_luma;
-            GetBitContext gb;
-            init_get_bits(&gb, (uint8_t *)h->mb, 768 * bit_depth);
-
-            for (p = 0; p < plane_count; p++)
-                for (i = 0; i < 16; i++) {
-                    uint16_t *tmp = (uint16_t *)(dest[p] + i * linesize);
-                    for (j = 0; j < 16; j++)
-                        tmp[j] = get_bits(&gb, bit_depth);
-                }
-        } else {
-            for (p = 0; p < plane_count; p++)
-                for (i = 0; i < 16; i++)
-                    memcpy(dest[p] + i * linesize,
-                           (uint8_t *)h->mb + p * 256 + i * 16, 16);
-        }
-    } else {
-        if (IS_INTRA(mb_type)) {
-            if (h->deblocking_filter)
-                xchg_mb_border(h, dest[0], dest[1], dest[2], linesize,
-                               linesize, 1, 1, simple, pixel_shift);
-
-            for (p = 0; p < plane_count; p++)
-                hl_decode_mb_predict_luma(h, mb_type, 1, simple,
-                                          transform_bypass, pixel_shift,
-                                          block_offset, linesize, dest[p], p);
-
-            if (h->deblocking_filter)
-                xchg_mb_border(h, dest[0], dest[1], dest[2], linesize,
-                               linesize, 0, 1, simple, pixel_shift);
-        } else {
-            hl_motion(h, dest[0], dest[1], dest[2],
-                      s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
-                      s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
-                      h->h264dsp.weight_h264_pixels_tab,
-                      h->h264dsp.biweight_h264_pixels_tab, pixel_shift, 3);
-        }
-
-        for (p = 0; p < plane_count; p++)
-            hl_decode_mb_idct_luma(h, mb_type, 1, simple, transform_bypass,
-                                   pixel_shift, block_offset, linesize,
-                                   dest[p], p);
-    }
-    if (h->cbp || IS_INTRA(mb_type)) {
-        s->dsp.clear_blocks(h->mb);
-        s->dsp.clear_blocks(h->mb + (24 * 16 << pixel_shift));
-    }
-}
-
-/**
- * Process a macroblock; this case avoids checks for expensive uncommon cases.
- */
-#define hl_decode_mb_simple(sh, bits)                          \
-static void hl_decode_mb_simple_ ## bits(H264Context *h)       \
-{                                                              \
-    hl_decode_mb_internal(h, 1, sh);                           \
-}
-
-hl_decode_mb_simple(0, 8)
-hl_decode_mb_simple(1, 16)
-
-/**
- * Process a macroblock; this handles edge cases, such as interlacing.
- */
-static av_noinline void hl_decode_mb_complex(H264Context *h)
-{
-    hl_decode_mb_internal(h, 0, h->pixel_shift);
-}
-
-static av_noinline void hl_decode_mb_444_complex(H264Context *h)
-{
-    hl_decode_mb_444_internal(h, 0, h->pixel_shift);
-}
-
-static av_noinline void hl_decode_mb_444_simple(H264Context *h)
-{
-    hl_decode_mb_444_internal(h, 1, 0);
-}
+#undef  SIMPLE
+#define SIMPLE 0
+#include "h264_mb_template.c"
  
  void ff_h264_hl_decode_mb(H264Context *h)
  {
@@ -2456,7 +1992,7 @@ void ff_h264_hl_decode_mb(H264Context *h)
          if (is_complex || h->pixel_shift)
              hl_decode_mb_444_complex(h);
          else
-            hl_decode_mb_444_simple(h);
+            hl_decode_mb_444_simple_8(h);
      } else if (is_complex) {
          hl_decode_mb_complex(h);
      } else if (h->pixel_shift) {
@@ -2608,15 +2144,9 @@ static void idr(H264Context *h)
  }
  
  /* forget old pics after a seek */
-static void flush_dpb(AVCodecContext *avctx)
+static void flush_change(H264Context *h)
  {
-    H264Context *h = avctx->priv_data;
      int i;
-    for (i = 0; i < MAX_DELAYED_PIC_COUNT; i++) {
-        if (h->delayed_pic[i])
-            h->delayed_pic[i]->f.reference = 0;
-        h->delayed_pic[i] = NULL;
-    }
      for (i = 0; i < MAX_DELAYED_PIC_COUNT; i++)
          h->last_pocs[i] = INT_MIN;
      h->outputed_poc = h->next_outputed_poc = INT_MIN;
@@ -2625,7 +2155,26 @@ static void flush_dpb(AVCodecContext *avctx)
      if (h->s.current_picture_ptr)
          h->s.current_picture_ptr->f.reference = 0;
      h->s.first_field = 0;
+    memset(h->ref_list[0], 0, sizeof(h->ref_list[0]));
+    memset(h->ref_list[1], 0, sizeof(h->ref_list[1]));
+    memset(h->default_ref_list[0], 0, sizeof(h->default_ref_list[0]));
+    memset(h->default_ref_list[1], 0, sizeof(h->default_ref_list[1]));
      ff_h264_reset_sei(h);
+}
+
+/* forget old pics after a seek */
+static void flush_dpb(AVCodecContext *avctx)
+{
+    H264Context *h = avctx->priv_data;
+    int i;
+
+    for (i = 0; i < MAX_DELAYED_PIC_COUNT; i++) {
+        if (h->delayed_pic[i])
+            h->delayed_pic[i]->f.reference = 0;
+        h->delayed_pic[i] = NULL;
+    }
+
+    flush_change(h);
      ff_mpeg_flush(avctx);
  }
  
@@ -2649,7 +2198,6 @@ static int init_poc(H264Context *h)
              h->poc_msb = h->prev_poc_msb - max_poc_lsb;
          else
              h->poc_msb = h->prev_poc_msb;
-        // printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
          field_poc[0] =
          field_poc[1] = h->poc_msb + h->poc_lsb;
          if (s->picture_structure == PICT_FRAME)
@@ -2752,7 +2300,7 @@ static int field_end(H264Context *h, int in_setup)
      int err = 0;
      s->mb_y = 0;
  
-    if (!in_setup && !s->dropable)
+    if (!in_setup && !s->droppable)
          ff_thread_report_progress(&s->current_picture_ptr->f, INT_MAX,
                                    s->picture_structure == PICT_BOTTOM_FIELD);
  
@@ -2761,7 +2309,7 @@ static int field_end(H264Context *h, int in_setup)
          ff_vdpau_h264_set_reference_frames(s);
  
      if (in_setup || !(avctx->active_thread_type & FF_THREAD_FRAME)) {
-        if (!s->dropable) {
+        if (!s->droppable) {
              err = ff_h264_execute_ref_pic_marking(h, h->mmco, h->mmco_index);
              h->prev_poc_msb = h->poc_msb;
              h->prev_poc_lsb = h->poc_lsb;
@@ -2806,8 +2354,10 @@ static int field_end(H264Context *h, int in_setup)
  /**
   * Replicate H264 "master" context to thread contexts.
   */
-static void clone_slice(H264Context *dst, H264Context *src)
+static int clone_slice(H264Context *dst, H264Context *src)
  {
+    int ret;
+
      memcpy(dst->block_offset, src->block_offset, sizeof(dst->block_offset));
      dst->s.current_picture_ptr = src->s.current_picture_ptr;
      dst->s.current_picture     = src->s.current_picture;
@@ -2815,6 +2365,13 @@ static void clone_slice(H264Context *dst, H264Context *src)
      dst->s.uvlinesize          = src->s.uvlinesize;
      dst->s.first_field         = src->s.first_field;
  
+    if (!dst->s.edge_emu_buffer &&
+        (ret = ff_mpv_frame_size_alloc(&dst->s, dst->s.linesize))) {
+        av_log(dst->s.avctx, AV_LOG_ERROR,
+               "Failed to allocate scratch buffers\n");
+        return ret;
+    }
+
      dst->prev_poc_msb          = src->prev_poc_msb;
      dst->prev_poc_lsb          = src->prev_poc_lsb;
      dst->prev_frame_num_offset = src->prev_frame_num_offset;
@@ -2824,10 +2381,11 @@ static void clone_slice(H264Context *dst, H264Context *src)
      memcpy(dst->short_ref,        src->short_ref,        sizeof(dst->short_ref));
      memcpy(dst->long_ref,         src->long_ref,         sizeof(dst->long_ref));
      memcpy(dst->default_ref_list, src->default_ref_list, sizeof(dst->default_ref_list));
-    memcpy(dst->ref_list,         src->ref_list,         sizeof(dst->ref_list));
  
      memcpy(dst->dequant4_coeff,   src->dequant4_coeff,   sizeof(src->dequant4_coeff));
      memcpy(dst->dequant8_coeff,   src->dequant8_coeff,   sizeof(src->dequant8_coeff));
+
+    return 0;
  }
  
  /**
@@ -2857,6 +2415,178 @@ int ff_h264_get_profile(SPS *sps)
      return profile;
  }
  
+static int h264_set_parameter_from_sps(H264Context *h)
+{
+    MpegEncContext *s = &h->s;
+
+    if (s->flags & CODEC_FLAG_LOW_DELAY ||
+        (h->sps.bitstream_restriction_flag &&
+         !h->sps.num_reorder_frames)) {
+        if (s->avctx->has_b_frames > 1 || h->delayed_pic[0])
+            av_log(h->s.avctx, AV_LOG_WARNING, "Delayed frames seen. "
+                   "Reenabling low delay requires a codec flush.\n");
+        else
+            s->low_delay = 1;
+    }
+
+    if (s->avctx->has_b_frames < 2)
+        s->avctx->has_b_frames = !s->low_delay;
+
+    if (s->avctx->bits_per_raw_sample != h->sps.bit_depth_luma ||
+        h->cur_chroma_format_idc      != h->sps.chroma_format_idc) {
+        if (s->avctx->codec &&
+            s->avctx->codec->capabilities & CODEC_CAP_HWACCEL_VDPAU &&
+            (h->sps.bit_depth_luma != 8 || h->sps.chroma_format_idc > 1)) {
+            av_log(s->avctx, AV_LOG_ERROR,
+                   "VDPAU decoding does not support video colorspace.\n");
+            return AVERROR_INVALIDDATA;
+        }
+        if (h->sps.bit_depth_luma >= 8 && h->sps.bit_depth_luma <= 10) {
+            s->avctx->bits_per_raw_sample = h->sps.bit_depth_luma;
+            h->cur_chroma_format_idc      = h->sps.chroma_format_idc;
+            h->pixel_shift                = h->sps.bit_depth_luma > 8;
+
+            ff_h264dsp_init(&h->h264dsp, h->sps.bit_depth_luma,
+                            h->sps.chroma_format_idc);
+            ff_h264chroma_init(&h->h264chroma, h->sps.bit_depth_chroma);
+            ff_h264qpel_init(&h->h264qpel, h->sps.bit_depth_luma);
+            ff_h264_pred_init(&h->hpc, s->codec_id, h->sps.bit_depth_luma,
+                              h->sps.chroma_format_idc);
+            s->dsp.dct_bits = h->sps.bit_depth_luma > 8 ? 32 : 16;
+            ff_dsputil_init(&s->dsp, s->avctx);
+            ff_videodsp_init(&s->vdsp, h->sps.bit_depth_luma);
+        } else {
+            av_log(s->avctx, AV_LOG_ERROR, "Unsupported bit depth: %d\n",
+                   h->sps.bit_depth_luma);
+            return AVERROR_INVALIDDATA;
+        }
+    }
+    return 0;
+}
+
+static enum PixelFormat get_pixel_format(H264Context *h)
+{
+    MpegEncContext *const s  = &h->s;
+    switch (h->sps.bit_depth_luma) {
+    case 9:
+        if (CHROMA444) {
+            if (s->avctx->colorspace == AVCOL_SPC_RGB) {
+                return AV_PIX_FMT_GBRP9;
+            } else
+                return AV_PIX_FMT_YUV444P9;
+        } else if (CHROMA422)
+            return AV_PIX_FMT_YUV422P9;
+        else
+            return AV_PIX_FMT_YUV420P9;
+        break;
+    case 10:
+        if (CHROMA444) {
+            if (s->avctx->colorspace == AVCOL_SPC_RGB) {
+                return AV_PIX_FMT_GBRP10;
+            } else
+                return AV_PIX_FMT_YUV444P10;
+        } else if (CHROMA422)
+            return AV_PIX_FMT_YUV422P10;
+        else
+            return AV_PIX_FMT_YUV420P10;
+        break;
+    case 8:
+        if (CHROMA444) {
+            if (s->avctx->colorspace == AVCOL_SPC_RGB) {
+                return AV_PIX_FMT_GBRP;
+            } else
+                return s->avctx->color_range == AVCOL_RANGE_JPEG ? AV_PIX_FMT_YUVJ444P
+                                                                 : AV_PIX_FMT_YUV444P;
+        } else if (CHROMA422) {
+            return s->avctx->color_range == AVCOL_RANGE_JPEG ? AV_PIX_FMT_YUVJ422P
+                                                             : AV_PIX_FMT_YUV422P;
+        } else {
+            return s->avctx->get_format(s->avctx, s->avctx->codec->pix_fmts ?
+                                        s->avctx->codec->pix_fmts :
+                                        s->avctx->color_range == AVCOL_RANGE_JPEG ?
+                                        hwaccel_pixfmt_list_h264_jpeg_420 :
+                                        ff_hwaccel_pixfmt_list_420);
+        }
+        break;
+    default:
+        av_log(s->avctx, AV_LOG_ERROR,
+               "Unsupported bit depth: %d\n", h->sps.bit_depth_luma);
+        return AVERROR_INVALIDDATA;
+    }
+}
+
+static int h264_slice_header_init(H264Context *h, int reinit)
+{
+    MpegEncContext *const s  = &h->s;
+    int i, ret;
+
+    avcodec_set_dimensions(s->avctx, s->width, s->height);
+    s->avctx->sample_aspect_ratio = h->sps.sar;
+    av_assert0(s->avctx->sample_aspect_ratio.den);
+
+    if (h->sps.timing_info_present_flag) {
+        int64_t den = h->sps.time_scale;
+        if (h->x264_build < 44U)
+            den *= 2;
+        av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
+                  h->sps.num_units_in_tick, den, 1 << 30);
+    }
+
+    s->avctx->hwaccel = ff_find_hwaccel(s->avctx->codec->id, s->avctx->pix_fmt);
+
+    if (reinit) {
+        free_tables(h, 0);
+        if ((ret = ff_MPV_common_frame_size_change(s)) < 0) {
+            av_log(h->s.avctx, AV_LOG_ERROR, "ff_MPV_common_frame_size_change() failed.\n");
+            return ret;
+        }
+    } else {
+        if ((ret = ff_MPV_common_init(s)) < 0) {
+            av_log(h->s.avctx, AV_LOG_ERROR, "ff_MPV_common_init() failed.\n");
+            return ret;
+        }
+    }
+    s->first_field = 0;
+    h->prev_interlaced_frame = 1;
+
+    init_scan_tables(h);
+    if (ff_h264_alloc_tables(h) < 0) {
+        av_log(h->s.avctx, AV_LOG_ERROR,
+               "Could not allocate memory for h264\n");
+        return AVERROR(ENOMEM);
+    }
+
+    if (!HAVE_THREADS || !(s->avctx->active_thread_type & FF_THREAD_SLICE)) {
+        if (context_init(h) < 0) {
+            av_log(h->s.avctx, AV_LOG_ERROR, "context_init() failed.\n");
+            return -1;
+        }
+    } else {
+        for (i = 1; i < s->slice_context_count; i++) {
+            H264Context *c;
+            c = h->thread_context[i] = av_malloc(sizeof(H264Context));
+            memcpy(c, h->s.thread_context[i], sizeof(MpegEncContext));
+            memset(&c->s + 1, 0, sizeof(H264Context) - sizeof(MpegEncContext));
+            c->h264dsp     = h->h264dsp;
+            c->h264qpel    = h->h264qpel;
+            c->h264chroma  = h->h264chroma;
+            c->sps         = h->sps;
+            c->pps         = h->pps;
+            c->pixel_shift = h->pixel_shift;
+            init_scan_tables(c);
+            clone_tables(c, h, i);
+        }
+
+        for (i = 0; i < s->slice_context_count; i++)
+            if (context_init(h->thread_context[i]) < 0) {
+                av_log(h->s.avctx, AV_LOG_ERROR, "context_init() failed.\n");
+                return -1;
+            }
+    }
+
+    return 0;
+}
+
  /**
   * Decode a slice header.
   * This will also call ff_MPV_common_init() and frame_start() as needed.
@@ -2873,20 +2603,14 @@ static int decode_slice_header(H264Context *h, H264Context *h0)
      MpegEncContext *const s0 = &h0->s;
      unsigned int first_mb_in_slice;
      unsigned int pps_id;
-    int num_ref_idx_active_override_flag;
+    int num_ref_idx_active_override_flag, max_refs, ret;
      unsigned int slice_type, tmp, i, j;
      int default_ref_list_done = 0;
-    int last_pic_structure, last_pic_dropable;
+    int last_pic_structure, last_pic_droppable;
+    int needs_reinit = 0;
  
-    /* FIXME: 2tap qpel isn't implemented for high bit depth. */
-    if ((s->avctx->flags2 & CODEC_FLAG2_FAST) &&
-        !h->nal_ref_idc && !h->pixel_shift) {
-        s->me.qpel_put = s->dsp.put_2tap_qpel_pixels_tab;
-        s->me.qpel_avg = s->dsp.avg_2tap_qpel_pixels_tab;
-    } else {
-        s->me.qpel_put = s->dsp.put_h264_qpel_pixels_tab;
-        s->me.qpel_avg = s->dsp.avg_h264_qpel_pixels_tab;
-    }
+    s->me.qpel_put = h->h264qpel.put_h264_qpel_pixels_tab;
+    s->me.qpel_avg = h->h264qpel.avg_h264_qpel_pixels_tab;
  
      first_mb_in_slice = get_ue_golomb(&s->gb);
  
@@ -2897,7 +2621,7 @@ static int decode_slice_header(H264Context *h, H264Context *h0)
  
          h0->current_slice = 0;
          if (!s0->first_field) {
-            if (s->current_picture_ptr && !s->dropable &&
+            if (s->current_picture_ptr && !s->droppable &&
                  s->current_picture_ptr->owner2 == s) {
                  ff_thread_report_progress(&s->current_picture_ptr->f, INT_MAX,
                                            s->picture_structure == PICT_BOTTOM_FIELD);
@@ -2949,12 +2673,33 @@ static int decode_slice_header(H264Context *h, H264Context *h0)
                 h->pps.sps_id);
          return -1;
      }
-    h->sps = *h0->sps_buffers[h->pps.sps_id];
+
+    if (h->pps.sps_id != h->current_sps_id ||
+        h->context_reinitialized           ||
+        h0->sps_buffers[h->pps.sps_id]->new) {
+        SPS *new_sps = h0->sps_buffers[h->pps.sps_id];
+
+        h0->sps_buffers[h->pps.sps_id]->new = 0;
+
+        if (h->sps.chroma_format_idc != new_sps->chroma_format_idc ||
+            h->sps.bit_depth_luma    != new_sps->bit_depth_luma)
+            needs_reinit = 1;
+
+        h->current_sps_id = h->pps.sps_id;
+        h->sps            = *h0->sps_buffers[h->pps.sps_id];
+
+        if ((ret = h264_set_parameter_from_sps(h)) < 0)
+            return ret;
+    }
  
      s->avctx->profile = ff_h264_get_profile(&h->sps);
      s->avctx->level   = h->sps.level_idc;
      s->avctx->refs    = h->sps.ref_frame_count;
  
+    if (s->mb_width  != h->sps.mb_width ||
+        s->mb_height != h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag))
+        needs_reinit = 1;
+
      s->mb_width  = h->sps.mb_width;
      s->mb_height = h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
  
@@ -2974,137 +2719,61 @@ static int decode_slice_header(H264Context *h, H264Context *h0)
          s->height = s->avctx->height;
      }
  
-    if (s->context_initialized &&
-        (s->width != s->avctx->width || s->height != s->avctx->height ||
-         av_cmp_q(h->sps.sar, s->avctx->sample_aspect_ratio))) {
-        if (h != h0 || (HAVE_THREADS && h->s.avctx->active_thread_type & FF_THREAD_FRAME)) {
-            av_log_missing_feature(s->avctx,
-                                   "Width/height changing with threads is", 0);
-            return AVERROR_PATCHWELCOME;   // width / height changed during parallelized decoding
+    if (h->sps.video_signal_type_present_flag) {
+        s->avctx->color_range = h->sps.full_range ? AVCOL_RANGE_JPEG
+                                                  : AVCOL_RANGE_MPEG;
+        if (h->sps.colour_description_present_flag) {
+            if (s->avctx->colorspace != h->sps.colorspace)
+                needs_reinit = 1;
+            s->avctx->color_primaries = h->sps.color_primaries;
+            s->avctx->color_trc       = h->sps.color_trc;
+            s->avctx->colorspace      = h->sps.colorspace;
          }
-        free_tables(h, 0);
-        flush_dpb(s->avctx);
-        ff_MPV_common_end(s);
      }
-    if (!s->context_initialized) {
-        if (h != h0) {
-            av_log(h->s.avctx, AV_LOG_ERROR,
-                   "Cannot (re-)initialize context during parallel decoding.\n");
-            return -1;
-        }
-
-        avcodec_set_dimensions(s->avctx, s->width, s->height);
-        s->avctx->sample_aspect_ratio = h->sps.sar;
-        av_assert0(s->avctx->sample_aspect_ratio.den);
-
-        if (h->sps.video_signal_type_present_flag) {
-            s->avctx->color_range = h->sps.full_range ? AVCOL_RANGE_JPEG
-                                                      : AVCOL_RANGE_MPEG;
-            if (h->sps.colour_description_present_flag) {
-                s->avctx->color_primaries = h->sps.color_primaries;
-                s->avctx->color_trc       = h->sps.color_trc;
-                s->avctx->colorspace      = h->sps.colorspace;
-            }
-        }
  
-        if (h->sps.timing_info_present_flag) {
-            int64_t den = h->sps.time_scale;
-            if (h->x264_build < 44U)
-                den *= 2;
-            av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
-                      h->sps.num_units_in_tick, den, 1 << 30);
-        }
+    if (s->context_initialized &&
+        (s->width  != s->avctx->width   ||
+         s->height != s->avctx->height  ||
+         needs_reinit                   ||
+         av_cmp_q(h->sps.sar, s->avctx->sample_aspect_ratio))) {
  
-        switch (h->sps.bit_depth_luma) {
-        case 9:
-            if (CHROMA444) {
-                if (s->avctx->colorspace == AVCOL_SPC_RGB) {
-                    s->avctx->pix_fmt = PIX_FMT_GBRP9;
-                } else
-                    s->avctx->pix_fmt = PIX_FMT_YUV444P9;
-            } else if (CHROMA422)
-                s->avctx->pix_fmt = PIX_FMT_YUV422P9;
-            else
-                s->avctx->pix_fmt = PIX_FMT_YUV420P9;
-            break;
-        case 10:
-            if (CHROMA444) {
-                if (s->avctx->colorspace == AVCOL_SPC_RGB) {
-                    s->avctx->pix_fmt = PIX_FMT_GBRP10;
-                } else
-                    s->avctx->pix_fmt = PIX_FMT_YUV444P10;
-            } else if (CHROMA422)
-                s->avctx->pix_fmt = PIX_FMT_YUV422P10;
-            else
-                s->avctx->pix_fmt = PIX_FMT_YUV420P10;
-            break;
-        case 8:
-            if (CHROMA444) {
-                if (s->avctx->colorspace == AVCOL_SPC_RGB) {
-                    s->avctx->pix_fmt = PIX_FMT_GBRP;
-                } else
-                    s->avctx->pix_fmt = s->avctx->color_range == AVCOL_RANGE_JPEG ? PIX_FMT_YUVJ444P
-                                                                                  : PIX_FMT_YUV444P;
-            } else if (CHROMA422) {
-                s->avctx->pix_fmt = s->avctx->color_range == AVCOL_RANGE_JPEG ? PIX_FMT_YUVJ422P
-                                                                              : PIX_FMT_YUV422P;
-            } else {
-                s->avctx->pix_fmt = s->avctx->get_format(s->avctx,
-                                                         s->avctx->codec->pix_fmts ?
-                                                         s->avctx->codec->pix_fmts :
-                                                         s->avctx->color_range == AVCOL_RANGE_JPEG ?
-                                                         hwaccel_pixfmt_list_h264_jpeg_420 :
-                                                         ff_hwaccel_pixfmt_list_420);
-            }
-            break;
-        default:
-            av_log(s->avctx, AV_LOG_ERROR,
-                   "Unsupported bit depth: %d\n", h->sps.bit_depth_luma);
+        if (h != h0) {
+            av_log(s->avctx, AV_LOG_ERROR, "changing width/height on "
+                   "slice %d\n", h0->current_slice + 1);
              return AVERROR_INVALIDDATA;
          }
  
-        s->avctx->hwaccel = ff_find_hwaccel(s->avctx->codec->id,
-                                            s->avctx->pix_fmt);
+        flush_change(h);
  
-        if (ff_MPV_common_init(s) < 0) {
-            av_log(h->s.avctx, AV_LOG_ERROR, "ff_MPV_common_init() failed.\n");
-            return -1;
-        }
-        s->first_field = 0;
-        h->prev_interlaced_frame = 1;
+        if ((ret = get_pixel_format(h)) < 0)
+            return ret;
+        s->avctx->pix_fmt = ret;
  
-        init_scan_tables(h);
-        if (ff_h264_alloc_tables(h) < 0) {
+        av_log(h->s.avctx, AV_LOG_INFO, "Reinit context to %dx%d, "
+               "pix_fmt: %d\n", s->width, s->height, s->avctx->pix_fmt);
+
+        if ((ret = h264_slice_header_init(h, 1)) < 0) {
              av_log(h->s.avctx, AV_LOG_ERROR,
-                   "Could not allocate memory for h264\n");
-            return AVERROR(ENOMEM);
+                   "h264_slice_header_init() failed\n");
+            return ret;
+        }
+        h->context_reinitialized = 1;
+    }
+    if (!s->context_initialized) {
+        if (h != h0) {
+            av_log(h->s.avctx, AV_LOG_ERROR,
+                   "Cannot (re-)initialize context during parallel decoding.\n");
+            return -1;
          }
  
-        if (!HAVE_THREADS || !(s->avctx->active_thread_type & FF_THREAD_SLICE)) {
-            if (context_init(h) < 0) {
-                av_log(h->s.avctx, AV_LOG_ERROR, "context_init() failed.\n");
-                return -1;
-            }
-        } else {
-            for (i = 1; i < s->slice_context_count; i++) {
-                H264Context *c;
-                c = h->thread_context[i] = av_malloc(sizeof(H264Context));
-                memcpy(c, h->s.thread_context[i], sizeof(MpegEncContext));
-                memset(&c->s + 1, 0, sizeof(H264Context) - sizeof(MpegEncContext));
-                c->h264dsp     = h->h264dsp;
-                c->sps         = h->sps;
-                c->pps         = h->pps;
-                c->pixel_shift = h->pixel_shift;
-                init_scan_tables(c);
-                clone_tables(c, h, i);
-            }
+        if ((ret = get_pixel_format(h)) < 0)
+            return ret;
+        s->avctx->pix_fmt = ret;
  
-            for (i = 0; i < s->slice_context_count; i++)
-                if (context_init(h->thread_context[i]) < 0) {
-                    av_log(h->s.avctx, AV_LOG_ERROR,
-                           "context_init() failed.\n");
-                    return -1;
-                }
+        if ((ret = h264_slice_header_init(h, 0)) < 0) {
+            av_log(h->s.avctx, AV_LOG_ERROR,
+                   "h264_slice_header_init() failed\n");
+            return ret;
          }
      }
  
@@ -3118,8 +2787,8 @@ static int decode_slice_header(H264Context *h, H264Context *h0)
      h->mb_mbaff        = 0;
      h->mb_aff_frame    = 0;
      last_pic_structure = s0->picture_structure;
-    last_pic_dropable  = s->dropable;
-    s->dropable        = h->nal_ref_idc == 0;
+    last_pic_droppable = s0->droppable;
+    s->droppable       = h->nal_ref_idc == 0;
      if (h->sps.frame_mbs_only_flag) {
          s->picture_structure = PICT_FRAME;
      } else {
@@ -3134,12 +2803,17 @@ static int decode_slice_header(H264Context *h, H264Context *h0)
  
      if (h0->current_slice != 0) {
          if (last_pic_structure != s->picture_structure ||
-            last_pic_dropable  != s->dropable) {
+            last_pic_droppable != s->droppable) {
              av_log(h->s.avctx, AV_LOG_ERROR,
                     "Changing field mode (%d -> %d) between slices is not allowed\n",
                     last_pic_structure, s->picture_structure);
              s->picture_structure = last_pic_structure;
-            s->dropable          = last_pic_dropable;
+            s->droppable         = last_pic_droppable;
+            return AVERROR_INVALIDDATA;
+        } else if (!s0->current_picture_ptr) {
+            av_log(s->avctx, AV_LOG_ERROR,
+                   "unset current_picture_ptr on %d. slice\n",
+                   h0->current_slice + 1);
              return AVERROR_INVALIDDATA;
          }
      } else {
@@ -3172,7 +2846,7 @@ static int decode_slice_header(H264Context *h, H264Context *h0)
              assert(s0->current_picture_ptr->f.reference != DELAYED_PIC_REF);
  
              /* Mark old field/frame as completed */
-            if (!last_pic_dropable && s0->current_picture_ptr->owner2 == s0) {
+            if (!last_pic_droppable && s0->current_picture_ptr->owner2 == s0) {
                  ff_thread_report_progress(&s0->current_picture_ptr->f, INT_MAX,
                                            last_pic_structure == PICT_BOTTOM_FIELD);
              }
@@ -3181,7 +2855,7 @@ static int decode_slice_header(H264Context *h, H264Context *h0)
              if (!FIELD_PICTURE || s->picture_structure == last_pic_structure) {
                  /* Previous field is unmatched. Don't display it, but let it
                   * remain for reference if marked as such. */
-                if (!last_pic_dropable && last_pic_structure != PICT_FRAME) {
+                if (!last_pic_droppable && last_pic_structure != PICT_FRAME) {
                      ff_thread_report_progress(&s0->current_picture_ptr->f, INT_MAX,
                                                last_pic_structure == PICT_TOP_FIELD);
                  }
@@ -3191,7 +2865,7 @@ static int decode_slice_header(H264Context *h, H264Context *h0)
                       * different frame_nums. Consider this field first in
                       * pair. Throw away previous field except for reference
                       * purposes. */
-                    if (!last_pic_dropable && last_pic_structure != PICT_FRAME) {
+                    if (!last_pic_droppable && last_pic_structure != PICT_FRAME) {
                          ff_thread_report_progress(&s0->current_picture_ptr->f, INT_MAX,
                                                    last_pic_structure == PICT_TOP_FIELD);
                      }
@@ -3205,15 +2879,15 @@ static int decode_slice_header(H264Context *h, H264Context *h0)
                                 "Invalid field mode combination %d/%d\n",
                                 last_pic_structure, s->picture_structure);
                          s->picture_structure = last_pic_structure;
-                        s->dropable          = last_pic_dropable;
+                        s->droppable         = last_pic_droppable;
                          return AVERROR_INVALIDDATA;
-                    } else if (last_pic_dropable != s->dropable) {
+                    } else if (last_pic_droppable != s->droppable) {
                          av_log(s->avctx, AV_LOG_ERROR,
                                 "Cannot combine reference and non-reference fields in the same frame\n");
                          av_log_ask_for_sample(s->avctx, NULL);
                          s->picture_structure = last_pic_structure;
-                        s->dropable          = last_pic_dropable;
-                        return AVERROR_INVALIDDATA;
+                        s->droppable         = last_pic_droppable;
+                        return AVERROR_PATCHWELCOME;
                      }
  
                      /* Take ownership of this buffer. Note that if another thread owned
@@ -3239,7 +2913,9 @@ static int decode_slice_header(H264Context *h, H264Context *h0)
              s->current_picture_ptr->frame_num = h->prev_frame_num;
              ff_thread_report_progress(&s->current_picture_ptr->f, INT_MAX, 0);
              ff_thread_report_progress(&s->current_picture_ptr->f, INT_MAX, 1);
-            ff_generate_sliding_window_mmcos(h);
+            if ((ret = ff_generate_sliding_window_mmcos(h, 1)) < 0 &&
+                s->avctx->err_recognition & AV_EF_EXPLODE)
+                return ret;
              if (ff_h264_execute_ref_pic_marking(h, h->mmco, h->mmco_index) < 0 &&
                  (s->avctx->err_recognition & AV_EF_EXPLODE))
                  return AVERROR_INVALIDDATA;
@@ -3288,7 +2964,6 @@ static int decode_slice_header(H264Context *h, H264Context *h0)
              }
          } else {
              /* Frame or first field in a potentially complementary pair */
-            assert(!s0->current_picture_ptr);
              s0->first_field = FIELD_PICTURE;
          }
  
@@ -3301,8 +2976,8 @@ static int decode_slice_header(H264Context *h, H264Context *h0)
              ff_release_unused_pictures(s, 0);
          }
      }
-    if (h != h0)
-        clone_slice(h, h0);
+    if (h != h0 && (ret = clone_slice(h, h0)) < 0)
+        return ret;
  
      s->current_picture_ptr->frame_num = h->frame_num; // FIXME frame_num cleanup
  
@@ -3353,22 +3028,19 @@ static int decode_slice_header(H264Context *h, H264Context *h0)
      h->ref_count[1] = h->pps.ref_count[1];
  
      if (h->slice_type_nos != AV_PICTURE_TYPE_I) {
-        int max_refs = s->picture_structure == PICT_FRAME ? 16 : 32;
-
          if (h->slice_type_nos == AV_PICTURE_TYPE_B)
              h->direct_spatial_mv_pred = get_bits1(&s->gb);
          num_ref_idx_active_override_flag = get_bits1(&s->gb);
  
          if (num_ref_idx_active_override_flag) {
              h->ref_count[0] = get_ue_golomb(&s->gb) + 1;
-            if (h->slice_type_nos == AV_PICTURE_TYPE_B)
+            if (h->ref_count[0] < 1)
+                return AVERROR_INVALIDDATA;
+            if (h->slice_type_nos == AV_PICTURE_TYPE_B) {
                  h->ref_count[1] = get_ue_golomb(&s->gb) + 1;
-        }
-
-        if (h->ref_count[0] > max_refs || h->ref_count[1] > max_refs) {
-            av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
-            h->ref_count[0] = h->ref_count[1] = 1;
-            return AVERROR_INVALIDDATA;
+                if (h->ref_count[1] < 1)
+                    return AVERROR_INVALIDDATA;
+            }
          }
  
          if (h->slice_type_nos == AV_PICTURE_TYPE_B)
@@ -3378,6 +3050,14 @@ static int decode_slice_header(H264Context *h, H264Context *h0)
      } else
          h->list_count = 0;
  
+    max_refs = s->picture_structure == PICT_FRAME ? 16 : 32;
+
+    if (h->ref_count[0] > max_refs || h->ref_count[1] > max_refs) {
+        av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
+        h->ref_count[0] = h->ref_count[1] = 1;
+        return AVERROR_INVALIDDATA;
+    }
+
      if (!default_ref_list_done)
          ff_h264_fill_default_ref_list(h);
  
@@ -3389,10 +3069,12 @@ static int decode_slice_header(H264Context *h, H264Context *h0)
  
      if (h->slice_type_nos != AV_PICTURE_TYPE_I) {
          s->last_picture_ptr = &h->ref_list[0][0];
+        s->last_picture_ptr->owner2 = s;
          ff_copy_picture(&s->last_picture, s->last_picture_ptr);
      }
      if (h->slice_type_nos == AV_PICTURE_TYPE_B) {
          s->next_picture_ptr = &h->ref_list[1][0];
+        s->next_picture_ptr->owner2 = s;
          ff_copy_picture(&s->next_picture, s->next_picture_ptr);
      }
  
@@ -3411,7 +3093,15 @@ static int decode_slice_header(H264Context *h, H264Context *h0)
          }
      }
  
-    if (h->nal_ref_idc && ff_h264_decode_ref_pic_marking(h0, &s->gb) < 0 &&
+    // If frame-mt is enabled, only update mmco tables for the first slice
+    // in a field. Subsequent slices can temporarily clobber h->mmco_index
+    // or h->mmco, which will cause ref list mix-ups and decoding errors
+    // further down the line. This may break decoding if the first slice is
+    // corrupt, thus we only do this if frame-mt is enabled.
+    if (h->nal_ref_idc &&
+        ff_h264_decode_ref_pic_marking(h0, &s->gb,
+                            !(s->avctx->active_thread_type & FF_THREAD_FRAME) ||
+                            h0->current_slice == 0) < 0 &&
          (s->avctx->err_recognition & AV_EF_EXPLODE))
          return AVERROR_INVALIDDATA;
  
@@ -3944,7 +3634,7 @@ static void decode_finish_row(H264Context *h)
  
      ff_draw_horiz_band(s, top, height);
  
-    if (s->dropable)
+    if (s->droppable)
          return;
  
      ff_thread_report_progress(&s->current_picture_ptr->f, top + height - 1,
@@ -3962,7 +3652,7 @@ static int decode_slice(struct AVCodecContext *avctx, void *arg)
      s->mb_skip_run = -1;
  
      h->is_complex = FRAME_MBAFF || s->picture_structure != PICT_FRAME ||
-                    s->codec_id != CODEC_ID_H264 ||
+                    s->codec_id != AV_CODEC_ID_H264 ||
                      (CONFIG_GRAY && (s->flags & CODEC_FLAG_GRAY));
  
      if (h->pps.cabac) {
@@ -4085,7 +3775,7 @@ static int decode_slice(struct AVCodecContext *avctx, void *arg)
                          return 0;
                      } else {
                          ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y,
-                                        s->mb_x, s->mb_y,
+                                        s->mb_x - 1, s->mb_y,
                                          ER_MB_END & part_mask);
  
                          return -1;
@@ -4147,7 +3837,7 @@ static int execute_decode_slices(H264Context *h, int context_count)
          hx                   = h->thread_context[context_count - 1];
          s->mb_x              = hx->s.mb_x;
          s->mb_y              = hx->s.mb_y;
-        s->dropable          = hx->s.dropable;
+        s->droppable         = hx->s.droppable;
          s->picture_structure = hx->s.picture_structure;
          for (i = 1; i < context_count; i++)
              h->s.error_count += h->thread_context[i]->s.error_count;
@@ -4156,7 +3846,8 @@ static int execute_decode_slices(H264Context *h, int context_count)
      return 0;
  }
  
-static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size)
+static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size,
+                            int parse_extradata)
  {
      MpegEncContext *const s     = &h->s;
      AVCodecContext *const avctx = s->avctx;
@@ -4210,8 +3901,10 @@ static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size)
                          buf[buf_index + 2] == 1)
                          break;
  
-                if (buf_index + 3 >= buf_size)
+                if (buf_index + 3 >= buf_size) {
+                    buf_index = buf_size;
                      break;
+                }
  
                  buf_index += 3;
                  if (buf_index >= next_avc)
@@ -4262,6 +3955,7 @@ static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size)
                  case NAL_PPS:
                      nals_needed = nal_index;
                      break;
+                case NAL_DPA:
                  case NAL_IDR_SLICE:
                  case NAL_SLICE:
                      init_get_bits(&hx->s.gb, ptr, bit_length);
@@ -4276,12 +3970,23 @@ static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size)
                  continue;
  
  again:
+            /* Ignore every NAL unit type except PPS and SPS during extradata
+             * parsing. Decoding slices is not possible in codec init
+             * with frame-mt */
+            if (parse_extradata && HAVE_THREADS &&
+                (s->avctx->active_thread_type & FF_THREAD_FRAME) &&
+                (hx->nal_unit_type != NAL_PPS &&
+                 hx->nal_unit_type != NAL_SPS)) {
+                av_log(avctx, AV_LOG_INFO, "Ignoring NAL unit %d during "
+                       "extradata parsing\n", hx->nal_unit_type);
+                hx->nal_unit_type = NAL_FF_IGNORE;
+            }
              err = 0;
              switch (hx->nal_unit_type) {
              case NAL_IDR_SLICE:
                  if (h->nal_unit_type != NAL_IDR_SLICE) {
                      av_log(h->s.avctx, AV_LOG_ERROR,
-                           "Invalid mix of idr and non-idr slices");
+                           "Invalid mix of idr and non-idr slices\n");
                      buf_index = -1;
                      goto end;
                  }
@@ -4357,6 +4062,7 @@ again:
                  if (hx->redundant_pic_count == 0 &&
                      hx->intra_gb_ptr &&
                      hx->s.data_partitioning &&
+                    s->current_picture_ptr &&
                      s->context_initialized &&
                      (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc) &&
                      (avctx->skip_frame < AVDISCARD_BIDIR  ||
@@ -4381,35 +4087,9 @@ again:
                      ff_h264_decode_seq_parameter_set(h);
                  }
  
-                if (s->flags & CODEC_FLAG_LOW_DELAY ||
-                    (h->sps.bitstream_restriction_flag &&
-                     !h->sps.num_reorder_frames))
-                    s->low_delay = 1;
-
-                if (avctx->has_b_frames < 2)
-                    avctx->has_b_frames = !s->low_delay;
-
-                if (avctx->bits_per_raw_sample != h->sps.bit_depth_luma ||
-                    h->cur_chroma_format_idc   != h->sps.chroma_format_idc) {
-                    if (h->sps.bit_depth_luma >= 8 && h->sps.bit_depth_luma <= 10) {
-                        avctx->bits_per_raw_sample = h->sps.bit_depth_luma;
-                        h->cur_chroma_format_idc   = h->sps.chroma_format_idc;
-                        h->pixel_shift             = h->sps.bit_depth_luma > 8;
-
-                        ff_h264dsp_init(&h->h264dsp, h->sps.bit_depth_luma,
-                                        h->sps.chroma_format_idc);
-                        ff_h264_pred_init(&h->hpc, s->codec_id,
-                                          h->sps.bit_depth_luma,
-                                          h->sps.chroma_format_idc);
-                        s->dsp.dct_bits = h->sps.bit_depth_luma > 8 ? 32 : 16;
-                        ff_dsputil_init(&s->dsp, s->avctx);
-                    } else {
-                        av_log(avctx, AV_LOG_ERROR,
-                               "Unsupported bit depth: %d\n",
-                               h->sps.bit_depth_luma);
-                        buf_index = -1;
-                        goto end;
-                    }
+                if (h264_set_parameter_from_sps(h) < 0) {
+                    buf_index = -1;
+                    goto end;
                  }
                  break;
              case NAL_PPS:
@@ -4423,6 +4103,8 @@ again:
              case NAL_SPS_EXT:
              case NAL_AUXILIARY_SLICE:
                  break;
+            case NAL_FF_IGNORE:
+                break;
              default:
                  av_log(avctx, AV_LOG_DEBUG, "Unknown NAL code: %d (%d bits)\n",
                         hx->nal_unit_type, bit_length);
@@ -4453,7 +4135,7 @@ again:
  end:
      /* clean up */
      if (s->current_picture_ptr && s->current_picture_ptr->owner2 == s &&
-        !s->dropable) {
+        !s->droppable) {
          ff_thread_report_progress(&s->current_picture_ptr->f, INT_MAX,
                                    s->picture_structure == PICT_BOTTOM_FIELD);
      }
@@ -4475,7 +4157,7 @@ static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size)
  }
  
  static int decode_frame(AVCodecContext *avctx, void *data,
-                        int *data_size, AVPacket *avpkt)
+                        int *got_frame, AVPacket *avpkt)
  {
      const uint8_t *buf = avpkt->data;
      int buf_size       = avpkt->size;
@@ -4512,14 +4194,14 @@ out:
              h->delayed_pic[i] = h->delayed_pic[i + 1];
  
          if (out) {
-            *data_size = sizeof(AVFrame);
+            *got_frame = 1;
              *pict      = out->f;
          }
  
          return buf_index;
      }
  
-    buf_index = decode_nal_units(h, buf, buf_size);
+    buf_index = decode_nal_units(h, buf, buf_size, 0);
      if (buf_index < 0)
          return -1;
  
@@ -4541,19 +4223,19 @@ out:
              decode_postinit(h, 1);
  
          field_end(h, 0);
+        h->context_reinitialized = 0;
  
          if (!h->next_output_pic) {
              /* Wait for second field. */
-            *data_size = 0;
+            *got_frame = 0;
          } else {
-            *data_size = sizeof(AVFrame);
+            *got_frame = 1;
              *pict      = h->next_output_pic->f;
          }
      }
  
-    assert(pict->data[0] || !*data_size);
+    assert(pict->data[0] || !*got_frame);
      ff_print_debug_info(s, pict);
-    // printf("out %d\n", (int)pict->data[0]);
  
      return get_consumed_bytes(s, buf_index, buf_size);
  }
@@ -4605,7 +4287,7 @@ static const AVProfile profiles[] = {
  AVCodec ff_h264_decoder = {
      .name                  = "h264",
      .type                  = AVMEDIA_TYPE_VIDEO,
-    .id                    = CODEC_ID_H264,
+    .id                    = AV_CODEC_ID_H264,
      .priv_data_size        = sizeof(H264Context),
      .init                  = ff_h264_decode_init,
      .close                 = h264_decode_end,
@@ -4624,7 +4306,7 @@ AVCodec ff_h264_decoder = {
  AVCodec ff_h264_vdpau_decoder = {
      .name           = "h264_vdpau",
      .type           = AVMEDIA_TYPE_VIDEO,
-    .id             = CODEC_ID_H264,
+    .id             = AV_CODEC_ID_H264,
      .priv_data_size = sizeof(H264Context),
      .init           = ff_h264_decode_init,
      .close          = h264_decode_end,
@@ -4632,8 +4314,8 @@ AVCodec ff_h264_vdpau_decoder = {
      .capabilities   = CODEC_CAP_DR1 | CODEC_CAP_DELAY | CODEC_CAP_HWACCEL_VDPAU,
      .flush          = flush_dpb,
      .long_name      = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10 (VDPAU acceleration)"),
-    .pix_fmts       = (const enum PixelFormat[]) { PIX_FMT_VDPAU_H264,
-                                                   PIX_FMT_NONE},
+    .pix_fmts       = (const enum AVPixelFormat[]) { AV_PIX_FMT_VDPAU_H264,
+                                                   AV_PIX_FMT_NONE},
      .profiles       = NULL_IF_CONFIG_SMALL(profiles),
  };
  #endif