merge another 2 if() to save a few cpu cycles

[ffmpeg] / libavcodec / h264.c
diff --git a/libavcodec/h264.c b/libavcodec/h264.c

index 70ec5d3eaa6fcd7e079dc979ebd7399e60a19e49..ea1be3cb105bc2ae213c0b77aac41d3158b81724 100644 (file)
--- a/libavcodec/h264.c
+++ b/libavcodec/h264.c
@@ -17,7 +17,6 @@
   * You should have received a copy of the GNU Lesser General Public
   * License along with FFmpeg; if not, write to the Free Software
   * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- *
   */
  
  /**
@@ -26,11 +25,12 @@
   * @author Michael Niedermayer <michaelni@gmx.at>
   */
  
-#include "common.h"
  #include "dsputil.h"
  #include "avcodec.h"
  #include "mpegvideo.h"
+#include "h264.h"
  #include "h264data.h"
+#include "h264_parser.h"
  #include "golomb.h"
  
  #include "cabac.h"
@@ -38,356 +38,6 @@
  //#undef NDEBUG
  #include <assert.h>
  
-#define interlaced_dct interlaced_dct_is_a_bad_name
-#define mb_intra mb_intra_isnt_initalized_see_mb_type
-
-#define LUMA_DC_BLOCK_INDEX   25
-#define CHROMA_DC_BLOCK_INDEX 26
-
-#define CHROMA_DC_COEFF_TOKEN_VLC_BITS 8
-#define COEFF_TOKEN_VLC_BITS           8
-#define TOTAL_ZEROS_VLC_BITS           9
-#define CHROMA_DC_TOTAL_ZEROS_VLC_BITS 3
-#define RUN_VLC_BITS                   3
-#define RUN7_VLC_BITS                  6
-
-#define MAX_SPS_COUNT 32
-#define MAX_PPS_COUNT 256
-
-#define MAX_MMCO_COUNT 66
-
-/* Compiling in interlaced support reduces the speed
- * of progressive decoding by about 2%. */
-#define ALLOW_INTERLACE
-
-#ifdef ALLOW_INTERLACE
-#define MB_MBAFF h->mb_mbaff
-#define MB_FIELD h->mb_field_decoding_flag
-#define FRAME_MBAFF h->mb_aff_frame
-#else
-#define MB_MBAFF 0
-#define MB_FIELD 0
-#define FRAME_MBAFF 0
-#undef  IS_INTERLACED
-#define IS_INTERLACED(mb_type) 0
-#endif
-
-/**
- * Sequence parameter set
- */
-typedef struct SPS{
-
-    int profile_idc;
-    int level_idc;
-    int transform_bypass;              ///< qpprime_y_zero_transform_bypass_flag
-    int log2_max_frame_num;            ///< log2_max_frame_num_minus4 + 4
-    int poc_type;                      ///< pic_order_cnt_type
-    int log2_max_poc_lsb;              ///< log2_max_pic_order_cnt_lsb_minus4
-    int delta_pic_order_always_zero_flag;
-    int offset_for_non_ref_pic;
-    int offset_for_top_to_bottom_field;
-    int poc_cycle_length;              ///< num_ref_frames_in_pic_order_cnt_cycle
-    int ref_frame_count;               ///< num_ref_frames
-    int gaps_in_frame_num_allowed_flag;
-    int mb_width;                      ///< frame_width_in_mbs_minus1 + 1
-    int mb_height;                     ///< frame_height_in_mbs_minus1 + 1
-    int frame_mbs_only_flag;
-    int mb_aff;                        ///<mb_adaptive_frame_field_flag
-    int direct_8x8_inference_flag;
-    int crop;                   ///< frame_cropping_flag
-    int crop_left;              ///< frame_cropping_rect_left_offset
-    int crop_right;             ///< frame_cropping_rect_right_offset
-    int crop_top;               ///< frame_cropping_rect_top_offset
-    int crop_bottom;            ///< frame_cropping_rect_bottom_offset
-    int vui_parameters_present_flag;
-    AVRational sar;
-    int timing_info_present_flag;
-    uint32_t num_units_in_tick;
-    uint32_t time_scale;
-    int fixed_frame_rate_flag;
-    short offset_for_ref_frame[256]; //FIXME dyn aloc?
-    int bitstream_restriction_flag;
-    int num_reorder_frames;
-    int scaling_matrix_present;
-    uint8_t scaling_matrix4[6][16];
-    uint8_t scaling_matrix8[2][64];
-}SPS;
-
-/**
- * Picture parameter set
- */
-typedef struct PPS{
-    unsigned int sps_id;
-    int cabac;                  ///< entropy_coding_mode_flag
-    int pic_order_present;      ///< pic_order_present_flag
-    int slice_group_count;      ///< num_slice_groups_minus1 + 1
-    int mb_slice_group_map_type;
-    unsigned int ref_count[2];  ///< num_ref_idx_l0/1_active_minus1 + 1
-    int weighted_pred;          ///< weighted_pred_flag
-    int weighted_bipred_idc;
-    int init_qp;                ///< pic_init_qp_minus26 + 26
-    int init_qs;                ///< pic_init_qs_minus26 + 26
-    int chroma_qp_index_offset;
-    int deblocking_filter_parameters_present; ///< deblocking_filter_parameters_present_flag
-    int constrained_intra_pred; ///< constrained_intra_pred_flag
-    int redundant_pic_cnt_present; ///< redundant_pic_cnt_present_flag
-    int transform_8x8_mode;     ///< transform_8x8_mode_flag
-    uint8_t scaling_matrix4[6][16];
-    uint8_t scaling_matrix8[2][64];
-}PPS;
-
-/**
- * Memory management control operation opcode.
- */
-typedef enum MMCOOpcode{
-    MMCO_END=0,
-    MMCO_SHORT2UNUSED,
-    MMCO_LONG2UNUSED,
-    MMCO_SHORT2LONG,
-    MMCO_SET_MAX_LONG,
-    MMCO_RESET,
-    MMCO_LONG,
-} MMCOOpcode;
-
-/**
- * Memory management control operation.
- */
-typedef struct MMCO{
-    MMCOOpcode opcode;
-    int short_frame_num;
-    int long_index;
-} MMCO;
-
-/**
- * H264Context
- */
-typedef struct H264Context{
-    MpegEncContext s;
-    int nal_ref_idc;
-    int nal_unit_type;
-    uint8_t *rbsp_buffer;
-    unsigned int rbsp_buffer_size;
-
-    /**
-      * Used to parse AVC variant of h264
-      */
-    int is_avc; ///< this flag is != 0 if codec is avc1
-    int got_avcC; ///< flag used to parse avcC data only once
-    int nal_length_size; ///< Number of bytes used for nal length (1, 2 or 4)
-
-    int chroma_qp; //QPc
-
-    int prev_mb_skipped;
-    int next_mb_skipped;
-
-    //prediction stuff
-    int chroma_pred_mode;
-    int intra16x16_pred_mode;
-
-    int top_mb_xy;
-    int left_mb_xy[2];
-
-    int8_t intra4x4_pred_mode_cache[5*8];
-    int8_t (*intra4x4_pred_mode)[8];
-    void (*pred4x4  [9+3])(uint8_t *src, uint8_t *topright, int stride);//FIXME move to dsp?
-    void (*pred8x8l [9+3])(uint8_t *src, int topleft, int topright, int stride);
-    void (*pred8x8  [4+3])(uint8_t *src, int stride);
-    void (*pred16x16[4+3])(uint8_t *src, int stride);
-    unsigned int topleft_samples_available;
-    unsigned int top_samples_available;
-    unsigned int topright_samples_available;
-    unsigned int left_samples_available;
-    uint8_t (*top_borders[2])[16+2*8];
-    uint8_t left_border[2*(17+2*9)];
-
-    /**
-     * non zero coeff count cache.
-     * is 64 if not available.
-     */
-    DECLARE_ALIGNED_8(uint8_t, non_zero_count_cache[6*8]);
-    uint8_t (*non_zero_count)[16];
-
-    /**
-     * Motion vector cache.
-     */
-    DECLARE_ALIGNED_8(int16_t, mv_cache[2][5*8][2]);
-    DECLARE_ALIGNED_8(int8_t, ref_cache[2][5*8]);
-#define LIST_NOT_USED -1 //FIXME rename?
-#define PART_NOT_AVAILABLE -2
-
-    /**
-     * is 1 if the specific list MV&references are set to 0,0,-2.
-     */
-    int mv_cache_clean[2];
-
-    /**
-     * number of neighbors (top and/or left) that used 8x8 dct
-     */
-    int neighbor_transform_size;
-
-    /**
-     * block_offset[ 0..23] for frame macroblocks
-     * block_offset[24..47] for field macroblocks
-     */
-    int block_offset[2*(16+8)];
-
-    uint32_t *mb2b_xy; //FIXME are these 4 a good idea?
-    uint32_t *mb2b8_xy;
-    int b_stride; //FIXME use s->b4_stride
-    int b8_stride;
-
-    int mb_linesize;   ///< may be equal to s->linesize or s->linesize*2, for mbaff
-    int mb_uvlinesize;
-
-    int emu_edge_width;
-    int emu_edge_height;
-
-    int halfpel_flag;
-    int thirdpel_flag;
-
-    int unknown_svq3_flag;
-    int next_slice_index;
-
-    SPS sps_buffer[MAX_SPS_COUNT];
-    SPS sps; ///< current sps
-
-    PPS pps_buffer[MAX_PPS_COUNT];
-    /**
-     * current pps
-     */
-    PPS pps; //FIXME move to Picture perhaps? (->no) do we need that?
-
-    uint32_t dequant4_buffer[6][52][16];
-    uint32_t dequant8_buffer[2][52][64];
-    uint32_t (*dequant4_coeff[6])[16];
-    uint32_t (*dequant8_coeff[2])[64];
-    int dequant_coeff_pps;     ///< reinit tables when pps changes
-
-    int slice_num;
-    uint8_t *slice_table_base;
-    uint8_t *slice_table;      ///< slice_table_base + 2*mb_stride + 1
-    int slice_type;
-    int slice_type_fixed;
-
-    //interlacing specific flags
-    int mb_aff_frame;
-    int mb_field_decoding_flag;
-    int mb_mbaff;              ///< mb_aff_frame && mb_field_decoding_flag
-
-    unsigned int sub_mb_type[4];
-
-    //POC stuff
-    int poc_lsb;
-    int poc_msb;
-    int delta_poc_bottom;
-    int delta_poc[2];
-    int frame_num;
-    int prev_poc_msb;             ///< poc_msb of the last reference pic for POC type 0
-    int prev_poc_lsb;             ///< poc_lsb of the last reference pic for POC type 0
-    int frame_num_offset;         ///< for POC type 2
-    int prev_frame_num_offset;    ///< for POC type 2
-    int prev_frame_num;           ///< frame_num of the last pic for POC type 1/2
-
-    /**
-     * frame_num for frames or 2*frame_num for field pics.
-     */
-    int curr_pic_num;
-
-    /**
-     * max_frame_num or 2*max_frame_num for field pics.
-     */
-    int max_pic_num;
-
-    //Weighted pred stuff
-    int use_weight;
-    int use_weight_chroma;
-    int luma_log2_weight_denom;
-    int chroma_log2_weight_denom;
-    int luma_weight[2][48];
-    int luma_offset[2][48];
-    int chroma_weight[2][48][2];
-    int chroma_offset[2][48][2];
-    int implicit_weight[48][48];
-
-    //deblock
-    int deblocking_filter;         ///< disable_deblocking_filter_idc with 1<->0
-    int slice_alpha_c0_offset;
-    int slice_beta_offset;
-
-    int redundant_pic_count;
-
-    int direct_spatial_mv_pred;
-    int dist_scale_factor[16];
-    int dist_scale_factor_field[32];
-    int map_col_to_list0[2][16];
-    int map_col_to_list0_field[2][32];
-
-    /**
-     * num_ref_idx_l0/1_active_minus1 + 1
-     */
-    unsigned int ref_count[2];   ///< counts frames or fields, depending on current mb mode
-    unsigned int list_count;
-    Picture *short_ref[32];
-    Picture *long_ref[32];
-    Picture default_ref_list[2][32];
-    Picture ref_list[2][48];     ///< 0..15: frame refs, 16..47: mbaff field refs
-    Picture *delayed_pic[18]; //FIXME size?
-    Picture *delayed_output_pic;
-
-    /**
-     * memory management control operations buffer.
-     */
-    MMCO mmco[MAX_MMCO_COUNT];
-    int mmco_index;
-
-    int long_ref_count;  ///< number of actual long term references
-    int short_ref_count; ///< number of actual short term references
-
-    //data partitioning
-    GetBitContext intra_gb;
-    GetBitContext inter_gb;
-    GetBitContext *intra_gb_ptr;
-    GetBitContext *inter_gb_ptr;
-
-    DECLARE_ALIGNED_8(DCTELEM, mb[16*24]);
-    DCTELEM mb_padding[256];        ///< as mb is addressed by scantable[i] and scantable is uint8_t we can either check that i is not to large or ensure that there is some unused stuff after mb
-
-    /**
-     * Cabac
-     */
-    CABACContext cabac;
-    uint8_t      cabac_state[460];
-    int          cabac_init_idc;
-
-    /* 0x100 -> non null luma_dc, 0x80/0x40 -> non null chroma_dc (cb/cr), 0x?0 -> chroma_cbp(0,1,2), 0x0? luma_cbp */
-    uint16_t     *cbp_table;
-    int cbp;
-    int top_cbp;
-    int left_cbp;
-    /* chroma_pred_mode for i4x4 or i16x16, else 0 */
-    uint8_t     *chroma_pred_mode_table;
-    int         last_qscale_diff;
-    int16_t     (*mvd_table[2])[2];
-    DECLARE_ALIGNED_8(int16_t, mvd_cache[2][5*8][2]);
-    uint8_t     *direct_table;
-    uint8_t     direct_cache[5*8];
-
-    uint8_t zigzag_scan[16];
-    uint8_t zigzag_scan8x8[64];
-    uint8_t zigzag_scan8x8_cavlc[64];
-    uint8_t field_scan[16];
-    uint8_t field_scan8x8[64];
-    uint8_t field_scan8x8_cavlc[64];
-    const uint8_t *zigzag_scan_q0;
-    const uint8_t *zigzag_scan8x8_q0;
-    const uint8_t *zigzag_scan8x8_cavlc_q0;
-    const uint8_t *field_scan_q0;
-    const uint8_t *field_scan8x8_q0;
-    const uint8_t *field_scan8x8_cavlc_q0;
-
-    int x264_build;
-}H264Context;
-
  static VLC coeff_token_vlc[4];
  static VLC chroma_dc_coeff_token_vlc;
  
@@ -1741,6 +1391,7 @@ static inline void write_back_motion(H264Context *h, int mb_type){
  static uint8_t *decode_nal(H264Context *h, uint8_t *src, int *dst_length, int *consumed, int length){
      int i, si, di;
      uint8_t *dst;
+    int bufidx;
  
  //    src[0]&0x80;                //forbidden bit
      h->nal_ref_idc= src[0]>>5;
@@ -1769,8 +1420,9 @@ static uint8_t *decode_nal(H264Context *h, uint8_t *src, int *dst_length, int *c
          return src;
      }
  
-    h->rbsp_buffer= av_fast_realloc(h->rbsp_buffer, &h->rbsp_buffer_size, length);
-    dst= h->rbsp_buffer;
+    bufidx = h->nal_unit_type == NAL_DPC ? 1 : 0; // use second escape buffer for inter data
+    h->rbsp_buffer[bufidx]= av_fast_realloc(h->rbsp_buffer[bufidx], &h->rbsp_buffer_size[bufidx], length);
+    dst= h->rbsp_buffer[bufidx];
  
      if (dst == NULL){
          return NULL;
@@ -1795,7 +1447,7 @@ static uint8_t *decode_nal(H264Context *h, uint8_t *src, int *dst_length, int *c
  
      *dst_length= di;
      *consumed= si + 1;//+1 for the header
-//FIXME store exact number of bits in the getbitcontext (its needed for decoding)
+//FIXME store exact number of bits in the getbitcontext (it is needed for decoding)
      return dst;
  }
  
@@ -1946,12 +1598,11 @@ static void chroma_dc_dct_c(DCTELEM *block){
  /**
   * gets the chroma qp.
   */
-static inline int get_chroma_qp(int chroma_qp_index_offset, int qscale){
-
-    return chroma_qp[av_clip(qscale + chroma_qp_index_offset, 0, 51)];
+static inline int get_chroma_qp(H264Context *h, int t, int qscale){
+    return h->pps.chroma_qp_table[t][qscale & 0xff];
  }
  
-//FIXME need to check that this doesnt overflow signed 32 bit for low qp, i am not sure, it's very close
+//FIXME need to check that this does not overflow signed 32 bit for low qp, i am not sure, it's very close
  //FIXME check that gcc inlines this (and optimizes intra & separate_dc stuff away)
  static inline int quantize_c(DCTELEM *block, uint8_t *scantable, int qscale, int intra, int separate_dc){
      int i;
@@ -2082,22 +1733,22 @@ static void pred4x4_128_dc_c(uint8_t *src, uint8_t *topright, int stride){
  
  
  #define LOAD_TOP_RIGHT_EDGE\
-    const int t4= topright[0];\
-    const int t5= topright[1];\
-    const int t6= topright[2];\
-    const int t7= topright[3];\
+    const int av_unused t4= topright[0];\
+    const int av_unused t5= topright[1];\
+    const int av_unused t6= topright[2];\
+    const int av_unused t7= topright[3];\
  
  #define LOAD_LEFT_EDGE\
-    const int l0= src[-1+0*stride];\
-    const int l1= src[-1+1*stride];\
-    const int l2= src[-1+2*stride];\
-    const int l3= src[-1+3*stride];\
+    const int av_unused l0= src[-1+0*stride];\
+    const int av_unused l1= src[-1+1*stride];\
+    const int av_unused l2= src[-1+2*stride];\
+    const int av_unused l3= src[-1+3*stride];\
  
  #define LOAD_TOP_EDGE\
-    const int t0= src[ 0-1*stride];\
-    const int t1= src[ 1-1*stride];\
-    const int t2= src[ 2-1*stride];\
-    const int t3= src[ 3-1*stride];\
+    const int av_unused t0= src[ 0-1*stride];\
+    const int av_unused t1= src[ 1-1*stride];\
+    const int av_unused t2= src[ 2-1*stride];\
+    const int av_unused t3= src[ 3-1*stride];\
  
  static void pred4x4_down_right_c(uint8_t *src, uint8_t *topright, int stride){
      const int lt= src[-1-1*stride];
@@ -2149,7 +1800,6 @@ static void pred4x4_vertical_right_c(uint8_t *src, uint8_t *topright, int stride
      const int lt= src[-1-1*stride];
      LOAD_TOP_EDGE
      LOAD_LEFT_EDGE
-    const __attribute__((unused)) int unu= l3;
  
      src[0+0*stride]=
      src[1+2*stride]=(lt + t0 + 1)>>1;
@@ -2172,7 +1822,6 @@ static void pred4x4_vertical_right_c(uint8_t *src, uint8_t *topright, int stride
  static void pred4x4_vertical_left_c(uint8_t *src, uint8_t *topright, int stride){
      LOAD_TOP_EDGE
      LOAD_TOP_RIGHT_EDGE
-    const __attribute__((unused)) int unu= t7;
  
      src[0+0*stride]=(t0 + t1 + 1)>>1;
      src[1+0*stride]=
@@ -2217,7 +1866,6 @@ static void pred4x4_horizontal_down_c(uint8_t *src, uint8_t *topright, int strid
      const int lt= src[-1-1*stride];
      LOAD_TOP_EDGE
      LOAD_LEFT_EDGE
-    const __attribute__((unused)) int unu= t3;
  
      src[0+0*stride]=
      src[2+1*stride]=(lt + l0 + 1)>>1;
@@ -2284,7 +1932,7 @@ void ff_pred16x16_dc_c(uint8_t *src, int stride){
      }
  }
  
-static void pred16x16_left_dc_c(uint8_t *src, int stride){
+void ff_pred16x16_left_dc_c(uint8_t *src, int stride){
      int i, dc=0;
  
      for(i=0;i<16; i++){
@@ -2301,7 +1949,7 @@ static void pred16x16_left_dc_c(uint8_t *src, int stride){
      }
  }
  
-static void pred16x16_top_dc_c(uint8_t *src, int stride){
+void ff_pred16x16_top_dc_c(uint8_t *src, int stride){
      int i, dc=0;
  
      for(i=0;i<16; i++){
@@ -2401,7 +2049,7 @@ void ff_pred8x8_128_dc_c(uint8_t *src, int stride){
      }
  }
  
-static void pred8x8_left_dc_c(uint8_t *src, int stride){
+void ff_pred8x8_left_dc_c(uint8_t *src, int stride){
      int i;
      int dc0, dc2;
  
@@ -2423,7 +2071,7 @@ static void pred8x8_left_dc_c(uint8_t *src, int stride){
      }
  }
  
-static void pred8x8_top_dc_c(uint8_t *src, int stride){
+void ff_pred8x8_top_dc_c(uint8_t *src, int stride){
      int i;
      int dc0, dc1;
  
@@ -2511,7 +2159,7 @@ void ff_pred8x8_plane_c(uint8_t *src, int stride){
      const int l0 = ((has_topleft ? SRC(-1,-1) : SRC(-1,0)) \
                       + 2*SRC(-1,0) + SRC(-1,1) + 2) >> 2; \
      PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) \
-    const int l7 attribute_unused = (SRC(-1,6) + 3*SRC(-1,7) + 2) >> 2
+    const int l7 av_unused = (SRC(-1,6) + 3*SRC(-1,7) + 2) >> 2
  
  #define PT(x) \
      const int t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
@@ -2519,7 +2167,7 @@ void ff_pred8x8_plane_c(uint8_t *src, int stride){
      const int t0 = ((has_topleft ? SRC(-1,-1) : SRC(0,-1)) \
                       + 2*SRC(0,-1) + SRC(1,-1) + 2) >> 2; \
      PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) \
-    const int t7 attribute_unused = ((has_topright ? SRC(8,-1) : SRC(7,-1)) \
+    const int t7 av_unused = ((has_topright ? SRC(8,-1) : SRC(7,-1)) \
                       + 2*SRC(7,-1) + SRC(6,-1) + 2) >> 2
  
  #define PTR(x) \
@@ -2784,7 +2432,7 @@ static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square,
          qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
      }
  
-    if(s->flags&CODEC_FLAG_GRAY) return;
+    if(ENABLE_GRAY && s->flags&CODEC_FLAG_GRAY) return;
  
      if(MB_MBAFF){
          // chroma offset when predicting from a field of opposite parity
@@ -3029,7 +2677,7 @@ static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t
      prefetch_motion(h, 1);
  }
  
-static void decode_init_vlc(){
+static void decode_init_vlc(void){
      static int done = 0;
  
      if (!done) {
@@ -3104,20 +2752,21 @@ static void init_pred_ptrs(H264Context *h){
      h->pred8x8[VERT_PRED8x8   ]= ff_pred8x8_vertical_c;
      h->pred8x8[HOR_PRED8x8    ]= ff_pred8x8_horizontal_c;
      h->pred8x8[PLANE_PRED8x8  ]= ff_pred8x8_plane_c;
-    h->pred8x8[LEFT_DC_PRED8x8]= pred8x8_left_dc_c;
-    h->pred8x8[TOP_DC_PRED8x8 ]= pred8x8_top_dc_c;
+    h->pred8x8[LEFT_DC_PRED8x8]= ff_pred8x8_left_dc_c;
+    h->pred8x8[TOP_DC_PRED8x8 ]= ff_pred8x8_top_dc_c;
      h->pred8x8[DC_128_PRED8x8 ]= ff_pred8x8_128_dc_c;
  
      h->pred16x16[DC_PRED8x8     ]= ff_pred16x16_dc_c;
      h->pred16x16[VERT_PRED8x8   ]= ff_pred16x16_vertical_c;
      h->pred16x16[HOR_PRED8x8    ]= ff_pred16x16_horizontal_c;
      h->pred16x16[PLANE_PRED8x8  ]= ff_pred16x16_plane_c;
-    h->pred16x16[LEFT_DC_PRED8x8]= pred16x16_left_dc_c;
-    h->pred16x16[TOP_DC_PRED8x8 ]= pred16x16_top_dc_c;
+    h->pred16x16[LEFT_DC_PRED8x8]= ff_pred16x16_left_dc_c;
+    h->pred16x16[TOP_DC_PRED8x8 ]= ff_pred16x16_top_dc_c;
      h->pred16x16[DC_128_PRED8x8 ]= ff_pred16x16_128_dc_c;
  }
  
  static void free_tables(H264Context *h){
+    int i;
      av_freep(&h->intra4x4_pred_mode);
      av_freep(&h->chroma_pred_mode_table);
      av_freep(&h->cbp_table);
@@ -3134,6 +2783,12 @@ static void free_tables(H264Context *h){
      av_freep(&h->mb2b8_xy);
  
      av_freep(&h->s.obmc_scratchpad);
+
+    for(i = 0; i < MAX_SPS_COUNT; i++)
+        av_freep(h->sps_buffers + i);
+
+    for(i = 0; i < MAX_PPS_COUNT; i++)
+        av_freep(h->pps_buffers + i);
  }
  
  static void init_dequant8_coeff_table(H264Context *h){
@@ -3283,6 +2938,7 @@ static int decode_init(AVCodecContext *avctx){
  
      // set defaults
  //    s->decode_mb= ff_h263_decode_mb;
+    s->quarter_sample = 1;
      s->low_delay= 1;
      avctx->pix_fmt= PIX_FMT_YUV420P;
  
@@ -3333,7 +2989,7 @@ static int frame_start(H264Context *h){
      return 0;
  }
  
-static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){
+static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
      MpegEncContext * const s = &h->s;
      int i;
  
@@ -3351,7 +3007,7 @@ static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src
      *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
      *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
  
-    if(!(s->flags&CODEC_FLAG_GRAY)){
+    if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
          h->left_border[17  ]= h->top_borders[0][s->mb_x][16+7];
          h->left_border[17+9]= h->top_borders[0][s->mb_x][24+7];
          for(i=1; i<9; i++){
@@ -3363,12 +3019,22 @@ static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src
      }
  }
  
-static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){
+static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg, int simple){
      MpegEncContext * const s = &h->s;
      int temp8, i;
      uint64_t temp64;
-    int deblock_left = (s->mb_x > 0);
-    int deblock_top  = (s->mb_y > 0);
+    int deblock_left;
+    int deblock_top;
+    int mb_xy;
+
+    if(h->deblocking_filter == 2) {
+        mb_xy = s->mb_x + s->mb_y*s->mb_stride;
+        deblock_left = h->slice_table[mb_xy] == h->slice_table[mb_xy - 1];
+        deblock_top  = h->slice_table[mb_xy] == h->slice_table[h->top_mb_xy];
+    } else {
+        deblock_left = (s->mb_x > 0);
+        deblock_top =  (s->mb_y > 0);
+    }
  
      src_y  -=   linesize + 1;
      src_cb -= uvlinesize + 1;
@@ -3394,7 +3060,7 @@ b= t;
          }
      }
  
-    if(!(s->flags&CODEC_FLAG_GRAY)){
+    if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
          if(deblock_left){
              for(i = !deblock_top; i<9; i++){
                  XCHG(h->left_border[i+17  ], src_cb[i*uvlinesize], temp8, xchg);
@@ -3429,7 +3095,7 @@ static inline void backup_pair_border(H264Context *h, uint8_t *src_y, uint8_t *s
      *(uint64_t*)(h->top_borders[1][s->mb_x]+0)= *(uint64_t*)(src_y +  33*linesize);
      *(uint64_t*)(h->top_borders[1][s->mb_x]+8)= *(uint64_t*)(src_y +8+33*linesize);
  
-    if(!(s->flags&CODEC_FLAG_GRAY)){
+    if(!ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
          h->left_border[34     ]= h->top_borders[0][s->mb_x][16+7];
          h->left_border[34+   1]= h->top_borders[1][s->mb_x][16+7];
          h->left_border[34+18  ]= h->top_borders[0][s->mb_x][24+7];
@@ -3481,7 +3147,7 @@ b= t;
          }
      }
  
-    if(!(s->flags&CODEC_FLAG_GRAY)){
+    if(!ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
          if(deblock_left){
              for(i = (!deblock_top) << 1; i<18; i++){
                  XCHG(h->left_border[i+34   ], src_cb[i*uvlinesize], temp8, xchg);
@@ -3497,7 +3163,7 @@ b= t;
      }
  }
  
-static void av_always_inline hl_decode_mb_internal(H264Context *h, int simple){
+static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
      MpegEncContext * const s = &h->s;
      const int mb_x= s->mb_x;
      const int mb_y= s->mb_y;
@@ -3601,9 +3267,9 @@ static void av_always_inline hl_decode_mb_internal(H264Context *h, int simple){
      } else {
          if(IS_INTRA(mb_type)){
              if(h->deblocking_filter && (simple || !FRAME_MBAFF))
-                xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1);
+                xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1, simple);
  
-            if(simple || !(s->flags&CODEC_FLAG_GRAY)){
+            if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
                  h->pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
                  h->pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
              }
@@ -3664,7 +3330,7 @@ static void av_always_inline hl_decode_mb_internal(H264Context *h, int simple){
                      svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
              }
              if(h->deblocking_filter && (simple || !FRAME_MBAFF))
-                xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0);
+                xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);
          }else if(is_h264){
              hl_motion(h, dest_y, dest_cb, dest_cr,
                        s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
@@ -3704,15 +3370,15 @@ static void av_always_inline hl_decode_mb_internal(H264Context *h, int simple){
              }
          }
  
-        if(simple || !(s->flags&CODEC_FLAG_GRAY)){
+        if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
              uint8_t *dest[2] = {dest_cb, dest_cr};
              if(transform_bypass){
                  idct_add = idct_dc_add = s->dsp.add_pixels4;
              }else{
                  idct_add = s->dsp.h264_idct_add;
                  idct_dc_add = s->dsp.h264_idct_dc_add;
-                chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp, h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp][0]);
-                chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp, h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp][0]);
+                chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp[0], h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
+                chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp[1], h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
              }
              if(is_h264){
                  for(i=16; i<16+8; i++){
@@ -3754,17 +3420,19 @@ static void av_always_inline hl_decode_mb_internal(H264Context *h, int simple){
              s->mb_y--;
              tprintf(h->s.avctx, "call mbaff filter_mb mb_x:%d mb_y:%d pair_dest_y = %p, dest_y = %p\n", mb_x, mb_y, pair_dest_y, dest_y);
              fill_caches(h, mb_type_top, 1); //FIXME don't fill stuff which isn't used by filter_mb
-            h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mb_xy]);
+            h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
+            h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
              filter_mb(h, mb_x, mb_y, pair_dest_y, pair_dest_cb, pair_dest_cr, linesize, uvlinesize);
              // bottom
              s->mb_y++;
              tprintf(h->s.avctx, "call mbaff filter_mb\n");
              fill_caches(h, mb_type_bottom, 1); //FIXME don't fill stuff which isn't used by filter_mb
-            h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mb_xy+s->mb_stride]);
+            h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy+s->mb_stride]);
+            h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy+s->mb_stride]);
              filter_mb(h, mb_x, mb_y+1, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
          } else {
              tprintf(h->s.avctx, "call filter_mb\n");
-            backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
+            backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, simple);
              fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
              filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
          }
@@ -3791,7 +3459,7 @@ static void hl_decode_mb(H264Context *h){
      const int mb_y= s->mb_y;
      const int mb_xy= mb_x + mb_y*s->mb_stride;
      const int mb_type= s->current_picture.mb_type[mb_xy];
-    int is_complex = FRAME_MBAFF || MB_FIELD || IS_INTRA_PCM(mb_type) || s->codec_id != CODEC_ID_H264 || (s->flags&CODEC_FLAG_GRAY) || s->encoding;
+    int is_complex = FRAME_MBAFF || MB_FIELD || IS_INTRA_PCM(mb_type) || s->codec_id != CODEC_ID_H264 || (ENABLE_GRAY && (s->flags&CODEC_FLAG_GRAY)) || s->encoding;
  
      if(!s->decode)
          return;
@@ -4351,13 +4019,13 @@ static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
      return 0;
  }
  
-static int decode_ref_pic_marking(H264Context *h){
+static int decode_ref_pic_marking(H264Context *h, GetBitContext *gb){
      MpegEncContext * const s = &h->s;
      int i;
  
      if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
-        s->broken_link= get_bits1(&s->gb) -1;
-        h->mmco[0].long_index= get_bits1(&s->gb) - 1; // current_long_term_idx
+        s->broken_link= get_bits1(gb) -1;
+        h->mmco[0].long_index= get_bits1(gb) - 1; // current_long_term_idx
          if(h->mmco[0].long_index == -1)
              h->mmco_index= 0;
          else{
@@ -4365,20 +4033,20 @@ static int decode_ref_pic_marking(H264Context *h){
              h->mmco_index= 1;
          }
      }else{
-        if(get_bits1(&s->gb)){ // adaptive_ref_pic_marking_mode_flag
+        if(get_bits1(gb)){ // adaptive_ref_pic_marking_mode_flag
              for(i= 0; i<MAX_MMCO_COUNT; i++) {
-                MMCOOpcode opcode= get_ue_golomb(&s->gb);;
+                MMCOOpcode opcode= get_ue_golomb(gb);
  
                  h->mmco[i].opcode= opcode;
                  if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
-                    h->mmco[i].short_frame_num= (h->frame_num - get_ue_golomb(&s->gb) - 1) & ((1<<h->sps.log2_max_frame_num)-1); //FIXME fields
+                    h->mmco[i].short_frame_num= (h->frame_num - get_ue_golomb(gb) - 1) & ((1<<h->sps.log2_max_frame_num)-1); //FIXME fields
  /*                    if(h->mmco[i].short_frame_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_frame_num ] == NULL){
                          av_log(s->avctx, AV_LOG_ERROR, "illegal short ref in memory management control operation %d\n", mmco);
                          return -1;
                      }*/
                  }
                  if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
-                    unsigned int long_index= get_ue_golomb(&s->gb);
+                    unsigned int long_index= get_ue_golomb(gb);
                      if(/*h->mmco[i].long_index >= h->long_ref_count || h->long_ref[ h->mmco[i].long_index ] == NULL*/ long_index >= 16){
                          av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
                          return -1;
@@ -4498,6 +4166,55 @@ static int init_poc(H264Context *h){
      return 0;
  }
  
+
+/**
+ * initialize scan tables
+ */
+static void init_scan_tables(H264Context *h){
+    MpegEncContext * const s = &h->s;
+    int i;
+    if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
+        memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
+        memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
+    }else{
+        for(i=0; i<16; i++){
+#define T(x) (x>>2) | ((x<<2) & 0xF)
+            h->zigzag_scan[i] = T(zigzag_scan[i]);
+            h-> field_scan[i] = T( field_scan[i]);
+#undef T
+        }
+    }
+    if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
+        memcpy(h->zigzag_scan8x8,       zigzag_scan8x8,       64*sizeof(uint8_t));
+        memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
+        memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
+        memcpy(h->field_scan8x8_cavlc,  field_scan8x8_cavlc,  64*sizeof(uint8_t));
+    }else{
+        for(i=0; i<64; i++){
+#define T(x) (x>>3) | ((x&7)<<3)
+            h->zigzag_scan8x8[i]       = T(zigzag_scan8x8[i]);
+            h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
+            h->field_scan8x8[i]        = T(field_scan8x8[i]);
+            h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
+#undef T
+        }
+    }
+    if(h->sps.transform_bypass){ //FIXME same ugly
+        h->zigzag_scan_q0          = zigzag_scan;
+        h->zigzag_scan8x8_q0       = zigzag_scan8x8;
+        h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
+        h->field_scan_q0           = field_scan;
+        h->field_scan8x8_q0        = field_scan8x8;
+        h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
+    }else{
+        h->zigzag_scan_q0          = h->zigzag_scan;
+        h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
+        h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
+        h->field_scan_q0           = h->field_scan;
+        h->field_scan8x8_q0        = h->field_scan8x8;
+        h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
+    }
+}
  /**
   * decodes a slice header.
   * this will allso call MPV_common_init() and frame_start() as needed
@@ -4546,17 +4263,17 @@ static int decode_slice_header(H264Context *h){
          av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
          return -1;
      }
-    h->pps= h->pps_buffer[pps_id];
-    if(h->pps.slice_group_count == 0){
+    if(!h->pps_buffers[pps_id]) {
          av_log(h->s.avctx, AV_LOG_ERROR, "non existing PPS referenced\n");
          return -1;
      }
+    h->pps= *h->pps_buffers[pps_id];
  
-    h->sps= h->sps_buffer[ h->pps.sps_id ];
-    if(h->sps.log2_max_frame_num == 0){
+    if(!h->sps_buffers[h->pps.sps_id]) {
          av_log(h->s.avctx, AV_LOG_ERROR, "non existing SPS referenced\n");
          return -1;
      }
+    h->sps = *h->sps_buffers[h->pps.sps_id];
  
      if(h->dequant_coeff_pps != pps_id){
          h->dequant_coeff_pps = pps_id;
@@ -4584,50 +4301,7 @@ static int decode_slice_header(H264Context *h){
          if (MPV_common_init(s) < 0)
              return -1;
  
-        if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
-            memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
-            memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
-        }else{
-            int i;
-            for(i=0; i<16; i++){
-#define T(x) (x>>2) | ((x<<2) & 0xF)
-                h->zigzag_scan[i] = T(zigzag_scan[i]);
-                h-> field_scan[i] = T( field_scan[i]);
-#undef T
-            }
-        }
-        if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
-            memcpy(h->zigzag_scan8x8,       zigzag_scan8x8,       64*sizeof(uint8_t));
-            memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
-            memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
-            memcpy(h->field_scan8x8_cavlc,  field_scan8x8_cavlc,  64*sizeof(uint8_t));
-        }else{
-            int i;
-            for(i=0; i<64; i++){
-#define T(x) (x>>3) | ((x&7)<<3)
-                h->zigzag_scan8x8[i]       = T(zigzag_scan8x8[i]);
-                h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
-                h->field_scan8x8[i]        = T(field_scan8x8[i]);
-                h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
-#undef T
-            }
-        }
-        if(h->sps.transform_bypass){ //FIXME same ugly
-            h->zigzag_scan_q0          = zigzag_scan;
-            h->zigzag_scan8x8_q0       = zigzag_scan8x8;
-            h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
-            h->field_scan_q0           = field_scan;
-            h->field_scan8x8_q0        = field_scan8x8;
-            h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
-        }else{
-            h->zigzag_scan_q0          = h->zigzag_scan;
-            h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
-            h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
-            h->field_scan_q0           = h->field_scan;
-            h->field_scan8x8_q0        = h->field_scan8x8;
-            h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
-        }
-
+        init_scan_tables(h);
          alloc_tables(h);
  
          s->avctx->width = s->width;
@@ -4755,7 +4429,7 @@ static int decode_slice_header(H264Context *h){
          h->use_weight = 0;
  
      if(s->current_picture.reference)
-        decode_ref_pic_marking(h);
+        decode_ref_pic_marking(h, &s->gb);
  
      if(FRAME_MBAFF)
          fill_mbaff_ref_list(h);
@@ -4776,7 +4450,8 @@ static int decode_slice_header(H264Context *h){
          return -1;
      }
      s->qscale= tmp;
-    h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, s->qscale);
+    h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
+    h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
      //FIXME qscale / qp ... stuff
      if(h->slice_type == SP_TYPE){
          get_bits1(&s->gb); /* sp_for_switch_flag */
@@ -5161,7 +4836,7 @@ decode_intra_mb:
      if(IS_INTRA_PCM(mb_type)){
          unsigned int x, y;
  
-        // we assume these blocks are very rare so we dont optimize it
+        // We assume these blocks are very rare so we do not optimize it.
          align_get_bits(&s->gb);
  
          // The pixels are stored in the same order as levels in h->mb array.
@@ -5189,7 +4864,8 @@ decode_intra_mb:
  
          // In deblocking, the quantizer is 0
          s->current_picture.qscale_table[mb_xy]= 0;
-        h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, 0);
+        h->chroma_qp[0] = get_chroma_qp(h, 0, 0);
+        h->chroma_qp[1] = get_chroma_qp(h, 1, 0);
          // All coeffs are present
          memset(h->non_zero_count[mb_xy], 16, 16);
  
@@ -5299,8 +4975,6 @@ decode_intra_mb:
              dct8x8_allowed = get_dct8x8_allowed(h);
  
          for(list=0; list<h->list_count; list++){
-            const int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
-
              for(i=0; i<4; i++){
                  if(IS_DIRECT(h->sub_mb_type[i])) {
                      h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
@@ -5465,7 +5139,7 @@ decode_intra_mb:
  
      if(cbp || IS_INTRA16x16(mb_type)){
          int i8x8, i4x4, chroma_idx;
-        int chroma_qp, dquant;
+        int dquant;
          GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
          const uint8_t *scan, *scan8x8, *dc_scan;
  
@@ -5494,7 +5168,8 @@ decode_intra_mb:
              else            s->qscale-= 52;
          }
  
-        h->chroma_qp= chroma_qp= get_chroma_qp(h->pps.chroma_qp_index_offset, s->qscale);
+        h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
+        h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
          if(IS_INTRA16x16(mb_type)){
              if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
                  return -1; //FIXME continue if partitioned and other return -1 too
@@ -5552,9 +5227,10 @@ decode_intra_mb:
  
          if(cbp&0x20){
              for(chroma_idx=0; chroma_idx<2; chroma_idx++){
+                const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
                  for(i4x4=0; i4x4<4; i4x4++){
                      const int index= 16 + 4*chroma_idx + i4x4;
-                    if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][chroma_qp], 15) < 0){
+                    if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, qmul, 15) < 0){
                          return -1;
                      }
                  }
@@ -5978,7 +5654,7 @@ static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
      return get_cabac_bypass_sign( &h->cabac, -mvd );
  }
  
-static int inline get_cabac_cbf_ctx( H264Context *h, int cat, int idx ) {
+static inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx ) {
      int nza, nzb;
      int ctx = 0;
  
@@ -6006,7 +5682,7 @@ static int inline get_cabac_cbf_ctx( H264Context *h, int cat, int idx ) {
      return ctx + 4 * cat;
  }
  
-static const __attribute((used)) uint8_t last_coeff_flag_offset_8x8[63] = {
+static const attribute_used uint8_t last_coeff_flag_offset_8x8[63] = {
      0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
      2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
      3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
@@ -6039,7 +5715,7 @@ static int decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n
  
      int index[64];
  
-    int last;
+    int av_unused last;
      int coeff_count = 0;
  
      int abslevel1 = 1;
@@ -6111,7 +5787,7 @@ static int decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n
              index[coeff_count++] = last;\
          }
          const uint8_t *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
-#if defined(ARCH_X86) && !(defined(PIC) && defined(__GNUC__))
+#if defined(ARCH_X86) && defined(CONFIG_7REGS) && defined(HAVE_EBX_AVAILABLE) && !defined(BROKEN_RELOCATIONS)
          coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index, sig_off);
      } else {
          coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index);
@@ -6187,7 +5863,7 @@ static int decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n
      return 0;
  }
  
-static void inline compute_mb_neighbors(H264Context *h)
+static inline void compute_mb_neighbors(H264Context *h)
  {
      MpegEncContext * const s = &h->s;
      const int mb_xy  = s->mb_x + s->mb_y*s->mb_stride;
@@ -6304,7 +5980,7 @@ decode_intra_mb:
          const uint8_t *ptr;
          unsigned int x, y;
  
-        // We assume these blocks are very rare so we dont optimize it.
+        // We assume these blocks are very rare so we do not optimize it.
          // FIXME The two following lines get the bitstream position in the cabac
          // decode, I think it should be done by a function in cabac.h (or cabac.c).
          ptr= h->cabac.bytestream;
@@ -6343,7 +6019,8 @@ decode_intra_mb:
          h->chroma_pred_mode_table[mb_xy] = 0;
          // In deblocking, the quantizer is 0
          s->current_picture.qscale_table[mb_xy]= 0;
-        h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, 0);
+        h->chroma_qp[0] = get_chroma_qp(h, 0, 0);
+        h->chroma_qp[1] = get_chroma_qp(h, 1, 0);
          // All coeffs are present
          memset(h->non_zero_count[mb_xy], 16, 16);
          s->current_picture.mb_type[mb_xy]= mb_type;
@@ -6399,6 +6076,10 @@ decode_intra_mb:
              if( IS_DIRECT(h->sub_mb_type[0] | h->sub_mb_type[1] |
                            h->sub_mb_type[2] | h->sub_mb_type[3]) ) {
                  pred_direct_motion(h, &mb_type);
+                h->ref_cache[0][scan8[4]] =
+                h->ref_cache[1][scan8[4]] =
+                h->ref_cache[0][scan8[12]] =
+                h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
                  if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
                      for( i = 0; i < 4; i++ )
                          if( IS_DIRECT(h->sub_mb_type[i]) )
@@ -6434,11 +6115,11 @@ decode_intra_mb:
  
          for(list=0; list<h->list_count; list++){
              for(i=0; i<4; i++){
+                h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
                  if(IS_DIRECT(h->sub_mb_type[i])){
                      fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
                      continue;
                  }
-                h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
  
                  if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
                      const int sub_mb_type= h->sub_mb_type[i];
@@ -6619,7 +6300,8 @@ decode_intra_mb:
              if(s->qscale<0) s->qscale+= 52;
              else            s->qscale-= 52;
          }
-        h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, s->qscale);
+        h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
+        h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
  
          if( IS_INTRA16x16( mb_type ) ) {
              int i;
@@ -6671,10 +6353,11 @@ decode_intra_mb:
          if( cbp&0x20 ) {
              int c, i;
              for( c = 0; c < 2; c++ ) {
+                const uint32_t *qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
                  for( i = 0; i < 4; i++ ) {
                      const int index = 16 + 4 * c + i;
                      //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
-                    if( decode_cabac_residual(h, h->mb + 16*index, 4, index - 16, scan + 1, h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp], 15) < 0)
+                    if( decode_cabac_residual(h, h->mb + 16*index, 4, index - 16, scan + 1, qmul, 15) < 0)
                          return -1;
                  }
              }
@@ -7012,20 +6695,23 @@ static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y,
      int mb_xy, mb_type;
      int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh;
  
-    if(mb_x==0 || mb_y==0 || !s->dsp.h264_loop_filter_strength) {
+    mb_xy = mb_x + mb_y*s->mb_stride;
+
+    if(mb_x==0 || mb_y==0 || !s->dsp.h264_loop_filter_strength || h->pps.chroma_qp_diff ||
+       (h->deblocking_filter == 2 && (h->slice_table[mb_xy] != h->slice_table[h->top_mb_xy] ||
+                                      h->slice_table[mb_xy] != h->slice_table[mb_xy - 1]))) {
          filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
          return;
      }
      assert(!FRAME_MBAFF);
  
-    mb_xy = mb_x + mb_y*s->mb_stride;
      mb_type = s->current_picture.mb_type[mb_xy];
      qp = s->current_picture.qscale_table[mb_xy];
      qp0 = s->current_picture.qscale_table[mb_xy-1];
      qp1 = s->current_picture.qscale_table[h->top_mb_xy];
-    qpc = get_chroma_qp( h->pps.chroma_qp_index_offset, qp );
-    qpc0 = get_chroma_qp( h->pps.chroma_qp_index_offset, qp0 );
-    qpc1 = get_chroma_qp( h->pps.chroma_qp_index_offset, qp1 );
+    qpc = get_chroma_qp( h, 0, qp );
+    qpc0 = get_chroma_qp( h, 0, qp0 );
+    qpc1 = get_chroma_qp( h, 0, qp1 );
      qp0 = (qp + qp0 + 1) >> 1;
      qp1 = (qp + qp1 + 1) >> 1;
      qpc0 = (qpc + qpc0 + 1) >> 1;
@@ -7131,7 +6817,7 @@ static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8
      //for sufficiently low qp, filtering wouldn't do anything
      //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
      if(!FRAME_MBAFF){
-        int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX(0, h->pps.chroma_qp_index_offset);
+        int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX(0, FFMAX(h->pps.chroma_qp_index_offset[0], h->pps.chroma_qp_index_offset[1]));
          int qp = s->current_picture.qscale_table[mb_xy];
          if(qp <= qp_thresh
             && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
@@ -7154,7 +6840,8 @@ static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8
          const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride };
          int16_t bS[8];
          int qp[2];
-        int chroma_qp[2];
+        int bqp[2];
+        int rqp[2];
          int mb_qp, mbn0_qp, mbn1_qp;
          int i;
          first_vertical_edge_done = 1;
@@ -7180,18 +6867,22 @@ static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8
          mbn0_qp = s->current_picture.qscale_table[left_mb_xy[0]];
          mbn1_qp = s->current_picture.qscale_table[left_mb_xy[1]];
          qp[0] = ( mb_qp + mbn0_qp + 1 ) >> 1;
-        chroma_qp[0] = ( get_chroma_qp( h->pps.chroma_qp_index_offset, mb_qp ) +
-                         get_chroma_qp( h->pps.chroma_qp_index_offset, mbn0_qp ) + 1 ) >> 1;
+        bqp[0] = ( get_chroma_qp( h, 0, mb_qp ) +
+                   get_chroma_qp( h, 0, mbn0_qp ) + 1 ) >> 1;
+        rqp[0] = ( get_chroma_qp( h, 1, mb_qp ) +
+                   get_chroma_qp( h, 1, mbn0_qp ) + 1 ) >> 1;
          qp[1] = ( mb_qp + mbn1_qp + 1 ) >> 1;
-        chroma_qp[1] = ( get_chroma_qp( h->pps.chroma_qp_index_offset, mb_qp ) +
-                         get_chroma_qp( h->pps.chroma_qp_index_offset, mbn1_qp ) + 1 ) >> 1;
+        bqp[1] = ( get_chroma_qp( h, 0, mb_qp ) +
+                   get_chroma_qp( h, 0, mbn1_qp ) + 1 ) >> 1;
+        rqp[1] = ( get_chroma_qp( h, 1, mb_qp ) +
+                   get_chroma_qp( h, 1, mbn1_qp ) + 1 ) >> 1;
  
          /* Filter edge */
-        tprintf(s->avctx, "filter mb:%d/%d MBAFF, QPy:%d/%d, QPc:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], chroma_qp[0], chroma_qp[1], linesize, uvlinesize);
+        tprintf(s->avctx, "filter mb:%d/%d MBAFF, QPy:%d/%d, QPb:%d/%d QPr:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], bqp[0], bqp[1], rqp[0], rqp[1], linesize, uvlinesize);
          { int i; for (i = 0; i < 8; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
          filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
-        filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, chroma_qp );
-        filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, chroma_qp );
+        filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, bqp );
+        filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, rqp );
      }
      /* dir : 0 -> vertical edge, 1 -> horizontal edge */
      for( dir = 0; dir < 2; dir++ )
@@ -7229,7 +6920,7 @@ static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8
              unsigned int tmp_linesize   = 2 *   linesize;
              unsigned int tmp_uvlinesize = 2 * uvlinesize;
              int mbn_xy = mb_xy - 2 * s->mb_stride;
-            int qp, chroma_qp;
+            int qp;
              int i, j;
              int16_t bS[4];
  
@@ -7253,10 +6944,10 @@ static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8
                  tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
                  { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
                  filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
-                chroma_qp = ( h->chroma_qp +
-                              get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
-                filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS, chroma_qp );
-                filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS, chroma_qp );
+                filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS,
+                                  ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
+                filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS,
+                                  ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
              }
  
              start = 1;
@@ -7353,18 +7044,18 @@ static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8
              if( dir == 0 ) {
                  filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
                  if( (edge&1) == 0 ) {
-                    int chroma_qp = ( h->chroma_qp +
-                                      get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
-                    filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS, chroma_qp );
-                    filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS, chroma_qp );
+                    filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS,
+                                      ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
+                    filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS,
+                                      ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
                  }
              } else {
                  filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
                  if( (edge&1) == 0 ) {
-                    int chroma_qp = ( h->chroma_qp +
-                                      get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
-                    filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS, chroma_qp );
-                    filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS, chroma_qp );
+                    filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS,
+                                      ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
+                    filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS,
+                                      ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
                  }
              }
          }
@@ -7421,7 +7112,7 @@ static int decode_slice(H264Context *h){
              eos = get_cabac_terminate( &h->cabac );
  
              if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 2) {
-                av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%d)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
+                av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
                  ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
                  return -1;
              }
@@ -7753,6 +7444,26 @@ static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_s
      }
  }
  
+/**
+ * Returns and optionally allocates SPS / PPS structures in the supplied array 'vec'
+ */
+static void *
+alloc_parameter_set(H264Context *h, void **vec, const unsigned int id, const unsigned int max,
+                    const size_t size, const char *name)
+{
+    if(id>=max) {
+        av_log(h->s.avctx, AV_LOG_ERROR, "%s_id (%d) out of range\n", name, id);
+        return NULL;
+    }
+
+    if(!vec[id]) {
+        vec[id] = av_mallocz(size);
+        if(vec[id] == NULL)
+            av_log(h->s.avctx, AV_LOG_ERROR, "cannot allocate memory for %s\n", name);
+    }
+    return vec[id];
+}
+
  static inline int decode_seq_parameter_set(H264Context *h){
      MpegEncContext * const s = &h->s;
      int profile_idc, level_idc;
@@ -7769,13 +7480,10 @@ static inline int decode_seq_parameter_set(H264Context *h){
      level_idc= get_bits(&s->gb, 8);
      sps_id= get_ue_golomb(&s->gb);
  
-    if (sps_id >= MAX_SPS_COUNT){
-        // ok it has gone out of hand, someone is sending us bad stuff.
-        av_log(h->s.avctx, AV_LOG_ERROR, "illegal sps_id (%d)\n", sps_id);
+    sps = alloc_parameter_set(h, (void **)h->sps_buffers, sps_id, MAX_SPS_COUNT, sizeof(SPS), "sps");
+    if(sps == NULL)
          return -1;
-    }
  
-    sps= &h->sps_buffer[ sps_id ];
      sps->profile_idc= profile_idc;
      sps->level_idc= level_idc;
  
@@ -7880,19 +7588,25 @@ static inline int decode_seq_parameter_set(H264Context *h){
      return 0;
  }
  
+static void
+build_qp_table(PPS *pps, int t, int index)
+{
+    int i;
+    for(i = 0; i < 255; i++)
+        pps->chroma_qp_table[t][i & 0xff] = chroma_qp[av_clip(i + index, 0, 51)];
+}
+
  static inline int decode_picture_parameter_set(H264Context *h, int bit_length){
      MpegEncContext * const s = &h->s;
      unsigned int tmp, pps_id= get_ue_golomb(&s->gb);
      PPS *pps;
  
-    if(pps_id>=MAX_PPS_COUNT){
-        av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
+    pps = alloc_parameter_set(h, (void **)h->pps_buffers, pps_id, MAX_PPS_COUNT, sizeof(PPS), "pps");
+    if(pps == NULL)
          return -1;
-    }
-    pps = &h->pps_buffer[pps_id];
  
      tmp= get_ue_golomb(&s->gb);
-    if(tmp>=MAX_SPS_COUNT){
+    if(tmp>=MAX_SPS_COUNT || h->sps_buffers[tmp] == NULL){
          av_log(h->s.avctx, AV_LOG_ERROR, "sps_id out of range\n");
          return -1;
      }
@@ -7950,7 +7664,7 @@ static inline int decode_picture_parameter_set(H264Context *h, int bit_length){
      pps->weighted_bipred_idc= get_bits(&s->gb, 2);
      pps->init_qp= get_se_golomb(&s->gb) + 26;
      pps->init_qs= get_se_golomb(&s->gb) + 26;
-    pps->chroma_qp_index_offset= get_se_golomb(&s->gb);
+    pps->chroma_qp_index_offset[0]= get_se_golomb(&s->gb);
      pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
      pps->constrained_intra_pred= get_bits1(&s->gb);
      pps->redundant_pic_cnt_present = get_bits1(&s->gb);
@@ -7962,18 +7676,27 @@ static inline int decode_picture_parameter_set(H264Context *h, int bit_length){
  
      if(get_bits_count(&s->gb) < bit_length){
          pps->transform_8x8_mode= get_bits1(&s->gb);
-        decode_scaling_matrices(h, &h->sps_buffer[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
-        get_se_golomb(&s->gb);  //second_chroma_qp_index_offset
+        decode_scaling_matrices(h, h->sps_buffers[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
+        pps->chroma_qp_index_offset[1]= get_se_golomb(&s->gb); //second_chroma_qp_index_offset
+    } else {
+        pps->chroma_qp_index_offset[1]= pps->chroma_qp_index_offset[0];
      }
  
+    build_qp_table(pps, 0, pps->chroma_qp_index_offset[0]);
+    if(pps->chroma_qp_index_offset[0] != pps->chroma_qp_index_offset[1]) {
+        build_qp_table(pps, 1, pps->chroma_qp_index_offset[1]);
+        h->pps.chroma_qp_diff= 1;
+    } else
+        memcpy(pps->chroma_qp_table[1], pps->chroma_qp_table[0], 256);
+
      if(s->avctx->debug&FF_DEBUG_PICT_INFO){
-        av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%u sps:%u %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d %s %s %s %s\n",
+        av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%u sps:%u %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d/%d %s %s %s %s\n",
                 pps_id, pps->sps_id,
                 pps->cabac ? "CABAC" : "CAVLC",
                 pps->slice_group_count,
                 pps->ref_count[0], pps->ref_count[1],
                 pps->weighted_pred ? "weighted" : "",
-               pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset,
+               pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset[0], pps->chroma_qp_index_offset[1],
                 pps->deblocking_filter_parameters_present ? "LPAR" : "",
                 pps->constrained_intra_pred ? "CONSTR" : "",
                 pps->redundant_pic_cnt_present ? "REDU" : "",
@@ -7984,116 +7707,6 @@ static inline int decode_picture_parameter_set(H264Context *h, int bit_length){
      return 0;
  }
  
-/**
- * finds the end of the current frame in the bitstream.
- * @return the position of the first byte of the next frame, or -1
- */
-static int find_frame_end(H264Context *h, const uint8_t *buf, int buf_size){
-    int i;
-    uint32_t state;
-    ParseContext *pc = &(h->s.parse_context);
-//printf("first %02X%02X%02X%02X\n", buf[0], buf[1],buf[2],buf[3]);
-//    mb_addr= pc->mb_addr - 1;
-    state= pc->state;
-    if(state>13)
-        state= 7;
-
-    for(i=0; i<buf_size; i++){
-        if(state==7){
-            for(; i<buf_size; i++){
-                if(!buf[i]){
-                    state=2;
-                    break;
-                }
-            }
-        }else if(state<=2){
-            if(buf[i]==1)   state^= 5; //2->7, 1->4, 0->5
-            else if(buf[i]) state = 7;
-            else            state>>=1; //2->1, 1->0, 0->0
-        }else if(state<=5){
-            int v= buf[i] & 0x1F;
-            if(v==7 || v==8 || v==9){
-                if(pc->frame_start_found){
-                    i++;
-found:
-                    pc->state=7;
-                    pc->frame_start_found= 0;
-                    return i-(state&5);
-                }
-            }else if(v==1 || v==2 || v==5){
-                if(pc->frame_start_found){
-                    state+=8;
-                    continue;
-                }else
-                    pc->frame_start_found = 1;
-            }
-            state= 7;
-        }else{
-            if(buf[i] & 0x80)
-                goto found;
-            state= 7;
-        }
-    }
-    pc->state= state;
-    return END_NOT_FOUND;
-}
-
-#ifdef CONFIG_H264_PARSER
-static int h264_parse(AVCodecParserContext *s,
-                      AVCodecContext *avctx,
-                      uint8_t **poutbuf, int *poutbuf_size,
-                      const uint8_t *buf, int buf_size)
-{
-    H264Context *h = s->priv_data;
-    ParseContext *pc = &h->s.parse_context;
-    int next;
-
-    if(s->flags & PARSER_FLAG_COMPLETE_FRAMES){
-        next= buf_size;
-    }else{
-    next= find_frame_end(h, buf, buf_size);
-
-    if (ff_combine_frame(pc, next, (uint8_t **)&buf, &buf_size) < 0) {
-        *poutbuf = NULL;
-        *poutbuf_size = 0;
-        return buf_size;
-    }
-
-    if(next<0){
-        find_frame_end(h, &pc->buffer[pc->last_index + next], -next); //update state
-    }
-    }
-
-    *poutbuf = (uint8_t *)buf;
-    *poutbuf_size = buf_size;
-    return next;
-}
-
-static int h264_split(AVCodecContext *avctx,
-                      const uint8_t *buf, int buf_size)
-{
-    int i;
-    uint32_t state = -1;
-    int has_sps= 0;
-
-    for(i=0; i<=buf_size; i++){
-        if((state&0xFFFFFF1F) == 0x107)
-            has_sps=1;
-/*        if((state&0xFFFFFF1F) == 0x101 || (state&0xFFFFFF1F) == 0x102 || (state&0xFFFFFF1F) == 0x105){
-        }*/
-        if((state&0xFFFFFF00) == 0x100 && (state&0xFFFFFF1F) != 0x107 && (state&0xFFFFFF1F) != 0x108 && (state&0xFFFFFF1F) != 0x109){
-            if(has_sps){
-                while(i>4 && buf[i-5]==0) i--;
-                return i-4;
-            }
-        }
-        if (i<buf_size)
-            state= (state<<8) | buf[i];
-    }
-    return 0;
-}
-#endif /* CONFIG_H264_PARSER */
-
  static int decode_nal_units(H264Context *h, uint8_t *buf, int buf_size){
      MpegEncContext * const s = &h->s;
      AVCodecContext * const avctx= s->avctx;
@@ -8116,40 +7729,40 @@ static int decode_nal_units(H264Context *h, uint8_t *buf, int buf_size){
          uint8_t *ptr;
          int i, nalsize = 0;
  
-      if(h->is_avc) {
-        if(buf_index >= buf_size) break;
-        nalsize = 0;
-        for(i = 0; i < h->nal_length_size; i++)
-            nalsize = (nalsize << 8) | buf[buf_index++];
-        if(nalsize <= 1 || nalsize > buf_size){
-            if(nalsize == 1){
-                buf_index++;
-                continue;
-            }else{
-                av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
-                break;
+        if(h->is_avc) {
+            if(buf_index >= buf_size) break;
+            nalsize = 0;
+            for(i = 0; i < h->nal_length_size; i++)
+                nalsize = (nalsize << 8) | buf[buf_index++];
+            if(nalsize <= 1 || (nalsize+buf_index > buf_size)){
+                if(nalsize == 1){
+                    buf_index++;
+                    continue;
+                }else{
+                    av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
+                    break;
+                }
+            }
+        } else {
+            // start code prefix search
+            for(; buf_index + 3 < buf_size; buf_index++){
+                // This should always succeed in the first iteration.
+                if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
+                    break;
              }
-        }
-      } else {
-        // start code prefix search
-        for(; buf_index + 3 < buf_size; buf_index++){
-            // this should allways succeed in the first iteration
-            if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
-                break;
-        }
  
-        if(buf_index+3 >= buf_size) break;
+            if(buf_index+3 >= buf_size) break;
  
-        buf_index+=3;
-      }
+            buf_index+=3;
+        }
  
          ptr= decode_nal(h, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index);
-        if (ptr==NULL || dst_length <= 0){
+        if (ptr==NULL || dst_length < 0){
              return -1;
          }
-        while(ptr[dst_length - 1] == 0 && dst_length > 1)
+        while(ptr[dst_length - 1] == 0 && dst_length > 0)
              dst_length--;
-        bit_length= 8*dst_length - decode_rbsp_trailing(h, ptr + dst_length - 1);
+        bit_length= !dst_length ? 0 : (8*dst_length - decode_rbsp_trailing(h, ptr + dst_length - 1));
  
          if(s->avctx->debug&FF_DEBUG_STARTCODE){
              av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", h->nal_unit_type, buf_index, buf_size, dst_length);
@@ -8160,7 +7773,7 @@ static int decode_nal_units(H264Context *h, uint8_t *buf, int buf_size){
  
          buf_index += consumed;
  
-        if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME dont discard SEI id
+        if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME do not discard SEI id
             ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
              continue;
  
@@ -8257,7 +7870,7 @@ static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
  
          return pos;
      }else{
-        if(pos==0) pos=1; //avoid infinite loops (i doubt thats needed but ...)
+        if(pos==0) pos=1; //avoid infinite loops (i doubt that is needed but ...)
          if(pos+10>buf_size) pos=buf_size; // oops ;)
  
          return pos;
@@ -8302,9 +7915,9 @@ static int decode_frame(AVCodecContext *avctx,
      }
  
      if(s->flags&CODEC_FLAG_TRUNCATED){
-        int next= find_frame_end(h, buf, buf_size);
+        int next= ff_h264_find_frame_end(h, buf, buf_size);
  
-        if( ff_combine_frame(&s->parse_context, next, &buf, &buf_size) < 0 )
+        if( ff_combine_frame(&s->parse_context, next, (const uint8_t **)&buf, &buf_size) < 0 )
              return buf_size;
  //printf("next:%d buf_size:%d last_index:%d\n", next, buf_size, s->parse_context.last_index);
      }
@@ -8492,6 +8105,7 @@ static inline void fill_mb_avail(H264Context *h){
  #endif
  
  #if 0 //selftest
+#undef random
  #define COUNT 8000
  #define SIZE (COUNT*40)
  int main(){
@@ -8523,7 +8137,7 @@ int main(){
          START_TIMER
          j= get_ue_golomb(&gb);
          if(j != i){
-            printf("missmatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
+            printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
  //            return -1;
          }
          STOP_TIMER("get_ue_golomb");
@@ -8548,7 +8162,7 @@ int main(){
          START_TIMER
          j= get_se_golomb(&gb);
          if(j != i - COUNT/2){
-            printf("missmatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
+            printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
  //            return -1;
          }
          STOP_TIMER("get_se_golomb");
@@ -8652,7 +8266,7 @@ int main(){
          }
  
          if(memcmp(bitstream, out, COUNT)){
-            printf("missmatch\n");
+            printf("mismatch\n");
              return -1;
          }
      }
@@ -8670,7 +8284,8 @@ static int decode_end(AVCodecContext *avctx)
      H264Context *h = avctx->priv_data;
      MpegEncContext *s = &h->s;
  
-    av_freep(&h->rbsp_buffer);
+    av_freep(&h->rbsp_buffer[0]);
+    av_freep(&h->rbsp_buffer[1]);
      free_tables(h); //FIXME cleanup init stuff perhaps
      MPV_common_end(s);
  
@@ -8693,15 +8308,4 @@ AVCodec h264_decoder = {
      .flush= flush_dpb,
  };
  
-#ifdef CONFIG_H264_PARSER
-AVCodecParser h264_parser = {
-    { CODEC_ID_H264 },
-    sizeof(H264Context),
-    NULL,
-    h264_parse,
-    ff_parse_close,
-    h264_split,
-};
-#endif
-
  #include "svq3.c"