unused variable

[ffmpeg] / libavcodec / h264.c
diff --git a/libavcodec/h264.c b/libavcodec/h264.c

index 42e7a5bc8c085a0c99816f9ab8f5cb82d377c1da..b923d50c368f57be54910bc8ab8971540d3e96b4 100644 (file)
--- a/libavcodec/h264.c
+++ b/libavcodec/h264.c
@@ -14,7 +14,7 @@
   *
   * You should have received a copy of the GNU Lesser General Public
   * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   *
   */
  
@@ -162,7 +162,7 @@ typedef struct H264Context{
  #define NAL_SPS_EXT             13
  #define NAL_AUXILIARY_SLICE     19
      uint8_t *rbsp_buffer;
-    int rbsp_buffer_size;
+    unsigned int rbsp_buffer_size;
  
      /**
        * Used to parse AVC variant of h264
@@ -199,14 +199,14 @@ typedef struct H264Context{
       * non zero coeff count cache.
       * is 64 if not available.
       */
-    uint8_t non_zero_count_cache[6*8] __align8;
+    DECLARE_ALIGNED_8(uint8_t, non_zero_count_cache[6*8]);
      uint8_t (*non_zero_count)[16];
  
      /**
       * Motion vector cache.
       */
-    int16_t mv_cache[2][5*8][2] __align8;
-    int8_t ref_cache[2][5*8] __align8;
+    DECLARE_ALIGNED_8(int16_t, mv_cache[2][5*8][2]);
+    DECLARE_ALIGNED_8(int8_t, ref_cache[2][5*8]);
  #define LIST_NOT_USED -1 //FIXME rename?
  #define PART_NOT_AVAILABLE -2
  
@@ -335,7 +335,7 @@ typedef struct H264Context{
      GetBitContext *intra_gb_ptr;
      GetBitContext *inter_gb_ptr;
  
-    DCTELEM mb[16*24] __align8;
+    DECLARE_ALIGNED_8(DCTELEM, mb[16*24]);
  
      /**
       * Cabac
@@ -352,14 +352,18 @@ typedef struct H264Context{
      uint8_t     *chroma_pred_mode_table;
      int         last_qscale_diff;
      int16_t     (*mvd_table[2])[2];
-    int16_t     mvd_cache[2][5*8][2] __align8;
+    DECLARE_ALIGNED_8(int16_t, mvd_cache[2][5*8][2]);
      uint8_t     *direct_table;
      uint8_t     direct_cache[5*8];
  
      uint8_t zigzag_scan[16];
      uint8_t field_scan[16];
+    uint8_t zigzag_scan8x8[64];
+    uint8_t zigzag_scan8x8_cavlc[64];
      const uint8_t *zigzag_scan_q0;
      const uint8_t *field_scan_q0;
+    const uint8_t *zigzag_scan8x8_q0;
+    const uint8_t *zigzag_scan8x8_cavlc_q0;
  
      int x264_build;
  }H264Context;
@@ -377,7 +381,7 @@ static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
  static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
  static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  
-static inline uint32_t pack16to32(int a, int b){
+static always_inline uint32_t pack16to32(int a, int b){
  #ifdef WORDS_BIGENDIAN
     return (b&0xFFFF) + (a<<16);
  #else
@@ -391,7 +395,7 @@ static inline uint32_t pack16to32(int a, int b){
   * @param w width of the rectangle, should be a constant
   * @param size the size of val (1 or 4), should be a constant
   */
-static inline void fill_rectangle(void *vp, int w, int h, int stride, uint32_t val, int size){ //FIXME ensure this IS inlined
+static always_inline void fill_rectangle(void *vp, int w, int h, int stride, uint32_t val, int size){
      uint8_t *p= (uint8_t*)vp;
      assert(size==1 || size==4);
  
@@ -450,7 +454,7 @@ static inline void fill_rectangle(void *vp, int w, int h, int stride, uint32_t v
          assert(0);
  }
  
-static inline void fill_caches(H264Context *h, int mb_type, int for_deblock){
+static void fill_caches(H264Context *h, int mb_type, int for_deblock){
      MpegEncContext * const s = &h->s;
      const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
      int topleft_xy, top_xy, topright_xy, left_xy[2];
@@ -1202,7 +1206,7 @@ static inline void direct_ref_list_init(H264Context * const h){
      for(list=0; list<2; list++){
          for(i=0; i<ref1->ref_count[list]; i++){
              const int poc = ref1->ref_poc[list][i];
-            h->map_col_to_list0[list][i] = PART_NOT_AVAILABLE;
+            h->map_col_to_list0[list][i] = 0; /* bogus; fills in for missing frames */
              for(j=0; j<h->ref_count[list]; j++)
                  if(h->ref_list[list][j].poc == poc){
                      h->map_col_to_list0[list][i] = j;
@@ -2607,7 +2611,8 @@ static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square,
      const int pic_width  = 16*s->mb_width;
      const int pic_height = 16*s->mb_height;
  
-    assert(pic->data[0]);
+    if(!pic->data[0])
+        return;
  
      if(mx&7) extra_width -= 3;
      if(my&7) extra_height -= 3;
@@ -2709,19 +2714,19 @@ static inline void mc_part_weighted(H264Context *h, int n, int square, int chrom
          if(h->use_weight == 2){
              int weight0 = h->implicit_weight[refn0][refn1];
              int weight1 = 64 - weight0;
-            luma_weight_avg(  dest_y,  tmp_y,  s->  linesize, 5, weight0, weight1, 0, 0);
-            chroma_weight_avg(dest_cb, tmp_cb, s->uvlinesize, 5, weight0, weight1, 0, 0);
-            chroma_weight_avg(dest_cr, tmp_cr, s->uvlinesize, 5, weight0, weight1, 0, 0);
+            luma_weight_avg(  dest_y,  tmp_y,  s->  linesize, 5, weight0, weight1, 0);
+            chroma_weight_avg(dest_cb, tmp_cb, s->uvlinesize, 5, weight0, weight1, 0);
+            chroma_weight_avg(dest_cr, tmp_cr, s->uvlinesize, 5, weight0, weight1, 0);
          }else{
              luma_weight_avg(dest_y, tmp_y, s->linesize, h->luma_log2_weight_denom,
                              h->luma_weight[0][refn0], h->luma_weight[1][refn1],
-                            h->luma_offset[0][refn0], h->luma_offset[1][refn1]);
+                            h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
              chroma_weight_avg(dest_cb, tmp_cb, s->uvlinesize, h->chroma_log2_weight_denom,
                              h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
-                            h->chroma_offset[0][refn0][0], h->chroma_offset[1][refn1][0]);
+                            h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
              chroma_weight_avg(dest_cr, tmp_cr, s->uvlinesize, h->chroma_log2_weight_denom,
                              h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
-                            h->chroma_offset[0][refn0][1], h->chroma_offset[1][refn1][1]);
+                            h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
          }
      }else{
          int list = list1 ? 1 : 0;
@@ -2952,6 +2957,7 @@ static void free_tables(H264Context *h){
  
  static void init_dequant8_coeff_table(H264Context *h){
      int i,q,x;
+    const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
      h->dequant8_coeff[0] = h->dequant8_buffer[0];
      h->dequant8_coeff[1] = h->dequant8_buffer[1];
  
@@ -2965,8 +2971,9 @@ static void init_dequant8_coeff_table(H264Context *h){
              int shift = div6[q];
              int idx = rem6[q];
              for(x=0; x<64; x++)
-                h->dequant8_coeff[i][q][x] = ((uint32_t)dequant8_coeff_init[idx][
-                    dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] * h->pps.scaling_matrix8[i][x]) << shift;
+                h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
+                    ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
+                    h->pps.scaling_matrix8[i][x]) << shift;
          }
      }
  }
@@ -3314,6 +3321,7 @@ static void hl_decode_mb(H264Context *h){
      const unsigned int bottom = mb_y & 1;
      const int transform_bypass = (s->qscale == 0 && h->sps.transform_bypass);
      void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
+    void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
  
      if(!s->decode)
          return;
@@ -3337,9 +3345,16 @@ static void hl_decode_mb(H264Context *h){
  //        dct_offset = s->linesize * 16;
      }
  
-    idct_add = transform_bypass
-             ? IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4
-             : IS_8x8DCT(mb_type) ? s->dsp.h264_idct8_add : s->dsp.h264_idct_add;
+    if(transform_bypass){
+        idct_dc_add =
+        idct_add = IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
+    }else if(IS_8x8DCT(mb_type)){
+        idct_dc_add = s->dsp.h264_idct8_dc_add;
+        idct_add = s->dsp.h264_idct8_add;
+    }else{
+        idct_dc_add = s->dsp.h264_idct_dc_add;
+        idct_add = s->dsp.h264_idct_add;
+    }
  
      if (IS_INTRA_PCM(mb_type)) {
          unsigned int x, y;
@@ -3389,17 +3404,22 @@ static void hl_decode_mb(H264Context *h){
                          for(i=0; i<16; i+=4){
                              uint8_t * const ptr= dest_y + block_offset[i];
                              const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
+                            const int nnz = h->non_zero_count_cache[ scan8[i] ];
                              h->pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
                                                     (h->topright_samples_available<<(i+1))&0x8000, linesize);
-                            if(h->non_zero_count_cache[ scan8[i] ])
-                                idct_add(ptr, h->mb + i*16, linesize);
+                            if(nnz){
+                                if(nnz == 1 && h->mb[i*16])
+                                    idct_dc_add(ptr, h->mb + i*16, linesize);
+                                else
+                                    idct_add(ptr, h->mb + i*16, linesize);
+                            }
                          }
                      }else
                      for(i=0; i<16; i++){
                          uint8_t * const ptr= dest_y + block_offset[i];
                          uint8_t *topright;
                          const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
-                        int tr;
+                        int nnz, tr;
  
                          if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
                              const int topright_avail= (h->topright_samples_available<<i)&0x8000;
@@ -3413,10 +3433,14 @@ static void hl_decode_mb(H264Context *h){
                              topright= NULL;
  
                          h->pred4x4[ dir ](ptr, topright, linesize);
-                        if(h->non_zero_count_cache[ scan8[i] ]){
-                            if(s->codec_id == CODEC_ID_H264)
-                                idct_add(ptr, h->mb + i*16, linesize);
-                            else
+                        nnz = h->non_zero_count_cache[ scan8[i] ];
+                        if(nnz){
+                            if(s->codec_id == CODEC_ID_H264){
+                                if(nnz == 1 && h->mb[i*16])
+                                    idct_dc_add(ptr, h->mb + i*16, linesize);
+                                else
+                                    idct_add(ptr, h->mb + i*16, linesize);
+                            }else
                                  svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
                          }
                      }
@@ -3453,11 +3477,23 @@ static void hl_decode_mb(H264Context *h){
  
          if(!IS_INTRA4x4(mb_type)){
              if(s->codec_id == CODEC_ID_H264){
-                const int di = IS_8x8DCT(mb_type) ? 4 : 1;
-                for(i=0; i<16; i+=di){
-                    if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
-                        uint8_t * const ptr= dest_y + block_offset[i];
-                        idct_add(ptr, h->mb + i*16, linesize);
+                if(IS_INTRA16x16(mb_type)){
+                    for(i=0; i<16; i++){
+                        if(h->non_zero_count_cache[ scan8[i] ])
+                            idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
+                        else if(h->mb[i*16])
+                            idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
+                    }
+                }else{
+                    const int di = IS_8x8DCT(mb_type) ? 4 : 1;
+                    for(i=0; i<16; i+=di){
+                        int nnz = h->non_zero_count_cache[ scan8[i] ];
+                        if(nnz){
+                            if(nnz==1 && h->mb[i*16])
+                                idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
+                            else
+                                idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
+                        }
                      }
                  }
              }else{
@@ -3471,34 +3507,26 @@ static void hl_decode_mb(H264Context *h){
          }
  
          if(!(s->flags&CODEC_FLAG_GRAY)){
-            idct_add = transform_bypass ? s->dsp.add_pixels4 : s->dsp.h264_idct_add;
-            if(!transform_bypass){
+            uint8_t *dest[2] = {dest_cb, dest_cr};
+            if(transform_bypass){
+                idct_add = idct_dc_add = s->dsp.add_pixels4;
+            }else{
+                idct_add = s->dsp.h264_idct_add;
+                idct_dc_add = s->dsp.h264_idct_dc_add;
                  chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp, h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp][0]);
                  chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp, h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp][0]);
              }
              if(s->codec_id == CODEC_ID_H264){
-                for(i=16; i<16+4; i++){
-                    if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
-                        uint8_t * const ptr= dest_cb + block_offset[i];
-                        idct_add(ptr, h->mb + i*16, uvlinesize);
-                    }
-                }
-                for(i=20; i<20+4; i++){
-                    if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
-                        uint8_t * const ptr= dest_cr + block_offset[i];
-                        idct_add(ptr, h->mb + i*16, uvlinesize);
-                    }
+                for(i=16; i<16+8; i++){
+                    if(h->non_zero_count_cache[ scan8[i] ])
+                        idct_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
+                    else if(h->mb[i*16])
+                        idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
                  }
              }else{
-                for(i=16; i<16+4; i++){
-                    if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
-                        uint8_t * const ptr= dest_cb + block_offset[i];
-                        svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
-                    }
-                }
-                for(i=20; i<20+4; i++){
+                for(i=16; i<16+8; i++){
                      if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
-                        uint8_t * const ptr= dest_cr + block_offset[i];
+                        uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
                          svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
                      }
                  }
@@ -3887,8 +3915,13 @@ static void idr(H264Context *h){
  static void flush_dpb(AVCodecContext *avctx){
      H264Context *h= avctx->priv_data;
      int i;
-    for(i=0; i<16; i++)
+    for(i=0; i<16; i++) {
+        if(h->delayed_pic[i])
+            h->delayed_pic[i]->reference= 0;
          h->delayed_pic[i]= NULL;
+    }
+    if(h->delayed_output_pic)
+        h->delayed_output_pic->reference= 0;
      h->delayed_output_pic= NULL;
      idr(h);
      if(h->s.current_picture_ptr)
@@ -4263,8 +4296,8 @@ static int decode_slice_header(H264Context *h){
      s->mb_width= h->sps.mb_width;
      s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
  
-    h->b_stride=  s->mb_width*4 + 1;
-    h->b8_stride= s->mb_width*2 + 1;
+    h->b_stride=  s->mb_width*4;
+    h->b8_stride= s->mb_width*2;
  
      s->width = 16*s->mb_width - 2*(h->sps.crop_left + h->sps.crop_right );
      if(h->sps.frame_mbs_only_flag)
@@ -4290,14 +4323,31 @@ static int decode_slice_header(H264Context *h){
  #define T(x) (x>>2) | ((x<<2) & 0xF)
                  h->zigzag_scan[i] = T(zigzag_scan[i]);
                  h-> field_scan[i] = T( field_scan[i]);
+#undef T
+            }
+        }
+        if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
+            memcpy(h->zigzag_scan8x8, zigzag_scan8x8, 64*sizeof(uint8_t));
+            memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
+        }else{
+            int i;
+            for(i=0; i<64; i++){
+#define T(x) (x>>3) | ((x&7)<<3)
+                h->zigzag_scan8x8[i] = T(zigzag_scan8x8[i]);
+                h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
+#undef T
              }
          }
          if(h->sps.transform_bypass){ //FIXME same ugly
              h->zigzag_scan_q0 = zigzag_scan;
              h->field_scan_q0 = field_scan;
+            h->zigzag_scan8x8_q0 = zigzag_scan8x8;
+            h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
          }else{
              h->zigzag_scan_q0 = h->zigzag_scan;
              h->field_scan_q0 = h->field_scan;
+            h->zigzag_scan8x8_q0 = h->zigzag_scan8x8;
+            h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
          }
  
          alloc_tables(h);
@@ -4309,7 +4359,11 @@ static int decode_slice_header(H264Context *h){
              s->avctx->sample_aspect_ratio.den = 1;
  
          if(h->sps.timing_info_present_flag){
-            s->avctx->time_base= (AVRational){h->sps.num_units_in_tick, h->sps.time_scale};
+            s->avctx->time_base= (AVRational){h->sps.num_units_in_tick * 2, h->sps.time_scale};
+            if(h->x264_build > 0 && h->x264_build < 44)
+                s->avctx->time_base.den *= 2;
+            av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
+                      s->avctx->time_base.num, s->avctx->time_base.den, 1<<30);
          }
      }
  
@@ -5070,7 +5124,7 @@ decode_intra_mb:
          int i8x8, i4x4, chroma_idx;
          int chroma_qp, dquant;
          GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
-        const uint8_t *scan, *dc_scan;
+        const uint8_t *scan, *scan8x8, *dc_scan;
  
  //        fill_non_zero_count_cache(h);
  
@@ -5081,6 +5135,7 @@ decode_intra_mb:
              scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
              dc_scan= luma_dc_zigzag_scan;
          }
+        scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
  
          dquant= get_se_golomb(&s->gb);
  
@@ -5122,12 +5177,12 @@ decode_intra_mb:
                          DCTELEM *buf = &h->mb[64*i8x8];
                          uint8_t *nnz;
                          for(i4x4=0; i4x4<4; i4x4++){
-                            if( decode_residual(h, gb, buf, i4x4+4*i8x8, zigzag_scan8x8_cavlc+16*i4x4,
+                            if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
                                                  h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
                                  return -1;
                          }
                          nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
-                        nnz[0] |= nnz[1] | nnz[8] | nnz[9];
+                        nnz[0] += nnz[1] + nnz[8] + nnz[9];
                      }else{
                          for(i4x4=0; i4x4<4; i4x4++){
                              const int index= i4x4 + 4*i8x8;
@@ -5221,19 +5276,11 @@ static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_sl
          return 25;  /* PCM */
  
      mb_type = 1; /* I16x16 */
-    if( get_cabac( &h->cabac, &state[1] ) )
-        mb_type += 12;  /* cbp_luma != 0 */
-
-    if( get_cabac( &h->cabac, &state[2] ) ) {
-        if( get_cabac( &h->cabac, &state[2+intra_slice] ) )
-            mb_type += 4 * 2;   /* cbp_chroma == 2 */
-        else
-            mb_type += 4 * 1;   /* cbp_chroma == 1 */
-    }
-    if( get_cabac( &h->cabac, &state[3+intra_slice] ) )
-        mb_type += 2;
-    if( get_cabac( &h->cabac, &state[3+2*intra_slice] ) )
-        mb_type += 1;
+    mb_type += 12 * get_cabac( &h->cabac, &state[1] ); /* cbp_luma != 0 */
+    if( get_cabac( &h->cabac, &state[2] ) ) /* cbp_chroma */
+        mb_type += 4 + 4 * get_cabac( &h->cabac, &state[2+intra_slice] );
+    mb_type += 2 * get_cabac( &h->cabac, &state[3+intra_slice] );
+    mb_type += 1 * get_cabac( &h->cabac, &state[3+2*intra_slice] );
      return mb_type;
  }
  
@@ -5246,15 +5293,11 @@ static int decode_cabac_mb_type( H264Context *h ) {
          if( get_cabac( &h->cabac, &h->cabac_state[14] ) == 0 ) {
              /* P-type */
              if( get_cabac( &h->cabac, &h->cabac_state[15] ) == 0 ) {
-                if( get_cabac( &h->cabac, &h->cabac_state[16] ) == 0 )
-                    return 0; /* P_L0_D16x16; */
-                else
-                    return 3; /* P_8x8; */
+                /* P_L0_D16x16, P_8x8 */
+                return 3 * get_cabac( &h->cabac, &h->cabac_state[16] );
              } else {
-                if( get_cabac( &h->cabac, &h->cabac_state[17] ) == 0 )
-                    return 2; /* P_L0_D8x16; */
-                else
-                    return 1; /* P_L0_D16x8; */
+                /* P_L0_D8x16, P_L0_D16x8 */
+                return 2 - get_cabac( &h->cabac, &h->cabac_state[17] );
              }
          } else {
              return decode_cabac_intra_mb_type(h, 17, 0) + 5;
@@ -5265,11 +5308,9 @@ static int decode_cabac_mb_type( H264Context *h ) {
          int ctx = 0;
          int bits;
  
-        if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] )
-                      && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
+        if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
              ctx++;
-        if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] )
-                      && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
+        if( h->slice_table[mbb_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
              ctx++;
  
          if( !get_cabac( &h->cabac, &h->cabac_state[27+ctx] ) )
@@ -5312,10 +5353,9 @@ static int decode_cabac_mb_skip( H264Context *h) {
      if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
          ctx++;
  
-    if( h->slice_type == P_TYPE || h->slice_type == SP_TYPE)
-        return get_cabac( &h->cabac, &h->cabac_state[11+ctx] );
-    else /* B-frame */
-        return get_cabac( &h->cabac, &h->cabac_state[24+ctx] );
+    if( h->slice_type == B_TYPE )
+        ctx += 13;
+    return get_cabac( &h->cabac, &h->cabac_state[11+ctx] );
  }
  
  static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
@@ -5372,14 +5412,17 @@ static const uint8_t block_idx_xy[4][4] = {
  };
  
  static int decode_cabac_mb_cbp_luma( H264Context *h) {
-    MpegEncContext * const s = &h->s;
-
      int cbp = 0;
+    int cbp_b = -1;
      int i8x8;
  
+    if( h->slice_table[h->top_mb_xy] == h->slice_num ) {
+        cbp_b = h->top_cbp;
+        tprintf("cbp_b = top_cbp = %x\n", cbp_b);
+    }
+
      for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
          int cbp_a = -1;
-        int cbp_b = -1;
          int x, y;
          int ctx = 0;
  
@@ -5388,17 +5431,13 @@ static int decode_cabac_mb_cbp_luma( H264Context *h) {
  
          if( x > 0 )
              cbp_a = cbp;
-        else if( s->mb_x > 0 && (h->slice_table[h->left_mb_xy[0]] == h->slice_num)) {
+        else if( h->slice_table[h->left_mb_xy[0]] == h->slice_num ) {
              cbp_a = h->left_cbp;
              tprintf("cbp_a = left_cbp = %x\n", cbp_a);
          }
  
          if( y > 0 )
              cbp_b = cbp;
-        else if( s->mb_y > 0 && (h->slice_table[h->top_mb_xy] == h->slice_num)) {
-            cbp_b = h->top_cbp;
-            tprintf("cbp_b = top_cbp = %x\n", cbp_b);
-        }
  
          /* No need to test for skip as we put 0 for skip block */
          /* No need to test for IPCM as we put 1 for IPCM block */
@@ -5458,7 +5497,7 @@ static int decode_cabac_mb_dqp( H264Context *h) {
          else
              ctx = 3;
          val++;
-        if(val > 52) //prevent infinite loop
+        if(val > 102) //prevent infinite loop
              return INT_MIN;
      }
  
@@ -5592,7 +5631,7 @@ static int inline get_cabac_cbf_ctx( H264Context *h, int cat, int idx ) {
      return ctx + 4 * cat;
  }
  
-static int inline decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff) {
+static int decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff) {
      const int mb_xy  = h->s.mb_x + h->s.mb_y*h->s.mb_stride;
      static const int significant_coeff_flag_field_offset[2] = { 105, 277 };
      static const int last_significant_coeff_flag_field_offset[2] = { 166, 338 };
@@ -5686,7 +5725,7 @@ static int inline decode_cabac_residual( H264Context *h, DCTELEM *block, int cat
          h->non_zero_count_cache[scan8[16+n]] = coeff_count;
      else {
          assert( cat == 5 );
-        fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, 1, 1);
+        fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
      }
  
      for( i = coeff_count - 1; i >= 0; i-- ) {
@@ -5737,7 +5776,7 @@ static int inline decode_cabac_residual( H264Context *h, DCTELEM *block, int cat
      return 0;
  }
  
-void inline compute_mb_neighboors(H264Context *h)
+static void inline compute_mb_neighbors(H264Context *h)
  {
      MpegEncContext * const s = &h->s;
      const int mb_xy  = s->mb_x + s->mb_y*s->mb_stride;
@@ -5797,7 +5836,7 @@ static int decode_mb_cabac(H264Context *h) {
  
      h->prev_mb_skipped = 0;
  
-    compute_mb_neighboors(h);
+    compute_mb_neighbors(h);
      if( ( mb_type = decode_cabac_mb_type( h ) ) < 0 ) {
          av_log( h->s.avctx, AV_LOG_ERROR, "decode_cabac_mb_type failed\n" );
          return -1;
@@ -6129,7 +6168,7 @@ decode_intra_mb:
      s->current_picture.mb_type[mb_xy]= mb_type;
  
      if( cbp || IS_INTRA16x16( mb_type ) ) {
-        const uint8_t *scan, *dc_scan;
+        const uint8_t *scan, *scan8x8, *dc_scan;
          int dqp;
  
          if(IS_INTERLACED(mb_type)){
@@ -6139,6 +6178,7 @@ decode_intra_mb:
              scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
              dc_scan= luma_dc_zigzag_scan;
          }
+        scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
  
          h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
          if( dqp == INT_MIN ){
@@ -6172,7 +6212,7 @@ decode_intra_mb:
                  if( cbp & (1<<i8x8) ) {
                      if( IS_8x8DCT(mb_type) ) {
                          if( decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
-                                zigzag_scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64) < 0 )
+                            scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64) < 0 )
                              return -1;
                      } else
                      for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
@@ -6543,6 +6583,18 @@ static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8
       * frame numbers, not indices. */
      static const int ref2frm[18] = {-1,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
  
+    //for sufficiently low qp, filtering wouldn't do anything
+    //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
+    if(!h->mb_aff_frame){
+        int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX(0, h->pps.chroma_qp_index_offset);
+        int qp = s->current_picture.qscale_table[mb_xy];
+        if(qp <= qp_thresh
+           && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
+           && (mb_y == 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
+            return;
+        }
+    }
+
      if (h->mb_aff_frame
              // left mb is in picture
              && h->slice_table[mb_xy-1] != 255
@@ -6613,8 +6665,8 @@ static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8
          const int mbm_type = s->current_picture.mb_type[mbm_xy];
          int start = h->slice_table[mbm_xy] == 255 ? 1 : 0;
  
-        const int edges = ((mb_type & mbm_type) & (MB_TYPE_16x16|MB_TYPE_SKIP))
-                                               == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
+        const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
+                                  == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
          // how often to recheck mv-based bS when iterating between edges
          const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
                                (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
@@ -6925,7 +6977,7 @@ static int decode_slice(H264Context *h){
              hl_decode_mb(h);
  
              if(ret<0){
-                fprintf(stderr, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
+                av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
                  ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
  
                  return -1;
@@ -7469,6 +7521,15 @@ static int decode_nal_units(H264Context *h, uint8_t *buf, int buf_size){
          nalsize = 0;
          for(i = 0; i < h->nal_length_size; i++)
              nalsize = (nalsize << 8) | buf[buf_index++];
+        if(nalsize <= 1){
+            if(nalsize == 1){
+                buf_index++;
+                continue;
+            }else{
+                av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
+                break;
+            }
+        }
        } else {
          // start code prefix search
          for(; buf_index + 3 < buf_size; buf_index++){
@@ -7512,6 +7573,7 @@ static int decode_nal_units(H264Context *h, uint8_t *buf, int buf_size){
                  av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
                  break;
              }
+            s->current_picture_ptr->key_frame= (h->nal_unit_type == NAL_IDR_SLICE);
              if(h->redundant_pic_count==0 && s->hurry_up < 5
                 && (avctx->skip_frame < AVDISCARD_NONREF || h->nal_ref_idc)
                 && (avctx->skip_frame < AVDISCARD_BIDIR  || h->slice_type!=B_TYPE)
@@ -7581,7 +7643,6 @@ static int decode_nal_units(H264Context *h, uint8_t *buf, int buf_size){
  
      s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
      s->current_picture_ptr->pict_type= s->pict_type;
-    s->current_picture_ptr->key_frame= s->pict_type == I_TYPE && h->nal_unit_type == NAL_IDR_SLICE;
  
      h->prev_frame_num_offset= h->frame_num_offset;
      h->prev_frame_num= h->frame_num;
@@ -7737,7 +7798,9 @@ static int decode_frame(AVCodecContext *avctx,
              }
  
          out_of_order = !cross_idr && prev && out->poc < prev->poc;
-        if(prev && pics <= s->avctx->has_b_frames)
+        if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
+            { }
+        else if(prev && pics <= s->avctx->has_b_frames)
              out = prev;
          else if((out_of_order && pics-1 == s->avctx->has_b_frames && pics < 15)
             || (s->low_delay &&