dont forget table_size in the decode_frame return value

[ffmpeg] / libavcodec / h264.c
diff --git a/libavcodec/h264.c b/libavcodec/h264.c

index ddecc12aca05505a0ff882352b8779e45b8a11b0..15fdfcfc3abfb24f72d60a6b152da633743fcfbf 100644 (file)
--- a/libavcodec/h264.c
+++ b/libavcodec/h264.c
@@ -358,8 +358,12 @@ typedef struct H264Context{
  
      uint8_t zigzag_scan[16];
      uint8_t field_scan[16];
+    uint8_t zigzag_scan8x8[64];
+    uint8_t zigzag_scan8x8_cavlc[64];
      const uint8_t *zigzag_scan_q0;
      const uint8_t *field_scan_q0;
+    const uint8_t *zigzag_scan8x8_q0;
+    const uint8_t *zigzag_scan8x8_cavlc_q0;
  
      int x264_build;
  }H264Context;
@@ -711,7 +715,7 @@ static void fill_caches(H264Context *h, int mb_type, int for_deblock){
              }
              h->mv_cache_clean[list]= 0;
  
-            if(IS_INTER(top_type)){
+            if(USES_LIST(top_type, list)){
                  const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
                  const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
                  *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
@@ -731,7 +735,7 @@ static void fill_caches(H264Context *h, int mb_type, int for_deblock){
              }
  
              //FIXME unify cleanup or sth
-            if(IS_INTER(left_type[0])){
+            if(USES_LIST(left_type[0], list)){
                  const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
                  const int b8_xy= h->mb2b8_xy[left_xy[0]] + 1;
                  *(uint32_t*)h->mv_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0]];
@@ -745,7 +749,7 @@ static void fill_caches(H264Context *h, int mb_type, int for_deblock){
                  h->ref_cache[list][scan8[0] - 1 + 1*8]= left_type[0] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
              }
  
-            if(IS_INTER(left_type[1])){
+            if(USES_LIST(left_type[1], list)){
                  const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
                  const int b8_xy= h->mb2b8_xy[left_xy[1]] + 1;
                  *(uint32_t*)h->mv_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[2]];
@@ -763,7 +767,7 @@ static void fill_caches(H264Context *h, int mb_type, int for_deblock){
              if(for_deblock || (IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred))
                  continue;
  
-            if(IS_INTER(topleft_type)){
+            if(USES_LIST(topleft_type, list)){
                  const int b_xy = h->mb2b_xy[topleft_xy] + 3 + 3*h->b_stride;
                  const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + h->b8_stride;
                  *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
@@ -773,7 +777,7 @@ static void fill_caches(H264Context *h, int mb_type, int for_deblock){
                  h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
              }
  
-            if(IS_INTER(topright_type)){
+            if(USES_LIST(topright_type, list)){
                  const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
                  const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
                  *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
@@ -797,14 +801,14 @@ static void fill_caches(H264Context *h, int mb_type, int for_deblock){
  
              if( h->pps.cabac ) {
                  /* XXX beurk, Load mvd */
-                if(IS_INTER(topleft_type)){
+                if(USES_LIST(topleft_type, list)){
                      const int b_xy = h->mb2b_xy[topleft_xy] + 3 + 3*h->b_stride;
                      *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy];
                  }else{
                      *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 - 1*8]= 0;
                  }
  
-                if(IS_INTER(top_type)){
+                if(USES_LIST(top_type, list)){
                      const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
                      *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
                      *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
@@ -816,7 +820,7 @@ static void fill_caches(H264Context *h, int mb_type, int for_deblock){
                      *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
                      *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
                  }
-                if(IS_INTER(left_type[0])){
+                if(USES_LIST(left_type[0], list)){
                      const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
                      *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
                      *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
@@ -824,7 +828,7 @@ static void fill_caches(H264Context *h, int mb_type, int for_deblock){
                      *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
                      *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
                  }
-                if(IS_INTER(left_type[1])){
+                if(USES_LIST(left_type[1], list)){
                      const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
                      *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
                      *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
@@ -1418,28 +1422,13 @@ static inline void write_back_motion(H264Context *h, int mb_type){
      const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
      int list;
  
+    if(!USES_LIST(mb_type, 0))
+        fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
+
      for(list=0; list<2; list++){
          int y;
-        if(!USES_LIST(mb_type, list)){
-            if(1){ //FIXME skip or never read if mb_type doesn't use it
-                for(y=0; y<4; y++){
-                    *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]=
-                    *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= 0;
-                }
-                if( h->pps.cabac ) {
-                    /* FIXME needed ? */
-                    for(y=0; y<4; y++){
-                        *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]=
-                        *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= 0;
-                    }
-                }
-                for(y=0; y<2; y++){
-                    s->current_picture.ref_index[list][b8_xy + 0 + y*h->b8_stride]=
-                    s->current_picture.ref_index[list][b8_xy + 1 + y*h->b8_stride]= LIST_NOT_USED;
-                }
-            }
+        if(!USES_LIST(mb_type, list))
              continue;
-        }
  
          for(y=0; y<4; y++){
              *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
@@ -1451,17 +1440,22 @@ static inline void write_back_motion(H264Context *h, int mb_type){
                  *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
              }
          }
-        for(y=0; y<2; y++){
-            s->current_picture.ref_index[list][b8_xy + 0 + y*h->b8_stride]= h->ref_cache[list][scan8[0]+0 + 16*y];
-            s->current_picture.ref_index[list][b8_xy + 1 + y*h->b8_stride]= h->ref_cache[list][scan8[0]+2 + 16*y];
+
+        {
+            uint8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
+            ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
+            ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
+            ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
+            ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
          }
      }
  
      if(h->slice_type == B_TYPE && h->pps.cabac){
          if(IS_8X8(mb_type)){
-            h->direct_table[b8_xy+1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
-            h->direct_table[b8_xy+0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
-            h->direct_table[b8_xy+1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
+            uint8_t *direct_table = &h->direct_table[b8_xy];
+            direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
+            direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
+            direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
          }
      }
  }
@@ -2761,6 +2755,22 @@ static inline void mc_part(H264Context *h, int n, int square, int chroma_height,
                      x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
  }
  
+static inline void prefetch_motion(H264Context *h, int list){
+    /* fetch pixels for estimated mv 4 macroblocks ahead
+     * optimized for 64byte cache lines */
+    MpegEncContext * const s = &h->s;
+    const int refn = h->ref_cache[list][scan8[0]];
+    if(refn >= 0){
+        const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
+        const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
+        uint8_t **src= h->ref_list[list][refn].data;
+        int off= mx + (my + (s->mb_x&3)*4)*s->linesize + 64;
+        s->dsp.prefetch(src[0]+off, s->linesize, 4);
+        off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
+        s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
+    }
+}
+
  static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
                        qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
                        qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
@@ -2771,6 +2781,8 @@ static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t
  
      assert(IS_INTER(mb_type));
  
+    prefetch_motion(h, 0);
+
      if(IS_16X16(mb_type)){
          mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
                  qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
@@ -2842,6 +2854,8 @@ static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t
              }
          }
      }
+
+    prefetch_motion(h, 1);
  }
  
  static void decode_init_vlc(H264Context *h){
@@ -2953,6 +2967,7 @@ static void free_tables(H264Context *h){
  
  static void init_dequant8_coeff_table(H264Context *h){
      int i,q,x;
+    const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
      h->dequant8_coeff[0] = h->dequant8_buffer[0];
      h->dequant8_coeff[1] = h->dequant8_buffer[1];
  
@@ -2966,8 +2981,9 @@ static void init_dequant8_coeff_table(H264Context *h){
              int shift = div6[q];
              int idx = rem6[q];
              for(x=0; x<64; x++)
-                h->dequant8_coeff[i][q][x] = ((uint32_t)dequant8_coeff_init[idx][
-                    dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] * h->pps.scaling_matrix8[i][x]) << shift;
+                h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
+                    ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
+                    h->pps.scaling_matrix8[i][x]) << shift;
          }
      }
  }
@@ -3909,8 +3925,13 @@ static void idr(H264Context *h){
  static void flush_dpb(AVCodecContext *avctx){
      H264Context *h= avctx->priv_data;
      int i;
-    for(i=0; i<16; i++)
+    for(i=0; i<16; i++) {
+        if(h->delayed_pic[i])
+            h->delayed_pic[i]->reference= 0;
          h->delayed_pic[i]= NULL;
+    }
+    if(h->delayed_output_pic)
+        h->delayed_output_pic->reference= 0;
      h->delayed_output_pic= NULL;
      idr(h);
      if(h->s.current_picture_ptr)
@@ -4285,8 +4306,8 @@ static int decode_slice_header(H264Context *h){
      s->mb_width= h->sps.mb_width;
      s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
  
-    h->b_stride=  s->mb_width*4 + 1;
-    h->b8_stride= s->mb_width*2 + 1;
+    h->b_stride=  s->mb_width*4;
+    h->b8_stride= s->mb_width*2;
  
      s->width = 16*s->mb_width - 2*(h->sps.crop_left + h->sps.crop_right );
      if(h->sps.frame_mbs_only_flag)
@@ -4312,14 +4333,31 @@ static int decode_slice_header(H264Context *h){
  #define T(x) (x>>2) | ((x<<2) & 0xF)
                  h->zigzag_scan[i] = T(zigzag_scan[i]);
                  h-> field_scan[i] = T( field_scan[i]);
+#undef T
+            }
+        }
+        if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
+            memcpy(h->zigzag_scan8x8, zigzag_scan8x8, 64*sizeof(uint8_t));
+            memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
+        }else{
+            int i;
+            for(i=0; i<64; i++){
+#define T(x) (x>>3) | ((x&7)<<3)
+                h->zigzag_scan8x8[i] = T(zigzag_scan8x8[i]);
+                h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
+#undef T
              }
          }
          if(h->sps.transform_bypass){ //FIXME same ugly
              h->zigzag_scan_q0 = zigzag_scan;
              h->field_scan_q0 = field_scan;
+            h->zigzag_scan8x8_q0 = zigzag_scan8x8;
+            h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
          }else{
              h->zigzag_scan_q0 = h->zigzag_scan;
              h->field_scan_q0 = h->field_scan;
+            h->zigzag_scan8x8_q0 = h->zigzag_scan8x8;
+            h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
          }
  
          alloc_tables(h);
@@ -5096,7 +5134,7 @@ decode_intra_mb:
          int i8x8, i4x4, chroma_idx;
          int chroma_qp, dquant;
          GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
-        const uint8_t *scan, *dc_scan;
+        const uint8_t *scan, *scan8x8, *dc_scan;
  
  //        fill_non_zero_count_cache(h);
  
@@ -5107,6 +5145,7 @@ decode_intra_mb:
              scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
              dc_scan= luma_dc_zigzag_scan;
          }
+        scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
  
          dquant= get_se_golomb(&s->gb);
  
@@ -5148,7 +5187,7 @@ decode_intra_mb:
                          DCTELEM *buf = &h->mb[64*i8x8];
                          uint8_t *nnz;
                          for(i4x4=0; i4x4<4; i4x4++){
-                            if( decode_residual(h, gb, buf, i4x4+4*i8x8, zigzag_scan8x8_cavlc+16*i4x4,
+                            if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
                                                  h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
                                  return -1;
                          }
@@ -5383,8 +5422,6 @@ static const uint8_t block_idx_xy[4][4] = {
  };
  
  static int decode_cabac_mb_cbp_luma( H264Context *h) {
-    MpegEncContext * const s = &h->s;
-
      int cbp = 0;
      int cbp_b = -1;
      int i8x8;
@@ -6141,7 +6178,7 @@ decode_intra_mb:
      s->current_picture.mb_type[mb_xy]= mb_type;
  
      if( cbp || IS_INTRA16x16( mb_type ) ) {
-        const uint8_t *scan, *dc_scan;
+        const uint8_t *scan, *scan8x8, *dc_scan;
          int dqp;
  
          if(IS_INTERLACED(mb_type)){
@@ -6151,6 +6188,7 @@ decode_intra_mb:
              scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
              dc_scan= luma_dc_zigzag_scan;
          }
+        scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
  
          h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
          if( dqp == INT_MIN ){
@@ -6184,7 +6222,7 @@ decode_intra_mb:
                  if( cbp & (1<<i8x8) ) {
                      if( IS_8x8DCT(mb_type) ) {
                          if( decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
-                                zigzag_scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64) < 0 )
+                            scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64) < 0 )
                              return -1;
                      } else
                      for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
@@ -6949,7 +6987,7 @@ static int decode_slice(H264Context *h){
              hl_decode_mb(h);
  
              if(ret<0){
-                fprintf(stderr, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
+                av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
                  ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
  
                  return -1;
@@ -7076,7 +7114,7 @@ static inline int decode_vui_parameters(H264Context *h, SPS *sps){
          if( aspect_ratio_idc == EXTENDED_SAR ) {
              sps->sar.num= get_bits(&s->gb, 16);
              sps->sar.den= get_bits(&s->gb, 16);
-        }else if(aspect_ratio_idc < 16){
+        }else if(aspect_ratio_idc < 14){
              sps->sar=  pixel_aspect[aspect_ratio_idc];
          }else{
              av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
@@ -7545,6 +7583,7 @@ static int decode_nal_units(H264Context *h, uint8_t *buf, int buf_size){
                  av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
                  break;
              }
+            s->current_picture_ptr->key_frame= (h->nal_unit_type == NAL_IDR_SLICE);
              if(h->redundant_pic_count==0 && s->hurry_up < 5
                 && (avctx->skip_frame < AVDISCARD_NONREF || h->nal_ref_idc)
                 && (avctx->skip_frame < AVDISCARD_BIDIR  || h->slice_type!=B_TYPE)
@@ -7614,7 +7653,6 @@ static int decode_nal_units(H264Context *h, uint8_t *buf, int buf_size){
  
      s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
      s->current_picture_ptr->pict_type= s->pict_type;
-    s->current_picture_ptr->key_frame= s->pict_type == I_TYPE && h->nal_unit_type == NAL_IDR_SLICE;
  
      h->prev_frame_num_offset= h->frame_num_offset;
      h->prev_frame_num= h->frame_num;