]> git.sesse.net Git - ffmpeg/blobdiff - libavcodec/h264.c
dont forget table_size in the decode_frame return value
[ffmpeg] / libavcodec / h264.c
index ddecc12aca05505a0ff882352b8779e45b8a11b0..15fdfcfc3abfb24f72d60a6b152da633743fcfbf 100644 (file)
@@ -358,8 +358,12 @@ typedef struct H264Context{
 
     uint8_t zigzag_scan[16];
     uint8_t field_scan[16];
+    uint8_t zigzag_scan8x8[64];
+    uint8_t zigzag_scan8x8_cavlc[64];
     const uint8_t *zigzag_scan_q0;
     const uint8_t *field_scan_q0;
+    const uint8_t *zigzag_scan8x8_q0;
+    const uint8_t *zigzag_scan8x8_cavlc_q0;
 
     int x264_build;
 }H264Context;
@@ -711,7 +715,7 @@ static void fill_caches(H264Context *h, int mb_type, int for_deblock){
             }
             h->mv_cache_clean[list]= 0;
 
-            if(IS_INTER(top_type)){
+            if(USES_LIST(top_type, list)){
                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
@@ -731,7 +735,7 @@ static void fill_caches(H264Context *h, int mb_type, int for_deblock){
             }
 
             //FIXME unify cleanup or sth
-            if(IS_INTER(left_type[0])){
+            if(USES_LIST(left_type[0], list)){
                 const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
                 const int b8_xy= h->mb2b8_xy[left_xy[0]] + 1;
                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0]];
@@ -745,7 +749,7 @@ static void fill_caches(H264Context *h, int mb_type, int for_deblock){
                 h->ref_cache[list][scan8[0] - 1 + 1*8]= left_type[0] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
             }
 
-            if(IS_INTER(left_type[1])){
+            if(USES_LIST(left_type[1], list)){
                 const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
                 const int b8_xy= h->mb2b8_xy[left_xy[1]] + 1;
                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[2]];
@@ -763,7 +767,7 @@ static void fill_caches(H264Context *h, int mb_type, int for_deblock){
             if(for_deblock || (IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred))
                 continue;
 
-            if(IS_INTER(topleft_type)){
+            if(USES_LIST(topleft_type, list)){
                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + 3*h->b_stride;
                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + h->b8_stride;
                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
@@ -773,7 +777,7 @@ static void fill_caches(H264Context *h, int mb_type, int for_deblock){
                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
             }
 
-            if(IS_INTER(topright_type)){
+            if(USES_LIST(topright_type, list)){
                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
@@ -797,14 +801,14 @@ static void fill_caches(H264Context *h, int mb_type, int for_deblock){
 
             if( h->pps.cabac ) {
                 /* XXX beurk, Load mvd */
-                if(IS_INTER(topleft_type)){
+                if(USES_LIST(topleft_type, list)){
                     const int b_xy = h->mb2b_xy[topleft_xy] + 3 + 3*h->b_stride;
                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy];
                 }else{
                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 - 1*8]= 0;
                 }
 
-                if(IS_INTER(top_type)){
+                if(USES_LIST(top_type, list)){
                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
@@ -816,7 +820,7 @@ static void fill_caches(H264Context *h, int mb_type, int for_deblock){
                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
                 }
-                if(IS_INTER(left_type[0])){
+                if(USES_LIST(left_type[0], list)){
                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
@@ -824,7 +828,7 @@ static void fill_caches(H264Context *h, int mb_type, int for_deblock){
                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
                 }
-                if(IS_INTER(left_type[1])){
+                if(USES_LIST(left_type[1], list)){
                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
@@ -1418,28 +1422,13 @@ static inline void write_back_motion(H264Context *h, int mb_type){
     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
     int list;
 
+    if(!USES_LIST(mb_type, 0))
+        fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
+
     for(list=0; list<2; list++){
         int y;
-        if(!USES_LIST(mb_type, list)){
-            if(1){ //FIXME skip or never read if mb_type doesn't use it
-                for(y=0; y<4; y++){
-                    *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]=
-                    *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= 0;
-                }
-                if( h->pps.cabac ) {
-                    /* FIXME needed ? */
-                    for(y=0; y<4; y++){
-                        *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]=
-                        *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= 0;
-                    }
-                }
-                for(y=0; y<2; y++){
-                    s->current_picture.ref_index[list][b8_xy + 0 + y*h->b8_stride]=
-                    s->current_picture.ref_index[list][b8_xy + 1 + y*h->b8_stride]= LIST_NOT_USED;
-                }
-            }
+        if(!USES_LIST(mb_type, list))
             continue;
-        }
 
         for(y=0; y<4; y++){
             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
@@ -1451,17 +1440,22 @@ static inline void write_back_motion(H264Context *h, int mb_type){
                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
             }
         }
-        for(y=0; y<2; y++){
-            s->current_picture.ref_index[list][b8_xy + 0 + y*h->b8_stride]= h->ref_cache[list][scan8[0]+0 + 16*y];
-            s->current_picture.ref_index[list][b8_xy + 1 + y*h->b8_stride]= h->ref_cache[list][scan8[0]+2 + 16*y];
+
+        {
+            uint8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
+            ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
+            ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
+            ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
+            ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
         }
     }
 
     if(h->slice_type == B_TYPE && h->pps.cabac){
         if(IS_8X8(mb_type)){
-            h->direct_table[b8_xy+1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
-            h->direct_table[b8_xy+0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
-            h->direct_table[b8_xy+1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
+            uint8_t *direct_table = &h->direct_table[b8_xy];
+            direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
+            direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
+            direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
         }
     }
 }
@@ -2761,6 +2755,22 @@ static inline void mc_part(H264Context *h, int n, int square, int chroma_height,
                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
 }
 
+static inline void prefetch_motion(H264Context *h, int list){
+    /* fetch pixels for estimated mv 4 macroblocks ahead
+     * optimized for 64byte cache lines */
+    MpegEncContext * const s = &h->s;
+    const int refn = h->ref_cache[list][scan8[0]];
+    if(refn >= 0){
+        const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
+        const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
+        uint8_t **src= h->ref_list[list][refn].data;
+        int off= mx + (my + (s->mb_x&3)*4)*s->linesize + 64;
+        s->dsp.prefetch(src[0]+off, s->linesize, 4);
+        off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
+        s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
+    }
+}
+
 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
@@ -2771,6 +2781,8 @@ static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t
 
     assert(IS_INTER(mb_type));
 
+    prefetch_motion(h, 0);
+
     if(IS_16X16(mb_type)){
         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
@@ -2842,6 +2854,8 @@ static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t
             }
         }
     }
+
+    prefetch_motion(h, 1);
 }
 
 static void decode_init_vlc(H264Context *h){
@@ -2953,6 +2967,7 @@ static void free_tables(H264Context *h){
 
 static void init_dequant8_coeff_table(H264Context *h){
     int i,q,x;
+    const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
     h->dequant8_coeff[0] = h->dequant8_buffer[0];
     h->dequant8_coeff[1] = h->dequant8_buffer[1];
 
@@ -2966,8 +2981,9 @@ static void init_dequant8_coeff_table(H264Context *h){
             int shift = div6[q];
             int idx = rem6[q];
             for(x=0; x<64; x++)
-                h->dequant8_coeff[i][q][x] = ((uint32_t)dequant8_coeff_init[idx][
-                    dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] * h->pps.scaling_matrix8[i][x]) << shift;
+                h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
+                    ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
+                    h->pps.scaling_matrix8[i][x]) << shift;
         }
     }
 }
@@ -3909,8 +3925,13 @@ static void idr(H264Context *h){
 static void flush_dpb(AVCodecContext *avctx){
     H264Context *h= avctx->priv_data;
     int i;
-    for(i=0; i<16; i++)
+    for(i=0; i<16; i++) {
+        if(h->delayed_pic[i])
+            h->delayed_pic[i]->reference= 0;
         h->delayed_pic[i]= NULL;
+    }
+    if(h->delayed_output_pic)
+        h->delayed_output_pic->reference= 0;
     h->delayed_output_pic= NULL;
     idr(h);
     if(h->s.current_picture_ptr)
@@ -4285,8 +4306,8 @@ static int decode_slice_header(H264Context *h){
     s->mb_width= h->sps.mb_width;
     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
 
-    h->b_stride=  s->mb_width*4 + 1;
-    h->b8_stride= s->mb_width*2 + 1;
+    h->b_stride=  s->mb_width*4;
+    h->b8_stride= s->mb_width*2;
 
     s->width = 16*s->mb_width - 2*(h->sps.crop_left + h->sps.crop_right );
     if(h->sps.frame_mbs_only_flag)
@@ -4312,14 +4333,31 @@ static int decode_slice_header(H264Context *h){
 #define T(x) (x>>2) | ((x<<2) & 0xF)
                 h->zigzag_scan[i] = T(zigzag_scan[i]);
                 h-> field_scan[i] = T( field_scan[i]);
+#undef T
+            }
+        }
+        if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
+            memcpy(h->zigzag_scan8x8, zigzag_scan8x8, 64*sizeof(uint8_t));
+            memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
+        }else{
+            int i;
+            for(i=0; i<64; i++){
+#define T(x) (x>>3) | ((x&7)<<3)
+                h->zigzag_scan8x8[i] = T(zigzag_scan8x8[i]);
+                h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
+#undef T
             }
         }
         if(h->sps.transform_bypass){ //FIXME same ugly
             h->zigzag_scan_q0 = zigzag_scan;
             h->field_scan_q0 = field_scan;
+            h->zigzag_scan8x8_q0 = zigzag_scan8x8;
+            h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
         }else{
             h->zigzag_scan_q0 = h->zigzag_scan;
             h->field_scan_q0 = h->field_scan;
+            h->zigzag_scan8x8_q0 = h->zigzag_scan8x8;
+            h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
         }
 
         alloc_tables(h);
@@ -5096,7 +5134,7 @@ decode_intra_mb:
         int i8x8, i4x4, chroma_idx;
         int chroma_qp, dquant;
         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
-        const uint8_t *scan, *dc_scan;
+        const uint8_t *scan, *scan8x8, *dc_scan;
 
 //        fill_non_zero_count_cache(h);
 
@@ -5107,6 +5145,7 @@ decode_intra_mb:
             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
             dc_scan= luma_dc_zigzag_scan;
         }
+        scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
 
         dquant= get_se_golomb(&s->gb);
 
@@ -5148,7 +5187,7 @@ decode_intra_mb:
                         DCTELEM *buf = &h->mb[64*i8x8];
                         uint8_t *nnz;
                         for(i4x4=0; i4x4<4; i4x4++){
-                            if( decode_residual(h, gb, buf, i4x4+4*i8x8, zigzag_scan8x8_cavlc+16*i4x4,
+                            if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
                                                 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
                                 return -1;
                         }
@@ -5383,8 +5422,6 @@ static const uint8_t block_idx_xy[4][4] = {
 };
 
 static int decode_cabac_mb_cbp_luma( H264Context *h) {
-    MpegEncContext * const s = &h->s;
-
     int cbp = 0;
     int cbp_b = -1;
     int i8x8;
@@ -6141,7 +6178,7 @@ decode_intra_mb:
     s->current_picture.mb_type[mb_xy]= mb_type;
 
     if( cbp || IS_INTRA16x16( mb_type ) ) {
-        const uint8_t *scan, *dc_scan;
+        const uint8_t *scan, *scan8x8, *dc_scan;
         int dqp;
 
         if(IS_INTERLACED(mb_type)){
@@ -6151,6 +6188,7 @@ decode_intra_mb:
             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
             dc_scan= luma_dc_zigzag_scan;
         }
+        scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
 
         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
         if( dqp == INT_MIN ){
@@ -6184,7 +6222,7 @@ decode_intra_mb:
                 if( cbp & (1<<i8x8) ) {
                     if( IS_8x8DCT(mb_type) ) {
                         if( decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
-                                zigzag_scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64) < 0 )
+                            scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64) < 0 )
                             return -1;
                     } else
                     for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
@@ -6949,7 +6987,7 @@ static int decode_slice(H264Context *h){
             hl_decode_mb(h);
 
             if(ret<0){
-                fprintf(stderr, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
+                av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
 
                 return -1;
@@ -7076,7 +7114,7 @@ static inline int decode_vui_parameters(H264Context *h, SPS *sps){
         if( aspect_ratio_idc == EXTENDED_SAR ) {
             sps->sar.num= get_bits(&s->gb, 16);
             sps->sar.den= get_bits(&s->gb, 16);
-        }else if(aspect_ratio_idc < 16){
+        }else if(aspect_ratio_idc < 14){
             sps->sar=  pixel_aspect[aspect_ratio_idc];
         }else{
             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
@@ -7545,6 +7583,7 @@ static int decode_nal_units(H264Context *h, uint8_t *buf, int buf_size){
                 av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
                 break;
             }
+            s->current_picture_ptr->key_frame= (h->nal_unit_type == NAL_IDR_SLICE);
             if(h->redundant_pic_count==0 && s->hurry_up < 5
                && (avctx->skip_frame < AVDISCARD_NONREF || h->nal_ref_idc)
                && (avctx->skip_frame < AVDISCARD_BIDIR  || h->slice_type!=B_TYPE)
@@ -7614,7 +7653,6 @@ static int decode_nal_units(H264Context *h, uint8_t *buf, int buf_size){
 
     s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
     s->current_picture_ptr->pict_type= s->pict_type;
-    s->current_picture_ptr->key_frame= s->pict_type == I_TYPE && h->nal_unit_type == NAL_IDR_SLICE;
 
     h->prev_frame_num_offset= h->frame_num_offset;
     h->prev_frame_num= h->frame_num;