CAVLC optimizations

[x264] / common / frame.c
diff --git a/common/frame.c b/common/frame.c

index 43da631b34218577cd2ad47b1c3f6014d4dc0dfa..bd7ad45c2c074d5eff12176ea20baa140263ca06 100644 (file)
--- a/common/frame.c
+++ b/common/frame.c
@@ -5,6 +5,7 @@
   *
   * Authors: Laurent Aimar <fenrir@via.ecp.fr>
   *          Loren Merritt <lorenm@u.washington.edu>
+ *          Fiona Glaser <fiona@x264.com>
   *
   * This program is free software; you can redistribute it and/or modify
   * it under the terms of the GNU General Public License as published by
@@ -34,6 +35,7 @@ x264_frame_t *x264_frame_new( x264_t *h )
      int i_stride, i_width, i_lines;
      int i_padv = PADV << h->param.b_interlaced;
      int luma_plane_size;
+    int chroma_plane_size;
      int align = h->param.cpu&X264_CPU_CACHELINE_64 ? 64 : h->param.cpu&X264_CPU_CACHELINE_32 ? 32 : 16;
  
      if( !frame ) return NULL;
@@ -48,23 +50,32 @@ x264_frame_t *x264_frame_new( x264_t *h )
      frame->i_plane = 3;
      for( i = 0; i < 3; i++ )
      {
-        frame->i_stride[i] = i_stride >> !!i;
+        frame->i_stride[i] = ALIGN( i_stride >> !!i, 16 );
          frame->i_width[i] = i_width >> !!i;
          frame->i_lines[i] = i_lines >> !!i;
      }
  
      luma_plane_size = (frame->i_stride[0] * ( frame->i_lines[0] + 2*i_padv ));
+    chroma_plane_size = (frame->i_stride[1] * ( frame->i_lines[1] + 2*i_padv ));
      for( i = 1; i < 3; i++ )
      {
-        CHECKED_MALLOC( frame->buffer[i], luma_plane_size/4 );
+        CHECKED_MALLOC( frame->buffer[i], chroma_plane_size );
          frame->plane[i] = frame->buffer[i] + (frame->i_stride[i] * i_padv + PADH)/2;
      }
      /* all 4 luma planes allocated together, since the cacheline split code
       * requires them to be in-phase wrt cacheline alignment. */
-    CHECKED_MALLOC( frame->buffer[0], 4*luma_plane_size);
-    for( i = 0; i < 4; i++ )
-        frame->filtered[i] = frame->buffer[0] + i*luma_plane_size + frame->i_stride[0] * i_padv + PADH;
-    frame->plane[0] = frame->filtered[0];
+    if( h->param.analyse.i_subpel_refine )
+    {
+        CHECKED_MALLOC( frame->buffer[0], 4*luma_plane_size);
+        for( i = 0; i < 4; i++ )
+            frame->filtered[i] = frame->buffer[0] + i*luma_plane_size + frame->i_stride[0] * i_padv + PADH;
+        frame->plane[0] = frame->filtered[0];
+    }
+    else
+    {
+        CHECKED_MALLOC( frame->buffer[0], luma_plane_size);
+        frame->plane[0] = frame->buffer[0] + frame->i_stride[0] * i_padv + PADH;
+    }
  
      if( h->frames.b_have_lowres )
      {
@@ -90,7 +101,7 @@ x264_frame_t *x264_frame_new( x264_t *h )
      if( h->param.analyse.i_me_method >= X264_ME_ESA )
      {
          CHECKED_MALLOC( frame->buffer[3],
-                        2 * frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv) * sizeof(uint16_t) );
+                        frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv) * sizeof(uint16_t) << h->frames.b_have_sub8x8_esa );
          frame->integral = (uint16_t*)frame->buffer[3] + frame->i_stride[0] * i_padv + PADH;
      }
  
@@ -124,7 +135,11 @@ x264_frame_t *x264_frame_new( x264_t *h )
              CHECKED_MALLOC( frame->i_row_satds[i][j], i_lines/16 * sizeof(int) );
  
      if( h->param.rc.i_aq_mode )
+    {
          CHECKED_MALLOC( frame->f_qp_offset, h->mb.i_mb_count * sizeof(float) );
+        if( h->frames.b_have_lowres )
+            CHECKED_MALLOC( frame->i_inv_qscale_factor, h->mb.i_mb_count * sizeof(uint16_t) );
+    }
  
      x264_pthread_mutex_init( &frame->mutex, NULL );
      x264_pthread_cond_init( &frame->cv, NULL );
@@ -152,6 +167,9 @@ void x264_frame_delete( x264_frame_t *frame )
              x264_free( frame->lowres_mvs[j][i] );
              x264_free( frame->lowres_mv_costs[j][i] );
          }
+    x264_free( frame->f_qp_offset );
+    x264_free( frame->i_inv_qscale_factor );
+    x264_free( frame->i_intra_cost );
      x264_free( frame->i_row_bits );
      x264_free( frame->i_row_qp );
      x264_free( frame->mb_type );
@@ -607,6 +625,7 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
      const int b_interlaced = h->sh.b_mbaff;
      const int mvy_limit = 4 >> b_interlaced;
      const int qp_thresh = 15 - X264_MIN(h->sh.i_alpha_c0_offset, h->sh.i_beta_offset) - X264_MAX(0, h->param.analyse.i_chroma_qp_offset);
+    const int no_sub8x8 = !(h->param.analyse.inter & X264_ANALYSE_PSUB8x8);
      int mb_x;
      int stridey   = h->fdec->i_stride[0];
      int stride2y  = stridey << b_interlaced;
@@ -692,24 +711,31 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
                      if( h->mb.non_zero_count[mb_xy][x+y*4] != 0 ||\
                          h->mb.non_zero_count[mbn_xy][xn+yn*4] != 0 )\
                          bS[i] = 2;\
-                    else\
+                    else if(!(i_edge&no_sub8x8))\
                      {\
-                        /* FIXME: A given frame may occupy more than one position in\
-                         * the reference list. So we should compare the frame numbers,\
-                         * not the indices in the ref list.\
-                         * No harm yet, as we don't generate that case.*/\
-                        int i8p= mb_8x8+(x>>1)+(y>>1)*s8x8;\
-                        int i8q= mbn_8x8+(xn>>1)+(yn>>1)*s8x8;\
-                        int i4p= mb_4x4+x+y*s4x4;\
-                        int i4q= mbn_4x4+xn+yn*s4x4;\
-                        for( l = 0; l < 1 + (h->sh.i_type == SLICE_TYPE_B); l++ )\
-                            if( h->mb.ref[l][i8p] != h->mb.ref[l][i8q] ||\
-                                abs( h->mb.mv[l][i4p][0] - h->mb.mv[l][i4q][0] ) >= 4 ||\
-                                abs( h->mb.mv[l][i4p][1] - h->mb.mv[l][i4q][1] ) >= mvy_limit )\
+                        if((i&no_sub8x8) && bS[i-1] != 2)\
+                            bS[i] = bS[i-1];\
+                        else\
+                        {\
+                            /* FIXME: A given frame may occupy more than one position in\
+                             * the reference list. So we should compare the frame numbers,\
+                             * not the indices in the ref list.\
+                             * No harm yet, as we don't generate that case.*/\
+                            int i8p= mb_8x8+(x>>1)+(y>>1)*s8x8;\
+                            int i8q= mbn_8x8+(xn>>1)+(yn>>1)*s8x8;\
+                            int i4p= mb_4x4+x+y*s4x4;\
+                            int i4q= mbn_4x4+xn+yn*s4x4;\
+                            if((h->mb.ref[0][i8p] != h->mb.ref[0][i8q] ||\
+                                abs( h->mb.mv[0][i4p][0] - h->mb.mv[0][i4q][0] ) >= 4 ||\
+                                abs( h->mb.mv[0][i4p][1] - h->mb.mv[0][i4q][1] ) >= mvy_limit ) ||\
+                               (h->sh.i_type == SLICE_TYPE_B &&\
+                               (h->mb.ref[1][i8p] != h->mb.ref[1][i8q] ||\
+                                abs( h->mb.mv[1][i4p][0] - h->mb.mv[1][i4q][0] ) >= 4 ||\
+                                abs( h->mb.mv[1][i4p][1] - h->mb.mv[1][i4q][1] ) >= mvy_limit )))\
                              {\
                                  bS[i] = 1;\
-                                break;\
                              }\
+                        }\
                      }\
                  }\
              }\
@@ -720,7 +746,7 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
          #define DEBLOCK_DIR(i_dir)\
          {\
              int i_edge = (i_dir ? (mb_y <= b_interlaced) : (mb_x == 0));\
-            int i_qpn, i, l, mbn_xy, mbn_8x8, mbn_4x4;\
+            int i_qpn, i, mbn_xy, mbn_8x8, mbn_4x4;\
              DECLARE_ALIGNED_4( uint8_t bS[4] );  /* filtering strength */\
              if( i_edge )\
                  i_edge+= b_8x8_transform;\