Store MV deltas as clipped absolute values.
This means CABAC no longer has to calculate absolute values in MV context selection.
This also lets us cut the memory spent on MVDs by a factor of 2, speeding up cache_mvd and reducing memory usage by 32*threads*(num macroblocks) bytes.
On a Core i7 encoding 1080p, this is about 3 megabytes saved.
return sum;
}
-static inline uint32_t x264_cabac_amvd_sum( int16_t *mvdleft, int16_t *mvdtop )
+static inline uint16_t x264_cabac_mvd_sum( uint8_t *mvdleft, uint8_t *mvdtop )
{
int amvd0 = abs(mvdleft[0]) + abs(mvdtop[0]);
int amvd1 = abs(mvdleft[1]) + abs(mvdtop[1]);
amvd0 = (amvd0 > 2) + (amvd0 > 32);
amvd1 = (amvd1 > 2) + (amvd1 > 32);
- return amvd0 + (amvd1<<16);
+ return amvd0 + (amvd1<<8);
}
extern const uint8_t x264_exp2_lut[64];
uint8_t (*non_zero_count)[16+4+4]; /* nzc. for I_PCM set to 16 */
int8_t *chroma_pred_mode; /* chroma_pred_mode. cabac only. for non intra I_PRED_CHROMA_DC(0) */
int16_t (*mv[2])[2]; /* mb mv. set to 0 for intra mb */
- int16_t (*mvd[2])[2]; /* mb mv difference with predict. set to 0 if intra. cabac only */
+ uint8_t (*mvd[2])[2]; /* absolute value of mb mv difference with predict, clipped to [0,33]. set to 0 if intra. cabac only */
int8_t *ref[2]; /* mb ref. set to -1 if non used (intra or Lx only) */
int16_t (*mvr[2][32])[2]; /* 16x16 mv for each possible ref */
int8_t *skipbp; /* block pattern for SKIP or DIRECT (sub)mbs. B-frames + cabac only */
/* 0 if not available */
ALIGNED_16( int16_t mv[2][X264_SCAN8_SIZE][2] );
- ALIGNED_8( int16_t mvd[2][X264_SCAN8_SIZE][2] );
+ ALIGNED_8( uint8_t mvd[2][X264_SCAN8_SIZE][2] );
/* 1 if SKIP or DIRECT. set only for B-frames + CABAC */
ALIGNED_4( int8_t skip[X264_SCAN8_SIZE] );
if( h->param.b_cabac )
{
CHECKED_MALLOC( h->mb.chroma_pred_mode, i_mb_count * sizeof(int8_t) );
- CHECKED_MALLOC( h->mb.mvd[0], 2*16 * i_mb_count * sizeof(int16_t) );
- CHECKED_MALLOC( h->mb.mvd[1], 2*16 * i_mb_count * sizeof(int16_t) );
+ CHECKED_MALLOC( h->mb.mvd[0], 2*16 * i_mb_count * sizeof(uint8_t) );
+ CHECKED_MALLOC( h->mb.mvd[1], 2*16 * i_mb_count * sizeof(uint8_t) );
}
for( i=0; i<2; i++ )
if( h->param.b_cabac )
{
if( i_top_type >= 0 )
- {
- const int i8 = x264_scan8[0] - 8;
- const int iv = i_top_4x4;
- CP64( h->mb.cache.mvd[i_list][i8+0], h->mb.mvd[i_list][iv+0] );
- CP64( h->mb.cache.mvd[i_list][i8+2], h->mb.mvd[i_list][iv+2] );
- }
+ CP64( h->mb.cache.mvd[i_list][x264_scan8[0] - 8], h->mb.mvd[i_list][i_top_4x4] );
else
- {
- const int i8 = x264_scan8[0] - 8;
- M64( h->mb.cache.mvd[i_list][i8+0] ) = 0;
- M64( h->mb.cache.mvd[i_list][i8+2] ) = 0;
- }
+ M64( h->mb.cache.mvd[i_list][x264_scan8[0] - 8] ) = 0;
if( i_left_type >= 0 )
{
const int i8 = x264_scan8[0] - 1;
const int iv = i_mb_4x4 - 1;
- CP32( h->mb.cache.mvd[i_list][i8+0*8], h->mb.mvd[i_list][iv + 0*s4x4] );
- CP32( h->mb.cache.mvd[i_list][i8+1*8], h->mb.mvd[i_list][iv + 1*s4x4] );
- CP32( h->mb.cache.mvd[i_list][i8+2*8], h->mb.mvd[i_list][iv + 2*s4x4] );
- CP32( h->mb.cache.mvd[i_list][i8+3*8], h->mb.mvd[i_list][iv + 3*s4x4] );
+ CP16( h->mb.cache.mvd[i_list][i8+0*8], h->mb.mvd[i_list][iv + 0*s4x4] );
+ CP16( h->mb.cache.mvd[i_list][i8+1*8], h->mb.mvd[i_list][iv + 1*s4x4] );
+ CP16( h->mb.cache.mvd[i_list][i8+2*8], h->mb.mvd[i_list][iv + 2*s4x4] );
+ CP16( h->mb.cache.mvd[i_list][i8+3*8], h->mb.mvd[i_list][iv + 3*s4x4] );
}
else
{
const int i8 = x264_scan8[0] - 1;
for( i = 0; i < 4; i++ )
- M32( h->mb.cache.mvd[i_list][i8+i*8] ) = 0;
+ M16( h->mb.cache.mvd[i_list][i8+i*8] ) = 0;
}
}
}
if( !IS_INTRA( i_mb_type ) && !IS_SKIP( i_mb_type ) && !IS_DIRECT( i_mb_type ) )
{
for( y = 0; y < 4; y++ )
- {
- CP64( h->mb.mvd[0][i_mb_4x4+y*s4x4+0], h->mb.cache.mvd[0][x264_scan8[0]+8*y+0] );
- CP64( h->mb.mvd[0][i_mb_4x4+y*s4x4+2], h->mb.cache.mvd[0][x264_scan8[0]+8*y+2] );
- }
+ CP64( h->mb.mvd[0][i_mb_4x4+y*s4x4], h->mb.cache.mvd[0][x264_scan8[0]+8*y] );
if( h->sh.i_type == SLICE_TYPE_B )
for( y = 0; y < 4; y++ )
- {
- CP64( h->mb.mvd[1][i_mb_4x4+y*s4x4+0], h->mb.cache.mvd[1][x264_scan8[0]+8*y+0] );
- CP64( h->mb.mvd[1][i_mb_4x4+y*s4x4+2], h->mb.cache.mvd[1][x264_scan8[0]+8*y+2] );
- }
+ CP64( h->mb.mvd[1][i_mb_4x4+y*s4x4], h->mb.cache.mvd[1][x264_scan8[0]+8*y] );
}
else
{
for( y = 0; y < 4; y++ )
- {
- M64( h->mb.mvd[0][i_mb_4x4+y*s4x4+0] ) = 0;
- M64( h->mb.mvd[0][i_mb_4x4+y*s4x4+2] ) = 0;
- }
+ M64( h->mb.mvd[0][i_mb_4x4+y*s4x4] ) = 0;
if( h->sh.i_type == SLICE_TYPE_B )
for( y = 0; y < 4; y++ )
- {
- M64( h->mb.mvd[1][i_mb_4x4+y*s4x4+0] ) = 0;
- M64( h->mb.mvd[1][i_mb_4x4+y*s4x4+2] ) = 0;
- }
+ M64( h->mb.mvd[1][i_mb_4x4+y*s4x4] ) = 0;
}
if( h->sh.i_type == SLICE_TYPE_B )
if( height == 4 ) M16( d+6 ) = val2;
}
}
+static ALWAYS_INLINE void x264_macroblock_cache_rect2( void *dst, int width, int height, uint16_t val )
+{
+ uint16_t *d = dst;
+ uint32_t val32 = val + (val<<16);
+ uint64_t val64 = val32 + ((uint64_t)val32<<32);
+ if( width == 4 )
+ {
+ M64( d+ 0 ) = val64;
+ if( height >= 2 ) M64( d+ 8 ) = val64;
+ if( height == 4 ) M64( d+16 ) = val64;
+ if( height == 4 ) M64( d+24 ) = val64;
+ }
+ else if( width == 2 )
+ {
+ M32( d+ 0 ) = val32;
+ if( height >= 2 ) M32( d+ 8 ) = val32;
+ if( height == 4 ) M32( d+16 ) = val32;
+ if( height == 4 ) M32( d+24 ) = val32;
+ }
+ else //if( width == 1 )
+ {
+ M16( d+ 0 ) = val;
+ if( height >= 2 ) M16( d+ 8 ) = val;
+ if( height == 4 ) M16( d+16 ) = val;
+ if( height == 4 ) M16( d+24 ) = val;
+ }
+}
static ALWAYS_INLINE void x264_macroblock_cache_rect4( void *dst, int width, int height, uint32_t val )
{
int dy;
{
x264_macroblock_cache_rect4( &h->mb.cache.mv[i_list][X264_SCAN8_0+x+8*y], width, height, mv );
}
-static ALWAYS_INLINE void x264_macroblock_cache_mvd( x264_t *h, int x, int y, int width, int height, int i_list, uint32_t mv )
+static ALWAYS_INLINE void x264_macroblock_cache_mvd( x264_t *h, int x, int y, int width, int height, int i_list, uint16_t mv )
{
- x264_macroblock_cache_rect4( &h->mb.cache.mvd[i_list][X264_SCAN8_0+x+8*y], width, height, mv );
+ x264_macroblock_cache_rect2( &h->mb.cache.mvd[i_list][X264_SCAN8_0+x+8*y], width, height, mv );
}
static ALWAYS_INLINE void x264_macroblock_cache_ref( x264_t *h, int x, int y, int width, int height, int i_list, uint8_t ref )
{
);
return sum;
}
-#define x264_cabac_amvd_sum x264_cabac_amvd_sum_mmxext
-static ALWAYS_INLINE uint32_t x264_cabac_amvd_sum_mmxext(int16_t *mvdleft, int16_t *mvdtop)
+#define x264_cabac_mvd_sum x264_cabac_mvd_sum_mmxext
+static ALWAYS_INLINE uint16_t x264_cabac_mvd_sum_mmxext(uint8_t *mvdleft, uint8_t *mvdtop)
{
- static const uint64_t pw_2 = 0x0002000200020002ULL;
- static const uint64_t pw_28 = 0x001C001C001C001CULL;
- static const uint64_t pw_2184 = 0x0888088808880888ULL;
- /* MIN(((x+28)*2184)>>16,2) = (x>2) + (x>32) */
- /* 2184 = fix16(1/30) */
- uint32_t amvd;
+ static const uint64_t pb_2 = 0x0202020202020202ULL;
+ static const uint64_t pb_32 = 0x2020202020202020ULL;
+ int amvd;
asm(
- "movd %1, %%mm0 \n"
- "movd %2, %%mm1 \n"
- "pxor %%mm2, %%mm2 \n"
- "pxor %%mm3, %%mm3 \n"
- "psubw %%mm0, %%mm2 \n"
- "psubw %%mm1, %%mm3 \n"
- "pmaxsw %%mm2, %%mm0 \n"
- "pmaxsw %%mm3, %%mm1 \n"
- "paddw %3, %%mm0 \n"
- "paddw %%mm1, %%mm0 \n"
- "pmulhuw %4, %%mm0 \n"
- "pminsw %5, %%mm0 \n"
- "movd %%mm0, %0 \n"
+ "movd %1, %%mm0 \n"
+ "movd %2, %%mm1 \n"
+ "paddb %%mm1, %%mm0 \n"
+ "pxor %%mm2, %%mm2 \n"
+ "movq %%mm0, %%mm1 \n"
+ "pcmpgtb %3, %%mm0 \n"
+ "pcmpgtb %4, %%mm1 \n"
+ "psubb %%mm0, %%mm2 \n"
+ "psubb %%mm1, %%mm2 \n"
+ "movd %%mm2, %0 \n"
:"=r"(amvd)
- :"m"(M32( mvdleft )),"m"(M32( mvdtop )),
- "m"(pw_28),"m"(pw_2184),"m"(pw_2)
+ :"m"(M16( mvdleft )),"m"(M16( mvdtop )),
+ "m"(pb_2),"m"(pb_32)
);
return amvd;
}
x264_cabac_encode_decision( cb, 54 + ctx, 0 );
}
-static inline void x264_cabac_mb_mvd_cpn( x264_t *h, x264_cabac_t *cb, int i_list, int idx, int l, int mvd, int ctx )
+static inline int x264_cabac_mb_mvd_cpn( x264_t *h, x264_cabac_t *cb, int i_list, int idx, int l, int mvd, int ctx )
{
const int i_abs = abs( mvd );
const int ctxbase = l ? 47 : 40;
x264_cabac_encode_bypass( cb, mvd < 0 );
}
#endif
+ /* Since we don't need to keep track of MVDs larger than 33, just cap the value.
+ * This lets us store MVDs as 8-bit values instead of 16-bit. */
+ return X264_MIN( i_abs, 33 );
}
-static NOINLINE uint32_t x264_cabac_mb_mvd( x264_t *h, x264_cabac_t *cb, int i_list, int idx, int width )
+static NOINLINE uint16_t x264_cabac_mb_mvd( x264_t *h, x264_cabac_t *cb, int i_list, int idx, int width )
{
ALIGNED_4( int16_t mvp[2] );
- uint32_t amvd;
int mdx, mdy;
/* Calculate mvd */
x264_mb_predict_mv( h, i_list, idx, width, mvp );
mdx = h->mb.cache.mv[i_list][x264_scan8[idx]][0] - mvp[0];
mdy = h->mb.cache.mv[i_list][x264_scan8[idx]][1] - mvp[1];
- amvd = x264_cabac_amvd_sum(h->mb.cache.mvd[i_list][x264_scan8[idx] - 1],
- h->mb.cache.mvd[i_list][x264_scan8[idx] - 8]);
+ uint16_t amvd = x264_cabac_mvd_sum(h->mb.cache.mvd[i_list][x264_scan8[idx] - 1],
+ h->mb.cache.mvd[i_list][x264_scan8[idx] - 8]);
/* encode */
- x264_cabac_mb_mvd_cpn( h, cb, i_list, idx, 0, mdx, amvd&0xFFFF );
- x264_cabac_mb_mvd_cpn( h, cb, i_list, idx, 1, mdy, amvd>>16 );
+ mdx = x264_cabac_mb_mvd_cpn( h, cb, i_list, idx, 0, mdx, amvd&0xFF );
+ mdy = x264_cabac_mb_mvd_cpn( h, cb, i_list, idx, 1, mdy, amvd>>8 );
- return pack16to32_mask(mdx,mdy);
+ return pack8to16(mdx,mdy);
}
#define x264_cabac_mb_mvd(h,cb,i_list,idx,width,height)\
do\
{\
- uint32_t mvd = x264_cabac_mb_mvd(h,cb,i_list,idx,width);\
+ uint16_t mvd = x264_cabac_mb_mvd(h,cb,i_list,idx,width);\
x264_macroblock_cache_mvd( h, block_idx_x[idx], block_idx_y[idx], width, height, i_list, mvd );\
} while(0)
m->mv[0] = bmx;
m->mv[1] = bmy;
x264_macroblock_cache_mv ( h, block_idx_x[i4], block_idx_y[i4], bw>>2, bh>>2, i_list, pack16to32_mask(bmx, bmy) );
- x264_macroblock_cache_mvd( h, block_idx_x[i4], block_idx_y[i4], bw>>2, bh>>2, i_list, pack16to32_mask(bmx - m->mvp[0], bmy - m->mvp[1]) );
+ uint16_t amvd = pack8to16(X264_MIN(abs(bmx - m->mvp[0]),33), X264_MIN(abs(bmy - m->mvp[1]),33));
+ x264_macroblock_cache_mvd( h, block_idx_x[i4], block_idx_y[i4], bw>>2, bh>>2, i_list, amvd );
h->mb.b_skip_mc = 0;
}