int i_max_ref1;
int i_delay; /* Number of frames buffered for B reordering */
int b_have_lowres; /* Whether 1/2 resolution luma planes are being used */
+ int b_have_sub8x8_esa;
} frames;
/* current frame being encoded */
if( h->param.analyse.i_me_method >= X264_ME_ESA )
{
CHECKED_MALLOC( frame->buffer[3],
- 2 * frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv) * sizeof(uint16_t) );
+ frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv) * sizeof(uint16_t) << h->frames.b_have_sub8x8_esa );
frame->integral = (uint16_t*)frame->buffer[3] + frame->i_stride[0] * i_padv + PADH;
}
memset( dst, 0, n );
}
+static void integral_init4h( uint16_t *sum, uint8_t *pix, int stride )
+{
+ int x, v = pix[0]+pix[1]+pix[2]+pix[3];
+ for( x=0; x<stride-4; x++ )
+ {
+ sum[x] = v + sum[x-stride];
+ v += pix[x+4] - pix[x];
+ }
+}
+
+static void integral_init8h( uint16_t *sum, uint8_t *pix, int stride )
+{
+ int x, v = pix[0]+pix[1]+pix[2]+pix[3]+pix[4]+pix[5]+pix[6]+pix[7];
+ for( x=0; x<stride-8; x++ )
+ {
+ sum[x] = v + sum[x-stride];
+ v += pix[x+8] - pix[x];
+ }
+}
+
+static void integral_init4v( uint16_t *sum8, uint16_t *sum4, int stride )
+{
+ int x;
+ for( x=0; x<stride-8; x++ )
+ sum4[x] = sum8[x+4*stride] - sum8[x];
+ for( x=0; x<stride-8; x++ )
+ sum8[x] = sum8[x+8*stride] + sum8[x+8*stride+4] - sum8[x] - sum8[x+4];
+}
+
+static void integral_init8v( uint16_t *sum8, int stride )
+{
+ int x;
+ for( x=0; x<stride-8; x++ )
+ sum8[x] = sum8[x+8*stride] - sum8[x];
+}
+
void x264_frame_init_lowres( x264_t *h, x264_frame_t *frame )
{
uint8_t *src = frame->plane[0];
pf->memzero_aligned = memzero_aligned;
pf->frame_init_lowres_core = frame_init_lowres_core;
+ pf->integral_init4h = integral_init4h;
+ pf->integral_init8h = integral_init8h;
+ pf->integral_init4v = integral_init4v;
+ pf->integral_init8v = integral_init8v;
+
#ifdef HAVE_MMX
x264_mc_init_mmx( cpu, pf );
#endif
int start = (mb_y*16 >> b_interlaced) - 8; // buffer = 4 for deblock + 3 for 6tap, rounded to 8
int height = ((b_end ? frame->i_lines[0] : mb_y*16) >> b_interlaced) + 8;
int offs = start*stride - 8; // buffer = 3 for 6tap, aligned to 8 for simd
- int x, y;
+ int y;
if( mb_y & b_interlaced )
return;
height += PADV-8;
for( y = start; y < height; y++ )
{
- uint8_t *ref = frame->plane[0] + y * stride - PADH;
- uint16_t *line = frame->integral + (y+1) * stride - PADH + 1;
- uint16_t v = line[0] = 0;
- for( x = 1; x < stride-1; x++ )
- line[x] = v += ref[x] + line[x-stride] - line[x-stride-1];
- line -= 8*stride;
- if( y >= 9-PADV )
+ uint8_t *pix = frame->plane[0] + y * stride - PADH;
+ uint16_t *sum8 = frame->integral + (y+1) * stride - PADH;
+ uint16_t *sum4;
+ if( h->frames.b_have_sub8x8_esa )
+ {
+ h->mc.integral_init4h( sum8, pix, stride );
+ sum8 -= 8*stride;
+ sum4 = sum8 + stride * (frame->i_lines[0] + PADV*2);
+ if( y >= 8-PADV )
+ h->mc.integral_init4v( sum8, sum4, stride );
+ }
+ else
{
- uint16_t *sum4 = line + stride * (frame->i_lines[0] + PADV*2);
- for( x = 1; x < stride-8; x++, line++, sum4++ )
- {
- sum4[0] = line[4+4*stride] - line[4] - line[4*stride] + line[0];
- line[0] += line[8+8*stride] - line[8] - line[8*stride];
- }
+ h->mc.integral_init8h( sum8, pix, stride );
+ if( y >= 8-PADV )
+ h->mc.integral_init8v( sum8-8*stride, stride );
}
}
}
void *(*memcpy_aligned)( void *dst, const void *src, size_t n );
void (*memzero_aligned)( void *dst, int n );
+ /* successive elimination prefilter */
+ void (*integral_init4h)( uint16_t *sum, uint8_t *pix, int stride );
+ void (*integral_init8h)( uint16_t *sum, uint8_t *pix, int stride );
+ void (*integral_init4v)( uint16_t *sum8, uint16_t *sum4, int stride );
+ void (*integral_init8v)( uint16_t *sum8, int stride );
+
void (*frame_init_lowres_core)( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
int src_stride, int dst_stride, int width, int height );
} x264_mc_functions_t;
+;-----------------------------------------------------------------------------
+; void x264_integral_init4h_sse4( uint16_t *sum, uint8_t *pix, int stride )
+;-----------------------------------------------------------------------------
+cglobal x264_integral_init4h_sse4, 3,4
+ lea r3, [r0+r2*2]
+ add r1, r2
+ neg r2
+ pxor m4, m4
+.loop:
+ movdqa m0, [r1+r2]
+ movdqu m1, [r1+r2+8]
+ mpsadbw m0, m4, 0
+ mpsadbw m1, m4, 0
+ paddw m0, [r0+r2*2]
+ paddw m1, [r0+r2*2+16]
+ movdqa [r3+r2*2 ], m0
+ movdqa [r3+r2*2+16], m1
+ add r2, 16
+ jl .loop
+ REP_RET
+
+cglobal x264_integral_init8h_sse4, 3,4
+ lea r3, [r0+r2*2]
+ add r1, r2
+ neg r2
+ pxor m4, m4
+.loop:
+ movdqa m0, [r1+r2]
+ movdqu m1, [r1+r2+8]
+ movdqa m2, m0
+ movdqa m3, m1
+ mpsadbw m0, m4, 0
+ mpsadbw m1, m4, 0
+ mpsadbw m2, m4, 4
+ mpsadbw m3, m4, 4
+ paddw m0, [r0+r2*2]
+ paddw m1, [r0+r2*2+16]
+ paddw m0, m2
+ paddw m1, m3
+ movdqa [r3+r2*2 ], m0
+ movdqa [r3+r2*2+16], m1
+ add r2, 16
+ jl .loop
+ REP_RET
+
+%macro INTEGRAL_INIT 1
+;-----------------------------------------------------------------------------
+; void x264_integral_init4v_mmx( uint16_t *sum8, uint16_t *sum4, int stride )
+;-----------------------------------------------------------------------------
+cglobal x264_integral_init4v_%1, 3,5
+ shl r2, 1
+ add r0, r2
+ add r1, r2
+ lea r3, [r0+r2*4]
+ lea r4, [r0+r2*8]
+ neg r2
+.loop:
+ movu m0, [r0+r2+8]
+ mova m2, [r0+r2]
+ movu m1, [r4+r2+8]
+ paddw m0, m2
+ paddw m1, [r4+r2]
+ mova m3, [r3+r2]
+ psubw m1, m0
+ psubw m3, m2
+ mova [r0+r2], m1
+ mova [r1+r2], m3
+ add r2, mmsize
+ jl .loop
+ REP_RET
+
+;-----------------------------------------------------------------------------
+; void x264_integral_init8v_mmx( uint16_t *sum8, int stride )
+;-----------------------------------------------------------------------------
+cglobal x264_integral_init8v_%1, 3,3
+ shl r1, 1
+ add r0, r1
+ lea r2, [r0+r1*8]
+ neg r1
+.loop:
+ mova m0, [r2+r1]
+ mova m1, [r2+r1+mmsize]
+ psubw m0, [r0+r1]
+ psubw m1, [r0+r1+mmsize]
+ mova [r0+r1], m0
+ mova [r0+r1+mmsize], m1
+ add r1, 2*mmsize
+ jl .loop
+ REP_RET
+%endmacro
+
+INIT_MMX
+INTEGRAL_INIT mmx
+INIT_XMM
+INTEGRAL_INIT sse2
+
+
+
%macro FILT8x4 7
mova %3, [r0+%7]
mova %4, [r0+r5+%7]
extern void *x264_memcpy_aligned_sse2( void * dst, const void * src, size_t n );
extern void x264_memzero_aligned_mmx( void * dst, int n );
extern void x264_memzero_aligned_sse2( void * dst, int n );
+extern void x264_integral_init4h_sse4( uint16_t *sum, uint8_t *pix, int stride );
+extern void x264_integral_init8h_sse4( uint16_t *sum, uint8_t *pix, int stride );
+extern void x264_integral_init4v_mmx( uint16_t *sum8, uint16_t *sum4, int stride );
+extern void x264_integral_init4v_sse2( uint16_t *sum8, uint16_t *sum4, int stride );
+extern void x264_integral_init8v_mmx( uint16_t *sum8, int stride );
+extern void x264_integral_init8v_sse2( uint16_t *sum8, int stride );
#define LOWRES(cpu) \
extern void x264_frame_init_lowres_core_##cpu( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,\
int src_stride, int dst_stride, int width, int height );
pf->copy[PIXEL_4x4] = x264_mc_copy_w4_mmx;
pf->memcpy_aligned = x264_memcpy_aligned_mmx;
pf->memzero_aligned = x264_memzero_aligned_mmx;
+ pf->integral_init4v = x264_integral_init4v_mmx;
+ pf->integral_init8v = x264_integral_init8v_mmx;
if( !(cpu&X264_CPU_MMXEXT) )
return;
pf->memcpy_aligned = x264_memcpy_aligned_sse2;
pf->memzero_aligned = x264_memzero_aligned_sse2;
+ pf->integral_init4v = x264_integral_init4v_sse2;
+ pf->integral_init8v = x264_integral_init8v_sse2;
pf->hpel_filter = x264_hpel_filter_sse2_amd;
if( cpu&X264_CPU_SSE2_IS_SLOW )
pf->hpel_filter = x264_hpel_filter_ssse3;
pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3;
pf->mc_chroma = x264_mc_chroma_ssse3;
+
+ if( !(cpu&X264_CPU_SSE4) )
+ return;
+
+ pf->integral_init4h = x264_integral_init4h_sse4;
+ pf->integral_init8h = x264_integral_init8h_sse4;
}
|| h->param.i_bframe_adaptive
|| h->param.b_pre_scenecut );
h->frames.b_have_lowres |= (h->param.rc.b_stat_read && h->param.rc.i_vbv_buffer_size > 0);
+ h->frames.b_have_sub8x8_esa = !!(h->param.analyse.inter & X264_ANALYSE_PSUB8x8);
h->frames.i_last_idr = - h->param.i_keyint_max;
h->frames.i_input = 0;
// can only twiddle these if they were enabled to begin with:
if( h->param.analyse.i_me_method >= X264_ME_ESA || param->analyse.i_me_method < X264_ME_ESA )
COPY( analyse.i_me_method );
+ if( h->param.analyse.i_me_method >= X264_ME_ESA && !h->frames.b_have_sub8x8_esa )
+ h->param.analyse.inter &= ~X264_ANALYSE_PSUB8x8;
if( h->pps->b_transform_8x8_mode )
COPY( analyse.b_transform_8x8 );
if( h->frames.i_max_ref1 > 1 )
uint8_t *dstc[4] = { buf3, buf3+1024, buf3+2048, buf3+3072 };
uint8_t *dsta[4] = { buf4, buf4+1024, buf4+2048, buf3+3072 };
set_func_name( "lowres_init" );
+ ok = 1; used_asm = 1;
for( w=40; w<=48; w+=8 )
- if( mc_a.frame_init_lowres_core != mc_ref.frame_init_lowres_core )
+ {
+ int stride = (w+8)&~15;
+ call_c( mc_c.frame_init_lowres_core, buf1, dstc[0], dstc[1], dstc[2], dstc[3], w*2, stride, w, 16 );
+ call_a( mc_a.frame_init_lowres_core, buf1, dsta[0], dsta[1], dsta[2], dsta[3], w*2, stride, w, 16 );
+ for( i=0; i<16; i++)
{
- int stride = (w+8)&~15;
- used_asm = 1;
- call_c( mc_c.frame_init_lowres_core, buf1, dstc[0], dstc[1], dstc[2], dstc[3], w*2, stride, w, 16 );
- call_a( mc_a.frame_init_lowres_core, buf1, dsta[0], dsta[1], dsta[2], dsta[3], w*2, stride, w, 16 );
- for( i=0; i<16; i++)
- {
- for( j=0; j<4; j++)
- if( memcmp( dstc[j]+i*stride, dsta[j]+i*stride, w ) )
- {
- ok = 0;
- fprintf( stderr, "frame_init_lowres differs at plane %d line %d\n", j, i );
- for( k=0; k<w; k++ )
- printf( "%d ", dstc[j][k+i*stride] );
- printf("\n");
- for( k=0; k<w; k++ )
- printf( "%d ", dsta[j][k+i*stride] );
- printf("\n");
- break;
- }
- }
+ for( j=0; j<4; j++)
+ if( memcmp( dstc[j]+i*stride, dsta[j]+i*stride, w ) )
+ {
+ ok = 0;
+ fprintf( stderr, "frame_init_lowres differs at plane %d line %d\n", j, i );
+ for( k=0; k<w; k++ )
+ printf( "%d ", dstc[j][k+i*stride] );
+ printf("\n");
+ for( k=0; k<w; k++ )
+ printf( "%d ", dsta[j][k+i*stride] );
+ printf("\n");
+ break;
+ }
}
+ }
report( "lowres init :" );
}
+#define INTEGRAL_INIT( name, size, ... )\
+ if( mc_a.name != mc_ref.name )\
+ {\
+ int stride = 80;\
+ set_func_name( #name );\
+ used_asm = 1;\
+ memcpy( buf3, buf1, size*2*stride );\
+ memcpy( buf4, buf1, size*2*stride );\
+ uint16_t *sum = (uint16_t*)buf3;\
+ call_c1( mc_c.name, __VA_ARGS__ );\
+ sum = (uint16_t*)buf4;\
+ call_a1( mc_a.name, __VA_ARGS__ );\
+ if( memcmp( buf3, buf4, (stride-8)*2 )\
+ || (size>9 && memcmp( buf3+18*stride, buf4+18*stride, (stride-8)*2 )))\
+ ok = 0;\
+ call_c2( mc_c.name, __VA_ARGS__ );\
+ call_a2( mc_a.name, __VA_ARGS__ );\
+ }
+ ok = 1; used_asm = 0;
+ INTEGRAL_INIT( integral_init4h, 2, sum+stride, buf2, stride );
+ INTEGRAL_INIT( integral_init8h, 2, sum+stride, buf2, stride );
+ INTEGRAL_INIT( integral_init4v, 14, sum, sum+9*stride, stride );
+ INTEGRAL_INIT( integral_init8v, 9, sum, stride );
+ report( "integral init :" );
+
return ret;
}