The risk of overflows increases exponentially with the bit depth.
The 8-bit asm versions may still overflow with image widths >= 11008 (or 6604 if interlaced).
return i_ssd;
}
-static uint64_t pixel_ssd_nv12_core( pixel *pixuv1, int stride1, pixel *pixuv2, int stride2, int width, int height )
+static void pixel_ssd_nv12_core( pixel *pixuv1, int stride1, pixel *pixuv2, int stride2, int width, int height, uint64_t *ssd_u, uint64_t *ssd_v )
{
- uint32_t ssd_u=0, ssd_v=0;
+ *ssd_u = 0, *ssd_v = 0;
for( int y = 0; y < height; y++, pixuv1+=stride1, pixuv2+=stride2 )
for( int x = 0; x < width; x++ )
{
int du = pixuv1[2*x] - pixuv2[2*x];
int dv = pixuv1[2*x+1] - pixuv2[2*x+1];
- ssd_u += du*du;
- ssd_v += dv*dv;
+ *ssd_u += du*du;
+ *ssd_v += dv*dv;
}
- return ssd_u + ((uint64_t)ssd_v<<32);
}
-// SSD in uint32 (i.e. packing two into uint64) can potentially overflow on
-// image widths >= 11008 (or 6604 if interlaced), since this is called on blocks
-// of height up to 12 (resp 20). Though it will probably take significantly more
-// than that at sane distortion levels.
-uint64_t x264_pixel_ssd_nv12( x264_pixel_function_t *pf, pixel *pix1, int i_pix1, pixel *pix2, int i_pix2, int i_width, int i_height )
+void x264_pixel_ssd_nv12( x264_pixel_function_t *pf, pixel *pix1, int i_pix1, pixel *pix2, int i_pix2, int i_width, int i_height, uint64_t *ssd_u, uint64_t *ssd_v )
{
- uint64_t ssd = pf->ssd_nv12_core( pix1, i_pix1, pix2, i_pix2, i_width&~7, i_height );
+ pf->ssd_nv12_core( pix1, i_pix1, pix2, i_pix2, i_width&~7, i_height, ssd_u, ssd_v );
if( i_width&7 )
- ssd += pixel_ssd_nv12_core( pix1+(i_width&~7), i_pix1, pix2+(i_width&~7), i_pix2, i_width&7, i_height );
- return ssd;
+ {
+ uint64_t tmp[2];
+ pixel_ssd_nv12_core( pix1+(i_width&~7), i_pix1, pix2+(i_width&~7), i_pix2, i_width&7, i_height, &tmp[0], &tmp[1] );
+ *ssd_u += tmp[0];
+ *ssd_v += tmp[1];
+ }
}
/****************************************************************************
uint64_t (*var[4])( pixel *pix, int stride );
uint64_t (*hadamard_ac[4])( pixel *pix, int stride );
- uint64_t (*ssd_nv12_core)( pixel *pixuv1, int stride1,
- pixel *pixuv2, int stride2, int width, int height );
+ void (*ssd_nv12_core)( pixel *pixuv1, int stride1,
+ pixel *pixuv2, int stride2, int width, int height,
+ uint64_t *ssd_u, uint64_t *ssd_v );
void (*ssim_4x4x2_core)( const pixel *pix1, int stride1,
const pixel *pix2, int stride2, int sums[2][4] );
float (*ssim_end4)( int sum0[5][4], int sum1[5][4], int width );
} x264_pixel_function_t;
void x264_pixel_init( int cpu, x264_pixel_function_t *pixf );
-uint64_t x264_pixel_ssd_nv12( x264_pixel_function_t *pf, pixel *pix1, int i_pix1, pixel *pix2, int i_pix2, int i_width, int i_height );
+void x264_pixel_ssd_nv12( x264_pixel_function_t *pf, pixel *pix1, int i_pix1, pixel *pix2, int i_pix2, int i_width, int i_height, uint64_t *ssd_u, uint64_t *ssd_v );
uint64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, pixel *pix1, int i_pix1, pixel *pix2, int i_pix2, int i_width, int i_height );
float x264_pixel_ssim_wxh( x264_pixel_function_t *pf, pixel *pix1, int i_pix1, pixel *pix2, int i_pix2, int i_width, int i_height, void *buf );
mask_1100: times 2 dd 0, -1
deinterleave_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
+pd_f0: times 4 dd 0xffff0000
+pq_0f: times 2 dd 0xffffffff, 0
+
SECTION .text
cextern pw_1
add r2, r3
dec r5d
jg .loopy
+ mov r3, r6m
+ mov r4, r7m
+ mova m5, [pq_0f]
HADDD m3, m0
HADDD m4, m0
- movd eax, m3
- movd edx, m4
-%ifdef ARCH_X86_64
- shl rdx, 32
- add rax, rdx
-%endif
+ pand m3, m5
+ pand m4, m5
+ movq [r3], m3
+ movq [r4], m4
RET
%endmacro ; SSD_NV12
void x264_intra_sa8d_x3_8x8_core_sse2 ( uint8_t *, int16_t [2][8], int * );
void x264_intra_sa8d_x3_8x8_core_ssse3 ( uint8_t *, int16_t [2][8], int * );
-uint64_t x264_pixel_ssd_nv12_core_mmxext( uint8_t *pixuv1, int stride1,
- uint8_t *pixuv2, int stride2, int width, int height );
-uint64_t x264_pixel_ssd_nv12_core_sse2( uint8_t *pixuv1, int stride1,
- uint8_t *pixuv2, int stride2, int width, int height );
+void x264_pixel_ssd_nv12_core_mmxext( uint8_t *pixuv1, int stride1,
+ uint8_t *pixuv2, int stride2, int width,
+ int height, uint64_t *ssd_u, uint64_t *ssd_v );
+void x264_pixel_ssd_nv12_core_sse2( uint8_t *pixuv1, int stride1,
+ uint8_t *pixuv2, int stride2, int width,
+ int height, uint64_t *ssd_u, uint64_t *ssd_v );
void x264_pixel_ssim_4x4x2_core_mmxext( const uint8_t *pix1, int stride1,
const uint8_t *pix2, int stride2, int sums[2][4] );
void x264_pixel_ssim_4x4x2_core_sse2( const uint8_t *pix1, int stride1,
h->fdec->plane[0] + min_y * h->fdec->i_stride[0], h->fdec->i_stride[0],
h->fenc->plane[0] + min_y * h->fenc->i_stride[0], h->fenc->i_stride[0],
h->param.i_width, max_y-min_y );
- uint64_t ssd_uv = x264_pixel_ssd_nv12( &h->pixf,
+ uint64_t ssd_u, ssd_v;
+ x264_pixel_ssd_nv12( &h->pixf,
h->fdec->plane[1] + (min_y>>1) * h->fdec->i_stride[1], h->fdec->i_stride[1],
h->fenc->plane[1] + (min_y>>1) * h->fenc->i_stride[1], h->fenc->i_stride[1],
- h->param.i_width>>1, (max_y-min_y)>>1 );
+ h->param.i_width>>1, (max_y-min_y)>>1, &ssd_u, &ssd_v );
h->stat.frame.i_ssd[0] += ssd_y;
- h->stat.frame.i_ssd[1] += (uint32_t)ssd_uv;
- h->stat.frame.i_ssd[2] += ssd_uv>>32;
+ h->stat.frame.i_ssd[1] += ssd_u;
+ h->stat.frame.i_ssd[2] += ssd_v;
}
if( h->param.analyse.b_ssim )
{
used_asm = 1;
set_func_name( "ssd_nv12" );
- uint64_t res_c = pixel_c.ssd_nv12_core( pbuf1, 368, pbuf2, 368, 360, 8 );
- uint64_t res_a = pixel_asm.ssd_nv12_core( pbuf1, 368, pbuf2, 368, 360, 8 );
- if( res_c != res_a )
+ uint64_t res_u_c, res_v_c, res_u_a, res_v_a;
+ pixel_c.ssd_nv12_core( pbuf1, 368, pbuf2, 368, 360, 8, &res_u_c, &res_v_c );
+ pixel_asm.ssd_nv12_core( pbuf1, 368, pbuf2, 368, 360, 8, &res_u_a, &res_v_a );
+ if( res_u_c != res_u_a || res_v_c != res_v_a )
{
ok = 0;
- fprintf( stderr, "ssd_nv12: %u,%u != %u,%u\n",
- (uint32_t)res_c, (uint32_t)(res_c>>32),
- (uint32_t)res_a, (uint32_t)(res_a>>32) );
+ fprintf( stderr, "ssd_nv12: %"PRIu64",%"PRIu64" != %"PRIu64",%"PRIu64"\n",
+ res_u_c, res_v_c, res_u_a, res_v_a );
}
- call_c( pixel_c.ssd_nv12_core, pbuf1, 368, pbuf2, 368, 360, 8 );
- call_a( pixel_asm.ssd_nv12_core, pbuf1, 368, pbuf2, 368, 360, 8 );
+ call_c( pixel_c.ssd_nv12_core, pbuf1, 368, pbuf2, 368, 360, 8, &res_u_c, &res_v_c );
+ call_a( pixel_asm.ssd_nv12_core, pbuf1, 368, pbuf2, 368, 360, 8, &res_u_a, &res_v_a );
}
report( "ssd_nv12 :" );