* in terms of the cache loads. */
while( i_height > 0 )
{
- for( int x = 0; x < i_width; x += 16 )
+ int x;
+ for( x = 0; x < i_width-8; x += 16 )
w->weightfn[16>>2]( dst+x, i_dst_stride, src+x, i_src_stride, w, X264_MIN( i_height, 16 ) );
+ if( x < i_width )
+ w->weightfn[ 8>>2]( dst+x, i_dst_stride, src+x, i_src_stride, w, X264_MIN( i_height, 16 ) );
i_height -= 16;
dst += 16 * i_dst_stride;
src += 16 * i_src_stride;
static int x264_nal_end( x264_t *h )
{
x264_nal_t *nal = &h->out.nal[h->out.i_nal];
- nal->i_payload = &h->out.p_bitstream[bs_pos( &h->out.bs ) / 8] - nal->p_payload;
+ uint8_t *end = &h->out.p_bitstream[bs_pos( &h->out.bs ) / 8];
+ nal->i_payload = end - nal->p_payload;
+ /* nal_escape_mmx reads past the end of the input.
+ * While undefined padding wouldn't actually affect the output, it makes valgrind unhappy. */
+ memset( end, 0xff, 32 );
if( h->param.nalu_process )
h->param.nalu_process( h, nal );
h->out.i_nal++;
/* Test corner-case sizes */
int test_size = i < 10 ? i+1 : rand() & 0x3fff;
/* Test 8 different probability distributions of zeros */
- for( int j = 0; j < test_size; j++ )
+ for( int j = 0; j < test_size+32; j++ )
input[j] = (rand()&((1 << ((i&7)+1)) - 1)) * rand();
uint8_t *end_c = (uint8_t*)call_c1( bs_c.nal_escape, output1, input, input+test_size );
uint8_t *end_a = (uint8_t*)call_a1( bs_a.nal_escape, output2, input, input+test_size );
break;
}
}
- for( int j = 0; j < size; j++ )
+ for( int j = 0; j < size+32; j++ )
input[j] = rand();
call_c2( bs_c.nal_escape, output1, input, input+size );
call_a2( bs_a.nal_escape, output2, input, input+size );