From a6cee0ab6d2e6a9fb6580827dc854c09567c74f0 Mon Sep 17 00:00:00 2001 From: Fiona Glaser Date: Thu, 17 Jul 2008 07:55:24 -0600 Subject: [PATCH] Align lowres planes for improved cacheline split performance --- common/frame.c | 33 ++++++++++++++------------------- common/frame.h | 4 ++-- 2 files changed, 16 insertions(+), 21 deletions(-) diff --git a/common/frame.c b/common/frame.c index dd77c897..1d5ef24d 100644 --- a/common/frame.c +++ b/common/frame.c @@ -23,6 +23,8 @@ #include "common.h" +#define ALIGN(x,a) (((x)+((a)-1))&~((a)-1)) + x264_frame_t *x264_frame_new( x264_t *h ) { x264_frame_t *frame = x264_malloc( sizeof(x264_frame_t) ); @@ -32,22 +34,16 @@ x264_frame_t *x264_frame_new( x264_t *h ) int i_stride, i_width, i_lines; int i_padv = PADV << h->param.b_interlaced; int luma_plane_size; + int align = h->param.cpu&X264_CPU_CACHELINE_64 ? 64 : h->param.cpu&X264_CPU_CACHELINE_32 ? 32 : 16; if( !frame ) return NULL; memset( frame, 0, sizeof(x264_frame_t) ); /* allocate frame data (+64 for extra data for me) */ - i_width = ( ( h->param.i_width + 15 ) & -16 ); - i_stride = i_width + 2*PADH; - i_lines = ( ( h->param.i_height + 15 ) & -16 ); - if( h->param.b_interlaced ) - i_lines = ( i_lines + 31 ) & -32; - - if( h->param.cpu&X264_CPU_CACHELINE_64 ) - i_stride = (i_stride + 63) & ~63; - else if( h->param.cpu&X264_CPU_CACHELINE_32 ) - i_stride = (i_stride + 31) & ~31; + i_width = ALIGN( h->param.i_width, 16 ); + i_stride = ALIGN( i_width + 2*PADH, align ); + i_lines = ALIGN( h->param.i_height, 16<param.b_interlaced ); frame->i_plane = 3; for( i = 0; i < 3; i++ ) @@ -61,27 +57,26 @@ x264_frame_t *x264_frame_new( x264_t *h ) for( i = 1; i < 3; i++ ) { CHECKED_MALLOC( frame->buffer[i], luma_plane_size/4 ); - frame->plane[i] = (uint8_t*)frame->buffer[i] + (frame->i_stride[i] * i_padv + PADH)/2; + frame->plane[i] = frame->buffer[i] + (frame->i_stride[i] * i_padv + PADH)/2; } /* all 4 luma planes allocated together, since the cacheline split code * requires them to be in-phase wrt cacheline alignment. */ CHECKED_MALLOC( frame->buffer[0], 4*luma_plane_size); for( i = 0; i < 4; i++ ) - frame->filtered[i] = (uint8_t*)frame->buffer[0] + i*luma_plane_size + frame->i_stride[0] * i_padv + PADH; + frame->filtered[i] = frame->buffer[0] + i*luma_plane_size + frame->i_stride[0] * i_padv + PADH; frame->plane[0] = frame->filtered[0]; if( h->frames.b_have_lowres ) { frame->i_width_lowres = frame->i_width[0]/2; - frame->i_stride_lowres = (frame->i_width_lowres + 2*PADH + 15) & ~15; + frame->i_stride_lowres = ALIGN( frame->i_width_lowres + 2*PADH, align ); frame->i_lines_lowres = frame->i_lines[0]/2; + + luma_plane_size = frame->i_stride_lowres * ( frame->i_lines[0]/2 + 2*i_padv ); + + CHECKED_MALLOC( frame->buffer_lowres[0], 4 * luma_plane_size ); for( i = 0; i < 4; i++ ) - { - CHECKED_MALLOC( frame->buffer_lowres[i], - frame->i_stride_lowres * ( frame->i_lines[0]/2 + 2*i_padv ) ); - frame->lowres[i] = ((uint8_t*)frame->buffer_lowres[i]) + - frame->i_stride_lowres * i_padv + PADH; - } + frame->lowres[i] = frame->buffer_lowres[0] + (frame->i_stride_lowres * i_padv + PADH) + i * luma_plane_size; } if( h->param.analyse.i_me_method >= X264_ME_ESA ) diff --git a/common/frame.h b/common/frame.h index 6da740ab..6a0c9282 100644 --- a/common/frame.h +++ b/common/frame.h @@ -56,8 +56,8 @@ typedef struct /* for unrestricted mv we allocate more data than needed * allocated data are stored in buffer */ - void *buffer[4]; - void *buffer_lowres[4]; + uint8_t *buffer[4]; + uint8_t *buffer_lowres[4]; /* motion data */ int8_t *mb_type; -- 2.39.2