From a6cee0ab6d2e6a9fb6580827dc854c09567c74f0 Mon Sep 17 00:00:00 2001
From: Fiona Glaser <fiona@x264.com>
Date: Thu, 17 Jul 2008 07:55:24 -0600
Subject: [PATCH] Align lowres planes for improved cacheline split performance

---
 common/frame.c | 33 ++++++++++++++-------------------
 common/frame.h |  4 ++--
 2 files changed, 16 insertions(+), 21 deletions(-)
diff --git a/common/frame.c b/common/frame.c
index dd77c897..1d5ef24d 100644
--- a/common/frame.c
+++ b/common/frame.c
@@ -23,6 +23,8 @@
 
 #include "common.h"
 
+#define ALIGN(x,a) (((x)+((a)-1))&~((a)-1))
+
 x264_frame_t *x264_frame_new( x264_t *h )
 {
     x264_frame_t *frame = x264_malloc( sizeof(x264_frame_t) );
@@ -32,22 +34,16 @@ x264_frame_t *x264_frame_new( x264_t *h )
     int i_stride, i_width, i_lines;
     int i_padv = PADV << h->param.b_interlaced;
     int luma_plane_size;
+    int align = h->param.cpu&X264_CPU_CACHELINE_64 ? 64 : h->param.cpu&X264_CPU_CACHELINE_32 ? 32 : 16;
 
     if( !frame ) return NULL;
 
     memset( frame, 0, sizeof(x264_frame_t) );
 
     /* allocate frame data (+64 for extra data for me) */
-    i_width  = ( ( h->param.i_width  + 15 ) & -16 );
-    i_stride = i_width + 2*PADH;
-    i_lines  = ( ( h->param.i_height + 15 ) & -16 );
-    if( h->param.b_interlaced )
-        i_lines = ( i_lines + 31 ) & -32;
-
-    if( h->param.cpu&X264_CPU_CACHELINE_64 )
-        i_stride = (i_stride + 63) & ~63;
-    else if( h->param.cpu&X264_CPU_CACHELINE_32 )
-        i_stride = (i_stride + 31) & ~31;
+    i_width  = ALIGN( h->param.i_width, 16 );
+    i_stride = ALIGN( i_width + 2*PADH, align );
+    i_lines  = ALIGN( h->param.i_height, 16<<h->param.b_interlaced );
 
     frame->i_plane = 3;
     for( i = 0; i < 3; i++ )
@@ -61,27 +57,26 @@ x264_frame_t *x264_frame_new( x264_t *h )
     for( i = 1; i < 3; i++ )
     {
         CHECKED_MALLOC( frame->buffer[i], luma_plane_size/4 );
-        frame->plane[i] = (uint8_t*)frame->buffer[i] + (frame->i_stride[i] * i_padv + PADH)/2;
+        frame->plane[i] = frame->buffer[i] + (frame->i_stride[i] * i_padv + PADH)/2;
     }
     /* all 4 luma planes allocated together, since the cacheline split code
      * requires them to be in-phase wrt cacheline alignment. */
     CHECKED_MALLOC( frame->buffer[0], 4*luma_plane_size);
     for( i = 0; i < 4; i++ )
-        frame->filtered[i] = (uint8_t*)frame->buffer[0] + i*luma_plane_size + frame->i_stride[0] * i_padv + PADH;
+        frame->filtered[i] = frame->buffer[0] + i*luma_plane_size + frame->i_stride[0] * i_padv + PADH;
     frame->plane[0] = frame->filtered[0];
 
     if( h->frames.b_have_lowres )
     {
         frame->i_width_lowres = frame->i_width[0]/2;
-        frame->i_stride_lowres = (frame->i_width_lowres + 2*PADH + 15) & ~15;
+        frame->i_stride_lowres = ALIGN( frame->i_width_lowres + 2*PADH, align );
         frame->i_lines_lowres = frame->i_lines[0]/2;
+
+        luma_plane_size = frame->i_stride_lowres * ( frame->i_lines[0]/2 + 2*i_padv );
+
+        CHECKED_MALLOC( frame->buffer_lowres[0], 4 * luma_plane_size );
         for( i = 0; i < 4; i++ )
-        {
-            CHECKED_MALLOC( frame->buffer_lowres[i],
-                            frame->i_stride_lowres * ( frame->i_lines[0]/2 + 2*i_padv ) );
-            frame->lowres[i] = ((uint8_t*)frame->buffer_lowres[i]) +
-                                frame->i_stride_lowres * i_padv + PADH;
-        }
+            frame->lowres[i] = frame->buffer_lowres[0] + (frame->i_stride_lowres * i_padv + PADH) + i * luma_plane_size;
     }
 
     if( h->param.analyse.i_me_method >= X264_ME_ESA )
diff --git a/common/frame.h b/common/frame.h
index 6da740ab..6a0c9282 100644
--- a/common/frame.h
+++ b/common/frame.h
@@ -56,8 +56,8 @@ typedef struct
 
     /* for unrestricted mv we allocate more data than needed
      * allocated data are stored in buffer */
-    void    *buffer[4];
-    void    *buffer_lowres[4];
+    uint8_t *buffer[4];
+    uint8_t *buffer_lowres[4];
 
     /* motion data */
     int8_t  *mb_type;
-- 
2.39.2