]> git.sesse.net Git - x264/blobdiff - common/frame.c
Align lowres planes for improved cacheline split performance
[x264] / common / frame.c
index ce8af34d1f211dce12c7ce46384b7bfdd899fc40..1d5ef24dc1f0cd9b4fe85f77b082f8ef5beb769c 100644 (file)
@@ -1,10 +1,10 @@
 /*****************************************************************************
  * frame.c: h264 encoder library
  *****************************************************************************
- * Copyright (C) 2003 Laurent Aimar
- * $Id: frame.c,v 1.1 2004/06/03 19:27:06 fenrir Exp $
+ * Copyright (C) 2003-2008 x264 project
  *
  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *          Loren Merritt <lorenm@u.washington.edu>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  *
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  *****************************************************************************/
 
 #include "common.h"
 
+#define ALIGN(x,a) (((x)+((a)-1))&~((a)-1))
+
 x264_frame_t *x264_frame_new( x264_t *h )
 {
     x264_frame_t *frame = x264_malloc( sizeof(x264_frame_t) );
@@ -31,74 +33,57 @@ x264_frame_t *x264_frame_new( x264_t *h )
     int i_mb_count = h->mb.i_mb_count;
     int i_stride, i_width, i_lines;
     int i_padv = PADV << h->param.b_interlaced;
+    int luma_plane_size;
+    int align = h->param.cpu&X264_CPU_CACHELINE_64 ? 64 : h->param.cpu&X264_CPU_CACHELINE_32 ? 32 : 16;
 
     if( !frame ) return NULL;
 
     memset( frame, 0, sizeof(x264_frame_t) );
 
     /* allocate frame data (+64 for extra data for me) */
-    i_width  = ( ( h->param.i_width  + 15 ) & -16 );
-    i_stride = i_width + 2*PADH;
-    i_lines  = ( ( h->param.i_height + 15 ) & -16 );
-    if( h->param.b_interlaced )
-        i_lines = ( i_lines + 31 ) & -32;
-
-    if( h->param.cpu&X264_CPU_CACHELINE_SPLIT )
-    {
-        int align = h->param.cpu&X264_CPU_CACHELINE_32 ? 32 : 64;
-        i_stride = (i_stride + align-1) & -align;
-    }
+    i_width  = ALIGN( h->param.i_width, 16 );
+    i_stride = ALIGN( i_width + 2*PADH, align );
+    i_lines  = ALIGN( h->param.i_height, 16<<h->param.b_interlaced );
 
     frame->i_plane = 3;
     for( i = 0; i < 3; i++ )
     {
-        int i_divh = 1;
-        int i_divw = 1;
-        if( i > 0 )
-        {
-            if( h->param.i_csp == X264_CSP_I420 )
-                i_divh = i_divw = 2;
-            else if( h->param.i_csp == X264_CSP_I422 )
-                i_divw = 2;
-        }
-        frame->i_stride[i] = i_stride / i_divw;
-        frame->i_width[i] = i_width / i_divw;
-        frame->i_lines[i] = i_lines / i_divh;
-        CHECKED_MALLOC( frame->buffer[i],
-                        frame->i_stride[i] * ( frame->i_lines[i] + 2*i_padv / i_divh ) );
-
-        frame->plane[i] = ((uint8_t*)frame->buffer[i]) +
-                          frame->i_stride[i] * i_padv / i_divh + PADH / i_divw;
+        frame->i_stride[i] = i_stride >> !!i;
+        frame->i_width[i] = i_width >> !!i;
+        frame->i_lines[i] = i_lines >> !!i;
     }
 
-    frame->filtered[0] = frame->plane[0];
-    for( i = 0; i < 3; i++ )
+    luma_plane_size = (frame->i_stride[0] * ( frame->i_lines[0] + 2*i_padv ));
+    for( i = 1; i < 3; i++ )
     {
-        CHECKED_MALLOC( frame->buffer[4+i],
-                        frame->i_stride[0] * ( frame->i_lines[0] + 2*i_padv ) );
-        frame->filtered[i+1] = ((uint8_t*)frame->buffer[4+i]) +
-                                frame->i_stride[0] * i_padv + PADH;
+        CHECKED_MALLOC( frame->buffer[i], luma_plane_size/4 );
+        frame->plane[i] = frame->buffer[i] + (frame->i_stride[i] * i_padv + PADH)/2;
     }
+    /* all 4 luma planes allocated together, since the cacheline split code
+     * requires them to be in-phase wrt cacheline alignment. */
+    CHECKED_MALLOC( frame->buffer[0], 4*luma_plane_size);
+    for( i = 0; i < 4; i++ )
+        frame->filtered[i] = frame->buffer[0] + i*luma_plane_size + frame->i_stride[0] * i_padv + PADH;
+    frame->plane[0] = frame->filtered[0];
 
     if( h->frames.b_have_lowres )
     {
         frame->i_width_lowres = frame->i_width[0]/2;
-        frame->i_stride_lowres = frame->i_width_lowres + 2*PADH;
+        frame->i_stride_lowres = ALIGN( frame->i_width_lowres + 2*PADH, align );
         frame->i_lines_lowres = frame->i_lines[0]/2;
+
+        luma_plane_size = frame->i_stride_lowres * ( frame->i_lines[0]/2 + 2*i_padv );
+
+        CHECKED_MALLOC( frame->buffer_lowres[0], 4 * luma_plane_size );
         for( i = 0; i < 4; i++ )
-        {
-            CHECKED_MALLOC( frame->buffer_lowres[i],
-                            frame->i_stride_lowres * ( frame->i_lines[0]/2 + 2*i_padv ) );
-            frame->lowres[i] = ((uint8_t*)frame->buffer_lowres[i]) +
-                                frame->i_stride_lowres * i_padv + PADH;
-        }
+            frame->lowres[i] = frame->buffer_lowres[0] + (frame->i_stride_lowres * i_padv + PADH) + i * luma_plane_size;
     }
 
     if( h->param.analyse.i_me_method >= X264_ME_ESA )
     {
-        CHECKED_MALLOC( frame->buffer[7],
+        CHECKED_MALLOC( frame->buffer[3],
                         2 * frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv) * sizeof(uint16_t) );
-        frame->integral = (uint16_t*)frame->buffer[7] + frame->i_stride[0] * i_padv + PADH;
+        frame->integral = (uint16_t*)frame->buffer[3] + frame->i_stride[0] * i_padv + PADH;
     }
 
     frame->i_poc = -1;
@@ -142,7 +127,7 @@ fail:
 void x264_frame_delete( x264_frame_t *frame )
 {
     int i, j;
-    for( i = 0; i < 8; i++ )
+    for( i = 0; i < 4; i++ )
         x264_free( frame->buffer[i] );
     for( i = 0; i < 4; i++ )
         x264_free( frame->buffer_lowres[i] );
@@ -161,17 +146,35 @@ void x264_frame_delete( x264_frame_t *frame )
     x264_free( frame );
 }
 
-void x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src )
+int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src )
 {
     int i_csp = src->img.i_csp & X264_CSP_MASK;
+    int i;
+    if( i_csp != X264_CSP_I420 && i_csp != X264_CSP_YV12 )
+    {
+        x264_log( h, X264_LOG_ERROR, "Arg invalid CSP\n" );
+        return -1;
+    }
+
     dst->i_type     = src->i_type;
     dst->i_qpplus1  = src->i_qpplus1;
     dst->i_pts      = src->i_pts;
 
-    if( i_csp <= X264_CSP_NONE  || i_csp >= X264_CSP_MAX )
-        x264_log( h, X264_LOG_ERROR, "Arg invalid CSP\n" );
-    else
-        h->csp.convert[i_csp]( &h->mc, dst, &src->img, h->param.i_width, h->param.i_height );
+    for( i=0; i<3; i++ )
+    {
+        int s = (i_csp == X264_CSP_YV12 && i) ? i^3 : i;
+        uint8_t *plane = src->img.plane[s];
+        int stride = src->img.i_stride[s];
+        int width = h->param.i_width >> !!i;
+        int height = h->param.i_height >> !!i;
+        if( src->img.i_csp & X264_CSP_VFLIP )
+        {
+            plane += (height-1)*stride;
+            stride = -stride;
+        }
+        h->mc.plane_copy( dst->plane[i], dst->i_stride[i], plane, stride, width, height );
+    }
+    return 0;
 }
 
 
@@ -229,19 +232,20 @@ void x264_frame_expand_border( x264_t *h, x264_frame_t *frame, int mb_y, int b_e
 
 void x264_frame_expand_border_filtered( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
 {
-    /* during filtering, 8 extra pixels were filtered on each edge. 
+    /* during filtering, 8 extra pixels were filtered on each edge,
+     * but up to 3 of the horizontal ones may be wrong. 
        we want to expand border from the last filtered pixel */
     int b_start = !mb_y;
     int stride = frame->i_stride[0];
-    int width = 16*h->sps->i_mb_width + 16;
+    int width = 16*h->sps->i_mb_width + 8;
     int height = b_end ? (16*(h->sps->i_mb_height - mb_y) >> h->sh.b_mbaff) + 16 : 16;
-    int padh = PADH - 8;
+    int padh = PADH - 4;
     int padv = PADV - 8;
     int i;
     for( i = 1; i < 4; i++ )
     {
         // buffer: 8 luma, to match the hpel filter
-        uint8_t *pix = frame->filtered[i] + (16*mb_y - (8 << h->sh.b_mbaff)) * stride - 8;
+        uint8_t *pix = frame->filtered[i] + (16*mb_y - (8 << h->sh.b_mbaff)) * stride - 4;
         if( h->sh.b_mbaff )
         {
             plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end );
@@ -297,16 +301,16 @@ void munge_cavlc_nnz_row( x264_t *h, int mb_y, uint8_t (*buf)[16] )
 {
     uint32_t (*src)[6] = (uint32_t(*)[6])h->mb.non_zero_count + mb_y * h->sps->i_mb_width;
     int8_t *transform = h->mb.mb_transform_size + mb_y * h->sps->i_mb_width;
-    int x;
+    int x, nnz;
     for( x=0; x<h->sps->i_mb_width; x++ )
     {
         memcpy( buf+x, src+x, 16 );
         if( transform[x] )
         {
-            if( src[x][0] ) src[x][0] = 0x01010101;
-            if( src[x][1] ) src[x][1] = 0x01010101;
-            if( src[x][2] ) src[x][2] = 0x01010101;
-            if( src[x][3] ) src[x][3] = 0x01010101;
+            nnz = src[x][0] | src[x][1];
+            src[x][0] = src[x][1] = ((uint16_t)nnz ? 0x0101 : 0) + (nnz>>16 ? 0x01010000 : 0);
+            nnz = src[x][2] | src[x][3];
+            src[x][2] = src[x][3] = ((uint16_t)nnz ? 0x0101 : 0) + (nnz>>16 ? 0x01010000 : 0);
         }
     }
 }
@@ -585,7 +589,7 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
         const int mb_4x4 = 4 * s4x4 * mb_y + 4 * mb_x;
         const int b_8x8_transform = h->mb.mb_transform_size[mb_xy];
         const int i_edge_end = (h->mb.type[mb_xy] == P_SKIP) ? 1 : 4;
-        int i_edge, i_dir;
+        int i_edge;
 
         int i_pix_y[3] = { 16*mb_y*h->fdec->i_stride[0] + 16*mb_x,
                             8*mb_y*h->fdec->i_stride[1] +  8*mb_x,
@@ -601,125 +605,116 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
 
         /* i_dir == 0 -> vertical edge
          * i_dir == 1 -> horizontal edge */
-        for( i_dir = 0; i_dir < 2; i_dir++ )
-        {
-            int i_start = (i_dir ? (mb_y <= b_interlaced) : (mb_x == 0));
-            int i_qp, i_qpn;
-
-            for( i_edge = i_start; i_edge < i_edge_end; i_edge++ )
-            {
-                int mbn_xy, mbn_8x8, mbn_4x4;
-                int bS[4];  /* filtering strength */
-
-                if( b_8x8_transform && (i_edge&1) )
-                    continue;
-
-                mbn_xy  = i_edge > 0 ? mb_xy  : ( i_dir == 0 ? mb_xy  - 1 : mb_xy - h->mb.i_mb_stride );
-                mbn_8x8 = i_edge > 0 ? mb_8x8 : ( i_dir == 0 ? mb_8x8 - 2 : mb_8x8 - 2 * s8x8 );
-                mbn_4x4 = i_edge > 0 ? mb_4x4 : ( i_dir == 0 ? mb_4x4 - 4 : mb_4x4 - 4 * s4x4 );
 
-                if( b_interlaced && i_edge == 0 && i_dir == 1 )
-                {
-                    mbn_xy -= h->mb.i_mb_stride;
-                    mbn_8x8 -= 2 * s8x8;
-                    mbn_4x4 -= 4 * s4x4;
-                }
-
-                /* *** Get bS for each 4px for the current edge *** */
-                if( IS_INTRA( h->mb.type[mb_xy] ) || IS_INTRA( h->mb.type[mbn_xy] ) )
-                {
-                    bS[0] = bS[1] = bS[2] = bS[3] = ( i_edge == 0 && !(b_interlaced && i_dir) ? 4 : 3 );
-                }
-                else
-                {
-                    int i;
-                    for( i = 0; i < 4; i++ )
-                    {
-                        int x  = i_dir == 0 ? i_edge : i;
-                        int y  = i_dir == 0 ? i      : i_edge;
-                        int xn = (x - (i_dir == 0 ? 1 : 0 ))&0x03;
-                        int yn = (y - (i_dir == 0 ? 0 : 1 ))&0x03;
-
-                        if( h->mb.non_zero_count[mb_xy][block_idx_xy[x][y]] != 0 ||
-                            h->mb.non_zero_count[mbn_xy][block_idx_xy[xn][yn]] != 0 )
-                        {
-                            bS[i] = 2;
-                        }
-                        else
-                        {
-                            /* FIXME: A given frame may occupy more than one position in
-                             * the reference list. So we should compare the frame numbers,
-                             * not the indices in the ref list.
-                             * No harm yet, as we don't generate that case.*/
-
-                            int i8p= mb_8x8+(x/2)+(y/2)*s8x8;
-                            int i8q= mbn_8x8+(xn/2)+(yn/2)*s8x8;
-                            int i4p= mb_4x4+x+y*s4x4;
-                            int i4q= mbn_4x4+xn+yn*s4x4;
-                            int l;
-
-                            bS[i] = 0;
-
-                            for( l = 0; l < 1 + (h->sh.i_type == SLICE_TYPE_B); l++ )
-                            {
-                                if( h->mb.ref[l][i8p] != h->mb.ref[l][i8q] ||
-                                    abs( h->mb.mv[l][i4p][0] - h->mb.mv[l][i4q][0] ) >= 4 ||
-                                    abs( h->mb.mv[l][i4p][1] - h->mb.mv[l][i4q][1] ) >= mvy_limit )
-                                {
-                                    bS[i] = 1;
-                                    break;
-                                }
-                            }
-                        }
-                    }
-                }
-
-                /* *** filter *** */
-                /* Y plane */
-                i_qp = h->mb.qp[mb_xy];
-                i_qpn= h->mb.qp[mbn_xy];
-
-                if( i_dir == 0 )
-                {
-                    /* vertical edge */
-                    deblock_edge( h, &h->fdec->plane[0][i_pix_y[0] + 4*i_edge],
-                                  i_stride2[0], bS, (i_qp+i_qpn+1) >> 1, 0,
-                                  h->loopf.deblock_h_luma, h->loopf.deblock_h_luma_intra );
-                    if( !(i_edge & 1) )
-                    {
-                        /* U/V planes */
-                        int i_qpc = ( i_chroma_qp_table[x264_clip3( i_qp + h->pps->i_chroma_qp_index_offset, 0, 51 )] +
-                                      i_chroma_qp_table[x264_clip3( i_qpn + h->pps->i_chroma_qp_index_offset, 0, 51 )] + 1 ) >> 1;
-                        deblock_edge( h, &h->fdec->plane[1][i_pix_y[1] + 2*i_edge],
-                                      i_stride2[1], bS, i_qpc, 1,
-                                      h->loopf.deblock_h_chroma, h->loopf.deblock_h_chroma_intra );
-                        deblock_edge( h, &h->fdec->plane[2][i_pix_y[2] + 2*i_edge],
-                                      i_stride2[2], bS, i_qpc, 1,
-                                      h->loopf.deblock_h_chroma, h->loopf.deblock_h_chroma_intra );
-                    }
-                }
-                else
-                {
-                    /* horizontal edge */
-                    deblock_edge( h, &h->fdec->plane[0][i_pix_y[0] + 4*i_edge*i_stride2[0]],
-                                  i_stride2[0], bS, (i_qp+i_qpn+1) >> 1, 0,
-                                  h->loopf.deblock_v_luma, h->loopf.deblock_v_luma_intra );
-                    /* U/V planes */
-                    if( !(i_edge & 1) )
-                    {
-                        int i_qpc = ( i_chroma_qp_table[x264_clip3( i_qp + h->pps->i_chroma_qp_index_offset, 0, 51 )] +
-                                      i_chroma_qp_table[x264_clip3( i_qpn + h->pps->i_chroma_qp_index_offset, 0, 51 )] + 1 ) >> 1;
-                        deblock_edge( h, &h->fdec->plane[1][i_pix_y[1] + 2*i_edge*i_stride2[1]],
-                                      i_stride2[1], bS, i_qpc, 1,
-                                      h->loopf.deblock_v_chroma, h->loopf.deblock_v_chroma_intra );
-                        deblock_edge( h, &h->fdec->plane[2][i_pix_y[2] + 2*i_edge*i_stride2[2]],
-                                      i_stride2[2], bS, i_qpc, 1,
-                                      h->loopf.deblock_v_chroma, h->loopf.deblock_v_chroma_intra );
-                    }
-                }
-            }
+        #define deblock_dir(i_dir)\
+        {\
+            int i_start = (i_dir ? (mb_y <= b_interlaced) : (mb_x == 0));\
+            int i_qp, i_qpn;\
+            for( i_edge = i_start; i_edge < i_edge_end; i_edge++ )\
+            {\
+                int mbn_xy, mbn_8x8, mbn_4x4;\
+                int bS[4];  /* filtering strength */\
+                if( b_8x8_transform && (i_edge&1) )\
+                    continue;\
+                mbn_xy  = i_edge > 0 ? mb_xy  : ( i_dir == 0 ? mb_xy  - 1 : mb_xy - h->mb.i_mb_stride );\
+                mbn_8x8 = i_edge > 0 ? mb_8x8 : ( i_dir == 0 ? mb_8x8 - 2 : mb_8x8 - 2 * s8x8 );\
+                mbn_4x4 = i_edge > 0 ? mb_4x4 : ( i_dir == 0 ? mb_4x4 - 4 : mb_4x4 - 4 * s4x4 );\
+                if( b_interlaced && i_edge == 0 && i_dir == 1 )\
+                {\
+                    mbn_xy -= h->mb.i_mb_stride;\
+                    mbn_8x8 -= 2 * s8x8;\
+                    mbn_4x4 -= 4 * s4x4;\
+                }\
+                /* *** Get bS for each 4px for the current edge *** */\
+                if( IS_INTRA( h->mb.type[mb_xy] ) || IS_INTRA( h->mb.type[mbn_xy] ) )\
+                    bS[0] = bS[1] = bS[2] = bS[3] = ( i_edge == 0 && !(b_interlaced && i_dir) ? 4 : 3 );\
+                else\
+                {\
+                    int i;\
+                    for( i = 0; i < 4; i++ )\
+                    {\
+                        int x  = i_dir == 0 ? i_edge : i;\
+                        int y  = i_dir == 0 ? i      : i_edge;\
+                        int xn = (x - (i_dir == 0 ? 1 : 0 ))&0x03;\
+                        int yn = (y - (i_dir == 0 ? 0 : 1 ))&0x03;\
+                        if( h->mb.non_zero_count[mb_xy][x+y*4] != 0 ||\
+                            h->mb.non_zero_count[mbn_xy][xn+yn*4] != 0 )\
+                        {\
+                            bS[i] = 2;\
+                        }\
+                        else\
+                        {\
+                            /* FIXME: A given frame may occupy more than one position in\
+                             * the reference list. So we should compare the frame numbers,\
+                             * not the indices in the ref list.\
+                             * No harm yet, as we don't generate that case.*/\
+                            int i8p= mb_8x8+(x/2)+(y/2)*s8x8;\
+                            int i8q= mbn_8x8+(xn/2)+(yn/2)*s8x8;\
+                            int i4p= mb_4x4+x+y*s4x4;\
+                            int i4q= mbn_4x4+xn+yn*s4x4;\
+                            int l;\
+                            bS[i] = 0;\
+                            for( l = 0; l < 1 + (h->sh.i_type == SLICE_TYPE_B); l++ )\
+                            {\
+                                if( h->mb.ref[l][i8p] != h->mb.ref[l][i8q] ||\
+                                    abs( h->mb.mv[l][i4p][0] - h->mb.mv[l][i4q][0] ) >= 4 ||\
+                                    abs( h->mb.mv[l][i4p][1] - h->mb.mv[l][i4q][1] ) >= mvy_limit )\
+                                {\
+                                    bS[i] = 1;\
+                                    break;\
+                                }\
+                            }\
+                        }\
+                    }\
+                }\
+                /* *** filter *** */\
+                /* Y plane */\
+                i_qp = h->mb.qp[mb_xy];\
+                i_qpn= h->mb.qp[mbn_xy];\
+                if( i_dir == 0 )\
+                {\
+                    /* vertical edge */\
+                    deblock_edge( h, &h->fdec->plane[0][i_pix_y[0] + 4*i_edge],\
+                                  i_stride2[0], bS, (i_qp+i_qpn+1) >> 1, 0,\
+                                  h->loopf.deblock_h_luma, h->loopf.deblock_h_luma_intra );\
+                    if( !(i_edge & 1) )\
+                    {\
+                        /* U/V planes */\
+                        int i_qpc = ( i_chroma_qp_table[x264_clip3( i_qp + h->pps->i_chroma_qp_index_offset, 0, 51 )] +\
+                                      i_chroma_qp_table[x264_clip3( i_qpn + h->pps->i_chroma_qp_index_offset, 0, 51 )] + 1 ) >> 1;\
+                        deblock_edge( h, &h->fdec->plane[1][i_pix_y[1] + 2*i_edge],\
+                                      i_stride2[1], bS, i_qpc, 1,\
+                                      h->loopf.deblock_h_chroma, h->loopf.deblock_h_chroma_intra );\
+                        deblock_edge( h, &h->fdec->plane[2][i_pix_y[2] + 2*i_edge],\
+                                      i_stride2[2], bS, i_qpc, 1,\
+                                      h->loopf.deblock_h_chroma, h->loopf.deblock_h_chroma_intra );\
+                    }\
+                }\
+                else\
+                {\
+                    /* horizontal edge */\
+                    deblock_edge( h, &h->fdec->plane[0][i_pix_y[0] + 4*i_edge*i_stride2[0]],\
+                                  i_stride2[0], bS, (i_qp+i_qpn+1) >> 1, 0,\
+                                  h->loopf.deblock_v_luma, h->loopf.deblock_v_luma_intra );\
+                    /* U/V planes */\
+                    if( !(i_edge & 1) )\
+                    {\
+                        int i_qpc = ( i_chroma_qp_table[x264_clip3( i_qp + h->pps->i_chroma_qp_index_offset, 0, 51 )] +\
+                                      i_chroma_qp_table[x264_clip3( i_qpn + h->pps->i_chroma_qp_index_offset, 0, 51 )] + 1 ) >> 1;\
+                        deblock_edge( h, &h->fdec->plane[1][i_pix_y[1] + 2*i_edge*i_stride2[1]],\
+                                      i_stride2[1], bS, i_qpc, 1,\
+                                      h->loopf.deblock_v_chroma, h->loopf.deblock_v_chroma_intra );\
+                        deblock_edge( h, &h->fdec->plane[2][i_pix_y[2] + 2*i_edge*i_stride2[2]],\
+                                      i_stride2[2], bS, i_qpc, 1,\
+                                      h->loopf.deblock_v_chroma, h->loopf.deblock_v_chroma_intra );\
+                    }\
+                }\
+            }\
         }
 
+        deblock_dir(0);
+        deblock_dir(1);
+
         /* next mb */
         if( !b_interlaced || (mb_y&1) )
             mb_x++;
@@ -743,18 +738,26 @@ void x264_deblock_h_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta
 void x264_deblock_v_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
 void x264_deblock_h_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
 
-#ifdef ARCH_X86_64
 void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
-#else
+void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta );
+void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta );
+#ifdef ARCH_X86
 void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_h_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
+void x264_deblock_v8_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
 
 void x264_deblock_v_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
 {
     x264_deblock_v8_luma_mmxext( pix,   stride, alpha, beta, tc0   );
     x264_deblock_v8_luma_mmxext( pix+8, stride, alpha, beta, tc0+2 );
 }
+void x264_deblock_v_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta )
+{
+    x264_deblock_v8_luma_intra_mmxext( pix,   stride, alpha, beta );
+    x264_deblock_v8_luma_intra_mmxext( pix+8, stride, alpha, beta );
+}
 #endif
 #endif
 
@@ -781,17 +784,19 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf )
         pf->deblock_h_chroma = x264_deblock_h_chroma_mmxext;
         pf->deblock_v_chroma_intra = x264_deblock_v_chroma_intra_mmxext;
         pf->deblock_h_chroma_intra = x264_deblock_h_chroma_intra_mmxext;
-
-#ifdef ARCH_X86_64
-        if( cpu&X264_CPU_SSE2 )
+#ifdef ARCH_X86
+        pf->deblock_v_luma = x264_deblock_v_luma_mmxext;
+        pf->deblock_h_luma = x264_deblock_h_luma_mmxext;
+        pf->deblock_v_luma_intra = x264_deblock_v_luma_intra_mmxext;
+        pf->deblock_h_luma_intra = x264_deblock_h_luma_intra_mmxext;
+#endif
+        if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_STACK_MOD4) )
         {
             pf->deblock_v_luma = x264_deblock_v_luma_sse2;
             pf->deblock_h_luma = x264_deblock_h_luma_sse2;
+            pf->deblock_v_luma_intra = x264_deblock_v_luma_intra_sse2;
+            pf->deblock_h_luma_intra = x264_deblock_h_luma_intra_sse2;
         }
-#else
-        pf->deblock_v_luma = x264_deblock_v_luma_mmxext;
-        pf->deblock_h_luma = x264_deblock_h_luma_mmxext;
-#endif
     }
 #endif
 
@@ -806,8 +811,6 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf )
 
 
 /* threading */
-
-#ifdef HAVE_PTHREAD
 void x264_frame_cond_broadcast( x264_frame_t *frame, int i_lines_completed )
 {
     x264_pthread_mutex_lock( &frame->mutex );
@@ -824,14 +827,6 @@ void x264_frame_cond_wait( x264_frame_t *frame, int i_lines_completed )
     x264_pthread_mutex_unlock( &frame->mutex );
 }
 
-#else
-void x264_frame_cond_broadcast( x264_frame_t *frame, int i_lines_completed )
-{}
-void x264_frame_cond_wait( x264_frame_t *frame, int i_lines_completed )
-{}
-#endif
-
-
 /* list operators */
 
 void x264_frame_push( x264_frame_t **list, x264_frame_t *frame )