git.sesse.net Git - x264/blob - common/frame.c

   1 /*****************************************************************************
   2  * frame.c: h264 encoder library
   3  *****************************************************************************
   4  * Copyright (C) 2003-2008 x264 project
   5  *
   6  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
   7  *          Loren Merritt <lorenm@u.washington.edu>
   8  *          Fiona Glaser <fiona@x264.com>
   9  *
  10  * This program is free software; you can redistribute it and/or modify
  11  * it under the terms of the GNU General Public License as published by
  12  * the Free Software Foundation; either version 2 of the License, or
  13  * (at your option) any later version.
  14  *
  15  * This program is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  * GNU General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU General Public License
  21  * along with this program; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  23  *****************************************************************************/
  24
  25 #include "common.h"
  26
  27 #define ALIGN(x,a) (((x)+((a)-1))&~((a)-1))
  28
  29 x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
  30 {
  31     x264_frame_t *frame;
  32     int i, j;
  33
  34     int i_mb_count = h->mb.i_mb_count;
  35     int i_stride, i_width, i_lines;
  36     int i_padv = PADV << h->param.b_interlaced;
  37     int luma_plane_size;
  38     int chroma_plane_size;
  39     int align = h->param.cpu&X264_CPU_CACHELINE_64 ? 64 : h->param.cpu&X264_CPU_CACHELINE_32 ? 32 : 16;
  40
  41     CHECKED_MALLOCZERO( frame, sizeof(x264_frame_t) );
  42
  43     /* allocate frame data (+64 for extra data for me) */
  44     i_width  = ALIGN( h->param.i_width, 16 );
  45     i_stride = ALIGN( i_width + 2*PADH, align );
  46     i_lines  = ALIGN( h->param.i_height, 16<<h->param.b_interlaced );
  47
  48     frame->i_plane = 3;
  49     for( i = 0; i < 3; i++ )
  50     {
  51         frame->i_stride[i] = ALIGN( i_stride >> !!i, align );
  52         frame->i_width[i] = i_width >> !!i;
  53         frame->i_lines[i] = i_lines >> !!i;
  54     }
  55
  56     luma_plane_size = (frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv));
  57     chroma_plane_size = (frame->i_stride[1] * (frame->i_lines[1] + 2*i_padv));
  58     for( i = 1; i < 3; i++ )
  59     {
  60         CHECKED_MALLOC( frame->buffer[i], chroma_plane_size );
  61         frame->plane[i] = frame->buffer[i] + (frame->i_stride[i] * i_padv + PADH)/2;
  62     }
  63
  64     for( i = 0; i < h->param.i_bframe + 2; i++ )
  65         for( j = 0; j < h->param.i_bframe + 2; j++ )
  66             CHECKED_MALLOC( frame->i_row_satds[i][j], i_lines/16 * sizeof(int) );
  67
  68     frame->i_poc = -1;
  69     frame->i_type = X264_TYPE_AUTO;
  70     frame->i_qpplus1 = 0;
  71     frame->i_pts = -1;
  72     frame->i_frame = -1;
  73     frame->i_frame_num = -1;
  74     frame->i_lines_completed = -1;
  75     frame->b_fdec = b_fdec;
  76     frame->orig = frame;
  77
  78     /* all 4 luma planes allocated together, since the cacheline split code
  79      * requires them to be in-phase wrt cacheline alignment. */
  80     if( h->param.analyse.i_subpel_refine && b_fdec )
  81     {
  82         CHECKED_MALLOC( frame->buffer[0], 4*luma_plane_size );
  83         for( i = 0; i < 4; i++ )
  84             frame->filtered[i] = frame->buffer[0] + i*luma_plane_size + frame->i_stride[0] * i_padv + PADH;
  85         frame->plane[0] = frame->filtered[0];
  86     }
  87     else
  88     {
  89         CHECKED_MALLOC( frame->buffer[0], luma_plane_size );
  90         frame->filtered[0] = frame->plane[0] = frame->buffer[0] + frame->i_stride[0] * i_padv + PADH;
  91     }
  92
  93     frame->b_duplicate = 0;
  94
  95     if( b_fdec ) /* fdec frame */
  96     {
  97         CHECKED_MALLOC( frame->mb_type, i_mb_count * sizeof(int8_t));
  98         CHECKED_MALLOC( frame->mv[0], 2*16 * i_mb_count * sizeof(int16_t) );
  99         CHECKED_MALLOC( frame->ref[0], 4 * i_mb_count * sizeof(int8_t) );
 100         if( h->param.i_bframe )
 101         {
 102             CHECKED_MALLOC( frame->mv[1], 2*16 * i_mb_count * sizeof(int16_t) );
 103             CHECKED_MALLOC( frame->ref[1], 4 * i_mb_count * sizeof(int8_t) );
 104         }
 105         else
 106         {
 107             frame->mv[1]  = NULL;
 108             frame->ref[1] = NULL;
 109         }
 110         CHECKED_MALLOC( frame->i_row_bits, i_lines/16 * sizeof(int) );
 111         CHECKED_MALLOC( frame->i_row_qp, i_lines/16 * sizeof(int) );
 112         if( h->param.analyse.i_me_method >= X264_ME_ESA )
 113         {
 114             CHECKED_MALLOC( frame->buffer[3],
 115                             frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv) * sizeof(uint16_t) << h->frames.b_have_sub8x8_esa );
 116             frame->integral = (uint16_t*)frame->buffer[3] + frame->i_stride[0] * i_padv + PADH;
 117         }
 118     }
 119     else /* fenc frame */
 120     {
 121         if( h->frames.b_have_lowres )
 122         {
 123             frame->i_width_lowres = frame->i_width[0]/2;
 124             frame->i_stride_lowres = ALIGN( frame->i_width_lowres + 2*PADH, align );
 125             frame->i_lines_lowres = frame->i_lines[0]/2;
 126
 127             luma_plane_size = frame->i_stride_lowres * (frame->i_lines[0]/2 + 2*PADV);
 128
 129             CHECKED_MALLOC( frame->buffer_lowres[0], 4 * luma_plane_size );
 130             for( i = 0; i < 4; i++ )
 131                 frame->lowres[i] = frame->buffer_lowres[0] + (frame->i_stride_lowres * PADV + PADH) + i * luma_plane_size;
 132
 133             for( j = 0; j <= !!h->param.i_bframe; j++ )
 134                 for( i = 0; i <= h->param.i_bframe; i++ )
 135                 {
 136                     CHECKED_MALLOCZERO( frame->lowres_mvs[j][i], 2*h->mb.i_mb_count*sizeof(int16_t) );
 137                     CHECKED_MALLOC( frame->lowres_mv_costs[j][i], h->mb.i_mb_count*sizeof(int) );
 138                 }
 139             CHECKED_MALLOC( frame->i_propagate_cost, (i_mb_count+3) * sizeof(uint16_t) );
 140             for( j = 0; j <= h->param.i_bframe+1; j++ )
 141                 for( i = 0; i <= h->param.i_bframe+1; i++ )
 142                 {
 143                     CHECKED_MALLOC( frame->lowres_costs[j][i], (i_mb_count+3) * sizeof(uint16_t) );
 144                     CHECKED_MALLOC( frame->lowres_inter_types[j][i], (i_mb_count+3)/4 * sizeof(uint8_t) );
 145                 }
 146             frame->i_intra_cost = frame->lowres_costs[0][0];
 147             memset( frame->i_intra_cost, -1, (i_mb_count+3) * sizeof(uint16_t) );
 148         }
 149         if( h->param.rc.i_aq_mode )
 150         {
 151             CHECKED_MALLOC( frame->f_qp_offset, h->mb.i_mb_count * sizeof(float) );
 152             CHECKED_MALLOC( frame->f_qp_offset_aq, h->mb.i_mb_count * sizeof(float) );
 153             if( h->frames.b_have_lowres )
 154                 /* shouldn't really be initialized, just silences a valgrind false-positive in x264_mbtree_propagate_cost_sse2 */
 155                 CHECKED_MALLOCZERO( frame->i_inv_qscale_factor, (h->mb.i_mb_count+3) * sizeof(uint16_t) );
 156         }
 157     }
 158
 159     if( x264_pthread_mutex_init( &frame->mutex, NULL ) )
 160         goto fail;
 161     if( x264_pthread_cond_init( &frame->cv, NULL ) )
 162         goto fail;
 163
 164     return frame;
 165
 166 fail:
 167     x264_free( frame );
 168     return NULL;
 169 }
 170
 171 void x264_frame_delete( x264_frame_t *frame )
 172 {
 173     int i, j;
 174     /* Duplicate frames are blank copies of real frames (including pointers),
 175      * so freeing those pointers would cause a double free later. */
 176     if( !frame->b_duplicate )
 177     {
 178         for( i = 0; i < 4; i++ )
 179             x264_free( frame->buffer[i] );
 180         for( i = 0; i < 4; i++ )
 181             x264_free( frame->buffer_lowres[i] );
 182         for( i = 0; i < X264_BFRAME_MAX+2; i++ )
 183             for( j = 0; j < X264_BFRAME_MAX+2; j++ )
 184                 x264_free( frame->i_row_satds[i][j] );
 185         for( j = 0; j < 2; j++ )
 186             for( i = 0; i <= X264_BFRAME_MAX; i++ )
 187             {
 188                 x264_free( frame->lowres_mvs[j][i] );
 189                 x264_free( frame->lowres_mv_costs[j][i] );
 190             }
 191         x264_free( frame->i_propagate_cost );
 192         for( j = 0; j <= X264_BFRAME_MAX+1; j++ )
 193             for( i = 0; i <= X264_BFRAME_MAX+1; i++ )
 194             {
 195                 x264_free( frame->lowres_costs[j][i] );
 196                 x264_free( frame->lowres_inter_types[j][i] );
 197             }
 198         x264_free( frame->f_qp_offset );
 199         x264_free( frame->f_qp_offset_aq );
 200         x264_free( frame->i_inv_qscale_factor );
 201         x264_free( frame->i_row_bits );
 202         x264_free( frame->i_row_qp );
 203         x264_free( frame->mb_type );
 204         x264_free( frame->mv[0] );
 205         x264_free( frame->mv[1] );
 206         x264_free( frame->ref[0] );
 207         x264_free( frame->ref[1] );
 208         x264_pthread_mutex_destroy( &frame->mutex );
 209         x264_pthread_cond_destroy( &frame->cv );
 210     }
 211     x264_free( frame );
 212 }
 213
 214 int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src )
 215 {
 216     int i_csp = src->img.i_csp & X264_CSP_MASK;
 217     int i;
 218     if( i_csp != X264_CSP_I420 && i_csp != X264_CSP_YV12 )
 219     {
 220         x264_log( h, X264_LOG_ERROR, "Arg invalid CSP\n" );
 221         return -1;
 222     }
 223
 224     dst->i_type     = src->i_type;
 225     dst->i_qpplus1  = src->i_qpplus1;
 226     dst->i_pts      = dst->i_reordered_pts = src->i_pts;
 227     dst->param      = src->param;
 228
 229     for( i=0; i<3; i++ )
 230     {
 231         int s = (i_csp == X264_CSP_YV12 && i) ? i^3 : i;
 232         uint8_t *plane = src->img.plane[s];
 233         int stride = src->img.i_stride[s];
 234         int width = h->param.i_width >> !!i;
 235         int height = h->param.i_height >> !!i;
 236         if( src->img.i_csp & X264_CSP_VFLIP )
 237         {
 238             plane += (height-1)*stride;
 239             stride = -stride;
 240         }
 241         h->mc.plane_copy( dst->plane[i], dst->i_stride[i], plane, stride, width, height );
 242     }
 243     return 0;
 244 }
 245
 246
 247
 248 static void plane_expand_border( uint8_t *pix, int i_stride, int i_width, int i_height, int i_padh, int i_padv, int b_pad_top, int b_pad_bottom )
 249 {
 250 #define PPIXEL(x, y) ( pix + (x) + (y)*i_stride )
 251     int y;
 252     for( y = 0; y < i_height; y++ )
 253     {
 254         /* left band */
 255         memset( PPIXEL(-i_padh, y), PPIXEL(0, y)[0], i_padh );
 256         /* right band */
 257         memset( PPIXEL(i_width, y), PPIXEL(i_width-1, y)[0], i_padh );
 258     }
 259     /* upper band */
 260     if( b_pad_top )
 261     for( y = 0; y < i_padv; y++ )
 262         memcpy( PPIXEL(-i_padh, -y-1), PPIXEL(-i_padh, 0), i_width+2*i_padh );
 263     /* lower band */
 264     if( b_pad_bottom )
 265     for( y = 0; y < i_padv; y++ )
 266         memcpy( PPIXEL(-i_padh, i_height+y), PPIXEL(-i_padh, i_height-1), i_width+2*i_padh );
 267 #undef PPIXEL
 268 }
 269
 270 void x264_frame_expand_border( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
 271 {
 272     int i;
 273     int b_start = !mb_y;
 274     if( mb_y & h->sh.b_mbaff )
 275         return;
 276     for( i = 0; i < frame->i_plane; i++ )
 277     {
 278         int stride = frame->i_stride[i];
 279         int width = 16*h->sps->i_mb_width >> !!i;
 280         int height = (b_end ? 16*(h->sps->i_mb_height - mb_y) >> h->sh.b_mbaff : 16) >> !!i;
 281         int padh = PADH >> !!i;
 282         int padv = PADV >> !!i;
 283         // buffer: 2 chroma, 3 luma (rounded to 4) because deblocking goes beyond the top of the mb
 284         uint8_t *pix = frame->plane[i] + X264_MAX(0, (16*mb_y-4)*stride >> !!i);
 285         if( b_end && !b_start )
 286             height += 4 >> (!!i + h->sh.b_mbaff);
 287         if( h->sh.b_mbaff )
 288         {
 289             plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end );
 290             plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end );
 291         }
 292         else
 293         {
 294             plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end );
 295         }
 296     }
 297 }
 298
 299 void x264_frame_expand_border_filtered( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
 300 {
 301     /* during filtering, 8 extra pixels were filtered on each edge,
 302      * but up to 3 of the horizontal ones may be wrong.
 303        we want to expand border from the last filtered pixel */
 304     int b_start = !mb_y;
 305     int stride = frame->i_stride[0];
 306     int width = 16*h->sps->i_mb_width + 8;
 307     int height = b_end ? (16*(h->sps->i_mb_height - mb_y) >> h->sh.b_mbaff) + 16 : 16;
 308     int padh = PADH - 4;
 309     int padv = PADV - 8;
 310     int i;
 311     for( i = 1; i < 4; i++ )
 312     {
 313         // buffer: 8 luma, to match the hpel filter
 314         uint8_t *pix = frame->filtered[i] + (16*mb_y - (8 << h->sh.b_mbaff)) * stride - 4;
 315         if( h->sh.b_mbaff )
 316         {
 317             plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end );
 318             plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end );
 319         }
 320         else
 321         {
 322             plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end );
 323         }
 324     }
 325 }
 326
 327 void x264_frame_expand_border_lowres( x264_frame_t *frame )
 328 {
 329     int i;
 330     for( i = 0; i < 4; i++ )
 331         plane_expand_border( frame->lowres[i], frame->i_stride_lowres, frame->i_width_lowres, frame->i_lines_lowres, PADH, PADV, 1, 1 );
 332 }
 333
 334 void x264_frame_expand_border_mod16( x264_t *h, x264_frame_t *frame )
 335 {
 336     int i, y;
 337     for( i = 0; i < frame->i_plane; i++ )
 338     {
 339         int i_subsample = i ? 1 : 0;
 340         int i_width = h->param.i_width >> i_subsample;
 341         int i_height = h->param.i_height >> i_subsample;
 342         int i_padx = (h->sps->i_mb_width * 16 - h->param.i_width) >> i_subsample;
 343         int i_pady = (h->sps->i_mb_height * 16 - h->param.i_height) >> i_subsample;
 344
 345         if( i_padx )
 346         {
 347             for( y = 0; y < i_height; y++ )
 348                 memset( &frame->plane[i][y*frame->i_stride[i] + i_width],
 349                          frame->plane[i][y*frame->i_stride[i] + i_width - 1],
 350                          i_padx );
 351         }
 352         if( i_pady )
 353         {
 354             //FIXME interlace? or just let it pad using the wrong field
 355             for( y = i_height; y < i_height + i_pady; y++ )
 356                 memcpy( &frame->plane[i][y*frame->i_stride[i]],
 357                         &frame->plane[i][(i_height-1)*frame->i_stride[i]],
 358                         i_width + i_padx );
 359         }
 360     }
 361 }
 362
 363
 364 /* cavlc + 8x8 transform stores nnz per 16 coeffs for the purpose of
 365  * entropy coding, but per 64 coeffs for the purpose of deblocking */
 366 static void munge_cavlc_nnz_row( x264_t *h, int mb_y, uint8_t (*buf)[16] )
 367 {
 368     uint32_t (*src)[6] = (uint32_t(*)[6])h->mb.non_zero_count + mb_y * h->sps->i_mb_width;
 369     int8_t *transform = h->mb.mb_transform_size + mb_y * h->sps->i_mb_width;
 370     int x, nnz;
 371     for( x=0; x<h->sps->i_mb_width; x++ )
 372     {
 373         memcpy( buf+x, src+x, 16 );
 374         if( transform[x] )
 375         {
 376             nnz = src[x][0] | src[x][1];
 377             src[x][0] = src[x][1] = ((uint16_t)nnz ? 0x0101 : 0) + (nnz>>16 ? 0x01010000 : 0);
 378             nnz = src[x][2] | src[x][3];
 379             src[x][2] = src[x][3] = ((uint16_t)nnz ? 0x0101 : 0) + (nnz>>16 ? 0x01010000 : 0);
 380         }
 381     }
 382 }
 383
 384 static void restore_cavlc_nnz_row( x264_t *h, int mb_y, uint8_t (*buf)[16] )
 385 {
 386     uint8_t (*dst)[24] = h->mb.non_zero_count + mb_y * h->sps->i_mb_width;
 387     int x;
 388     for( x=0; x<h->sps->i_mb_width; x++ )
 389         memcpy( dst+x, buf+x, 16 );
 390 }
 391
 392 static void munge_cavlc_nnz( x264_t *h, int mb_y, uint8_t (*buf)[16], void (*func)(x264_t*, int, uint8_t (*)[16]) )
 393 {
 394     func( h, mb_y, buf );
 395     if( mb_y > 0 )
 396         func( h, mb_y-1, buf + h->sps->i_mb_width );
 397     if( h->sh.b_mbaff )
 398     {
 399         func( h, mb_y+1, buf + h->sps->i_mb_width * 2 );
 400         if( mb_y > 0 )
 401             func( h, mb_y-2, buf + h->sps->i_mb_width * 3 );
 402     }
 403 }
 404
 405
 406 /* Deblocking filter */
 407 static const uint8_t i_alpha_table[52+12*2] =
 408 {
 409      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 410      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 411      0,  0,  0,  0,  0,  0,  4,  4,  5,  6,
 412      7,  8,  9, 10, 12, 13, 15, 17, 20, 22,
 413     25, 28, 32, 36, 40, 45, 50, 56, 63, 71,
 414     80, 90,101,113,127,144,162,182,203,226,
 415    255,255,
 416    255,255,255,255,255,255,255,255,255,255,255,255,
 417 };
 418 static const uint8_t i_beta_table[52+12*2] =
 419 {
 420      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 421      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 422      0,  0,  0,  0,  0,  0,  2,  2,  2,  3,
 423      3,  3,  3,  4,  4,  4,  6,  6,  7,  7,
 424      8,  8,  9,  9, 10, 10, 11, 11, 12, 12,
 425     13, 13, 14, 14, 15, 15, 16, 16, 17, 17,
 426     18, 18,
 427     18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
 428 };
 429 static const int8_t i_tc0_table[52+12*2][4] =
 430 {
 431     {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
 432     {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
 433     {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
 434     {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
 435     {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 1 },
 436     {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 1, 1 }, {-1, 0, 1, 1 }, {-1, 1, 1, 1 },
 437     {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 },
 438     {-1, 1, 1, 2 }, {-1, 1, 2, 3 }, {-1, 1, 2, 3 }, {-1, 2, 2, 3 }, {-1, 2, 2, 4 }, {-1, 2, 3, 4 },
 439     {-1, 2, 3, 4 }, {-1, 3, 3, 5 }, {-1, 3, 4, 6 }, {-1, 3, 4, 6 }, {-1, 4, 5, 7 }, {-1, 4, 5, 8 },
 440     {-1, 4, 6, 9 }, {-1, 5, 7,10 }, {-1, 6, 8,11 }, {-1, 6, 8,13 }, {-1, 7,10,14 }, {-1, 8,11,16 },
 441     {-1, 9,12,18 }, {-1,10,13,20 }, {-1,11,15,23 }, {-1,13,17,25 },
 442     {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
 443     {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
 444 };
 445 #define alpha_table(x) i_alpha_table[(x)+12]
 446 #define beta_table(x)  i_beta_table[(x)+12]
 447 #define tc0_table(x)   i_tc0_table[(x)+12]
 448
 449 /* From ffmpeg */
 450 static inline void deblock_luma_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0 )
 451 {
 452     int i, d;
 453     for( i = 0; i < 4; i++ )
 454     {
 455         if( tc0[i] < 0 )
 456         {
 457             pix += 4*ystride;
 458             continue;
 459         }
 460         for( d = 0; d < 4; d++ )
 461         {
 462             const int p2 = pix[-3*xstride];
 463             const int p1 = pix[-2*xstride];
 464             const int p0 = pix[-1*xstride];
 465             const int q0 = pix[ 0*xstride];
 466             const int q1 = pix[ 1*xstride];
 467             const int q2 = pix[ 2*xstride];
 468
 469             if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
 470             {
 471                 int tc = tc0[i];
 472                 int delta;
 473                 if( abs( p2 - p0 ) < beta )
 474                 {
 475                     if( tc0[i] )
 476                         pix[-2*xstride] = p1 + x264_clip3( (( p2 + ((p0 + q0 + 1) >> 1)) >> 1) - p1, -tc0[i], tc0[i] );
 477                     tc++;
 478                 }
 479                 if( abs( q2 - q0 ) < beta )
 480                 {
 481                     if( tc0[i] )
 482                         pix[ 1*xstride] = q1 + x264_clip3( (( q2 + ((p0 + q0 + 1) >> 1)) >> 1) - q1, -tc0[i], tc0[i] );
 483                     tc++;
 484                 }
 485
 486                 delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
 487                 pix[-1*xstride] = x264_clip_uint8( p0 + delta );    /* p0' */
 488                 pix[ 0*xstride] = x264_clip_uint8( q0 - delta );    /* q0' */
 489             }
 490             pix += ystride;
 491         }
 492     }
 493 }
 494 static void deblock_v_luma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
 495 {
 496     deblock_luma_c( pix, stride, 1, alpha, beta, tc0 );
 497 }
 498 static void deblock_h_luma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
 499 {
 500     deblock_luma_c( pix, 1, stride, alpha, beta, tc0 );
 501 }
 502
 503 static inline void deblock_chroma_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0 )
 504 {
 505     int i, d;
 506     for( i = 0; i < 4; i++ )
 507     {
 508         const int tc = tc0[i];
 509         if( tc <= 0 )
 510         {
 511             pix += 2*ystride;
 512             continue;
 513         }
 514         for( d = 0; d < 2; d++ )
 515         {
 516             const int p1 = pix[-2*xstride];
 517             const int p0 = pix[-1*xstride];
 518             const int q0 = pix[ 0*xstride];
 519             const int q1 = pix[ 1*xstride];
 520
 521             if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
 522             {
 523                 int delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
 524                 pix[-1*xstride] = x264_clip_uint8( p0 + delta );    /* p0' */
 525                 pix[ 0*xstride] = x264_clip_uint8( q0 - delta );    /* q0' */
 526             }
 527             pix += ystride;
 528         }
 529     }
 530 }
 531 static void deblock_v_chroma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
 532 {
 533     deblock_chroma_c( pix, stride, 1, alpha, beta, tc0 );
 534 }
 535 static void deblock_h_chroma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
 536 {
 537     deblock_chroma_c( pix, 1, stride, alpha, beta, tc0 );
 538 }
 539
 540 static inline void deblock_luma_intra_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta )
 541 {
 542     int d;
 543     for( d = 0; d < 16; d++ )
 544     {
 545         const int p2 = pix[-3*xstride];
 546         const int p1 = pix[-2*xstride];
 547         const int p0 = pix[-1*xstride];
 548         const int q0 = pix[ 0*xstride];
 549         const int q1 = pix[ 1*xstride];
 550         const int q2 = pix[ 2*xstride];
 551
 552         if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
 553         {
 554             if(abs( p0 - q0 ) < ((alpha >> 2) + 2) )
 555             {
 556                 if( abs( p2 - p0 ) < beta ) /* p0', p1', p2' */
 557                 {
 558                     const int p3 = pix[-4*xstride];
 559                     pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
 560                     pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
 561                     pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
 562                 }
 563                 else /* p0' */
 564                     pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
 565                 if( abs( q2 - q0 ) < beta ) /* q0', q1', q2' */
 566                 {
 567                     const int q3 = pix[3*xstride];
 568                     pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
 569                     pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
 570                     pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
 571                 }
 572                 else /* q0' */
 573                     pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
 574             }
 575             else /* p0', q0' */
 576             {
 577                 pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
 578                 pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
 579             }
 580         }
 581         pix += ystride;
 582     }
 583 }
 584 static void deblock_v_luma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
 585 {
 586     deblock_luma_intra_c( pix, stride, 1, alpha, beta );
 587 }
 588 static void deblock_h_luma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
 589 {
 590     deblock_luma_intra_c( pix, 1, stride, alpha, beta );
 591 }
 592
 593 static inline void deblock_chroma_intra_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta )
 594 {
 595     int d;
 596     for( d = 0; d < 8; d++ )
 597     {
 598         const int p1 = pix[-2*xstride];
 599         const int p0 = pix[-1*xstride];
 600         const int q0 = pix[ 0*xstride];
 601         const int q1 = pix[ 1*xstride];
 602
 603         if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
 604         {
 605             pix[-1*xstride] = (2*p1 + p0 + q1 + 2) >> 2;   /* p0' */
 606             pix[ 0*xstride] = (2*q1 + q0 + p1 + 2) >> 2;   /* q0' */
 607         }
 608         pix += ystride;
 609     }
 610 }
 611 static void deblock_v_chroma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
 612 {
 613     deblock_chroma_intra_c( pix, stride, 1, alpha, beta );
 614 }
 615 static void deblock_h_chroma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
 616 {
 617     deblock_chroma_intra_c( pix, 1, stride, alpha, beta );
 618 }
 619
 620 static inline void deblock_edge( x264_t *h, uint8_t *pix1, uint8_t *pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_inter_t pf_inter )
 621 {
 622     const int index_a = i_qp + h->sh.i_alpha_c0_offset;
 623     const int alpha = alpha_table(index_a);
 624     const int beta  = beta_table(i_qp + h->sh.i_beta_offset);
 625     int8_t tc[4];
 626
 627     if( !alpha || !beta )
 628         return;
 629
 630     tc[0] = tc0_table(index_a)[bS[0]] + b_chroma;
 631     tc[1] = tc0_table(index_a)[bS[1]] + b_chroma;
 632     tc[2] = tc0_table(index_a)[bS[2]] + b_chroma;
 633     tc[3] = tc0_table(index_a)[bS[3]] + b_chroma;
 634
 635     pf_inter( pix1, i_stride, alpha, beta, tc );
 636     if( b_chroma )
 637         pf_inter( pix2, i_stride, alpha, beta, tc );
 638 }
 639
 640 static inline void deblock_edge_intra( x264_t *h, uint8_t *pix1, uint8_t *pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_intra_t pf_intra )
 641 {
 642     const int alpha = alpha_table(i_qp + h->sh.i_alpha_c0_offset);
 643     const int beta  = beta_table(i_qp + h->sh.i_beta_offset);
 644
 645     if( !alpha || !beta )
 646         return;
 647
 648     pf_intra( pix1, i_stride, alpha, beta );
 649     if( b_chroma )
 650         pf_intra( pix2, i_stride, alpha, beta );
 651 }
 652
 653 void x264_frame_deblock_row( x264_t *h, int mb_y )
 654 {
 655     const int s8x8 = 2 * h->mb.i_mb_stride;
 656     const int s4x4 = 4 * h->mb.i_mb_stride;
 657     const int b_interlaced = h->sh.b_mbaff;
 658     const int mvy_limit = 4 >> b_interlaced;
 659     const int qp_thresh = 15 - X264_MIN(h->sh.i_alpha_c0_offset, h->sh.i_beta_offset) - X264_MAX(0, h->param.analyse.i_chroma_qp_offset);
 660     const int no_sub8x8 = !(h->param.analyse.inter & X264_ANALYSE_PSUB8x8);
 661     int mb_x;
 662     int stridey   = h->fdec->i_stride[0];
 663     int stride2y  = stridey << b_interlaced;
 664     int strideuv  = h->fdec->i_stride[1];
 665     int stride2uv = strideuv << b_interlaced;
 666     uint8_t (*nnz_backup)[16] = h->scratch_buffer;
 667
 668     if( !h->pps->b_cabac && h->pps->b_transform_8x8_mode )
 669         munge_cavlc_nnz( h, mb_y, nnz_backup, munge_cavlc_nnz_row );
 670
 671     for( mb_x = 0; mb_x < h->sps->i_mb_width; mb_x += (~b_interlaced | mb_y)&1, mb_y ^= b_interlaced )
 672     {
 673         const int mb_xy  = mb_y * h->mb.i_mb_stride + mb_x;
 674         const int mb_8x8 = 2 * s8x8 * mb_y + 2 * mb_x;
 675         const int mb_4x4 = 4 * s4x4 * mb_y + 4 * mb_x;
 676         const int b_8x8_transform = h->mb.mb_transform_size[mb_xy];
 677         const int i_qp = h->mb.qp[mb_xy];
 678         int i_edge_end = (h->mb.type[mb_xy] == P_SKIP) ? 1 : 4;
 679         uint8_t *pixy = h->fdec->plane[0] + 16*mb_y*stridey  + 16*mb_x;
 680         uint8_t *pixu = h->fdec->plane[1] +  8*mb_y*strideuv +  8*mb_x;
 681         uint8_t *pixv = h->fdec->plane[2] +  8*mb_y*strideuv +  8*mb_x;
 682         if( b_interlaced && (mb_y&1) )
 683         {
 684             pixy -= 15*stridey;
 685             pixu -=  7*strideuv;
 686             pixv -=  7*strideuv;
 687         }
 688
 689         x264_prefetch_fenc( h, h->fdec, mb_x, mb_y );
 690
 691         if( i_qp <= qp_thresh )
 692             i_edge_end = 1;
 693
 694         #define FILTER_DIR(intra, i_dir)\
 695         {\
 696             /* Y plane */\
 697             i_qpn= h->mb.qp[mbn_xy];\
 698             if( i_dir == 0 )\
 699             {\
 700                 /* vertical edge */\
 701                 deblock_edge##intra( h, pixy + 4*i_edge, NULL,\
 702                               stride2y, bS, (i_qp+i_qpn+1) >> 1, 0,\
 703                               h->loopf.deblock_h_luma##intra );\
 704                 if( !(i_edge & 1) )\
 705                 {\
 706                     /* U/V planes */\
 707                     int i_qpc = (h->chroma_qp_table[i_qp] + h->chroma_qp_table[i_qpn] + 1) >> 1;\
 708                     deblock_edge##intra( h, pixu + 2*i_edge, pixv + 2*i_edge,\
 709                                   stride2uv, bS, i_qpc, 1,\
 710                                   h->loopf.deblock_h_chroma##intra );\
 711                 }\
 712             }\
 713             else\
 714             {\
 715                 /* horizontal edge */\
 716                 deblock_edge##intra( h, pixy + 4*i_edge*stride2y, NULL,\
 717                               stride2y, bS, (i_qp+i_qpn+1) >> 1, 0,\
 718                               h->loopf.deblock_v_luma##intra );\
 719                 /* U/V planes */\
 720                 if( !(i_edge & 1) )\
 721                 {\
 722                     int i_qpc = (h->chroma_qp_table[i_qp] + h->chroma_qp_table[i_qpn] + 1) >> 1;\
 723                     deblock_edge##intra( h, pixu + 2*i_edge*stride2uv, pixv + 2*i_edge*stride2uv,\
 724                                   stride2uv, bS, i_qpc, 1,\
 725                                   h->loopf.deblock_v_chroma##intra );\
 726                 }\
 727             }\
 728         }
 729
 730         #define DEBLOCK_STRENGTH(i_dir)\
 731         {\
 732             /* *** Get bS for each 4px for the current edge *** */\
 733             if( IS_INTRA( h->mb.type[mb_xy] ) || IS_INTRA( h->mb.type[mbn_xy]) )\
 734                 M32( bS ) = 0x03030303;\
 735             else\
 736             {\
 737                 M32( bS ) = 0x00000000;\
 738                 for( i = 0; i < 4; i++ )\
 739                 {\
 740                     int x  = i_dir == 0 ? i_edge : i;\
 741                     int y  = i_dir == 0 ? i      : i_edge;\
 742                     int xn = i_dir == 0 ? (x - 1)&0x03 : x;\
 743                     int yn = i_dir == 0 ? y : (y - 1)&0x03;\
 744                     if( h->mb.non_zero_count[mb_xy][x+y*4] != 0 ||\
 745                         h->mb.non_zero_count[mbn_xy][xn+yn*4] != 0 )\
 746                         bS[i] = 2;\
 747                     else if(!(i_edge&no_sub8x8))\
 748                     {\
 749                         if((i&no_sub8x8) && bS[i-1] != 2)\
 750                             bS[i] = bS[i-1];\
 751                         else\
 752                         {\
 753                             int i8p= mb_8x8+(x>>1)+(y>>1)*s8x8;\
 754                             int i8q= mbn_8x8+(xn>>1)+(yn>>1)*s8x8;\
 755                             int i4p= mb_4x4+x+y*s4x4;\
 756                             int i4q= mbn_4x4+xn+yn*s4x4;\
 757                             int refs_equal;\
 758                             /* We don't use duplicate refs in B-frames, so we can take this shortcut for now. */ \
 759                             if( h->sh.i_type == SLICE_TYPE_B || h->mb.ref[0][i8p] < 0 || h->mb.ref[0][i8q] < 0 )\
 760                                 refs_equal = h->mb.ref[0][i8p] == h->mb.ref[0][i8q];\
 761                             else if( !h->mb.b_interlaced )\
 762                                 refs_equal = h->fref0[h->mb.ref[0][i8p]]->i_poc == h->fref0[h->mb.ref[0][i8q]]->i_poc;\
 763                             else\
 764                                 refs_equal = h->fref0[h->mb.ref[0][i8p]>>1]->i_poc == h->fref0[h->mb.ref[0][i8q]>>1]->i_poc\
 765                                            && (h->mb.ref[0][i8p]&1) == (h->mb.ref[0][i8q]&1);\
 766                             if((!refs_equal ||\
 767                                 abs( h->mb.mv[0][i4p][0] - h->mb.mv[0][i4q][0] ) >= 4 ||\
 768                                 abs( h->mb.mv[0][i4p][1] - h->mb.mv[0][i4q][1] ) >= mvy_limit ) ||\
 769                                (h->sh.i_type == SLICE_TYPE_B &&\
 770                                (h->mb.ref[1][i8p] != h->mb.ref[1][i8q] ||\
 771                                 abs( h->mb.mv[1][i4p][0] - h->mb.mv[1][i4q][0] ) >= 4 ||\
 772                                 abs( h->mb.mv[1][i4p][1] - h->mb.mv[1][i4q][1] ) >= mvy_limit )))\
 773                             {\
 774                                 bS[i] = 1;\
 775                             }\
 776                         }\
 777                     }\
 778                 }\
 779             }\
 780         }
 781
 782         /* i_dir == 0 -> vertical edge
 783          * i_dir == 1 -> horizontal edge */
 784         #define DEBLOCK_DIR(i_dir)\
 785         {\
 786             int i_edge = (i_dir ? (mb_y <= b_interlaced) : (mb_x == 0));\
 787             int i_qpn, i, mbn_xy, mbn_8x8, mbn_4x4;\
 788             ALIGNED_4( uint8_t bS[4] );  /* filtering strength */\
 789             if( i_edge )\
 790                 i_edge+= b_8x8_transform;\
 791             else\
 792             {\
 793                 mbn_xy  = i_dir == 0 ? mb_xy  - 1 : mb_xy - h->mb.i_mb_stride;\
 794                 mbn_8x8 = i_dir == 0 ? mb_8x8 - 2 : mb_8x8 - 2 * s8x8;\
 795                 mbn_4x4 = i_dir == 0 ? mb_4x4 - 4 : mb_4x4 - 4 * s4x4;\
 796                 if( b_interlaced && i_dir == 1 )\
 797                 {\
 798                     mbn_xy -= h->mb.i_mb_stride;\
 799                     mbn_8x8 -= 2 * s8x8;\
 800                     mbn_4x4 -= 4 * s4x4;\
 801                 }\
 802                 else if( IS_INTRA( h->mb.type[mb_xy] ) || IS_INTRA( h->mb.type[mbn_xy]) )\
 803                 {\
 804                     FILTER_DIR( _intra, i_dir );\
 805                     goto end##i_dir;\
 806                 }\
 807                 DEBLOCK_STRENGTH(i_dir);\
 808                 if( M32( bS ) )\
 809                     FILTER_DIR( , i_dir);\
 810                 end##i_dir:\
 811                 i_edge += b_8x8_transform+1;\
 812             }\
 813             mbn_xy  = mb_xy;\
 814             mbn_8x8 = mb_8x8;\
 815             mbn_4x4 = mb_4x4;\
 816             for( ; i_edge < i_edge_end; i_edge+=b_8x8_transform+1 )\
 817             {\
 818                 DEBLOCK_STRENGTH(i_dir);\
 819                 if( M32( bS ) )\
 820                     FILTER_DIR( , i_dir);\
 821             }\
 822         }
 823
 824         DEBLOCK_DIR(0);
 825         DEBLOCK_DIR(1);
 826     }
 827
 828     if( !h->pps->b_cabac && h->pps->b_transform_8x8_mode )
 829         munge_cavlc_nnz( h, mb_y, nnz_backup, restore_cavlc_nnz_row );
 830 }
 831
 832 void x264_frame_deblock( x264_t *h )
 833 {
 834     int mb_y;
 835     for( mb_y = 0; mb_y < h->sps->i_mb_height; mb_y += 1 + h->sh.b_mbaff )
 836         x264_frame_deblock_row( h, mb_y );
 837 }
 838
 839 #ifdef HAVE_MMX
 840 void x264_deblock_v_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 841 void x264_deblock_h_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 842 void x264_deblock_v_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
 843 void x264_deblock_h_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
 844
 845 void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 846 void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 847 void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta );
 848 void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta );
 849 #ifdef ARCH_X86
 850 void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 851 void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 852 void x264_deblock_h_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
 853 void x264_deblock_v8_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
 854
 855 static void x264_deblock_v_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
 856 {
 857     x264_deblock_v8_luma_mmxext( pix,   stride, alpha, beta, tc0   );
 858     x264_deblock_v8_luma_mmxext( pix+8, stride, alpha, beta, tc0+2 );
 859 }
 860 static void x264_deblock_v_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta )
 861 {
 862     x264_deblock_v8_luma_intra_mmxext( pix,   stride, alpha, beta );
 863     x264_deblock_v8_luma_intra_mmxext( pix+8, stride, alpha, beta );
 864 }
 865 #endif
 866 #endif
 867
 868 #ifdef ARCH_PPC
 869 void x264_deblock_v_luma_altivec( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 870 void x264_deblock_h_luma_altivec( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 871 #endif // ARCH_PPC
 872
 873 #ifdef HAVE_ARMV6
 874 void x264_deblock_v_luma_neon( uint8_t *, int, int, int, int8_t * );
 875 void x264_deblock_h_luma_neon( uint8_t *, int, int, int, int8_t * );
 876 void x264_deblock_v_chroma_neon( uint8_t *, int, int, int, int8_t * );
 877 void x264_deblock_h_chroma_neon( uint8_t *, int, int, int, int8_t * );
 878 #endif
 879
 880 void x264_deblock_init( int cpu, x264_deblock_function_t *pf )
 881 {
 882     pf->deblock_v_luma = deblock_v_luma_c;
 883     pf->deblock_h_luma = deblock_h_luma_c;
 884     pf->deblock_v_chroma = deblock_v_chroma_c;
 885     pf->deblock_h_chroma = deblock_h_chroma_c;
 886     pf->deblock_v_luma_intra = deblock_v_luma_intra_c;
 887     pf->deblock_h_luma_intra = deblock_h_luma_intra_c;
 888     pf->deblock_v_chroma_intra = deblock_v_chroma_intra_c;
 889     pf->deblock_h_chroma_intra = deblock_h_chroma_intra_c;
 890
 891 #ifdef HAVE_MMX
 892     if( cpu&X264_CPU_MMXEXT )
 893     {
 894         pf->deblock_v_chroma = x264_deblock_v_chroma_mmxext;
 895         pf->deblock_h_chroma = x264_deblock_h_chroma_mmxext;
 896         pf->deblock_v_chroma_intra = x264_deblock_v_chroma_intra_mmxext;
 897         pf->deblock_h_chroma_intra = x264_deblock_h_chroma_intra_mmxext;
 898 #ifdef ARCH_X86
 899         pf->deblock_v_luma = x264_deblock_v_luma_mmxext;
 900         pf->deblock_h_luma = x264_deblock_h_luma_mmxext;
 901         pf->deblock_v_luma_intra = x264_deblock_v_luma_intra_mmxext;
 902         pf->deblock_h_luma_intra = x264_deblock_h_luma_intra_mmxext;
 903 #endif
 904         if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_STACK_MOD4) )
 905         {
 906             pf->deblock_v_luma = x264_deblock_v_luma_sse2;
 907             pf->deblock_h_luma = x264_deblock_h_luma_sse2;
 908             pf->deblock_v_luma_intra = x264_deblock_v_luma_intra_sse2;
 909             pf->deblock_h_luma_intra = x264_deblock_h_luma_intra_sse2;
 910         }
 911     }
 912 #endif
 913
 914 #ifdef ARCH_PPC
 915     if( cpu&X264_CPU_ALTIVEC )
 916     {
 917         pf->deblock_v_luma = x264_deblock_v_luma_altivec;
 918         pf->deblock_h_luma = x264_deblock_h_luma_altivec;
 919    }
 920 #endif // ARCH_PPC
 921
 922 #ifdef HAVE_ARMV6
 923    if( cpu&X264_CPU_NEON )
 924    {
 925         pf->deblock_v_luma   = x264_deblock_v_luma_neon;
 926         pf->deblock_h_luma   = x264_deblock_h_luma_neon;
 927         pf->deblock_v_chroma = x264_deblock_v_chroma_neon;
 928         pf->deblock_h_chroma = x264_deblock_h_chroma_neon;
 929    }
 930 #endif
 931 }
 932
 933
 934 /* threading */
 935 void x264_frame_cond_broadcast( x264_frame_t *frame, int i_lines_completed )
 936 {
 937     x264_pthread_mutex_lock( &frame->mutex );
 938     frame->i_lines_completed = i_lines_completed;
 939     x264_pthread_cond_broadcast( &frame->cv );
 940     x264_pthread_mutex_unlock( &frame->mutex );
 941 }
 942
 943 void x264_frame_cond_wait( x264_frame_t *frame, int i_lines_completed )
 944 {
 945     x264_pthread_mutex_lock( &frame->mutex );
 946     while( frame->i_lines_completed < i_lines_completed )
 947         x264_pthread_cond_wait( &frame->cv, &frame->mutex );
 948     x264_pthread_mutex_unlock( &frame->mutex );
 949 }
 950
 951 /* list operators */
 952
 953 void x264_frame_push( x264_frame_t **list, x264_frame_t *frame )
 954 {
 955     int i = 0;
 956     while( list[i] ) i++;
 957     list[i] = frame;
 958 }
 959
 960 x264_frame_t *x264_frame_pop( x264_frame_t **list )
 961 {
 962     x264_frame_t *frame;
 963     int i = 0;
 964     assert( list[0] );
 965     while( list[i+1] ) i++;
 966     frame = list[i];
 967     list[i] = NULL;
 968     return frame;
 969 }
 970
 971 void x264_frame_unshift( x264_frame_t **list, x264_frame_t *frame )
 972 {
 973     int i = 0;
 974     while( list[i] ) i++;
 975     while( i-- )
 976         list[i+1] = list[i];
 977     list[0] = frame;
 978 }
 979
 980 x264_frame_t *x264_frame_shift( x264_frame_t **list )
 981 {
 982     x264_frame_t *frame = list[0];
 983     int i;
 984     for( i = 0; list[i]; i++ )
 985         list[i] = list[i+1];
 986     assert(frame);
 987     return frame;
 988 }
 989
 990 void x264_frame_push_unused( x264_t *h, x264_frame_t *frame )
 991 {
 992     assert( frame->i_reference_count > 0 );
 993     frame->i_reference_count--;
 994     if( frame->i_reference_count == 0 )
 995         x264_frame_push( h->frames.unused[frame->b_fdec], frame );
 996 }
 997
 998 x264_frame_t *x264_frame_pop_unused( x264_t *h, int b_fdec )
 999 {
1000     x264_frame_t *frame;
1001     if( h->frames.unused[b_fdec][0] )
1002         frame = x264_frame_pop( h->frames.unused[b_fdec] );
1003     else
1004         frame = x264_frame_new( h, b_fdec );
1005     if( !frame )
1006         return NULL;
1007     frame->b_last_minigop_bframe = 0;
1008     frame->i_reference_count = 1;
1009     frame->b_intra_calculated = 0;
1010     frame->b_scenecut = 1;
1011     frame->b_keyframe = 0;
1012
1013     memset( frame->weight, 0, sizeof(frame->weight) );
1014     memset( frame->f_weighted_cost_delta, 0, sizeof(frame->f_weighted_cost_delta) );
1015
1016     return frame;
1017 }
1018
1019 void x264_frame_push_blank_unused( x264_t *h, x264_frame_t *frame )
1020 {
1021     assert( frame->i_reference_count > 0 );
1022     frame->i_reference_count--;
1023     if( frame->i_reference_count == 0 )
1024         x264_frame_push( h->frames.blank_unused, frame );
1025 }
1026
1027 x264_frame_t *x264_frame_pop_blank_unused( x264_t *h )
1028 {
1029     x264_frame_t *frame;
1030     if( h->frames.blank_unused[0] )
1031         frame = x264_frame_pop( h->frames.blank_unused );
1032     else
1033         frame = x264_malloc( sizeof(x264_frame_t) );
1034     if( !frame )
1035         return NULL;
1036     frame->b_duplicate = 1;
1037     frame->i_reference_count = 1;
1038     return frame;
1039 }
1040
1041 void x264_frame_sort( x264_frame_t **list, int b_dts )
1042 {
1043     int i, b_ok;
1044     do {
1045         b_ok = 1;
1046         for( i = 0; list[i+1]; i++ )
1047         {
1048             int dtype = list[i]->i_type - list[i+1]->i_type;
1049             int dtime = list[i]->i_frame - list[i+1]->i_frame;
1050             int swap = b_dts ? dtype > 0 || ( dtype == 0 && dtime > 0 )
1051                              : dtime > 0;
1052             if( swap )
1053             {
1054                 XCHG( x264_frame_t*, list[i], list[i+1] );
1055                 b_ok = 0;
1056             }
1057         }
1058     } while( !b_ok );
1059 }
1060
1061 void x264_weight_scale_plane( x264_t *h, uint8_t *dst, int i_dst_stride, uint8_t *src, int i_src_stride,
1062                          int i_width, int i_height, x264_weight_t *w )
1063 {
1064     int x;
1065     /* Weight horizontal strips of height 16. This was found to be the optimal height
1066      * in terms of the cache loads. */
1067     while( i_height > 0 )
1068     {
1069         for( x = 0; x < i_width; x += 16 )
1070             w->weightfn[16>>2]( dst+x, i_dst_stride, src+x, i_src_stride, w, X264_MIN( i_height, 16 ) );
1071         i_height -= 16;
1072         dst += 16 * i_dst_stride;
1073         src += 16 * i_src_stride;
1074     }
1075 }
1076
1077 void x264_frame_delete_list( x264_frame_t **list )
1078 {
1079     int i = 0;
1080     if( !list )
1081         return;
1082     while( list[i] )
1083         x264_frame_delete( list[i++] );
1084     x264_free( list );
1085 }
1086
1087 int x264_synch_frame_list_init( x264_synch_frame_list_t *slist, int max_size )
1088 {
1089     if( max_size < 0 )
1090         return -1;
1091     slist->i_max_size = max_size;
1092     slist->i_size = 0;
1093     CHECKED_MALLOCZERO( slist->list, (max_size+1) * sizeof(x264_frame_t*) );
1094     if( x264_pthread_mutex_init( &slist->mutex, NULL ) ||
1095         x264_pthread_cond_init( &slist->cv_fill, NULL ) ||
1096         x264_pthread_cond_init( &slist->cv_empty, NULL ) )
1097         return -1;
1098     return 0;
1099 fail:
1100     return -1;
1101 }
1102
1103 void x264_synch_frame_list_delete( x264_synch_frame_list_t *slist )
1104 {
1105     x264_pthread_mutex_destroy( &slist->mutex );
1106     x264_pthread_cond_destroy( &slist->cv_fill );
1107     x264_pthread_cond_destroy( &slist->cv_empty );
1108     x264_frame_delete_list( slist->list );
1109 }
1110
1111 void x264_synch_frame_list_push( x264_synch_frame_list_t *slist, x264_frame_t *frame )
1112 {
1113     x264_pthread_mutex_lock( &slist->mutex );
1114     while( slist->i_size == slist->i_max_size )
1115         x264_pthread_cond_wait( &slist->cv_empty, &slist->mutex );
1116     slist->list[ slist->i_size++ ] = frame;
1117     x264_pthread_mutex_unlock( &slist->mutex );
1118     x264_pthread_cond_broadcast( &slist->cv_fill );
1119 }