git.sesse.net Git - x264/blob - common/frame.c

   1 /*****************************************************************************
   2  * frame.c: h264 encoder library
   3  *****************************************************************************
   4  * Copyright (C) 2003-2008 x264 project
   5  *
   6  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
   7  *          Loren Merritt <lorenm@u.washington.edu>
   8  *          Fiona Glaser <fiona@x264.com>
   9  *
  10  * This program is free software; you can redistribute it and/or modify
  11  * it under the terms of the GNU General Public License as published by
  12  * the Free Software Foundation; either version 2 of the License, or
  13  * (at your option) any later version.
  14  *
  15  * This program is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  * GNU General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU General Public License
  21  * along with this program; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  23  *****************************************************************************/
  24
  25 #include "common.h"
  26
  27 #define ALIGN(x,a) (((x)+((a)-1))&~((a)-1))
  28
  29 x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
  30 {
  31     x264_frame_t *frame;
  32     int i, j;
  33
  34     int i_mb_count = h->mb.i_mb_count;
  35     int i_stride, i_width, i_lines;
  36     int i_padv = PADV << h->param.b_interlaced;
  37     int luma_plane_size;
  38     int chroma_plane_size;
  39     int align = h->param.cpu&X264_CPU_CACHELINE_64 ? 64 : h->param.cpu&X264_CPU_CACHELINE_32 ? 32 : 16;
  40
  41     CHECKED_MALLOCZERO( frame, sizeof(x264_frame_t) );
  42
  43     /* allocate frame data (+64 for extra data for me) */
  44     i_width  = ALIGN( h->param.i_width, 16 );
  45     i_stride = ALIGN( i_width + 2*PADH, align );
  46     i_lines  = ALIGN( h->param.i_height, 16<<h->param.b_interlaced );
  47
  48     frame->i_plane = 3;
  49     for( i = 0; i < 3; i++ )
  50     {
  51         frame->i_stride[i] = ALIGN( i_stride >> !!i, align );
  52         frame->i_width[i] = i_width >> !!i;
  53         frame->i_lines[i] = i_lines >> !!i;
  54     }
  55
  56     luma_plane_size = (frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv));
  57     chroma_plane_size = (frame->i_stride[1] * (frame->i_lines[1] + 2*i_padv));
  58     for( i = 1; i < 3; i++ )
  59     {
  60         CHECKED_MALLOC( frame->buffer[i], chroma_plane_size );
  61         frame->plane[i] = frame->buffer[i] + (frame->i_stride[i] * i_padv + PADH)/2;
  62     }
  63
  64     for( i = 0; i < h->param.i_bframe + 2; i++ )
  65         for( j = 0; j < h->param.i_bframe + 2; j++ )
  66             CHECKED_MALLOC( frame->i_row_satds[i][j], i_lines/16 * sizeof(int) );
  67
  68     frame->i_poc = -1;
  69     frame->i_type = X264_TYPE_AUTO;
  70     frame->i_qpplus1 = 0;
  71     frame->i_pts = -1;
  72     frame->i_frame = -1;
  73     frame->i_frame_num = -1;
  74     frame->i_lines_completed = -1;
  75     frame->b_fdec = b_fdec;
  76     frame->orig = frame;
  77
  78     /* all 4 luma planes allocated together, since the cacheline split code
  79      * requires them to be in-phase wrt cacheline alignment. */
  80     if( h->param.analyse.i_subpel_refine && b_fdec )
  81     {
  82         CHECKED_MALLOC( frame->buffer[0], 4*luma_plane_size );
  83         for( i = 0; i < 4; i++ )
  84             frame->filtered[i] = frame->buffer[0] + i*luma_plane_size + frame->i_stride[0] * i_padv + PADH;
  85         frame->plane[0] = frame->filtered[0];
  86     }
  87     else
  88     {
  89         CHECKED_MALLOC( frame->buffer[0], luma_plane_size );
  90         frame->filtered[0] = frame->plane[0] = frame->buffer[0] + frame->i_stride[0] * i_padv + PADH;
  91     }
  92
  93     frame->b_duplicate = 0;
  94
  95     if( b_fdec ) /* fdec frame */
  96     {
  97         CHECKED_MALLOC( frame->mb_type, i_mb_count * sizeof(int8_t));
  98         CHECKED_MALLOC( frame->mv[0], 2*16 * i_mb_count * sizeof(int16_t) );
  99         CHECKED_MALLOC( frame->ref[0], 4 * i_mb_count * sizeof(int8_t) );
 100         if( h->param.i_bframe )
 101         {
 102             CHECKED_MALLOC( frame->mv[1], 2*16 * i_mb_count * sizeof(int16_t) );
 103             CHECKED_MALLOC( frame->ref[1], 4 * i_mb_count * sizeof(int8_t) );
 104         }
 105         else
 106         {
 107             frame->mv[1]  = NULL;
 108             frame->ref[1] = NULL;
 109         }
 110         CHECKED_MALLOC( frame->i_row_bits, i_lines/16 * sizeof(int) );
 111         CHECKED_MALLOC( frame->i_row_qp, i_lines/16 * sizeof(int) );
 112         if( h->param.analyse.i_me_method >= X264_ME_ESA )
 113         {
 114             CHECKED_MALLOC( frame->buffer[3],
 115                             frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv) * sizeof(uint16_t) << h->frames.b_have_sub8x8_esa );
 116             frame->integral = (uint16_t*)frame->buffer[3] + frame->i_stride[0] * i_padv + PADH;
 117         }
 118     }
 119     else /* fenc frame */
 120     {
 121         if( h->frames.b_have_lowres )
 122         {
 123             frame->i_width_lowres = frame->i_width[0]/2;
 124             frame->i_stride_lowres = ALIGN( frame->i_width_lowres + 2*PADH, align );
 125             frame->i_lines_lowres = frame->i_lines[0]/2;
 126
 127             luma_plane_size = frame->i_stride_lowres * (frame->i_lines[0]/2 + 2*i_padv);
 128
 129             CHECKED_MALLOC( frame->buffer_lowres[0], 4 * luma_plane_size );
 130             for( i = 0; i < 4; i++ )
 131                 frame->lowres[i] = frame->buffer_lowres[0] + (frame->i_stride_lowres * i_padv + PADH) + i * luma_plane_size;
 132
 133             for( j = 0; j <= !!h->param.i_bframe; j++ )
 134                 for( i = 0; i <= h->param.i_bframe; i++ )
 135                 {
 136                     CHECKED_MALLOCZERO( frame->lowres_mvs[j][i], 2*h->mb.i_mb_count*sizeof(int16_t) );
 137                     CHECKED_MALLOC( frame->lowres_mv_costs[j][i], h->mb.i_mb_count*sizeof(int) );
 138                 }
 139             CHECKED_MALLOC( frame->i_propagate_cost, (i_mb_count+3) * sizeof(uint16_t) );
 140             for( j = 0; j <= h->param.i_bframe+1; j++ )
 141                 for( i = 0; i <= h->param.i_bframe+1; i++ )
 142                 {
 143                     CHECKED_MALLOC( frame->lowres_costs[j][i], (i_mb_count+3) * sizeof(uint16_t) );
 144                     CHECKED_MALLOC( frame->lowres_inter_types[j][i], (i_mb_count+3)/4 * sizeof(uint8_t) );
 145                 }
 146             frame->i_intra_cost = frame->lowres_costs[0][0];
 147             memset( frame->i_intra_cost, -1, (i_mb_count+3) * sizeof(uint16_t) );
 148         }
 149         if( h->param.rc.i_aq_mode )
 150         {
 151             CHECKED_MALLOC( frame->f_qp_offset, h->mb.i_mb_count * sizeof(float) );
 152             CHECKED_MALLOC( frame->f_qp_offset_aq, h->mb.i_mb_count * sizeof(float) );
 153             if( h->frames.b_have_lowres )
 154                 /* shouldn't really be initialized, just silences a valgrind false-positive in x264_mbtree_propagate_cost_sse2 */
 155                 CHECKED_MALLOCZERO( frame->i_inv_qscale_factor, (h->mb.i_mb_count+3) * sizeof(uint16_t) );
 156         }
 157     }
 158
 159     if( x264_pthread_mutex_init( &frame->mutex, NULL ) )
 160         goto fail;
 161     if( x264_pthread_cond_init( &frame->cv, NULL ) )
 162         goto fail;
 163
 164     return frame;
 165
 166 fail:
 167     x264_free( frame );
 168     return NULL;
 169 }
 170
 171 void x264_frame_delete( x264_frame_t *frame )
 172 {
 173     int i, j;
 174     /* Duplicate frames are blank copies of real frames (including pointers),
 175      * so freeing those pointers would cause a double free later. */
 176     if( !frame->b_duplicate )
 177     {
 178         for( i = 0; i < 4; i++ )
 179             x264_free( frame->buffer[i] );
 180         for( i = 0; i < 4; i++ )
 181             x264_free( frame->buffer_lowres[i] );
 182         for( i = 0; i < X264_BFRAME_MAX+2; i++ )
 183             for( j = 0; j < X264_BFRAME_MAX+2; j++ )
 184                 x264_free( frame->i_row_satds[i][j] );
 185         for( j = 0; j < 2; j++ )
 186             for( i = 0; i <= X264_BFRAME_MAX; i++ )
 187             {
 188                 x264_free( frame->lowres_mvs[j][i] );
 189                 x264_free( frame->lowres_mv_costs[j][i] );
 190             }
 191         x264_free( frame->i_propagate_cost );
 192         for( j = 0; j <= X264_BFRAME_MAX+1; j++ )
 193             for( i = 0; i <= X264_BFRAME_MAX+1; i++ )
 194             {
 195                 x264_free( frame->lowres_costs[j][i] );
 196                 x264_free( frame->lowres_inter_types[j][i] );
 197             }
 198         x264_free( frame->f_qp_offset );
 199         x264_free( frame->f_qp_offset_aq );
 200         x264_free( frame->i_inv_qscale_factor );
 201         x264_free( frame->i_row_bits );
 202         x264_free( frame->i_row_qp );
 203         x264_free( frame->mb_type );
 204         x264_free( frame->mv[0] );
 205         x264_free( frame->mv[1] );
 206         x264_free( frame->ref[0] );
 207         x264_free( frame->ref[1] );
 208         x264_pthread_mutex_destroy( &frame->mutex );
 209         x264_pthread_cond_destroy( &frame->cv );
 210     }
 211     x264_free( frame );
 212 }
 213
 214 int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src )
 215 {
 216     int i_csp = src->img.i_csp & X264_CSP_MASK;
 217     int i;
 218     if( i_csp != X264_CSP_I420 && i_csp != X264_CSP_YV12 )
 219     {
 220         x264_log( h, X264_LOG_ERROR, "Arg invalid CSP\n" );
 221         return -1;
 222     }
 223
 224     dst->i_type     = src->i_type;
 225     dst->i_qpplus1  = src->i_qpplus1;
 226     dst->i_pts      = dst->i_dts = src->i_pts;
 227     dst->param      = src->param;
 228
 229     for( i=0; i<3; i++ )
 230     {
 231         int s = (i_csp == X264_CSP_YV12 && i) ? i^3 : i;
 232         uint8_t *plane = src->img.plane[s];
 233         int stride = src->img.i_stride[s];
 234         int width = h->param.i_width >> !!i;
 235         int height = h->param.i_height >> !!i;
 236         if( src->img.i_csp & X264_CSP_VFLIP )
 237         {
 238             plane += (height-1)*stride;
 239             stride = -stride;
 240         }
 241         h->mc.plane_copy( dst->plane[i], dst->i_stride[i], plane, stride, width, height );
 242     }
 243     return 0;
 244 }
 245
 246
 247
 248 static void plane_expand_border( uint8_t *pix, int i_stride, int i_width, int i_height, int i_padh, int i_padv, int b_pad_top, int b_pad_bottom )
 249 {
 250 #define PPIXEL(x, y) ( pix + (x) + (y)*i_stride )
 251     int y;
 252     for( y = 0; y < i_height; y++ )
 253     {
 254         /* left band */
 255         memset( PPIXEL(-i_padh, y), PPIXEL(0, y)[0], i_padh );
 256         /* right band */
 257         memset( PPIXEL(i_width, y), PPIXEL(i_width-1, y)[0], i_padh );
 258     }
 259     /* upper band */
 260     if( b_pad_top )
 261     for( y = 0; y < i_padv; y++ )
 262         memcpy( PPIXEL(-i_padh, -y-1), PPIXEL(-i_padh, 0), i_width+2*i_padh );
 263     /* lower band */
 264     if( b_pad_bottom )
 265     for( y = 0; y < i_padv; y++ )
 266         memcpy( PPIXEL(-i_padh, i_height+y), PPIXEL(-i_padh, i_height-1), i_width+2*i_padh );
 267 #undef PPIXEL
 268 }
 269
 270 void x264_frame_expand_border( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
 271 {
 272     int i;
 273     int b_start = !mb_y;
 274     if( mb_y & h->sh.b_mbaff )
 275         return;
 276     for( i = 0; i < frame->i_plane; i++ )
 277     {
 278         int stride = frame->i_stride[i];
 279         int width = 16*h->sps->i_mb_width >> !!i;
 280         int height = (b_end ? 16*(h->sps->i_mb_height - mb_y) >> h->sh.b_mbaff : 16) >> !!i;
 281         int padh = PADH >> !!i;
 282         int padv = PADV >> !!i;
 283         // buffer: 2 chroma, 3 luma (rounded to 4) because deblocking goes beyond the top of the mb
 284         uint8_t *pix = frame->plane[i] + X264_MAX(0, (16*mb_y-4)*stride >> !!i);
 285         if( b_end && !b_start )
 286             height += 4 >> (!!i + h->sh.b_mbaff);
 287         if( h->sh.b_mbaff )
 288         {
 289             plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end );
 290             plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end );
 291         }
 292         else
 293         {
 294             plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end );
 295         }
 296     }
 297 }
 298
 299 void x264_frame_expand_border_filtered( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
 300 {
 301     /* during filtering, 8 extra pixels were filtered on each edge,
 302      * but up to 3 of the horizontal ones may be wrong.
 303        we want to expand border from the last filtered pixel */
 304     int b_start = !mb_y;
 305     int stride = frame->i_stride[0];
 306     int width = 16*h->sps->i_mb_width + 8;
 307     int height = b_end ? (16*(h->sps->i_mb_height - mb_y) >> h->sh.b_mbaff) + 16 : 16;
 308     int padh = PADH - 4;
 309     int padv = PADV - 8;
 310     int i;
 311     for( i = 1; i < 4; i++ )
 312     {
 313         // buffer: 8 luma, to match the hpel filter
 314         uint8_t *pix = frame->filtered[i] + (16*mb_y - (8 << h->sh.b_mbaff)) * stride - 4;
 315         if( h->sh.b_mbaff )
 316         {
 317             plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end );
 318             plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end );
 319         }
 320         else
 321         {
 322             plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end );
 323         }
 324     }
 325 }
 326
 327 void x264_frame_expand_border_lowres( x264_frame_t *frame )
 328 {
 329     int i;
 330     for( i = 0; i < 4; i++ )
 331         plane_expand_border( frame->lowres[i], frame->i_stride_lowres, frame->i_width_lowres, frame->i_lines_lowres, PADH, PADV, 1, 1 );
 332 }
 333
 334 void x264_frame_expand_border_mod16( x264_t *h, x264_frame_t *frame )
 335 {
 336     int i, y;
 337     for( i = 0; i < frame->i_plane; i++ )
 338     {
 339         int i_subsample = i ? 1 : 0;
 340         int i_width = h->param.i_width >> i_subsample;
 341         int i_height = h->param.i_height >> i_subsample;
 342         int i_padx = (h->sps->i_mb_width * 16 - h->param.i_width) >> i_subsample;
 343         int i_pady = (h->sps->i_mb_height * 16 - h->param.i_height) >> i_subsample;
 344
 345         if( i_padx )
 346         {
 347             for( y = 0; y < i_height; y++ )
 348                 memset( &frame->plane[i][y*frame->i_stride[i] + i_width],
 349                          frame->plane[i][y*frame->i_stride[i] + i_width - 1],
 350                          i_padx );
 351         }
 352         if( i_pady )
 353         {
 354             //FIXME interlace? or just let it pad using the wrong field
 355             for( y = i_height; y < i_height + i_pady; y++ )
 356                 memcpy( &frame->plane[i][y*frame->i_stride[i]],
 357                         &frame->plane[i][(i_height-1)*frame->i_stride[i]],
 358                         i_width + i_padx );
 359         }
 360     }
 361 }
 362
 363
 364 /* cavlc + 8x8 transform stores nnz per 16 coeffs for the purpose of
 365  * entropy coding, but per 64 coeffs for the purpose of deblocking */
 366 static void munge_cavlc_nnz_row( x264_t *h, int mb_y, uint8_t (*buf)[16] )
 367 {
 368     uint32_t (*src)[6] = (uint32_t(*)[6])h->mb.non_zero_count + mb_y * h->sps->i_mb_width;
 369     int8_t *transform = h->mb.mb_transform_size + mb_y * h->sps->i_mb_width;
 370     int x, nnz;
 371     for( x=0; x<h->sps->i_mb_width; x++ )
 372     {
 373         memcpy( buf+x, src+x, 16 );
 374         if( transform[x] )
 375         {
 376             nnz = src[x][0] | src[x][1];
 377             src[x][0] = src[x][1] = ((uint16_t)nnz ? 0x0101 : 0) + (nnz>>16 ? 0x01010000 : 0);
 378             nnz = src[x][2] | src[x][3];
 379             src[x][2] = src[x][3] = ((uint16_t)nnz ? 0x0101 : 0) + (nnz>>16 ? 0x01010000 : 0);
 380         }
 381     }
 382 }
 383
 384 static void restore_cavlc_nnz_row( x264_t *h, int mb_y, uint8_t (*buf)[16] )
 385 {
 386     uint8_t (*dst)[24] = h->mb.non_zero_count + mb_y * h->sps->i_mb_width;
 387     int x;
 388     for( x=0; x<h->sps->i_mb_width; x++ )
 389         memcpy( dst+x, buf+x, 16 );
 390 }
 391
 392 static void munge_cavlc_nnz( x264_t *h, int mb_y, uint8_t (*buf)[16], void (*func)(x264_t*, int, uint8_t (*)[16]) )
 393 {
 394     func( h, mb_y, buf );
 395     if( mb_y > 0 )
 396         func( h, mb_y-1, buf + h->sps->i_mb_width );
 397     if( h->sh.b_mbaff )
 398     {
 399         func( h, mb_y+1, buf + h->sps->i_mb_width * 2 );
 400         if( mb_y > 0 )
 401             func( h, mb_y-2, buf + h->sps->i_mb_width * 3 );
 402     }
 403 }
 404
 405
 406 /* Deblocking filter */
 407 static const uint8_t i_alpha_table[52+12*2] =
 408 {
 409      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 410      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 411      0,  0,  0,  0,  0,  0,  4,  4,  5,  6,
 412      7,  8,  9, 10, 12, 13, 15, 17, 20, 22,
 413     25, 28, 32, 36, 40, 45, 50, 56, 63, 71,
 414     80, 90,101,113,127,144,162,182,203,226,
 415    255,255,
 416    255,255,255,255,255,255,255,255,255,255,255,255,
 417 };
 418 static const uint8_t i_beta_table[52+12*2] =
 419 {
 420      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 421      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 422      0,  0,  0,  0,  0,  0,  2,  2,  2,  3,
 423      3,  3,  3,  4,  4,  4,  6,  6,  7,  7,
 424      8,  8,  9,  9, 10, 10, 11, 11, 12, 12,
 425     13, 13, 14, 14, 15, 15, 16, 16, 17, 17,
 426     18, 18,
 427     18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
 428 };
 429 static const int8_t i_tc0_table[52+12*2][4] =
 430 {
 431     {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
 432     {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
 433     {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
 434     {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
 435     {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 1 },
 436     {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 1, 1 }, {-1, 0, 1, 1 }, {-1, 1, 1, 1 },
 437     {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 },
 438     {-1, 1, 1, 2 }, {-1, 1, 2, 3 }, {-1, 1, 2, 3 }, {-1, 2, 2, 3 }, {-1, 2, 2, 4 }, {-1, 2, 3, 4 },
 439     {-1, 2, 3, 4 }, {-1, 3, 3, 5 }, {-1, 3, 4, 6 }, {-1, 3, 4, 6 }, {-1, 4, 5, 7 }, {-1, 4, 5, 8 },
 440     {-1, 4, 6, 9 }, {-1, 5, 7,10 }, {-1, 6, 8,11 }, {-1, 6, 8,13 }, {-1, 7,10,14 }, {-1, 8,11,16 },
 441     {-1, 9,12,18 }, {-1,10,13,20 }, {-1,11,15,23 }, {-1,13,17,25 },
 442     {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
 443     {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
 444 };
 445 #define alpha_table(x) i_alpha_table[(x)+12]
 446 #define beta_table(x)  i_beta_table[(x)+12]
 447 #define tc0_table(x)   i_tc0_table[(x)+12]
 448
 449 /* From ffmpeg */
 450 static inline void deblock_luma_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0 )
 451 {
 452     int i, d;
 453     for( i = 0; i < 4; i++ )
 454     {
 455         if( tc0[i] < 0 )
 456         {
 457             pix += 4*ystride;
 458             continue;
 459         }
 460         for( d = 0; d < 4; d++ )
 461         {
 462             const int p2 = pix[-3*xstride];
 463             const int p1 = pix[-2*xstride];
 464             const int p0 = pix[-1*xstride];
 465             const int q0 = pix[ 0*xstride];
 466             const int q1 = pix[ 1*xstride];
 467             const int q2 = pix[ 2*xstride];
 468
 469             if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
 470             {
 471                 int tc = tc0[i];
 472                 int delta;
 473                 if( abs( p2 - p0 ) < beta )
 474                 {
 475                     pix[-2*xstride] = p1 + x264_clip3( (( p2 + ((p0 + q0 + 1) >> 1)) >> 1) - p1, -tc0[i], tc0[i] );
 476                     tc++;
 477                 }
 478                 if( abs( q2 - q0 ) < beta )
 479                 {
 480                     pix[ 1*xstride] = q1 + x264_clip3( (( q2 + ((p0 + q0 + 1) >> 1)) >> 1) - q1, -tc0[i], tc0[i] );
 481                     tc++;
 482                 }
 483
 484                 delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
 485                 pix[-1*xstride] = x264_clip_uint8( p0 + delta );    /* p0' */
 486                 pix[ 0*xstride] = x264_clip_uint8( q0 - delta );    /* q0' */
 487             }
 488             pix += ystride;
 489         }
 490     }
 491 }
 492 static void deblock_v_luma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
 493 {
 494     deblock_luma_c( pix, stride, 1, alpha, beta, tc0 );
 495 }
 496 static void deblock_h_luma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
 497 {
 498     deblock_luma_c( pix, 1, stride, alpha, beta, tc0 );
 499 }
 500
 501 static inline void deblock_chroma_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0 )
 502 {
 503     int i, d;
 504     for( i = 0; i < 4; i++ )
 505     {
 506         const int tc = tc0[i];
 507         if( tc <= 0 )
 508         {
 509             pix += 2*ystride;
 510             continue;
 511         }
 512         for( d = 0; d < 2; d++ )
 513         {
 514             const int p1 = pix[-2*xstride];
 515             const int p0 = pix[-1*xstride];
 516             const int q0 = pix[ 0*xstride];
 517             const int q1 = pix[ 1*xstride];
 518
 519             if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
 520             {
 521                 int delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
 522                 pix[-1*xstride] = x264_clip_uint8( p0 + delta );    /* p0' */
 523                 pix[ 0*xstride] = x264_clip_uint8( q0 - delta );    /* q0' */
 524             }
 525             pix += ystride;
 526         }
 527     }
 528 }
 529 static void deblock_v_chroma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
 530 {
 531     deblock_chroma_c( pix, stride, 1, alpha, beta, tc0 );
 532 }
 533 static void deblock_h_chroma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
 534 {
 535     deblock_chroma_c( pix, 1, stride, alpha, beta, tc0 );
 536 }
 537
 538 static inline void deblock_luma_intra_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta )
 539 {
 540     int d;
 541     for( d = 0; d < 16; d++ )
 542     {
 543         const int p2 = pix[-3*xstride];
 544         const int p1 = pix[-2*xstride];
 545         const int p0 = pix[-1*xstride];
 546         const int q0 = pix[ 0*xstride];
 547         const int q1 = pix[ 1*xstride];
 548         const int q2 = pix[ 2*xstride];
 549
 550         if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
 551         {
 552             if(abs( p0 - q0 ) < ((alpha >> 2) + 2) )
 553             {
 554                 if( abs( p2 - p0 ) < beta ) /* p0', p1', p2' */
 555                 {
 556                     const int p3 = pix[-4*xstride];
 557                     pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
 558                     pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
 559                     pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
 560                 }
 561                 else /* p0' */
 562                     pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
 563                 if( abs( q2 - q0 ) < beta ) /* q0', q1', q2' */
 564                 {
 565                     const int q3 = pix[3*xstride];
 566                     pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
 567                     pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
 568                     pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
 569                 }
 570                 else /* q0' */
 571                     pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
 572             }
 573             else /* p0', q0' */
 574             {
 575                 pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
 576                 pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
 577             }
 578         }
 579         pix += ystride;
 580     }
 581 }
 582 static void deblock_v_luma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
 583 {
 584     deblock_luma_intra_c( pix, stride, 1, alpha, beta );
 585 }
 586 static void deblock_h_luma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
 587 {
 588     deblock_luma_intra_c( pix, 1, stride, alpha, beta );
 589 }
 590
 591 static inline void deblock_chroma_intra_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta )
 592 {
 593     int d;
 594     for( d = 0; d < 8; d++ )
 595     {
 596         const int p1 = pix[-2*xstride];
 597         const int p0 = pix[-1*xstride];
 598         const int q0 = pix[ 0*xstride];
 599         const int q1 = pix[ 1*xstride];
 600
 601         if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
 602         {
 603             pix[-1*xstride] = (2*p1 + p0 + q1 + 2) >> 2;   /* p0' */
 604             pix[ 0*xstride] = (2*q1 + q0 + p1 + 2) >> 2;   /* q0' */
 605         }
 606         pix += ystride;
 607     }
 608 }
 609 static void deblock_v_chroma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
 610 {
 611     deblock_chroma_intra_c( pix, stride, 1, alpha, beta );
 612 }
 613 static void deblock_h_chroma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
 614 {
 615     deblock_chroma_intra_c( pix, 1, stride, alpha, beta );
 616 }
 617
 618 static inline void deblock_edge( x264_t *h, uint8_t *pix1, uint8_t *pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_inter_t pf_inter )
 619 {
 620     const int index_a = i_qp + h->sh.i_alpha_c0_offset;
 621     const int alpha = alpha_table(index_a);
 622     const int beta  = beta_table(i_qp + h->sh.i_beta_offset);
 623     int8_t tc[4];
 624
 625     if( !alpha || !beta )
 626         return;
 627
 628     tc[0] = tc0_table(index_a)[bS[0]] + b_chroma;
 629     tc[1] = tc0_table(index_a)[bS[1]] + b_chroma;
 630     tc[2] = tc0_table(index_a)[bS[2]] + b_chroma;
 631     tc[3] = tc0_table(index_a)[bS[3]] + b_chroma;
 632
 633     pf_inter( pix1, i_stride, alpha, beta, tc );
 634     if( b_chroma )
 635         pf_inter( pix2, i_stride, alpha, beta, tc );
 636 }
 637
 638 static inline void deblock_edge_intra( x264_t *h, uint8_t *pix1, uint8_t *pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_intra_t pf_intra )
 639 {
 640     const int alpha = alpha_table(i_qp + h->sh.i_alpha_c0_offset);
 641     const int beta  = beta_table(i_qp + h->sh.i_beta_offset);
 642
 643     if( !alpha || !beta )
 644         return;
 645
 646     pf_intra( pix1, i_stride, alpha, beta );
 647     if( b_chroma )
 648         pf_intra( pix2, i_stride, alpha, beta );
 649 }
 650
 651 void x264_frame_deblock_row( x264_t *h, int mb_y )
 652 {
 653     const int s8x8 = 2 * h->mb.i_mb_stride;
 654     const int s4x4 = 4 * h->mb.i_mb_stride;
 655     const int b_interlaced = h->sh.b_mbaff;
 656     const int mvy_limit = 4 >> b_interlaced;
 657     const int qp_thresh = 15 - X264_MIN(h->sh.i_alpha_c0_offset, h->sh.i_beta_offset) - X264_MAX(0, h->param.analyse.i_chroma_qp_offset);
 658     const int no_sub8x8 = !(h->param.analyse.inter & X264_ANALYSE_PSUB8x8);
 659     int mb_x;
 660     int stridey   = h->fdec->i_stride[0];
 661     int stride2y  = stridey << b_interlaced;
 662     int strideuv  = h->fdec->i_stride[1];
 663     int stride2uv = strideuv << b_interlaced;
 664
 665     if( !h->pps->b_cabac && h->pps->b_transform_8x8_mode )
 666         munge_cavlc_nnz( h, mb_y, h->mb.nnz_backup, munge_cavlc_nnz_row );
 667
 668     for( mb_x = 0; mb_x < h->sps->i_mb_width; mb_x += (~b_interlaced | mb_y)&1, mb_y ^= b_interlaced )
 669     {
 670         const int mb_xy  = mb_y * h->mb.i_mb_stride + mb_x;
 671         const int mb_8x8 = 2 * s8x8 * mb_y + 2 * mb_x;
 672         const int mb_4x4 = 4 * s4x4 * mb_y + 4 * mb_x;
 673         const int b_8x8_transform = h->mb.mb_transform_size[mb_xy];
 674         const int i_qp = h->mb.qp[mb_xy];
 675         int i_edge_end = (h->mb.type[mb_xy] == P_SKIP) ? 1 : 4;
 676         uint8_t *pixy = h->fdec->plane[0] + 16*mb_y*stridey  + 16*mb_x;
 677         uint8_t *pixu = h->fdec->plane[1] +  8*mb_y*strideuv +  8*mb_x;
 678         uint8_t *pixv = h->fdec->plane[2] +  8*mb_y*strideuv +  8*mb_x;
 679         if( b_interlaced && (mb_y&1) )
 680         {
 681             pixy -= 15*stridey;
 682             pixu -=  7*strideuv;
 683             pixv -=  7*strideuv;
 684         }
 685
 686         x264_prefetch_fenc( h, h->fdec, mb_x, mb_y );
 687
 688         if( i_qp <= qp_thresh )
 689             i_edge_end = 1;
 690
 691         #define FILTER_DIR(intra, i_dir)\
 692         {\
 693             /* Y plane */\
 694             i_qpn= h->mb.qp[mbn_xy];\
 695             if( i_dir == 0 )\
 696             {\
 697                 /* vertical edge */\
 698                 deblock_edge##intra( h, pixy + 4*i_edge, NULL,\
 699                               stride2y, bS, (i_qp+i_qpn+1) >> 1, 0,\
 700                               h->loopf.deblock_h_luma##intra );\
 701                 if( !(i_edge & 1) )\
 702                 {\
 703                     /* U/V planes */\
 704                     int i_qpc = (h->chroma_qp_table[i_qp] + h->chroma_qp_table[i_qpn] + 1) >> 1;\
 705                     deblock_edge##intra( h, pixu + 2*i_edge, pixv + 2*i_edge,\
 706                                   stride2uv, bS, i_qpc, 1,\
 707                                   h->loopf.deblock_h_chroma##intra );\
 708                 }\
 709             }\
 710             else\
 711             {\
 712                 /* horizontal edge */\
 713                 deblock_edge##intra( h, pixy + 4*i_edge*stride2y, NULL,\
 714                               stride2y, bS, (i_qp+i_qpn+1) >> 1, 0,\
 715                               h->loopf.deblock_v_luma##intra );\
 716                 /* U/V planes */\
 717                 if( !(i_edge & 1) )\
 718                 {\
 719                     int i_qpc = (h->chroma_qp_table[i_qp] + h->chroma_qp_table[i_qpn] + 1) >> 1;\
 720                     deblock_edge##intra( h, pixu + 2*i_edge*stride2uv, pixv + 2*i_edge*stride2uv,\
 721                                   stride2uv, bS, i_qpc, 1,\
 722                                   h->loopf.deblock_v_chroma##intra );\
 723                 }\
 724             }\
 725         }
 726
 727         #define DEBLOCK_STRENGTH(i_dir)\
 728         {\
 729             /* *** Get bS for each 4px for the current edge *** */\
 730             if( IS_INTRA( h->mb.type[mb_xy] ) || IS_INTRA( h->mb.type[mbn_xy]) )\
 731                 M32( bS ) = 0x03030303;\
 732             else\
 733             {\
 734                 M32( bS ) = 0x00000000;\
 735                 for( i = 0; i < 4; i++ )\
 736                 {\
 737                     int x  = i_dir == 0 ? i_edge : i;\
 738                     int y  = i_dir == 0 ? i      : i_edge;\
 739                     int xn = i_dir == 0 ? (x - 1)&0x03 : x;\
 740                     int yn = i_dir == 0 ? y : (y - 1)&0x03;\
 741                     if( h->mb.non_zero_count[mb_xy][x+y*4] != 0 ||\
 742                         h->mb.non_zero_count[mbn_xy][xn+yn*4] != 0 )\
 743                         bS[i] = 2;\
 744                     else if(!(i_edge&no_sub8x8))\
 745                     {\
 746                         if((i&no_sub8x8) && bS[i-1] != 2)\
 747                             bS[i] = bS[i-1];\
 748                         else\
 749                         {\
 750                             int i8p= mb_8x8+(x>>1)+(y>>1)*s8x8;\
 751                             int i8q= mbn_8x8+(xn>>1)+(yn>>1)*s8x8;\
 752                             int i4p= mb_4x4+x+y*s4x4;\
 753                             int i4q= mbn_4x4+xn+yn*s4x4;\
 754                             int refs_equal;\
 755                             /* We don't use duplicate refs in B-frames, so we can take this shortcut for now. */ \
 756                             if( h->sh.i_type == SLICE_TYPE_B || h->mb.ref[0][i8p] < 0 || h->mb.ref[0][i8q] < 0 )\
 757                                 refs_equal = h->mb.ref[0][i8p] == h->mb.ref[0][i8q];\
 758                             else if( !h->mb.b_interlaced )\
 759                                 refs_equal = h->fref0[h->mb.ref[0][i8p]]->i_poc == h->fref0[h->mb.ref[0][i8q]]->i_poc;\
 760                             else\
 761                                 refs_equal = h->fref0[h->mb.ref[0][i8p]>>1]->i_poc == h->fref0[h->mb.ref[0][i8q]>>1]->i_poc\
 762                                            && (h->mb.ref[0][i8p]&1) == (h->mb.ref[0][i8q]&1);\
 763                             if((!refs_equal ||\
 764                                 abs( h->mb.mv[0][i4p][0] - h->mb.mv[0][i4q][0] ) >= 4 ||\
 765                                 abs( h->mb.mv[0][i4p][1] - h->mb.mv[0][i4q][1] ) >= mvy_limit ) ||\
 766                                (h->sh.i_type == SLICE_TYPE_B &&\
 767                                (h->mb.ref[1][i8p] != h->mb.ref[1][i8q] ||\
 768                                 abs( h->mb.mv[1][i4p][0] - h->mb.mv[1][i4q][0] ) >= 4 ||\
 769                                 abs( h->mb.mv[1][i4p][1] - h->mb.mv[1][i4q][1] ) >= mvy_limit )))\
 770                             {\
 771                                 bS[i] = 1;\
 772                             }\
 773                         }\
 774                     }\
 775                 }\
 776             }\
 777         }
 778
 779         /* i_dir == 0 -> vertical edge
 780          * i_dir == 1 -> horizontal edge */
 781         #define DEBLOCK_DIR(i_dir)\
 782         {\
 783             int i_edge = (i_dir ? (mb_y <= b_interlaced) : (mb_x == 0));\
 784             int i_qpn, i, mbn_xy, mbn_8x8, mbn_4x4;\
 785             ALIGNED_4( uint8_t bS[4] );  /* filtering strength */\
 786             if( i_edge )\
 787                 i_edge+= b_8x8_transform;\
 788             else\
 789             {\
 790                 mbn_xy  = i_dir == 0 ? mb_xy  - 1 : mb_xy - h->mb.i_mb_stride;\
 791                 mbn_8x8 = i_dir == 0 ? mb_8x8 - 2 : mb_8x8 - 2 * s8x8;\
 792                 mbn_4x4 = i_dir == 0 ? mb_4x4 - 4 : mb_4x4 - 4 * s4x4;\
 793                 if( b_interlaced && i_dir == 1 )\
 794                 {\
 795                     mbn_xy -= h->mb.i_mb_stride;\
 796                     mbn_8x8 -= 2 * s8x8;\
 797                     mbn_4x4 -= 4 * s4x4;\
 798                 }\
 799                 else if( IS_INTRA( h->mb.type[mb_xy] ) || IS_INTRA( h->mb.type[mbn_xy]) )\
 800                 {\
 801                     FILTER_DIR( _intra, i_dir );\
 802                     goto end##i_dir;\
 803                 }\
 804                 DEBLOCK_STRENGTH(i_dir);\
 805                 if( M32( bS ) )\
 806                     FILTER_DIR( , i_dir);\
 807                 end##i_dir:\
 808                 i_edge += b_8x8_transform+1;\
 809             }\
 810             mbn_xy  = mb_xy;\
 811             mbn_8x8 = mb_8x8;\
 812             mbn_4x4 = mb_4x4;\
 813             for( ; i_edge < i_edge_end; i_edge+=b_8x8_transform+1 )\
 814             {\
 815                 DEBLOCK_STRENGTH(i_dir);\
 816                 if( M32( bS ) )\
 817                     FILTER_DIR( , i_dir);\
 818             }\
 819         }
 820
 821         DEBLOCK_DIR(0);
 822         DEBLOCK_DIR(1);
 823     }
 824
 825     if( !h->pps->b_cabac && h->pps->b_transform_8x8_mode )
 826         munge_cavlc_nnz( h, mb_y, h->mb.nnz_backup, restore_cavlc_nnz_row );
 827 }
 828
 829 void x264_frame_deblock( x264_t *h )
 830 {
 831     int mb_y;
 832     for( mb_y = 0; mb_y < h->sps->i_mb_height; mb_y += 1 + h->sh.b_mbaff )
 833         x264_frame_deblock_row( h, mb_y );
 834 }
 835
 836 #ifdef HAVE_MMX
 837 void x264_deblock_v_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 838 void x264_deblock_h_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 839 void x264_deblock_v_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
 840 void x264_deblock_h_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
 841
 842 void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 843 void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 844 void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta );
 845 void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta );
 846 #ifdef ARCH_X86
 847 void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 848 void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 849 void x264_deblock_h_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
 850 void x264_deblock_v8_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
 851
 852 static void x264_deblock_v_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
 853 {
 854     x264_deblock_v8_luma_mmxext( pix,   stride, alpha, beta, tc0   );
 855     x264_deblock_v8_luma_mmxext( pix+8, stride, alpha, beta, tc0+2 );
 856 }
 857 static void x264_deblock_v_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta )
 858 {
 859     x264_deblock_v8_luma_intra_mmxext( pix,   stride, alpha, beta );
 860     x264_deblock_v8_luma_intra_mmxext( pix+8, stride, alpha, beta );
 861 }
 862 #endif
 863 #endif
 864
 865 #ifdef ARCH_PPC
 866 void x264_deblock_v_luma_altivec( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 867 void x264_deblock_h_luma_altivec( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 868 #endif // ARCH_PPC
 869
 870 #ifdef HAVE_ARMV6
 871 void x264_deblock_v_luma_neon( uint8_t *, int, int, int, int8_t * );
 872 void x264_deblock_h_luma_neon( uint8_t *, int, int, int, int8_t * );
 873 void x264_deblock_v_chroma_neon( uint8_t *, int, int, int, int8_t * );
 874 void x264_deblock_h_chroma_neon( uint8_t *, int, int, int, int8_t * );
 875 #endif
 876
 877 void x264_deblock_init( int cpu, x264_deblock_function_t *pf )
 878 {
 879     pf->deblock_v_luma = deblock_v_luma_c;
 880     pf->deblock_h_luma = deblock_h_luma_c;
 881     pf->deblock_v_chroma = deblock_v_chroma_c;
 882     pf->deblock_h_chroma = deblock_h_chroma_c;
 883     pf->deblock_v_luma_intra = deblock_v_luma_intra_c;
 884     pf->deblock_h_luma_intra = deblock_h_luma_intra_c;
 885     pf->deblock_v_chroma_intra = deblock_v_chroma_intra_c;
 886     pf->deblock_h_chroma_intra = deblock_h_chroma_intra_c;
 887
 888 #ifdef HAVE_MMX
 889     if( cpu&X264_CPU_MMXEXT )
 890     {
 891         pf->deblock_v_chroma = x264_deblock_v_chroma_mmxext;
 892         pf->deblock_h_chroma = x264_deblock_h_chroma_mmxext;
 893         pf->deblock_v_chroma_intra = x264_deblock_v_chroma_intra_mmxext;
 894         pf->deblock_h_chroma_intra = x264_deblock_h_chroma_intra_mmxext;
 895 #ifdef ARCH_X86
 896         pf->deblock_v_luma = x264_deblock_v_luma_mmxext;
 897         pf->deblock_h_luma = x264_deblock_h_luma_mmxext;
 898         pf->deblock_v_luma_intra = x264_deblock_v_luma_intra_mmxext;
 899         pf->deblock_h_luma_intra = x264_deblock_h_luma_intra_mmxext;
 900 #endif
 901         if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_STACK_MOD4) )
 902         {
 903             pf->deblock_v_luma = x264_deblock_v_luma_sse2;
 904             pf->deblock_h_luma = x264_deblock_h_luma_sse2;
 905             pf->deblock_v_luma_intra = x264_deblock_v_luma_intra_sse2;
 906             pf->deblock_h_luma_intra = x264_deblock_h_luma_intra_sse2;
 907         }
 908     }
 909 #endif
 910
 911 #ifdef ARCH_PPC
 912     if( cpu&X264_CPU_ALTIVEC )
 913     {
 914         pf->deblock_v_luma = x264_deblock_v_luma_altivec;
 915         pf->deblock_h_luma = x264_deblock_h_luma_altivec;
 916    }
 917 #endif // ARCH_PPC
 918
 919 #ifdef HAVE_ARMV6
 920    if( cpu&X264_CPU_NEON )
 921    {
 922         pf->deblock_v_luma   = x264_deblock_v_luma_neon;
 923         pf->deblock_h_luma   = x264_deblock_h_luma_neon;
 924         pf->deblock_v_chroma = x264_deblock_v_chroma_neon;
 925         pf->deblock_h_chroma = x264_deblock_h_chroma_neon;
 926    }
 927 #endif
 928 }
 929
 930
 931 /* threading */
 932 void x264_frame_cond_broadcast( x264_frame_t *frame, int i_lines_completed )
 933 {
 934     x264_pthread_mutex_lock( &frame->mutex );
 935     frame->i_lines_completed = i_lines_completed;
 936     x264_pthread_cond_broadcast( &frame->cv );
 937     x264_pthread_mutex_unlock( &frame->mutex );
 938 }
 939
 940 void x264_frame_cond_wait( x264_frame_t *frame, int i_lines_completed )
 941 {
 942     x264_pthread_mutex_lock( &frame->mutex );
 943     while( frame->i_lines_completed < i_lines_completed )
 944         x264_pthread_cond_wait( &frame->cv, &frame->mutex );
 945     x264_pthread_mutex_unlock( &frame->mutex );
 946 }
 947
 948 /* list operators */
 949
 950 void x264_frame_push( x264_frame_t **list, x264_frame_t *frame )
 951 {
 952     int i = 0;
 953     while( list[i] ) i++;
 954     list[i] = frame;
 955 }
 956
 957 x264_frame_t *x264_frame_pop( x264_frame_t **list )
 958 {
 959     x264_frame_t *frame;
 960     int i = 0;
 961     assert( list[0] );
 962     while( list[i+1] ) i++;
 963     frame = list[i];
 964     list[i] = NULL;
 965     return frame;
 966 }
 967
 968 void x264_frame_unshift( x264_frame_t **list, x264_frame_t *frame )
 969 {
 970     int i = 0;
 971     while( list[i] ) i++;
 972     while( i-- )
 973         list[i+1] = list[i];
 974     list[0] = frame;
 975 }
 976
 977 x264_frame_t *x264_frame_shift( x264_frame_t **list )
 978 {
 979     x264_frame_t *frame = list[0];
 980     int i;
 981     for( i = 0; list[i]; i++ )
 982         list[i] = list[i+1];
 983     assert(frame);
 984     return frame;
 985 }
 986
 987 void x264_frame_push_unused( x264_t *h, x264_frame_t *frame )
 988 {
 989     assert( frame->i_reference_count > 0 );
 990     frame->i_reference_count--;
 991     if( frame->i_reference_count == 0 )
 992         x264_frame_push( h->frames.unused[frame->b_fdec], frame );
 993 }
 994
 995 x264_frame_t *x264_frame_pop_unused( x264_t *h, int b_fdec )
 996 {
 997     x264_frame_t *frame;
 998     if( h->frames.unused[b_fdec][0] )
 999         frame = x264_frame_pop( h->frames.unused[b_fdec] );
1000     else
1001         frame = x264_frame_new( h, b_fdec );
1002     if( !frame )
1003         return NULL;
1004     frame->b_last_minigop_bframe = 0;
1005     frame->i_reference_count = 1;
1006     frame->b_intra_calculated = 0;
1007     frame->b_scenecut = 1;
1008
1009     memset( frame->weight, 0, sizeof(frame->weight) );
1010     memset( frame->f_weighted_cost_delta, 0, sizeof(frame->f_weighted_cost_delta) );
1011
1012     return frame;
1013 }
1014
1015 void x264_frame_push_blank_unused( x264_t *h, x264_frame_t *frame )
1016 {
1017     assert( frame->i_reference_count > 0 );
1018     frame->i_reference_count--;
1019     if( frame->i_reference_count == 0 )
1020         x264_frame_push( h->frames.blank_unused, frame );
1021 }
1022
1023 x264_frame_t *x264_frame_pop_blank_unused( x264_t *h )
1024 {
1025     x264_frame_t *frame;
1026     if( h->frames.blank_unused[0] )
1027         frame = x264_frame_pop( h->frames.blank_unused );
1028     else
1029         frame = x264_malloc( sizeof(x264_frame_t) );
1030     if( !frame )
1031         return NULL;
1032     frame->b_duplicate = 1;
1033     frame->i_reference_count = 1;
1034     return frame;
1035 }
1036
1037 void x264_frame_sort( x264_frame_t **list, int b_dts )
1038 {
1039     int i, b_ok;
1040     do {
1041         b_ok = 1;
1042         for( i = 0; list[i+1]; i++ )
1043         {
1044             int dtype = list[i]->i_type - list[i+1]->i_type;
1045             int dtime = list[i]->i_frame - list[i+1]->i_frame;
1046             int swap = b_dts ? dtype > 0 || ( dtype == 0 && dtime > 0 )
1047                              : dtime > 0;
1048             if( swap )
1049             {
1050                 XCHG( x264_frame_t*, list[i], list[i+1] );
1051                 b_ok = 0;
1052             }
1053         }
1054     } while( !b_ok );
1055 }
1056
1057 void x264_weight_scale_plane( x264_t *h, uint8_t *dst, int i_dst_stride, uint8_t *src, int i_src_stride,
1058                          int i_width, int i_height, x264_weight_t *w )
1059 {
1060     int x;
1061     /* Weight horizontal strips of height 16. This was found to be the optimal height
1062      * in terms of the cache loads. */
1063     while( i_height > 0 )
1064     {
1065         for( x = 0; x < i_width; x += 16 )
1066             w->weightfn[16>>2]( dst+x, i_dst_stride, src+x, i_src_stride, w, X264_MIN( i_height, 16 ) );
1067         i_height -= 16;
1068         dst += 16 * i_dst_stride;
1069         src += 16 * i_src_stride;
1070     }
1071 }
1072
1073 void x264_frame_delete_list( x264_frame_t **list )
1074 {
1075     int i = 0;
1076     if( !list )
1077         return;
1078     while( list[i] )
1079         x264_frame_delete( list[i++] );
1080     x264_free( list );
1081 }
1082
1083 int x264_synch_frame_list_init( x264_synch_frame_list_t *slist, int max_size )
1084 {
1085     if( max_size < 0 )
1086         return -1;
1087     slist->i_max_size = max_size;
1088     slist->i_size = 0;
1089     CHECKED_MALLOCZERO( slist->list, (max_size+1) * sizeof(x264_frame_t*) );
1090     if( x264_pthread_mutex_init( &slist->mutex, NULL ) ||
1091         x264_pthread_cond_init( &slist->cv_fill, NULL ) ||
1092         x264_pthread_cond_init( &slist->cv_empty, NULL ) )
1093         return -1;
1094     return 0;
1095 fail:
1096     return -1;
1097 }
1098
1099 void x264_synch_frame_list_delete( x264_synch_frame_list_t *slist )
1100 {
1101     x264_pthread_mutex_destroy( &slist->mutex );
1102     x264_pthread_cond_destroy( &slist->cv_fill );
1103     x264_pthread_cond_destroy( &slist->cv_empty );
1104     x264_frame_delete_list( slist->list );
1105 }
1106
1107 void x264_synch_frame_list_push( x264_synch_frame_list_t *slist, x264_frame_t *frame )
1108 {
1109     x264_pthread_mutex_lock( &slist->mutex );
1110     while( slist->i_size == slist->i_max_size )
1111         x264_pthread_cond_wait( &slist->cv_empty, &slist->mutex );
1112     slist->list[ slist->i_size++ ] = frame;
1113     x264_pthread_mutex_unlock( &slist->mutex );
1114     x264_pthread_cond_broadcast( &slist->cv_fill );
1115 }