git.sesse.net Git - x264/blob - common/frame.c

   1 /*****************************************************************************
   2  * frame.c: h264 encoder library
   3  *****************************************************************************
   4  * Copyright (C) 2003-2008 x264 project
   5  *
   6  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
   7  *          Loren Merritt <lorenm@u.washington.edu>
   8  *          Fiona Glaser <fiona@x264.com>
   9  *
  10  * This program is free software; you can redistribute it and/or modify
  11  * it under the terms of the GNU General Public License as published by
  12  * the Free Software Foundation; either version 2 of the License, or
  13  * (at your option) any later version.
  14  *
  15  * This program is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  * GNU General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU General Public License
  21  * along with this program; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  23  *****************************************************************************/
  24
  25 #include "common.h"
  26
  27 #define ALIGN(x,a) (((x)+((a)-1))&~((a)-1))
  28
  29 x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
  30 {
  31     x264_frame_t *frame;
  32     int i, j;
  33
  34     int i_mb_count = h->mb.i_mb_count;
  35     int i_stride, i_width, i_lines;
  36     int i_padv = PADV << h->param.b_interlaced;
  37     int luma_plane_size;
  38     int chroma_plane_size;
  39     int align = h->param.cpu&X264_CPU_CACHELINE_64 ? 64 : h->param.cpu&X264_CPU_CACHELINE_32 ? 32 : 16;
  40
  41     CHECKED_MALLOCZERO( frame, sizeof(x264_frame_t) );
  42
  43     /* allocate frame data (+64 for extra data for me) */
  44     i_width  = ALIGN( h->param.i_width, 16 );
  45     i_stride = ALIGN( i_width + 2*PADH, align );
  46     i_lines  = ALIGN( h->param.i_height, 16<<h->param.b_interlaced );
  47
  48     frame->i_plane = 3;
  49     for( i = 0; i < 3; i++ )
  50     {
  51         frame->i_stride[i] = ALIGN( i_stride >> !!i, align );
  52         frame->i_width[i] = i_width >> !!i;
  53         frame->i_lines[i] = i_lines >> !!i;
  54     }
  55
  56     luma_plane_size = (frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv));
  57     chroma_plane_size = (frame->i_stride[1] * (frame->i_lines[1] + 2*i_padv));
  58     for( i = 1; i < 3; i++ )
  59     {
  60         CHECKED_MALLOC( frame->buffer[i], chroma_plane_size );
  61         frame->plane[i] = frame->buffer[i] + (frame->i_stride[i] * i_padv + PADH)/2;
  62     }
  63
  64     for( i = 0; i < h->param.i_bframe + 2; i++ )
  65         for( j = 0; j < h->param.i_bframe + 2; j++ )
  66             CHECKED_MALLOC( frame->i_row_satds[i][j], i_lines/16 * sizeof(int) );
  67
  68     frame->i_poc = -1;
  69     frame->i_type = X264_TYPE_AUTO;
  70     frame->i_qpplus1 = 0;
  71     frame->i_pts = -1;
  72     frame->i_frame = -1;
  73     frame->i_frame_num = -1;
  74     frame->i_lines_completed = -1;
  75     frame->b_fdec = b_fdec;
  76     frame->orig = frame;
  77
  78     /* all 4 luma planes allocated together, since the cacheline split code
  79      * requires them to be in-phase wrt cacheline alignment. */
  80     if( h->param.analyse.i_subpel_refine && b_fdec )
  81     {
  82         CHECKED_MALLOC( frame->buffer[0], 4*luma_plane_size );
  83         for( i = 0; i < 4; i++ )
  84             frame->filtered[i] = frame->buffer[0] + i*luma_plane_size + frame->i_stride[0] * i_padv + PADH;
  85         frame->plane[0] = frame->filtered[0];
  86     }
  87     else
  88     {
  89         CHECKED_MALLOC( frame->buffer[0], luma_plane_size );
  90         frame->filtered[0] = frame->plane[0] = frame->buffer[0] + frame->i_stride[0] * i_padv + PADH;
  91     }
  92
  93     frame->b_duplicate = 0;
  94
  95     if( b_fdec ) /* fdec frame */
  96     {
  97         CHECKED_MALLOC( frame->mb_type, i_mb_count * sizeof(int8_t));
  98         CHECKED_MALLOC( frame->mv[0], 2*16 * i_mb_count * sizeof(int16_t) );
  99         CHECKED_MALLOC( frame->ref[0], 4 * i_mb_count * sizeof(int8_t) );
 100         if( h->param.i_bframe )
 101         {
 102             CHECKED_MALLOC( frame->mv[1], 2*16 * i_mb_count * sizeof(int16_t) );
 103             CHECKED_MALLOC( frame->ref[1], 4 * i_mb_count * sizeof(int8_t) );
 104         }
 105         else
 106         {
 107             frame->mv[1]  = NULL;
 108             frame->ref[1] = NULL;
 109         }
 110         CHECKED_MALLOC( frame->i_row_bits, i_lines/16 * sizeof(int) );
 111         CHECKED_MALLOC( frame->i_row_qp, i_lines/16 * sizeof(int) );
 112         if( h->param.analyse.i_me_method >= X264_ME_ESA )
 113         {
 114             CHECKED_MALLOC( frame->buffer[3],
 115                             frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv) * sizeof(uint16_t) << h->frames.b_have_sub8x8_esa );
 116             frame->integral = (uint16_t*)frame->buffer[3] + frame->i_stride[0] * i_padv + PADH;
 117         }
 118     }
 119     else /* fenc frame */
 120     {
 121         if( h->frames.b_have_lowres )
 122         {
 123             frame->i_width_lowres = frame->i_width[0]/2;
 124             frame->i_stride_lowres = ALIGN( frame->i_width_lowres + 2*PADH, align );
 125             frame->i_lines_lowres = frame->i_lines[0]/2;
 126
 127             luma_plane_size = frame->i_stride_lowres * (frame->i_lines[0]/2 + 2*PADV);
 128
 129             CHECKED_MALLOC( frame->buffer_lowres[0], 4 * luma_plane_size );
 130             for( i = 0; i < 4; i++ )
 131                 frame->lowres[i] = frame->buffer_lowres[0] + (frame->i_stride_lowres * PADV + PADH) + i * luma_plane_size;
 132
 133             for( j = 0; j <= !!h->param.i_bframe; j++ )
 134                 for( i = 0; i <= h->param.i_bframe; i++ )
 135                 {
 136                     CHECKED_MALLOCZERO( frame->lowres_mvs[j][i], 2*h->mb.i_mb_count*sizeof(int16_t) );
 137                     CHECKED_MALLOC( frame->lowres_mv_costs[j][i], h->mb.i_mb_count*sizeof(int) );
 138                 }
 139             CHECKED_MALLOC( frame->i_propagate_cost, (i_mb_count+3) * sizeof(uint16_t) );
 140             for( j = 0; j <= h->param.i_bframe+1; j++ )
 141                 for( i = 0; i <= h->param.i_bframe+1; i++ )
 142                 {
 143                     CHECKED_MALLOC( frame->lowres_costs[j][i], (i_mb_count+3) * sizeof(uint16_t) );
 144                     CHECKED_MALLOC( frame->lowres_inter_types[j][i], (i_mb_count+3)/4 * sizeof(uint8_t) );
 145                 }
 146             frame->i_intra_cost = frame->lowres_costs[0][0];
 147             memset( frame->i_intra_cost, -1, (i_mb_count+3) * sizeof(uint16_t) );
 148         }
 149         if( h->param.rc.i_aq_mode )
 150         {
 151             CHECKED_MALLOC( frame->f_qp_offset, h->mb.i_mb_count * sizeof(float) );
 152             CHECKED_MALLOC( frame->f_qp_offset_aq, h->mb.i_mb_count * sizeof(float) );
 153             if( h->frames.b_have_lowres )
 154                 /* shouldn't really be initialized, just silences a valgrind false-positive in x264_mbtree_propagate_cost_sse2 */
 155                 CHECKED_MALLOCZERO( frame->i_inv_qscale_factor, (h->mb.i_mb_count+3) * sizeof(uint16_t) );
 156         }
 157     }
 158
 159     if( x264_pthread_mutex_init( &frame->mutex, NULL ) )
 160         goto fail;
 161     if( x264_pthread_cond_init( &frame->cv, NULL ) )
 162         goto fail;
 163
 164     return frame;
 165
 166 fail:
 167     x264_free( frame );
 168     return NULL;
 169 }
 170
 171 void x264_frame_delete( x264_frame_t *frame )
 172 {
 173     int i, j;
 174     /* Duplicate frames are blank copies of real frames (including pointers),
 175      * so freeing those pointers would cause a double free later. */
 176     if( !frame->b_duplicate )
 177     {
 178         for( i = 0; i < 4; i++ )
 179             x264_free( frame->buffer[i] );
 180         for( i = 0; i < 4; i++ )
 181             x264_free( frame->buffer_lowres[i] );
 182         for( i = 0; i < X264_BFRAME_MAX+2; i++ )
 183             for( j = 0; j < X264_BFRAME_MAX+2; j++ )
 184                 x264_free( frame->i_row_satds[i][j] );
 185         for( j = 0; j < 2; j++ )
 186             for( i = 0; i <= X264_BFRAME_MAX; i++ )
 187             {
 188                 x264_free( frame->lowres_mvs[j][i] );
 189                 x264_free( frame->lowres_mv_costs[j][i] );
 190             }
 191         x264_free( frame->i_propagate_cost );
 192         for( j = 0; j <= X264_BFRAME_MAX+1; j++ )
 193             for( i = 0; i <= X264_BFRAME_MAX+1; i++ )
 194             {
 195                 x264_free( frame->lowres_costs[j][i] );
 196                 x264_free( frame->lowres_inter_types[j][i] );
 197             }
 198         x264_free( frame->f_qp_offset );
 199         x264_free( frame->f_qp_offset_aq );
 200         x264_free( frame->i_inv_qscale_factor );
 201         x264_free( frame->i_row_bits );
 202         x264_free( frame->i_row_qp );
 203         x264_free( frame->mb_type );
 204         x264_free( frame->mv[0] );
 205         x264_free( frame->mv[1] );
 206         x264_free( frame->ref[0] );
 207         x264_free( frame->ref[1] );
 208         x264_pthread_mutex_destroy( &frame->mutex );
 209         x264_pthread_cond_destroy( &frame->cv );
 210     }
 211     x264_free( frame );
 212 }
 213
 214 int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src )
 215 {
 216     int i_csp = src->img.i_csp & X264_CSP_MASK;
 217     int i;
 218     if( i_csp != X264_CSP_I420 && i_csp != X264_CSP_YV12 )
 219     {
 220         x264_log( h, X264_LOG_ERROR, "Arg invalid CSP\n" );
 221         return -1;
 222     }
 223
 224     dst->i_type     = src->i_type;
 225     dst->i_qpplus1  = src->i_qpplus1;
 226     dst->i_pts      = dst->i_reordered_pts = src->i_pts;
 227     dst->param      = src->param;
 228
 229     for( i=0; i<3; i++ )
 230     {
 231         int s = (i_csp == X264_CSP_YV12 && i) ? i^3 : i;
 232         uint8_t *plane = src->img.plane[s];
 233         int stride = src->img.i_stride[s];
 234         int width = h->param.i_width >> !!i;
 235         int height = h->param.i_height >> !!i;
 236         if( src->img.i_csp & X264_CSP_VFLIP )
 237         {
 238             plane += (height-1)*stride;
 239             stride = -stride;
 240         }
 241         h->mc.plane_copy( dst->plane[i], dst->i_stride[i], plane, stride, width, height );
 242     }
 243     return 0;
 244 }
 245
 246
 247
 248 static void plane_expand_border( uint8_t *pix, int i_stride, int i_width, int i_height, int i_padh, int i_padv, int b_pad_top, int b_pad_bottom )
 249 {
 250 #define PPIXEL(x, y) ( pix + (x) + (y)*i_stride )
 251     int y;
 252     for( y = 0; y < i_height; y++ )
 253     {
 254         /* left band */
 255         memset( PPIXEL(-i_padh, y), PPIXEL(0, y)[0], i_padh );
 256         /* right band */
 257         memset( PPIXEL(i_width, y), PPIXEL(i_width-1, y)[0], i_padh );
 258     }
 259     /* upper band */
 260     if( b_pad_top )
 261     for( y = 0; y < i_padv; y++ )
 262         memcpy( PPIXEL(-i_padh, -y-1), PPIXEL(-i_padh, 0), i_width+2*i_padh );
 263     /* lower band */
 264     if( b_pad_bottom )
 265     for( y = 0; y < i_padv; y++ )
 266         memcpy( PPIXEL(-i_padh, i_height+y), PPIXEL(-i_padh, i_height-1), i_width+2*i_padh );
 267 #undef PPIXEL
 268 }
 269
 270 void x264_frame_expand_border( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
 271 {
 272     int i;
 273     int b_start = !mb_y;
 274     if( mb_y & h->sh.b_mbaff )
 275         return;
 276     for( i = 0; i < frame->i_plane; i++ )
 277     {
 278         int stride = frame->i_stride[i];
 279         int width = 16*h->sps->i_mb_width >> !!i;
 280         int height = (b_end ? 16*(h->sps->i_mb_height - mb_y) >> h->sh.b_mbaff : 16) >> !!i;
 281         int padh = PADH >> !!i;
 282         int padv = PADV >> !!i;
 283         // buffer: 2 chroma, 3 luma (rounded to 4) because deblocking goes beyond the top of the mb
 284         uint8_t *pix = frame->plane[i] + X264_MAX(0, (16*mb_y-4)*stride >> !!i);
 285         if( b_end && !b_start )
 286             height += 4 >> (!!i + h->sh.b_mbaff);
 287         if( h->sh.b_mbaff )
 288         {
 289             plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end );
 290             plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end );
 291         }
 292         else
 293         {
 294             plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end );
 295         }
 296     }
 297 }
 298
 299 void x264_frame_expand_border_filtered( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
 300 {
 301     /* during filtering, 8 extra pixels were filtered on each edge,
 302      * but up to 3 of the horizontal ones may be wrong.
 303        we want to expand border from the last filtered pixel */
 304     int b_start = !mb_y;
 305     int stride = frame->i_stride[0];
 306     int width = 16*h->sps->i_mb_width + 8;
 307     int height = b_end ? (16*(h->sps->i_mb_height - mb_y) >> h->sh.b_mbaff) + 16 : 16;
 308     int padh = PADH - 4;
 309     int padv = PADV - 8;
 310     int i;
 311     for( i = 1; i < 4; i++ )
 312     {
 313         // buffer: 8 luma, to match the hpel filter
 314         uint8_t *pix = frame->filtered[i] + (16*mb_y - (8 << h->sh.b_mbaff)) * stride - 4;
 315         if( h->sh.b_mbaff )
 316         {
 317             plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end );
 318             plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end );
 319         }
 320         else
 321         {
 322             plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end );
 323         }
 324     }
 325 }
 326
 327 void x264_frame_expand_border_lowres( x264_frame_t *frame )
 328 {
 329     int i;
 330     for( i = 0; i < 4; i++ )
 331         plane_expand_border( frame->lowres[i], frame->i_stride_lowres, frame->i_width_lowres, frame->i_lines_lowres, PADH, PADV, 1, 1 );
 332 }
 333
 334 void x264_frame_expand_border_mod16( x264_t *h, x264_frame_t *frame )
 335 {
 336     int i, y;
 337     for( i = 0; i < frame->i_plane; i++ )
 338     {
 339         int i_subsample = i ? 1 : 0;
 340         int i_width = h->param.i_width >> i_subsample;
 341         int i_height = h->param.i_height >> i_subsample;
 342         int i_padx = (h->sps->i_mb_width * 16 - h->param.i_width) >> i_subsample;
 343         int i_pady = (h->sps->i_mb_height * 16 - h->param.i_height) >> i_subsample;
 344
 345         if( i_padx )
 346         {
 347             for( y = 0; y < i_height; y++ )
 348                 memset( &frame->plane[i][y*frame->i_stride[i] + i_width],
 349                          frame->plane[i][y*frame->i_stride[i] + i_width - 1],
 350                          i_padx );
 351         }
 352         if( i_pady )
 353         {
 354             //FIXME interlace? or just let it pad using the wrong field
 355             for( y = i_height; y < i_height + i_pady; y++ )
 356                 memcpy( &frame->plane[i][y*frame->i_stride[i]],
 357                         &frame->plane[i][(i_height-1)*frame->i_stride[i]],
 358                         i_width + i_padx );
 359         }
 360     }
 361 }
 362
 363
 364 /* cavlc + 8x8 transform stores nnz per 16 coeffs for the purpose of
 365  * entropy coding, but per 64 coeffs for the purpose of deblocking */
 366 static void munge_cavlc_nnz_row( x264_t *h, int mb_y, uint8_t (*buf)[16] )
 367 {
 368     uint32_t (*src)[6] = (uint32_t(*)[6])h->mb.non_zero_count + mb_y * h->sps->i_mb_width;
 369     int8_t *transform = h->mb.mb_transform_size + mb_y * h->sps->i_mb_width;
 370     int x, nnz;
 371     for( x=0; x<h->sps->i_mb_width; x++ )
 372     {
 373         memcpy( buf+x, src+x, 16 );
 374         if( transform[x] )
 375         {
 376             nnz = src[x][0] | src[x][1];
 377             src[x][0] = src[x][1] = ((uint16_t)nnz ? 0x0101 : 0) + (nnz>>16 ? 0x01010000 : 0);
 378             nnz = src[x][2] | src[x][3];
 379             src[x][2] = src[x][3] = ((uint16_t)nnz ? 0x0101 : 0) + (nnz>>16 ? 0x01010000 : 0);
 380         }
 381     }
 382 }
 383
 384 static void restore_cavlc_nnz_row( x264_t *h, int mb_y, uint8_t (*buf)[16] )
 385 {
 386     uint8_t (*dst)[24] = h->mb.non_zero_count + mb_y * h->sps->i_mb_width;
 387     int x;
 388     for( x=0; x<h->sps->i_mb_width; x++ )
 389         memcpy( dst+x, buf+x, 16 );
 390 }
 391
 392 static void munge_cavlc_nnz( x264_t *h, int mb_y, uint8_t (*buf)[16], void (*func)(x264_t*, int, uint8_t (*)[16]) )
 393 {
 394     func( h, mb_y, buf );
 395     if( mb_y > 0 )
 396         func( h, mb_y-1, buf + h->sps->i_mb_width );
 397     if( h->sh.b_mbaff )
 398     {
 399         func( h, mb_y+1, buf + h->sps->i_mb_width * 2 );
 400         if( mb_y > 0 )
 401             func( h, mb_y-2, buf + h->sps->i_mb_width * 3 );
 402     }
 403 }
 404
 405
 406 /* Deblocking filter */
 407 static const uint8_t i_alpha_table[52+12*2] =
 408 {
 409      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 410      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 411      0,  0,  0,  0,  0,  0,  4,  4,  5,  6,
 412      7,  8,  9, 10, 12, 13, 15, 17, 20, 22,
 413     25, 28, 32, 36, 40, 45, 50, 56, 63, 71,
 414     80, 90,101,113,127,144,162,182,203,226,
 415    255,255,
 416    255,255,255,255,255,255,255,255,255,255,255,255,
 417 };
 418 static const uint8_t i_beta_table[52+12*2] =
 419 {
 420      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 421      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 422      0,  0,  0,  0,  0,  0,  2,  2,  2,  3,
 423      3,  3,  3,  4,  4,  4,  6,  6,  7,  7,
 424      8,  8,  9,  9, 10, 10, 11, 11, 12, 12,
 425     13, 13, 14, 14, 15, 15, 16, 16, 17, 17,
 426     18, 18,
 427     18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
 428 };
 429 static const int8_t i_tc0_table[52+12*2][4] =
 430 {
 431     {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
 432     {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
 433     {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
 434     {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
 435     {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 1 },
 436     {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 1, 1 }, {-1, 0, 1, 1 }, {-1, 1, 1, 1 },
 437     {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 },
 438     {-1, 1, 1, 2 }, {-1, 1, 2, 3 }, {-1, 1, 2, 3 }, {-1, 2, 2, 3 }, {-1, 2, 2, 4 }, {-1, 2, 3, 4 },
 439     {-1, 2, 3, 4 }, {-1, 3, 3, 5 }, {-1, 3, 4, 6 }, {-1, 3, 4, 6 }, {-1, 4, 5, 7 }, {-1, 4, 5, 8 },
 440     {-1, 4, 6, 9 }, {-1, 5, 7,10 }, {-1, 6, 8,11 }, {-1, 6, 8,13 }, {-1, 7,10,14 }, {-1, 8,11,16 },
 441     {-1, 9,12,18 }, {-1,10,13,20 }, {-1,11,15,23 }, {-1,13,17,25 },
 442     {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
 443     {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
 444 };
 445 #define alpha_table(x) i_alpha_table[(x)+12]
 446 #define beta_table(x)  i_beta_table[(x)+12]
 447 #define tc0_table(x)   i_tc0_table[(x)+12]
 448
 449 /* From ffmpeg */
 450 static inline void deblock_luma_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0 )
 451 {
 452     int i, d;
 453     for( i = 0; i < 4; i++ )
 454     {
 455         if( tc0[i] < 0 )
 456         {
 457             pix += 4*ystride;
 458             continue;
 459         }
 460         for( d = 0; d < 4; d++ )
 461         {
 462             const int p2 = pix[-3*xstride];
 463             const int p1 = pix[-2*xstride];
 464             const int p0 = pix[-1*xstride];
 465             const int q0 = pix[ 0*xstride];
 466             const int q1 = pix[ 1*xstride];
 467             const int q2 = pix[ 2*xstride];
 468
 469             if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
 470             {
 471                 int tc = tc0[i];
 472                 int delta;
 473                 if( abs( p2 - p0 ) < beta )
 474                 {
 475                     pix[-2*xstride] = p1 + x264_clip3( (( p2 + ((p0 + q0 + 1) >> 1)) >> 1) - p1, -tc0[i], tc0[i] );
 476                     tc++;
 477                 }
 478                 if( abs( q2 - q0 ) < beta )
 479                 {
 480                     pix[ 1*xstride] = q1 + x264_clip3( (( q2 + ((p0 + q0 + 1) >> 1)) >> 1) - q1, -tc0[i], tc0[i] );
 481                     tc++;
 482                 }
 483
 484                 delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
 485                 pix[-1*xstride] = x264_clip_uint8( p0 + delta );    /* p0' */
 486                 pix[ 0*xstride] = x264_clip_uint8( q0 - delta );    /* q0' */
 487             }
 488             pix += ystride;
 489         }
 490     }
 491 }
 492 static void deblock_v_luma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
 493 {
 494     deblock_luma_c( pix, stride, 1, alpha, beta, tc0 );
 495 }
 496 static void deblock_h_luma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
 497 {
 498     deblock_luma_c( pix, 1, stride, alpha, beta, tc0 );
 499 }
 500
 501 static inline void deblock_chroma_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0 )
 502 {
 503     int i, d;
 504     for( i = 0; i < 4; i++ )
 505     {
 506         const int tc = tc0[i];
 507         if( tc <= 0 )
 508         {
 509             pix += 2*ystride;
 510             continue;
 511         }
 512         for( d = 0; d < 2; d++ )
 513         {
 514             const int p1 = pix[-2*xstride];
 515             const int p0 = pix[-1*xstride];
 516             const int q0 = pix[ 0*xstride];
 517             const int q1 = pix[ 1*xstride];
 518
 519             if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
 520             {
 521                 int delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
 522                 pix[-1*xstride] = x264_clip_uint8( p0 + delta );    /* p0' */
 523                 pix[ 0*xstride] = x264_clip_uint8( q0 - delta );    /* q0' */
 524             }
 525             pix += ystride;
 526         }
 527     }
 528 }
 529 static void deblock_v_chroma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
 530 {
 531     deblock_chroma_c( pix, stride, 1, alpha, beta, tc0 );
 532 }
 533 static void deblock_h_chroma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
 534 {
 535     deblock_chroma_c( pix, 1, stride, alpha, beta, tc0 );
 536 }
 537
 538 static inline void deblock_luma_intra_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta )
 539 {
 540     int d;
 541     for( d = 0; d < 16; d++ )
 542     {
 543         const int p2 = pix[-3*xstride];
 544         const int p1 = pix[-2*xstride];
 545         const int p0 = pix[-1*xstride];
 546         const int q0 = pix[ 0*xstride];
 547         const int q1 = pix[ 1*xstride];
 548         const int q2 = pix[ 2*xstride];
 549
 550         if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
 551         {
 552             if(abs( p0 - q0 ) < ((alpha >> 2) + 2) )
 553             {
 554                 if( abs( p2 - p0 ) < beta ) /* p0', p1', p2' */
 555                 {
 556                     const int p3 = pix[-4*xstride];
 557                     pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
 558                     pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
 559                     pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
 560                 }
 561                 else /* p0' */
 562                     pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
 563                 if( abs( q2 - q0 ) < beta ) /* q0', q1', q2' */
 564                 {
 565                     const int q3 = pix[3*xstride];
 566                     pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
 567                     pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
 568                     pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
 569                 }
 570                 else /* q0' */
 571                     pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
 572             }
 573             else /* p0', q0' */
 574             {
 575                 pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
 576                 pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
 577             }
 578         }
 579         pix += ystride;
 580     }
 581 }
 582 static void deblock_v_luma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
 583 {
 584     deblock_luma_intra_c( pix, stride, 1, alpha, beta );
 585 }
 586 static void deblock_h_luma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
 587 {
 588     deblock_luma_intra_c( pix, 1, stride, alpha, beta );
 589 }
 590
 591 static inline void deblock_chroma_intra_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta )
 592 {
 593     int d;
 594     for( d = 0; d < 8; d++ )
 595     {
 596         const int p1 = pix[-2*xstride];
 597         const int p0 = pix[-1*xstride];
 598         const int q0 = pix[ 0*xstride];
 599         const int q1 = pix[ 1*xstride];
 600
 601         if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
 602         {
 603             pix[-1*xstride] = (2*p1 + p0 + q1 + 2) >> 2;   /* p0' */
 604             pix[ 0*xstride] = (2*q1 + q0 + p1 + 2) >> 2;   /* q0' */
 605         }
 606         pix += ystride;
 607     }
 608 }
 609 static void deblock_v_chroma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
 610 {
 611     deblock_chroma_intra_c( pix, stride, 1, alpha, beta );
 612 }
 613 static void deblock_h_chroma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
 614 {
 615     deblock_chroma_intra_c( pix, 1, stride, alpha, beta );
 616 }
 617
 618 static inline void deblock_edge( x264_t *h, uint8_t *pix1, uint8_t *pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_inter_t pf_inter )
 619 {
 620     const int index_a = i_qp + h->sh.i_alpha_c0_offset;
 621     const int alpha = alpha_table(index_a);
 622     const int beta  = beta_table(i_qp + h->sh.i_beta_offset);
 623     int8_t tc[4];
 624
 625     if( !alpha || !beta )
 626         return;
 627
 628     tc[0] = tc0_table(index_a)[bS[0]] + b_chroma;
 629     tc[1] = tc0_table(index_a)[bS[1]] + b_chroma;
 630     tc[2] = tc0_table(index_a)[bS[2]] + b_chroma;
 631     tc[3] = tc0_table(index_a)[bS[3]] + b_chroma;
 632
 633     pf_inter( pix1, i_stride, alpha, beta, tc );
 634     if( b_chroma )
 635         pf_inter( pix2, i_stride, alpha, beta, tc );
 636 }
 637
 638 static inline void deblock_edge_intra( x264_t *h, uint8_t *pix1, uint8_t *pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_intra_t pf_intra )
 639 {
 640     const int alpha = alpha_table(i_qp + h->sh.i_alpha_c0_offset);
 641     const int beta  = beta_table(i_qp + h->sh.i_beta_offset);
 642
 643     if( !alpha || !beta )
 644         return;
 645
 646     pf_intra( pix1, i_stride, alpha, beta );
 647     if( b_chroma )
 648         pf_intra( pix2, i_stride, alpha, beta );
 649 }
 650
 651 void x264_frame_deblock_row( x264_t *h, int mb_y )
 652 {
 653     const int s8x8 = 2 * h->mb.i_mb_stride;
 654     const int s4x4 = 4 * h->mb.i_mb_stride;
 655     const int b_interlaced = h->sh.b_mbaff;
 656     const int mvy_limit = 4 >> b_interlaced;
 657     const int qp_thresh = 15 - X264_MIN(h->sh.i_alpha_c0_offset, h->sh.i_beta_offset) - X264_MAX(0, h->param.analyse.i_chroma_qp_offset);
 658     const int no_sub8x8 = !(h->param.analyse.inter & X264_ANALYSE_PSUB8x8);
 659     int mb_x;
 660     int stridey   = h->fdec->i_stride[0];
 661     int stride2y  = stridey << b_interlaced;
 662     int strideuv  = h->fdec->i_stride[1];
 663     int stride2uv = strideuv << b_interlaced;
 664     uint8_t (*nnz_backup)[16] = h->scratch_buffer;
 665
 666     if( !h->pps->b_cabac && h->pps->b_transform_8x8_mode )
 667         munge_cavlc_nnz( h, mb_y, nnz_backup, munge_cavlc_nnz_row );
 668
 669     for( mb_x = 0; mb_x < h->sps->i_mb_width; mb_x += (~b_interlaced | mb_y)&1, mb_y ^= b_interlaced )
 670     {
 671         const int mb_xy  = mb_y * h->mb.i_mb_stride + mb_x;
 672         const int mb_8x8 = 2 * s8x8 * mb_y + 2 * mb_x;
 673         const int mb_4x4 = 4 * s4x4 * mb_y + 4 * mb_x;
 674         const int b_8x8_transform = h->mb.mb_transform_size[mb_xy];
 675         const int i_qp = h->mb.qp[mb_xy];
 676         int i_edge_end = (h->mb.type[mb_xy] == P_SKIP) ? 1 : 4;
 677         uint8_t *pixy = h->fdec->plane[0] + 16*mb_y*stridey  + 16*mb_x;
 678         uint8_t *pixu = h->fdec->plane[1] +  8*mb_y*strideuv +  8*mb_x;
 679         uint8_t *pixv = h->fdec->plane[2] +  8*mb_y*strideuv +  8*mb_x;
 680         if( b_interlaced && (mb_y&1) )
 681         {
 682             pixy -= 15*stridey;
 683             pixu -=  7*strideuv;
 684             pixv -=  7*strideuv;
 685         }
 686
 687         x264_prefetch_fenc( h, h->fdec, mb_x, mb_y );
 688
 689         if( i_qp <= qp_thresh )
 690             i_edge_end = 1;
 691
 692         #define FILTER_DIR(intra, i_dir)\
 693         {\
 694             /* Y plane */\
 695             i_qpn= h->mb.qp[mbn_xy];\
 696             if( i_dir == 0 )\
 697             {\
 698                 /* vertical edge */\
 699                 deblock_edge##intra( h, pixy + 4*i_edge, NULL,\
 700                               stride2y, bS, (i_qp+i_qpn+1) >> 1, 0,\
 701                               h->loopf.deblock_h_luma##intra );\
 702                 if( !(i_edge & 1) )\
 703                 {\
 704                     /* U/V planes */\
 705                     int i_qpc = (h->chroma_qp_table[i_qp] + h->chroma_qp_table[i_qpn] + 1) >> 1;\
 706                     deblock_edge##intra( h, pixu + 2*i_edge, pixv + 2*i_edge,\
 707                                   stride2uv, bS, i_qpc, 1,\
 708                                   h->loopf.deblock_h_chroma##intra );\
 709                 }\
 710             }\
 711             else\
 712             {\
 713                 /* horizontal edge */\
 714                 deblock_edge##intra( h, pixy + 4*i_edge*stride2y, NULL,\
 715                               stride2y, bS, (i_qp+i_qpn+1) >> 1, 0,\
 716                               h->loopf.deblock_v_luma##intra );\
 717                 /* U/V planes */\
 718                 if( !(i_edge & 1) )\
 719                 {\
 720                     int i_qpc = (h->chroma_qp_table[i_qp] + h->chroma_qp_table[i_qpn] + 1) >> 1;\
 721                     deblock_edge##intra( h, pixu + 2*i_edge*stride2uv, pixv + 2*i_edge*stride2uv,\
 722                                   stride2uv, bS, i_qpc, 1,\
 723                                   h->loopf.deblock_v_chroma##intra );\
 724                 }\
 725             }\
 726         }
 727
 728         #define DEBLOCK_STRENGTH(i_dir)\
 729         {\
 730             /* *** Get bS for each 4px for the current edge *** */\
 731             if( IS_INTRA( h->mb.type[mb_xy] ) || IS_INTRA( h->mb.type[mbn_xy]) )\
 732                 M32( bS ) = 0x03030303;\
 733             else\
 734             {\
 735                 M32( bS ) = 0x00000000;\
 736                 for( i = 0; i < 4; i++ )\
 737                 {\
 738                     int x  = i_dir == 0 ? i_edge : i;\
 739                     int y  = i_dir == 0 ? i      : i_edge;\
 740                     int xn = i_dir == 0 ? (x - 1)&0x03 : x;\
 741                     int yn = i_dir == 0 ? y : (y - 1)&0x03;\
 742                     if( h->mb.non_zero_count[mb_xy][x+y*4] != 0 ||\
 743                         h->mb.non_zero_count[mbn_xy][xn+yn*4] != 0 )\
 744                         bS[i] = 2;\
 745                     else if(!(i_edge&no_sub8x8))\
 746                     {\
 747                         if((i&no_sub8x8) && bS[i-1] != 2)\
 748                             bS[i] = bS[i-1];\
 749                         else\
 750                         {\
 751                             int i8p= mb_8x8+(x>>1)+(y>>1)*s8x8;\
 752                             int i8q= mbn_8x8+(xn>>1)+(yn>>1)*s8x8;\
 753                             int i4p= mb_4x4+x+y*s4x4;\
 754                             int i4q= mbn_4x4+xn+yn*s4x4;\
 755                             int refs_equal;\
 756                             /* We don't use duplicate refs in B-frames, so we can take this shortcut for now. */ \
 757                             if( h->sh.i_type == SLICE_TYPE_B || h->mb.ref[0][i8p] < 0 || h->mb.ref[0][i8q] < 0 )\
 758                                 refs_equal = h->mb.ref[0][i8p] == h->mb.ref[0][i8q];\
 759                             else if( !h->mb.b_interlaced )\
 760                                 refs_equal = h->fref0[h->mb.ref[0][i8p]]->i_poc == h->fref0[h->mb.ref[0][i8q]]->i_poc;\
 761                             else\
 762                                 refs_equal = h->fref0[h->mb.ref[0][i8p]>>1]->i_poc == h->fref0[h->mb.ref[0][i8q]>>1]->i_poc\
 763                                            && (h->mb.ref[0][i8p]&1) == (h->mb.ref[0][i8q]&1);\
 764                             if((!refs_equal ||\
 765                                 abs( h->mb.mv[0][i4p][0] - h->mb.mv[0][i4q][0] ) >= 4 ||\
 766                                 abs( h->mb.mv[0][i4p][1] - h->mb.mv[0][i4q][1] ) >= mvy_limit ) ||\
 767                                (h->sh.i_type == SLICE_TYPE_B &&\
 768                                (h->mb.ref[1][i8p] != h->mb.ref[1][i8q] ||\
 769                                 abs( h->mb.mv[1][i4p][0] - h->mb.mv[1][i4q][0] ) >= 4 ||\
 770                                 abs( h->mb.mv[1][i4p][1] - h->mb.mv[1][i4q][1] ) >= mvy_limit )))\
 771                             {\
 772                                 bS[i] = 1;\
 773                             }\
 774                         }\
 775                     }\
 776                 }\
 777             }\
 778         }
 779
 780         /* i_dir == 0 -> vertical edge
 781          * i_dir == 1 -> horizontal edge */
 782         #define DEBLOCK_DIR(i_dir)\
 783         {\
 784             int i_edge = (i_dir ? (mb_y <= b_interlaced) : (mb_x == 0));\
 785             int i_qpn, i, mbn_xy, mbn_8x8, mbn_4x4;\
 786             ALIGNED_4( uint8_t bS[4] );  /* filtering strength */\
 787             if( i_edge )\
 788                 i_edge+= b_8x8_transform;\
 789             else\
 790             {\
 791                 mbn_xy  = i_dir == 0 ? mb_xy  - 1 : mb_xy - h->mb.i_mb_stride;\
 792                 mbn_8x8 = i_dir == 0 ? mb_8x8 - 2 : mb_8x8 - 2 * s8x8;\
 793                 mbn_4x4 = i_dir == 0 ? mb_4x4 - 4 : mb_4x4 - 4 * s4x4;\
 794                 if( b_interlaced && i_dir == 1 )\
 795                 {\
 796                     mbn_xy -= h->mb.i_mb_stride;\
 797                     mbn_8x8 -= 2 * s8x8;\
 798                     mbn_4x4 -= 4 * s4x4;\
 799                 }\
 800                 else if( IS_INTRA( h->mb.type[mb_xy] ) || IS_INTRA( h->mb.type[mbn_xy]) )\
 801                 {\
 802                     FILTER_DIR( _intra, i_dir );\
 803                     goto end##i_dir;\
 804                 }\
 805                 DEBLOCK_STRENGTH(i_dir);\
 806                 if( M32( bS ) )\
 807                     FILTER_DIR( , i_dir);\
 808                 end##i_dir:\
 809                 i_edge += b_8x8_transform+1;\
 810             }\
 811             mbn_xy  = mb_xy;\
 812             mbn_8x8 = mb_8x8;\
 813             mbn_4x4 = mb_4x4;\
 814             for( ; i_edge < i_edge_end; i_edge+=b_8x8_transform+1 )\
 815             {\
 816                 DEBLOCK_STRENGTH(i_dir);\
 817                 if( M32( bS ) )\
 818                     FILTER_DIR( , i_dir);\
 819             }\
 820         }
 821
 822         DEBLOCK_DIR(0);
 823         DEBLOCK_DIR(1);
 824     }
 825
 826     if( !h->pps->b_cabac && h->pps->b_transform_8x8_mode )
 827         munge_cavlc_nnz( h, mb_y, nnz_backup, restore_cavlc_nnz_row );
 828 }
 829
 830 void x264_frame_deblock( x264_t *h )
 831 {
 832     int mb_y;
 833     for( mb_y = 0; mb_y < h->sps->i_mb_height; mb_y += 1 + h->sh.b_mbaff )
 834         x264_frame_deblock_row( h, mb_y );
 835 }
 836
 837 #ifdef HAVE_MMX
 838 void x264_deblock_v_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 839 void x264_deblock_h_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 840 void x264_deblock_v_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
 841 void x264_deblock_h_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
 842
 843 void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 844 void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 845 void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta );
 846 void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta );
 847 #ifdef ARCH_X86
 848 void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 849 void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 850 void x264_deblock_h_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
 851 void x264_deblock_v8_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
 852
 853 static void x264_deblock_v_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
 854 {
 855     x264_deblock_v8_luma_mmxext( pix,   stride, alpha, beta, tc0   );
 856     x264_deblock_v8_luma_mmxext( pix+8, stride, alpha, beta, tc0+2 );
 857 }
 858 static void x264_deblock_v_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta )
 859 {
 860     x264_deblock_v8_luma_intra_mmxext( pix,   stride, alpha, beta );
 861     x264_deblock_v8_luma_intra_mmxext( pix+8, stride, alpha, beta );
 862 }
 863 #endif
 864 #endif
 865
 866 #ifdef ARCH_PPC
 867 void x264_deblock_v_luma_altivec( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 868 void x264_deblock_h_luma_altivec( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 869 #endif // ARCH_PPC
 870
 871 #ifdef HAVE_ARMV6
 872 void x264_deblock_v_luma_neon( uint8_t *, int, int, int, int8_t * );
 873 void x264_deblock_h_luma_neon( uint8_t *, int, int, int, int8_t * );
 874 void x264_deblock_v_chroma_neon( uint8_t *, int, int, int, int8_t * );
 875 void x264_deblock_h_chroma_neon( uint8_t *, int, int, int, int8_t * );
 876 #endif
 877
 878 void x264_deblock_init( int cpu, x264_deblock_function_t *pf )
 879 {
 880     pf->deblock_v_luma = deblock_v_luma_c;
 881     pf->deblock_h_luma = deblock_h_luma_c;
 882     pf->deblock_v_chroma = deblock_v_chroma_c;
 883     pf->deblock_h_chroma = deblock_h_chroma_c;
 884     pf->deblock_v_luma_intra = deblock_v_luma_intra_c;
 885     pf->deblock_h_luma_intra = deblock_h_luma_intra_c;
 886     pf->deblock_v_chroma_intra = deblock_v_chroma_intra_c;
 887     pf->deblock_h_chroma_intra = deblock_h_chroma_intra_c;
 888
 889 #ifdef HAVE_MMX
 890     if( cpu&X264_CPU_MMXEXT )
 891     {
 892         pf->deblock_v_chroma = x264_deblock_v_chroma_mmxext;
 893         pf->deblock_h_chroma = x264_deblock_h_chroma_mmxext;
 894         pf->deblock_v_chroma_intra = x264_deblock_v_chroma_intra_mmxext;
 895         pf->deblock_h_chroma_intra = x264_deblock_h_chroma_intra_mmxext;
 896 #ifdef ARCH_X86
 897         pf->deblock_v_luma = x264_deblock_v_luma_mmxext;
 898         pf->deblock_h_luma = x264_deblock_h_luma_mmxext;
 899         pf->deblock_v_luma_intra = x264_deblock_v_luma_intra_mmxext;
 900         pf->deblock_h_luma_intra = x264_deblock_h_luma_intra_mmxext;
 901 #endif
 902         if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_STACK_MOD4) )
 903         {
 904             pf->deblock_v_luma = x264_deblock_v_luma_sse2;
 905             pf->deblock_h_luma = x264_deblock_h_luma_sse2;
 906             pf->deblock_v_luma_intra = x264_deblock_v_luma_intra_sse2;
 907             pf->deblock_h_luma_intra = x264_deblock_h_luma_intra_sse2;
 908         }
 909     }
 910 #endif
 911
 912 #ifdef ARCH_PPC
 913     if( cpu&X264_CPU_ALTIVEC )
 914     {
 915         pf->deblock_v_luma = x264_deblock_v_luma_altivec;
 916         pf->deblock_h_luma = x264_deblock_h_luma_altivec;
 917    }
 918 #endif // ARCH_PPC
 919
 920 #ifdef HAVE_ARMV6
 921    if( cpu&X264_CPU_NEON )
 922    {
 923         pf->deblock_v_luma   = x264_deblock_v_luma_neon;
 924         pf->deblock_h_luma   = x264_deblock_h_luma_neon;
 925         pf->deblock_v_chroma = x264_deblock_v_chroma_neon;
 926         pf->deblock_h_chroma = x264_deblock_h_chroma_neon;
 927    }
 928 #endif
 929 }
 930
 931
 932 /* threading */
 933 void x264_frame_cond_broadcast( x264_frame_t *frame, int i_lines_completed )
 934 {
 935     x264_pthread_mutex_lock( &frame->mutex );
 936     frame->i_lines_completed = i_lines_completed;
 937     x264_pthread_cond_broadcast( &frame->cv );
 938     x264_pthread_mutex_unlock( &frame->mutex );
 939 }
 940
 941 void x264_frame_cond_wait( x264_frame_t *frame, int i_lines_completed )
 942 {
 943     x264_pthread_mutex_lock( &frame->mutex );
 944     while( frame->i_lines_completed < i_lines_completed )
 945         x264_pthread_cond_wait( &frame->cv, &frame->mutex );
 946     x264_pthread_mutex_unlock( &frame->mutex );
 947 }
 948
 949 /* list operators */
 950
 951 void x264_frame_push( x264_frame_t **list, x264_frame_t *frame )
 952 {
 953     int i = 0;
 954     while( list[i] ) i++;
 955     list[i] = frame;
 956 }
 957
 958 x264_frame_t *x264_frame_pop( x264_frame_t **list )
 959 {
 960     x264_frame_t *frame;
 961     int i = 0;
 962     assert( list[0] );
 963     while( list[i+1] ) i++;
 964     frame = list[i];
 965     list[i] = NULL;
 966     return frame;
 967 }
 968
 969 void x264_frame_unshift( x264_frame_t **list, x264_frame_t *frame )
 970 {
 971     int i = 0;
 972     while( list[i] ) i++;
 973     while( i-- )
 974         list[i+1] = list[i];
 975     list[0] = frame;
 976 }
 977
 978 x264_frame_t *x264_frame_shift( x264_frame_t **list )
 979 {
 980     x264_frame_t *frame = list[0];
 981     int i;
 982     for( i = 0; list[i]; i++ )
 983         list[i] = list[i+1];
 984     assert(frame);
 985     return frame;
 986 }
 987
 988 void x264_frame_push_unused( x264_t *h, x264_frame_t *frame )
 989 {
 990     assert( frame->i_reference_count > 0 );
 991     frame->i_reference_count--;
 992     if( frame->i_reference_count == 0 )
 993         x264_frame_push( h->frames.unused[frame->b_fdec], frame );
 994 }
 995
 996 x264_frame_t *x264_frame_pop_unused( x264_t *h, int b_fdec )
 997 {
 998     x264_frame_t *frame;
 999     if( h->frames.unused[b_fdec][0] )
1000         frame = x264_frame_pop( h->frames.unused[b_fdec] );
1001     else
1002         frame = x264_frame_new( h, b_fdec );
1003     if( !frame )
1004         return NULL;
1005     frame->b_last_minigop_bframe = 0;
1006     frame->i_reference_count = 1;
1007     frame->b_intra_calculated = 0;
1008     frame->b_scenecut = 1;
1009     frame->b_keyframe = 0;
1010
1011     memset( frame->weight, 0, sizeof(frame->weight) );
1012     memset( frame->f_weighted_cost_delta, 0, sizeof(frame->f_weighted_cost_delta) );
1013
1014     return frame;
1015 }
1016
1017 void x264_frame_push_blank_unused( x264_t *h, x264_frame_t *frame )
1018 {
1019     assert( frame->i_reference_count > 0 );
1020     frame->i_reference_count--;
1021     if( frame->i_reference_count == 0 )
1022         x264_frame_push( h->frames.blank_unused, frame );
1023 }
1024
1025 x264_frame_t *x264_frame_pop_blank_unused( x264_t *h )
1026 {
1027     x264_frame_t *frame;
1028     if( h->frames.blank_unused[0] )
1029         frame = x264_frame_pop( h->frames.blank_unused );
1030     else
1031         frame = x264_malloc( sizeof(x264_frame_t) );
1032     if( !frame )
1033         return NULL;
1034     frame->b_duplicate = 1;
1035     frame->i_reference_count = 1;
1036     return frame;
1037 }
1038
1039 void x264_frame_sort( x264_frame_t **list, int b_dts )
1040 {
1041     int i, b_ok;
1042     do {
1043         b_ok = 1;
1044         for( i = 0; list[i+1]; i++ )
1045         {
1046             int dtype = list[i]->i_type - list[i+1]->i_type;
1047             int dtime = list[i]->i_frame - list[i+1]->i_frame;
1048             int swap = b_dts ? dtype > 0 || ( dtype == 0 && dtime > 0 )
1049                              : dtime > 0;
1050             if( swap )
1051             {
1052                 XCHG( x264_frame_t*, list[i], list[i+1] );
1053                 b_ok = 0;
1054             }
1055         }
1056     } while( !b_ok );
1057 }
1058
1059 void x264_weight_scale_plane( x264_t *h, uint8_t *dst, int i_dst_stride, uint8_t *src, int i_src_stride,
1060                          int i_width, int i_height, x264_weight_t *w )
1061 {
1062     int x;
1063     /* Weight horizontal strips of height 16. This was found to be the optimal height
1064      * in terms of the cache loads. */
1065     while( i_height > 0 )
1066     {
1067         for( x = 0; x < i_width; x += 16 )
1068             w->weightfn[16>>2]( dst+x, i_dst_stride, src+x, i_src_stride, w, X264_MIN( i_height, 16 ) );
1069         i_height -= 16;
1070         dst += 16 * i_dst_stride;
1071         src += 16 * i_src_stride;
1072     }
1073 }
1074
1075 void x264_frame_delete_list( x264_frame_t **list )
1076 {
1077     int i = 0;
1078     if( !list )
1079         return;
1080     while( list[i] )
1081         x264_frame_delete( list[i++] );
1082     x264_free( list );
1083 }
1084
1085 int x264_synch_frame_list_init( x264_synch_frame_list_t *slist, int max_size )
1086 {
1087     if( max_size < 0 )
1088         return -1;
1089     slist->i_max_size = max_size;
1090     slist->i_size = 0;
1091     CHECKED_MALLOCZERO( slist->list, (max_size+1) * sizeof(x264_frame_t*) );
1092     if( x264_pthread_mutex_init( &slist->mutex, NULL ) ||
1093         x264_pthread_cond_init( &slist->cv_fill, NULL ) ||
1094         x264_pthread_cond_init( &slist->cv_empty, NULL ) )
1095         return -1;
1096     return 0;
1097 fail:
1098     return -1;
1099 }
1100
1101 void x264_synch_frame_list_delete( x264_synch_frame_list_t *slist )
1102 {
1103     x264_pthread_mutex_destroy( &slist->mutex );
1104     x264_pthread_cond_destroy( &slist->cv_fill );
1105     x264_pthread_cond_destroy( &slist->cv_empty );
1106     x264_frame_delete_list( slist->list );
1107 }
1108
1109 void x264_synch_frame_list_push( x264_synch_frame_list_t *slist, x264_frame_t *frame )
1110 {
1111     x264_pthread_mutex_lock( &slist->mutex );
1112     while( slist->i_size == slist->i_max_size )
1113         x264_pthread_cond_wait( &slist->cv_empty, &slist->mutex );
1114     slist->list[ slist->i_size++ ] = frame;
1115     x264_pthread_mutex_unlock( &slist->mutex );
1116     x264_pthread_cond_broadcast( &slist->cv_fill );
1117 }