git.sesse.net Git - x264/blob - common/frame.c

   1 /*****************************************************************************
   2  * frame.c: h264 encoder library
   3  *****************************************************************************
   4  * Copyright (C) 2003-2008 x264 project
   5  *
   6  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
   7  *          Loren Merritt <lorenm@u.washington.edu>
   8  *          Fiona Glaser <fiona@x264.com>
   9  *
  10  * This program is free software; you can redistribute it and/or modify
  11  * it under the terms of the GNU General Public License as published by
  12  * the Free Software Foundation; either version 2 of the License, or
  13  * (at your option) any later version.
  14  *
  15  * This program is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  * GNU General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU General Public License
  21  * along with this program; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  23  *****************************************************************************/
  24
  25 #include "common.h"
  26
  27 #define ALIGN(x,a) (((x)+((a)-1))&~((a)-1))
  28
  29 x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
  30 {
  31     x264_frame_t *frame;
  32     int i, j;
  33
  34     int i_mb_count = h->mb.i_mb_count;
  35     int i_stride, i_width, i_lines;
  36     int i_padv = PADV << h->param.b_interlaced;
  37     int luma_plane_size;
  38     int chroma_plane_size;
  39     int align = h->param.cpu&X264_CPU_CACHELINE_64 ? 64 : h->param.cpu&X264_CPU_CACHELINE_32 ? 32 : 16;
  40
  41     CHECKED_MALLOCZERO( frame, sizeof(x264_frame_t) );
  42
  43     /* allocate frame data (+64 for extra data for me) */
  44     i_width  = ALIGN( h->param.i_width, 16 );
  45     i_stride = ALIGN( i_width + 2*PADH, align );
  46     i_lines  = ALIGN( h->param.i_height, 16<<h->param.b_interlaced );
  47
  48     frame->i_plane = 3;
  49     for( i = 0; i < 3; i++ )
  50     {
  51         frame->i_stride[i] = ALIGN( i_stride >> !!i, align );
  52         frame->i_width[i] = i_width >> !!i;
  53         frame->i_lines[i] = i_lines >> !!i;
  54     }
  55
  56     luma_plane_size = (frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv));
  57     chroma_plane_size = (frame->i_stride[1] * (frame->i_lines[1] + 2*i_padv));
  58     for( i = 1; i < 3; i++ )
  59     {
  60         CHECKED_MALLOC( frame->buffer[i], chroma_plane_size );
  61         frame->plane[i] = frame->buffer[i] + (frame->i_stride[i] * i_padv + PADH)/2;
  62     }
  63
  64     for( i = 0; i < h->param.i_bframe + 2; i++ )
  65         for( j = 0; j < h->param.i_bframe + 2; j++ )
  66             CHECKED_MALLOC( frame->i_row_satds[i][j], i_lines/16 * sizeof(int) );
  67
  68     frame->i_poc = -1;
  69     frame->i_type = X264_TYPE_AUTO;
  70     frame->i_qpplus1 = 0;
  71     frame->i_pts = -1;
  72     frame->i_frame = -1;
  73     frame->i_frame_num = -1;
  74     frame->i_lines_completed = -1;
  75     frame->b_fdec = b_fdec;
  76     frame->i_pic_struct = PIC_STRUCT_AUTO;
  77     frame->i_field_cnt = -1;
  78     frame->i_duration =
  79     frame->i_cpb_duration =
  80     frame->i_dpb_output_delay =
  81     frame->i_cpb_delay = 0;
  82     frame->i_coded_fields_lookahead =
  83     frame->i_cpb_delay_lookahead = -1;
  84
  85     frame->orig = frame;
  86
  87     /* all 4 luma planes allocated together, since the cacheline split code
  88      * requires them to be in-phase wrt cacheline alignment. */
  89     if( h->param.analyse.i_subpel_refine && b_fdec )
  90     {
  91         CHECKED_MALLOC( frame->buffer[0], 4*luma_plane_size );
  92         for( i = 0; i < 4; i++ )
  93             frame->filtered[i] = frame->buffer[0] + i*luma_plane_size + frame->i_stride[0] * i_padv + PADH;
  94         frame->plane[0] = frame->filtered[0];
  95     }
  96     else
  97     {
  98         CHECKED_MALLOC( frame->buffer[0], luma_plane_size );
  99         frame->filtered[0] = frame->plane[0] = frame->buffer[0] + frame->i_stride[0] * i_padv + PADH;
 100     }
 101
 102     frame->b_duplicate = 0;
 103
 104     if( b_fdec ) /* fdec frame */
 105     {
 106         CHECKED_MALLOC( frame->mb_type, i_mb_count * sizeof(int8_t));
 107         CHECKED_MALLOC( frame->mb_partition, i_mb_count * sizeof(uint8_t));
 108         CHECKED_MALLOC( frame->mv[0], 2*16 * i_mb_count * sizeof(int16_t) );
 109         CHECKED_MALLOC( frame->ref[0], 4 * i_mb_count * sizeof(int8_t) );
 110         if( h->param.i_bframe )
 111         {
 112             CHECKED_MALLOC( frame->mv[1], 2*16 * i_mb_count * sizeof(int16_t) );
 113             CHECKED_MALLOC( frame->ref[1], 4 * i_mb_count * sizeof(int8_t) );
 114         }
 115         else
 116         {
 117             frame->mv[1]  = NULL;
 118             frame->ref[1] = NULL;
 119         }
 120         CHECKED_MALLOC( frame->i_row_bits, i_lines/16 * sizeof(int) );
 121         CHECKED_MALLOC( frame->i_row_qp, i_lines/16 * sizeof(int) );
 122         if( h->param.analyse.i_me_method >= X264_ME_ESA )
 123         {
 124             CHECKED_MALLOC( frame->buffer[3],
 125                             frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv) * sizeof(uint16_t) << h->frames.b_have_sub8x8_esa );
 126             frame->integral = (uint16_t*)frame->buffer[3] + frame->i_stride[0] * i_padv + PADH;
 127         }
 128     }
 129     else /* fenc frame */
 130     {
 131         if( h->frames.b_have_lowres )
 132         {
 133             frame->i_width_lowres = frame->i_width[0]/2;
 134             frame->i_stride_lowres = ALIGN( frame->i_width_lowres + 2*PADH, align );
 135             frame->i_lines_lowres = frame->i_lines[0]/2;
 136
 137             luma_plane_size = frame->i_stride_lowres * (frame->i_lines[0]/2 + 2*PADV);
 138
 139             CHECKED_MALLOC( frame->buffer_lowres[0], 4 * luma_plane_size );
 140             for( i = 0; i < 4; i++ )
 141                 frame->lowres[i] = frame->buffer_lowres[0] + (frame->i_stride_lowres * PADV + PADH) + i * luma_plane_size;
 142
 143             for( j = 0; j <= !!h->param.i_bframe; j++ )
 144                 for( i = 0; i <= h->param.i_bframe; i++ )
 145                 {
 146                     CHECKED_MALLOCZERO( frame->lowres_mvs[j][i], 2*h->mb.i_mb_count*sizeof(int16_t) );
 147                     CHECKED_MALLOC( frame->lowres_mv_costs[j][i], h->mb.i_mb_count*sizeof(int) );
 148                 }
 149             CHECKED_MALLOC( frame->i_propagate_cost, (i_mb_count+3) * sizeof(uint16_t) );
 150             for( j = 0; j <= h->param.i_bframe+1; j++ )
 151                 for( i = 0; i <= h->param.i_bframe+1; i++ )
 152                 {
 153                     CHECKED_MALLOC( frame->lowres_costs[j][i], (i_mb_count+3) * sizeof(uint16_t) );
 154                     CHECKED_MALLOC( frame->lowres_inter_types[j][i], (i_mb_count+3)/4 * sizeof(uint8_t) );
 155                 }
 156             frame->i_intra_cost = frame->lowres_costs[0][0];
 157             memset( frame->i_intra_cost, -1, (i_mb_count+3) * sizeof(uint16_t) );
 158         }
 159         if( h->param.rc.i_aq_mode )
 160         {
 161             CHECKED_MALLOC( frame->f_qp_offset, h->mb.i_mb_count * sizeof(float) );
 162             CHECKED_MALLOC( frame->f_qp_offset_aq, h->mb.i_mb_count * sizeof(float) );
 163             if( h->frames.b_have_lowres )
 164                 /* shouldn't really be initialized, just silences a valgrind false-positive in x264_mbtree_propagate_cost_sse2 */
 165                 CHECKED_MALLOCZERO( frame->i_inv_qscale_factor, (h->mb.i_mb_count+3) * sizeof(uint16_t) );
 166         }
 167     }
 168
 169     if( x264_pthread_mutex_init( &frame->mutex, NULL ) )
 170         goto fail;
 171     if( x264_pthread_cond_init( &frame->cv, NULL ) )
 172         goto fail;
 173
 174     return frame;
 175
 176 fail:
 177     x264_free( frame );
 178     return NULL;
 179 }
 180
 181 void x264_frame_delete( x264_frame_t *frame )
 182 {
 183     int i, j;
 184     /* Duplicate frames are blank copies of real frames (including pointers),
 185      * so freeing those pointers would cause a double free later. */
 186     if( !frame->b_duplicate )
 187     {
 188         for( i = 0; i < 4; i++ )
 189             x264_free( frame->buffer[i] );
 190         for( i = 0; i < 4; i++ )
 191             x264_free( frame->buffer_lowres[i] );
 192         for( i = 0; i < X264_BFRAME_MAX+2; i++ )
 193             for( j = 0; j < X264_BFRAME_MAX+2; j++ )
 194                 x264_free( frame->i_row_satds[i][j] );
 195         for( j = 0; j < 2; j++ )
 196             for( i = 0; i <= X264_BFRAME_MAX; i++ )
 197             {
 198                 x264_free( frame->lowres_mvs[j][i] );
 199                 x264_free( frame->lowres_mv_costs[j][i] );
 200             }
 201         x264_free( frame->i_propagate_cost );
 202         for( j = 0; j <= X264_BFRAME_MAX+1; j++ )
 203             for( i = 0; i <= X264_BFRAME_MAX+1; i++ )
 204             {
 205                 x264_free( frame->lowres_costs[j][i] );
 206                 x264_free( frame->lowres_inter_types[j][i] );
 207             }
 208         x264_free( frame->f_qp_offset );
 209         x264_free( frame->f_qp_offset_aq );
 210         x264_free( frame->i_inv_qscale_factor );
 211         x264_free( frame->i_row_bits );
 212         x264_free( frame->i_row_qp );
 213         x264_free( frame->mb_type );
 214         x264_free( frame->mb_partition );
 215         x264_free( frame->mv[0] );
 216         x264_free( frame->mv[1] );
 217         x264_free( frame->ref[0] );
 218         x264_free( frame->ref[1] );
 219         x264_pthread_mutex_destroy( &frame->mutex );
 220         x264_pthread_cond_destroy( &frame->cv );
 221     }
 222     x264_free( frame );
 223 }
 224
 225 int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src )
 226 {
 227     int i_csp = src->img.i_csp & X264_CSP_MASK;
 228     int i;
 229     if( i_csp != X264_CSP_I420 && i_csp != X264_CSP_YV12 )
 230     {
 231         x264_log( h, X264_LOG_ERROR, "Arg invalid CSP\n" );
 232         return -1;
 233     }
 234
 235     dst->i_type     = src->i_type;
 236     dst->i_qpplus1  = src->i_qpplus1;
 237     dst->i_pts      = dst->i_reordered_pts = src->i_pts;
 238     dst->param      = src->param;
 239     dst->i_pic_struct = src->i_pic_struct;
 240
 241     for( i=0; i<3; i++ )
 242     {
 243         int s = (i_csp == X264_CSP_YV12 && i) ? i^3 : i;
 244         uint8_t *plane = src->img.plane[s];
 245         int stride = src->img.i_stride[s];
 246         int width = h->param.i_width >> !!i;
 247         int height = h->param.i_height >> !!i;
 248         if( src->img.i_csp & X264_CSP_VFLIP )
 249         {
 250             plane += (height-1)*stride;
 251             stride = -stride;
 252         }
 253         h->mc.plane_copy( dst->plane[i], dst->i_stride[i], plane, stride, width, height );
 254     }
 255     return 0;
 256 }
 257
 258
 259
 260 static void plane_expand_border( uint8_t *pix, int i_stride, int i_width, int i_height, int i_padh, int i_padv, int b_pad_top, int b_pad_bottom )
 261 {
 262 #define PPIXEL(x, y) ( pix + (x) + (y)*i_stride )
 263     int y;
 264     for( y = 0; y < i_height; y++ )
 265     {
 266         /* left band */
 267         memset( PPIXEL(-i_padh, y), PPIXEL(0, y)[0], i_padh );
 268         /* right band */
 269         memset( PPIXEL(i_width, y), PPIXEL(i_width-1, y)[0], i_padh );
 270     }
 271     /* upper band */
 272     if( b_pad_top )
 273     for( y = 0; y < i_padv; y++ )
 274         memcpy( PPIXEL(-i_padh, -y-1), PPIXEL(-i_padh, 0), i_width+2*i_padh );
 275     /* lower band */
 276     if( b_pad_bottom )
 277     for( y = 0; y < i_padv; y++ )
 278         memcpy( PPIXEL(-i_padh, i_height+y), PPIXEL(-i_padh, i_height-1), i_width+2*i_padh );
 279 #undef PPIXEL
 280 }
 281
 282 void x264_frame_expand_border( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
 283 {
 284     int i;
 285     int b_start = !mb_y;
 286     if( mb_y & h->sh.b_mbaff )
 287         return;
 288     for( i = 0; i < frame->i_plane; i++ )
 289     {
 290         int stride = frame->i_stride[i];
 291         int width = 16*h->sps->i_mb_width >> !!i;
 292         int height = (b_end ? 16*(h->sps->i_mb_height - mb_y) >> h->sh.b_mbaff : 16) >> !!i;
 293         int padh = PADH >> !!i;
 294         int padv = PADV >> !!i;
 295         // buffer: 2 chroma, 3 luma (rounded to 4) because deblocking goes beyond the top of the mb
 296         uint8_t *pix = frame->plane[i] + X264_MAX(0, (16*mb_y-4)*stride >> !!i);
 297         if( b_end && !b_start )
 298             height += 4 >> (!!i + h->sh.b_mbaff);
 299         if( h->sh.b_mbaff )
 300         {
 301             plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end );
 302             plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end );
 303         }
 304         else
 305         {
 306             plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end );
 307         }
 308     }
 309 }
 310
 311 void x264_frame_expand_border_filtered( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
 312 {
 313     /* during filtering, 8 extra pixels were filtered on each edge,
 314      * but up to 3 of the horizontal ones may be wrong.
 315        we want to expand border from the last filtered pixel */
 316     int b_start = !mb_y;
 317     int stride = frame->i_stride[0];
 318     int width = 16*h->sps->i_mb_width + 8;
 319     int height = b_end ? (16*(h->sps->i_mb_height - mb_y) >> h->sh.b_mbaff) + 16 : 16;
 320     int padh = PADH - 4;
 321     int padv = PADV - 8;
 322     int i;
 323     for( i = 1; i < 4; i++ )
 324     {
 325         // buffer: 8 luma, to match the hpel filter
 326         uint8_t *pix = frame->filtered[i] + (16*mb_y - (8 << h->sh.b_mbaff)) * stride - 4;
 327         if( h->sh.b_mbaff )
 328         {
 329             plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end );
 330             plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end );
 331         }
 332         else
 333         {
 334             plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end );
 335         }
 336     }
 337 }
 338
 339 void x264_frame_expand_border_lowres( x264_frame_t *frame )
 340 {
 341     int i;
 342     for( i = 0; i < 4; i++ )
 343         plane_expand_border( frame->lowres[i], frame->i_stride_lowres, frame->i_width_lowres, frame->i_lines_lowres, PADH, PADV, 1, 1 );
 344 }
 345
 346 void x264_frame_expand_border_mod16( x264_t *h, x264_frame_t *frame )
 347 {
 348     int i, y;
 349     for( i = 0; i < frame->i_plane; i++ )
 350     {
 351         int i_subsample = i ? 1 : 0;
 352         int i_width = h->param.i_width >> i_subsample;
 353         int i_height = h->param.i_height >> i_subsample;
 354         int i_padx = (h->sps->i_mb_width * 16 - h->param.i_width) >> i_subsample;
 355         int i_pady = (h->sps->i_mb_height * 16 - h->param.i_height) >> i_subsample;
 356
 357         if( i_padx )
 358         {
 359             for( y = 0; y < i_height; y++ )
 360                 memset( &frame->plane[i][y*frame->i_stride[i] + i_width],
 361                          frame->plane[i][y*frame->i_stride[i] + i_width - 1],
 362                          i_padx );
 363         }
 364         if( i_pady )
 365         {
 366             for( y = i_height; y < i_height + i_pady; y++ )
 367                 memcpy( &frame->plane[i][y*frame->i_stride[i]],
 368                         &frame->plane[i][(i_height-(~y&h->param.b_interlaced)-1)*frame->i_stride[i]],
 369                         i_width + i_padx );
 370         }
 371     }
 372 }
 373
 374
 375 /* cavlc + 8x8 transform stores nnz per 16 coeffs for the purpose of
 376  * entropy coding, but per 64 coeffs for the purpose of deblocking */
 377 static void munge_cavlc_nnz_row( x264_t *h, int mb_y, uint8_t (*buf)[16] )
 378 {
 379     uint32_t (*src)[6] = (uint32_t(*)[6])h->mb.non_zero_count + mb_y * h->sps->i_mb_width;
 380     int8_t *transform = h->mb.mb_transform_size + mb_y * h->sps->i_mb_width;
 381     int x, nnz;
 382     for( x=0; x<h->sps->i_mb_width; x++ )
 383     {
 384         memcpy( buf+x, src+x, 16 );
 385         if( transform[x] )
 386         {
 387             nnz = src[x][0] | src[x][1];
 388             src[x][0] = src[x][1] = ((uint16_t)nnz ? 0x0101 : 0) + (nnz>>16 ? 0x01010000 : 0);
 389             nnz = src[x][2] | src[x][3];
 390             src[x][2] = src[x][3] = ((uint16_t)nnz ? 0x0101 : 0) + (nnz>>16 ? 0x01010000 : 0);
 391         }
 392     }
 393 }
 394
 395 static void restore_cavlc_nnz_row( x264_t *h, int mb_y, uint8_t (*buf)[16] )
 396 {
 397     uint8_t (*dst)[24] = h->mb.non_zero_count + mb_y * h->sps->i_mb_width;
 398     int x;
 399     for( x=0; x<h->sps->i_mb_width; x++ )
 400         memcpy( dst+x, buf+x, 16 );
 401 }
 402
 403 static void munge_cavlc_nnz( x264_t *h, int mb_y, uint8_t (*buf)[16], void (*func)(x264_t*, int, uint8_t (*)[16]) )
 404 {
 405     func( h, mb_y, buf );
 406     if( mb_y > 0 )
 407         func( h, mb_y-1, buf + h->sps->i_mb_width );
 408     if( h->sh.b_mbaff )
 409     {
 410         func( h, mb_y+1, buf + h->sps->i_mb_width * 2 );
 411         if( mb_y > 0 )
 412             func( h, mb_y-2, buf + h->sps->i_mb_width * 3 );
 413     }
 414 }
 415
 416
 417 /* Deblocking filter */
 418 static const uint8_t i_alpha_table[52+12*2] =
 419 {
 420      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 421      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 422      0,  0,  0,  0,  0,  0,  4,  4,  5,  6,
 423      7,  8,  9, 10, 12, 13, 15, 17, 20, 22,
 424     25, 28, 32, 36, 40, 45, 50, 56, 63, 71,
 425     80, 90,101,113,127,144,162,182,203,226,
 426    255,255,
 427    255,255,255,255,255,255,255,255,255,255,255,255,
 428 };
 429 static const uint8_t i_beta_table[52+12*2] =
 430 {
 431      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 432      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 433      0,  0,  0,  0,  0,  0,  2,  2,  2,  3,
 434      3,  3,  3,  4,  4,  4,  6,  6,  7,  7,
 435      8,  8,  9,  9, 10, 10, 11, 11, 12, 12,
 436     13, 13, 14, 14, 15, 15, 16, 16, 17, 17,
 437     18, 18,
 438     18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
 439 };
 440 static const int8_t i_tc0_table[52+12*2][4] =
 441 {
 442     {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
 443     {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
 444     {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
 445     {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
 446     {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 1 },
 447     {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 1, 1 }, {-1, 0, 1, 1 }, {-1, 1, 1, 1 },
 448     {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 },
 449     {-1, 1, 1, 2 }, {-1, 1, 2, 3 }, {-1, 1, 2, 3 }, {-1, 2, 2, 3 }, {-1, 2, 2, 4 }, {-1, 2, 3, 4 },
 450     {-1, 2, 3, 4 }, {-1, 3, 3, 5 }, {-1, 3, 4, 6 }, {-1, 3, 4, 6 }, {-1, 4, 5, 7 }, {-1, 4, 5, 8 },
 451     {-1, 4, 6, 9 }, {-1, 5, 7,10 }, {-1, 6, 8,11 }, {-1, 6, 8,13 }, {-1, 7,10,14 }, {-1, 8,11,16 },
 452     {-1, 9,12,18 }, {-1,10,13,20 }, {-1,11,15,23 }, {-1,13,17,25 },
 453     {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
 454     {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
 455 };
 456 #define alpha_table(x) i_alpha_table[(x)+12]
 457 #define beta_table(x)  i_beta_table[(x)+12]
 458 #define tc0_table(x)   i_tc0_table[(x)+12]
 459
 460 /* From ffmpeg */
 461 static inline void deblock_luma_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0 )
 462 {
 463     int i, d;
 464     for( i = 0; i < 4; i++ )
 465     {
 466         if( tc0[i] < 0 )
 467         {
 468             pix += 4*ystride;
 469             continue;
 470         }
 471         for( d = 0; d < 4; d++ )
 472         {
 473             const int p2 = pix[-3*xstride];
 474             const int p1 = pix[-2*xstride];
 475             const int p0 = pix[-1*xstride];
 476             const int q0 = pix[ 0*xstride];
 477             const int q1 = pix[ 1*xstride];
 478             const int q2 = pix[ 2*xstride];
 479
 480             if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
 481             {
 482                 int tc = tc0[i];
 483                 int delta;
 484                 if( abs( p2 - p0 ) < beta )
 485                 {
 486                     if( tc0[i] )
 487                         pix[-2*xstride] = p1 + x264_clip3( (( p2 + ((p0 + q0 + 1) >> 1)) >> 1) - p1, -tc0[i], tc0[i] );
 488                     tc++;
 489                 }
 490                 if( abs( q2 - q0 ) < beta )
 491                 {
 492                     if( tc0[i] )
 493                         pix[ 1*xstride] = q1 + x264_clip3( (( q2 + ((p0 + q0 + 1) >> 1)) >> 1) - q1, -tc0[i], tc0[i] );
 494                     tc++;
 495                 }
 496
 497                 delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
 498                 pix[-1*xstride] = x264_clip_uint8( p0 + delta );    /* p0' */
 499                 pix[ 0*xstride] = x264_clip_uint8( q0 - delta );    /* q0' */
 500             }
 501             pix += ystride;
 502         }
 503     }
 504 }
 505 static void deblock_v_luma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
 506 {
 507     deblock_luma_c( pix, stride, 1, alpha, beta, tc0 );
 508 }
 509 static void deblock_h_luma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
 510 {
 511     deblock_luma_c( pix, 1, stride, alpha, beta, tc0 );
 512 }
 513
 514 static inline void deblock_chroma_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0 )
 515 {
 516     int i, d;
 517     for( i = 0; i < 4; i++ )
 518     {
 519         const int tc = tc0[i];
 520         if( tc <= 0 )
 521         {
 522             pix += 2*ystride;
 523             continue;
 524         }
 525         for( d = 0; d < 2; d++ )
 526         {
 527             const int p1 = pix[-2*xstride];
 528             const int p0 = pix[-1*xstride];
 529             const int q0 = pix[ 0*xstride];
 530             const int q1 = pix[ 1*xstride];
 531
 532             if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
 533             {
 534                 int delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
 535                 pix[-1*xstride] = x264_clip_uint8( p0 + delta );    /* p0' */
 536                 pix[ 0*xstride] = x264_clip_uint8( q0 - delta );    /* q0' */
 537             }
 538             pix += ystride;
 539         }
 540     }
 541 }
 542 static void deblock_v_chroma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
 543 {
 544     deblock_chroma_c( pix, stride, 1, alpha, beta, tc0 );
 545 }
 546 static void deblock_h_chroma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
 547 {
 548     deblock_chroma_c( pix, 1, stride, alpha, beta, tc0 );
 549 }
 550
 551 static inline void deblock_luma_intra_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta )
 552 {
 553     int d;
 554     for( d = 0; d < 16; d++ )
 555     {
 556         const int p2 = pix[-3*xstride];
 557         const int p1 = pix[-2*xstride];
 558         const int p0 = pix[-1*xstride];
 559         const int q0 = pix[ 0*xstride];
 560         const int q1 = pix[ 1*xstride];
 561         const int q2 = pix[ 2*xstride];
 562
 563         if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
 564         {
 565             if(abs( p0 - q0 ) < ((alpha >> 2) + 2) )
 566             {
 567                 if( abs( p2 - p0 ) < beta ) /* p0', p1', p2' */
 568                 {
 569                     const int p3 = pix[-4*xstride];
 570                     pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
 571                     pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
 572                     pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
 573                 }
 574                 else /* p0' */
 575                     pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
 576                 if( abs( q2 - q0 ) < beta ) /* q0', q1', q2' */
 577                 {
 578                     const int q3 = pix[3*xstride];
 579                     pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
 580                     pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
 581                     pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
 582                 }
 583                 else /* q0' */
 584                     pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
 585             }
 586             else /* p0', q0' */
 587             {
 588                 pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
 589                 pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
 590             }
 591         }
 592         pix += ystride;
 593     }
 594 }
 595 static void deblock_v_luma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
 596 {
 597     deblock_luma_intra_c( pix, stride, 1, alpha, beta );
 598 }
 599 static void deblock_h_luma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
 600 {
 601     deblock_luma_intra_c( pix, 1, stride, alpha, beta );
 602 }
 603
 604 static inline void deblock_chroma_intra_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta )
 605 {
 606     int d;
 607     for( d = 0; d < 8; d++ )
 608     {
 609         const int p1 = pix[-2*xstride];
 610         const int p0 = pix[-1*xstride];
 611         const int q0 = pix[ 0*xstride];
 612         const int q1 = pix[ 1*xstride];
 613
 614         if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
 615         {
 616             pix[-1*xstride] = (2*p1 + p0 + q1 + 2) >> 2;   /* p0' */
 617             pix[ 0*xstride] = (2*q1 + q0 + p1 + 2) >> 2;   /* q0' */
 618         }
 619         pix += ystride;
 620     }
 621 }
 622 static void deblock_v_chroma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
 623 {
 624     deblock_chroma_intra_c( pix, stride, 1, alpha, beta );
 625 }
 626 static void deblock_h_chroma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
 627 {
 628     deblock_chroma_intra_c( pix, 1, stride, alpha, beta );
 629 }
 630
 631 static inline void deblock_edge( x264_t *h, uint8_t *pix1, uint8_t *pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_inter_t pf_inter )
 632 {
 633     const int index_a = i_qp + h->sh.i_alpha_c0_offset;
 634     const int alpha = alpha_table(index_a);
 635     const int beta  = beta_table(i_qp + h->sh.i_beta_offset);
 636     int8_t tc[4];
 637
 638     if( !alpha || !beta )
 639         return;
 640
 641     tc[0] = tc0_table(index_a)[bS[0]] + b_chroma;
 642     tc[1] = tc0_table(index_a)[bS[1]] + b_chroma;
 643     tc[2] = tc0_table(index_a)[bS[2]] + b_chroma;
 644     tc[3] = tc0_table(index_a)[bS[3]] + b_chroma;
 645
 646     pf_inter( pix1, i_stride, alpha, beta, tc );
 647     if( b_chroma )
 648         pf_inter( pix2, i_stride, alpha, beta, tc );
 649 }
 650
 651 static inline void deblock_edge_intra( x264_t *h, uint8_t *pix1, uint8_t *pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_intra_t pf_intra )
 652 {
 653     const int alpha = alpha_table(i_qp + h->sh.i_alpha_c0_offset);
 654     const int beta  = beta_table(i_qp + h->sh.i_beta_offset);
 655
 656     if( !alpha || !beta )
 657         return;
 658
 659     pf_intra( pix1, i_stride, alpha, beta );
 660     if( b_chroma )
 661         pf_intra( pix2, i_stride, alpha, beta );
 662 }
 663
 664 void x264_frame_deblock_row( x264_t *h, int mb_y )
 665 {
 666     const int s8x8 = 2 * h->mb.i_mb_stride;
 667     const int s4x4 = 4 * h->mb.i_mb_stride;
 668     const int b_interlaced = h->sh.b_mbaff;
 669     const int mvy_limit = 4 >> b_interlaced;
 670     const int qp_thresh = 15 - X264_MIN(h->sh.i_alpha_c0_offset, h->sh.i_beta_offset) - X264_MAX(0, h->param.analyse.i_chroma_qp_offset);
 671     const int no_sub8x8 = !(h->param.analyse.inter & X264_ANALYSE_PSUB8x8);
 672     int mb_x;
 673     int stridey   = h->fdec->i_stride[0];
 674     int stride2y  = stridey << b_interlaced;
 675     int strideuv  = h->fdec->i_stride[1];
 676     int stride2uv = strideuv << b_interlaced;
 677     uint8_t (*nnz_backup)[16] = h->scratch_buffer;
 678
 679     if( !h->pps->b_cabac && h->pps->b_transform_8x8_mode )
 680         munge_cavlc_nnz( h, mb_y, nnz_backup, munge_cavlc_nnz_row );
 681
 682     for( mb_x = 0; mb_x < h->sps->i_mb_width; mb_x += (~b_interlaced | mb_y)&1, mb_y ^= b_interlaced )
 683     {
 684         const int mb_xy  = mb_y * h->mb.i_mb_stride + mb_x;
 685         const int mb_8x8 = 2 * s8x8 * mb_y + 2 * mb_x;
 686         const int mb_4x4 = 4 * s4x4 * mb_y + 4 * mb_x;
 687         const int b_8x8_transform = h->mb.mb_transform_size[mb_xy];
 688         const int i_qp = h->mb.qp[mb_xy];
 689         int i_edge_end = (h->mb.type[mb_xy] == P_SKIP) ? 1 : 4;
 690         uint8_t *pixy = h->fdec->plane[0] + 16*mb_y*stridey  + 16*mb_x;
 691         uint8_t *pixu = h->fdec->plane[1] +  8*mb_y*strideuv +  8*mb_x;
 692         uint8_t *pixv = h->fdec->plane[2] +  8*mb_y*strideuv +  8*mb_x;
 693         if( b_interlaced && (mb_y&1) )
 694         {
 695             pixy -= 15*stridey;
 696             pixu -=  7*strideuv;
 697             pixv -=  7*strideuv;
 698         }
 699
 700         x264_prefetch_fenc( h, h->fdec, mb_x, mb_y );
 701
 702         if( i_qp <= qp_thresh )
 703             i_edge_end = 1;
 704
 705         #define FILTER_DIR(intra, i_dir)\
 706         {\
 707             /* Y plane */\
 708             i_qpn= h->mb.qp[mbn_xy];\
 709             if( i_dir == 0 )\
 710             {\
 711                 /* vertical edge */\
 712                 deblock_edge##intra( h, pixy + 4*i_edge, NULL,\
 713                               stride2y, bS, (i_qp+i_qpn+1) >> 1, 0,\
 714                               h->loopf.deblock_h_luma##intra );\
 715                 if( !(i_edge & 1) )\
 716                 {\
 717                     /* U/V planes */\
 718                     int i_qpc = (h->chroma_qp_table[i_qp] + h->chroma_qp_table[i_qpn] + 1) >> 1;\
 719                     deblock_edge##intra( h, pixu + 2*i_edge, pixv + 2*i_edge,\
 720                                   stride2uv, bS, i_qpc, 1,\
 721                                   h->loopf.deblock_h_chroma##intra );\
 722                 }\
 723             }\
 724             else\
 725             {\
 726                 /* horizontal edge */\
 727                 deblock_edge##intra( h, pixy + 4*i_edge*stride2y, NULL,\
 728                               stride2y, bS, (i_qp+i_qpn+1) >> 1, 0,\
 729                               h->loopf.deblock_v_luma##intra );\
 730                 /* U/V planes */\
 731                 if( !(i_edge & 1) )\
 732                 {\
 733                     int i_qpc = (h->chroma_qp_table[i_qp] + h->chroma_qp_table[i_qpn] + 1) >> 1;\
 734                     deblock_edge##intra( h, pixu + 2*i_edge*stride2uv, pixv + 2*i_edge*stride2uv,\
 735                                   stride2uv, bS, i_qpc, 1,\
 736                                   h->loopf.deblock_v_chroma##intra );\
 737                 }\
 738             }\
 739         }
 740
 741         #define DEBLOCK_STRENGTH(i_dir)\
 742         {\
 743             /* *** Get bS for each 4px for the current edge *** */\
 744             if( IS_INTRA( h->mb.type[mb_xy] ) || IS_INTRA( h->mb.type[mbn_xy]) )\
 745                 M32( bS ) = 0x03030303;\
 746             else\
 747             {\
 748                 M32( bS ) = 0x00000000;\
 749                 for( i = 0; i < 4; i++ )\
 750                 {\
 751                     int x  = i_dir == 0 ? i_edge : i;\
 752                     int y  = i_dir == 0 ? i      : i_edge;\
 753                     int xn = i_dir == 0 ? (x - 1)&0x03 : x;\
 754                     int yn = i_dir == 0 ? y : (y - 1)&0x03;\
 755                     if( h->mb.non_zero_count[mb_xy][x+y*4] != 0 ||\
 756                         h->mb.non_zero_count[mbn_xy][xn+yn*4] != 0 )\
 757                         bS[i] = 2;\
 758                     else if(!(i_edge&no_sub8x8))\
 759                     {\
 760                         if((i&no_sub8x8) && bS[i-1] != 2)\
 761                             bS[i] = bS[i-1];\
 762                         else\
 763                         {\
 764                             int i8p= mb_8x8+(x>>1)+(y>>1)*s8x8;\
 765                             int i8q= mbn_8x8+(xn>>1)+(yn>>1)*s8x8;\
 766                             int i4p= mb_4x4+x+y*s4x4;\
 767                             int i4q= mbn_4x4+xn+yn*s4x4;\
 768                             int refs_equal;\
 769                             /* We don't use duplicate refs in B-frames, so we can take this shortcut for now. */ \
 770                             if( h->sh.i_type == SLICE_TYPE_B || h->mb.ref[0][i8p] < 0 || h->mb.ref[0][i8q] < 0 )\
 771                                 refs_equal = h->mb.ref[0][i8p] == h->mb.ref[0][i8q];\
 772                             else if( !h->mb.b_interlaced )\
 773                                 refs_equal = h->fref0[h->mb.ref[0][i8p]]->i_poc == h->fref0[h->mb.ref[0][i8q]]->i_poc;\
 774                             else\
 775                                 refs_equal = h->fref0[h->mb.ref[0][i8p]>>1]->i_poc == h->fref0[h->mb.ref[0][i8q]>>1]->i_poc\
 776                                            && (h->mb.ref[0][i8p]&1) == (h->mb.ref[0][i8q]&1);\
 777                             if((!refs_equal ||\
 778                                 abs( h->mb.mv[0][i4p][0] - h->mb.mv[0][i4q][0] ) >= 4 ||\
 779                                 abs( h->mb.mv[0][i4p][1] - h->mb.mv[0][i4q][1] ) >= mvy_limit ) ||\
 780                                (h->sh.i_type == SLICE_TYPE_B &&\
 781                                (h->mb.ref[1][i8p] != h->mb.ref[1][i8q] ||\
 782                                 abs( h->mb.mv[1][i4p][0] - h->mb.mv[1][i4q][0] ) >= 4 ||\
 783                                 abs( h->mb.mv[1][i4p][1] - h->mb.mv[1][i4q][1] ) >= mvy_limit )))\
 784                             {\
 785                                 bS[i] = 1;\
 786                             }\
 787                         }\
 788                     }\
 789                 }\
 790             }\
 791         }
 792
 793         /* i_dir == 0 -> vertical edge
 794          * i_dir == 1 -> horizontal edge */
 795         #define DEBLOCK_DIR(i_dir)\
 796         {\
 797             int i_edge = (i_dir ? (mb_y <= b_interlaced) : (mb_x == 0));\
 798             int i_qpn, i, mbn_xy, mbn_8x8, mbn_4x4;\
 799             ALIGNED_4( uint8_t bS[4] );  /* filtering strength */\
 800             if( i_edge )\
 801                 i_edge+= b_8x8_transform;\
 802             else\
 803             {\
 804                 mbn_xy  = i_dir == 0 ? mb_xy  - 1 : mb_xy - h->mb.i_mb_stride;\
 805                 mbn_8x8 = i_dir == 0 ? mb_8x8 - 2 : mb_8x8 - 2 * s8x8;\
 806                 mbn_4x4 = i_dir == 0 ? mb_4x4 - 4 : mb_4x4 - 4 * s4x4;\
 807                 if( b_interlaced && i_dir == 1 )\
 808                 {\
 809                     mbn_xy -= h->mb.i_mb_stride;\
 810                     mbn_8x8 -= 2 * s8x8;\
 811                     mbn_4x4 -= 4 * s4x4;\
 812                 }\
 813                 else if( IS_INTRA( h->mb.type[mb_xy] ) || IS_INTRA( h->mb.type[mbn_xy]) )\
 814                 {\
 815                     FILTER_DIR( _intra, i_dir );\
 816                     goto end##i_dir;\
 817                 }\
 818                 DEBLOCK_STRENGTH(i_dir);\
 819                 if( M32( bS ) )\
 820                     FILTER_DIR( , i_dir);\
 821                 end##i_dir:\
 822                 i_edge += b_8x8_transform+1;\
 823             }\
 824             mbn_xy  = mb_xy;\
 825             mbn_8x8 = mb_8x8;\
 826             mbn_4x4 = mb_4x4;\
 827             for( ; i_edge < i_edge_end; i_edge+=b_8x8_transform+1 )\
 828             {\
 829                 DEBLOCK_STRENGTH(i_dir);\
 830                 if( M32( bS ) )\
 831                     FILTER_DIR( , i_dir);\
 832             }\
 833         }
 834
 835         DEBLOCK_DIR(0);
 836         DEBLOCK_DIR(1);
 837     }
 838
 839     if( !h->pps->b_cabac && h->pps->b_transform_8x8_mode )
 840         munge_cavlc_nnz( h, mb_y, nnz_backup, restore_cavlc_nnz_row );
 841 }
 842
 843 void x264_frame_deblock( x264_t *h )
 844 {
 845     int mb_y;
 846     for( mb_y = 0; mb_y < h->sps->i_mb_height; mb_y += 1 + h->sh.b_mbaff )
 847         x264_frame_deblock_row( h, mb_y );
 848 }
 849
 850 #ifdef HAVE_MMX
 851 void x264_deblock_v_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 852 void x264_deblock_h_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 853 void x264_deblock_v_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
 854 void x264_deblock_h_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
 855
 856 void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 857 void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 858 void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta );
 859 void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta );
 860 #ifdef ARCH_X86
 861 void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 862 void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 863 void x264_deblock_h_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
 864 void x264_deblock_v8_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
 865
 866 static void x264_deblock_v_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
 867 {
 868     x264_deblock_v8_luma_mmxext( pix,   stride, alpha, beta, tc0   );
 869     x264_deblock_v8_luma_mmxext( pix+8, stride, alpha, beta, tc0+2 );
 870 }
 871 static void x264_deblock_v_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta )
 872 {
 873     x264_deblock_v8_luma_intra_mmxext( pix,   stride, alpha, beta );
 874     x264_deblock_v8_luma_intra_mmxext( pix+8, stride, alpha, beta );
 875 }
 876 #endif
 877 #endif
 878
 879 #ifdef ARCH_PPC
 880 void x264_deblock_v_luma_altivec( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 881 void x264_deblock_h_luma_altivec( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 882 #endif // ARCH_PPC
 883
 884 #ifdef HAVE_ARMV6
 885 void x264_deblock_v_luma_neon( uint8_t *, int, int, int, int8_t * );
 886 void x264_deblock_h_luma_neon( uint8_t *, int, int, int, int8_t * );
 887 void x264_deblock_v_chroma_neon( uint8_t *, int, int, int, int8_t * );
 888 void x264_deblock_h_chroma_neon( uint8_t *, int, int, int, int8_t * );
 889 #endif
 890
 891 void x264_deblock_init( int cpu, x264_deblock_function_t *pf )
 892 {
 893     pf->deblock_v_luma = deblock_v_luma_c;
 894     pf->deblock_h_luma = deblock_h_luma_c;
 895     pf->deblock_v_chroma = deblock_v_chroma_c;
 896     pf->deblock_h_chroma = deblock_h_chroma_c;
 897     pf->deblock_v_luma_intra = deblock_v_luma_intra_c;
 898     pf->deblock_h_luma_intra = deblock_h_luma_intra_c;
 899     pf->deblock_v_chroma_intra = deblock_v_chroma_intra_c;
 900     pf->deblock_h_chroma_intra = deblock_h_chroma_intra_c;
 901
 902 #ifdef HAVE_MMX
 903     if( cpu&X264_CPU_MMXEXT )
 904     {
 905         pf->deblock_v_chroma = x264_deblock_v_chroma_mmxext;
 906         pf->deblock_h_chroma = x264_deblock_h_chroma_mmxext;
 907         pf->deblock_v_chroma_intra = x264_deblock_v_chroma_intra_mmxext;
 908         pf->deblock_h_chroma_intra = x264_deblock_h_chroma_intra_mmxext;
 909 #ifdef ARCH_X86
 910         pf->deblock_v_luma = x264_deblock_v_luma_mmxext;
 911         pf->deblock_h_luma = x264_deblock_h_luma_mmxext;
 912         pf->deblock_v_luma_intra = x264_deblock_v_luma_intra_mmxext;
 913         pf->deblock_h_luma_intra = x264_deblock_h_luma_intra_mmxext;
 914 #endif
 915         if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_STACK_MOD4) )
 916         {
 917             pf->deblock_v_luma = x264_deblock_v_luma_sse2;
 918             pf->deblock_h_luma = x264_deblock_h_luma_sse2;
 919             pf->deblock_v_luma_intra = x264_deblock_v_luma_intra_sse2;
 920             pf->deblock_h_luma_intra = x264_deblock_h_luma_intra_sse2;
 921         }
 922     }
 923 #endif
 924
 925 #ifdef HAVE_ALTIVEC
 926     if( cpu&X264_CPU_ALTIVEC )
 927     {
 928         pf->deblock_v_luma = x264_deblock_v_luma_altivec;
 929         pf->deblock_h_luma = x264_deblock_h_luma_altivec;
 930    }
 931 #endif // HAVE_ALTIVEC
 932
 933 #ifdef HAVE_ARMV6
 934    if( cpu&X264_CPU_NEON )
 935    {
 936         pf->deblock_v_luma   = x264_deblock_v_luma_neon;
 937         pf->deblock_h_luma   = x264_deblock_h_luma_neon;
 938         pf->deblock_v_chroma = x264_deblock_v_chroma_neon;
 939         pf->deblock_h_chroma = x264_deblock_h_chroma_neon;
 940    }
 941 #endif
 942 }
 943
 944
 945 /* threading */
 946 void x264_frame_cond_broadcast( x264_frame_t *frame, int i_lines_completed )
 947 {
 948     x264_pthread_mutex_lock( &frame->mutex );
 949     frame->i_lines_completed = i_lines_completed;
 950     x264_pthread_cond_broadcast( &frame->cv );
 951     x264_pthread_mutex_unlock( &frame->mutex );
 952 }
 953
 954 void x264_frame_cond_wait( x264_frame_t *frame, int i_lines_completed )
 955 {
 956     x264_pthread_mutex_lock( &frame->mutex );
 957     while( frame->i_lines_completed < i_lines_completed )
 958         x264_pthread_cond_wait( &frame->cv, &frame->mutex );
 959     x264_pthread_mutex_unlock( &frame->mutex );
 960 }
 961
 962 /* list operators */
 963
 964 void x264_frame_push( x264_frame_t **list, x264_frame_t *frame )
 965 {
 966     int i = 0;
 967     while( list[i] ) i++;
 968     list[i] = frame;
 969 }
 970
 971 x264_frame_t *x264_frame_pop( x264_frame_t **list )
 972 {
 973     x264_frame_t *frame;
 974     int i = 0;
 975     assert( list[0] );
 976     while( list[i+1] ) i++;
 977     frame = list[i];
 978     list[i] = NULL;
 979     return frame;
 980 }
 981
 982 void x264_frame_unshift( x264_frame_t **list, x264_frame_t *frame )
 983 {
 984     int i = 0;
 985     while( list[i] ) i++;
 986     while( i-- )
 987         list[i+1] = list[i];
 988     list[0] = frame;
 989 }
 990
 991 x264_frame_t *x264_frame_shift( x264_frame_t **list )
 992 {
 993     x264_frame_t *frame = list[0];
 994     int i;
 995     for( i = 0; list[i]; i++ )
 996         list[i] = list[i+1];
 997     assert(frame);
 998     return frame;
 999 }
1000
1001 void x264_frame_push_unused( x264_t *h, x264_frame_t *frame )
1002 {
1003     assert( frame->i_reference_count > 0 );
1004     frame->i_reference_count--;
1005     if( frame->i_reference_count == 0 )
1006         x264_frame_push( h->frames.unused[frame->b_fdec], frame );
1007 }
1008
1009 x264_frame_t *x264_frame_pop_unused( x264_t *h, int b_fdec )
1010 {
1011     x264_frame_t *frame;
1012     if( h->frames.unused[b_fdec][0] )
1013         frame = x264_frame_pop( h->frames.unused[b_fdec] );
1014     else
1015         frame = x264_frame_new( h, b_fdec );
1016     if( !frame )
1017         return NULL;
1018     frame->b_last_minigop_bframe = 0;
1019     frame->i_reference_count = 1;
1020     frame->b_intra_calculated = 0;
1021     frame->b_scenecut = 1;
1022     frame->b_keyframe = 0;
1023
1024     memset( frame->weight, 0, sizeof(frame->weight) );
1025     memset( frame->f_weighted_cost_delta, 0, sizeof(frame->f_weighted_cost_delta) );
1026
1027     return frame;
1028 }
1029
1030 void x264_frame_push_blank_unused( x264_t *h, x264_frame_t *frame )
1031 {
1032     assert( frame->i_reference_count > 0 );
1033     frame->i_reference_count--;
1034     if( frame->i_reference_count == 0 )
1035         x264_frame_push( h->frames.blank_unused, frame );
1036 }
1037
1038 x264_frame_t *x264_frame_pop_blank_unused( x264_t *h )
1039 {
1040     x264_frame_t *frame;
1041     if( h->frames.blank_unused[0] )
1042         frame = x264_frame_pop( h->frames.blank_unused );
1043     else
1044         frame = x264_malloc( sizeof(x264_frame_t) );
1045     if( !frame )
1046         return NULL;
1047     frame->b_duplicate = 1;
1048     frame->i_reference_count = 1;
1049     return frame;
1050 }
1051
1052 void x264_frame_sort( x264_frame_t **list, int b_dts )
1053 {
1054     int i, b_ok;
1055     do {
1056         b_ok = 1;
1057         for( i = 0; list[i+1]; i++ )
1058         {
1059             int dtype = list[i]->i_type - list[i+1]->i_type;
1060             int dtime = list[i]->i_frame - list[i+1]->i_frame;
1061             int swap = b_dts ? dtype > 0 || ( dtype == 0 && dtime > 0 )
1062                              : dtime > 0;
1063             if( swap )
1064             {
1065                 XCHG( x264_frame_t*, list[i], list[i+1] );
1066                 b_ok = 0;
1067             }
1068         }
1069     } while( !b_ok );
1070 }
1071
1072 void x264_weight_scale_plane( x264_t *h, uint8_t *dst, int i_dst_stride, uint8_t *src, int i_src_stride,
1073                          int i_width, int i_height, x264_weight_t *w )
1074 {
1075     int x;
1076     /* Weight horizontal strips of height 16. This was found to be the optimal height
1077      * in terms of the cache loads. */
1078     while( i_height > 0 )
1079     {
1080         for( x = 0; x < i_width; x += 16 )
1081             w->weightfn[16>>2]( dst+x, i_dst_stride, src+x, i_src_stride, w, X264_MIN( i_height, 16 ) );
1082         i_height -= 16;
1083         dst += 16 * i_dst_stride;
1084         src += 16 * i_src_stride;
1085     }
1086 }
1087
1088 void x264_frame_delete_list( x264_frame_t **list )
1089 {
1090     int i = 0;
1091     if( !list )
1092         return;
1093     while( list[i] )
1094         x264_frame_delete( list[i++] );
1095     x264_free( list );
1096 }
1097
1098 int x264_synch_frame_list_init( x264_synch_frame_list_t *slist, int max_size )
1099 {
1100     if( max_size < 0 )
1101         return -1;
1102     slist->i_max_size = max_size;
1103     slist->i_size = 0;
1104     CHECKED_MALLOCZERO( slist->list, (max_size+1) * sizeof(x264_frame_t*) );
1105     if( x264_pthread_mutex_init( &slist->mutex, NULL ) ||
1106         x264_pthread_cond_init( &slist->cv_fill, NULL ) ||
1107         x264_pthread_cond_init( &slist->cv_empty, NULL ) )
1108         return -1;
1109     return 0;
1110 fail:
1111     return -1;
1112 }
1113
1114 void x264_synch_frame_list_delete( x264_synch_frame_list_t *slist )
1115 {
1116     x264_pthread_mutex_destroy( &slist->mutex );
1117     x264_pthread_cond_destroy( &slist->cv_fill );
1118     x264_pthread_cond_destroy( &slist->cv_empty );
1119     x264_frame_delete_list( slist->list );
1120 }
1121
1122 void x264_synch_frame_list_push( x264_synch_frame_list_t *slist, x264_frame_t *frame )
1123 {
1124     x264_pthread_mutex_lock( &slist->mutex );
1125     while( slist->i_size == slist->i_max_size )
1126         x264_pthread_cond_wait( &slist->cv_empty, &slist->mutex );
1127     slist->list[ slist->i_size++ ] = frame;
1128     x264_pthread_mutex_unlock( &slist->mutex );
1129     x264_pthread_cond_broadcast( &slist->cv_fill );
1130 }