git.sesse.net Git - x264/blob - common/frame.c

   1 /*****************************************************************************
   2  * frame.c: h264 encoder library
   3  *****************************************************************************
   4  * Copyright (C) 2003-2008 x264 project
   5  *
   6  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
   7  *          Loren Merritt <lorenm@u.washington.edu>
   8  *          Fiona Glaser <fiona@x264.com>
   9  *
  10  * This program is free software; you can redistribute it and/or modify
  11  * it under the terms of the GNU General Public License as published by
  12  * the Free Software Foundation; either version 2 of the License, or
  13  * (at your option) any later version.
  14  *
  15  * This program is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  * GNU General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU General Public License
  21  * along with this program; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  23  *****************************************************************************/
  24
  25 #include "common.h"
  26
  27 #define ALIGN(x,a) (((x)+((a)-1))&~((a)-1))
  28
  29 x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
  30 {
  31     x264_frame_t *frame;
  32     int i, j;
  33
  34     int i_mb_count = h->mb.i_mb_count;
  35     int i_stride, i_width, i_lines;
  36     int i_padv = PADV << h->param.b_interlaced;
  37     int luma_plane_size;
  38     int chroma_plane_size;
  39     int align = h->param.cpu&X264_CPU_CACHELINE_64 ? 64 : h->param.cpu&X264_CPU_CACHELINE_32 ? 32 : 16;
  40
  41     CHECKED_MALLOCZERO( frame, sizeof(x264_frame_t) );
  42
  43     /* allocate frame data (+64 for extra data for me) */
  44     i_width  = ALIGN( h->param.i_width, 16 );
  45     i_stride = ALIGN( i_width + 2*PADH, align );
  46     i_lines  = ALIGN( h->param.i_height, 16<<h->param.b_interlaced );
  47
  48     frame->i_plane = 3;
  49     for( i = 0; i < 3; i++ )
  50     {
  51         frame->i_stride[i] = ALIGN( i_stride >> !!i, align );
  52         frame->i_width[i] = i_width >> !!i;
  53         frame->i_lines[i] = i_lines >> !!i;
  54     }
  55
  56     luma_plane_size = (frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv));
  57     chroma_plane_size = (frame->i_stride[1] * (frame->i_lines[1] + 2*i_padv));
  58     for( i = 1; i < 3; i++ )
  59     {
  60         CHECKED_MALLOC( frame->buffer[i], chroma_plane_size );
  61         frame->plane[i] = frame->buffer[i] + (frame->i_stride[i] * i_padv + PADH)/2;
  62     }
  63
  64     for( i = 0; i < h->param.i_bframe + 2; i++ )
  65         for( j = 0; j < h->param.i_bframe + 2; j++ )
  66             CHECKED_MALLOC( frame->i_row_satds[i][j], i_lines/16 * sizeof(int) );
  67
  68     frame->i_poc = -1;
  69     frame->i_type = X264_TYPE_AUTO;
  70     frame->i_qpplus1 = 0;
  71     frame->i_pts = -1;
  72     frame->i_frame = -1;
  73     frame->i_frame_num = -1;
  74     frame->i_lines_completed = -1;
  75     frame->b_fdec = b_fdec;
  76     frame->orig = frame;
  77
  78     /* all 4 luma planes allocated together, since the cacheline split code
  79      * requires them to be in-phase wrt cacheline alignment. */
  80     if( h->param.analyse.i_subpel_refine && b_fdec )
  81     {
  82         CHECKED_MALLOC( frame->buffer[0], 4*luma_plane_size );
  83         for( i = 0; i < 4; i++ )
  84             frame->filtered[i] = frame->buffer[0] + i*luma_plane_size + frame->i_stride[0] * i_padv + PADH;
  85         frame->plane[0] = frame->filtered[0];
  86     }
  87     else
  88     {
  89         CHECKED_MALLOC( frame->buffer[0], luma_plane_size );
  90         frame->filtered[0] = frame->plane[0] = frame->buffer[0] + frame->i_stride[0] * i_padv + PADH;
  91     }
  92
  93     frame->b_duplicate = 0;
  94
  95     if( b_fdec ) /* fdec frame */
  96     {
  97         CHECKED_MALLOC( frame->mb_type, i_mb_count * sizeof(int8_t));
  98         CHECKED_MALLOC( frame->mv[0], 2*16 * i_mb_count * sizeof(int16_t) );
  99         CHECKED_MALLOC( frame->ref[0], 4 * i_mb_count * sizeof(int8_t) );
 100         if( h->param.i_bframe )
 101         {
 102             CHECKED_MALLOC( frame->mv[1], 2*16 * i_mb_count * sizeof(int16_t) );
 103             CHECKED_MALLOC( frame->ref[1], 4 * i_mb_count * sizeof(int8_t) );
 104         }
 105         else
 106         {
 107             frame->mv[1]  = NULL;
 108             frame->ref[1] = NULL;
 109         }
 110         CHECKED_MALLOC( frame->i_row_bits, i_lines/16 * sizeof(int) );
 111         CHECKED_MALLOC( frame->i_row_qp, i_lines/16 * sizeof(int) );
 112         if( h->param.analyse.i_me_method >= X264_ME_ESA )
 113         {
 114             CHECKED_MALLOC( frame->buffer[3],
 115                             frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv) * sizeof(uint16_t) << h->frames.b_have_sub8x8_esa );
 116             frame->integral = (uint16_t*)frame->buffer[3] + frame->i_stride[0] * i_padv + PADH;
 117         }
 118     }
 119     else /* fenc frame */
 120     {
 121         if( h->frames.b_have_lowres )
 122         {
 123             frame->i_width_lowres = frame->i_width[0]/2;
 124             frame->i_stride_lowres = ALIGN( frame->i_width_lowres + 2*PADH, align );
 125             frame->i_lines_lowres = frame->i_lines[0]/2;
 126
 127             luma_plane_size = frame->i_stride_lowres * (frame->i_lines[0]/2 + 2*i_padv);
 128
 129             CHECKED_MALLOC( frame->buffer_lowres[0], 4 * luma_plane_size );
 130             for( i = 0; i < 4; i++ )
 131                 frame->lowres[i] = frame->buffer_lowres[0] + (frame->i_stride_lowres * i_padv + PADH) + i * luma_plane_size;
 132
 133             for( j = 0; j <= !!h->param.i_bframe; j++ )
 134                 for( i = 0; i <= h->param.i_bframe; i++ )
 135                 {
 136                     CHECKED_MALLOCZERO( frame->lowres_mvs[j][i], 2*h->mb.i_mb_count*sizeof(int16_t) );
 137                     CHECKED_MALLOC( frame->lowres_mv_costs[j][i], h->mb.i_mb_count*sizeof(int) );
 138                 }
 139             CHECKED_MALLOC( frame->i_propagate_cost, (i_mb_count+3) * sizeof(uint16_t) );
 140             for( j = 0; j <= h->param.i_bframe+1; j++ )
 141                 for( i = 0; i <= h->param.i_bframe+1; i++ )
 142                 {
 143                     CHECKED_MALLOC( frame->lowres_costs[j][i], (i_mb_count+3) * sizeof(uint16_t) );
 144                     CHECKED_MALLOC( frame->lowres_inter_types[j][i], (i_mb_count+3)/4 * sizeof(uint8_t) );
 145                 }
 146             frame->i_intra_cost = frame->lowres_costs[0][0];
 147             memset( frame->i_intra_cost, -1, (i_mb_count+3) * sizeof(uint16_t) );
 148         }
 149         if( h->param.rc.i_aq_mode )
 150         {
 151             CHECKED_MALLOC( frame->f_qp_offset, h->mb.i_mb_count * sizeof(float) );
 152             CHECKED_MALLOC( frame->f_qp_offset_aq, h->mb.i_mb_count * sizeof(float) );
 153             if( h->frames.b_have_lowres )
 154                 /* shouldn't really be initialized, just silences a valgrind false-positive in x264_mbtree_propagate_cost_sse2 */
 155                 CHECKED_MALLOCZERO( frame->i_inv_qscale_factor, (h->mb.i_mb_count+3) * sizeof(uint16_t) );
 156         }
 157     }
 158
 159     if( x264_pthread_mutex_init( &frame->mutex, NULL ) )
 160         goto fail;
 161     if( x264_pthread_cond_init( &frame->cv, NULL ) )
 162         goto fail;
 163
 164     return frame;
 165
 166 fail:
 167     x264_free( frame );
 168     return NULL;
 169 }
 170
 171 void x264_frame_delete( x264_frame_t *frame )
 172 {
 173     int i, j;
 174     /* Duplicate frames are blank copies of real frames (including pointers),
 175      * so freeing those pointers would cause a double free later. */
 176     if( !frame->b_duplicate )
 177     {
 178         for( i = 0; i < 4; i++ )
 179             x264_free( frame->buffer[i] );
 180         for( i = 0; i < 4; i++ )
 181             x264_free( frame->buffer_lowres[i] );
 182         for( i = 0; i < X264_BFRAME_MAX+2; i++ )
 183             for( j = 0; j < X264_BFRAME_MAX+2; j++ )
 184                 x264_free( frame->i_row_satds[i][j] );
 185         for( j = 0; j < 2; j++ )
 186             for( i = 0; i <= X264_BFRAME_MAX; i++ )
 187             {
 188                 x264_free( frame->lowres_mvs[j][i] );
 189                 x264_free( frame->lowres_mv_costs[j][i] );
 190             }
 191         x264_free( frame->i_propagate_cost );
 192         for( j = 0; j <= X264_BFRAME_MAX+1; j++ )
 193             for( i = 0; i <= X264_BFRAME_MAX+1; i++ )
 194             {
 195                 x264_free( frame->lowres_costs[j][i] );
 196                 x264_free( frame->lowres_inter_types[j][i] );
 197             }
 198         x264_free( frame->f_qp_offset );
 199         x264_free( frame->f_qp_offset_aq );
 200         x264_free( frame->i_inv_qscale_factor );
 201         x264_free( frame->i_row_bits );
 202         x264_free( frame->i_row_qp );
 203         x264_free( frame->mb_type );
 204         x264_free( frame->mv[0] );
 205         x264_free( frame->mv[1] );
 206         x264_free( frame->ref[0] );
 207         x264_free( frame->ref[1] );
 208         x264_pthread_mutex_destroy( &frame->mutex );
 209         x264_pthread_cond_destroy( &frame->cv );
 210     }
 211     x264_free( frame );
 212 }
 213
 214 int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src )
 215 {
 216     int i_csp = src->img.i_csp & X264_CSP_MASK;
 217     int i;
 218     if( i_csp != X264_CSP_I420 && i_csp != X264_CSP_YV12 )
 219     {
 220         x264_log( h, X264_LOG_ERROR, "Arg invalid CSP\n" );
 221         return -1;
 222     }
 223
 224     dst->i_type     = src->i_type;
 225     dst->i_qpplus1  = src->i_qpplus1;
 226     dst->i_pts      = src->i_pts;
 227     dst->param      = src->param;
 228
 229     for( i=0; i<3; i++ )
 230     {
 231         int s = (i_csp == X264_CSP_YV12 && i) ? i^3 : i;
 232         uint8_t *plane = src->img.plane[s];
 233         int stride = src->img.i_stride[s];
 234         int width = h->param.i_width >> !!i;
 235         int height = h->param.i_height >> !!i;
 236         if( src->img.i_csp & X264_CSP_VFLIP )
 237         {
 238             plane += (height-1)*stride;
 239             stride = -stride;
 240         }
 241         h->mc.plane_copy( dst->plane[i], dst->i_stride[i], plane, stride, width, height );
 242     }
 243     return 0;
 244 }
 245
 246
 247
 248 static void plane_expand_border( uint8_t *pix, int i_stride, int i_width, int i_height, int i_padh, int i_padv, int b_pad_top, int b_pad_bottom )
 249 {
 250 #define PPIXEL(x, y) ( pix + (x) + (y)*i_stride )
 251     int y;
 252     for( y = 0; y < i_height; y++ )
 253     {
 254         /* left band */
 255         memset( PPIXEL(-i_padh, y), PPIXEL(0, y)[0], i_padh );
 256         /* right band */
 257         memset( PPIXEL(i_width, y), PPIXEL(i_width-1, y)[0], i_padh );
 258     }
 259     /* upper band */
 260     if( b_pad_top )
 261     for( y = 0; y < i_padv; y++ )
 262         memcpy( PPIXEL(-i_padh, -y-1), PPIXEL(-i_padh, 0), i_width+2*i_padh );
 263     /* lower band */
 264     if( b_pad_bottom )
 265     for( y = 0; y < i_padv; y++ )
 266         memcpy( PPIXEL(-i_padh, i_height+y), PPIXEL(-i_padh, i_height-1), i_width+2*i_padh );
 267 #undef PPIXEL
 268 }
 269
 270 void x264_frame_expand_border( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
 271 {
 272     int i;
 273     int b_start = !mb_y;
 274     if( mb_y & h->sh.b_mbaff )
 275         return;
 276     for( i = 0; i < frame->i_plane; i++ )
 277     {
 278         int stride = frame->i_stride[i];
 279         int width = 16*h->sps->i_mb_width >> !!i;
 280         int height = (b_end ? 16*(h->sps->i_mb_height - mb_y) >> h->sh.b_mbaff : 16) >> !!i;
 281         int padh = PADH >> !!i;
 282         int padv = PADV >> !!i;
 283         // buffer: 2 chroma, 3 luma (rounded to 4) because deblocking goes beyond the top of the mb
 284         uint8_t *pix = frame->plane[i] + X264_MAX(0, (16*mb_y-4)*stride >> !!i);
 285         if( b_end && !b_start )
 286             height += 4 >> (!!i + h->sh.b_mbaff);
 287         if( h->sh.b_mbaff )
 288         {
 289             plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end );
 290             plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end );
 291         }
 292         else
 293         {
 294             plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end );
 295         }
 296     }
 297 }
 298
 299 void x264_frame_expand_border_filtered( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
 300 {
 301     /* during filtering, 8 extra pixels were filtered on each edge,
 302      * but up to 3 of the horizontal ones may be wrong.
 303        we want to expand border from the last filtered pixel */
 304     int b_start = !mb_y;
 305     int stride = frame->i_stride[0];
 306     int width = 16*h->sps->i_mb_width + 8;
 307     int height = b_end ? (16*(h->sps->i_mb_height - mb_y) >> h->sh.b_mbaff) + 16 : 16;
 308     int padh = PADH - 4;
 309     int padv = PADV - 8;
 310     int i;
 311     for( i = 1; i < 4; i++ )
 312     {
 313         // buffer: 8 luma, to match the hpel filter
 314         uint8_t *pix = frame->filtered[i] + (16*mb_y - (8 << h->sh.b_mbaff)) * stride - 4;
 315         if( h->sh.b_mbaff )
 316         {
 317             plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end );
 318             plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end );
 319         }
 320         else
 321         {
 322             plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end );
 323         }
 324     }
 325 }
 326
 327 void x264_frame_expand_border_lowres( x264_frame_t *frame )
 328 {
 329     int i;
 330     for( i = 0; i < 4; i++ )
 331         plane_expand_border( frame->lowres[i], frame->i_stride_lowres, frame->i_width_lowres, frame->i_lines_lowres, PADH, PADV, 1, 1 );
 332 }
 333
 334 void x264_frame_expand_border_mod16( x264_t *h, x264_frame_t *frame )
 335 {
 336     int i, y;
 337     for( i = 0; i < frame->i_plane; i++ )
 338     {
 339         int i_subsample = i ? 1 : 0;
 340         int i_width = h->param.i_width >> i_subsample;
 341         int i_height = h->param.i_height >> i_subsample;
 342         int i_padx = (h->sps->i_mb_width * 16 - h->param.i_width) >> i_subsample;
 343         int i_pady = (h->sps->i_mb_height * 16 - h->param.i_height) >> i_subsample;
 344
 345         if( i_padx )
 346         {
 347             for( y = 0; y < i_height; y++ )
 348                 memset( &frame->plane[i][y*frame->i_stride[i] + i_width],
 349                          frame->plane[i][y*frame->i_stride[i] + i_width - 1],
 350                          i_padx );
 351         }
 352         if( i_pady )
 353         {
 354             //FIXME interlace? or just let it pad using the wrong field
 355             for( y = i_height; y < i_height + i_pady; y++ )
 356                 memcpy( &frame->plane[i][y*frame->i_stride[i]],
 357                         &frame->plane[i][(i_height-1)*frame->i_stride[i]],
 358                         i_width + i_padx );
 359         }
 360     }
 361 }
 362
 363
 364 /* cavlc + 8x8 transform stores nnz per 16 coeffs for the purpose of
 365  * entropy coding, but per 64 coeffs for the purpose of deblocking */
 366 static void munge_cavlc_nnz_row( x264_t *h, int mb_y, uint8_t (*buf)[16] )
 367 {
 368     uint32_t (*src)[6] = (uint32_t(*)[6])h->mb.non_zero_count + mb_y * h->sps->i_mb_width;
 369     int8_t *transform = h->mb.mb_transform_size + mb_y * h->sps->i_mb_width;
 370     int x, nnz;
 371     for( x=0; x<h->sps->i_mb_width; x++ )
 372     {
 373         memcpy( buf+x, src+x, 16 );
 374         if( transform[x] )
 375         {
 376             nnz = src[x][0] | src[x][1];
 377             src[x][0] = src[x][1] = ((uint16_t)nnz ? 0x0101 : 0) + (nnz>>16 ? 0x01010000 : 0);
 378             nnz = src[x][2] | src[x][3];
 379             src[x][2] = src[x][3] = ((uint16_t)nnz ? 0x0101 : 0) + (nnz>>16 ? 0x01010000 : 0);
 380         }
 381     }
 382 }
 383
 384 static void restore_cavlc_nnz_row( x264_t *h, int mb_y, uint8_t (*buf)[16] )
 385 {
 386     uint8_t (*dst)[24] = h->mb.non_zero_count + mb_y * h->sps->i_mb_width;
 387     int x;
 388     for( x=0; x<h->sps->i_mb_width; x++ )
 389         memcpy( dst+x, buf+x, 16 );
 390 }
 391
 392 static void munge_cavlc_nnz( x264_t *h, int mb_y, uint8_t (*buf)[16], void (*func)(x264_t*, int, uint8_t (*)[16]) )
 393 {
 394     func( h, mb_y, buf );
 395     if( mb_y > 0 )
 396         func( h, mb_y-1, buf + h->sps->i_mb_width );
 397     if( h->sh.b_mbaff )
 398     {
 399         func( h, mb_y+1, buf + h->sps->i_mb_width * 2 );
 400         if( mb_y > 0 )
 401             func( h, mb_y-2, buf + h->sps->i_mb_width * 3 );
 402     }
 403 }
 404
 405
 406 /* Deblocking filter */
 407 static const uint8_t i_alpha_table[52+12*2] =
 408 {
 409      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 410      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 411      0,  0,  0,  0,  0,  0,  4,  4,  5,  6,
 412      7,  8,  9, 10, 12, 13, 15, 17, 20, 22,
 413     25, 28, 32, 36, 40, 45, 50, 56, 63, 71,
 414     80, 90,101,113,127,144,162,182,203,226,
 415    255,255,
 416    255,255,255,255,255,255,255,255,255,255,255,255,
 417 };
 418 static const uint8_t i_beta_table[52+12*2] =
 419 {
 420      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 421      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 422      0,  0,  0,  0,  0,  0,  2,  2,  2,  3,
 423      3,  3,  3,  4,  4,  4,  6,  6,  7,  7,
 424      8,  8,  9,  9, 10, 10, 11, 11, 12, 12,
 425     13, 13, 14, 14, 15, 15, 16, 16, 17, 17,
 426     18, 18,
 427     18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
 428 };
 429 static const int8_t i_tc0_table[52+12*2][4] =
 430 {
 431     {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
 432     {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
 433     {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
 434     {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
 435     {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 1 },
 436     {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 1, 1 }, {-1, 0, 1, 1 }, {-1, 1, 1, 1 },
 437     {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 },
 438     {-1, 1, 1, 2 }, {-1, 1, 2, 3 }, {-1, 1, 2, 3 }, {-1, 2, 2, 3 }, {-1, 2, 2, 4 }, {-1, 2, 3, 4 },
 439     {-1, 2, 3, 4 }, {-1, 3, 3, 5 }, {-1, 3, 4, 6 }, {-1, 3, 4, 6 }, {-1, 4, 5, 7 }, {-1, 4, 5, 8 },
 440     {-1, 4, 6, 9 }, {-1, 5, 7,10 }, {-1, 6, 8,11 }, {-1, 6, 8,13 }, {-1, 7,10,14 }, {-1, 8,11,16 },
 441     {-1, 9,12,18 }, {-1,10,13,20 }, {-1,11,15,23 }, {-1,13,17,25 },
 442     {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
 443     {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
 444 };
 445 #define alpha_table(x) i_alpha_table[(x)+12]
 446 #define beta_table(x)  i_beta_table[(x)+12]
 447 #define tc0_table(x)   i_tc0_table[(x)+12]
 448
 449 /* From ffmpeg */
 450 static inline void deblock_luma_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0 )
 451 {
 452     int i, d;
 453     for( i = 0; i < 4; i++ )
 454     {
 455         if( tc0[i] < 0 )
 456         {
 457             pix += 4*ystride;
 458             continue;
 459         }
 460         for( d = 0; d < 4; d++ )
 461         {
 462             const int p2 = pix[-3*xstride];
 463             const int p1 = pix[-2*xstride];
 464             const int p0 = pix[-1*xstride];
 465             const int q0 = pix[ 0*xstride];
 466             const int q1 = pix[ 1*xstride];
 467             const int q2 = pix[ 2*xstride];
 468
 469             if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
 470             {
 471                 int tc = tc0[i];
 472                 int delta;
 473                 if( abs( p2 - p0 ) < beta )
 474                 {
 475                     pix[-2*xstride] = p1 + x264_clip3( (( p2 + ((p0 + q0 + 1) >> 1)) >> 1) - p1, -tc0[i], tc0[i] );
 476                     tc++;
 477                 }
 478                 if( abs( q2 - q0 ) < beta )
 479                 {
 480                     pix[ 1*xstride] = q1 + x264_clip3( (( q2 + ((p0 + q0 + 1) >> 1)) >> 1) - q1, -tc0[i], tc0[i] );
 481                     tc++;
 482                 }
 483
 484                 delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
 485                 pix[-1*xstride] = x264_clip_uint8( p0 + delta );    /* p0' */
 486                 pix[ 0*xstride] = x264_clip_uint8( q0 - delta );    /* q0' */
 487             }
 488             pix += ystride;
 489         }
 490     }
 491 }
 492 static void deblock_v_luma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
 493 {
 494     deblock_luma_c( pix, stride, 1, alpha, beta, tc0 );
 495 }
 496 static void deblock_h_luma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
 497 {
 498     deblock_luma_c( pix, 1, stride, alpha, beta, tc0 );
 499 }
 500
 501 static inline void deblock_chroma_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0 )
 502 {
 503     int i, d;
 504     for( i = 0; i < 4; i++ )
 505     {
 506         const int tc = tc0[i];
 507         if( tc <= 0 )
 508         {
 509             pix += 2*ystride;
 510             continue;
 511         }
 512         for( d = 0; d < 2; d++ )
 513         {
 514             const int p1 = pix[-2*xstride];
 515             const int p0 = pix[-1*xstride];
 516             const int q0 = pix[ 0*xstride];
 517             const int q1 = pix[ 1*xstride];
 518
 519             if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
 520             {
 521                 int delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
 522                 pix[-1*xstride] = x264_clip_uint8( p0 + delta );    /* p0' */
 523                 pix[ 0*xstride] = x264_clip_uint8( q0 - delta );    /* q0' */
 524             }
 525             pix += ystride;
 526         }
 527     }
 528 }
 529 static void deblock_v_chroma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
 530 {
 531     deblock_chroma_c( pix, stride, 1, alpha, beta, tc0 );
 532 }
 533 static void deblock_h_chroma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
 534 {
 535     deblock_chroma_c( pix, 1, stride, alpha, beta, tc0 );
 536 }
 537
 538 static inline void deblock_luma_intra_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta )
 539 {
 540     int d;
 541     for( d = 0; d < 16; d++ )
 542     {
 543         const int p2 = pix[-3*xstride];
 544         const int p1 = pix[-2*xstride];
 545         const int p0 = pix[-1*xstride];
 546         const int q0 = pix[ 0*xstride];
 547         const int q1 = pix[ 1*xstride];
 548         const int q2 = pix[ 2*xstride];
 549
 550         if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
 551         {
 552             if(abs( p0 - q0 ) < ((alpha >> 2) + 2) )
 553             {
 554                 if( abs( p2 - p0 ) < beta ) /* p0', p1', p2' */
 555                 {
 556                     const int p3 = pix[-4*xstride];
 557                     pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
 558                     pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
 559                     pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
 560                 }
 561                 else /* p0' */
 562                     pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
 563                 if( abs( q2 - q0 ) < beta ) /* q0', q1', q2' */
 564                 {
 565                     const int q3 = pix[3*xstride];
 566                     pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
 567                     pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
 568                     pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
 569                 }
 570                 else /* q0' */
 571                     pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
 572             }
 573             else /* p0', q0' */
 574             {
 575                 pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
 576                 pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
 577             }
 578         }
 579         pix += ystride;
 580     }
 581 }
 582 static void deblock_v_luma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
 583 {
 584     deblock_luma_intra_c( pix, stride, 1, alpha, beta );
 585 }
 586 static void deblock_h_luma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
 587 {
 588     deblock_luma_intra_c( pix, 1, stride, alpha, beta );
 589 }
 590
 591 static inline void deblock_chroma_intra_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta )
 592 {
 593     int d;
 594     for( d = 0; d < 8; d++ )
 595     {
 596         const int p1 = pix[-2*xstride];
 597         const int p0 = pix[-1*xstride];
 598         const int q0 = pix[ 0*xstride];
 599         const int q1 = pix[ 1*xstride];
 600
 601         if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
 602         {
 603             pix[-1*xstride] = (2*p1 + p0 + q1 + 2) >> 2;   /* p0' */
 604             pix[ 0*xstride] = (2*q1 + q0 + p1 + 2) >> 2;   /* q0' */
 605         }
 606         pix += ystride;
 607     }
 608 }
 609 static void deblock_v_chroma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
 610 {
 611     deblock_chroma_intra_c( pix, stride, 1, alpha, beta );
 612 }
 613 static void deblock_h_chroma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
 614 {
 615     deblock_chroma_intra_c( pix, 1, stride, alpha, beta );
 616 }
 617
 618 static inline void deblock_edge( x264_t *h, uint8_t *pix1, uint8_t *pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_inter_t pf_inter )
 619 {
 620     const int index_a = i_qp + h->sh.i_alpha_c0_offset;
 621     const int alpha = alpha_table(index_a);
 622     const int beta  = beta_table(i_qp + h->sh.i_beta_offset);
 623     int8_t tc[4];
 624
 625     if( !alpha || !beta )
 626         return;
 627
 628     tc[0] = tc0_table(index_a)[bS[0]] + b_chroma;
 629     tc[1] = tc0_table(index_a)[bS[1]] + b_chroma;
 630     tc[2] = tc0_table(index_a)[bS[2]] + b_chroma;
 631     tc[3] = tc0_table(index_a)[bS[3]] + b_chroma;
 632
 633     pf_inter( pix1, i_stride, alpha, beta, tc );
 634     if( b_chroma )
 635         pf_inter( pix2, i_stride, alpha, beta, tc );
 636 }
 637
 638 static inline void deblock_edge_intra( x264_t *h, uint8_t *pix1, uint8_t *pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_intra_t pf_intra )
 639 {
 640     const int alpha = alpha_table(i_qp + h->sh.i_alpha_c0_offset);
 641     const int beta  = beta_table(i_qp + h->sh.i_beta_offset);
 642
 643     if( !alpha || !beta )
 644         return;
 645
 646     pf_intra( pix1, i_stride, alpha, beta );
 647     if( b_chroma )
 648         pf_intra( pix2, i_stride, alpha, beta );
 649 }
 650
 651 void x264_frame_deblock_row( x264_t *h, int mb_y )
 652 {
 653     const int s8x8 = 2 * h->mb.i_mb_stride;
 654     const int s4x4 = 4 * h->mb.i_mb_stride;
 655     const int b_interlaced = h->sh.b_mbaff;
 656     const int mvy_limit = 4 >> b_interlaced;
 657     const int qp_thresh = 15 - X264_MIN(h->sh.i_alpha_c0_offset, h->sh.i_beta_offset) - X264_MAX(0, h->param.analyse.i_chroma_qp_offset);
 658     const int no_sub8x8 = !(h->param.analyse.inter & X264_ANALYSE_PSUB8x8);
 659     int mb_x;
 660     int stridey   = h->fdec->i_stride[0];
 661     int stride2y  = stridey << b_interlaced;
 662     int strideuv  = h->fdec->i_stride[1];
 663     int stride2uv = strideuv << b_interlaced;
 664
 665     if( !h->pps->b_cabac && h->pps->b_transform_8x8_mode )
 666         munge_cavlc_nnz( h, mb_y, h->mb.nnz_backup, munge_cavlc_nnz_row );
 667
 668     for( mb_x = 0; mb_x < h->sps->i_mb_width; mb_x += (~b_interlaced | mb_y)&1, mb_y ^= b_interlaced )
 669     {
 670         const int mb_xy  = mb_y * h->mb.i_mb_stride + mb_x;
 671         const int mb_8x8 = 2 * s8x8 * mb_y + 2 * mb_x;
 672         const int mb_4x4 = 4 * s4x4 * mb_y + 4 * mb_x;
 673         const int b_8x8_transform = h->mb.mb_transform_size[mb_xy];
 674         const int i_qp = h->mb.qp[mb_xy];
 675         int i_edge_end = (h->mb.type[mb_xy] == P_SKIP) ? 1 : 4;
 676         uint8_t *pixy = h->fdec->plane[0] + 16*mb_y*stridey  + 16*mb_x;
 677         uint8_t *pixu = h->fdec->plane[1] +  8*mb_y*strideuv +  8*mb_x;
 678         uint8_t *pixv = h->fdec->plane[2] +  8*mb_y*strideuv +  8*mb_x;
 679         if( b_interlaced && (mb_y&1) )
 680         {
 681             pixy -= 15*stridey;
 682             pixu -=  7*strideuv;
 683             pixv -=  7*strideuv;
 684         }
 685
 686         x264_prefetch_fenc( h, h->fdec, mb_x, mb_y );
 687
 688         if( i_qp <= qp_thresh )
 689             i_edge_end = 1;
 690
 691         #define FILTER_DIR(intra, i_dir)\
 692         {\
 693             /* Y plane */\
 694             i_qpn= h->mb.qp[mbn_xy];\
 695             if( i_dir == 0 )\
 696             {\
 697                 /* vertical edge */\
 698                 deblock_edge##intra( h, pixy + 4*i_edge, NULL,\
 699                               stride2y, bS, (i_qp+i_qpn+1) >> 1, 0,\
 700                               h->loopf.deblock_h_luma##intra );\
 701                 if( !(i_edge & 1) )\
 702                 {\
 703                     /* U/V planes */\
 704                     int i_qpc = (h->chroma_qp_table[i_qp] + h->chroma_qp_table[i_qpn] + 1) >> 1;\
 705                     deblock_edge##intra( h, pixu + 2*i_edge, pixv + 2*i_edge,\
 706                                   stride2uv, bS, i_qpc, 1,\
 707                                   h->loopf.deblock_h_chroma##intra );\
 708                 }\
 709             }\
 710             else\
 711             {\
 712                 /* horizontal edge */\
 713                 deblock_edge##intra( h, pixy + 4*i_edge*stride2y, NULL,\
 714                               stride2y, bS, (i_qp+i_qpn+1) >> 1, 0,\
 715                               h->loopf.deblock_v_luma##intra );\
 716                 /* U/V planes */\
 717                 if( !(i_edge & 1) )\
 718                 {\
 719                     int i_qpc = (h->chroma_qp_table[i_qp] + h->chroma_qp_table[i_qpn] + 1) >> 1;\
 720                     deblock_edge##intra( h, pixu + 2*i_edge*stride2uv, pixv + 2*i_edge*stride2uv,\
 721                                   stride2uv, bS, i_qpc, 1,\
 722                                   h->loopf.deblock_v_chroma##intra );\
 723                 }\
 724             }\
 725         }
 726
 727         #define DEBLOCK_STRENGTH(i_dir)\
 728         {\
 729             /* *** Get bS for each 4px for the current edge *** */\
 730             if( IS_INTRA( h->mb.type[mb_xy] ) || IS_INTRA( h->mb.type[mbn_xy]) )\
 731                 M32( bS ) = 0x03030303;\
 732             else\
 733             {\
 734                 M32( bS ) = 0x00000000;\
 735                 for( i = 0; i < 4; i++ )\
 736                 {\
 737                     int x  = i_dir == 0 ? i_edge : i;\
 738                     int y  = i_dir == 0 ? i      : i_edge;\
 739                     int xn = i_dir == 0 ? (x - 1)&0x03 : x;\
 740                     int yn = i_dir == 0 ? y : (y - 1)&0x03;\
 741                     if( h->mb.non_zero_count[mb_xy][x+y*4] != 0 ||\
 742                         h->mb.non_zero_count[mbn_xy][xn+yn*4] != 0 )\
 743                         bS[i] = 2;\
 744                     else if(!(i_edge&no_sub8x8))\
 745                     {\
 746                         if((i&no_sub8x8) && bS[i-1] != 2)\
 747                             bS[i] = bS[i-1];\
 748                         else\
 749                         {\
 750                             /* FIXME: A given frame may occupy more than one position in\
 751                              * the reference list. So we should compare the frame numbers,\
 752                              * not the indices in the ref list.\
 753                              * No harm yet, as we don't generate that case.*/\
 754                             int i8p= mb_8x8+(x>>1)+(y>>1)*s8x8;\
 755                             int i8q= mbn_8x8+(xn>>1)+(yn>>1)*s8x8;\
 756                             int i4p= mb_4x4+x+y*s4x4;\
 757                             int i4q= mbn_4x4+xn+yn*s4x4;\
 758                             int refs_equal;\
 759                             if( h->mb.ref[0][i8p] < 0 || h->mb.ref[0][i8q] < 0 )\
 760                                 refs_equal = h->mb.ref[0][i8p] == h->mb.ref[0][i8q];\
 761                             else if( !h->mb.b_interlaced )\
 762                                 refs_equal = h->fref0[h->mb.ref[0][i8p]]->i_poc == h->fref0[h->mb.ref[0][i8q]]->i_poc;\
 763                             else\
 764                                 refs_equal = h->fref0[h->mb.ref[0][i8p]>>1]->i_poc == h->fref0[h->mb.ref[0][i8q]>>1]->i_poc\
 765                                            && (h->mb.ref[0][i8p]&1) == (h->mb.ref[0][i8q]&1);\
 766                             if((!refs_equal ||\
 767                                 abs( h->mb.mv[0][i4p][0] - h->mb.mv[0][i4q][0] ) >= 4 ||\
 768                                 abs( h->mb.mv[0][i4p][1] - h->mb.mv[0][i4q][1] ) >= mvy_limit ) ||\
 769                                (h->sh.i_type == SLICE_TYPE_B &&\
 770                                (h->mb.ref[1][i8p] != h->mb.ref[1][i8q] ||\
 771                                 abs( h->mb.mv[1][i4p][0] - h->mb.mv[1][i4q][0] ) >= 4 ||\
 772                                 abs( h->mb.mv[1][i4p][1] - h->mb.mv[1][i4q][1] ) >= mvy_limit )))\
 773                             {\
 774                                 bS[i] = 1;\
 775                             }\
 776                         }\
 777                     }\
 778                 }\
 779             }\
 780         }
 781
 782         /* i_dir == 0 -> vertical edge
 783          * i_dir == 1 -> horizontal edge */
 784         #define DEBLOCK_DIR(i_dir)\
 785         {\
 786             int i_edge = (i_dir ? (mb_y <= b_interlaced) : (mb_x == 0));\
 787             int i_qpn, i, mbn_xy, mbn_8x8, mbn_4x4;\
 788             ALIGNED_4( uint8_t bS[4] );  /* filtering strength */\
 789             if( i_edge )\
 790                 i_edge+= b_8x8_transform;\
 791             else\
 792             {\
 793                 mbn_xy  = i_dir == 0 ? mb_xy  - 1 : mb_xy - h->mb.i_mb_stride;\
 794                 mbn_8x8 = i_dir == 0 ? mb_8x8 - 2 : mb_8x8 - 2 * s8x8;\
 795                 mbn_4x4 = i_dir == 0 ? mb_4x4 - 4 : mb_4x4 - 4 * s4x4;\
 796                 if( b_interlaced && i_dir == 1 )\
 797                 {\
 798                     mbn_xy -= h->mb.i_mb_stride;\
 799                     mbn_8x8 -= 2 * s8x8;\
 800                     mbn_4x4 -= 4 * s4x4;\
 801                 }\
 802                 else if( IS_INTRA( h->mb.type[mb_xy] ) || IS_INTRA( h->mb.type[mbn_xy]) )\
 803                 {\
 804                     FILTER_DIR( _intra, i_dir );\
 805                     goto end##i_dir;\
 806                 }\
 807                 DEBLOCK_STRENGTH(i_dir);\
 808                 if( M32( bS ) )\
 809                     FILTER_DIR( , i_dir);\
 810                 end##i_dir:\
 811                 i_edge += b_8x8_transform+1;\
 812             }\
 813             mbn_xy  = mb_xy;\
 814             mbn_8x8 = mb_8x8;\
 815             mbn_4x4 = mb_4x4;\
 816             for( ; i_edge < i_edge_end; i_edge+=b_8x8_transform+1 )\
 817             {\
 818                 DEBLOCK_STRENGTH(i_dir);\
 819                 if( M32( bS ) )\
 820                     FILTER_DIR( , i_dir);\
 821             }\
 822         }
 823
 824         DEBLOCK_DIR(0);
 825         DEBLOCK_DIR(1);
 826     }
 827
 828     if( !h->pps->b_cabac && h->pps->b_transform_8x8_mode )
 829         munge_cavlc_nnz( h, mb_y, h->mb.nnz_backup, restore_cavlc_nnz_row );
 830 }
 831
 832 void x264_frame_deblock( x264_t *h )
 833 {
 834     int mb_y;
 835     for( mb_y = 0; mb_y < h->sps->i_mb_height; mb_y += 1 + h->sh.b_mbaff )
 836         x264_frame_deblock_row( h, mb_y );
 837 }
 838
 839 #ifdef HAVE_MMX
 840 void x264_deblock_v_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 841 void x264_deblock_h_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 842 void x264_deblock_v_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
 843 void x264_deblock_h_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
 844
 845 void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 846 void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 847 void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta );
 848 void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta );
 849 #ifdef ARCH_X86
 850 void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 851 void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 852 void x264_deblock_h_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
 853 void x264_deblock_v8_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
 854
 855 static void x264_deblock_v_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
 856 {
 857     x264_deblock_v8_luma_mmxext( pix,   stride, alpha, beta, tc0   );
 858     x264_deblock_v8_luma_mmxext( pix+8, stride, alpha, beta, tc0+2 );
 859 }
 860 static void x264_deblock_v_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta )
 861 {
 862     x264_deblock_v8_luma_intra_mmxext( pix,   stride, alpha, beta );
 863     x264_deblock_v8_luma_intra_mmxext( pix+8, stride, alpha, beta );
 864 }
 865 #endif
 866 #endif
 867
 868 #ifdef ARCH_PPC
 869 void x264_deblock_v_luma_altivec( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 870 void x264_deblock_h_luma_altivec( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 871 #endif // ARCH_PPC
 872
 873 #ifdef HAVE_ARMV6
 874 void x264_deblock_v_luma_neon( uint8_t *, int, int, int, int8_t * );
 875 void x264_deblock_h_luma_neon( uint8_t *, int, int, int, int8_t * );
 876 void x264_deblock_v_chroma_neon( uint8_t *, int, int, int, int8_t * );
 877 void x264_deblock_h_chroma_neon( uint8_t *, int, int, int, int8_t * );
 878 #endif
 879
 880 void x264_deblock_init( int cpu, x264_deblock_function_t *pf )
 881 {
 882     pf->deblock_v_luma = deblock_v_luma_c;
 883     pf->deblock_h_luma = deblock_h_luma_c;
 884     pf->deblock_v_chroma = deblock_v_chroma_c;
 885     pf->deblock_h_chroma = deblock_h_chroma_c;
 886     pf->deblock_v_luma_intra = deblock_v_luma_intra_c;
 887     pf->deblock_h_luma_intra = deblock_h_luma_intra_c;
 888     pf->deblock_v_chroma_intra = deblock_v_chroma_intra_c;
 889     pf->deblock_h_chroma_intra = deblock_h_chroma_intra_c;
 890
 891 #ifdef HAVE_MMX
 892     if( cpu&X264_CPU_MMXEXT )
 893     {
 894         pf->deblock_v_chroma = x264_deblock_v_chroma_mmxext;
 895         pf->deblock_h_chroma = x264_deblock_h_chroma_mmxext;
 896         pf->deblock_v_chroma_intra = x264_deblock_v_chroma_intra_mmxext;
 897         pf->deblock_h_chroma_intra = x264_deblock_h_chroma_intra_mmxext;
 898 #ifdef ARCH_X86
 899         pf->deblock_v_luma = x264_deblock_v_luma_mmxext;
 900         pf->deblock_h_luma = x264_deblock_h_luma_mmxext;
 901         pf->deblock_v_luma_intra = x264_deblock_v_luma_intra_mmxext;
 902         pf->deblock_h_luma_intra = x264_deblock_h_luma_intra_mmxext;
 903 #endif
 904         if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_STACK_MOD4) )
 905         {
 906             pf->deblock_v_luma = x264_deblock_v_luma_sse2;
 907             pf->deblock_h_luma = x264_deblock_h_luma_sse2;
 908             pf->deblock_v_luma_intra = x264_deblock_v_luma_intra_sse2;
 909             pf->deblock_h_luma_intra = x264_deblock_h_luma_intra_sse2;
 910         }
 911     }
 912 #endif
 913
 914 #ifdef ARCH_PPC
 915     if( cpu&X264_CPU_ALTIVEC )
 916     {
 917         pf->deblock_v_luma = x264_deblock_v_luma_altivec;
 918         pf->deblock_h_luma = x264_deblock_h_luma_altivec;
 919    }
 920 #endif // ARCH_PPC
 921
 922 #ifdef HAVE_ARMV6
 923    if( cpu&X264_CPU_NEON )
 924    {
 925         pf->deblock_v_luma   = x264_deblock_v_luma_neon;
 926         pf->deblock_h_luma   = x264_deblock_h_luma_neon;
 927         pf->deblock_v_chroma = x264_deblock_v_chroma_neon;
 928         pf->deblock_h_chroma = x264_deblock_h_chroma_neon;
 929    }
 930 #endif
 931 }
 932
 933
 934 /* threading */
 935 void x264_frame_cond_broadcast( x264_frame_t *frame, int i_lines_completed )
 936 {
 937     x264_pthread_mutex_lock( &frame->mutex );
 938     frame->i_lines_completed = i_lines_completed;
 939     x264_pthread_cond_broadcast( &frame->cv );
 940     x264_pthread_mutex_unlock( &frame->mutex );
 941 }
 942
 943 void x264_frame_cond_wait( x264_frame_t *frame, int i_lines_completed )
 944 {
 945     x264_pthread_mutex_lock( &frame->mutex );
 946     while( frame->i_lines_completed < i_lines_completed )
 947         x264_pthread_cond_wait( &frame->cv, &frame->mutex );
 948     x264_pthread_mutex_unlock( &frame->mutex );
 949 }
 950
 951 /* list operators */
 952
 953 void x264_frame_push( x264_frame_t **list, x264_frame_t *frame )
 954 {
 955     int i = 0;
 956     while( list[i] ) i++;
 957     list[i] = frame;
 958 }
 959
 960 x264_frame_t *x264_frame_pop( x264_frame_t **list )
 961 {
 962     x264_frame_t *frame;
 963     int i = 0;
 964     assert( list[0] );
 965     while( list[i+1] ) i++;
 966     frame = list[i];
 967     list[i] = NULL;
 968     return frame;
 969 }
 970
 971 void x264_frame_unshift( x264_frame_t **list, x264_frame_t *frame )
 972 {
 973     int i = 0;
 974     while( list[i] ) i++;
 975     while( i-- )
 976         list[i+1] = list[i];
 977     list[0] = frame;
 978 }
 979
 980 x264_frame_t *x264_frame_shift( x264_frame_t **list )
 981 {
 982     x264_frame_t *frame = list[0];
 983     int i;
 984     for( i = 0; list[i]; i++ )
 985         list[i] = list[i+1];
 986     assert(frame);
 987     return frame;
 988 }
 989
 990 void x264_frame_push_unused( x264_t *h, x264_frame_t *frame )
 991 {
 992     assert( frame->i_reference_count > 0 );
 993     frame->i_reference_count--;
 994     if( frame->i_reference_count == 0 )
 995         x264_frame_push( h->frames.unused[frame->b_fdec], frame );
 996 }
 997
 998 x264_frame_t *x264_frame_pop_unused( x264_t *h, int b_fdec )
 999 {
1000     x264_frame_t *frame;
1001     if( h->frames.unused[b_fdec][0] )
1002         frame = x264_frame_pop( h->frames.unused[b_fdec] );
1003     else
1004         frame = x264_frame_new( h, b_fdec );
1005     if( !frame )
1006         return NULL;
1007     frame->b_last_minigop_bframe = 0;
1008     frame->i_reference_count = 1;
1009     frame->b_intra_calculated = 0;
1010     frame->b_scenecut = 1;
1011
1012     memset( frame->weight, 0, sizeof(frame->weight) );
1013     memset( frame->f_weighted_cost_delta, 0, sizeof(frame->f_weighted_cost_delta) );
1014
1015     return frame;
1016 }
1017
1018 void x264_frame_push_blank_unused( x264_t *h, x264_frame_t *frame )
1019 {
1020     assert( frame->i_reference_count > 0 );
1021     frame->i_reference_count--;
1022     if( frame->i_reference_count == 0 )
1023         x264_frame_push( h->frames.blank_unused, frame );
1024 }
1025
1026 x264_frame_t *x264_frame_pop_blank_unused( x264_t *h )
1027 {
1028     x264_frame_t *frame;
1029     if( h->frames.blank_unused[0] )
1030         frame = x264_frame_pop( h->frames.blank_unused );
1031     else
1032         frame = x264_malloc( sizeof(x264_frame_t) );
1033     if( !frame )
1034         return NULL;
1035     frame->b_duplicate = 1;
1036     frame->i_reference_count = 1;
1037     return frame;
1038 }
1039
1040 void x264_frame_sort( x264_frame_t **list, int b_dts )
1041 {
1042     int i, b_ok;
1043     do {
1044         b_ok = 1;
1045         for( i = 0; list[i+1]; i++ )
1046         {
1047             int dtype = list[i]->i_type - list[i+1]->i_type;
1048             int dtime = list[i]->i_frame - list[i+1]->i_frame;
1049             int swap = b_dts ? dtype > 0 || ( dtype == 0 && dtime > 0 )
1050                              : dtime > 0;
1051             if( swap )
1052             {
1053                 XCHG( x264_frame_t*, list[i], list[i+1] );
1054                 b_ok = 0;
1055             }
1056         }
1057     } while( !b_ok );
1058 }
1059
1060 void x264_weight_scale_plane( x264_t *h, uint8_t *dst, int i_dst_stride, uint8_t *src, int i_src_stride,
1061                          int i_width, int i_height, x264_weight_t *w )
1062 {
1063     int x;
1064     /* Weight horizontal strips of height 16. This was found to be the optimal height
1065      * in terms of the cache loads. */
1066     while( i_height > 0 )
1067     {
1068         for( x = 0; x < i_width; x += 16 )
1069             w->weightfn[16>>2]( dst+x, i_dst_stride, src+x, i_src_stride, w, X264_MIN( i_height, 16 ) );
1070         i_height -= 16;
1071         dst += 16 * i_dst_stride;
1072         src += 16 * i_src_stride;
1073     }
1074 }
1075
1076 void x264_frame_delete_list( x264_frame_t **list )
1077 {
1078     int i = 0;
1079     if( !list )
1080         return;
1081     while( list[i] )
1082         x264_frame_delete( list[i++] );
1083     x264_free( list );
1084 }
1085
1086 int x264_synch_frame_list_init( x264_synch_frame_list_t *slist, int max_size )
1087 {
1088     if( max_size < 0 )
1089         return -1;
1090     slist->i_max_size = max_size;
1091     slist->i_size = 0;
1092     CHECKED_MALLOCZERO( slist->list, (max_size+1) * sizeof(x264_frame_t*) );
1093     if( x264_pthread_mutex_init( &slist->mutex, NULL ) ||
1094         x264_pthread_cond_init( &slist->cv_fill, NULL ) ||
1095         x264_pthread_cond_init( &slist->cv_empty, NULL ) )
1096         return -1;
1097     return 0;
1098 fail:
1099     return -1;
1100 }
1101
1102 void x264_synch_frame_list_delete( x264_synch_frame_list_t *slist )
1103 {
1104     x264_pthread_mutex_destroy( &slist->mutex );
1105     x264_pthread_cond_destroy( &slist->cv_fill );
1106     x264_pthread_cond_destroy( &slist->cv_empty );
1107     x264_frame_delete_list( slist->list );
1108 }
1109
1110 void x264_synch_frame_list_push( x264_synch_frame_list_t *slist, x264_frame_t *frame )
1111 {
1112     x264_pthread_mutex_lock( &slist->mutex );
1113     while( slist->i_size == slist->i_max_size )
1114         x264_pthread_cond_wait( &slist->cv_empty, &slist->mutex );
1115     slist->list[ slist->i_size++ ] = frame;
1116     x264_pthread_mutex_unlock( &slist->mutex );
1117     x264_pthread_cond_broadcast( &slist->cv_fill );
1118 }