git.sesse.net Git - x264/blob - common/frame.c

   1 /*****************************************************************************
   2  * frame.c: h264 encoder library
   3  *****************************************************************************
   4  * Copyright (C) 2003-2008 x264 project
   5  *
   6  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
   7  *          Loren Merritt <lorenm@u.washington.edu>
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation; either version 2 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License
  20  * along with this program; if not, write to the Free Software
  21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  22  *****************************************************************************/
  23
  24 #include "common.h"
  25
  26 x264_frame_t *x264_frame_new( x264_t *h )
  27 {
  28     x264_frame_t *frame = x264_malloc( sizeof(x264_frame_t) );
  29     int i, j;
  30
  31     int i_mb_count = h->mb.i_mb_count;
  32     int i_stride, i_width, i_lines;
  33     int i_padv = PADV << h->param.b_interlaced;
  34     int luma_plane_size;
  35
  36     if( !frame ) return NULL;
  37
  38     memset( frame, 0, sizeof(x264_frame_t) );
  39
  40     /* allocate frame data (+64 for extra data for me) */
  41     i_width  = ( ( h->param.i_width  + 15 ) & -16 );
  42     i_stride = i_width + 2*PADH;
  43     i_lines  = ( ( h->param.i_height + 15 ) & -16 );
  44     if( h->param.b_interlaced )
  45         i_lines = ( i_lines + 31 ) & -32;
  46
  47     if( h->param.cpu&X264_CPU_CACHELINE_64 )
  48         i_stride = (i_stride + 63) & ~63;
  49     else if( h->param.cpu&X264_CPU_CACHELINE_32 )
  50         i_stride = (i_stride + 31) & ~31;
  51
  52     frame->i_plane = 3;
  53     for( i = 0; i < 3; i++ )
  54     {
  55         frame->i_stride[i] = i_stride >> !!i;
  56         frame->i_width[i] = i_width >> !!i;
  57         frame->i_lines[i] = i_lines >> !!i;
  58     }
  59
  60     luma_plane_size = (frame->i_stride[0] * ( frame->i_lines[0] + 2*i_padv ));
  61     for( i = 1; i < 3; i++ )
  62     {
  63         CHECKED_MALLOC( frame->buffer[i], luma_plane_size/4 );
  64         frame->plane[i] = (uint8_t*)frame->buffer[i] + (frame->i_stride[i] * i_padv + PADH)/2;
  65     }
  66     /* all 4 luma planes allocated together, since the cacheline split code
  67      * requires them to be in-phase wrt cacheline alignment. */
  68     CHECKED_MALLOC( frame->buffer[0], 4*luma_plane_size);
  69     for( i = 0; i < 4; i++ )
  70         frame->filtered[i] = (uint8_t*)frame->buffer[0] + i*luma_plane_size + frame->i_stride[0] * i_padv + PADH;
  71     frame->plane[0] = frame->filtered[0];
  72
  73     if( h->frames.b_have_lowres )
  74     {
  75         frame->i_width_lowres = frame->i_width[0]/2;
  76         frame->i_stride_lowres = (frame->i_width_lowres + 2*PADH + 15) & ~15;
  77         frame->i_lines_lowres = frame->i_lines[0]/2;
  78         for( i = 0; i < 4; i++ )
  79         {
  80             CHECKED_MALLOC( frame->buffer_lowres[i],
  81                             frame->i_stride_lowres * ( frame->i_lines[0]/2 + 2*i_padv ) );
  82             frame->lowres[i] = ((uint8_t*)frame->buffer_lowres[i]) +
  83                                 frame->i_stride_lowres * i_padv + PADH;
  84         }
  85     }
  86
  87     if( h->param.analyse.i_me_method >= X264_ME_ESA )
  88     {
  89         CHECKED_MALLOC( frame->buffer[3],
  90                         2 * frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv) * sizeof(uint16_t) );
  91         frame->integral = (uint16_t*)frame->buffer[3] + frame->i_stride[0] * i_padv + PADH;
  92     }
  93
  94     frame->i_poc = -1;
  95     frame->i_type = X264_TYPE_AUTO;
  96     frame->i_qpplus1 = 0;
  97     frame->i_pts = -1;
  98     frame->i_frame = -1;
  99     frame->i_frame_num = -1;
 100     frame->i_lines_completed = -1;
 101
 102     CHECKED_MALLOC( frame->mb_type, i_mb_count * sizeof(int8_t));
 103     CHECKED_MALLOC( frame->mv[0], 2*16 * i_mb_count * sizeof(int16_t) );
 104     CHECKED_MALLOC( frame->ref[0], 4 * i_mb_count * sizeof(int8_t) );
 105     if( h->param.i_bframe )
 106     {
 107         CHECKED_MALLOC( frame->mv[1], 2*16 * i_mb_count * sizeof(int16_t) );
 108         CHECKED_MALLOC( frame->ref[1], 4 * i_mb_count * sizeof(int8_t) );
 109     }
 110     else
 111     {
 112         frame->mv[1]  = NULL;
 113         frame->ref[1] = NULL;
 114     }
 115
 116     CHECKED_MALLOC( frame->i_row_bits, i_lines/16 * sizeof(int) );
 117     CHECKED_MALLOC( frame->i_row_qp, i_lines/16 * sizeof(int) );
 118     for( i = 0; i < h->param.i_bframe + 2; i++ )
 119         for( j = 0; j < h->param.i_bframe + 2; j++ )
 120             CHECKED_MALLOC( frame->i_row_satds[i][j], i_lines/16 * sizeof(int) );
 121
 122     x264_pthread_mutex_init( &frame->mutex, NULL );
 123     x264_pthread_cond_init( &frame->cv, NULL );
 124
 125     return frame;
 126
 127 fail:
 128     x264_frame_delete( frame );
 129     return NULL;
 130 }
 131
 132 void x264_frame_delete( x264_frame_t *frame )
 133 {
 134     int i, j;
 135     for( i = 0; i < 4; i++ )
 136         x264_free( frame->buffer[i] );
 137     for( i = 0; i < 4; i++ )
 138         x264_free( frame->buffer_lowres[i] );
 139     for( i = 0; i < X264_BFRAME_MAX+2; i++ )
 140         for( j = 0; j < X264_BFRAME_MAX+2; j++ )
 141             x264_free( frame->i_row_satds[i][j] );
 142     x264_free( frame->i_row_bits );
 143     x264_free( frame->i_row_qp );
 144     x264_free( frame->mb_type );
 145     x264_free( frame->mv[0] );
 146     x264_free( frame->mv[1] );
 147     x264_free( frame->ref[0] );
 148     x264_free( frame->ref[1] );
 149     x264_pthread_mutex_destroy( &frame->mutex );
 150     x264_pthread_cond_destroy( &frame->cv );
 151     x264_free( frame );
 152 }
 153
 154 int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src )
 155 {
 156     int i_csp = src->img.i_csp & X264_CSP_MASK;
 157     int i;
 158     if( i_csp != X264_CSP_I420 && i_csp != X264_CSP_YV12 )
 159     {
 160         x264_log( h, X264_LOG_ERROR, "Arg invalid CSP\n" );
 161         return -1;
 162     }
 163
 164     dst->i_type     = src->i_type;
 165     dst->i_qpplus1  = src->i_qpplus1;
 166     dst->i_pts      = src->i_pts;
 167
 168     for( i=0; i<3; i++ )
 169     {
 170         int s = (i_csp == X264_CSP_YV12 && i) ? i^3 : i;
 171         uint8_t *plane = src->img.plane[s];
 172         int stride = src->img.i_stride[s];
 173         int width = h->param.i_width >> !!i;
 174         int height = h->param.i_height >> !!i;
 175         if( src->img.i_csp & X264_CSP_VFLIP )
 176         {
 177             plane += (height-1)*stride;
 178             stride = -stride;
 179         }
 180         h->mc.plane_copy( dst->plane[i], dst->i_stride[i], plane, stride, width, height );
 181     }
 182     return 0;
 183 }
 184
 185
 186
 187 static void plane_expand_border( uint8_t *pix, int i_stride, int i_width, int i_height, int i_padh, int i_padv, int b_pad_top, int b_pad_bottom )
 188 {
 189 #define PPIXEL(x, y) ( pix + (x) + (y)*i_stride )
 190     int y;
 191     for( y = 0; y < i_height; y++ )
 192     {
 193         /* left band */
 194         memset( PPIXEL(-i_padh, y), PPIXEL(0, y)[0], i_padh );
 195         /* right band */
 196         memset( PPIXEL(i_width, y), PPIXEL(i_width-1, y)[0], i_padh );
 197     }
 198     /* upper band */
 199     if( b_pad_top )
 200     for( y = 0; y < i_padv; y++ )
 201         memcpy( PPIXEL(-i_padh, -y-1), PPIXEL(-i_padh, 0), i_width+2*i_padh );
 202     /* lower band */
 203     if( b_pad_bottom )
 204     for( y = 0; y < i_padv; y++ )
 205         memcpy( PPIXEL(-i_padh, i_height+y), PPIXEL(-i_padh, i_height-1), i_width+2*i_padh );
 206 #undef PPIXEL
 207 }
 208
 209 void x264_frame_expand_border( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
 210 {
 211     int i;
 212     int b_start = !mb_y;
 213     if( mb_y & h->sh.b_mbaff )
 214         return;
 215     for( i = 0; i < frame->i_plane; i++ )
 216     {
 217         int stride = frame->i_stride[i];
 218         int width = 16*h->sps->i_mb_width >> !!i;
 219         int height = (b_end ? 16*(h->sps->i_mb_height - mb_y) >> h->sh.b_mbaff : 16) >> !!i;
 220         int padh = PADH >> !!i;
 221         int padv = PADV >> !!i;
 222         // buffer: 2 chroma, 3 luma (rounded to 4) because deblocking goes beyond the top of the mb
 223         uint8_t *pix = frame->plane[i] + X264_MAX(0, (16*mb_y-4)*stride >> !!i);
 224         if( b_end && !b_start )
 225             height += 4 >> (!!i + h->sh.b_mbaff);
 226         if( h->sh.b_mbaff )
 227         {
 228             plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end );
 229             plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end );
 230         }
 231         else
 232         {
 233             plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end );
 234         }
 235     }
 236 }
 237
 238 void x264_frame_expand_border_filtered( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
 239 {
 240     /* during filtering, 8 extra pixels were filtered on each edge,
 241      * but up to 3 of the horizontal ones may be wrong.
 242        we want to expand border from the last filtered pixel */
 243     int b_start = !mb_y;
 244     int stride = frame->i_stride[0];
 245     int width = 16*h->sps->i_mb_width + 8;
 246     int height = b_end ? (16*(h->sps->i_mb_height - mb_y) >> h->sh.b_mbaff) + 16 : 16;
 247     int padh = PADH - 4;
 248     int padv = PADV - 8;
 249     int i;
 250     for( i = 1; i < 4; i++ )
 251     {
 252         // buffer: 8 luma, to match the hpel filter
 253         uint8_t *pix = frame->filtered[i] + (16*mb_y - (8 << h->sh.b_mbaff)) * stride - 4;
 254         if( h->sh.b_mbaff )
 255         {
 256             plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end );
 257             plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end );
 258         }
 259         else
 260         {
 261             plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end );
 262         }
 263     }
 264 }
 265
 266 void x264_frame_expand_border_lowres( x264_frame_t *frame )
 267 {
 268     int i;
 269     for( i = 0; i < 4; i++ )
 270         plane_expand_border( frame->lowres[i], frame->i_stride_lowres, frame->i_stride_lowres - 2*PADH, frame->i_lines_lowres, PADH, PADV, 1, 1 );
 271 }
 272
 273 void x264_frame_expand_border_mod16( x264_t *h, x264_frame_t *frame )
 274 {
 275     int i, y;
 276     for( i = 0; i < frame->i_plane; i++ )
 277     {
 278         int i_subsample = i ? 1 : 0;
 279         int i_width = h->param.i_width >> i_subsample;
 280         int i_height = h->param.i_height >> i_subsample;
 281         int i_padx = ( h->sps->i_mb_width * 16 - h->param.i_width ) >> i_subsample;
 282         int i_pady = ( h->sps->i_mb_height * 16 - h->param.i_height ) >> i_subsample;
 283
 284         if( i_padx )
 285         {
 286             for( y = 0; y < i_height; y++ )
 287                 memset( &frame->plane[i][y*frame->i_stride[i] + i_width],
 288                          frame->plane[i][y*frame->i_stride[i] + i_width - 1],
 289                          i_padx );
 290         }
 291         if( i_pady )
 292         {
 293             //FIXME interlace? or just let it pad using the wrong field
 294             for( y = i_height; y < i_height + i_pady; y++ )
 295                 memcpy( &frame->plane[i][y*frame->i_stride[i]],
 296                         &frame->plane[i][(i_height-1)*frame->i_stride[i]],
 297                         i_width + i_padx );
 298         }
 299     }
 300 }
 301
 302
 303 /* cavlc + 8x8 transform stores nnz per 16 coeffs for the purpose of
 304  * entropy coding, but per 64 coeffs for the purpose of deblocking */
 305 void munge_cavlc_nnz_row( x264_t *h, int mb_y, uint8_t (*buf)[16] )
 306 {
 307     uint32_t (*src)[6] = (uint32_t(*)[6])h->mb.non_zero_count + mb_y * h->sps->i_mb_width;
 308     int8_t *transform = h->mb.mb_transform_size + mb_y * h->sps->i_mb_width;
 309     int x, nnz;
 310     for( x=0; x<h->sps->i_mb_width; x++ )
 311     {
 312         memcpy( buf+x, src+x, 16 );
 313         if( transform[x] )
 314         {
 315             nnz = src[x][0] | src[x][1];
 316             src[x][0] = src[x][1] = ((uint16_t)nnz ? 0x0101 : 0) + (nnz>>16 ? 0x01010000 : 0);
 317             nnz = src[x][2] | src[x][3];
 318             src[x][2] = src[x][3] = ((uint16_t)nnz ? 0x0101 : 0) + (nnz>>16 ? 0x01010000 : 0);
 319         }
 320     }
 321 }
 322
 323 static void restore_cavlc_nnz_row( x264_t *h, int mb_y, uint8_t (*buf)[16] )
 324 {
 325     uint8_t (*dst)[24] = h->mb.non_zero_count + mb_y * h->sps->i_mb_width;
 326     int x;
 327     for( x=0; x<h->sps->i_mb_width; x++ )
 328         memcpy( dst+x, buf+x, 16 );
 329 }
 330
 331 static void munge_cavlc_nnz( x264_t *h, int mb_y, uint8_t (*buf)[16], void (*func)(x264_t*, int, uint8_t (*)[16]) )
 332 {
 333     func( h, mb_y, buf );
 334     if( mb_y > 0 )
 335         func( h, mb_y-1, buf + h->sps->i_mb_width );
 336     if( h->sh.b_mbaff )
 337     {
 338         func( h, mb_y+1, buf + h->sps->i_mb_width * 2 );
 339         if( mb_y > 0 )
 340             func( h, mb_y-2, buf + h->sps->i_mb_width * 3 );
 341     }
 342 }
 343
 344
 345 /* Deblocking filter */
 346
 347 static const int i_alpha_table[52] =
 348 {
 349      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 350      0,  0,  0,  0,  0,  0,  4,  4,  5,  6,
 351      7,  8,  9, 10, 12, 13, 15, 17, 20, 22,
 352     25, 28, 32, 36, 40, 45, 50, 56, 63, 71,
 353     80, 90,101,113,127,144,162,182,203,226,
 354     255, 255
 355 };
 356 static const int i_beta_table[52] =
 357 {
 358      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 359      0,  0,  0,  0,  0,  0,  2,  2,  2,  3,
 360      3,  3,  3,  4,  4,  4,  6,  6,  7,  7,
 361      8,  8,  9,  9, 10, 10, 11, 11, 12, 12,
 362     13, 13, 14, 14, 15, 15, 16, 16, 17, 17,
 363     18, 18
 364 };
 365 static const int i_tc0_table[52][3] =
 366 {
 367     { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 },
 368     { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 },
 369     { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 1 },
 370     { 0, 0, 1 }, { 0, 0, 1 }, { 0, 0, 1 }, { 0, 1, 1 }, { 0, 1, 1 }, { 1, 1, 1 },
 371     { 1, 1, 1 }, { 1, 1, 1 }, { 1, 1, 1 }, { 1, 1, 2 }, { 1, 1, 2 }, { 1, 1, 2 },
 372     { 1, 1, 2 }, { 1, 2, 3 }, { 1, 2, 3 }, { 2, 2, 3 }, { 2, 2, 4 }, { 2, 3, 4 },
 373     { 2, 3, 4 }, { 3, 3, 5 }, { 3, 4, 6 }, { 3, 4, 6 }, { 4, 5, 7 }, { 4, 5, 8 },
 374     { 4, 6, 9 }, { 5, 7,10 }, { 6, 8,11 }, { 6, 8,13 }, { 7,10,14 }, { 8,11,16 },
 375     { 9,12,18 }, {10,13,20 }, {11,15,23 }, {13,17,25 }
 376 };
 377
 378 /* From ffmpeg */
 379 static inline int clip_uint8( int a )
 380 {
 381     if (a&(~255))
 382         return (-a)>>31;
 383     else
 384         return a;
 385 }
 386
 387 static inline void deblock_luma_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0 )
 388 {
 389     int i, d;
 390     for( i = 0; i < 4; i++ ) {
 391         if( tc0[i] < 0 ) {
 392             pix += 4*ystride;
 393             continue;
 394         }
 395         for( d = 0; d < 4; d++ ) {
 396             const int p2 = pix[-3*xstride];
 397             const int p1 = pix[-2*xstride];
 398             const int p0 = pix[-1*xstride];
 399             const int q0 = pix[ 0*xstride];
 400             const int q1 = pix[ 1*xstride];
 401             const int q2 = pix[ 2*xstride];
 402
 403             if( abs( p0 - q0 ) < alpha &&
 404                 abs( p1 - p0 ) < beta &&
 405                 abs( q1 - q0 ) < beta ) {
 406
 407                 int tc = tc0[i];
 408                 int delta;
 409
 410                 if( abs( p2 - p0 ) < beta ) {
 411                     pix[-2*xstride] = p1 + x264_clip3( (( p2 + ((p0 + q0 + 1) >> 1)) >> 1) - p1, -tc0[i], tc0[i] );
 412                     tc++;
 413                 }
 414                 if( abs( q2 - q0 ) < beta ) {
 415                     pix[ 1*xstride] = q1 + x264_clip3( (( q2 + ((p0 + q0 + 1) >> 1)) >> 1) - q1, -tc0[i], tc0[i] );
 416                     tc++;
 417                 }
 418
 419                 delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
 420                 pix[-1*xstride] = clip_uint8( p0 + delta );    /* p0' */
 421                 pix[ 0*xstride] = clip_uint8( q0 - delta );    /* q0' */
 422             }
 423             pix += ystride;
 424         }
 425     }
 426 }
 427 static void deblock_v_luma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
 428 {
 429     deblock_luma_c( pix, stride, 1, alpha, beta, tc0 );
 430 }
 431 static void deblock_h_luma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
 432 {
 433     deblock_luma_c( pix, 1, stride, alpha, beta, tc0 );
 434 }
 435
 436 static inline void deblock_chroma_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0 )
 437 {
 438     int i, d;
 439     for( i = 0; i < 4; i++ ) {
 440         const int tc = tc0[i];
 441         if( tc <= 0 ) {
 442             pix += 2*ystride;
 443             continue;
 444         }
 445         for( d = 0; d < 2; d++ ) {
 446             const int p1 = pix[-2*xstride];
 447             const int p0 = pix[-1*xstride];
 448             const int q0 = pix[ 0*xstride];
 449             const int q1 = pix[ 1*xstride];
 450
 451             if( abs( p0 - q0 ) < alpha &&
 452                 abs( p1 - p0 ) < beta &&
 453                 abs( q1 - q0 ) < beta ) {
 454
 455                 int delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
 456                 pix[-1*xstride] = clip_uint8( p0 + delta );    /* p0' */
 457                 pix[ 0*xstride] = clip_uint8( q0 - delta );    /* q0' */
 458             }
 459             pix += ystride;
 460         }
 461     }
 462 }
 463 static void deblock_v_chroma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
 464 {
 465     deblock_chroma_c( pix, stride, 1, alpha, beta, tc0 );
 466 }
 467 static void deblock_h_chroma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
 468 {
 469     deblock_chroma_c( pix, 1, stride, alpha, beta, tc0 );
 470 }
 471
 472 static inline void deblock_luma_intra_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta )
 473 {
 474     int d;
 475     for( d = 0; d < 16; d++ ) {
 476         const int p2 = pix[-3*xstride];
 477         const int p1 = pix[-2*xstride];
 478         const int p0 = pix[-1*xstride];
 479         const int q0 = pix[ 0*xstride];
 480         const int q1 = pix[ 1*xstride];
 481         const int q2 = pix[ 2*xstride];
 482
 483         if( abs( p0 - q0 ) < alpha &&
 484             abs( p1 - p0 ) < beta &&
 485             abs( q1 - q0 ) < beta ) {
 486
 487             if(abs( p0 - q0 ) < ((alpha >> 2) + 2) ){
 488                 if( abs( p2 - p0 ) < beta)
 489                 {
 490                     const int p3 = pix[-4*xstride];
 491                     /* p0', p1', p2' */
 492                     pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
 493                     pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
 494                     pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
 495                 } else {
 496                     /* p0' */
 497                     pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
 498                 }
 499                 if( abs( q2 - q0 ) < beta)
 500                 {
 501                     const int q3 = pix[3*xstride];
 502                     /* q0', q1', q2' */
 503                     pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
 504                     pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
 505                     pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
 506                 } else {
 507                     /* q0' */
 508                     pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
 509                 }
 510             }else{
 511                 /* p0', q0' */
 512                 pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
 513                 pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
 514             }
 515         }
 516         pix += ystride;
 517     }
 518 }
 519 static void deblock_v_luma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
 520 {
 521     deblock_luma_intra_c( pix, stride, 1, alpha, beta );
 522 }
 523 static void deblock_h_luma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
 524 {
 525     deblock_luma_intra_c( pix, 1, stride, alpha, beta );
 526 }
 527
 528 static inline void deblock_chroma_intra_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta )
 529 {
 530     int d;
 531     for( d = 0; d < 8; d++ ) {
 532         const int p1 = pix[-2*xstride];
 533         const int p0 = pix[-1*xstride];
 534         const int q0 = pix[ 0*xstride];
 535         const int q1 = pix[ 1*xstride];
 536
 537         if( abs( p0 - q0 ) < alpha &&
 538             abs( p1 - p0 ) < beta &&
 539             abs( q1 - q0 ) < beta ) {
 540
 541             pix[-1*xstride] = (2*p1 + p0 + q1 + 2) >> 2;   /* p0' */
 542             pix[ 0*xstride] = (2*q1 + q0 + p1 + 2) >> 2;   /* q0' */
 543         }
 544
 545         pix += ystride;
 546     }
 547 }
 548 static void deblock_v_chroma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
 549 {
 550     deblock_chroma_intra_c( pix, stride, 1, alpha, beta );
 551 }
 552 static void deblock_h_chroma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
 553 {
 554     deblock_chroma_intra_c( pix, 1, stride, alpha, beta );
 555 }
 556
 557 static inline void deblock_edge( x264_t *h, uint8_t *pix, int i_stride, int bS[4], int i_qp, int b_chroma,
 558                                  x264_deblock_inter_t pf_inter, x264_deblock_intra_t pf_intra )
 559 {
 560     int i;
 561     const int index_a = x264_clip3( i_qp + h->sh.i_alpha_c0_offset, 0, 51 );
 562     const int alpha = i_alpha_table[index_a];
 563     const int beta  = i_beta_table[x264_clip3( i_qp + h->sh.i_beta_offset, 0, 51 )];
 564
 565     if( bS[0] < 4 ) {
 566         int8_t tc[4];
 567         for(i=0; i<4; i++)
 568             tc[i] = (bS[i] ? i_tc0_table[index_a][bS[i] - 1] : -1) + b_chroma;
 569         pf_inter( pix, i_stride, alpha, beta, tc );
 570     } else {
 571         pf_intra( pix, i_stride, alpha, beta );
 572     }
 573 }
 574
 575 void x264_frame_deblock_row( x264_t *h, int mb_y )
 576 {
 577     const int s8x8 = 2 * h->mb.i_mb_stride;
 578     const int s4x4 = 4 * h->mb.i_mb_stride;
 579     const int b_interlaced = h->sh.b_mbaff;
 580     const int mvy_limit = 4 >> b_interlaced;
 581     int mb_x;
 582
 583     int i_stride2[3] = { h->fdec->i_stride[0] << b_interlaced,
 584                          h->fdec->i_stride[1] << b_interlaced,
 585                          h->fdec->i_stride[2] << b_interlaced };
 586
 587     if( !h->pps->b_cabac && h->pps->b_transform_8x8_mode )
 588         munge_cavlc_nnz( h, mb_y, h->mb.nnz_backup, munge_cavlc_nnz_row );
 589
 590     for( mb_x = 0; mb_x < h->sps->i_mb_width; )
 591     {
 592         const int mb_xy  = mb_y * h->mb.i_mb_stride + mb_x;
 593         const int mb_8x8 = 2 * s8x8 * mb_y + 2 * mb_x;
 594         const int mb_4x4 = 4 * s4x4 * mb_y + 4 * mb_x;
 595         const int b_8x8_transform = h->mb.mb_transform_size[mb_xy];
 596         const int i_edge_end = (h->mb.type[mb_xy] == P_SKIP) ? 1 : 4;
 597         int i_edge;
 598
 599         int i_pix_y[3] = { 16*mb_y*h->fdec->i_stride[0] + 16*mb_x,
 600                             8*mb_y*h->fdec->i_stride[1] +  8*mb_x,
 601                             8*mb_y*h->fdec->i_stride[2] +  8*mb_x };
 602         if( b_interlaced && (mb_y&1) )
 603         {
 604             i_pix_y[0] -= 15*h->fdec->i_stride[0];
 605             i_pix_y[1] -=  7*h->fdec->i_stride[1];
 606             i_pix_y[2] -=  7*h->fdec->i_stride[2];
 607         }
 608
 609         x264_prefetch_fenc( h, h->fdec, mb_x, mb_y );
 610
 611         /* i_dir == 0 -> vertical edge
 612          * i_dir == 1 -> horizontal edge */
 613
 614         #define deblock_dir(i_dir)\
 615         {\
 616             int i_start = (i_dir ? (mb_y <= b_interlaced) : (mb_x == 0));\
 617             int i_qp, i_qpn;\
 618             for( i_edge = i_start; i_edge < i_edge_end; i_edge++ )\
 619             {\
 620                 int mbn_xy, mbn_8x8, mbn_4x4;\
 621                 int bS[4];  /* filtering strength */\
 622                 if( b_8x8_transform && (i_edge&1) )\
 623                     continue;\
 624                 mbn_xy  = i_edge > 0 ? mb_xy  : ( i_dir == 0 ? mb_xy  - 1 : mb_xy - h->mb.i_mb_stride );\
 625                 mbn_8x8 = i_edge > 0 ? mb_8x8 : ( i_dir == 0 ? mb_8x8 - 2 : mb_8x8 - 2 * s8x8 );\
 626                 mbn_4x4 = i_edge > 0 ? mb_4x4 : ( i_dir == 0 ? mb_4x4 - 4 : mb_4x4 - 4 * s4x4 );\
 627                 if( b_interlaced && i_edge == 0 && i_dir == 1 )\
 628                 {\
 629                     mbn_xy -= h->mb.i_mb_stride;\
 630                     mbn_8x8 -= 2 * s8x8;\
 631                     mbn_4x4 -= 4 * s4x4;\
 632                 }\
 633                 /* *** Get bS for each 4px for the current edge *** */\
 634                 if( IS_INTRA( h->mb.type[mb_xy] ) || IS_INTRA( h->mb.type[mbn_xy] ) )\
 635                     bS[0] = bS[1] = bS[2] = bS[3] = ( i_edge == 0 && !(b_interlaced && i_dir) ? 4 : 3 );\
 636                 else\
 637                 {\
 638                     int i;\
 639                     for( i = 0; i < 4; i++ )\
 640                     {\
 641                         int x  = i_dir == 0 ? i_edge : i;\
 642                         int y  = i_dir == 0 ? i      : i_edge;\
 643                         int xn = (x - (i_dir == 0 ? 1 : 0 ))&0x03;\
 644                         int yn = (y - (i_dir == 0 ? 0 : 1 ))&0x03;\
 645                         if( h->mb.non_zero_count[mb_xy][x+y*4] != 0 ||\
 646                             h->mb.non_zero_count[mbn_xy][xn+yn*4] != 0 )\
 647                         {\
 648                             bS[i] = 2;\
 649                         }\
 650                         else\
 651                         {\
 652                             /* FIXME: A given frame may occupy more than one position in\
 653                              * the reference list. So we should compare the frame numbers,\
 654                              * not the indices in the ref list.\
 655                              * No harm yet, as we don't generate that case.*/\
 656                             int i8p= mb_8x8+(x/2)+(y/2)*s8x8;\
 657                             int i8q= mbn_8x8+(xn/2)+(yn/2)*s8x8;\
 658                             int i4p= mb_4x4+x+y*s4x4;\
 659                             int i4q= mbn_4x4+xn+yn*s4x4;\
 660                             int l;\
 661                             bS[i] = 0;\
 662                             for( l = 0; l < 1 + (h->sh.i_type == SLICE_TYPE_B); l++ )\
 663                             {\
 664                                 if( h->mb.ref[l][i8p] != h->mb.ref[l][i8q] ||\
 665                                     abs( h->mb.mv[l][i4p][0] - h->mb.mv[l][i4q][0] ) >= 4 ||\
 666                                     abs( h->mb.mv[l][i4p][1] - h->mb.mv[l][i4q][1] ) >= mvy_limit )\
 667                                 {\
 668                                     bS[i] = 1;\
 669                                     break;\
 670                                 }\
 671                             }\
 672                         }\
 673                     }\
 674                 }\
 675                 /* *** filter *** */\
 676                 /* Y plane */\
 677                 i_qp = h->mb.qp[mb_xy];\
 678                 i_qpn= h->mb.qp[mbn_xy];\
 679                 if( i_dir == 0 )\
 680                 {\
 681                     /* vertical edge */\
 682                     deblock_edge( h, &h->fdec->plane[0][i_pix_y[0] + 4*i_edge],\
 683                                   i_stride2[0], bS, (i_qp+i_qpn+1) >> 1, 0,\
 684                                   h->loopf.deblock_h_luma, h->loopf.deblock_h_luma_intra );\
 685                     if( !(i_edge & 1) )\
 686                     {\
 687                         /* U/V planes */\
 688                         int i_qpc = ( i_chroma_qp_table[x264_clip3( i_qp + h->pps->i_chroma_qp_index_offset, 0, 51 )] +\
 689                                       i_chroma_qp_table[x264_clip3( i_qpn + h->pps->i_chroma_qp_index_offset, 0, 51 )] + 1 ) >> 1;\
 690                         deblock_edge( h, &h->fdec->plane[1][i_pix_y[1] + 2*i_edge],\
 691                                       i_stride2[1], bS, i_qpc, 1,\
 692                                       h->loopf.deblock_h_chroma, h->loopf.deblock_h_chroma_intra );\
 693                         deblock_edge( h, &h->fdec->plane[2][i_pix_y[2] + 2*i_edge],\
 694                                       i_stride2[2], bS, i_qpc, 1,\
 695                                       h->loopf.deblock_h_chroma, h->loopf.deblock_h_chroma_intra );\
 696                     }\
 697                 }\
 698                 else\
 699                 {\
 700                     /* horizontal edge */\
 701                     deblock_edge( h, &h->fdec->plane[0][i_pix_y[0] + 4*i_edge*i_stride2[0]],\
 702                                   i_stride2[0], bS, (i_qp+i_qpn+1) >> 1, 0,\
 703                                   h->loopf.deblock_v_luma, h->loopf.deblock_v_luma_intra );\
 704                     /* U/V planes */\
 705                     if( !(i_edge & 1) )\
 706                     {\
 707                         int i_qpc = ( i_chroma_qp_table[x264_clip3( i_qp + h->pps->i_chroma_qp_index_offset, 0, 51 )] +\
 708                                       i_chroma_qp_table[x264_clip3( i_qpn + h->pps->i_chroma_qp_index_offset, 0, 51 )] + 1 ) >> 1;\
 709                         deblock_edge( h, &h->fdec->plane[1][i_pix_y[1] + 2*i_edge*i_stride2[1]],\
 710                                       i_stride2[1], bS, i_qpc, 1,\
 711                                       h->loopf.deblock_v_chroma, h->loopf.deblock_v_chroma_intra );\
 712                         deblock_edge( h, &h->fdec->plane[2][i_pix_y[2] + 2*i_edge*i_stride2[2]],\
 713                                       i_stride2[2], bS, i_qpc, 1,\
 714                                       h->loopf.deblock_v_chroma, h->loopf.deblock_v_chroma_intra );\
 715                     }\
 716                 }\
 717             }\
 718         }
 719
 720         deblock_dir(0);
 721         deblock_dir(1);
 722
 723         /* next mb */
 724         if( !b_interlaced || (mb_y&1) )
 725             mb_x++;
 726         mb_y ^= b_interlaced;
 727     }
 728
 729     if( !h->pps->b_cabac && h->pps->b_transform_8x8_mode )
 730         munge_cavlc_nnz( h, mb_y, h->mb.nnz_backup, restore_cavlc_nnz_row );
 731 }
 732
 733 void x264_frame_deblock( x264_t *h )
 734 {
 735     int mb_y;
 736     for( mb_y = 0; mb_y < h->sps->i_mb_height; mb_y += 1 + h->sh.b_mbaff )
 737         x264_frame_deblock_row( h, mb_y );
 738 }
 739
 740 #ifdef HAVE_MMX
 741 void x264_deblock_v_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 742 void x264_deblock_h_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 743 void x264_deblock_v_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
 744 void x264_deblock_h_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
 745
 746 void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 747 void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 748 void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta );
 749 void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta );
 750 #ifdef ARCH_X86
 751 void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 752 void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 753 void x264_deblock_h_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
 754 void x264_deblock_v8_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
 755
 756 void x264_deblock_v_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
 757 {
 758     x264_deblock_v8_luma_mmxext( pix,   stride, alpha, beta, tc0   );
 759     x264_deblock_v8_luma_mmxext( pix+8, stride, alpha, beta, tc0+2 );
 760 }
 761 void x264_deblock_v_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta )
 762 {
 763     x264_deblock_v8_luma_intra_mmxext( pix,   stride, alpha, beta );
 764     x264_deblock_v8_luma_intra_mmxext( pix+8, stride, alpha, beta );
 765 }
 766 #endif
 767 #endif
 768
 769 #ifdef ARCH_PPC
 770 void x264_deblock_v_luma_altivec( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 771 void x264_deblock_h_luma_altivec( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 772 #endif // ARCH_PPC
 773
 774 void x264_deblock_init( int cpu, x264_deblock_function_t *pf )
 775 {
 776     pf->deblock_v_luma = deblock_v_luma_c;
 777     pf->deblock_h_luma = deblock_h_luma_c;
 778     pf->deblock_v_chroma = deblock_v_chroma_c;
 779     pf->deblock_h_chroma = deblock_h_chroma_c;
 780     pf->deblock_v_luma_intra = deblock_v_luma_intra_c;
 781     pf->deblock_h_luma_intra = deblock_h_luma_intra_c;
 782     pf->deblock_v_chroma_intra = deblock_v_chroma_intra_c;
 783     pf->deblock_h_chroma_intra = deblock_h_chroma_intra_c;
 784
 785 #ifdef HAVE_MMX
 786     if( cpu&X264_CPU_MMXEXT )
 787     {
 788         pf->deblock_v_chroma = x264_deblock_v_chroma_mmxext;
 789         pf->deblock_h_chroma = x264_deblock_h_chroma_mmxext;
 790         pf->deblock_v_chroma_intra = x264_deblock_v_chroma_intra_mmxext;
 791         pf->deblock_h_chroma_intra = x264_deblock_h_chroma_intra_mmxext;
 792 #ifdef ARCH_X86
 793         pf->deblock_v_luma = x264_deblock_v_luma_mmxext;
 794         pf->deblock_h_luma = x264_deblock_h_luma_mmxext;
 795         pf->deblock_v_luma_intra = x264_deblock_v_luma_intra_mmxext;
 796         pf->deblock_h_luma_intra = x264_deblock_h_luma_intra_mmxext;
 797 #endif
 798         if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_STACK_MOD4) )
 799         {
 800             pf->deblock_v_luma = x264_deblock_v_luma_sse2;
 801             pf->deblock_h_luma = x264_deblock_h_luma_sse2;
 802             pf->deblock_v_luma_intra = x264_deblock_v_luma_intra_sse2;
 803             pf->deblock_h_luma_intra = x264_deblock_h_luma_intra_sse2;
 804         }
 805     }
 806 #endif
 807
 808 #ifdef ARCH_PPC
 809     if( cpu&X264_CPU_ALTIVEC )
 810     {
 811         pf->deblock_v_luma = x264_deblock_v_luma_altivec;
 812         pf->deblock_h_luma = x264_deblock_h_luma_altivec;
 813    }
 814 #endif // ARCH_PPC
 815 }
 816
 817
 818 /* threading */
 819 void x264_frame_cond_broadcast( x264_frame_t *frame, int i_lines_completed )
 820 {
 821     x264_pthread_mutex_lock( &frame->mutex );
 822     frame->i_lines_completed = i_lines_completed;
 823     x264_pthread_cond_broadcast( &frame->cv );
 824     x264_pthread_mutex_unlock( &frame->mutex );
 825 }
 826
 827 void x264_frame_cond_wait( x264_frame_t *frame, int i_lines_completed )
 828 {
 829     x264_pthread_mutex_lock( &frame->mutex );
 830     while( frame->i_lines_completed < i_lines_completed )
 831         x264_pthread_cond_wait( &frame->cv, &frame->mutex );
 832     x264_pthread_mutex_unlock( &frame->mutex );
 833 }
 834
 835 /* list operators */
 836
 837 void x264_frame_push( x264_frame_t **list, x264_frame_t *frame )
 838 {
 839     int i = 0;
 840     while( list[i] ) i++;
 841     list[i] = frame;
 842 }
 843
 844 x264_frame_t *x264_frame_pop( x264_frame_t **list )
 845 {
 846     x264_frame_t *frame;
 847     int i = 0;
 848     assert( list[0] );
 849     while( list[i+1] ) i++;
 850     frame = list[i];
 851     list[i] = NULL;
 852     return frame;
 853 }
 854
 855 void x264_frame_unshift( x264_frame_t **list, x264_frame_t *frame )
 856 {
 857     int i = 0;
 858     while( list[i] ) i++;
 859     while( i-- )
 860         list[i+1] = list[i];
 861     list[0] = frame;
 862 }
 863
 864 x264_frame_t *x264_frame_shift( x264_frame_t **list )
 865 {
 866     x264_frame_t *frame = list[0];
 867     int i;
 868     for( i = 0; list[i]; i++ )
 869         list[i] = list[i+1];
 870     assert(frame);
 871     return frame;
 872 }
 873
 874 void x264_frame_push_unused( x264_t *h, x264_frame_t *frame )
 875 {
 876     assert( frame->i_reference_count > 0 );
 877     frame->i_reference_count--;
 878     if( frame->i_reference_count == 0 )
 879         x264_frame_push( h->frames.unused, frame );
 880     assert( h->frames.unused[ sizeof(h->frames.unused) / sizeof(*h->frames.unused) - 1 ] == NULL );
 881 }
 882
 883 x264_frame_t *x264_frame_pop_unused( x264_t *h )
 884 {
 885     x264_frame_t *frame;
 886     if( h->frames.unused[0] )
 887         frame = x264_frame_pop( h->frames.unused );
 888     else
 889         frame = x264_frame_new( h );
 890     assert( frame->i_reference_count == 0 );
 891     frame->i_reference_count = 1;
 892     return frame;
 893 }
 894
 895 void x264_frame_sort( x264_frame_t **list, int b_dts )
 896 {
 897     int i, b_ok;
 898     do {
 899         b_ok = 1;
 900         for( i = 0; list[i+1]; i++ )
 901         {
 902             int dtype = list[i]->i_type - list[i+1]->i_type;
 903             int dtime = list[i]->i_frame - list[i+1]->i_frame;
 904             int swap = b_dts ? dtype > 0 || ( dtype == 0 && dtime > 0 )
 905                              : dtime > 0;
 906             if( swap )
 907             {
 908                 XCHG( x264_frame_t*, list[i], list[i+1] );
 909                 b_ok = 0;
 910             }
 911         }
 912     } while( !b_ok );
 913 }