git.sesse.net Git - x264/blob - common/frame.c

   1 /*****************************************************************************
   2  * frame.c: h264 encoder library
   3  *****************************************************************************
   4  * Copyright (C) 2003 Laurent Aimar
   5  * $Id: frame.c,v 1.1 2004/06/03 19:27:06 fenrir Exp $
   6  *
   7  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation; either version 2 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License
  20  * along with this program; if not, write to the Free Software
  21  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
  22  *****************************************************************************/
  23
  24 #include "common.h"
  25
  26 x264_frame_t *x264_frame_new( x264_t *h )
  27 {
  28     x264_frame_t *frame = x264_malloc( sizeof(x264_frame_t) );
  29     int i, j;
  30
  31     int i_mb_count = h->mb.i_mb_count;
  32     int i_stride, i_width, i_lines;
  33     int i_padv = PADV << h->param.b_interlaced;
  34     int luma_plane_size;
  35
  36     if( !frame ) return NULL;
  37
  38     memset( frame, 0, sizeof(x264_frame_t) );
  39
  40     /* allocate frame data (+64 for extra data for me) */
  41     i_width  = ( ( h->param.i_width  + 15 ) & -16 );
  42     i_stride = i_width + 2*PADH;
  43     i_lines  = ( ( h->param.i_height + 15 ) & -16 );
  44     if( h->param.b_interlaced )
  45         i_lines = ( i_lines + 31 ) & -32;
  46
  47     if( h->param.cpu&X264_CPU_CACHELINE_SPLIT )
  48     {
  49         int align = h->param.cpu&X264_CPU_CACHELINE_32 ? 32 : 64;
  50         i_stride = (i_stride + align-1) & -align;
  51     }
  52
  53     frame->i_plane = 3;
  54     for( i = 0; i < 3; i++ )
  55     {
  56         frame->i_stride[i] = i_stride >> !!i;
  57         frame->i_width[i] = i_width >> !!i;
  58         frame->i_lines[i] = i_lines >> !!i;
  59     }
  60
  61     luma_plane_size = (frame->i_stride[0] * ( frame->i_lines[0] + 2*i_padv ));
  62     for( i = 1; i < 3; i++ )
  63     {
  64         CHECKED_MALLOC( frame->buffer[i], luma_plane_size/4 );
  65         frame->plane[i] = (uint8_t*)frame->buffer[i] + (frame->i_stride[i] * i_padv + PADH)/2;
  66     }
  67     /* all 4 luma planes allocated together, since the cacheline split code
  68      * requires them to be in-phase wrt cacheline alignment. */
  69     CHECKED_MALLOC( frame->buffer[0], 4*luma_plane_size);
  70     for( i = 0; i < 4; i++ )
  71         frame->filtered[i] = (uint8_t*)frame->buffer[0] + i*luma_plane_size + frame->i_stride[0] * i_padv + PADH;
  72     frame->plane[0] = frame->filtered[0];
  73
  74     if( h->frames.b_have_lowres )
  75     {
  76         frame->i_width_lowres = frame->i_width[0]/2;
  77         frame->i_stride_lowres = frame->i_width_lowres + 2*PADH;
  78         frame->i_lines_lowres = frame->i_lines[0]/2;
  79         for( i = 0; i < 4; i++ )
  80         {
  81             CHECKED_MALLOC( frame->buffer_lowres[i],
  82                             frame->i_stride_lowres * ( frame->i_lines[0]/2 + 2*i_padv ) );
  83             frame->lowres[i] = ((uint8_t*)frame->buffer_lowres[i]) +
  84                                 frame->i_stride_lowres * i_padv + PADH;
  85         }
  86     }
  87
  88     if( h->param.analyse.i_me_method >= X264_ME_ESA )
  89     {
  90         CHECKED_MALLOC( frame->buffer[3],
  91                         2 * frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv) * sizeof(uint16_t) );
  92         frame->integral = (uint16_t*)frame->buffer[3] + frame->i_stride[0] * i_padv + PADH;
  93     }
  94
  95     frame->i_poc = -1;
  96     frame->i_type = X264_TYPE_AUTO;
  97     frame->i_qpplus1 = 0;
  98     frame->i_pts = -1;
  99     frame->i_frame = -1;
 100     frame->i_frame_num = -1;
 101     frame->i_lines_completed = -1;
 102
 103     CHECKED_MALLOC( frame->mb_type, i_mb_count * sizeof(int8_t));
 104     CHECKED_MALLOC( frame->mv[0], 2*16 * i_mb_count * sizeof(int16_t) );
 105     CHECKED_MALLOC( frame->ref[0], 4 * i_mb_count * sizeof(int8_t) );
 106     if( h->param.i_bframe )
 107     {
 108         CHECKED_MALLOC( frame->mv[1], 2*16 * i_mb_count * sizeof(int16_t) );
 109         CHECKED_MALLOC( frame->ref[1], 4 * i_mb_count * sizeof(int8_t) );
 110     }
 111     else
 112     {
 113         frame->mv[1]  = NULL;
 114         frame->ref[1] = NULL;
 115     }
 116
 117     CHECKED_MALLOC( frame->i_row_bits, i_lines/16 * sizeof(int) );
 118     CHECKED_MALLOC( frame->i_row_qp, i_lines/16 * sizeof(int) );
 119     for( i = 0; i < h->param.i_bframe + 2; i++ )
 120         for( j = 0; j < h->param.i_bframe + 2; j++ )
 121             CHECKED_MALLOC( frame->i_row_satds[i][j], i_lines/16 * sizeof(int) );
 122
 123     x264_pthread_mutex_init( &frame->mutex, NULL );
 124     x264_pthread_cond_init( &frame->cv, NULL );
 125
 126     return frame;
 127
 128 fail:
 129     x264_frame_delete( frame );
 130     return NULL;
 131 }
 132
 133 void x264_frame_delete( x264_frame_t *frame )
 134 {
 135     int i, j;
 136     for( i = 0; i < 4; i++ )
 137         x264_free( frame->buffer[i] );
 138     for( i = 0; i < 4; i++ )
 139         x264_free( frame->buffer_lowres[i] );
 140     for( i = 0; i < X264_BFRAME_MAX+2; i++ )
 141         for( j = 0; j < X264_BFRAME_MAX+2; j++ )
 142             x264_free( frame->i_row_satds[i][j] );
 143     x264_free( frame->i_row_bits );
 144     x264_free( frame->i_row_qp );
 145     x264_free( frame->mb_type );
 146     x264_free( frame->mv[0] );
 147     x264_free( frame->mv[1] );
 148     x264_free( frame->ref[0] );
 149     x264_free( frame->ref[1] );
 150     x264_pthread_mutex_destroy( &frame->mutex );
 151     x264_pthread_cond_destroy( &frame->cv );
 152     x264_free( frame );
 153 }
 154
 155 int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src )
 156 {
 157     int i_csp = src->img.i_csp & X264_CSP_MASK;
 158     int i;
 159     if( i_csp != X264_CSP_I420 && i_csp != X264_CSP_YV12 )
 160     {
 161         x264_log( h, X264_LOG_ERROR, "Arg invalid CSP\n" );
 162         return -1;
 163     }
 164
 165     dst->i_type     = src->i_type;
 166     dst->i_qpplus1  = src->i_qpplus1;
 167     dst->i_pts      = src->i_pts;
 168
 169     for( i=0; i<3; i++ )
 170     {
 171         int s = (i_csp == X264_CSP_YV12 && i) ? i^3 : i;
 172         uint8_t *plane = src->img.plane[s];
 173         int stride = src->img.i_stride[s];
 174         int width = h->param.i_width >> !!i;
 175         int height = h->param.i_height >> !!i;
 176         if( src->img.i_csp & X264_CSP_VFLIP )
 177         {
 178             plane += (height-1)*stride;
 179             stride = -stride;
 180         }
 181         h->mc.plane_copy( dst->plane[i], dst->i_stride[i], plane, stride, width, height );
 182     }
 183     return 0;
 184 }
 185
 186
 187
 188 static void plane_expand_border( uint8_t *pix, int i_stride, int i_width, int i_height, int i_padh, int i_padv, int b_pad_top, int b_pad_bottom )
 189 {
 190 #define PPIXEL(x, y) ( pix + (x) + (y)*i_stride )
 191     int y;
 192     for( y = 0; y < i_height; y++ )
 193     {
 194         /* left band */
 195         memset( PPIXEL(-i_padh, y), PPIXEL(0, y)[0], i_padh );
 196         /* right band */
 197         memset( PPIXEL(i_width, y), PPIXEL(i_width-1, y)[0], i_padh );
 198     }
 199     /* upper band */
 200     if( b_pad_top )
 201     for( y = 0; y < i_padv; y++ )
 202         memcpy( PPIXEL(-i_padh, -y-1), PPIXEL(-i_padh, 0), i_width+2*i_padh );
 203     /* lower band */
 204     if( b_pad_bottom )
 205     for( y = 0; y < i_padv; y++ )
 206         memcpy( PPIXEL(-i_padh, i_height+y), PPIXEL(-i_padh, i_height-1), i_width+2*i_padh );
 207 #undef PPIXEL
 208 }
 209
 210 void x264_frame_expand_border( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
 211 {
 212     int i;
 213     int b_start = !mb_y;
 214     if( mb_y & h->sh.b_mbaff )
 215         return;
 216     for( i = 0; i < frame->i_plane; i++ )
 217     {
 218         int stride = frame->i_stride[i];
 219         int width = 16*h->sps->i_mb_width >> !!i;
 220         int height = (b_end ? 16*(h->sps->i_mb_height - mb_y) >> h->sh.b_mbaff : 16) >> !!i;
 221         int padh = PADH >> !!i;
 222         int padv = PADV >> !!i;
 223         // buffer: 2 chroma, 3 luma (rounded to 4) because deblocking goes beyond the top of the mb
 224         uint8_t *pix = frame->plane[i] + X264_MAX(0, (16*mb_y-4)*stride >> !!i);
 225         if( b_end && !b_start )
 226             height += 4 >> (!!i + h->sh.b_mbaff);
 227         if( h->sh.b_mbaff )
 228         {
 229             plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end );
 230             plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end );
 231         }
 232         else
 233         {
 234             plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end );
 235         }
 236     }
 237 }
 238
 239 void x264_frame_expand_border_filtered( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
 240 {
 241     /* during filtering, 8 extra pixels were filtered on each edge,
 242      * but up to 3 of the horizontal ones may be wrong.
 243        we want to expand border from the last filtered pixel */
 244     int b_start = !mb_y;
 245     int stride = frame->i_stride[0];
 246     int width = 16*h->sps->i_mb_width + 8;
 247     int height = b_end ? (16*(h->sps->i_mb_height - mb_y) >> h->sh.b_mbaff) + 16 : 16;
 248     int padh = PADH - 4;
 249     int padv = PADV - 8;
 250     int i;
 251     for( i = 1; i < 4; i++ )
 252     {
 253         // buffer: 8 luma, to match the hpel filter
 254         uint8_t *pix = frame->filtered[i] + (16*mb_y - (8 << h->sh.b_mbaff)) * stride - 4;
 255         if( h->sh.b_mbaff )
 256         {
 257             plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end );
 258             plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end );
 259         }
 260         else
 261         {
 262             plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end );
 263         }
 264     }
 265 }
 266
 267 void x264_frame_expand_border_lowres( x264_frame_t *frame )
 268 {
 269     int i;
 270     for( i = 0; i < 4; i++ )
 271         plane_expand_border( frame->lowres[i], frame->i_stride_lowres, frame->i_stride_lowres - 2*PADH, frame->i_lines_lowres, PADH, PADV, 1, 1 );
 272 }
 273
 274 void x264_frame_expand_border_mod16( x264_t *h, x264_frame_t *frame )
 275 {
 276     int i, y;
 277     for( i = 0; i < frame->i_plane; i++ )
 278     {
 279         int i_subsample = i ? 1 : 0;
 280         int i_width = h->param.i_width >> i_subsample;
 281         int i_height = h->param.i_height >> i_subsample;
 282         int i_padx = ( h->sps->i_mb_width * 16 - h->param.i_width ) >> i_subsample;
 283         int i_pady = ( h->sps->i_mb_height * 16 - h->param.i_height ) >> i_subsample;
 284
 285         if( i_padx )
 286         {
 287             for( y = 0; y < i_height; y++ )
 288                 memset( &frame->plane[i][y*frame->i_stride[i] + i_width],
 289                          frame->plane[i][y*frame->i_stride[i] + i_width - 1],
 290                          i_padx );
 291         }
 292         if( i_pady )
 293         {
 294             //FIXME interlace? or just let it pad using the wrong field
 295             for( y = i_height; y < i_height + i_pady; y++ )
 296                 memcpy( &frame->plane[i][y*frame->i_stride[i]],
 297                         &frame->plane[i][(i_height-1)*frame->i_stride[i]],
 298                         i_width + i_padx );
 299         }
 300     }
 301 }
 302
 303
 304 /* cavlc + 8x8 transform stores nnz per 16 coeffs for the purpose of
 305  * entropy coding, but per 64 coeffs for the purpose of deblocking */
 306 void munge_cavlc_nnz_row( x264_t *h, int mb_y, uint8_t (*buf)[16] )
 307 {
 308     uint32_t (*src)[6] = (uint32_t(*)[6])h->mb.non_zero_count + mb_y * h->sps->i_mb_width;
 309     int8_t *transform = h->mb.mb_transform_size + mb_y * h->sps->i_mb_width;
 310     int x;
 311     for( x=0; x<h->sps->i_mb_width; x++ )
 312     {
 313         memcpy( buf+x, src+x, 16 );
 314         if( transform[x] )
 315         {
 316             if( src[x][0] ) src[x][0] = 0x01010101;
 317             if( src[x][1] ) src[x][1] = 0x01010101;
 318             if( src[x][2] ) src[x][2] = 0x01010101;
 319             if( src[x][3] ) src[x][3] = 0x01010101;
 320         }
 321     }
 322 }
 323
 324 static void restore_cavlc_nnz_row( x264_t *h, int mb_y, uint8_t (*buf)[16] )
 325 {
 326     uint8_t (*dst)[24] = h->mb.non_zero_count + mb_y * h->sps->i_mb_width;
 327     int x;
 328     for( x=0; x<h->sps->i_mb_width; x++ )
 329         memcpy( dst+x, buf+x, 16 );
 330 }
 331
 332 static void munge_cavlc_nnz( x264_t *h, int mb_y, uint8_t (*buf)[16], void (*func)(x264_t*, int, uint8_t (*)[16]) )
 333 {
 334     func( h, mb_y, buf );
 335     if( mb_y > 0 )
 336         func( h, mb_y-1, buf + h->sps->i_mb_width );
 337     if( h->sh.b_mbaff )
 338     {
 339         func( h, mb_y+1, buf + h->sps->i_mb_width * 2 );
 340         if( mb_y > 0 )
 341             func( h, mb_y-2, buf + h->sps->i_mb_width * 3 );
 342     }
 343 }
 344
 345
 346 /* Deblocking filter */
 347
 348 static const int i_alpha_table[52] =
 349 {
 350      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 351      0,  0,  0,  0,  0,  0,  4,  4,  5,  6,
 352      7,  8,  9, 10, 12, 13, 15, 17, 20, 22,
 353     25, 28, 32, 36, 40, 45, 50, 56, 63, 71,
 354     80, 90,101,113,127,144,162,182,203,226,
 355     255, 255
 356 };
 357 static const int i_beta_table[52] =
 358 {
 359      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 360      0,  0,  0,  0,  0,  0,  2,  2,  2,  3,
 361      3,  3,  3,  4,  4,  4,  6,  6,  7,  7,
 362      8,  8,  9,  9, 10, 10, 11, 11, 12, 12,
 363     13, 13, 14, 14, 15, 15, 16, 16, 17, 17,
 364     18, 18
 365 };
 366 static const int i_tc0_table[52][3] =
 367 {
 368     { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 },
 369     { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 },
 370     { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 1 },
 371     { 0, 0, 1 }, { 0, 0, 1 }, { 0, 0, 1 }, { 0, 1, 1 }, { 0, 1, 1 }, { 1, 1, 1 },
 372     { 1, 1, 1 }, { 1, 1, 1 }, { 1, 1, 1 }, { 1, 1, 2 }, { 1, 1, 2 }, { 1, 1, 2 },
 373     { 1, 1, 2 }, { 1, 2, 3 }, { 1, 2, 3 }, { 2, 2, 3 }, { 2, 2, 4 }, { 2, 3, 4 },
 374     { 2, 3, 4 }, { 3, 3, 5 }, { 3, 4, 6 }, { 3, 4, 6 }, { 4, 5, 7 }, { 4, 5, 8 },
 375     { 4, 6, 9 }, { 5, 7,10 }, { 6, 8,11 }, { 6, 8,13 }, { 7,10,14 }, { 8,11,16 },
 376     { 9,12,18 }, {10,13,20 }, {11,15,23 }, {13,17,25 }
 377 };
 378
 379 /* From ffmpeg */
 380 static inline int clip_uint8( int a )
 381 {
 382     if (a&(~255))
 383         return (-a)>>31;
 384     else
 385         return a;
 386 }
 387
 388 static inline void deblock_luma_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0 )
 389 {
 390     int i, d;
 391     for( i = 0; i < 4; i++ ) {
 392         if( tc0[i] < 0 ) {
 393             pix += 4*ystride;
 394             continue;
 395         }
 396         for( d = 0; d < 4; d++ ) {
 397             const int p2 = pix[-3*xstride];
 398             const int p1 = pix[-2*xstride];
 399             const int p0 = pix[-1*xstride];
 400             const int q0 = pix[ 0*xstride];
 401             const int q1 = pix[ 1*xstride];
 402             const int q2 = pix[ 2*xstride];
 403
 404             if( abs( p0 - q0 ) < alpha &&
 405                 abs( p1 - p0 ) < beta &&
 406                 abs( q1 - q0 ) < beta ) {
 407
 408                 int tc = tc0[i];
 409                 int delta;
 410
 411                 if( abs( p2 - p0 ) < beta ) {
 412                     pix[-2*xstride] = p1 + x264_clip3( (( p2 + ((p0 + q0 + 1) >> 1)) >> 1) - p1, -tc0[i], tc0[i] );
 413                     tc++;
 414                 }
 415                 if( abs( q2 - q0 ) < beta ) {
 416                     pix[ 1*xstride] = q1 + x264_clip3( (( q2 + ((p0 + q0 + 1) >> 1)) >> 1) - q1, -tc0[i], tc0[i] );
 417                     tc++;
 418                 }
 419
 420                 delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
 421                 pix[-1*xstride] = clip_uint8( p0 + delta );    /* p0' */
 422                 pix[ 0*xstride] = clip_uint8( q0 - delta );    /* q0' */
 423             }
 424             pix += ystride;
 425         }
 426     }
 427 }
 428 static void deblock_v_luma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
 429 {
 430     deblock_luma_c( pix, stride, 1, alpha, beta, tc0 );
 431 }
 432 static void deblock_h_luma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
 433 {
 434     deblock_luma_c( pix, 1, stride, alpha, beta, tc0 );
 435 }
 436
 437 static inline void deblock_chroma_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0 )
 438 {
 439     int i, d;
 440     for( i = 0; i < 4; i++ ) {
 441         const int tc = tc0[i];
 442         if( tc <= 0 ) {
 443             pix += 2*ystride;
 444             continue;
 445         }
 446         for( d = 0; d < 2; d++ ) {
 447             const int p1 = pix[-2*xstride];
 448             const int p0 = pix[-1*xstride];
 449             const int q0 = pix[ 0*xstride];
 450             const int q1 = pix[ 1*xstride];
 451
 452             if( abs( p0 - q0 ) < alpha &&
 453                 abs( p1 - p0 ) < beta &&
 454                 abs( q1 - q0 ) < beta ) {
 455
 456                 int delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
 457                 pix[-1*xstride] = clip_uint8( p0 + delta );    /* p0' */
 458                 pix[ 0*xstride] = clip_uint8( q0 - delta );    /* q0' */
 459             }
 460             pix += ystride;
 461         }
 462     }
 463 }
 464 static void deblock_v_chroma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
 465 {
 466     deblock_chroma_c( pix, stride, 1, alpha, beta, tc0 );
 467 }
 468 static void deblock_h_chroma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
 469 {
 470     deblock_chroma_c( pix, 1, stride, alpha, beta, tc0 );
 471 }
 472
 473 static inline void deblock_luma_intra_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta )
 474 {
 475     int d;
 476     for( d = 0; d < 16; d++ ) {
 477         const int p2 = pix[-3*xstride];
 478         const int p1 = pix[-2*xstride];
 479         const int p0 = pix[-1*xstride];
 480         const int q0 = pix[ 0*xstride];
 481         const int q1 = pix[ 1*xstride];
 482         const int q2 = pix[ 2*xstride];
 483
 484         if( abs( p0 - q0 ) < alpha &&
 485             abs( p1 - p0 ) < beta &&
 486             abs( q1 - q0 ) < beta ) {
 487
 488             if(abs( p0 - q0 ) < ((alpha >> 2) + 2) ){
 489                 if( abs( p2 - p0 ) < beta)
 490                 {
 491                     const int p3 = pix[-4*xstride];
 492                     /* p0', p1', p2' */
 493                     pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
 494                     pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
 495                     pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
 496                 } else {
 497                     /* p0' */
 498                     pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
 499                 }
 500                 if( abs( q2 - q0 ) < beta)
 501                 {
 502                     const int q3 = pix[3*xstride];
 503                     /* q0', q1', q2' */
 504                     pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
 505                     pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
 506                     pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
 507                 } else {
 508                     /* q0' */
 509                     pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
 510                 }
 511             }else{
 512                 /* p0', q0' */
 513                 pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
 514                 pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
 515             }
 516         }
 517         pix += ystride;
 518     }
 519 }
 520 static void deblock_v_luma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
 521 {
 522     deblock_luma_intra_c( pix, stride, 1, alpha, beta );
 523 }
 524 static void deblock_h_luma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
 525 {
 526     deblock_luma_intra_c( pix, 1, stride, alpha, beta );
 527 }
 528
 529 static inline void deblock_chroma_intra_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta )
 530 {
 531     int d;
 532     for( d = 0; d < 8; d++ ) {
 533         const int p1 = pix[-2*xstride];
 534         const int p0 = pix[-1*xstride];
 535         const int q0 = pix[ 0*xstride];
 536         const int q1 = pix[ 1*xstride];
 537
 538         if( abs( p0 - q0 ) < alpha &&
 539             abs( p1 - p0 ) < beta &&
 540             abs( q1 - q0 ) < beta ) {
 541
 542             pix[-1*xstride] = (2*p1 + p0 + q1 + 2) >> 2;   /* p0' */
 543             pix[ 0*xstride] = (2*q1 + q0 + p1 + 2) >> 2;   /* q0' */
 544         }
 545
 546         pix += ystride;
 547     }
 548 }
 549 static void deblock_v_chroma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
 550 {
 551     deblock_chroma_intra_c( pix, stride, 1, alpha, beta );
 552 }
 553 static void deblock_h_chroma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
 554 {
 555     deblock_chroma_intra_c( pix, 1, stride, alpha, beta );
 556 }
 557
 558 static inline void deblock_edge( x264_t *h, uint8_t *pix, int i_stride, int bS[4], int i_qp, int b_chroma,
 559                                  x264_deblock_inter_t pf_inter, x264_deblock_intra_t pf_intra )
 560 {
 561     int i;
 562     const int index_a = x264_clip3( i_qp + h->sh.i_alpha_c0_offset, 0, 51 );
 563     const int alpha = i_alpha_table[index_a];
 564     const int beta  = i_beta_table[x264_clip3( i_qp + h->sh.i_beta_offset, 0, 51 )];
 565
 566     if( bS[0] < 4 ) {
 567         int8_t tc[4];
 568         for(i=0; i<4; i++)
 569             tc[i] = (bS[i] ? i_tc0_table[index_a][bS[i] - 1] : -1) + b_chroma;
 570         pf_inter( pix, i_stride, alpha, beta, tc );
 571     } else {
 572         pf_intra( pix, i_stride, alpha, beta );
 573     }
 574 }
 575
 576 void x264_frame_deblock_row( x264_t *h, int mb_y )
 577 {
 578     const int s8x8 = 2 * h->mb.i_mb_stride;
 579     const int s4x4 = 4 * h->mb.i_mb_stride;
 580     const int b_interlaced = h->sh.b_mbaff;
 581     const int mvy_limit = 4 >> b_interlaced;
 582     int mb_x;
 583
 584     int i_stride2[3] = { h->fdec->i_stride[0] << b_interlaced,
 585                          h->fdec->i_stride[1] << b_interlaced,
 586                          h->fdec->i_stride[2] << b_interlaced };
 587
 588     if( !h->pps->b_cabac && h->pps->b_transform_8x8_mode )
 589         munge_cavlc_nnz( h, mb_y, h->mb.nnz_backup, munge_cavlc_nnz_row );
 590
 591     for( mb_x = 0; mb_x < h->sps->i_mb_width; )
 592     {
 593         const int mb_xy  = mb_y * h->mb.i_mb_stride + mb_x;
 594         const int mb_8x8 = 2 * s8x8 * mb_y + 2 * mb_x;
 595         const int mb_4x4 = 4 * s4x4 * mb_y + 4 * mb_x;
 596         const int b_8x8_transform = h->mb.mb_transform_size[mb_xy];
 597         const int i_edge_end = (h->mb.type[mb_xy] == P_SKIP) ? 1 : 4;
 598         int i_edge, i_dir;
 599
 600         int i_pix_y[3] = { 16*mb_y*h->fdec->i_stride[0] + 16*mb_x,
 601                             8*mb_y*h->fdec->i_stride[1] +  8*mb_x,
 602                             8*mb_y*h->fdec->i_stride[2] +  8*mb_x };
 603         if( b_interlaced && (mb_y&1) )
 604         {
 605             i_pix_y[0] -= 15*h->fdec->i_stride[0];
 606             i_pix_y[1] -=  7*h->fdec->i_stride[1];
 607             i_pix_y[2] -=  7*h->fdec->i_stride[2];
 608         }
 609
 610         x264_prefetch_fenc( h, h->fdec, mb_x, mb_y );
 611
 612         /* i_dir == 0 -> vertical edge
 613          * i_dir == 1 -> horizontal edge */
 614         for( i_dir = 0; i_dir < 2; i_dir++ )
 615         {
 616             int i_start = (i_dir ? (mb_y <= b_interlaced) : (mb_x == 0));
 617             int i_qp, i_qpn;
 618
 619             for( i_edge = i_start; i_edge < i_edge_end; i_edge++ )
 620             {
 621                 int mbn_xy, mbn_8x8, mbn_4x4;
 622                 int bS[4];  /* filtering strength */
 623
 624                 if( b_8x8_transform && (i_edge&1) )
 625                     continue;
 626
 627                 mbn_xy  = i_edge > 0 ? mb_xy  : ( i_dir == 0 ? mb_xy  - 1 : mb_xy - h->mb.i_mb_stride );
 628                 mbn_8x8 = i_edge > 0 ? mb_8x8 : ( i_dir == 0 ? mb_8x8 - 2 : mb_8x8 - 2 * s8x8 );
 629                 mbn_4x4 = i_edge > 0 ? mb_4x4 : ( i_dir == 0 ? mb_4x4 - 4 : mb_4x4 - 4 * s4x4 );
 630
 631                 if( b_interlaced && i_edge == 0 && i_dir == 1 )
 632                 {
 633                     mbn_xy -= h->mb.i_mb_stride;
 634                     mbn_8x8 -= 2 * s8x8;
 635                     mbn_4x4 -= 4 * s4x4;
 636                 }
 637
 638                 /* *** Get bS for each 4px for the current edge *** */
 639                 if( IS_INTRA( h->mb.type[mb_xy] ) || IS_INTRA( h->mb.type[mbn_xy] ) )
 640                 {
 641                     bS[0] = bS[1] = bS[2] = bS[3] = ( i_edge == 0 && !(b_interlaced && i_dir) ? 4 : 3 );
 642                 }
 643                 else
 644                 {
 645                     int i;
 646                     for( i = 0; i < 4; i++ )
 647                     {
 648                         int x  = i_dir == 0 ? i_edge : i;
 649                         int y  = i_dir == 0 ? i      : i_edge;
 650                         int xn = (x - (i_dir == 0 ? 1 : 0 ))&0x03;
 651                         int yn = (y - (i_dir == 0 ? 0 : 1 ))&0x03;
 652
 653                         if( h->mb.non_zero_count[mb_xy][block_idx_xy[x][y]] != 0 ||
 654                             h->mb.non_zero_count[mbn_xy][block_idx_xy[xn][yn]] != 0 )
 655                         {
 656                             bS[i] = 2;
 657                         }
 658                         else
 659                         {
 660                             /* FIXME: A given frame may occupy more than one position in
 661                              * the reference list. So we should compare the frame numbers,
 662                              * not the indices in the ref list.
 663                              * No harm yet, as we don't generate that case.*/
 664
 665                             int i8p= mb_8x8+(x/2)+(y/2)*s8x8;
 666                             int i8q= mbn_8x8+(xn/2)+(yn/2)*s8x8;
 667                             int i4p= mb_4x4+x+y*s4x4;
 668                             int i4q= mbn_4x4+xn+yn*s4x4;
 669                             int l;
 670
 671                             bS[i] = 0;
 672
 673                             for( l = 0; l < 1 + (h->sh.i_type == SLICE_TYPE_B); l++ )
 674                             {
 675                                 if( h->mb.ref[l][i8p] != h->mb.ref[l][i8q] ||
 676                                     abs( h->mb.mv[l][i4p][0] - h->mb.mv[l][i4q][0] ) >= 4 ||
 677                                     abs( h->mb.mv[l][i4p][1] - h->mb.mv[l][i4q][1] ) >= mvy_limit )
 678                                 {
 679                                     bS[i] = 1;
 680                                     break;
 681                                 }
 682                             }
 683                         }
 684                     }
 685                 }
 686
 687                 /* *** filter *** */
 688                 /* Y plane */
 689                 i_qp = h->mb.qp[mb_xy];
 690                 i_qpn= h->mb.qp[mbn_xy];
 691
 692                 if( i_dir == 0 )
 693                 {
 694                     /* vertical edge */
 695                     deblock_edge( h, &h->fdec->plane[0][i_pix_y[0] + 4*i_edge],
 696                                   i_stride2[0], bS, (i_qp+i_qpn+1) >> 1, 0,
 697                                   h->loopf.deblock_h_luma, h->loopf.deblock_h_luma_intra );
 698                     if( !(i_edge & 1) )
 699                     {
 700                         /* U/V planes */
 701                         int i_qpc = ( i_chroma_qp_table[x264_clip3( i_qp + h->pps->i_chroma_qp_index_offset, 0, 51 )] +
 702                                       i_chroma_qp_table[x264_clip3( i_qpn + h->pps->i_chroma_qp_index_offset, 0, 51 )] + 1 ) >> 1;
 703                         deblock_edge( h, &h->fdec->plane[1][i_pix_y[1] + 2*i_edge],
 704                                       i_stride2[1], bS, i_qpc, 1,
 705                                       h->loopf.deblock_h_chroma, h->loopf.deblock_h_chroma_intra );
 706                         deblock_edge( h, &h->fdec->plane[2][i_pix_y[2] + 2*i_edge],
 707                                       i_stride2[2], bS, i_qpc, 1,
 708                                       h->loopf.deblock_h_chroma, h->loopf.deblock_h_chroma_intra );
 709                     }
 710                 }
 711                 else
 712                 {
 713                     /* horizontal edge */
 714                     deblock_edge( h, &h->fdec->plane[0][i_pix_y[0] + 4*i_edge*i_stride2[0]],
 715                                   i_stride2[0], bS, (i_qp+i_qpn+1) >> 1, 0,
 716                                   h->loopf.deblock_v_luma, h->loopf.deblock_v_luma_intra );
 717                     /* U/V planes */
 718                     if( !(i_edge & 1) )
 719                     {
 720                         int i_qpc = ( i_chroma_qp_table[x264_clip3( i_qp + h->pps->i_chroma_qp_index_offset, 0, 51 )] +
 721                                       i_chroma_qp_table[x264_clip3( i_qpn + h->pps->i_chroma_qp_index_offset, 0, 51 )] + 1 ) >> 1;
 722                         deblock_edge( h, &h->fdec->plane[1][i_pix_y[1] + 2*i_edge*i_stride2[1]],
 723                                       i_stride2[1], bS, i_qpc, 1,
 724                                       h->loopf.deblock_v_chroma, h->loopf.deblock_v_chroma_intra );
 725                         deblock_edge( h, &h->fdec->plane[2][i_pix_y[2] + 2*i_edge*i_stride2[2]],
 726                                       i_stride2[2], bS, i_qpc, 1,
 727                                       h->loopf.deblock_v_chroma, h->loopf.deblock_v_chroma_intra );
 728                     }
 729                 }
 730             }
 731         }
 732
 733         /* next mb */
 734         if( !b_interlaced || (mb_y&1) )
 735             mb_x++;
 736         mb_y ^= b_interlaced;
 737     }
 738
 739     if( !h->pps->b_cabac && h->pps->b_transform_8x8_mode )
 740         munge_cavlc_nnz( h, mb_y, h->mb.nnz_backup, restore_cavlc_nnz_row );
 741 }
 742
 743 void x264_frame_deblock( x264_t *h )
 744 {
 745     int mb_y;
 746     for( mb_y = 0; mb_y < h->sps->i_mb_height; mb_y += 1 + h->sh.b_mbaff )
 747         x264_frame_deblock_row( h, mb_y );
 748 }
 749
 750 #ifdef HAVE_MMX
 751 void x264_deblock_v_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 752 void x264_deblock_h_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 753 void x264_deblock_v_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
 754 void x264_deblock_h_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
 755
 756 void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 757 void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 758 #ifdef ARCH_X86
 759 void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 760 void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 761
 762 void x264_deblock_v_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
 763 {
 764     x264_deblock_v8_luma_mmxext( pix,   stride, alpha, beta, tc0   );
 765     x264_deblock_v8_luma_mmxext( pix+8, stride, alpha, beta, tc0+2 );
 766 }
 767 #endif
 768 #endif
 769
 770 #ifdef ARCH_PPC
 771 void x264_deblock_v_luma_altivec( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 772 void x264_deblock_h_luma_altivec( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 773 #endif // ARCH_PPC
 774
 775 void x264_deblock_init( int cpu, x264_deblock_function_t *pf )
 776 {
 777     pf->deblock_v_luma = deblock_v_luma_c;
 778     pf->deblock_h_luma = deblock_h_luma_c;
 779     pf->deblock_v_chroma = deblock_v_chroma_c;
 780     pf->deblock_h_chroma = deblock_h_chroma_c;
 781     pf->deblock_v_luma_intra = deblock_v_luma_intra_c;
 782     pf->deblock_h_luma_intra = deblock_h_luma_intra_c;
 783     pf->deblock_v_chroma_intra = deblock_v_chroma_intra_c;
 784     pf->deblock_h_chroma_intra = deblock_h_chroma_intra_c;
 785
 786 #ifdef HAVE_MMX
 787     if( cpu&X264_CPU_MMXEXT )
 788     {
 789         pf->deblock_v_chroma = x264_deblock_v_chroma_mmxext;
 790         pf->deblock_h_chroma = x264_deblock_h_chroma_mmxext;
 791         pf->deblock_v_chroma_intra = x264_deblock_v_chroma_intra_mmxext;
 792         pf->deblock_h_chroma_intra = x264_deblock_h_chroma_intra_mmxext;
 793 #ifdef ARCH_X86
 794         pf->deblock_v_luma = x264_deblock_v_luma_mmxext;
 795         pf->deblock_h_luma = x264_deblock_h_luma_mmxext;
 796 #endif
 797         if( cpu&X264_CPU_SSE2 )
 798         {
 799             pf->deblock_v_luma = x264_deblock_v_luma_sse2;
 800             pf->deblock_h_luma = x264_deblock_h_luma_sse2;
 801         }
 802     }
 803 #endif
 804
 805 #ifdef ARCH_PPC
 806     if( cpu&X264_CPU_ALTIVEC )
 807     {
 808         pf->deblock_v_luma = x264_deblock_v_luma_altivec;
 809         pf->deblock_h_luma = x264_deblock_h_luma_altivec;
 810    }
 811 #endif // ARCH_PPC
 812 }
 813
 814
 815 /* threading */
 816
 817 #ifdef HAVE_PTHREAD
 818 void x264_frame_cond_broadcast( x264_frame_t *frame, int i_lines_completed )
 819 {
 820     x264_pthread_mutex_lock( &frame->mutex );
 821     frame->i_lines_completed = i_lines_completed;
 822     x264_pthread_cond_broadcast( &frame->cv );
 823     x264_pthread_mutex_unlock( &frame->mutex );
 824 }
 825
 826 void x264_frame_cond_wait( x264_frame_t *frame, int i_lines_completed )
 827 {
 828     x264_pthread_mutex_lock( &frame->mutex );
 829     while( frame->i_lines_completed < i_lines_completed )
 830         x264_pthread_cond_wait( &frame->cv, &frame->mutex );
 831     x264_pthread_mutex_unlock( &frame->mutex );
 832 }
 833
 834 #else
 835 void x264_frame_cond_broadcast( x264_frame_t *frame, int i_lines_completed )
 836 {}
 837 void x264_frame_cond_wait( x264_frame_t *frame, int i_lines_completed )
 838 {}
 839 #endif
 840
 841
 842 /* list operators */
 843
 844 void x264_frame_push( x264_frame_t **list, x264_frame_t *frame )
 845 {
 846     int i = 0;
 847     while( list[i] ) i++;
 848     list[i] = frame;
 849 }
 850
 851 x264_frame_t *x264_frame_pop( x264_frame_t **list )
 852 {
 853     x264_frame_t *frame;
 854     int i = 0;
 855     assert( list[0] );
 856     while( list[i+1] ) i++;
 857     frame = list[i];
 858     list[i] = NULL;
 859     return frame;
 860 }
 861
 862 void x264_frame_unshift( x264_frame_t **list, x264_frame_t *frame )
 863 {
 864     int i = 0;
 865     while( list[i] ) i++;
 866     while( i-- )
 867         list[i+1] = list[i];
 868     list[0] = frame;
 869 }
 870
 871 x264_frame_t *x264_frame_shift( x264_frame_t **list )
 872 {
 873     x264_frame_t *frame = list[0];
 874     int i;
 875     for( i = 0; list[i]; i++ )
 876         list[i] = list[i+1];
 877     assert(frame);
 878     return frame;
 879 }
 880
 881 void x264_frame_push_unused( x264_t *h, x264_frame_t *frame )
 882 {
 883     assert( frame->i_reference_count > 0 );
 884     frame->i_reference_count--;
 885     if( frame->i_reference_count == 0 )
 886         x264_frame_push( h->frames.unused, frame );
 887     assert( h->frames.unused[ sizeof(h->frames.unused) / sizeof(*h->frames.unused) - 1 ] == NULL );
 888 }
 889
 890 x264_frame_t *x264_frame_pop_unused( x264_t *h )
 891 {
 892     x264_frame_t *frame;
 893     if( h->frames.unused[0] )
 894         frame = x264_frame_pop( h->frames.unused );
 895     else
 896         frame = x264_frame_new( h );
 897     assert( frame->i_reference_count == 0 );
 898     frame->i_reference_count = 1;
 899     return frame;
 900 }
 901
 902 void x264_frame_sort( x264_frame_t **list, int b_dts )
 903 {
 904     int i, b_ok;
 905     do {
 906         b_ok = 1;
 907         for( i = 0; list[i+1]; i++ )
 908         {
 909             int dtype = list[i]->i_type - list[i+1]->i_type;
 910             int dtime = list[i]->i_frame - list[i+1]->i_frame;
 911             int swap = b_dts ? dtype > 0 || ( dtype == 0 && dtime > 0 )
 912                              : dtime > 0;
 913             if( swap )
 914             {
 915                 XCHG( x264_frame_t*, list[i], list[i+1] );
 916                 b_ok = 0;
 917             }
 918         }
 919     } while( !b_ok );
 920 }