git.sesse.net Git - x264/blob - common/frame.c

   1 /*****************************************************************************
   2  * frame.c: h264 encoder library
   3  *****************************************************************************
   4  * Copyright (C) 2003-2008 x264 project
   5  *
   6  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
   7  *          Loren Merritt <lorenm@u.washington.edu>
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation; either version 2 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License
  20  * along with this program; if not, write to the Free Software
  21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  22  *****************************************************************************/
  23
  24 #include "common.h"
  25
  26 #define ALIGN(x,a) (((x)+((a)-1))&~((a)-1))
  27
  28 x264_frame_t *x264_frame_new( x264_t *h )
  29 {
  30     x264_frame_t *frame = x264_malloc( sizeof(x264_frame_t) );
  31     int i, j;
  32
  33     int i_mb_count = h->mb.i_mb_count;
  34     int i_stride, i_width, i_lines;
  35     int i_padv = PADV << h->param.b_interlaced;
  36     int luma_plane_size;
  37     int align = h->param.cpu&X264_CPU_CACHELINE_64 ? 64 : h->param.cpu&X264_CPU_CACHELINE_32 ? 32 : 16;
  38
  39     if( !frame ) return NULL;
  40
  41     memset( frame, 0, sizeof(x264_frame_t) );
  42
  43     /* allocate frame data (+64 for extra data for me) */
  44     i_width  = ALIGN( h->param.i_width, 16 );
  45     i_stride = ALIGN( i_width + 2*PADH, align );
  46     i_lines  = ALIGN( h->param.i_height, 16<<h->param.b_interlaced );
  47
  48     frame->i_plane = 3;
  49     for( i = 0; i < 3; i++ )
  50     {
  51         frame->i_stride[i] = i_stride >> !!i;
  52         frame->i_width[i] = i_width >> !!i;
  53         frame->i_lines[i] = i_lines >> !!i;
  54     }
  55
  56     luma_plane_size = (frame->i_stride[0] * ( frame->i_lines[0] + 2*i_padv ));
  57     for( i = 1; i < 3; i++ )
  58     {
  59         CHECKED_MALLOC( frame->buffer[i], luma_plane_size/4 );
  60         frame->plane[i] = frame->buffer[i] + (frame->i_stride[i] * i_padv + PADH)/2;
  61     }
  62     /* all 4 luma planes allocated together, since the cacheline split code
  63      * requires them to be in-phase wrt cacheline alignment. */
  64     CHECKED_MALLOC( frame->buffer[0], 4*luma_plane_size);
  65     for( i = 0; i < 4; i++ )
  66         frame->filtered[i] = frame->buffer[0] + i*luma_plane_size + frame->i_stride[0] * i_padv + PADH;
  67     frame->plane[0] = frame->filtered[0];
  68
  69     if( h->frames.b_have_lowres )
  70     {
  71         frame->i_width_lowres = frame->i_width[0]/2;
  72         frame->i_stride_lowres = ALIGN( frame->i_width_lowres + 2*PADH, align );
  73         frame->i_lines_lowres = frame->i_lines[0]/2;
  74
  75         luma_plane_size = frame->i_stride_lowres * ( frame->i_lines[0]/2 + 2*i_padv );
  76
  77         CHECKED_MALLOC( frame->buffer_lowres[0], 4 * luma_plane_size );
  78         for( i = 0; i < 4; i++ )
  79             frame->lowres[i] = frame->buffer_lowres[0] + (frame->i_stride_lowres * i_padv + PADH) + i * luma_plane_size;
  80     }
  81
  82     if( h->param.analyse.i_me_method >= X264_ME_ESA )
  83     {
  84         CHECKED_MALLOC( frame->buffer[3],
  85                         2 * frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv) * sizeof(uint16_t) );
  86         frame->integral = (uint16_t*)frame->buffer[3] + frame->i_stride[0] * i_padv + PADH;
  87     }
  88
  89     frame->i_poc = -1;
  90     frame->i_type = X264_TYPE_AUTO;
  91     frame->i_qpplus1 = 0;
  92     frame->i_pts = -1;
  93     frame->i_frame = -1;
  94     frame->i_frame_num = -1;
  95     frame->i_lines_completed = -1;
  96
  97     CHECKED_MALLOC( frame->mb_type, i_mb_count * sizeof(int8_t));
  98     CHECKED_MALLOC( frame->mv[0], 2*16 * i_mb_count * sizeof(int16_t) );
  99     CHECKED_MALLOC( frame->ref[0], 4 * i_mb_count * sizeof(int8_t) );
 100     if( h->param.i_bframe )
 101     {
 102         CHECKED_MALLOC( frame->mv[1], 2*16 * i_mb_count * sizeof(int16_t) );
 103         CHECKED_MALLOC( frame->ref[1], 4 * i_mb_count * sizeof(int8_t) );
 104     }
 105     else
 106     {
 107         frame->mv[1]  = NULL;
 108         frame->ref[1] = NULL;
 109     }
 110
 111     CHECKED_MALLOC( frame->i_row_bits, i_lines/16 * sizeof(int) );
 112     CHECKED_MALLOC( frame->i_row_qp, i_lines/16 * sizeof(int) );
 113     for( i = 0; i < h->param.i_bframe + 2; i++ )
 114         for( j = 0; j < h->param.i_bframe + 2; j++ )
 115             CHECKED_MALLOC( frame->i_row_satds[i][j], i_lines/16 * sizeof(int) );
 116
 117     x264_pthread_mutex_init( &frame->mutex, NULL );
 118     x264_pthread_cond_init( &frame->cv, NULL );
 119
 120     return frame;
 121
 122 fail:
 123     x264_frame_delete( frame );
 124     return NULL;
 125 }
 126
 127 void x264_frame_delete( x264_frame_t *frame )
 128 {
 129     int i, j;
 130     for( i = 0; i < 4; i++ )
 131         x264_free( frame->buffer[i] );
 132     for( i = 0; i < 4; i++ )
 133         x264_free( frame->buffer_lowres[i] );
 134     for( i = 0; i < X264_BFRAME_MAX+2; i++ )
 135         for( j = 0; j < X264_BFRAME_MAX+2; j++ )
 136             x264_free( frame->i_row_satds[i][j] );
 137     x264_free( frame->i_row_bits );
 138     x264_free( frame->i_row_qp );
 139     x264_free( frame->mb_type );
 140     x264_free( frame->mv[0] );
 141     x264_free( frame->mv[1] );
 142     x264_free( frame->ref[0] );
 143     x264_free( frame->ref[1] );
 144     x264_pthread_mutex_destroy( &frame->mutex );
 145     x264_pthread_cond_destroy( &frame->cv );
 146     x264_free( frame );
 147 }
 148
 149 int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src )
 150 {
 151     int i_csp = src->img.i_csp & X264_CSP_MASK;
 152     int i;
 153     if( i_csp != X264_CSP_I420 && i_csp != X264_CSP_YV12 )
 154     {
 155         x264_log( h, X264_LOG_ERROR, "Arg invalid CSP\n" );
 156         return -1;
 157     }
 158
 159     dst->i_type     = src->i_type;
 160     dst->i_qpplus1  = src->i_qpplus1;
 161     dst->i_pts      = src->i_pts;
 162
 163     for( i=0; i<3; i++ )
 164     {
 165         int s = (i_csp == X264_CSP_YV12 && i) ? i^3 : i;
 166         uint8_t *plane = src->img.plane[s];
 167         int stride = src->img.i_stride[s];
 168         int width = h->param.i_width >> !!i;
 169         int height = h->param.i_height >> !!i;
 170         if( src->img.i_csp & X264_CSP_VFLIP )
 171         {
 172             plane += (height-1)*stride;
 173             stride = -stride;
 174         }
 175         h->mc.plane_copy( dst->plane[i], dst->i_stride[i], plane, stride, width, height );
 176     }
 177     return 0;
 178 }
 179
 180
 181
 182 static void plane_expand_border( uint8_t *pix, int i_stride, int i_width, int i_height, int i_padh, int i_padv, int b_pad_top, int b_pad_bottom )
 183 {
 184 #define PPIXEL(x, y) ( pix + (x) + (y)*i_stride )
 185     int y;
 186     for( y = 0; y < i_height; y++ )
 187     {
 188         /* left band */
 189         memset( PPIXEL(-i_padh, y), PPIXEL(0, y)[0], i_padh );
 190         /* right band */
 191         memset( PPIXEL(i_width, y), PPIXEL(i_width-1, y)[0], i_padh );
 192     }
 193     /* upper band */
 194     if( b_pad_top )
 195     for( y = 0; y < i_padv; y++ )
 196         memcpy( PPIXEL(-i_padh, -y-1), PPIXEL(-i_padh, 0), i_width+2*i_padh );
 197     /* lower band */
 198     if( b_pad_bottom )
 199     for( y = 0; y < i_padv; y++ )
 200         memcpy( PPIXEL(-i_padh, i_height+y), PPIXEL(-i_padh, i_height-1), i_width+2*i_padh );
 201 #undef PPIXEL
 202 }
 203
 204 void x264_frame_expand_border( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
 205 {
 206     int i;
 207     int b_start = !mb_y;
 208     if( mb_y & h->sh.b_mbaff )
 209         return;
 210     for( i = 0; i < frame->i_plane; i++ )
 211     {
 212         int stride = frame->i_stride[i];
 213         int width = 16*h->sps->i_mb_width >> !!i;
 214         int height = (b_end ? 16*(h->sps->i_mb_height - mb_y) >> h->sh.b_mbaff : 16) >> !!i;
 215         int padh = PADH >> !!i;
 216         int padv = PADV >> !!i;
 217         // buffer: 2 chroma, 3 luma (rounded to 4) because deblocking goes beyond the top of the mb
 218         uint8_t *pix = frame->plane[i] + X264_MAX(0, (16*mb_y-4)*stride >> !!i);
 219         if( b_end && !b_start )
 220             height += 4 >> (!!i + h->sh.b_mbaff);
 221         if( h->sh.b_mbaff )
 222         {
 223             plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end );
 224             plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end );
 225         }
 226         else
 227         {
 228             plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end );
 229         }
 230     }
 231 }
 232
 233 void x264_frame_expand_border_filtered( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
 234 {
 235     /* during filtering, 8 extra pixels were filtered on each edge,
 236      * but up to 3 of the horizontal ones may be wrong.
 237        we want to expand border from the last filtered pixel */
 238     int b_start = !mb_y;
 239     int stride = frame->i_stride[0];
 240     int width = 16*h->sps->i_mb_width + 8;
 241     int height = b_end ? (16*(h->sps->i_mb_height - mb_y) >> h->sh.b_mbaff) + 16 : 16;
 242     int padh = PADH - 4;
 243     int padv = PADV - 8;
 244     int i;
 245     for( i = 1; i < 4; i++ )
 246     {
 247         // buffer: 8 luma, to match the hpel filter
 248         uint8_t *pix = frame->filtered[i] + (16*mb_y - (8 << h->sh.b_mbaff)) * stride - 4;
 249         if( h->sh.b_mbaff )
 250         {
 251             plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end );
 252             plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end );
 253         }
 254         else
 255         {
 256             plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end );
 257         }
 258     }
 259 }
 260
 261 void x264_frame_expand_border_lowres( x264_frame_t *frame )
 262 {
 263     int i;
 264     for( i = 0; i < 4; i++ )
 265         plane_expand_border( frame->lowres[i], frame->i_stride_lowres, frame->i_stride_lowres - 2*PADH, frame->i_lines_lowres, PADH, PADV, 1, 1 );
 266 }
 267
 268 void x264_frame_expand_border_mod16( x264_t *h, x264_frame_t *frame )
 269 {
 270     int i, y;
 271     for( i = 0; i < frame->i_plane; i++ )
 272     {
 273         int i_subsample = i ? 1 : 0;
 274         int i_width = h->param.i_width >> i_subsample;
 275         int i_height = h->param.i_height >> i_subsample;
 276         int i_padx = ( h->sps->i_mb_width * 16 - h->param.i_width ) >> i_subsample;
 277         int i_pady = ( h->sps->i_mb_height * 16 - h->param.i_height ) >> i_subsample;
 278
 279         if( i_padx )
 280         {
 281             for( y = 0; y < i_height; y++ )
 282                 memset( &frame->plane[i][y*frame->i_stride[i] + i_width],
 283                          frame->plane[i][y*frame->i_stride[i] + i_width - 1],
 284                          i_padx );
 285         }
 286         if( i_pady )
 287         {
 288             //FIXME interlace? or just let it pad using the wrong field
 289             for( y = i_height; y < i_height + i_pady; y++ )
 290                 memcpy( &frame->plane[i][y*frame->i_stride[i]],
 291                         &frame->plane[i][(i_height-1)*frame->i_stride[i]],
 292                         i_width + i_padx );
 293         }
 294     }
 295 }
 296
 297
 298 /* cavlc + 8x8 transform stores nnz per 16 coeffs for the purpose of
 299  * entropy coding, but per 64 coeffs for the purpose of deblocking */
 300 void munge_cavlc_nnz_row( x264_t *h, int mb_y, uint8_t (*buf)[16] )
 301 {
 302     uint32_t (*src)[6] = (uint32_t(*)[6])h->mb.non_zero_count + mb_y * h->sps->i_mb_width;
 303     int8_t *transform = h->mb.mb_transform_size + mb_y * h->sps->i_mb_width;
 304     int x, nnz;
 305     for( x=0; x<h->sps->i_mb_width; x++ )
 306     {
 307         memcpy( buf+x, src+x, 16 );
 308         if( transform[x] )
 309         {
 310             nnz = src[x][0] | src[x][1];
 311             src[x][0] = src[x][1] = ((uint16_t)nnz ? 0x0101 : 0) + (nnz>>16 ? 0x01010000 : 0);
 312             nnz = src[x][2] | src[x][3];
 313             src[x][2] = src[x][3] = ((uint16_t)nnz ? 0x0101 : 0) + (nnz>>16 ? 0x01010000 : 0);
 314         }
 315     }
 316 }
 317
 318 static void restore_cavlc_nnz_row( x264_t *h, int mb_y, uint8_t (*buf)[16] )
 319 {
 320     uint8_t (*dst)[24] = h->mb.non_zero_count + mb_y * h->sps->i_mb_width;
 321     int x;
 322     for( x=0; x<h->sps->i_mb_width; x++ )
 323         memcpy( dst+x, buf+x, 16 );
 324 }
 325
 326 static void munge_cavlc_nnz( x264_t *h, int mb_y, uint8_t (*buf)[16], void (*func)(x264_t*, int, uint8_t (*)[16]) )
 327 {
 328     func( h, mb_y, buf );
 329     if( mb_y > 0 )
 330         func( h, mb_y-1, buf + h->sps->i_mb_width );
 331     if( h->sh.b_mbaff )
 332     {
 333         func( h, mb_y+1, buf + h->sps->i_mb_width * 2 );
 334         if( mb_y > 0 )
 335             func( h, mb_y-2, buf + h->sps->i_mb_width * 3 );
 336     }
 337 }
 338
 339
 340 /* Deblocking filter */
 341 static const uint8_t i_alpha_table[52+12*2] =
 342 {
 343      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 344      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 345      0,  0,  0,  0,  0,  0,  4,  4,  5,  6,
 346      7,  8,  9, 10, 12, 13, 15, 17, 20, 22,
 347     25, 28, 32, 36, 40, 45, 50, 56, 63, 71,
 348     80, 90,101,113,127,144,162,182,203,226,
 349    255,255,
 350    255,255,255,255,255,255,255,255,255,255,255,255,
 351 };
 352 static const uint8_t i_beta_table[52+12*2] =
 353 {
 354      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 355      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 356      0,  0,  0,  0,  0,  0,  2,  2,  2,  3,
 357      3,  3,  3,  4,  4,  4,  6,  6,  7,  7,
 358      8,  8,  9,  9, 10, 10, 11, 11, 12, 12,
 359     13, 13, 14, 14, 15, 15, 16, 16, 17, 17,
 360     18, 18,
 361     18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
 362 };
 363 static const int8_t i_tc0_table[52+12*2][4] =
 364 {
 365     {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
 366     {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
 367     {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
 368     {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
 369     {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 1 },
 370     {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 1, 1 }, {-1, 0, 1, 1 }, {-1, 1, 1, 1 },
 371     {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 },
 372     {-1, 1, 1, 2 }, {-1, 1, 2, 3 }, {-1, 1, 2, 3 }, {-1, 2, 2, 3 }, {-1, 2, 2, 4 }, {-1, 2, 3, 4 },
 373     {-1, 2, 3, 4 }, {-1, 3, 3, 5 }, {-1, 3, 4, 6 }, {-1, 3, 4, 6 }, {-1, 4, 5, 7 }, {-1, 4, 5, 8 },
 374     {-1, 4, 6, 9 }, {-1, 5, 7,10 }, {-1, 6, 8,11 }, {-1, 6, 8,13 }, {-1, 7,10,14 }, {-1, 8,11,16 },
 375     {-1, 9,12,18 }, {-1,10,13,20 }, {-1,11,15,23 }, {-1,13,17,25 },
 376     {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
 377     {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
 378 };
 379 #define alpha_table(x) i_alpha_table[(x)+12]
 380 #define beta_table(x)  i_beta_table[(x)+12]
 381 #define tc0_table(x)   i_tc0_table[(x)+12]
 382
 383 /* From ffmpeg */
 384 static inline void deblock_luma_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0 )
 385 {
 386     int i, d;
 387     for( i = 0; i < 4; i++ )
 388     {
 389         if( tc0[i] < 0 )
 390         {
 391             pix += 4*ystride;
 392             continue;
 393         }
 394         for( d = 0; d < 4; d++ )
 395         {
 396             const int p2 = pix[-3*xstride];
 397             const int p1 = pix[-2*xstride];
 398             const int p0 = pix[-1*xstride];
 399             const int q0 = pix[ 0*xstride];
 400             const int q1 = pix[ 1*xstride];
 401             const int q2 = pix[ 2*xstride];
 402
 403             if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
 404             {
 405                 int tc = tc0[i];
 406                 int delta;
 407                 if( abs( p2 - p0 ) < beta )
 408                 {
 409                     pix[-2*xstride] = p1 + x264_clip3( (( p2 + ((p0 + q0 + 1) >> 1)) >> 1) - p1, -tc0[i], tc0[i] );
 410                     tc++;
 411                 }
 412                 if( abs( q2 - q0 ) < beta )
 413                 {
 414                     pix[ 1*xstride] = q1 + x264_clip3( (( q2 + ((p0 + q0 + 1) >> 1)) >> 1) - q1, -tc0[i], tc0[i] );
 415                     tc++;
 416                 }
 417
 418                 delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
 419                 pix[-1*xstride] = x264_clip_uint8( p0 + delta );    /* p0' */
 420                 pix[ 0*xstride] = x264_clip_uint8( q0 - delta );    /* q0' */
 421             }
 422             pix += ystride;
 423         }
 424     }
 425 }
 426 static void deblock_v_luma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
 427 {
 428     deblock_luma_c( pix, stride, 1, alpha, beta, tc0 );
 429 }
 430 static void deblock_h_luma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
 431 {
 432     deblock_luma_c( pix, 1, stride, alpha, beta, tc0 );
 433 }
 434
 435 static inline void deblock_chroma_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0 )
 436 {
 437     int i, d;
 438     for( i = 0; i < 4; i++ )
 439     {
 440         const int tc = tc0[i];
 441         if( tc <= 0 )
 442         {
 443             pix += 2*ystride;
 444             continue;
 445         }
 446         for( d = 0; d < 2; d++ )
 447         {
 448             const int p1 = pix[-2*xstride];
 449             const int p0 = pix[-1*xstride];
 450             const int q0 = pix[ 0*xstride];
 451             const int q1 = pix[ 1*xstride];
 452
 453             if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
 454             {
 455                 int delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
 456                 pix[-1*xstride] = x264_clip_uint8( p0 + delta );    /* p0' */
 457                 pix[ 0*xstride] = x264_clip_uint8( q0 - delta );    /* q0' */
 458             }
 459             pix += ystride;
 460         }
 461     }
 462 }
 463 static void deblock_v_chroma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
 464 {
 465     deblock_chroma_c( pix, stride, 1, alpha, beta, tc0 );
 466 }
 467 static void deblock_h_chroma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
 468 {
 469     deblock_chroma_c( pix, 1, stride, alpha, beta, tc0 );
 470 }
 471
 472 static inline void deblock_luma_intra_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta )
 473 {
 474     int d;
 475     for( d = 0; d < 16; d++ )
 476     {
 477         const int p2 = pix[-3*xstride];
 478         const int p1 = pix[-2*xstride];
 479         const int p0 = pix[-1*xstride];
 480         const int q0 = pix[ 0*xstride];
 481         const int q1 = pix[ 1*xstride];
 482         const int q2 = pix[ 2*xstride];
 483
 484         if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
 485         {
 486             if(abs( p0 - q0 ) < ((alpha >> 2) + 2) )
 487             {
 488                 if( abs( p2 - p0 ) < beta ) /* p0', p1', p2' */
 489                 {
 490                     const int p3 = pix[-4*xstride];
 491                     pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
 492                     pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
 493                     pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
 494                 }
 495                 else /* p0' */
 496                     pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
 497                 if( abs( q2 - q0 ) < beta ) /* q0', q1', q2' */
 498                 {
 499                     const int q3 = pix[3*xstride];
 500                     pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
 501                     pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
 502                     pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
 503                 }
 504                 else /* q0' */
 505                     pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
 506             }
 507             else /* p0', q0' */
 508             {
 509                 pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
 510                 pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
 511             }
 512         }
 513         pix += ystride;
 514     }
 515 }
 516 static void deblock_v_luma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
 517 {
 518     deblock_luma_intra_c( pix, stride, 1, alpha, beta );
 519 }
 520 static void deblock_h_luma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
 521 {
 522     deblock_luma_intra_c( pix, 1, stride, alpha, beta );
 523 }
 524
 525 static inline void deblock_chroma_intra_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta )
 526 {
 527     int d;
 528     for( d = 0; d < 8; d++ )
 529     {
 530         const int p1 = pix[-2*xstride];
 531         const int p0 = pix[-1*xstride];
 532         const int q0 = pix[ 0*xstride];
 533         const int q1 = pix[ 1*xstride];
 534
 535         if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
 536         {
 537             pix[-1*xstride] = (2*p1 + p0 + q1 + 2) >> 2;   /* p0' */
 538             pix[ 0*xstride] = (2*q1 + q0 + p1 + 2) >> 2;   /* q0' */
 539         }
 540         pix += ystride;
 541     }
 542 }
 543 static void deblock_v_chroma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
 544 {
 545     deblock_chroma_intra_c( pix, stride, 1, alpha, beta );
 546 }
 547 static void deblock_h_chroma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
 548 {
 549     deblock_chroma_intra_c( pix, 1, stride, alpha, beta );
 550 }
 551
 552 static inline void deblock_edge( x264_t *h, uint8_t *pix1, uint8_t *pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_inter_t pf_inter )
 553 {
 554     const int index_a = i_qp + h->sh.i_alpha_c0_offset;
 555     const int alpha = alpha_table(index_a);
 556     const int beta  = beta_table(i_qp + h->sh.i_beta_offset);
 557     int8_t tc[4];
 558
 559     if( !alpha || !beta )
 560         return;
 561
 562     tc[0] = tc0_table(index_a)[bS[0]] + b_chroma;
 563     tc[1] = tc0_table(index_a)[bS[1]] + b_chroma;
 564     tc[2] = tc0_table(index_a)[bS[2]] + b_chroma;
 565     tc[3] = tc0_table(index_a)[bS[3]] + b_chroma;
 566
 567     pf_inter( pix1, i_stride, alpha, beta, tc );
 568     if( b_chroma )
 569         pf_inter( pix2, i_stride, alpha, beta, tc );
 570 }
 571
 572 static inline void deblock_edge_intra( x264_t *h, uint8_t *pix1, uint8_t *pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_intra_t pf_intra )
 573 {
 574     const int alpha = alpha_table(i_qp + h->sh.i_alpha_c0_offset);
 575     const int beta  = beta_table(i_qp + h->sh.i_beta_offset);
 576
 577     if( !alpha || !beta )
 578         return;
 579
 580     pf_intra( pix1, i_stride, alpha, beta );
 581     if( b_chroma )
 582         pf_intra( pix2, i_stride, alpha, beta );
 583 }
 584
 585 void x264_frame_deblock_row( x264_t *h, int mb_y )
 586 {
 587     const int s8x8 = 2 * h->mb.i_mb_stride;
 588     const int s4x4 = 4 * h->mb.i_mb_stride;
 589     const int b_interlaced = h->sh.b_mbaff;
 590     const int mvy_limit = 4 >> b_interlaced;
 591     const int qp_thresh = 15 - X264_MIN(h->sh.i_alpha_c0_offset, h->sh.i_beta_offset) - X264_MAX(0, h->param.analyse.i_chroma_qp_offset);
 592     int mb_x;
 593     int stridey   = h->fdec->i_stride[0];
 594     int stride2y  = stridey << b_interlaced;
 595     int strideuv  = h->fdec->i_stride[1];
 596     int stride2uv = strideuv << b_interlaced;
 597
 598     if( !h->pps->b_cabac && h->pps->b_transform_8x8_mode )
 599         munge_cavlc_nnz( h, mb_y, h->mb.nnz_backup, munge_cavlc_nnz_row );
 600
 601     for( mb_x = 0; mb_x < h->sps->i_mb_width; mb_x += (~b_interlaced | mb_y)&1, mb_y ^= b_interlaced )
 602     {
 603         const int mb_xy  = mb_y * h->mb.i_mb_stride + mb_x;
 604         const int mb_8x8 = 2 * s8x8 * mb_y + 2 * mb_x;
 605         const int mb_4x4 = 4 * s4x4 * mb_y + 4 * mb_x;
 606         const int b_8x8_transform = h->mb.mb_transform_size[mb_xy];
 607         const int i_qp = h->mb.qp[mb_xy];
 608         int i_edge_end = (h->mb.type[mb_xy] == P_SKIP) ? 1 : 4;
 609         uint8_t *pixy = h->fdec->plane[0] + 16*mb_y*stridey  + 16*mb_x;
 610         uint8_t *pixu = h->fdec->plane[1] +  8*mb_y*strideuv +  8*mb_x;
 611         uint8_t *pixv = h->fdec->plane[2] +  8*mb_y*strideuv +  8*mb_x;
 612         if( b_interlaced && (mb_y&1) )
 613         {
 614             pixy -= 15*stridey;
 615             pixu -=  7*strideuv;
 616             pixv -=  7*strideuv;
 617         }
 618
 619         x264_prefetch_fenc( h, h->fdec, mb_x, mb_y );
 620
 621         if( i_qp <= qp_thresh )
 622             i_edge_end = 1;
 623
 624         #define FILTER_DIR(intra, i_dir)\
 625         {\
 626             /* Y plane */\
 627             i_qpn= h->mb.qp[mbn_xy];\
 628             if( i_dir == 0 )\
 629             {\
 630                 /* vertical edge */\
 631                 deblock_edge##intra( h, pixy + 4*i_edge, NULL,\
 632                               stride2y, bS, (i_qp+i_qpn+1) >> 1, 0,\
 633                               h->loopf.deblock_h_luma##intra );\
 634                 if( !(i_edge & 1) )\
 635                 {\
 636                     /* U/V planes */\
 637                     int i_qpc = (h->chroma_qp_table[i_qp] + h->chroma_qp_table[i_qpn] + 1) >> 1;\
 638                     deblock_edge##intra( h, pixu + 2*i_edge, pixv + 2*i_edge,\
 639                                   stride2uv, bS, i_qpc, 1,\
 640                                   h->loopf.deblock_h_chroma##intra );\
 641                 }\
 642             }\
 643             else\
 644             {\
 645                 /* horizontal edge */\
 646                 deblock_edge##intra( h, pixy + 4*i_edge*stride2y, NULL,\
 647                               stride2y, bS, (i_qp+i_qpn+1) >> 1, 0,\
 648                               h->loopf.deblock_v_luma##intra );\
 649                 /* U/V planes */\
 650                 if( !(i_edge & 1) )\
 651                 {\
 652                     int i_qpc = (h->chroma_qp_table[i_qp] + h->chroma_qp_table[i_qpn] + 1) >> 1;\
 653                     deblock_edge##intra( h, pixu + 2*i_edge*stride2uv, pixv + 2*i_edge*stride2uv,\
 654                                   stride2uv, bS, i_qpc, 1,\
 655                                   h->loopf.deblock_v_chroma##intra );\
 656                 }\
 657             }\
 658         }
 659
 660         #define DEBLOCK_STRENGTH(i_dir)\
 661         {\
 662             /* *** Get bS for each 4px for the current edge *** */\
 663             if( IS_INTRA( h->mb.type[mb_xy] ) || IS_INTRA( h->mb.type[mbn_xy]) )\
 664                 *(uint32_t*)bS = 0x03030303;\
 665             else\
 666             {\
 667                 *(uint32_t*)bS = 0x00000000;\
 668                 for( i = 0; i < 4; i++ )\
 669                 {\
 670                     int x  = i_dir == 0 ? i_edge : i;\
 671                     int y  = i_dir == 0 ? i      : i_edge;\
 672                     int xn = i_dir == 0 ? (x - 1)&0x03 : x;\
 673                     int yn = i_dir == 0 ? y : (y - 1)&0x03;\
 674                     if( h->mb.non_zero_count[mb_xy][x+y*4] != 0 ||\
 675                         h->mb.non_zero_count[mbn_xy][xn+yn*4] != 0 )\
 676                         bS[i] = 2;\
 677                     else\
 678                     {\
 679                         /* FIXME: A given frame may occupy more than one position in\
 680                          * the reference list. So we should compare the frame numbers,\
 681                          * not the indices in the ref list.\
 682                          * No harm yet, as we don't generate that case.*/\
 683                         int i8p= mb_8x8+(x>>1)+(y>>1)*s8x8;\
 684                         int i8q= mbn_8x8+(xn>>1)+(yn>>1)*s8x8;\
 685                         int i4p= mb_4x4+x+y*s4x4;\
 686                         int i4q= mbn_4x4+xn+yn*s4x4;\
 687                         for( l = 0; l < 1 + (h->sh.i_type == SLICE_TYPE_B); l++ )\
 688                             if( h->mb.ref[l][i8p] != h->mb.ref[l][i8q] ||\
 689                                 abs( h->mb.mv[l][i4p][0] - h->mb.mv[l][i4q][0] ) >= 4 ||\
 690                                 abs( h->mb.mv[l][i4p][1] - h->mb.mv[l][i4q][1] ) >= mvy_limit )\
 691                             {\
 692                                 bS[i] = 1;\
 693                                 break;\
 694                             }\
 695                     }\
 696                 }\
 697             }\
 698         }
 699
 700         /* i_dir == 0 -> vertical edge
 701          * i_dir == 1 -> horizontal edge */
 702         #define DEBLOCK_DIR(i_dir)\
 703         {\
 704             int i_edge = (i_dir ? (mb_y <= b_interlaced) : (mb_x == 0));\
 705             int i_qpn, i, l, mbn_xy, mbn_8x8, mbn_4x4;\
 706             DECLARE_ALIGNED_4( uint8_t bS[4] );  /* filtering strength */\
 707             if( i_edge )\
 708                 i_edge+= b_8x8_transform;\
 709             else\
 710             {\
 711                 mbn_xy  = i_dir == 0 ? mb_xy  - 1 : mb_xy - h->mb.i_mb_stride;\
 712                 mbn_8x8 = i_dir == 0 ? mb_8x8 - 2 : mb_8x8 - 2 * s8x8;\
 713                 mbn_4x4 = i_dir == 0 ? mb_4x4 - 4 : mb_4x4 - 4 * s4x4;\
 714                 if( b_interlaced && i_dir == 1 )\
 715                 {\
 716                     mbn_xy -= h->mb.i_mb_stride;\
 717                     mbn_8x8 -= 2 * s8x8;\
 718                     mbn_4x4 -= 4 * s4x4;\
 719                 }\
 720                 else if( IS_INTRA( h->mb.type[mb_xy] ) || IS_INTRA( h->mb.type[mbn_xy]) )\
 721                 {\
 722                     FILTER_DIR( _intra, i_dir );\
 723                     goto end##i_dir;\
 724                 }\
 725                 DEBLOCK_STRENGTH(i_dir);\
 726                 if( *(uint32_t*)bS )\
 727                     FILTER_DIR( , i_dir);\
 728                 end##i_dir:\
 729                 i_edge += b_8x8_transform+1;\
 730             }\
 731             mbn_xy  = mb_xy;\
 732             mbn_8x8 = mb_8x8;\
 733             mbn_4x4 = mb_4x4;\
 734             for( ; i_edge < i_edge_end; i_edge+=b_8x8_transform+1 )\
 735             {\
 736                 DEBLOCK_STRENGTH(i_dir);\
 737                 if( *(uint32_t*)bS )\
 738                     FILTER_DIR( , i_dir);\
 739             }\
 740         }
 741
 742         DEBLOCK_DIR(0);
 743         DEBLOCK_DIR(1);
 744     }
 745
 746     if( !h->pps->b_cabac && h->pps->b_transform_8x8_mode )
 747         munge_cavlc_nnz( h, mb_y, h->mb.nnz_backup, restore_cavlc_nnz_row );
 748 }
 749
 750 void x264_frame_deblock( x264_t *h )
 751 {
 752     int mb_y;
 753     for( mb_y = 0; mb_y < h->sps->i_mb_height; mb_y += 1 + h->sh.b_mbaff )
 754         x264_frame_deblock_row( h, mb_y );
 755 }
 756
 757 #ifdef HAVE_MMX
 758 void x264_deblock_v_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 759 void x264_deblock_h_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 760 void x264_deblock_v_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
 761 void x264_deblock_h_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
 762
 763 void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 764 void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 765 void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta );
 766 void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta );
 767 #ifdef ARCH_X86
 768 void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 769 void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 770 void x264_deblock_h_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
 771 void x264_deblock_v8_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
 772
 773 void x264_deblock_v_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
 774 {
 775     x264_deblock_v8_luma_mmxext( pix,   stride, alpha, beta, tc0   );
 776     x264_deblock_v8_luma_mmxext( pix+8, stride, alpha, beta, tc0+2 );
 777 }
 778 void x264_deblock_v_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta )
 779 {
 780     x264_deblock_v8_luma_intra_mmxext( pix,   stride, alpha, beta );
 781     x264_deblock_v8_luma_intra_mmxext( pix+8, stride, alpha, beta );
 782 }
 783 #endif
 784 #endif
 785
 786 #ifdef ARCH_PPC
 787 void x264_deblock_v_luma_altivec( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 788 void x264_deblock_h_luma_altivec( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 789 #endif // ARCH_PPC
 790
 791 void x264_deblock_init( int cpu, x264_deblock_function_t *pf )
 792 {
 793     pf->deblock_v_luma = deblock_v_luma_c;
 794     pf->deblock_h_luma = deblock_h_luma_c;
 795     pf->deblock_v_chroma = deblock_v_chroma_c;
 796     pf->deblock_h_chroma = deblock_h_chroma_c;
 797     pf->deblock_v_luma_intra = deblock_v_luma_intra_c;
 798     pf->deblock_h_luma_intra = deblock_h_luma_intra_c;
 799     pf->deblock_v_chroma_intra = deblock_v_chroma_intra_c;
 800     pf->deblock_h_chroma_intra = deblock_h_chroma_intra_c;
 801
 802 #ifdef HAVE_MMX
 803     if( cpu&X264_CPU_MMXEXT )
 804     {
 805         pf->deblock_v_chroma = x264_deblock_v_chroma_mmxext;
 806         pf->deblock_h_chroma = x264_deblock_h_chroma_mmxext;
 807         pf->deblock_v_chroma_intra = x264_deblock_v_chroma_intra_mmxext;
 808         pf->deblock_h_chroma_intra = x264_deblock_h_chroma_intra_mmxext;
 809 #ifdef ARCH_X86
 810         pf->deblock_v_luma = x264_deblock_v_luma_mmxext;
 811         pf->deblock_h_luma = x264_deblock_h_luma_mmxext;
 812         pf->deblock_v_luma_intra = x264_deblock_v_luma_intra_mmxext;
 813         pf->deblock_h_luma_intra = x264_deblock_h_luma_intra_mmxext;
 814 #endif
 815         if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_STACK_MOD4) )
 816         {
 817             pf->deblock_v_luma = x264_deblock_v_luma_sse2;
 818             pf->deblock_h_luma = x264_deblock_h_luma_sse2;
 819             pf->deblock_v_luma_intra = x264_deblock_v_luma_intra_sse2;
 820             pf->deblock_h_luma_intra = x264_deblock_h_luma_intra_sse2;
 821         }
 822     }
 823 #endif
 824
 825 #ifdef ARCH_PPC
 826     if( cpu&X264_CPU_ALTIVEC )
 827     {
 828         pf->deblock_v_luma = x264_deblock_v_luma_altivec;
 829         pf->deblock_h_luma = x264_deblock_h_luma_altivec;
 830    }
 831 #endif // ARCH_PPC
 832 }
 833
 834
 835 /* threading */
 836 void x264_frame_cond_broadcast( x264_frame_t *frame, int i_lines_completed )
 837 {
 838     x264_pthread_mutex_lock( &frame->mutex );
 839     frame->i_lines_completed = i_lines_completed;
 840     x264_pthread_cond_broadcast( &frame->cv );
 841     x264_pthread_mutex_unlock( &frame->mutex );
 842 }
 843
 844 void x264_frame_cond_wait( x264_frame_t *frame, int i_lines_completed )
 845 {
 846     x264_pthread_mutex_lock( &frame->mutex );
 847     while( frame->i_lines_completed < i_lines_completed )
 848         x264_pthread_cond_wait( &frame->cv, &frame->mutex );
 849     x264_pthread_mutex_unlock( &frame->mutex );
 850 }
 851
 852 /* list operators */
 853
 854 void x264_frame_push( x264_frame_t **list, x264_frame_t *frame )
 855 {
 856     int i = 0;
 857     while( list[i] ) i++;
 858     list[i] = frame;
 859 }
 860
 861 x264_frame_t *x264_frame_pop( x264_frame_t **list )
 862 {
 863     x264_frame_t *frame;
 864     int i = 0;
 865     assert( list[0] );
 866     while( list[i+1] ) i++;
 867     frame = list[i];
 868     list[i] = NULL;
 869     return frame;
 870 }
 871
 872 void x264_frame_unshift( x264_frame_t **list, x264_frame_t *frame )
 873 {
 874     int i = 0;
 875     while( list[i] ) i++;
 876     while( i-- )
 877         list[i+1] = list[i];
 878     list[0] = frame;
 879 }
 880
 881 x264_frame_t *x264_frame_shift( x264_frame_t **list )
 882 {
 883     x264_frame_t *frame = list[0];
 884     int i;
 885     for( i = 0; list[i]; i++ )
 886         list[i] = list[i+1];
 887     assert(frame);
 888     return frame;
 889 }
 890
 891 void x264_frame_push_unused( x264_t *h, x264_frame_t *frame )
 892 {
 893     assert( frame->i_reference_count > 0 );
 894     frame->i_reference_count--;
 895     if( frame->i_reference_count == 0 )
 896         x264_frame_push( h->frames.unused, frame );
 897     assert( h->frames.unused[ sizeof(h->frames.unused) / sizeof(*h->frames.unused) - 1 ] == NULL );
 898 }
 899
 900 x264_frame_t *x264_frame_pop_unused( x264_t *h )
 901 {
 902     x264_frame_t *frame;
 903     if( h->frames.unused[0] )
 904         frame = x264_frame_pop( h->frames.unused );
 905     else
 906         frame = x264_frame_new( h );
 907     assert( frame->i_reference_count == 0 );
 908     frame->i_reference_count = 1;
 909     return frame;
 910 }
 911
 912 void x264_frame_sort( x264_frame_t **list, int b_dts )
 913 {
 914     int i, b_ok;
 915     do {
 916         b_ok = 1;
 917         for( i = 0; list[i+1]; i++ )
 918         {
 919             int dtype = list[i]->i_type - list[i+1]->i_type;
 920             int dtime = list[i]->i_frame - list[i+1]->i_frame;
 921             int swap = b_dts ? dtype > 0 || ( dtype == 0 && dtime > 0 )
 922                              : dtime > 0;
 923             if( swap )
 924             {
 925                 XCHG( x264_frame_t*, list[i], list[i+1] );
 926                 b_ok = 0;
 927             }
 928         }
 929     } while( !b_ok );
 930 }