git.sesse.net Git - x264/blob - common/frame.c

   1 /*****************************************************************************
   2  * frame.c: frame handling
   3  *****************************************************************************
   4  * Copyright (C) 2003-2016 x264 project
   5  *
   6  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
   7  *          Loren Merritt <lorenm@u.washington.edu>
   8  *          Fiona Glaser <fiona@x264.com>
   9  *
  10  * This program is free software; you can redistribute it and/or modify
  11  * it under the terms of the GNU General Public License as published by
  12  * the Free Software Foundation; either version 2 of the License, or
  13  * (at your option) any later version.
  14  *
  15  * This program is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  * GNU General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU General Public License
  21  * along with this program; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  23  *
  24  * This program is also available under a commercial proprietary license.
  25  * For more information, contact us at licensing@x264.com.
  26  *****************************************************************************/
  27
  28 #include "common.h"
  29
  30 static int align_stride( int x, int align, int disalign )
  31 {
  32     x = ALIGN( x, align );
  33     if( !(x&(disalign-1)) )
  34         x += align;
  35     return x;
  36 }
  37
  38 static int align_plane_size( int x, int disalign )
  39 {
  40     if( !(x&(disalign-1)) )
  41         x += 128;
  42     return x;
  43 }
  44
  45 static int x264_frame_internal_csp( int external_csp )
  46 {
  47     switch( external_csp & X264_CSP_MASK )
  48     {
  49         case X264_CSP_NV12:
  50         case X264_CSP_NV21:
  51         case X264_CSP_I420:
  52         case X264_CSP_YV12:
  53             return X264_CSP_NV12;
  54         case X264_CSP_NV16:
  55         case X264_CSP_I422:
  56         case X264_CSP_YV16:
  57         case X264_CSP_V210:
  58             return X264_CSP_NV16;
  59         case X264_CSP_I444:
  60         case X264_CSP_YV24:
  61         case X264_CSP_BGR:
  62         case X264_CSP_BGRA:
  63         case X264_CSP_RGB:
  64             return X264_CSP_I444;
  65         default:
  66             return X264_CSP_NONE;
  67     }
  68 }
  69
  70 static x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
  71 {
  72     x264_frame_t *frame;
  73     int i_csp = x264_frame_internal_csp( h->param.i_csp );
  74     int i_mb_count = h->mb.i_mb_count;
  75     int i_stride, i_width, i_lines, luma_plane_count;
  76     int i_padv = PADV << PARAM_INTERLACED;
  77     int align = 16;
  78 #if ARCH_X86 || ARCH_X86_64
  79     if( h->param.cpu&X264_CPU_CACHELINE_64 )
  80         align = 64;
  81     else if( h->param.cpu&X264_CPU_CACHELINE_32 || h->param.cpu&X264_CPU_AVX )
  82         align = 32;
  83 #endif
  84 #if ARCH_PPC
  85     int disalign = 1<<9;
  86 #else
  87     int disalign = 1<<10;
  88 #endif
  89
  90     CHECKED_MALLOCZERO( frame, sizeof(x264_frame_t) );
  91     PREALLOC_INIT
  92
  93     /* allocate frame data (+64 for extra data for me) */
  94     i_width  = h->mb.i_mb_width*16;
  95     i_lines  = h->mb.i_mb_height*16;
  96     i_stride = align_stride( i_width + 2*PADH, align, disalign );
  97
  98     if( i_csp == X264_CSP_NV12 || i_csp == X264_CSP_NV16 )
  99     {
 100         luma_plane_count = 1;
 101         frame->i_plane = 2;
 102         for( int i = 0; i < 2; i++ )
 103         {
 104             frame->i_width[i] = i_width >> i;
 105             frame->i_lines[i] = i_lines >> (i && i_csp == X264_CSP_NV12);
 106             frame->i_stride[i] = i_stride;
 107         }
 108     }
 109     else if( i_csp == X264_CSP_I444 )
 110     {
 111         luma_plane_count = 3;
 112         frame->i_plane = 3;
 113         for( int i = 0; i < 3; i++ )
 114         {
 115             frame->i_width[i] = i_width;
 116             frame->i_lines[i] = i_lines;
 117             frame->i_stride[i] = i_stride;
 118         }
 119     }
 120     else
 121         goto fail;
 122
 123     frame->i_csp = i_csp;
 124     frame->i_width_lowres = frame->i_width[0]/2;
 125     frame->i_lines_lowres = frame->i_lines[0]/2;
 126     frame->i_stride_lowres = align_stride( frame->i_width_lowres + 2*PADH, align, disalign<<1 );
 127
 128     for( int i = 0; i < h->param.i_bframe + 2; i++ )
 129         for( int j = 0; j < h->param.i_bframe + 2; j++ )
 130             PREALLOC( frame->i_row_satds[i][j], i_lines/16 * sizeof(int) );
 131
 132     frame->i_poc = -1;
 133     frame->i_type = X264_TYPE_AUTO;
 134     frame->i_qpplus1 = X264_QP_AUTO;
 135     frame->i_pts = -1;
 136     frame->i_frame = -1;
 137     frame->i_frame_num = -1;
 138     frame->i_lines_completed = -1;
 139     frame->b_fdec = b_fdec;
 140     frame->i_pic_struct = PIC_STRUCT_AUTO;
 141     frame->i_field_cnt = -1;
 142     frame->i_duration =
 143     frame->i_cpb_duration =
 144     frame->i_dpb_output_delay =
 145     frame->i_cpb_delay = 0;
 146     frame->i_coded_fields_lookahead =
 147     frame->i_cpb_delay_lookahead = -1;
 148
 149     frame->orig = frame;
 150
 151     if( i_csp == X264_CSP_NV12 || i_csp == X264_CSP_NV16 )
 152     {
 153         int chroma_padv = i_padv >> (i_csp == X264_CSP_NV12);
 154         int chroma_plane_size = (frame->i_stride[1] * (frame->i_lines[1] + 2*chroma_padv));
 155         PREALLOC( frame->buffer[1], chroma_plane_size * sizeof(pixel) );
 156         if( PARAM_INTERLACED )
 157             PREALLOC( frame->buffer_fld[1], chroma_plane_size * sizeof(pixel) );
 158     }
 159
 160     /* all 4 luma planes allocated together, since the cacheline split code
 161      * requires them to be in-phase wrt cacheline alignment. */
 162
 163     for( int p = 0; p < luma_plane_count; p++ )
 164     {
 165         int luma_plane_size = align_plane_size( frame->i_stride[p] * (frame->i_lines[p] + 2*i_padv), disalign );
 166         if( h->param.analyse.i_subpel_refine && b_fdec )
 167         {
 168             /* FIXME: Don't allocate both buffers in non-adaptive MBAFF. */
 169             PREALLOC( frame->buffer[p], 4*luma_plane_size * sizeof(pixel) );
 170             if( PARAM_INTERLACED )
 171                 PREALLOC( frame->buffer_fld[p], 4*luma_plane_size * sizeof(pixel) );
 172         }
 173         else
 174         {
 175             PREALLOC( frame->buffer[p], luma_plane_size * sizeof(pixel) );
 176             if( PARAM_INTERLACED )
 177                 PREALLOC( frame->buffer_fld[p], luma_plane_size * sizeof(pixel) );
 178         }
 179     }
 180
 181     frame->b_duplicate = 0;
 182
 183     if( b_fdec ) /* fdec frame */
 184     {
 185         PREALLOC( frame->mb_type, i_mb_count * sizeof(int8_t) );
 186         PREALLOC( frame->mb_partition, i_mb_count * sizeof(uint8_t) );
 187         PREALLOC( frame->mv[0], 2*16 * i_mb_count * sizeof(int16_t) );
 188         PREALLOC( frame->mv16x16, 2*(i_mb_count+1) * sizeof(int16_t) );
 189         PREALLOC( frame->ref[0], 4 * i_mb_count * sizeof(int8_t) );
 190         if( h->param.i_bframe )
 191         {
 192             PREALLOC( frame->mv[1], 2*16 * i_mb_count * sizeof(int16_t) );
 193             PREALLOC( frame->ref[1], 4 * i_mb_count * sizeof(int8_t) );
 194         }
 195         else
 196         {
 197             frame->mv[1]  = NULL;
 198             frame->ref[1] = NULL;
 199         }
 200         PREALLOC( frame->i_row_bits, i_lines/16 * sizeof(int) );
 201         PREALLOC( frame->f_row_qp, i_lines/16 * sizeof(float) );
 202         PREALLOC( frame->f_row_qscale, i_lines/16 * sizeof(float) );
 203         if( h->param.analyse.i_me_method >= X264_ME_ESA )
 204             PREALLOC( frame->buffer[3], frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv) * sizeof(uint16_t) << h->frames.b_have_sub8x8_esa );
 205         if( PARAM_INTERLACED )
 206             PREALLOC( frame->field, i_mb_count * sizeof(uint8_t) );
 207         if( h->param.analyse.b_mb_info )
 208             PREALLOC( frame->effective_qp, i_mb_count * sizeof(uint8_t) );
 209     }
 210     else /* fenc frame */
 211     {
 212         if( h->frames.b_have_lowres )
 213         {
 214             int luma_plane_size = align_plane_size( frame->i_stride_lowres * (frame->i_lines[0]/2 + 2*PADV), disalign );
 215
 216             PREALLOC( frame->buffer_lowres[0], 4 * luma_plane_size * sizeof(pixel) );
 217
 218             for( int j = 0; j <= !!h->param.i_bframe; j++ )
 219                 for( int i = 0; i <= h->param.i_bframe; i++ )
 220                 {
 221                     PREALLOC( frame->lowres_mvs[j][i], 2*h->mb.i_mb_count*sizeof(int16_t) );
 222                     PREALLOC( frame->lowres_mv_costs[j][i], h->mb.i_mb_count*sizeof(int) );
 223                 }
 224             PREALLOC( frame->i_propagate_cost, (i_mb_count+7) * sizeof(uint16_t) );
 225             for( int j = 0; j <= h->param.i_bframe+1; j++ )
 226                 for( int i = 0; i <= h->param.i_bframe+1; i++ )
 227                     PREALLOC( frame->lowres_costs[j][i], (i_mb_count+3) * sizeof(uint16_t) );
 228
 229         }
 230         if( h->param.rc.i_aq_mode )
 231         {
 232             PREALLOC( frame->f_qp_offset, h->mb.i_mb_count * sizeof(float) );
 233             PREALLOC( frame->f_qp_offset_aq, h->mb.i_mb_count * sizeof(float) );
 234             if( h->frames.b_have_lowres )
 235                 PREALLOC( frame->i_inv_qscale_factor, (h->mb.i_mb_count+3) * sizeof(uint16_t) );
 236         }
 237     }
 238
 239     PREALLOC_END( frame->base );
 240
 241     if( i_csp == X264_CSP_NV12 || i_csp == X264_CSP_NV16 )
 242     {
 243         int chroma_padv = i_padv >> (i_csp == X264_CSP_NV12);
 244         frame->plane[1] = frame->buffer[1] + frame->i_stride[1] * chroma_padv + PADH;
 245         if( PARAM_INTERLACED )
 246             frame->plane_fld[1] = frame->buffer_fld[1] + frame->i_stride[1] * chroma_padv + PADH;
 247     }
 248
 249     for( int p = 0; p < luma_plane_count; p++ )
 250     {
 251         int luma_plane_size = align_plane_size( frame->i_stride[p] * (frame->i_lines[p] + 2*i_padv), disalign );
 252         if( h->param.analyse.i_subpel_refine && b_fdec )
 253         {
 254             for( int i = 0; i < 4; i++ )
 255             {
 256                 frame->filtered[p][i] = frame->buffer[p] + i*luma_plane_size + frame->i_stride[p] * i_padv + PADH;
 257                 frame->filtered_fld[p][i] = frame->buffer_fld[p] + i*luma_plane_size + frame->i_stride[p] * i_padv + PADH;
 258             }
 259             frame->plane[p] = frame->filtered[p][0];
 260             frame->plane_fld[p] = frame->filtered_fld[p][0];
 261         }
 262         else
 263         {
 264             frame->filtered[p][0] = frame->plane[p] = frame->buffer[p] + frame->i_stride[p] * i_padv + PADH;
 265             frame->filtered_fld[p][0] = frame->plane_fld[p] = frame->buffer_fld[p] + frame->i_stride[p] * i_padv + PADH;
 266         }
 267     }
 268
 269     if( b_fdec )
 270     {
 271         M32( frame->mv16x16[0] ) = 0;
 272         frame->mv16x16++;
 273
 274         if( h->param.analyse.i_me_method >= X264_ME_ESA )
 275             frame->integral = (uint16_t*)frame->buffer[3] + frame->i_stride[0] * i_padv + PADH;
 276     }
 277     else
 278     {
 279         if( h->frames.b_have_lowres )
 280         {
 281             int luma_plane_size = align_plane_size( frame->i_stride_lowres * (frame->i_lines[0]/2 + 2*PADV), disalign );
 282             for( int i = 0; i < 4; i++ )
 283                 frame->lowres[i] = frame->buffer_lowres[0] + (frame->i_stride_lowres * PADV + PADH) + i * luma_plane_size;
 284
 285             for( int j = 0; j <= !!h->param.i_bframe; j++ )
 286                 for( int i = 0; i <= h->param.i_bframe; i++ )
 287                     memset( frame->lowres_mvs[j][i], 0, 2*h->mb.i_mb_count*sizeof(int16_t) );
 288
 289             frame->i_intra_cost = frame->lowres_costs[0][0];
 290             memset( frame->i_intra_cost, -1, (i_mb_count+3) * sizeof(uint16_t) );
 291
 292             if( h->param.rc.i_aq_mode )
 293                 /* shouldn't really be initialized, just silences a valgrind false-positive in x264_mbtree_propagate_cost_sse2 */
 294                 memset( frame->i_inv_qscale_factor, 0, (h->mb.i_mb_count+3) * sizeof(uint16_t) );
 295         }
 296     }
 297
 298     if( x264_pthread_mutex_init( &frame->mutex, NULL ) )
 299         goto fail;
 300     if( x264_pthread_cond_init( &frame->cv, NULL ) )
 301         goto fail;
 302
 303 #if HAVE_OPENCL
 304     frame->opencl.ocl = h->opencl.ocl;
 305 #endif
 306
 307     return frame;
 308
 309 fail:
 310     x264_free( frame );
 311     return NULL;
 312 }
 313
 314 void x264_frame_delete( x264_frame_t *frame )
 315 {
 316     /* Duplicate frames are blank copies of real frames (including pointers),
 317      * so freeing those pointers would cause a double free later. */
 318     if( !frame->b_duplicate )
 319     {
 320         x264_free( frame->base );
 321
 322         if( frame->param && frame->param->param_free )
 323             frame->param->param_free( frame->param );
 324         if( frame->mb_info_free )
 325             frame->mb_info_free( frame->mb_info );
 326         if( frame->extra_sei.sei_free )
 327         {
 328             for( int i = 0; i < frame->extra_sei.num_payloads; i++ )
 329                 frame->extra_sei.sei_free( frame->extra_sei.payloads[i].payload );
 330             frame->extra_sei.sei_free( frame->extra_sei.payloads );
 331         }
 332         x264_pthread_mutex_destroy( &frame->mutex );
 333         x264_pthread_cond_destroy( &frame->cv );
 334 #if HAVE_OPENCL
 335         x264_opencl_frame_delete( frame );
 336 #endif
 337     }
 338     x264_free( frame );
 339 }
 340
 341 static int get_plane_ptr( x264_t *h, x264_picture_t *src, uint8_t **pix, int *stride, int plane, int xshift, int yshift )
 342 {
 343     int width = h->param.i_width >> xshift;
 344     int height = h->param.i_height >> yshift;
 345     *pix = src->img.plane[plane];
 346     *stride = src->img.i_stride[plane];
 347     if( src->img.i_csp & X264_CSP_VFLIP )
 348     {
 349         *pix += (height-1) * *stride;
 350         *stride = -*stride;
 351     }
 352     if( width > abs(*stride) )
 353     {
 354         x264_log( h, X264_LOG_ERROR, "Input picture width (%d) is greater than stride (%d)\n", width, *stride );
 355         return -1;
 356     }
 357     return 0;
 358 }
 359
 360 #define get_plane_ptr(...) do{ if( get_plane_ptr(__VA_ARGS__) < 0 ) return -1; }while(0)
 361
 362 int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src )
 363 {
 364     int i_csp = src->img.i_csp & X264_CSP_MASK;
 365     if( dst->i_csp != x264_frame_internal_csp( i_csp ) )
 366     {
 367         x264_log( h, X264_LOG_ERROR, "Invalid input colorspace\n" );
 368         return -1;
 369     }
 370
 371 #if HIGH_BIT_DEPTH
 372     if( !(src->img.i_csp & X264_CSP_HIGH_DEPTH) )
 373     {
 374         x264_log( h, X264_LOG_ERROR, "This build of x264 requires high depth input. Rebuild to support 8-bit input.\n" );
 375         return -1;
 376     }
 377 #else
 378     if( src->img.i_csp & X264_CSP_HIGH_DEPTH )
 379     {
 380         x264_log( h, X264_LOG_ERROR, "This build of x264 requires 8-bit input. Rebuild to support high depth input.\n" );
 381         return -1;
 382     }
 383 #endif
 384
 385     if( BIT_DEPTH != 10 && i_csp == X264_CSP_V210 )
 386     {
 387         x264_log( h, X264_LOG_ERROR, "v210 input is only compatible with bit-depth of 10 bits\n" );
 388         return -1;
 389     }
 390
 391     if( src->i_type < X264_TYPE_AUTO || src->i_type > X264_TYPE_KEYFRAME )
 392     {
 393         x264_log( h, X264_LOG_WARNING, "forced frame type (%d) at %d is unknown\n", src->i_type, h->frames.i_input );
 394         dst->i_forced_type = X264_TYPE_AUTO;
 395     }
 396     else
 397         dst->i_forced_type = src->i_type;
 398
 399     dst->i_type     = dst->i_forced_type;
 400     dst->i_qpplus1  = src->i_qpplus1;
 401     dst->i_pts      = dst->i_reordered_pts = src->i_pts;
 402     dst->param      = src->param;
 403     dst->i_pic_struct = src->i_pic_struct;
 404     dst->extra_sei  = src->extra_sei;
 405     dst->opaque     = src->opaque;
 406     dst->mb_info    = h->param.analyse.b_mb_info ? src->prop.mb_info : NULL;
 407     dst->mb_info_free = h->param.analyse.b_mb_info ? src->prop.mb_info_free : NULL;
 408
 409     uint8_t *pix[3];
 410     int stride[3];
 411     if( i_csp == X264_CSP_V210 )
 412     {
 413          stride[0] = src->img.i_stride[0];
 414          pix[0] = src->img.plane[0];
 415
 416          h->mc.plane_copy_deinterleave_v210( dst->plane[0], dst->i_stride[0],
 417                                              dst->plane[1], dst->i_stride[1],
 418                                              (uint32_t *)pix[0], stride[0]/sizeof(uint32_t), h->param.i_width, h->param.i_height );
 419     }
 420     else if( i_csp >= X264_CSP_BGR )
 421     {
 422          stride[0] = src->img.i_stride[0];
 423          pix[0] = src->img.plane[0];
 424          if( src->img.i_csp & X264_CSP_VFLIP )
 425          {
 426              pix[0] += (h->param.i_height-1) * stride[0];
 427              stride[0] = -stride[0];
 428          }
 429          int b = i_csp==X264_CSP_RGB;
 430          h->mc.plane_copy_deinterleave_rgb( dst->plane[1+b], dst->i_stride[1+b],
 431                                             dst->plane[0], dst->i_stride[0],
 432                                             dst->plane[2-b], dst->i_stride[2-b],
 433                                             (pixel*)pix[0], stride[0]/sizeof(pixel), i_csp==X264_CSP_BGRA ? 4 : 3, h->param.i_width, h->param.i_height );
 434     }
 435     else
 436     {
 437         int v_shift = CHROMA_V_SHIFT;
 438         get_plane_ptr( h, src, &pix[0], &stride[0], 0, 0, 0 );
 439         h->mc.plane_copy( dst->plane[0], dst->i_stride[0], (pixel*)pix[0],
 440                           stride[0]/sizeof(pixel), h->param.i_width, h->param.i_height );
 441         if( i_csp == X264_CSP_NV12 || i_csp == X264_CSP_NV16 )
 442         {
 443             get_plane_ptr( h, src, &pix[1], &stride[1], 1, 0, v_shift );
 444             h->mc.plane_copy( dst->plane[1], dst->i_stride[1], (pixel*)pix[1],
 445                               stride[1]/sizeof(pixel), h->param.i_width, h->param.i_height>>v_shift );
 446         }
 447         else if( i_csp == X264_CSP_NV21 )
 448         {
 449             get_plane_ptr( h, src, &pix[1], &stride[1], 1, 0, v_shift );
 450             h->mc.plane_copy_swap( dst->plane[1], dst->i_stride[1], (pixel*)pix[1],
 451                                    stride[1]/sizeof(pixel), h->param.i_width>>1, h->param.i_height>>v_shift );
 452         }
 453         else if( i_csp == X264_CSP_I420 || i_csp == X264_CSP_I422 || i_csp == X264_CSP_YV12 || i_csp == X264_CSP_YV16 )
 454         {
 455             int uv_swap = i_csp == X264_CSP_YV12 || i_csp == X264_CSP_YV16;
 456             get_plane_ptr( h, src, &pix[1], &stride[1], uv_swap ? 2 : 1, 1, v_shift );
 457             get_plane_ptr( h, src, &pix[2], &stride[2], uv_swap ? 1 : 2, 1, v_shift );
 458             h->mc.plane_copy_interleave( dst->plane[1], dst->i_stride[1],
 459                                          (pixel*)pix[1], stride[1]/sizeof(pixel),
 460                                          (pixel*)pix[2], stride[2]/sizeof(pixel),
 461                                          h->param.i_width>>1, h->param.i_height>>v_shift );
 462         }
 463         else //if( i_csp == X264_CSP_I444 || i_csp == X264_CSP_YV24 )
 464         {
 465             get_plane_ptr( h, src, &pix[1], &stride[1], i_csp==X264_CSP_I444 ? 1 : 2, 0, 0 );
 466             get_plane_ptr( h, src, &pix[2], &stride[2], i_csp==X264_CSP_I444 ? 2 : 1, 0, 0 );
 467             h->mc.plane_copy( dst->plane[1], dst->i_stride[1], (pixel*)pix[1],
 468                               stride[1]/sizeof(pixel), h->param.i_width, h->param.i_height );
 469             h->mc.plane_copy( dst->plane[2], dst->i_stride[2], (pixel*)pix[2],
 470                               stride[2]/sizeof(pixel), h->param.i_width, h->param.i_height );
 471         }
 472     }
 473     return 0;
 474 }
 475
 476 static void ALWAYS_INLINE pixel_memset( pixel *dst, pixel *src, int len, int size )
 477 {
 478     uint8_t *dstp = (uint8_t*)dst;
 479     uint32_t v1 = *src;
 480     uint32_t v2 = size == 1 ? v1 + (v1 <<  8) : M16( src );
 481     uint32_t v4 = size <= 2 ? v2 + (v2 << 16) : M32( src );
 482     int i = 0;
 483     len *= size;
 484
 485     /* Align the input pointer if it isn't already */
 486     if( (intptr_t)dstp & (WORD_SIZE - 1) )
 487     {
 488         if( size <= 2 && ((intptr_t)dstp & 3) )
 489         {
 490             if( size == 1 && ((intptr_t)dstp & 1) )
 491                 dstp[i++] = v1;
 492             if( (intptr_t)dstp & 2 )
 493             {
 494                 M16( dstp+i ) = v2;
 495                 i += 2;
 496             }
 497         }
 498         if( WORD_SIZE == 8 && (intptr_t)dstp & 4 )
 499         {
 500             M32( dstp+i ) = v4;
 501             i += 4;
 502         }
 503     }
 504
 505     /* Main copy loop */
 506     if( WORD_SIZE == 8 )
 507     {
 508         uint64_t v8 = v4 + ((uint64_t)v4<<32);
 509         for( ; i < len - 7; i+=8 )
 510             M64( dstp+i ) = v8;
 511     }
 512     for( ; i < len - 3; i+=4 )
 513         M32( dstp+i ) = v4;
 514
 515     /* Finish up the last few bytes */
 516     if( size <= 2 )
 517     {
 518         if( i < len - 1 )
 519         {
 520             M16( dstp+i ) = v2;
 521             i += 2;
 522         }
 523         if( size == 1 && i != len )
 524             dstp[i] = v1;
 525     }
 526 }
 527
 528 static void ALWAYS_INLINE plane_expand_border( pixel *pix, int i_stride, int i_width, int i_height, int i_padh, int i_padv, int b_pad_top, int b_pad_bottom, int b_chroma )
 529 {
 530 #define PPIXEL(x, y) ( pix + (x) + (y)*i_stride )
 531     for( int y = 0; y < i_height; y++ )
 532     {
 533         /* left band */
 534         pixel_memset( PPIXEL(-i_padh, y), PPIXEL(0, y), i_padh>>b_chroma, sizeof(pixel)<<b_chroma );
 535         /* right band */
 536         pixel_memset( PPIXEL(i_width, y), PPIXEL(i_width-1-b_chroma, y), i_padh>>b_chroma, sizeof(pixel)<<b_chroma );
 537     }
 538     /* upper band */
 539     if( b_pad_top )
 540         for( int y = 0; y < i_padv; y++ )
 541             memcpy( PPIXEL(-i_padh, -y-1), PPIXEL(-i_padh, 0), (i_width+2*i_padh) * sizeof(pixel) );
 542     /* lower band */
 543     if( b_pad_bottom )
 544         for( int y = 0; y < i_padv; y++ )
 545             memcpy( PPIXEL(-i_padh, i_height+y), PPIXEL(-i_padh, i_height-1), (i_width+2*i_padh) * sizeof(pixel) );
 546 #undef PPIXEL
 547 }
 548
 549 void x264_frame_expand_border( x264_t *h, x264_frame_t *frame, int mb_y )
 550 {
 551     int pad_top = mb_y == 0;
 552     int pad_bot = mb_y == h->mb.i_mb_height - (1 << SLICE_MBAFF);
 553     int b_start = mb_y == h->i_threadslice_start;
 554     int b_end   = mb_y == h->i_threadslice_end - (1 << SLICE_MBAFF);
 555     if( mb_y & SLICE_MBAFF )
 556         return;
 557     for( int i = 0; i < frame->i_plane; i++ )
 558     {
 559         int h_shift = i && CHROMA_H_SHIFT;
 560         int v_shift = i && CHROMA_V_SHIFT;
 561         int stride = frame->i_stride[i];
 562         int width = 16*h->mb.i_mb_width;
 563         int height = (pad_bot ? 16*(h->mb.i_mb_height - mb_y) >> SLICE_MBAFF : 16) >> v_shift;
 564         int padh = PADH;
 565         int padv = PADV >> v_shift;
 566         // buffer: 2 chroma, 3 luma (rounded to 4) because deblocking goes beyond the top of the mb
 567         if( b_end && !b_start )
 568             height += 4 >> (v_shift + SLICE_MBAFF);
 569         pixel *pix;
 570         int starty = 16*mb_y - 4*!b_start;
 571         if( SLICE_MBAFF )
 572         {
 573             // border samples for each field are extended separately
 574             pix = frame->plane_fld[i] + (starty*stride >> v_shift);
 575             plane_expand_border( pix, stride*2, width, height, padh, padv, pad_top, pad_bot, h_shift );
 576             plane_expand_border( pix+stride, stride*2, width, height, padh, padv, pad_top, pad_bot, h_shift );
 577
 578             height = (pad_bot ? 16*(h->mb.i_mb_height - mb_y) : 32) >> v_shift;
 579             if( b_end && !b_start )
 580                 height += 4 >> v_shift;
 581             pix = frame->plane[i] + (starty*stride >> v_shift);
 582             plane_expand_border( pix, stride, width, height, padh, padv, pad_top, pad_bot, h_shift );
 583         }
 584         else
 585         {
 586             pix = frame->plane[i] + (starty*stride >> v_shift);
 587             plane_expand_border( pix, stride, width, height, padh, padv, pad_top, pad_bot, h_shift );
 588         }
 589     }
 590 }
 591
 592 void x264_frame_expand_border_filtered( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
 593 {
 594     /* during filtering, 8 extra pixels were filtered on each edge,
 595      * but up to 3 of the horizontal ones may be wrong.
 596        we want to expand border from the last filtered pixel */
 597     int b_start = !mb_y;
 598     int width = 16*h->mb.i_mb_width + 8;
 599     int height = b_end ? (16*(h->mb.i_mb_height - mb_y) >> SLICE_MBAFF) + 16 : 16;
 600     int padh = PADH - 4;
 601     int padv = PADV - 8;
 602     for( int p = 0; p < (CHROMA444 ? 3 : 1); p++ )
 603         for( int i = 1; i < 4; i++ )
 604         {
 605             int stride = frame->i_stride[p];
 606             // buffer: 8 luma, to match the hpel filter
 607             pixel *pix;
 608             if( SLICE_MBAFF )
 609             {
 610                 pix = frame->filtered_fld[p][i] + (16*mb_y - 16) * stride - 4;
 611                 plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end, 0 );
 612                 plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end, 0 );
 613             }
 614
 615             pix = frame->filtered[p][i] + (16*mb_y - 8) * stride - 4;
 616             plane_expand_border( pix, stride, width, height << SLICE_MBAFF, padh, padv, b_start, b_end, 0 );
 617         }
 618 }
 619
 620 void x264_frame_expand_border_lowres( x264_frame_t *frame )
 621 {
 622     for( int i = 0; i < 4; i++ )
 623         plane_expand_border( frame->lowres[i], frame->i_stride_lowres, frame->i_width_lowres, frame->i_lines_lowres, PADH, PADV, 1, 1, 0 );
 624 }
 625
 626 void x264_frame_expand_border_chroma( x264_t *h, x264_frame_t *frame, int plane )
 627 {
 628     int v_shift = CHROMA_V_SHIFT;
 629     plane_expand_border( frame->plane[plane], frame->i_stride[plane], 16*h->mb.i_mb_width, 16*h->mb.i_mb_height>>v_shift,
 630                          PADH, PADV>>v_shift, 1, 1, CHROMA_H_SHIFT );
 631 }
 632
 633 void x264_frame_expand_border_mod16( x264_t *h, x264_frame_t *frame )
 634 {
 635     for( int i = 0; i < frame->i_plane; i++ )
 636     {
 637         int i_width = h->param.i_width;
 638         int h_shift = i && CHROMA_H_SHIFT;
 639         int v_shift = i && CHROMA_V_SHIFT;
 640         int i_height = h->param.i_height >> v_shift;
 641         int i_padx = (h->mb.i_mb_width * 16 - h->param.i_width);
 642         int i_pady = (h->mb.i_mb_height * 16 - h->param.i_height) >> v_shift;
 643
 644         if( i_padx )
 645         {
 646             for( int y = 0; y < i_height; y++ )
 647                 pixel_memset( &frame->plane[i][y*frame->i_stride[i] + i_width],
 648                               &frame->plane[i][y*frame->i_stride[i] + i_width - 1-h_shift],
 649                               i_padx>>h_shift, sizeof(pixel)<<h_shift );
 650         }
 651         if( i_pady )
 652         {
 653             for( int y = i_height; y < i_height + i_pady; y++ )
 654                 memcpy( &frame->plane[i][y*frame->i_stride[i]],
 655                         &frame->plane[i][(i_height-(~y&PARAM_INTERLACED)-1)*frame->i_stride[i]],
 656                         (i_width + i_padx) * sizeof(pixel) );
 657         }
 658     }
 659 }
 660
 661 void x264_expand_border_mbpair( x264_t *h, int mb_x, int mb_y )
 662 {
 663     for( int i = 0; i < h->fenc->i_plane; i++ )
 664     {
 665         int v_shift = i && CHROMA_V_SHIFT;
 666         int stride = h->fenc->i_stride[i];
 667         int height = h->param.i_height >> v_shift;
 668         int pady = (h->mb.i_mb_height * 16 - h->param.i_height) >> v_shift;
 669         pixel *fenc = h->fenc->plane[i] + 16*mb_x;
 670         for( int y = height; y < height + pady; y++ )
 671             memcpy( fenc + y*stride, fenc + (height-1)*stride, 16*sizeof(pixel) );
 672     }
 673 }
 674
 675 /* threading */
 676 void x264_frame_cond_broadcast( x264_frame_t *frame, int i_lines_completed )
 677 {
 678     x264_pthread_mutex_lock( &frame->mutex );
 679     frame->i_lines_completed = i_lines_completed;
 680     x264_pthread_cond_broadcast( &frame->cv );
 681     x264_pthread_mutex_unlock( &frame->mutex );
 682 }
 683
 684 void x264_frame_cond_wait( x264_frame_t *frame, int i_lines_completed )
 685 {
 686     x264_pthread_mutex_lock( &frame->mutex );
 687     while( frame->i_lines_completed < i_lines_completed )
 688         x264_pthread_cond_wait( &frame->cv, &frame->mutex );
 689     x264_pthread_mutex_unlock( &frame->mutex );
 690 }
 691
 692 void x264_threadslice_cond_broadcast( x264_t *h, int pass )
 693 {
 694     x264_pthread_mutex_lock( &h->mutex );
 695     h->i_threadslice_pass = pass;
 696     if( pass > 0 )
 697         x264_pthread_cond_broadcast( &h->cv );
 698     x264_pthread_mutex_unlock( &h->mutex );
 699 }
 700
 701 void x264_threadslice_cond_wait( x264_t *h, int pass )
 702 {
 703     x264_pthread_mutex_lock( &h->mutex );
 704     while( h->i_threadslice_pass < pass )
 705         x264_pthread_cond_wait( &h->cv, &h->mutex );
 706     x264_pthread_mutex_unlock( &h->mutex );
 707 }
 708
 709 int x264_frame_new_slice( x264_t *h, x264_frame_t *frame )
 710 {
 711     if( h->param.i_slice_count_max )
 712     {
 713         int slice_count;
 714         if( h->param.b_sliced_threads )
 715             slice_count = x264_pthread_fetch_and_add( &frame->i_slice_count, 1, &frame->mutex );
 716         else
 717             slice_count = frame->i_slice_count++;
 718         if( slice_count >= h->param.i_slice_count_max )
 719             return -1;
 720     }
 721     return 0;
 722 }
 723
 724 /* list operators */
 725
 726 void x264_frame_push( x264_frame_t **list, x264_frame_t *frame )
 727 {
 728     int i = 0;
 729     while( list[i] ) i++;
 730     list[i] = frame;
 731 }
 732
 733 x264_frame_t *x264_frame_pop( x264_frame_t **list )
 734 {
 735     x264_frame_t *frame;
 736     int i = 0;
 737     assert( list[0] );
 738     while( list[i+1] ) i++;
 739     frame = list[i];
 740     list[i] = NULL;
 741     return frame;
 742 }
 743
 744 void x264_frame_unshift( x264_frame_t **list, x264_frame_t *frame )
 745 {
 746     int i = 0;
 747     while( list[i] ) i++;
 748     while( i-- )
 749         list[i+1] = list[i];
 750     list[0] = frame;
 751 }
 752
 753 x264_frame_t *x264_frame_shift( x264_frame_t **list )
 754 {
 755     x264_frame_t *frame = list[0];
 756     int i;
 757     for( i = 0; list[i]; i++ )
 758         list[i] = list[i+1];
 759     assert(frame);
 760     return frame;
 761 }
 762
 763 void x264_frame_push_unused( x264_t *h, x264_frame_t *frame )
 764 {
 765     assert( frame->i_reference_count > 0 );
 766     frame->i_reference_count--;
 767     if( frame->i_reference_count == 0 )
 768         x264_frame_push( h->frames.unused[frame->b_fdec], frame );
 769 }
 770
 771 x264_frame_t *x264_frame_pop_unused( x264_t *h, int b_fdec )
 772 {
 773     x264_frame_t *frame;
 774     if( h->frames.unused[b_fdec][0] )
 775         frame = x264_frame_pop( h->frames.unused[b_fdec] );
 776     else
 777         frame = x264_frame_new( h, b_fdec );
 778     if( !frame )
 779         return NULL;
 780     frame->b_last_minigop_bframe = 0;
 781     frame->i_reference_count = 1;
 782     frame->b_intra_calculated = 0;
 783     frame->b_scenecut = 1;
 784     frame->b_keyframe = 0;
 785     frame->b_corrupt = 0;
 786     frame->i_slice_count = h->param.b_sliced_threads ? h->param.i_threads : 1;
 787
 788     memset( frame->weight, 0, sizeof(frame->weight) );
 789     memset( frame->f_weighted_cost_delta, 0, sizeof(frame->f_weighted_cost_delta) );
 790
 791     return frame;
 792 }
 793
 794 void x264_frame_push_blank_unused( x264_t *h, x264_frame_t *frame )
 795 {
 796     assert( frame->i_reference_count > 0 );
 797     frame->i_reference_count--;
 798     if( frame->i_reference_count == 0 )
 799         x264_frame_push( h->frames.blank_unused, frame );
 800 }
 801
 802 x264_frame_t *x264_frame_pop_blank_unused( x264_t *h )
 803 {
 804     x264_frame_t *frame;
 805     if( h->frames.blank_unused[0] )
 806         frame = x264_frame_pop( h->frames.blank_unused );
 807     else
 808         frame = x264_malloc( sizeof(x264_frame_t) );
 809     if( !frame )
 810         return NULL;
 811     frame->b_duplicate = 1;
 812     frame->i_reference_count = 1;
 813     return frame;
 814 }
 815
 816 void x264_weight_scale_plane( x264_t *h, pixel *dst, intptr_t i_dst_stride, pixel *src, intptr_t i_src_stride,
 817                               int i_width, int i_height, x264_weight_t *w )
 818 {
 819     /* Weight horizontal strips of height 16. This was found to be the optimal height
 820      * in terms of the cache loads. */
 821     while( i_height > 0 )
 822     {
 823         int x;
 824         for( x = 0; x < i_width-8; x += 16 )
 825             w->weightfn[16>>2]( dst+x, i_dst_stride, src+x, i_src_stride, w, X264_MIN( i_height, 16 ) );
 826         if( x < i_width )
 827             w->weightfn[ 8>>2]( dst+x, i_dst_stride, src+x, i_src_stride, w, X264_MIN( i_height, 16 ) );
 828         i_height -= 16;
 829         dst += 16 * i_dst_stride;
 830         src += 16 * i_src_stride;
 831     }
 832 }
 833
 834 void x264_frame_delete_list( x264_frame_t **list )
 835 {
 836     int i = 0;
 837     if( !list )
 838         return;
 839     while( list[i] )
 840         x264_frame_delete( list[i++] );
 841     x264_free( list );
 842 }
 843
 844 int x264_sync_frame_list_init( x264_sync_frame_list_t *slist, int max_size )
 845 {
 846     if( max_size < 0 )
 847         return -1;
 848     slist->i_max_size = max_size;
 849     slist->i_size = 0;
 850     CHECKED_MALLOCZERO( slist->list, (max_size+1) * sizeof(x264_frame_t*) );
 851     if( x264_pthread_mutex_init( &slist->mutex, NULL ) ||
 852         x264_pthread_cond_init( &slist->cv_fill, NULL ) ||
 853         x264_pthread_cond_init( &slist->cv_empty, NULL ) )
 854         return -1;
 855     return 0;
 856 fail:
 857     return -1;
 858 }
 859
 860 void x264_sync_frame_list_delete( x264_sync_frame_list_t *slist )
 861 {
 862     x264_pthread_mutex_destroy( &slist->mutex );
 863     x264_pthread_cond_destroy( &slist->cv_fill );
 864     x264_pthread_cond_destroy( &slist->cv_empty );
 865     x264_frame_delete_list( slist->list );
 866 }
 867
 868 void x264_sync_frame_list_push( x264_sync_frame_list_t *slist, x264_frame_t *frame )
 869 {
 870     x264_pthread_mutex_lock( &slist->mutex );
 871     while( slist->i_size == slist->i_max_size )
 872         x264_pthread_cond_wait( &slist->cv_empty, &slist->mutex );
 873     slist->list[ slist->i_size++ ] = frame;
 874     x264_pthread_mutex_unlock( &slist->mutex );
 875     x264_pthread_cond_broadcast( &slist->cv_fill );
 876 }
 877
 878 x264_frame_t *x264_sync_frame_list_pop( x264_sync_frame_list_t *slist )
 879 {
 880     x264_frame_t *frame;
 881     x264_pthread_mutex_lock( &slist->mutex );
 882     while( !slist->i_size )
 883         x264_pthread_cond_wait( &slist->cv_fill, &slist->mutex );
 884     frame = slist->list[ --slist->i_size ];
 885     slist->list[ slist->i_size ] = NULL;
 886     x264_pthread_cond_broadcast( &slist->cv_empty );
 887     x264_pthread_mutex_unlock( &slist->mutex );
 888     return frame;
 889 }