1 /*****************************************************************************
2 * frame.c: h264 encoder library
3 *****************************************************************************
4 * Copyright (C) 2003-2008 x264 project
6 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
7 * Loren Merritt <lorenm@u.washington.edu>
8 * Fiona Glaser <fiona@x264.com>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23 *****************************************************************************/
27 #define ALIGN(x,a) (((x)+((a)-1))&~((a)-1))
29 x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
34 int i_mb_count = h->mb.i_mb_count;
35 int i_stride, i_width, i_lines;
36 int i_padv = PADV << h->param.b_interlaced;
38 int chroma_plane_size;
39 int align = h->param.cpu&X264_CPU_CACHELINE_64 ? 64 : h->param.cpu&X264_CPU_CACHELINE_32 ? 32 : 16;
41 CHECKED_MALLOCZERO( frame, sizeof(x264_frame_t) );
43 /* allocate frame data (+64 for extra data for me) */
44 i_width = ALIGN( h->param.i_width, 16 );
45 i_stride = ALIGN( i_width + 2*PADH, align );
46 i_lines = ALIGN( h->param.i_height, 16<<h->param.b_interlaced );
49 for( i = 0; i < 3; i++ )
51 frame->i_stride[i] = ALIGN( i_stride >> !!i, align );
52 frame->i_width[i] = i_width >> !!i;
53 frame->i_lines[i] = i_lines >> !!i;
56 luma_plane_size = (frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv));
57 chroma_plane_size = (frame->i_stride[1] * (frame->i_lines[1] + 2*i_padv));
58 for( i = 1; i < 3; i++ )
60 CHECKED_MALLOC( frame->buffer[i], chroma_plane_size );
61 frame->plane[i] = frame->buffer[i] + (frame->i_stride[i] * i_padv + PADH)/2;
64 for( i = 0; i < h->param.i_bframe + 2; i++ )
65 for( j = 0; j < h->param.i_bframe + 2; j++ )
66 CHECKED_MALLOC( frame->i_row_satds[i][j], i_lines/16 * sizeof(int) );
69 frame->i_type = X264_TYPE_AUTO;
73 frame->i_frame_num = -1;
74 frame->i_lines_completed = -1;
75 frame->b_fdec = b_fdec;
78 /* all 4 luma planes allocated together, since the cacheline split code
79 * requires them to be in-phase wrt cacheline alignment. */
80 if( h->param.analyse.i_subpel_refine && b_fdec )
82 CHECKED_MALLOC( frame->buffer[0], 4*luma_plane_size );
83 for( i = 0; i < 4; i++ )
84 frame->filtered[i] = frame->buffer[0] + i*luma_plane_size + frame->i_stride[0] * i_padv + PADH;
85 frame->plane[0] = frame->filtered[0];
89 CHECKED_MALLOC( frame->buffer[0], luma_plane_size );
90 frame->filtered[0] = frame->plane[0] = frame->buffer[0] + frame->i_stride[0] * i_padv + PADH;
93 frame->b_duplicate = 0;
95 if( b_fdec ) /* fdec frame */
97 CHECKED_MALLOC( frame->mb_type, i_mb_count * sizeof(int8_t));
98 CHECKED_MALLOC( frame->mv[0], 2*16 * i_mb_count * sizeof(int16_t) );
99 CHECKED_MALLOC( frame->ref[0], 4 * i_mb_count * sizeof(int8_t) );
100 if( h->param.i_bframe )
102 CHECKED_MALLOC( frame->mv[1], 2*16 * i_mb_count * sizeof(int16_t) );
103 CHECKED_MALLOC( frame->ref[1], 4 * i_mb_count * sizeof(int8_t) );
108 frame->ref[1] = NULL;
110 CHECKED_MALLOC( frame->i_row_bits, i_lines/16 * sizeof(int) );
111 CHECKED_MALLOC( frame->i_row_qp, i_lines/16 * sizeof(int) );
112 if( h->param.analyse.i_me_method >= X264_ME_ESA )
114 CHECKED_MALLOC( frame->buffer[3],
115 frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv) * sizeof(uint16_t) << h->frames.b_have_sub8x8_esa );
116 frame->integral = (uint16_t*)frame->buffer[3] + frame->i_stride[0] * i_padv + PADH;
119 else /* fenc frame */
121 if( h->frames.b_have_lowres )
123 frame->i_width_lowres = frame->i_width[0]/2;
124 frame->i_stride_lowres = ALIGN( frame->i_width_lowres + 2*PADH, align );
125 frame->i_lines_lowres = frame->i_lines[0]/2;
127 luma_plane_size = frame->i_stride_lowres * (frame->i_lines[0]/2 + 2*PADV);
129 CHECKED_MALLOC( frame->buffer_lowres[0], 4 * luma_plane_size );
130 for( i = 0; i < 4; i++ )
131 frame->lowres[i] = frame->buffer_lowres[0] + (frame->i_stride_lowres * PADV + PADH) + i * luma_plane_size;
133 for( j = 0; j <= !!h->param.i_bframe; j++ )
134 for( i = 0; i <= h->param.i_bframe; i++ )
136 CHECKED_MALLOCZERO( frame->lowres_mvs[j][i], 2*h->mb.i_mb_count*sizeof(int16_t) );
137 CHECKED_MALLOC( frame->lowres_mv_costs[j][i], h->mb.i_mb_count*sizeof(int) );
139 CHECKED_MALLOC( frame->i_propagate_cost, (i_mb_count+3) * sizeof(uint16_t) );
140 for( j = 0; j <= h->param.i_bframe+1; j++ )
141 for( i = 0; i <= h->param.i_bframe+1; i++ )
143 CHECKED_MALLOC( frame->lowres_costs[j][i], (i_mb_count+3) * sizeof(uint16_t) );
144 CHECKED_MALLOC( frame->lowres_inter_types[j][i], (i_mb_count+3)/4 * sizeof(uint8_t) );
146 frame->i_intra_cost = frame->lowres_costs[0][0];
147 memset( frame->i_intra_cost, -1, (i_mb_count+3) * sizeof(uint16_t) );
149 if( h->param.rc.i_aq_mode )
151 CHECKED_MALLOC( frame->f_qp_offset, h->mb.i_mb_count * sizeof(float) );
152 CHECKED_MALLOC( frame->f_qp_offset_aq, h->mb.i_mb_count * sizeof(float) );
153 if( h->frames.b_have_lowres )
154 /* shouldn't really be initialized, just silences a valgrind false-positive in x264_mbtree_propagate_cost_sse2 */
155 CHECKED_MALLOCZERO( frame->i_inv_qscale_factor, (h->mb.i_mb_count+3) * sizeof(uint16_t) );
159 if( x264_pthread_mutex_init( &frame->mutex, NULL ) )
161 if( x264_pthread_cond_init( &frame->cv, NULL ) )
171 void x264_frame_delete( x264_frame_t *frame )
174 /* Duplicate frames are blank copies of real frames (including pointers),
175 * so freeing those pointers would cause a double free later. */
176 if( !frame->b_duplicate )
178 for( i = 0; i < 4; i++ )
179 x264_free( frame->buffer[i] );
180 for( i = 0; i < 4; i++ )
181 x264_free( frame->buffer_lowres[i] );
182 for( i = 0; i < X264_BFRAME_MAX+2; i++ )
183 for( j = 0; j < X264_BFRAME_MAX+2; j++ )
184 x264_free( frame->i_row_satds[i][j] );
185 for( j = 0; j < 2; j++ )
186 for( i = 0; i <= X264_BFRAME_MAX; i++ )
188 x264_free( frame->lowres_mvs[j][i] );
189 x264_free( frame->lowres_mv_costs[j][i] );
191 x264_free( frame->i_propagate_cost );
192 for( j = 0; j <= X264_BFRAME_MAX+1; j++ )
193 for( i = 0; i <= X264_BFRAME_MAX+1; i++ )
195 x264_free( frame->lowres_costs[j][i] );
196 x264_free( frame->lowres_inter_types[j][i] );
198 x264_free( frame->f_qp_offset );
199 x264_free( frame->f_qp_offset_aq );
200 x264_free( frame->i_inv_qscale_factor );
201 x264_free( frame->i_row_bits );
202 x264_free( frame->i_row_qp );
203 x264_free( frame->mb_type );
204 x264_free( frame->mv[0] );
205 x264_free( frame->mv[1] );
206 x264_free( frame->ref[0] );
207 x264_free( frame->ref[1] );
208 x264_pthread_mutex_destroy( &frame->mutex );
209 x264_pthread_cond_destroy( &frame->cv );
214 int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src )
216 int i_csp = src->img.i_csp & X264_CSP_MASK;
218 if( i_csp != X264_CSP_I420 && i_csp != X264_CSP_YV12 )
220 x264_log( h, X264_LOG_ERROR, "Arg invalid CSP\n" );
224 dst->i_type = src->i_type;
225 dst->i_qpplus1 = src->i_qpplus1;
226 dst->i_pts = dst->i_reordered_pts = src->i_pts;
227 dst->param = src->param;
231 int s = (i_csp == X264_CSP_YV12 && i) ? i^3 : i;
232 uint8_t *plane = src->img.plane[s];
233 int stride = src->img.i_stride[s];
234 int width = h->param.i_width >> !!i;
235 int height = h->param.i_height >> !!i;
236 if( src->img.i_csp & X264_CSP_VFLIP )
238 plane += (height-1)*stride;
241 h->mc.plane_copy( dst->plane[i], dst->i_stride[i], plane, stride, width, height );
248 static void plane_expand_border( uint8_t *pix, int i_stride, int i_width, int i_height, int i_padh, int i_padv, int b_pad_top, int b_pad_bottom )
250 #define PPIXEL(x, y) ( pix + (x) + (y)*i_stride )
252 for( y = 0; y < i_height; y++ )
255 memset( PPIXEL(-i_padh, y), PPIXEL(0, y)[0], i_padh );
257 memset( PPIXEL(i_width, y), PPIXEL(i_width-1, y)[0], i_padh );
261 for( y = 0; y < i_padv; y++ )
262 memcpy( PPIXEL(-i_padh, -y-1), PPIXEL(-i_padh, 0), i_width+2*i_padh );
265 for( y = 0; y < i_padv; y++ )
266 memcpy( PPIXEL(-i_padh, i_height+y), PPIXEL(-i_padh, i_height-1), i_width+2*i_padh );
270 void x264_frame_expand_border( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
274 if( mb_y & h->sh.b_mbaff )
276 for( i = 0; i < frame->i_plane; i++ )
278 int stride = frame->i_stride[i];
279 int width = 16*h->sps->i_mb_width >> !!i;
280 int height = (b_end ? 16*(h->sps->i_mb_height - mb_y) >> h->sh.b_mbaff : 16) >> !!i;
281 int padh = PADH >> !!i;
282 int padv = PADV >> !!i;
283 // buffer: 2 chroma, 3 luma (rounded to 4) because deblocking goes beyond the top of the mb
284 uint8_t *pix = frame->plane[i] + X264_MAX(0, (16*mb_y-4)*stride >> !!i);
285 if( b_end && !b_start )
286 height += 4 >> (!!i + h->sh.b_mbaff);
289 plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end );
290 plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end );
294 plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end );
299 void x264_frame_expand_border_filtered( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
301 /* during filtering, 8 extra pixels were filtered on each edge,
302 * but up to 3 of the horizontal ones may be wrong.
303 we want to expand border from the last filtered pixel */
305 int stride = frame->i_stride[0];
306 int width = 16*h->sps->i_mb_width + 8;
307 int height = b_end ? (16*(h->sps->i_mb_height - mb_y) >> h->sh.b_mbaff) + 16 : 16;
311 for( i = 1; i < 4; i++ )
313 // buffer: 8 luma, to match the hpel filter
314 uint8_t *pix = frame->filtered[i] + (16*mb_y - (8 << h->sh.b_mbaff)) * stride - 4;
317 plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end );
318 plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end );
322 plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end );
327 void x264_frame_expand_border_lowres( x264_frame_t *frame )
330 for( i = 0; i < 4; i++ )
331 plane_expand_border( frame->lowres[i], frame->i_stride_lowres, frame->i_width_lowres, frame->i_lines_lowres, PADH, PADV, 1, 1 );
334 void x264_frame_expand_border_mod16( x264_t *h, x264_frame_t *frame )
337 for( i = 0; i < frame->i_plane; i++ )
339 int i_subsample = i ? 1 : 0;
340 int i_width = h->param.i_width >> i_subsample;
341 int i_height = h->param.i_height >> i_subsample;
342 int i_padx = (h->sps->i_mb_width * 16 - h->param.i_width) >> i_subsample;
343 int i_pady = (h->sps->i_mb_height * 16 - h->param.i_height) >> i_subsample;
347 for( y = 0; y < i_height; y++ )
348 memset( &frame->plane[i][y*frame->i_stride[i] + i_width],
349 frame->plane[i][y*frame->i_stride[i] + i_width - 1],
354 //FIXME interlace? or just let it pad using the wrong field
355 for( y = i_height; y < i_height + i_pady; y++ )
356 memcpy( &frame->plane[i][y*frame->i_stride[i]],
357 &frame->plane[i][(i_height-1)*frame->i_stride[i]],
364 /* cavlc + 8x8 transform stores nnz per 16 coeffs for the purpose of
365 * entropy coding, but per 64 coeffs for the purpose of deblocking */
366 static void munge_cavlc_nnz_row( x264_t *h, int mb_y, uint8_t (*buf)[16] )
368 uint32_t (*src)[6] = (uint32_t(*)[6])h->mb.non_zero_count + mb_y * h->sps->i_mb_width;
369 int8_t *transform = h->mb.mb_transform_size + mb_y * h->sps->i_mb_width;
371 for( x=0; x<h->sps->i_mb_width; x++ )
373 memcpy( buf+x, src+x, 16 );
376 nnz = src[x][0] | src[x][1];
377 src[x][0] = src[x][1] = ((uint16_t)nnz ? 0x0101 : 0) + (nnz>>16 ? 0x01010000 : 0);
378 nnz = src[x][2] | src[x][3];
379 src[x][2] = src[x][3] = ((uint16_t)nnz ? 0x0101 : 0) + (nnz>>16 ? 0x01010000 : 0);
384 static void restore_cavlc_nnz_row( x264_t *h, int mb_y, uint8_t (*buf)[16] )
386 uint8_t (*dst)[24] = h->mb.non_zero_count + mb_y * h->sps->i_mb_width;
388 for( x=0; x<h->sps->i_mb_width; x++ )
389 memcpy( dst+x, buf+x, 16 );
392 static void munge_cavlc_nnz( x264_t *h, int mb_y, uint8_t (*buf)[16], void (*func)(x264_t*, int, uint8_t (*)[16]) )
394 func( h, mb_y, buf );
396 func( h, mb_y-1, buf + h->sps->i_mb_width );
399 func( h, mb_y+1, buf + h->sps->i_mb_width * 2 );
401 func( h, mb_y-2, buf + h->sps->i_mb_width * 3 );
406 /* Deblocking filter */
407 static const uint8_t i_alpha_table[52+12*2] =
409 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
410 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
411 0, 0, 0, 0, 0, 0, 4, 4, 5, 6,
412 7, 8, 9, 10, 12, 13, 15, 17, 20, 22,
413 25, 28, 32, 36, 40, 45, 50, 56, 63, 71,
414 80, 90,101,113,127,144,162,182,203,226,
416 255,255,255,255,255,255,255,255,255,255,255,255,
418 static const uint8_t i_beta_table[52+12*2] =
420 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
421 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
422 0, 0, 0, 0, 0, 0, 2, 2, 2, 3,
423 3, 3, 3, 4, 4, 4, 6, 6, 7, 7,
424 8, 8, 9, 9, 10, 10, 11, 11, 12, 12,
425 13, 13, 14, 14, 15, 15, 16, 16, 17, 17,
427 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
429 static const int8_t i_tc0_table[52+12*2][4] =
431 {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
432 {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
433 {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
434 {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
435 {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 1 },
436 {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 1, 1 }, {-1, 0, 1, 1 }, {-1, 1, 1, 1 },
437 {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 },
438 {-1, 1, 1, 2 }, {-1, 1, 2, 3 }, {-1, 1, 2, 3 }, {-1, 2, 2, 3 }, {-1, 2, 2, 4 }, {-1, 2, 3, 4 },
439 {-1, 2, 3, 4 }, {-1, 3, 3, 5 }, {-1, 3, 4, 6 }, {-1, 3, 4, 6 }, {-1, 4, 5, 7 }, {-1, 4, 5, 8 },
440 {-1, 4, 6, 9 }, {-1, 5, 7,10 }, {-1, 6, 8,11 }, {-1, 6, 8,13 }, {-1, 7,10,14 }, {-1, 8,11,16 },
441 {-1, 9,12,18 }, {-1,10,13,20 }, {-1,11,15,23 }, {-1,13,17,25 },
442 {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
443 {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
445 #define alpha_table(x) i_alpha_table[(x)+12]
446 #define beta_table(x) i_beta_table[(x)+12]
447 #define tc0_table(x) i_tc0_table[(x)+12]
450 static inline void deblock_luma_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0 )
453 for( i = 0; i < 4; i++ )
460 for( d = 0; d < 4; d++ )
462 const int p2 = pix[-3*xstride];
463 const int p1 = pix[-2*xstride];
464 const int p0 = pix[-1*xstride];
465 const int q0 = pix[ 0*xstride];
466 const int q1 = pix[ 1*xstride];
467 const int q2 = pix[ 2*xstride];
469 if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
473 if( abs( p2 - p0 ) < beta )
475 pix[-2*xstride] = p1 + x264_clip3( (( p2 + ((p0 + q0 + 1) >> 1)) >> 1) - p1, -tc0[i], tc0[i] );
478 if( abs( q2 - q0 ) < beta )
480 pix[ 1*xstride] = q1 + x264_clip3( (( q2 + ((p0 + q0 + 1) >> 1)) >> 1) - q1, -tc0[i], tc0[i] );
484 delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
485 pix[-1*xstride] = x264_clip_uint8( p0 + delta ); /* p0' */
486 pix[ 0*xstride] = x264_clip_uint8( q0 - delta ); /* q0' */
492 static void deblock_v_luma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
494 deblock_luma_c( pix, stride, 1, alpha, beta, tc0 );
496 static void deblock_h_luma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
498 deblock_luma_c( pix, 1, stride, alpha, beta, tc0 );
501 static inline void deblock_chroma_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0 )
504 for( i = 0; i < 4; i++ )
506 const int tc = tc0[i];
512 for( d = 0; d < 2; d++ )
514 const int p1 = pix[-2*xstride];
515 const int p0 = pix[-1*xstride];
516 const int q0 = pix[ 0*xstride];
517 const int q1 = pix[ 1*xstride];
519 if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
521 int delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
522 pix[-1*xstride] = x264_clip_uint8( p0 + delta ); /* p0' */
523 pix[ 0*xstride] = x264_clip_uint8( q0 - delta ); /* q0' */
529 static void deblock_v_chroma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
531 deblock_chroma_c( pix, stride, 1, alpha, beta, tc0 );
533 static void deblock_h_chroma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
535 deblock_chroma_c( pix, 1, stride, alpha, beta, tc0 );
538 static inline void deblock_luma_intra_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta )
541 for( d = 0; d < 16; d++ )
543 const int p2 = pix[-3*xstride];
544 const int p1 = pix[-2*xstride];
545 const int p0 = pix[-1*xstride];
546 const int q0 = pix[ 0*xstride];
547 const int q1 = pix[ 1*xstride];
548 const int q2 = pix[ 2*xstride];
550 if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
552 if(abs( p0 - q0 ) < ((alpha >> 2) + 2) )
554 if( abs( p2 - p0 ) < beta ) /* p0', p1', p2' */
556 const int p3 = pix[-4*xstride];
557 pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
558 pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
559 pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
562 pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
563 if( abs( q2 - q0 ) < beta ) /* q0', q1', q2' */
565 const int q3 = pix[3*xstride];
566 pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
567 pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
568 pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
571 pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
575 pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
576 pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
582 static void deblock_v_luma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
584 deblock_luma_intra_c( pix, stride, 1, alpha, beta );
586 static void deblock_h_luma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
588 deblock_luma_intra_c( pix, 1, stride, alpha, beta );
591 static inline void deblock_chroma_intra_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta )
594 for( d = 0; d < 8; d++ )
596 const int p1 = pix[-2*xstride];
597 const int p0 = pix[-1*xstride];
598 const int q0 = pix[ 0*xstride];
599 const int q1 = pix[ 1*xstride];
601 if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
603 pix[-1*xstride] = (2*p1 + p0 + q1 + 2) >> 2; /* p0' */
604 pix[ 0*xstride] = (2*q1 + q0 + p1 + 2) >> 2; /* q0' */
609 static void deblock_v_chroma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
611 deblock_chroma_intra_c( pix, stride, 1, alpha, beta );
613 static void deblock_h_chroma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
615 deblock_chroma_intra_c( pix, 1, stride, alpha, beta );
618 static inline void deblock_edge( x264_t *h, uint8_t *pix1, uint8_t *pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_inter_t pf_inter )
620 const int index_a = i_qp + h->sh.i_alpha_c0_offset;
621 const int alpha = alpha_table(index_a);
622 const int beta = beta_table(i_qp + h->sh.i_beta_offset);
625 if( !alpha || !beta )
628 tc[0] = tc0_table(index_a)[bS[0]] + b_chroma;
629 tc[1] = tc0_table(index_a)[bS[1]] + b_chroma;
630 tc[2] = tc0_table(index_a)[bS[2]] + b_chroma;
631 tc[3] = tc0_table(index_a)[bS[3]] + b_chroma;
633 pf_inter( pix1, i_stride, alpha, beta, tc );
635 pf_inter( pix2, i_stride, alpha, beta, tc );
638 static inline void deblock_edge_intra( x264_t *h, uint8_t *pix1, uint8_t *pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_intra_t pf_intra )
640 const int alpha = alpha_table(i_qp + h->sh.i_alpha_c0_offset);
641 const int beta = beta_table(i_qp + h->sh.i_beta_offset);
643 if( !alpha || !beta )
646 pf_intra( pix1, i_stride, alpha, beta );
648 pf_intra( pix2, i_stride, alpha, beta );
651 void x264_frame_deblock_row( x264_t *h, int mb_y )
653 const int s8x8 = 2 * h->mb.i_mb_stride;
654 const int s4x4 = 4 * h->mb.i_mb_stride;
655 const int b_interlaced = h->sh.b_mbaff;
656 const int mvy_limit = 4 >> b_interlaced;
657 const int qp_thresh = 15 - X264_MIN(h->sh.i_alpha_c0_offset, h->sh.i_beta_offset) - X264_MAX(0, h->param.analyse.i_chroma_qp_offset);
658 const int no_sub8x8 = !(h->param.analyse.inter & X264_ANALYSE_PSUB8x8);
660 int stridey = h->fdec->i_stride[0];
661 int stride2y = stridey << b_interlaced;
662 int strideuv = h->fdec->i_stride[1];
663 int stride2uv = strideuv << b_interlaced;
664 uint8_t (*nnz_backup)[16] = h->scratch_buffer;
666 if( !h->pps->b_cabac && h->pps->b_transform_8x8_mode )
667 munge_cavlc_nnz( h, mb_y, nnz_backup, munge_cavlc_nnz_row );
669 for( mb_x = 0; mb_x < h->sps->i_mb_width; mb_x += (~b_interlaced | mb_y)&1, mb_y ^= b_interlaced )
671 const int mb_xy = mb_y * h->mb.i_mb_stride + mb_x;
672 const int mb_8x8 = 2 * s8x8 * mb_y + 2 * mb_x;
673 const int mb_4x4 = 4 * s4x4 * mb_y + 4 * mb_x;
674 const int b_8x8_transform = h->mb.mb_transform_size[mb_xy];
675 const int i_qp = h->mb.qp[mb_xy];
676 int i_edge_end = (h->mb.type[mb_xy] == P_SKIP) ? 1 : 4;
677 uint8_t *pixy = h->fdec->plane[0] + 16*mb_y*stridey + 16*mb_x;
678 uint8_t *pixu = h->fdec->plane[1] + 8*mb_y*strideuv + 8*mb_x;
679 uint8_t *pixv = h->fdec->plane[2] + 8*mb_y*strideuv + 8*mb_x;
680 if( b_interlaced && (mb_y&1) )
687 x264_prefetch_fenc( h, h->fdec, mb_x, mb_y );
689 if( i_qp <= qp_thresh )
692 #define FILTER_DIR(intra, i_dir)\
695 i_qpn= h->mb.qp[mbn_xy];\
699 deblock_edge##intra( h, pixy + 4*i_edge, NULL,\
700 stride2y, bS, (i_qp+i_qpn+1) >> 1, 0,\
701 h->loopf.deblock_h_luma##intra );\
705 int i_qpc = (h->chroma_qp_table[i_qp] + h->chroma_qp_table[i_qpn] + 1) >> 1;\
706 deblock_edge##intra( h, pixu + 2*i_edge, pixv + 2*i_edge,\
707 stride2uv, bS, i_qpc, 1,\
708 h->loopf.deblock_h_chroma##intra );\
713 /* horizontal edge */\
714 deblock_edge##intra( h, pixy + 4*i_edge*stride2y, NULL,\
715 stride2y, bS, (i_qp+i_qpn+1) >> 1, 0,\
716 h->loopf.deblock_v_luma##intra );\
720 int i_qpc = (h->chroma_qp_table[i_qp] + h->chroma_qp_table[i_qpn] + 1) >> 1;\
721 deblock_edge##intra( h, pixu + 2*i_edge*stride2uv, pixv + 2*i_edge*stride2uv,\
722 stride2uv, bS, i_qpc, 1,\
723 h->loopf.deblock_v_chroma##intra );\
728 #define DEBLOCK_STRENGTH(i_dir)\
730 /* *** Get bS for each 4px for the current edge *** */\
731 if( IS_INTRA( h->mb.type[mb_xy] ) || IS_INTRA( h->mb.type[mbn_xy]) )\
732 M32( bS ) = 0x03030303;\
735 M32( bS ) = 0x00000000;\
736 for( i = 0; i < 4; i++ )\
738 int x = i_dir == 0 ? i_edge : i;\
739 int y = i_dir == 0 ? i : i_edge;\
740 int xn = i_dir == 0 ? (x - 1)&0x03 : x;\
741 int yn = i_dir == 0 ? y : (y - 1)&0x03;\
742 if( h->mb.non_zero_count[mb_xy][x+y*4] != 0 ||\
743 h->mb.non_zero_count[mbn_xy][xn+yn*4] != 0 )\
745 else if(!(i_edge&no_sub8x8))\
747 if((i&no_sub8x8) && bS[i-1] != 2)\
751 int i8p= mb_8x8+(x>>1)+(y>>1)*s8x8;\
752 int i8q= mbn_8x8+(xn>>1)+(yn>>1)*s8x8;\
753 int i4p= mb_4x4+x+y*s4x4;\
754 int i4q= mbn_4x4+xn+yn*s4x4;\
756 /* We don't use duplicate refs in B-frames, so we can take this shortcut for now. */ \
757 if( h->sh.i_type == SLICE_TYPE_B || h->mb.ref[0][i8p] < 0 || h->mb.ref[0][i8q] < 0 )\
758 refs_equal = h->mb.ref[0][i8p] == h->mb.ref[0][i8q];\
759 else if( !h->mb.b_interlaced )\
760 refs_equal = h->fref0[h->mb.ref[0][i8p]]->i_poc == h->fref0[h->mb.ref[0][i8q]]->i_poc;\
762 refs_equal = h->fref0[h->mb.ref[0][i8p]>>1]->i_poc == h->fref0[h->mb.ref[0][i8q]>>1]->i_poc\
763 && (h->mb.ref[0][i8p]&1) == (h->mb.ref[0][i8q]&1);\
765 abs( h->mb.mv[0][i4p][0] - h->mb.mv[0][i4q][0] ) >= 4 ||\
766 abs( h->mb.mv[0][i4p][1] - h->mb.mv[0][i4q][1] ) >= mvy_limit ) ||\
767 (h->sh.i_type == SLICE_TYPE_B &&\
768 (h->mb.ref[1][i8p] != h->mb.ref[1][i8q] ||\
769 abs( h->mb.mv[1][i4p][0] - h->mb.mv[1][i4q][0] ) >= 4 ||\
770 abs( h->mb.mv[1][i4p][1] - h->mb.mv[1][i4q][1] ) >= mvy_limit )))\
780 /* i_dir == 0 -> vertical edge
781 * i_dir == 1 -> horizontal edge */
782 #define DEBLOCK_DIR(i_dir)\
784 int i_edge = (i_dir ? (mb_y <= b_interlaced) : (mb_x == 0));\
785 int i_qpn, i, mbn_xy, mbn_8x8, mbn_4x4;\
786 ALIGNED_4( uint8_t bS[4] ); /* filtering strength */\
788 i_edge+= b_8x8_transform;\
791 mbn_xy = i_dir == 0 ? mb_xy - 1 : mb_xy - h->mb.i_mb_stride;\
792 mbn_8x8 = i_dir == 0 ? mb_8x8 - 2 : mb_8x8 - 2 * s8x8;\
793 mbn_4x4 = i_dir == 0 ? mb_4x4 - 4 : mb_4x4 - 4 * s4x4;\
794 if( b_interlaced && i_dir == 1 )\
796 mbn_xy -= h->mb.i_mb_stride;\
797 mbn_8x8 -= 2 * s8x8;\
798 mbn_4x4 -= 4 * s4x4;\
800 else if( IS_INTRA( h->mb.type[mb_xy] ) || IS_INTRA( h->mb.type[mbn_xy]) )\
802 FILTER_DIR( _intra, i_dir );\
805 DEBLOCK_STRENGTH(i_dir);\
807 FILTER_DIR( , i_dir);\
809 i_edge += b_8x8_transform+1;\
814 for( ; i_edge < i_edge_end; i_edge+=b_8x8_transform+1 )\
816 DEBLOCK_STRENGTH(i_dir);\
818 FILTER_DIR( , i_dir);\
826 if( !h->pps->b_cabac && h->pps->b_transform_8x8_mode )
827 munge_cavlc_nnz( h, mb_y, nnz_backup, restore_cavlc_nnz_row );
830 void x264_frame_deblock( x264_t *h )
833 for( mb_y = 0; mb_y < h->sps->i_mb_height; mb_y += 1 + h->sh.b_mbaff )
834 x264_frame_deblock_row( h, mb_y );
838 void x264_deblock_v_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
839 void x264_deblock_h_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
840 void x264_deblock_v_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
841 void x264_deblock_h_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
843 void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
844 void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
845 void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta );
846 void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta );
848 void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
849 void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
850 void x264_deblock_h_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
851 void x264_deblock_v8_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
853 static void x264_deblock_v_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
855 x264_deblock_v8_luma_mmxext( pix, stride, alpha, beta, tc0 );
856 x264_deblock_v8_luma_mmxext( pix+8, stride, alpha, beta, tc0+2 );
858 static void x264_deblock_v_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta )
860 x264_deblock_v8_luma_intra_mmxext( pix, stride, alpha, beta );
861 x264_deblock_v8_luma_intra_mmxext( pix+8, stride, alpha, beta );
867 void x264_deblock_v_luma_altivec( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
868 void x264_deblock_h_luma_altivec( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
872 void x264_deblock_v_luma_neon( uint8_t *, int, int, int, int8_t * );
873 void x264_deblock_h_luma_neon( uint8_t *, int, int, int, int8_t * );
874 void x264_deblock_v_chroma_neon( uint8_t *, int, int, int, int8_t * );
875 void x264_deblock_h_chroma_neon( uint8_t *, int, int, int, int8_t * );
878 void x264_deblock_init( int cpu, x264_deblock_function_t *pf )
880 pf->deblock_v_luma = deblock_v_luma_c;
881 pf->deblock_h_luma = deblock_h_luma_c;
882 pf->deblock_v_chroma = deblock_v_chroma_c;
883 pf->deblock_h_chroma = deblock_h_chroma_c;
884 pf->deblock_v_luma_intra = deblock_v_luma_intra_c;
885 pf->deblock_h_luma_intra = deblock_h_luma_intra_c;
886 pf->deblock_v_chroma_intra = deblock_v_chroma_intra_c;
887 pf->deblock_h_chroma_intra = deblock_h_chroma_intra_c;
890 if( cpu&X264_CPU_MMXEXT )
892 pf->deblock_v_chroma = x264_deblock_v_chroma_mmxext;
893 pf->deblock_h_chroma = x264_deblock_h_chroma_mmxext;
894 pf->deblock_v_chroma_intra = x264_deblock_v_chroma_intra_mmxext;
895 pf->deblock_h_chroma_intra = x264_deblock_h_chroma_intra_mmxext;
897 pf->deblock_v_luma = x264_deblock_v_luma_mmxext;
898 pf->deblock_h_luma = x264_deblock_h_luma_mmxext;
899 pf->deblock_v_luma_intra = x264_deblock_v_luma_intra_mmxext;
900 pf->deblock_h_luma_intra = x264_deblock_h_luma_intra_mmxext;
902 if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_STACK_MOD4) )
904 pf->deblock_v_luma = x264_deblock_v_luma_sse2;
905 pf->deblock_h_luma = x264_deblock_h_luma_sse2;
906 pf->deblock_v_luma_intra = x264_deblock_v_luma_intra_sse2;
907 pf->deblock_h_luma_intra = x264_deblock_h_luma_intra_sse2;
913 if( cpu&X264_CPU_ALTIVEC )
915 pf->deblock_v_luma = x264_deblock_v_luma_altivec;
916 pf->deblock_h_luma = x264_deblock_h_luma_altivec;
921 if( cpu&X264_CPU_NEON )
923 pf->deblock_v_luma = x264_deblock_v_luma_neon;
924 pf->deblock_h_luma = x264_deblock_h_luma_neon;
925 pf->deblock_v_chroma = x264_deblock_v_chroma_neon;
926 pf->deblock_h_chroma = x264_deblock_h_chroma_neon;
933 void x264_frame_cond_broadcast( x264_frame_t *frame, int i_lines_completed )
935 x264_pthread_mutex_lock( &frame->mutex );
936 frame->i_lines_completed = i_lines_completed;
937 x264_pthread_cond_broadcast( &frame->cv );
938 x264_pthread_mutex_unlock( &frame->mutex );
941 void x264_frame_cond_wait( x264_frame_t *frame, int i_lines_completed )
943 x264_pthread_mutex_lock( &frame->mutex );
944 while( frame->i_lines_completed < i_lines_completed )
945 x264_pthread_cond_wait( &frame->cv, &frame->mutex );
946 x264_pthread_mutex_unlock( &frame->mutex );
951 void x264_frame_push( x264_frame_t **list, x264_frame_t *frame )
954 while( list[i] ) i++;
958 x264_frame_t *x264_frame_pop( x264_frame_t **list )
963 while( list[i+1] ) i++;
969 void x264_frame_unshift( x264_frame_t **list, x264_frame_t *frame )
972 while( list[i] ) i++;
978 x264_frame_t *x264_frame_shift( x264_frame_t **list )
980 x264_frame_t *frame = list[0];
982 for( i = 0; list[i]; i++ )
988 void x264_frame_push_unused( x264_t *h, x264_frame_t *frame )
990 assert( frame->i_reference_count > 0 );
991 frame->i_reference_count--;
992 if( frame->i_reference_count == 0 )
993 x264_frame_push( h->frames.unused[frame->b_fdec], frame );
996 x264_frame_t *x264_frame_pop_unused( x264_t *h, int b_fdec )
999 if( h->frames.unused[b_fdec][0] )
1000 frame = x264_frame_pop( h->frames.unused[b_fdec] );
1002 frame = x264_frame_new( h, b_fdec );
1005 frame->b_last_minigop_bframe = 0;
1006 frame->i_reference_count = 1;
1007 frame->b_intra_calculated = 0;
1008 frame->b_scenecut = 1;
1009 frame->b_keyframe = 0;
1011 memset( frame->weight, 0, sizeof(frame->weight) );
1012 memset( frame->f_weighted_cost_delta, 0, sizeof(frame->f_weighted_cost_delta) );
1017 void x264_frame_push_blank_unused( x264_t *h, x264_frame_t *frame )
1019 assert( frame->i_reference_count > 0 );
1020 frame->i_reference_count--;
1021 if( frame->i_reference_count == 0 )
1022 x264_frame_push( h->frames.blank_unused, frame );
1025 x264_frame_t *x264_frame_pop_blank_unused( x264_t *h )
1027 x264_frame_t *frame;
1028 if( h->frames.blank_unused[0] )
1029 frame = x264_frame_pop( h->frames.blank_unused );
1031 frame = x264_malloc( sizeof(x264_frame_t) );
1034 frame->b_duplicate = 1;
1035 frame->i_reference_count = 1;
1039 void x264_frame_sort( x264_frame_t **list, int b_dts )
1044 for( i = 0; list[i+1]; i++ )
1046 int dtype = list[i]->i_type - list[i+1]->i_type;
1047 int dtime = list[i]->i_frame - list[i+1]->i_frame;
1048 int swap = b_dts ? dtype > 0 || ( dtype == 0 && dtime > 0 )
1052 XCHG( x264_frame_t*, list[i], list[i+1] );
1059 void x264_weight_scale_plane( x264_t *h, uint8_t *dst, int i_dst_stride, uint8_t *src, int i_src_stride,
1060 int i_width, int i_height, x264_weight_t *w )
1063 /* Weight horizontal strips of height 16. This was found to be the optimal height
1064 * in terms of the cache loads. */
1065 while( i_height > 0 )
1067 for( x = 0; x < i_width; x += 16 )
1068 w->weightfn[16>>2]( dst+x, i_dst_stride, src+x, i_src_stride, w, X264_MIN( i_height, 16 ) );
1070 dst += 16 * i_dst_stride;
1071 src += 16 * i_src_stride;
1075 void x264_frame_delete_list( x264_frame_t **list )
1081 x264_frame_delete( list[i++] );
1085 int x264_synch_frame_list_init( x264_synch_frame_list_t *slist, int max_size )
1089 slist->i_max_size = max_size;
1091 CHECKED_MALLOCZERO( slist->list, (max_size+1) * sizeof(x264_frame_t*) );
1092 if( x264_pthread_mutex_init( &slist->mutex, NULL ) ||
1093 x264_pthread_cond_init( &slist->cv_fill, NULL ) ||
1094 x264_pthread_cond_init( &slist->cv_empty, NULL ) )
1101 void x264_synch_frame_list_delete( x264_synch_frame_list_t *slist )
1103 x264_pthread_mutex_destroy( &slist->mutex );
1104 x264_pthread_cond_destroy( &slist->cv_fill );
1105 x264_pthread_cond_destroy( &slist->cv_empty );
1106 x264_frame_delete_list( slist->list );
1109 void x264_synch_frame_list_push( x264_synch_frame_list_t *slist, x264_frame_t *frame )
1111 x264_pthread_mutex_lock( &slist->mutex );
1112 while( slist->i_size == slist->i_max_size )
1113 x264_pthread_cond_wait( &slist->cv_empty, &slist->mutex );
1114 slist->list[ slist->i_size++ ] = frame;
1115 x264_pthread_mutex_unlock( &slist->mutex );
1116 x264_pthread_cond_broadcast( &slist->cv_fill );