1 /*****************************************************************************
2 * frame.c: h264 encoder library
3 *****************************************************************************
4 * Copyright (C) 2003-2008 x264 project
6 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
7 * Loren Merritt <lorenm@u.washington.edu>
8 * Fiona Glaser <fiona@x264.com>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23 *****************************************************************************/
27 #define ALIGN(x,a) (((x)+((a)-1))&~((a)-1))
29 x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
33 int i_mb_count = h->mb.i_mb_count;
34 int i_stride, i_width, i_lines;
35 int i_padv = PADV << h->param.b_interlaced;
37 int chroma_plane_size;
38 int align = h->param.cpu&X264_CPU_CACHELINE_64 ? 64 : h->param.cpu&X264_CPU_CACHELINE_32 ? 32 : 16;
40 CHECKED_MALLOCZERO( frame, sizeof(x264_frame_t) );
42 /* allocate frame data (+64 for extra data for me) */
43 i_width = ALIGN( h->param.i_width, 16 );
44 i_stride = ALIGN( i_width + 2*PADH, align );
45 i_lines = ALIGN( h->param.i_height, 16<<h->param.b_interlaced );
48 for( int i = 0; i < 3; i++ )
50 frame->i_stride[i] = ALIGN( i_stride >> !!i, align );
51 frame->i_width[i] = i_width >> !!i;
52 frame->i_lines[i] = i_lines >> !!i;
55 luma_plane_size = (frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv));
56 chroma_plane_size = (frame->i_stride[1] * (frame->i_lines[1] + 2*i_padv));
57 for( int i = 1; i < 3; i++ )
59 CHECKED_MALLOC( frame->buffer[i], chroma_plane_size );
60 frame->plane[i] = frame->buffer[i] + (frame->i_stride[i] * i_padv + PADH)/2;
63 for( int i = 0; i < h->param.i_bframe + 2; i++ )
64 for( int j = 0; j < h->param.i_bframe + 2; j++ )
65 CHECKED_MALLOC( frame->i_row_satds[i][j], i_lines/16 * sizeof(int) );
68 frame->i_type = X264_TYPE_AUTO;
72 frame->i_frame_num = -1;
73 frame->i_lines_completed = -1;
74 frame->b_fdec = b_fdec;
75 frame->i_pic_struct = PIC_STRUCT_AUTO;
76 frame->i_field_cnt = -1;
78 frame->i_cpb_duration =
79 frame->i_dpb_output_delay =
80 frame->i_cpb_delay = 0;
81 frame->i_coded_fields_lookahead =
82 frame->i_cpb_delay_lookahead = -1;
86 /* all 4 luma planes allocated together, since the cacheline split code
87 * requires them to be in-phase wrt cacheline alignment. */
88 if( h->param.analyse.i_subpel_refine && b_fdec )
90 CHECKED_MALLOC( frame->buffer[0], 4*luma_plane_size );
91 for( int i = 0; i < 4; i++ )
92 frame->filtered[i] = frame->buffer[0] + i*luma_plane_size + frame->i_stride[0] * i_padv + PADH;
93 frame->plane[0] = frame->filtered[0];
97 CHECKED_MALLOC( frame->buffer[0], luma_plane_size );
98 frame->filtered[0] = frame->plane[0] = frame->buffer[0] + frame->i_stride[0] * i_padv + PADH;
101 frame->b_duplicate = 0;
103 if( b_fdec ) /* fdec frame */
105 CHECKED_MALLOC( frame->mb_type, i_mb_count * sizeof(int8_t));
106 CHECKED_MALLOC( frame->mb_partition, i_mb_count * sizeof(uint8_t));
107 CHECKED_MALLOC( frame->mv[0], 2*16 * i_mb_count * sizeof(int16_t) );
108 CHECKED_MALLOC( frame->ref[0], 4 * i_mb_count * sizeof(int8_t) );
109 if( h->param.i_bframe )
111 CHECKED_MALLOC( frame->mv[1], 2*16 * i_mb_count * sizeof(int16_t) );
112 CHECKED_MALLOC( frame->ref[1], 4 * i_mb_count * sizeof(int8_t) );
117 frame->ref[1] = NULL;
119 CHECKED_MALLOC( frame->i_row_bits, i_lines/16 * sizeof(int) );
120 CHECKED_MALLOC( frame->i_row_qp, i_lines/16 * sizeof(int) );
121 if( h->param.analyse.i_me_method >= X264_ME_ESA )
123 CHECKED_MALLOC( frame->buffer[3],
124 frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv) * sizeof(uint16_t) << h->frames.b_have_sub8x8_esa );
125 frame->integral = (uint16_t*)frame->buffer[3] + frame->i_stride[0] * i_padv + PADH;
128 else /* fenc frame */
130 if( h->frames.b_have_lowres )
132 frame->i_width_lowres = frame->i_width[0]/2;
133 frame->i_stride_lowres = ALIGN( frame->i_width_lowres + 2*PADH, align );
134 frame->i_lines_lowres = frame->i_lines[0]/2;
136 luma_plane_size = frame->i_stride_lowres * (frame->i_lines[0]/2 + 2*PADV);
138 CHECKED_MALLOC( frame->buffer_lowres[0], 4 * luma_plane_size );
139 for( int i = 0; i < 4; i++ )
140 frame->lowres[i] = frame->buffer_lowres[0] + (frame->i_stride_lowres * PADV + PADH) + i * luma_plane_size;
142 for( int j = 0; j <= !!h->param.i_bframe; j++ )
143 for( int i = 0; i <= h->param.i_bframe; i++ )
145 CHECKED_MALLOCZERO( frame->lowres_mvs[j][i], 2*h->mb.i_mb_count*sizeof(int16_t) );
146 CHECKED_MALLOC( frame->lowres_mv_costs[j][i], h->mb.i_mb_count*sizeof(int) );
148 CHECKED_MALLOC( frame->i_propagate_cost, (i_mb_count+3) * sizeof(uint16_t) );
149 for( int j = 0; j <= h->param.i_bframe+1; j++ )
150 for( int i = 0; i <= h->param.i_bframe+1; i++ )
152 CHECKED_MALLOC( frame->lowres_costs[j][i], (i_mb_count+3) * sizeof(uint16_t) );
153 CHECKED_MALLOC( frame->lowres_inter_types[j][i], (i_mb_count+3)/4 * sizeof(uint8_t) );
155 frame->i_intra_cost = frame->lowres_costs[0][0];
156 memset( frame->i_intra_cost, -1, (i_mb_count+3) * sizeof(uint16_t) );
158 if( h->param.rc.i_aq_mode )
160 CHECKED_MALLOC( frame->f_qp_offset, h->mb.i_mb_count * sizeof(float) );
161 CHECKED_MALLOC( frame->f_qp_offset_aq, h->mb.i_mb_count * sizeof(float) );
162 if( h->frames.b_have_lowres )
163 /* shouldn't really be initialized, just silences a valgrind false-positive in x264_mbtree_propagate_cost_sse2 */
164 CHECKED_MALLOCZERO( frame->i_inv_qscale_factor, (h->mb.i_mb_count+3) * sizeof(uint16_t) );
168 if( x264_pthread_mutex_init( &frame->mutex, NULL ) )
170 if( x264_pthread_cond_init( &frame->cv, NULL ) )
180 void x264_frame_delete( x264_frame_t *frame )
182 /* Duplicate frames are blank copies of real frames (including pointers),
183 * so freeing those pointers would cause a double free later. */
184 if( !frame->b_duplicate )
186 for( int i = 0; i < 4; i++ )
187 x264_free( frame->buffer[i] );
188 for( int i = 0; i < 4; i++ )
189 x264_free( frame->buffer_lowres[i] );
190 for( int i = 0; i < X264_BFRAME_MAX+2; i++ )
191 for( int j = 0; j < X264_BFRAME_MAX+2; j++ )
192 x264_free( frame->i_row_satds[i][j] );
193 for( int j = 0; j < 2; j++ )
194 for( int i = 0; i <= X264_BFRAME_MAX; i++ )
196 x264_free( frame->lowres_mvs[j][i] );
197 x264_free( frame->lowres_mv_costs[j][i] );
199 x264_free( frame->i_propagate_cost );
200 for( int j = 0; j <= X264_BFRAME_MAX+1; j++ )
201 for( int i = 0; i <= X264_BFRAME_MAX+1; i++ )
203 x264_free( frame->lowres_costs[j][i] );
204 x264_free( frame->lowres_inter_types[j][i] );
206 x264_free( frame->f_qp_offset );
207 x264_free( frame->f_qp_offset_aq );
208 x264_free( frame->i_inv_qscale_factor );
209 x264_free( frame->i_row_bits );
210 x264_free( frame->i_row_qp );
211 x264_free( frame->mb_type );
212 x264_free( frame->mb_partition );
213 x264_free( frame->mv[0] );
214 x264_free( frame->mv[1] );
215 x264_free( frame->ref[0] );
216 x264_free( frame->ref[1] );
217 x264_pthread_mutex_destroy( &frame->mutex );
218 x264_pthread_cond_destroy( &frame->cv );
223 int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src )
225 int i_csp = src->img.i_csp & X264_CSP_MASK;
226 if( i_csp != X264_CSP_I420 && i_csp != X264_CSP_YV12 )
228 x264_log( h, X264_LOG_ERROR, "Arg invalid CSP\n" );
232 dst->i_type = src->i_type;
233 dst->i_qpplus1 = src->i_qpplus1;
234 dst->i_pts = dst->i_reordered_pts = src->i_pts;
235 dst->param = src->param;
236 dst->i_pic_struct = src->i_pic_struct;
238 for( int i = 0; i < 3; i++ )
240 int s = (i_csp == X264_CSP_YV12 && i) ? i^3 : i;
241 uint8_t *plane = src->img.plane[s];
242 int stride = src->img.i_stride[s];
243 int width = h->param.i_width >> !!i;
244 int height = h->param.i_height >> !!i;
245 if( src->img.i_csp & X264_CSP_VFLIP )
247 plane += (height-1)*stride;
250 h->mc.plane_copy( dst->plane[i], dst->i_stride[i], plane, stride, width, height );
257 static void plane_expand_border( uint8_t *pix, int i_stride, int i_width, int i_height, int i_padh, int i_padv, int b_pad_top, int b_pad_bottom )
259 #define PPIXEL(x, y) ( pix + (x) + (y)*i_stride )
260 for( int y = 0; y < i_height; y++ )
263 memset( PPIXEL(-i_padh, y), PPIXEL(0, y)[0], i_padh );
265 memset( PPIXEL(i_width, y), PPIXEL(i_width-1, y)[0], i_padh );
269 for( int y = 0; y < i_padv; y++ )
270 memcpy( PPIXEL(-i_padh, -y-1), PPIXEL(-i_padh, 0), i_width+2*i_padh );
273 for( int y = 0; y < i_padv; y++ )
274 memcpy( PPIXEL(-i_padh, i_height+y), PPIXEL(-i_padh, i_height-1), i_width+2*i_padh );
278 void x264_frame_expand_border( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
281 if( mb_y & h->sh.b_mbaff )
283 for( int i = 0; i < frame->i_plane; i++ )
285 int stride = frame->i_stride[i];
286 int width = 16*h->sps->i_mb_width >> !!i;
287 int height = (b_end ? 16*(h->sps->i_mb_height - mb_y) >> h->sh.b_mbaff : 16) >> !!i;
288 int padh = PADH >> !!i;
289 int padv = PADV >> !!i;
290 // buffer: 2 chroma, 3 luma (rounded to 4) because deblocking goes beyond the top of the mb
291 uint8_t *pix = frame->plane[i] + X264_MAX(0, (16*mb_y-4)*stride >> !!i);
292 if( b_end && !b_start )
293 height += 4 >> (!!i + h->sh.b_mbaff);
296 plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end );
297 plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end );
301 plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end );
306 void x264_frame_expand_border_filtered( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
308 /* during filtering, 8 extra pixels were filtered on each edge,
309 * but up to 3 of the horizontal ones may be wrong.
310 we want to expand border from the last filtered pixel */
312 int stride = frame->i_stride[0];
313 int width = 16*h->sps->i_mb_width + 8;
314 int height = b_end ? (16*(h->sps->i_mb_height - mb_y) >> h->sh.b_mbaff) + 16 : 16;
317 for( int i = 1; i < 4; i++ )
319 // buffer: 8 luma, to match the hpel filter
320 uint8_t *pix = frame->filtered[i] + (16*mb_y - (8 << h->sh.b_mbaff)) * stride - 4;
323 plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end );
324 plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end );
327 plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end );
331 void x264_frame_expand_border_lowres( x264_frame_t *frame )
333 for( int i = 0; i < 4; i++ )
334 plane_expand_border( frame->lowres[i], frame->i_stride_lowres, frame->i_width_lowres, frame->i_lines_lowres, PADH, PADV, 1, 1 );
337 void x264_frame_expand_border_mod16( x264_t *h, x264_frame_t *frame )
339 for( int i = 0; i < frame->i_plane; i++ )
341 int i_subsample = i ? 1 : 0;
342 int i_width = h->param.i_width >> i_subsample;
343 int i_height = h->param.i_height >> i_subsample;
344 int i_padx = (h->sps->i_mb_width * 16 - h->param.i_width) >> i_subsample;
345 int i_pady = (h->sps->i_mb_height * 16 - h->param.i_height) >> i_subsample;
349 for( int y = 0; y < i_height; y++ )
350 memset( &frame->plane[i][y*frame->i_stride[i] + i_width],
351 frame->plane[i][y*frame->i_stride[i] + i_width - 1],
356 for( int y = i_height; y < i_height + i_pady; y++ )
357 memcpy( &frame->plane[i][y*frame->i_stride[i]],
358 &frame->plane[i][(i_height-(~y&h->param.b_interlaced)-1)*frame->i_stride[i]],
365 /* cavlc + 8x8 transform stores nnz per 16 coeffs for the purpose of
366 * entropy coding, but per 64 coeffs for the purpose of deblocking */
367 static void munge_cavlc_nnz_row( x264_t *h, int mb_y, uint8_t (*buf)[16] )
369 uint32_t (*src)[6] = (uint32_t(*)[6])h->mb.non_zero_count + mb_y * h->sps->i_mb_width;
370 int8_t *transform = h->mb.mb_transform_size + mb_y * h->sps->i_mb_width;
371 for( int x = 0; x<h->sps->i_mb_width; x++ )
373 memcpy( buf+x, src+x, 16 );
376 int nnz = src[x][0] | src[x][1];
377 src[x][0] = src[x][1] = ((uint16_t)nnz ? 0x0101 : 0) + (nnz>>16 ? 0x01010000 : 0);
378 nnz = src[x][2] | src[x][3];
379 src[x][2] = src[x][3] = ((uint16_t)nnz ? 0x0101 : 0) + (nnz>>16 ? 0x01010000 : 0);
384 static void restore_cavlc_nnz_row( x264_t *h, int mb_y, uint8_t (*buf)[16] )
386 uint8_t (*dst)[24] = h->mb.non_zero_count + mb_y * h->sps->i_mb_width;
387 for( int x = 0; x < h->sps->i_mb_width; x++ )
388 memcpy( dst+x, buf+x, 16 );
391 static void munge_cavlc_nnz( x264_t *h, int mb_y, uint8_t (*buf)[16], void (*func)(x264_t*, int, uint8_t (*)[16]) )
393 func( h, mb_y, buf );
395 func( h, mb_y-1, buf + h->sps->i_mb_width );
398 func( h, mb_y+1, buf + h->sps->i_mb_width * 2 );
400 func( h, mb_y-2, buf + h->sps->i_mb_width * 3 );
405 /* Deblocking filter */
406 static const uint8_t i_alpha_table[52+12*2] =
408 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
409 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
410 0, 0, 0, 0, 0, 0, 4, 4, 5, 6,
411 7, 8, 9, 10, 12, 13, 15, 17, 20, 22,
412 25, 28, 32, 36, 40, 45, 50, 56, 63, 71,
413 80, 90,101,113,127,144,162,182,203,226,
415 255,255,255,255,255,255,255,255,255,255,255,255,
417 static const uint8_t i_beta_table[52+12*2] =
419 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
420 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
421 0, 0, 0, 0, 0, 0, 2, 2, 2, 3,
422 3, 3, 3, 4, 4, 4, 6, 6, 7, 7,
423 8, 8, 9, 9, 10, 10, 11, 11, 12, 12,
424 13, 13, 14, 14, 15, 15, 16, 16, 17, 17,
426 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
428 static const int8_t i_tc0_table[52+12*2][4] =
430 {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
431 {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
432 {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
433 {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
434 {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 1 },
435 {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 1, 1 }, {-1, 0, 1, 1 }, {-1, 1, 1, 1 },
436 {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 },
437 {-1, 1, 1, 2 }, {-1, 1, 2, 3 }, {-1, 1, 2, 3 }, {-1, 2, 2, 3 }, {-1, 2, 2, 4 }, {-1, 2, 3, 4 },
438 {-1, 2, 3, 4 }, {-1, 3, 3, 5 }, {-1, 3, 4, 6 }, {-1, 3, 4, 6 }, {-1, 4, 5, 7 }, {-1, 4, 5, 8 },
439 {-1, 4, 6, 9 }, {-1, 5, 7,10 }, {-1, 6, 8,11 }, {-1, 6, 8,13 }, {-1, 7,10,14 }, {-1, 8,11,16 },
440 {-1, 9,12,18 }, {-1,10,13,20 }, {-1,11,15,23 }, {-1,13,17,25 },
441 {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
442 {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
444 #define alpha_table(x) i_alpha_table[(x)+12]
445 #define beta_table(x) i_beta_table[(x)+12]
446 #define tc0_table(x) i_tc0_table[(x)+12]
449 static inline void deblock_luma_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0 )
451 for( int i = 0; i < 4; i++ )
458 for( int d = 0; d < 4; d++ )
460 int p2 = pix[-3*xstride];
461 int p1 = pix[-2*xstride];
462 int p0 = pix[-1*xstride];
463 int q0 = pix[ 0*xstride];
464 int q1 = pix[ 1*xstride];
465 int q2 = pix[ 2*xstride];
467 if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
471 if( abs( p2 - p0 ) < beta )
474 pix[-2*xstride] = p1 + x264_clip3( (( p2 + ((p0 + q0 + 1) >> 1)) >> 1) - p1, -tc0[i], tc0[i] );
477 if( abs( q2 - q0 ) < beta )
480 pix[ 1*xstride] = q1 + x264_clip3( (( q2 + ((p0 + q0 + 1) >> 1)) >> 1) - q1, -tc0[i], tc0[i] );
484 delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
485 pix[-1*xstride] = x264_clip_uint8( p0 + delta ); /* p0' */
486 pix[ 0*xstride] = x264_clip_uint8( q0 - delta ); /* q0' */
492 static void deblock_v_luma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
494 deblock_luma_c( pix, stride, 1, alpha, beta, tc0 );
496 static void deblock_h_luma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
498 deblock_luma_c( pix, 1, stride, alpha, beta, tc0 );
501 static inline void deblock_chroma_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0 )
503 for( int i = 0; i < 4; i++ )
511 for( int d = 0; d < 2; d++ )
513 int p1 = pix[-2*xstride];
514 int p0 = pix[-1*xstride];
515 int q0 = pix[ 0*xstride];
516 int q1 = pix[ 1*xstride];
518 if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
520 int delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
521 pix[-1*xstride] = x264_clip_uint8( p0 + delta ); /* p0' */
522 pix[ 0*xstride] = x264_clip_uint8( q0 - delta ); /* q0' */
528 static void deblock_v_chroma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
530 deblock_chroma_c( pix, stride, 1, alpha, beta, tc0 );
532 static void deblock_h_chroma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
534 deblock_chroma_c( pix, 1, stride, alpha, beta, tc0 );
537 static inline void deblock_luma_intra_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta )
539 for( int d = 0; d < 16; d++ )
541 int p2 = pix[-3*xstride];
542 int p1 = pix[-2*xstride];
543 int p0 = pix[-1*xstride];
544 int q0 = pix[ 0*xstride];
545 int q1 = pix[ 1*xstride];
546 int q2 = pix[ 2*xstride];
548 if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
550 if(abs( p0 - q0 ) < ((alpha >> 2) + 2) )
552 if( abs( p2 - p0 ) < beta ) /* p0', p1', p2' */
554 const int p3 = pix[-4*xstride];
555 pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
556 pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
557 pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
560 pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
561 if( abs( q2 - q0 ) < beta ) /* q0', q1', q2' */
563 const int q3 = pix[3*xstride];
564 pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
565 pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
566 pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
569 pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
573 pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
574 pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
580 static void deblock_v_luma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
582 deblock_luma_intra_c( pix, stride, 1, alpha, beta );
584 static void deblock_h_luma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
586 deblock_luma_intra_c( pix, 1, stride, alpha, beta );
589 static inline void deblock_chroma_intra_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta )
591 for( int d = 0; d < 8; d++ )
593 int p1 = pix[-2*xstride];
594 int p0 = pix[-1*xstride];
595 int q0 = pix[ 0*xstride];
596 int q1 = pix[ 1*xstride];
598 if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
600 pix[-1*xstride] = (2*p1 + p0 + q1 + 2) >> 2; /* p0' */
601 pix[ 0*xstride] = (2*q1 + q0 + p1 + 2) >> 2; /* q0' */
606 static void deblock_v_chroma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
608 deblock_chroma_intra_c( pix, stride, 1, alpha, beta );
610 static void deblock_h_chroma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
612 deblock_chroma_intra_c( pix, 1, stride, alpha, beta );
615 static inline void deblock_edge( x264_t *h, uint8_t *pix1, uint8_t *pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_inter_t pf_inter )
617 int index_a = i_qp + h->sh.i_alpha_c0_offset;
618 int alpha = alpha_table(index_a);
619 int beta = beta_table(i_qp + h->sh.i_beta_offset);
622 if( !alpha || !beta )
625 tc[0] = tc0_table(index_a)[bS[0]] + b_chroma;
626 tc[1] = tc0_table(index_a)[bS[1]] + b_chroma;
627 tc[2] = tc0_table(index_a)[bS[2]] + b_chroma;
628 tc[3] = tc0_table(index_a)[bS[3]] + b_chroma;
630 pf_inter( pix1, i_stride, alpha, beta, tc );
632 pf_inter( pix2, i_stride, alpha, beta, tc );
635 static inline void deblock_edge_intra( x264_t *h, uint8_t *pix1, uint8_t *pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_intra_t pf_intra )
637 int alpha = alpha_table(i_qp + h->sh.i_alpha_c0_offset);
638 int beta = beta_table(i_qp + h->sh.i_beta_offset);
640 if( !alpha || !beta )
643 pf_intra( pix1, i_stride, alpha, beta );
645 pf_intra( pix2, i_stride, alpha, beta );
648 void x264_frame_deblock_row( x264_t *h, int mb_y )
650 int s8x8 = 2 * h->mb.i_mb_stride;
651 int s4x4 = 4 * h->mb.i_mb_stride;
652 int b_interlaced = h->sh.b_mbaff;
653 int mvy_limit = 4 >> b_interlaced;
654 int qp_thresh = 15 - X264_MIN(h->sh.i_alpha_c0_offset, h->sh.i_beta_offset) - X264_MAX(0, h->param.analyse.i_chroma_qp_offset);
655 int no_sub8x8 = !(h->param.analyse.inter & X264_ANALYSE_PSUB8x8);
657 int stridey = h->fdec->i_stride[0];
658 int stride2y = stridey << b_interlaced;
659 int strideuv = h->fdec->i_stride[1];
660 int stride2uv = strideuv << b_interlaced;
661 int deblock_on_slice_edges = h->sh.i_disable_deblocking_filter_idc != 2;
662 uint8_t (*nnz_backup)[16] = h->scratch_buffer;
664 if( !h->pps->b_cabac && h->pps->b_transform_8x8_mode )
665 munge_cavlc_nnz( h, mb_y, nnz_backup, munge_cavlc_nnz_row );
667 for( mb_x = 0; mb_x < h->sps->i_mb_width; mb_x += (~b_interlaced | mb_y)&1, mb_y ^= b_interlaced )
669 int mb_xy = mb_y * h->mb.i_mb_stride + mb_x;
670 int mb_8x8 = 2 * s8x8 * mb_y + 2 * mb_x;
671 int mb_4x4 = 4 * s4x4 * mb_y + 4 * mb_x;
672 int b_8x8_transform = h->mb.mb_transform_size[mb_xy];
673 int i_qp = h->mb.qp[mb_xy];
674 int i_edge_end = (h->mb.type[mb_xy] == P_SKIP) ? 1 : 4;
675 uint8_t *pixy = h->fdec->plane[0] + 16*mb_y*stridey + 16*mb_x;
676 uint8_t *pixu = h->fdec->plane[1] + 8*mb_y*strideuv + 8*mb_x;
677 uint8_t *pixv = h->fdec->plane[2] + 8*mb_y*strideuv + 8*mb_x;
678 if( b_interlaced && (mb_y&1) )
685 x264_prefetch_fenc( h, h->fdec, mb_x, mb_y );
687 if( i_qp <= qp_thresh )
690 #define FILTER_DIR(intra, i_dir)\
693 i_qpn= h->mb.qp[mbn_xy];\
697 deblock_edge##intra( h, pixy + 4*i_edge, NULL,\
698 stride2y, bS, (i_qp+i_qpn+1) >> 1, 0,\
699 h->loopf.deblock_h_luma##intra );\
703 int i_qpc = (h->chroma_qp_table[i_qp] + h->chroma_qp_table[i_qpn] + 1) >> 1;\
704 deblock_edge##intra( h, pixu + 2*i_edge, pixv + 2*i_edge,\
705 stride2uv, bS, i_qpc, 1,\
706 h->loopf.deblock_h_chroma##intra );\
711 /* horizontal edge */\
712 deblock_edge##intra( h, pixy + 4*i_edge*stride2y, NULL,\
713 stride2y, bS, (i_qp+i_qpn+1) >> 1, 0,\
714 h->loopf.deblock_v_luma##intra );\
718 int i_qpc = (h->chroma_qp_table[i_qp] + h->chroma_qp_table[i_qpn] + 1) >> 1;\
719 deblock_edge##intra( h, pixu + 2*i_edge*stride2uv, pixv + 2*i_edge*stride2uv,\
720 stride2uv, bS, i_qpc, 1,\
721 h->loopf.deblock_v_chroma##intra );\
726 #define DEBLOCK_STRENGTH(i_dir)\
728 /* *** Get bS for each 4px for the current edge *** */\
729 if( IS_INTRA( h->mb.type[mb_xy] ) || IS_INTRA( h->mb.type[mbn_xy]) )\
730 M32( bS ) = 0x03030303;\
733 M32( bS ) = 0x00000000;\
734 for( int i = 0; i < 4; i++ )\
736 int x = i_dir == 0 ? i_edge : i;\
737 int y = i_dir == 0 ? i : i_edge;\
738 int xn = i_dir == 0 ? (x - 1)&0x03 : x;\
739 int yn = i_dir == 0 ? y : (y - 1)&0x03;\
740 if( h->mb.non_zero_count[mb_xy][x+y*4] != 0 ||\
741 h->mb.non_zero_count[mbn_xy][xn+yn*4] != 0 )\
743 else if(!(i_edge&no_sub8x8))\
745 if((i&no_sub8x8) && bS[i-1] != 2)\
749 int i8p= mb_8x8+(x>>1)+(y>>1)*s8x8;\
750 int i8q= mbn_8x8+(xn>>1)+(yn>>1)*s8x8;\
751 int i4p= mb_4x4+x+y*s4x4;\
752 int i4q= mbn_4x4+xn+yn*s4x4;\
754 /* We don't use duplicate refs in B-frames, so we can take this shortcut for now. */ \
755 if( h->sh.i_type == SLICE_TYPE_B || h->mb.ref[0][i8p] < 0 || h->mb.ref[0][i8q] < 0 )\
756 refs_equal = h->mb.ref[0][i8p] == h->mb.ref[0][i8q];\
757 else if( !h->mb.b_interlaced )\
758 refs_equal = h->fref0[h->mb.ref[0][i8p]]->i_poc == h->fref0[h->mb.ref[0][i8q]]->i_poc;\
760 refs_equal = h->fref0[h->mb.ref[0][i8p]>>1]->i_poc == h->fref0[h->mb.ref[0][i8q]>>1]->i_poc\
761 && (h->mb.ref[0][i8p]&1) == (h->mb.ref[0][i8q]&1);\
763 abs( h->mb.mv[0][i4p][0] - h->mb.mv[0][i4q][0] ) >= 4 ||\
764 abs( h->mb.mv[0][i4p][1] - h->mb.mv[0][i4q][1] ) >= mvy_limit ) ||\
765 (h->sh.i_type == SLICE_TYPE_B &&\
766 (h->mb.ref[1][i8p] != h->mb.ref[1][i8q] ||\
767 abs( h->mb.mv[1][i4p][0] - h->mb.mv[1][i4q][0] ) >= 4 ||\
768 abs( h->mb.mv[1][i4p][1] - h->mb.mv[1][i4q][1] ) >= mvy_limit )))\
778 /* i_dir == 0 -> vertical edge
779 * i_dir == 1 -> horizontal edge */
780 #define DEBLOCK_DIR(i_dir)\
783 int i_qpn, mbn_xy, mbn_8x8, mbn_4x4;\
784 ALIGNED_4( uint8_t bS[4] ); /* filtering strength */\
785 /* We don't have to consider the MBAFF case of a slice breaking in the middle\
786 * of a row because x264 doesn't support that case. If we add support for that,\
787 * this will have to become significantly more complex. */\
788 if( i_dir == 0 && (mb_x == 0 || (!deblock_on_slice_edges &&\
789 h->mb.slice_table[mb_xy] != h->mb.slice_table[mb_xy-1])) )\
791 if( i_dir == 1 && (mb_y <= b_interlaced || (!deblock_on_slice_edges &&\
792 h->mb.slice_table[mb_xy] != h->mb.slice_table[mb_xy-(h->mb.i_mb_stride<<b_interlaced)])) )\
795 i_edge+= b_8x8_transform;\
798 mbn_xy = i_dir == 0 ? mb_xy - 1 : mb_xy - h->mb.i_mb_stride;\
799 mbn_8x8 = i_dir == 0 ? mb_8x8 - 2 : mb_8x8 - 2 * s8x8;\
800 mbn_4x4 = i_dir == 0 ? mb_4x4 - 4 : mb_4x4 - 4 * s4x4;\
801 if( b_interlaced && i_dir == 1 )\
803 mbn_xy -= h->mb.i_mb_stride;\
804 mbn_8x8 -= 2 * s8x8;\
805 mbn_4x4 -= 4 * s4x4;\
807 else if( IS_INTRA( h->mb.type[mb_xy] ) || IS_INTRA( h->mb.type[mbn_xy]) )\
809 FILTER_DIR( _intra, i_dir );\
812 DEBLOCK_STRENGTH(i_dir);\
814 FILTER_DIR( , i_dir);\
816 i_edge += b_8x8_transform+1;\
821 for( ; i_edge < i_edge_end; i_edge+=b_8x8_transform+1 )\
823 DEBLOCK_STRENGTH(i_dir);\
825 FILTER_DIR( , i_dir);\
833 if( !h->pps->b_cabac && h->pps->b_transform_8x8_mode )
834 munge_cavlc_nnz( h, mb_y, nnz_backup, restore_cavlc_nnz_row );
837 void x264_frame_deblock( x264_t *h )
839 for( int mb_y = 0; mb_y < h->sps->i_mb_height; mb_y += 1 + h->sh.b_mbaff )
840 x264_frame_deblock_row( h, mb_y );
844 void x264_deblock_v_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
845 void x264_deblock_h_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
846 void x264_deblock_v_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
847 void x264_deblock_h_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
849 void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
850 void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
851 void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta );
852 void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta );
854 void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
855 void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
856 void x264_deblock_h_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
857 void x264_deblock_v8_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
859 static void x264_deblock_v_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
861 x264_deblock_v8_luma_mmxext( pix, stride, alpha, beta, tc0 );
862 x264_deblock_v8_luma_mmxext( pix+8, stride, alpha, beta, tc0+2 );
864 static void x264_deblock_v_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta )
866 x264_deblock_v8_luma_intra_mmxext( pix, stride, alpha, beta );
867 x264_deblock_v8_luma_intra_mmxext( pix+8, stride, alpha, beta );
873 void x264_deblock_v_luma_altivec( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
874 void x264_deblock_h_luma_altivec( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
878 void x264_deblock_v_luma_neon( uint8_t *, int, int, int, int8_t * );
879 void x264_deblock_h_luma_neon( uint8_t *, int, int, int, int8_t * );
880 void x264_deblock_v_chroma_neon( uint8_t *, int, int, int, int8_t * );
881 void x264_deblock_h_chroma_neon( uint8_t *, int, int, int, int8_t * );
884 void x264_deblock_init( int cpu, x264_deblock_function_t *pf )
886 pf->deblock_v_luma = deblock_v_luma_c;
887 pf->deblock_h_luma = deblock_h_luma_c;
888 pf->deblock_v_chroma = deblock_v_chroma_c;
889 pf->deblock_h_chroma = deblock_h_chroma_c;
890 pf->deblock_v_luma_intra = deblock_v_luma_intra_c;
891 pf->deblock_h_luma_intra = deblock_h_luma_intra_c;
892 pf->deblock_v_chroma_intra = deblock_v_chroma_intra_c;
893 pf->deblock_h_chroma_intra = deblock_h_chroma_intra_c;
896 if( cpu&X264_CPU_MMXEXT )
898 pf->deblock_v_chroma = x264_deblock_v_chroma_mmxext;
899 pf->deblock_h_chroma = x264_deblock_h_chroma_mmxext;
900 pf->deblock_v_chroma_intra = x264_deblock_v_chroma_intra_mmxext;
901 pf->deblock_h_chroma_intra = x264_deblock_h_chroma_intra_mmxext;
903 pf->deblock_v_luma = x264_deblock_v_luma_mmxext;
904 pf->deblock_h_luma = x264_deblock_h_luma_mmxext;
905 pf->deblock_v_luma_intra = x264_deblock_v_luma_intra_mmxext;
906 pf->deblock_h_luma_intra = x264_deblock_h_luma_intra_mmxext;
908 if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_STACK_MOD4) )
910 pf->deblock_v_luma = x264_deblock_v_luma_sse2;
911 pf->deblock_h_luma = x264_deblock_h_luma_sse2;
912 pf->deblock_v_luma_intra = x264_deblock_v_luma_intra_sse2;
913 pf->deblock_h_luma_intra = x264_deblock_h_luma_intra_sse2;
919 if( cpu&X264_CPU_ALTIVEC )
921 pf->deblock_v_luma = x264_deblock_v_luma_altivec;
922 pf->deblock_h_luma = x264_deblock_h_luma_altivec;
924 #endif // HAVE_ALTIVEC
927 if( cpu&X264_CPU_NEON )
929 pf->deblock_v_luma = x264_deblock_v_luma_neon;
930 pf->deblock_h_luma = x264_deblock_h_luma_neon;
931 pf->deblock_v_chroma = x264_deblock_v_chroma_neon;
932 pf->deblock_h_chroma = x264_deblock_h_chroma_neon;
939 void x264_frame_cond_broadcast( x264_frame_t *frame, int i_lines_completed )
941 x264_pthread_mutex_lock( &frame->mutex );
942 frame->i_lines_completed = i_lines_completed;
943 x264_pthread_cond_broadcast( &frame->cv );
944 x264_pthread_mutex_unlock( &frame->mutex );
947 void x264_frame_cond_wait( x264_frame_t *frame, int i_lines_completed )
949 x264_pthread_mutex_lock( &frame->mutex );
950 while( frame->i_lines_completed < i_lines_completed )
951 x264_pthread_cond_wait( &frame->cv, &frame->mutex );
952 x264_pthread_mutex_unlock( &frame->mutex );
957 void x264_frame_push( x264_frame_t **list, x264_frame_t *frame )
960 while( list[i] ) i++;
964 x264_frame_t *x264_frame_pop( x264_frame_t **list )
969 while( list[i+1] ) i++;
975 void x264_frame_unshift( x264_frame_t **list, x264_frame_t *frame )
978 while( list[i] ) i++;
984 x264_frame_t *x264_frame_shift( x264_frame_t **list )
986 x264_frame_t *frame = list[0];
988 for( i = 0; list[i]; i++ )
994 void x264_frame_push_unused( x264_t *h, x264_frame_t *frame )
996 assert( frame->i_reference_count > 0 );
997 frame->i_reference_count--;
998 if( frame->i_reference_count == 0 )
999 x264_frame_push( h->frames.unused[frame->b_fdec], frame );
1002 x264_frame_t *x264_frame_pop_unused( x264_t *h, int b_fdec )
1004 x264_frame_t *frame;
1005 if( h->frames.unused[b_fdec][0] )
1006 frame = x264_frame_pop( h->frames.unused[b_fdec] );
1008 frame = x264_frame_new( h, b_fdec );
1011 frame->b_last_minigop_bframe = 0;
1012 frame->i_reference_count = 1;
1013 frame->b_intra_calculated = 0;
1014 frame->b_scenecut = 1;
1015 frame->b_keyframe = 0;
1017 memset( frame->weight, 0, sizeof(frame->weight) );
1018 memset( frame->f_weighted_cost_delta, 0, sizeof(frame->f_weighted_cost_delta) );
1023 void x264_frame_push_blank_unused( x264_t *h, x264_frame_t *frame )
1025 assert( frame->i_reference_count > 0 );
1026 frame->i_reference_count--;
1027 if( frame->i_reference_count == 0 )
1028 x264_frame_push( h->frames.blank_unused, frame );
1031 x264_frame_t *x264_frame_pop_blank_unused( x264_t *h )
1033 x264_frame_t *frame;
1034 if( h->frames.blank_unused[0] )
1035 frame = x264_frame_pop( h->frames.blank_unused );
1037 frame = x264_malloc( sizeof(x264_frame_t) );
1040 frame->b_duplicate = 1;
1041 frame->i_reference_count = 1;
1045 void x264_frame_sort( x264_frame_t **list, int b_dts )
1050 for( int i = 0; list[i+1]; i++ )
1052 int dtype = list[i]->i_type - list[i+1]->i_type;
1053 int dtime = list[i]->i_frame - list[i+1]->i_frame;
1054 int swap = b_dts ? dtype > 0 || ( dtype == 0 && dtime > 0 )
1058 XCHG( x264_frame_t*, list[i], list[i+1] );
1065 void x264_weight_scale_plane( x264_t *h, uint8_t *dst, int i_dst_stride, uint8_t *src, int i_src_stride,
1066 int i_width, int i_height, x264_weight_t *w )
1068 /* Weight horizontal strips of height 16. This was found to be the optimal height
1069 * in terms of the cache loads. */
1070 while( i_height > 0 )
1072 for( int x = 0; x < i_width; x += 16 )
1073 w->weightfn[16>>2]( dst+x, i_dst_stride, src+x, i_src_stride, w, X264_MIN( i_height, 16 ) );
1075 dst += 16 * i_dst_stride;
1076 src += 16 * i_src_stride;
1080 void x264_frame_delete_list( x264_frame_t **list )
1086 x264_frame_delete( list[i++] );
1090 int x264_synch_frame_list_init( x264_synch_frame_list_t *slist, int max_size )
1094 slist->i_max_size = max_size;
1096 CHECKED_MALLOCZERO( slist->list, (max_size+1) * sizeof(x264_frame_t*) );
1097 if( x264_pthread_mutex_init( &slist->mutex, NULL ) ||
1098 x264_pthread_cond_init( &slist->cv_fill, NULL ) ||
1099 x264_pthread_cond_init( &slist->cv_empty, NULL ) )
1106 void x264_synch_frame_list_delete( x264_synch_frame_list_t *slist )
1108 x264_pthread_mutex_destroy( &slist->mutex );
1109 x264_pthread_cond_destroy( &slist->cv_fill );
1110 x264_pthread_cond_destroy( &slist->cv_empty );
1111 x264_frame_delete_list( slist->list );
1114 void x264_synch_frame_list_push( x264_synch_frame_list_t *slist, x264_frame_t *frame )
1116 x264_pthread_mutex_lock( &slist->mutex );
1117 while( slist->i_size == slist->i_max_size )
1118 x264_pthread_cond_wait( &slist->cv_empty, &slist->mutex );
1119 slist->list[ slist->i_size++ ] = frame;
1120 x264_pthread_mutex_unlock( &slist->mutex );
1121 x264_pthread_cond_broadcast( &slist->cv_fill );