1 /*****************************************************************************
2 * frame.c: h264 encoder library
3 *****************************************************************************
4 * Copyright (C) 2003-2008 x264 project
6 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
7 * Loren Merritt <lorenm@u.washington.edu>
8 * Fiona Glaser <fiona@x264.com>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23 *****************************************************************************/
27 #define ALIGN(x,a) (((x)+((a)-1))&~((a)-1))
29 x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
34 int i_mb_count = h->mb.i_mb_count;
35 int i_stride, i_width, i_lines;
36 int i_padv = PADV << h->param.b_interlaced;
38 int chroma_plane_size;
39 int align = h->param.cpu&X264_CPU_CACHELINE_64 ? 64 : h->param.cpu&X264_CPU_CACHELINE_32 ? 32 : 16;
41 CHECKED_MALLOCZERO( frame, sizeof(x264_frame_t) );
43 /* allocate frame data (+64 for extra data for me) */
44 i_width = ALIGN( h->param.i_width, 16 );
45 i_stride = ALIGN( i_width + 2*PADH, align );
46 i_lines = ALIGN( h->param.i_height, 16<<h->param.b_interlaced );
49 for( i = 0; i < 3; i++ )
51 frame->i_stride[i] = ALIGN( i_stride >> !!i, align );
52 frame->i_width[i] = i_width >> !!i;
53 frame->i_lines[i] = i_lines >> !!i;
56 luma_plane_size = (frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv));
57 chroma_plane_size = (frame->i_stride[1] * (frame->i_lines[1] + 2*i_padv));
58 for( i = 1; i < 3; i++ )
60 CHECKED_MALLOC( frame->buffer[i], chroma_plane_size );
61 frame->plane[i] = frame->buffer[i] + (frame->i_stride[i] * i_padv + PADH)/2;
64 for( i = 0; i < h->param.i_bframe + 2; i++ )
65 for( j = 0; j < h->param.i_bframe + 2; j++ )
66 CHECKED_MALLOC( frame->i_row_satds[i][j], i_lines/16 * sizeof(int) );
69 frame->i_type = X264_TYPE_AUTO;
73 frame->i_frame_num = -1;
74 frame->i_lines_completed = -1;
75 frame->b_fdec = b_fdec;
76 frame->i_pic_struct = PIC_STRUCT_AUTO;
77 frame->i_field_cnt = -1;
79 frame->i_cpb_duration =
80 frame->i_dpb_output_delay =
81 frame->i_cpb_delay = 0;
82 frame->i_coded_fields_lookahead =
83 frame->i_cpb_delay_lookahead = -1;
87 /* all 4 luma planes allocated together, since the cacheline split code
88 * requires them to be in-phase wrt cacheline alignment. */
89 if( h->param.analyse.i_subpel_refine && b_fdec )
91 CHECKED_MALLOC( frame->buffer[0], 4*luma_plane_size );
92 for( i = 0; i < 4; i++ )
93 frame->filtered[i] = frame->buffer[0] + i*luma_plane_size + frame->i_stride[0] * i_padv + PADH;
94 frame->plane[0] = frame->filtered[0];
98 CHECKED_MALLOC( frame->buffer[0], luma_plane_size );
99 frame->filtered[0] = frame->plane[0] = frame->buffer[0] + frame->i_stride[0] * i_padv + PADH;
102 frame->b_duplicate = 0;
104 if( b_fdec ) /* fdec frame */
106 CHECKED_MALLOC( frame->mb_type, i_mb_count * sizeof(int8_t));
107 CHECKED_MALLOC( frame->mb_partition, i_mb_count * sizeof(uint8_t));
108 CHECKED_MALLOC( frame->mv[0], 2*16 * i_mb_count * sizeof(int16_t) );
109 CHECKED_MALLOC( frame->ref[0], 4 * i_mb_count * sizeof(int8_t) );
110 if( h->param.i_bframe )
112 CHECKED_MALLOC( frame->mv[1], 2*16 * i_mb_count * sizeof(int16_t) );
113 CHECKED_MALLOC( frame->ref[1], 4 * i_mb_count * sizeof(int8_t) );
118 frame->ref[1] = NULL;
120 CHECKED_MALLOC( frame->i_row_bits, i_lines/16 * sizeof(int) );
121 CHECKED_MALLOC( frame->i_row_qp, i_lines/16 * sizeof(int) );
122 if( h->param.analyse.i_me_method >= X264_ME_ESA )
124 CHECKED_MALLOC( frame->buffer[3],
125 frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv) * sizeof(uint16_t) << h->frames.b_have_sub8x8_esa );
126 frame->integral = (uint16_t*)frame->buffer[3] + frame->i_stride[0] * i_padv + PADH;
129 else /* fenc frame */
131 if( h->frames.b_have_lowres )
133 frame->i_width_lowres = frame->i_width[0]/2;
134 frame->i_stride_lowres = ALIGN( frame->i_width_lowres + 2*PADH, align );
135 frame->i_lines_lowres = frame->i_lines[0]/2;
137 luma_plane_size = frame->i_stride_lowres * (frame->i_lines[0]/2 + 2*PADV);
139 CHECKED_MALLOC( frame->buffer_lowres[0], 4 * luma_plane_size );
140 for( i = 0; i < 4; i++ )
141 frame->lowres[i] = frame->buffer_lowres[0] + (frame->i_stride_lowres * PADV + PADH) + i * luma_plane_size;
143 for( j = 0; j <= !!h->param.i_bframe; j++ )
144 for( i = 0; i <= h->param.i_bframe; i++ )
146 CHECKED_MALLOCZERO( frame->lowres_mvs[j][i], 2*h->mb.i_mb_count*sizeof(int16_t) );
147 CHECKED_MALLOC( frame->lowres_mv_costs[j][i], h->mb.i_mb_count*sizeof(int) );
149 CHECKED_MALLOC( frame->i_propagate_cost, (i_mb_count+3) * sizeof(uint16_t) );
150 for( j = 0; j <= h->param.i_bframe+1; j++ )
151 for( i = 0; i <= h->param.i_bframe+1; i++ )
153 CHECKED_MALLOC( frame->lowres_costs[j][i], (i_mb_count+3) * sizeof(uint16_t) );
154 CHECKED_MALLOC( frame->lowres_inter_types[j][i], (i_mb_count+3)/4 * sizeof(uint8_t) );
156 frame->i_intra_cost = frame->lowres_costs[0][0];
157 memset( frame->i_intra_cost, -1, (i_mb_count+3) * sizeof(uint16_t) );
159 if( h->param.rc.i_aq_mode )
161 CHECKED_MALLOC( frame->f_qp_offset, h->mb.i_mb_count * sizeof(float) );
162 CHECKED_MALLOC( frame->f_qp_offset_aq, h->mb.i_mb_count * sizeof(float) );
163 if( h->frames.b_have_lowres )
164 /* shouldn't really be initialized, just silences a valgrind false-positive in x264_mbtree_propagate_cost_sse2 */
165 CHECKED_MALLOCZERO( frame->i_inv_qscale_factor, (h->mb.i_mb_count+3) * sizeof(uint16_t) );
169 if( x264_pthread_mutex_init( &frame->mutex, NULL ) )
171 if( x264_pthread_cond_init( &frame->cv, NULL ) )
181 void x264_frame_delete( x264_frame_t *frame )
184 /* Duplicate frames are blank copies of real frames (including pointers),
185 * so freeing those pointers would cause a double free later. */
186 if( !frame->b_duplicate )
188 for( i = 0; i < 4; i++ )
189 x264_free( frame->buffer[i] );
190 for( i = 0; i < 4; i++ )
191 x264_free( frame->buffer_lowres[i] );
192 for( i = 0; i < X264_BFRAME_MAX+2; i++ )
193 for( j = 0; j < X264_BFRAME_MAX+2; j++ )
194 x264_free( frame->i_row_satds[i][j] );
195 for( j = 0; j < 2; j++ )
196 for( i = 0; i <= X264_BFRAME_MAX; i++ )
198 x264_free( frame->lowres_mvs[j][i] );
199 x264_free( frame->lowres_mv_costs[j][i] );
201 x264_free( frame->i_propagate_cost );
202 for( j = 0; j <= X264_BFRAME_MAX+1; j++ )
203 for( i = 0; i <= X264_BFRAME_MAX+1; i++ )
205 x264_free( frame->lowres_costs[j][i] );
206 x264_free( frame->lowres_inter_types[j][i] );
208 x264_free( frame->f_qp_offset );
209 x264_free( frame->f_qp_offset_aq );
210 x264_free( frame->i_inv_qscale_factor );
211 x264_free( frame->i_row_bits );
212 x264_free( frame->i_row_qp );
213 x264_free( frame->mb_type );
214 x264_free( frame->mb_partition );
215 x264_free( frame->mv[0] );
216 x264_free( frame->mv[1] );
217 x264_free( frame->ref[0] );
218 x264_free( frame->ref[1] );
219 x264_pthread_mutex_destroy( &frame->mutex );
220 x264_pthread_cond_destroy( &frame->cv );
225 int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src )
227 int i_csp = src->img.i_csp & X264_CSP_MASK;
229 if( i_csp != X264_CSP_I420 && i_csp != X264_CSP_YV12 )
231 x264_log( h, X264_LOG_ERROR, "Arg invalid CSP\n" );
235 dst->i_type = src->i_type;
236 dst->i_qpplus1 = src->i_qpplus1;
237 dst->i_pts = dst->i_reordered_pts = src->i_pts;
238 dst->param = src->param;
239 dst->i_pic_struct = src->i_pic_struct;
243 int s = (i_csp == X264_CSP_YV12 && i) ? i^3 : i;
244 uint8_t *plane = src->img.plane[s];
245 int stride = src->img.i_stride[s];
246 int width = h->param.i_width >> !!i;
247 int height = h->param.i_height >> !!i;
248 if( src->img.i_csp & X264_CSP_VFLIP )
250 plane += (height-1)*stride;
253 h->mc.plane_copy( dst->plane[i], dst->i_stride[i], plane, stride, width, height );
260 static void plane_expand_border( uint8_t *pix, int i_stride, int i_width, int i_height, int i_padh, int i_padv, int b_pad_top, int b_pad_bottom )
262 #define PPIXEL(x, y) ( pix + (x) + (y)*i_stride )
264 for( y = 0; y < i_height; y++ )
267 memset( PPIXEL(-i_padh, y), PPIXEL(0, y)[0], i_padh );
269 memset( PPIXEL(i_width, y), PPIXEL(i_width-1, y)[0], i_padh );
273 for( y = 0; y < i_padv; y++ )
274 memcpy( PPIXEL(-i_padh, -y-1), PPIXEL(-i_padh, 0), i_width+2*i_padh );
277 for( y = 0; y < i_padv; y++ )
278 memcpy( PPIXEL(-i_padh, i_height+y), PPIXEL(-i_padh, i_height-1), i_width+2*i_padh );
282 void x264_frame_expand_border( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
286 if( mb_y & h->sh.b_mbaff )
288 for( i = 0; i < frame->i_plane; i++ )
290 int stride = frame->i_stride[i];
291 int width = 16*h->sps->i_mb_width >> !!i;
292 int height = (b_end ? 16*(h->sps->i_mb_height - mb_y) >> h->sh.b_mbaff : 16) >> !!i;
293 int padh = PADH >> !!i;
294 int padv = PADV >> !!i;
295 // buffer: 2 chroma, 3 luma (rounded to 4) because deblocking goes beyond the top of the mb
296 uint8_t *pix = frame->plane[i] + X264_MAX(0, (16*mb_y-4)*stride >> !!i);
297 if( b_end && !b_start )
298 height += 4 >> (!!i + h->sh.b_mbaff);
301 plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end );
302 plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end );
306 plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end );
311 void x264_frame_expand_border_filtered( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
313 /* during filtering, 8 extra pixels were filtered on each edge,
314 * but up to 3 of the horizontal ones may be wrong.
315 we want to expand border from the last filtered pixel */
317 int stride = frame->i_stride[0];
318 int width = 16*h->sps->i_mb_width + 8;
319 int height = b_end ? (16*(h->sps->i_mb_height - mb_y) >> h->sh.b_mbaff) + 16 : 16;
323 for( i = 1; i < 4; i++ )
325 // buffer: 8 luma, to match the hpel filter
326 uint8_t *pix = frame->filtered[i] + (16*mb_y - (8 << h->sh.b_mbaff)) * stride - 4;
329 plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end );
330 plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end );
334 plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end );
339 void x264_frame_expand_border_lowres( x264_frame_t *frame )
342 for( i = 0; i < 4; i++ )
343 plane_expand_border( frame->lowres[i], frame->i_stride_lowres, frame->i_width_lowres, frame->i_lines_lowres, PADH, PADV, 1, 1 );
346 void x264_frame_expand_border_mod16( x264_t *h, x264_frame_t *frame )
349 for( i = 0; i < frame->i_plane; i++ )
351 int i_subsample = i ? 1 : 0;
352 int i_width = h->param.i_width >> i_subsample;
353 int i_height = h->param.i_height >> i_subsample;
354 int i_padx = (h->sps->i_mb_width * 16 - h->param.i_width) >> i_subsample;
355 int i_pady = (h->sps->i_mb_height * 16 - h->param.i_height) >> i_subsample;
359 for( y = 0; y < i_height; y++ )
360 memset( &frame->plane[i][y*frame->i_stride[i] + i_width],
361 frame->plane[i][y*frame->i_stride[i] + i_width - 1],
366 for( y = i_height; y < i_height + i_pady; y++ )
367 memcpy( &frame->plane[i][y*frame->i_stride[i]],
368 &frame->plane[i][(i_height-(~y&h->param.b_interlaced)-1)*frame->i_stride[i]],
375 /* cavlc + 8x8 transform stores nnz per 16 coeffs for the purpose of
376 * entropy coding, but per 64 coeffs for the purpose of deblocking */
377 static void munge_cavlc_nnz_row( x264_t *h, int mb_y, uint8_t (*buf)[16] )
379 uint32_t (*src)[6] = (uint32_t(*)[6])h->mb.non_zero_count + mb_y * h->sps->i_mb_width;
380 int8_t *transform = h->mb.mb_transform_size + mb_y * h->sps->i_mb_width;
382 for( x=0; x<h->sps->i_mb_width; x++ )
384 memcpy( buf+x, src+x, 16 );
387 nnz = src[x][0] | src[x][1];
388 src[x][0] = src[x][1] = ((uint16_t)nnz ? 0x0101 : 0) + (nnz>>16 ? 0x01010000 : 0);
389 nnz = src[x][2] | src[x][3];
390 src[x][2] = src[x][3] = ((uint16_t)nnz ? 0x0101 : 0) + (nnz>>16 ? 0x01010000 : 0);
395 static void restore_cavlc_nnz_row( x264_t *h, int mb_y, uint8_t (*buf)[16] )
397 uint8_t (*dst)[24] = h->mb.non_zero_count + mb_y * h->sps->i_mb_width;
399 for( x=0; x<h->sps->i_mb_width; x++ )
400 memcpy( dst+x, buf+x, 16 );
403 static void munge_cavlc_nnz( x264_t *h, int mb_y, uint8_t (*buf)[16], void (*func)(x264_t*, int, uint8_t (*)[16]) )
405 func( h, mb_y, buf );
407 func( h, mb_y-1, buf + h->sps->i_mb_width );
410 func( h, mb_y+1, buf + h->sps->i_mb_width * 2 );
412 func( h, mb_y-2, buf + h->sps->i_mb_width * 3 );
417 /* Deblocking filter */
418 static const uint8_t i_alpha_table[52+12*2] =
420 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
421 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
422 0, 0, 0, 0, 0, 0, 4, 4, 5, 6,
423 7, 8, 9, 10, 12, 13, 15, 17, 20, 22,
424 25, 28, 32, 36, 40, 45, 50, 56, 63, 71,
425 80, 90,101,113,127,144,162,182,203,226,
427 255,255,255,255,255,255,255,255,255,255,255,255,
429 static const uint8_t i_beta_table[52+12*2] =
431 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
432 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
433 0, 0, 0, 0, 0, 0, 2, 2, 2, 3,
434 3, 3, 3, 4, 4, 4, 6, 6, 7, 7,
435 8, 8, 9, 9, 10, 10, 11, 11, 12, 12,
436 13, 13, 14, 14, 15, 15, 16, 16, 17, 17,
438 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
440 static const int8_t i_tc0_table[52+12*2][4] =
442 {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
443 {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
444 {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
445 {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
446 {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 1 },
447 {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 1, 1 }, {-1, 0, 1, 1 }, {-1, 1, 1, 1 },
448 {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 },
449 {-1, 1, 1, 2 }, {-1, 1, 2, 3 }, {-1, 1, 2, 3 }, {-1, 2, 2, 3 }, {-1, 2, 2, 4 }, {-1, 2, 3, 4 },
450 {-1, 2, 3, 4 }, {-1, 3, 3, 5 }, {-1, 3, 4, 6 }, {-1, 3, 4, 6 }, {-1, 4, 5, 7 }, {-1, 4, 5, 8 },
451 {-1, 4, 6, 9 }, {-1, 5, 7,10 }, {-1, 6, 8,11 }, {-1, 6, 8,13 }, {-1, 7,10,14 }, {-1, 8,11,16 },
452 {-1, 9,12,18 }, {-1,10,13,20 }, {-1,11,15,23 }, {-1,13,17,25 },
453 {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
454 {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
456 #define alpha_table(x) i_alpha_table[(x)+12]
457 #define beta_table(x) i_beta_table[(x)+12]
458 #define tc0_table(x) i_tc0_table[(x)+12]
461 static inline void deblock_luma_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0 )
464 for( i = 0; i < 4; i++ )
471 for( d = 0; d < 4; d++ )
473 const int p2 = pix[-3*xstride];
474 const int p1 = pix[-2*xstride];
475 const int p0 = pix[-1*xstride];
476 const int q0 = pix[ 0*xstride];
477 const int q1 = pix[ 1*xstride];
478 const int q2 = pix[ 2*xstride];
480 if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
484 if( abs( p2 - p0 ) < beta )
487 pix[-2*xstride] = p1 + x264_clip3( (( p2 + ((p0 + q0 + 1) >> 1)) >> 1) - p1, -tc0[i], tc0[i] );
490 if( abs( q2 - q0 ) < beta )
493 pix[ 1*xstride] = q1 + x264_clip3( (( q2 + ((p0 + q0 + 1) >> 1)) >> 1) - q1, -tc0[i], tc0[i] );
497 delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
498 pix[-1*xstride] = x264_clip_uint8( p0 + delta ); /* p0' */
499 pix[ 0*xstride] = x264_clip_uint8( q0 - delta ); /* q0' */
505 static void deblock_v_luma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
507 deblock_luma_c( pix, stride, 1, alpha, beta, tc0 );
509 static void deblock_h_luma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
511 deblock_luma_c( pix, 1, stride, alpha, beta, tc0 );
514 static inline void deblock_chroma_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0 )
517 for( i = 0; i < 4; i++ )
519 const int tc = tc0[i];
525 for( d = 0; d < 2; d++ )
527 const int p1 = pix[-2*xstride];
528 const int p0 = pix[-1*xstride];
529 const int q0 = pix[ 0*xstride];
530 const int q1 = pix[ 1*xstride];
532 if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
534 int delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
535 pix[-1*xstride] = x264_clip_uint8( p0 + delta ); /* p0' */
536 pix[ 0*xstride] = x264_clip_uint8( q0 - delta ); /* q0' */
542 static void deblock_v_chroma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
544 deblock_chroma_c( pix, stride, 1, alpha, beta, tc0 );
546 static void deblock_h_chroma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
548 deblock_chroma_c( pix, 1, stride, alpha, beta, tc0 );
551 static inline void deblock_luma_intra_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta )
554 for( d = 0; d < 16; d++ )
556 const int p2 = pix[-3*xstride];
557 const int p1 = pix[-2*xstride];
558 const int p0 = pix[-1*xstride];
559 const int q0 = pix[ 0*xstride];
560 const int q1 = pix[ 1*xstride];
561 const int q2 = pix[ 2*xstride];
563 if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
565 if(abs( p0 - q0 ) < ((alpha >> 2) + 2) )
567 if( abs( p2 - p0 ) < beta ) /* p0', p1', p2' */
569 const int p3 = pix[-4*xstride];
570 pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
571 pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
572 pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
575 pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
576 if( abs( q2 - q0 ) < beta ) /* q0', q1', q2' */
578 const int q3 = pix[3*xstride];
579 pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
580 pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
581 pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
584 pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
588 pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
589 pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
595 static void deblock_v_luma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
597 deblock_luma_intra_c( pix, stride, 1, alpha, beta );
599 static void deblock_h_luma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
601 deblock_luma_intra_c( pix, 1, stride, alpha, beta );
604 static inline void deblock_chroma_intra_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta )
607 for( d = 0; d < 8; d++ )
609 const int p1 = pix[-2*xstride];
610 const int p0 = pix[-1*xstride];
611 const int q0 = pix[ 0*xstride];
612 const int q1 = pix[ 1*xstride];
614 if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
616 pix[-1*xstride] = (2*p1 + p0 + q1 + 2) >> 2; /* p0' */
617 pix[ 0*xstride] = (2*q1 + q0 + p1 + 2) >> 2; /* q0' */
622 static void deblock_v_chroma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
624 deblock_chroma_intra_c( pix, stride, 1, alpha, beta );
626 static void deblock_h_chroma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
628 deblock_chroma_intra_c( pix, 1, stride, alpha, beta );
631 static inline void deblock_edge( x264_t *h, uint8_t *pix1, uint8_t *pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_inter_t pf_inter )
633 const int index_a = i_qp + h->sh.i_alpha_c0_offset;
634 const int alpha = alpha_table(index_a);
635 const int beta = beta_table(i_qp + h->sh.i_beta_offset);
638 if( !alpha || !beta )
641 tc[0] = tc0_table(index_a)[bS[0]] + b_chroma;
642 tc[1] = tc0_table(index_a)[bS[1]] + b_chroma;
643 tc[2] = tc0_table(index_a)[bS[2]] + b_chroma;
644 tc[3] = tc0_table(index_a)[bS[3]] + b_chroma;
646 pf_inter( pix1, i_stride, alpha, beta, tc );
648 pf_inter( pix2, i_stride, alpha, beta, tc );
651 static inline void deblock_edge_intra( x264_t *h, uint8_t *pix1, uint8_t *pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_intra_t pf_intra )
653 const int alpha = alpha_table(i_qp + h->sh.i_alpha_c0_offset);
654 const int beta = beta_table(i_qp + h->sh.i_beta_offset);
656 if( !alpha || !beta )
659 pf_intra( pix1, i_stride, alpha, beta );
661 pf_intra( pix2, i_stride, alpha, beta );
664 void x264_frame_deblock_row( x264_t *h, int mb_y )
666 const int s8x8 = 2 * h->mb.i_mb_stride;
667 const int s4x4 = 4 * h->mb.i_mb_stride;
668 const int b_interlaced = h->sh.b_mbaff;
669 const int mvy_limit = 4 >> b_interlaced;
670 const int qp_thresh = 15 - X264_MIN(h->sh.i_alpha_c0_offset, h->sh.i_beta_offset) - X264_MAX(0, h->param.analyse.i_chroma_qp_offset);
671 const int no_sub8x8 = !(h->param.analyse.inter & X264_ANALYSE_PSUB8x8);
673 int stridey = h->fdec->i_stride[0];
674 int stride2y = stridey << b_interlaced;
675 int strideuv = h->fdec->i_stride[1];
676 int stride2uv = strideuv << b_interlaced;
677 uint8_t (*nnz_backup)[16] = h->scratch_buffer;
679 if( !h->pps->b_cabac && h->pps->b_transform_8x8_mode )
680 munge_cavlc_nnz( h, mb_y, nnz_backup, munge_cavlc_nnz_row );
682 for( mb_x = 0; mb_x < h->sps->i_mb_width; mb_x += (~b_interlaced | mb_y)&1, mb_y ^= b_interlaced )
684 const int mb_xy = mb_y * h->mb.i_mb_stride + mb_x;
685 const int mb_8x8 = 2 * s8x8 * mb_y + 2 * mb_x;
686 const int mb_4x4 = 4 * s4x4 * mb_y + 4 * mb_x;
687 const int b_8x8_transform = h->mb.mb_transform_size[mb_xy];
688 const int i_qp = h->mb.qp[mb_xy];
689 int i_edge_end = (h->mb.type[mb_xy] == P_SKIP) ? 1 : 4;
690 uint8_t *pixy = h->fdec->plane[0] + 16*mb_y*stridey + 16*mb_x;
691 uint8_t *pixu = h->fdec->plane[1] + 8*mb_y*strideuv + 8*mb_x;
692 uint8_t *pixv = h->fdec->plane[2] + 8*mb_y*strideuv + 8*mb_x;
693 if( b_interlaced && (mb_y&1) )
700 x264_prefetch_fenc( h, h->fdec, mb_x, mb_y );
702 if( i_qp <= qp_thresh )
705 #define FILTER_DIR(intra, i_dir)\
708 i_qpn= h->mb.qp[mbn_xy];\
712 deblock_edge##intra( h, pixy + 4*i_edge, NULL,\
713 stride2y, bS, (i_qp+i_qpn+1) >> 1, 0,\
714 h->loopf.deblock_h_luma##intra );\
718 int i_qpc = (h->chroma_qp_table[i_qp] + h->chroma_qp_table[i_qpn] + 1) >> 1;\
719 deblock_edge##intra( h, pixu + 2*i_edge, pixv + 2*i_edge,\
720 stride2uv, bS, i_qpc, 1,\
721 h->loopf.deblock_h_chroma##intra );\
726 /* horizontal edge */\
727 deblock_edge##intra( h, pixy + 4*i_edge*stride2y, NULL,\
728 stride2y, bS, (i_qp+i_qpn+1) >> 1, 0,\
729 h->loopf.deblock_v_luma##intra );\
733 int i_qpc = (h->chroma_qp_table[i_qp] + h->chroma_qp_table[i_qpn] + 1) >> 1;\
734 deblock_edge##intra( h, pixu + 2*i_edge*stride2uv, pixv + 2*i_edge*stride2uv,\
735 stride2uv, bS, i_qpc, 1,\
736 h->loopf.deblock_v_chroma##intra );\
741 #define DEBLOCK_STRENGTH(i_dir)\
743 /* *** Get bS for each 4px for the current edge *** */\
744 if( IS_INTRA( h->mb.type[mb_xy] ) || IS_INTRA( h->mb.type[mbn_xy]) )\
745 M32( bS ) = 0x03030303;\
748 M32( bS ) = 0x00000000;\
749 for( i = 0; i < 4; i++ )\
751 int x = i_dir == 0 ? i_edge : i;\
752 int y = i_dir == 0 ? i : i_edge;\
753 int xn = i_dir == 0 ? (x - 1)&0x03 : x;\
754 int yn = i_dir == 0 ? y : (y - 1)&0x03;\
755 if( h->mb.non_zero_count[mb_xy][x+y*4] != 0 ||\
756 h->mb.non_zero_count[mbn_xy][xn+yn*4] != 0 )\
758 else if(!(i_edge&no_sub8x8))\
760 if((i&no_sub8x8) && bS[i-1] != 2)\
764 int i8p= mb_8x8+(x>>1)+(y>>1)*s8x8;\
765 int i8q= mbn_8x8+(xn>>1)+(yn>>1)*s8x8;\
766 int i4p= mb_4x4+x+y*s4x4;\
767 int i4q= mbn_4x4+xn+yn*s4x4;\
769 /* We don't use duplicate refs in B-frames, so we can take this shortcut for now. */ \
770 if( h->sh.i_type == SLICE_TYPE_B || h->mb.ref[0][i8p] < 0 || h->mb.ref[0][i8q] < 0 )\
771 refs_equal = h->mb.ref[0][i8p] == h->mb.ref[0][i8q];\
772 else if( !h->mb.b_interlaced )\
773 refs_equal = h->fref0[h->mb.ref[0][i8p]]->i_poc == h->fref0[h->mb.ref[0][i8q]]->i_poc;\
775 refs_equal = h->fref0[h->mb.ref[0][i8p]>>1]->i_poc == h->fref0[h->mb.ref[0][i8q]>>1]->i_poc\
776 && (h->mb.ref[0][i8p]&1) == (h->mb.ref[0][i8q]&1);\
778 abs( h->mb.mv[0][i4p][0] - h->mb.mv[0][i4q][0] ) >= 4 ||\
779 abs( h->mb.mv[0][i4p][1] - h->mb.mv[0][i4q][1] ) >= mvy_limit ) ||\
780 (h->sh.i_type == SLICE_TYPE_B &&\
781 (h->mb.ref[1][i8p] != h->mb.ref[1][i8q] ||\
782 abs( h->mb.mv[1][i4p][0] - h->mb.mv[1][i4q][0] ) >= 4 ||\
783 abs( h->mb.mv[1][i4p][1] - h->mb.mv[1][i4q][1] ) >= mvy_limit )))\
793 /* i_dir == 0 -> vertical edge
794 * i_dir == 1 -> horizontal edge */
795 #define DEBLOCK_DIR(i_dir)\
797 int i_edge = (i_dir ? (mb_y <= b_interlaced) : (mb_x == 0));\
798 int i_qpn, i, mbn_xy, mbn_8x8, mbn_4x4;\
799 ALIGNED_4( uint8_t bS[4] ); /* filtering strength */\
801 i_edge+= b_8x8_transform;\
804 mbn_xy = i_dir == 0 ? mb_xy - 1 : mb_xy - h->mb.i_mb_stride;\
805 mbn_8x8 = i_dir == 0 ? mb_8x8 - 2 : mb_8x8 - 2 * s8x8;\
806 mbn_4x4 = i_dir == 0 ? mb_4x4 - 4 : mb_4x4 - 4 * s4x4;\
807 if( b_interlaced && i_dir == 1 )\
809 mbn_xy -= h->mb.i_mb_stride;\
810 mbn_8x8 -= 2 * s8x8;\
811 mbn_4x4 -= 4 * s4x4;\
813 else if( IS_INTRA( h->mb.type[mb_xy] ) || IS_INTRA( h->mb.type[mbn_xy]) )\
815 FILTER_DIR( _intra, i_dir );\
818 DEBLOCK_STRENGTH(i_dir);\
820 FILTER_DIR( , i_dir);\
822 i_edge += b_8x8_transform+1;\
827 for( ; i_edge < i_edge_end; i_edge+=b_8x8_transform+1 )\
829 DEBLOCK_STRENGTH(i_dir);\
831 FILTER_DIR( , i_dir);\
839 if( !h->pps->b_cabac && h->pps->b_transform_8x8_mode )
840 munge_cavlc_nnz( h, mb_y, nnz_backup, restore_cavlc_nnz_row );
843 void x264_frame_deblock( x264_t *h )
846 for( mb_y = 0; mb_y < h->sps->i_mb_height; mb_y += 1 + h->sh.b_mbaff )
847 x264_frame_deblock_row( h, mb_y );
851 void x264_deblock_v_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
852 void x264_deblock_h_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
853 void x264_deblock_v_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
854 void x264_deblock_h_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
856 void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
857 void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
858 void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta );
859 void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta );
861 void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
862 void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
863 void x264_deblock_h_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
864 void x264_deblock_v8_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
866 static void x264_deblock_v_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
868 x264_deblock_v8_luma_mmxext( pix, stride, alpha, beta, tc0 );
869 x264_deblock_v8_luma_mmxext( pix+8, stride, alpha, beta, tc0+2 );
871 static void x264_deblock_v_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta )
873 x264_deblock_v8_luma_intra_mmxext( pix, stride, alpha, beta );
874 x264_deblock_v8_luma_intra_mmxext( pix+8, stride, alpha, beta );
880 void x264_deblock_v_luma_altivec( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
881 void x264_deblock_h_luma_altivec( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
885 void x264_deblock_v_luma_neon( uint8_t *, int, int, int, int8_t * );
886 void x264_deblock_h_luma_neon( uint8_t *, int, int, int, int8_t * );
887 void x264_deblock_v_chroma_neon( uint8_t *, int, int, int, int8_t * );
888 void x264_deblock_h_chroma_neon( uint8_t *, int, int, int, int8_t * );
891 void x264_deblock_init( int cpu, x264_deblock_function_t *pf )
893 pf->deblock_v_luma = deblock_v_luma_c;
894 pf->deblock_h_luma = deblock_h_luma_c;
895 pf->deblock_v_chroma = deblock_v_chroma_c;
896 pf->deblock_h_chroma = deblock_h_chroma_c;
897 pf->deblock_v_luma_intra = deblock_v_luma_intra_c;
898 pf->deblock_h_luma_intra = deblock_h_luma_intra_c;
899 pf->deblock_v_chroma_intra = deblock_v_chroma_intra_c;
900 pf->deblock_h_chroma_intra = deblock_h_chroma_intra_c;
903 if( cpu&X264_CPU_MMXEXT )
905 pf->deblock_v_chroma = x264_deblock_v_chroma_mmxext;
906 pf->deblock_h_chroma = x264_deblock_h_chroma_mmxext;
907 pf->deblock_v_chroma_intra = x264_deblock_v_chroma_intra_mmxext;
908 pf->deblock_h_chroma_intra = x264_deblock_h_chroma_intra_mmxext;
910 pf->deblock_v_luma = x264_deblock_v_luma_mmxext;
911 pf->deblock_h_luma = x264_deblock_h_luma_mmxext;
912 pf->deblock_v_luma_intra = x264_deblock_v_luma_intra_mmxext;
913 pf->deblock_h_luma_intra = x264_deblock_h_luma_intra_mmxext;
915 if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_STACK_MOD4) )
917 pf->deblock_v_luma = x264_deblock_v_luma_sse2;
918 pf->deblock_h_luma = x264_deblock_h_luma_sse2;
919 pf->deblock_v_luma_intra = x264_deblock_v_luma_intra_sse2;
920 pf->deblock_h_luma_intra = x264_deblock_h_luma_intra_sse2;
926 if( cpu&X264_CPU_ALTIVEC )
928 pf->deblock_v_luma = x264_deblock_v_luma_altivec;
929 pf->deblock_h_luma = x264_deblock_h_luma_altivec;
931 #endif // HAVE_ALTIVEC
934 if( cpu&X264_CPU_NEON )
936 pf->deblock_v_luma = x264_deblock_v_luma_neon;
937 pf->deblock_h_luma = x264_deblock_h_luma_neon;
938 pf->deblock_v_chroma = x264_deblock_v_chroma_neon;
939 pf->deblock_h_chroma = x264_deblock_h_chroma_neon;
946 void x264_frame_cond_broadcast( x264_frame_t *frame, int i_lines_completed )
948 x264_pthread_mutex_lock( &frame->mutex );
949 frame->i_lines_completed = i_lines_completed;
950 x264_pthread_cond_broadcast( &frame->cv );
951 x264_pthread_mutex_unlock( &frame->mutex );
954 void x264_frame_cond_wait( x264_frame_t *frame, int i_lines_completed )
956 x264_pthread_mutex_lock( &frame->mutex );
957 while( frame->i_lines_completed < i_lines_completed )
958 x264_pthread_cond_wait( &frame->cv, &frame->mutex );
959 x264_pthread_mutex_unlock( &frame->mutex );
964 void x264_frame_push( x264_frame_t **list, x264_frame_t *frame )
967 while( list[i] ) i++;
971 x264_frame_t *x264_frame_pop( x264_frame_t **list )
976 while( list[i+1] ) i++;
982 void x264_frame_unshift( x264_frame_t **list, x264_frame_t *frame )
985 while( list[i] ) i++;
991 x264_frame_t *x264_frame_shift( x264_frame_t **list )
993 x264_frame_t *frame = list[0];
995 for( i = 0; list[i]; i++ )
1001 void x264_frame_push_unused( x264_t *h, x264_frame_t *frame )
1003 assert( frame->i_reference_count > 0 );
1004 frame->i_reference_count--;
1005 if( frame->i_reference_count == 0 )
1006 x264_frame_push( h->frames.unused[frame->b_fdec], frame );
1009 x264_frame_t *x264_frame_pop_unused( x264_t *h, int b_fdec )
1011 x264_frame_t *frame;
1012 if( h->frames.unused[b_fdec][0] )
1013 frame = x264_frame_pop( h->frames.unused[b_fdec] );
1015 frame = x264_frame_new( h, b_fdec );
1018 frame->b_last_minigop_bframe = 0;
1019 frame->i_reference_count = 1;
1020 frame->b_intra_calculated = 0;
1021 frame->b_scenecut = 1;
1022 frame->b_keyframe = 0;
1024 memset( frame->weight, 0, sizeof(frame->weight) );
1025 memset( frame->f_weighted_cost_delta, 0, sizeof(frame->f_weighted_cost_delta) );
1030 void x264_frame_push_blank_unused( x264_t *h, x264_frame_t *frame )
1032 assert( frame->i_reference_count > 0 );
1033 frame->i_reference_count--;
1034 if( frame->i_reference_count == 0 )
1035 x264_frame_push( h->frames.blank_unused, frame );
1038 x264_frame_t *x264_frame_pop_blank_unused( x264_t *h )
1040 x264_frame_t *frame;
1041 if( h->frames.blank_unused[0] )
1042 frame = x264_frame_pop( h->frames.blank_unused );
1044 frame = x264_malloc( sizeof(x264_frame_t) );
1047 frame->b_duplicate = 1;
1048 frame->i_reference_count = 1;
1052 void x264_frame_sort( x264_frame_t **list, int b_dts )
1057 for( i = 0; list[i+1]; i++ )
1059 int dtype = list[i]->i_type - list[i+1]->i_type;
1060 int dtime = list[i]->i_frame - list[i+1]->i_frame;
1061 int swap = b_dts ? dtype > 0 || ( dtype == 0 && dtime > 0 )
1065 XCHG( x264_frame_t*, list[i], list[i+1] );
1072 void x264_weight_scale_plane( x264_t *h, uint8_t *dst, int i_dst_stride, uint8_t *src, int i_src_stride,
1073 int i_width, int i_height, x264_weight_t *w )
1076 /* Weight horizontal strips of height 16. This was found to be the optimal height
1077 * in terms of the cache loads. */
1078 while( i_height > 0 )
1080 for( x = 0; x < i_width; x += 16 )
1081 w->weightfn[16>>2]( dst+x, i_dst_stride, src+x, i_src_stride, w, X264_MIN( i_height, 16 ) );
1083 dst += 16 * i_dst_stride;
1084 src += 16 * i_src_stride;
1088 void x264_frame_delete_list( x264_frame_t **list )
1094 x264_frame_delete( list[i++] );
1098 int x264_synch_frame_list_init( x264_synch_frame_list_t *slist, int max_size )
1102 slist->i_max_size = max_size;
1104 CHECKED_MALLOCZERO( slist->list, (max_size+1) * sizeof(x264_frame_t*) );
1105 if( x264_pthread_mutex_init( &slist->mutex, NULL ) ||
1106 x264_pthread_cond_init( &slist->cv_fill, NULL ) ||
1107 x264_pthread_cond_init( &slist->cv_empty, NULL ) )
1114 void x264_synch_frame_list_delete( x264_synch_frame_list_t *slist )
1116 x264_pthread_mutex_destroy( &slist->mutex );
1117 x264_pthread_cond_destroy( &slist->cv_fill );
1118 x264_pthread_cond_destroy( &slist->cv_empty );
1119 x264_frame_delete_list( slist->list );
1122 void x264_synch_frame_list_push( x264_synch_frame_list_t *slist, x264_frame_t *frame )
1124 x264_pthread_mutex_lock( &slist->mutex );
1125 while( slist->i_size == slist->i_max_size )
1126 x264_pthread_cond_wait( &slist->cv_empty, &slist->mutex );
1127 slist->list[ slist->i_size++ ] = frame;
1128 x264_pthread_mutex_unlock( &slist->mutex );
1129 x264_pthread_cond_broadcast( &slist->cv_fill );