1 /*****************************************************************************
2 * frame.c: h264 encoder library
3 *****************************************************************************
4 * Copyright (C) 2003-2008 x264 project
6 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
7 * Loren Merritt <lorenm@u.washington.edu>
8 * Fiona Glaser <fiona@x264.com>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23 *****************************************************************************/
27 #define ALIGN(x,a) (((x)+((a)-1))&~((a)-1))
29 x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
34 int i_mb_count = h->mb.i_mb_count;
35 int i_stride, i_width, i_lines;
36 int i_padv = PADV << h->param.b_interlaced;
38 int chroma_plane_size;
39 int align = h->param.cpu&X264_CPU_CACHELINE_64 ? 64 : h->param.cpu&X264_CPU_CACHELINE_32 ? 32 : 16;
41 CHECKED_MALLOCZERO( frame, sizeof(x264_frame_t) );
43 /* allocate frame data (+64 for extra data for me) */
44 i_width = ALIGN( h->param.i_width, 16 );
45 i_stride = ALIGN( i_width + 2*PADH, align );
46 i_lines = ALIGN( h->param.i_height, 16<<h->param.b_interlaced );
49 for( i = 0; i < 3; i++ )
51 frame->i_stride[i] = ALIGN( i_stride >> !!i, align );
52 frame->i_width[i] = i_width >> !!i;
53 frame->i_lines[i] = i_lines >> !!i;
56 luma_plane_size = (frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv));
57 chroma_plane_size = (frame->i_stride[1] * (frame->i_lines[1] + 2*i_padv));
58 for( i = 1; i < 3; i++ )
60 CHECKED_MALLOC( frame->buffer[i], chroma_plane_size );
61 frame->plane[i] = frame->buffer[i] + (frame->i_stride[i] * i_padv + PADH)/2;
64 for( i = 0; i < h->param.i_bframe + 2; i++ )
65 for( j = 0; j < h->param.i_bframe + 2; j++ )
66 CHECKED_MALLOC( frame->i_row_satds[i][j], i_lines/16 * sizeof(int) );
69 frame->i_type = X264_TYPE_AUTO;
73 frame->i_frame_num = -1;
74 frame->i_lines_completed = -1;
75 frame->b_fdec = b_fdec;
78 /* all 4 luma planes allocated together, since the cacheline split code
79 * requires them to be in-phase wrt cacheline alignment. */
80 if( h->param.analyse.i_subpel_refine && b_fdec )
82 CHECKED_MALLOC( frame->buffer[0], 4*luma_plane_size );
83 for( i = 0; i < 4; i++ )
84 frame->filtered[i] = frame->buffer[0] + i*luma_plane_size + frame->i_stride[0] * i_padv + PADH;
85 frame->plane[0] = frame->filtered[0];
89 CHECKED_MALLOC( frame->buffer[0], luma_plane_size );
90 frame->filtered[0] = frame->plane[0] = frame->buffer[0] + frame->i_stride[0] * i_padv + PADH;
93 frame->b_duplicate = 0;
95 if( b_fdec ) /* fdec frame */
97 CHECKED_MALLOC( frame->mb_type, i_mb_count * sizeof(int8_t));
98 CHECKED_MALLOC( frame->mv[0], 2*16 * i_mb_count * sizeof(int16_t) );
99 CHECKED_MALLOC( frame->ref[0], 4 * i_mb_count * sizeof(int8_t) );
100 if( h->param.i_bframe )
102 CHECKED_MALLOC( frame->mv[1], 2*16 * i_mb_count * sizeof(int16_t) );
103 CHECKED_MALLOC( frame->ref[1], 4 * i_mb_count * sizeof(int8_t) );
108 frame->ref[1] = NULL;
110 CHECKED_MALLOC( frame->i_row_bits, i_lines/16 * sizeof(int) );
111 CHECKED_MALLOC( frame->i_row_qp, i_lines/16 * sizeof(int) );
112 if( h->param.analyse.i_me_method >= X264_ME_ESA )
114 CHECKED_MALLOC( frame->buffer[3],
115 frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv) * sizeof(uint16_t) << h->frames.b_have_sub8x8_esa );
116 frame->integral = (uint16_t*)frame->buffer[3] + frame->i_stride[0] * i_padv + PADH;
119 else /* fenc frame */
121 if( h->frames.b_have_lowres )
123 frame->i_width_lowres = frame->i_width[0]/2;
124 frame->i_stride_lowres = ALIGN( frame->i_width_lowres + 2*PADH, align );
125 frame->i_lines_lowres = frame->i_lines[0]/2;
127 luma_plane_size = frame->i_stride_lowres * (frame->i_lines[0]/2 + 2*PADV);
129 CHECKED_MALLOC( frame->buffer_lowres[0], 4 * luma_plane_size );
130 for( i = 0; i < 4; i++ )
131 frame->lowres[i] = frame->buffer_lowres[0] + (frame->i_stride_lowres * PADV + PADH) + i * luma_plane_size;
133 for( j = 0; j <= !!h->param.i_bframe; j++ )
134 for( i = 0; i <= h->param.i_bframe; i++ )
136 CHECKED_MALLOCZERO( frame->lowres_mvs[j][i], 2*h->mb.i_mb_count*sizeof(int16_t) );
137 CHECKED_MALLOC( frame->lowres_mv_costs[j][i], h->mb.i_mb_count*sizeof(int) );
139 CHECKED_MALLOC( frame->i_propagate_cost, (i_mb_count+3) * sizeof(uint16_t) );
140 for( j = 0; j <= h->param.i_bframe+1; j++ )
141 for( i = 0; i <= h->param.i_bframe+1; i++ )
143 CHECKED_MALLOC( frame->lowres_costs[j][i], (i_mb_count+3) * sizeof(uint16_t) );
144 CHECKED_MALLOC( frame->lowres_inter_types[j][i], (i_mb_count+3)/4 * sizeof(uint8_t) );
146 frame->i_intra_cost = frame->lowres_costs[0][0];
147 memset( frame->i_intra_cost, -1, (i_mb_count+3) * sizeof(uint16_t) );
149 if( h->param.rc.i_aq_mode )
151 CHECKED_MALLOC( frame->f_qp_offset, h->mb.i_mb_count * sizeof(float) );
152 CHECKED_MALLOC( frame->f_qp_offset_aq, h->mb.i_mb_count * sizeof(float) );
153 if( h->frames.b_have_lowres )
154 /* shouldn't really be initialized, just silences a valgrind false-positive in x264_mbtree_propagate_cost_sse2 */
155 CHECKED_MALLOCZERO( frame->i_inv_qscale_factor, (h->mb.i_mb_count+3) * sizeof(uint16_t) );
159 if( x264_pthread_mutex_init( &frame->mutex, NULL ) )
161 if( x264_pthread_cond_init( &frame->cv, NULL ) )
171 void x264_frame_delete( x264_frame_t *frame )
174 /* Duplicate frames are blank copies of real frames (including pointers),
175 * so freeing those pointers would cause a double free later. */
176 if( !frame->b_duplicate )
178 for( i = 0; i < 4; i++ )
179 x264_free( frame->buffer[i] );
180 for( i = 0; i < 4; i++ )
181 x264_free( frame->buffer_lowres[i] );
182 for( i = 0; i < X264_BFRAME_MAX+2; i++ )
183 for( j = 0; j < X264_BFRAME_MAX+2; j++ )
184 x264_free( frame->i_row_satds[i][j] );
185 for( j = 0; j < 2; j++ )
186 for( i = 0; i <= X264_BFRAME_MAX; i++ )
188 x264_free( frame->lowres_mvs[j][i] );
189 x264_free( frame->lowres_mv_costs[j][i] );
191 x264_free( frame->i_propagate_cost );
192 for( j = 0; j <= X264_BFRAME_MAX+1; j++ )
193 for( i = 0; i <= X264_BFRAME_MAX+1; i++ )
195 x264_free( frame->lowres_costs[j][i] );
196 x264_free( frame->lowres_inter_types[j][i] );
198 x264_free( frame->f_qp_offset );
199 x264_free( frame->f_qp_offset_aq );
200 x264_free( frame->i_inv_qscale_factor );
201 x264_free( frame->i_row_bits );
202 x264_free( frame->i_row_qp );
203 x264_free( frame->mb_type );
204 x264_free( frame->mv[0] );
205 x264_free( frame->mv[1] );
206 x264_free( frame->ref[0] );
207 x264_free( frame->ref[1] );
208 x264_pthread_mutex_destroy( &frame->mutex );
209 x264_pthread_cond_destroy( &frame->cv );
214 int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src )
216 int i_csp = src->img.i_csp & X264_CSP_MASK;
218 if( i_csp != X264_CSP_I420 && i_csp != X264_CSP_YV12 )
220 x264_log( h, X264_LOG_ERROR, "Arg invalid CSP\n" );
224 dst->i_type = src->i_type;
225 dst->i_qpplus1 = src->i_qpplus1;
226 dst->i_pts = dst->i_reordered_pts = src->i_pts;
227 dst->param = src->param;
231 int s = (i_csp == X264_CSP_YV12 && i) ? i^3 : i;
232 uint8_t *plane = src->img.plane[s];
233 int stride = src->img.i_stride[s];
234 int width = h->param.i_width >> !!i;
235 int height = h->param.i_height >> !!i;
236 if( src->img.i_csp & X264_CSP_VFLIP )
238 plane += (height-1)*stride;
241 h->mc.plane_copy( dst->plane[i], dst->i_stride[i], plane, stride, width, height );
248 static void plane_expand_border( uint8_t *pix, int i_stride, int i_width, int i_height, int i_padh, int i_padv, int b_pad_top, int b_pad_bottom )
250 #define PPIXEL(x, y) ( pix + (x) + (y)*i_stride )
252 for( y = 0; y < i_height; y++ )
255 memset( PPIXEL(-i_padh, y), PPIXEL(0, y)[0], i_padh );
257 memset( PPIXEL(i_width, y), PPIXEL(i_width-1, y)[0], i_padh );
261 for( y = 0; y < i_padv; y++ )
262 memcpy( PPIXEL(-i_padh, -y-1), PPIXEL(-i_padh, 0), i_width+2*i_padh );
265 for( y = 0; y < i_padv; y++ )
266 memcpy( PPIXEL(-i_padh, i_height+y), PPIXEL(-i_padh, i_height-1), i_width+2*i_padh );
270 void x264_frame_expand_border( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
274 if( mb_y & h->sh.b_mbaff )
276 for( i = 0; i < frame->i_plane; i++ )
278 int stride = frame->i_stride[i];
279 int width = 16*h->sps->i_mb_width >> !!i;
280 int height = (b_end ? 16*(h->sps->i_mb_height - mb_y) >> h->sh.b_mbaff : 16) >> !!i;
281 int padh = PADH >> !!i;
282 int padv = PADV >> !!i;
283 // buffer: 2 chroma, 3 luma (rounded to 4) because deblocking goes beyond the top of the mb
284 uint8_t *pix = frame->plane[i] + X264_MAX(0, (16*mb_y-4)*stride >> !!i);
285 if( b_end && !b_start )
286 height += 4 >> (!!i + h->sh.b_mbaff);
289 plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end );
290 plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end );
294 plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end );
299 void x264_frame_expand_border_filtered( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
301 /* during filtering, 8 extra pixels were filtered on each edge,
302 * but up to 3 of the horizontal ones may be wrong.
303 we want to expand border from the last filtered pixel */
305 int stride = frame->i_stride[0];
306 int width = 16*h->sps->i_mb_width + 8;
307 int height = b_end ? (16*(h->sps->i_mb_height - mb_y) >> h->sh.b_mbaff) + 16 : 16;
311 for( i = 1; i < 4; i++ )
313 // buffer: 8 luma, to match the hpel filter
314 uint8_t *pix = frame->filtered[i] + (16*mb_y - (8 << h->sh.b_mbaff)) * stride - 4;
317 plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end );
318 plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end );
322 plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end );
327 void x264_frame_expand_border_lowres( x264_frame_t *frame )
330 for( i = 0; i < 4; i++ )
331 plane_expand_border( frame->lowres[i], frame->i_stride_lowres, frame->i_width_lowres, frame->i_lines_lowres, PADH, PADV, 1, 1 );
334 void x264_frame_expand_border_mod16( x264_t *h, x264_frame_t *frame )
337 for( i = 0; i < frame->i_plane; i++ )
339 int i_subsample = i ? 1 : 0;
340 int i_width = h->param.i_width >> i_subsample;
341 int i_height = h->param.i_height >> i_subsample;
342 int i_padx = (h->sps->i_mb_width * 16 - h->param.i_width) >> i_subsample;
343 int i_pady = (h->sps->i_mb_height * 16 - h->param.i_height) >> i_subsample;
347 for( y = 0; y < i_height; y++ )
348 memset( &frame->plane[i][y*frame->i_stride[i] + i_width],
349 frame->plane[i][y*frame->i_stride[i] + i_width - 1],
354 //FIXME interlace? or just let it pad using the wrong field
355 for( y = i_height; y < i_height + i_pady; y++ )
356 memcpy( &frame->plane[i][y*frame->i_stride[i]],
357 &frame->plane[i][(i_height-1)*frame->i_stride[i]],
364 /* cavlc + 8x8 transform stores nnz per 16 coeffs for the purpose of
365 * entropy coding, but per 64 coeffs for the purpose of deblocking */
366 static void munge_cavlc_nnz_row( x264_t *h, int mb_y, uint8_t (*buf)[16] )
368 uint32_t (*src)[6] = (uint32_t(*)[6])h->mb.non_zero_count + mb_y * h->sps->i_mb_width;
369 int8_t *transform = h->mb.mb_transform_size + mb_y * h->sps->i_mb_width;
371 for( x=0; x<h->sps->i_mb_width; x++ )
373 memcpy( buf+x, src+x, 16 );
376 nnz = src[x][0] | src[x][1];
377 src[x][0] = src[x][1] = ((uint16_t)nnz ? 0x0101 : 0) + (nnz>>16 ? 0x01010000 : 0);
378 nnz = src[x][2] | src[x][3];
379 src[x][2] = src[x][3] = ((uint16_t)nnz ? 0x0101 : 0) + (nnz>>16 ? 0x01010000 : 0);
384 static void restore_cavlc_nnz_row( x264_t *h, int mb_y, uint8_t (*buf)[16] )
386 uint8_t (*dst)[24] = h->mb.non_zero_count + mb_y * h->sps->i_mb_width;
388 for( x=0; x<h->sps->i_mb_width; x++ )
389 memcpy( dst+x, buf+x, 16 );
392 static void munge_cavlc_nnz( x264_t *h, int mb_y, uint8_t (*buf)[16], void (*func)(x264_t*, int, uint8_t (*)[16]) )
394 func( h, mb_y, buf );
396 func( h, mb_y-1, buf + h->sps->i_mb_width );
399 func( h, mb_y+1, buf + h->sps->i_mb_width * 2 );
401 func( h, mb_y-2, buf + h->sps->i_mb_width * 3 );
406 /* Deblocking filter */
407 static const uint8_t i_alpha_table[52+12*2] =
409 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
410 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
411 0, 0, 0, 0, 0, 0, 4, 4, 5, 6,
412 7, 8, 9, 10, 12, 13, 15, 17, 20, 22,
413 25, 28, 32, 36, 40, 45, 50, 56, 63, 71,
414 80, 90,101,113,127,144,162,182,203,226,
416 255,255,255,255,255,255,255,255,255,255,255,255,
418 static const uint8_t i_beta_table[52+12*2] =
420 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
421 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
422 0, 0, 0, 0, 0, 0, 2, 2, 2, 3,
423 3, 3, 3, 4, 4, 4, 6, 6, 7, 7,
424 8, 8, 9, 9, 10, 10, 11, 11, 12, 12,
425 13, 13, 14, 14, 15, 15, 16, 16, 17, 17,
427 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
429 static const int8_t i_tc0_table[52+12*2][4] =
431 {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
432 {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
433 {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
434 {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
435 {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 1 },
436 {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 1, 1 }, {-1, 0, 1, 1 }, {-1, 1, 1, 1 },
437 {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 },
438 {-1, 1, 1, 2 }, {-1, 1, 2, 3 }, {-1, 1, 2, 3 }, {-1, 2, 2, 3 }, {-1, 2, 2, 4 }, {-1, 2, 3, 4 },
439 {-1, 2, 3, 4 }, {-1, 3, 3, 5 }, {-1, 3, 4, 6 }, {-1, 3, 4, 6 }, {-1, 4, 5, 7 }, {-1, 4, 5, 8 },
440 {-1, 4, 6, 9 }, {-1, 5, 7,10 }, {-1, 6, 8,11 }, {-1, 6, 8,13 }, {-1, 7,10,14 }, {-1, 8,11,16 },
441 {-1, 9,12,18 }, {-1,10,13,20 }, {-1,11,15,23 }, {-1,13,17,25 },
442 {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
443 {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
445 #define alpha_table(x) i_alpha_table[(x)+12]
446 #define beta_table(x) i_beta_table[(x)+12]
447 #define tc0_table(x) i_tc0_table[(x)+12]
450 static inline void deblock_luma_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0 )
453 for( i = 0; i < 4; i++ )
460 for( d = 0; d < 4; d++ )
462 const int p2 = pix[-3*xstride];
463 const int p1 = pix[-2*xstride];
464 const int p0 = pix[-1*xstride];
465 const int q0 = pix[ 0*xstride];
466 const int q1 = pix[ 1*xstride];
467 const int q2 = pix[ 2*xstride];
469 if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
473 if( abs( p2 - p0 ) < beta )
476 pix[-2*xstride] = p1 + x264_clip3( (( p2 + ((p0 + q0 + 1) >> 1)) >> 1) - p1, -tc0[i], tc0[i] );
479 if( abs( q2 - q0 ) < beta )
482 pix[ 1*xstride] = q1 + x264_clip3( (( q2 + ((p0 + q0 + 1) >> 1)) >> 1) - q1, -tc0[i], tc0[i] );
486 delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
487 pix[-1*xstride] = x264_clip_uint8( p0 + delta ); /* p0' */
488 pix[ 0*xstride] = x264_clip_uint8( q0 - delta ); /* q0' */
494 static void deblock_v_luma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
496 deblock_luma_c( pix, stride, 1, alpha, beta, tc0 );
498 static void deblock_h_luma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
500 deblock_luma_c( pix, 1, stride, alpha, beta, tc0 );
503 static inline void deblock_chroma_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0 )
506 for( i = 0; i < 4; i++ )
508 const int tc = tc0[i];
514 for( d = 0; d < 2; d++ )
516 const int p1 = pix[-2*xstride];
517 const int p0 = pix[-1*xstride];
518 const int q0 = pix[ 0*xstride];
519 const int q1 = pix[ 1*xstride];
521 if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
523 int delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
524 pix[-1*xstride] = x264_clip_uint8( p0 + delta ); /* p0' */
525 pix[ 0*xstride] = x264_clip_uint8( q0 - delta ); /* q0' */
531 static void deblock_v_chroma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
533 deblock_chroma_c( pix, stride, 1, alpha, beta, tc0 );
535 static void deblock_h_chroma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
537 deblock_chroma_c( pix, 1, stride, alpha, beta, tc0 );
540 static inline void deblock_luma_intra_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta )
543 for( d = 0; d < 16; d++ )
545 const int p2 = pix[-3*xstride];
546 const int p1 = pix[-2*xstride];
547 const int p0 = pix[-1*xstride];
548 const int q0 = pix[ 0*xstride];
549 const int q1 = pix[ 1*xstride];
550 const int q2 = pix[ 2*xstride];
552 if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
554 if(abs( p0 - q0 ) < ((alpha >> 2) + 2) )
556 if( abs( p2 - p0 ) < beta ) /* p0', p1', p2' */
558 const int p3 = pix[-4*xstride];
559 pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
560 pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
561 pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
564 pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
565 if( abs( q2 - q0 ) < beta ) /* q0', q1', q2' */
567 const int q3 = pix[3*xstride];
568 pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
569 pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
570 pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
573 pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
577 pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
578 pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
584 static void deblock_v_luma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
586 deblock_luma_intra_c( pix, stride, 1, alpha, beta );
588 static void deblock_h_luma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
590 deblock_luma_intra_c( pix, 1, stride, alpha, beta );
593 static inline void deblock_chroma_intra_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta )
596 for( d = 0; d < 8; d++ )
598 const int p1 = pix[-2*xstride];
599 const int p0 = pix[-1*xstride];
600 const int q0 = pix[ 0*xstride];
601 const int q1 = pix[ 1*xstride];
603 if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
605 pix[-1*xstride] = (2*p1 + p0 + q1 + 2) >> 2; /* p0' */
606 pix[ 0*xstride] = (2*q1 + q0 + p1 + 2) >> 2; /* q0' */
611 static void deblock_v_chroma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
613 deblock_chroma_intra_c( pix, stride, 1, alpha, beta );
615 static void deblock_h_chroma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
617 deblock_chroma_intra_c( pix, 1, stride, alpha, beta );
620 static inline void deblock_edge( x264_t *h, uint8_t *pix1, uint8_t *pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_inter_t pf_inter )
622 const int index_a = i_qp + h->sh.i_alpha_c0_offset;
623 const int alpha = alpha_table(index_a);
624 const int beta = beta_table(i_qp + h->sh.i_beta_offset);
627 if( !alpha || !beta )
630 tc[0] = tc0_table(index_a)[bS[0]] + b_chroma;
631 tc[1] = tc0_table(index_a)[bS[1]] + b_chroma;
632 tc[2] = tc0_table(index_a)[bS[2]] + b_chroma;
633 tc[3] = tc0_table(index_a)[bS[3]] + b_chroma;
635 pf_inter( pix1, i_stride, alpha, beta, tc );
637 pf_inter( pix2, i_stride, alpha, beta, tc );
640 static inline void deblock_edge_intra( x264_t *h, uint8_t *pix1, uint8_t *pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_intra_t pf_intra )
642 const int alpha = alpha_table(i_qp + h->sh.i_alpha_c0_offset);
643 const int beta = beta_table(i_qp + h->sh.i_beta_offset);
645 if( !alpha || !beta )
648 pf_intra( pix1, i_stride, alpha, beta );
650 pf_intra( pix2, i_stride, alpha, beta );
653 void x264_frame_deblock_row( x264_t *h, int mb_y )
655 const int s8x8 = 2 * h->mb.i_mb_stride;
656 const int s4x4 = 4 * h->mb.i_mb_stride;
657 const int b_interlaced = h->sh.b_mbaff;
658 const int mvy_limit = 4 >> b_interlaced;
659 const int qp_thresh = 15 - X264_MIN(h->sh.i_alpha_c0_offset, h->sh.i_beta_offset) - X264_MAX(0, h->param.analyse.i_chroma_qp_offset);
660 const int no_sub8x8 = !(h->param.analyse.inter & X264_ANALYSE_PSUB8x8);
662 int stridey = h->fdec->i_stride[0];
663 int stride2y = stridey << b_interlaced;
664 int strideuv = h->fdec->i_stride[1];
665 int stride2uv = strideuv << b_interlaced;
666 uint8_t (*nnz_backup)[16] = h->scratch_buffer;
668 if( !h->pps->b_cabac && h->pps->b_transform_8x8_mode )
669 munge_cavlc_nnz( h, mb_y, nnz_backup, munge_cavlc_nnz_row );
671 for( mb_x = 0; mb_x < h->sps->i_mb_width; mb_x += (~b_interlaced | mb_y)&1, mb_y ^= b_interlaced )
673 const int mb_xy = mb_y * h->mb.i_mb_stride + mb_x;
674 const int mb_8x8 = 2 * s8x8 * mb_y + 2 * mb_x;
675 const int mb_4x4 = 4 * s4x4 * mb_y + 4 * mb_x;
676 const int b_8x8_transform = h->mb.mb_transform_size[mb_xy];
677 const int i_qp = h->mb.qp[mb_xy];
678 int i_edge_end = (h->mb.type[mb_xy] == P_SKIP) ? 1 : 4;
679 uint8_t *pixy = h->fdec->plane[0] + 16*mb_y*stridey + 16*mb_x;
680 uint8_t *pixu = h->fdec->plane[1] + 8*mb_y*strideuv + 8*mb_x;
681 uint8_t *pixv = h->fdec->plane[2] + 8*mb_y*strideuv + 8*mb_x;
682 if( b_interlaced && (mb_y&1) )
689 x264_prefetch_fenc( h, h->fdec, mb_x, mb_y );
691 if( i_qp <= qp_thresh )
694 #define FILTER_DIR(intra, i_dir)\
697 i_qpn= h->mb.qp[mbn_xy];\
701 deblock_edge##intra( h, pixy + 4*i_edge, NULL,\
702 stride2y, bS, (i_qp+i_qpn+1) >> 1, 0,\
703 h->loopf.deblock_h_luma##intra );\
707 int i_qpc = (h->chroma_qp_table[i_qp] + h->chroma_qp_table[i_qpn] + 1) >> 1;\
708 deblock_edge##intra( h, pixu + 2*i_edge, pixv + 2*i_edge,\
709 stride2uv, bS, i_qpc, 1,\
710 h->loopf.deblock_h_chroma##intra );\
715 /* horizontal edge */\
716 deblock_edge##intra( h, pixy + 4*i_edge*stride2y, NULL,\
717 stride2y, bS, (i_qp+i_qpn+1) >> 1, 0,\
718 h->loopf.deblock_v_luma##intra );\
722 int i_qpc = (h->chroma_qp_table[i_qp] + h->chroma_qp_table[i_qpn] + 1) >> 1;\
723 deblock_edge##intra( h, pixu + 2*i_edge*stride2uv, pixv + 2*i_edge*stride2uv,\
724 stride2uv, bS, i_qpc, 1,\
725 h->loopf.deblock_v_chroma##intra );\
730 #define DEBLOCK_STRENGTH(i_dir)\
732 /* *** Get bS for each 4px for the current edge *** */\
733 if( IS_INTRA( h->mb.type[mb_xy] ) || IS_INTRA( h->mb.type[mbn_xy]) )\
734 M32( bS ) = 0x03030303;\
737 M32( bS ) = 0x00000000;\
738 for( i = 0; i < 4; i++ )\
740 int x = i_dir == 0 ? i_edge : i;\
741 int y = i_dir == 0 ? i : i_edge;\
742 int xn = i_dir == 0 ? (x - 1)&0x03 : x;\
743 int yn = i_dir == 0 ? y : (y - 1)&0x03;\
744 if( h->mb.non_zero_count[mb_xy][x+y*4] != 0 ||\
745 h->mb.non_zero_count[mbn_xy][xn+yn*4] != 0 )\
747 else if(!(i_edge&no_sub8x8))\
749 if((i&no_sub8x8) && bS[i-1] != 2)\
753 int i8p= mb_8x8+(x>>1)+(y>>1)*s8x8;\
754 int i8q= mbn_8x8+(xn>>1)+(yn>>1)*s8x8;\
755 int i4p= mb_4x4+x+y*s4x4;\
756 int i4q= mbn_4x4+xn+yn*s4x4;\
758 /* We don't use duplicate refs in B-frames, so we can take this shortcut for now. */ \
759 if( h->sh.i_type == SLICE_TYPE_B || h->mb.ref[0][i8p] < 0 || h->mb.ref[0][i8q] < 0 )\
760 refs_equal = h->mb.ref[0][i8p] == h->mb.ref[0][i8q];\
761 else if( !h->mb.b_interlaced )\
762 refs_equal = h->fref0[h->mb.ref[0][i8p]]->i_poc == h->fref0[h->mb.ref[0][i8q]]->i_poc;\
764 refs_equal = h->fref0[h->mb.ref[0][i8p]>>1]->i_poc == h->fref0[h->mb.ref[0][i8q]>>1]->i_poc\
765 && (h->mb.ref[0][i8p]&1) == (h->mb.ref[0][i8q]&1);\
767 abs( h->mb.mv[0][i4p][0] - h->mb.mv[0][i4q][0] ) >= 4 ||\
768 abs( h->mb.mv[0][i4p][1] - h->mb.mv[0][i4q][1] ) >= mvy_limit ) ||\
769 (h->sh.i_type == SLICE_TYPE_B &&\
770 (h->mb.ref[1][i8p] != h->mb.ref[1][i8q] ||\
771 abs( h->mb.mv[1][i4p][0] - h->mb.mv[1][i4q][0] ) >= 4 ||\
772 abs( h->mb.mv[1][i4p][1] - h->mb.mv[1][i4q][1] ) >= mvy_limit )))\
782 /* i_dir == 0 -> vertical edge
783 * i_dir == 1 -> horizontal edge */
784 #define DEBLOCK_DIR(i_dir)\
786 int i_edge = (i_dir ? (mb_y <= b_interlaced) : (mb_x == 0));\
787 int i_qpn, i, mbn_xy, mbn_8x8, mbn_4x4;\
788 ALIGNED_4( uint8_t bS[4] ); /* filtering strength */\
790 i_edge+= b_8x8_transform;\
793 mbn_xy = i_dir == 0 ? mb_xy - 1 : mb_xy - h->mb.i_mb_stride;\
794 mbn_8x8 = i_dir == 0 ? mb_8x8 - 2 : mb_8x8 - 2 * s8x8;\
795 mbn_4x4 = i_dir == 0 ? mb_4x4 - 4 : mb_4x4 - 4 * s4x4;\
796 if( b_interlaced && i_dir == 1 )\
798 mbn_xy -= h->mb.i_mb_stride;\
799 mbn_8x8 -= 2 * s8x8;\
800 mbn_4x4 -= 4 * s4x4;\
802 else if( IS_INTRA( h->mb.type[mb_xy] ) || IS_INTRA( h->mb.type[mbn_xy]) )\
804 FILTER_DIR( _intra, i_dir );\
807 DEBLOCK_STRENGTH(i_dir);\
809 FILTER_DIR( , i_dir);\
811 i_edge += b_8x8_transform+1;\
816 for( ; i_edge < i_edge_end; i_edge+=b_8x8_transform+1 )\
818 DEBLOCK_STRENGTH(i_dir);\
820 FILTER_DIR( , i_dir);\
828 if( !h->pps->b_cabac && h->pps->b_transform_8x8_mode )
829 munge_cavlc_nnz( h, mb_y, nnz_backup, restore_cavlc_nnz_row );
832 void x264_frame_deblock( x264_t *h )
835 for( mb_y = 0; mb_y < h->sps->i_mb_height; mb_y += 1 + h->sh.b_mbaff )
836 x264_frame_deblock_row( h, mb_y );
840 void x264_deblock_v_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
841 void x264_deblock_h_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
842 void x264_deblock_v_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
843 void x264_deblock_h_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
845 void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
846 void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
847 void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta );
848 void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta );
850 void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
851 void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
852 void x264_deblock_h_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
853 void x264_deblock_v8_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
855 static void x264_deblock_v_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
857 x264_deblock_v8_luma_mmxext( pix, stride, alpha, beta, tc0 );
858 x264_deblock_v8_luma_mmxext( pix+8, stride, alpha, beta, tc0+2 );
860 static void x264_deblock_v_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta )
862 x264_deblock_v8_luma_intra_mmxext( pix, stride, alpha, beta );
863 x264_deblock_v8_luma_intra_mmxext( pix+8, stride, alpha, beta );
869 void x264_deblock_v_luma_altivec( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
870 void x264_deblock_h_luma_altivec( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
874 void x264_deblock_v_luma_neon( uint8_t *, int, int, int, int8_t * );
875 void x264_deblock_h_luma_neon( uint8_t *, int, int, int, int8_t * );
876 void x264_deblock_v_chroma_neon( uint8_t *, int, int, int, int8_t * );
877 void x264_deblock_h_chroma_neon( uint8_t *, int, int, int, int8_t * );
880 void x264_deblock_init( int cpu, x264_deblock_function_t *pf )
882 pf->deblock_v_luma = deblock_v_luma_c;
883 pf->deblock_h_luma = deblock_h_luma_c;
884 pf->deblock_v_chroma = deblock_v_chroma_c;
885 pf->deblock_h_chroma = deblock_h_chroma_c;
886 pf->deblock_v_luma_intra = deblock_v_luma_intra_c;
887 pf->deblock_h_luma_intra = deblock_h_luma_intra_c;
888 pf->deblock_v_chroma_intra = deblock_v_chroma_intra_c;
889 pf->deblock_h_chroma_intra = deblock_h_chroma_intra_c;
892 if( cpu&X264_CPU_MMXEXT )
894 pf->deblock_v_chroma = x264_deblock_v_chroma_mmxext;
895 pf->deblock_h_chroma = x264_deblock_h_chroma_mmxext;
896 pf->deblock_v_chroma_intra = x264_deblock_v_chroma_intra_mmxext;
897 pf->deblock_h_chroma_intra = x264_deblock_h_chroma_intra_mmxext;
899 pf->deblock_v_luma = x264_deblock_v_luma_mmxext;
900 pf->deblock_h_luma = x264_deblock_h_luma_mmxext;
901 pf->deblock_v_luma_intra = x264_deblock_v_luma_intra_mmxext;
902 pf->deblock_h_luma_intra = x264_deblock_h_luma_intra_mmxext;
904 if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_STACK_MOD4) )
906 pf->deblock_v_luma = x264_deblock_v_luma_sse2;
907 pf->deblock_h_luma = x264_deblock_h_luma_sse2;
908 pf->deblock_v_luma_intra = x264_deblock_v_luma_intra_sse2;
909 pf->deblock_h_luma_intra = x264_deblock_h_luma_intra_sse2;
915 if( cpu&X264_CPU_ALTIVEC )
917 pf->deblock_v_luma = x264_deblock_v_luma_altivec;
918 pf->deblock_h_luma = x264_deblock_h_luma_altivec;
923 if( cpu&X264_CPU_NEON )
925 pf->deblock_v_luma = x264_deblock_v_luma_neon;
926 pf->deblock_h_luma = x264_deblock_h_luma_neon;
927 pf->deblock_v_chroma = x264_deblock_v_chroma_neon;
928 pf->deblock_h_chroma = x264_deblock_h_chroma_neon;
935 void x264_frame_cond_broadcast( x264_frame_t *frame, int i_lines_completed )
937 x264_pthread_mutex_lock( &frame->mutex );
938 frame->i_lines_completed = i_lines_completed;
939 x264_pthread_cond_broadcast( &frame->cv );
940 x264_pthread_mutex_unlock( &frame->mutex );
943 void x264_frame_cond_wait( x264_frame_t *frame, int i_lines_completed )
945 x264_pthread_mutex_lock( &frame->mutex );
946 while( frame->i_lines_completed < i_lines_completed )
947 x264_pthread_cond_wait( &frame->cv, &frame->mutex );
948 x264_pthread_mutex_unlock( &frame->mutex );
953 void x264_frame_push( x264_frame_t **list, x264_frame_t *frame )
956 while( list[i] ) i++;
960 x264_frame_t *x264_frame_pop( x264_frame_t **list )
965 while( list[i+1] ) i++;
971 void x264_frame_unshift( x264_frame_t **list, x264_frame_t *frame )
974 while( list[i] ) i++;
980 x264_frame_t *x264_frame_shift( x264_frame_t **list )
982 x264_frame_t *frame = list[0];
984 for( i = 0; list[i]; i++ )
990 void x264_frame_push_unused( x264_t *h, x264_frame_t *frame )
992 assert( frame->i_reference_count > 0 );
993 frame->i_reference_count--;
994 if( frame->i_reference_count == 0 )
995 x264_frame_push( h->frames.unused[frame->b_fdec], frame );
998 x264_frame_t *x264_frame_pop_unused( x264_t *h, int b_fdec )
1000 x264_frame_t *frame;
1001 if( h->frames.unused[b_fdec][0] )
1002 frame = x264_frame_pop( h->frames.unused[b_fdec] );
1004 frame = x264_frame_new( h, b_fdec );
1007 frame->b_last_minigop_bframe = 0;
1008 frame->i_reference_count = 1;
1009 frame->b_intra_calculated = 0;
1010 frame->b_scenecut = 1;
1011 frame->b_keyframe = 0;
1013 memset( frame->weight, 0, sizeof(frame->weight) );
1014 memset( frame->f_weighted_cost_delta, 0, sizeof(frame->f_weighted_cost_delta) );
1019 void x264_frame_push_blank_unused( x264_t *h, x264_frame_t *frame )
1021 assert( frame->i_reference_count > 0 );
1022 frame->i_reference_count--;
1023 if( frame->i_reference_count == 0 )
1024 x264_frame_push( h->frames.blank_unused, frame );
1027 x264_frame_t *x264_frame_pop_blank_unused( x264_t *h )
1029 x264_frame_t *frame;
1030 if( h->frames.blank_unused[0] )
1031 frame = x264_frame_pop( h->frames.blank_unused );
1033 frame = x264_malloc( sizeof(x264_frame_t) );
1036 frame->b_duplicate = 1;
1037 frame->i_reference_count = 1;
1041 void x264_frame_sort( x264_frame_t **list, int b_dts )
1046 for( i = 0; list[i+1]; i++ )
1048 int dtype = list[i]->i_type - list[i+1]->i_type;
1049 int dtime = list[i]->i_frame - list[i+1]->i_frame;
1050 int swap = b_dts ? dtype > 0 || ( dtype == 0 && dtime > 0 )
1054 XCHG( x264_frame_t*, list[i], list[i+1] );
1061 void x264_weight_scale_plane( x264_t *h, uint8_t *dst, int i_dst_stride, uint8_t *src, int i_src_stride,
1062 int i_width, int i_height, x264_weight_t *w )
1065 /* Weight horizontal strips of height 16. This was found to be the optimal height
1066 * in terms of the cache loads. */
1067 while( i_height > 0 )
1069 for( x = 0; x < i_width; x += 16 )
1070 w->weightfn[16>>2]( dst+x, i_dst_stride, src+x, i_src_stride, w, X264_MIN( i_height, 16 ) );
1072 dst += 16 * i_dst_stride;
1073 src += 16 * i_src_stride;
1077 void x264_frame_delete_list( x264_frame_t **list )
1083 x264_frame_delete( list[i++] );
1087 int x264_synch_frame_list_init( x264_synch_frame_list_t *slist, int max_size )
1091 slist->i_max_size = max_size;
1093 CHECKED_MALLOCZERO( slist->list, (max_size+1) * sizeof(x264_frame_t*) );
1094 if( x264_pthread_mutex_init( &slist->mutex, NULL ) ||
1095 x264_pthread_cond_init( &slist->cv_fill, NULL ) ||
1096 x264_pthread_cond_init( &slist->cv_empty, NULL ) )
1103 void x264_synch_frame_list_delete( x264_synch_frame_list_t *slist )
1105 x264_pthread_mutex_destroy( &slist->mutex );
1106 x264_pthread_cond_destroy( &slist->cv_fill );
1107 x264_pthread_cond_destroy( &slist->cv_empty );
1108 x264_frame_delete_list( slist->list );
1111 void x264_synch_frame_list_push( x264_synch_frame_list_t *slist, x264_frame_t *frame )
1113 x264_pthread_mutex_lock( &slist->mutex );
1114 while( slist->i_size == slist->i_max_size )
1115 x264_pthread_cond_wait( &slist->cv_empty, &slist->mutex );
1116 slist->list[ slist->i_size++ ] = frame;
1117 x264_pthread_mutex_unlock( &slist->mutex );
1118 x264_pthread_cond_broadcast( &slist->cv_fill );