1 /*****************************************************************************
2 * frame.c: h264 encoder library
3 *****************************************************************************
4 * Copyright (C) 2003-2008 x264 project
6 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
7 * Loren Merritt <lorenm@u.washington.edu>
8 * Fiona Glaser <fiona@x264.com>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23 *****************************************************************************/
27 #define ALIGN(x,a) (((x)+((a)-1))&~((a)-1))
29 x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
34 int i_mb_count = h->mb.i_mb_count;
35 int i_stride, i_width, i_lines;
36 int i_padv = PADV << h->param.b_interlaced;
38 int chroma_plane_size;
39 int align = h->param.cpu&X264_CPU_CACHELINE_64 ? 64 : h->param.cpu&X264_CPU_CACHELINE_32 ? 32 : 16;
41 CHECKED_MALLOCZERO( frame, sizeof(x264_frame_t) );
43 /* allocate frame data (+64 for extra data for me) */
44 i_width = ALIGN( h->param.i_width, 16 );
45 i_stride = ALIGN( i_width + 2*PADH, align );
46 i_lines = ALIGN( h->param.i_height, 16<<h->param.b_interlaced );
49 for( i = 0; i < 3; i++ )
51 frame->i_stride[i] = ALIGN( i_stride >> !!i, align );
52 frame->i_width[i] = i_width >> !!i;
53 frame->i_lines[i] = i_lines >> !!i;
56 luma_plane_size = (frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv));
57 chroma_plane_size = (frame->i_stride[1] * (frame->i_lines[1] + 2*i_padv));
58 for( i = 1; i < 3; i++ )
60 CHECKED_MALLOC( frame->buffer[i], chroma_plane_size );
61 frame->plane[i] = frame->buffer[i] + (frame->i_stride[i] * i_padv + PADH)/2;
64 for( i = 0; i < h->param.i_bframe + 2; i++ )
65 for( j = 0; j < h->param.i_bframe + 2; j++ )
66 CHECKED_MALLOC( frame->i_row_satds[i][j], i_lines/16 * sizeof(int) );
69 frame->i_type = X264_TYPE_AUTO;
73 frame->i_frame_num = -1;
74 frame->i_lines_completed = -1;
75 frame->b_fdec = b_fdec;
78 /* all 4 luma planes allocated together, since the cacheline split code
79 * requires them to be in-phase wrt cacheline alignment. */
80 if( h->param.analyse.i_subpel_refine && b_fdec )
82 CHECKED_MALLOC( frame->buffer[0], 4*luma_plane_size );
83 for( i = 0; i < 4; i++ )
84 frame->filtered[i] = frame->buffer[0] + i*luma_plane_size + frame->i_stride[0] * i_padv + PADH;
85 frame->plane[0] = frame->filtered[0];
89 CHECKED_MALLOC( frame->buffer[0], luma_plane_size );
90 frame->filtered[0] = frame->plane[0] = frame->buffer[0] + frame->i_stride[0] * i_padv + PADH;
93 frame->b_duplicate = 0;
95 if( b_fdec ) /* fdec frame */
97 CHECKED_MALLOC( frame->mb_type, i_mb_count * sizeof(int8_t));
98 CHECKED_MALLOC( frame->mv[0], 2*16 * i_mb_count * sizeof(int16_t) );
99 CHECKED_MALLOC( frame->ref[0], 4 * i_mb_count * sizeof(int8_t) );
100 if( h->param.i_bframe )
102 CHECKED_MALLOC( frame->mv[1], 2*16 * i_mb_count * sizeof(int16_t) );
103 CHECKED_MALLOC( frame->ref[1], 4 * i_mb_count * sizeof(int8_t) );
108 frame->ref[1] = NULL;
110 CHECKED_MALLOC( frame->i_row_bits, i_lines/16 * sizeof(int) );
111 CHECKED_MALLOC( frame->i_row_qp, i_lines/16 * sizeof(int) );
112 if( h->param.analyse.i_me_method >= X264_ME_ESA )
114 CHECKED_MALLOC( frame->buffer[3],
115 frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv) * sizeof(uint16_t) << h->frames.b_have_sub8x8_esa );
116 frame->integral = (uint16_t*)frame->buffer[3] + frame->i_stride[0] * i_padv + PADH;
119 else /* fenc frame */
121 if( h->frames.b_have_lowres )
123 frame->i_width_lowres = frame->i_width[0]/2;
124 frame->i_stride_lowres = ALIGN( frame->i_width_lowres + 2*PADH, align );
125 frame->i_lines_lowres = frame->i_lines[0]/2;
127 luma_plane_size = frame->i_stride_lowres * (frame->i_lines[0]/2 + 2*i_padv);
129 CHECKED_MALLOC( frame->buffer_lowres[0], 4 * luma_plane_size );
130 for( i = 0; i < 4; i++ )
131 frame->lowres[i] = frame->buffer_lowres[0] + (frame->i_stride_lowres * i_padv + PADH) + i * luma_plane_size;
133 for( j = 0; j <= !!h->param.i_bframe; j++ )
134 for( i = 0; i <= h->param.i_bframe; i++ )
136 CHECKED_MALLOCZERO( frame->lowres_mvs[j][i], 2*h->mb.i_mb_count*sizeof(int16_t) );
137 CHECKED_MALLOC( frame->lowres_mv_costs[j][i], h->mb.i_mb_count*sizeof(int) );
139 CHECKED_MALLOC( frame->i_propagate_cost, (i_mb_count+3) * sizeof(uint16_t) );
140 for( j = 0; j <= h->param.i_bframe+1; j++ )
141 for( i = 0; i <= h->param.i_bframe+1; i++ )
143 CHECKED_MALLOC( frame->lowres_costs[j][i], (i_mb_count+3) * sizeof(uint16_t) );
144 CHECKED_MALLOC( frame->lowres_inter_types[j][i], (i_mb_count+3)/4 * sizeof(uint8_t) );
146 frame->i_intra_cost = frame->lowres_costs[0][0];
147 memset( frame->i_intra_cost, -1, (i_mb_count+3) * sizeof(uint16_t) );
149 if( h->param.rc.i_aq_mode )
151 CHECKED_MALLOC( frame->f_qp_offset, h->mb.i_mb_count * sizeof(float) );
152 CHECKED_MALLOC( frame->f_qp_offset_aq, h->mb.i_mb_count * sizeof(float) );
153 if( h->frames.b_have_lowres )
154 /* shouldn't really be initialized, just silences a valgrind false-positive in x264_mbtree_propagate_cost_sse2 */
155 CHECKED_MALLOCZERO( frame->i_inv_qscale_factor, (h->mb.i_mb_count+3) * sizeof(uint16_t) );
159 if( x264_pthread_mutex_init( &frame->mutex, NULL ) )
161 if( x264_pthread_cond_init( &frame->cv, NULL ) )
171 void x264_frame_delete( x264_frame_t *frame )
174 /* Duplicate frames are blank copies of real frames (including pointers),
175 * so freeing those pointers would cause a double free later. */
176 if( !frame->b_duplicate )
178 for( i = 0; i < 4; i++ )
179 x264_free( frame->buffer[i] );
180 for( i = 0; i < 4; i++ )
181 x264_free( frame->buffer_lowres[i] );
182 for( i = 0; i < X264_BFRAME_MAX+2; i++ )
183 for( j = 0; j < X264_BFRAME_MAX+2; j++ )
184 x264_free( frame->i_row_satds[i][j] );
185 for( j = 0; j < 2; j++ )
186 for( i = 0; i <= X264_BFRAME_MAX; i++ )
188 x264_free( frame->lowres_mvs[j][i] );
189 x264_free( frame->lowres_mv_costs[j][i] );
191 x264_free( frame->i_propagate_cost );
192 for( j = 0; j <= X264_BFRAME_MAX+1; j++ )
193 for( i = 0; i <= X264_BFRAME_MAX+1; i++ )
195 x264_free( frame->lowres_costs[j][i] );
196 x264_free( frame->lowres_inter_types[j][i] );
198 x264_free( frame->f_qp_offset );
199 x264_free( frame->f_qp_offset_aq );
200 x264_free( frame->i_inv_qscale_factor );
201 x264_free( frame->i_row_bits );
202 x264_free( frame->i_row_qp );
203 x264_free( frame->mb_type );
204 x264_free( frame->mv[0] );
205 x264_free( frame->mv[1] );
206 x264_free( frame->ref[0] );
207 x264_free( frame->ref[1] );
208 x264_pthread_mutex_destroy( &frame->mutex );
209 x264_pthread_cond_destroy( &frame->cv );
214 int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src )
216 int i_csp = src->img.i_csp & X264_CSP_MASK;
218 if( i_csp != X264_CSP_I420 && i_csp != X264_CSP_YV12 )
220 x264_log( h, X264_LOG_ERROR, "Arg invalid CSP\n" );
224 dst->i_type = src->i_type;
225 dst->i_qpplus1 = src->i_qpplus1;
226 dst->i_pts = dst->i_dts = src->i_pts;
227 dst->param = src->param;
231 int s = (i_csp == X264_CSP_YV12 && i) ? i^3 : i;
232 uint8_t *plane = src->img.plane[s];
233 int stride = src->img.i_stride[s];
234 int width = h->param.i_width >> !!i;
235 int height = h->param.i_height >> !!i;
236 if( src->img.i_csp & X264_CSP_VFLIP )
238 plane += (height-1)*stride;
241 h->mc.plane_copy( dst->plane[i], dst->i_stride[i], plane, stride, width, height );
248 static void plane_expand_border( uint8_t *pix, int i_stride, int i_width, int i_height, int i_padh, int i_padv, int b_pad_top, int b_pad_bottom )
250 #define PPIXEL(x, y) ( pix + (x) + (y)*i_stride )
252 for( y = 0; y < i_height; y++ )
255 memset( PPIXEL(-i_padh, y), PPIXEL(0, y)[0], i_padh );
257 memset( PPIXEL(i_width, y), PPIXEL(i_width-1, y)[0], i_padh );
261 for( y = 0; y < i_padv; y++ )
262 memcpy( PPIXEL(-i_padh, -y-1), PPIXEL(-i_padh, 0), i_width+2*i_padh );
265 for( y = 0; y < i_padv; y++ )
266 memcpy( PPIXEL(-i_padh, i_height+y), PPIXEL(-i_padh, i_height-1), i_width+2*i_padh );
270 void x264_frame_expand_border( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
274 if( mb_y & h->sh.b_mbaff )
276 for( i = 0; i < frame->i_plane; i++ )
278 int stride = frame->i_stride[i];
279 int width = 16*h->sps->i_mb_width >> !!i;
280 int height = (b_end ? 16*(h->sps->i_mb_height - mb_y) >> h->sh.b_mbaff : 16) >> !!i;
281 int padh = PADH >> !!i;
282 int padv = PADV >> !!i;
283 // buffer: 2 chroma, 3 luma (rounded to 4) because deblocking goes beyond the top of the mb
284 uint8_t *pix = frame->plane[i] + X264_MAX(0, (16*mb_y-4)*stride >> !!i);
285 if( b_end && !b_start )
286 height += 4 >> (!!i + h->sh.b_mbaff);
289 plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end );
290 plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end );
294 plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end );
299 void x264_frame_expand_border_filtered( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
301 /* during filtering, 8 extra pixels were filtered on each edge,
302 * but up to 3 of the horizontal ones may be wrong.
303 we want to expand border from the last filtered pixel */
305 int stride = frame->i_stride[0];
306 int width = 16*h->sps->i_mb_width + 8;
307 int height = b_end ? (16*(h->sps->i_mb_height - mb_y) >> h->sh.b_mbaff) + 16 : 16;
311 for( i = 1; i < 4; i++ )
313 // buffer: 8 luma, to match the hpel filter
314 uint8_t *pix = frame->filtered[i] + (16*mb_y - (8 << h->sh.b_mbaff)) * stride - 4;
317 plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end );
318 plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end );
322 plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end );
327 void x264_frame_expand_border_lowres( x264_frame_t *frame )
330 for( i = 0; i < 4; i++ )
331 plane_expand_border( frame->lowres[i], frame->i_stride_lowres, frame->i_width_lowres, frame->i_lines_lowres, PADH, PADV, 1, 1 );
334 void x264_frame_expand_border_mod16( x264_t *h, x264_frame_t *frame )
337 for( i = 0; i < frame->i_plane; i++ )
339 int i_subsample = i ? 1 : 0;
340 int i_width = h->param.i_width >> i_subsample;
341 int i_height = h->param.i_height >> i_subsample;
342 int i_padx = (h->sps->i_mb_width * 16 - h->param.i_width) >> i_subsample;
343 int i_pady = (h->sps->i_mb_height * 16 - h->param.i_height) >> i_subsample;
347 for( y = 0; y < i_height; y++ )
348 memset( &frame->plane[i][y*frame->i_stride[i] + i_width],
349 frame->plane[i][y*frame->i_stride[i] + i_width - 1],
354 //FIXME interlace? or just let it pad using the wrong field
355 for( y = i_height; y < i_height + i_pady; y++ )
356 memcpy( &frame->plane[i][y*frame->i_stride[i]],
357 &frame->plane[i][(i_height-1)*frame->i_stride[i]],
364 /* cavlc + 8x8 transform stores nnz per 16 coeffs for the purpose of
365 * entropy coding, but per 64 coeffs for the purpose of deblocking */
366 static void munge_cavlc_nnz_row( x264_t *h, int mb_y, uint8_t (*buf)[16] )
368 uint32_t (*src)[6] = (uint32_t(*)[6])h->mb.non_zero_count + mb_y * h->sps->i_mb_width;
369 int8_t *transform = h->mb.mb_transform_size + mb_y * h->sps->i_mb_width;
371 for( x=0; x<h->sps->i_mb_width; x++ )
373 memcpy( buf+x, src+x, 16 );
376 nnz = src[x][0] | src[x][1];
377 src[x][0] = src[x][1] = ((uint16_t)nnz ? 0x0101 : 0) + (nnz>>16 ? 0x01010000 : 0);
378 nnz = src[x][2] | src[x][3];
379 src[x][2] = src[x][3] = ((uint16_t)nnz ? 0x0101 : 0) + (nnz>>16 ? 0x01010000 : 0);
384 static void restore_cavlc_nnz_row( x264_t *h, int mb_y, uint8_t (*buf)[16] )
386 uint8_t (*dst)[24] = h->mb.non_zero_count + mb_y * h->sps->i_mb_width;
388 for( x=0; x<h->sps->i_mb_width; x++ )
389 memcpy( dst+x, buf+x, 16 );
392 static void munge_cavlc_nnz( x264_t *h, int mb_y, uint8_t (*buf)[16], void (*func)(x264_t*, int, uint8_t (*)[16]) )
394 func( h, mb_y, buf );
396 func( h, mb_y-1, buf + h->sps->i_mb_width );
399 func( h, mb_y+1, buf + h->sps->i_mb_width * 2 );
401 func( h, mb_y-2, buf + h->sps->i_mb_width * 3 );
406 /* Deblocking filter */
407 static const uint8_t i_alpha_table[52+12*2] =
409 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
410 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
411 0, 0, 0, 0, 0, 0, 4, 4, 5, 6,
412 7, 8, 9, 10, 12, 13, 15, 17, 20, 22,
413 25, 28, 32, 36, 40, 45, 50, 56, 63, 71,
414 80, 90,101,113,127,144,162,182,203,226,
416 255,255,255,255,255,255,255,255,255,255,255,255,
418 static const uint8_t i_beta_table[52+12*2] =
420 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
421 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
422 0, 0, 0, 0, 0, 0, 2, 2, 2, 3,
423 3, 3, 3, 4, 4, 4, 6, 6, 7, 7,
424 8, 8, 9, 9, 10, 10, 11, 11, 12, 12,
425 13, 13, 14, 14, 15, 15, 16, 16, 17, 17,
427 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
429 static const int8_t i_tc0_table[52+12*2][4] =
431 {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
432 {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
433 {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
434 {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
435 {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 1 },
436 {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 1, 1 }, {-1, 0, 1, 1 }, {-1, 1, 1, 1 },
437 {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 },
438 {-1, 1, 1, 2 }, {-1, 1, 2, 3 }, {-1, 1, 2, 3 }, {-1, 2, 2, 3 }, {-1, 2, 2, 4 }, {-1, 2, 3, 4 },
439 {-1, 2, 3, 4 }, {-1, 3, 3, 5 }, {-1, 3, 4, 6 }, {-1, 3, 4, 6 }, {-1, 4, 5, 7 }, {-1, 4, 5, 8 },
440 {-1, 4, 6, 9 }, {-1, 5, 7,10 }, {-1, 6, 8,11 }, {-1, 6, 8,13 }, {-1, 7,10,14 }, {-1, 8,11,16 },
441 {-1, 9,12,18 }, {-1,10,13,20 }, {-1,11,15,23 }, {-1,13,17,25 },
442 {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
443 {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
445 #define alpha_table(x) i_alpha_table[(x)+12]
446 #define beta_table(x) i_beta_table[(x)+12]
447 #define tc0_table(x) i_tc0_table[(x)+12]
450 static inline void deblock_luma_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0 )
453 for( i = 0; i < 4; i++ )
460 for( d = 0; d < 4; d++ )
462 const int p2 = pix[-3*xstride];
463 const int p1 = pix[-2*xstride];
464 const int p0 = pix[-1*xstride];
465 const int q0 = pix[ 0*xstride];
466 const int q1 = pix[ 1*xstride];
467 const int q2 = pix[ 2*xstride];
469 if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
473 if( abs( p2 - p0 ) < beta )
475 pix[-2*xstride] = p1 + x264_clip3( (( p2 + ((p0 + q0 + 1) >> 1)) >> 1) - p1, -tc0[i], tc0[i] );
478 if( abs( q2 - q0 ) < beta )
480 pix[ 1*xstride] = q1 + x264_clip3( (( q2 + ((p0 + q0 + 1) >> 1)) >> 1) - q1, -tc0[i], tc0[i] );
484 delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
485 pix[-1*xstride] = x264_clip_uint8( p0 + delta ); /* p0' */
486 pix[ 0*xstride] = x264_clip_uint8( q0 - delta ); /* q0' */
492 static void deblock_v_luma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
494 deblock_luma_c( pix, stride, 1, alpha, beta, tc0 );
496 static void deblock_h_luma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
498 deblock_luma_c( pix, 1, stride, alpha, beta, tc0 );
501 static inline void deblock_chroma_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0 )
504 for( i = 0; i < 4; i++ )
506 const int tc = tc0[i];
512 for( d = 0; d < 2; d++ )
514 const int p1 = pix[-2*xstride];
515 const int p0 = pix[-1*xstride];
516 const int q0 = pix[ 0*xstride];
517 const int q1 = pix[ 1*xstride];
519 if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
521 int delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
522 pix[-1*xstride] = x264_clip_uint8( p0 + delta ); /* p0' */
523 pix[ 0*xstride] = x264_clip_uint8( q0 - delta ); /* q0' */
529 static void deblock_v_chroma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
531 deblock_chroma_c( pix, stride, 1, alpha, beta, tc0 );
533 static void deblock_h_chroma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
535 deblock_chroma_c( pix, 1, stride, alpha, beta, tc0 );
538 static inline void deblock_luma_intra_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta )
541 for( d = 0; d < 16; d++ )
543 const int p2 = pix[-3*xstride];
544 const int p1 = pix[-2*xstride];
545 const int p0 = pix[-1*xstride];
546 const int q0 = pix[ 0*xstride];
547 const int q1 = pix[ 1*xstride];
548 const int q2 = pix[ 2*xstride];
550 if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
552 if(abs( p0 - q0 ) < ((alpha >> 2) + 2) )
554 if( abs( p2 - p0 ) < beta ) /* p0', p1', p2' */
556 const int p3 = pix[-4*xstride];
557 pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
558 pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
559 pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
562 pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
563 if( abs( q2 - q0 ) < beta ) /* q0', q1', q2' */
565 const int q3 = pix[3*xstride];
566 pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
567 pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
568 pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
571 pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
575 pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
576 pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
582 static void deblock_v_luma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
584 deblock_luma_intra_c( pix, stride, 1, alpha, beta );
586 static void deblock_h_luma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
588 deblock_luma_intra_c( pix, 1, stride, alpha, beta );
591 static inline void deblock_chroma_intra_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta )
594 for( d = 0; d < 8; d++ )
596 const int p1 = pix[-2*xstride];
597 const int p0 = pix[-1*xstride];
598 const int q0 = pix[ 0*xstride];
599 const int q1 = pix[ 1*xstride];
601 if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
603 pix[-1*xstride] = (2*p1 + p0 + q1 + 2) >> 2; /* p0' */
604 pix[ 0*xstride] = (2*q1 + q0 + p1 + 2) >> 2; /* q0' */
609 static void deblock_v_chroma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
611 deblock_chroma_intra_c( pix, stride, 1, alpha, beta );
613 static void deblock_h_chroma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
615 deblock_chroma_intra_c( pix, 1, stride, alpha, beta );
618 static inline void deblock_edge( x264_t *h, uint8_t *pix1, uint8_t *pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_inter_t pf_inter )
620 const int index_a = i_qp + h->sh.i_alpha_c0_offset;
621 const int alpha = alpha_table(index_a);
622 const int beta = beta_table(i_qp + h->sh.i_beta_offset);
625 if( !alpha || !beta )
628 tc[0] = tc0_table(index_a)[bS[0]] + b_chroma;
629 tc[1] = tc0_table(index_a)[bS[1]] + b_chroma;
630 tc[2] = tc0_table(index_a)[bS[2]] + b_chroma;
631 tc[3] = tc0_table(index_a)[bS[3]] + b_chroma;
633 pf_inter( pix1, i_stride, alpha, beta, tc );
635 pf_inter( pix2, i_stride, alpha, beta, tc );
638 static inline void deblock_edge_intra( x264_t *h, uint8_t *pix1, uint8_t *pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_intra_t pf_intra )
640 const int alpha = alpha_table(i_qp + h->sh.i_alpha_c0_offset);
641 const int beta = beta_table(i_qp + h->sh.i_beta_offset);
643 if( !alpha || !beta )
646 pf_intra( pix1, i_stride, alpha, beta );
648 pf_intra( pix2, i_stride, alpha, beta );
651 void x264_frame_deblock_row( x264_t *h, int mb_y )
653 const int s8x8 = 2 * h->mb.i_mb_stride;
654 const int s4x4 = 4 * h->mb.i_mb_stride;
655 const int b_interlaced = h->sh.b_mbaff;
656 const int mvy_limit = 4 >> b_interlaced;
657 const int qp_thresh = 15 - X264_MIN(h->sh.i_alpha_c0_offset, h->sh.i_beta_offset) - X264_MAX(0, h->param.analyse.i_chroma_qp_offset);
658 const int no_sub8x8 = !(h->param.analyse.inter & X264_ANALYSE_PSUB8x8);
660 int stridey = h->fdec->i_stride[0];
661 int stride2y = stridey << b_interlaced;
662 int strideuv = h->fdec->i_stride[1];
663 int stride2uv = strideuv << b_interlaced;
665 if( !h->pps->b_cabac && h->pps->b_transform_8x8_mode )
666 munge_cavlc_nnz( h, mb_y, h->mb.nnz_backup, munge_cavlc_nnz_row );
668 for( mb_x = 0; mb_x < h->sps->i_mb_width; mb_x += (~b_interlaced | mb_y)&1, mb_y ^= b_interlaced )
670 const int mb_xy = mb_y * h->mb.i_mb_stride + mb_x;
671 const int mb_8x8 = 2 * s8x8 * mb_y + 2 * mb_x;
672 const int mb_4x4 = 4 * s4x4 * mb_y + 4 * mb_x;
673 const int b_8x8_transform = h->mb.mb_transform_size[mb_xy];
674 const int i_qp = h->mb.qp[mb_xy];
675 int i_edge_end = (h->mb.type[mb_xy] == P_SKIP) ? 1 : 4;
676 uint8_t *pixy = h->fdec->plane[0] + 16*mb_y*stridey + 16*mb_x;
677 uint8_t *pixu = h->fdec->plane[1] + 8*mb_y*strideuv + 8*mb_x;
678 uint8_t *pixv = h->fdec->plane[2] + 8*mb_y*strideuv + 8*mb_x;
679 if( b_interlaced && (mb_y&1) )
686 x264_prefetch_fenc( h, h->fdec, mb_x, mb_y );
688 if( i_qp <= qp_thresh )
691 #define FILTER_DIR(intra, i_dir)\
694 i_qpn= h->mb.qp[mbn_xy];\
698 deblock_edge##intra( h, pixy + 4*i_edge, NULL,\
699 stride2y, bS, (i_qp+i_qpn+1) >> 1, 0,\
700 h->loopf.deblock_h_luma##intra );\
704 int i_qpc = (h->chroma_qp_table[i_qp] + h->chroma_qp_table[i_qpn] + 1) >> 1;\
705 deblock_edge##intra( h, pixu + 2*i_edge, pixv + 2*i_edge,\
706 stride2uv, bS, i_qpc, 1,\
707 h->loopf.deblock_h_chroma##intra );\
712 /* horizontal edge */\
713 deblock_edge##intra( h, pixy + 4*i_edge*stride2y, NULL,\
714 stride2y, bS, (i_qp+i_qpn+1) >> 1, 0,\
715 h->loopf.deblock_v_luma##intra );\
719 int i_qpc = (h->chroma_qp_table[i_qp] + h->chroma_qp_table[i_qpn] + 1) >> 1;\
720 deblock_edge##intra( h, pixu + 2*i_edge*stride2uv, pixv + 2*i_edge*stride2uv,\
721 stride2uv, bS, i_qpc, 1,\
722 h->loopf.deblock_v_chroma##intra );\
727 #define DEBLOCK_STRENGTH(i_dir)\
729 /* *** Get bS for each 4px for the current edge *** */\
730 if( IS_INTRA( h->mb.type[mb_xy] ) || IS_INTRA( h->mb.type[mbn_xy]) )\
731 M32( bS ) = 0x03030303;\
734 M32( bS ) = 0x00000000;\
735 for( i = 0; i < 4; i++ )\
737 int x = i_dir == 0 ? i_edge : i;\
738 int y = i_dir == 0 ? i : i_edge;\
739 int xn = i_dir == 0 ? (x - 1)&0x03 : x;\
740 int yn = i_dir == 0 ? y : (y - 1)&0x03;\
741 if( h->mb.non_zero_count[mb_xy][x+y*4] != 0 ||\
742 h->mb.non_zero_count[mbn_xy][xn+yn*4] != 0 )\
744 else if(!(i_edge&no_sub8x8))\
746 if((i&no_sub8x8) && bS[i-1] != 2)\
750 int i8p= mb_8x8+(x>>1)+(y>>1)*s8x8;\
751 int i8q= mbn_8x8+(xn>>1)+(yn>>1)*s8x8;\
752 int i4p= mb_4x4+x+y*s4x4;\
753 int i4q= mbn_4x4+xn+yn*s4x4;\
755 /* We don't use duplicate refs in B-frames, so we can take this shortcut for now. */ \
756 if( h->sh.i_type == SLICE_TYPE_B || h->mb.ref[0][i8p] < 0 || h->mb.ref[0][i8q] < 0 )\
757 refs_equal = h->mb.ref[0][i8p] == h->mb.ref[0][i8q];\
758 else if( !h->mb.b_interlaced )\
759 refs_equal = h->fref0[h->mb.ref[0][i8p]]->i_poc == h->fref0[h->mb.ref[0][i8q]]->i_poc;\
761 refs_equal = h->fref0[h->mb.ref[0][i8p]>>1]->i_poc == h->fref0[h->mb.ref[0][i8q]>>1]->i_poc\
762 && (h->mb.ref[0][i8p]&1) == (h->mb.ref[0][i8q]&1);\
764 abs( h->mb.mv[0][i4p][0] - h->mb.mv[0][i4q][0] ) >= 4 ||\
765 abs( h->mb.mv[0][i4p][1] - h->mb.mv[0][i4q][1] ) >= mvy_limit ) ||\
766 (h->sh.i_type == SLICE_TYPE_B &&\
767 (h->mb.ref[1][i8p] != h->mb.ref[1][i8q] ||\
768 abs( h->mb.mv[1][i4p][0] - h->mb.mv[1][i4q][0] ) >= 4 ||\
769 abs( h->mb.mv[1][i4p][1] - h->mb.mv[1][i4q][1] ) >= mvy_limit )))\
779 /* i_dir == 0 -> vertical edge
780 * i_dir == 1 -> horizontal edge */
781 #define DEBLOCK_DIR(i_dir)\
783 int i_edge = (i_dir ? (mb_y <= b_interlaced) : (mb_x == 0));\
784 int i_qpn, i, mbn_xy, mbn_8x8, mbn_4x4;\
785 ALIGNED_4( uint8_t bS[4] ); /* filtering strength */\
787 i_edge+= b_8x8_transform;\
790 mbn_xy = i_dir == 0 ? mb_xy - 1 : mb_xy - h->mb.i_mb_stride;\
791 mbn_8x8 = i_dir == 0 ? mb_8x8 - 2 : mb_8x8 - 2 * s8x8;\
792 mbn_4x4 = i_dir == 0 ? mb_4x4 - 4 : mb_4x4 - 4 * s4x4;\
793 if( b_interlaced && i_dir == 1 )\
795 mbn_xy -= h->mb.i_mb_stride;\
796 mbn_8x8 -= 2 * s8x8;\
797 mbn_4x4 -= 4 * s4x4;\
799 else if( IS_INTRA( h->mb.type[mb_xy] ) || IS_INTRA( h->mb.type[mbn_xy]) )\
801 FILTER_DIR( _intra, i_dir );\
804 DEBLOCK_STRENGTH(i_dir);\
806 FILTER_DIR( , i_dir);\
808 i_edge += b_8x8_transform+1;\
813 for( ; i_edge < i_edge_end; i_edge+=b_8x8_transform+1 )\
815 DEBLOCK_STRENGTH(i_dir);\
817 FILTER_DIR( , i_dir);\
825 if( !h->pps->b_cabac && h->pps->b_transform_8x8_mode )
826 munge_cavlc_nnz( h, mb_y, h->mb.nnz_backup, restore_cavlc_nnz_row );
829 void x264_frame_deblock( x264_t *h )
832 for( mb_y = 0; mb_y < h->sps->i_mb_height; mb_y += 1 + h->sh.b_mbaff )
833 x264_frame_deblock_row( h, mb_y );
837 void x264_deblock_v_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
838 void x264_deblock_h_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
839 void x264_deblock_v_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
840 void x264_deblock_h_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
842 void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
843 void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
844 void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta );
845 void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta );
847 void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
848 void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
849 void x264_deblock_h_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
850 void x264_deblock_v8_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
852 static void x264_deblock_v_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
854 x264_deblock_v8_luma_mmxext( pix, stride, alpha, beta, tc0 );
855 x264_deblock_v8_luma_mmxext( pix+8, stride, alpha, beta, tc0+2 );
857 static void x264_deblock_v_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta )
859 x264_deblock_v8_luma_intra_mmxext( pix, stride, alpha, beta );
860 x264_deblock_v8_luma_intra_mmxext( pix+8, stride, alpha, beta );
866 void x264_deblock_v_luma_altivec( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
867 void x264_deblock_h_luma_altivec( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
871 void x264_deblock_v_luma_neon( uint8_t *, int, int, int, int8_t * );
872 void x264_deblock_h_luma_neon( uint8_t *, int, int, int, int8_t * );
873 void x264_deblock_v_chroma_neon( uint8_t *, int, int, int, int8_t * );
874 void x264_deblock_h_chroma_neon( uint8_t *, int, int, int, int8_t * );
877 void x264_deblock_init( int cpu, x264_deblock_function_t *pf )
879 pf->deblock_v_luma = deblock_v_luma_c;
880 pf->deblock_h_luma = deblock_h_luma_c;
881 pf->deblock_v_chroma = deblock_v_chroma_c;
882 pf->deblock_h_chroma = deblock_h_chroma_c;
883 pf->deblock_v_luma_intra = deblock_v_luma_intra_c;
884 pf->deblock_h_luma_intra = deblock_h_luma_intra_c;
885 pf->deblock_v_chroma_intra = deblock_v_chroma_intra_c;
886 pf->deblock_h_chroma_intra = deblock_h_chroma_intra_c;
889 if( cpu&X264_CPU_MMXEXT )
891 pf->deblock_v_chroma = x264_deblock_v_chroma_mmxext;
892 pf->deblock_h_chroma = x264_deblock_h_chroma_mmxext;
893 pf->deblock_v_chroma_intra = x264_deblock_v_chroma_intra_mmxext;
894 pf->deblock_h_chroma_intra = x264_deblock_h_chroma_intra_mmxext;
896 pf->deblock_v_luma = x264_deblock_v_luma_mmxext;
897 pf->deblock_h_luma = x264_deblock_h_luma_mmxext;
898 pf->deblock_v_luma_intra = x264_deblock_v_luma_intra_mmxext;
899 pf->deblock_h_luma_intra = x264_deblock_h_luma_intra_mmxext;
901 if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_STACK_MOD4) )
903 pf->deblock_v_luma = x264_deblock_v_luma_sse2;
904 pf->deblock_h_luma = x264_deblock_h_luma_sse2;
905 pf->deblock_v_luma_intra = x264_deblock_v_luma_intra_sse2;
906 pf->deblock_h_luma_intra = x264_deblock_h_luma_intra_sse2;
912 if( cpu&X264_CPU_ALTIVEC )
914 pf->deblock_v_luma = x264_deblock_v_luma_altivec;
915 pf->deblock_h_luma = x264_deblock_h_luma_altivec;
920 if( cpu&X264_CPU_NEON )
922 pf->deblock_v_luma = x264_deblock_v_luma_neon;
923 pf->deblock_h_luma = x264_deblock_h_luma_neon;
924 pf->deblock_v_chroma = x264_deblock_v_chroma_neon;
925 pf->deblock_h_chroma = x264_deblock_h_chroma_neon;
932 void x264_frame_cond_broadcast( x264_frame_t *frame, int i_lines_completed )
934 x264_pthread_mutex_lock( &frame->mutex );
935 frame->i_lines_completed = i_lines_completed;
936 x264_pthread_cond_broadcast( &frame->cv );
937 x264_pthread_mutex_unlock( &frame->mutex );
940 void x264_frame_cond_wait( x264_frame_t *frame, int i_lines_completed )
942 x264_pthread_mutex_lock( &frame->mutex );
943 while( frame->i_lines_completed < i_lines_completed )
944 x264_pthread_cond_wait( &frame->cv, &frame->mutex );
945 x264_pthread_mutex_unlock( &frame->mutex );
950 void x264_frame_push( x264_frame_t **list, x264_frame_t *frame )
953 while( list[i] ) i++;
957 x264_frame_t *x264_frame_pop( x264_frame_t **list )
962 while( list[i+1] ) i++;
968 void x264_frame_unshift( x264_frame_t **list, x264_frame_t *frame )
971 while( list[i] ) i++;
977 x264_frame_t *x264_frame_shift( x264_frame_t **list )
979 x264_frame_t *frame = list[0];
981 for( i = 0; list[i]; i++ )
987 void x264_frame_push_unused( x264_t *h, x264_frame_t *frame )
989 assert( frame->i_reference_count > 0 );
990 frame->i_reference_count--;
991 if( frame->i_reference_count == 0 )
992 x264_frame_push( h->frames.unused[frame->b_fdec], frame );
995 x264_frame_t *x264_frame_pop_unused( x264_t *h, int b_fdec )
998 if( h->frames.unused[b_fdec][0] )
999 frame = x264_frame_pop( h->frames.unused[b_fdec] );
1001 frame = x264_frame_new( h, b_fdec );
1004 frame->b_last_minigop_bframe = 0;
1005 frame->i_reference_count = 1;
1006 frame->b_intra_calculated = 0;
1007 frame->b_scenecut = 1;
1008 frame->b_keyframe = 0;
1010 memset( frame->weight, 0, sizeof(frame->weight) );
1011 memset( frame->f_weighted_cost_delta, 0, sizeof(frame->f_weighted_cost_delta) );
1016 void x264_frame_push_blank_unused( x264_t *h, x264_frame_t *frame )
1018 assert( frame->i_reference_count > 0 );
1019 frame->i_reference_count--;
1020 if( frame->i_reference_count == 0 )
1021 x264_frame_push( h->frames.blank_unused, frame );
1024 x264_frame_t *x264_frame_pop_blank_unused( x264_t *h )
1026 x264_frame_t *frame;
1027 if( h->frames.blank_unused[0] )
1028 frame = x264_frame_pop( h->frames.blank_unused );
1030 frame = x264_malloc( sizeof(x264_frame_t) );
1033 frame->b_duplicate = 1;
1034 frame->i_reference_count = 1;
1038 void x264_frame_sort( x264_frame_t **list, int b_dts )
1043 for( i = 0; list[i+1]; i++ )
1045 int dtype = list[i]->i_type - list[i+1]->i_type;
1046 int dtime = list[i]->i_frame - list[i+1]->i_frame;
1047 int swap = b_dts ? dtype > 0 || ( dtype == 0 && dtime > 0 )
1051 XCHG( x264_frame_t*, list[i], list[i+1] );
1058 void x264_weight_scale_plane( x264_t *h, uint8_t *dst, int i_dst_stride, uint8_t *src, int i_src_stride,
1059 int i_width, int i_height, x264_weight_t *w )
1062 /* Weight horizontal strips of height 16. This was found to be the optimal height
1063 * in terms of the cache loads. */
1064 while( i_height > 0 )
1066 for( x = 0; x < i_width; x += 16 )
1067 w->weightfn[16>>2]( dst+x, i_dst_stride, src+x, i_src_stride, w, X264_MIN( i_height, 16 ) );
1069 dst += 16 * i_dst_stride;
1070 src += 16 * i_src_stride;
1074 void x264_frame_delete_list( x264_frame_t **list )
1080 x264_frame_delete( list[i++] );
1084 int x264_synch_frame_list_init( x264_synch_frame_list_t *slist, int max_size )
1088 slist->i_max_size = max_size;
1090 CHECKED_MALLOCZERO( slist->list, (max_size+1) * sizeof(x264_frame_t*) );
1091 if( x264_pthread_mutex_init( &slist->mutex, NULL ) ||
1092 x264_pthread_cond_init( &slist->cv_fill, NULL ) ||
1093 x264_pthread_cond_init( &slist->cv_empty, NULL ) )
1100 void x264_synch_frame_list_delete( x264_synch_frame_list_t *slist )
1102 x264_pthread_mutex_destroy( &slist->mutex );
1103 x264_pthread_cond_destroy( &slist->cv_fill );
1104 x264_pthread_cond_destroy( &slist->cv_empty );
1105 x264_frame_delete_list( slist->list );
1108 void x264_synch_frame_list_push( x264_synch_frame_list_t *slist, x264_frame_t *frame )
1110 x264_pthread_mutex_lock( &slist->mutex );
1111 while( slist->i_size == slist->i_max_size )
1112 x264_pthread_cond_wait( &slist->cv_empty, &slist->mutex );
1113 slist->list[ slist->i_size++ ] = frame;
1114 x264_pthread_mutex_unlock( &slist->mutex );
1115 x264_pthread_cond_broadcast( &slist->cv_fill );