1 /*****************************************************************************
2 * frame.c: h264 encoder library
3 *****************************************************************************
4 * Copyright (C) 2003-2008 x264 project
6 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
7 * Loren Merritt <lorenm@u.washington.edu>
8 * Fiona Glaser <fiona@x264.com>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23 *****************************************************************************/
27 #define ALIGN(x,a) (((x)+((a)-1))&~((a)-1))
29 x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
34 int i_mb_count = h->mb.i_mb_count;
35 int i_stride, i_width, i_lines;
36 int i_padv = PADV << h->param.b_interlaced;
38 int chroma_plane_size;
39 int align = h->param.cpu&X264_CPU_CACHELINE_64 ? 64 : h->param.cpu&X264_CPU_CACHELINE_32 ? 32 : 16;
41 CHECKED_MALLOCZERO( frame, sizeof(x264_frame_t) );
43 /* allocate frame data (+64 for extra data for me) */
44 i_width = ALIGN( h->param.i_width, 16 );
45 i_stride = ALIGN( i_width + 2*PADH, align );
46 i_lines = ALIGN( h->param.i_height, 16<<h->param.b_interlaced );
49 for( i = 0; i < 3; i++ )
51 frame->i_stride[i] = ALIGN( i_stride >> !!i, align );
52 frame->i_width[i] = i_width >> !!i;
53 frame->i_lines[i] = i_lines >> !!i;
56 luma_plane_size = (frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv));
57 chroma_plane_size = (frame->i_stride[1] * (frame->i_lines[1] + 2*i_padv));
58 for( i = 1; i < 3; i++ )
60 CHECKED_MALLOC( frame->buffer[i], chroma_plane_size );
61 frame->plane[i] = frame->buffer[i] + (frame->i_stride[i] * i_padv + PADH)/2;
64 for( i = 0; i < h->param.i_bframe + 2; i++ )
65 for( j = 0; j < h->param.i_bframe + 2; j++ )
66 CHECKED_MALLOC( frame->i_row_satds[i][j], i_lines/16 * sizeof(int) );
69 frame->i_type = X264_TYPE_AUTO;
73 frame->i_frame_num = -1;
74 frame->i_lines_completed = -1;
75 frame->b_fdec = b_fdec;
78 /* all 4 luma planes allocated together, since the cacheline split code
79 * requires them to be in-phase wrt cacheline alignment. */
80 if( h->param.analyse.i_subpel_refine && b_fdec )
82 CHECKED_MALLOC( frame->buffer[0], 4*luma_plane_size );
83 for( i = 0; i < 4; i++ )
84 frame->filtered[i] = frame->buffer[0] + i*luma_plane_size + frame->i_stride[0] * i_padv + PADH;
85 frame->plane[0] = frame->filtered[0];
89 CHECKED_MALLOC( frame->buffer[0], luma_plane_size );
90 frame->filtered[0] = frame->plane[0] = frame->buffer[0] + frame->i_stride[0] * i_padv + PADH;
93 frame->b_duplicate = 0;
95 if( b_fdec ) /* fdec frame */
97 CHECKED_MALLOC( frame->mb_type, i_mb_count * sizeof(int8_t));
98 CHECKED_MALLOC( frame->mv[0], 2*16 * i_mb_count * sizeof(int16_t) );
99 CHECKED_MALLOC( frame->ref[0], 4 * i_mb_count * sizeof(int8_t) );
100 if( h->param.i_bframe )
102 CHECKED_MALLOC( frame->mv[1], 2*16 * i_mb_count * sizeof(int16_t) );
103 CHECKED_MALLOC( frame->ref[1], 4 * i_mb_count * sizeof(int8_t) );
108 frame->ref[1] = NULL;
110 CHECKED_MALLOC( frame->i_row_bits, i_lines/16 * sizeof(int) );
111 CHECKED_MALLOC( frame->i_row_qp, i_lines/16 * sizeof(int) );
112 if( h->param.analyse.i_me_method >= X264_ME_ESA )
114 CHECKED_MALLOC( frame->buffer[3],
115 frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv) * sizeof(uint16_t) << h->frames.b_have_sub8x8_esa );
116 frame->integral = (uint16_t*)frame->buffer[3] + frame->i_stride[0] * i_padv + PADH;
119 else /* fenc frame */
121 if( h->frames.b_have_lowres )
123 frame->i_width_lowres = frame->i_width[0]/2;
124 frame->i_stride_lowres = ALIGN( frame->i_width_lowres + 2*PADH, align );
125 frame->i_lines_lowres = frame->i_lines[0]/2;
127 luma_plane_size = frame->i_stride_lowres * (frame->i_lines[0]/2 + 2*i_padv);
129 CHECKED_MALLOC( frame->buffer_lowres[0], 4 * luma_plane_size );
130 for( i = 0; i < 4; i++ )
131 frame->lowres[i] = frame->buffer_lowres[0] + (frame->i_stride_lowres * i_padv + PADH) + i * luma_plane_size;
133 for( j = 0; j <= !!h->param.i_bframe; j++ )
134 for( i = 0; i <= h->param.i_bframe; i++ )
136 CHECKED_MALLOCZERO( frame->lowres_mvs[j][i], 2*h->mb.i_mb_count*sizeof(int16_t) );
137 CHECKED_MALLOC( frame->lowres_mv_costs[j][i], h->mb.i_mb_count*sizeof(int) );
139 CHECKED_MALLOC( frame->i_propagate_cost, (i_mb_count+3) * sizeof(uint16_t) );
140 for( j = 0; j <= h->param.i_bframe+1; j++ )
141 for( i = 0; i <= h->param.i_bframe+1; i++ )
143 CHECKED_MALLOC( frame->lowres_costs[j][i], (i_mb_count+3) * sizeof(uint16_t) );
144 CHECKED_MALLOC( frame->lowres_inter_types[j][i], (i_mb_count+3)/4 * sizeof(uint8_t) );
146 frame->i_intra_cost = frame->lowres_costs[0][0];
147 memset( frame->i_intra_cost, -1, (i_mb_count+3) * sizeof(uint16_t) );
149 if( h->param.rc.i_aq_mode )
151 CHECKED_MALLOC( frame->f_qp_offset, h->mb.i_mb_count * sizeof(float) );
152 CHECKED_MALLOC( frame->f_qp_offset_aq, h->mb.i_mb_count * sizeof(float) );
153 if( h->frames.b_have_lowres )
154 /* shouldn't really be initialized, just silences a valgrind false-positive in x264_mbtree_propagate_cost_sse2 */
155 CHECKED_MALLOCZERO( frame->i_inv_qscale_factor, (h->mb.i_mb_count+3) * sizeof(uint16_t) );
159 if( x264_pthread_mutex_init( &frame->mutex, NULL ) )
161 if( x264_pthread_cond_init( &frame->cv, NULL ) )
171 void x264_frame_delete( x264_frame_t *frame )
174 /* Duplicate frames are blank copies of real frames (including pointers),
175 * so freeing those pointers would cause a double free later. */
176 if( !frame->b_duplicate )
178 for( i = 0; i < 4; i++ )
179 x264_free( frame->buffer[i] );
180 for( i = 0; i < 4; i++ )
181 x264_free( frame->buffer_lowres[i] );
182 for( i = 0; i < X264_BFRAME_MAX+2; i++ )
183 for( j = 0; j < X264_BFRAME_MAX+2; j++ )
184 x264_free( frame->i_row_satds[i][j] );
185 for( j = 0; j < 2; j++ )
186 for( i = 0; i <= X264_BFRAME_MAX; i++ )
188 x264_free( frame->lowres_mvs[j][i] );
189 x264_free( frame->lowres_mv_costs[j][i] );
191 x264_free( frame->i_propagate_cost );
192 for( j = 0; j <= X264_BFRAME_MAX+1; j++ )
193 for( i = 0; i <= X264_BFRAME_MAX+1; i++ )
195 x264_free( frame->lowres_costs[j][i] );
196 x264_free( frame->lowres_inter_types[j][i] );
198 x264_free( frame->f_qp_offset );
199 x264_free( frame->f_qp_offset_aq );
200 x264_free( frame->i_inv_qscale_factor );
201 x264_free( frame->i_row_bits );
202 x264_free( frame->i_row_qp );
203 x264_free( frame->mb_type );
204 x264_free( frame->mv[0] );
205 x264_free( frame->mv[1] );
206 x264_free( frame->ref[0] );
207 x264_free( frame->ref[1] );
208 x264_pthread_mutex_destroy( &frame->mutex );
209 x264_pthread_cond_destroy( &frame->cv );
214 int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src )
216 int i_csp = src->img.i_csp & X264_CSP_MASK;
218 if( i_csp != X264_CSP_I420 && i_csp != X264_CSP_YV12 )
220 x264_log( h, X264_LOG_ERROR, "Arg invalid CSP\n" );
224 dst->i_type = src->i_type;
225 dst->i_qpplus1 = src->i_qpplus1;
226 dst->i_pts = src->i_pts;
227 dst->param = src->param;
231 int s = (i_csp == X264_CSP_YV12 && i) ? i^3 : i;
232 uint8_t *plane = src->img.plane[s];
233 int stride = src->img.i_stride[s];
234 int width = h->param.i_width >> !!i;
235 int height = h->param.i_height >> !!i;
236 if( src->img.i_csp & X264_CSP_VFLIP )
238 plane += (height-1)*stride;
241 h->mc.plane_copy( dst->plane[i], dst->i_stride[i], plane, stride, width, height );
248 static void plane_expand_border( uint8_t *pix, int i_stride, int i_width, int i_height, int i_padh, int i_padv, int b_pad_top, int b_pad_bottom )
250 #define PPIXEL(x, y) ( pix + (x) + (y)*i_stride )
252 for( y = 0; y < i_height; y++ )
255 memset( PPIXEL(-i_padh, y), PPIXEL(0, y)[0], i_padh );
257 memset( PPIXEL(i_width, y), PPIXEL(i_width-1, y)[0], i_padh );
261 for( y = 0; y < i_padv; y++ )
262 memcpy( PPIXEL(-i_padh, -y-1), PPIXEL(-i_padh, 0), i_width+2*i_padh );
265 for( y = 0; y < i_padv; y++ )
266 memcpy( PPIXEL(-i_padh, i_height+y), PPIXEL(-i_padh, i_height-1), i_width+2*i_padh );
270 void x264_frame_expand_border( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
274 if( mb_y & h->sh.b_mbaff )
276 for( i = 0; i < frame->i_plane; i++ )
278 int stride = frame->i_stride[i];
279 int width = 16*h->sps->i_mb_width >> !!i;
280 int height = (b_end ? 16*(h->sps->i_mb_height - mb_y) >> h->sh.b_mbaff : 16) >> !!i;
281 int padh = PADH >> !!i;
282 int padv = PADV >> !!i;
283 // buffer: 2 chroma, 3 luma (rounded to 4) because deblocking goes beyond the top of the mb
284 uint8_t *pix = frame->plane[i] + X264_MAX(0, (16*mb_y-4)*stride >> !!i);
285 if( b_end && !b_start )
286 height += 4 >> (!!i + h->sh.b_mbaff);
289 plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end );
290 plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end );
294 plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end );
299 void x264_frame_expand_border_filtered( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
301 /* during filtering, 8 extra pixels were filtered on each edge,
302 * but up to 3 of the horizontal ones may be wrong.
303 we want to expand border from the last filtered pixel */
305 int stride = frame->i_stride[0];
306 int width = 16*h->sps->i_mb_width + 8;
307 int height = b_end ? (16*(h->sps->i_mb_height - mb_y) >> h->sh.b_mbaff) + 16 : 16;
311 for( i = 1; i < 4; i++ )
313 // buffer: 8 luma, to match the hpel filter
314 uint8_t *pix = frame->filtered[i] + (16*mb_y - (8 << h->sh.b_mbaff)) * stride - 4;
317 plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end );
318 plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end );
322 plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end );
327 void x264_frame_expand_border_lowres( x264_frame_t *frame )
330 for( i = 0; i < 4; i++ )
331 plane_expand_border( frame->lowres[i], frame->i_stride_lowres, frame->i_width_lowres, frame->i_lines_lowres, PADH, PADV, 1, 1 );
334 void x264_frame_expand_border_mod16( x264_t *h, x264_frame_t *frame )
337 for( i = 0; i < frame->i_plane; i++ )
339 int i_subsample = i ? 1 : 0;
340 int i_width = h->param.i_width >> i_subsample;
341 int i_height = h->param.i_height >> i_subsample;
342 int i_padx = (h->sps->i_mb_width * 16 - h->param.i_width) >> i_subsample;
343 int i_pady = (h->sps->i_mb_height * 16 - h->param.i_height) >> i_subsample;
347 for( y = 0; y < i_height; y++ )
348 memset( &frame->plane[i][y*frame->i_stride[i] + i_width],
349 frame->plane[i][y*frame->i_stride[i] + i_width - 1],
354 //FIXME interlace? or just let it pad using the wrong field
355 for( y = i_height; y < i_height + i_pady; y++ )
356 memcpy( &frame->plane[i][y*frame->i_stride[i]],
357 &frame->plane[i][(i_height-1)*frame->i_stride[i]],
364 /* cavlc + 8x8 transform stores nnz per 16 coeffs for the purpose of
365 * entropy coding, but per 64 coeffs for the purpose of deblocking */
366 static void munge_cavlc_nnz_row( x264_t *h, int mb_y, uint8_t (*buf)[16] )
368 uint32_t (*src)[6] = (uint32_t(*)[6])h->mb.non_zero_count + mb_y * h->sps->i_mb_width;
369 int8_t *transform = h->mb.mb_transform_size + mb_y * h->sps->i_mb_width;
371 for( x=0; x<h->sps->i_mb_width; x++ )
373 memcpy( buf+x, src+x, 16 );
376 nnz = src[x][0] | src[x][1];
377 src[x][0] = src[x][1] = ((uint16_t)nnz ? 0x0101 : 0) + (nnz>>16 ? 0x01010000 : 0);
378 nnz = src[x][2] | src[x][3];
379 src[x][2] = src[x][3] = ((uint16_t)nnz ? 0x0101 : 0) + (nnz>>16 ? 0x01010000 : 0);
384 static void restore_cavlc_nnz_row( x264_t *h, int mb_y, uint8_t (*buf)[16] )
386 uint8_t (*dst)[24] = h->mb.non_zero_count + mb_y * h->sps->i_mb_width;
388 for( x=0; x<h->sps->i_mb_width; x++ )
389 memcpy( dst+x, buf+x, 16 );
392 static void munge_cavlc_nnz( x264_t *h, int mb_y, uint8_t (*buf)[16], void (*func)(x264_t*, int, uint8_t (*)[16]) )
394 func( h, mb_y, buf );
396 func( h, mb_y-1, buf + h->sps->i_mb_width );
399 func( h, mb_y+1, buf + h->sps->i_mb_width * 2 );
401 func( h, mb_y-2, buf + h->sps->i_mb_width * 3 );
406 /* Deblocking filter */
407 static const uint8_t i_alpha_table[52+12*2] =
409 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
410 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
411 0, 0, 0, 0, 0, 0, 4, 4, 5, 6,
412 7, 8, 9, 10, 12, 13, 15, 17, 20, 22,
413 25, 28, 32, 36, 40, 45, 50, 56, 63, 71,
414 80, 90,101,113,127,144,162,182,203,226,
416 255,255,255,255,255,255,255,255,255,255,255,255,
418 static const uint8_t i_beta_table[52+12*2] =
420 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
421 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
422 0, 0, 0, 0, 0, 0, 2, 2, 2, 3,
423 3, 3, 3, 4, 4, 4, 6, 6, 7, 7,
424 8, 8, 9, 9, 10, 10, 11, 11, 12, 12,
425 13, 13, 14, 14, 15, 15, 16, 16, 17, 17,
427 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
429 static const int8_t i_tc0_table[52+12*2][4] =
431 {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
432 {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
433 {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
434 {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
435 {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 1 },
436 {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 1, 1 }, {-1, 0, 1, 1 }, {-1, 1, 1, 1 },
437 {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 },
438 {-1, 1, 1, 2 }, {-1, 1, 2, 3 }, {-1, 1, 2, 3 }, {-1, 2, 2, 3 }, {-1, 2, 2, 4 }, {-1, 2, 3, 4 },
439 {-1, 2, 3, 4 }, {-1, 3, 3, 5 }, {-1, 3, 4, 6 }, {-1, 3, 4, 6 }, {-1, 4, 5, 7 }, {-1, 4, 5, 8 },
440 {-1, 4, 6, 9 }, {-1, 5, 7,10 }, {-1, 6, 8,11 }, {-1, 6, 8,13 }, {-1, 7,10,14 }, {-1, 8,11,16 },
441 {-1, 9,12,18 }, {-1,10,13,20 }, {-1,11,15,23 }, {-1,13,17,25 },
442 {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
443 {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
445 #define alpha_table(x) i_alpha_table[(x)+12]
446 #define beta_table(x) i_beta_table[(x)+12]
447 #define tc0_table(x) i_tc0_table[(x)+12]
450 static inline void deblock_luma_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0 )
453 for( i = 0; i < 4; i++ )
460 for( d = 0; d < 4; d++ )
462 const int p2 = pix[-3*xstride];
463 const int p1 = pix[-2*xstride];
464 const int p0 = pix[-1*xstride];
465 const int q0 = pix[ 0*xstride];
466 const int q1 = pix[ 1*xstride];
467 const int q2 = pix[ 2*xstride];
469 if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
473 if( abs( p2 - p0 ) < beta )
475 pix[-2*xstride] = p1 + x264_clip3( (( p2 + ((p0 + q0 + 1) >> 1)) >> 1) - p1, -tc0[i], tc0[i] );
478 if( abs( q2 - q0 ) < beta )
480 pix[ 1*xstride] = q1 + x264_clip3( (( q2 + ((p0 + q0 + 1) >> 1)) >> 1) - q1, -tc0[i], tc0[i] );
484 delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
485 pix[-1*xstride] = x264_clip_uint8( p0 + delta ); /* p0' */
486 pix[ 0*xstride] = x264_clip_uint8( q0 - delta ); /* q0' */
492 static void deblock_v_luma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
494 deblock_luma_c( pix, stride, 1, alpha, beta, tc0 );
496 static void deblock_h_luma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
498 deblock_luma_c( pix, 1, stride, alpha, beta, tc0 );
501 static inline void deblock_chroma_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0 )
504 for( i = 0; i < 4; i++ )
506 const int tc = tc0[i];
512 for( d = 0; d < 2; d++ )
514 const int p1 = pix[-2*xstride];
515 const int p0 = pix[-1*xstride];
516 const int q0 = pix[ 0*xstride];
517 const int q1 = pix[ 1*xstride];
519 if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
521 int delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
522 pix[-1*xstride] = x264_clip_uint8( p0 + delta ); /* p0' */
523 pix[ 0*xstride] = x264_clip_uint8( q0 - delta ); /* q0' */
529 static void deblock_v_chroma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
531 deblock_chroma_c( pix, stride, 1, alpha, beta, tc0 );
533 static void deblock_h_chroma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
535 deblock_chroma_c( pix, 1, stride, alpha, beta, tc0 );
538 static inline void deblock_luma_intra_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta )
541 for( d = 0; d < 16; d++ )
543 const int p2 = pix[-3*xstride];
544 const int p1 = pix[-2*xstride];
545 const int p0 = pix[-1*xstride];
546 const int q0 = pix[ 0*xstride];
547 const int q1 = pix[ 1*xstride];
548 const int q2 = pix[ 2*xstride];
550 if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
552 if(abs( p0 - q0 ) < ((alpha >> 2) + 2) )
554 if( abs( p2 - p0 ) < beta ) /* p0', p1', p2' */
556 const int p3 = pix[-4*xstride];
557 pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
558 pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
559 pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
562 pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
563 if( abs( q2 - q0 ) < beta ) /* q0', q1', q2' */
565 const int q3 = pix[3*xstride];
566 pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
567 pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
568 pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
571 pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
575 pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
576 pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
582 static void deblock_v_luma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
584 deblock_luma_intra_c( pix, stride, 1, alpha, beta );
586 static void deblock_h_luma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
588 deblock_luma_intra_c( pix, 1, stride, alpha, beta );
591 static inline void deblock_chroma_intra_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta )
594 for( d = 0; d < 8; d++ )
596 const int p1 = pix[-2*xstride];
597 const int p0 = pix[-1*xstride];
598 const int q0 = pix[ 0*xstride];
599 const int q1 = pix[ 1*xstride];
601 if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
603 pix[-1*xstride] = (2*p1 + p0 + q1 + 2) >> 2; /* p0' */
604 pix[ 0*xstride] = (2*q1 + q0 + p1 + 2) >> 2; /* q0' */
609 static void deblock_v_chroma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
611 deblock_chroma_intra_c( pix, stride, 1, alpha, beta );
613 static void deblock_h_chroma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
615 deblock_chroma_intra_c( pix, 1, stride, alpha, beta );
618 static inline void deblock_edge( x264_t *h, uint8_t *pix1, uint8_t *pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_inter_t pf_inter )
620 const int index_a = i_qp + h->sh.i_alpha_c0_offset;
621 const int alpha = alpha_table(index_a);
622 const int beta = beta_table(i_qp + h->sh.i_beta_offset);
625 if( !alpha || !beta )
628 tc[0] = tc0_table(index_a)[bS[0]] + b_chroma;
629 tc[1] = tc0_table(index_a)[bS[1]] + b_chroma;
630 tc[2] = tc0_table(index_a)[bS[2]] + b_chroma;
631 tc[3] = tc0_table(index_a)[bS[3]] + b_chroma;
633 pf_inter( pix1, i_stride, alpha, beta, tc );
635 pf_inter( pix2, i_stride, alpha, beta, tc );
638 static inline void deblock_edge_intra( x264_t *h, uint8_t *pix1, uint8_t *pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_intra_t pf_intra )
640 const int alpha = alpha_table(i_qp + h->sh.i_alpha_c0_offset);
641 const int beta = beta_table(i_qp + h->sh.i_beta_offset);
643 if( !alpha || !beta )
646 pf_intra( pix1, i_stride, alpha, beta );
648 pf_intra( pix2, i_stride, alpha, beta );
651 void x264_frame_deblock_row( x264_t *h, int mb_y )
653 const int s8x8 = 2 * h->mb.i_mb_stride;
654 const int s4x4 = 4 * h->mb.i_mb_stride;
655 const int b_interlaced = h->sh.b_mbaff;
656 const int mvy_limit = 4 >> b_interlaced;
657 const int qp_thresh = 15 - X264_MIN(h->sh.i_alpha_c0_offset, h->sh.i_beta_offset) - X264_MAX(0, h->param.analyse.i_chroma_qp_offset);
658 const int no_sub8x8 = !(h->param.analyse.inter & X264_ANALYSE_PSUB8x8);
660 int stridey = h->fdec->i_stride[0];
661 int stride2y = stridey << b_interlaced;
662 int strideuv = h->fdec->i_stride[1];
663 int stride2uv = strideuv << b_interlaced;
665 if( !h->pps->b_cabac && h->pps->b_transform_8x8_mode )
666 munge_cavlc_nnz( h, mb_y, h->mb.nnz_backup, munge_cavlc_nnz_row );
668 for( mb_x = 0; mb_x < h->sps->i_mb_width; mb_x += (~b_interlaced | mb_y)&1, mb_y ^= b_interlaced )
670 const int mb_xy = mb_y * h->mb.i_mb_stride + mb_x;
671 const int mb_8x8 = 2 * s8x8 * mb_y + 2 * mb_x;
672 const int mb_4x4 = 4 * s4x4 * mb_y + 4 * mb_x;
673 const int b_8x8_transform = h->mb.mb_transform_size[mb_xy];
674 const int i_qp = h->mb.qp[mb_xy];
675 int i_edge_end = (h->mb.type[mb_xy] == P_SKIP) ? 1 : 4;
676 uint8_t *pixy = h->fdec->plane[0] + 16*mb_y*stridey + 16*mb_x;
677 uint8_t *pixu = h->fdec->plane[1] + 8*mb_y*strideuv + 8*mb_x;
678 uint8_t *pixv = h->fdec->plane[2] + 8*mb_y*strideuv + 8*mb_x;
679 if( b_interlaced && (mb_y&1) )
686 x264_prefetch_fenc( h, h->fdec, mb_x, mb_y );
688 if( i_qp <= qp_thresh )
691 #define FILTER_DIR(intra, i_dir)\
694 i_qpn= h->mb.qp[mbn_xy];\
698 deblock_edge##intra( h, pixy + 4*i_edge, NULL,\
699 stride2y, bS, (i_qp+i_qpn+1) >> 1, 0,\
700 h->loopf.deblock_h_luma##intra );\
704 int i_qpc = (h->chroma_qp_table[i_qp] + h->chroma_qp_table[i_qpn] + 1) >> 1;\
705 deblock_edge##intra( h, pixu + 2*i_edge, pixv + 2*i_edge,\
706 stride2uv, bS, i_qpc, 1,\
707 h->loopf.deblock_h_chroma##intra );\
712 /* horizontal edge */\
713 deblock_edge##intra( h, pixy + 4*i_edge*stride2y, NULL,\
714 stride2y, bS, (i_qp+i_qpn+1) >> 1, 0,\
715 h->loopf.deblock_v_luma##intra );\
719 int i_qpc = (h->chroma_qp_table[i_qp] + h->chroma_qp_table[i_qpn] + 1) >> 1;\
720 deblock_edge##intra( h, pixu + 2*i_edge*stride2uv, pixv + 2*i_edge*stride2uv,\
721 stride2uv, bS, i_qpc, 1,\
722 h->loopf.deblock_v_chroma##intra );\
727 #define DEBLOCK_STRENGTH(i_dir)\
729 /* *** Get bS for each 4px for the current edge *** */\
730 if( IS_INTRA( h->mb.type[mb_xy] ) || IS_INTRA( h->mb.type[mbn_xy]) )\
731 M32( bS ) = 0x03030303;\
734 M32( bS ) = 0x00000000;\
735 for( i = 0; i < 4; i++ )\
737 int x = i_dir == 0 ? i_edge : i;\
738 int y = i_dir == 0 ? i : i_edge;\
739 int xn = i_dir == 0 ? (x - 1)&0x03 : x;\
740 int yn = i_dir == 0 ? y : (y - 1)&0x03;\
741 if( h->mb.non_zero_count[mb_xy][x+y*4] != 0 ||\
742 h->mb.non_zero_count[mbn_xy][xn+yn*4] != 0 )\
744 else if(!(i_edge&no_sub8x8))\
746 if((i&no_sub8x8) && bS[i-1] != 2)\
750 /* FIXME: A given frame may occupy more than one position in\
751 * the reference list. So we should compare the frame numbers,\
752 * not the indices in the ref list.\
753 * No harm yet, as we don't generate that case.*/\
754 int i8p= mb_8x8+(x>>1)+(y>>1)*s8x8;\
755 int i8q= mbn_8x8+(xn>>1)+(yn>>1)*s8x8;\
756 int i4p= mb_4x4+x+y*s4x4;\
757 int i4q= mbn_4x4+xn+yn*s4x4;\
759 if( h->mb.ref[0][i8p] < 0 || h->mb.ref[0][i8q] < 0 )\
760 refs_equal = h->mb.ref[0][i8p] == h->mb.ref[0][i8q];\
761 else if( !h->mb.b_interlaced )\
762 refs_equal = h->fref0[h->mb.ref[0][i8p]]->i_poc == h->fref0[h->mb.ref[0][i8q]]->i_poc;\
764 refs_equal = h->fref0[h->mb.ref[0][i8p]>>1]->i_poc == h->fref0[h->mb.ref[0][i8q]>>1]->i_poc\
765 && (h->mb.ref[0][i8p]&1) == (h->mb.ref[0][i8q]&1);\
767 abs( h->mb.mv[0][i4p][0] - h->mb.mv[0][i4q][0] ) >= 4 ||\
768 abs( h->mb.mv[0][i4p][1] - h->mb.mv[0][i4q][1] ) >= mvy_limit ) ||\
769 (h->sh.i_type == SLICE_TYPE_B &&\
770 (h->mb.ref[1][i8p] != h->mb.ref[1][i8q] ||\
771 abs( h->mb.mv[1][i4p][0] - h->mb.mv[1][i4q][0] ) >= 4 ||\
772 abs( h->mb.mv[1][i4p][1] - h->mb.mv[1][i4q][1] ) >= mvy_limit )))\
782 /* i_dir == 0 -> vertical edge
783 * i_dir == 1 -> horizontal edge */
784 #define DEBLOCK_DIR(i_dir)\
786 int i_edge = (i_dir ? (mb_y <= b_interlaced) : (mb_x == 0));\
787 int i_qpn, i, mbn_xy, mbn_8x8, mbn_4x4;\
788 ALIGNED_4( uint8_t bS[4] ); /* filtering strength */\
790 i_edge+= b_8x8_transform;\
793 mbn_xy = i_dir == 0 ? mb_xy - 1 : mb_xy - h->mb.i_mb_stride;\
794 mbn_8x8 = i_dir == 0 ? mb_8x8 - 2 : mb_8x8 - 2 * s8x8;\
795 mbn_4x4 = i_dir == 0 ? mb_4x4 - 4 : mb_4x4 - 4 * s4x4;\
796 if( b_interlaced && i_dir == 1 )\
798 mbn_xy -= h->mb.i_mb_stride;\
799 mbn_8x8 -= 2 * s8x8;\
800 mbn_4x4 -= 4 * s4x4;\
802 else if( IS_INTRA( h->mb.type[mb_xy] ) || IS_INTRA( h->mb.type[mbn_xy]) )\
804 FILTER_DIR( _intra, i_dir );\
807 DEBLOCK_STRENGTH(i_dir);\
809 FILTER_DIR( , i_dir);\
811 i_edge += b_8x8_transform+1;\
816 for( ; i_edge < i_edge_end; i_edge+=b_8x8_transform+1 )\
818 DEBLOCK_STRENGTH(i_dir);\
820 FILTER_DIR( , i_dir);\
828 if( !h->pps->b_cabac && h->pps->b_transform_8x8_mode )
829 munge_cavlc_nnz( h, mb_y, h->mb.nnz_backup, restore_cavlc_nnz_row );
832 void x264_frame_deblock( x264_t *h )
835 for( mb_y = 0; mb_y < h->sps->i_mb_height; mb_y += 1 + h->sh.b_mbaff )
836 x264_frame_deblock_row( h, mb_y );
840 void x264_deblock_v_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
841 void x264_deblock_h_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
842 void x264_deblock_v_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
843 void x264_deblock_h_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
845 void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
846 void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
847 void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta );
848 void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta );
850 void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
851 void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
852 void x264_deblock_h_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
853 void x264_deblock_v8_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
855 static void x264_deblock_v_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
857 x264_deblock_v8_luma_mmxext( pix, stride, alpha, beta, tc0 );
858 x264_deblock_v8_luma_mmxext( pix+8, stride, alpha, beta, tc0+2 );
860 static void x264_deblock_v_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta )
862 x264_deblock_v8_luma_intra_mmxext( pix, stride, alpha, beta );
863 x264_deblock_v8_luma_intra_mmxext( pix+8, stride, alpha, beta );
869 void x264_deblock_v_luma_altivec( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
870 void x264_deblock_h_luma_altivec( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
874 void x264_deblock_v_luma_neon( uint8_t *, int, int, int, int8_t * );
875 void x264_deblock_h_luma_neon( uint8_t *, int, int, int, int8_t * );
876 void x264_deblock_v_chroma_neon( uint8_t *, int, int, int, int8_t * );
877 void x264_deblock_h_chroma_neon( uint8_t *, int, int, int, int8_t * );
880 void x264_deblock_init( int cpu, x264_deblock_function_t *pf )
882 pf->deblock_v_luma = deblock_v_luma_c;
883 pf->deblock_h_luma = deblock_h_luma_c;
884 pf->deblock_v_chroma = deblock_v_chroma_c;
885 pf->deblock_h_chroma = deblock_h_chroma_c;
886 pf->deblock_v_luma_intra = deblock_v_luma_intra_c;
887 pf->deblock_h_luma_intra = deblock_h_luma_intra_c;
888 pf->deblock_v_chroma_intra = deblock_v_chroma_intra_c;
889 pf->deblock_h_chroma_intra = deblock_h_chroma_intra_c;
892 if( cpu&X264_CPU_MMXEXT )
894 pf->deblock_v_chroma = x264_deblock_v_chroma_mmxext;
895 pf->deblock_h_chroma = x264_deblock_h_chroma_mmxext;
896 pf->deblock_v_chroma_intra = x264_deblock_v_chroma_intra_mmxext;
897 pf->deblock_h_chroma_intra = x264_deblock_h_chroma_intra_mmxext;
899 pf->deblock_v_luma = x264_deblock_v_luma_mmxext;
900 pf->deblock_h_luma = x264_deblock_h_luma_mmxext;
901 pf->deblock_v_luma_intra = x264_deblock_v_luma_intra_mmxext;
902 pf->deblock_h_luma_intra = x264_deblock_h_luma_intra_mmxext;
904 if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_STACK_MOD4) )
906 pf->deblock_v_luma = x264_deblock_v_luma_sse2;
907 pf->deblock_h_luma = x264_deblock_h_luma_sse2;
908 pf->deblock_v_luma_intra = x264_deblock_v_luma_intra_sse2;
909 pf->deblock_h_luma_intra = x264_deblock_h_luma_intra_sse2;
915 if( cpu&X264_CPU_ALTIVEC )
917 pf->deblock_v_luma = x264_deblock_v_luma_altivec;
918 pf->deblock_h_luma = x264_deblock_h_luma_altivec;
923 if( cpu&X264_CPU_NEON )
925 pf->deblock_v_luma = x264_deblock_v_luma_neon;
926 pf->deblock_h_luma = x264_deblock_h_luma_neon;
927 pf->deblock_v_chroma = x264_deblock_v_chroma_neon;
928 pf->deblock_h_chroma = x264_deblock_h_chroma_neon;
935 void x264_frame_cond_broadcast( x264_frame_t *frame, int i_lines_completed )
937 x264_pthread_mutex_lock( &frame->mutex );
938 frame->i_lines_completed = i_lines_completed;
939 x264_pthread_cond_broadcast( &frame->cv );
940 x264_pthread_mutex_unlock( &frame->mutex );
943 void x264_frame_cond_wait( x264_frame_t *frame, int i_lines_completed )
945 x264_pthread_mutex_lock( &frame->mutex );
946 while( frame->i_lines_completed < i_lines_completed )
947 x264_pthread_cond_wait( &frame->cv, &frame->mutex );
948 x264_pthread_mutex_unlock( &frame->mutex );
953 void x264_frame_push( x264_frame_t **list, x264_frame_t *frame )
956 while( list[i] ) i++;
960 x264_frame_t *x264_frame_pop( x264_frame_t **list )
965 while( list[i+1] ) i++;
971 void x264_frame_unshift( x264_frame_t **list, x264_frame_t *frame )
974 while( list[i] ) i++;
980 x264_frame_t *x264_frame_shift( x264_frame_t **list )
982 x264_frame_t *frame = list[0];
984 for( i = 0; list[i]; i++ )
990 void x264_frame_push_unused( x264_t *h, x264_frame_t *frame )
992 assert( frame->i_reference_count > 0 );
993 frame->i_reference_count--;
994 if( frame->i_reference_count == 0 )
995 x264_frame_push( h->frames.unused[frame->b_fdec], frame );
998 x264_frame_t *x264_frame_pop_unused( x264_t *h, int b_fdec )
1000 x264_frame_t *frame;
1001 if( h->frames.unused[b_fdec][0] )
1002 frame = x264_frame_pop( h->frames.unused[b_fdec] );
1004 frame = x264_frame_new( h, b_fdec );
1007 frame->b_last_minigop_bframe = 0;
1008 frame->i_reference_count = 1;
1009 frame->b_intra_calculated = 0;
1010 frame->b_scenecut = 1;
1012 memset( frame->weight, 0, sizeof(frame->weight) );
1013 memset( frame->f_weighted_cost_delta, 0, sizeof(frame->f_weighted_cost_delta) );
1018 void x264_frame_push_blank_unused( x264_t *h, x264_frame_t *frame )
1020 assert( frame->i_reference_count > 0 );
1021 frame->i_reference_count--;
1022 if( frame->i_reference_count == 0 )
1023 x264_frame_push( h->frames.blank_unused, frame );
1026 x264_frame_t *x264_frame_pop_blank_unused( x264_t *h )
1028 x264_frame_t *frame;
1029 if( h->frames.blank_unused[0] )
1030 frame = x264_frame_pop( h->frames.blank_unused );
1032 frame = x264_malloc( sizeof(x264_frame_t) );
1035 frame->b_duplicate = 1;
1036 frame->i_reference_count = 1;
1040 void x264_frame_sort( x264_frame_t **list, int b_dts )
1045 for( i = 0; list[i+1]; i++ )
1047 int dtype = list[i]->i_type - list[i+1]->i_type;
1048 int dtime = list[i]->i_frame - list[i+1]->i_frame;
1049 int swap = b_dts ? dtype > 0 || ( dtype == 0 && dtime > 0 )
1053 XCHG( x264_frame_t*, list[i], list[i+1] );
1060 void x264_weight_scale_plane( x264_t *h, uint8_t *dst, int i_dst_stride, uint8_t *src, int i_src_stride,
1061 int i_width, int i_height, x264_weight_t *w )
1064 /* Weight horizontal strips of height 16. This was found to be the optimal height
1065 * in terms of the cache loads. */
1066 while( i_height > 0 )
1068 for( x = 0; x < i_width; x += 16 )
1069 w->weightfn[16>>2]( dst+x, i_dst_stride, src+x, i_src_stride, w, X264_MIN( i_height, 16 ) );
1071 dst += 16 * i_dst_stride;
1072 src += 16 * i_src_stride;
1076 void x264_frame_delete_list( x264_frame_t **list )
1082 x264_frame_delete( list[i++] );
1086 int x264_synch_frame_list_init( x264_synch_frame_list_t *slist, int max_size )
1090 slist->i_max_size = max_size;
1092 CHECKED_MALLOCZERO( slist->list, (max_size+1) * sizeof(x264_frame_t*) );
1093 if( x264_pthread_mutex_init( &slist->mutex, NULL ) ||
1094 x264_pthread_cond_init( &slist->cv_fill, NULL ) ||
1095 x264_pthread_cond_init( &slist->cv_empty, NULL ) )
1102 void x264_synch_frame_list_delete( x264_synch_frame_list_t *slist )
1104 x264_pthread_mutex_destroy( &slist->mutex );
1105 x264_pthread_cond_destroy( &slist->cv_fill );
1106 x264_pthread_cond_destroy( &slist->cv_empty );
1107 x264_frame_delete_list( slist->list );
1110 void x264_synch_frame_list_push( x264_synch_frame_list_t *slist, x264_frame_t *frame )
1112 x264_pthread_mutex_lock( &slist->mutex );
1113 while( slist->i_size == slist->i_max_size )
1114 x264_pthread_cond_wait( &slist->cv_empty, &slist->mutex );
1115 slist->list[ slist->i_size++ ] = frame;
1116 x264_pthread_mutex_unlock( &slist->mutex );
1117 x264_pthread_cond_broadcast( &slist->cv_fill );