1 /*****************************************************************************
2 * frame.c: h264 encoder library
3 *****************************************************************************
4 * Copyright (C) 2003-2008 x264 project
6 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
7 * Loren Merritt <lorenm@u.washington.edu>
8 * Fiona Glaser <fiona@x264.com>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23 *****************************************************************************/
27 #define ALIGN(x,a) (((x)+((a)-1))&~((a)-1))
29 x264_frame_t *x264_frame_new( x264_t *h )
34 int i_mb_count = h->mb.i_mb_count;
35 int i_stride, i_width, i_lines;
36 int i_padv = PADV << h->param.b_interlaced;
38 int chroma_plane_size;
39 int align = h->param.cpu&X264_CPU_CACHELINE_64 ? 64 : h->param.cpu&X264_CPU_CACHELINE_32 ? 32 : 16;
41 CHECKED_MALLOCZERO( frame, sizeof(x264_frame_t) );
43 /* allocate frame data (+64 for extra data for me) */
44 i_width = ALIGN( h->param.i_width, 16 );
45 i_stride = ALIGN( i_width + 2*PADH, align );
46 i_lines = ALIGN( h->param.i_height, 16<<h->param.b_interlaced );
49 for( i = 0; i < 3; i++ )
51 frame->i_stride[i] = ALIGN( i_stride >> !!i, align );
52 frame->i_width[i] = i_width >> !!i;
53 frame->i_lines[i] = i_lines >> !!i;
56 luma_plane_size = (frame->i_stride[0] * ( frame->i_lines[0] + 2*i_padv ));
57 chroma_plane_size = (frame->i_stride[1] * ( frame->i_lines[1] + 2*i_padv ));
58 for( i = 1; i < 3; i++ )
60 CHECKED_MALLOC( frame->buffer[i], chroma_plane_size );
61 frame->plane[i] = frame->buffer[i] + (frame->i_stride[i] * i_padv + PADH)/2;
63 /* all 4 luma planes allocated together, since the cacheline split code
64 * requires them to be in-phase wrt cacheline alignment. */
65 if( h->param.analyse.i_subpel_refine )
67 CHECKED_MALLOC( frame->buffer[0], 4*luma_plane_size);
68 for( i = 0; i < 4; i++ )
69 frame->filtered[i] = frame->buffer[0] + i*luma_plane_size + frame->i_stride[0] * i_padv + PADH;
70 frame->plane[0] = frame->filtered[0];
74 CHECKED_MALLOC( frame->buffer[0], luma_plane_size);
75 frame->plane[0] = frame->buffer[0] + frame->i_stride[0] * i_padv + PADH;
78 if( h->frames.b_have_lowres )
80 frame->i_width_lowres = frame->i_width[0]/2;
81 frame->i_stride_lowres = ALIGN( frame->i_width_lowres + 2*PADH, align );
82 frame->i_lines_lowres = frame->i_lines[0]/2;
84 luma_plane_size = frame->i_stride_lowres * ( frame->i_lines[0]/2 + 2*i_padv );
86 CHECKED_MALLOC( frame->buffer_lowres[0], 4 * luma_plane_size );
87 for( i = 0; i < 4; i++ )
88 frame->lowres[i] = frame->buffer_lowres[0] + (frame->i_stride_lowres * i_padv + PADH) + i * luma_plane_size;
90 for( j = 0; j <= !!h->param.i_bframe; j++ )
91 for( i = 0; i <= h->param.i_bframe; i++ )
93 CHECKED_MALLOCZERO( frame->lowres_mvs[j][i], 2*h->mb.i_mb_count*sizeof(int16_t) );
94 CHECKED_MALLOC( frame->lowres_mv_costs[j][i], h->mb.i_mb_count*sizeof(int) );
96 CHECKED_MALLOC( frame->i_propagate_cost, (i_mb_count+3) * sizeof(uint16_t) );
97 for( j = 0; j <= h->param.i_bframe+1; j++ )
98 for( i = 0; i <= h->param.i_bframe+1; i++ )
100 CHECKED_MALLOC( frame->lowres_costs[j][i], (i_mb_count+3) * sizeof(uint16_t) );
101 CHECKED_MALLOC( frame->lowres_inter_types[j][i], (i_mb_count+3) * sizeof(uint8_t) );
103 frame->i_intra_cost = frame->lowres_costs[0][0];
104 memset( frame->i_intra_cost, -1, (i_mb_count+3) * sizeof(uint16_t) );
107 if( h->param.analyse.i_me_method >= X264_ME_ESA )
109 CHECKED_MALLOC( frame->buffer[3],
110 frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv) * sizeof(uint16_t) << h->frames.b_have_sub8x8_esa );
111 frame->integral = (uint16_t*)frame->buffer[3] + frame->i_stride[0] * i_padv + PADH;
115 frame->i_type = X264_TYPE_AUTO;
116 frame->i_qpplus1 = 0;
119 frame->i_frame_num = -1;
120 frame->i_lines_completed = -1;
122 CHECKED_MALLOC( frame->mb_type, i_mb_count * sizeof(int8_t));
123 CHECKED_MALLOC( frame->mv[0], 2*16 * i_mb_count * sizeof(int16_t) );
124 CHECKED_MALLOC( frame->ref[0], 4 * i_mb_count * sizeof(int8_t) );
125 if( h->param.i_bframe )
127 CHECKED_MALLOC( frame->mv[1], 2*16 * i_mb_count * sizeof(int16_t) );
128 CHECKED_MALLOC( frame->ref[1], 4 * i_mb_count * sizeof(int8_t) );
133 frame->ref[1] = NULL;
136 CHECKED_MALLOC( frame->i_row_bits, i_lines/16 * sizeof(int) );
137 CHECKED_MALLOC( frame->i_row_qp, i_lines/16 * sizeof(int) );
138 for( i = 0; i < h->param.i_bframe + 2; i++ )
139 for( j = 0; j < h->param.i_bframe + 2; j++ )
140 CHECKED_MALLOC( frame->i_row_satds[i][j], i_lines/16 * sizeof(int) );
142 if( h->param.rc.i_aq_mode )
144 CHECKED_MALLOC( frame->f_qp_offset, h->mb.i_mb_count * sizeof(float) );
145 CHECKED_MALLOC( frame->f_qp_offset_aq, h->mb.i_mb_count * sizeof(float) );
146 if( h->frames.b_have_lowres )
147 /* shouldn't really be initialized, just silences a valgrind false-positive in x264_mbtree_propagate_cost_sse2 */
148 CHECKED_MALLOCZERO( frame->i_inv_qscale_factor, (h->mb.i_mb_count+3) * sizeof(uint16_t) );
151 if( x264_pthread_mutex_init( &frame->mutex, NULL ) )
153 if( x264_pthread_cond_init( &frame->cv, NULL ) )
163 void x264_frame_delete( x264_frame_t *frame )
166 for( i = 0; i < 4; i++ )
167 x264_free( frame->buffer[i] );
168 for( i = 0; i < 4; i++ )
169 x264_free( frame->buffer_lowres[i] );
170 for( i = 0; i < X264_BFRAME_MAX+2; i++ )
171 for( j = 0; j < X264_BFRAME_MAX+2; j++ )
172 x264_free( frame->i_row_satds[i][j] );
173 for( j = 0; j < 2; j++ )
174 for( i = 0; i <= X264_BFRAME_MAX; i++ )
176 x264_free( frame->lowres_mvs[j][i] );
177 x264_free( frame->lowres_mv_costs[j][i] );
179 x264_free( frame->i_propagate_cost );
180 for( j = 0; j <= X264_BFRAME_MAX+1; j++ )
181 for( i = 0; i <= X264_BFRAME_MAX+1; i++ )
183 x264_free( frame->lowres_costs[j][i] );
184 x264_free( frame->lowres_inter_types[j][i] );
186 x264_free( frame->f_qp_offset );
187 x264_free( frame->f_qp_offset_aq );
188 x264_free( frame->i_inv_qscale_factor );
189 x264_free( frame->i_row_bits );
190 x264_free( frame->i_row_qp );
191 x264_free( frame->mb_type );
192 x264_free( frame->mv[0] );
193 x264_free( frame->mv[1] );
194 x264_free( frame->ref[0] );
195 x264_free( frame->ref[1] );
196 x264_pthread_mutex_destroy( &frame->mutex );
197 x264_pthread_cond_destroy( &frame->cv );
201 int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src )
203 int i_csp = src->img.i_csp & X264_CSP_MASK;
205 if( i_csp != X264_CSP_I420 && i_csp != X264_CSP_YV12 )
207 x264_log( h, X264_LOG_ERROR, "Arg invalid CSP\n" );
211 dst->i_type = src->i_type;
212 dst->i_qpplus1 = src->i_qpplus1;
213 dst->i_pts = src->i_pts;
214 dst->param = src->param;
218 int s = (i_csp == X264_CSP_YV12 && i) ? i^3 : i;
219 uint8_t *plane = src->img.plane[s];
220 int stride = src->img.i_stride[s];
221 int width = h->param.i_width >> !!i;
222 int height = h->param.i_height >> !!i;
223 if( src->img.i_csp & X264_CSP_VFLIP )
225 plane += (height-1)*stride;
228 h->mc.plane_copy( dst->plane[i], dst->i_stride[i], plane, stride, width, height );
235 static void plane_expand_border( uint8_t *pix, int i_stride, int i_width, int i_height, int i_padh, int i_padv, int b_pad_top, int b_pad_bottom )
237 #define PPIXEL(x, y) ( pix + (x) + (y)*i_stride )
239 for( y = 0; y < i_height; y++ )
242 memset( PPIXEL(-i_padh, y), PPIXEL(0, y)[0], i_padh );
244 memset( PPIXEL(i_width, y), PPIXEL(i_width-1, y)[0], i_padh );
248 for( y = 0; y < i_padv; y++ )
249 memcpy( PPIXEL(-i_padh, -y-1), PPIXEL(-i_padh, 0), i_width+2*i_padh );
252 for( y = 0; y < i_padv; y++ )
253 memcpy( PPIXEL(-i_padh, i_height+y), PPIXEL(-i_padh, i_height-1), i_width+2*i_padh );
257 void x264_frame_expand_border( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
261 if( mb_y & h->sh.b_mbaff )
263 for( i = 0; i < frame->i_plane; i++ )
265 int stride = frame->i_stride[i];
266 int width = 16*h->sps->i_mb_width >> !!i;
267 int height = (b_end ? 16*(h->sps->i_mb_height - mb_y) >> h->sh.b_mbaff : 16) >> !!i;
268 int padh = PADH >> !!i;
269 int padv = PADV >> !!i;
270 // buffer: 2 chroma, 3 luma (rounded to 4) because deblocking goes beyond the top of the mb
271 uint8_t *pix = frame->plane[i] + X264_MAX(0, (16*mb_y-4)*stride >> !!i);
272 if( b_end && !b_start )
273 height += 4 >> (!!i + h->sh.b_mbaff);
276 plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end );
277 plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end );
281 plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end );
286 void x264_frame_expand_border_filtered( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
288 /* during filtering, 8 extra pixels were filtered on each edge,
289 * but up to 3 of the horizontal ones may be wrong.
290 we want to expand border from the last filtered pixel */
292 int stride = frame->i_stride[0];
293 int width = 16*h->sps->i_mb_width + 8;
294 int height = b_end ? (16*(h->sps->i_mb_height - mb_y) >> h->sh.b_mbaff) + 16 : 16;
298 for( i = 1; i < 4; i++ )
300 // buffer: 8 luma, to match the hpel filter
301 uint8_t *pix = frame->filtered[i] + (16*mb_y - (8 << h->sh.b_mbaff)) * stride - 4;
304 plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end );
305 plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end );
309 plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end );
314 void x264_frame_expand_border_lowres( x264_frame_t *frame )
317 for( i = 0; i < 4; i++ )
318 plane_expand_border( frame->lowres[i], frame->i_stride_lowres, frame->i_width_lowres, frame->i_lines_lowres, PADH, PADV, 1, 1 );
321 void x264_frame_expand_border_mod16( x264_t *h, x264_frame_t *frame )
324 for( i = 0; i < frame->i_plane; i++ )
326 int i_subsample = i ? 1 : 0;
327 int i_width = h->param.i_width >> i_subsample;
328 int i_height = h->param.i_height >> i_subsample;
329 int i_padx = ( h->sps->i_mb_width * 16 - h->param.i_width ) >> i_subsample;
330 int i_pady = ( h->sps->i_mb_height * 16 - h->param.i_height ) >> i_subsample;
334 for( y = 0; y < i_height; y++ )
335 memset( &frame->plane[i][y*frame->i_stride[i] + i_width],
336 frame->plane[i][y*frame->i_stride[i] + i_width - 1],
341 //FIXME interlace? or just let it pad using the wrong field
342 for( y = i_height; y < i_height + i_pady; y++ )
343 memcpy( &frame->plane[i][y*frame->i_stride[i]],
344 &frame->plane[i][(i_height-1)*frame->i_stride[i]],
351 /* cavlc + 8x8 transform stores nnz per 16 coeffs for the purpose of
352 * entropy coding, but per 64 coeffs for the purpose of deblocking */
353 static void munge_cavlc_nnz_row( x264_t *h, int mb_y, uint8_t (*buf)[16] )
355 uint32_t (*src)[6] = (uint32_t(*)[6])h->mb.non_zero_count + mb_y * h->sps->i_mb_width;
356 int8_t *transform = h->mb.mb_transform_size + mb_y * h->sps->i_mb_width;
358 for( x=0; x<h->sps->i_mb_width; x++ )
360 memcpy( buf+x, src+x, 16 );
363 nnz = src[x][0] | src[x][1];
364 src[x][0] = src[x][1] = ((uint16_t)nnz ? 0x0101 : 0) + (nnz>>16 ? 0x01010000 : 0);
365 nnz = src[x][2] | src[x][3];
366 src[x][2] = src[x][3] = ((uint16_t)nnz ? 0x0101 : 0) + (nnz>>16 ? 0x01010000 : 0);
371 static void restore_cavlc_nnz_row( x264_t *h, int mb_y, uint8_t (*buf)[16] )
373 uint8_t (*dst)[24] = h->mb.non_zero_count + mb_y * h->sps->i_mb_width;
375 for( x=0; x<h->sps->i_mb_width; x++ )
376 memcpy( dst+x, buf+x, 16 );
379 static void munge_cavlc_nnz( x264_t *h, int mb_y, uint8_t (*buf)[16], void (*func)(x264_t*, int, uint8_t (*)[16]) )
381 func( h, mb_y, buf );
383 func( h, mb_y-1, buf + h->sps->i_mb_width );
386 func( h, mb_y+1, buf + h->sps->i_mb_width * 2 );
388 func( h, mb_y-2, buf + h->sps->i_mb_width * 3 );
393 /* Deblocking filter */
394 static const uint8_t i_alpha_table[52+12*2] =
396 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
397 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
398 0, 0, 0, 0, 0, 0, 4, 4, 5, 6,
399 7, 8, 9, 10, 12, 13, 15, 17, 20, 22,
400 25, 28, 32, 36, 40, 45, 50, 56, 63, 71,
401 80, 90,101,113,127,144,162,182,203,226,
403 255,255,255,255,255,255,255,255,255,255,255,255,
405 static const uint8_t i_beta_table[52+12*2] =
407 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
408 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
409 0, 0, 0, 0, 0, 0, 2, 2, 2, 3,
410 3, 3, 3, 4, 4, 4, 6, 6, 7, 7,
411 8, 8, 9, 9, 10, 10, 11, 11, 12, 12,
412 13, 13, 14, 14, 15, 15, 16, 16, 17, 17,
414 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
416 static const int8_t i_tc0_table[52+12*2][4] =
418 {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
419 {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
420 {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
421 {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
422 {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 1 },
423 {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 1, 1 }, {-1, 0, 1, 1 }, {-1, 1, 1, 1 },
424 {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 },
425 {-1, 1, 1, 2 }, {-1, 1, 2, 3 }, {-1, 1, 2, 3 }, {-1, 2, 2, 3 }, {-1, 2, 2, 4 }, {-1, 2, 3, 4 },
426 {-1, 2, 3, 4 }, {-1, 3, 3, 5 }, {-1, 3, 4, 6 }, {-1, 3, 4, 6 }, {-1, 4, 5, 7 }, {-1, 4, 5, 8 },
427 {-1, 4, 6, 9 }, {-1, 5, 7,10 }, {-1, 6, 8,11 }, {-1, 6, 8,13 }, {-1, 7,10,14 }, {-1, 8,11,16 },
428 {-1, 9,12,18 }, {-1,10,13,20 }, {-1,11,15,23 }, {-1,13,17,25 },
429 {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
430 {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
432 #define alpha_table(x) i_alpha_table[(x)+12]
433 #define beta_table(x) i_beta_table[(x)+12]
434 #define tc0_table(x) i_tc0_table[(x)+12]
437 static inline void deblock_luma_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0 )
440 for( i = 0; i < 4; i++ )
447 for( d = 0; d < 4; d++ )
449 const int p2 = pix[-3*xstride];
450 const int p1 = pix[-2*xstride];
451 const int p0 = pix[-1*xstride];
452 const int q0 = pix[ 0*xstride];
453 const int q1 = pix[ 1*xstride];
454 const int q2 = pix[ 2*xstride];
456 if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
460 if( abs( p2 - p0 ) < beta )
462 pix[-2*xstride] = p1 + x264_clip3( (( p2 + ((p0 + q0 + 1) >> 1)) >> 1) - p1, -tc0[i], tc0[i] );
465 if( abs( q2 - q0 ) < beta )
467 pix[ 1*xstride] = q1 + x264_clip3( (( q2 + ((p0 + q0 + 1) >> 1)) >> 1) - q1, -tc0[i], tc0[i] );
471 delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
472 pix[-1*xstride] = x264_clip_uint8( p0 + delta ); /* p0' */
473 pix[ 0*xstride] = x264_clip_uint8( q0 - delta ); /* q0' */
479 static void deblock_v_luma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
481 deblock_luma_c( pix, stride, 1, alpha, beta, tc0 );
483 static void deblock_h_luma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
485 deblock_luma_c( pix, 1, stride, alpha, beta, tc0 );
488 static inline void deblock_chroma_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0 )
491 for( i = 0; i < 4; i++ )
493 const int tc = tc0[i];
499 for( d = 0; d < 2; d++ )
501 const int p1 = pix[-2*xstride];
502 const int p0 = pix[-1*xstride];
503 const int q0 = pix[ 0*xstride];
504 const int q1 = pix[ 1*xstride];
506 if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
508 int delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
509 pix[-1*xstride] = x264_clip_uint8( p0 + delta ); /* p0' */
510 pix[ 0*xstride] = x264_clip_uint8( q0 - delta ); /* q0' */
516 static void deblock_v_chroma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
518 deblock_chroma_c( pix, stride, 1, alpha, beta, tc0 );
520 static void deblock_h_chroma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
522 deblock_chroma_c( pix, 1, stride, alpha, beta, tc0 );
525 static inline void deblock_luma_intra_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta )
528 for( d = 0; d < 16; d++ )
530 const int p2 = pix[-3*xstride];
531 const int p1 = pix[-2*xstride];
532 const int p0 = pix[-1*xstride];
533 const int q0 = pix[ 0*xstride];
534 const int q1 = pix[ 1*xstride];
535 const int q2 = pix[ 2*xstride];
537 if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
539 if(abs( p0 - q0 ) < ((alpha >> 2) + 2) )
541 if( abs( p2 - p0 ) < beta ) /* p0', p1', p2' */
543 const int p3 = pix[-4*xstride];
544 pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
545 pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
546 pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
549 pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
550 if( abs( q2 - q0 ) < beta ) /* q0', q1', q2' */
552 const int q3 = pix[3*xstride];
553 pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
554 pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
555 pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
558 pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
562 pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
563 pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
569 static void deblock_v_luma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
571 deblock_luma_intra_c( pix, stride, 1, alpha, beta );
573 static void deblock_h_luma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
575 deblock_luma_intra_c( pix, 1, stride, alpha, beta );
578 static inline void deblock_chroma_intra_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta )
581 for( d = 0; d < 8; d++ )
583 const int p1 = pix[-2*xstride];
584 const int p0 = pix[-1*xstride];
585 const int q0 = pix[ 0*xstride];
586 const int q1 = pix[ 1*xstride];
588 if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
590 pix[-1*xstride] = (2*p1 + p0 + q1 + 2) >> 2; /* p0' */
591 pix[ 0*xstride] = (2*q1 + q0 + p1 + 2) >> 2; /* q0' */
596 static void deblock_v_chroma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
598 deblock_chroma_intra_c( pix, stride, 1, alpha, beta );
600 static void deblock_h_chroma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
602 deblock_chroma_intra_c( pix, 1, stride, alpha, beta );
605 static inline void deblock_edge( x264_t *h, uint8_t *pix1, uint8_t *pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_inter_t pf_inter )
607 const int index_a = i_qp + h->sh.i_alpha_c0_offset;
608 const int alpha = alpha_table(index_a);
609 const int beta = beta_table(i_qp + h->sh.i_beta_offset);
612 if( !alpha || !beta )
615 tc[0] = tc0_table(index_a)[bS[0]] + b_chroma;
616 tc[1] = tc0_table(index_a)[bS[1]] + b_chroma;
617 tc[2] = tc0_table(index_a)[bS[2]] + b_chroma;
618 tc[3] = tc0_table(index_a)[bS[3]] + b_chroma;
620 pf_inter( pix1, i_stride, alpha, beta, tc );
622 pf_inter( pix2, i_stride, alpha, beta, tc );
625 static inline void deblock_edge_intra( x264_t *h, uint8_t *pix1, uint8_t *pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_intra_t pf_intra )
627 const int alpha = alpha_table(i_qp + h->sh.i_alpha_c0_offset);
628 const int beta = beta_table(i_qp + h->sh.i_beta_offset);
630 if( !alpha || !beta )
633 pf_intra( pix1, i_stride, alpha, beta );
635 pf_intra( pix2, i_stride, alpha, beta );
638 void x264_frame_deblock_row( x264_t *h, int mb_y )
640 const int s8x8 = 2 * h->mb.i_mb_stride;
641 const int s4x4 = 4 * h->mb.i_mb_stride;
642 const int b_interlaced = h->sh.b_mbaff;
643 const int mvy_limit = 4 >> b_interlaced;
644 const int qp_thresh = 15 - X264_MIN(h->sh.i_alpha_c0_offset, h->sh.i_beta_offset) - X264_MAX(0, h->param.analyse.i_chroma_qp_offset);
645 const int no_sub8x8 = !(h->param.analyse.inter & X264_ANALYSE_PSUB8x8);
647 int stridey = h->fdec->i_stride[0];
648 int stride2y = stridey << b_interlaced;
649 int strideuv = h->fdec->i_stride[1];
650 int stride2uv = strideuv << b_interlaced;
652 if( !h->pps->b_cabac && h->pps->b_transform_8x8_mode )
653 munge_cavlc_nnz( h, mb_y, h->mb.nnz_backup, munge_cavlc_nnz_row );
655 for( mb_x = 0; mb_x < h->sps->i_mb_width; mb_x += (~b_interlaced | mb_y)&1, mb_y ^= b_interlaced )
657 const int mb_xy = mb_y * h->mb.i_mb_stride + mb_x;
658 const int mb_8x8 = 2 * s8x8 * mb_y + 2 * mb_x;
659 const int mb_4x4 = 4 * s4x4 * mb_y + 4 * mb_x;
660 const int b_8x8_transform = h->mb.mb_transform_size[mb_xy];
661 const int i_qp = h->mb.qp[mb_xy];
662 int i_edge_end = (h->mb.type[mb_xy] == P_SKIP) ? 1 : 4;
663 uint8_t *pixy = h->fdec->plane[0] + 16*mb_y*stridey + 16*mb_x;
664 uint8_t *pixu = h->fdec->plane[1] + 8*mb_y*strideuv + 8*mb_x;
665 uint8_t *pixv = h->fdec->plane[2] + 8*mb_y*strideuv + 8*mb_x;
666 if( b_interlaced && (mb_y&1) )
673 x264_prefetch_fenc( h, h->fdec, mb_x, mb_y );
675 if( i_qp <= qp_thresh )
678 #define FILTER_DIR(intra, i_dir)\
681 i_qpn= h->mb.qp[mbn_xy];\
685 deblock_edge##intra( h, pixy + 4*i_edge, NULL,\
686 stride2y, bS, (i_qp+i_qpn+1) >> 1, 0,\
687 h->loopf.deblock_h_luma##intra );\
691 int i_qpc = (h->chroma_qp_table[i_qp] + h->chroma_qp_table[i_qpn] + 1) >> 1;\
692 deblock_edge##intra( h, pixu + 2*i_edge, pixv + 2*i_edge,\
693 stride2uv, bS, i_qpc, 1,\
694 h->loopf.deblock_h_chroma##intra );\
699 /* horizontal edge */\
700 deblock_edge##intra( h, pixy + 4*i_edge*stride2y, NULL,\
701 stride2y, bS, (i_qp+i_qpn+1) >> 1, 0,\
702 h->loopf.deblock_v_luma##intra );\
706 int i_qpc = (h->chroma_qp_table[i_qp] + h->chroma_qp_table[i_qpn] + 1) >> 1;\
707 deblock_edge##intra( h, pixu + 2*i_edge*stride2uv, pixv + 2*i_edge*stride2uv,\
708 stride2uv, bS, i_qpc, 1,\
709 h->loopf.deblock_v_chroma##intra );\
714 #define DEBLOCK_STRENGTH(i_dir)\
716 /* *** Get bS for each 4px for the current edge *** */\
717 if( IS_INTRA( h->mb.type[mb_xy] ) || IS_INTRA( h->mb.type[mbn_xy]) )\
718 *(uint32_t*)bS = 0x03030303;\
721 *(uint32_t*)bS = 0x00000000;\
722 for( i = 0; i < 4; i++ )\
724 int x = i_dir == 0 ? i_edge : i;\
725 int y = i_dir == 0 ? i : i_edge;\
726 int xn = i_dir == 0 ? (x - 1)&0x03 : x;\
727 int yn = i_dir == 0 ? y : (y - 1)&0x03;\
728 if( h->mb.non_zero_count[mb_xy][x+y*4] != 0 ||\
729 h->mb.non_zero_count[mbn_xy][xn+yn*4] != 0 )\
731 else if(!(i_edge&no_sub8x8))\
733 if((i&no_sub8x8) && bS[i-1] != 2)\
737 /* FIXME: A given frame may occupy more than one position in\
738 * the reference list. So we should compare the frame numbers,\
739 * not the indices in the ref list.\
740 * No harm yet, as we don't generate that case.*/\
741 int i8p= mb_8x8+(x>>1)+(y>>1)*s8x8;\
742 int i8q= mbn_8x8+(xn>>1)+(yn>>1)*s8x8;\
743 int i4p= mb_4x4+x+y*s4x4;\
744 int i4q= mbn_4x4+xn+yn*s4x4;\
745 if((h->mb.ref[0][i8p] != h->mb.ref[0][i8q] ||\
746 abs( h->mb.mv[0][i4p][0] - h->mb.mv[0][i4q][0] ) >= 4 ||\
747 abs( h->mb.mv[0][i4p][1] - h->mb.mv[0][i4q][1] ) >= mvy_limit ) ||\
748 (h->sh.i_type == SLICE_TYPE_B &&\
749 (h->mb.ref[1][i8p] != h->mb.ref[1][i8q] ||\
750 abs( h->mb.mv[1][i4p][0] - h->mb.mv[1][i4q][0] ) >= 4 ||\
751 abs( h->mb.mv[1][i4p][1] - h->mb.mv[1][i4q][1] ) >= mvy_limit )))\
761 /* i_dir == 0 -> vertical edge
762 * i_dir == 1 -> horizontal edge */
763 #define DEBLOCK_DIR(i_dir)\
765 int i_edge = (i_dir ? (mb_y <= b_interlaced) : (mb_x == 0));\
766 int i_qpn, i, mbn_xy, mbn_8x8, mbn_4x4;\
767 DECLARE_ALIGNED_4( uint8_t bS[4] ); /* filtering strength */\
769 i_edge+= b_8x8_transform;\
772 mbn_xy = i_dir == 0 ? mb_xy - 1 : mb_xy - h->mb.i_mb_stride;\
773 mbn_8x8 = i_dir == 0 ? mb_8x8 - 2 : mb_8x8 - 2 * s8x8;\
774 mbn_4x4 = i_dir == 0 ? mb_4x4 - 4 : mb_4x4 - 4 * s4x4;\
775 if( b_interlaced && i_dir == 1 )\
777 mbn_xy -= h->mb.i_mb_stride;\
778 mbn_8x8 -= 2 * s8x8;\
779 mbn_4x4 -= 4 * s4x4;\
781 else if( IS_INTRA( h->mb.type[mb_xy] ) || IS_INTRA( h->mb.type[mbn_xy]) )\
783 FILTER_DIR( _intra, i_dir );\
786 DEBLOCK_STRENGTH(i_dir);\
787 if( *(uint32_t*)bS )\
788 FILTER_DIR( , i_dir);\
790 i_edge += b_8x8_transform+1;\
795 for( ; i_edge < i_edge_end; i_edge+=b_8x8_transform+1 )\
797 DEBLOCK_STRENGTH(i_dir);\
798 if( *(uint32_t*)bS )\
799 FILTER_DIR( , i_dir);\
807 if( !h->pps->b_cabac && h->pps->b_transform_8x8_mode )
808 munge_cavlc_nnz( h, mb_y, h->mb.nnz_backup, restore_cavlc_nnz_row );
811 void x264_frame_deblock( x264_t *h )
814 for( mb_y = 0; mb_y < h->sps->i_mb_height; mb_y += 1 + h->sh.b_mbaff )
815 x264_frame_deblock_row( h, mb_y );
819 void x264_deblock_v_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
820 void x264_deblock_h_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
821 void x264_deblock_v_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
822 void x264_deblock_h_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
824 void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
825 void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
826 void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta );
827 void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta );
829 void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
830 void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
831 void x264_deblock_h_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
832 void x264_deblock_v8_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
834 static void x264_deblock_v_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
836 x264_deblock_v8_luma_mmxext( pix, stride, alpha, beta, tc0 );
837 x264_deblock_v8_luma_mmxext( pix+8, stride, alpha, beta, tc0+2 );
839 static void x264_deblock_v_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta )
841 x264_deblock_v8_luma_intra_mmxext( pix, stride, alpha, beta );
842 x264_deblock_v8_luma_intra_mmxext( pix+8, stride, alpha, beta );
848 void x264_deblock_v_luma_altivec( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
849 void x264_deblock_h_luma_altivec( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
852 void x264_deblock_init( int cpu, x264_deblock_function_t *pf )
854 pf->deblock_v_luma = deblock_v_luma_c;
855 pf->deblock_h_luma = deblock_h_luma_c;
856 pf->deblock_v_chroma = deblock_v_chroma_c;
857 pf->deblock_h_chroma = deblock_h_chroma_c;
858 pf->deblock_v_luma_intra = deblock_v_luma_intra_c;
859 pf->deblock_h_luma_intra = deblock_h_luma_intra_c;
860 pf->deblock_v_chroma_intra = deblock_v_chroma_intra_c;
861 pf->deblock_h_chroma_intra = deblock_h_chroma_intra_c;
864 if( cpu&X264_CPU_MMXEXT )
866 pf->deblock_v_chroma = x264_deblock_v_chroma_mmxext;
867 pf->deblock_h_chroma = x264_deblock_h_chroma_mmxext;
868 pf->deblock_v_chroma_intra = x264_deblock_v_chroma_intra_mmxext;
869 pf->deblock_h_chroma_intra = x264_deblock_h_chroma_intra_mmxext;
871 pf->deblock_v_luma = x264_deblock_v_luma_mmxext;
872 pf->deblock_h_luma = x264_deblock_h_luma_mmxext;
873 pf->deblock_v_luma_intra = x264_deblock_v_luma_intra_mmxext;
874 pf->deblock_h_luma_intra = x264_deblock_h_luma_intra_mmxext;
876 if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_STACK_MOD4) )
878 pf->deblock_v_luma = x264_deblock_v_luma_sse2;
879 pf->deblock_h_luma = x264_deblock_h_luma_sse2;
880 pf->deblock_v_luma_intra = x264_deblock_v_luma_intra_sse2;
881 pf->deblock_h_luma_intra = x264_deblock_h_luma_intra_sse2;
887 if( cpu&X264_CPU_ALTIVEC )
889 pf->deblock_v_luma = x264_deblock_v_luma_altivec;
890 pf->deblock_h_luma = x264_deblock_h_luma_altivec;
897 void x264_frame_cond_broadcast( x264_frame_t *frame, int i_lines_completed )
899 x264_pthread_mutex_lock( &frame->mutex );
900 frame->i_lines_completed = i_lines_completed;
901 x264_pthread_cond_broadcast( &frame->cv );
902 x264_pthread_mutex_unlock( &frame->mutex );
905 void x264_frame_cond_wait( x264_frame_t *frame, int i_lines_completed )
907 x264_pthread_mutex_lock( &frame->mutex );
908 while( frame->i_lines_completed < i_lines_completed )
909 x264_pthread_cond_wait( &frame->cv, &frame->mutex );
910 x264_pthread_mutex_unlock( &frame->mutex );
915 void x264_frame_push( x264_frame_t **list, x264_frame_t *frame )
918 while( list[i] ) i++;
922 x264_frame_t *x264_frame_pop( x264_frame_t **list )
927 while( list[i+1] ) i++;
933 void x264_frame_unshift( x264_frame_t **list, x264_frame_t *frame )
936 while( list[i] ) i++;
942 x264_frame_t *x264_frame_shift( x264_frame_t **list )
944 x264_frame_t *frame = list[0];
946 for( i = 0; list[i]; i++ )
952 void x264_frame_push_unused( x264_t *h, x264_frame_t *frame )
954 assert( frame->i_reference_count > 0 );
955 frame->i_reference_count--;
956 if( frame->i_reference_count == 0 )
957 x264_frame_push( h->frames.unused, frame );
958 assert( h->frames.unused[ sizeof(h->frames.unused) / sizeof(*h->frames.unused) - 1 ] == NULL );
961 x264_frame_t *x264_frame_pop_unused( x264_t *h )
964 if( h->frames.unused[0] )
965 frame = x264_frame_pop( h->frames.unused );
967 frame = x264_frame_new( h );
970 frame->i_reference_count = 1;
971 frame->b_intra_calculated = 0;
975 void x264_frame_sort( x264_frame_t **list, int b_dts )
980 for( i = 0; list[i+1]; i++ )
982 int dtype = list[i]->i_type - list[i+1]->i_type;
983 int dtime = list[i]->i_frame - list[i+1]->i_frame;
984 int swap = b_dts ? dtype > 0 || ( dtype == 0 && dtime > 0 )
988 XCHG( x264_frame_t*, list[i], list[i+1] );