X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=common%2Fdeblock.c;h=636bf3ee37bc612ebdac653bda0f59ef93ba9be6;hb=c82c7374938f4342971adf8b2495c3a1bbe621c4;hp=812f4ea9ba8f664b93fdb4ce7ab69233ffd85a83;hpb=a93e4c4a75d05e7bf379cb9a39caad57f615eeb0;p=x264 diff --git a/common/deblock.c b/common/deblock.c index 812f4ea9..636bf3ee 100644 --- a/common/deblock.c +++ b/common/deblock.c @@ -1,11 +1,12 @@ /***************************************************************************** * deblock.c: deblocking ***************************************************************************** - * Copyright (C) 2003-2011 x264 project + * Copyright (C) 2003-2016 x264 project * * Authors: Laurent Aimar * Loren Merritt * Fiona Glaser + * Henrik Gramner * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -75,7 +76,7 @@ static const int8_t i_tc0_table[52+12*3][4] = #define tc0_table(x) i_tc0_table[(x)+24] /* From ffmpeg */ -static ALWAYS_INLINE void deblock_edge_luma_c( pixel *pix, int xstride, int alpha, int beta, int8_t tc0 ) +static ALWAYS_INLINE void deblock_edge_luma_c( pixel *pix, intptr_t xstride, int alpha, int beta, int8_t tc0 ) { int p2 = pix[-3*xstride]; int p1 = pix[-2*xstride]; @@ -106,7 +107,7 @@ static ALWAYS_INLINE void deblock_edge_luma_c( pixel *pix, int xstride, int alph pix[ 0*xstride] = x264_clip_pixel( q0 - delta ); /* q0' */ } } -static inline void deblock_luma_c( pixel *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0 ) +static inline void deblock_luma_c( pixel *pix, intptr_t xstride, intptr_t ystride, int alpha, int beta, int8_t *tc0 ) { for( int i = 0; i < 4; i++ ) { @@ -119,21 +120,21 @@ static inline void deblock_luma_c( pixel *pix, int xstride, int ystride, int alp deblock_edge_luma_c( pix, xstride, alpha, beta, tc0[i] ); } } -static inline void deblock_v_luma_mbaff_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 ) +static void deblock_h_luma_mbaff_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) { for( int d = 0; d < 8; d++, pix += stride ) deblock_edge_luma_c( pix, 1, alpha, beta, tc0[d>>1] ); } -static void deblock_v_luma_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 ) +static void deblock_v_luma_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) { deblock_luma_c( pix, stride, 1, alpha, beta, tc0 ); } -static void deblock_h_luma_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 ) +static void deblock_h_luma_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) { deblock_luma_c( pix, 1, stride, alpha, beta, tc0 ); } -static ALWAYS_INLINE void deblock_edge_chroma_c( pixel *pix, int xstride, int alpha, int beta, int8_t tc ) +static ALWAYS_INLINE void deblock_edge_chroma_c( pixel *pix, intptr_t xstride, int alpha, int beta, int8_t tc ) { int p1 = pix[-2*xstride]; int p0 = pix[-1*xstride]; @@ -147,36 +148,39 @@ static ALWAYS_INLINE void deblock_edge_chroma_c( pixel *pix, int xstride, int al pix[ 0*xstride] = x264_clip_pixel( q0 - delta ); /* q0' */ } } -static inline void deblock_chroma_c( pixel *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0 ) +static ALWAYS_INLINE void deblock_chroma_c( pixel *pix, int height, intptr_t xstride, intptr_t ystride, int alpha, int beta, int8_t *tc0 ) { for( int i = 0; i < 4; i++ ) { int tc = tc0[i]; if( tc <= 0 ) { - pix += 2*ystride; + pix += height*ystride; continue; } - for( int d = 0; d < 2; d++, pix += ystride-2 ) - for( int e = 0; e < 2; e++, pix++ ) - deblock_edge_chroma_c( pix, xstride, alpha, beta, tc0[i] ); + for( int d = 0; d < height; d++, pix += ystride-2 ) + for( int e = 0; e < 2; e++, pix++ ) + deblock_edge_chroma_c( pix, xstride, alpha, beta, tc0[i] ); } } -static inline void deblock_v_chroma_mbaff_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 ) +static void deblock_h_chroma_mbaff_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) { - for( int i = 0; i < 4; i++, pix += stride ) - deblock_edge_chroma_c( pix, 2, alpha, beta, tc0[i] ); + deblock_chroma_c( pix, 1, 2, stride, alpha, beta, tc0 ); } -static void deblock_v_chroma_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 ) +static void deblock_v_chroma_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) { - deblock_chroma_c( pix, stride, 2, alpha, beta, tc0 ); + deblock_chroma_c( pix, 2, stride, 2, alpha, beta, tc0 ); } -static void deblock_h_chroma_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 ) +static void deblock_h_chroma_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) { - deblock_chroma_c( pix, 2, stride, alpha, beta, tc0 ); + deblock_chroma_c( pix, 2, 2, stride, alpha, beta, tc0 ); +} +static void deblock_h_chroma_422_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) +{ + deblock_chroma_c( pix, 4, 2, stride, alpha, beta, tc0 ); } -static ALWAYS_INLINE void deblock_edge_luma_intra_c( pixel *pix, int xstride, int alpha, int beta ) +static ALWAYS_INLINE void deblock_edge_luma_intra_c( pixel *pix, intptr_t xstride, int alpha, int beta ) { int p2 = pix[-3*xstride]; int p1 = pix[-2*xstride]; @@ -215,26 +219,26 @@ static ALWAYS_INLINE void deblock_edge_luma_intra_c( pixel *pix, int xstride, in } } } -static inline void deblock_luma_intra_c( pixel *pix, int xstride, int ystride, int alpha, int beta ) +static inline void deblock_luma_intra_c( pixel *pix, intptr_t xstride, intptr_t ystride, int alpha, int beta ) { for( int d = 0; d < 16; d++, pix += ystride ) deblock_edge_luma_intra_c( pix, xstride, alpha, beta ); } -static inline void deblock_v_luma_intra_mbaff_c( pixel *pix, int ystride, int alpha, int beta ) +static void deblock_h_luma_intra_mbaff_c( pixel *pix, intptr_t ystride, int alpha, int beta ) { for( int d = 0; d < 8; d++, pix += ystride ) deblock_edge_luma_intra_c( pix, 1, alpha, beta ); } -static void deblock_v_luma_intra_c( pixel *pix, int stride, int alpha, int beta ) +static void deblock_v_luma_intra_c( pixel *pix, intptr_t stride, int alpha, int beta ) { deblock_luma_intra_c( pix, stride, 1, alpha, beta ); } -static void deblock_h_luma_intra_c( pixel *pix, int stride, int alpha, int beta ) +static void deblock_h_luma_intra_c( pixel *pix, intptr_t stride, int alpha, int beta ) { deblock_luma_intra_c( pix, 1, stride, alpha, beta ); } -static ALWAYS_INLINE void deblock_edge_chroma_intra_c( pixel *pix, int xstride, int alpha, int beta ) +static ALWAYS_INLINE void deblock_edge_chroma_intra_c( pixel *pix, intptr_t xstride, int alpha, int beta ) { int p1 = pix[-2*xstride]; int p0 = pix[-1*xstride]; @@ -247,29 +251,32 @@ static ALWAYS_INLINE void deblock_edge_chroma_intra_c( pixel *pix, int xstride, pix[ 0*xstride] = (2*q1 + q0 + p1 + 2) >> 2; /* q0' */ } } -static inline void deblock_chroma_intra_c( pixel *pix, int xstride, int ystride, int alpha, int beta, int dir ) +static ALWAYS_INLINE void deblock_chroma_intra_c( pixel *pix, int width, int height, intptr_t xstride, intptr_t ystride, int alpha, int beta ) +{ + for( int d = 0; d < height; d++, pix += ystride-2 ) + for( int e = 0; e < width; e++, pix++ ) + deblock_edge_chroma_intra_c( pix, xstride, alpha, beta ); +} +static void deblock_h_chroma_intra_mbaff_c( pixel *pix, intptr_t stride, int alpha, int beta ) { - for( int d = 0; d < (dir?16:8); d++, pix += ystride-2 ) - for( int e = 0; e < (dir?1:2); e++, pix++ ) - deblock_edge_chroma_intra_c( pix, xstride, alpha, beta ); + deblock_chroma_intra_c( pix, 2, 4, 2, stride, alpha, beta ); } -static inline void deblock_v_chroma_intra_mbaff_c( pixel *pix, int stride, int alpha, int beta ) +static void deblock_v_chroma_intra_c( pixel *pix, intptr_t stride, int alpha, int beta ) { - for( int i = 0; i < 4; i++, pix += stride ) - deblock_edge_chroma_intra_c( pix, 2, alpha, beta ); + deblock_chroma_intra_c( pix, 1, 16, stride, 2, alpha, beta ); } -static void deblock_v_chroma_intra_c( pixel *pix, int stride, int alpha, int beta ) +static void deblock_h_chroma_intra_c( pixel *pix, intptr_t stride, int alpha, int beta ) { - deblock_chroma_intra_c( pix, stride, 2, alpha, beta, 1 ); + deblock_chroma_intra_c( pix, 2, 8, 2, stride, alpha, beta ); } -static void deblock_h_chroma_intra_c( pixel *pix, int stride, int alpha, int beta ) +static void deblock_h_chroma_422_intra_c( pixel *pix, intptr_t stride, int alpha, int beta ) { - deblock_chroma_intra_c( pix, 2, stride, alpha, beta, 0 ); + deblock_chroma_intra_c( pix, 2, 16, 2, stride, alpha, beta ); } static void deblock_strength_c( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], int mvy_limit, - int bframe, x264_t *h ) + int bframe ) { for( int dir = 0; dir < 2; dir++ ) { @@ -296,166 +303,11 @@ static void deblock_strength_c( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264 } } -void deblock_strength_mbaff_c( uint8_t nnz_cache[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], - int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], - int mvy_limit, int bframe, x264_t *h ) +static ALWAYS_INLINE void deblock_edge( x264_t *h, pixel *pix, intptr_t i_stride, uint8_t bS[4], int i_qp, + int a, int b, int b_chroma, x264_deblock_inter_t pf_inter ) { - int neighbour_field[2]; - neighbour_field[0] = h->mb.i_mb_left_xy[0] >= 0 && h->mb.field[h->mb.i_mb_left_xy[0]]; - neighbour_field[1] = h->mb.i_mb_top_xy >= 0 && h->mb.field[h->mb.i_mb_top_xy]; - int intra_cur = IS_INTRA( h->mb.i_type ); - - if( !intra_cur ) - { - for( int dir = 0; dir < 2; dir++ ) - { - int edge_stride = dir ? 8 : 1; - int part_stride = dir ? 1 : 8; - for( int edge = 0; edge < 4; edge++ ) - { - for( int i = 0, q = X264_SCAN8_0+edge*edge_stride; i < 4; i++, q += part_stride ) - { - int p = q - edge_stride; - if( nnz_cache[q] || nnz_cache[p] ) - { - bs[dir][edge][i] = 2; - } - else if( (edge == 0 && MB_INTERLACED != neighbour_field[dir]) || - ref[0][q] != ref[0][p] || - abs( mv[0][q][0] - mv[0][p][0] ) >= 4 || - abs( mv[0][q][1] - mv[0][p][1] ) >= mvy_limit || - (bframe && (ref[1][q] != ref[1][p] || - abs( mv[1][q][0] - mv[1][p][0] ) >= 4 || - abs( mv[1][q][1] - mv[1][p][1] ) >= mvy_limit )) ) - { - bs[dir][edge][i] = 1; - } - else - bs[dir][edge][i] = 0; - } - } - } - } - - if( h->mb.i_neighbour & MB_LEFT ) - { - if( h->mb.field[h->mb.i_mb_left_xy[0]] != MB_INTERLACED ) - { - static const uint8_t offset[2][2][8] = { - { { 0, 0, 0, 0, 1, 1, 1, 1 }, - { 2, 2, 2, 2, 3, 3, 3, 3 }, }, - { { 0, 1, 2, 3, 0, 1, 2, 3 }, - { 0, 1, 2, 3, 0, 1, 2, 3 }, } - }; - uint8_t bS[8]; - - if( intra_cur ) - memset( bS, 4, 8 ); - else - { - const uint8_t *off = offset[MB_INTERLACED][h->mb.i_mb_y&1]; - uint8_t (*nnz)[48] = h->mb.non_zero_count; - - for( int i = 0; i < 8; i++ ) - { - int left = h->mb.i_mb_left_xy[MB_INTERLACED ? i>>2 : i&1]; - int nnz_this = h->mb.cache.non_zero_count[x264_scan8[0]+8*(i>>1)]; - int nnz_left = nnz[left][3 + 4*off[i]]; - if( !h->param.b_cabac && h->pps->b_transform_8x8_mode ) - { - int j = off[i]&~1; - if( h->mb.mb_transform_size[left] ) - nnz_left = !!(M16( &nnz[left][2+4*j] ) | M16( &nnz[left][2+4*(1+j)] )); - } - if( IS_INTRA( h->mb.type[left] ) ) - bS[i] = 4; - else if( nnz_left || nnz_this ) - bS[i] = 2; - else // As left is different interlaced. - bS[i] = 1; - } - } - - if( MB_INTERLACED ) - { - for( int i = 0; i < 4; i++ ) bs[0][0][i] = bS[i]; - for( int i = 0; i < 4; i++ ) bs[0][4][i] = bS[4+i]; - } - else - { - for( int i = 0; i < 4; i++ ) bs[0][0][i] = bS[2*i]; - for( int i = 0; i < 4; i++ ) bs[0][4][i] = bS[1+2*i]; - } - } - } - - if( h->mb.i_neighbour & MB_TOP ) - { - if( !(h->mb.i_mb_y&1) && !MB_INTERLACED && h->mb.field[h->mb.i_mb_top_xy] ) - { - /* Need to filter both fields (even for frame macroblocks). - * Filter top two rows using the top macroblock of the above - * pair and then the bottom one. */ - int mbn_xy = h->mb.i_mb_xy - 2 * h->mb.i_mb_stride; - uint32_t nnz_cur[4]; - nnz_cur[0] = h->mb.cache.non_zero_count[x264_scan8[0]+0]; - nnz_cur[1] = h->mb.cache.non_zero_count[x264_scan8[0]+1]; - nnz_cur[2] = h->mb.cache.non_zero_count[x264_scan8[0]+2]; - nnz_cur[3] = h->mb.cache.non_zero_count[x264_scan8[0]+3]; - /* Munge NNZ for cavlc + 8x8dct */ - if( !h->param.b_cabac && h->pps->b_transform_8x8_mode && - h->mb.mb_transform_size[h->mb.i_mb_xy] ) - { - int nnz0 = M16( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) | M16( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ); - int nnz1 = M16( &h->mb.cache.non_zero_count[x264_scan8[ 4]] ) | M16( &h->mb.cache.non_zero_count[x264_scan8[ 6]] ); - nnz_cur[0] = nnz_cur[1] = !!nnz0; - nnz_cur[2] = nnz_cur[3] = !!nnz1; - } - - for( int j = 0; j < 2; j++, mbn_xy += h->mb.i_mb_stride ) - { - int mbn_intra = IS_INTRA( h->mb.type[mbn_xy] ); - uint8_t (*nnz)[48] = h->mb.non_zero_count; - - uint32_t nnz_top[4]; - nnz_top[0] = nnz[mbn_xy][3*4+0]; - nnz_top[1] = nnz[mbn_xy][3*4+1]; - nnz_top[2] = nnz[mbn_xy][3*4+2]; - nnz_top[3] = nnz[mbn_xy][3*4+3]; - - if( !h->param.b_cabac && h->pps->b_transform_8x8_mode && - (h->mb.i_neighbour & MB_TOP) && h->mb.mb_transform_size[mbn_xy] ) - { - int nnz_top0 = M16( &nnz[mbn_xy][8] ) | M16( &nnz[mbn_xy][12] ); - int nnz_top1 = M16( &nnz[mbn_xy][10] ) | M16( &nnz[mbn_xy][14] ); - nnz_top[0] = nnz_top[1] = nnz_top0 ? 0x0101 : 0; - nnz_top[2] = nnz_top[3] = nnz_top1 ? 0x0101 : 0; - } - - uint8_t bS[4]; - if( intra_cur || mbn_intra ) - M32( bS ) = 0x03030303; - else - { - for( int i = 0; i < 4; i++ ) - { - if( nnz_cur[i] || nnz_top[i] ) - bS[i] = 2; - else - bS[i] = 1; - } - } - for( int i = 0; i < 4; i++ ) - bs[1][4*j][i] = bS[i]; - } - } - } -} - -static inline void deblock_edge( x264_t *h, pixel *pix, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_inter_t pf_inter ) -{ - int index_a = i_qp-QP_BD_OFFSET + h->sh.i_alpha_c0_offset; - int index_b = i_qp-QP_BD_OFFSET + h->sh.i_beta_offset; + int index_a = i_qp + a; + int index_b = i_qp + b; int alpha = alpha_table(index_a) << (BIT_DEPTH-8); int beta = beta_table(index_b) << (BIT_DEPTH-8); int8_t tc[4]; @@ -471,10 +323,11 @@ static inline void deblock_edge( x264_t *h, pixel *pix, int i_stride, uint8_t bS pf_inter( pix, i_stride, alpha, beta, tc ); } -static inline void deblock_edge_intra( x264_t *h, pixel *pix, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_intra_t pf_intra ) +static ALWAYS_INLINE void deblock_edge_intra( x264_t *h, pixel *pix, intptr_t i_stride, uint8_t bS[4], int i_qp, + int a, int b, int b_chroma, x264_deblock_intra_t pf_intra ) { - int index_a = i_qp-QP_BD_OFFSET + h->sh.i_alpha_c0_offset; - int index_b = i_qp-QP_BD_OFFSET + h->sh.i_beta_offset; + int index_a = i_qp + a; + int index_b = i_qp + b; int alpha = alpha_table(index_a) << (BIT_DEPTH-8); int beta = beta_table(index_b) << (BIT_DEPTH-8); @@ -484,12 +337,55 @@ static inline void deblock_edge_intra( x264_t *h, pixel *pix, int i_stride, uint pf_intra( pix, i_stride, alpha, beta ); } +static ALWAYS_INLINE void x264_macroblock_cache_load_neighbours_deblock( x264_t *h, int mb_x, int mb_y ) +{ + int deblock_on_slice_edges = h->sh.i_disable_deblocking_filter_idc != 2; + + h->mb.i_neighbour = 0; + h->mb.i_mb_xy = mb_y * h->mb.i_mb_stride + mb_x; + h->mb.b_interlaced = PARAM_INTERLACED && h->mb.field[h->mb.i_mb_xy]; + h->mb.i_mb_top_y = mb_y - (1 << MB_INTERLACED); + h->mb.i_mb_top_xy = mb_x + h->mb.i_mb_stride*h->mb.i_mb_top_y; + h->mb.i_mb_left_xy[1] = + h->mb.i_mb_left_xy[0] = h->mb.i_mb_xy - 1; + if( SLICE_MBAFF ) + { + if( mb_y&1 ) + { + if( mb_x && h->mb.field[h->mb.i_mb_xy - 1] != MB_INTERLACED ) + h->mb.i_mb_left_xy[0] -= h->mb.i_mb_stride; + } + else + { + if( h->mb.i_mb_top_xy >= 0 && MB_INTERLACED && !h->mb.field[h->mb.i_mb_top_xy] ) + { + h->mb.i_mb_top_xy += h->mb.i_mb_stride; + h->mb.i_mb_top_y++; + } + if( mb_x && h->mb.field[h->mb.i_mb_xy - 1] != MB_INTERLACED ) + h->mb.i_mb_left_xy[1] += h->mb.i_mb_stride; + } + } + + if( mb_x > 0 && (deblock_on_slice_edges || + h->mb.slice_table[h->mb.i_mb_left_xy[0]] == h->mb.slice_table[h->mb.i_mb_xy]) ) + h->mb.i_neighbour |= MB_LEFT; + if( mb_y > MB_INTERLACED && (deblock_on_slice_edges + || h->mb.slice_table[h->mb.i_mb_top_xy] == h->mb.slice_table[h->mb.i_mb_xy]) ) + h->mb.i_neighbour |= MB_TOP; +} + void x264_frame_deblock_row( x264_t *h, int mb_y ) { int b_interlaced = SLICE_MBAFF; - int qp_thresh = 15 - X264_MIN( h->sh.i_alpha_c0_offset, h->sh.i_beta_offset ) - X264_MAX( 0, h->param.analyse.i_chroma_qp_offset ); + int a = h->sh.i_alpha_c0_offset - QP_BD_OFFSET; + int b = h->sh.i_beta_offset - QP_BD_OFFSET; + int qp_thresh = 15 - X264_MIN( a, b ) - X264_MAX( 0, h->pps->i_chroma_qp_index_offset ); int stridey = h->fdec->i_stride[0]; int strideuv = h->fdec->i_stride[1]; + int chroma444 = CHROMA444; + int chroma_height = 16 >> CHROMA_V_SHIFT; + intptr_t uvdiff = chroma444 ? h->fdec->plane[2] - h->fdec->plane[1] : 1; for( int mb_x = 0; mb_x < h->mb.i_mb_width; mb_x += (~b_interlaced | mb_y)&1, mb_y ^= b_interlaced ) { @@ -497,44 +393,55 @@ void x264_frame_deblock_row( x264_t *h, int mb_y ) x264_macroblock_cache_load_neighbours_deblock( h, mb_x, mb_y ); int mb_xy = h->mb.i_mb_xy; - int transform_8x8 = h->mb.mb_transform_size[h->mb.i_mb_xy]; + int transform_8x8 = h->mb.mb_transform_size[mb_xy]; int intra_cur = IS_INTRA( h->mb.type[mb_xy] ); - uint8_t (*bs)[8][4] = h->deblock_strength[mb_y&1][mb_x]; + uint8_t (*bs)[8][4] = h->deblock_strength[mb_y&1][h->param.b_sliced_threads?mb_xy:mb_x]; pixel *pixy = h->fdec->plane[0] + 16*mb_y*stridey + 16*mb_x; - pixel *pixuv = h->fdec->plane[1] + (8<fdec->plane[2] - h->fdec->plane[1] : 1; + pixel *pixuv = h->fdec->plane[1] + chroma_height*mb_y*strideuv + 16*mb_x; + if( mb_y & MB_INTERLACED ) { pixy -= 15*stridey; - pixuv -= ((8<mb.qp[mb_xy]; int qpc = h->chroma_qp_table[qp]; - int first_edge_only = h->mb.type[mb_xy] == P_SKIP || qp <= qp_thresh; + int first_edge_only = (h->mb.partition[mb_xy] == D_16x16 && !h->mb.cbp[mb_xy] && !intra_cur) || qp <= qp_thresh; #define FILTER( intra, dir, edge, qp, chroma_qp )\ do\ {\ - deblock_edge##intra( h, pixy + 4*edge*(dir?stride2y:1),\ - stride2y, bs[dir][edge], qp, 0,\ - h->loopf.deblock_luma##intra[dir] );\ - if( CHROMA444 )\ + if( !(edge & 1) || !transform_8x8 )\ {\ - deblock_edge##intra( h, pixuv + 4*edge*(dir?stride2uv:1),\ - stride2uv, bs[dir][edge], chroma_qp, 0,\ - h->loopf.deblock_luma##intra[dir] );\ - deblock_edge##intra( h, pixuv + uvdiff + 4*edge*(dir?stride2uv:1),\ - stride2uv, bs[dir][edge], chroma_qp, 0,\ + deblock_edge##intra( h, pixy + 4*edge*(dir?stride2y:1),\ + stride2y, bs[dir][edge], qp, a, b, 0,\ h->loopf.deblock_luma##intra[dir] );\ + if( CHROMA_FORMAT == CHROMA_444 )\ + {\ + deblock_edge##intra( h, pixuv + 4*edge*(dir?stride2uv:1),\ + stride2uv, bs[dir][edge], chroma_qp, a, b, 0,\ + h->loopf.deblock_luma##intra[dir] );\ + deblock_edge##intra( h, pixuv + uvdiff + 4*edge*(dir?stride2uv:1),\ + stride2uv, bs[dir][edge], chroma_qp, a, b, 0,\ + h->loopf.deblock_luma##intra[dir] );\ + }\ + else if( CHROMA_FORMAT == CHROMA_420 && !(edge & 1) )\ + {\ + deblock_edge##intra( h, pixuv + edge*(dir?2*stride2uv:4),\ + stride2uv, bs[dir][edge], chroma_qp, a, b, 1,\ + h->loopf.deblock_chroma##intra[dir] );\ + }\ }\ - else if( !(edge & 1) )\ - deblock_edge##intra( h, pixuv + 2*edge*(dir?stride2uv:2),\ - stride2uv, bs[dir][edge], chroma_qp, 1,\ + if( CHROMA_FORMAT == CHROMA_422 && (dir || !(edge & 1)) )\ + {\ + deblock_edge##intra( h, pixuv + edge*(dir?4*stride2uv:4),\ + stride2uv, bs[dir][edge], chroma_qp, a, b, 1,\ h->loopf.deblock_chroma##intra[dir] );\ + }\ } while(0) if( h->mb.i_neighbour & MB_LEFT ) @@ -544,56 +451,69 @@ void x264_frame_deblock_row( x264_t *h, int mb_y ) int luma_qp[2]; int chroma_qp[2]; int left_qp[2]; - int current_qp = h->mb.qp[mb_xy]; - x264_deblock_inter_t luma_deblock = deblock_v_luma_mbaff_c; - x264_deblock_inter_t chroma_deblock = CHROMA444 ? deblock_v_luma_mbaff_c : deblock_v_chroma_mbaff_c; - x264_deblock_intra_t luma_intra_deblock = deblock_v_luma_intra_mbaff_c; - x264_deblock_intra_t chroma_intra_deblock = CHROMA444 ? deblock_v_luma_intra_mbaff_c : deblock_v_chroma_intra_mbaff_c; - int c = CHROMA444 ? 0 : 1; + x264_deblock_inter_t luma_deblock = h->loopf.deblock_luma_mbaff; + x264_deblock_inter_t chroma_deblock = h->loopf.deblock_chroma_mbaff; + x264_deblock_intra_t luma_intra_deblock = h->loopf.deblock_luma_intra_mbaff; + x264_deblock_intra_t chroma_intra_deblock = h->loopf.deblock_chroma_intra_mbaff; + int c = chroma444 ? 0 : 1; left_qp[0] = h->mb.qp[h->mb.i_mb_left_xy[0]]; - luma_qp[0] = (current_qp + left_qp[0] + 1) >> 1; - chroma_qp[0] = (h->chroma_qp_table[current_qp] + h->chroma_qp_table[left_qp[0]] + 1) >> 1; - if( bs[0][0][0] == 4) + luma_qp[0] = (qp + left_qp[0] + 1) >> 1; + chroma_qp[0] = (qpc + h->chroma_qp_table[left_qp[0]] + 1) >> 1; + if( intra_cur || IS_INTRA( h->mb.type[h->mb.i_mb_left_xy[0]] ) ) { - deblock_edge_intra( h, pixy, 2*stridey, bs[0][0], luma_qp[0], 0, luma_intra_deblock ); - deblock_edge_intra( h, pixuv, 2*strideuv, bs[0][0], chroma_qp[0], c, chroma_intra_deblock ); - deblock_edge_intra( h, pixuv + uvdiff, 2*strideuv, bs[0][0], chroma_qp[0], c, chroma_intra_deblock ); + deblock_edge_intra( h, pixy, 2*stridey, bs[0][0], luma_qp[0], a, b, 0, luma_intra_deblock ); + deblock_edge_intra( h, pixuv, 2*strideuv, bs[0][0], chroma_qp[0], a, b, c, chroma_intra_deblock ); + if( chroma444 ) + deblock_edge_intra( h, pixuv + uvdiff, 2*strideuv, bs[0][0], chroma_qp[0], a, b, c, chroma_intra_deblock ); } else { - deblock_edge( h, pixy, 2*stridey, bs[0][0], luma_qp[0], 0, luma_deblock ); - deblock_edge( h, pixuv, 2*strideuv, bs[0][0], chroma_qp[0], c, chroma_deblock ); - deblock_edge( h, pixuv + uvdiff, 2*strideuv, bs[0][0], chroma_qp[0], c, chroma_deblock ); + deblock_edge( h, pixy, 2*stridey, bs[0][0], luma_qp[0], a, b, 0, luma_deblock ); + deblock_edge( h, pixuv, 2*strideuv, bs[0][0], chroma_qp[0], a, b, c, chroma_deblock ); + if( chroma444 ) + deblock_edge( h, pixuv + uvdiff, 2*strideuv, bs[0][0], chroma_qp[0], a, b, c, chroma_deblock ); } int offy = MB_INTERLACED ? 4 : 0; - int offuv = MB_INTERLACED ? 3 : 0; - if( CHROMA444 ) offuv = offy; + int offuv = MB_INTERLACED ? 4-CHROMA_V_SHIFT : 0; left_qp[1] = h->mb.qp[h->mb.i_mb_left_xy[1]]; - luma_qp[1] = (current_qp + left_qp[1] + 1) >> 1; - chroma_qp[1] = (h->chroma_qp_table[current_qp] + h->chroma_qp_table[left_qp[1]] + 1) >> 1; - if( bs[0][4][0] == 4) + luma_qp[1] = (qp + left_qp[1] + 1) >> 1; + chroma_qp[1] = (qpc + h->chroma_qp_table[left_qp[1]] + 1) >> 1; + if( intra_cur || IS_INTRA( h->mb.type[h->mb.i_mb_left_xy[1]] ) ) { - deblock_edge_intra( h, pixy + (stridey<mb.qp[h->mb.i_mb_xy-1]; int qp_left = (qp + qpl + 1) >> 1; - int qpc_left = (h->chroma_qp_table[qp] + h->chroma_qp_table[qpl] + 1) >> 1; + int qpc_left = (qpc + h->chroma_qp_table[qpl] + 1) >> 1; int intra_left = IS_INTRA( h->mb.type[h->mb.i_mb_xy-1] ); + int intra_deblock = intra_cur || intra_left; + + /* Any MB that was coded, or that analysis decided to skip, has quality commensurate with its QP. + * But if deblocking affects neighboring MBs that were force-skipped, blur might accumulate there. + * So reset their effective QP to max, to indicate that lack of guarantee. */ + if( h->fdec->mb_info && M32( bs[0][0] ) ) + { +#define RESET_EFFECTIVE_QP(xy) h->fdec->effective_qp[xy] |= 0xff * !!(h->fdec->mb_info[xy] & X264_MBINFO_CONSTANT); + RESET_EFFECTIVE_QP(mb_xy); + RESET_EFFECTIVE_QP(h->mb.i_mb_left_xy[0]); + } - if( intra_cur || intra_left ) + if( intra_deblock ) FILTER( _intra, 0, 0, qp_left, qpc_left ); else FILTER( , 0, 0, qp_left, qpc_left ); @@ -601,9 +521,9 @@ void x264_frame_deblock_row( x264_t *h, int mb_y ) } if( !first_edge_only ) { - if( !transform_8x8 ) FILTER( , 0, 1, qp, qpc ); - FILTER( , 0, 2, qp, qpc ); - if( !transform_8x8 ) FILTER( , 0, 3, qp, qpc ); + FILTER( , 0, 1, qp, qpc ); + FILTER( , 0, 2, qp, qpc ); + FILTER( , 0, 3, qp, qpc ); } if( h->mb.i_neighbour & MB_TOP ) @@ -612,38 +532,48 @@ void x264_frame_deblock_row( x264_t *h, int mb_y ) { int mbn_xy = mb_xy - 2 * h->mb.i_mb_stride; - for(int j=0; j<2; j++, mbn_xy += h->mb.i_mb_stride) + for( int j = 0; j < 2; j++, mbn_xy += h->mb.i_mb_stride ) { int qpt = h->mb.qp[mbn_xy]; int qp_top = (qp + qpt + 1) >> 1; - int qpc_top = (h->chroma_qp_table[qp] + h->chroma_qp_table[qpt] + 1) >> 1; + int qpc_top = (qpc + h->chroma_qp_table[qpt] + 1) >> 1; + int intra_top = IS_INTRA( h->mb.type[mbn_xy] ); + if( intra_cur || intra_top ) + M32( bs[1][4*j] ) = 0x03030303; // deblock the first horizontal edge of the even rows, then the first horizontal edge of the odd rows - deblock_edge( h, pixy + j*stridey, 2* stridey, bs[1][4*j], qp_top, 0, deblock_v_luma_c ); - if( CHROMA444 ) + deblock_edge( h, pixy + j*stridey, 2* stridey, bs[1][4*j], qp_top, a, b, 0, h->loopf.deblock_luma[1] ); + if( chroma444 ) { - deblock_edge( h, pixuv + j*strideuv, 2*strideuv, bs[1][4*j], qpc_top, 0, deblock_v_luma_c ); - deblock_edge( h, pixuv + uvdiff + j*strideuv, 2*strideuv, bs[1][4*j], qpc_top, 0, deblock_v_luma_c ); + deblock_edge( h, pixuv + j*strideuv, 2*strideuv, bs[1][4*j], qpc_top, a, b, 0, h->loopf.deblock_luma[1] ); + deblock_edge( h, pixuv + uvdiff + j*strideuv, 2*strideuv, bs[1][4*j], qpc_top, a, b, 0, h->loopf.deblock_luma[1] ); } else - deblock_edge( h, pixuv + j*strideuv, 2*strideuv, bs[1][4*j], qpc_top, 1, deblock_v_chroma_c ); + deblock_edge( h, pixuv + j*strideuv, 2*strideuv, bs[1][4*j], qpc_top, a, b, 1, h->loopf.deblock_chroma[1] ); } } else { int qpt = h->mb.qp[h->mb.i_mb_top_xy]; int qp_top = (qp + qpt + 1) >> 1; - int qpc_top = (h->chroma_qp_table[qp] + h->chroma_qp_table[qpt] + 1) >> 1; + int qpc_top = (qpc + h->chroma_qp_table[qpt] + 1) >> 1; int intra_top = IS_INTRA( h->mb.type[h->mb.i_mb_top_xy] ); + int intra_deblock = intra_cur || intra_top; - if( (!b_interlaced || (!MB_INTERLACED && !h->mb.field[h->mb.i_mb_top_xy])) - && (intra_cur || intra_top) ) + /* This edge has been modified, reset effective qp to max. */ + if( h->fdec->mb_info && M32( bs[1][0] ) ) + { + RESET_EFFECTIVE_QP(mb_xy); + RESET_EFFECTIVE_QP(h->mb.i_mb_top_xy); + } + + if( (!b_interlaced || (!MB_INTERLACED && !h->mb.field[h->mb.i_mb_top_xy])) && intra_deblock ) { FILTER( _intra, 1, 0, qp_top, qpc_top ); } else { - if( intra_top ) + if( intra_deblock ) M32( bs[1][0] ) = 0x03030303; FILTER( , 1, 0, qp_top, qpc_top ); } @@ -652,9 +582,9 @@ void x264_frame_deblock_row( x264_t *h, int mb_y ) if( !first_edge_only ) { - if( !transform_8x8 ) FILTER( , 1, 1, qp, qpc ); - FILTER( , 1, 2, qp, qpc ); - if( !transform_8x8 ) FILTER( , 1, 3, qp, qpc ); + FILTER( , 1, 1, qp, qpc ); + FILTER( , 1, 2, qp, qpc ); + FILTER( , 1, 3, qp, qpc ); } #undef FILTER @@ -665,24 +595,31 @@ void x264_frame_deblock_row( x264_t *h, int mb_y ) * TODO: * deblock macroblock edges * support analysis partitions smaller than 16x16 - * deblock chroma for 4:2:0 + * deblock chroma for 4:2:0/4:2:2 * handle duplicate refs correctly - * handle cavlc+8x8dct correctly */ void x264_macroblock_deblock( x264_t *h ) { - int qp_thresh = 15 - X264_MIN( h->sh.i_alpha_c0_offset, h->sh.i_beta_offset ) - X264_MAX( 0, h->pps->i_chroma_qp_index_offset ); + int a = h->sh.i_alpha_c0_offset - QP_BD_OFFSET; + int b = h->sh.i_beta_offset - QP_BD_OFFSET; + int qp_thresh = 15 - X264_MIN( a, b ) - X264_MAX( 0, h->pps->i_chroma_qp_index_offset ); + int intra_cur = IS_INTRA( h->mb.i_type ); int qp = h->mb.i_qp; int qpc = h->mb.i_chroma_qp; - if( qp <= qp_thresh || h->mb.i_type == P_SKIP ) + if( (h->mb.i_partition == D_16x16 && !h->mb.i_cbp_luma && !intra_cur) || qp <= qp_thresh ) return; - uint8_t (*bs)[8][4] = h->deblock_strength[h->mb.i_mb_y&1][h->mb.i_mb_x]; - if( IS_INTRA( h->mb.i_type ) ) - memset( bs, 3, 2*8*4*sizeof(uint8_t) ); + uint8_t (*bs)[8][4] = h->mb.cache.deblock_strength; + if( intra_cur ) + { + M32( bs[0][1] ) = 0x03030303; + M64( bs[0][2] ) = 0x0303030303030303ULL; + M32( bs[1][1] ) = 0x03030303; + M64( bs[1][2] ) = 0x0303030303030303ULL; + } else h->loopf.deblock_strength( h->mb.cache.non_zero_count, h->mb.cache.ref, h->mb.cache.mv, - bs, 4 >> SLICE_MBAFF, h->sh.i_type == SLICE_TYPE_B, h ); + bs, 4 >> MB_INTERLACED, h->sh.i_type == SLICE_TYPE_B ); int transform_8x8 = h->mb.b_transform_8x8; @@ -690,15 +627,15 @@ void x264_macroblock_deblock( x264_t *h ) do\ {\ deblock_edge( h, h->mb.pic.p_fdec[0] + 4*edge*(dir?FDEC_STRIDE:1),\ - FDEC_STRIDE, bs[dir][edge], qp, 0,\ + FDEC_STRIDE, bs[dir][edge], qp, a, b, 0,\ h->loopf.deblock_luma[dir] );\ if( CHROMA444 )\ {\ deblock_edge( h, h->mb.pic.p_fdec[1] + 4*edge*(dir?FDEC_STRIDE:1),\ - FDEC_STRIDE, bs[dir][edge], qpc, 0,\ + FDEC_STRIDE, bs[dir][edge], qpc, a, b, 0,\ h->loopf.deblock_luma[dir] );\ deblock_edge( h, h->mb.pic.p_fdec[2] + 4*edge*(dir?FDEC_STRIDE:1),\ - FDEC_STRIDE, bs[dir][edge], qpc, 0,\ + FDEC_STRIDE, bs[dir][edge], qpc, a, b, 0,\ h->loopf.deblock_luma[dir] );\ }\ } while(0) @@ -715,73 +652,117 @@ void x264_macroblock_deblock( x264_t *h ) } #if HAVE_MMX -void x264_deblock_v_luma_sse2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 ); -void x264_deblock_v_luma_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 ); -void x264_deblock_h_luma_sse2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 ); -void x264_deblock_h_luma_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 ); -void x264_deblock_v_chroma_sse2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 ); -void x264_deblock_v_chroma_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 ); -void x264_deblock_h_chroma_sse2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 ); -void x264_deblock_h_chroma_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 ); -void x264_deblock_v_luma_intra_sse2( pixel *pix, int stride, int alpha, int beta ); -void x264_deblock_v_luma_intra_avx ( pixel *pix, int stride, int alpha, int beta ); -void x264_deblock_h_luma_intra_sse2( pixel *pix, int stride, int alpha, int beta ); -void x264_deblock_h_luma_intra_avx ( pixel *pix, int stride, int alpha, int beta ); -void x264_deblock_v_chroma_intra_sse2( pixel *pix, int stride, int alpha, int beta ); -void x264_deblock_v_chroma_intra_avx ( pixel *pix, int stride, int alpha, int beta ); -void x264_deblock_h_chroma_intra_sse2( pixel *pix, int stride, int alpha, int beta ); -void x264_deblock_h_chroma_intra_avx ( pixel *pix, int stride, int alpha, int beta ); -void x264_deblock_strength_mmxext( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], - int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], - int mvy_limit, int bframe, x264_t *h ); -void x264_deblock_strength_sse2 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], - int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], - int mvy_limit, int bframe, x264_t *h ); -void x264_deblock_strength_ssse3 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], - int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], - int mvy_limit, int bframe, x264_t *h ); -void x264_deblock_strength_avx ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], - int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], - int mvy_limit, int bframe, x264_t *h ); +void x264_deblock_v_luma_sse2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_v_luma_avx ( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_h_luma_sse2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_h_luma_avx ( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_v_chroma_sse2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_v_chroma_avx ( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_h_chroma_sse2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_h_chroma_avx ( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_h_chroma_mbaff_sse2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_h_chroma_mbaff_avx ( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_h_chroma_422_mmx2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_h_chroma_422_sse2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_h_chroma_422_avx ( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_v_luma_intra_sse2( pixel *pix, intptr_t stride, int alpha, int beta ); +void x264_deblock_v_luma_intra_avx ( pixel *pix, intptr_t stride, int alpha, int beta ); +void x264_deblock_h_luma_intra_sse2( pixel *pix, intptr_t stride, int alpha, int beta ); +void x264_deblock_h_luma_intra_avx ( pixel *pix, intptr_t stride, int alpha, int beta ); +void x264_deblock_v_chroma_intra_sse2( pixel *pix, intptr_t stride, int alpha, int beta ); +void x264_deblock_v_chroma_intra_avx ( pixel *pix, intptr_t stride, int alpha, int beta ); +void x264_deblock_h_chroma_intra_sse2( pixel *pix, intptr_t stride, int alpha, int beta ); +void x264_deblock_h_chroma_intra_avx ( pixel *pix, intptr_t stride, int alpha, int beta ); +void x264_deblock_h_chroma_422_intra_mmx2( pixel *pix, intptr_t stride, int alpha, int beta ); +void x264_deblock_h_chroma_422_intra_sse2( pixel *pix, intptr_t stride, int alpha, int beta ); +void x264_deblock_h_chroma_422_intra_avx ( pixel *pix, intptr_t stride, int alpha, int beta ); +void x264_deblock_strength_mmx2 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], + int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], + int mvy_limit, int bframe ); +void x264_deblock_strength_sse2 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], + int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], + int mvy_limit, int bframe ); +void x264_deblock_strength_ssse3( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], + int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], + int mvy_limit, int bframe ); +void x264_deblock_strength_avx ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], + int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], + int mvy_limit, int bframe ); +void x264_deblock_strength_avx2 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], + int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], + int mvy_limit, int bframe ); + +void x264_deblock_h_chroma_intra_mbaff_mmx2( pixel *pix, intptr_t stride, int alpha, int beta ); +void x264_deblock_h_chroma_intra_mbaff_sse2( pixel *pix, intptr_t stride, int alpha, int beta ); +void x264_deblock_h_chroma_intra_mbaff_avx ( pixel *pix, intptr_t stride, int alpha, int beta ); #if ARCH_X86 -void x264_deblock_h_luma_mmxext( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 ); -void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); -void x264_deblock_v_chroma_mmxext( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 ); -void x264_deblock_h_chroma_mmxext( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 ); -void x264_deblock_h_luma_intra_mmxext( pixel *pix, int stride, int alpha, int beta ); -void x264_deblock_v8_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta ); -void x264_deblock_v_chroma_intra_mmxext( pixel *pix, int stride, int alpha, int beta ); -void x264_deblock_h_chroma_intra_mmxext( pixel *pix, int stride, int alpha, int beta ); +void x264_deblock_h_luma_mmx2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_v8_luma_mmx2( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_v_chroma_mmx2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_h_chroma_mmx2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_h_chroma_mbaff_mmx2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_h_luma_intra_mmx2( pixel *pix, intptr_t stride, int alpha, int beta ); +void x264_deblock_v8_luma_intra_mmx2( uint8_t *pix, intptr_t stride, int alpha, int beta ); +void x264_deblock_v_chroma_intra_mmx2( pixel *pix, intptr_t stride, int alpha, int beta ); +void x264_deblock_h_chroma_intra_mmx2( pixel *pix, intptr_t stride, int alpha, int beta ); +void x264_deblock_h_chroma_intra_mbaff_mmx2( pixel *pix, intptr_t stride, int alpha, int beta ); #if HIGH_BIT_DEPTH -void x264_deblock_v_luma_mmxext( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 ); -void x264_deblock_v_luma_intra_mmxext( pixel *pix, int stride, int alpha, int beta ); +void x264_deblock_v_luma_mmx2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_v_luma_intra_mmx2( pixel *pix, intptr_t stride, int alpha, int beta ); #else // FIXME this wrapper has a significant cpu cost -static void x264_deblock_v_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +static void x264_deblock_v_luma_mmx2( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) { - x264_deblock_v8_luma_mmxext( pix, stride, alpha, beta, tc0 ); - x264_deblock_v8_luma_mmxext( pix+8, stride, alpha, beta, tc0+2 ); + x264_deblock_v8_luma_mmx2( pix, stride, alpha, beta, tc0 ); + x264_deblock_v8_luma_mmx2( pix+8, stride, alpha, beta, tc0+2 ); } -static void x264_deblock_v_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta ) +static void x264_deblock_v_luma_intra_mmx2( uint8_t *pix, intptr_t stride, int alpha, int beta ) { - x264_deblock_v8_luma_intra_mmxext( pix, stride, alpha, beta ); - x264_deblock_v8_luma_intra_mmxext( pix+8, stride, alpha, beta ); + x264_deblock_v8_luma_intra_mmx2( pix, stride, alpha, beta ); + x264_deblock_v8_luma_intra_mmx2( pix+8, stride, alpha, beta ); } #endif // HIGH_BIT_DEPTH #endif #endif #if ARCH_PPC -void x264_deblock_v_luma_altivec( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); -void x264_deblock_h_luma_altivec( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_v_luma_altivec( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_h_luma_altivec( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); #endif // ARCH_PPC -#if HAVE_ARMV6 -void x264_deblock_v_luma_neon( uint8_t *, int, int, int, int8_t * ); -void x264_deblock_h_luma_neon( uint8_t *, int, int, int, int8_t * ); -void x264_deblock_v_chroma_neon( uint8_t *, int, int, int, int8_t * ); -void x264_deblock_h_chroma_neon( uint8_t *, int, int, int, int8_t * ); +#if HAVE_ARMV6 || ARCH_AARCH64 +void x264_deblock_v_luma_neon ( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_h_luma_neon ( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_v_chroma_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_h_chroma_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_strength_neon( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], + int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], + int mvy_limit, int bframe ); +void x264_deblock_h_chroma_422_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_h_chroma_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_h_chroma_intra_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta ); +void x264_deblock_h_chroma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta ); +void x264_deblock_h_chroma_422_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta ); +void x264_deblock_v_chroma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta ); +void x264_deblock_h_luma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta ); +void x264_deblock_v_luma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta ); +#endif + +#if !HIGH_BIT_DEPTH +#if HAVE_MSA +void x264_deblock_v_luma_msa( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_h_luma_msa( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_v_chroma_msa( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_h_chroma_msa( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_v_luma_intra_msa( uint8_t *pix, intptr_t stride, int alpha, int beta ); +void x264_deblock_h_luma_intra_msa( uint8_t *pix, intptr_t stride, int alpha, int beta ); +void x264_deblock_v_chroma_intra_msa( uint8_t *pix, intptr_t stride, int alpha, int beta ); +void x264_deblock_h_chroma_intra_msa( uint8_t *pix, intptr_t stride, int alpha, int beta ); +void x264_deblock_strength_msa( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], + int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], int mvy_limit, + int bframe ); +#endif #endif void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff ) @@ -789,40 +770,59 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff ) pf->deblock_luma[1] = deblock_v_luma_c; pf->deblock_luma[0] = deblock_h_luma_c; pf->deblock_chroma[1] = deblock_v_chroma_c; - pf->deblock_chroma[0] = deblock_h_chroma_c; + pf->deblock_h_chroma_420 = deblock_h_chroma_c; + pf->deblock_h_chroma_422 = deblock_h_chroma_422_c; pf->deblock_luma_intra[1] = deblock_v_luma_intra_c; pf->deblock_luma_intra[0] = deblock_h_luma_intra_c; pf->deblock_chroma_intra[1] = deblock_v_chroma_intra_c; - pf->deblock_chroma_intra[0] = deblock_h_chroma_intra_c; + pf->deblock_h_chroma_420_intra = deblock_h_chroma_intra_c; + pf->deblock_h_chroma_422_intra = deblock_h_chroma_422_intra_c; + pf->deblock_luma_mbaff = deblock_h_luma_mbaff_c; + pf->deblock_chroma_420_mbaff = deblock_h_chroma_mbaff_c; + pf->deblock_luma_intra_mbaff = deblock_h_luma_intra_mbaff_c; + pf->deblock_chroma_420_intra_mbaff = deblock_h_chroma_intra_mbaff_c; pf->deblock_strength = deblock_strength_c; #if HAVE_MMX - if( cpu&X264_CPU_MMXEXT ) + if( cpu&X264_CPU_MMX2 ) { #if ARCH_X86 - pf->deblock_luma[1] = x264_deblock_v_luma_mmxext; - pf->deblock_luma[0] = x264_deblock_h_luma_mmxext; - pf->deblock_chroma[1] = x264_deblock_v_chroma_mmxext; - pf->deblock_chroma[0] = x264_deblock_h_chroma_mmxext; - pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_mmxext; - pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_mmxext; - pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_mmxext; - pf->deblock_chroma_intra[0] = x264_deblock_h_chroma_intra_mmxext; + pf->deblock_luma[1] = x264_deblock_v_luma_mmx2; + pf->deblock_luma[0] = x264_deblock_h_luma_mmx2; + pf->deblock_chroma[1] = x264_deblock_v_chroma_mmx2; + pf->deblock_h_chroma_420 = x264_deblock_h_chroma_mmx2; + pf->deblock_chroma_420_mbaff = x264_deblock_h_chroma_mbaff_mmx2; + pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_mmx2; + pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_mmx2; + pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_mmx2; + pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_mmx2; + pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_mmx2; + pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_mmx2; + pf->deblock_chroma_420_intra_mbaff = x264_deblock_h_chroma_intra_mbaff_mmx2; +#endif +#if !HIGH_BIT_DEPTH + pf->deblock_chroma_420_intra_mbaff = x264_deblock_h_chroma_intra_mbaff_mmx2; #endif - pf->deblock_strength = x264_deblock_strength_mmxext; + pf->deblock_strength = x264_deblock_strength_mmx2; if( cpu&X264_CPU_SSE2 ) { pf->deblock_strength = x264_deblock_strength_sse2; + pf->deblock_h_chroma_420 = x264_deblock_h_chroma_sse2; + pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_sse2; + pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_sse2; + pf->deblock_chroma_420_mbaff = x264_deblock_h_chroma_mbaff_sse2; + pf->deblock_luma[1] = x264_deblock_v_luma_sse2; + pf->deblock_luma[0] = x264_deblock_h_luma_sse2; + pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_sse2; + pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_sse2; if( !(cpu&X264_CPU_STACK_MOD4) ) { - pf->deblock_luma[1] = x264_deblock_v_luma_sse2; - pf->deblock_luma[0] = x264_deblock_h_luma_sse2; pf->deblock_chroma[1] = x264_deblock_v_chroma_sse2; - pf->deblock_chroma[0] = x264_deblock_h_chroma_sse2; - pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_sse2; - pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_sse2; pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_sse2; - pf->deblock_chroma_intra[0] = x264_deblock_h_chroma_intra_sse2; + pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_sse2; +#if HIGH_BIT_DEPTH + pf->deblock_chroma_420_intra_mbaff= x264_deblock_h_chroma_intra_mbaff_sse2; +#endif } } if( cpu&X264_CPU_SSSE3 ) @@ -830,18 +830,28 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff ) if( cpu&X264_CPU_AVX ) { pf->deblock_strength = x264_deblock_strength_avx; + pf->deblock_h_chroma_420 = x264_deblock_h_chroma_avx; + pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_avx; + pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_avx; + pf->deblock_luma[1] = x264_deblock_v_luma_avx; + pf->deblock_luma[0] = x264_deblock_h_luma_avx; + pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_avx; + pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_avx; if( !(cpu&X264_CPU_STACK_MOD4) ) { - pf->deblock_luma[1] = x264_deblock_v_luma_avx; - pf->deblock_luma[0] = x264_deblock_h_luma_avx; pf->deblock_chroma[1] = x264_deblock_v_chroma_avx; - pf->deblock_chroma[0] = x264_deblock_h_chroma_avx; - pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_avx; - pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_avx; pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_avx; - pf->deblock_chroma_intra[0] = x264_deblock_h_chroma_intra_avx; + pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_avx; +#if HIGH_BIT_DEPTH + pf->deblock_chroma_420_mbaff = x264_deblock_h_chroma_mbaff_avx; + pf->deblock_chroma_420_intra_mbaff = x264_deblock_h_chroma_intra_mbaff_avx; +#endif } } + if( cpu&X264_CPU_AVX2 ) + { + pf->deblock_strength = x264_deblock_strength_avx2; + } } #endif @@ -851,19 +861,45 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff ) { pf->deblock_luma[1] = x264_deblock_v_luma_altivec; pf->deblock_luma[0] = x264_deblock_h_luma_altivec; - } + } #endif // HAVE_ALTIVEC -#if HAVE_ARMV6 - if( cpu&X264_CPU_NEON ) - { +#if HAVE_ARMV6 || ARCH_AARCH64 + if( cpu&X264_CPU_NEON ) + { pf->deblock_luma[1] = x264_deblock_v_luma_neon; pf->deblock_luma[0] = x264_deblock_h_luma_neon; -// pf->deblock_chroma[1] = x264_deblock_v_chroma_neon; -// pf->deblock_chroma[0] = x264_deblock_h_chroma_neon; - } + pf->deblock_chroma[1] = x264_deblock_v_chroma_neon; + pf->deblock_h_chroma_420 = x264_deblock_h_chroma_neon; + pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_neon; + pf->deblock_chroma_420_mbaff = x264_deblock_h_chroma_mbaff_neon; + pf->deblock_chroma_420_intra_mbaff = x264_deblock_h_chroma_intra_mbaff_neon; + pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_neon; + pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_neon; + pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_neon; + pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_neon; + pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_neon; + pf->deblock_strength = x264_deblock_strength_neon; + } +#endif + +#if HAVE_MSA + if( cpu&X264_CPU_MSA ) + { + pf->deblock_luma[1] = x264_deblock_v_luma_msa; + pf->deblock_luma[0] = x264_deblock_h_luma_msa; + pf->deblock_chroma[1] = x264_deblock_v_chroma_msa; + pf->deblock_h_chroma_420 = x264_deblock_h_chroma_msa; + pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_msa; + pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_msa; + pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_msa; + pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_msa; + pf->deblock_strength = x264_deblock_strength_msa; + } #endif #endif // !HIGH_BIT_DEPTH - if( b_mbaff ) pf->deblock_strength = deblock_strength_mbaff_c; + /* These functions are equivalent, so don't duplicate them. */ + pf->deblock_chroma_422_mbaff = pf->deblock_h_chroma_420; + pf->deblock_chroma_422_intra_mbaff = pf->deblock_h_chroma_420_intra; }