X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=common%2Fframe.c;h=1d5ef24dc1f0cd9b4fe85f77b082f8ef5beb769c;hb=a6cee0ab6d2e6a9fb6580827dc854c09567c74f0;hp=ce8af34d1f211dce12c7ce46384b7bfdd899fc40;hpb=8d09ebe2e862688ce213d3f098ce7eca719fea23;p=x264 diff --git a/common/frame.c b/common/frame.c index ce8af34d..1d5ef24d 100644 --- a/common/frame.c +++ b/common/frame.c @@ -1,10 +1,10 @@ /***************************************************************************** * frame.c: h264 encoder library ***************************************************************************** - * Copyright (C) 2003 Laurent Aimar - * $Id: frame.c,v 1.1 2004/06/03 19:27:06 fenrir Exp $ + * Copyright (C) 2003-2008 x264 project * * Authors: Laurent Aimar + * Loren Merritt * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -18,11 +18,13 @@ * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. *****************************************************************************/ #include "common.h" +#define ALIGN(x,a) (((x)+((a)-1))&~((a)-1)) + x264_frame_t *x264_frame_new( x264_t *h ) { x264_frame_t *frame = x264_malloc( sizeof(x264_frame_t) ); @@ -31,74 +33,57 @@ x264_frame_t *x264_frame_new( x264_t *h ) int i_mb_count = h->mb.i_mb_count; int i_stride, i_width, i_lines; int i_padv = PADV << h->param.b_interlaced; + int luma_plane_size; + int align = h->param.cpu&X264_CPU_CACHELINE_64 ? 64 : h->param.cpu&X264_CPU_CACHELINE_32 ? 32 : 16; if( !frame ) return NULL; memset( frame, 0, sizeof(x264_frame_t) ); /* allocate frame data (+64 for extra data for me) */ - i_width = ( ( h->param.i_width + 15 ) & -16 ); - i_stride = i_width + 2*PADH; - i_lines = ( ( h->param.i_height + 15 ) & -16 ); - if( h->param.b_interlaced ) - i_lines = ( i_lines + 31 ) & -32; - - if( h->param.cpu&X264_CPU_CACHELINE_SPLIT ) - { - int align = h->param.cpu&X264_CPU_CACHELINE_32 ? 32 : 64; - i_stride = (i_stride + align-1) & -align; - } + i_width = ALIGN( h->param.i_width, 16 ); + i_stride = ALIGN( i_width + 2*PADH, align ); + i_lines = ALIGN( h->param.i_height, 16<param.b_interlaced ); frame->i_plane = 3; for( i = 0; i < 3; i++ ) { - int i_divh = 1; - int i_divw = 1; - if( i > 0 ) - { - if( h->param.i_csp == X264_CSP_I420 ) - i_divh = i_divw = 2; - else if( h->param.i_csp == X264_CSP_I422 ) - i_divw = 2; - } - frame->i_stride[i] = i_stride / i_divw; - frame->i_width[i] = i_width / i_divw; - frame->i_lines[i] = i_lines / i_divh; - CHECKED_MALLOC( frame->buffer[i], - frame->i_stride[i] * ( frame->i_lines[i] + 2*i_padv / i_divh ) ); - - frame->plane[i] = ((uint8_t*)frame->buffer[i]) + - frame->i_stride[i] * i_padv / i_divh + PADH / i_divw; + frame->i_stride[i] = i_stride >> !!i; + frame->i_width[i] = i_width >> !!i; + frame->i_lines[i] = i_lines >> !!i; } - frame->filtered[0] = frame->plane[0]; - for( i = 0; i < 3; i++ ) + luma_plane_size = (frame->i_stride[0] * ( frame->i_lines[0] + 2*i_padv )); + for( i = 1; i < 3; i++ ) { - CHECKED_MALLOC( frame->buffer[4+i], - frame->i_stride[0] * ( frame->i_lines[0] + 2*i_padv ) ); - frame->filtered[i+1] = ((uint8_t*)frame->buffer[4+i]) + - frame->i_stride[0] * i_padv + PADH; + CHECKED_MALLOC( frame->buffer[i], luma_plane_size/4 ); + frame->plane[i] = frame->buffer[i] + (frame->i_stride[i] * i_padv + PADH)/2; } + /* all 4 luma planes allocated together, since the cacheline split code + * requires them to be in-phase wrt cacheline alignment. */ + CHECKED_MALLOC( frame->buffer[0], 4*luma_plane_size); + for( i = 0; i < 4; i++ ) + frame->filtered[i] = frame->buffer[0] + i*luma_plane_size + frame->i_stride[0] * i_padv + PADH; + frame->plane[0] = frame->filtered[0]; if( h->frames.b_have_lowres ) { frame->i_width_lowres = frame->i_width[0]/2; - frame->i_stride_lowres = frame->i_width_lowres + 2*PADH; + frame->i_stride_lowres = ALIGN( frame->i_width_lowres + 2*PADH, align ); frame->i_lines_lowres = frame->i_lines[0]/2; + + luma_plane_size = frame->i_stride_lowres * ( frame->i_lines[0]/2 + 2*i_padv ); + + CHECKED_MALLOC( frame->buffer_lowres[0], 4 * luma_plane_size ); for( i = 0; i < 4; i++ ) - { - CHECKED_MALLOC( frame->buffer_lowres[i], - frame->i_stride_lowres * ( frame->i_lines[0]/2 + 2*i_padv ) ); - frame->lowres[i] = ((uint8_t*)frame->buffer_lowres[i]) + - frame->i_stride_lowres * i_padv + PADH; - } + frame->lowres[i] = frame->buffer_lowres[0] + (frame->i_stride_lowres * i_padv + PADH) + i * luma_plane_size; } if( h->param.analyse.i_me_method >= X264_ME_ESA ) { - CHECKED_MALLOC( frame->buffer[7], + CHECKED_MALLOC( frame->buffer[3], 2 * frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv) * sizeof(uint16_t) ); - frame->integral = (uint16_t*)frame->buffer[7] + frame->i_stride[0] * i_padv + PADH; + frame->integral = (uint16_t*)frame->buffer[3] + frame->i_stride[0] * i_padv + PADH; } frame->i_poc = -1; @@ -142,7 +127,7 @@ fail: void x264_frame_delete( x264_frame_t *frame ) { int i, j; - for( i = 0; i < 8; i++ ) + for( i = 0; i < 4; i++ ) x264_free( frame->buffer[i] ); for( i = 0; i < 4; i++ ) x264_free( frame->buffer_lowres[i] ); @@ -161,17 +146,35 @@ void x264_frame_delete( x264_frame_t *frame ) x264_free( frame ); } -void x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src ) +int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src ) { int i_csp = src->img.i_csp & X264_CSP_MASK; + int i; + if( i_csp != X264_CSP_I420 && i_csp != X264_CSP_YV12 ) + { + x264_log( h, X264_LOG_ERROR, "Arg invalid CSP\n" ); + return -1; + } + dst->i_type = src->i_type; dst->i_qpplus1 = src->i_qpplus1; dst->i_pts = src->i_pts; - if( i_csp <= X264_CSP_NONE || i_csp >= X264_CSP_MAX ) - x264_log( h, X264_LOG_ERROR, "Arg invalid CSP\n" ); - else - h->csp.convert[i_csp]( &h->mc, dst, &src->img, h->param.i_width, h->param.i_height ); + for( i=0; i<3; i++ ) + { + int s = (i_csp == X264_CSP_YV12 && i) ? i^3 : i; + uint8_t *plane = src->img.plane[s]; + int stride = src->img.i_stride[s]; + int width = h->param.i_width >> !!i; + int height = h->param.i_height >> !!i; + if( src->img.i_csp & X264_CSP_VFLIP ) + { + plane += (height-1)*stride; + stride = -stride; + } + h->mc.plane_copy( dst->plane[i], dst->i_stride[i], plane, stride, width, height ); + } + return 0; } @@ -229,19 +232,20 @@ void x264_frame_expand_border( x264_t *h, x264_frame_t *frame, int mb_y, int b_e void x264_frame_expand_border_filtered( x264_t *h, x264_frame_t *frame, int mb_y, int b_end ) { - /* during filtering, 8 extra pixels were filtered on each edge. + /* during filtering, 8 extra pixels were filtered on each edge, + * but up to 3 of the horizontal ones may be wrong. we want to expand border from the last filtered pixel */ int b_start = !mb_y; int stride = frame->i_stride[0]; - int width = 16*h->sps->i_mb_width + 16; + int width = 16*h->sps->i_mb_width + 8; int height = b_end ? (16*(h->sps->i_mb_height - mb_y) >> h->sh.b_mbaff) + 16 : 16; - int padh = PADH - 8; + int padh = PADH - 4; int padv = PADV - 8; int i; for( i = 1; i < 4; i++ ) { // buffer: 8 luma, to match the hpel filter - uint8_t *pix = frame->filtered[i] + (16*mb_y - (8 << h->sh.b_mbaff)) * stride - 8; + uint8_t *pix = frame->filtered[i] + (16*mb_y - (8 << h->sh.b_mbaff)) * stride - 4; if( h->sh.b_mbaff ) { plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end ); @@ -297,16 +301,16 @@ void munge_cavlc_nnz_row( x264_t *h, int mb_y, uint8_t (*buf)[16] ) { uint32_t (*src)[6] = (uint32_t(*)[6])h->mb.non_zero_count + mb_y * h->sps->i_mb_width; int8_t *transform = h->mb.mb_transform_size + mb_y * h->sps->i_mb_width; - int x; + int x, nnz; for( x=0; xsps->i_mb_width; x++ ) { memcpy( buf+x, src+x, 16 ); if( transform[x] ) { - if( src[x][0] ) src[x][0] = 0x01010101; - if( src[x][1] ) src[x][1] = 0x01010101; - if( src[x][2] ) src[x][2] = 0x01010101; - if( src[x][3] ) src[x][3] = 0x01010101; + nnz = src[x][0] | src[x][1]; + src[x][0] = src[x][1] = ((uint16_t)nnz ? 0x0101 : 0) + (nnz>>16 ? 0x01010000 : 0); + nnz = src[x][2] | src[x][3]; + src[x][2] = src[x][3] = ((uint16_t)nnz ? 0x0101 : 0) + (nnz>>16 ? 0x01010000 : 0); } } } @@ -585,7 +589,7 @@ void x264_frame_deblock_row( x264_t *h, int mb_y ) const int mb_4x4 = 4 * s4x4 * mb_y + 4 * mb_x; const int b_8x8_transform = h->mb.mb_transform_size[mb_xy]; const int i_edge_end = (h->mb.type[mb_xy] == P_SKIP) ? 1 : 4; - int i_edge, i_dir; + int i_edge; int i_pix_y[3] = { 16*mb_y*h->fdec->i_stride[0] + 16*mb_x, 8*mb_y*h->fdec->i_stride[1] + 8*mb_x, @@ -601,125 +605,116 @@ void x264_frame_deblock_row( x264_t *h, int mb_y ) /* i_dir == 0 -> vertical edge * i_dir == 1 -> horizontal edge */ - for( i_dir = 0; i_dir < 2; i_dir++ ) - { - int i_start = (i_dir ? (mb_y <= b_interlaced) : (mb_x == 0)); - int i_qp, i_qpn; - - for( i_edge = i_start; i_edge < i_edge_end; i_edge++ ) - { - int mbn_xy, mbn_8x8, mbn_4x4; - int bS[4]; /* filtering strength */ - - if( b_8x8_transform && (i_edge&1) ) - continue; - - mbn_xy = i_edge > 0 ? mb_xy : ( i_dir == 0 ? mb_xy - 1 : mb_xy - h->mb.i_mb_stride ); - mbn_8x8 = i_edge > 0 ? mb_8x8 : ( i_dir == 0 ? mb_8x8 - 2 : mb_8x8 - 2 * s8x8 ); - mbn_4x4 = i_edge > 0 ? mb_4x4 : ( i_dir == 0 ? mb_4x4 - 4 : mb_4x4 - 4 * s4x4 ); - if( b_interlaced && i_edge == 0 && i_dir == 1 ) - { - mbn_xy -= h->mb.i_mb_stride; - mbn_8x8 -= 2 * s8x8; - mbn_4x4 -= 4 * s4x4; - } - - /* *** Get bS for each 4px for the current edge *** */ - if( IS_INTRA( h->mb.type[mb_xy] ) || IS_INTRA( h->mb.type[mbn_xy] ) ) - { - bS[0] = bS[1] = bS[2] = bS[3] = ( i_edge == 0 && !(b_interlaced && i_dir) ? 4 : 3 ); - } - else - { - int i; - for( i = 0; i < 4; i++ ) - { - int x = i_dir == 0 ? i_edge : i; - int y = i_dir == 0 ? i : i_edge; - int xn = (x - (i_dir == 0 ? 1 : 0 ))&0x03; - int yn = (y - (i_dir == 0 ? 0 : 1 ))&0x03; - - if( h->mb.non_zero_count[mb_xy][block_idx_xy[x][y]] != 0 || - h->mb.non_zero_count[mbn_xy][block_idx_xy[xn][yn]] != 0 ) - { - bS[i] = 2; - } - else - { - /* FIXME: A given frame may occupy more than one position in - * the reference list. So we should compare the frame numbers, - * not the indices in the ref list. - * No harm yet, as we don't generate that case.*/ - - int i8p= mb_8x8+(x/2)+(y/2)*s8x8; - int i8q= mbn_8x8+(xn/2)+(yn/2)*s8x8; - int i4p= mb_4x4+x+y*s4x4; - int i4q= mbn_4x4+xn+yn*s4x4; - int l; - - bS[i] = 0; - - for( l = 0; l < 1 + (h->sh.i_type == SLICE_TYPE_B); l++ ) - { - if( h->mb.ref[l][i8p] != h->mb.ref[l][i8q] || - abs( h->mb.mv[l][i4p][0] - h->mb.mv[l][i4q][0] ) >= 4 || - abs( h->mb.mv[l][i4p][1] - h->mb.mv[l][i4q][1] ) >= mvy_limit ) - { - bS[i] = 1; - break; - } - } - } - } - } - - /* *** filter *** */ - /* Y plane */ - i_qp = h->mb.qp[mb_xy]; - i_qpn= h->mb.qp[mbn_xy]; - - if( i_dir == 0 ) - { - /* vertical edge */ - deblock_edge( h, &h->fdec->plane[0][i_pix_y[0] + 4*i_edge], - i_stride2[0], bS, (i_qp+i_qpn+1) >> 1, 0, - h->loopf.deblock_h_luma, h->loopf.deblock_h_luma_intra ); - if( !(i_edge & 1) ) - { - /* U/V planes */ - int i_qpc = ( i_chroma_qp_table[x264_clip3( i_qp + h->pps->i_chroma_qp_index_offset, 0, 51 )] + - i_chroma_qp_table[x264_clip3( i_qpn + h->pps->i_chroma_qp_index_offset, 0, 51 )] + 1 ) >> 1; - deblock_edge( h, &h->fdec->plane[1][i_pix_y[1] + 2*i_edge], - i_stride2[1], bS, i_qpc, 1, - h->loopf.deblock_h_chroma, h->loopf.deblock_h_chroma_intra ); - deblock_edge( h, &h->fdec->plane[2][i_pix_y[2] + 2*i_edge], - i_stride2[2], bS, i_qpc, 1, - h->loopf.deblock_h_chroma, h->loopf.deblock_h_chroma_intra ); - } - } - else - { - /* horizontal edge */ - deblock_edge( h, &h->fdec->plane[0][i_pix_y[0] + 4*i_edge*i_stride2[0]], - i_stride2[0], bS, (i_qp+i_qpn+1) >> 1, 0, - h->loopf.deblock_v_luma, h->loopf.deblock_v_luma_intra ); - /* U/V planes */ - if( !(i_edge & 1) ) - { - int i_qpc = ( i_chroma_qp_table[x264_clip3( i_qp + h->pps->i_chroma_qp_index_offset, 0, 51 )] + - i_chroma_qp_table[x264_clip3( i_qpn + h->pps->i_chroma_qp_index_offset, 0, 51 )] + 1 ) >> 1; - deblock_edge( h, &h->fdec->plane[1][i_pix_y[1] + 2*i_edge*i_stride2[1]], - i_stride2[1], bS, i_qpc, 1, - h->loopf.deblock_v_chroma, h->loopf.deblock_v_chroma_intra ); - deblock_edge( h, &h->fdec->plane[2][i_pix_y[2] + 2*i_edge*i_stride2[2]], - i_stride2[2], bS, i_qpc, 1, - h->loopf.deblock_v_chroma, h->loopf.deblock_v_chroma_intra ); - } - } - } + #define deblock_dir(i_dir)\ + {\ + int i_start = (i_dir ? (mb_y <= b_interlaced) : (mb_x == 0));\ + int i_qp, i_qpn;\ + for( i_edge = i_start; i_edge < i_edge_end; i_edge++ )\ + {\ + int mbn_xy, mbn_8x8, mbn_4x4;\ + int bS[4]; /* filtering strength */\ + if( b_8x8_transform && (i_edge&1) )\ + continue;\ + mbn_xy = i_edge > 0 ? mb_xy : ( i_dir == 0 ? mb_xy - 1 : mb_xy - h->mb.i_mb_stride );\ + mbn_8x8 = i_edge > 0 ? mb_8x8 : ( i_dir == 0 ? mb_8x8 - 2 : mb_8x8 - 2 * s8x8 );\ + mbn_4x4 = i_edge > 0 ? mb_4x4 : ( i_dir == 0 ? mb_4x4 - 4 : mb_4x4 - 4 * s4x4 );\ + if( b_interlaced && i_edge == 0 && i_dir == 1 )\ + {\ + mbn_xy -= h->mb.i_mb_stride;\ + mbn_8x8 -= 2 * s8x8;\ + mbn_4x4 -= 4 * s4x4;\ + }\ + /* *** Get bS for each 4px for the current edge *** */\ + if( IS_INTRA( h->mb.type[mb_xy] ) || IS_INTRA( h->mb.type[mbn_xy] ) )\ + bS[0] = bS[1] = bS[2] = bS[3] = ( i_edge == 0 && !(b_interlaced && i_dir) ? 4 : 3 );\ + else\ + {\ + int i;\ + for( i = 0; i < 4; i++ )\ + {\ + int x = i_dir == 0 ? i_edge : i;\ + int y = i_dir == 0 ? i : i_edge;\ + int xn = (x - (i_dir == 0 ? 1 : 0 ))&0x03;\ + int yn = (y - (i_dir == 0 ? 0 : 1 ))&0x03;\ + if( h->mb.non_zero_count[mb_xy][x+y*4] != 0 ||\ + h->mb.non_zero_count[mbn_xy][xn+yn*4] != 0 )\ + {\ + bS[i] = 2;\ + }\ + else\ + {\ + /* FIXME: A given frame may occupy more than one position in\ + * the reference list. So we should compare the frame numbers,\ + * not the indices in the ref list.\ + * No harm yet, as we don't generate that case.*/\ + int i8p= mb_8x8+(x/2)+(y/2)*s8x8;\ + int i8q= mbn_8x8+(xn/2)+(yn/2)*s8x8;\ + int i4p= mb_4x4+x+y*s4x4;\ + int i4q= mbn_4x4+xn+yn*s4x4;\ + int l;\ + bS[i] = 0;\ + for( l = 0; l < 1 + (h->sh.i_type == SLICE_TYPE_B); l++ )\ + {\ + if( h->mb.ref[l][i8p] != h->mb.ref[l][i8q] ||\ + abs( h->mb.mv[l][i4p][0] - h->mb.mv[l][i4q][0] ) >= 4 ||\ + abs( h->mb.mv[l][i4p][1] - h->mb.mv[l][i4q][1] ) >= mvy_limit )\ + {\ + bS[i] = 1;\ + break;\ + }\ + }\ + }\ + }\ + }\ + /* *** filter *** */\ + /* Y plane */\ + i_qp = h->mb.qp[mb_xy];\ + i_qpn= h->mb.qp[mbn_xy];\ + if( i_dir == 0 )\ + {\ + /* vertical edge */\ + deblock_edge( h, &h->fdec->plane[0][i_pix_y[0] + 4*i_edge],\ + i_stride2[0], bS, (i_qp+i_qpn+1) >> 1, 0,\ + h->loopf.deblock_h_luma, h->loopf.deblock_h_luma_intra );\ + if( !(i_edge & 1) )\ + {\ + /* U/V planes */\ + int i_qpc = ( i_chroma_qp_table[x264_clip3( i_qp + h->pps->i_chroma_qp_index_offset, 0, 51 )] +\ + i_chroma_qp_table[x264_clip3( i_qpn + h->pps->i_chroma_qp_index_offset, 0, 51 )] + 1 ) >> 1;\ + deblock_edge( h, &h->fdec->plane[1][i_pix_y[1] + 2*i_edge],\ + i_stride2[1], bS, i_qpc, 1,\ + h->loopf.deblock_h_chroma, h->loopf.deblock_h_chroma_intra );\ + deblock_edge( h, &h->fdec->plane[2][i_pix_y[2] + 2*i_edge],\ + i_stride2[2], bS, i_qpc, 1,\ + h->loopf.deblock_h_chroma, h->loopf.deblock_h_chroma_intra );\ + }\ + }\ + else\ + {\ + /* horizontal edge */\ + deblock_edge( h, &h->fdec->plane[0][i_pix_y[0] + 4*i_edge*i_stride2[0]],\ + i_stride2[0], bS, (i_qp+i_qpn+1) >> 1, 0,\ + h->loopf.deblock_v_luma, h->loopf.deblock_v_luma_intra );\ + /* U/V planes */\ + if( !(i_edge & 1) )\ + {\ + int i_qpc = ( i_chroma_qp_table[x264_clip3( i_qp + h->pps->i_chroma_qp_index_offset, 0, 51 )] +\ + i_chroma_qp_table[x264_clip3( i_qpn + h->pps->i_chroma_qp_index_offset, 0, 51 )] + 1 ) >> 1;\ + deblock_edge( h, &h->fdec->plane[1][i_pix_y[1] + 2*i_edge*i_stride2[1]],\ + i_stride2[1], bS, i_qpc, 1,\ + h->loopf.deblock_v_chroma, h->loopf.deblock_v_chroma_intra );\ + deblock_edge( h, &h->fdec->plane[2][i_pix_y[2] + 2*i_edge*i_stride2[2]],\ + i_stride2[2], bS, i_qpc, 1,\ + h->loopf.deblock_v_chroma, h->loopf.deblock_v_chroma_intra );\ + }\ + }\ + }\ } + deblock_dir(0); + deblock_dir(1); + /* next mb */ if( !b_interlaced || (mb_y&1) ) mb_x++; @@ -743,18 +738,26 @@ void x264_deblock_h_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta void x264_deblock_v_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta ); void x264_deblock_h_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta ); -#ifdef ARCH_X86_64 void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); -#else +void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta ); +void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta ); +#ifdef ARCH_X86 void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_h_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta ); +void x264_deblock_v8_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta ); void x264_deblock_v_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) { x264_deblock_v8_luma_mmxext( pix, stride, alpha, beta, tc0 ); x264_deblock_v8_luma_mmxext( pix+8, stride, alpha, beta, tc0+2 ); } +void x264_deblock_v_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta ) +{ + x264_deblock_v8_luma_intra_mmxext( pix, stride, alpha, beta ); + x264_deblock_v8_luma_intra_mmxext( pix+8, stride, alpha, beta ); +} #endif #endif @@ -781,17 +784,19 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf ) pf->deblock_h_chroma = x264_deblock_h_chroma_mmxext; pf->deblock_v_chroma_intra = x264_deblock_v_chroma_intra_mmxext; pf->deblock_h_chroma_intra = x264_deblock_h_chroma_intra_mmxext; - -#ifdef ARCH_X86_64 - if( cpu&X264_CPU_SSE2 ) +#ifdef ARCH_X86 + pf->deblock_v_luma = x264_deblock_v_luma_mmxext; + pf->deblock_h_luma = x264_deblock_h_luma_mmxext; + pf->deblock_v_luma_intra = x264_deblock_v_luma_intra_mmxext; + pf->deblock_h_luma_intra = x264_deblock_h_luma_intra_mmxext; +#endif + if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_STACK_MOD4) ) { pf->deblock_v_luma = x264_deblock_v_luma_sse2; pf->deblock_h_luma = x264_deblock_h_luma_sse2; + pf->deblock_v_luma_intra = x264_deblock_v_luma_intra_sse2; + pf->deblock_h_luma_intra = x264_deblock_h_luma_intra_sse2; } -#else - pf->deblock_v_luma = x264_deblock_v_luma_mmxext; - pf->deblock_h_luma = x264_deblock_h_luma_mmxext; -#endif } #endif @@ -806,8 +811,6 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf ) /* threading */ - -#ifdef HAVE_PTHREAD void x264_frame_cond_broadcast( x264_frame_t *frame, int i_lines_completed ) { x264_pthread_mutex_lock( &frame->mutex ); @@ -824,14 +827,6 @@ void x264_frame_cond_wait( x264_frame_t *frame, int i_lines_completed ) x264_pthread_mutex_unlock( &frame->mutex ); } -#else -void x264_frame_cond_broadcast( x264_frame_t *frame, int i_lines_completed ) -{} -void x264_frame_cond_wait( x264_frame_t *frame, int i_lines_completed ) -{} -#endif - - /* list operators */ void x264_frame_push( x264_frame_t **list, x264_frame_t *frame )