const int i8 = x264_scan8[0]+x+8*y;
const int i_ref0 = h->mb.cache.ref[0][i8];
const int i_ref1 = h->mb.cache.ref[1][i8];
+ const int weight = h->mb.bipred_weight[i_ref0][i_ref1];
const int mvx0 = x264_clip3( h->mb.cache.mv[0][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] );
const int mvx1 = x264_clip3( h->mb.cache.mv[1][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] );
int mvy0 = x264_clip3( h->mb.cache.mv[0][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] );
mvx0 + 4*4*x, mvy0 + 4*4*y, 4*width, 4*height );
src1 = h->mc.get_ref( tmp1, &i_stride1, h->mb.pic.p_fref[1][i_ref1], h->mb.pic.i_stride[0],
mvx1 + 4*4*x, mvy1 + 4*4*y, 4*width, 4*height );
+ h->mc.avg[i_mode]( &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE,
+ src0, i_stride0, src1, i_stride1, weight );
if( h->mb.b_interlaced & i_ref0 )
mvy0 += (h->mb.i_mb_y & 1)*4 - 2;
if( h->mb.b_interlaced & i_ref1 )
mvy1 += (h->mb.i_mb_y & 1)*4 - 2;
- h->mc.mc_chroma( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
- &h->mb.pic.p_fref[0][i_ref0][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
+ h->mc.mc_chroma( tmp0, 16, &h->mb.pic.p_fref[0][i_ref0][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
mvx0, mvy0, 2*width, 2*height );
-
- h->mc.mc_chroma( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
- &h->mb.pic.p_fref[0][i_ref0][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
+ h->mc.mc_chroma( tmp1, 16, &h->mb.pic.p_fref[1][i_ref1][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
+ mvx1, mvy1, 2*width, 2*height );
+ h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0, 16, tmp1, 16, weight );
+ h->mc.mc_chroma( tmp0, 16, &h->mb.pic.p_fref[0][i_ref0][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
mvx0, mvy0, 2*width, 2*height );
-
- if( h->param.analyse.b_weighted_bipred )
- {
- const int weight = h->mb.bipred_weight[i_ref0][i_ref1];
-
- h->mc.avg_weight[i_mode]( &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE,
- src0, i_stride0, src1, i_stride1, weight );
- h->mc.mc_chroma( tmp0, 16, &h->mb.pic.p_fref[1][i_ref1][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
- mvx1, mvy1, 2*width, 2*height );
- h->mc.avg_weight[i_mode+3]( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
- &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0, 16, weight );
- h->mc.mc_chroma( tmp0, 16, &h->mb.pic.p_fref[1][i_ref1][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
- mvx1, mvy1, 2*width, 2*height );
- h->mc.avg_weight[i_mode+3]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
- &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0, 16, weight );
- }
- else
- {
- h->mc.avg[i_mode]( &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE,
- src0, i_stride0, src1, i_stride1 );
- h->mc.mc_chroma( tmp0, 16, &h->mb.pic.p_fref[1][i_ref1][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
- mvx1, mvy1, 2*width, 2*height );
- h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
- &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0, 16 );
- h->mc.mc_chroma( tmp0, 16, &h->mb.pic.p_fref[1][i_ref1][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
- mvx1, mvy1, 2*width, 2*height );
- h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
- &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0, 16 );
- }
+ h->mc.mc_chroma( tmp1, 16, &h->mb.pic.p_fref[1][i_ref1][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
+ mvx1, mvy1, 2*width, 2*height );
+ h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0, 16, tmp1, 16, weight );
}
static void x264_mb_mc_direct8x8( x264_t *h, int x, int y )
}
}
-#define PIXEL_AVG_C( name, width, height ) \
-static void name( uint8_t *pix1, int i_stride_pix1, \
- uint8_t *pix2, int i_stride_pix2, \
- uint8_t *pix3, int i_stride_pix3 ) \
-{ \
- pixel_avg_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, pix3, i_stride_pix3, width, height ); \
-}
-PIXEL_AVG_C( pixel_avg_16x16, 16, 16 )
-PIXEL_AVG_C( pixel_avg_16x8, 16, 8 )
-PIXEL_AVG_C( pixel_avg_8x16, 8, 16 )
-PIXEL_AVG_C( pixel_avg_8x8, 8, 8 )
-PIXEL_AVG_C( pixel_avg_8x4, 8, 4 )
-PIXEL_AVG_C( pixel_avg_4x8, 4, 8 )
-PIXEL_AVG_C( pixel_avg_4x4, 4, 4 )
-PIXEL_AVG_C( pixel_avg_4x2, 4, 2 )
-PIXEL_AVG_C( pixel_avg_2x4, 2, 4 )
-PIXEL_AVG_C( pixel_avg_2x2, 2, 2 )
-
-
/* Implicit weighted bipred only:
* assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64 */
#define op_scale2(x) dst[x] = x264_clip_uint8( (src1[x]*i_weight1 + src2[x]*i_weight2 + (1<<5)) >> 6 )
op_scale2(15);
}
}
+#undef op_scale2
-#define PIXEL_AVG_WEIGHT_C( width, height ) \
-static void pixel_avg_weight_##width##x##height( \
- uint8_t *pix1, int i_stride_pix1, \
- uint8_t *pix2, int i_stride_pix2, \
- uint8_t *pix3, int i_stride_pix3, int i_weight1 ) \
+#define PIXEL_AVG_C( name, width, height ) \
+static void name( uint8_t *pix1, int i_stride_pix1, \
+ uint8_t *pix2, int i_stride_pix2, \
+ uint8_t *pix3, int i_stride_pix3, int weight ) \
{ \
- pixel_avg_weight_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, pix3, i_stride_pix3, width, height, i_weight1 ); \
+ if( weight == 32 )\
+ pixel_avg_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, pix3, i_stride_pix3, width, height ); \
+ else\
+ pixel_avg_weight_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, pix3, i_stride_pix3, width, height, weight ); \
}
-
-PIXEL_AVG_WEIGHT_C(16,16)
-PIXEL_AVG_WEIGHT_C(16,8)
-PIXEL_AVG_WEIGHT_C(8,16)
-PIXEL_AVG_WEIGHT_C(8,8)
-PIXEL_AVG_WEIGHT_C(8,4)
-PIXEL_AVG_WEIGHT_C(4,8)
-PIXEL_AVG_WEIGHT_C(4,4)
-PIXEL_AVG_WEIGHT_C(4,2)
-PIXEL_AVG_WEIGHT_C(2,4)
-PIXEL_AVG_WEIGHT_C(2,2)
-#undef op_scale2
-#undef PIXEL_AVG_WEIGHT_C
+PIXEL_AVG_C( pixel_avg_16x16, 16, 16 )
+PIXEL_AVG_C( pixel_avg_16x8, 16, 8 )
+PIXEL_AVG_C( pixel_avg_8x16, 8, 16 )
+PIXEL_AVG_C( pixel_avg_8x8, 8, 8 )
+PIXEL_AVG_C( pixel_avg_8x4, 8, 4 )
+PIXEL_AVG_C( pixel_avg_4x8, 4, 8 )
+PIXEL_AVG_C( pixel_avg_4x4, 4, 4 )
+PIXEL_AVG_C( pixel_avg_4x2, 4, 2 )
+PIXEL_AVG_C( pixel_avg_2x4, 2, 4 )
+PIXEL_AVG_C( pixel_avg_2x2, 2, 2 )
static void mc_copy( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_width, int i_height )
{
pf->avg[PIXEL_2x4] = pixel_avg_2x4;
pf->avg[PIXEL_2x2] = pixel_avg_2x2;
- pf->avg_weight[PIXEL_16x16]= pixel_avg_weight_16x16;
- pf->avg_weight[PIXEL_16x8] = pixel_avg_weight_16x8;
- pf->avg_weight[PIXEL_8x16] = pixel_avg_weight_8x16;
- pf->avg_weight[PIXEL_8x8] = pixel_avg_weight_8x8;
- pf->avg_weight[PIXEL_8x4] = pixel_avg_weight_8x4;
- pf->avg_weight[PIXEL_4x8] = pixel_avg_weight_4x8;
- pf->avg_weight[PIXEL_4x4] = pixel_avg_weight_4x4;
- pf->avg_weight[PIXEL_4x2] = pixel_avg_weight_4x2;
- pf->avg_weight[PIXEL_2x4] = pixel_avg_weight_2x4;
- pf->avg_weight[PIXEL_2x2] = pixel_avg_weight_2x2;
-
pf->copy[PIXEL_16x16] = mc_copy_w16;
pf->copy[PIXEL_8x8] = mc_copy_w8;
pf->copy[PIXEL_4x4] = mc_copy_w4;
int mvx, int mvy,
int i_width, int i_height );
- void (*avg[10])( uint8_t *dst, int, uint8_t *src1, int, uint8_t *src2, int );
- void (*avg_weight[10])( uint8_t *dst, int, uint8_t *src1, int, uint8_t *src2, int, int i_weight );
+ void (*avg[10])( uint8_t *dst, int, uint8_t *src1, int, uint8_t *src2, int, int i_weight );
/* only 16x16, 8x8, and 4x4 defined */
void (*copy[7])( uint8_t *dst, int, uint8_t *src, int, int i_height );
SECTION .text
;=============================================================================
-; pixel avg
+; weighted prediction
;=============================================================================
-
-;-----------------------------------------------------------------------------
-; void x264_pixel_avg_4x4_mmxext( uint8_t *dst, int dst_stride,
-; uint8_t *src1, int src1_stride, uint8_t *src2, int src2_stride );
-;-----------------------------------------------------------------------------
-%macro AVGH 3
-%assign function_align 8 ; the whole function fits in 8 bytes, so a larger align just wastes space
-cglobal x264_pixel_avg_%1x%2_%3,0,0
- mov eax, %2
-%ifidn %3, sse2
- test dword r4m, 15
- jnz x264_pixel_avg_w%1_mmxext
-%endif
- jmp x264_pixel_avg_w%1_%3
-%assign function_align 16
-%endmacro
-
-;-----------------------------------------------------------------------------
-; void x264_pixel_avg_w4_mmxext( uint8_t *dst, int dst_stride,
-; uint8_t *src1, int src1_stride, uint8_t *src2, int src2_stride,
-; int height );
-;-----------------------------------------------------------------------------
+; implicit bipred only:
+; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64
%ifdef ARCH_X86_64
%define t0 r0
%define t1 r1
%define t5 r5
%macro AVG_START 1
cglobal %1, 6,7
- .height_loop:
%endmacro
%else
%define t0 r1
mov t3, r3m
mov t4, r4m
mov t5, r5m
- .height_loop:
%endmacro
%endif
+%macro SPLATW 2
+%if mmsize==16
+ pshuflw %1, %2, 0
+ movlhps %1, %1
+%else
+ pshufw %1, %2, 0
+%endif
+%endmacro
+
+%macro BIWEIGHT 3
+ movh m0, %2
+ movh m1, %3
+ punpcklbw m0, m7
+ punpcklbw m1, m7
+ pmullw m0, m4
+ pmullw m1, m5
+ paddw m0, m1
+ paddw m0, m6
+ psraw m0, 6
+ pmaxsw m0, m7
+ packuswb m0, m0
+ movh %1, m0
+%endmacro
+
+%macro BIWEIGHT_START 0
+ movd m4, r6m
+ SPLATW m4, m4 ; weight_dst
+ mova m5, [pw_64 GLOBAL]
+ psubw m5, m4 ; weight_src
+ mova m6, [pw_32 GLOBAL] ; rounding
+ pxor m7, m7
+.height_loop:
+%endmacro
+
+INIT_MMX
+
+;-----------------------------------------------------------------------------
+; int x264_pixel_avg_weight_w16_mmxext( uint8_t *dst, int, uint8_t *src1, int, uint8_t *src2, int, int i_weight )
+;-----------------------------------------------------------------------------
+%macro AVG_WEIGHT 2
+AVG_START x264_pixel_avg_weight_w%2_%1
+ BIWEIGHT_START
+%assign x 0
+%rep %2*2/mmsize
+ BIWEIGHT [t0+x], [t2+x], [t4+x]
+ BIWEIGHT [t0+x+t1], [t2+x+t3], [t4+x+t5]
+%assign x x+mmsize/2
+%endrep
+ lea t0, [t0+t1*2]
+ lea t2, [t2+t3*2]
+ lea t4, [t4+t5*2]
+ sub eax, 2
+ jg .height_loop
+ REP_RET
+%endmacro
+
+AVG_WEIGHT mmxext, 4
+AVG_WEIGHT mmxext, 8
+AVG_WEIGHT mmxext, 16
+INIT_XMM
+AVG_WEIGHT sse2, 8
+AVG_WEIGHT sse2, 16
+
+
+
+;=============================================================================
+; pixel avg
+;=============================================================================
+
+;-----------------------------------------------------------------------------
+; void x264_pixel_avg_4x4_mmxext( uint8_t *dst, int dst_stride,
+; uint8_t *src1, int src1_stride, uint8_t *src2, int src2_stride, int weight );
+;-----------------------------------------------------------------------------
+%macro AVGH 3
+cglobal x264_pixel_avg_%1x%2_%3,0,0
+ mov eax, %2
+ cmp dword r6m, 32
+ jne x264_pixel_avg_weight_w%1_mmxext
+%if mmsize == 16 && %1 == 16
+ test dword r4m, 15
+ jz x264_pixel_avg_w%1_sse2
+%endif
+ jmp x264_pixel_avg_w%1_mmxext
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void x264_pixel_avg_w4_mmxext( uint8_t *dst, int dst_stride,
+; uint8_t *src1, int src1_stride, uint8_t *src2, int src2_stride,
+; int height, int weight );
+;-----------------------------------------------------------------------------
+
%macro AVG_END 0
sub eax, 2
lea t4, [t4+t5*2]
REP_RET
%endmacro
+INIT_MMX
+
AVG_START x264_pixel_avg_w4_mmxext
+.height_loop:
movd mm0, [t2]
movd mm1, [t2+t3]
pavgb mm0, [t4]
AVGH 4, 2, mmxext
AVG_START x264_pixel_avg_w8_mmxext
+.height_loop:
movq mm0, [t2]
movq mm1, [t2+t3]
pavgb mm0, [t4]
AVGH 8, 4, mmxext
AVG_START x264_pixel_avg_w16_mmxext
+.height_loop:
movq mm0, [t2 ]
movq mm1, [t2+8]
movq mm2, [t2+t3 ]
AVGH 16, 8, mmxext
AVG_START x264_pixel_avg_w16_sse2
+.height_loop:
movdqu xmm0, [t2]
movdqu xmm1, [t2+t3]
pavgb xmm0, [t4]
movdqa [t0+t1], xmm1
AVG_END
+INIT_XMM
AVGH 16, 16, sse2
AVGH 16, 8, sse2
+AVGH 8, 16, sse2
+AVGH 8, 8, sse2
+AVGH 8, 4, sse2
-;=============================================================================
-; weighted prediction
-;=============================================================================
-; implicit bipred only:
-; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64
-
-%macro SPLATW 2
-%if mmsize==16
- pshuflw %1, %2, 0
- movlhps %1, %1
-%else
- pshufw %1, %2, 0
-%endif
-%endmacro
-
-%macro BIWEIGHT 3
- movh m0, %2
- movh m1, %3
- punpcklbw m0, m7
- punpcklbw m1, m7
- pmullw m0, m4
- pmullw m1, m5
- paddw m0, m1
- paddw m0, m6
- psraw m0, 6
- pmaxsw m0, m7
- packuswb m0, m0
- movh %1, m0
-%endmacro
-
-%macro BIWEIGHT_START 1
- movd m4, r6m
- SPLATW m4, m4 ; weight_dst
- mova m5, [pw_64 GLOBAL]
- psubw m5, m4 ; weight_src
- mova m6, [pw_32 GLOBAL] ; rounding
- pxor m7, m7
-%if %1
- %define t0 r6d
- mov r6d, r7m
-%endif
-.height_loop:
-%endmacro
-
-INIT_MMX
-;-----------------------------------------------------------------------------
-; int x264_pixel_avg_weight_w16_mmxext( uint8_t *dst, int, uint8_t *src1, int, uint8_t *src2, int, int i_weight, int )
-;-----------------------------------------------------------------------------
-cglobal x264_pixel_avg_weight_4x4_mmxext, 6,6
- BIWEIGHT_START 0
- BIWEIGHT [r0 ], [r2 ], [r4 ]
- BIWEIGHT [r0+r1 ], [r2+r3 ], [r4+r5 ]
- BIWEIGHT [r0+r1*2], [r2+r3*2], [r4+r5*2]
- add r0, r1
- add r2, r3
- add r4, r5
- BIWEIGHT [r0+r1*2], [r2+r3*2], [r4+r5*2]
- RET
-
-%macro AVG_WEIGHT 2
-cglobal x264_pixel_avg_weight_w%2_%1, 6,7
- BIWEIGHT_START 1
-%assign x 0
-%rep %2*2/mmsize
- BIWEIGHT [r0+x], [r2+x], [r4+x]
- BIWEIGHT [r0+x+r1], [r2+x+r3], [r4+x+r5]
-%assign x x+mmsize/2
-%endrep
- lea r0, [r0+r1*2]
- lea r2, [r2+r3*2]
- lea r4, [r4+r5*2]
- sub t0, 2
- jg .height_loop
- REP_RET
-%endmacro
-
-AVG_WEIGHT mmxext, 8
-AVG_WEIGHT mmxext, 16
-INIT_XMM
-AVG_WEIGHT sse2, 8
-AVG_WEIGHT sse2, 16
-
-
-
;=============================================================================
; prefetch
;=============================================================================
#include "mc.h"
/* NASM functions */
-extern void x264_pixel_avg_16x16_sse2( uint8_t *, int, uint8_t *, int, uint8_t *, int );
-extern void x264_pixel_avg_16x8_sse2( uint8_t *, int, uint8_t *, int, uint8_t *, int );
-extern void x264_pixel_avg_16x16_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int );
-extern void x264_pixel_avg_16x8_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int );
-extern void x264_pixel_avg_8x16_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int );
-extern void x264_pixel_avg_8x8_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int );
-extern void x264_pixel_avg_8x4_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int );
-extern void x264_pixel_avg_4x8_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int );
-extern void x264_pixel_avg_4x4_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int );
-extern void x264_pixel_avg_4x2_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int );
+extern void x264_pixel_avg_16x16_sse2( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
+extern void x264_pixel_avg_16x8_sse2( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
+extern void x264_pixel_avg_8x16_sse2( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
+extern void x264_pixel_avg_8x8_sse2( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
+extern void x264_pixel_avg_8x4_sse2( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
+extern void x264_pixel_avg_16x16_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
+extern void x264_pixel_avg_16x8_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
+extern void x264_pixel_avg_8x16_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
+extern void x264_pixel_avg_8x8_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
+extern void x264_pixel_avg_8x4_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
+extern void x264_pixel_avg_4x8_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
+extern void x264_pixel_avg_4x4_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
+extern void x264_pixel_avg_4x2_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
extern void x264_mc_copy_w4_mmx( uint8_t *, int, uint8_t *, int, int );
extern void x264_mc_copy_w8_mmx( uint8_t *, int, uint8_t *, int, int );
extern void x264_mc_copy_w16_mmx( uint8_t *, int, uint8_t *, int, int );
extern void x264_mc_copy_w16_sse2( uint8_t *, int, uint8_t *, int, int );
extern void x264_mc_copy_w16_sse3( uint8_t *, int, uint8_t *, int, int );
extern void x264_mc_copy_w16_aligned_sse2( uint8_t *, int, uint8_t *, int, int );
-extern void x264_pixel_avg_weight_4x4_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
-extern void x264_pixel_avg_weight_w8_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int, int );
-extern void x264_pixel_avg_weight_w8_sse2( uint8_t *, int, uint8_t *, int, uint8_t *, int, int, int );
-extern void x264_pixel_avg_weight_w16_sse2( uint8_t *, int, uint8_t *, int, uint8_t *, int, int, int );
-extern void x264_pixel_avg_weight_w16_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int, int );
extern void x264_prefetch_fenc_mmxext( uint8_t *, int, uint8_t *, int, int );
extern void x264_prefetch_ref_mmxext( uint8_t *, int, int );
extern void x264_mc_chroma_mmxext( uint8_t *src, int i_src_stride,
PIXEL_AVG_WALL(cache64_sse2)
PIXEL_AVG_WALL(sse2)
-#define AVG_WEIGHT(W,H,name) \
-static void x264_pixel_avg_weight_ ## W ## x ## H ## _##name( uint8_t *dst, int i_dst, uint8_t *src1, int i_src1, uint8_t *src2, int i_src2, int i_weight_dst ) \
-{ \
- x264_pixel_avg_weight_w ## W ## _##name( dst, i_dst, src1, i_src1, src2, i_src2, i_weight_dst, H ); \
-}
-
-AVG_WEIGHT(16,16,mmxext)
-AVG_WEIGHT(16,8,mmxext)
-AVG_WEIGHT(8,16,mmxext)
-AVG_WEIGHT(8,8,mmxext)
-AVG_WEIGHT(8,4,mmxext)
-AVG_WEIGHT(16,16,sse2)
-AVG_WEIGHT(16,8,sse2)
-AVG_WEIGHT(8,16,sse2)
-AVG_WEIGHT(8,8,sse2)
-AVG_WEIGHT(8,4,sse2)
-
#define PIXEL_AVG_WTAB(instr, name1, name2, name3, name4, name5)\
static void (* const x264_pixel_avg_wtab_##instr[6])( uint8_t *, int, uint8_t *, int, uint8_t *, int ) =\
{\
pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_mmxext;
pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_mmxext;
- pf->avg_weight[PIXEL_16x16] = x264_pixel_avg_weight_16x16_mmxext;
- pf->avg_weight[PIXEL_16x8] = x264_pixel_avg_weight_16x8_mmxext;
- pf->avg_weight[PIXEL_8x16] = x264_pixel_avg_weight_8x16_mmxext;
- pf->avg_weight[PIXEL_8x8] = x264_pixel_avg_weight_8x8_mmxext;
- pf->avg_weight[PIXEL_8x4] = x264_pixel_avg_weight_8x4_mmxext;
- pf->avg_weight[PIXEL_4x4] = x264_pixel_avg_weight_4x4_mmxext;
- // avg_weight_4x8 is rare and 4x2 is not used
-
pf->plane_copy = x264_plane_copy_mmxext;
pf->hpel_filter = x264_hpel_filter_mmxext;
pf->frame_init_lowres_core = x264_frame_init_lowres_core_mmxext;
pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_sse2;
pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_sse2;
pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_sse2;
- pf->avg_weight[PIXEL_16x16] = x264_pixel_avg_weight_16x16_sse2;
- pf->avg_weight[PIXEL_16x8] = x264_pixel_avg_weight_16x8_sse2;
- pf->avg_weight[PIXEL_8x16] = x264_pixel_avg_weight_8x16_sse2;
- pf->avg_weight[PIXEL_8x8] = x264_pixel_avg_weight_8x8_sse2;
- pf->avg_weight[PIXEL_8x4] = x264_pixel_avg_weight_8x4_sse2;
+ pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_sse2;
+ pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_sse2;
+ pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_sse2;
pf->hpel_filter = x264_hpel_filter_sse2;
pf->frame_init_lowres_core = x264_frame_init_lowres_core_sse2;
pf->mc_chroma = x264_mc_chroma_sse2;
#define WEIGHTED_AVG( size, pix, stride, src1, stride1, src2, stride2 ) \
{ \
- if( h->param.analyse.b_weighted_bipred ) \
- h->mc.avg_weight[size]( pix, stride, src1, stride1, src2, stride2, \
- h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] ); \
- else \
- h->mc.avg[size]( pix, stride, src1, stride1, src2, stride2 ); \
+ h->mc.avg[size]( pix, stride, src1, stride1, src2, stride2, h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] ); \
}
static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
DECLARE_ALIGNED_16( uint8_t pix1[16*16] );
uint8_t *src0, *src1;
int stride0 = 16, stride1 = 16;
- int weight;
x264_me_t m;
int i_ref, i_mvc;
x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
/* get cost of BI mode */
- weight = h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref];
src0 = h->mc.get_ref( pix0, &stride0,
h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.i_stride[0],
a->l0.me16x16.mv[0], a->l0.me16x16.mv[1], 16, 16 );
h->mb.pic.p_fref[1][a->l1.i_ref], h->mb.pic.i_stride[0],
a->l1.me16x16.mv[0], a->l1.me16x16.mv[1], 16, 16 );
- if( h->param.analyse.b_weighted_bipred )
- h->mc.avg_weight[PIXEL_16x16]( pix0, 16, src0, stride0, src1, stride1, weight );
- else
- h->mc.avg[PIXEL_16x16]( pix0, 16, src0, stride0, src1, stride1 );
+ h->mc.avg[PIXEL_16x16]( pix0, 16, src0, stride0, src1, stride1, h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
a->i_cost16x16bi = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
+ REF_COST( 0, a->l0.i_ref )
i_part_cost_bi += m->cost_mv;
/* FIXME: ref cost */
}
- WEIGHTED_AVG( PIXEL_8x8, pix[0], 8, src[0], stride[0], src[1], stride[1] );
+ h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
i_part_cost_bi += h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 )
+ a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
/* FIXME: ref cost */
i_part_cost_bi += m->cost_mv;
}
-
- WEIGHTED_AVG( PIXEL_16x8, pix[0], 16, src[0], stride[0], src[1], stride[1] );
+ h->mc.avg[PIXEL_16x8]( pix[0], 16, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
i_part_cost_bi += h->pixf.mbcmp[PIXEL_16x8]( a->l0.me16x8[i].p_fenc[0], FENC_STRIDE, pix[0], 16 );
i_part_cost = a->l0.me16x8[i].cost;
i_part_cost_bi += m->cost_mv;
}
- WEIGHTED_AVG( PIXEL_8x16, pix[0], 8, src[0], stride[0], src[1], stride[1] );
+ h->mc.avg[PIXEL_8x16]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
i_part_cost_bi += h->pixf.mbcmp[PIXEL_8x16]( a->l0.me8x16[i].p_fenc[0], FENC_STRIDE, pix[0], 8 );
i_part_cost = a->l0.me8x16[i].cost;
int i0 = 4 + 3*(m0x-om0x) + (m0y-om0y); \
int i1 = 4 + 3*(m1x-om1x) + (m1y-om1y); \
visited[(m0x)&7][(m0y)&7][(m1x)&7] |= (1<<((m1y)&7));\
- if( i_weight == 32 ) \
- h->mc.avg[i_pixel]( pix, bw, src0[i0], stride0[i0], src1[i1], stride1[i1] ); \
- else \
- h->mc.avg_weight[i_pixel]( pix, bw, src1[i1], stride1[i1], src0[i0], stride0[i0], i_weight ); \
+ h->mc.avg[i_pixel]( pix, bw, src1[i1], stride1[i1], src0[i0], stride0[i0], i_weight ); \
cost = h->pixf.mbcmp[i_pixel]( m0->p_fenc[0], FENC_STRIDE, pix, bw ) \
+ p_cost_m0x[ m0x ] + p_cost_m0y[ m0y ] \
+ p_cost_m1x[ m1x ] + p_cost_m1y[ m1y ]; \
(mv0)[0], (mv0)[1], 8, 8 ); \
src2 = h->mc.get_ref( pix2, &stride2, m[1].p_fref, m[1].i_stride[0], \
(mv1)[0], (mv1)[1], 8, 8 ); \
- if( i_bipred_weight != 32 ) \
- h->mc.avg_weight[PIXEL_8x8]( pix1, 16, src1, stride1, src2, stride2, i_bipred_weight ); \
- else \
- h->mc.avg[PIXEL_8x8]( pix1, 16, src1, stride1, src2, stride2 ); \
+ h->mc.avg[PIXEL_8x8]( pix1, 16, src1, stride1, src2, stride2, i_bipred_weight ); \
i_cost = penalty + h->pixf.mbcmp[PIXEL_8x8]( \
m[0].p_fenc[0], FENC_STRIDE, pix1, 16 ); \
if( i_bcost > i_cost ) \
#undef MC_TEST_LUMA
#undef MC_TEST_CHROMA
-#define MC_TEST_AVG( name, ... ) \
+#define MC_TEST_AVG( name, weight ) \
for( i = 0, ok = 1, used_asm = 0; i < 10; i++ ) \
{ \
memcpy( buf2, buf1, 1024 ); \
{ \
set_func_name( "%s_%s", #name, pixel_names[i] );\
used_asm = 1; \
- call_c1( mc_c.name[i], buf3, 32, buf2+1, 16, buf1+18, 16, ##__VA_ARGS__ ); \
- call_a1( mc_a.name[i], buf4, 32, buf2+1, 16, buf1+18, 16, ##__VA_ARGS__ ); \
+ call_c1( mc_c.name[i], buf3, 16, buf2+1, 16, buf1+18, 16, weight ); \
+ call_a1( mc_a.name[i], buf4, 16, buf2+1, 16, buf1+18, 16, weight ); \
if( memcmp( buf3, buf4, 1024 ) ) \
{ \
ok = 0; \
fprintf( stderr, #name "[%d]: [FAILED]\n", i ); \
} \
- call_c2( mc_c.name[i], buf3, 32, buf2+1, 16, buf1+18, 16, ##__VA_ARGS__ ); \
- call_a2( mc_a.name[i], buf4, 32, buf2+1, 16, buf1+18, 16, ##__VA_ARGS__ ); \
+ call_c2( mc_c.name[i], buf3, 16, buf2+1, 16, buf1+18, 16, weight ); \
+ call_a2( mc_a.name[i], buf4, 16, buf2+1, 16, buf1+18, 16, weight ); \
} \
}
- MC_TEST_AVG( avg );
- report( "mc avg :" );
ok = 1; used_asm = 0;
- for( w = 32; w <= 32 && ok; w++ )
- MC_TEST_AVG( avg_weight, w );
+ for( w = -64; w <= 128 && ok; w++ )
+ MC_TEST_AVG( avg, w );
report( "mc wpredb :" );
if( mc_a.hpel_filter != mc_ref.hpel_filter )