deblock_chroma_420_mbaff, plus 422/422_intra_mbaff implemented using existing functions.
From Google Code-In.
{
deblock_chroma_c( pix, 1, 2, stride, alpha, beta, tc0 );
}
-static void deblock_h_chroma_422_mbaff_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
-{
- deblock_chroma_c( pix, 2, 2, stride, alpha, beta, tc0 );
-}
static void deblock_v_chroma_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
{
deblock_chroma_c( pix, 2, stride, 2, alpha, beta, tc0 );
{
deblock_chroma_intra_c( pix, 2, 4, 2, stride, alpha, beta );
}
-static void deblock_h_chroma_422_intra_mbaff_c( pixel *pix, int stride, int alpha, int beta )
-{
- deblock_chroma_intra_c( pix, 2, 8, 2, stride, alpha, beta );
-}
static void deblock_v_chroma_intra_c( pixel *pix, int stride, int alpha, int beta )
{
deblock_chroma_intra_c( pix, 1, 16, stride, 2, alpha, beta );
void x264_deblock_v_chroma_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_sse2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_h_chroma_mbaff_sse2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_h_chroma_mbaff_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_422_mmx2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_422_sse2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_422_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_intra_avx ( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_h_chroma_422_intra_mmx2( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_h_chroma_422_intra_sse2( pixel *pix, int stride, int alpha, int beta );
+void x264_deblock_h_chroma_422_intra_avx ( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_strength_mmx2 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
int mvy_limit, int bframe );
void x264_deblock_strength_avx ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
int mvy_limit, int bframe );
+
+void x264_deblock_h_chroma_intra_mbaff_mmx2( pixel *pix, int stride, int alpha, int beta );
+void x264_deblock_h_chroma_intra_mbaff_sse2( pixel *pix, int stride, int alpha, int beta );
+void x264_deblock_h_chroma_intra_mbaff_avx ( pixel *pix, int stride, int alpha, int beta );
#if ARCH_X86
void x264_deblock_h_luma_mmx2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_v8_luma_mmx2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_v_chroma_mmx2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_mmx2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_h_chroma_mbaff_mmx2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_luma_intra_mmx2( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_v8_luma_intra_mmx2( uint8_t *pix, int stride, int alpha, int beta );
void x264_deblock_v_chroma_intra_mmx2( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_h_chroma_intra_mmx2( pixel *pix, int stride, int alpha, int beta );
+void x264_deblock_h_chroma_intra_mbaff_mmx2( pixel *pix, int stride, int alpha, int beta );
#if HIGH_BIT_DEPTH
void x264_deblock_v_luma_mmx2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
pf->deblock_h_chroma_422_intra = deblock_h_chroma_422_intra_c;
pf->deblock_luma_mbaff = deblock_h_luma_mbaff_c;
pf->deblock_chroma_420_mbaff = deblock_h_chroma_mbaff_c;
- pf->deblock_chroma_422_mbaff = deblock_h_chroma_422_mbaff_c;
pf->deblock_luma_intra_mbaff = deblock_h_luma_intra_mbaff_c;
pf->deblock_chroma_420_intra_mbaff = deblock_h_chroma_intra_mbaff_c;
- pf->deblock_chroma_422_intra_mbaff = deblock_h_chroma_422_intra_mbaff_c;
pf->deblock_strength = deblock_strength_c;
#if HAVE_MMX
pf->deblock_luma[0] = x264_deblock_h_luma_mmx2;
pf->deblock_chroma[1] = x264_deblock_v_chroma_mmx2;
pf->deblock_h_chroma_420 = x264_deblock_h_chroma_mmx2;
-#if !HIGH_BIT_DEPTH
+ pf->deblock_chroma_420_mbaff = x264_deblock_h_chroma_mbaff_mmx2;
pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_mmx2;
pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_mmx2;
-#endif
pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_mmx2;
pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_mmx2;
pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_mmx2;
pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_mmx2;
+ pf->deblock_chroma_420_intra_mbaff = x264_deblock_h_chroma_intra_mbaff_mmx2;
+#endif
+#if !HIGH_BIT_DEPTH
+ pf->deblock_chroma_420_intra_mbaff = x264_deblock_h_chroma_intra_mbaff_mmx2;
#endif
pf->deblock_strength = x264_deblock_strength_mmx2;
if( cpu&X264_CPU_SSE2 )
pf->deblock_h_chroma_420 = x264_deblock_h_chroma_sse2;
pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_sse2;
pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_sse2;
+ pf->deblock_chroma_420_mbaff = x264_deblock_h_chroma_mbaff_sse2;
if( !(cpu&X264_CPU_STACK_MOD4) )
{
pf->deblock_luma[1] = x264_deblock_v_luma_sse2;
pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_sse2;
pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_sse2;
pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_sse2;
+#if HIGH_BIT_DEPTH
+ pf->deblock_chroma_420_intra_mbaff= x264_deblock_h_chroma_intra_mbaff_sse2;
+#endif
}
}
if( cpu&X264_CPU_SSSE3 )
{
pf->deblock_strength = x264_deblock_strength_avx;
pf->deblock_h_chroma_420 = x264_deblock_h_chroma_avx;
-#if !HIGH_BIT_DEPTH
pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_avx;
-#endif
+ pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_avx;
if( !(cpu&X264_CPU_STACK_MOD4) )
{
pf->deblock_luma[1] = x264_deblock_v_luma_avx;
pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_avx;
pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_avx;
pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_avx;
+#if HIGH_BIT_DEPTH
+ pf->deblock_chroma_420_mbaff = x264_deblock_h_chroma_mbaff_avx;
+ pf->deblock_chroma_420_intra_mbaff = x264_deblock_h_chroma_intra_mbaff_avx;
+#endif
}
}
}
}
#endif
#endif // !HIGH_BIT_DEPTH
+
+ /* These functions are equivalent, so don't duplicate them. */
+ pf->deblock_chroma_422_mbaff = pf->deblock_h_chroma_420;
+ pf->deblock_chroma_422_intra_mbaff = pf->deblock_h_chroma_420_intra;
}
void x264_sub8x8_dct_avx ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct_avx ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_dc_mmx2( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub8x8_dct_dc_sse2( dctcoef dct [ 4], pixel *pix1, pixel *pix2 );
-void x264_sub8x16_dct_dc_sse2( dctcoef dct [ 4], pixel *pix1, pixel *pix2 );
-void x264_sub8x16_dct_dc_avx ( dctcoef dct [ 4], pixel *pix1, pixel *pix2 );
+void x264_sub8x8_dct_dc_sse2( dctcoef dct [ 4], pixel *pix1, pixel *pix2 );
+void x264_sub8x16_dct_dc_sse2 ( dctcoef dct [ 4], pixel *pix1, pixel *pix2 );
void x264_sub8x16_dct_dc_ssse3( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub8x16_dct_dc_avx ( dctcoef dct [ 4], pixel *pix1, pixel *pix2 );
void x264_add4x4_idct_mmx ( uint8_t *p_dst, int16_t dct [16] );
void x264_add4x4_idct_sse2 ( uint16_t *p_dst, int32_t dct [16] );
%macro DEBLOCK_CHROMA 0
cglobal deblock_inter_body
- RESET_MM_PERMUTATION
LOAD_AB m4, m5, r2, r3
LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
pxor m4, m4
cglobal deblock_intra_body
- RESET_MM_PERMUTATION
LOAD_AB m4, m5, r2, r3
LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6
dec r4
jg .loop
REP_RET
-%endmacro
-%ifndef ARCH_X86_64
-INIT_MMX mmx2
-DEBLOCK_CHROMA
+;-----------------------------------------------------------------------------
+; void deblock_h_chroma_intra_mbaff( uint16_t *pix, int stride, int alpha, int beta )
+;-----------------------------------------------------------------------------
+cglobal deblock_h_chroma_intra_mbaff, 4,6,8
+ add r1, r1
+%if mmsize == 8
+ mov r4, 16/mmsize
+.loop:
+%else
+ lea r5, [r1*3]
%endif
-INIT_XMM sse2
-DEBLOCK_CHROMA
-INIT_XMM avx
-DEBLOCK_CHROMA
+ CHROMA_H_LOAD r5
+ LOAD_AB m4, m5, r2, r3
+ LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
+ CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6
+ CHROMA_H_STORE r5
+%if mmsize == 8
+ lea r0, [r0+r1*(mmsize/4)]
+ dec r4
+ jg .loop
+%endif
+ REP_RET
-%macro DEBLOCK_H_CHROMA_422_INTRA_10 0
+;-----------------------------------------------------------------------------
+; void deblock_h_chroma_mbaff( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+;-----------------------------------------------------------------------------
+cglobal deblock_h_chroma_mbaff, 5,7,8
+ add r1, r1
+ lea r6, [r1*3]
+%if mmsize == 8
+ mov r5, 16/mmsize
+.loop:
+%endif
+ CHROMA_H_LOAD r6
+ LOAD_AB m4, m5, r2, r3
+ LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
+ movd m6, [r4]
+ punpcklbw m6, m6
+ psraw m6, 8
+ punpcklwd m6, m6
+ pand m7, m6
+ DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6
+ CHROMA_H_STORE r6
+%if mmsize == 8
+ lea r0, [r0+r1*(mmsize/4)]
+ add r4, mmsize/4
+ dec r5
+ jg .loop
+%endif
+ REP_RET
+
+;-----------------------------------------------------------------------------
+; void deblock_h_chroma_422_intra( uint16_t *pix, int stride, int alpha, int beta )
+;-----------------------------------------------------------------------------
cglobal deblock_h_chroma_422_intra, 4,6,8
add r1, r1
mov r4, 64/mmsize
dec r4
jg .loop
REP_RET
-%endmacro
-INIT_XMM sse2
-DEBLOCK_H_CHROMA_422_INTRA_10
-%macro DEBLOCK_H_CHROMA_422_10 0
+;-----------------------------------------------------------------------------
+; void deblock_h_chroma_422( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+;-----------------------------------------------------------------------------
cglobal deblock_h_chroma_422, 5,7,8
add r1, r1
mov r5, 64/mmsize
lea r6, [r1*3]
.loop:
CHROMA_H_LOAD r6
- RESET_MM_PERMUTATION
- LOAD_AB m4, m5, r2, r3
+ LOAD_AB m4, m5, r2m, r3
LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
pxor m4, m4
movd m6, [r4-1]
DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6
CHROMA_H_STORE r6
lea r0, [r0+r1*(mmsize/4)]
- add r4, mmsize/16
+%if mmsize == 16
+ inc r4
+%else
+ mov r2, r5
+ and r2, 1
+ add r4, r2 ; increment once every 2 iterations
+%endif
dec r5
jg .loop
REP_RET
-%endmacro
+%endmacro ; DEBLOCK_CHROMA
+
+%ifndef ARCH_X86_64
+INIT_MMX mmx2
+DEBLOCK_CHROMA
+%endif
INIT_XMM sse2
-DEBLOCK_H_CHROMA_422_10
+DEBLOCK_CHROMA
+INIT_XMM avx
+DEBLOCK_CHROMA
%endif ; HIGH_BIT_DEPTH
%ifndef HIGH_BIT_DEPTH
DEBLOCK_CHROMA
%endif
+;-----------------------------------------------------------------------------
+; void deblock_h_chroma_mbaff( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+;-----------------------------------------------------------------------------
+%macro DEBLOCK_H_CHROMA_420_MBAFF 0
+cglobal deblock_h_chroma_mbaff, 5,7,8
+ dec r2d
+ dec r3d
+ sub r0, 4
+ lea t6, [r1*3]
+ mov t5, r0
+ add r0, t6
+ TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6)
+ LOAD_MASK r2d, r3d
+ movd m6, [r4] ; tc0
+ punpcklbw m6, m6
+ pand m7, m6
+ DEBLOCK_P0_Q0
+ TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2)
+ RET
+%endmacro
+
+INIT_XMM sse2
+DEBLOCK_H_CHROMA_420_MBAFF
+%ifndef ARCH_X86_64
+INIT_MMX mmx2
+DEBLOCK_H_CHROMA_420_MBAFF
+%endif
+
%macro DEBLOCK_H_CHROMA_422 0
cglobal deblock_h_chroma_422, 5,7,8
%ifdef ARCH_X86_64
%define t5 r4
%define t6 r5
-%macro DEBLOCK_CHROMA_INTRA 0
+%macro DEBLOCK_CHROMA_INTRA_BODY 0
cglobal chroma_intra_body
LOAD_MASK r2d, r3d
mova m5, m1
paddb m1, m5
paddb m2, m6
ret
+%endmacro
+%macro DEBLOCK_CHROMA_INTRA 0
;-----------------------------------------------------------------------------
; void deblock_v_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2)
CHROMA_H_LOOP 0
RET
-%endmacro ; DEBLOCK_CHROMA_INTRA
-INIT_XMM sse2
-DEBLOCK_CHROMA_INTRA
-INIT_XMM avx
-DEBLOCK_CHROMA_INTRA
-%ifndef ARCH_X86_64
-INIT_MMX mmx2
-DEBLOCK_CHROMA_INTRA
-%endif
-
-%macro DEBLOCK_H_CHROMA_422_INTRA 0
cglobal deblock_h_chroma_422_intra, 4,7,8
CHROMA_H_START
mov r6d, 32/mmsize
dec r6d
jg .loop
REP_RET
-%endmacro
+%endmacro ; DEBLOCK_CHROMA_INTRA
+
INIT_XMM sse2
-DEBLOCK_H_CHROMA_422_INTRA
-%ifndef ARCH_X86_64
+DEBLOCK_CHROMA_INTRA_BODY
+DEBLOCK_CHROMA_INTRA
+INIT_XMM avx
+DEBLOCK_CHROMA_INTRA_BODY
+DEBLOCK_CHROMA_INTRA
INIT_MMX mmx2
-DEBLOCK_H_CHROMA_422_INTRA
+DEBLOCK_CHROMA_INTRA_BODY
+%ifndef ARCH_X86_64
+DEBLOCK_CHROMA_INTRA
%endif
+
+;-----------------------------------------------------------------------------
+; void deblock_h_chroma_intra_mbaff( uint8_t *pix, int stride, int alpha, int beta )
+;-----------------------------------------------------------------------------
+INIT_MMX mmx2
+cglobal deblock_h_chroma_intra_mbaff, 4,6,8
+ CHROMA_H_START
+ TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6)
+ call chroma_intra_body
+ TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2)
+ RET
%endif ; !HIGH_BIT_DEPTH
mova [r0-2*FDEC_STRIDEB], m3
PALIGNR m3, m7, m6, SIZEOF_PIXEL*2, m5
mova [r0+0*FDEC_STRIDEB], m3
- PALIGNR m3, m7, m6, SIZEOF_PIXEL*3, m5
- mova [r0+2*FDEC_STRIDEB], m3
+ PALIGNR m7, m7, m6, SIZEOF_PIXEL*3, m5
+ mova [r0+2*FDEC_STRIDEB], m7
PALIGNR m3, m1, m0, SIZEOF_PIXEL*7, m6
PSLLPIX m5, m0, 1
PRED8x8_LOWPASS m0, m5, m2, m0, m7
mova [r0-1*FDEC_STRIDEB], m4
PALIGNR m4, m1, m0, SIZEOF_PIXEL*3, m2
mova [r0+1*FDEC_STRIDEB], m4
- PALIGNR m4, m1, m0, SIZEOF_PIXEL*4, m2
- mova [r0+3*FDEC_STRIDEB], m4
+ PALIGNR m1, m1, m0, SIZEOF_PIXEL*4, m2
+ mova [r0+3*FDEC_STRIDEB], m1
RET
%endmacro
%ifdef HIGH_BIT_DEPTH
TEST_DEBLOCK( deblock_luma[1], 1, tcs[i] );
TEST_DEBLOCK( deblock_h_chroma_420, 0, tcs[i] );
TEST_DEBLOCK( deblock_h_chroma_422, 0, tcs[i] );
+ TEST_DEBLOCK( deblock_chroma_420_mbaff, 0, tcs[i] );
+ TEST_DEBLOCK( deblock_chroma_422_mbaff, 0, tcs[i] );
TEST_DEBLOCK( deblock_chroma[1], 1, tcs[i] );
TEST_DEBLOCK( deblock_luma_intra[0], 0 );
TEST_DEBLOCK( deblock_luma_intra[1], 1 );
TEST_DEBLOCK( deblock_h_chroma_420_intra, 0 );
TEST_DEBLOCK( deblock_h_chroma_422_intra, 0 );
+ TEST_DEBLOCK( deblock_chroma_420_intra_mbaff, 0 );
+ TEST_DEBLOCK( deblock_chroma_422_intra_mbaff, 0 );
TEST_DEBLOCK( deblock_chroma_intra[1], 1 );
if( db_a.deblock_strength != db_ref.deblock_strength )