High bit depth version of deblock_h_chroma_422.
Regular and high bit depth versions of deblock_h_chroma_intra_422.
High bit depth pixel_vsad.
SSE2 high bit depth and MMX 8-bit predict_8x8_vl.
Our first GCI patch this year!
dctf->idct4x4dc = x264_idct4x4dc_sse2;
dctf->add8x8_idct = x264_add8x8_idct_sse2;
dctf->add16x16_idct = x264_add16x16_idct_sse2;
+ dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_sse2;
dctf->add8x8_idct_dc = x264_add8x8_idct_dc_sse2;
+ dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_sse2;
dctf->add16x16_idct_dc= x264_add16x16_idct_dc_sse2;
}
if( cpu&X264_CPU_AVX )
dctf->add8x8_idct = x264_add8x8_idct_avx;
dctf->add16x16_idct = x264_add16x16_idct_avx;
dctf->add8x8_idct_dc = x264_add8x8_idct_dc_avx;
+ dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_avx;
dctf->add16x16_idct_dc= x264_add16x16_idct_dc_avx;
}
#endif // HAVE_MMX
void x264_deblock_v_chroma_intra_avx ( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_h_chroma_intra_sse2( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_h_chroma_intra_avx ( pixel *pix, int stride, int alpha, int beta );
+void x264_deblock_h_chroma_422_intra_mmx2( pixel *pix, int stride, int alpha, int beta );
+void x264_deblock_h_chroma_422_intra_sse2( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_strength_mmx2 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
int mvy_limit, int bframe );
pf->deblock_h_chroma_420 = x264_deblock_h_chroma_mmx2;
#if !HIGH_BIT_DEPTH
pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_mmx2;
+ pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_mmx2;
#endif
pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_mmx2;
pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_mmx2;
{
pf->deblock_strength = x264_deblock_strength_sse2;
pf->deblock_h_chroma_420 = x264_deblock_h_chroma_sse2;
-#if !HIGH_BIT_DEPTH
pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_sse2;
-#endif
+ pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_sse2;
if( !(cpu&X264_CPU_STACK_MOD4) )
{
pf->deblock_luma[1] = x264_deblock_v_luma_sse2;
{
INIT4( hadamard_ac, _sse2 );
}
-
+ pixf->vsad = x264_pixel_vsad_sse2;
pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_sse2;
pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_sse2;
pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_sse2;
{
INIT4( hadamard_ac, _ssse3 );
}
-
+ pixf->vsad = x264_pixel_vsad_ssse3;
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_ssse3;
pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_ssse3;
pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_avx;
pixf->ssim_end4 = x264_pixel_ssim_end4_avx;
}
+ if( cpu&X264_CPU_XOP )
+ {
+ pixf->vsad = x264_pixel_vsad_xop;
+ }
#endif // HAVE_MMX
#else // !HIGH_BIT_DEPTH
#if HAVE_MMX
%include "x86util.asm"
SECTION_RODATA
+pw_ppmmmmpp: dw 1,1,-1,-1,-1,-1,1,1
pb_sub4frame: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15
pb_sub4field: db 0,4,1,8,12,5,9,13,2,6,10,14,3,7,11,15
pb_subacmask: dw 0,-1,-1,-1,-1,-1,-1,-1
%endif ; !HIGH_BIT_DEPTH
+%macro DCTDC_4ROW_SSE2 2
+ mova %1, [r1+FENC_STRIDEB*%2]
+ mova m0, [r2+FDEC_STRIDEB*%2]
+%assign Y (%2+1)
+%rep 3
+ paddw %1, [r1+FENC_STRIDEB*Y]
+ paddw m0, [r2+FDEC_STRIDEB*Y]
+%assign Y (Y+1)
+%endrep
+ psubw %1, m0
+ pshufd m0, %1, q2301
+ paddw %1, m0
+%endmacro
+
+%ifdef HIGH_BIT_DEPTH
+%macro SUB8x8_DCT_DC_10 0
+cglobal sub8x8_dct_dc, 3,3,3
+ DCTDC_4ROW_SSE2 m1, 0
+ DCTDC_4ROW_SSE2 m2, 4
+ mova m0, [pw_ppmmmmpp]
+ pmaddwd m1, m0
+ pmaddwd m2, m0
+ pshufd m0, m1, q2200 ; -1 -1 +0 +0
+ pshufd m1, m1, q0033 ; +0 +0 +1 +1
+ paddd m1, m0
+ pshufd m0, m2, q1023 ; -2 +2 -3 +3
+ paddd m1, m2
+ paddd m1, m0
+ mova [r0], m1
+ RET
+%endmacro
+INIT_XMM sse2
+SUB8x8_DCT_DC_10
+
+%macro SUB8x16_DCT_DC_10 0
+cglobal sub8x16_dct_dc, 3,3,6
+ DCTDC_4ROW_SSE2 m1, 0
+ DCTDC_4ROW_SSE2 m2, 4
+ DCTDC_4ROW_SSE2 m3, 8
+ DCTDC_4ROW_SSE2 m4, 12
+ mova m0, [pw_ppmmmmpp]
+ pmaddwd m1, m0
+ pmaddwd m2, m0
+ pshufd m5, m1, q2200 ; -1 -1 +0 +0
+ pshufd m1, m1, q0033 ; +0 +0 +1 +1
+ paddd m1, m5
+ pshufd m5, m2, q1023 ; -2 +2 -3 +3
+ paddd m1, m2
+ paddd m1, m5 ; a6 a2 a4 a0
+ pmaddwd m3, m0
+ pmaddwd m4, m0
+ pshufd m5, m3, q2200
+ pshufd m3, m3, q0033
+ paddd m3, m5
+ pshufd m5, m4, q1023
+ paddd m3, m4
+ paddd m3, m5 ; a7 a3 a5 a1
+ paddd m0, m1, m3
+ psubd m1, m3
+ pshufd m0, m0, q3120
+ pshufd m1, m1, q3120
+ punpcklqdq m2, m0, m1
+ punpckhqdq m1, m0
+ mova [r0+ 0], m2
+ mova [r0+16], m1
+ RET
+%endmacro
+INIT_XMM sse2
+SUB8x16_DCT_DC_10
+INIT_XMM avx
+SUB8x16_DCT_DC_10
+%endif
+
;-----------------------------------------------------------------------------
; void zigzag_scan_8x8_frame( int16_t level[64], int16_t dct[8][8] )
;-----------------------------------------------------------------------------
void x264_sub8x8_dct_avx ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct_avx ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_dc_mmx2( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub8x8_dct_dc_sse2( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub8x16_dct_dc_sse2 ( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub8x8_dct_dc_sse2( dctcoef dct [ 4], pixel *pix1, pixel *pix2 );
+void x264_sub8x16_dct_dc_sse2( dctcoef dct [ 4], pixel *pix1, pixel *pix2 );
+void x264_sub8x16_dct_dc_avx ( dctcoef dct [ 4], pixel *pix1, pixel *pix2 );
void x264_sub8x16_dct_dc_ssse3( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 );
void x264_add4x4_idct_mmx ( uint8_t *p_dst, int16_t dct [16] );
DEBLOCK_CHROMA
INIT_XMM avx
DEBLOCK_CHROMA
+
+%macro DEBLOCK_H_CHROMA_422_INTRA_10 0
+cglobal deblock_h_chroma_422_intra, 4,6,8
+ add r1, r1
+ mov r4, 64/mmsize
+%if mmsize == 16
+ lea r5, [r1*3]
+%endif
+.loop:
+ CHROMA_H_LOAD r5
+ call deblock_intra_body
+ CHROMA_H_STORE r5
+ lea r0, [r0+r1*(mmsize/4)]
+ dec r4
+ jg .loop
+ REP_RET
+%endmacro
+INIT_XMM sse2
+DEBLOCK_H_CHROMA_422_INTRA_10
+
+%macro DEBLOCK_H_CHROMA_422_10 0
+cglobal deblock_h_chroma_422, 5,7,8
+ add r1, r1
+ mov r5, 64/mmsize
+ lea r6, [r1*3]
+.loop:
+ CHROMA_H_LOAD r6
+ RESET_MM_PERMUTATION
+ LOAD_AB m4, m5, r2, r3
+ LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
+ pxor m4, m4
+ movd m6, [r4-1]
+ psraw m6, 8
+ SPLATW m6, m6
+ pmaxsw m6, m4
+ pand m7, m6
+ DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6
+ CHROMA_H_STORE r6
+ lea r0, [r0+r1*(mmsize/4)]
+ add r4, mmsize/16
+ dec r5
+ jg .loop
+ REP_RET
+%endmacro
+INIT_XMM sse2
+DEBLOCK_H_CHROMA_422_10
%endif ; HIGH_BIT_DEPTH
%ifndef HIGH_BIT_DEPTH
sub t5, r1
%if mmsize==8
mov dword r0m, 2
-.skip_prologue:
+.loop:
%endif
%endmacro
lea t6, [r1*3]
mov t5, r0
add r0, t6
-%if mmsize==8
- mov dword r0m, 2
-.skip_prologue:
-%endif
%endmacro
%macro CHROMA_V_LOOP 1
add r4, 2
%endif
dec dword r0m
- jg .skip_prologue
+ jg .loop
%endif
%endmacro
add r4, 2
%endif
dec dword r0m
- jg .skip_prologue
+ jg .loop
%endif
%endmacro
;-----------------------------------------------------------------------------
cglobal deblock_h_chroma, 5,7,8
CHROMA_H_START
+%if mmsize==8
+ mov dword r0m, 2
+.loop:
+%endif
TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6)
call chroma_inter_body
TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2)
%else
%define cntr dword r0m
%endif
- dec r2d
- dec r3d
- sub r0, 4
- lea t6, [r1*3]
- mov t5, r0
- add r0, t6
+ CHROMA_H_START
mov cntr, 32/mmsize
-.skip_prologue:
+.loop:
TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6)
LOAD_MASK r2d, r3d
movd m6, [r4] ; tc0
lea t5, [t5+r1*(mmsize/2)]
add r4, mmsize/8
dec cntr
- jg .skip_prologue
+ jg .loop
REP_RET
%endmacro
;-----------------------------------------------------------------------------
cglobal deblock_h_chroma_intra, 4,6,8
CHROMA_H_START
+%if mmsize==8
+ mov dword r0m, 2
+.loop:
+%endif
TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6)
call chroma_intra_body
TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2)
INIT_MMX mmx2
DEBLOCK_CHROMA_INTRA
%endif
+
+%macro DEBLOCK_H_CHROMA_422_INTRA 0
+cglobal deblock_h_chroma_422_intra, 4,7,8
+ CHROMA_H_START
+ mov r6d, 32/mmsize
+.loop:
+ TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6)
+ call chroma_intra_body
+ TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2)
+ lea r0, [r0+r1*(mmsize/2)]
+ lea t5, [t5+r1*(mmsize/2)]
+ dec r6d
+ jg .loop
+ REP_RET
+%endmacro
+INIT_XMM sse2
+DEBLOCK_H_CHROMA_422_INTRA
+%ifndef ARCH_X86_64
+INIT_MMX mmx2
+DEBLOCK_H_CHROMA_422_INTRA
+%endif
%endif ; !HIGH_BIT_DEPTH
int x264_pixel_var2_8x16_xop( uint8_t *, int, uint8_t *, int, int * );
int x264_pixel_vsad_mmx2( pixel *src, int stride, int height );
int x264_pixel_vsad_sse2( pixel *src, int stride, int height );
+int x264_pixel_vsad_ssse3( pixel *src, int stride, int height );
+int x264_pixel_vsad_xop( pixel *src, int stride, int height );
#define DECL_ADS( size, suffix ) \
int x264_pixel_ads##size##_##suffix( int enc_dc[size], uint16_t *sums, int delta,\
%endif ; !HIGH_BIT_DEPTH
+;-----------------------------------------------------------------------------
+; void predict_8x8_vl( pixel *src, pixel *edge )
+;-----------------------------------------------------------------------------
+%macro PREDICT_8x8_VL_10 1
+cglobal predict_8x8_vl, 2,2,8
+ mova m0, [r1+16*SIZEOF_PIXEL]
+ mova m1, [r1+24*SIZEOF_PIXEL]
+ PALIGNR m2, m1, m0, SIZEOF_PIXEL*1, m4
+ PSRLPIX m4, m1, 1
+ pavg%1 m6, m0, m2
+ pavg%1 m7, m1, m4
+ add r0, FDEC_STRIDEB*4
+ mova [r0-4*FDEC_STRIDEB], m6
+ PALIGNR m3, m7, m6, SIZEOF_PIXEL*1, m5
+ mova [r0-2*FDEC_STRIDEB], m3
+ PALIGNR m3, m7, m6, SIZEOF_PIXEL*2, m5
+ mova [r0+0*FDEC_STRIDEB], m3
+ PALIGNR m3, m7, m6, SIZEOF_PIXEL*3, m5
+ mova [r0+2*FDEC_STRIDEB], m3
+ PALIGNR m3, m1, m0, SIZEOF_PIXEL*7, m6
+ PSLLPIX m5, m0, 1
+ PRED8x8_LOWPASS m0, m5, m2, m0, m7
+ PRED8x8_LOWPASS m1, m3, m4, m1, m7
+ PALIGNR m4, m1, m0, SIZEOF_PIXEL*1, m2
+ mova [r0-3*FDEC_STRIDEB], m4
+ PALIGNR m4, m1, m0, SIZEOF_PIXEL*2, m2
+ mova [r0-1*FDEC_STRIDEB], m4
+ PALIGNR m4, m1, m0, SIZEOF_PIXEL*3, m2
+ mova [r0+1*FDEC_STRIDEB], m4
+ PALIGNR m4, m1, m0, SIZEOF_PIXEL*4, m2
+ mova [r0+3*FDEC_STRIDEB], m4
+ RET
+%endmacro
+%ifdef HIGH_BIT_DEPTH
+INIT_XMM sse2
+PREDICT_8x8_VL_10 w
+INIT_XMM ssse3
+PREDICT_8x8_VL_10 w
+INIT_XMM avx
+PREDICT_8x8_VL_10 w
+%else
+INIT_MMX mmx2
+PREDICT_8x8_VL_10 b
+%endif
+
;-----------------------------------------------------------------------------
; void predict_8x8_hd( pixel *src, pixel *edge )
;-----------------------------------------------------------------------------
pf[I_PRED_8x8_DC_LEFT]= x264_predict_8x8_dc_left_sse2;
pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_sse2;
pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_sse2;
+ pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_sse2;
pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_sse2;
pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_sse2;
pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_sse2;
pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_ssse3;
pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_ssse3;
pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_ssse3;
+ pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_ssse3;
pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_ssse3;
*predict_8x8_filter = x264_predict_8x8_filter_ssse3;
if( cpu&X264_CPU_CACHELINE_64 )
return;
pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_avx;
pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_avx;
+ pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_avx;
pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_avx;
*predict_8x8_filter = x264_predict_8x8_filter_avx;
#else
pf[I_PRED_8x8_DC_TOP] = x264_predict_8x8_dc_top_mmx2;
pf[I_PRED_8x8_DC_LEFT]= x264_predict_8x8_dc_left_mmx2;
pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_mmx2;
+ pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_mmx2;
*predict_8x8_filter = x264_predict_8x8_filter_mmx2;
#if ARCH_X86
pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_mmx2;
void x264_predict_8x8_ddr_ssse3( pixel *src, pixel edge[36] );
void x264_predict_8x8_ddr_ssse3_cache64( pixel *src, pixel edge[36] );
void x264_predict_8x8_ddr_avx( pixel *src, pixel edge[36] );
-void x264_predict_8x8_vl_sse2( uint8_t *src, uint8_t edge[36] );
-void x264_predict_8x8_vl_avx( uint8_t *src, uint8_t edge[36] );
+void x264_predict_8x8_vl_sse2( pixel *src, pixel edge[36] );
+void x264_predict_8x8_vl_ssse3( pixel *src, pixel edge[36] );
+void x264_predict_8x8_vl_avx( pixel *src, pixel edge[36] );
+void x264_predict_8x8_vl_mmx2( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_vr_mmx2( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_vr_sse2( pixel *src, pixel edge[36] );
void x264_predict_8x8_vr_ssse3( pixel *src, pixel edge[36] );
%endrep
%endmacro
+%macro PIXEL_VSAD 0
+cglobal pixel_vsad, 3,3,8
+ mova m0, [r0]
+ mova m1, [r0+16]
+ mova m2, [r0+2*r1]
+ mova m3, [r0+2*r1+16]
+ lea r0, [r0+4*r1]
+ psubw m0, m2
+ psubw m1, m3
+ ABSW2 m0, m1, m0, m1, m4, m5
+ paddw m0, m1
+ sub r2d, 2
+ je .end
+.loop:
+ mova m4, [r0]
+ mova m5, [r0+16]
+ mova m6, [r0+2*r1]
+ mova m7, [r0+2*r1+16]
+ lea r0, [r0+4*r1]
+ psubw m2, m4
+ psubw m3, m5
+ psubw m4, m6
+ psubw m5, m7
+ ABSW m2, m2, m1
+ ABSW m3, m3, m1
+ ABSW m4, m4, m1
+ ABSW m5, m5, m1
+ paddw m0, m2
+ paddw m0, m3
+ paddw m0, m4
+ paddw m0, m5
+ mova m2, m6
+ mova m3, m7
+ sub r2d, 2
+ jg .loop
+.end:
+%if BIT_DEPTH == 9
+ HADDW m0, m1 ; max sum: 62(pixel diffs)*511(pixel_max)=31682
+%else
+ HADDUW m0, m1 ; max sum: 62(pixel diffs)*1023(pixel_max)=63426
+%endif
+ movd eax, m0
+ RET
+%endmacro
+INIT_XMM sse2
+PIXEL_VSAD
+INIT_XMM ssse3
+PIXEL_VSAD
+INIT_XMM xop
+PIXEL_VSAD
+
;-----------------------------------------------------------------------------
; void pixel_sad_xK_MxN( uint16_t *fenc, uint16_t *pix0, uint16_t *pix1,
; uint16_t *pix2, int i_stride, int scores[3] )
}
report( "pixel hadamard_ac :" );
+ // maximize sum
+ for( int i = 0; i < 32; i++ )
+ for( int j = 0; j < 16; j++ )
+ pbuf4[16*i+j] = -((i+j)&1) & PIXEL_MAX;
ok = 1; used_asm = 0;
if( pixel_asm.vsad != pixel_ref.vsad )
{
int res_c, res_asm;
set_func_name( "vsad" );
used_asm = 1;
- res_c = call_c( pixel_c.vsad, pbuf1, 16, h );
- res_asm = call_a( pixel_asm.vsad, pbuf1, 16, h );
- if( res_c != res_asm )
+ for( int j = 0; j < 2 && ok; j++ )
{
- ok = 0;
- fprintf( stderr, "vsad: height=%d, %d != %d\n", h, res_c, res_asm );
- break;
+ pixel *p = j ? pbuf4 : pbuf1;
+ res_c = call_c( pixel_c.vsad, p, 16, h );
+ res_asm = call_a( pixel_asm.vsad, p, 16, h );
+ if( res_c != res_asm )
+ {
+ ok = 0;
+ fprintf( stderr, "vsad: height=%d, %d != %d\n", h, res_c, res_asm );
+ break;
+ }
}
}
}
{
int cond_a = (i < 2) ? 1 : ((j&3) == 0 || (j&3) == (i-1));
int cond_b = (i == 0) ? 1 : !cond_a;
- enc[0] = enc[1] = cond_a ? PIXEL_MAX : 0;
- enc[2] = enc[3] = cond_b ? PIXEL_MAX : 0;
+ enc[0] = enc[1] = enc[4] = enc[5] = enc[8] = enc[9] = enc[12] = enc[13] = cond_a ? PIXEL_MAX : 0;
+ enc[2] = enc[3] = enc[6] = enc[7] = enc[10] = enc[11] = enc[14] = enc[15] = cond_b ? PIXEL_MAX : 0;
for( int k = 0; k < 4; k++ )
dec[k] = PIXEL_MAX - enc[k];