coeff_last8, coeff_level_run8, var2_8x16, predict_8x16c_dc, satd_4x16, intra_mbcmp_8x16c_x3, deblock_h_chroma_422
void x264_deblock_v_chroma_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_sse2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_h_chroma_422_mmx2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_h_chroma_422_sse2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_h_chroma_422_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_v_luma_intra_sse2( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_v_luma_intra_avx ( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_h_luma_intra_sse2( pixel *pix, int stride, int alpha, int beta );
pf->deblock_luma[0] = x264_deblock_h_luma_mmx2;
pf->deblock_chroma[1] = x264_deblock_v_chroma_mmx2;
pf->deblock_h_chroma_420 = x264_deblock_h_chroma_mmx2;
+#if !HIGH_BIT_DEPTH
+ pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_mmx2;
+#endif
pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_mmx2;
pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_mmx2;
pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_mmx2;
if( cpu&X264_CPU_SSE2 )
{
pf->deblock_strength = x264_deblock_strength_sse2;
+ pf->deblock_h_chroma_420 = x264_deblock_h_chroma_sse2;
+#if !HIGH_BIT_DEPTH
+ pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_sse2;
+#endif
if( !(cpu&X264_CPU_STACK_MOD4) )
{
pf->deblock_luma[1] = x264_deblock_v_luma_sse2;
pf->deblock_luma[0] = x264_deblock_h_luma_sse2;
pf->deblock_chroma[1] = x264_deblock_v_chroma_sse2;
- pf->deblock_h_chroma_420 = x264_deblock_h_chroma_sse2;
pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_sse2;
pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_sse2;
pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_sse2;
if( cpu&X264_CPU_AVX )
{
pf->deblock_strength = x264_deblock_strength_avx;
+ pf->deblock_h_chroma_420 = x264_deblock_h_chroma_avx;
+#if !HIGH_BIT_DEPTH
+ pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_avx;
+#endif
if( !(cpu&X264_CPU_STACK_MOD4) )
{
pf->deblock_luma[1] = x264_deblock_v_luma_avx;
pf->deblock_luma[0] = x264_deblock_h_luma_avx;
pf->deblock_chroma[1] = x264_deblock_v_chroma_avx;
- pf->deblock_h_chroma_420 = x264_deblock_h_chroma_avx;
pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_avx;
pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_avx;
pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_avx;
INTRA_MBCMP( sad, 16x16, v, h, dc, ,, _c )
INTRA_MBCMP(satd, 16x16, v, h, dc, ,, _c )
-#if HIGH_BIT_DEPTH && HAVE_MMX
+#if HAVE_MMX
+#if HIGH_BIT_DEPTH
INTRA_MBCMP( sad, 4x4, v, h, dc, , _mmx2, _c )
INTRA_MBCMP(satd, 4x4, v, h, dc, , _mmx2, _c )
INTRA_MBCMP( sad, 8x8, dc, h, v, c, _mmx2, _c )
INTRA_MBCMP( sad, 4x4, v, h, dc, , _ssse3, _c )
INTRA_MBCMP( sad, 8x8, dc, h, v, c, _ssse3, _sse2 )
INTRA_MBCMP( sad, 16x16, v, h, dc, , _ssse3, _sse2 )
+#else
+#define x264_predict_8x16c_v_mmx2 x264_predict_8x16c_v_mmx
+INTRA_MBCMP( sad, 8x16, dc, h, v, c, _mmx2, _mmx2 )
+INTRA_MBCMP(satd, 8x16, dc, h, v, c, _mmx2, _mmx2 )
+INTRA_MBCMP( sad, 8x16, dc, h, v, c, _sse2, _mmx2 )
+INTRA_MBCMP(satd, 8x16, dc, h, v, c, _sse2, _mmx2 )
+INTRA_MBCMP(satd, 8x16, dc, h, v, c, _ssse3, _mmx2 )
+INTRA_MBCMP(satd, 8x16, dc, h, v, c, _sse4, _mmx2 )
+INTRA_MBCMP(satd, 8x16, dc, h, v, c, _avx, _mmx2 )
+INTRA_MBCMP(satd, 8x16, dc, h, v, c, _xop, _mmx2 )
+#endif
#endif
// No C implementation of intra_satd_x9. See checkasm for its behavior,
INIT7( sad, _mmx2 );
INIT7( sad_x3, _mmx2 );
INIT7( sad_x4, _mmx2 );
- INIT7( satd, _mmx2 );
+ INIT8( satd, _mmx2 );
INIT7( satd_x3, _mmx2 );
INIT7( satd_x4, _mmx2 );
INIT4( hadamard_ac, _mmx2 );
- INIT7( ssd, _mmx2 );
+ INIT8( ssd, _mmx2 );
INIT_ADS( _mmx2 );
pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_mmx2;
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_mmx2;
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_mmx2;
+#if ARCH_X86
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_mmx2;
+ pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_mmx2;
+#endif
pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_mmx2;
pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_mmx2;
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_sse2;
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_sse2;
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_sse2;
+ pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_sse2;
}
if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_SLOW) )
{
INIT8_NAME( sad_aligned, sad, _mmx2 );
INIT7( sad_x3, _mmx2 );
INIT7( sad_x4, _mmx2 );
- INIT7( satd, _mmx2 );
+ INIT8( satd, _mmx2 );
INIT7( satd_x3, _mmx2 );
INIT7( satd_x4, _mmx2 );
INIT4( hadamard_ac, _mmx2 );
pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_mmx2;
pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_mmx2;
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_mmx2;
+ pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_mmx2;
pixf->vsad = x264_pixel_vsad_mmx2;
if( cpu&X264_CPU_CACHELINE_32 )
#endif
pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_mmx2;
pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_mmx2;
+ pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_mmx2;
+ pixf->intra_sad_x3_8x16c = x264_intra_sad_x3_8x16c_mmx2;
pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_mmx2;
pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_mmx2;
pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_mmx2;
pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_sse2;
#endif
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_sse2;
+ pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_sse2;
pixf->vsad = x264_pixel_vsad_sse2;
}
INIT2( sad_x3, _sse2 );
INIT2( sad_x4, _sse2 );
INIT6( satd, _sse2 );
+ pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_sse2;
INIT6( satd_x3, _sse2 );
INIT6( satd_x4, _sse2 );
if( !(cpu&X264_CPU_STACK_MOD4) )
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_sse2;
pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_sse2;
pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_sse2;
+ pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_sse2;
+ pixf->intra_sad_x3_8x16c = x264_intra_sad_x3_8x16c_sse2;
if( cpu&X264_CPU_CACHELINE_64 )
{
INIT2( ssd, _sse2); /* faster for width 16 on p4 */
INIT8( ssd, _ssse3 );
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_ssse3;
- INIT7( satd, _ssse3 );
+ INIT8( satd, _ssse3 );
INIT7( satd_x3, _ssse3 );
INIT7( satd_x4, _ssse3 );
}
pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_ssse3;
pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_ssse3;
+ pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_ssse3;
pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_ssse3;
pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_ssse3;
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_ssse3;
+ pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_ssse3;
if( cpu&X264_CPU_CACHELINE_64 )
{
INIT2( sad, _cache64_ssse3 );
if( cpu&X264_CPU_SSE4 )
{
- INIT7( satd, _sse4 );
+ INIT8( satd, _sse4 );
INIT7( satd_x3, _sse4 );
INIT7( satd_x4, _sse4 );
if( !(cpu&X264_CPU_STACK_MOD4) )
}
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_sse4;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sse4;
+ pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_sse4;
}
if( cpu&X264_CPU_AVX )
{
- INIT7( satd, _avx );
+ INIT8( satd, _avx );
INIT7( satd_x3, _avx );
INIT7( satd_x4, _avx );
INIT_ADS( _avx );
INIT5( ssd, _avx );
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_avx;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_avx;
+ pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_avx;
pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_avx;
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx;
pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_avx;
INIT5( ssd, _xop );
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_xop;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_xop;
+ pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_xop;
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_xop;
pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_xop;
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_xop;
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_xop;
+ pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_xop;
}
#endif //HAVE_MMX
pf->decimate_score16 = x264_decimate_score16_mmx2_slowctz;
}
pf->decimate_score64 = x264_decimate_score64_mmx2;
- pf->coeff_last4 = x264_coeff_last4_mmx2;
+ pf->coeff_last8 = x264_coeff_last8_mmx2;
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_mmx2;
pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16_mmx2;
pf->coeff_last[ DCT_LUMA_8x8] = x264_coeff_last64_mmx2;
+ pf->coeff_level_run8 = x264_coeff_level_run8_mmx2;
pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_mmx2;
pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16_mmx2;
#endif
+ pf->coeff_last4 = x264_coeff_last4_mmx2;
pf->coeff_level_run4 = x264_coeff_level_run4_mmx2;
if( cpu&X264_CPU_LZCNT )
pf->coeff_level_run4 = x264_coeff_level_run4_mmx2_lzcnt;
pf->decimate_score15 = x264_decimate_score15_sse2_slowctz;
pf->decimate_score16 = x264_decimate_score16_sse2_slowctz;
}
+ pf->coeff_last8 = x264_coeff_last8_sse2;
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2;
pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2;
pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2;
+ pf->coeff_level_run8 = x264_coeff_level_run8_sse2;
pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2;
pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2;
if( cpu&X264_CPU_LZCNT )
{
pf->coeff_last4 = x264_coeff_last4_mmx2_lzcnt;
+ pf->coeff_last8 = x264_coeff_last8_sse2_lzcnt;
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2_lzcnt;
pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2_lzcnt;
pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2_lzcnt;
+ pf->coeff_level_run8 = x264_coeff_level_run8_sse2_lzcnt;
pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2_lzcnt;
pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2_lzcnt;
}
pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16_mmx2;
#endif
pf->coeff_last4 = x264_coeff_last4_mmx2;
+ pf->coeff_last8 = x264_coeff_last8_mmx2;
pf->coeff_level_run4 = x264_coeff_level_run4_mmx2;
+ pf->coeff_level_run8 = x264_coeff_level_run8_mmx2;
if( cpu&X264_CPU_LZCNT )
{
pf->coeff_last4 = x264_coeff_last4_mmx2_lzcnt;
+ pf->coeff_last8 = x264_coeff_last8_mmx2_lzcnt;
pf->coeff_level_run4 = x264_coeff_level_run4_mmx2_lzcnt;
+ pf->coeff_level_run8 = x264_coeff_level_run8_mmx2_lzcnt;
}
}
DEBLOCK_CHROMA
%endif
+%macro DEBLOCK_H_CHROMA_422 0
+cglobal deblock_h_chroma_422, 5,7,8
+%ifdef ARCH_X86_64
+ %define cntr r11
+%else
+ %define cntr dword r0m
+%endif
+ dec r2d
+ dec r3d
+ sub r0, 4
+ lea t6, [r1*3]
+ mov t5, r0
+ add r0, t6
+ mov cntr, 32/mmsize
+.skip_prologue:
+ TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6)
+ LOAD_MASK r2d, r3d
+ movd m6, [r4] ; tc0
+ punpcklbw m6, m6
+%if mmsize == 16
+ punpcklbw m6, m6
+ punpcklbw m6, m6
+%else
+ pshufw m6, m6, q0000
+%endif
+ pand m7, m6
+ DEBLOCK_P0_Q0
+ TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2)
+ lea r0, [r0+r1*(mmsize/2)]
+ lea t5, [t5+r1*(mmsize/2)]
+ add r4, mmsize/8
+ dec cntr
+ jg .skip_prologue
+ REP_RET
+%endmacro
+
+INIT_MMX mmx2
+DEBLOCK_H_CHROMA_422
+INIT_XMM sse2
+DEBLOCK_H_CHROMA_422
+INIT_XMM avx
+DEBLOCK_H_CHROMA_422
; in: %1=p0 %2=p1 %3=q1
; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2
INIT_MMX mmx2
SSD_ONE 4, 4
SSD_ONE 4, 8
+SSD_ONE 4, 16
SSD_ONE 8, 4
SSD_ONE 8, 8
SSD_ONE 8, 16
VAR
%endif ; !HIGH_BIT_DEPTH
-%macro VAR2_END 0
+%macro VAR2_END 1
HADDW m5, m7
movd r1d, m5
imul r1d, r1d
HADDD m6, m1
- shr r1d, 6
+ shr r1d, %1
movd eax, m6
mov [r4], eax
sub eax, r1d ; sqr - (sum * sum >> shift)
;-----------------------------------------------------------------------------
; int pixel_var2_8x8( pixel *, int, pixel *, int, int * )
;-----------------------------------------------------------------------------
-INIT_MMX mmx2
-cglobal pixel_var2_8x8, 5,6
+%macro VAR2_8x8_MMX 2
+cglobal pixel_var2_8x%1, 5,6
FIX_STRIDES r1, r3
VAR_START 0
- mov r5d, 8
+ mov r5d, %1
.loop:
%ifdef HIGH_BIT_DEPTH
mova m0, [r0]
add r2, r3
dec r5d
jg .loop
- VAR2_END
- RET
+ VAR2_END %2
+%endmacro
-INIT_XMM sse2
-cglobal pixel_var2_8x8, 5,6,8
+%ifndef ARCH_X86_64
+INIT_MMX mmx2
+VAR2_8x8_MMX 8, 6
+VAR2_8x8_MMX 16, 7
+%endif
+
+%macro VAR2_8x8_SSE2 2
+cglobal pixel_var2_8x%1, 5,6,8
VAR_START 1
- mov r5d, 4
+ mov r5d, %1/2
.loop:
%ifdef HIGH_BIT_DEPTH
mova m0, [r0]
lea r2, [r2+r3*2*SIZEOF_PIXEL]
dec r5d
jg .loop
- VAR2_END
- RET
+ VAR2_END %2
+%endmacro
+
+INIT_XMM sse2
+VAR2_8x8_SSE2 8, 6
+VAR2_8x8_SSE2 16, 7
%ifndef HIGH_BIT_DEPTH
-%macro VAR2_8x8 0
-cglobal pixel_var2_8x8, 5,6,8
+%macro VAR2_8x8_SSSE3 2
+cglobal pixel_var2_8x%1, 5,6,8
pxor m5, m5 ; sum
pxor m6, m6 ; sum squared
mova m7, [hsub_mul]
- mov r5d, 2
+ mov r5d, %1/4
.loop:
movq m0, [r0]
movq m2, [r2]
lea r2, [r2+r3*2]
dec r5d
jg .loop
- VAR2_END
- RET
+ VAR2_END %2
%endmacro
INIT_XMM ssse3
-VAR2_8x8
+VAR2_8x8_SSSE3 8, 6
+VAR2_8x8_SSSE3 16, 7
INIT_XMM xop
-VAR2_8x8
+VAR2_8x8_SSSE3 8, 6
+VAR2_8x8_SSSE3 16, 7
%endif ; !HIGH_BIT_DEPTH
call pixel_satd_8x4_internal_mmx2
SATD_END_MMX
+cglobal pixel_satd_4x16, 4,6
+ SATD_START_MMX
+ SATD_4x4_MMX m0, 0, 1
+ SATD_4x4_MMX m1, 0, 1
+ paddw m0, m1
+ SATD_4x4_MMX m1, 0, 1
+ paddw m0, m1
+ SATD_4x4_MMX m1, 0, 0
+ paddw m0, m1
+ SATD_END_MMX
+
cglobal pixel_satd_4x8, 4,6
SATD_START_MMX
SATD_4x4_MMX m0, 0, 1
%endif
%endmacro
-;-----------------------------------------------------------------------------
-; int pixel_satd_8x4( uint8_t *, int, uint8_t *, int )
-;-----------------------------------------------------------------------------
-%macro SATDS_SSE2 0
-%if cpuflag(ssse3)
-cglobal pixel_satd_4x4, 4, 6, 6
- SATD_START_MMX
- mova m4, [hmul_4p]
- LOAD_DUP_2x4P m2, m5, [r2], [r2+r3]
- LOAD_DUP_2x4P m3, m5, [r2+2*r3], [r2+r5]
- LOAD_DUP_2x4P m0, m5, [r0], [r0+r1]
- LOAD_DUP_2x4P m1, m5, [r0+2*r1], [r0+r4]
- DIFF_SUMSUB_SSSE3 0, 2, 1, 3, 4
- HADAMARD 0, sumsub, 0, 1, 2, 3
- HADAMARD 4, sumsub, 0, 1, 2, 3
- HADAMARD 1, amax, 0, 1, 2, 3
- HADDW m0, m1
- movd eax, m0
- RET
-%endif
-
-cglobal pixel_satd_4x8, 4, 6, 8
- SATD_START_MMX
-%if cpuflag(ssse3)
- mova m7, [hmul_4p]
-%endif
+%macro SATD_4x8_SSE 2
movd m4, [r2]
movd m5, [r2+r3]
movd m6, [r2+2*r3]
JDUP m5, m3
movd m3, [r0+2*r1]
JDUP m1, m3
+%if cpuflag(ssse3) && %1==1
+ mova m3, [hmul_4p]
+ DIFFOP 0, 4, 1, 5, 3
+%else
DIFFOP 0, 4, 1, 5, 7
+%endif
movd m5, [r2]
add r2, r5
movd m3, [r0]
JDUP m5, m4
movd m4, [r0+r1]
JDUP m3, m4
+%if cpuflag(ssse3) && %1==1
+ mova m4, [hmul_4p]
+ DIFFOP 2, 6, 3, 5, 4
+%else
DIFFOP 2, 6, 3, 5, 7
- SATD_8x4_SSE cpuname, 0, 1, 2, 3, 4, 5, 6, swap
- HADDW m6, m1
- movd eax, m6
+%endif
+ SATD_8x4_SSE cpuname, 0, 1, 2, 3, 4, 5, 7, %2
+%endmacro
+
+;-----------------------------------------------------------------------------
+; int pixel_satd_8x4( uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+%macro SATDS_SSE2 0
+%if cpuflag(ssse3)
+cglobal pixel_satd_4x4, 4, 6, 6
+ SATD_START_MMX
+ mova m4, [hmul_4p]
+ LOAD_DUP_2x4P m2, m5, [r2], [r2+r3]
+ LOAD_DUP_2x4P m3, m5, [r2+2*r3], [r2+r5]
+ LOAD_DUP_2x4P m0, m5, [r0], [r0+r1]
+ LOAD_DUP_2x4P m1, m5, [r0+2*r1], [r0+r4]
+ DIFF_SUMSUB_SSSE3 0, 2, 1, 3, 4
+ HADAMARD 0, sumsub, 0, 1, 2, 3
+ HADAMARD 4, sumsub, 0, 1, 2, 3
+ HADAMARD 1, amax, 0, 1, 2, 3
+ HADDW m0, m1
+ movd eax, m0
+ RET
+%endif
+
+cglobal pixel_satd_4x8, 4, 6, 8
+ SATD_START_MMX
+%if cpuflag(ssse3)
+ mova m7, [hmul_4p]
+%endif
+ SATD_4x8_SSE 0, swap
+ HADDW m7, m1
+ movd eax, m7
+ RET
+
+cglobal pixel_satd_4x16, 4, 6, 8
+ SATD_START_MMX
+%if cpuflag(ssse3)
+ mova m7, [hmul_4p]
+%endif
+ SATD_4x8_SSE 0, swap
+ lea r0, [r0+r1*2]
+ lea r2, [r2+r3*2]
+ SATD_4x8_SSE 1, add
+ HADDW m7, m1
+ movd eax, m7
RET
cglobal pixel_satd_8x8_internal
int x264_pixel_var2_8x8_sse2( pixel *, int, pixel *, int, int * );
int x264_pixel_var2_8x8_ssse3( uint8_t *, int, uint8_t *, int, int * );
int x264_pixel_var2_8x8_xop( uint8_t *, int, uint8_t *, int, int * );
+int x264_pixel_var2_8x16_mmx2( pixel *, int, pixel *, int, int * );
+int x264_pixel_var2_8x16_sse2( pixel *, int, pixel *, int, int * );
+int x264_pixel_var2_8x16_ssse3( uint8_t *, int, uint8_t *, int, int * );
+int x264_pixel_var2_8x16_xop( uint8_t *, int, uint8_t *, int, int * );
int x264_pixel_vsad_mmx2( pixel *src, int stride, int height );
int x264_pixel_vsad_sse2( pixel *src, int stride, int height );
; void predict_8x8c_dc( pixel *src )
;-----------------------------------------------------------------------------
+%macro LOAD_LEFT 1
+ movzx r1d, pixel [r0+FDEC_STRIDEB*(%1-4)-SIZEOF_PIXEL]
+ movzx r2d, pixel [r0+FDEC_STRIDEB*(%1-3)-SIZEOF_PIXEL]
+ add r1d, r2d
+ movzx r2d, pixel [r0+FDEC_STRIDEB*(%1-2)-SIZEOF_PIXEL]
+ add r1d, r2d
+ movzx r2d, pixel [r0+FDEC_STRIDEB*(%1-1)-SIZEOF_PIXEL]
+ add r1d, r2d
+%endmacro
+
%macro PREDICT_8x8C_DC 0
cglobal predict_8x8c_dc, 1,3
pxor m7, m7
%endif
add r0, FDEC_STRIDEB*4
- movzx r1d, pixel [r0-FDEC_STRIDEB*4-SIZEOF_PIXEL]
- movzx r2d, pixel [r0-FDEC_STRIDEB*3-SIZEOF_PIXEL]
- add r1d, r2d
- movzx r2d, pixel [r0-FDEC_STRIDEB*2-SIZEOF_PIXEL]
- add r1d, r2d
- movzx r2d, pixel [r0-FDEC_STRIDEB*1-SIZEOF_PIXEL]
- add r1d, r2d
- movd m2, r1d ; s2
-
- movzx r1d, pixel [r0+FDEC_STRIDEB*0-SIZEOF_PIXEL]
- movzx r2d, pixel [r0+FDEC_STRIDEB*1-SIZEOF_PIXEL]
- add r1d, r2d
- movzx r2d, pixel [r0+FDEC_STRIDEB*2-SIZEOF_PIXEL]
- add r1d, r2d
- movzx r2d, pixel [r0+FDEC_STRIDEB*3-SIZEOF_PIXEL]
- add r1d, r2d
- movd m3, r1d ; s3
+ LOAD_LEFT 0 ; s2
+ movd m2, r1d
+ LOAD_LEFT 4 ; s3
+ movd m3, r1d
punpcklwd m0, m1
punpcklwd m2, m3
PREDICT_8x8C_DC
%endif
+%ifdef HIGH_BIT_DEPTH
+%macro STORE_4LINES 3
+%if cpuflag(sse2)
+ movdqa [r0+FDEC_STRIDEB*(%3-4)], %1
+ movdqa [r0+FDEC_STRIDEB*(%3-3)], %1
+ movdqa [r0+FDEC_STRIDEB*(%3-2)], %1
+ movdqa [r0+FDEC_STRIDEB*(%3-1)], %1
+%else
+ movq [r0+FDEC_STRIDEB*(%3-4)+0], %1
+ movq [r0+FDEC_STRIDEB*(%3-4)+8], %2
+ movq [r0+FDEC_STRIDEB*(%3-3)+0], %1
+ movq [r0+FDEC_STRIDEB*(%3-3)+8], %2
+ movq [r0+FDEC_STRIDEB*(%3-2)+0], %1
+ movq [r0+FDEC_STRIDEB*(%3-2)+8], %2
+ movq [r0+FDEC_STRIDEB*(%3-1)+0], %1
+ movq [r0+FDEC_STRIDEB*(%3-1)+8], %2
+%endif
+%endmacro
+%else
+%macro STORE_4LINES 2
+ movq [r0+FDEC_STRIDEB*(%2-4)], %1
+ movq [r0+FDEC_STRIDEB*(%2-3)], %1
+ movq [r0+FDEC_STRIDEB*(%2-2)], %1
+ movq [r0+FDEC_STRIDEB*(%2-1)], %1
+%endmacro
+%endif
+
+%macro PREDICT_8x16C_DC 0
+cglobal predict_8x16c_dc, 1,3
+ pxor m7, m7
+%ifdef HIGH_BIT_DEPTH
+ movq m0, [r0-FDEC_STRIDEB+0]
+ movq m1, [r0-FDEC_STRIDEB+8]
+ HADDW m0, m2
+ HADDW m1, m2
+%else
+ movd m0, [r0-FDEC_STRIDEB+0]
+ movd m1, [r0-FDEC_STRIDEB+4]
+ psadbw m0, m7 ; s0
+ psadbw m1, m7 ; s1
+%endif
+ punpcklwd m0, m1 ; s0, s1
+
+ add r0, FDEC_STRIDEB*4
+ LOAD_LEFT 0 ; s2
+ pinsrw m0, r1d, 2
+ LOAD_LEFT 4 ; s3
+ pinsrw m0, r1d, 3 ; s0, s1, s2, s3
+ add r0, FDEC_STRIDEB*8
+ LOAD_LEFT 0 ; s4
+ pinsrw m1, r1d, 2
+ LOAD_LEFT 4 ; s5
+ pinsrw m1, r1d, 3 ; s1, __, s4, s5
+ sub r0, FDEC_STRIDEB*8
+
+ pshufw m2, m0, q1310 ; s0, s1, s3, s1
+ pshufw m0, m0, q3312 ; s2, s1, s3, s3
+ pshufw m3, m1, q0302 ; s4, s1, s5, s1
+ pshufw m1, m1, q3322 ; s4, s4, s5, s5
+ paddw m0, m2
+ paddw m1, m3
+ psrlw m0, 2
+ psrlw m1, 2
+ pavgw m0, m7
+ pavgw m1, m7
+%ifdef HIGH_BIT_DEPTH
+%if cpuflag(sse2)
+ movq2dq xmm0, m0
+ movq2dq xmm1, m1
+ punpcklwd xmm0, xmm0
+ punpcklwd xmm1, xmm1
+ pshufd xmm2, xmm0, q3322
+ pshufd xmm3, xmm1, q3322
+ punpckldq xmm0, xmm0
+ punpckldq xmm1, xmm1
+ STORE_4LINES xmm0, xmm0, 0
+ STORE_4LINES xmm2, xmm2, 4
+ STORE_4LINES xmm1, xmm1, 8
+ STORE_4LINES xmm3, xmm3, 12
+%else
+ pshufw m2, m0, q0000
+ pshufw m3, m0, q1111
+ pshufw m4, m0, q2222
+ pshufw m5, m0, q3333
+ STORE_4LINES m2, m3, 0
+ STORE_4LINES m4, m5, 4
+ pshufw m2, m1, q0000
+ pshufw m3, m1, q1111
+ pshufw m4, m1, q2222
+ pshufw m5, m1, q3333
+ STORE_4LINES m2, m3, 8
+ STORE_4LINES m4, m5, 12
+%endif
+%else
+ packuswb m0, m0 ; dc0, dc1, dc2, dc3
+ packuswb m1, m1 ; dc4, dc5, dc6, dc7
+ punpcklbw m0, m0
+ punpcklbw m1, m1
+ pshufw m2, m0, q1100
+ pshufw m3, m0, q3322
+ pshufw m4, m1, q1100
+ pshufw m5, m1, q3322
+ STORE_4LINES m2, 0
+ STORE_4LINES m3, 4
+ add r0, FDEC_STRIDEB*8
+ STORE_4LINES m4, 0
+ STORE_4LINES m5, 4
+%endif
+ RET
+%endmacro
+
+INIT_MMX mmx2
+PREDICT_8x16C_DC
+%ifdef HIGH_BIT_DEPTH
+INIT_MMX sse2
+PREDICT_8x16C_DC
+%endif
+
%macro PREDICT_C_DC_TOP 1
%ifdef HIGH_BIT_DEPTH
INIT_XMM
H += -4 * src[-1*FDEC_STRIDE -1];\
int a = 16 * ( src[7*FDEC_STRIDE -1] + src[7 - FDEC_STRIDE] );\
int b = ( 17 * H + 16 ) >> 5;\
- int c = ( 17 * V + 16 ) >> 5;\
+ int c = ( 17 * V + 16 ) >> 5;
#if HIGH_BIT_DEPTH
#define PREDICT_8x8_P2(cpu1, cpu2)\
if( !(cpu&X264_CPU_MMX) )
return;
#if HIGH_BIT_DEPTH
- pf[I_PRED_CHROMA_V] = x264_predict_8x16c_v_sse2;
+ if( !(cpu&X264_CPU_MMX2) )
+ return;
+ pf[I_PRED_CHROMA_DC] = x264_predict_8x16c_dc_mmx2;
if( !(cpu&X264_CPU_SSE2) )
return;
+ pf[I_PRED_CHROMA_V] = x264_predict_8x16c_v_sse2;
pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x16c_dc_top_sse2;
+ pf[I_PRED_CHROMA_DC] = x264_predict_8x16c_dc_sse2;
pf[I_PRED_CHROMA_H] = x264_predict_8x16c_h_sse2;
#else
pf[I_PRED_CHROMA_V] = x264_predict_8x16c_v_mmx;
if( !(cpu&X264_CPU_MMX2) )
return;
pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x16c_dc_top_mmx2;
+ pf[I_PRED_CHROMA_DC] = x264_predict_8x16c_dc_mmx2;
pf[I_PRED_CHROMA_H] = x264_predict_8x16c_h_mmx2;
if( !(cpu&X264_CPU_SSSE3) )
return;
void x264_predict_16x16_p_core_mmx2( uint8_t *src, int i00, int b, int c );
void x264_predict_16x16_p_core_sse2( pixel *src, int i00, int b, int c );
void x264_predict_16x16_p_core_avx( pixel *src, int i00, int b, int c );
+void x264_predict_8x16c_dc_mmx2( pixel *src );
+void x264_predict_8x16c_dc_sse2( uint16_t *src );
void x264_predict_8x16c_dc_top_mmx2( uint8_t *src );
void x264_predict_8x16c_dc_top_sse2( uint16_t *src );
void x264_predict_8x16c_v_mmx( uint8_t *src );
pmovmskb %2, mm0
%elif mmsize == 16
movdqa xmm0, [%3+ 0]
+%if %1 == 8
+ packssdw xmm0, [%3+16]
+ packsswb xmm0, xmm0
+%else
movdqa xmm1, [%3+32]
packssdw xmm0, [%3+16]
packssdw xmm1, [%3+48]
packsswb xmm0, xmm1
+%endif
pcmpeqb xmm0, xmm2
pmovmskb %2, xmm0
+%elif %1 == 8
+ movq mm0, [%3+ 0]
+ movq mm1, [%3+16]
+ packssdw mm0, [%3+ 8]
+ packssdw mm1, [%3+24]
+ packsswb mm0, mm1
+ pcmpeqb mm0, mm2
+ pmovmskb %2, mm0
%else
movq mm0, [%3+ 0]
movq mm1, [%3+16]
INIT_MMX mmx2, lzcnt
COEFF_LAST4
+%macro COEFF_LAST8 0
+cglobal coeff_last8, 1,3
+ pxor m2, m2
+ LAST_MASK 8, r1d, r0
+%if mmsize == 16
+ xor r1d, 0xffff
+ shr r1d, 8
+%else
+ xor r1d, 0xff
+%endif
+ BSR eax, r1d, 0x1f
+ RET
+%endmacro
+
+%ifndef ARCH_X86_64
+INIT_MMX mmx2
+COEFF_LAST8
+%endif
+INIT_XMM sse2
+COEFF_LAST8
+INIT_XMM sse2, lzcnt
+COEFF_LAST8
+
%else ; !HIGH_BIT_DEPTH
%macro LAST_MASK 3-4
+%if %1 <= 8
+ movq mm0, [%3+ 0]
%if %1 == 4
- movq mm0, [%3]
packsswb mm0, mm0
+%else
+ packsswb mm0, [%3+ 8]
+%endif
pcmpeqb mm0, mm2
pmovmskb %2, mm0
%elif mmsize == 16
%endif
%endmacro
-%macro COEFF_LAST4 0
+%macro COEFF_LAST48 0
%ifdef ARCH_X86_64
cglobal coeff_last4, 1,1
BSR rax, [r0], 0x3f
lea eax, [eax+ecx*2]
RET
%endif
+
+cglobal coeff_last8, 1,3
+ pxor m2, m2
+ LAST_MASK 8, r1d, r0, r2d
+ xor r1d, 0xff
+ BSR eax, r1d, 0x1f
+ RET
%endmacro
INIT_MMX mmx2
-COEFF_LAST4
+COEFF_LAST48
INIT_MMX mmx2, lzcnt
-COEFF_LAST4
+COEFF_LAST48
%endif ; HIGH_BIT_DEPTH
%macro COEFF_LAST 0
COEFF_LEVELRUN 16
%endif
COEFF_LEVELRUN 4
+COEFF_LEVELRUN 8
INIT_XMM sse2
+%ifdef HIGH_BIT_DEPTH
+COEFF_LEVELRUN 8
+%endif
COEFF_LEVELRUN 15
COEFF_LEVELRUN 16
INIT_XMM sse2, lzcnt
+%ifdef HIGH_BIT_DEPTH
+COEFF_LEVELRUN 8
+%endif
COEFF_LEVELRUN 15
COEFF_LEVELRUN 16
INIT_MMX mmx2, lzcnt
COEFF_LEVELRUN 4
+COEFF_LEVELRUN 8
int x264_decimate_score64_sse2( dctcoef *dct );
int x264_decimate_score64_ssse3( dctcoef *dct );
int x264_coeff_last4_mmx2( dctcoef *dct );
+int x264_coeff_last8_mmx2( dctcoef *dct );
int x264_coeff_last15_mmx2( dctcoef *dct );
int x264_coeff_last16_mmx2( dctcoef *dct );
int x264_coeff_last64_mmx2( dctcoef *dct );
+int x264_coeff_last8_sse2( dctcoef *dct );
int x264_coeff_last15_sse2( dctcoef *dct );
int x264_coeff_last16_sse2( dctcoef *dct );
int x264_coeff_last64_sse2( dctcoef *dct );
int x264_coeff_last4_mmx2_lzcnt( dctcoef *dct );
+int x264_coeff_last8_mmx2_lzcnt( dctcoef *dct );
+int x264_coeff_last8_sse2_lzcnt( dctcoef *dct );
int x264_coeff_last15_sse2_lzcnt( dctcoef *dct );
int x264_coeff_last16_sse2_lzcnt( dctcoef *dct );
int x264_coeff_last64_sse2_lzcnt( dctcoef *dct );
int x264_coeff_level_run15_sse2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run4_mmx2( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run4_mmx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run8_mmx2( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run8_mmx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run8_sse2( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run8_sse2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
#endif