INTRA_MBCMP(satd, 16, v, h, dc, , _mmx2 )
INTRA_MBCMP( sad, 8, dc, h, v, c, _sse2 )
INTRA_MBCMP( sad, 16, v, h, dc, , _sse2 )
+INTRA_MBCMP( sad, 4, v, h, dc, , _ssse3 )
INTRA_MBCMP( sad, 8, dc, h, v, c, _ssse3 )
INTRA_MBCMP( sad, 16, v, h, dc, , _ssse3 )
#endif
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_ssse3;
+ pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_ssse3;
pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_ssse3;
pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_ssse3;
pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_ssse3;
}
+ if( cpu&X264_CPU_SSE4 )
+ {
+ if( !(cpu&X264_CPU_STACK_MOD4) )
+ {
+ INIT4( hadamard_ac, _sse4 );
+ }
+ pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_sse4;
+ pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sse4;
+ }
+ if( cpu&X264_CPU_AVX )
+ {
+ INIT_ADS( _avx );
+ if( !(cpu&X264_CPU_STACK_MOD4) )
+ {
+ INIT4( hadamard_ac, _avx );
+ }
+ pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_avx;
+ pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_avx;
+ pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx;
+ pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_avx;
+ pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_avx;
+ pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_avx;
+ pixf->ssim_end4 = x264_pixel_ssim_end4_avx;
+ }
#endif // HAVE_MMX
#else // !HIGH_BIT_DEPTH
#if HAVE_MMX
pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_ssse3;
#endif
pixf->var2_8x8 = x264_pixel_var2_8x8_ssse3;
+ if( cpu&X264_CPU_SHUFFLE_IS_FAST )
+ pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_ssse3;
if( cpu&X264_CPU_CACHELINE_64 )
{
INIT2( sad, _cache64_ssse3 );
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_sse4;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sse4;
pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_sse4;
- /* Slower on Conroe, so only enable under SSE4 */
- pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_ssse3;
}
if( cpu&X264_CPU_AVX )
INIT7( satd, _avx );
INIT7( satd_x3, _avx );
INIT7( satd_x4, _avx );
- pixf->ads[PIXEL_16x16] = x264_pixel_ads4_avx;
- pixf->ads[PIXEL_16x8] = x264_pixel_ads2_avx;
+ INIT_ADS( _avx );
if( !(cpu&X264_CPU_STACK_MOD4) )
{
INIT4( hadamard_ac, _avx );
pf->quant_4x4 = x264_quant_4x4_sse4;
pf->quant_8x8 = x264_quant_8x8_sse4;
}
+ if( cpu&X264_CPU_AVX )
+ {
+ pf->denoise_dct = x264_denoise_dct_avx;
+ }
#endif // HAVE_MMX
#else // !HIGH_BIT_DEPTH
#if HAVE_MMX
;=============================================================================
; prefetch
;=============================================================================
-; FIXME assumes 64 byte cachelines
+; assumes 64 byte cachelines
+; FIXME doesn't cover all pixels in high depth and/or 4:4:4
;-----------------------------------------------------------------------------
-; void prefetch_fenc( uint8_t *pix_y, int stride_y,
-; uint8_t *pix_uv, int stride_uv, int mb_x )
+; void prefetch_fenc( pixel *pix_y, int stride_y,
+; pixel *pix_uv, int stride_uv, int mb_x )
;-----------------------------------------------------------------------------
INIT_MMX
%ifdef ARCH_X86_64
cglobal prefetch_fenc_mmx2, 5,5
+ FIX_STRIDES r1d, r3d
and r4d, 3
mov eax, r4d
imul r4d, r1d
- lea r0, [r0+r4*4+64]
+ lea r0, [r0+r4*4+64*SIZEOF_PIXEL]
prefetcht0 [r0]
prefetcht0 [r0+r1]
lea r0, [r0+r1*2]
prefetcht0 [r0+r1]
imul eax, r3d
- lea r2, [r2+rax*2+64]
+ lea r2, [r2+rax*2+64*SIZEOF_PIXEL]
prefetcht0 [r2]
prefetcht0 [r2+r3]
RET
mov r2, r4m
mov r1, r1m
mov r0, r0m
+ FIX_STRIDES r1
and r2, 3
imul r2, r1
- lea r0, [r0+r2*4+64]
+ lea r0, [r0+r2*4+64*SIZEOF_PIXEL]
prefetcht0 [r0]
prefetcht0 [r0+r1]
lea r0, [r0+r1*2]
mov r2, r4m
mov r1, r3m
mov r0, r2m
+ FIX_STRIDES r1
and r2, 3
imul r2, r1
- lea r0, [r0+r2*2+64]
+ lea r0, [r0+r2*2+64*SIZEOF_PIXEL]
prefetcht0 [r0]
prefetcht0 [r0+r1]
ret
%endif ; ARCH_X86_64
;-----------------------------------------------------------------------------
-; void prefetch_ref( uint8_t *pix, int stride, int parity )
+; void prefetch_ref( pixel *pix, int stride, int parity )
;-----------------------------------------------------------------------------
cglobal prefetch_ref_mmx2, 3,3
+ FIX_STRIDES r1d
dec r2d
and r2d, r1d
- lea r0, [r0+r2*8+64]
+ lea r0, [r0+r2*8+64*SIZEOF_PIXEL]
lea r2, [r1*3]
prefetcht0 [r0]
prefetcht0 [r0+r1]
void x264_mc_copy_w16_mmx( pixel *, int, pixel *, int, int );
void x264_mc_copy_w16_sse2( pixel *, int, pixel *, int, int );
void x264_mc_copy_w16_aligned_sse2( pixel *, int, pixel *, int, int );
-void x264_prefetch_fenc_mmx2( uint8_t *, int, uint8_t *, int, int );
-void x264_prefetch_ref_mmx2( uint8_t *, int, int );
+void x264_prefetch_fenc_mmx2( pixel *, int, pixel *, int, int );
+void x264_prefetch_ref_mmx2( pixel *, int, int );
void x264_plane_copy_core_mmx2( pixel *, int, pixel *, int, int w, int h);
void x264_plane_copy_c( pixel *, int, pixel *, int, int w, int h );
void x264_plane_copy_interleave_core_mmx2( pixel *dst, int i_dst,
};
MC_COPY_WTAB(mmx,mmx,mmx,mmx)
+#if HIGH_BIT_DEPTH
+MC_COPY_WTAB(sse2,mmx,sse2,sse2)
+#else
MC_COPY_WTAB(sse2,mmx,mmx,sse2)
+#endif
#define MC_WEIGHT_WTAB(function, instr, name1, name2, w12version)\
static void (* x264_mc_##function##_wtab_##instr[6])( pixel *, int, pixel *, int, const x264_weight_t *, int ) =\
if( !(cpu&X264_CPU_MMX2) )
return;
+ pf->prefetch_fenc = x264_prefetch_fenc_mmx2;
+ pf->prefetch_ref = x264_prefetch_ref_mmx2;
+
pf->plane_copy = x264_plane_copy_mmx2;
pf->plane_copy_interleave = x264_plane_copy_interleave_mmx2;
pf->store_interleave_8x8x2 = x264_store_interleave_8x8x2_mmx2;
if( !(cpu&X264_CPU_STACK_MOD4) )
pf->mc_chroma = x264_mc_chroma_avx;
#else // !HIGH_BIT_DEPTH
- pf->prefetch_fenc = x264_prefetch_fenc_mmx2;
- pf->prefetch_ref = x264_prefetch_ref_mmx2;
#if ARCH_X86 // all x86_64 cpus with cacheline split issues use sse2 instead
if( cpu&X264_CPU_CACHELINE_32 )
pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_avx;
pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_avx;
pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_avx;
+ pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_avx;
#endif // HIGH_BIT_DEPTH
}