DECLARE_ALIGNED(8, const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL;
DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
+DECLARE_ALIGNED(16, const xmm_reg, ff_pw_512) = {0x0200020002000200ULL, 0x0200020002000200ULL};
+DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019)= {0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL};
DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0 ) = {0x0000000000000000ULL, 0x0000000000000000ULL};
DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1 ) = {0x0101010101010101ULL, 0x0101010101010101ULL};
"movdqu (%1,%3), %%xmm1 \n\t"
"movdqu (%1,%3,2), %%xmm2 \n\t"
"movdqu (%1,%4), %%xmm3 \n\t"
+ "lea (%1,%3,4), %1 \n\t"
"movdqa %%xmm0, (%2) \n\t"
"movdqa %%xmm1, (%2,%3) \n\t"
"movdqa %%xmm2, (%2,%3,2) \n\t"
"movdqa %%xmm3, (%2,%4) \n\t"
"subl $4, %0 \n\t"
- "lea (%1,%3,4), %1 \n\t"
"lea (%2,%3,4), %2 \n\t"
"jnz 1b \n\t"
: "+g"(h), "+r" (pixels), "+r" (block)
"movdqu (%1,%3), %%xmm1 \n\t"
"movdqu (%1,%3,2), %%xmm2 \n\t"
"movdqu (%1,%4), %%xmm3 \n\t"
+ "lea (%1,%3,4), %1 \n\t"
"pavgb (%2), %%xmm0 \n\t"
"pavgb (%2,%3), %%xmm1 \n\t"
"pavgb (%2,%3,2), %%xmm2 \n\t"
"movdqa %%xmm2, (%2,%3,2) \n\t"
"movdqa %%xmm3, (%2,%4) \n\t"
"subl $4, %0 \n\t"
- "lea (%1,%3,4), %1 \n\t"
"lea (%2,%3,4), %2 \n\t"
"jnz 1b \n\t"
: "+g"(h), "+r" (pixels), "+r" (block)
QPEL_2TAP(avg_, 8, 3dnow)
-#if 0
-static void just_return(void) { return; }
-#endif
-
#if HAVE_YASM
typedef void emu_edge_core_func (uint8_t *buf, const uint8_t *src,
x86_reg linesize, x86_reg start_y,
void ff_put_h264_chroma_mc8_mmx_rnd (uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
-void ff_put_rv40_chroma_mc8_mmx (uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
void ff_avg_h264_chroma_mc8_mmx2_rnd (uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
-void ff_avg_rv40_chroma_mc8_mmx2 (uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
void ff_avg_h264_chroma_mc8_3dnow_rnd (uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
-void ff_avg_rv40_chroma_mc8_3dnow (uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
void ff_put_h264_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
-void ff_put_rv40_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
void ff_avg_h264_chroma_mc4_mmx2 (uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
-void ff_avg_rv40_chroma_mc4_mmx2 (uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
void ff_avg_h264_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
-void ff_avg_rv40_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
void ff_put_h264_chroma_mc2_mmx2 (uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
int32_t max, unsigned int len);
void ff_vector_clip_int32_sse2 (int32_t *dst, const int32_t *src, int32_t min,
int32_t max, unsigned int len);
-void ff_vector_clip_int32_sse2_int(int32_t *dst, const int32_t *src, int32_t min,
+void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src, int32_t min,
int32_t max, unsigned int len);
-void ff_vector_clip_int32_sse41 (int32_t *dst, const int32_t *src, int32_t min,
+void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src, int32_t min,
int32_t max, unsigned int len);
+extern void ff_butterflies_float_interleave_sse(float *dst, const float *src0,
+ const float *src1, int len);
+extern void ff_butterflies_float_interleave_avx(float *dst, const float *src0,
+ const float *src1, int len);
+
void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
{
int mm_flags = av_get_cpu_flags();
- const int high_bit_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
+ const int high_bit_depth = avctx->bits_per_raw_sample > 8;
const int bit_depth = avctx->bits_per_raw_sample;
if (avctx->dsp_mask) {
if (mm_flags & AV_CPU_FLAG_MMX) {
const int idct_algo= avctx->idct_algo;
- if(avctx->lowres==0){
+ if (avctx->lowres == 0 && avctx->bits_per_raw_sample <= 8) {
if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){
c->idct_put= ff_simple_idct_put_mmx;
c->idct_add= ff_simple_idct_add_mmx;
}
#if HAVE_YASM
- if (!high_bit_depth) {
+ if (!high_bit_depth && CONFIG_H264CHROMA) {
c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_mmx_rnd;
c->put_h264_chroma_pixels_tab[1]= ff_put_h264_chroma_mc4_mmx;
}
- c->put_rv40_chroma_pixels_tab[0]= ff_put_rv40_chroma_mc8_mmx;
- c->put_rv40_chroma_pixels_tab[1]= ff_put_rv40_chroma_mc4_mmx;
-
c->vector_clip_int32 = ff_vector_clip_int32_mmx;
#endif
SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmx2, );
#if HAVE_YASM
- c->avg_rv40_chroma_pixels_tab[0]= ff_avg_rv40_chroma_mc8_mmx2;
- c->avg_rv40_chroma_pixels_tab[1]= ff_avg_rv40_chroma_mc4_mmx2;
-
- if (!high_bit_depth) {
+ if (!high_bit_depth && CONFIG_H264CHROMA) {
c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_mmx2_rnd;
c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_mmx2;
c->avg_h264_chroma_pixels_tab[2]= ff_avg_h264_chroma_mc2_mmx2;
c->put_h264_chroma_pixels_tab[2]= ff_put_h264_chroma_mc2_mmx2;
}
- if (bit_depth == 10) {
+ if (bit_depth == 10 && CONFIG_H264CHROMA) {
c->put_h264_chroma_pixels_tab[2]= ff_put_h264_chroma_mc2_10_mmxext;
c->avg_h264_chroma_pixels_tab[2]= ff_avg_h264_chroma_mc2_10_mmxext;
c->put_h264_chroma_pixels_tab[1]= ff_put_h264_chroma_mc4_10_mmxext;
c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmx2;
#endif
#if HAVE_7REGS
- if( mm_flags&AV_CPU_FLAG_3DNOW )
+ if (HAVE_AMD3DNOW && (mm_flags & AV_CPU_FLAG_3DNOW))
c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
#endif
c->add_png_paeth_prediction= add_png_paeth_prediction_mmx2;
- } else if (mm_flags & AV_CPU_FLAG_3DNOW) {
+ } else if (HAVE_AMD3DNOW && (mm_flags & AV_CPU_FLAG_3DNOW)) {
c->prefetch = prefetch_3dnow;
if (!high_bit_depth) {
SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, 3dnow, );
#if HAVE_YASM
- if (!high_bit_depth) {
+ if (!high_bit_depth && CONFIG_H264CHROMA) {
c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_3dnow_rnd;
c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_3dnow;
}
- c->avg_rv40_chroma_pixels_tab[0]= ff_avg_rv40_chroma_mc8_3dnow;
- c->avg_rv40_chroma_pixels_tab[1]= ff_avg_rv40_chroma_mc4_3dnow;
#endif
}
}
if(mm_flags & AV_CPU_FLAG_SSE2){
if (!high_bit_depth) {
- H264_QPEL_FUNCS(0, 1, sse2);
- H264_QPEL_FUNCS(0, 2, sse2);
- H264_QPEL_FUNCS(0, 3, sse2);
- H264_QPEL_FUNCS(1, 1, sse2);
- H264_QPEL_FUNCS(1, 2, sse2);
- H264_QPEL_FUNCS(1, 3, sse2);
- H264_QPEL_FUNCS(2, 1, sse2);
- H264_QPEL_FUNCS(2, 2, sse2);
- H264_QPEL_FUNCS(2, 3, sse2);
- H264_QPEL_FUNCS(3, 1, sse2);
- H264_QPEL_FUNCS(3, 2, sse2);
- H264_QPEL_FUNCS(3, 3, sse2);
+ H264_QPEL_FUNCS(0, 1, sse2);
+ H264_QPEL_FUNCS(0, 2, sse2);
+ H264_QPEL_FUNCS(0, 3, sse2);
+ H264_QPEL_FUNCS(1, 1, sse2);
+ H264_QPEL_FUNCS(1, 2, sse2);
+ H264_QPEL_FUNCS(1, 3, sse2);
+ H264_QPEL_FUNCS(2, 1, sse2);
+ H264_QPEL_FUNCS(2, 2, sse2);
+ H264_QPEL_FUNCS(2, 3, sse2);
+ H264_QPEL_FUNCS(3, 1, sse2);
+ H264_QPEL_FUNCS(3, 2, sse2);
+ H264_QPEL_FUNCS(3, 3, sse2);
}
#if HAVE_YASM
#define H264_QPEL_FUNCS_10(x, y, CPU)\
H264_QPEL_FUNCS_10(2, 0, sse2_cache64)
H264_QPEL_FUNCS_10(3, 0, sse2_cache64)
- c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_10_sse2;
- c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_10_sse2;
+ if (CONFIG_H264CHROMA) {
+ c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_sse2;
+ c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_sse2;
+ }
}
#endif
}
#endif
c->add_png_paeth_prediction= add_png_paeth_prediction_ssse3;
#if HAVE_YASM
- if (!high_bit_depth) {
+ if (!high_bit_depth && CONFIG_H264CHROMA) {
c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_ssse3_rnd;
c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_ssse3_rnd;
c->put_h264_chroma_pixels_tab[1]= ff_put_h264_chroma_mc4_ssse3;
}
#endif
- if(mm_flags & AV_CPU_FLAG_3DNOW){
+ if (HAVE_AMD3DNOW && (mm_flags & AV_CPU_FLAG_3DNOW)) {
c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
c->vector_fmul = vector_fmul_3dnow;
}
- if(mm_flags & AV_CPU_FLAG_3DNOWEXT){
+ if (HAVE_AMD3DNOWEXT && (mm_flags & AV_CPU_FLAG_3DNOWEXT)) {
c->vector_fmul_reverse = vector_fmul_reverse_3dnow2;
#if HAVE_6REGS
c->vector_fmul_window = vector_fmul_window_3dnow2;
c->vector_clipf = vector_clipf_sse;
#if HAVE_YASM
c->scalarproduct_float = ff_scalarproduct_float_sse;
+ c->butterflies_float_interleave = ff_butterflies_float_interleave_sse;
+
+ if (!high_bit_depth)
+ c->emulated_edge_mc = emulated_edge_mc_sse;
+ c->gmc = gmc_sse;
#endif
}
- if(mm_flags & AV_CPU_FLAG_3DNOW)
+ if (HAVE_AMD3DNOW && (mm_flags & AV_CPU_FLAG_3DNOW))
c->vector_fmul_add = vector_fmul_add_3dnow; // faster than sse
if(mm_flags & AV_CPU_FLAG_SSE2){
#if HAVE_YASM
c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
if (mm_flags & AV_CPU_FLAG_ATOM) {
- c->vector_clip_int32 = ff_vector_clip_int32_sse2_int;
+ c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
} else {
c->vector_clip_int32 = ff_vector_clip_int32_sse2;
}
c->apply_window_int16 = ff_apply_window_int16_sse2;
}
}
-
- if (!high_bit_depth)
- c->emulated_edge_mc = emulated_edge_mc_sse;
- c->gmc= gmc_sse;
#endif
}
if (mm_flags & AV_CPU_FLAG_SSSE3) {
if (mm_flags & AV_CPU_FLAG_SSE4 && HAVE_SSE) {
#if HAVE_YASM
- c->vector_clip_int32 = ff_vector_clip_int32_sse41;
+ c->vector_clip_int32 = ff_vector_clip_int32_sse4;
#endif
}
H264_QPEL_FUNCS_10(2, 0, sse2)
H264_QPEL_FUNCS_10(3, 0, sse2)
- c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_10_avx;
- c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_10_avx;
+ if (CONFIG_H264CHROMA) {
+ c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_avx;
+ c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_avx;
+ }
}
+ c->butterflies_float_interleave = ff_butterflies_float_interleave_avx;
}
#endif
}
if (CONFIG_ENCODERS)
dsputilenc_init_mmx(c, avctx);
-
-#if 0
- // for speed testing
- get_pixels = just_return;
- put_pixels_clamped = just_return;
- add_pixels_clamped = just_return;
-
- pix_abs16x16 = just_return;
- pix_abs16x16_x2 = just_return;
- pix_abs16x16_y2 = just_return;
- pix_abs16x16_xy2 = just_return;
-
- put_pixels_tab[0] = just_return;
- put_pixels_tab[1] = just_return;
- put_pixels_tab[2] = just_return;
- put_pixels_tab[3] = just_return;
-
- put_no_rnd_pixels_tab[0] = just_return;
- put_no_rnd_pixels_tab[1] = just_return;
- put_no_rnd_pixels_tab[2] = just_return;
- put_no_rnd_pixels_tab[3] = just_return;
-
- avg_pixels_tab[0] = just_return;
- avg_pixels_tab[1] = just_return;
- avg_pixels_tab[2] = just_return;
- avg_pixels_tab[3] = just_return;
-
- avg_no_rnd_pixels_tab[0] = just_return;
- avg_no_rnd_pixels_tab[1] = just_return;
- avg_no_rnd_pixels_tab[2] = just_return;
- avg_no_rnd_pixels_tab[3] = just_return;
-
- //av_fdct = just_return;
- //ff_idct = just_return;
-#endif
}