X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=libavcodec%2Fx86%2Fdsputil_mmx.c;h=b2f389bb6146a033c7ca63c20c4d4ce196c66ff7;hb=14bc1f24858a8e83a59dd61a88bdd2bc65993e2b;hp=a3d7ff1a2c95f95c00716a24f8e21b18ac2b75e5;hpb=0be9e73e3807edfc63bccb7645a99beab4d00c9b;p=ffmpeg diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c index a3d7ff1a2c9..b2f389bb614 100644 --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -24,54 +24,57 @@ #include "libavutil/x86_cpu.h" #include "libavcodec/dsputil.h" -#include "libavcodec/h263.h" +#include "libavcodec/h264dsp.h" #include "libavcodec/mpegvideo.h" #include "libavcodec/simple_idct.h" #include "dsputil_mmx.h" -#include "vp3dsp_mmx.h" -#include "vp3dsp_sse2.h" -#include "vp6dsp_mmx.h" -#include "vp6dsp_sse2.h" #include "idct_xvid.h" //#undef NDEBUG //#include -int mm_flags; /* multimedia extension flags */ - /* pixel operations */ -DECLARE_ALIGNED_8 (const uint64_t, ff_bone) = 0x0101010101010101ULL; -DECLARE_ALIGNED_8 (const uint64_t, ff_wtwo) = 0x0002000200020002ULL; +DECLARE_ALIGNED(8, const uint64_t, ff_bone) = 0x0101010101010101ULL; +DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL; -DECLARE_ALIGNED_16(const uint64_t, ff_pdw_80000000[2]) = +DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] = {0x8000000080000000ULL, 0x8000000080000000ULL}; -DECLARE_ALIGNED_8 (const uint64_t, ff_pw_3 ) = 0x0003000300030003ULL; -DECLARE_ALIGNED_8 (const uint64_t, ff_pw_4 ) = 0x0004000400040004ULL; -DECLARE_ALIGNED_16(const xmm_reg, ff_pw_5 ) = {0x0005000500050005ULL, 0x0005000500050005ULL}; -DECLARE_ALIGNED_16(const xmm_reg, ff_pw_8 ) = {0x0008000800080008ULL, 0x0008000800080008ULL}; -DECLARE_ALIGNED_8 (const uint64_t, ff_pw_15 ) = 0x000F000F000F000FULL; -DECLARE_ALIGNED_16(const xmm_reg, ff_pw_16 ) = {0x0010001000100010ULL, 0x0010001000100010ULL}; -DECLARE_ALIGNED_8 (const uint64_t, ff_pw_20 ) = 0x0014001400140014ULL; -DECLARE_ALIGNED_16(const xmm_reg, ff_pw_28 ) = {0x001C001C001C001CULL, 0x001C001C001C001CULL}; -DECLARE_ALIGNED_16(const xmm_reg, ff_pw_32 ) = {0x0020002000200020ULL, 0x0020002000200020ULL}; -DECLARE_ALIGNED_8 (const uint64_t, ff_pw_42 ) = 0x002A002A002A002AULL; -DECLARE_ALIGNED_16(const xmm_reg, ff_pw_64 ) = {0x0040004000400040ULL, 0x0040004000400040ULL}; -DECLARE_ALIGNED_8 (const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL; -DECLARE_ALIGNED_8 (const uint64_t, ff_pw_128) = 0x0080008000800080ULL; -DECLARE_ALIGNED_8 (const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL; - -DECLARE_ALIGNED_8 (const uint64_t, ff_pb_1 ) = 0x0101010101010101ULL; -DECLARE_ALIGNED_8 (const uint64_t, ff_pb_3 ) = 0x0303030303030303ULL; -DECLARE_ALIGNED_8 (const uint64_t, ff_pb_7 ) = 0x0707070707070707ULL; -DECLARE_ALIGNED_8 (const uint64_t, ff_pb_1F ) = 0x1F1F1F1F1F1F1F1FULL; -DECLARE_ALIGNED_8 (const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL; -DECLARE_ALIGNED_8 (const uint64_t, ff_pb_81 ) = 0x8181818181818181ULL; -DECLARE_ALIGNED_8 (const uint64_t, ff_pb_A1 ) = 0xA1A1A1A1A1A1A1A1ULL; -DECLARE_ALIGNED_8 (const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL; - -DECLARE_ALIGNED_16(const double, ff_pd_1[2]) = { 1.0, 1.0 }; -DECLARE_ALIGNED_16(const double, ff_pd_2[2]) = { 2.0, 2.0 }; +DECLARE_ALIGNED(8, const uint64_t, ff_pw_3 ) = 0x0003000300030003ULL; +DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4 ) = {0x0004000400040004ULL, 0x0004000400040004ULL}; +DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5 ) = {0x0005000500050005ULL, 0x0005000500050005ULL}; +DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8 ) = {0x0008000800080008ULL, 0x0008000800080008ULL}; +DECLARE_ALIGNED(16, const xmm_reg, ff_pw_9 ) = {0x0009000900090009ULL, 0x0009000900090009ULL}; +DECLARE_ALIGNED(8, const uint64_t, ff_pw_15 ) = 0x000F000F000F000FULL; +DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16 ) = {0x0010001000100010ULL, 0x0010001000100010ULL}; +DECLARE_ALIGNED(16, const xmm_reg, ff_pw_18 ) = {0x0012001200120012ULL, 0x0012001200120012ULL}; +DECLARE_ALIGNED(8, const uint64_t, ff_pw_20 ) = 0x0014001400140014ULL; +DECLARE_ALIGNED(16, const xmm_reg, ff_pw_27 ) = {0x001B001B001B001BULL, 0x001B001B001B001BULL}; +DECLARE_ALIGNED(16, const xmm_reg, ff_pw_28 ) = {0x001C001C001C001CULL, 0x001C001C001C001CULL}; +DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32 ) = {0x0020002000200020ULL, 0x0020002000200020ULL}; +DECLARE_ALIGNED(8, const uint64_t, ff_pw_42 ) = 0x002A002A002A002AULL; +DECLARE_ALIGNED(8, const uint64_t, ff_pw_53 ) = 0x0035003500350035ULL; +DECLARE_ALIGNED(16, const xmm_reg, ff_pw_63 ) = {0x003F003F003F003FULL, 0x003F003F003F003FULL}; +DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64 ) = {0x0040004000400040ULL, 0x0040004000400040ULL}; +DECLARE_ALIGNED(8, const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL; +DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL; +DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL; + +DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1 ) = {0x0101010101010101ULL, 0x0101010101010101ULL}; +DECLARE_ALIGNED(16, const xmm_reg, ff_pb_3 ) = {0x0303030303030303ULL, 0x0303030303030303ULL}; +DECLARE_ALIGNED(16, const xmm_reg, ff_pb_4 ) = {0x0404040404040404ULL, 0x0404040404040404ULL}; +DECLARE_ALIGNED(8, const uint64_t, ff_pb_7 ) = 0x0707070707070707ULL; +DECLARE_ALIGNED(8, const uint64_t, ff_pb_1F ) = 0x1F1F1F1F1F1F1F1FULL; +DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL; +DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80 ) = {0x8080808080808080ULL, 0x8080808080808080ULL}; +DECLARE_ALIGNED(8, const uint64_t, ff_pb_81 ) = 0x8181818181818181ULL; +DECLARE_ALIGNED(8, const uint64_t, ff_pb_A1 ) = 0xA1A1A1A1A1A1A1A1ULL; +DECLARE_ALIGNED(16, const xmm_reg, ff_pb_F8 ) = {0xF8F8F8F8F8F8F8F8ULL, 0xF8F8F8F8F8F8F8F8ULL}; +DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL; +DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FE ) = {0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL}; + +DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 }; +DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 }; #define JUMPALIGN() __asm__ volatile (ASMALIGN(3)::) #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%" #regd ", %%" #regd ::) @@ -155,6 +158,7 @@ DECLARE_ALIGNED_16(const double, ff_pd_2[2]) = { 2.0, 2.0 }; #define SET_RND MOVQ_WONE #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f) #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e) +#define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e) #include "dsputil_mmx_rnd_template.c" @@ -176,17 +180,20 @@ DECLARE_ALIGNED_16(const double, ff_pd_2[2]) = { 2.0, 2.0 }; #undef SET_RND #undef PAVGBP #undef PAVGB +#undef OP_AVG /***********************************/ /* 3Dnow specific */ #define DEF(x) x ## _3dnow #define PAVGB "pavgusb" +#define OP_AVG PAVGB #include "dsputil_mmx_avg_template.c" #undef DEF #undef PAVGB +#undef OP_AVG /***********************************/ /* MMX2 specific */ @@ -195,11 +202,13 @@ DECLARE_ALIGNED_16(const double, ff_pd_2[2]) = { 2.0, 2.0 }; /* Introduced only in MMX2 set */ #define PAVGB "pavgb" +#define OP_AVG PAVGB #include "dsputil_mmx_avg_template.c" #undef DEF #undef PAVGB +#undef OP_AVG #define put_no_rnd_pixels16_mmx put_pixels16_mmx #define put_no_rnd_pixels8_mmx put_pixels8_mmx @@ -217,7 +226,7 @@ DECLARE_ALIGNED_16(const double, ff_pd_2[2]) = { 2.0, 2.0 }; /***********************************/ /* standard MMX */ -void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) +void ff_put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) { const DCTELEM *p; uint8_t *pix; @@ -272,7 +281,7 @@ void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size :"memory"); } -DECLARE_ASM_CONST(8, uint8_t, ff_vector128[8]) = +DECLARE_ASM_CONST(8, uint8_t, ff_vector128)[8] = { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 }; #define put_signed_pixels_clamped_mmx_half(off) \ @@ -293,7 +302,7 @@ DECLARE_ASM_CONST(8, uint8_t, ff_vector128[8]) = "movq %%mm3, (%0, %3, 2) \n\t"\ "movq %%mm4, (%0, %1) \n\t" -void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) +void ff_put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) { x86_reg line_skip = line_size; x86_reg line_skip3; @@ -309,7 +318,7 @@ void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int li :"memory"); } -void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) +void ff_add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) { const DCTELEM *p; uint8_t *pix; @@ -522,6 +531,28 @@ static void clear_block_sse(DCTELEM *block) ); } +static void clear_blocks_sse(DCTELEM *blocks) +{\ + __asm__ volatile( + "xorps %%xmm0, %%xmm0 \n" + "mov %1, %%"REG_a" \n" + "1: \n" + "movaps %%xmm0, (%0, %%"REG_a") \n" + "movaps %%xmm0, 16(%0, %%"REG_a") \n" + "movaps %%xmm0, 32(%0, %%"REG_a") \n" + "movaps %%xmm0, 48(%0, %%"REG_a") \n" + "movaps %%xmm0, 64(%0, %%"REG_a") \n" + "movaps %%xmm0, 80(%0, %%"REG_a") \n" + "movaps %%xmm0, 96(%0, %%"REG_a") \n" + "movaps %%xmm0, 112(%0, %%"REG_a") \n" + "add $128, %%"REG_a" \n" + " js 1b \n" + : : "r" (((uint8_t *)blocks)+128*6), + "i" (-128*6) + : "%"REG_a + ); +} + static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){ x86_reg i=0; __asm__ volatile( @@ -569,7 +600,7 @@ static void add_bytes_l2_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){ } #if HAVE_7REGS && HAVE_TEN_OPERANDS -static void add_hfyu_median_prediction_cmov(uint8_t *dst, uint8_t *top, uint8_t *diff, int w, int *left, int *left_top) { +static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top) { x86_reg w2 = -w; x86_reg x; int l = *left & 0xff; @@ -675,7 +706,7 @@ static void add_hfyu_median_prediction_cmov(uint8_t *dst, uint8_t *top, uint8_t "paddb %%mm1, %%mm6 \n\t" static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){ - if(CONFIG_ANY_H263) { + if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) { const int strength= ff_h263_loop_filter_strength[qscale]; __asm__ volatile( @@ -695,39 +726,10 @@ static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){ } } -static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){ - __asm__ volatile( //FIXME could save 1 instruction if done as 8x4 ... - "movd %4, %%mm0 \n\t" - "movd %5, %%mm1 \n\t" - "movd %6, %%mm2 \n\t" - "movd %7, %%mm3 \n\t" - "punpcklbw %%mm1, %%mm0 \n\t" - "punpcklbw %%mm3, %%mm2 \n\t" - "movq %%mm0, %%mm1 \n\t" - "punpcklwd %%mm2, %%mm0 \n\t" - "punpckhwd %%mm2, %%mm1 \n\t" - "movd %%mm0, %0 \n\t" - "punpckhdq %%mm0, %%mm0 \n\t" - "movd %%mm0, %1 \n\t" - "movd %%mm1, %2 \n\t" - "punpckhdq %%mm1, %%mm1 \n\t" - "movd %%mm1, %3 \n\t" - - : "=m" (*(uint32_t*)(dst + 0*dst_stride)), - "=m" (*(uint32_t*)(dst + 1*dst_stride)), - "=m" (*(uint32_t*)(dst + 2*dst_stride)), - "=m" (*(uint32_t*)(dst + 3*dst_stride)) - : "m" (*(uint32_t*)(src + 0*src_stride)), - "m" (*(uint32_t*)(src + 1*src_stride)), - "m" (*(uint32_t*)(src + 2*src_stride)), - "m" (*(uint32_t*)(src + 3*src_stride)) - ); -} - static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){ - if(CONFIG_ANY_H263) { + if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) { const int strength= ff_h263_loop_filter_strength[qscale]; - DECLARE_ALIGNED(8, uint64_t, temp[4]); + DECLARE_ALIGNED(8, uint64_t, temp)[4]; uint8_t *btemp= (uint8_t*)temp; src -= 2; @@ -1787,13 +1789,61 @@ PREFETCH(prefetch_mmx2, prefetcht0) PREFETCH(prefetch_3dnow, prefetch) #undef PREFETCH -#include "h264dsp_mmx.c" -#include "rv40dsp_mmx.c" +#include "h264_qpel_mmx.c" + +void ff_put_h264_chroma_mc8_mmx_rnd (uint8_t *dst, uint8_t *src, + int stride, int h, int x, int y); +void ff_put_vc1_chroma_mc8_mmx_nornd (uint8_t *dst, uint8_t *src, + int stride, int h, int x, int y); +void ff_put_rv40_chroma_mc8_mmx (uint8_t *dst, uint8_t *src, + int stride, int h, int x, int y); +void ff_avg_h264_chroma_mc8_mmx2_rnd (uint8_t *dst, uint8_t *src, + int stride, int h, int x, int y); +void ff_avg_vc1_chroma_mc8_mmx2_nornd (uint8_t *dst, uint8_t *src, + int stride, int h, int x, int y); +void ff_avg_rv40_chroma_mc8_mmx2 (uint8_t *dst, uint8_t *src, + int stride, int h, int x, int y); +void ff_avg_h264_chroma_mc8_3dnow_rnd (uint8_t *dst, uint8_t *src, + int stride, int h, int x, int y); +void ff_avg_vc1_chroma_mc8_3dnow_nornd(uint8_t *dst, uint8_t *src, + int stride, int h, int x, int y); +void ff_avg_rv40_chroma_mc8_3dnow (uint8_t *dst, uint8_t *src, + int stride, int h, int x, int y); + +void ff_put_h264_chroma_mc4_mmx (uint8_t *dst, uint8_t *src, + int stride, int h, int x, int y); +void ff_put_rv40_chroma_mc4_mmx (uint8_t *dst, uint8_t *src, + int stride, int h, int x, int y); +void ff_avg_h264_chroma_mc4_mmx2 (uint8_t *dst, uint8_t *src, + int stride, int h, int x, int y); +void ff_avg_rv40_chroma_mc4_mmx2 (uint8_t *dst, uint8_t *src, + int stride, int h, int x, int y); +void ff_avg_h264_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src, + int stride, int h, int x, int y); +void ff_avg_rv40_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src, + int stride, int h, int x, int y); + +void ff_put_h264_chroma_mc2_mmx2 (uint8_t *dst, uint8_t *src, + int stride, int h, int x, int y); +void ff_avg_h264_chroma_mc2_mmx2 (uint8_t *dst, uint8_t *src, + int stride, int h, int x, int y); + +void ff_put_h264_chroma_mc8_ssse3_rnd (uint8_t *dst, uint8_t *src, + int stride, int h, int x, int y); +void ff_put_vc1_chroma_mc8_ssse3_nornd(uint8_t *dst, uint8_t *src, + int stride, int h, int x, int y); +void ff_put_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src, + int stride, int h, int x, int y); + +void ff_avg_h264_chroma_mc8_ssse3_rnd (uint8_t *dst, uint8_t *src, + int stride, int h, int x, int y); +void ff_avg_vc1_chroma_mc8_ssse3_nornd(uint8_t *dst, uint8_t *src, + int stride, int h, int x, int y); +void ff_avg_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src, + int stride, int h, int x, int y); -/* CAVS specific */ -void ff_cavsdsp_init_mmx2(DSPContext* c, AVCodecContext *avctx); -void ff_cavsdsp_init_3dnow(DSPContext* c, AVCodecContext *avctx); +/* CAVS specific */ void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) { put_pixels8_mmx(dst, src, stride, 8); } @@ -1808,15 +1858,12 @@ void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) { } /* VC1 specific */ -void ff_vc1dsp_init_mmx(DSPContext* dsp, AVCodecContext *avctx); - void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd) { put_pixels8_mmx(dst, src, stride, 8); } - -/* external functions, from idct_mmx.c */ -void ff_mmx_idct(DCTELEM *block); -void ff_mmxext_idct(DCTELEM *block); +void ff_avg_vc1_mspel_mc00_mmx2(uint8_t *dst, const uint8_t *src, int stride, int rnd) { + avg_pixels8_mmx2(dst, src, stride, 8); +} /* XXX: those functions should be suppressed ASAP when all IDCTs are converted */ @@ -1824,43 +1871,43 @@ void ff_mmxext_idct(DCTELEM *block); static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block) { ff_mmx_idct (block); - put_pixels_clamped_mmx(block, dest, line_size); + ff_put_pixels_clamped_mmx(block, dest, line_size); } static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, DCTELEM *block) { ff_mmx_idct (block); - add_pixels_clamped_mmx(block, dest, line_size); + ff_add_pixels_clamped_mmx(block, dest, line_size); } static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, DCTELEM *block) { ff_mmxext_idct (block); - put_pixels_clamped_mmx(block, dest, line_size); + ff_put_pixels_clamped_mmx(block, dest, line_size); } static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *block) { ff_mmxext_idct (block); - add_pixels_clamped_mmx(block, dest, line_size); + ff_add_pixels_clamped_mmx(block, dest, line_size); } #endif static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block) { ff_idct_xvid_mmx (block); - put_pixels_clamped_mmx(block, dest, line_size); + ff_put_pixels_clamped_mmx(block, dest, line_size); } static void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, DCTELEM *block) { ff_idct_xvid_mmx (block); - add_pixels_clamped_mmx(block, dest, line_size); + ff_add_pixels_clamped_mmx(block, dest, line_size); } static void ff_idct_xvid_mmx2_put(uint8_t *dest, int line_size, DCTELEM *block) { ff_idct_xvid_mmx2 (block); - put_pixels_clamped_mmx(block, dest, line_size); + ff_put_pixels_clamped_mmx(block, dest, line_size); } static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block) { ff_idct_xvid_mmx2 (block); - add_pixels_clamped_mmx(block, dest, line_size); + ff_add_pixels_clamped_mmx(block, dest, line_size); } static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize) @@ -1996,7 +2043,7 @@ static void ac3_downmix_sse(float (*samples)[256], float (*matrix)[2], int out_c } else if(in_ch == 5 && out_ch == 1 && matrix_cmp[0][0]==matrix_cmp[2][0] && matrix_cmp[3][0]==matrix_cmp[4][0]) { MIX5(IF1,IF0); } else { - DECLARE_ALIGNED_16(float, matrix_simd[in_ch][2][4]); + DECLARE_ALIGNED(16, float, matrix_simd)[in_ch][2][4]; j = 2*in_ch*sizeof(float); __asm__ volatile( "1: \n" @@ -2094,110 +2141,46 @@ static void vector_fmul_reverse_sse(float *dst, const float *src0, const float * ); } -static void vector_fmul_add_add_3dnow(float *dst, const float *src0, const float *src1, - const float *src2, int src3, int len, int step){ +static void vector_fmul_add_3dnow(float *dst, const float *src0, const float *src1, + const float *src2, int len){ x86_reg i = (len-4)*4; - if(step == 2 && src3 == 0){ - dst += (len-4)*2; - __asm__ volatile( - "1: \n\t" - "movq (%2,%0), %%mm0 \n\t" - "movq 8(%2,%0), %%mm1 \n\t" - "pfmul (%3,%0), %%mm0 \n\t" - "pfmul 8(%3,%0), %%mm1 \n\t" - "pfadd (%4,%0), %%mm0 \n\t" - "pfadd 8(%4,%0), %%mm1 \n\t" - "movd %%mm0, (%1) \n\t" - "movd %%mm1, 16(%1) \n\t" - "psrlq $32, %%mm0 \n\t" - "psrlq $32, %%mm1 \n\t" - "movd %%mm0, 8(%1) \n\t" - "movd %%mm1, 24(%1) \n\t" - "sub $32, %1 \n\t" - "sub $16, %0 \n\t" - "jge 1b \n\t" - :"+r"(i), "+r"(dst) - :"r"(src0), "r"(src1), "r"(src2) - :"memory" - ); - } - else if(step == 1 && src3 == 0){ - __asm__ volatile( - "1: \n\t" - "movq (%2,%0), %%mm0 \n\t" - "movq 8(%2,%0), %%mm1 \n\t" - "pfmul (%3,%0), %%mm0 \n\t" - "pfmul 8(%3,%0), %%mm1 \n\t" - "pfadd (%4,%0), %%mm0 \n\t" - "pfadd 8(%4,%0), %%mm1 \n\t" - "movq %%mm0, (%1,%0) \n\t" - "movq %%mm1, 8(%1,%0) \n\t" - "sub $16, %0 \n\t" - "jge 1b \n\t" - :"+r"(i) - :"r"(dst), "r"(src0), "r"(src1), "r"(src2) - :"memory" - ); - } - else - ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step); + __asm__ volatile( + "1: \n\t" + "movq (%2,%0), %%mm0 \n\t" + "movq 8(%2,%0), %%mm1 \n\t" + "pfmul (%3,%0), %%mm0 \n\t" + "pfmul 8(%3,%0), %%mm1 \n\t" + "pfadd (%4,%0), %%mm0 \n\t" + "pfadd 8(%4,%0), %%mm1 \n\t" + "movq %%mm0, (%1,%0) \n\t" + "movq %%mm1, 8(%1,%0) \n\t" + "sub $16, %0 \n\t" + "jge 1b \n\t" + :"+r"(i) + :"r"(dst), "r"(src0), "r"(src1), "r"(src2) + :"memory" + ); __asm__ volatile("femms"); } -static void vector_fmul_add_add_sse(float *dst, const float *src0, const float *src1, - const float *src2, int src3, int len, int step){ +static void vector_fmul_add_sse(float *dst, const float *src0, const float *src1, + const float *src2, int len){ x86_reg i = (len-8)*4; - if(step == 2 && src3 == 0){ - dst += (len-8)*2; - __asm__ volatile( - "1: \n\t" - "movaps (%2,%0), %%xmm0 \n\t" - "movaps 16(%2,%0), %%xmm1 \n\t" - "mulps (%3,%0), %%xmm0 \n\t" - "mulps 16(%3,%0), %%xmm1 \n\t" - "addps (%4,%0), %%xmm0 \n\t" - "addps 16(%4,%0), %%xmm1 \n\t" - "movss %%xmm0, (%1) \n\t" - "movss %%xmm1, 32(%1) \n\t" - "movhlps %%xmm0, %%xmm2 \n\t" - "movhlps %%xmm1, %%xmm3 \n\t" - "movss %%xmm2, 16(%1) \n\t" - "movss %%xmm3, 48(%1) \n\t" - "shufps $0xb1, %%xmm0, %%xmm0 \n\t" - "shufps $0xb1, %%xmm1, %%xmm1 \n\t" - "movss %%xmm0, 8(%1) \n\t" - "movss %%xmm1, 40(%1) \n\t" - "movhlps %%xmm0, %%xmm2 \n\t" - "movhlps %%xmm1, %%xmm3 \n\t" - "movss %%xmm2, 24(%1) \n\t" - "movss %%xmm3, 56(%1) \n\t" - "sub $64, %1 \n\t" - "sub $32, %0 \n\t" - "jge 1b \n\t" - :"+r"(i), "+r"(dst) - :"r"(src0), "r"(src1), "r"(src2) - :"memory" - ); - } - else if(step == 1 && src3 == 0){ - __asm__ volatile( - "1: \n\t" - "movaps (%2,%0), %%xmm0 \n\t" - "movaps 16(%2,%0), %%xmm1 \n\t" - "mulps (%3,%0), %%xmm0 \n\t" - "mulps 16(%3,%0), %%xmm1 \n\t" - "addps (%4,%0), %%xmm0 \n\t" - "addps 16(%4,%0), %%xmm1 \n\t" - "movaps %%xmm0, (%1,%0) \n\t" - "movaps %%xmm1, 16(%1,%0) \n\t" - "sub $32, %0 \n\t" - "jge 1b \n\t" - :"+r"(i) - :"r"(dst), "r"(src0), "r"(src1), "r"(src2) - :"memory" - ); - } - else - ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step); + __asm__ volatile( + "1: \n\t" + "movaps (%2,%0), %%xmm0 \n\t" + "movaps 16(%2,%0), %%xmm1 \n\t" + "mulps (%3,%0), %%xmm0 \n\t" + "mulps 16(%3,%0), %%xmm1 \n\t" + "addps (%4,%0), %%xmm0 \n\t" + "addps 16(%4,%0), %%xmm1 \n\t" + "movaps %%xmm0, (%1,%0) \n\t" + "movaps %%xmm1, 16(%1,%0) \n\t" + "sub $32, %0 \n\t" + "jge 1b \n\t" + :"+r"(i) + :"r"(dst), "r"(src0), "r"(src1), "r"(src2) + :"memory" + ); } static void vector_fmul_window_3dnow2(float *dst, const float *src0, const float *src1, @@ -2315,6 +2298,40 @@ static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mu ); } +static void vector_clipf_sse(float *dst, const float *src, float min, float max, + int len) +{ + x86_reg i = (len-16)*4; + __asm__ volatile( + "movss %3, %%xmm4 \n" + "movss %4, %%xmm5 \n" + "shufps $0, %%xmm4, %%xmm4 \n" + "shufps $0, %%xmm5, %%xmm5 \n" + "1: \n\t" + "movaps (%2,%0), %%xmm0 \n\t" // 3/1 on intel + "movaps 16(%2,%0), %%xmm1 \n\t" + "movaps 32(%2,%0), %%xmm2 \n\t" + "movaps 48(%2,%0), %%xmm3 \n\t" + "maxps %%xmm4, %%xmm0 \n\t" + "maxps %%xmm4, %%xmm1 \n\t" + "maxps %%xmm4, %%xmm2 \n\t" + "maxps %%xmm4, %%xmm3 \n\t" + "minps %%xmm5, %%xmm0 \n\t" + "minps %%xmm5, %%xmm1 \n\t" + "minps %%xmm5, %%xmm2 \n\t" + "minps %%xmm5, %%xmm3 \n\t" + "movaps %%xmm0, (%1,%0) \n\t" + "movaps %%xmm1, 16(%1,%0) \n\t" + "movaps %%xmm2, 32(%1,%0) \n\t" + "movaps %%xmm3, 48(%1,%0) \n\t" + "sub $64, %0 \n\t" + "jge 1b \n\t" + :"+&r"(i) + :"r"(dst), "r"(src), "m"(min), "m"(max) + :"memory" + ); +} + static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){ x86_reg reglen = len; // not bit-exact: pf2id uses different rounding than C and SSE @@ -2379,25 +2396,32 @@ static void float_to_int16_sse2(int16_t *dst, const float *src, long len){ ); } -#if HAVE_YASM +void ff_vp3_idct_mmx(int16_t *input_data); +void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block); +void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block); + +void ff_vp3_idct_dc_add_mmx2(uint8_t *dest, int line_size, const DCTELEM *block); + +void ff_vp3_v_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values); +void ff_vp3_h_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values); + +void ff_vp3_idct_sse2(int16_t *input_data); +void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block); +void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block); + void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len); void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len); void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len); -void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *top, uint8_t *diff, int w, int *left, int *left_top); -void ff_x264_deblock_v_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); -void ff_x264_deblock_h_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); -void ff_x264_deblock_v8_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta); -void ff_x264_deblock_h_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta); -#if ARCH_X86_32 -static void ff_x264_deblock_v_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta) -{ - ff_x264_deblock_v8_luma_intra_mmxext(pix+0, stride, alpha, beta); - ff_x264_deblock_v8_luma_intra_mmxext(pix+8, stride, alpha, beta); -} -#endif -void ff_x264_deblock_v_luma_intra_sse2(uint8_t *pix, int stride, int alpha, int beta); -void ff_x264_deblock_h_luma_intra_sse2(uint8_t *pix, int stride, int alpha, int beta); -#else +int32_t ff_scalarproduct_int16_mmx2(const int16_t *v1, const int16_t *v2, int order, int shift); +int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2, int order, int shift); +int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul); +int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul); +int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul); +void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top); +int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left); +int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left); + +#if !HAVE_YASM #define ff_float_to_int16_interleave6_sse(a,b,c) float_to_int16_interleave_misc_sse(a,b,c,6) #define ff_float_to_int16_interleave6_3dnow(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6) #define ff_float_to_int16_interleave6_3dn2(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6) @@ -2407,7 +2431,7 @@ void ff_x264_deblock_h_luma_intra_sse2(uint8_t *pix, int stride, int alpha, int #define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \ /* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\ static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\ - DECLARE_ALIGNED_16(int16_t, tmp[len]);\ + DECLARE_ALIGNED(16, int16_t, tmp)[len];\ int i,j,c;\ for(c=0; cdsp_mask) { if (avctx->dsp_mask & FF_MM_FORCE) @@ -2591,8 +2534,8 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:"); if (mm_flags & FF_MM_MMX) av_log(avctx, AV_LOG_INFO, " mmx"); - if (mm_flags & FF_MM_MMXEXT) - av_log(avctx, AV_LOG_INFO, " mmxext"); + if (mm_flags & FF_MM_MMX2) + av_log(avctx, AV_LOG_INFO, " mmx2"); if (mm_flags & FF_MM_3DNOW) av_log(avctx, AV_LOG_INFO, " 3dnow"); if (mm_flags & FF_MM_SSE) @@ -2613,7 +2556,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) c->idct_permutation_type= FF_SIMPLE_IDCT_PERM; #if CONFIG_GPL }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){ - if(mm_flags & FF_MM_MMXEXT){ + if(mm_flags & FF_MM_MMX2){ c->idct_put= ff_libmpeg2mmx2_idct_put; c->idct_add= ff_libmpeg2mmx2_idct_add; c->idct = ff_mmxext_idct; @@ -2624,8 +2567,8 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) } c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM; #endif - }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER || CONFIG_THEORA_DECODER) && - idct_algo==FF_IDCT_VP3){ + }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER) && + idct_algo==FF_IDCT_VP3 && HAVE_YASM){ if(mm_flags & FF_MM_SSE2){ c->idct_put= ff_vp3_idct_put_sse2; c->idct_add= ff_vp3_idct_add_sse2; @@ -2645,7 +2588,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) c->idct_add= ff_idct_xvid_sse2_add; c->idct = ff_idct_xvid_sse2; c->idct_permutation_type= FF_SSE2_IDCT_PERM; - }else if(mm_flags & FF_MM_MMXEXT){ + }else if(mm_flags & FF_MM_MMX2){ c->idct_put= ff_idct_xvid_mmx2_put; c->idct_add= ff_idct_xvid_mmx2_add; c->idct = ff_idct_xvid_mmx2; @@ -2657,13 +2600,17 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) } } - c->put_pixels_clamped = put_pixels_clamped_mmx; - c->put_signed_pixels_clamped = put_signed_pixels_clamped_mmx; - c->add_pixels_clamped = add_pixels_clamped_mmx; + c->put_pixels_clamped = ff_put_pixels_clamped_mmx; + c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx; + c->add_pixels_clamped = ff_add_pixels_clamped_mmx; c->clear_block = clear_block_mmx; c->clear_blocks = clear_blocks_mmx; - if (mm_flags & FF_MM_SSE) - c->clear_block = clear_block_sse; + if ((mm_flags & FF_MM_SSE) && + !(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)){ + /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */ + c->clear_block = clear_block_sse; + c->clear_blocks = clear_blocks_sse; + } #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \ c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \ @@ -2687,32 +2634,21 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) c->draw_edges = draw_edges_mmx; - if (CONFIG_ANY_H263) { + if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) { c->h263_v_loop_filter= h263_v_loop_filter_mmx; c->h263_h_loop_filter= h263_h_loop_filter_mmx; } - c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx_rnd; - c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_mmx; - c->put_no_rnd_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx_nornd; - - c->put_rv40_chroma_pixels_tab[0]= put_rv40_chroma_mc8_mmx; - c->put_rv40_chroma_pixels_tab[1]= put_rv40_chroma_mc4_mmx; - - c->h264_idct_dc_add= - c->h264_idct_add= ff_h264_idct_add_mmx; - c->h264_idct8_dc_add= - c->h264_idct8_add= ff_h264_idct8_add_mmx; - c->h264_idct_add16 = ff_h264_idct_add16_mmx; - c->h264_idct8_add4 = ff_h264_idct8_add4_mmx; - c->h264_idct_add8 = ff_h264_idct_add8_mmx; - c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx; +#if HAVE_YASM + c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_mmx_rnd; + c->put_h264_chroma_pixels_tab[1]= ff_put_h264_chroma_mc4_mmx; + c->put_no_rnd_vc1_chroma_pixels_tab[0]= ff_put_vc1_chroma_mc8_mmx_nornd; - if (CONFIG_VP6_DECODER) { - c->vp6_filter_diag4 = ff_vp6_filter_diag4_mmx; - } + c->put_rv40_chroma_pixels_tab[0]= ff_put_rv40_chroma_mc8_mmx; + c->put_rv40_chroma_pixels_tab[1]= ff_put_rv40_chroma_mc4_mmx; +#endif - if (mm_flags & FF_MM_MMXEXT) { + if (mm_flags & FF_MM_MMX2) { c->prefetch = prefetch_mmx2; c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2; @@ -2729,13 +2665,6 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2; c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2; - c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2; - c->h264_idct8_dc_add= ff_h264_idct8_dc_add_mmx2; - c->h264_idct_add16 = ff_h264_idct_add16_mmx2; - c->h264_idct8_add4 = ff_h264_idct8_add4_mmx2; - c->h264_idct_add8 = ff_h264_idct_add8_mmx2; - c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx2; - if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2; c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2; @@ -2744,11 +2673,20 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2; c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2; - if (CONFIG_VP3_DECODER || CONFIG_THEORA_DECODER) { + if (CONFIG_VP3_DECODER && HAVE_YASM) { c->vp3_v_loop_filter= ff_vp3_v_loop_filter_mmx2; c->vp3_h_loop_filter= ff_vp3_h_loop_filter_mmx2; } } + if (CONFIG_VP3_DECODER && HAVE_YASM) { + c->vp3_idct_dc_add = ff_vp3_idct_dc_add_mmx2; + } + + if (CONFIG_VP3_DECODER + && (avctx->codec_id == CODEC_ID_VP3 || avctx->codec_id == CODEC_ID_THEORA)) { + c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_mmx2; + c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_mmx2; + } #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU) \ c->PFX ## _pixels_tab[IDX][ 0] = PFX ## SIZE ## _mc00_ ## CPU; \ @@ -2787,40 +2725,17 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmx2); SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmx2); - c->avg_rv40_chroma_pixels_tab[0]= avg_rv40_chroma_mc8_mmx2; - c->avg_rv40_chroma_pixels_tab[1]= avg_rv40_chroma_mc4_mmx2; - - c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_mmx2_rnd; - c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_mmx2; - c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_mmx2; - c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_mmx2; - c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_mmx2; - c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_mmx2; - c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_mmx2; - c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_mmx2; - c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2; - c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2; - c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2; - - c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2; - c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2; - c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2; - c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2; - c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2; - c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2; - c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2; - c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2; - - c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2; - c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2; - c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2; - c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2; - c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2; - c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2; - c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2; - c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2; - #if HAVE_YASM + c->avg_rv40_chroma_pixels_tab[0]= ff_avg_rv40_chroma_mc8_mmx2; + c->avg_rv40_chroma_pixels_tab[1]= ff_avg_rv40_chroma_mc4_mmx2; + + c->avg_no_rnd_vc1_chroma_pixels_tab[0]= ff_avg_vc1_chroma_mc8_mmx2_nornd; + + c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_mmx2_rnd; + c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_mmx2; + c->avg_h264_chroma_pixels_tab[2]= ff_avg_h264_chroma_mc2_mmx2; + c->put_h264_chroma_pixels_tab[2]= ff_put_h264_chroma_mc2_mmx2; + c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmx2; #endif #if HAVE_7REGS && HAVE_TEN_OPERANDS @@ -2828,10 +2743,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov; #endif - if (CONFIG_CAVS_DECODER) - ff_cavsdsp_init_mmx2(c, avctx); - - if (CONFIG_VC1_DECODER || CONFIG_WMV3_DECODER) + if (CONFIG_VC1_DECODER) ff_vc1dsp_init_mmx(c, avctx); c->add_png_paeth_prediction= add_png_paeth_prediction_mmx2; @@ -2861,6 +2773,12 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow; } + if (CONFIG_VP3_DECODER + && (avctx->codec_id == CODEC_ID_VP3 || avctx->codec_id == CODEC_ID_THEORA)) { + c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_3dnow; + c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_3dnow; + } + SET_QPEL_FUNCS(put_qpel, 0, 16, 3dnow); SET_QPEL_FUNCS(put_qpel, 1, 8, 3dnow); SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, 3dnow); @@ -2880,14 +2798,15 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, 3dnow); SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, 3dnow); - c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_3dnow_rnd; - c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_3dnow; +#if HAVE_YASM + c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_3dnow_rnd; + c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_3dnow; - c->avg_rv40_chroma_pixels_tab[0]= avg_rv40_chroma_mc8_3dnow; - c->avg_rv40_chroma_pixels_tab[1]= avg_rv40_chroma_mc4_3dnow; + c->avg_no_rnd_vc1_chroma_pixels_tab[0]= ff_avg_vc1_chroma_mc8_3dnow_nornd; - if (CONFIG_CAVS_DECODER) - ff_cavsdsp_init_3dnow(c, avctx); + c->avg_rv40_chroma_pixels_tab[0]= ff_avg_rv40_chroma_mc8_3dnow; + c->avg_rv40_chroma_pixels_tab[1]= ff_avg_rv40_chroma_mc4_3dnow; +#endif } @@ -2898,16 +2817,11 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) c->avg_h264_qpel_pixels_tab[1][x+y*4] = avg_h264_qpel8_mc##x##y##_##CPU; if((mm_flags & FF_MM_SSE2) && !(mm_flags & FF_MM_3DNOW)){ // these functions are slower than mmx on AMD, but faster on Intel -/* FIXME works in most codecs, but crashes svq1 due to unaligned chroma c->put_pixels_tab[0][0] = put_pixels16_sse2; c->avg_pixels_tab[0][0] = avg_pixels16_sse2; -*/ H264_QPEL_FUNCS(0, 0, sse2); } if(mm_flags & FF_MM_SSE2){ - c->h264_idct8_add = ff_h264_idct8_add_sse2; - c->h264_idct8_add4= ff_h264_idct8_add4_sse2; - H264_QPEL_FUNCS(0, 1, sse2); H264_QPEL_FUNCS(0, 2, sse2); H264_QPEL_FUNCS(0, 3, sse2); @@ -2920,10 +2834,6 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) H264_QPEL_FUNCS(3, 1, sse2); H264_QPEL_FUNCS(3, 2, sse2); H264_QPEL_FUNCS(3, 3, sse2); - - if (CONFIG_VP6_DECODER) { - c->vp6_filter_diag4 = ff_vp6_filter_diag4_sse2; - } } #if HAVE_SSSE3 if(mm_flags & FF_MM_SSSE3){ @@ -2939,51 +2849,18 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) H264_QPEL_FUNCS(3, 1, ssse3); H264_QPEL_FUNCS(3, 2, ssse3); H264_QPEL_FUNCS(3, 3, ssse3); - c->put_no_rnd_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_ssse3_nornd; - c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_ssse3_rnd; - c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_ssse3_rnd; - c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_ssse3; - c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_ssse3; c->add_png_paeth_prediction= add_png_paeth_prediction_ssse3; - } -#endif - -#if CONFIG_GPL && HAVE_YASM - if( mm_flags&FF_MM_MMXEXT ){ -#if ARCH_X86_32 - c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_mmxext; - c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_mmxext; -#endif - if( mm_flags&FF_MM_SSE2 ){ -#if ARCH_X86_64 || !defined(__ICC) || __ICC > 1100 - c->h264_v_loop_filter_luma = ff_x264_deblock_v_luma_sse2; - c->h264_h_loop_filter_luma = ff_x264_deblock_h_luma_sse2; - c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_sse2; - c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_sse2; -#endif - c->h264_idct_add16 = ff_h264_idct_add16_sse2; - c->h264_idct_add8 = ff_h264_idct_add8_sse2; - c->h264_idct_add16intra = ff_h264_idct_add16intra_sse2; - } - } -#endif - -#if CONFIG_SNOW_DECODER - if(mm_flags & FF_MM_SSE2 & 0){ - c->horizontal_compose97i = ff_snow_horizontal_compose97i_sse2; -#if HAVE_7REGS - c->vertical_compose97i = ff_snow_vertical_compose97i_sse2; -#endif - c->inner_add_yblock = ff_snow_inner_add_yblock_sse2; - } - else{ - if(mm_flags & FF_MM_MMXEXT){ - c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx; -#if HAVE_7REGS - c->vertical_compose97i = ff_snow_vertical_compose97i_mmx; +#if HAVE_YASM + c->put_no_rnd_vc1_chroma_pixels_tab[0]= ff_put_vc1_chroma_mc8_ssse3_nornd; + c->avg_no_rnd_vc1_chroma_pixels_tab[0]= ff_avg_vc1_chroma_mc8_ssse3_nornd; + c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_ssse3_rnd; + c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_ssse3_rnd; + c->put_h264_chroma_pixels_tab[1]= ff_put_h264_chroma_mc4_ssse3; + c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_ssse3; + c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3; + if (mm_flags & FF_MM_SSE4) // not really sse4, just slow on Conroe + c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4; #endif - } - c->inner_add_yblock = ff_snow_inner_add_yblock_mmx; } #endif @@ -3002,27 +2879,40 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) c->float_to_int16_interleave = float_to_int16_interleave_3dn2; } } + if(mm_flags & FF_MM_MMX2){ +#if HAVE_YASM + c->scalarproduct_int16 = ff_scalarproduct_int16_mmx2; + c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmx2; +#endif + } if(mm_flags & FF_MM_SSE){ c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse; c->ac3_downmix = ac3_downmix_sse; c->vector_fmul = vector_fmul_sse; c->vector_fmul_reverse = vector_fmul_reverse_sse; - c->vector_fmul_add_add = vector_fmul_add_add_sse; + c->vector_fmul_add = vector_fmul_add_sse; c->vector_fmul_window = vector_fmul_window_sse; c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse; + c->vector_clipf = vector_clipf_sse; c->float_to_int16 = float_to_int16_sse; c->float_to_int16_interleave = float_to_int16_interleave_sse; +#if HAVE_YASM + c->scalarproduct_float = ff_scalarproduct_float_sse; +#endif } if(mm_flags & FF_MM_3DNOW) - c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse + c->vector_fmul_add = vector_fmul_add_3dnow; // faster than sse if(mm_flags & FF_MM_SSE2){ c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2; c->float_to_int16 = float_to_int16_sse2; c->float_to_int16_interleave = float_to_int16_interleave_sse2; - c->add_int16 = add_int16_sse2; - c->sub_int16 = sub_int16_sse2; - c->scalarproduct_int16 = scalarproduct_int16_sse2; +#if HAVE_YASM + c->scalarproduct_int16 = ff_scalarproduct_int16_sse2; + c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2; +#endif } + if((mm_flags & FF_MM_SSSE3) && !(mm_flags & (FF_MM_SSE42|FF_MM_3DNOW)) && HAVE_YASM) // cachesplit + c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3; } if (CONFIG_ENCODERS)