/*
* MMX optimized DSP utils
- * Copyright (c) 2000, 2001 Fabrice Bellard.
+ * Copyright (c) 2000, 2001 Fabrice Bellard
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
#include "libavcodec/mpegvideo.h"
#include "libavcodec/simple_idct.h"
#include "dsputil_mmx.h"
-#include "mmx.h"
#include "vp3dsp_mmx.h"
#include "vp3dsp_sse2.h"
+#include "vp6dsp_mmx.h"
+#include "vp6dsp_sse2.h"
#include "idct_xvid.h"
//#undef NDEBUG
DECLARE_ALIGNED_16(const xmm_reg, ff_pw_28 ) = {0x001C001C001C001CULL, 0x001C001C001C001CULL};
DECLARE_ALIGNED_16(const xmm_reg, ff_pw_32 ) = {0x0020002000200020ULL, 0x0020002000200020ULL};
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_42 ) = 0x002A002A002A002AULL;
-DECLARE_ALIGNED_8 (const uint64_t, ff_pw_64 ) = 0x0040004000400040ULL;
+DECLARE_ALIGNED_16(const xmm_reg, ff_pw_64 ) = {0x0040004000400040ULL, 0x0040004000400040ULL};
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL;
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
#define SET_RND MOVQ_WONE
#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
#define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
+#define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e)
#include "dsputil_mmx_rnd_template.c"
#undef SET_RND
#undef PAVGBP
#undef PAVGB
+#undef OP_AVG
/***********************************/
/* 3Dnow specific */
#define DEF(x) x ## _3dnow
#define PAVGB "pavgusb"
+#define OP_AVG PAVGB
#include "dsputil_mmx_avg_template.c"
#undef DEF
#undef PAVGB
+#undef OP_AVG
/***********************************/
/* MMX2 specific */
/* Introduced only in MMX2 set */
#define PAVGB "pavgb"
+#define OP_AVG PAVGB
#include "dsputil_mmx_avg_template.c"
#undef DEF
#undef PAVGB
+#undef OP_AVG
#define put_no_rnd_pixels16_mmx put_pixels16_mmx
#define put_no_rnd_pixels8_mmx put_pixels8_mmx
:"memory");
}
-static DECLARE_ALIGNED_8(const unsigned char, vector128[8]) =
+DECLARE_ASM_CONST(8, uint8_t, ff_vector128[8]) =
{ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
+#define put_signed_pixels_clamped_mmx_half(off) \
+ "movq "#off"(%2), %%mm1 \n\t"\
+ "movq 16+"#off"(%2), %%mm2 \n\t"\
+ "movq 32+"#off"(%2), %%mm3 \n\t"\
+ "movq 48+"#off"(%2), %%mm4 \n\t"\
+ "packsswb 8+"#off"(%2), %%mm1 \n\t"\
+ "packsswb 24+"#off"(%2), %%mm2 \n\t"\
+ "packsswb 40+"#off"(%2), %%mm3 \n\t"\
+ "packsswb 56+"#off"(%2), %%mm4 \n\t"\
+ "paddb %%mm0, %%mm1 \n\t"\
+ "paddb %%mm0, %%mm2 \n\t"\
+ "paddb %%mm0, %%mm3 \n\t"\
+ "paddb %%mm0, %%mm4 \n\t"\
+ "movq %%mm1, (%0) \n\t"\
+ "movq %%mm2, (%0, %3) \n\t"\
+ "movq %%mm3, (%0, %3, 2) \n\t"\
+ "movq %%mm4, (%0, %1) \n\t"
+
void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
{
- int i;
-
- movq_m2r(*vector128, mm1);
- for (i = 0; i < 8; i++) {
- movq_m2r(*(block), mm0);
- packsswb_m2r(*(block + 4), mm0);
- block += 8;
- paddb_r2r(mm1, mm0);
- movq_r2m(mm0, *pixels);
- pixels += line_size;
- }
+ x86_reg line_skip = line_size;
+ x86_reg line_skip3;
+
+ __asm__ volatile (
+ "movq "MANGLE(ff_vector128)", %%mm0 \n\t"
+ "lea (%3, %3, 2), %1 \n\t"
+ put_signed_pixels_clamped_mmx_half(0)
+ "lea (%0, %3, 4), %0 \n\t"
+ put_signed_pixels_clamped_mmx_half(64)
+ :"+&r" (pixels), "=&r" (line_skip3)
+ :"r" (block), "r"(line_skip)
+ :"memory");
}
void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
);
}
+static void clear_blocks_sse(DCTELEM *blocks)
+{\
+ __asm__ volatile(
+ "xorps %%xmm0, %%xmm0 \n"
+ "mov %1, %%"REG_a" \n"
+ "1: \n"
+ "movaps %%xmm0, (%0, %%"REG_a") \n"
+ "movaps %%xmm0, 16(%0, %%"REG_a") \n"
+ "movaps %%xmm0, 32(%0, %%"REG_a") \n"
+ "movaps %%xmm0, 48(%0, %%"REG_a") \n"
+ "movaps %%xmm0, 64(%0, %%"REG_a") \n"
+ "movaps %%xmm0, 80(%0, %%"REG_a") \n"
+ "movaps %%xmm0, 96(%0, %%"REG_a") \n"
+ "movaps %%xmm0, 112(%0, %%"REG_a") \n"
+ "add $128, %%"REG_a" \n"
+ " js 1b \n"
+ : : "r" (((uint8_t *)blocks)+128*6),
+ "i" (-128*6)
+ : "%"REG_a
+ );
+}
+
static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
x86_reg i=0;
__asm__ volatile(
dst[i] = src1[i] + src2[i];
}
+#if HAVE_7REGS && HAVE_TEN_OPERANDS
+static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top) {
+ x86_reg w2 = -w;
+ x86_reg x;
+ int l = *left & 0xff;
+ int tl = *left_top & 0xff;
+ int t;
+ __asm__ volatile(
+ "mov %7, %3 \n"
+ "1: \n"
+ "movzx (%3,%4), %2 \n"
+ "mov %2, %k3 \n"
+ "sub %b1, %b3 \n"
+ "add %b0, %b3 \n"
+ "mov %2, %1 \n"
+ "cmp %0, %2 \n"
+ "cmovg %0, %2 \n"
+ "cmovg %1, %0 \n"
+ "cmp %k3, %0 \n"
+ "cmovg %k3, %0 \n"
+ "mov %7, %3 \n"
+ "cmp %2, %0 \n"
+ "cmovl %2, %0 \n"
+ "add (%6,%4), %b0 \n"
+ "mov %b0, (%5,%4) \n"
+ "inc %4 \n"
+ "jl 1b \n"
+ :"+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
+ :"r"(dst+w), "r"(diff+w), "rm"(top+w)
+ );
+ *left = l;
+ *left_top = tl;
+}
+#endif
+
#define H263_LOOP_FILTER \
"pxor %%mm7, %%mm7 \n\t"\
"movq %0, %%mm0 \n\t"\
"paddb %%mm1, %%mm6 \n\t"
static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){
- if(ENABLE_ANY_H263) {
+ if(CONFIG_ANY_H263) {
const int strength= ff_h263_loop_filter_strength[qscale];
__asm__ volatile(
}
static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
- if(ENABLE_ANY_H263) {
+ if(CONFIG_ANY_H263) {
const int strength= ff_h263_loop_filter_strength[qscale];
DECLARE_ALIGNED(8, uint64_t, temp[4]);
uint8_t *btemp= (uint8_t*)temp;
"pabsw %%mm5, %%mm5 \n"
PAETH(mmx2, ABS3_MMX2)
-#ifdef HAVE_SSSE3
+#if HAVE_SSSE3
PAETH(ssse3, ABS3_SSSE3)
#endif
#if 0
-static void just_return() { return; }
+static void just_return(void) { return; }
#endif
static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
#undef PREFETCH
#include "h264dsp_mmx.c"
+#include "rv40dsp_mmx.c"
/* CAVS specific */
void ff_cavsdsp_init_mmx2(DSPContext* c, AVCodecContext *avctx);
void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
put_pixels8_mmx(dst, src, stride, 8);
}
+void ff_avg_vc1_mspel_mc00_mmx2(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
+ avg_pixels8_mmx2(dst, src, stride, 8);
+}
/* external functions, from idct_mmx.c */
void ff_mmx_idct(DCTELEM *block);
/* XXX: those functions should be suppressed ASAP when all IDCTs are
converted */
-#ifdef CONFIG_GPL
+#if CONFIG_GPL
static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
{
ff_mmx_idct (block);
);
}
-static void vector_fmul_add_add_3dnow(float *dst, const float *src0, const float *src1,
- const float *src2, int src3, int len, int step){
+static void vector_fmul_add_3dnow(float *dst, const float *src0, const float *src1,
+ const float *src2, int len){
x86_reg i = (len-4)*4;
- if(step == 2 && src3 == 0){
- dst += (len-4)*2;
- __asm__ volatile(
- "1: \n\t"
- "movq (%2,%0), %%mm0 \n\t"
- "movq 8(%2,%0), %%mm1 \n\t"
- "pfmul (%3,%0), %%mm0 \n\t"
- "pfmul 8(%3,%0), %%mm1 \n\t"
- "pfadd (%4,%0), %%mm0 \n\t"
- "pfadd 8(%4,%0), %%mm1 \n\t"
- "movd %%mm0, (%1) \n\t"
- "movd %%mm1, 16(%1) \n\t"
- "psrlq $32, %%mm0 \n\t"
- "psrlq $32, %%mm1 \n\t"
- "movd %%mm0, 8(%1) \n\t"
- "movd %%mm1, 24(%1) \n\t"
- "sub $32, %1 \n\t"
- "sub $16, %0 \n\t"
- "jge 1b \n\t"
- :"+r"(i), "+r"(dst)
- :"r"(src0), "r"(src1), "r"(src2)
- :"memory"
- );
- }
- else if(step == 1 && src3 == 0){
- __asm__ volatile(
- "1: \n\t"
- "movq (%2,%0), %%mm0 \n\t"
- "movq 8(%2,%0), %%mm1 \n\t"
- "pfmul (%3,%0), %%mm0 \n\t"
- "pfmul 8(%3,%0), %%mm1 \n\t"
- "pfadd (%4,%0), %%mm0 \n\t"
- "pfadd 8(%4,%0), %%mm1 \n\t"
- "movq %%mm0, (%1,%0) \n\t"
- "movq %%mm1, 8(%1,%0) \n\t"
- "sub $16, %0 \n\t"
- "jge 1b \n\t"
- :"+r"(i)
- :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
- :"memory"
- );
- }
- else
- ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
+ __asm__ volatile(
+ "1: \n\t"
+ "movq (%2,%0), %%mm0 \n\t"
+ "movq 8(%2,%0), %%mm1 \n\t"
+ "pfmul (%3,%0), %%mm0 \n\t"
+ "pfmul 8(%3,%0), %%mm1 \n\t"
+ "pfadd (%4,%0), %%mm0 \n\t"
+ "pfadd 8(%4,%0), %%mm1 \n\t"
+ "movq %%mm0, (%1,%0) \n\t"
+ "movq %%mm1, 8(%1,%0) \n\t"
+ "sub $16, %0 \n\t"
+ "jge 1b \n\t"
+ :"+r"(i)
+ :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
+ :"memory"
+ );
__asm__ volatile("femms");
}
-static void vector_fmul_add_add_sse(float *dst, const float *src0, const float *src1,
- const float *src2, int src3, int len, int step){
+static void vector_fmul_add_sse(float *dst, const float *src0, const float *src1,
+ const float *src2, int len){
x86_reg i = (len-8)*4;
- if(step == 2 && src3 == 0){
- dst += (len-8)*2;
- __asm__ volatile(
- "1: \n\t"
- "movaps (%2,%0), %%xmm0 \n\t"
- "movaps 16(%2,%0), %%xmm1 \n\t"
- "mulps (%3,%0), %%xmm0 \n\t"
- "mulps 16(%3,%0), %%xmm1 \n\t"
- "addps (%4,%0), %%xmm0 \n\t"
- "addps 16(%4,%0), %%xmm1 \n\t"
- "movss %%xmm0, (%1) \n\t"
- "movss %%xmm1, 32(%1) \n\t"
- "movhlps %%xmm0, %%xmm2 \n\t"
- "movhlps %%xmm1, %%xmm3 \n\t"
- "movss %%xmm2, 16(%1) \n\t"
- "movss %%xmm3, 48(%1) \n\t"
- "shufps $0xb1, %%xmm0, %%xmm0 \n\t"
- "shufps $0xb1, %%xmm1, %%xmm1 \n\t"
- "movss %%xmm0, 8(%1) \n\t"
- "movss %%xmm1, 40(%1) \n\t"
- "movhlps %%xmm0, %%xmm2 \n\t"
- "movhlps %%xmm1, %%xmm3 \n\t"
- "movss %%xmm2, 24(%1) \n\t"
- "movss %%xmm3, 56(%1) \n\t"
- "sub $64, %1 \n\t"
- "sub $32, %0 \n\t"
- "jge 1b \n\t"
- :"+r"(i), "+r"(dst)
- :"r"(src0), "r"(src1), "r"(src2)
- :"memory"
- );
- }
- else if(step == 1 && src3 == 0){
- __asm__ volatile(
- "1: \n\t"
- "movaps (%2,%0), %%xmm0 \n\t"
- "movaps 16(%2,%0), %%xmm1 \n\t"
- "mulps (%3,%0), %%xmm0 \n\t"
- "mulps 16(%3,%0), %%xmm1 \n\t"
- "addps (%4,%0), %%xmm0 \n\t"
- "addps 16(%4,%0), %%xmm1 \n\t"
- "movaps %%xmm0, (%1,%0) \n\t"
- "movaps %%xmm1, 16(%1,%0) \n\t"
- "sub $32, %0 \n\t"
- "jge 1b \n\t"
- :"+r"(i)
- :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
- :"memory"
- );
- }
- else
- ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
+ __asm__ volatile(
+ "1: \n\t"
+ "movaps (%2,%0), %%xmm0 \n\t"
+ "movaps 16(%2,%0), %%xmm1 \n\t"
+ "mulps (%3,%0), %%xmm0 \n\t"
+ "mulps 16(%3,%0), %%xmm1 \n\t"
+ "addps (%4,%0), %%xmm0 \n\t"
+ "addps 16(%4,%0), %%xmm1 \n\t"
+ "movaps %%xmm0, (%1,%0) \n\t"
+ "movaps %%xmm1, 16(%1,%0) \n\t"
+ "sub $32, %0 \n\t"
+ "jge 1b \n\t"
+ :"+r"(i)
+ :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
+ :"memory"
+ );
}
static void vector_fmul_window_3dnow2(float *dst, const float *src0, const float *src1,
const float *win, float add_bias, int len){
-#ifdef HAVE_6REGS
+#if HAVE_6REGS
if(add_bias == 0){
x86_reg i = -len*4;
x86_reg j = len*4-8;
static void vector_fmul_window_sse(float *dst, const float *src0, const float *src1,
const float *win, float add_bias, int len){
-#ifdef HAVE_6REGS
+#if HAVE_6REGS
if(add_bias == 0){
x86_reg i = -len*4;
x86_reg j = len*4-16;
);
}
+static void vector_clipf_sse(float *dst, const float *src, float min, float max,
+ int len)
+{
+ x86_reg i = (len-16)*4;
+ __asm__ volatile(
+ "movss %3, %%xmm4 \n"
+ "movss %4, %%xmm5 \n"
+ "shufps $0, %%xmm4, %%xmm4 \n"
+ "shufps $0, %%xmm5, %%xmm5 \n"
+ "1: \n\t"
+ "movaps (%2,%0), %%xmm0 \n\t" // 3/1 on intel
+ "movaps 16(%2,%0), %%xmm1 \n\t"
+ "movaps 32(%2,%0), %%xmm2 \n\t"
+ "movaps 48(%2,%0), %%xmm3 \n\t"
+ "maxps %%xmm4, %%xmm0 \n\t"
+ "maxps %%xmm4, %%xmm1 \n\t"
+ "maxps %%xmm4, %%xmm2 \n\t"
+ "maxps %%xmm4, %%xmm3 \n\t"
+ "minps %%xmm5, %%xmm0 \n\t"
+ "minps %%xmm5, %%xmm1 \n\t"
+ "minps %%xmm5, %%xmm2 \n\t"
+ "minps %%xmm5, %%xmm3 \n\t"
+ "movaps %%xmm0, (%1,%0) \n\t"
+ "movaps %%xmm1, 16(%1,%0) \n\t"
+ "movaps %%xmm2, 32(%1,%0) \n\t"
+ "movaps %%xmm3, 48(%1,%0) \n\t"
+ "sub $64, %0 \n\t"
+ "jge 1b \n\t"
+ :"+&r"(i)
+ :"r"(dst), "r"(src), "m"(min), "m"(max)
+ :"memory"
+ );
+}
+
static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){
x86_reg reglen = len;
// not bit-exact: pf2id uses different rounding than C and SSE
);
}
-#ifdef HAVE_YASM
void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len);
void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len);
+int32_t ff_scalarproduct_int16_mmx2(int16_t *v1, int16_t *v2, int order, int shift);
+int32_t ff_scalarproduct_int16_sse2(int16_t *v1, int16_t *v2, int order, int shift);
+int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul);
+int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul);
+int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul);
+void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top);
+int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left);
+int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left);
void ff_x264_deblock_v_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0);
void ff_x264_deblock_h_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0);
-void ff_x264_deblock_v8_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta);
void ff_x264_deblock_h_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta);
-#ifdef ARCH_X86_32
+void ff_x264_deblock_v_luma_intra_sse2(uint8_t *pix, int stride, int alpha, int beta);
+void ff_x264_deblock_h_luma_intra_sse2(uint8_t *pix, int stride, int alpha, int beta);
+
+#if HAVE_YASM && ARCH_X86_32
+void ff_x264_deblock_v8_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta);
static void ff_x264_deblock_v_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta)
{
ff_x264_deblock_v8_luma_intra_mmxext(pix+0, stride, alpha, beta);
ff_x264_deblock_v8_luma_intra_mmxext(pix+8, stride, alpha, beta);
}
-#endif
-void ff_x264_deblock_v_luma_intra_sse2(uint8_t *pix, int stride, int alpha, int beta);
-void ff_x264_deblock_h_luma_intra_sse2(uint8_t *pix, int stride, int alpha, int beta);
-#else
+#elif !HAVE_YASM
#define ff_float_to_int16_interleave6_sse(a,b,c) float_to_int16_interleave_misc_sse(a,b,c,6)
#define ff_float_to_int16_interleave6_3dnow(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6)
#define ff_float_to_int16_interleave6_3dn2(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6)
int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
-static void add_int16_sse2(int16_t * v1, int16_t * v2, int order)
-{
- x86_reg o = -(order << 1);
- v1 += order;
- v2 += order;
- __asm__ volatile(
- "1: \n\t"
- "movdqu (%1,%2), %%xmm0 \n\t"
- "movdqu 16(%1,%2), %%xmm1 \n\t"
- "paddw (%0,%2), %%xmm0 \n\t"
- "paddw 16(%0,%2), %%xmm1 \n\t"
- "movdqa %%xmm0, (%0,%2) \n\t"
- "movdqa %%xmm1, 16(%0,%2) \n\t"
- "add $32, %2 \n\t"
- "js 1b \n\t"
- : "+r"(v1), "+r"(v2), "+r"(o)
- );
-}
-
-static void sub_int16_sse2(int16_t * v1, int16_t * v2, int order)
-{
- x86_reg o = -(order << 1);
- v1 += order;
- v2 += order;
- __asm__ volatile(
- "1: \n\t"
- "movdqa (%0,%2), %%xmm0 \n\t"
- "movdqa 16(%0,%2), %%xmm2 \n\t"
- "movdqu (%1,%2), %%xmm1 \n\t"
- "movdqu 16(%1,%2), %%xmm3 \n\t"
- "psubw %%xmm1, %%xmm0 \n\t"
- "psubw %%xmm3, %%xmm2 \n\t"
- "movdqa %%xmm0, (%0,%2) \n\t"
- "movdqa %%xmm2, 16(%0,%2) \n\t"
- "add $32, %2 \n\t"
- "js 1b \n\t"
- : "+r"(v1), "+r"(v2), "+r"(o)
- );
-}
-
-static int32_t scalarproduct_int16_sse2(int16_t * v1, int16_t * v2, int order, int shift)
-{
- int res = 0;
- DECLARE_ALIGNED_16(int64_t, sh);
- x86_reg o = -(order << 1);
-
- v1 += order;
- v2 += order;
- sh = shift;
- __asm__ volatile(
- "pxor %%xmm7, %%xmm7 \n\t"
- "1: \n\t"
- "movdqu (%0,%3), %%xmm0 \n\t"
- "movdqu 16(%0,%3), %%xmm1 \n\t"
- "pmaddwd (%1,%3), %%xmm0 \n\t"
- "pmaddwd 16(%1,%3), %%xmm1 \n\t"
- "paddd %%xmm0, %%xmm7 \n\t"
- "paddd %%xmm1, %%xmm7 \n\t"
- "add $32, %3 \n\t"
- "js 1b \n\t"
- "movhlps %%xmm7, %%xmm2 \n\t"
- "paddd %%xmm2, %%xmm7 \n\t"
- "psrad %4, %%xmm7 \n\t"
- "pshuflw $0x4E, %%xmm7,%%xmm2 \n\t"
- "paddd %%xmm2, %%xmm7 \n\t"
- "movd %%xmm7, %2 \n\t"
- : "+r"(v1), "+r"(v2), "=r"(res), "+r"(o)
- : "m"(sh)
- );
- return res;
-}
-
void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
{
mm_flags = mm_support();
av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:");
if (mm_flags & FF_MM_MMX)
av_log(avctx, AV_LOG_INFO, " mmx");
- if (mm_flags & FF_MM_MMXEXT)
- av_log(avctx, AV_LOG_INFO, " mmxext");
+ if (mm_flags & FF_MM_MMX2)
+ av_log(avctx, AV_LOG_INFO, " mmx2");
if (mm_flags & FF_MM_3DNOW)
av_log(avctx, AV_LOG_INFO, " 3dnow");
if (mm_flags & FF_MM_SSE)
c->idct_add= ff_simple_idct_add_mmx;
c->idct = ff_simple_idct_mmx;
c->idct_permutation_type= FF_SIMPLE_IDCT_PERM;
-#ifdef CONFIG_GPL
+#if CONFIG_GPL
}else if(idct_algo==FF_IDCT_LIBMPEG2MMX){
- if(mm_flags & FF_MM_MMXEXT){
+ if(mm_flags & FF_MM_MMX2){
c->idct_put= ff_libmpeg2mmx2_idct_put;
c->idct_add= ff_libmpeg2mmx2_idct_add;
c->idct = ff_mmxext_idct;
}
c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
#endif
- }else if((ENABLE_VP3_DECODER || ENABLE_VP5_DECODER || ENABLE_VP6_DECODER || ENABLE_THEORA_DECODER) &&
+ }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER) &&
idct_algo==FF_IDCT_VP3){
if(mm_flags & FF_MM_SSE2){
c->idct_put= ff_vp3_idct_put_sse2;
c->idct_add= ff_idct_xvid_sse2_add;
c->idct = ff_idct_xvid_sse2;
c->idct_permutation_type= FF_SSE2_IDCT_PERM;
- }else if(mm_flags & FF_MM_MMXEXT){
+ }else if(mm_flags & FF_MM_MMX2){
c->idct_put= ff_idct_xvid_mmx2_put;
c->idct_add= ff_idct_xvid_mmx2_add;
c->idct = ff_idct_xvid_mmx2;
c->add_pixels_clamped = add_pixels_clamped_mmx;
c->clear_block = clear_block_mmx;
c->clear_blocks = clear_blocks_mmx;
- if (mm_flags & FF_MM_SSE)
- c->clear_block = clear_block_sse;
+ if (mm_flags & FF_MM_SSE){
+ c->clear_block = clear_block_sse;
+ c->clear_blocks = clear_blocks_sse;
+ }
#define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
c->draw_edges = draw_edges_mmx;
- if (ENABLE_ANY_H263) {
+ if (CONFIG_ANY_H263) {
c->h263_v_loop_filter= h263_v_loop_filter_mmx;
c->h263_h_loop_filter= h263_h_loop_filter_mmx;
}
c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx_rnd;
c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_mmx;
- c->put_no_rnd_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx_nornd;
+ c->put_no_rnd_vc1_chroma_pixels_tab[0]= put_vc1_chroma_mc8_mmx_nornd;
+
+ c->put_rv40_chroma_pixels_tab[0]= put_rv40_chroma_mc8_mmx;
+ c->put_rv40_chroma_pixels_tab[1]= put_rv40_chroma_mc4_mmx;
c->h264_idct_dc_add=
c->h264_idct_add= ff_h264_idct_add_mmx;
c->h264_idct_add8 = ff_h264_idct_add8_mmx;
c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx;
- if (mm_flags & FF_MM_MMXEXT) {
+ if (CONFIG_VP6_DECODER) {
+ c->vp6_filter_diag4 = ff_vp6_filter_diag4_mmx;
+ }
+
+ if (mm_flags & FF_MM_MMX2) {
c->prefetch = prefetch_mmx2;
c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
- if (ENABLE_VP3_DECODER || ENABLE_THEORA_DECODER) {
+ if (CONFIG_VP3_DECODER) {
c->vp3_v_loop_filter= ff_vp3_v_loop_filter_mmx2;
c->vp3_h_loop_filter= ff_vp3_h_loop_filter_mmx2;
}
SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmx2);
SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmx2);
+ c->avg_rv40_chroma_pixels_tab[0]= avg_rv40_chroma_mc8_mmx2;
+ c->avg_rv40_chroma_pixels_tab[1]= avg_rv40_chroma_mc4_mmx2;
+
+ c->avg_no_rnd_vc1_chroma_pixels_tab[0]= avg_vc1_chroma_mc8_mmx2_nornd;
+
c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_mmx2_rnd;
c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_mmx2;
c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_mmx2;
c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2;
c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2;
- if (ENABLE_CAVS_DECODER)
+#if HAVE_YASM
+ c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmx2;
+#endif
+#if HAVE_7REGS && HAVE_TEN_OPERANDS
+ if( mm_flags&FF_MM_3DNOW )
+ c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
+#endif
+
+ if (CONFIG_CAVS_DECODER)
ff_cavsdsp_init_mmx2(c, avctx);
- if (ENABLE_VC1_DECODER || ENABLE_WMV3_DECODER)
+ if (CONFIG_VC1_DECODER)
ff_vc1dsp_init_mmx(c, avctx);
c->add_png_paeth_prediction= add_png_paeth_prediction_mmx2;
c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_3dnow_rnd;
c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_3dnow;
- if (ENABLE_CAVS_DECODER)
+ c->avg_rv40_chroma_pixels_tab[0]= avg_rv40_chroma_mc8_3dnow;
+ c->avg_rv40_chroma_pixels_tab[1]= avg_rv40_chroma_mc4_3dnow;
+
+ if (CONFIG_CAVS_DECODER)
ff_cavsdsp_init_3dnow(c, avctx);
}
H264_QPEL_FUNCS(3, 1, sse2);
H264_QPEL_FUNCS(3, 2, sse2);
H264_QPEL_FUNCS(3, 3, sse2);
+
+ if (CONFIG_VP6_DECODER) {
+ c->vp6_filter_diag4 = ff_vp6_filter_diag4_sse2;
+ }
}
-#ifdef HAVE_SSSE3
+#if HAVE_SSSE3
if(mm_flags & FF_MM_SSSE3){
H264_QPEL_FUNCS(1, 0, ssse3);
H264_QPEL_FUNCS(1, 1, ssse3);
H264_QPEL_FUNCS(3, 1, ssse3);
H264_QPEL_FUNCS(3, 2, ssse3);
H264_QPEL_FUNCS(3, 3, ssse3);
- c->put_no_rnd_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_ssse3_nornd;
+ c->put_no_rnd_vc1_chroma_pixels_tab[0]= put_vc1_chroma_mc8_ssse3_nornd;
+ c->avg_no_rnd_vc1_chroma_pixels_tab[0]= avg_vc1_chroma_mc8_ssse3_nornd;
c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_ssse3_rnd;
c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_ssse3_rnd;
c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_ssse3;
c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_ssse3;
c->add_png_paeth_prediction= add_png_paeth_prediction_ssse3;
+#if HAVE_YASM
+ c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
+ if (mm_flags & FF_MM_SSE4) // not really sse4, just slow on Conroe
+ c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
+#endif
}
#endif
-#if defined(CONFIG_GPL) && defined(HAVE_YASM)
- if( mm_flags&FF_MM_MMXEXT ){
-#ifdef ARCH_X86_32
+#if CONFIG_GPL && HAVE_YASM
+ if (mm_flags & FF_MM_MMX2){
+#if ARCH_X86_32
c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_mmxext;
c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_mmxext;
#endif
if( mm_flags&FF_MM_SSE2 ){
+#if ARCH_X86_64 || !defined(__ICC) || __ICC > 1110
c->h264_v_loop_filter_luma = ff_x264_deblock_v_luma_sse2;
c->h264_h_loop_filter_luma = ff_x264_deblock_h_luma_sse2;
c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_sse2;
c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_sse2;
+#endif
+ c->h264_idct_add16 = ff_h264_idct_add16_sse2;
+ c->h264_idct_add8 = ff_h264_idct_add8_sse2;
+ c->h264_idct_add16intra = ff_h264_idct_add16intra_sse2;
}
}
#endif
-#ifdef CONFIG_SNOW_DECODER
+#if CONFIG_SNOW_DECODER
if(mm_flags & FF_MM_SSE2 & 0){
c->horizontal_compose97i = ff_snow_horizontal_compose97i_sse2;
-#ifdef HAVE_7REGS
+#if HAVE_7REGS
c->vertical_compose97i = ff_snow_vertical_compose97i_sse2;
#endif
c->inner_add_yblock = ff_snow_inner_add_yblock_sse2;
}
else{
- if(mm_flags & FF_MM_MMXEXT){
+ if(mm_flags & FF_MM_MMX2){
c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx;
-#ifdef HAVE_7REGS
+#if HAVE_7REGS
c->vertical_compose97i = ff_snow_vertical_compose97i_mmx;
#endif
}
c->float_to_int16_interleave = float_to_int16_interleave_3dn2;
}
}
+ if(mm_flags & FF_MM_MMX2){
+#if HAVE_YASM
+ c->scalarproduct_int16 = ff_scalarproduct_int16_mmx2;
+ c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmx2;
+#endif
+ }
if(mm_flags & FF_MM_SSE){
c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
c->ac3_downmix = ac3_downmix_sse;
c->vector_fmul = vector_fmul_sse;
c->vector_fmul_reverse = vector_fmul_reverse_sse;
- c->vector_fmul_add_add = vector_fmul_add_add_sse;
+ c->vector_fmul_add = vector_fmul_add_sse;
c->vector_fmul_window = vector_fmul_window_sse;
c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse;
+ c->vector_clipf = vector_clipf_sse;
c->float_to_int16 = float_to_int16_sse;
c->float_to_int16_interleave = float_to_int16_interleave_sse;
}
if(mm_flags & FF_MM_3DNOW)
- c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse
+ c->vector_fmul_add = vector_fmul_add_3dnow; // faster than sse
if(mm_flags & FF_MM_SSE2){
c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2;
c->float_to_int16 = float_to_int16_sse2;
c->float_to_int16_interleave = float_to_int16_interleave_sse2;
- c->add_int16 = add_int16_sse2;
- c->sub_int16 = sub_int16_sse2;
- c->scalarproduct_int16 = scalarproduct_int16_sse2;
+#if HAVE_YASM
+ c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
+ c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
+#endif
}
+ if((mm_flags & FF_MM_SSSE3) && !(mm_flags & (FF_MM_SSE42|FF_MM_3DNOW)) && HAVE_YASM) // cachesplit
+ c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
}
- if (ENABLE_ENCODERS)
+ if (CONFIG_ENCODERS)
dsputilenc_init_mmx(c, avctx);
#if 0