DECLARE_ALIGNED(8, const uint64_t, ff_bone) = 0x0101010101010101ULL;
DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
-DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] =
- { 0x8000000080000000ULL, 0x8000000080000000ULL };
-
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1) = { 0x0001000100010001ULL, 0x0001000100010001ULL };
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_2) = { 0x0002000200020002ULL, 0x0002000200020002ULL };
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_3) = { 0x0003000300030003ULL, 0x0003000300030003ULL };
#define DEF(x) x ## _3dnow
#define PAVGB "pavgusb"
-#define OP_AVG PAVGB
#define SKIP_FOR_3DNOW
#include "dsputil_avg_template.c"
#undef DEF
#undef PAVGB
-#undef OP_AVG
#undef SKIP_FOR_3DNOW
/***********************************/
/* Introduced only in MMXEXT set */
#define PAVGB "pavgb"
-#define OP_AVG PAVGB
#include "dsputil_avg_template.c"
#undef DEF
#undef PAVGB
-#undef OP_AVG
#define put_no_rnd_pixels16_mmx put_pixels16_mmx
#define put_no_rnd_pixels8_mmx put_pixels8_mmx
/***********************************/
/* standard MMX */
-void ff_put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
+void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
int line_size)
{
- const DCTELEM *p;
+ const int16_t *p;
uint8_t *pix;
/* read the pixels */
"movq %%mm3, (%0, %3, 2) \n\t" \
"movq %%mm4, (%0, %1) \n\t"
-void ff_put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
+void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
int line_size)
{
x86_reg line_skip = line_size;
: "memory");
}
-void ff_add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
+void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
int line_size)
{
- const DCTELEM *p;
+ const int16_t *p;
uint8_t *pix;
int i;
}
#define CLEAR_BLOCKS(name, n) \
-static void name(DCTELEM *blocks) \
+static void name(int16_t *blocks) \
{ \
__asm__ volatile ( \
"pxor %%mm7, %%mm7 \n\t" \
CLEAR_BLOCKS(clear_blocks_mmx, 6)
CLEAR_BLOCKS(clear_block_mmx, 1)
-static void clear_block_sse(DCTELEM *block)
+static void clear_block_sse(int16_t *block)
{
__asm__ volatile (
"xorps %%xmm0, %%xmm0 \n"
);
}
-static void clear_blocks_sse(DCTELEM *blocks)
+static void clear_blocks_sse(int16_t *blocks)
{
__asm__ volatile (
"xorps %%xmm0, %%xmm0 \n"
QPEL_OP(avg_, ff_pw_16, _, AVG_MMXEXT_OP, mmxext)
QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmxext)
-/***********************************/
-/* bilinear qpel: not compliant to any spec, only for -lavdopts fast */
-
-#define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL) \
-static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, \
- uint8_t *src, \
- int stride) \
-{ \
- OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE); \
-}
-
-#define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2) \
-static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, \
- uint8_t *src, \
- int stride) \
-{ \
- OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src + S0, stride, SIZE, \
- S1, S2); \
-}
-
-#define QPEL_2TAP(OPNAME, SIZE, MMX) \
-QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX) \
-QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX) \
-QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx) \
-static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX = \
- OPNAME ## qpel ## SIZE ## _mc00_ ## MMX; \
-static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX = \
- OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX; \
-static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX = \
- OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX; \
-static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, \
- uint8_t *src, \
- int stride) \
-{ \
- OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src + 1, stride, SIZE); \
-} \
-static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, \
- uint8_t *src, \
- int stride) \
-{ \
- OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src + stride, \
- stride, SIZE); \
-} \
-QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0, 1, 0) \
-QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1, -1, 0) \
-QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0, stride, 0) \
-QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride, -stride, 0) \
-QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0, stride, 1) \
-QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1, stride, -1) \
-QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride, -stride, 1) \
-QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride + 1, -stride, -1) \
-
-QPEL_2TAP(put_, 16, mmxext)
-QPEL_2TAP(avg_, 16, mmxext)
-QPEL_2TAP(put_, 8, mmxext)
-QPEL_2TAP(avg_, 8, mmxext)
-
void ff_put_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
{
put_pixels8_xy2_mmx(dst, src, stride, 8);
avg_pixels8_mmxext(dst, src, stride, 8);
}
-/* only used in VP3/5/6 */
-static void put_vp_no_rnd_pixels8_l2_mmx(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h)
-{
-// START_TIMER
- MOVQ_BFE(mm6);
- __asm__ volatile(
- "1: \n\t"
- "movq (%1), %%mm0 \n\t"
- "movq (%2), %%mm1 \n\t"
- "movq (%1,%4), %%mm2 \n\t"
- "movq (%2,%4), %%mm3 \n\t"
- PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
- "movq %%mm4, (%3) \n\t"
- "movq %%mm5, (%3,%4) \n\t"
-
- "movq (%1,%4,2), %%mm0 \n\t"
- "movq (%2,%4,2), %%mm1 \n\t"
- "movq (%1,%5), %%mm2 \n\t"
- "movq (%2,%5), %%mm3 \n\t"
- "lea (%1,%4,4), %1 \n\t"
- "lea (%2,%4,4), %2 \n\t"
- PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
- "movq %%mm4, (%3,%4,2) \n\t"
- "movq %%mm5, (%3,%5) \n\t"
- "lea (%3,%4,4), %3 \n\t"
- "subl $4, %0 \n\t"
- "jnz 1b \n\t"
- :"+r"(h), "+r"(a), "+r"(b), "+r"(dst)
- :"r"((x86_reg)stride), "r"((x86_reg)3L*stride)
- :"memory");
-// STOP_TIMER("put_vp_no_rnd_pixels8_l2_mmx")
-}
-static void put_vp_no_rnd_pixels16_l2_mmx(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h)
-{
- put_vp_no_rnd_pixels8_l2_mmx(dst, a, b, stride, h);
- put_vp_no_rnd_pixels8_l2_mmx(dst+8, a+8, b+8, stride, h);
-}
-
#if CONFIG_DIRAC_DECODER
#define DIRAC_PIXOP(OPNAME, EXT)\
void ff_ ## OPNAME ## _dirac_pixels8_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
* converted. */
#if CONFIG_GPL
static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size,
- DCTELEM *block)
+ int16_t *block)
{
ff_mmx_idct(block);
ff_put_pixels_clamped_mmx(block, dest, line_size);
}
static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size,
- DCTELEM *block)
+ int16_t *block)
{
ff_mmx_idct(block);
ff_add_pixels_clamped_mmx(block, dest, line_size);
}
static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size,
- DCTELEM *block)
+ int16_t *block)
{
ff_mmxext_idct(block);
ff_put_pixels_clamped_mmx(block, dest, line_size);
}
static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size,
- DCTELEM *block)
+ int16_t *block)
{
ff_mmxext_idct(block);
ff_add_pixels_clamped_mmx(block, dest, line_size);
}
#endif
-static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize)
-{
- int i;
- __asm__ volatile ("pxor %%mm7, %%mm7":);
- for (i = 0; i < blocksize; i += 2) {
- __asm__ volatile (
- "movq %0, %%mm0 \n\t"
- "movq %1, %%mm1 \n\t"
- "movq %%mm0, %%mm2 \n\t"
- "movq %%mm1, %%mm3 \n\t"
- "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0
- "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0
- "pslld $31, %%mm2 \n\t" // keep only the sign bit
- "pxor %%mm2, %%mm1 \n\t"
- "movq %%mm3, %%mm4 \n\t"
- "pand %%mm1, %%mm3 \n\t"
- "pandn %%mm1, %%mm4 \n\t"
- "pfadd %%mm0, %%mm3 \n\t" // a = m + ((a < 0) & (a ^ sign(m)))
- "pfsub %%mm4, %%mm0 \n\t" // m = m + ((a > 0) & (a ^ sign(m)))
- "movq %%mm3, %1 \n\t"
- "movq %%mm0, %0 \n\t"
- : "+m"(mag[i]), "+m"(ang[i])
- :: "memory"
- );
- }
- __asm__ volatile ("femms");
-}
-
-static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
-{
- int i;
-
- __asm__ volatile (
- "movaps %0, %%xmm5 \n\t"
- :: "m"(ff_pdw_80000000[0])
- );
- for (i = 0; i < blocksize; i += 4) {
- __asm__ volatile (
- "movaps %0, %%xmm0 \n\t"
- "movaps %1, %%xmm1 \n\t"
- "xorps %%xmm2, %%xmm2 \n\t"
- "xorps %%xmm3, %%xmm3 \n\t"
- "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0
- "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0
- "andps %%xmm5, %%xmm2 \n\t" // keep only the sign bit
- "xorps %%xmm2, %%xmm1 \n\t"
- "movaps %%xmm3, %%xmm4 \n\t"
- "andps %%xmm1, %%xmm3 \n\t"
- "andnps %%xmm1, %%xmm4 \n\t"
- "addps %%xmm0, %%xmm3 \n\t" // a = m + ((a < 0) & (a ^ sign(m)))
- "subps %%xmm4, %%xmm0 \n\t" // m = m + ((a > 0) & (a ^ sign(m)))
- "movaps %%xmm3, %1 \n\t"
- "movaps %%xmm0, %0 \n\t"
- : "+m"(mag[i]), "+m"(ang[i])
- :: "memory"
- );
- }
-}
-
-#if HAVE_6REGS
-static void vector_fmul_window_3dnowext(float *dst, const float *src0,
- const float *src1, const float *win,
- int len)
-{
- x86_reg i = -len * 4;
- x86_reg j = len * 4 - 8;
- __asm__ volatile (
- "1: \n"
- "pswapd (%5, %1), %%mm1 \n"
- "movq (%5, %0), %%mm0 \n"
- "pswapd (%4, %1), %%mm5 \n"
- "movq (%3, %0), %%mm4 \n"
- "movq %%mm0, %%mm2 \n"
- "movq %%mm1, %%mm3 \n"
- "pfmul %%mm4, %%mm2 \n" // src0[len + i] * win[len + i]
- "pfmul %%mm5, %%mm3 \n" // src1[j] * win[len + j]
- "pfmul %%mm4, %%mm1 \n" // src0[len + i] * win[len + j]
- "pfmul %%mm5, %%mm0 \n" // src1[j] * win[len + i]
- "pfadd %%mm3, %%mm2 \n"
- "pfsub %%mm0, %%mm1 \n"
- "pswapd %%mm2, %%mm2 \n"
- "movq %%mm1, (%2, %0) \n"
- "movq %%mm2, (%2, %1) \n"
- "sub $8, %1 \n"
- "add $8, %0 \n"
- "jl 1b \n"
- "femms \n"
- : "+r"(i), "+r"(j)
- : "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)
- );
-}
-
-static void vector_fmul_window_sse(float *dst, const float *src0,
- const float *src1, const float *win, int len)
-{
- x86_reg i = -len * 4;
- x86_reg j = len * 4 - 16;
- __asm__ volatile (
- "1: \n"
- "movaps (%5, %1), %%xmm1 \n"
- "movaps (%5, %0), %%xmm0 \n"
- "movaps (%4, %1), %%xmm5 \n"
- "movaps (%3, %0), %%xmm4 \n"
- "shufps $0x1b, %%xmm1, %%xmm1 \n"
- "shufps $0x1b, %%xmm5, %%xmm5 \n"
- "movaps %%xmm0, %%xmm2 \n"
- "movaps %%xmm1, %%xmm3 \n"
- "mulps %%xmm4, %%xmm2 \n" // src0[len + i] * win[len + i]
- "mulps %%xmm5, %%xmm3 \n" // src1[j] * win[len + j]
- "mulps %%xmm4, %%xmm1 \n" // src0[len + i] * win[len + j]
- "mulps %%xmm5, %%xmm0 \n" // src1[j] * win[len + i]
- "addps %%xmm3, %%xmm2 \n"
- "subps %%xmm0, %%xmm1 \n"
- "shufps $0x1b, %%xmm2, %%xmm2 \n"
- "movaps %%xmm1, (%2, %0) \n"
- "movaps %%xmm2, (%2, %1) \n"
- "sub $16, %1 \n"
- "add $16, %0 \n"
- "jl 1b \n"
- : "+r"(i), "+r"(j)
- : "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)
- );
-}
-#endif /* HAVE_6REGS */
-
static void vector_clipf_sse(float *dst, const float *src,
float min, float max, int len)
{
int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src,
int w, int left);
-float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
-
-void ff_vector_fmul_reverse_sse(float *dst, const float *src0,
- const float *src1, int len);
-void ff_vector_fmul_reverse_avx(float *dst, const float *src0,
- const float *src1, int len);
-
-void ff_vector_fmul_add_sse(float *dst, const float *src0, const float *src1,
- const float *src2, int len);
-void ff_vector_fmul_add_avx(float *dst, const float *src0, const float *src1,
- const float *src2, int len);
-
void ff_vector_clip_int32_mmx (int32_t *dst, const int32_t *src,
int32_t min, int32_t max, unsigned int len);
void ff_vector_clip_int32_sse2 (int32_t *dst, const int32_t *src,
void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src,
int32_t min, int32_t max, unsigned int len);
-extern void ff_butterflies_float_interleave_sse(float *dst, const float *src0,
- const float *src1, int len);
-extern void ff_butterflies_float_interleave_avx(float *dst, const float *src0,
- const float *src1, int len);
-
#define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
do { \
c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
c->add_bytes = add_bytes_mmx;
- c->put_no_rnd_pixels_l2[0]= put_vp_no_rnd_pixels16_l2_mmx;
- c->put_no_rnd_pixels_l2[1]= put_vp_no_rnd_pixels8_l2_mmx;
-
if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
c->h263_v_loop_filter = h263_v_loop_filter_mmx;
c->h263_h_loop_filter = h263_h_loop_filter_mmx;
#if HAVE_INLINE_ASM
SET_QPEL_FUNCS(avg_qpel, 0, 16, mmxext, );
SET_QPEL_FUNCS(avg_qpel, 1, 8, mmxext, );
- SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmxext, );
- SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmxext, );
SET_QPEL_FUNCS(put_qpel, 0, 16, mmxext, );
SET_QPEL_FUNCS(put_qpel, 1, 8, mmxext, );
- SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmxext, );
- SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmxext, );
SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmxext, );
SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmxext, );
c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_3dnow;
c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_3dnow;
}
-
- c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
#endif /* HAVE_INLINE_ASM */
#if HAVE_YASM
#endif /* HAVE_YASM */
}
-static void dsputil_init_3dnowext(DSPContext *c, AVCodecContext *avctx,
- int mm_flags)
-{
-#if HAVE_AMD3DNOWEXT_INLINE && HAVE_6REGS
- c->vector_fmul_window = vector_fmul_window_3dnowext;
-#endif
-}
-
static void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx, int mm_flags)
{
const int high_bit_depth = avctx->bits_per_raw_sample > 8;
}
}
- c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
-
-#if HAVE_6REGS
- c->vector_fmul_window = vector_fmul_window_sse;
-#endif
-
c->vector_clipf = vector_clipf_sse;
#endif /* HAVE_INLINE_ASM */
#if HAVE_YASM
- c->vector_fmul_reverse = ff_vector_fmul_reverse_sse;
- c->vector_fmul_add = ff_vector_fmul_add_sse;
-
- c->scalarproduct_float = ff_scalarproduct_float_sse;
- c->butterflies_float_interleave = ff_butterflies_float_interleave_sse;
-
#if HAVE_INLINE_ASM && CONFIG_VIDEODSP
c->gmc = gmc_sse;
#endif
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_avx;
}
}
- c->butterflies_float_interleave = ff_butterflies_float_interleave_avx;
- c->vector_fmul_reverse = ff_vector_fmul_reverse_avx;
- c->vector_fmul_add = ff_vector_fmul_add_avx;
#endif /* HAVE_AVX_EXTERNAL */
}
if (mm_flags & AV_CPU_FLAG_3DNOW)
dsputil_init_3dnow(c, avctx, mm_flags);
- if (mm_flags & AV_CPU_FLAG_3DNOWEXT)
- dsputil_init_3dnowext(c, avctx, mm_flags);
-
if (mm_flags & AV_CPU_FLAG_SSE)
dsputil_init_sse(c, avctx, mm_flags);