:"memory");
}
-DECLARE_ASM_CONST(8, uint8_t, ff_vector128)[8] =
- { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
-
#define put_signed_pixels_clamped_mmx_half(off) \
"movq "#off"(%2), %%mm1 \n\t"\
"movq 16+"#off"(%2), %%mm2 \n\t"\
x86_reg line_skip3;
__asm__ volatile (
- "movq "MANGLE(ff_vector128)", %%mm0 \n\t"
+ "movq "MANGLE(ff_pb_80)", %%mm0 \n\t"
"lea (%3, %3, 2), %1 \n\t"
put_signed_pixels_clamped_mmx_half(0)
"lea (%0, %3, 4), %0 \n\t"
dst[i+0] += src[i+0];
}
-static void add_bytes_l2_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
- x86_reg i=0;
- __asm__ volatile(
- "jmp 2f \n\t"
- "1: \n\t"
- "movq (%2, %0), %%mm0 \n\t"
- "movq 8(%2, %0), %%mm1 \n\t"
- "paddb (%3, %0), %%mm0 \n\t"
- "paddb 8(%3, %0), %%mm1 \n\t"
- "movq %%mm0, (%1, %0) \n\t"
- "movq %%mm1, 8(%1, %0) \n\t"
- "add $16, %0 \n\t"
- "2: \n\t"
- "cmp %4, %0 \n\t"
- " js 1b \n\t"
- : "+r" (i)
- : "r"(dst), "r"(src1), "r"(src2), "r"((x86_reg)w-15)
- );
- for(; i<w; i++)
- dst[i] = src1[i] + src2[i];
-}
-
#if HAVE_7REGS && HAVE_TEN_OPERANDS
static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top) {
x86_reg w2 = -w;
/* draw the edges of width 'w' of an image of size width, height
this mmx version can only handle w==8 || w==16 */
-static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w)
+static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w, int sides)
{
uint8_t *ptr, *last_line;
int i;
);
}
- for(i=0;i<w;i+=4) {
- /* top and bottom (and hopefully also the corners) */
- ptr= buf - (i + 1) * wrap - w;
- __asm__ volatile(
- "1: \n\t"
- "movq (%1, %0), %%mm0 \n\t"
- "movq %%mm0, (%0) \n\t"
- "movq %%mm0, (%0, %2) \n\t"
- "movq %%mm0, (%0, %2, 2) \n\t"
- "movq %%mm0, (%0, %3) \n\t"
- "add $8, %0 \n\t"
- "cmp %4, %0 \n\t"
- " jb 1b \n\t"
- : "+r" (ptr)
- : "r" ((x86_reg)buf - (x86_reg)ptr - w), "r" ((x86_reg)-wrap), "r" ((x86_reg)-wrap*3), "r" (ptr+width+2*w)
- );
- ptr= last_line + (i + 1) * wrap - w;
- __asm__ volatile(
- "1: \n\t"
- "movq (%1, %0), %%mm0 \n\t"
- "movq %%mm0, (%0) \n\t"
- "movq %%mm0, (%0, %2) \n\t"
- "movq %%mm0, (%0, %2, 2) \n\t"
- "movq %%mm0, (%0, %3) \n\t"
- "add $8, %0 \n\t"
- "cmp %4, %0 \n\t"
- " jb 1b \n\t"
- : "+r" (ptr)
- : "r" ((x86_reg)last_line - (x86_reg)ptr - w), "r" ((x86_reg)wrap), "r" ((x86_reg)wrap*3), "r" (ptr+width+2*w)
- );
+ /* top and bottom (and hopefully also the corners) */
+ if (sides&EDGE_TOP) {
+ for(i = 0; i < w; i += 4) {
+ ptr= buf - (i + 1) * wrap - w;
+ __asm__ volatile(
+ "1: \n\t"
+ "movq (%1, %0), %%mm0 \n\t"
+ "movq %%mm0, (%0) \n\t"
+ "movq %%mm0, (%0, %2) \n\t"
+ "movq %%mm0, (%0, %2, 2) \n\t"
+ "movq %%mm0, (%0, %3) \n\t"
+ "add $8, %0 \n\t"
+ "cmp %4, %0 \n\t"
+ " jb 1b \n\t"
+ : "+r" (ptr)
+ : "r" ((x86_reg)buf - (x86_reg)ptr - w), "r" ((x86_reg)-wrap), "r" ((x86_reg)-wrap*3), "r" (ptr+width+2*w)
+ );
+ }
}
-}
-#define PAETH(cpu, abs3)\
-static void add_png_paeth_prediction_##cpu(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp)\
-{\
- x86_reg i = -bpp;\
- x86_reg end = w-3;\
- __asm__ volatile(\
- "pxor %%mm7, %%mm7 \n"\
- "movd (%1,%0), %%mm0 \n"\
- "movd (%2,%0), %%mm1 \n"\
- "punpcklbw %%mm7, %%mm0 \n"\
- "punpcklbw %%mm7, %%mm1 \n"\
- "add %4, %0 \n"\
- "1: \n"\
- "movq %%mm1, %%mm2 \n"\
- "movd (%2,%0), %%mm1 \n"\
- "movq %%mm2, %%mm3 \n"\
- "punpcklbw %%mm7, %%mm1 \n"\
- "movq %%mm2, %%mm4 \n"\
- "psubw %%mm1, %%mm3 \n"\
- "psubw %%mm0, %%mm4 \n"\
- "movq %%mm3, %%mm5 \n"\
- "paddw %%mm4, %%mm5 \n"\
- abs3\
- "movq %%mm4, %%mm6 \n"\
- "pminsw %%mm5, %%mm6 \n"\
- "pcmpgtw %%mm6, %%mm3 \n"\
- "pcmpgtw %%mm5, %%mm4 \n"\
- "movq %%mm4, %%mm6 \n"\
- "pand %%mm3, %%mm4 \n"\
- "pandn %%mm3, %%mm6 \n"\
- "pandn %%mm0, %%mm3 \n"\
- "movd (%3,%0), %%mm0 \n"\
- "pand %%mm1, %%mm6 \n"\
- "pand %%mm4, %%mm2 \n"\
- "punpcklbw %%mm7, %%mm0 \n"\
- "movq %6, %%mm5 \n"\
- "paddw %%mm6, %%mm0 \n"\
- "paddw %%mm2, %%mm3 \n"\
- "paddw %%mm3, %%mm0 \n"\
- "pand %%mm5, %%mm0 \n"\
- "movq %%mm0, %%mm3 \n"\
- "packuswb %%mm3, %%mm3 \n"\
- "movd %%mm3, (%1,%0) \n"\
- "add %4, %0 \n"\
- "cmp %5, %0 \n"\
- "jle 1b \n"\
- :"+r"(i)\
- :"r"(dst), "r"(top), "r"(src), "r"((x86_reg)bpp), "g"(end),\
- "m"(ff_pw_255)\
- :"memory"\
- );\
+ if (sides&EDGE_BOTTOM) {
+ for(i = 0; i < w; i += 4) {
+ ptr= last_line + (i + 1) * wrap - w;
+ __asm__ volatile(
+ "1: \n\t"
+ "movq (%1, %0), %%mm0 \n\t"
+ "movq %%mm0, (%0) \n\t"
+ "movq %%mm0, (%0, %2) \n\t"
+ "movq %%mm0, (%0, %2, 2) \n\t"
+ "movq %%mm0, (%0, %3) \n\t"
+ "add $8, %0 \n\t"
+ "cmp %4, %0 \n\t"
+ " jb 1b \n\t"
+ : "+r" (ptr)
+ : "r" ((x86_reg)last_line - (x86_reg)ptr - w), "r" ((x86_reg)wrap), "r" ((x86_reg)wrap*3), "r" (ptr+width+2*w)
+ );
+ }
+ }
}
-#define ABS3_MMX2\
- "psubw %%mm5, %%mm7 \n"\
- "pmaxsw %%mm7, %%mm5 \n"\
- "pxor %%mm6, %%mm6 \n"\
- "pxor %%mm7, %%mm7 \n"\
- "psubw %%mm3, %%mm6 \n"\
- "psubw %%mm4, %%mm7 \n"\
- "pmaxsw %%mm6, %%mm3 \n"\
- "pmaxsw %%mm7, %%mm4 \n"\
- "pxor %%mm7, %%mm7 \n"
-
-#define ABS3_SSSE3\
- "pabsw %%mm3, %%mm3 \n"\
- "pabsw %%mm4, %%mm4 \n"\
- "pabsw %%mm5, %%mm5 \n"
-
-PAETH(mmx2, ABS3_MMX2)
-#if HAVE_SSSE3
-PAETH(ssse3, ABS3_SSSE3)
-#endif
-
#define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
"paddw " #m4 ", " #m3 " \n\t" /* x1 */\
"movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\
void ff_put_h264_chroma_mc8_mmx_rnd (uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
-void ff_put_vc1_chroma_mc8_mmx_nornd (uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
void ff_put_rv40_chroma_mc8_mmx (uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
void ff_avg_h264_chroma_mc8_mmx2_rnd (uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
-void ff_avg_vc1_chroma_mc8_mmx2_nornd (uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
void ff_avg_rv40_chroma_mc8_mmx2 (uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
void ff_avg_h264_chroma_mc8_3dnow_rnd (uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
-void ff_avg_vc1_chroma_mc8_3dnow_nornd(uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
void ff_avg_rv40_chroma_mc8_3dnow (uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
void ff_put_h264_chroma_mc8_ssse3_rnd (uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
-void ff_put_vc1_chroma_mc8_ssse3_nornd(uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
void ff_put_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
void ff_avg_h264_chroma_mc8_ssse3_rnd (uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
-void ff_avg_vc1_chroma_mc8_ssse3_nornd(uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
void ff_avg_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
}
#endif /* HAVE_6REGS */
-static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len)
-{
- x86_reg i = -4*len;
- __asm__ volatile(
- "movss %3, %%xmm4 \n"
- "shufps $0, %%xmm4, %%xmm4 \n"
- "1: \n"
- "cvtpi2ps (%2,%0), %%xmm0 \n"
- "cvtpi2ps 8(%2,%0), %%xmm1 \n"
- "cvtpi2ps 16(%2,%0), %%xmm2 \n"
- "cvtpi2ps 24(%2,%0), %%xmm3 \n"
- "movlhps %%xmm1, %%xmm0 \n"
- "movlhps %%xmm3, %%xmm2 \n"
- "mulps %%xmm4, %%xmm0 \n"
- "mulps %%xmm4, %%xmm2 \n"
- "movaps %%xmm0, (%1,%0) \n"
- "movaps %%xmm2, 16(%1,%0) \n"
- "add $32, %0 \n"
- "jl 1b \n"
- :"+r"(i)
- :"r"(dst+len), "r"(src+len), "m"(mul)
- );
-}
-
-static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len)
-{
- x86_reg i = -4*len;
- __asm__ volatile(
- "movss %3, %%xmm4 \n"
- "shufps $0, %%xmm4, %%xmm4 \n"
- "1: \n"
- "cvtdq2ps (%2,%0), %%xmm0 \n"
- "cvtdq2ps 16(%2,%0), %%xmm1 \n"
- "mulps %%xmm4, %%xmm0 \n"
- "mulps %%xmm4, %%xmm1 \n"
- "movaps %%xmm0, (%1,%0) \n"
- "movaps %%xmm1, 16(%1,%0) \n"
- "add $32, %0 \n"
- "jl 1b \n"
- :"+r"(i)
- :"r"(dst+len), "r"(src+len), "m"(mul)
- );
-}
-
static void vector_clipf_sse(float *dst, const float *src, float min, float max,
int len)
{
);
}
-static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){
- x86_reg reglen = len;
- // not bit-exact: pf2id uses different rounding than C and SSE
- __asm__ volatile(
- "add %0 , %0 \n\t"
- "lea (%2,%0,2) , %2 \n\t"
- "add %0 , %1 \n\t"
- "neg %0 \n\t"
- "1: \n\t"
- "pf2id (%2,%0,2) , %%mm0 \n\t"
- "pf2id 8(%2,%0,2) , %%mm1 \n\t"
- "pf2id 16(%2,%0,2) , %%mm2 \n\t"
- "pf2id 24(%2,%0,2) , %%mm3 \n\t"
- "packssdw %%mm1 , %%mm0 \n\t"
- "packssdw %%mm3 , %%mm2 \n\t"
- "movq %%mm0 , (%1,%0) \n\t"
- "movq %%mm2 , 8(%1,%0) \n\t"
- "add $16 , %0 \n\t"
- " js 1b \n\t"
- "femms \n\t"
- :"+r"(reglen), "+r"(dst), "+r"(src)
- );
-}
-static void float_to_int16_sse(int16_t *dst, const float *src, long len){
- x86_reg reglen = len;
- __asm__ volatile(
- "add %0 , %0 \n\t"
- "lea (%2,%0,2) , %2 \n\t"
- "add %0 , %1 \n\t"
- "neg %0 \n\t"
- "1: \n\t"
- "cvtps2pi (%2,%0,2) , %%mm0 \n\t"
- "cvtps2pi 8(%2,%0,2) , %%mm1 \n\t"
- "cvtps2pi 16(%2,%0,2) , %%mm2 \n\t"
- "cvtps2pi 24(%2,%0,2) , %%mm3 \n\t"
- "packssdw %%mm1 , %%mm0 \n\t"
- "packssdw %%mm3 , %%mm2 \n\t"
- "movq %%mm0 , (%1,%0) \n\t"
- "movq %%mm2 , 8(%1,%0) \n\t"
- "add $16 , %0 \n\t"
- " js 1b \n\t"
- "emms \n\t"
- :"+r"(reglen), "+r"(dst), "+r"(src)
- );
-}
-
-static void float_to_int16_sse2(int16_t *dst, const float *src, long len){
- x86_reg reglen = len;
- __asm__ volatile(
- "add %0 , %0 \n\t"
- "lea (%2,%0,2) , %2 \n\t"
- "add %0 , %1 \n\t"
- "neg %0 \n\t"
- "1: \n\t"
- "cvtps2dq (%2,%0,2) , %%xmm0 \n\t"
- "cvtps2dq 16(%2,%0,2) , %%xmm1 \n\t"
- "packssdw %%xmm1 , %%xmm0 \n\t"
- "movdqa %%xmm0 , (%1,%0) \n\t"
- "add $16 , %0 \n\t"
- " js 1b \n\t"
- :"+r"(reglen), "+r"(dst), "+r"(src)
- );
-}
-
void ff_vp3_idct_mmx(int16_t *input_data);
void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block);
void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block);
void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block);
void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block);
-void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len);
-void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
-void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len);
int32_t ff_scalarproduct_int16_mmx2(const int16_t *v1, const int16_t *v2, int order, int shift);
int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2, int order, int shift);
int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
+
+void ff_apply_window_int16_mmxext (int16_t *output, const int16_t *input,
+ const int16_t *window, unsigned int len);
+void ff_apply_window_int16_mmxext_ba (int16_t *output, const int16_t *input,
+ const int16_t *window, unsigned int len);
+void ff_apply_window_int16_sse2 (int16_t *output, const int16_t *input,
+ const int16_t *window, unsigned int len);
+void ff_apply_window_int16_sse2_ba (int16_t *output, const int16_t *input,
+ const int16_t *window, unsigned int len);
+void ff_apply_window_int16_ssse3 (int16_t *output, const int16_t *input,
+ const int16_t *window, unsigned int len);
+void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
+ const int16_t *window, unsigned int len);
+
void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top);
int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left);
int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left);
-#if !HAVE_YASM
-#define ff_float_to_int16_interleave6_sse(a,b,c) float_to_int16_interleave_misc_sse(a,b,c,6)
-#define ff_float_to_int16_interleave6_3dnow(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6)
-#define ff_float_to_int16_interleave6_3dn2(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6)
-#endif
-#define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse
-
-#define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \
-/* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\
-static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\
- DECLARE_ALIGNED(16, int16_t, tmp)[len];\
- int i,j,c;\
- for(c=0; c<channels; c++){\
- float_to_int16_##cpu(tmp, src[c], len);\
- for(i=0, j=c; i<len; i++, j+=channels)\
- dst[j] = tmp[i];\
- }\
-}\
-\
-static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, long len, int channels){\
- if(channels==1)\
- float_to_int16_##cpu(dst, src[0], len);\
- else if(channels==2){\
- x86_reg reglen = len; \
- const float *src0 = src[0];\
- const float *src1 = src[1];\
- __asm__ volatile(\
- "shl $2, %0 \n"\
- "add %0, %1 \n"\
- "add %0, %2 \n"\
- "add %0, %3 \n"\
- "neg %0 \n"\
- body\
- :"+r"(reglen), "+r"(dst), "+r"(src0), "+r"(src1)\
- );\
- }else if(channels==6){\
- ff_float_to_int16_interleave6_##cpu(dst, src, len);\
- }else\
- float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\
-}
-
-FLOAT_TO_INT16_INTERLEAVE(3dnow,
- "1: \n"
- "pf2id (%2,%0), %%mm0 \n"
- "pf2id 8(%2,%0), %%mm1 \n"
- "pf2id (%3,%0), %%mm2 \n"
- "pf2id 8(%3,%0), %%mm3 \n"
- "packssdw %%mm1, %%mm0 \n"
- "packssdw %%mm3, %%mm2 \n"
- "movq %%mm0, %%mm1 \n"
- "punpcklwd %%mm2, %%mm0 \n"
- "punpckhwd %%mm2, %%mm1 \n"
- "movq %%mm0, (%1,%0)\n"
- "movq %%mm1, 8(%1,%0)\n"
- "add $16, %0 \n"
- "js 1b \n"
- "femms \n"
-)
-
-FLOAT_TO_INT16_INTERLEAVE(sse,
- "1: \n"
- "cvtps2pi (%2,%0), %%mm0 \n"
- "cvtps2pi 8(%2,%0), %%mm1 \n"
- "cvtps2pi (%3,%0), %%mm2 \n"
- "cvtps2pi 8(%3,%0), %%mm3 \n"
- "packssdw %%mm1, %%mm0 \n"
- "packssdw %%mm3, %%mm2 \n"
- "movq %%mm0, %%mm1 \n"
- "punpcklwd %%mm2, %%mm0 \n"
- "punpckhwd %%mm2, %%mm1 \n"
- "movq %%mm0, (%1,%0)\n"
- "movq %%mm1, 8(%1,%0)\n"
- "add $16, %0 \n"
- "js 1b \n"
- "emms \n"
-)
-
-FLOAT_TO_INT16_INTERLEAVE(sse2,
- "1: \n"
- "cvtps2dq (%2,%0), %%xmm0 \n"
- "cvtps2dq (%3,%0), %%xmm1 \n"
- "packssdw %%xmm1, %%xmm0 \n"
- "movhlps %%xmm0, %%xmm1 \n"
- "punpcklwd %%xmm1, %%xmm0 \n"
- "movdqa %%xmm0, (%1,%0) \n"
- "add $16, %0 \n"
- "js 1b \n"
-)
-
-static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long len, int channels){
- if(channels==6)
- ff_float_to_int16_interleave6_3dn2(dst, src, len);
- else
- float_to_int16_interleave_3dnow(dst, src, len, channels);
-}
-
float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
{
int mm_flags = av_get_cpu_flags();
+ const int h264_high_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
if (avctx->dsp_mask) {
if (avctx->dsp_mask & AV_CPU_FLAG_FORCE)
c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
+ if (!h264_high_depth) {
c->clear_block = clear_block_mmx;
c->clear_blocks = clear_blocks_mmx;
if ((mm_flags & AV_CPU_FLAG_SSE) &&
c->clear_block = clear_block_sse;
c->clear_blocks = clear_blocks_sse;
}
+ }
#define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU
+ if (!h264_high_depth) {
SET_HPEL_FUNCS(put, 0, 16, mmx);
SET_HPEL_FUNCS(put_no_rnd, 0, 16, mmx);
SET_HPEL_FUNCS(avg, 0, 16, mmx);
SET_HPEL_FUNCS(put_no_rnd, 1, 8, mmx);
SET_HPEL_FUNCS(avg, 1, 8, mmx);
SET_HPEL_FUNCS(avg_no_rnd, 1, 8, mmx);
+ }
#if ARCH_X86_32 || !HAVE_YASM
c->gmc= gmc_mmx;
#endif
#if ARCH_X86_32 && HAVE_YASM
+ if (!h264_high_depth)
c->emulated_edge_mc = emulated_edge_mc_mmx;
#endif
c->add_bytes= add_bytes_mmx;
- c->add_bytes_l2= add_bytes_l2_mmx;
+ if (!h264_high_depth)
c->draw_edges = draw_edges_mmx;
if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
}
#if HAVE_YASM
+ if (!h264_high_depth) {
c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_mmx_rnd;
c->put_h264_chroma_pixels_tab[1]= ff_put_h264_chroma_mc4_mmx;
- c->put_no_rnd_vc1_chroma_pixels_tab[0]= ff_put_vc1_chroma_mc8_mmx_nornd;
+ }
c->put_rv40_chroma_pixels_tab[0]= ff_put_rv40_chroma_mc8_mmx;
c->put_rv40_chroma_pixels_tab[1]= ff_put_rv40_chroma_mc4_mmx;
if (mm_flags & AV_CPU_FLAG_MMX2) {
c->prefetch = prefetch_mmx2;
+ if (!h264_high_depth) {
c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
+ }
if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
+ if (!h264_high_depth) {
c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
+ }
if (CONFIG_VP3_DECODER && HAVE_YASM) {
c->vp3_v_loop_filter= ff_vp3_v_loop_filter_mmx2;
SET_QPEL_FUNCS(avg_qpel, 0, 16, mmx2);
SET_QPEL_FUNCS(avg_qpel, 1, 8, mmx2);
+ if (!h264_high_depth) {
SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmx2);
SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmx2);
SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmx2);
SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmx2);
SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmx2);
SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmx2);
+ }
SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmx2);
SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmx2);
c->avg_rv40_chroma_pixels_tab[0]= ff_avg_rv40_chroma_mc8_mmx2;
c->avg_rv40_chroma_pixels_tab[1]= ff_avg_rv40_chroma_mc4_mmx2;
- c->avg_no_rnd_vc1_chroma_pixels_tab[0]= ff_avg_vc1_chroma_mc8_mmx2_nornd;
-
+ if (!h264_high_depth) {
c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_mmx2_rnd;
c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_mmx2;
c->avg_h264_chroma_pixels_tab[2]= ff_avg_h264_chroma_mc2_mmx2;
c->put_h264_chroma_pixels_tab[2]= ff_put_h264_chroma_mc2_mmx2;
+ }
c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmx2;
#endif
c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
#endif
- if (CONFIG_VC1_DECODER)
- ff_vc1dsp_init_mmx(c, avctx);
-
- c->add_png_paeth_prediction= add_png_paeth_prediction_mmx2;
} else if (mm_flags & AV_CPU_FLAG_3DNOW) {
c->prefetch = prefetch_3dnow;
+ if (!h264_high_depth) {
c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
}
+ }
if (CONFIG_VP3_DECODER
&& (avctx->codec_id == CODEC_ID_VP3 || avctx->codec_id == CODEC_ID_THEORA)) {
SET_QPEL_FUNCS(avg_qpel, 0, 16, 3dnow);
SET_QPEL_FUNCS(avg_qpel, 1, 8, 3dnow);
+ if (!h264_high_depth) {
SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 3dnow);
SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 3dnow);
SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 3dnow);
SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 3dnow);
SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 3dnow);
SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 3dnow);
+ }
SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, 3dnow);
SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, 3dnow);
SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, 3dnow);
#if HAVE_YASM
+ if (!h264_high_depth) {
c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_3dnow_rnd;
c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_3dnow;
-
- c->avg_no_rnd_vc1_chroma_pixels_tab[0]= ff_avg_vc1_chroma_mc8_3dnow_nornd;
+ }
c->avg_rv40_chroma_pixels_tab[0]= ff_avg_rv40_chroma_mc8_3dnow;
c->avg_rv40_chroma_pixels_tab[1]= ff_avg_rv40_chroma_mc4_3dnow;
c->avg_h264_qpel_pixels_tab[1][x+y*4] = avg_h264_qpel8_mc##x##y##_##CPU;
if((mm_flags & AV_CPU_FLAG_SSE2) && !(mm_flags & AV_CPU_FLAG_3DNOW)){
// these functions are slower than mmx on AMD, but faster on Intel
+ if (!h264_high_depth) {
c->put_pixels_tab[0][0] = put_pixels16_sse2;
c->put_no_rnd_pixels_tab[0][0] = put_pixels16_sse2;
c->avg_pixels_tab[0][0] = avg_pixels16_sse2;
H264_QPEL_FUNCS(0, 0, sse2);
+ }
}
if(mm_flags & AV_CPU_FLAG_SSE2){
+ if (!h264_high_depth) {
H264_QPEL_FUNCS(0, 1, sse2);
H264_QPEL_FUNCS(0, 2, sse2);
H264_QPEL_FUNCS(0, 3, sse2);
H264_QPEL_FUNCS(3, 1, sse2);
H264_QPEL_FUNCS(3, 2, sse2);
H264_QPEL_FUNCS(3, 3, sse2);
+ }
}
#if HAVE_SSSE3
if(mm_flags & AV_CPU_FLAG_SSSE3){
+ if (!h264_high_depth) {
H264_QPEL_FUNCS(1, 0, ssse3);
H264_QPEL_FUNCS(1, 1, ssse3);
H264_QPEL_FUNCS(1, 2, ssse3);
H264_QPEL_FUNCS(3, 1, ssse3);
H264_QPEL_FUNCS(3, 2, ssse3);
H264_QPEL_FUNCS(3, 3, ssse3);
- c->add_png_paeth_prediction= add_png_paeth_prediction_ssse3;
+ }
#if HAVE_YASM
- c->put_no_rnd_vc1_chroma_pixels_tab[0]= ff_put_vc1_chroma_mc8_ssse3_nornd;
- c->avg_no_rnd_vc1_chroma_pixels_tab[0]= ff_avg_vc1_chroma_mc8_ssse3_nornd;
+ if (!h264_high_depth) {
c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_ssse3_rnd;
c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_ssse3_rnd;
c->put_h264_chroma_pixels_tab[1]= ff_put_h264_chroma_mc4_ssse3;
c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_ssse3;
+ }
c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
if (mm_flags & AV_CPU_FLAG_SSE4) // not really sse4, just slow on Conroe
c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
if(mm_flags & AV_CPU_FLAG_3DNOW){
c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
c->vector_fmul = vector_fmul_3dnow;
- if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
- c->float_to_int16 = float_to_int16_3dnow;
- c->float_to_int16_interleave = float_to_int16_interleave_3dnow;
- }
}
if(mm_flags & AV_CPU_FLAG_3DNOWEXT){
c->vector_fmul_reverse = vector_fmul_reverse_3dnow2;
#if HAVE_6REGS
c->vector_fmul_window = vector_fmul_window_3dnow2;
#endif
- if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
- c->float_to_int16_interleave = float_to_int16_interleave_3dn2;
- }
}
if(mm_flags & AV_CPU_FLAG_MMX2){
#if HAVE_YASM
c->scalarproduct_int16 = ff_scalarproduct_int16_mmx2;
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmx2;
+ if (avctx->flags & CODEC_FLAG_BITEXACT) {
+ c->apply_window_int16 = ff_apply_window_int16_mmxext_ba;
+ } else {
+ c->apply_window_int16 = ff_apply_window_int16_mmxext;
+ }
#endif
}
if(mm_flags & AV_CPU_FLAG_SSE){
#if HAVE_6REGS
c->vector_fmul_window = vector_fmul_window_sse;
#endif
- c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse;
c->vector_clipf = vector_clipf_sse;
- c->float_to_int16 = float_to_int16_sse;
- c->float_to_int16_interleave = float_to_int16_interleave_sse;
#if HAVE_YASM
c->scalarproduct_float = ff_scalarproduct_float_sse;
#endif
if(mm_flags & AV_CPU_FLAG_3DNOW)
c->vector_fmul_add = vector_fmul_add_3dnow; // faster than sse
if(mm_flags & AV_CPU_FLAG_SSE2){
- c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2;
- c->float_to_int16 = float_to_int16_sse2;
- c->float_to_int16_interleave = float_to_int16_interleave_sse2;
#if HAVE_YASM
c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
+ if (avctx->flags & CODEC_FLAG_BITEXACT) {
+ c->apply_window_int16 = ff_apply_window_int16_sse2_ba;
+ } else {
+ if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
+ c->apply_window_int16 = ff_apply_window_int16_sse2;
+ }
+ }
+ if (!h264_high_depth)
c->emulated_edge_mc = emulated_edge_mc_sse;
c->gmc= gmc_sse;
#endif
}
- if((mm_flags & AV_CPU_FLAG_SSSE3) && !(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW)) && HAVE_YASM) // cachesplit
- c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
+ if (mm_flags & AV_CPU_FLAG_SSSE3) {
+#if HAVE_YASM
+ if (mm_flags & AV_CPU_FLAG_ATOM) {
+ c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
+ } else {
+ c->apply_window_int16 = ff_apply_window_int16_ssse3;
+ }
+ if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) { // cachesplit
+ c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
+ }
+#endif
+ }
}
if (CONFIG_ENCODERS)