X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=libavcodec%2Fx86%2Fdsputil_mmx.c;h=e34b95b0daf3f675bac82aa72f652ac03d39ff6f;hb=88b97357531bca25c22c351d5cac36e168b70803;hp=16ad965ec17018506f2d6af685231bc31270adb4;hpb=12802ec0601c3bd7b9c7a2503518e28fd5e7d744;p=ffmpeg diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c index 16ad965ec17..e34b95b0daf 100644 --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -3,20 +3,20 @@ * Copyright (c) 2000, 2001 Fabrice Bellard * Copyright (c) 2002-2004 Michael Niedermayer * - * This file is part of FFmpeg. + * This file is part of Libav. * - * FFmpeg is free software; you can redistribute it and/or + * Libav is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * FFmpeg is distributed in the hope that it will be useful, + * Libav is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software + * License along with Libav; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * * MMX optimization by Nick Kurshev @@ -42,7 +42,8 @@ DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL; DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] = {0x8000000080000000ULL, 0x8000000080000000ULL}; -DECLARE_ALIGNED(8, const uint64_t, ff_pw_1 ) = 0x0001000100010001ULL; +DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1 ) = {0x0001000100010001ULL, 0x0001000100010001ULL}; +DECLARE_ALIGNED(16, const xmm_reg, ff_pw_2 ) = {0x0002000200020002ULL, 0x0002000200020002ULL}; DECLARE_ALIGNED(16, const xmm_reg, ff_pw_3 ) = {0x0003000300030003ULL, 0x0003000300030003ULL}; DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4 ) = {0x0004000400040004ULL, 0x0004000400040004ULL}; DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5 ) = {0x0005000500050005ULL, 0x0005000500050005ULL}; @@ -63,6 +64,8 @@ DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64 ) = {0x0040004000400040ULL, 0x00400 DECLARE_ALIGNED(8, const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL; DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL; DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL; +DECLARE_ALIGNED(16, const xmm_reg, ff_pw_512) = {0x0200020002000200ULL, 0x0200020002000200ULL}; +DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019)= {0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL}; DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0 ) = {0x0000000000000000ULL, 0x0000000000000000ULL}; DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1 ) = {0x0101010101010101ULL, 0x0101010101010101ULL}; @@ -286,9 +289,6 @@ void ff_put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_s :"memory"); } -DECLARE_ASM_CONST(8, uint8_t, ff_vector128)[8] = - { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 }; - #define put_signed_pixels_clamped_mmx_half(off) \ "movq "#off"(%2), %%mm1 \n\t"\ "movq 16+"#off"(%2), %%mm2 \n\t"\ @@ -313,7 +313,7 @@ void ff_put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int x86_reg line_skip3; __asm__ volatile ( - "movq "MANGLE(ff_vector128)", %%mm0 \n\t" + "movq "MANGLE(ff_pb_80)", %%mm0 \n\t" "lea (%3, %3, 2), %1 \n\t" put_signed_pixels_clamped_mmx_half(0) "lea (%0, %3, 4), %0 \n\t" @@ -458,12 +458,12 @@ static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_si "movdqu (%1,%3), %%xmm1 \n\t" "movdqu (%1,%3,2), %%xmm2 \n\t" "movdqu (%1,%4), %%xmm3 \n\t" + "lea (%1,%3,4), %1 \n\t" "movdqa %%xmm0, (%2) \n\t" "movdqa %%xmm1, (%2,%3) \n\t" "movdqa %%xmm2, (%2,%3,2) \n\t" "movdqa %%xmm3, (%2,%4) \n\t" "subl $4, %0 \n\t" - "lea (%1,%3,4), %1 \n\t" "lea (%2,%3,4), %2 \n\t" "jnz 1b \n\t" : "+g"(h), "+r" (pixels), "+r" (block) @@ -480,6 +480,7 @@ static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_si "movdqu (%1,%3), %%xmm1 \n\t" "movdqu (%1,%3,2), %%xmm2 \n\t" "movdqu (%1,%4), %%xmm3 \n\t" + "lea (%1,%3,4), %1 \n\t" "pavgb (%2), %%xmm0 \n\t" "pavgb (%2,%3), %%xmm1 \n\t" "pavgb (%2,%3,2), %%xmm2 \n\t" @@ -489,7 +490,6 @@ static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_si "movdqa %%xmm2, (%2,%3,2) \n\t" "movdqa %%xmm3, (%2,%4) \n\t" "subl $4, %0 \n\t" - "lea (%1,%3,4), %1 \n\t" "lea (%2,%3,4), %2 \n\t" "jnz 1b \n\t" : "+g"(h), "+r" (pixels), "+r" (block) @@ -604,7 +604,7 @@ static void add_bytes_l2_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){ dst[i] = src1[i] + src2[i]; } -#if HAVE_7REGS && HAVE_TEN_OPERANDS +#if HAVE_7REGS static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top) { x86_reg w2 = -w; x86_reg x; @@ -786,7 +786,7 @@ static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){ /* draw the edges of width 'w' of an image of size width, height this mmx version can only handle w==8 || w==16 */ -static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w) +static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w, int h, int sides) { uint8_t *ptr, *last_line; int i; @@ -839,36 +839,43 @@ static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w) ); } - for(i=0;ibits_per_raw_sample > 8; + const int bit_depth = avctx->bits_per_raw_sample; if (avctx->dsp_mask) { if (avctx->dsp_mask & AV_CPU_FLAG_FORCE) @@ -2426,7 +2460,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) if (mm_flags & AV_CPU_FLAG_MMX) { const int idct_algo= avctx->idct_algo; - if(avctx->lowres==0){ + if (avctx->lowres == 0 && avctx->bits_per_raw_sample <= 8) { if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){ c->idct_put= ff_simple_idct_put_mmx; c->idct_add= ff_simple_idct_add_mmx; @@ -2481,6 +2515,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) c->put_pixels_clamped = ff_put_pixels_clamped_mmx; c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx; c->add_pixels_clamped = ff_add_pixels_clamped_mmx; + if (!high_bit_depth) { c->clear_block = clear_block_mmx; c->clear_blocks = clear_blocks_mmx; if ((mm_flags & AV_CPU_FLAG_SSE) && @@ -2489,6 +2524,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) c->clear_block = clear_block_sse; c->clear_blocks = clear_blocks_sse; } + } #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \ c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \ @@ -2496,6 +2532,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \ c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU + if (!high_bit_depth) { SET_HPEL_FUNCS(put, 0, 16, mmx); SET_HPEL_FUNCS(put_no_rnd, 0, 16, mmx); SET_HPEL_FUNCS(avg, 0, 16, mmx); @@ -2504,17 +2541,20 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) SET_HPEL_FUNCS(put_no_rnd, 1, 8, mmx); SET_HPEL_FUNCS(avg, 1, 8, mmx); SET_HPEL_FUNCS(avg_no_rnd, 1, 8, mmx); + } #if ARCH_X86_32 || !HAVE_YASM c->gmc= gmc_mmx; #endif #if ARCH_X86_32 && HAVE_YASM + if (!high_bit_depth) c->emulated_edge_mc = emulated_edge_mc_mmx; #endif c->add_bytes= add_bytes_mmx; c->add_bytes_l2= add_bytes_l2_mmx; + if (!high_bit_depth) c->draw_edges = draw_edges_mmx; if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) { @@ -2523,16 +2563,18 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) } #if HAVE_YASM + if (!high_bit_depth && CONFIG_H264CHROMA) { c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_mmx_rnd; c->put_h264_chroma_pixels_tab[1]= ff_put_h264_chroma_mc4_mmx; + } - c->put_rv40_chroma_pixels_tab[0]= ff_put_rv40_chroma_mc8_mmx; - c->put_rv40_chroma_pixels_tab[1]= ff_put_rv40_chroma_mc4_mmx; + c->vector_clip_int32 = ff_vector_clip_int32_mmx; #endif if (mm_flags & AV_CPU_FLAG_MMX2) { c->prefetch = prefetch_mmx2; + if (!high_bit_depth) { c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2; c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2; @@ -2546,14 +2588,17 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) c->avg_pixels_tab[1][0] = avg_pixels8_mmx2; c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2; c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2; + } if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ + if (!high_bit_depth) { c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2; c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2; c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2; c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2; c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2; c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2; + } if (CONFIG_VP3_DECODER && HAVE_YASM) { c->vp3_v_loop_filter= ff_vp3_v_loop_filter_mmx2; @@ -2570,63 +2615,83 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_mmx2; } -#define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU) \ - c->PFX ## _pixels_tab[IDX][ 0] = PFX ## SIZE ## _mc00_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][ 1] = PFX ## SIZE ## _mc10_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][ 2] = PFX ## SIZE ## _mc20_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][ 3] = PFX ## SIZE ## _mc30_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][ 4] = PFX ## SIZE ## _mc01_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][ 5] = PFX ## SIZE ## _mc11_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][ 6] = PFX ## SIZE ## _mc21_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][ 7] = PFX ## SIZE ## _mc31_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][ 8] = PFX ## SIZE ## _mc02_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][ 9] = PFX ## SIZE ## _mc12_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][10] = PFX ## SIZE ## _mc22_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][11] = PFX ## SIZE ## _mc32_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][12] = PFX ## SIZE ## _mc03_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][13] = PFX ## SIZE ## _mc13_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][14] = PFX ## SIZE ## _mc23_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][15] = PFX ## SIZE ## _mc33_ ## CPU - - SET_QPEL_FUNCS(put_qpel, 0, 16, mmx2); - SET_QPEL_FUNCS(put_qpel, 1, 8, mmx2); - SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmx2); - SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmx2); - SET_QPEL_FUNCS(avg_qpel, 0, 16, mmx2); - SET_QPEL_FUNCS(avg_qpel, 1, 8, mmx2); - - SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmx2); - SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmx2); - SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmx2); - SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmx2); - SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmx2); - SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmx2); - - SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmx2); - SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmx2); - SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmx2); - SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmx2); - +#define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \ + c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \ + c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \ + c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \ + c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \ + c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \ + c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \ + c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \ + c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \ + c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \ + c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \ + c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \ + c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \ + c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \ + c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \ + c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \ + c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU + + SET_QPEL_FUNCS(put_qpel, 0, 16, mmx2, ); + SET_QPEL_FUNCS(put_qpel, 1, 8, mmx2, ); + SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmx2, ); + SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmx2, ); + SET_QPEL_FUNCS(avg_qpel, 0, 16, mmx2, ); + SET_QPEL_FUNCS(avg_qpel, 1, 8, mmx2, ); + + if (!high_bit_depth) { + SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmx2, ); + SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmx2, ); + SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmx2, ); + SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmx2, ); + SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmx2, ); + SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmx2, ); + } + else if (bit_depth == 10) { #if HAVE_YASM - c->avg_rv40_chroma_pixels_tab[0]= ff_avg_rv40_chroma_mc8_mmx2; - c->avg_rv40_chroma_pixels_tab[1]= ff_avg_rv40_chroma_mc4_mmx2; +#if !ARCH_X86_64 + SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_mmxext, ff_); + SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_mmxext, ff_); + SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_mmxext, ff_); + SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_mmxext, ff_); +#endif + SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 10_mmxext, ff_); + SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 10_mmxext, ff_); +#endif + } + + SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmx2, ); + SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmx2, ); + SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmx2, ); + SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmx2, ); +#if HAVE_YASM + if (!high_bit_depth && CONFIG_H264CHROMA) { c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_mmx2_rnd; c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_mmx2; c->avg_h264_chroma_pixels_tab[2]= ff_avg_h264_chroma_mc2_mmx2; c->put_h264_chroma_pixels_tab[2]= ff_put_h264_chroma_mc2_mmx2; + } + if (bit_depth == 10 && CONFIG_H264CHROMA) { + c->put_h264_chroma_pixels_tab[2]= ff_put_h264_chroma_mc2_10_mmxext; + c->avg_h264_chroma_pixels_tab[2]= ff_avg_h264_chroma_mc2_10_mmxext; + c->put_h264_chroma_pixels_tab[1]= ff_put_h264_chroma_mc4_10_mmxext; + c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_10_mmxext; + } c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmx2; #endif -#if HAVE_7REGS && HAVE_TEN_OPERANDS - if( mm_flags&AV_CPU_FLAG_3DNOW ) +#if HAVE_7REGS + if (HAVE_AMD3DNOW && (mm_flags & AV_CPU_FLAG_3DNOW)) c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov; #endif c->add_png_paeth_prediction= add_png_paeth_prediction_mmx2; - } else if (mm_flags & AV_CPU_FLAG_3DNOW) { + } else if (HAVE_AMD3DNOW && (mm_flags & AV_CPU_FLAG_3DNOW)) { c->prefetch = prefetch_3dnow; + if (!high_bit_depth) { c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow; c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow; @@ -2649,6 +2714,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow; c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow; } + } if (CONFIG_VP3_DECODER && (avctx->codec_id == CODEC_ID_VP3 || avctx->codec_id == CODEC_ID_THEORA)) { @@ -2656,31 +2722,33 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_3dnow; } - SET_QPEL_FUNCS(put_qpel, 0, 16, 3dnow); - SET_QPEL_FUNCS(put_qpel, 1, 8, 3dnow); - SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, 3dnow); - SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, 3dnow); - SET_QPEL_FUNCS(avg_qpel, 0, 16, 3dnow); - SET_QPEL_FUNCS(avg_qpel, 1, 8, 3dnow); - - SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 3dnow); - SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 3dnow); - SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 3dnow); - SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 3dnow); - SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 3dnow); - SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 3dnow); - - SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, 3dnow); - SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, 3dnow); - SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, 3dnow); - SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, 3dnow); + SET_QPEL_FUNCS(put_qpel, 0, 16, 3dnow, ); + SET_QPEL_FUNCS(put_qpel, 1, 8, 3dnow, ); + SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, 3dnow, ); + SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, 3dnow, ); + SET_QPEL_FUNCS(avg_qpel, 0, 16, 3dnow, ); + SET_QPEL_FUNCS(avg_qpel, 1, 8, 3dnow, ); + + if (!high_bit_depth) { + SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 3dnow, ); + SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 3dnow, ); + SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 3dnow, ); + SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 3dnow, ); + SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 3dnow, ); + SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 3dnow, ); + } + + SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, 3dnow, ); + SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, 3dnow, ); + SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, 3dnow, ); + SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, 3dnow, ); #if HAVE_YASM + if (!high_bit_depth && CONFIG_H264CHROMA) { c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_3dnow_rnd; c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_3dnow; + } - c->avg_rv40_chroma_pixels_tab[0]= ff_avg_rv40_chroma_mc8_3dnow; - c->avg_rv40_chroma_pixels_tab[1]= ff_avg_rv40_chroma_mc4_3dnow; #endif } @@ -2692,27 +2760,53 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) c->avg_h264_qpel_pixels_tab[1][x+y*4] = avg_h264_qpel8_mc##x##y##_##CPU; if((mm_flags & AV_CPU_FLAG_SSE2) && !(mm_flags & AV_CPU_FLAG_3DNOW)){ // these functions are slower than mmx on AMD, but faster on Intel + if (!high_bit_depth) { c->put_pixels_tab[0][0] = put_pixels16_sse2; c->put_no_rnd_pixels_tab[0][0] = put_pixels16_sse2; c->avg_pixels_tab[0][0] = avg_pixels16_sse2; H264_QPEL_FUNCS(0, 0, sse2); + } } if(mm_flags & AV_CPU_FLAG_SSE2){ - H264_QPEL_FUNCS(0, 1, sse2); - H264_QPEL_FUNCS(0, 2, sse2); - H264_QPEL_FUNCS(0, 3, sse2); - H264_QPEL_FUNCS(1, 1, sse2); - H264_QPEL_FUNCS(1, 2, sse2); - H264_QPEL_FUNCS(1, 3, sse2); - H264_QPEL_FUNCS(2, 1, sse2); - H264_QPEL_FUNCS(2, 2, sse2); - H264_QPEL_FUNCS(2, 3, sse2); - H264_QPEL_FUNCS(3, 1, sse2); - H264_QPEL_FUNCS(3, 2, sse2); - H264_QPEL_FUNCS(3, 3, sse2); + if (!high_bit_depth) { + H264_QPEL_FUNCS(0, 1, sse2); + H264_QPEL_FUNCS(0, 2, sse2); + H264_QPEL_FUNCS(0, 3, sse2); + H264_QPEL_FUNCS(1, 1, sse2); + H264_QPEL_FUNCS(1, 2, sse2); + H264_QPEL_FUNCS(1, 3, sse2); + H264_QPEL_FUNCS(2, 1, sse2); + H264_QPEL_FUNCS(2, 2, sse2); + H264_QPEL_FUNCS(2, 3, sse2); + H264_QPEL_FUNCS(3, 1, sse2); + H264_QPEL_FUNCS(3, 2, sse2); + H264_QPEL_FUNCS(3, 3, sse2); + } +#if HAVE_YASM +#define H264_QPEL_FUNCS_10(x, y, CPU)\ + c->put_h264_qpel_pixels_tab[0][x+y*4] = ff_put_h264_qpel16_mc##x##y##_10_##CPU;\ + c->put_h264_qpel_pixels_tab[1][x+y*4] = ff_put_h264_qpel8_mc##x##y##_10_##CPU;\ + c->avg_h264_qpel_pixels_tab[0][x+y*4] = ff_avg_h264_qpel16_mc##x##y##_10_##CPU;\ + c->avg_h264_qpel_pixels_tab[1][x+y*4] = ff_avg_h264_qpel8_mc##x##y##_10_##CPU; + if (bit_depth == 10) { + SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_sse2, ff_); + SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_sse2, ff_); + SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_sse2, ff_); + SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_sse2, ff_); + H264_QPEL_FUNCS_10(1, 0, sse2_cache64) + H264_QPEL_FUNCS_10(2, 0, sse2_cache64) + H264_QPEL_FUNCS_10(3, 0, sse2_cache64) + + if (CONFIG_H264CHROMA) { + c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_sse2; + c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_sse2; + } + } +#endif } #if HAVE_SSSE3 if(mm_flags & AV_CPU_FLAG_SSSE3){ + if (!high_bit_depth) { H264_QPEL_FUNCS(1, 0, ssse3); H264_QPEL_FUNCS(1, 1, ssse3); H264_QPEL_FUNCS(1, 2, ssse3); @@ -2725,12 +2819,22 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) H264_QPEL_FUNCS(3, 1, ssse3); H264_QPEL_FUNCS(3, 2, ssse3); H264_QPEL_FUNCS(3, 3, ssse3); + } +#if HAVE_YASM + else if (bit_depth == 10) { + H264_QPEL_FUNCS_10(1, 0, ssse3_cache64) + H264_QPEL_FUNCS_10(2, 0, ssse3_cache64) + H264_QPEL_FUNCS_10(3, 0, ssse3_cache64) + } +#endif c->add_png_paeth_prediction= add_png_paeth_prediction_ssse3; #if HAVE_YASM + if (!high_bit_depth && CONFIG_H264CHROMA) { c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_ssse3_rnd; c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_ssse3_rnd; c->put_h264_chroma_pixels_tab[1]= ff_put_h264_chroma_mc4_ssse3; c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_ssse3; + } c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3; if (mm_flags & AV_CPU_FLAG_SSE4) // not really sse4, just slow on Conroe c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4; @@ -2738,11 +2842,11 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) } #endif - if(mm_flags & AV_CPU_FLAG_3DNOW){ + if (HAVE_AMD3DNOW && (mm_flags & AV_CPU_FLAG_3DNOW)) { c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow; c->vector_fmul = vector_fmul_3dnow; } - if(mm_flags & AV_CPU_FLAG_3DNOWEXT){ + if (HAVE_AMD3DNOWEXT && (mm_flags & AV_CPU_FLAG_3DNOWEXT)) { c->vector_fmul_reverse = vector_fmul_reverse_3dnow2; #if HAVE_6REGS c->vector_fmul_window = vector_fmul_window_3dnow2; @@ -2752,6 +2856,11 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) #if HAVE_YASM c->scalarproduct_int16 = ff_scalarproduct_int16_mmx2; c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmx2; + if (avctx->flags & CODEC_FLAG_BITEXACT) { + c->apply_window_int16 = ff_apply_window_int16_mmxext_ba; + } else { + c->apply_window_int16 = ff_apply_window_int16_mmxext; + } #endif } if(mm_flags & AV_CPU_FLAG_SSE){ @@ -2766,58 +2875,71 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) c->vector_clipf = vector_clipf_sse; #if HAVE_YASM c->scalarproduct_float = ff_scalarproduct_float_sse; + c->butterflies_float_interleave = ff_butterflies_float_interleave_sse; + + if (!high_bit_depth) + c->emulated_edge_mc = emulated_edge_mc_sse; + c->gmc = gmc_sse; #endif } - if(mm_flags & AV_CPU_FLAG_3DNOW) + if (HAVE_AMD3DNOW && (mm_flags & AV_CPU_FLAG_3DNOW)) c->vector_fmul_add = vector_fmul_add_3dnow; // faster than sse if(mm_flags & AV_CPU_FLAG_SSE2){ #if HAVE_YASM c->scalarproduct_int16 = ff_scalarproduct_int16_sse2; c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2; + if (mm_flags & AV_CPU_FLAG_ATOM) { + c->vector_clip_int32 = ff_vector_clip_int32_int_sse2; + } else { + c->vector_clip_int32 = ff_vector_clip_int32_sse2; + } + if (avctx->flags & CODEC_FLAG_BITEXACT) { + c->apply_window_int16 = ff_apply_window_int16_sse2_ba; + } else { + if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) { + c->apply_window_int16 = ff_apply_window_int16_sse2; + } + } +#endif + } + if (mm_flags & AV_CPU_FLAG_SSSE3) { +#if HAVE_YASM + if (mm_flags & AV_CPU_FLAG_ATOM) { + c->apply_window_int16 = ff_apply_window_int16_ssse3_atom; + } else { + c->apply_window_int16 = ff_apply_window_int16_ssse3; + } + if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) { // cachesplit + c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3; + } +#endif + } - c->emulated_edge_mc = emulated_edge_mc_sse; - c->gmc= gmc_sse; + if (mm_flags & AV_CPU_FLAG_SSE4 && HAVE_SSE) { +#if HAVE_YASM + c->vector_clip_int32 = ff_vector_clip_int32_sse4; #endif } - if((mm_flags & AV_CPU_FLAG_SSSE3) && !(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW)) && HAVE_YASM) // cachesplit - c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3; + +#if HAVE_AVX && HAVE_YASM + if (mm_flags & AV_CPU_FLAG_AVX) { + if (bit_depth == 10) { + //AVX implies !cache64. + //TODO: Port cache(32|64) detection from x264. + H264_QPEL_FUNCS_10(1, 0, sse2) + H264_QPEL_FUNCS_10(2, 0, sse2) + H264_QPEL_FUNCS_10(3, 0, sse2) + + if (CONFIG_H264CHROMA) { + c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_avx; + c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_avx; + } + } + c->butterflies_float_interleave = ff_butterflies_float_interleave_avx; + } +#endif } if (CONFIG_ENCODERS) dsputilenc_init_mmx(c, avctx); - -#if 0 - // for speed testing - get_pixels = just_return; - put_pixels_clamped = just_return; - add_pixels_clamped = just_return; - - pix_abs16x16 = just_return; - pix_abs16x16_x2 = just_return; - pix_abs16x16_y2 = just_return; - pix_abs16x16_xy2 = just_return; - - put_pixels_tab[0] = just_return; - put_pixels_tab[1] = just_return; - put_pixels_tab[2] = just_return; - put_pixels_tab[3] = just_return; - - put_no_rnd_pixels_tab[0] = just_return; - put_no_rnd_pixels_tab[1] = just_return; - put_no_rnd_pixels_tab[2] = just_return; - put_no_rnd_pixels_tab[3] = just_return; - - avg_pixels_tab[0] = just_return; - avg_pixels_tab[1] = just_return; - avg_pixels_tab[2] = just_return; - avg_pixels_tab[3] = just_return; - - avg_no_rnd_pixels_tab[0] = just_return; - avg_no_rnd_pixels_tab[1] = just_return; - avg_no_rnd_pixels_tab[2] = just_return; - avg_no_rnd_pixels_tab[3] = just_return; - - //av_fdct = just_return; - //ff_idct = just_return; -#endif }