From: Naohiro KORIYAMA Date: Wed, 21 Dec 2011 08:02:09 +0000 (+0900) Subject: yadif : Add SSSE3 and SSE2 support. porting from FFmpeg. X-Git-Tag: 2.1.0-git~6603 X-Git-Url: https://git.sesse.net/?a=commitdiff_plain;h=5c7c27cae5ca69415f1863fd322a6da35f5d3ad8;p=vlc yadif : Add SSSE3 and SSE2 support. porting from FFmpeg. Signed-off-by: Jean-Baptiste Kempf --- diff --git a/modules/video_filter/deinterlace/algo_yadif.c b/modules/video_filter/deinterlace/algo_yadif.c index bf2f097c3f..0a9efdbba2 100644 --- a/modules/video_filter/deinterlace/algo_yadif.c +++ b/modules/video_filter/deinterlace/algo_yadif.c @@ -1,5 +1,5 @@ /***************************************************************************** - * algo_yadif.c : Wrapper for MPlayer's Yadif algorithm + * algo_yadif.c : Wrapper for FFmpeg's Yadif algorithm ***************************************************************************** * Copyright (C) 2000-2011 the VideoLAN team * $Id$ @@ -26,10 +26,6 @@ # include "config.h" #endif -#ifdef CAN_COMPILE_MMXEXT -# include "mmx.h" -#endif - #include #include @@ -47,23 +43,7 @@ * Yadif (Yet Another DeInterlacing Filter). *****************************************************************************/ -/* Yadif's private data struct */ -struct vf_priv_s { - /* - * 0: Output 1 frame for each frame. - * 1: Output 1 frame for each field. - * 2: Like 0 but skips spatial interlacing check. - * 3: Like 1 but skips spatial interlacing check. - * - * In vlc, only & 0x02 has meaning, as we do the & 0x01 ourself. - */ - int mode; -}; - -/* I am unsure it is the right one */ -typedef intptr_t x86_reg; - -/* yadif.h comes from vf_yadif.c of mplayer project. +/* yadif.h comes from yadif.c of FFmpeg project. Necessary preprocessor macros are defined in common.h. */ #include "yadif.h" @@ -125,15 +105,22 @@ int RenderYadif( filter_t *p_filter, picture_t *p_dst, picture_t *p_src, if( p_prev && p_cur && p_next ) { /* */ - void (*filter)(struct vf_priv_s *p, uint8_t *dst, - uint8_t *prev, uint8_t *cur, uint8_t *next, - int w, int refs, int parity); + void (*filter)(uint8_t *dst, uint8_t *prev, uint8_t *cur, uint8_t *next, + int w, int prefs, int mrefs, int parity, int mode); + + filter = yadif_filter_line_c; +#if defined(HAVE_YADIF_MMX) + if( vlc_CPU() & CPU_CAPABILITY_MMX ) + filter = yadif_filter_line_mmx; +#endif #if defined(HAVE_YADIF_SSE2) if( vlc_CPU() & CPU_CAPABILITY_SSE2 ) - filter = yadif_filter_line_mmx2; - else + filter = yadif_filter_line_sse2; +#endif +#if defined(HAVE_YADIF_SSSE3) + if( vlc_CPU() & CPU_CAPABILITY_SSSE3 ) + filter = yadif_filter_line_ssse3; #endif - filter = yadif_filter_line_c; for( int n = 0; n < p_dst->i_planes; n++ ) { @@ -151,19 +138,20 @@ int RenderYadif( filter_t *p_filter, picture_t *p_dst, picture_t *p_src, } else { - struct vf_priv_s cfg; + int mode; /* Spatial checks only when enough data */ - cfg.mode = (y >= 2 && y < dstp->i_visible_lines - 2) ? 0 : 2; + mode = (y >= 2 && y < dstp->i_visible_lines - 2) ? 0 : 2; assert( prevp->i_pitch == curp->i_pitch && curp->i_pitch == nextp->i_pitch ); - filter( &cfg, - &dstp->p_pixels[y * dstp->i_pitch], + filter( &dstp->p_pixels[y * dstp->i_pitch], &prevp->p_pixels[y * prevp->i_pitch], &curp->p_pixels[y * curp->i_pitch], &nextp->p_pixels[y * nextp->i_pitch], dstp->i_visible_pitch, - curp->i_pitch, - yadif_parity ); + y < dstp->i_visible_lines - 2 ? curp->i_pitch : -curp->i_pitch, + y - 1 ? -curp->i_pitch : curp->i_pitch, + yadif_parity, + mode ); } /* We duplicate the first and last lines */ diff --git a/modules/video_filter/deinterlace/yadif.h b/modules/video_filter/deinterlace/yadif.h index 260fa65bbd..a2fccac67b 100644 --- a/modules/video_filter/deinterlace/yadif.h +++ b/modules/video_filter/deinterlace/yadif.h @@ -1,274 +1,118 @@ /* * Copyright (C) 2006 Michael Niedermayer * - * This file is part of MPlayer. + * This file is part of FFmpeg. * - * MPlayer is free software; you can redistribute it and/or modify + * FFmpeg is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * - * MPlayer is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along - * with MPlayer; if not, write to the Free Software Foundation, Inc., + * with FFmpeg; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */ -/* */ -#if defined(CAN_COMPILE_SSE2) && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ > 0)) - -#define HAVE_YADIF_SSE2 - -#define LOAD4(mem,dst) \ - "movd "mem", "#dst" \n\t"\ - "punpcklbw %%mm7, "#dst" \n\t" - -#define PABS(tmp,dst) \ - "pxor "#tmp", "#tmp" \n\t"\ - "psubw "#dst", "#tmp" \n\t"\ - "pmaxsw "#tmp", "#dst" \n\t" - -#define CHECK(pj,mj) \ - "movq "#pj"(%[cur],%[mrefs]), %%mm2 \n\t" /* cur[x-refs-1+j] */\ - "movq "#mj"(%[cur],%[prefs]), %%mm3 \n\t" /* cur[x+refs-1-j] */\ - "movq %%mm2, %%mm4 \n\t"\ - "movq %%mm2, %%mm5 \n\t"\ - "pxor %%mm3, %%mm4 \n\t"\ - "pavgb %%mm3, %%mm5 \n\t"\ - "pand %[pb1], %%mm4 \n\t"\ - "psubusb %%mm4, %%mm5 \n\t"\ - "psrlq $8, %%mm5 \n\t"\ - "punpcklbw %%mm7, %%mm5 \n\t" /* (cur[x-refs+j] + cur[x+refs-j])>>1 */\ - "movq %%mm2, %%mm4 \n\t"\ - "psubusb %%mm3, %%mm2 \n\t"\ - "psubusb %%mm4, %%mm3 \n\t"\ - "pmaxub %%mm3, %%mm2 \n\t"\ - "movq %%mm2, %%mm3 \n\t"\ - "movq %%mm2, %%mm4 \n\t" /* ABS(cur[x-refs-1+j] - cur[x+refs-1-j]) */\ - "psrlq $8, %%mm3 \n\t" /* ABS(cur[x-refs +j] - cur[x+refs -j]) */\ - "psrlq $16, %%mm4 \n\t" /* ABS(cur[x-refs+1+j] - cur[x+refs+1-j]) */\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpcklbw %%mm7, %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm4 \n\t"\ - "paddw %%mm3, %%mm2 \n\t"\ - "paddw %%mm4, %%mm2 \n\t" /* score */ - -#define CHECK1 \ - "movq %%mm0, %%mm3 \n\t"\ - "pcmpgtw %%mm2, %%mm3 \n\t" /* if(score < spatial_score) */\ - "pminsw %%mm2, %%mm0 \n\t" /* spatial_score= score; */\ - "movq %%mm3, %%mm6 \n\t"\ - "pand %%mm3, %%mm5 \n\t"\ - "pandn %%mm1, %%mm3 \n\t"\ - "por %%mm5, %%mm3 \n\t"\ - "movq %%mm3, %%mm1 \n\t" /* spatial_pred= (cur[x-refs+j] + cur[x+refs-j])>>1; */ - -#define CHECK2 /* pretend not to have checked dir=2 if dir=1 was bad.\ - hurts both quality and speed, but matches the C version. */\ - "paddw %[pw1], %%mm6 \n\t"\ - "psllw $14, %%mm6 \n\t"\ - "paddsw %%mm6, %%mm2 \n\t"\ - "movq %%mm0, %%mm3 \n\t"\ - "pcmpgtw %%mm2, %%mm3 \n\t"\ - "pminsw %%mm2, %%mm0 \n\t"\ - "pand %%mm3, %%mm5 \n\t"\ - "pandn %%mm1, %%mm3 \n\t"\ - "por %%mm5, %%mm3 \n\t"\ - "movq %%mm3, %%mm1 \n\t" +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif -static void yadif_filter_line_mmx2(struct vf_priv_s *p, uint8_t *dst, uint8_t *prev, uint8_t *cur, uint8_t *next, int w, int refs, int parity){ - static const uint64_t pw_1 = 0x0001000100010001ULL; - static const uint64_t pb_1 = 0x0101010101010101ULL; - const int mode = p->mode; - uint64_t tmp0, tmp1, tmp2, tmp3; - int x; +#if defined(__GNUC__) +# define DECLARE_ALIGNED(n,t,v) t __attribute__ ((aligned (n))) v +# if VLC_GCC_VERSION(3,1) +# define DECLARE_ASM_CONST(n,t,v) static const t __attribute__((used)) __attribute__ ((aligned (n))) v +# else +# define DECLARE_ASM_CONST(n,t,v) static const t __attribute__ ((aligned (n))) v +# endif +#endif -#define FILTER\ - for(x=0; x>1 */\ - "movq %%mm0, %[tmp0] \n\t" /* c */\ - "movq %%mm3, %[tmp1] \n\t" /* d */\ - "movq %%mm1, %[tmp2] \n\t" /* e */\ - "psubw %%mm4, %%mm2 \n\t"\ - PABS( %%mm4, %%mm2) /* temporal_diff0 */\ - LOAD4("(%[prev],%[mrefs])", %%mm3) /* prev[x-refs] */\ - LOAD4("(%[prev],%[prefs])", %%mm4) /* prev[x+refs] */\ - "psubw %%mm0, %%mm3 \n\t"\ - "psubw %%mm1, %%mm4 \n\t"\ - PABS( %%mm5, %%mm3)\ - PABS( %%mm5, %%mm4)\ - "paddw %%mm4, %%mm3 \n\t" /* temporal_diff1 */\ - "psrlw $1, %%mm2 \n\t"\ - "psrlw $1, %%mm3 \n\t"\ - "pmaxsw %%mm3, %%mm2 \n\t"\ - LOAD4("(%[next],%[mrefs])", %%mm3) /* next[x-refs] */\ - LOAD4("(%[next],%[prefs])", %%mm4) /* next[x+refs] */\ - "psubw %%mm0, %%mm3 \n\t"\ - "psubw %%mm1, %%mm4 \n\t"\ - PABS( %%mm5, %%mm3)\ - PABS( %%mm5, %%mm4)\ - "paddw %%mm4, %%mm3 \n\t" /* temporal_diff2 */\ - "psrlw $1, %%mm3 \n\t"\ - "pmaxsw %%mm3, %%mm2 \n\t"\ - "movq %%mm2, %[tmp3] \n\t" /* diff */\ -\ - "paddw %%mm0, %%mm1 \n\t"\ - "paddw %%mm0, %%mm0 \n\t"\ - "psubw %%mm1, %%mm0 \n\t"\ - "psrlw $1, %%mm1 \n\t" /* spatial_pred */\ - PABS( %%mm2, %%mm0) /* ABS(c-e) */\ -\ - "movq -1(%[cur],%[mrefs]), %%mm2 \n\t" /* cur[x-refs-1] */\ - "movq -1(%[cur],%[prefs]), %%mm3 \n\t" /* cur[x+refs-1] */\ - "movq %%mm2, %%mm4 \n\t"\ - "psubusb %%mm3, %%mm2 \n\t"\ - "psubusb %%mm4, %%mm3 \n\t"\ - "pmaxub %%mm3, %%mm2 \n\t"\ - "pshufw $9,%%mm2, %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t" /* ABS(cur[x-refs-1] - cur[x+refs-1]) */\ - "punpcklbw %%mm7, %%mm3 \n\t" /* ABS(cur[x-refs+1] - cur[x+refs+1]) */\ - "paddw %%mm2, %%mm0 \n\t"\ - "paddw %%mm3, %%mm0 \n\t"\ - "psubw %[pw1], %%mm0 \n\t" /* spatial_score */\ -\ - CHECK(-2,0)\ - CHECK1\ - CHECK(-3,1)\ - CHECK2\ - CHECK(0,-2)\ - CHECK1\ - CHECK(1,-3)\ - CHECK2\ -\ - /* if(p->mode<2) ... */\ - "movq %[tmp3], %%mm6 \n\t" /* diff */\ - "cmpl $2, %[mode] \n\t"\ - "jge 1f \n\t"\ - LOAD4("(%["prev2"],%[mrefs],2)", %%mm2) /* prev2[x-2*refs] */\ - LOAD4("(%["next2"],%[mrefs],2)", %%mm4) /* next2[x-2*refs] */\ - LOAD4("(%["prev2"],%[prefs],2)", %%mm3) /* prev2[x+2*refs] */\ - LOAD4("(%["next2"],%[prefs],2)", %%mm5) /* next2[x+2*refs] */\ - "paddw %%mm4, %%mm2 \n\t"\ - "paddw %%mm5, %%mm3 \n\t"\ - "psrlw $1, %%mm2 \n\t" /* b */\ - "psrlw $1, %%mm3 \n\t" /* f */\ - "movq %[tmp0], %%mm4 \n\t" /* c */\ - "movq %[tmp1], %%mm5 \n\t" /* d */\ - "movq %[tmp2], %%mm7 \n\t" /* e */\ - "psubw %%mm4, %%mm2 \n\t" /* b-c */\ - "psubw %%mm7, %%mm3 \n\t" /* f-e */\ - "movq %%mm5, %%mm0 \n\t"\ - "psubw %%mm4, %%mm5 \n\t" /* d-c */\ - "psubw %%mm7, %%mm0 \n\t" /* d-e */\ - "movq %%mm2, %%mm4 \n\t"\ - "pminsw %%mm3, %%mm2 \n\t"\ - "pmaxsw %%mm4, %%mm3 \n\t"\ - "pmaxsw %%mm5, %%mm2 \n\t"\ - "pminsw %%mm5, %%mm3 \n\t"\ - "pmaxsw %%mm0, %%mm2 \n\t" /* max */\ - "pminsw %%mm0, %%mm3 \n\t" /* min */\ - "pxor %%mm4, %%mm4 \n\t"\ - "pmaxsw %%mm3, %%mm6 \n\t"\ - "psubw %%mm2, %%mm4 \n\t" /* -max */\ - "pmaxsw %%mm4, %%mm6 \n\t" /* diff= MAX3(diff, min, -max); */\ - "1: \n\t"\ -\ - "movq %[tmp1], %%mm2 \n\t" /* d */\ - "movq %%mm2, %%mm3 \n\t"\ - "psubw %%mm6, %%mm2 \n\t" /* d-diff */\ - "paddw %%mm6, %%mm3 \n\t" /* d+diff */\ - "pmaxsw %%mm2, %%mm1 \n\t"\ - "pminsw %%mm3, %%mm1 \n\t" /* d = clip(spatial_pred, d-diff, d+diff); */\ - "packuswb %%mm1, %%mm1 \n\t"\ -\ - :[tmp0]"=m"(tmp0),\ - [tmp1]"=m"(tmp1),\ - [tmp2]"=m"(tmp2),\ - [tmp3]"=m"(tmp3)\ - :[prev] "r"(prev),\ - [cur] "r"(cur),\ - [next] "r"(next),\ - [prefs]"r"((x86_reg)refs),\ - [mrefs]"r"((x86_reg)-refs),\ - [pw1] "m"(pw_1),\ - [pb1] "m"(pb_1),\ - [mode] "g"(mode)\ - );\ - __asm__ volatile("movd %%mm1, %0" :"=m"(*dst));\ - dst += 4;\ - prev+= 4;\ - cur += 4;\ - next+= 4;\ - } +typedef intptr_t x86_reg; +typedef struct { uint64_t a, b; } xmm_reg; + +DECLARE_ASM_CONST(16, const xmm_reg, pb_1) = {0x0101010101010101ULL, 0x0101010101010101ULL}; +DECLARE_ASM_CONST(16, const xmm_reg, pw_1) = {0x0001000100010001ULL, 0x0001000100010001ULL}; + + +#ifdef CAN_COMPILE_SSSE3 +#if defined(__SSE__) || VLC_GCC_VERSION(4, 4) +// ================ SSSE3 ================= +#define HAVE_YADIF_SSSE3 +#define COMPILE_TEMPLATE_SSE 1 +#define COMPILE_TEMPLATE_SSSE3 1 +#define VLC_TARGET VLC_SSE +#define RENAME(a) a ## _ssse3 +#include "yadif_template.h" +#undef COMPILE_TEMPLATE_SSE +#undef COMPILE_TEMPLATE_SSSE3 +#undef VLC_TARGET +#undef RENAME +#endif +#endif - if(parity){ -#define prev2 "prev" -#define next2 "cur" - FILTER -#undef prev2 -#undef next2 - }else{ -#define prev2 "cur" -#define next2 "next" - FILTER -#undef prev2 -#undef next2 - } -} -#undef LOAD4 -#undef PABS -#undef CHECK -#undef CHECK1 -#undef CHECK2 -#undef FILTER +#ifdef CAN_COMPILE_SSE2 +#if defined(__SSE__) || VLC_GCC_VERSION(4, 4) +// ================= SSE2 ================= +#define HAVE_YADIF_SSE2 +#define COMPILE_TEMPLATE_SSE 1 +#define VLC_TARGET VLC_SSE +#define RENAME(a) a ## _sse2 +#include "yadif_template.h" +#undef COMPILE_TEMPLATE_SSE +#undef VLC_TARGET +#undef RENAME +#endif +#endif +#ifdef CAN_COMPILE_MMX +#if defined(__MMX__) || VLC_GCC_VERSION(4, 4) +// ================ MMX ================= +#define HAVE_YADIF_MMX +#define VLC_TARGET VLC_MMX +#define RENAME(a) a ## _mmx +#include "yadif_template.h" +#undef VLC_TARGET +#undef RENAME +#endif #endif -static void yadif_filter_line_c(struct vf_priv_s *p, uint8_t *dst, uint8_t *prev, uint8_t *cur, uint8_t *next, int w, int refs, int parity){ +static void yadif_filter_line_c(uint8_t *dst, uint8_t *prev, uint8_t *cur, uint8_t *next, int w, int prefs, int mrefs, int parity, int mode) { int x; uint8_t *prev2= parity ? prev : cur ; uint8_t *next2= parity ? cur : next; for(x=0; x>1; - int e= cur[+refs]; + int e= cur[prefs]; int temporal_diff0= FFABS(prev2[0] - next2[0]); - int temporal_diff1=( FFABS(prev[-refs] - c) + FFABS(prev[+refs] - e) )>>1; - int temporal_diff2=( FFABS(next[-refs] - c) + FFABS(next[+refs] - e) )>>1; + int temporal_diff1=( FFABS(prev[mrefs] - c) + FFABS(prev[prefs] - e) )>>1; + int temporal_diff2=( FFABS(next[mrefs] - c) + FFABS(next[prefs] - e) )>>1; int diff= FFMAX3(temporal_diff0>>1, temporal_diff1, temporal_diff2); int spatial_pred= (c+e)>>1; - int spatial_score= FFABS(cur[-refs-1] - cur[+refs-1]) + FFABS(c-e) - + FFABS(cur[-refs+1] - cur[+refs+1]) - 1; + int spatial_score= FFABS(cur[mrefs-1] - cur[prefs-1]) + FFABS(c-e) + + FFABS(cur[mrefs+1] - cur[prefs+1]) - 1; #define CHECK(j)\ - { int score= FFABS(cur[-refs-1+j] - cur[+refs-1-j])\ - + FFABS(cur[-refs +j] - cur[+refs -j])\ - + FFABS(cur[-refs+1+j] - cur[+refs+1-j]);\ + { int score= FFABS(cur[mrefs-1+j] - cur[prefs-1-j])\ + + FFABS(cur[mrefs +j] - cur[prefs -j])\ + + FFABS(cur[mrefs+1+j] - cur[prefs+1-j]);\ if(score < spatial_score){\ spatial_score= score;\ - spatial_pred= (cur[-refs +j] + cur[+refs -j])>>1;\ + spatial_pred= (cur[mrefs +j] + cur[prefs -j])>>1;\ CHECK(-1) CHECK(-2) }} }} CHECK( 1) CHECK( 2) }} }} - if(p->mode<2){ - int b= (prev2[-2*refs] + next2[-2*refs])>>1; - int f= (prev2[+2*refs] + next2[+2*refs])>>1; + if(mode<2){ + int b= (prev2[2*mrefs] + next2[2*mrefs])>>1; + int f= (prev2[2*prefs] + next2[2*prefs])>>1; #if 0 - int a= cur[-3*refs]; - int g= cur[+3*refs]; + int a= cur[3*mrefs]; + int g= cur[3*prefs]; int max= FFMAX3(d-e, d-c, FFMIN3(FFMAX(b-c,f-e),FFMAX(b-c,b-a),FFMAX(f-g,f-e)) ); int min= FFMIN3(d-e, d-c, FFMAX3(FFMIN(b-c,f-e),FFMIN(b-c,b-a),FFMIN(f-g,f-e)) ); #else diff --git a/modules/video_filter/deinterlace/yadif_template.h b/modules/video_filter/deinterlace/yadif_template.h new file mode 100644 index 0000000000..146b301b0b --- /dev/null +++ b/modules/video_filter/deinterlace/yadif_template.h @@ -0,0 +1,275 @@ +/* + * Copyright (C) 2006 Michael Niedermayer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with FFmpeg; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#ifdef COMPILE_TEMPLATE_SSE +#define REGMM "xmm" +#define MM "%%"REGMM +#define MOV "movq" +#define MOVQ "movdqa" +#define MOVQU "movdqu" +#define STEP 8 +#define LOAD(mem,dst) \ + MOV" "mem", "dst" \n\t"\ + "punpcklbw "MM"7, "dst" \n\t" +#define PSRL1(reg) "psrldq $1, "reg" \n\t" +#define PSRL2(reg) "psrldq $2, "reg" \n\t" +#define PSHUF(src,dst) "movdqa "dst", "src" \n\t"\ + "psrldq $2, "src" \n\t" +#else +#define REGMM "mm" +#define MM "%%"REGMM +#define MOV "movd" +#define MOVQ "movq" +#define MOVQU "movq" +#define STEP 4 +#define LOAD(mem,dst) \ + MOV" "mem", "dst" \n\t"\ + "punpcklbw "MM"7, "dst" \n\t" +#define PSRL1(reg) "psrlq $8, "reg" \n\t" +#define PSRL2(reg) "psrlq $16, "reg" \n\t" +#define PSHUF(src,dst) "pshufw $9, "dst", "src" \n\t" +#endif + +#ifdef COMPILE_TEMPLATE_SSSE3 +#define PABS(tmp,dst) \ + "pabsw "dst", "dst" \n\t" +#else +#define PABS(tmp,dst) \ + "pxor "tmp", "tmp" \n\t"\ + "psubw "dst", "tmp" \n\t"\ + "pmaxsw "tmp", "dst" \n\t" +#endif + + +#define CHECK(pj,mj) \ + MOVQU" "#pj"(%[cur],%[mrefs]), "MM"2 \n\t" /* cur[x-refs-1+j] */\ + MOVQU" "#mj"(%[cur],%[prefs]), "MM"3 \n\t" /* cur[x+refs-1-j] */\ + MOVQ" "MM"2, "MM"4 \n\t"\ + MOVQ" "MM"2, "MM"5 \n\t"\ + "pxor "MM"3, "MM"4 \n\t"\ + "pavgb "MM"3, "MM"5 \n\t"\ + "pand %[pb_1], "MM"4 \n\t"\ + "psubusb "MM"4, "MM"5 \n\t"\ + PSRL1(MM"5") \ + "punpcklbw "MM"7, "MM"5 \n\t" /* (cur[x-refs+j] + cur[x+refs-j])>>1 */\ + MOVQ" "MM"2, "MM"4 \n\t"\ + "psubusb "MM"3, "MM"2 \n\t"\ + "psubusb "MM"4, "MM"3 \n\t"\ + "pmaxub "MM"3, "MM"2 \n\t"\ + MOVQ" "MM"2, "MM"3 \n\t"\ + MOVQ" "MM"2, "MM"4 \n\t" /* ABS(cur[x-refs-1+j] - cur[x+refs-1-j]) */\ + PSRL1(MM"3") /* ABS(cur[x-refs +j] - cur[x+refs -j]) */\ + PSRL2(MM"4") /* ABS(cur[x-refs+1+j] - cur[x+refs+1-j]) */\ + "punpcklbw "MM"7, "MM"2 \n\t"\ + "punpcklbw "MM"7, "MM"3 \n\t"\ + "punpcklbw "MM"7, "MM"4 \n\t"\ + "paddw "MM"3, "MM"2 \n\t"\ + "paddw "MM"4, "MM"2 \n\t" /* score */ + +#define CHECK1 \ + MOVQ" "MM"0, "MM"3 \n\t"\ + "pcmpgtw "MM"2, "MM"3 \n\t" /* if(score < spatial_score) */\ + "pminsw "MM"2, "MM"0 \n\t" /* spatial_score= score; */\ + MOVQ" "MM"3, "MM"6 \n\t"\ + "pand "MM"3, "MM"5 \n\t"\ + "pandn "MM"1, "MM"3 \n\t"\ + "por "MM"5, "MM"3 \n\t"\ + MOVQ" "MM"3, "MM"1 \n\t" /* spatial_pred= (cur[x-refs+j] + cur[x+refs-j])>>1; */ + +#define CHECK2 /* pretend not to have checked dir=2 if dir=1 was bad.\ + hurts both quality and speed, but matches the C version. */\ + "paddw %[pw_1], "MM"6 \n\t"\ + "psllw $14, "MM"6 \n\t"\ + "paddsw "MM"6, "MM"2 \n\t"\ + MOVQ" "MM"0, "MM"3 \n\t"\ + "pcmpgtw "MM"2, "MM"3 \n\t"\ + "pminsw "MM"2, "MM"0 \n\t"\ + "pand "MM"3, "MM"5 \n\t"\ + "pandn "MM"1, "MM"3 \n\t"\ + "por "MM"5, "MM"3 \n\t"\ + MOVQ" "MM"3, "MM"1 \n\t" + +VLC_TARGET static void RENAME(yadif_filter_line)(uint8_t *dst, + uint8_t *prev, uint8_t *cur, uint8_t *next, + int w, int prefs, int mrefs, int parity, int mode) +{ + DECLARE_ALIGNED(16, uint8_t, tmp0)[16]; + DECLARE_ALIGNED(16, uint8_t, tmp1)[16]; + DECLARE_ALIGNED(16, uint8_t, tmp2)[16]; + DECLARE_ALIGNED(16, uint8_t, tmp3)[16]; + int x; + +#define FILTER\ + for(x=0; x>1 */\ + MOVQ" "MM"0, %[tmp0] \n\t" /* c */\ + MOVQ" "MM"3, %[tmp1] \n\t" /* d */\ + MOVQ" "MM"1, %[tmp2] \n\t" /* e */\ + "psubw "MM"4, "MM"2 \n\t"\ + PABS( MM"4", MM"2") /* temporal_diff0 */\ + LOAD("(%[prev],%[mrefs])", MM"3") /* prev[x-refs] */\ + LOAD("(%[prev],%[prefs])", MM"4") /* prev[x+refs] */\ + "psubw "MM"0, "MM"3 \n\t"\ + "psubw "MM"1, "MM"4 \n\t"\ + PABS( MM"5", MM"3")\ + PABS( MM"5", MM"4")\ + "paddw "MM"4, "MM"3 \n\t" /* temporal_diff1 */\ + "psrlw $1, "MM"2 \n\t"\ + "psrlw $1, "MM"3 \n\t"\ + "pmaxsw "MM"3, "MM"2 \n\t"\ + LOAD("(%[next],%[mrefs])", MM"3") /* next[x-refs] */\ + LOAD("(%[next],%[prefs])", MM"4") /* next[x+refs] */\ + "psubw "MM"0, "MM"3 \n\t"\ + "psubw "MM"1, "MM"4 \n\t"\ + PABS( MM"5", MM"3")\ + PABS( MM"5", MM"4")\ + "paddw "MM"4, "MM"3 \n\t" /* temporal_diff2 */\ + "psrlw $1, "MM"3 \n\t"\ + "pmaxsw "MM"3, "MM"2 \n\t"\ + MOVQ" "MM"2, %[tmp3] \n\t" /* diff */\ +\ + "paddw "MM"0, "MM"1 \n\t"\ + "paddw "MM"0, "MM"0 \n\t"\ + "psubw "MM"1, "MM"0 \n\t"\ + "psrlw $1, "MM"1 \n\t" /* spatial_pred */\ + PABS( MM"2", MM"0") /* ABS(c-e) */\ +\ + MOVQU" -1(%[cur],%[mrefs]), "MM"2 \n\t" /* cur[x-refs-1] */\ + MOVQU" -1(%[cur],%[prefs]), "MM"3 \n\t" /* cur[x+refs-1] */\ + MOVQ" "MM"2, "MM"4 \n\t"\ + "psubusb "MM"3, "MM"2 \n\t"\ + "psubusb "MM"4, "MM"3 \n\t"\ + "pmaxub "MM"3, "MM"2 \n\t"\ + PSHUF(MM"3", MM"2") \ + "punpcklbw "MM"7, "MM"2 \n\t" /* ABS(cur[x-refs-1] - cur[x+refs-1]) */\ + "punpcklbw "MM"7, "MM"3 \n\t" /* ABS(cur[x-refs+1] - cur[x+refs+1]) */\ + "paddw "MM"2, "MM"0 \n\t"\ + "paddw "MM"3, "MM"0 \n\t"\ + "psubw %[pw_1], "MM"0 \n\t" /* spatial_score */\ +\ + CHECK(-2,0)\ + CHECK1\ + CHECK(-3,1)\ + CHECK2\ + CHECK(0,-2)\ + CHECK1\ + CHECK(1,-3)\ + CHECK2\ +\ + /* if(p->mode<2) ... */\ + MOVQ" %[tmp3], "MM"6 \n\t" /* diff */\ + "cmpl $2, %[mode] \n\t"\ + "jge 1f \n\t"\ + LOAD("(%["prev2"],%[mrefs],2)", MM"2") /* prev2[x-2*refs] */\ + LOAD("(%["next2"],%[mrefs],2)", MM"4") /* next2[x-2*refs] */\ + LOAD("(%["prev2"],%[prefs],2)", MM"3") /* prev2[x+2*refs] */\ + LOAD("(%["next2"],%[prefs],2)", MM"5") /* next2[x+2*refs] */\ + "paddw "MM"4, "MM"2 \n\t"\ + "paddw "MM"5, "MM"3 \n\t"\ + "psrlw $1, "MM"2 \n\t" /* b */\ + "psrlw $1, "MM"3 \n\t" /* f */\ + MOVQ" %[tmp0], "MM"4 \n\t" /* c */\ + MOVQ" %[tmp1], "MM"5 \n\t" /* d */\ + MOVQ" %[tmp2], "MM"7 \n\t" /* e */\ + "psubw "MM"4, "MM"2 \n\t" /* b-c */\ + "psubw "MM"7, "MM"3 \n\t" /* f-e */\ + MOVQ" "MM"5, "MM"0 \n\t"\ + "psubw "MM"4, "MM"5 \n\t" /* d-c */\ + "psubw "MM"7, "MM"0 \n\t" /* d-e */\ + MOVQ" "MM"2, "MM"4 \n\t"\ + "pminsw "MM"3, "MM"2 \n\t"\ + "pmaxsw "MM"4, "MM"3 \n\t"\ + "pmaxsw "MM"5, "MM"2 \n\t"\ + "pminsw "MM"5, "MM"3 \n\t"\ + "pmaxsw "MM"0, "MM"2 \n\t" /* max */\ + "pminsw "MM"0, "MM"3 \n\t" /* min */\ + "pxor "MM"4, "MM"4 \n\t"\ + "pmaxsw "MM"3, "MM"6 \n\t"\ + "psubw "MM"2, "MM"4 \n\t" /* -max */\ + "pmaxsw "MM"4, "MM"6 \n\t" /* diff= MAX3(diff, min, -max); */\ + "1: \n\t"\ +\ + MOVQ" %[tmp1], "MM"2 \n\t" /* d */\ + MOVQ" "MM"2, "MM"3 \n\t"\ + "psubw "MM"6, "MM"2 \n\t" /* d-diff */\ + "paddw "MM"6, "MM"3 \n\t" /* d+diff */\ + "pmaxsw "MM"2, "MM"1 \n\t"\ + "pminsw "MM"3, "MM"1 \n\t" /* d = clip(spatial_pred, d-diff, d+diff); */\ + "packuswb "MM"1, "MM"1 \n\t"\ +\ + :[tmp0]"=m"(tmp0),\ + [tmp1]"=m"(tmp1),\ + [tmp2]"=m"(tmp2),\ + [tmp3]"=m"(tmp3)\ + :[prev] "r"(prev),\ + [cur] "r"(cur),\ + [next] "r"(next),\ + [prefs]"r"((x86_reg)prefs),\ + [mrefs]"r"((x86_reg)mrefs),\ + [pw_1] "m"(pw_1),\ + [pb_1] "m"(pb_1),\ + [mode] "g"(mode)\ + :REGMM"0",REGMM"1",REGMM"2",REGMM"3",REGMM"4",REGMM"5",REGMM"6",REGMM"7"\ + );\ + __asm__ volatile(MOV" "MM"1, %0" :"=m"(*dst));\ + dst += STEP;\ + prev+= STEP;\ + cur += STEP;\ + next+= STEP;\ + } + + if (parity) { +#define prev2 "prev" +#define next2 "cur" + FILTER +#undef prev2 +#undef next2 + } else { +#define prev2 "cur" +#define next2 "next" + FILTER +#undef prev2 +#undef next2 + } +} +#undef STEP +#undef REGMM +#undef MM +#undef MOV +#undef MOVQ +#undef MOVQU +#undef PSHUF +#undef PSRL1 +#undef PSRL2 +#undef LOAD +#undef PABS +#undef CHECK +#undef CHECK1 +#undef CHECK2 +#undef FILTER +