2 * Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License along
17 * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
21 #ifdef COMPILE_TEMPLATE_SSE
26 #define MOVQU "movdqu"
28 #define LOAD(mem,dst) \
29 MOV" "mem", "dst" \n\t"\
30 "punpcklbw "MM"7, "dst" \n\t"
31 #define PSRL1(reg) "psrldq $1, "reg" \n\t"
32 #define PSRL2(reg) "psrldq $2, "reg" \n\t"
33 #define PSHUF(src,dst) "movdqa "dst", "src" \n\t"\
34 "psrldq $2, "src" \n\t"
42 #define LOAD(mem,dst) \
43 MOV" "mem", "dst" \n\t"\
44 "punpcklbw "MM"7, "dst" \n\t"
45 #define PSRL1(reg) "psrlq $8, "reg" \n\t"
46 #define PSRL2(reg) "psrlq $16, "reg" \n\t"
47 #define PSHUF(src,dst) "pshufw $9, "dst", "src" \n\t"
50 #ifdef COMPILE_TEMPLATE_SSSE3
51 #define PABS(tmp,dst) \
52 "pabsw "dst", "dst" \n\t"
54 #define PABS(tmp,dst) \
55 "pxor "tmp", "tmp" \n\t"\
56 "psubw "dst", "tmp" \n\t"\
57 "pmaxsw "tmp", "dst" \n\t"
61 #define CHECK(pj,mj) \
62 MOVQU" "#pj"(%[cur],%[mrefs]), "MM"2 \n\t" /* cur[x-refs-1+j] */\
63 MOVQU" "#mj"(%[cur],%[prefs]), "MM"3 \n\t" /* cur[x+refs-1-j] */\
64 MOVQ" "MM"2, "MM"4 \n\t"\
65 MOVQ" "MM"2, "MM"5 \n\t"\
66 "pxor "MM"3, "MM"4 \n\t"\
67 "pavgb "MM"3, "MM"5 \n\t"\
68 "pand %[pb_1], "MM"4 \n\t"\
69 "psubusb "MM"4, "MM"5 \n\t"\
71 "punpcklbw "MM"7, "MM"5 \n\t" /* (cur[x-refs+j] + cur[x+refs-j])>>1 */\
72 MOVQ" "MM"2, "MM"4 \n\t"\
73 "psubusb "MM"3, "MM"2 \n\t"\
74 "psubusb "MM"4, "MM"3 \n\t"\
75 "pmaxub "MM"3, "MM"2 \n\t"\
76 MOVQ" "MM"2, "MM"3 \n\t"\
77 MOVQ" "MM"2, "MM"4 \n\t" /* ABS(cur[x-refs-1+j] - cur[x+refs-1-j]) */\
78 PSRL1(MM"3") /* ABS(cur[x-refs +j] - cur[x+refs -j]) */\
79 PSRL2(MM"4") /* ABS(cur[x-refs+1+j] - cur[x+refs+1-j]) */\
80 "punpcklbw "MM"7, "MM"2 \n\t"\
81 "punpcklbw "MM"7, "MM"3 \n\t"\
82 "punpcklbw "MM"7, "MM"4 \n\t"\
83 "paddw "MM"3, "MM"2 \n\t"\
84 "paddw "MM"4, "MM"2 \n\t" /* score */
87 MOVQ" "MM"0, "MM"3 \n\t"\
88 "pcmpgtw "MM"2, "MM"3 \n\t" /* if(score < spatial_score) */\
89 "pminsw "MM"2, "MM"0 \n\t" /* spatial_score= score; */\
90 MOVQ" "MM"3, "MM"6 \n\t"\
91 "pand "MM"3, "MM"5 \n\t"\
92 "pandn "MM"1, "MM"3 \n\t"\
93 "por "MM"5, "MM"3 \n\t"\
94 MOVQ" "MM"3, "MM"1 \n\t" /* spatial_pred= (cur[x-refs+j] + cur[x+refs-j])>>1; */
96 #define CHECK2 /* pretend not to have checked dir=2 if dir=1 was bad.\
97 hurts both quality and speed, but matches the C version. */\
98 "paddw %[pw_1], "MM"6 \n\t"\
99 "psllw $14, "MM"6 \n\t"\
100 "paddsw "MM"6, "MM"2 \n\t"\
101 MOVQ" "MM"0, "MM"3 \n\t"\
102 "pcmpgtw "MM"2, "MM"3 \n\t"\
103 "pminsw "MM"2, "MM"0 \n\t"\
104 "pand "MM"3, "MM"5 \n\t"\
105 "pandn "MM"1, "MM"3 \n\t"\
106 "por "MM"5, "MM"3 \n\t"\
107 MOVQ" "MM"3, "MM"1 \n\t"
109 #if defined(__MINGW32__) && defined(WIN32) && !defined(WIN64)
110 __attribute__((__force_align_arg_pointer__))
112 VLC_TARGET static void RENAME(yadif_filter_line)(uint8_t *dst,
113 uint8_t *prev, uint8_t *cur, uint8_t *next,
114 int w, int prefs, int mrefs, int parity, int mode)
116 DECLARE_ALIGNED(16, uint8_t, tmp0[16]);
117 DECLARE_ALIGNED(16, uint8_t, tmp1[16]);
118 DECLARE_ALIGNED(16, uint8_t, tmp2[16]);
119 DECLARE_ALIGNED(16, uint8_t, tmp3[16]);
123 for(x=0; x<w; x+=STEP){\
125 "pxor "MM"7, "MM"7 \n\t"\
126 LOAD("(%[cur],%[mrefs])", MM"0") /* c = cur[x-refs] */\
127 LOAD("(%[cur],%[prefs])", MM"1") /* e = cur[x+refs] */\
128 LOAD("(%["prev2"])", MM"2") /* prev2[x] */\
129 LOAD("(%["next2"])", MM"3") /* next2[x] */\
130 MOVQ" "MM"3, "MM"4 \n\t"\
131 "paddw "MM"2, "MM"3 \n\t"\
132 "psraw $1, "MM"3 \n\t" /* d = (prev2[x] + next2[x])>>1 */\
133 MOVQ" "MM"0, %[tmp0] \n\t" /* c */\
134 MOVQ" "MM"3, %[tmp1] \n\t" /* d */\
135 MOVQ" "MM"1, %[tmp2] \n\t" /* e */\
136 "psubw "MM"4, "MM"2 \n\t"\
137 PABS( MM"4", MM"2") /* temporal_diff0 */\
138 LOAD("(%[prev],%[mrefs])", MM"3") /* prev[x-refs] */\
139 LOAD("(%[prev],%[prefs])", MM"4") /* prev[x+refs] */\
140 "psubw "MM"0, "MM"3 \n\t"\
141 "psubw "MM"1, "MM"4 \n\t"\
144 "paddw "MM"4, "MM"3 \n\t" /* temporal_diff1 */\
145 "psrlw $1, "MM"2 \n\t"\
146 "psrlw $1, "MM"3 \n\t"\
147 "pmaxsw "MM"3, "MM"2 \n\t"\
148 LOAD("(%[next],%[mrefs])", MM"3") /* next[x-refs] */\
149 LOAD("(%[next],%[prefs])", MM"4") /* next[x+refs] */\
150 "psubw "MM"0, "MM"3 \n\t"\
151 "psubw "MM"1, "MM"4 \n\t"\
154 "paddw "MM"4, "MM"3 \n\t" /* temporal_diff2 */\
155 "psrlw $1, "MM"3 \n\t"\
156 "pmaxsw "MM"3, "MM"2 \n\t"\
157 MOVQ" "MM"2, %[tmp3] \n\t" /* diff */\
159 "paddw "MM"0, "MM"1 \n\t"\
160 "paddw "MM"0, "MM"0 \n\t"\
161 "psubw "MM"1, "MM"0 \n\t"\
162 "psrlw $1, "MM"1 \n\t" /* spatial_pred */\
163 PABS( MM"2", MM"0") /* ABS(c-e) */\
165 MOVQU" -1(%[cur],%[mrefs]), "MM"2 \n\t" /* cur[x-refs-1] */\
166 MOVQU" -1(%[cur],%[prefs]), "MM"3 \n\t" /* cur[x+refs-1] */\
167 MOVQ" "MM"2, "MM"4 \n\t"\
168 "psubusb "MM"3, "MM"2 \n\t"\
169 "psubusb "MM"4, "MM"3 \n\t"\
170 "pmaxub "MM"3, "MM"2 \n\t"\
171 PSHUF(MM"3", MM"2") \
172 "punpcklbw "MM"7, "MM"2 \n\t" /* ABS(cur[x-refs-1] - cur[x+refs-1]) */\
173 "punpcklbw "MM"7, "MM"3 \n\t" /* ABS(cur[x-refs+1] - cur[x+refs+1]) */\
174 "paddw "MM"2, "MM"0 \n\t"\
175 "paddw "MM"3, "MM"0 \n\t"\
176 "psubw %[pw_1], "MM"0 \n\t" /* spatial_score */\
187 /* if(p->mode<2) ... */\
188 MOVQ" %[tmp3], "MM"6 \n\t" /* diff */\
189 "cmpl $2, %[mode] \n\t"\
191 LOAD("(%["prev2"],%[mrefs],2)", MM"2") /* prev2[x-2*refs] */\
192 LOAD("(%["next2"],%[mrefs],2)", MM"4") /* next2[x-2*refs] */\
193 LOAD("(%["prev2"],%[prefs],2)", MM"3") /* prev2[x+2*refs] */\
194 LOAD("(%["next2"],%[prefs],2)", MM"5") /* next2[x+2*refs] */\
195 "paddw "MM"4, "MM"2 \n\t"\
196 "paddw "MM"5, "MM"3 \n\t"\
197 "psrlw $1, "MM"2 \n\t" /* b */\
198 "psrlw $1, "MM"3 \n\t" /* f */\
199 MOVQ" %[tmp0], "MM"4 \n\t" /* c */\
200 MOVQ" %[tmp1], "MM"5 \n\t" /* d */\
201 MOVQ" %[tmp2], "MM"7 \n\t" /* e */\
202 "psubw "MM"4, "MM"2 \n\t" /* b-c */\
203 "psubw "MM"7, "MM"3 \n\t" /* f-e */\
204 MOVQ" "MM"5, "MM"0 \n\t"\
205 "psubw "MM"4, "MM"5 \n\t" /* d-c */\
206 "psubw "MM"7, "MM"0 \n\t" /* d-e */\
207 MOVQ" "MM"2, "MM"4 \n\t"\
208 "pminsw "MM"3, "MM"2 \n\t"\
209 "pmaxsw "MM"4, "MM"3 \n\t"\
210 "pmaxsw "MM"5, "MM"2 \n\t"\
211 "pminsw "MM"5, "MM"3 \n\t"\
212 "pmaxsw "MM"0, "MM"2 \n\t" /* max */\
213 "pminsw "MM"0, "MM"3 \n\t" /* min */\
214 "pxor "MM"4, "MM"4 \n\t"\
215 "pmaxsw "MM"3, "MM"6 \n\t"\
216 "psubw "MM"2, "MM"4 \n\t" /* -max */\
217 "pmaxsw "MM"4, "MM"6 \n\t" /* diff= MAX3(diff, min, -max); */\
220 MOVQ" %[tmp1], "MM"2 \n\t" /* d */\
221 MOVQ" "MM"2, "MM"3 \n\t"\
222 "psubw "MM"6, "MM"2 \n\t" /* d-diff */\
223 "paddw "MM"6, "MM"3 \n\t" /* d+diff */\
224 "pmaxsw "MM"2, "MM"1 \n\t"\
225 "pminsw "MM"3, "MM"1 \n\t" /* d = clip(spatial_pred, d-diff, d+diff); */\
226 "packuswb "MM"1, "MM"1 \n\t"\
235 [prefs]"r"((x86_reg)prefs),\
236 [mrefs]"r"((x86_reg)mrefs),\
240 :REGMM"0",REGMM"1",REGMM"2",REGMM"3",REGMM"4",REGMM"5",REGMM"6",REGMM"7"\
242 __asm__ volatile(MOV" "MM"1, %0" :"=m"(*dst));\