2 * Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License along
17 * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
21 #define MANGLE(a) #a "(%%rip)"
22 #ifdef COMPILE_TEMPLATE_SSE
27 #define MOVQU "movdqu"
29 #define LOAD(mem,dst) \
30 MOV" "mem", "dst" \n\t"\
31 "punpcklbw "MM"7, "dst" \n\t"
32 #define PSRL1(reg) "psrldq $1, "reg" \n\t"
33 #define PSRL2(reg) "psrldq $2, "reg" \n\t"
34 #define PSHUF(src,dst) "movdqa "dst", "src" \n\t"\
35 "psrldq $2, "src" \n\t"
43 #define LOAD(mem,dst) \
44 MOV" "mem", "dst" \n\t"\
45 "punpcklbw "MM"7, "dst" \n\t"
46 #define PSRL1(reg) "psrlq $8, "reg" \n\t"
47 #define PSRL2(reg) "psrlq $16, "reg" \n\t"
48 #define PSHUF(src,dst) "pshufw $9, "dst", "src" \n\t"
51 #ifdef COMPILE_TEMPLATE_SSSE3
52 #define PABS(tmp,dst) \
53 "pabsw "dst", "dst" \n\t"
55 #define PABS(tmp,dst) \
56 "pxor "tmp", "tmp" \n\t"\
57 "psubw "dst", "tmp" \n\t"\
58 "pmaxsw "tmp", "dst" \n\t"
62 #define CHECK(pj,mj) \
63 MOVQU" "#pj"(%[cur],%[mrefs]), "MM"2 \n\t" /* cur[x-refs-1+j] */\
64 MOVQU" "#mj"(%[cur],%[prefs]), "MM"3 \n\t" /* cur[x+refs-1-j] */\
65 MOVQ" "MM"2, "MM"4 \n\t"\
66 MOVQ" "MM"2, "MM"5 \n\t"\
67 "pxor "MM"3, "MM"4 \n\t"\
68 "pavgb "MM"3, "MM"5 \n\t"\
69 "pand "MANGLE(pb_1)", "MM"4 \n\t"\
70 "psubusb "MM"4, "MM"5 \n\t"\
72 "punpcklbw "MM"7, "MM"5 \n\t" /* (cur[x-refs+j] + cur[x+refs-j])>>1 */\
73 MOVQ" "MM"2, "MM"4 \n\t"\
74 "psubusb "MM"3, "MM"2 \n\t"\
75 "psubusb "MM"4, "MM"3 \n\t"\
76 "pmaxub "MM"3, "MM"2 \n\t"\
77 MOVQ" "MM"2, "MM"3 \n\t"\
78 MOVQ" "MM"2, "MM"4 \n\t" /* ABS(cur[x-refs-1+j] - cur[x+refs-1-j]) */\
79 PSRL1(MM"3") /* ABS(cur[x-refs +j] - cur[x+refs -j]) */\
80 PSRL2(MM"4") /* ABS(cur[x-refs+1+j] - cur[x+refs+1-j]) */\
81 "punpcklbw "MM"7, "MM"2 \n\t"\
82 "punpcklbw "MM"7, "MM"3 \n\t"\
83 "punpcklbw "MM"7, "MM"4 \n\t"\
84 "paddw "MM"3, "MM"2 \n\t"\
85 "paddw "MM"4, "MM"2 \n\t" /* score */
88 MOVQ" "MM"0, "MM"3 \n\t"\
89 "pcmpgtw "MM"2, "MM"3 \n\t" /* if(score < spatial_score) */\
90 "pminsw "MM"2, "MM"0 \n\t" /* spatial_score= score; */\
91 MOVQ" "MM"3, "MM"6 \n\t"\
92 "pand "MM"3, "MM"5 \n\t"\
93 "pandn "MM"1, "MM"3 \n\t"\
94 "por "MM"5, "MM"3 \n\t"\
95 MOVQ" "MM"3, "MM"1 \n\t" /* spatial_pred= (cur[x-refs+j] + cur[x+refs-j])>>1; */
97 #define CHECK2 /* pretend not to have checked dir=2 if dir=1 was bad.\
98 hurts both quality and speed, but matches the C version. */\
99 "paddw "MANGLE(pw_1)", "MM"6 \n\t"\
100 "psllw $14, "MM"6 \n\t"\
101 "paddsw "MM"6, "MM"2 \n\t"\
102 MOVQ" "MM"0, "MM"3 \n\t"\
103 "pcmpgtw "MM"2, "MM"3 \n\t"\
104 "pminsw "MM"2, "MM"0 \n\t"\
105 "pand "MM"3, "MM"5 \n\t"\
106 "pandn "MM"1, "MM"3 \n\t"\
107 "por "MM"5, "MM"3 \n\t"\
108 MOVQ" "MM"3, "MM"1 \n\t"
110 #if defined(__MINGW32__) && defined(WIN32) && !defined(WIN64)
111 __attribute__((__force_align_arg_pointer__))
113 VLC_TARGET static void RENAME(yadif_filter_line)(uint8_t *dst,
114 uint8_t *prev, uint8_t *cur, uint8_t *next,
115 int w, int prefs, int mrefs, int parity, int mode)
117 DECLARE_ALIGNED(16, uint8_t, tmp[16*4]);
121 for(x=0; x<w; x+=STEP){\
123 "pxor "MM"7, "MM"7 \n\t"\
124 LOAD("(%[cur],%[mrefs])", MM"0") /* c = cur[x-refs] */\
125 LOAD("(%[cur],%[prefs])", MM"1") /* e = cur[x+refs] */\
126 LOAD("(%["prev2"])", MM"2") /* prev2[x] */\
127 LOAD("(%["next2"])", MM"3") /* next2[x] */\
128 MOVQ" "MM"3, "MM"4 \n\t"\
129 "paddw "MM"2, "MM"3 \n\t"\
130 "psraw $1, "MM"3 \n\t" /* d = (prev2[x] + next2[x])>>1 */\
131 MOVQ" "MM"0, (%[tmp]) \n\t" /* c */\
132 MOVQ" "MM"3, 16(%[tmp]) \n\t" /* d */\
133 MOVQ" "MM"1, 32(%[tmp]) \n\t" /* e */\
134 "psubw "MM"4, "MM"2 \n\t"\
135 PABS( MM"4", MM"2") /* temporal_diff0 */\
136 LOAD("(%[prev],%[mrefs])", MM"3") /* prev[x-refs] */\
137 LOAD("(%[prev],%[prefs])", MM"4") /* prev[x+refs] */\
138 "psubw "MM"0, "MM"3 \n\t"\
139 "psubw "MM"1, "MM"4 \n\t"\
142 "paddw "MM"4, "MM"3 \n\t" /* temporal_diff1 */\
143 "psrlw $1, "MM"2 \n\t"\
144 "psrlw $1, "MM"3 \n\t"\
145 "pmaxsw "MM"3, "MM"2 \n\t"\
146 LOAD("(%[next],%[mrefs])", MM"3") /* next[x-refs] */\
147 LOAD("(%[next],%[prefs])", MM"4") /* next[x+refs] */\
148 "psubw "MM"0, "MM"3 \n\t"\
149 "psubw "MM"1, "MM"4 \n\t"\
152 "paddw "MM"4, "MM"3 \n\t" /* temporal_diff2 */\
153 "psrlw $1, "MM"3 \n\t"\
154 "pmaxsw "MM"3, "MM"2 \n\t"\
155 MOVQ" "MM"2, 48(%[tmp]) \n\t" /* diff */\
157 "paddw "MM"0, "MM"1 \n\t"\
158 "paddw "MM"0, "MM"0 \n\t"\
159 "psubw "MM"1, "MM"0 \n\t"\
160 "psrlw $1, "MM"1 \n\t" /* spatial_pred */\
161 PABS( MM"2", MM"0") /* ABS(c-e) */\
163 MOVQU" -1(%[cur],%[mrefs]), "MM"2 \n\t" /* cur[x-refs-1] */\
164 MOVQU" -1(%[cur],%[prefs]), "MM"3 \n\t" /* cur[x+refs-1] */\
165 MOVQ" "MM"2, "MM"4 \n\t"\
166 "psubusb "MM"3, "MM"2 \n\t"\
167 "psubusb "MM"4, "MM"3 \n\t"\
168 "pmaxub "MM"3, "MM"2 \n\t"\
169 PSHUF(MM"3", MM"2") \
170 "punpcklbw "MM"7, "MM"2 \n\t" /* ABS(cur[x-refs-1] - cur[x+refs-1]) */\
171 "punpcklbw "MM"7, "MM"3 \n\t" /* ABS(cur[x-refs+1] - cur[x+refs+1]) */\
172 "paddw "MM"2, "MM"0 \n\t"\
173 "paddw "MM"3, "MM"0 \n\t"\
174 "psubw "MANGLE(pw_1)", "MM"0 \n\t" /* spatial_score */\
185 /* if(p->mode<2) ... */\
186 MOVQ" 48(%[tmp]), "MM"6 \n\t" /* diff */\
187 "cmpl $2, %[mode] \n\t"\
189 LOAD("(%["prev2"],%[mrefs],2)", MM"2") /* prev2[x-2*refs] */\
190 LOAD("(%["next2"],%[mrefs],2)", MM"4") /* next2[x-2*refs] */\
191 LOAD("(%["prev2"],%[prefs],2)", MM"3") /* prev2[x+2*refs] */\
192 LOAD("(%["next2"],%[prefs],2)", MM"5") /* next2[x+2*refs] */\
193 "paddw "MM"4, "MM"2 \n\t"\
194 "paddw "MM"5, "MM"3 \n\t"\
195 "psrlw $1, "MM"2 \n\t" /* b */\
196 "psrlw $1, "MM"3 \n\t" /* f */\
197 MOVQ" (%[tmp]), "MM"4 \n\t" /* c */\
198 MOVQ" 16(%[tmp]), "MM"5 \n\t" /* d */\
199 MOVQ" 32(%[tmp]), "MM"7 \n\t" /* e */\
200 "psubw "MM"4, "MM"2 \n\t" /* b-c */\
201 "psubw "MM"7, "MM"3 \n\t" /* f-e */\
202 MOVQ" "MM"5, "MM"0 \n\t"\
203 "psubw "MM"4, "MM"5 \n\t" /* d-c */\
204 "psubw "MM"7, "MM"0 \n\t" /* d-e */\
205 MOVQ" "MM"2, "MM"4 \n\t"\
206 "pminsw "MM"3, "MM"2 \n\t"\
207 "pmaxsw "MM"4, "MM"3 \n\t"\
208 "pmaxsw "MM"5, "MM"2 \n\t"\
209 "pminsw "MM"5, "MM"3 \n\t"\
210 "pmaxsw "MM"0, "MM"2 \n\t" /* max */\
211 "pminsw "MM"0, "MM"3 \n\t" /* min */\
212 "pxor "MM"4, "MM"4 \n\t"\
213 "pmaxsw "MM"3, "MM"6 \n\t"\
214 "psubw "MM"2, "MM"4 \n\t" /* -max */\
215 "pmaxsw "MM"4, "MM"6 \n\t" /* diff= MAX3(diff, min, -max); */\
218 MOVQ" 16(%[tmp]), "MM"2 \n\t" /* d */\
219 MOVQ" "MM"2, "MM"3 \n\t"\
220 "psubw "MM"6, "MM"2 \n\t" /* d-diff */\
221 "paddw "MM"6, "MM"3 \n\t" /* d+diff */\
222 "pmaxsw "MM"2, "MM"1 \n\t"\
223 "pminsw "MM"3, "MM"1 \n\t" /* d = clip(spatial_pred, d-diff, d+diff); */\
224 "packuswb "MM"1, "MM"1 \n\t"\
229 [prefs]"r"((x86_reg)prefs),\
230 [mrefs]"r"((x86_reg)mrefs),\
234 __asm__ volatile(MOV" "MM"1, %0" :"=m"(*dst));\