2 * Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
4 * This file is part of Libav.
6 * Libav is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * Libav is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License along
17 * with Libav; if not, write to the Free Software Foundation, Inc.,
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
21 #ifdef COMPILE_TEMPLATE_SSE
25 #define MOVQU "movdqu"
27 #define LOAD(mem,dst) \
28 MOV" "mem", "dst" \n\t"\
29 "punpcklbw "MM"7, "dst" \n\t"
30 #define PSRL1(reg) "psrldq $1, "reg" \n\t"
31 #define PSRL2(reg) "psrldq $2, "reg" \n\t"
32 #define PSHUF(src,dst) "movdqa "dst", "src" \n\t"\
33 "psrldq $2, "src" \n\t"
40 #define LOAD(mem,dst) \
41 MOV" "mem", "dst" \n\t"\
42 "punpcklbw "MM"7, "dst" \n\t"
43 #define PSRL1(reg) "psrlq $8, "reg" \n\t"
44 #define PSRL2(reg) "psrlq $16, "reg" \n\t"
45 #define PSHUF(src,dst) "pshufw $9, "dst", "src" \n\t"
48 #ifdef COMPILE_TEMPLATE_SSSE3
49 #define PABS(tmp,dst) \
50 "pabsw "dst", "dst" \n\t"
52 #define PABS(tmp,dst) \
53 "pxor "tmp", "tmp" \n\t"\
54 "psubw "dst", "tmp" \n\t"\
55 "pmaxsw "tmp", "dst" \n\t"
58 #define CHECK(pj,mj) \
59 MOVQU" "#pj"(%[cur],%[mrefs]), "MM"2 \n\t" /* cur[x-refs-1+j] */\
60 MOVQU" "#mj"(%[cur],%[prefs]), "MM"3 \n\t" /* cur[x+refs-1-j] */\
61 MOVQ" "MM"2, "MM"4 \n\t"\
62 MOVQ" "MM"2, "MM"5 \n\t"\
63 "pxor "MM"3, "MM"4 \n\t"\
64 "pavgb "MM"3, "MM"5 \n\t"\
65 "pand "MANGLE(pb_1)", "MM"4 \n\t"\
66 "psubusb "MM"4, "MM"5 \n\t"\
68 "punpcklbw "MM"7, "MM"5 \n\t" /* (cur[x-refs+j] + cur[x+refs-j])>>1 */\
69 MOVQ" "MM"2, "MM"4 \n\t"\
70 "psubusb "MM"3, "MM"2 \n\t"\
71 "psubusb "MM"4, "MM"3 \n\t"\
72 "pmaxub "MM"3, "MM"2 \n\t"\
73 MOVQ" "MM"2, "MM"3 \n\t"\
74 MOVQ" "MM"2, "MM"4 \n\t" /* ABS(cur[x-refs-1+j] - cur[x+refs-1-j]) */\
75 PSRL1(MM"3") /* ABS(cur[x-refs +j] - cur[x+refs -j]) */\
76 PSRL2(MM"4") /* ABS(cur[x-refs+1+j] - cur[x+refs+1-j]) */\
77 "punpcklbw "MM"7, "MM"2 \n\t"\
78 "punpcklbw "MM"7, "MM"3 \n\t"\
79 "punpcklbw "MM"7, "MM"4 \n\t"\
80 "paddw "MM"3, "MM"2 \n\t"\
81 "paddw "MM"4, "MM"2 \n\t" /* score */
84 MOVQ" "MM"0, "MM"3 \n\t"\
85 "pcmpgtw "MM"2, "MM"3 \n\t" /* if(score < spatial_score) */\
86 "pminsw "MM"2, "MM"0 \n\t" /* spatial_score= score; */\
87 MOVQ" "MM"3, "MM"6 \n\t"\
88 "pand "MM"3, "MM"5 \n\t"\
89 "pandn "MM"1, "MM"3 \n\t"\
90 "por "MM"5, "MM"3 \n\t"\
91 MOVQ" "MM"3, "MM"1 \n\t" /* spatial_pred= (cur[x-refs+j] + cur[x+refs-j])>>1; */
93 #define CHECK2 /* pretend not to have checked dir=2 if dir=1 was bad.\
94 hurts both quality and speed, but matches the C version. */\
95 "paddw "MANGLE(pw_1)", "MM"6 \n\t"\
96 "psllw $14, "MM"6 \n\t"\
97 "paddsw "MM"6, "MM"2 \n\t"\
98 MOVQ" "MM"0, "MM"3 \n\t"\
99 "pcmpgtw "MM"2, "MM"3 \n\t"\
100 "pminsw "MM"2, "MM"0 \n\t"\
101 "pand "MM"3, "MM"5 \n\t"\
102 "pandn "MM"1, "MM"3 \n\t"\
103 "por "MM"5, "MM"3 \n\t"\
104 MOVQ" "MM"3, "MM"1 \n\t"
106 void RENAME(ff_yadif_filter_line)(uint8_t *dst,
107 uint8_t *prev, uint8_t *cur, uint8_t *next,
108 int w, int prefs, int mrefs, int parity, int mode)
110 DECLARE_ALIGNED(16, uint8_t, tmp0[16]);
111 DECLARE_ALIGNED(16, uint8_t, tmp1[16]);
112 DECLARE_ALIGNED(16, uint8_t, tmp2[16]);
113 DECLARE_ALIGNED(16, uint8_t, tmp3[16]);
117 for(x=0; x<w; x+=STEP){\
119 "pxor "MM"7, "MM"7 \n\t"\
120 LOAD("(%[cur],%[mrefs])", MM"0") /* c = cur[x-refs] */\
121 LOAD("(%[cur],%[prefs])", MM"1") /* e = cur[x+refs] */\
122 LOAD("(%["prev2"])", MM"2") /* prev2[x] */\
123 LOAD("(%["next2"])", MM"3") /* next2[x] */\
124 MOVQ" "MM"3, "MM"4 \n\t"\
125 "paddw "MM"2, "MM"3 \n\t"\
126 "psraw $1, "MM"3 \n\t" /* d = (prev2[x] + next2[x])>>1 */\
127 MOVQ" "MM"0, %[tmp0] \n\t" /* c */\
128 MOVQ" "MM"3, %[tmp1] \n\t" /* d */\
129 MOVQ" "MM"1, %[tmp2] \n\t" /* e */\
130 "psubw "MM"4, "MM"2 \n\t"\
131 PABS( MM"4", MM"2") /* temporal_diff0 */\
132 LOAD("(%[prev],%[mrefs])", MM"3") /* prev[x-refs] */\
133 LOAD("(%[prev],%[prefs])", MM"4") /* prev[x+refs] */\
134 "psubw "MM"0, "MM"3 \n\t"\
135 "psubw "MM"1, "MM"4 \n\t"\
138 "paddw "MM"4, "MM"3 \n\t" /* temporal_diff1 */\
139 "psrlw $1, "MM"2 \n\t"\
140 "psrlw $1, "MM"3 \n\t"\
141 "pmaxsw "MM"3, "MM"2 \n\t"\
142 LOAD("(%[next],%[mrefs])", MM"3") /* next[x-refs] */\
143 LOAD("(%[next],%[prefs])", MM"4") /* next[x+refs] */\
144 "psubw "MM"0, "MM"3 \n\t"\
145 "psubw "MM"1, "MM"4 \n\t"\
148 "paddw "MM"4, "MM"3 \n\t" /* temporal_diff2 */\
149 "psrlw $1, "MM"3 \n\t"\
150 "pmaxsw "MM"3, "MM"2 \n\t"\
151 MOVQ" "MM"2, %[tmp3] \n\t" /* diff */\
153 "paddw "MM"0, "MM"1 \n\t"\
154 "paddw "MM"0, "MM"0 \n\t"\
155 "psubw "MM"1, "MM"0 \n\t"\
156 "psrlw $1, "MM"1 \n\t" /* spatial_pred */\
157 PABS( MM"2", MM"0") /* ABS(c-e) */\
159 MOVQU" -1(%[cur],%[mrefs]), "MM"2 \n\t" /* cur[x-refs-1] */\
160 MOVQU" -1(%[cur],%[prefs]), "MM"3 \n\t" /* cur[x+refs-1] */\
161 MOVQ" "MM"2, "MM"4 \n\t"\
162 "psubusb "MM"3, "MM"2 \n\t"\
163 "psubusb "MM"4, "MM"3 \n\t"\
164 "pmaxub "MM"3, "MM"2 \n\t"\
165 PSHUF(MM"3", MM"2") \
166 "punpcklbw "MM"7, "MM"2 \n\t" /* ABS(cur[x-refs-1] - cur[x+refs-1]) */\
167 "punpcklbw "MM"7, "MM"3 \n\t" /* ABS(cur[x-refs+1] - cur[x+refs+1]) */\
168 "paddw "MM"2, "MM"0 \n\t"\
169 "paddw "MM"3, "MM"0 \n\t"\
170 "psubw "MANGLE(pw_1)", "MM"0 \n\t" /* spatial_score */\
181 /* if(p->mode<2) ... */\
182 MOVQ" %[tmp3], "MM"6 \n\t" /* diff */\
183 "cmpl $2, %[mode] \n\t"\
185 LOAD("(%["prev2"],%[mrefs],2)", MM"2") /* prev2[x-2*refs] */\
186 LOAD("(%["next2"],%[mrefs],2)", MM"4") /* next2[x-2*refs] */\
187 LOAD("(%["prev2"],%[prefs],2)", MM"3") /* prev2[x+2*refs] */\
188 LOAD("(%["next2"],%[prefs],2)", MM"5") /* next2[x+2*refs] */\
189 "paddw "MM"4, "MM"2 \n\t"\
190 "paddw "MM"5, "MM"3 \n\t"\
191 "psrlw $1, "MM"2 \n\t" /* b */\
192 "psrlw $1, "MM"3 \n\t" /* f */\
193 MOVQ" %[tmp0], "MM"4 \n\t" /* c */\
194 MOVQ" %[tmp1], "MM"5 \n\t" /* d */\
195 MOVQ" %[tmp2], "MM"7 \n\t" /* e */\
196 "psubw "MM"4, "MM"2 \n\t" /* b-c */\
197 "psubw "MM"7, "MM"3 \n\t" /* f-e */\
198 MOVQ" "MM"5, "MM"0 \n\t"\
199 "psubw "MM"4, "MM"5 \n\t" /* d-c */\
200 "psubw "MM"7, "MM"0 \n\t" /* d-e */\
201 MOVQ" "MM"2, "MM"4 \n\t"\
202 "pminsw "MM"3, "MM"2 \n\t"\
203 "pmaxsw "MM"4, "MM"3 \n\t"\
204 "pmaxsw "MM"5, "MM"2 \n\t"\
205 "pminsw "MM"5, "MM"3 \n\t"\
206 "pmaxsw "MM"0, "MM"2 \n\t" /* max */\
207 "pminsw "MM"0, "MM"3 \n\t" /* min */\
208 "pxor "MM"4, "MM"4 \n\t"\
209 "pmaxsw "MM"3, "MM"6 \n\t"\
210 "psubw "MM"2, "MM"4 \n\t" /* -max */\
211 "pmaxsw "MM"4, "MM"6 \n\t" /* diff= MAX3(diff, min, -max); */\
214 MOVQ" %[tmp1], "MM"2 \n\t" /* d */\
215 MOVQ" "MM"2, "MM"3 \n\t"\
216 "psubw "MM"6, "MM"2 \n\t" /* d-diff */\
217 "paddw "MM"6, "MM"3 \n\t" /* d+diff */\
218 "pmaxsw "MM"2, "MM"1 \n\t"\
219 "pminsw "MM"3, "MM"1 \n\t" /* d = clip(spatial_pred, d-diff, d+diff); */\
220 "packuswb "MM"1, "MM"1 \n\t"\
229 [prefs]"r"((x86_reg)prefs),\
230 [mrefs]"r"((x86_reg)mrefs),\
233 __asm__ volatile(MOV" "MM"1, %0" :"=m"(*dst));\