2 * Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License along
17 * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
21 #ifdef COMPILE_TEMPLATE_SSE2
25 #define MOVQU "movdqu"
27 #define LOAD(mem,dst) \
28 MOV" "mem", "dst" \n\t"\
29 "punpcklbw "MM"7, "dst" \n\t"
30 #define PSRL1(reg) "psrldq $1, "reg" \n\t"
31 #define PSRL2(reg) "psrldq $2, "reg" \n\t"
32 #define PSHUF(src,dst) "movdqa "dst", "src" \n\t"\
33 "psrldq $2, "src" \n\t"
40 #define LOAD(mem,dst) \
41 MOV" "mem", "dst" \n\t"\
42 "punpcklbw "MM"7, "dst" \n\t"
43 #define PSRL1(reg) "psrlq $8, "reg" \n\t"
44 #define PSRL2(reg) "psrlq $16, "reg" \n\t"
45 #define PSHUF(src,dst) "pshufw $9, "dst", "src" \n\t"
48 #ifdef COMPILE_TEMPLATE_SSSE3
49 #define PABS(tmp,dst) \
50 "pabsw "dst", "dst" \n\t"
52 #define PABS(tmp,dst) \
53 "pxor "tmp", "tmp" \n\t"\
54 "psubw "dst", "tmp" \n\t"\
55 "pmaxsw "tmp", "dst" \n\t"
58 #define CHECK(pj,mj) \
59 MOVQU" "#pj"(%[cur],%[mrefs]), "MM"2 \n\t" /* cur[x-refs-1+j] */\
60 MOVQU" "#mj"(%[cur],%[prefs]), "MM"3 \n\t" /* cur[x+refs-1-j] */\
61 MOVQ" "MM"2, "MM"4 \n\t"\
62 MOVQ" "MM"2, "MM"5 \n\t"\
63 "pxor "MM"3, "MM"4 \n\t"\
64 "pavgb "MM"3, "MM"5 \n\t"\
65 "pand "MANGLE(pb_1)", "MM"4 \n\t"\
66 "psubusb "MM"4, "MM"5 \n\t"\
68 "punpcklbw "MM"7, "MM"5 \n\t" /* (cur[x-refs+j] + cur[x+refs-j])>>1 */\
69 MOVQ" "MM"2, "MM"4 \n\t"\
70 "psubusb "MM"3, "MM"2 \n\t"\
71 "psubusb "MM"4, "MM"3 \n\t"\
72 "pmaxub "MM"3, "MM"2 \n\t"\
73 MOVQ" "MM"2, "MM"3 \n\t"\
74 MOVQ" "MM"2, "MM"4 \n\t" /* ABS(cur[x-refs-1+j] - cur[x+refs-1-j]) */\
75 PSRL1(MM"3") /* ABS(cur[x-refs +j] - cur[x+refs -j]) */\
76 PSRL2(MM"4") /* ABS(cur[x-refs+1+j] - cur[x+refs+1-j]) */\
77 "punpcklbw "MM"7, "MM"2 \n\t"\
78 "punpcklbw "MM"7, "MM"3 \n\t"\
79 "punpcklbw "MM"7, "MM"4 \n\t"\
80 "paddw "MM"3, "MM"2 \n\t"\
81 "paddw "MM"4, "MM"2 \n\t" /* score */
84 MOVQ" "MM"0, "MM"3 \n\t"\
85 "pcmpgtw "MM"2, "MM"3 \n\t" /* if(score < spatial_score) */\
86 "pminsw "MM"2, "MM"0 \n\t" /* spatial_score= score; */\
87 MOVQ" "MM"3, "MM"6 \n\t"\
88 "pand "MM"3, "MM"5 \n\t"\
89 "pandn "MM"1, "MM"3 \n\t"\
90 "por "MM"5, "MM"3 \n\t"\
91 MOVQ" "MM"3, "MM"1 \n\t" /* spatial_pred= (cur[x-refs+j] + cur[x+refs-j])>>1; */
93 #define CHECK2 /* pretend not to have checked dir=2 if dir=1 was bad.\
94 hurts both quality and speed, but matches the C version. */\
95 "paddw "MANGLE(pw_1)", "MM"6 \n\t"\
96 "psllw $14, "MM"6 \n\t"\
97 "paddsw "MM"6, "MM"2 \n\t"\
98 MOVQ" "MM"0, "MM"3 \n\t"\
99 "pcmpgtw "MM"2, "MM"3 \n\t"\
100 "pminsw "MM"2, "MM"0 \n\t"\
101 "pand "MM"3, "MM"5 \n\t"\
102 "pandn "MM"1, "MM"3 \n\t"\
103 "por "MM"5, "MM"3 \n\t"\
104 MOVQ" "MM"3, "MM"1 \n\t"
106 static void RENAME(yadif_filter_line)(uint8_t *dst, uint8_t *prev, uint8_t *cur,
107 uint8_t *next, int w, int prefs,
108 int mrefs, int parity, int mode)
111 uint8_t *tmp= (uint8_t*)(((uint64_t)(tmpU+15)) & ~15);
115 for(x=0; x<w; x+=STEP){\
117 "pxor "MM"7, "MM"7 \n\t"\
118 LOAD("(%[cur],%[mrefs])", MM"0") /* c = cur[x-refs] */\
119 LOAD("(%[cur],%[prefs])", MM"1") /* e = cur[x+refs] */\
120 LOAD("(%["prev2"])", MM"2") /* prev2[x] */\
121 LOAD("(%["next2"])", MM"3") /* next2[x] */\
122 MOVQ" "MM"3, "MM"4 \n\t"\
123 "paddw "MM"2, "MM"3 \n\t"\
124 "psraw $1, "MM"3 \n\t" /* d = (prev2[x] + next2[x])>>1 */\
125 MOVQ" "MM"0, (%[tmp]) \n\t" /* c */\
126 MOVQ" "MM"3, 16(%[tmp]) \n\t" /* d */\
127 MOVQ" "MM"1, 32(%[tmp]) \n\t" /* e */\
128 "psubw "MM"4, "MM"2 \n\t"\
129 PABS( MM"4", MM"2") /* temporal_diff0 */\
130 LOAD("(%[prev],%[mrefs])", MM"3") /* prev[x-refs] */\
131 LOAD("(%[prev],%[prefs])", MM"4") /* prev[x+refs] */\
132 "psubw "MM"0, "MM"3 \n\t"\
133 "psubw "MM"1, "MM"4 \n\t"\
136 "paddw "MM"4, "MM"3 \n\t" /* temporal_diff1 */\
137 "psrlw $1, "MM"2 \n\t"\
138 "psrlw $1, "MM"3 \n\t"\
139 "pmaxsw "MM"3, "MM"2 \n\t"\
140 LOAD("(%[next],%[mrefs])", MM"3") /* next[x-refs] */\
141 LOAD("(%[next],%[prefs])", MM"4") /* next[x+refs] */\
142 "psubw "MM"0, "MM"3 \n\t"\
143 "psubw "MM"1, "MM"4 \n\t"\
146 "paddw "MM"4, "MM"3 \n\t" /* temporal_diff2 */\
147 "psrlw $1, "MM"3 \n\t"\
148 "pmaxsw "MM"3, "MM"2 \n\t"\
149 MOVQ" "MM"2, 48(%[tmp]) \n\t" /* diff */\
151 "paddw "MM"0, "MM"1 \n\t"\
152 "paddw "MM"0, "MM"0 \n\t"\
153 "psubw "MM"1, "MM"0 \n\t"\
154 "psrlw $1, "MM"1 \n\t" /* spatial_pred */\
155 PABS( MM"2", MM"0") /* ABS(c-e) */\
157 MOVQU" -1(%[cur],%[mrefs]), "MM"2 \n\t" /* cur[x-refs-1] */\
158 MOVQU" -1(%[cur],%[prefs]), "MM"3 \n\t" /* cur[x+refs-1] */\
159 MOVQ" "MM"2, "MM"4 \n\t"\
160 "psubusb "MM"3, "MM"2 \n\t"\
161 "psubusb "MM"4, "MM"3 \n\t"\
162 "pmaxub "MM"3, "MM"2 \n\t"\
163 PSHUF(MM"3", MM"2") \
164 "punpcklbw "MM"7, "MM"2 \n\t" /* ABS(cur[x-refs-1] - cur[x+refs-1]) */\
165 "punpcklbw "MM"7, "MM"3 \n\t" /* ABS(cur[x-refs+1] - cur[x+refs+1]) */\
166 "paddw "MM"2, "MM"0 \n\t"\
167 "paddw "MM"3, "MM"0 \n\t"\
168 "psubw "MANGLE(pw_1)", "MM"0 \n\t" /* spatial_score */\
179 /* if(p->mode<2) ... */\
180 MOVQ" 48(%[tmp]), "MM"6 \n\t" /* diff */\
181 "cmpl $2, %[mode] \n\t"\
183 LOAD("(%["prev2"],%[mrefs],2)", MM"2") /* prev2[x-2*refs] */\
184 LOAD("(%["next2"],%[mrefs],2)", MM"4") /* next2[x-2*refs] */\
185 LOAD("(%["prev2"],%[prefs],2)", MM"3") /* prev2[x+2*refs] */\
186 LOAD("(%["next2"],%[prefs],2)", MM"5") /* next2[x+2*refs] */\
187 "paddw "MM"4, "MM"2 \n\t"\
188 "paddw "MM"5, "MM"3 \n\t"\
189 "psrlw $1, "MM"2 \n\t" /* b */\
190 "psrlw $1, "MM"3 \n\t" /* f */\
191 MOVQ" (%[tmp]), "MM"4 \n\t" /* c */\
192 MOVQ" 16(%[tmp]), "MM"5 \n\t" /* d */\
193 MOVQ" 32(%[tmp]), "MM"7 \n\t" /* e */\
194 "psubw "MM"4, "MM"2 \n\t" /* b-c */\
195 "psubw "MM"7, "MM"3 \n\t" /* f-e */\
196 MOVQ" "MM"5, "MM"0 \n\t"\
197 "psubw "MM"4, "MM"5 \n\t" /* d-c */\
198 "psubw "MM"7, "MM"0 \n\t" /* d-e */\
199 MOVQ" "MM"2, "MM"4 \n\t"\
200 "pminsw "MM"3, "MM"2 \n\t"\
201 "pmaxsw "MM"4, "MM"3 \n\t"\
202 "pmaxsw "MM"5, "MM"2 \n\t"\
203 "pminsw "MM"5, "MM"3 \n\t"\
204 "pmaxsw "MM"0, "MM"2 \n\t" /* max */\
205 "pminsw "MM"0, "MM"3 \n\t" /* min */\
206 "pxor "MM"4, "MM"4 \n\t"\
207 "pmaxsw "MM"3, "MM"6 \n\t"\
208 "psubw "MM"2, "MM"4 \n\t" /* -max */\
209 "pmaxsw "MM"4, "MM"6 \n\t" /* diff= MAX3(diff, min, -max); */\
212 MOVQ" 16(%[tmp]), "MM"2 \n\t" /* d */\
213 MOVQ" "MM"2, "MM"3 \n\t"\
214 "psubw "MM"6, "MM"2 \n\t" /* d-diff */\
215 "paddw "MM"6, "MM"3 \n\t" /* d+diff */\
216 "pmaxsw "MM"2, "MM"1 \n\t"\
217 "pminsw "MM"3, "MM"1 \n\t" /* d = clip(spatial_pred, d-diff, d+diff); */\
218 "packuswb "MM"1, "MM"1 \n\t"\
223 [prefs]"r"((x86_reg)prefs),\
224 [mrefs]"r"((x86_reg)mrefs),\
228 __asm__ volatile(MOV" "MM"1, %0" :"=m"(*dst));\