2 * Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
\r
4 * SSE2/SSSE3 version (custom optimization) by h.yamagata
\r
6 * Small fix by Alexander Balakhnin (fizick@avisynth.org.ru)
\r
8 * MPlayer is free software; you can redistribute it and/or modify
\r
9 * it under the terms of the GNU General Public License as published by
\r
10 * the Free Software Foundation; either version 2 of the License, or
\r
11 * (at your option) any later version.
\r
13 * MPlayer is distributed in the hope that it will be useful,
\r
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
\r
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
\r
16 * GNU General Public License for more details.
\r
18 * You should have received a copy of the GNU General Public License along
\r
19 * with MPlayer; if not, write to the Free Software Foundation, Inc.,
\r
20 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\r
23 #define LOAD8(mem,dst) \
\r
24 "movq "mem", "#dst" \n\t"\
\r
25 "punpcklbw %%xmm7, "#dst" \n\t"
\r
27 #define CHECK(pj,mj) \
\r
28 "movdqu "#pj"(%[cur],%[mrefs]), %%xmm2 \n\t" /* cur[x-refs-1+j] */\
\r
29 "movdqu "#mj"(%[cur],%[prefs]), %%xmm3 \n\t" /* cur[x+refs-1-j] */\
\r
30 "movdqa %%xmm2, %%xmm4 \n\t"\
\r
31 "movdqa %%xmm2, %%xmm5 \n\t"\
\r
32 "pxor %%xmm3, %%xmm4 \n\t"\
\r
33 "pavgb %%xmm3, %%xmm5 \n\t"\
\r
34 "pand %[pb1], %%xmm4 \n\t"\
\r
35 "psubusb %%xmm4, %%xmm5 \n\t"\
\r
36 "psrldq $1, %%xmm5 \n\t"\
\r
37 "punpcklbw %%xmm7, %%xmm5 \n\t" /* (cur[x-refs+j] + cur[x+refs-j])>>1 */\
\r
38 "movdqa %%xmm2, %%xmm4 \n\t"\
\r
39 "psubusb %%xmm3, %%xmm2 \n\t"\
\r
40 "psubusb %%xmm4, %%xmm3 \n\t"\
\r
41 "pmaxub %%xmm3, %%xmm2 \n\t"\
\r
42 "movdqa %%xmm2, %%xmm3 \n\t"\
\r
43 "movdqa %%xmm2, %%xmm4 \n\t" /* ABS(cur[x-refs-1+j] - cur[x+refs-1-j]) */\
\r
44 "psrldq $1, %%xmm3 \n\t" /* ABS(cur[x-refs +j] - cur[x+refs -j]) */\
\r
45 "psrldq $2, %%xmm4 \n\t" /* ABS(cur[x-refs+1+j] - cur[x+refs+1-j]) */\
\r
46 "punpcklbw %%xmm7, %%xmm2 \n\t"\
\r
47 "punpcklbw %%xmm7, %%xmm3 \n\t"\
\r
48 "punpcklbw %%xmm7, %%xmm4 \n\t"\
\r
49 "paddw %%xmm3, %%xmm2 \n\t"\
\r
50 "paddw %%xmm4, %%xmm2 \n\t" /* score */
\r
53 "movdqa %%xmm0, %%xmm3 \n\t"\
\r
54 "pcmpgtw %%xmm2, %%xmm3 \n\t" /* if(score < spatial_score) */\
\r
55 "pminsw %%xmm2, %%xmm0 \n\t" /* spatial_score= score; */\
\r
56 "movdqa %%xmm3, %%xmm6 \n\t"\
\r
57 "pand %%xmm3, %%xmm5 \n\t"\
\r
58 "pandn %%xmm1, %%xmm3 \n\t"\
\r
59 "por %%xmm5, %%xmm3 \n\t"\
\r
60 "movdqa %%xmm3, %%xmm1 \n\t" /* spatial_pred= (cur[x-refs+j] + cur[x+refs-j])>>1; */
\r
62 #define CHECK2 /* pretend not to have checked dir=2 if dir=1 was bad.\
\r
63 hurts both quality and speed, but matches the C version. */\
\r
64 "paddw %[pw1], %%xmm6 \n\t"\
\r
65 "psllw $14, %%xmm6 \n\t"\
\r
66 "paddsw %%xmm6, %%xmm2 \n\t"\
\r
67 "movdqa %%xmm0, %%xmm3 \n\t"\
\r
68 "pcmpgtw %%xmm2, %%xmm3 \n\t"\
\r
69 "pminsw %%xmm2, %%xmm0 \n\t"\
\r
70 "pand %%xmm3, %%xmm5 \n\t"\
\r
71 "pandn %%xmm1, %%xmm3 \n\t"\
\r
72 "por %%xmm5, %%xmm3 \n\t"\
\r
73 "movdqa %%xmm3, %%xmm1 \n\t"
\r
75 /* mode argument mod - Fizick */
\r
77 /* static attribute_align_arg void FILTER_LINE_FUNC_NAME(YadifContext *yadctx, uint8_t *dst, uint8_t *prev, uint8_t *cur, uint8_t *next, int w, int refs, int parity){
\r
78 const int mode = yadctx->mode; */
\r
79 static attribute_align_arg void FILTER_LINE_FUNC_NAME(int mode, uint8_t *dst, const uint8_t *prev, const uint8_t *cur, const uint8_t *next, int w, int refs, int parity){
\r
80 DECLARE_ALIGNED(16, uint8_t, tmp0[16]);
\r
81 DECLARE_ALIGNED(16, uint8_t, tmp1[16]);
\r
82 DECLARE_ALIGNED(16, uint8_t, tmp2[16]);
\r
83 DECLARE_ALIGNED(16, uint8_t, tmp3[16]);
\r
85 static DECLARE_ALIGNED(16, const unsigned short, pw_1[]) =
\r
87 0x0001,0x0001,0x0001,0x0001,0x0001,0x0001,0x0001,0x0001
\r
90 static DECLARE_ALIGNED(16, const unsigned short, pb_1[]) =
\r
92 0x0101,0x0101,0x0101,0x0101,0x0101,0x0101,0x0101,0x0101
\r
97 for(x=0; x<w; x+=8){\
\r
99 "pxor %%xmm7, %%xmm7 \n\t"\
\r
100 LOAD8("(%[cur],%[mrefs])", %%xmm0) /* c = cur[x-refs] */\
\r
101 LOAD8("(%[cur],%[prefs])", %%xmm1) /* e = cur[x+refs] */\
\r
102 LOAD8("(%["prev2"])", %%xmm2) /* prev2[x] */\
\r
103 LOAD8("(%["next2"])", %%xmm3) /* next2[x] */\
\r
104 "movdqa %%xmm3, %%xmm4 \n\t"\
\r
105 "paddw %%xmm2, %%xmm3 \n\t"\
\r
106 "psraw $1, %%xmm3 \n\t" /* d = (prev2[x] + next2[x])>>1 */\
\r
107 "movdqa %%xmm0, %[tmp0] \n\t" /* c */\
\r
108 "movdqa %%xmm3, %[tmp1] \n\t" /* d */\
\r
109 "movdqa %%xmm1, %[tmp2] \n\t" /* e */\
\r
110 "psubw %%xmm4, %%xmm2 \n\t"\
\r
111 PABS( %%xmm4, %%xmm2) /* temporal_diff0 */\
\r
112 LOAD8("(%[prev],%[mrefs])", %%xmm3) /* prev[x-refs] */\
\r
113 LOAD8("(%[prev],%[prefs])", %%xmm4) /* prev[x+refs] */\
\r
114 "psubw %%xmm0, %%xmm3 \n\t"\
\r
115 "psubw %%xmm1, %%xmm4 \n\t"\
\r
116 PABS( %%xmm5, %%xmm3)\
\r
117 PABS( %%xmm5, %%xmm4)\
\r
118 "paddw %%xmm4, %%xmm3 \n\t" /* temporal_diff1 */\
\r
119 "psrlw $1, %%xmm2 \n\t"\
\r
120 "psrlw $1, %%xmm3 \n\t"\
\r
121 "pmaxsw %%xmm3, %%xmm2 \n\t"\
\r
122 LOAD8("(%[next],%[mrefs])", %%xmm3) /* next[x-refs] */\
\r
123 LOAD8("(%[next],%[prefs])", %%xmm4) /* next[x+refs] */\
\r
124 "psubw %%xmm0, %%xmm3 \n\t"\
\r
125 "psubw %%xmm1, %%xmm4 \n\t"\
\r
126 PABS( %%xmm5, %%xmm3)\
\r
127 PABS( %%xmm5, %%xmm4)\
\r
128 "paddw %%xmm4, %%xmm3 \n\t" /* temporal_diff2 */\
\r
129 "psrlw $1, %%xmm3 \n\t"\
\r
130 "pmaxsw %%xmm3, %%xmm2 \n\t"\
\r
131 "movdqa %%xmm2, %[tmp3] \n\t" /* diff */\
\r
133 "paddw %%xmm0, %%xmm1 \n\t"\
\r
134 "paddw %%xmm0, %%xmm0 \n\t"\
\r
135 "psubw %%xmm1, %%xmm0 \n\t"\
\r
136 "psrlw $1, %%xmm1 \n\t" /* spatial_pred */\
\r
137 PABS( %%xmm2, %%xmm0) /* ABS(c-e) */\
\r
139 "movdqu -1(%[cur],%[mrefs]), %%xmm2 \n\t" /* cur[x-refs-1] */\
\r
140 "movdqu -1(%[cur],%[prefs]), %%xmm3 \n\t" /* cur[x+refs-1] */\
\r
141 "movdqa %%xmm2, %%xmm4 \n\t"\
\r
142 "psubusb %%xmm3, %%xmm2 \n\t"\
\r
143 "psubusb %%xmm4, %%xmm3 \n\t"\
\r
144 "pmaxub %%xmm3, %%xmm2 \n\t"\
\r
145 /*"pshuflw $9,%%xmm2, %%xmm3 \n\t"*/\
\r
146 /*"pshufhw $9,%%xmm2, %%xmm3 \n\t"*/\
\r
147 "movdqa %%xmm2, %%xmm3 \n\t" /* correct replacement (here) */\
\r
148 "psrldq $2, %%xmm3 \n\t"/* for "pshufw $9,%%mm2, %%mm3" - fix by Fizick */\
\r
149 "punpcklbw %%xmm7, %%xmm2 \n\t" /* ABS(cur[x-refs-1] - cur[x+refs-1]) */\
\r
150 "punpcklbw %%xmm7, %%xmm3 \n\t" /* ABS(cur[x-refs+1] - cur[x+refs+1]) */\
\r
151 "paddw %%xmm2, %%xmm0 \n\t"\
\r
152 "paddw %%xmm3, %%xmm0 \n\t"\
\r
153 "psubw %[pw1], %%xmm0 \n\t" /* spatial_score */\
\r
164 /* if(yadctx->mode<2) ... */\
\r
165 "movdqa %[tmp3], %%xmm6 \n\t" /* diff */\
\r
166 "cmpl $2, %[mode] \n\t"\
\r
168 LOAD8("(%["prev2"],%[mrefs],2)", %%xmm2) /* prev2[x-2*refs] */\
\r
169 LOAD8("(%["next2"],%[mrefs],2)", %%xmm4) /* next2[x-2*refs] */\
\r
170 LOAD8("(%["prev2"],%[prefs],2)", %%xmm3) /* prev2[x+2*refs] */\
\r
171 LOAD8("(%["next2"],%[prefs],2)", %%xmm5) /* next2[x+2*refs] */\
\r
172 "paddw %%xmm4, %%xmm2 \n\t"\
\r
173 "paddw %%xmm5, %%xmm3 \n\t"\
\r
174 "psrlw $1, %%xmm2 \n\t" /* b */\
\r
175 "psrlw $1, %%xmm3 \n\t" /* f */\
\r
176 "movdqa %[tmp0], %%xmm4 \n\t" /* c */\
\r
177 "movdqa %[tmp1], %%xmm5 \n\t" /* d */\
\r
178 "movdqa %[tmp2], %%xmm7 \n\t" /* e */\
\r
179 "psubw %%xmm4, %%xmm2 \n\t" /* b-c */\
\r
180 "psubw %%xmm7, %%xmm3 \n\t" /* f-e */\
\r
181 "movdqa %%xmm5, %%xmm0 \n\t"\
\r
182 "psubw %%xmm4, %%xmm5 \n\t" /* d-c */\
\r
183 "psubw %%xmm7, %%xmm0 \n\t" /* d-e */\
\r
184 "movdqa %%xmm2, %%xmm4 \n\t"\
\r
185 "pminsw %%xmm3, %%xmm2 \n\t"\
\r
186 "pmaxsw %%xmm4, %%xmm3 \n\t"\
\r
187 "pmaxsw %%xmm5, %%xmm2 \n\t"\
\r
188 "pminsw %%xmm5, %%xmm3 \n\t"\
\r
189 "pmaxsw %%xmm0, %%xmm2 \n\t" /* max */\
\r
190 "pminsw %%xmm0, %%xmm3 \n\t" /* min */\
\r
191 "pxor %%xmm4, %%xmm4 \n\t"\
\r
192 "pmaxsw %%xmm3, %%xmm6 \n\t"\
\r
193 "psubw %%xmm2, %%xmm4 \n\t" /* -max */\
\r
194 "pmaxsw %%xmm4, %%xmm6 \n\t" /* diff= MAX3(diff, min, -max); */\
\r
197 "movdqa %[tmp1], %%xmm2 \n\t" /* d */\
\r
198 "movdqa %%xmm2, %%xmm3 \n\t"\
\r
199 "psubw %%xmm6, %%xmm2 \n\t" /* d-diff */\
\r
200 "paddw %%xmm6, %%xmm3 \n\t" /* d+diff */\
\r
201 "pmaxsw %%xmm2, %%xmm1 \n\t"\
\r
202 "pminsw %%xmm3, %%xmm1 \n\t" /* d = clip(spatial_pred, d-diff, d+diff); */\
\r
203 "packuswb %%xmm1, %%xmm1 \n\t"\
\r
205 :[tmp0]"=m"(tmp0),\
\r
209 :[prev] "r"(prev),\
\r
212 [prefs]"r"((long)refs),\
\r
213 [mrefs]"r"((long)-refs),\
\r
218 __asm__ volatile("movq %%xmm1, %0" :"=m"(*dst));\
\r
226 #define prev2 "prev"
\r
227 #define next2 "cur"
\r
232 #define prev2 "cur"
\r
233 #define next2 "next"
\r
245 #undef FILTER_LINE_FUNC_NAME
\r