2 * Loongson SIMD optimized vp8dsp
4 * Copyright (c) 2016 Loongson Technology Corporation Limited
5 * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
7 * This file is part of FFmpeg.
9 * FFmpeg is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
14 * FFmpeg is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with FFmpeg; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 #include "vp8dsp_mips.h"
25 #include "constants.h"
26 #include "libavutil/mips/mmiutils.h"
28 #define PUT_VP8_EPEL4_H6_MMI(src, dst) \
29 MMI_ULWC1(%[ftmp1], src, 0x00) \
30 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
31 "pmullh %[ftmp3], %[ftmp2], %[filter2] \n\t" \
33 MMI_ULWC1(%[ftmp1], src, -0x01) \
34 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
35 "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
36 "psubsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
38 MMI_ULWC1(%[ftmp1], src, -0x02) \
39 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
40 "pmullh %[ftmp2], %[ftmp2], %[filter0] \n\t" \
41 "paddsh %[ftmp5], %[ftmp3], %[ftmp2] \n\t" \
43 MMI_ULWC1(%[ftmp1], src, 0x01) \
44 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
45 "pmullh %[ftmp3], %[ftmp2], %[filter3] \n\t" \
47 MMI_ULWC1(%[ftmp1], src, 0x02) \
48 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
49 "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
50 "psubsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
52 MMI_ULWC1(%[ftmp1], src, 0x03) \
53 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
54 "pmullh %[ftmp2], %[ftmp2], %[filter5] \n\t" \
55 "paddsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
57 "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \
58 "paddsh %[ftmp3], %[ftmp3], %[ff_pw_64] \n\t" \
59 "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
60 "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
62 MMI_SWC1(%[ftmp1], dst, 0x00)
65 #define PUT_VP8_EPEL4_H4_MMI(src, dst) \
66 MMI_ULWC1(%[ftmp1], src, 0x00) \
67 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
68 "pmullh %[ftmp3], %[ftmp2], %[filter2] \n\t" \
70 MMI_ULWC1(%[ftmp1], src, -0x01) \
71 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
72 "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
73 "psubsh %[ftmp5], %[ftmp3], %[ftmp2] \n\t" \
75 MMI_ULWC1(%[ftmp1], src, 0x01) \
76 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
77 "pmullh %[ftmp3], %[ftmp2], %[filter3] \n\t" \
79 MMI_ULWC1(%[ftmp1], src, 0x02) \
80 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
81 "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
82 "psubh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
84 "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \
86 "paddsh %[ftmp3], %[ftmp3], %[ff_pw_64] \n\t" \
87 "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
89 "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
90 MMI_SWC1(%[ftmp1], dst, 0x00)
93 #define PUT_VP8_EPEL4_V6_MMI(src, src1, dst, srcstride) \
94 MMI_ULWC1(%[ftmp1], src, 0x00) \
95 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
96 "pmullh %[ftmp3], %[ftmp2], %[filter2] \n\t" \
98 PTR_SUBU ""#src1", "#src", "#srcstride" \n\t" \
99 MMI_ULWC1(%[ftmp1], src1, 0x00) \
100 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
101 "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
102 "psubsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
104 PTR_SUBU ""#src1", "#src1", "#srcstride" \n\t" \
105 MMI_ULWC1(%[ftmp1], src1, 0x00) \
106 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
107 "pmullh %[ftmp2], %[ftmp2], %[filter0] \n\t" \
108 "paddsh %[ftmp5], %[ftmp3], %[ftmp2] \n\t" \
110 PTR_ADDU ""#src1", "#src", "#srcstride" \n\t" \
111 MMI_ULWC1(%[ftmp1], src1, 0x00) \
112 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
113 "pmullh %[ftmp3], %[ftmp2], %[filter3] \n\t" \
115 PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \
116 MMI_ULWC1(%[ftmp1], src1, 0x00) \
117 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
118 "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
119 "psubsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
121 PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \
122 MMI_ULWC1(%[ftmp1], src1, 0x00) \
123 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
124 "pmullh %[ftmp2], %[ftmp2], %[filter5] \n\t" \
125 "paddsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
127 "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \
129 "paddsh %[ftmp3], %[ftmp3], %[ff_pw_64] \n\t" \
130 "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
131 "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
133 MMI_SWC1(%[ftmp1], dst, 0x00)
136 #define PUT_VP8_EPEL4_V4_MMI(src, src1, dst, srcstride) \
137 MMI_ULWC1(%[ftmp1], src, 0x00) \
138 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
139 "pmullh %[ftmp3], %[ftmp2], %[filter2] \n\t" \
141 PTR_SUBU ""#src1", "#src", "#srcstride" \n\t" \
142 MMI_ULWC1(%[ftmp1], src1, 0x00) \
143 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
144 "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
145 "psubsh %[ftmp5], %[ftmp3], %[ftmp2] \n\t" \
147 PTR_ADDU ""#src1", "#src", "#srcstride" \n\t" \
148 MMI_ULWC1(%[ftmp1], src1, 0x00) \
149 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
150 "pmullh %[ftmp3], %[ftmp2], %[filter3] \n\t" \
152 PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \
153 MMI_ULWC1(%[ftmp1], src1, 0x00) \
154 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
155 "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
156 "psubsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
158 "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \
160 "paddsh %[ftmp3], %[ftmp3], %[ff_pw_64] \n\t" \
161 "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
162 "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
164 MMI_SWC1(%[ftmp1], dst, 0x00)
167 #define PUT_VP8_EPEL8_H6_MMI(src, dst) \
168 MMI_ULDC1(%[ftmp1], src, 0x00) \
169 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
170 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
171 "pmullh %[ftmp5], %[ftmp2], %[filter2] \n\t" \
172 "pmullh %[ftmp6], %[ftmp3], %[filter2] \n\t" \
174 MMI_ULDC1(%[ftmp1], src, -0x01) \
175 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
176 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
177 "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
178 "pmullh %[ftmp3], %[ftmp3], %[filter1] \n\t" \
179 "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
180 "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
182 MMI_ULDC1(%[ftmp1], src, -0x02) \
183 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
184 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
185 "pmullh %[ftmp2], %[ftmp2], %[filter0] \n\t" \
186 "pmullh %[ftmp3], %[ftmp3], %[filter0] \n\t" \
187 "paddsh %[ftmp7], %[ftmp5], %[ftmp2] \n\t" \
188 "paddsh %[ftmp8], %[ftmp6], %[ftmp3] \n\t" \
190 MMI_ULDC1(%[ftmp1], src, 0x01) \
191 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
192 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
193 "pmullh %[ftmp5], %[ftmp2], %[filter3] \n\t" \
194 "pmullh %[ftmp6], %[ftmp3], %[filter3] \n\t" \
196 MMI_ULDC1(%[ftmp1], src, 0x02) \
197 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
198 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
199 "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
200 "pmullh %[ftmp3], %[ftmp3], %[filter4] \n\t" \
201 "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
202 "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
204 MMI_ULDC1(%[ftmp1], src, 0x03) \
205 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
206 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
207 "pmullh %[ftmp2], %[ftmp2], %[filter5] \n\t" \
208 "pmullh %[ftmp3], %[ftmp3], %[filter5] \n\t" \
209 "paddsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
210 "paddsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
212 "paddsh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \
213 "paddsh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" \
215 "paddsh %[ftmp5], %[ftmp5], %[ff_pw_64] \n\t" \
216 "paddsh %[ftmp6], %[ftmp6], %[ff_pw_64] \n\t" \
217 "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \
218 "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \
219 "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \
221 MMI_SDC1(%[ftmp1], dst, 0x00)
224 #define PUT_VP8_EPEL8_H4_MMI(src, dst) \
225 MMI_ULDC1(%[ftmp1], src, 0x00) \
226 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
227 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
228 "pmullh %[ftmp5], %[ftmp2], %[filter2] \n\t" \
229 "pmullh %[ftmp6], %[ftmp3], %[filter2] \n\t" \
231 MMI_ULDC1(%[ftmp1], src, -0x01) \
232 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
233 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
234 "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
235 "pmullh %[ftmp3], %[ftmp3], %[filter1] \n\t" \
236 "psubsh %[ftmp7], %[ftmp5], %[ftmp2] \n\t" \
237 "psubsh %[ftmp8], %[ftmp6], %[ftmp3] \n\t" \
239 MMI_ULDC1(%[ftmp1], src, 0x01) \
240 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
241 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
242 "pmullh %[ftmp5], %[ftmp2], %[filter3] \n\t" \
243 "pmullh %[ftmp6], %[ftmp3], %[filter3] \n\t" \
245 MMI_ULDC1(%[ftmp1], src, 0x02) \
246 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
247 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
248 "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
249 "pmullh %[ftmp3], %[ftmp3], %[filter4] \n\t" \
250 "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
251 "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
253 "paddsh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \
254 "paddsh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" \
256 "paddsh %[ftmp5], %[ftmp5], %[ff_pw_64] \n\t" \
257 "paddsh %[ftmp6], %[ftmp6], %[ff_pw_64] \n\t" \
258 "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \
259 "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \
261 "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \
262 MMI_SDC1(%[ftmp1], dst, 0x00)
265 #define PUT_VP8_EPEL8_V6_MMI(src, src1, dst, srcstride) \
266 MMI_ULDC1(%[ftmp1], src, 0x00) \
267 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
268 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
269 "pmullh %[ftmp5], %[ftmp2], %[filter2] \n\t" \
270 "pmullh %[ftmp6], %[ftmp3], %[filter2] \n\t" \
272 PTR_SUBU ""#src1", "#src", "#srcstride" \n\t" \
273 MMI_ULDC1(%[ftmp1], src1, 0x00) \
274 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
275 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
276 "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
277 "pmullh %[ftmp3], %[ftmp3], %[filter1] \n\t" \
278 "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
279 "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
281 PTR_SUBU ""#src1", "#src1", "#srcstride" \n\t" \
282 MMI_ULDC1(%[ftmp1], src1, 0x00) \
283 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
284 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
285 "pmullh %[ftmp2], %[ftmp2], %[filter0] \n\t" \
286 "pmullh %[ftmp3], %[ftmp3], %[filter0] \n\t" \
287 "paddsh %[ftmp7], %[ftmp5], %[ftmp2] \n\t" \
288 "paddsh %[ftmp8], %[ftmp6], %[ftmp3] \n\t" \
290 PTR_ADDU ""#src1", "#src", "#srcstride" \n\t" \
291 MMI_ULDC1(%[ftmp1], src1, 0x00) \
292 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
293 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
294 "pmullh %[ftmp5], %[ftmp2], %[filter3] \n\t" \
295 "pmullh %[ftmp6], %[ftmp3], %[filter3] \n\t" \
297 PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \
298 MMI_ULDC1(%[ftmp1], src1, 0x00) \
299 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
300 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
301 "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
302 "pmullh %[ftmp3], %[ftmp3], %[filter4] \n\t" \
303 "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
304 "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
306 PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \
307 MMI_ULDC1(%[ftmp1], src1, 0x00) \
308 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
309 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
310 "pmullh %[ftmp2], %[ftmp2], %[filter5] \n\t" \
311 "pmullh %[ftmp3], %[ftmp3], %[filter5] \n\t" \
312 "paddsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
313 "paddsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
315 "paddsh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \
316 "paddsh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" \
318 "paddsh %[ftmp5], %[ftmp5], %[ff_pw_64] \n\t" \
319 "paddsh %[ftmp6], %[ftmp6], %[ff_pw_64] \n\t" \
320 "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \
321 "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \
322 "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \
324 MMI_SDC1(%[ftmp1], dst, 0x00)
327 #define PUT_VP8_EPEL8_V4_MMI(src, src1, dst, srcstride) \
328 MMI_ULDC1(%[ftmp1], src, 0x00) \
329 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
330 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
331 "pmullh %[ftmp5], %[ftmp2], %[filter2] \n\t" \
332 "pmullh %[ftmp6], %[ftmp3], %[filter2] \n\t" \
334 PTR_SUBU ""#src1", "#src", "#srcstride" \n\t" \
335 MMI_ULDC1(%[ftmp1], src1, 0x00) \
336 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
337 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
338 "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
339 "pmullh %[ftmp3], %[ftmp3], %[filter1] \n\t" \
340 "psubsh %[ftmp7], %[ftmp5], %[ftmp2] \n\t" \
341 "psubsh %[ftmp8], %[ftmp6], %[ftmp3] \n\t" \
343 PTR_ADDU ""#src1", "#src", "#srcstride" \n\t" \
344 MMI_ULDC1(%[ftmp1], src1, 0x00) \
345 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
346 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
347 "pmullh %[ftmp5], %[ftmp2], %[filter3] \n\t" \
348 "pmullh %[ftmp6], %[ftmp3], %[filter3] \n\t" \
350 PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \
351 MMI_ULDC1(%[ftmp1], src1, 0x00) \
352 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
353 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
354 "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
355 "pmullh %[ftmp3], %[ftmp3], %[filter4] \n\t" \
356 "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
357 "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
359 "paddsh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \
360 "paddsh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" \
362 "paddsh %[ftmp5], %[ftmp5], %[ff_pw_64] \n\t" \
363 "paddsh %[ftmp6], %[ftmp6], %[ff_pw_64] \n\t" \
364 "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \
365 "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \
366 "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \
368 MMI_SDC1(%[ftmp1], dst, 0x00)
371 #define PUT_VP8_BILINEAR8_H_MMI(src, dst) \
372 MMI_ULDC1(%[ftmp1], src, 0x00) \
373 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
374 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
375 "pmullh %[ftmp5], %[ftmp2], %[a] \n\t" \
376 "pmullh %[ftmp6], %[ftmp3], %[a] \n\t" \
378 MMI_ULDC1(%[ftmp1], src, 0x01) \
379 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
380 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
381 "pmullh %[ftmp2], %[ftmp2], %[b] \n\t" \
382 "pmullh %[ftmp3], %[ftmp3], %[b] \n\t" \
383 "paddsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
384 "paddsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
386 "paddsh %[ftmp5], %[ftmp5], %[ff_pw_4] \n\t" \
387 "paddsh %[ftmp6], %[ftmp6], %[ff_pw_4] \n\t" \
388 "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \
389 "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \
391 "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \
392 MMI_SDC1(%[ftmp1], dst, 0x00)
395 #define PUT_VP8_BILINEAR4_H_MMI(src, dst) \
396 MMI_ULWC1(%[ftmp1], src, 0x00) \
397 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
398 "pmullh %[ftmp3], %[ftmp2], %[a] \n\t" \
400 MMI_ULWC1(%[ftmp1], src, 0x01) \
401 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
402 "pmullh %[ftmp2], %[ftmp2], %[b] \n\t" \
403 "paddsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
405 "paddsh %[ftmp3], %[ftmp3], %[ff_pw_4] \n\t" \
406 "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
408 "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
409 MMI_SWC1(%[ftmp1], dst, 0x00)
412 #define PUT_VP8_BILINEAR8_V_MMI(src, src1, dst, sstride) \
413 MMI_ULDC1(%[ftmp1], src, 0x00) \
414 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
415 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
416 "pmullh %[ftmp5], %[ftmp2], %[c] \n\t" \
417 "pmullh %[ftmp6], %[ftmp3], %[c] \n\t" \
419 PTR_ADDU ""#src1", "#src", "#sstride" \n\t" \
420 MMI_ULDC1(%[ftmp1], src1, 0x00) \
421 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
422 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
423 "pmullh %[ftmp2], %[ftmp2], %[d] \n\t" \
424 "pmullh %[ftmp3], %[ftmp3], %[d] \n\t" \
425 "paddsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
426 "paddsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
428 "paddsh %[ftmp5], %[ftmp5], %[ff_pw_4] \n\t" \
429 "paddsh %[ftmp6], %[ftmp6], %[ff_pw_4] \n\t" \
430 "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \
431 "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \
433 "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \
434 MMI_SDC1(%[ftmp1], dst, 0x00)
437 #define PUT_VP8_BILINEAR4_V_MMI(src, src1, dst, sstride) \
438 MMI_ULWC1(%[ftmp1], src, 0x00) \
439 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
440 "pmullh %[ftmp3], %[ftmp2], %[c] \n\t" \
442 PTR_ADDU ""#src1", "#src", "#sstride" \n\t" \
443 MMI_ULWC1(%[ftmp1], src1, 0x00) \
444 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
445 "pmullh %[ftmp2], %[ftmp2], %[d] \n\t" \
446 "paddsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
448 "paddsh %[ftmp3], %[ftmp3], %[ff_pw_4] \n\t" \
449 "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
451 "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
452 MMI_SWC1(%[ftmp1], dst, 0x00)
455 DECLARE_ALIGNED(8, static const uint64_t, fourtap_subpel_filters[7][6]) = {
456 {0x0000000000000000, 0x0006000600060006, 0x007b007b007b007b,
457 0x000c000c000c000c, 0x0001000100010001, 0x0000000000000000},
459 {0x0002000200020002, 0x000b000b000b000b, 0x006c006c006c006c,
460 0x0024002400240024, 0x0008000800080008, 0x0001000100010001},
462 {0x0000000000000000, 0x0009000900090009, 0x005d005d005d005d,
463 0x0032003200320032, 0x0006000600060006, 0x0000000000000000},
465 {0x0003000300030003, 0x0010001000100010, 0x004d004d004d004d,
466 0x004d004d004d004d, 0x0010001000100010, 0x0003000300030003},
468 {0x0000000000000000, 0x0006000600060006, 0x0032003200320032,
469 0x005d005d005d005d, 0x0009000900090009, 0x0000000000000000},
471 {0x0001000100010001, 0x0008000800080008, 0x0024002400240024,
472 0x006c006c006c006c, 0x000b000b000b000b, 0x0002000200020002},
474 {0x0000000000000000, 0x0001000100010001, 0x000c000c000c000c,
475 0x007b007b007b007b, 0x0006000600060006, 0x0000000000000000}
479 #define FILTER_6TAP(src, F, stride) \
480 cm[(F[2] * src[x + 0 * stride] - F[1] * src[x - 1 * stride] + \
481 F[0] * src[x - 2 * stride] + F[3] * src[x + 1 * stride] - \
482 F[4] * src[x + 2 * stride] + F[5] * src[x + 3 * stride] + 64) >> 7]
484 #define FILTER_4TAP(src, F, stride) \
485 cm[(F[2] * src[x + 0 * stride] - F[1] * src[x - 1 * stride] + \
486 F[3] * src[x + 1 * stride] - F[4] * src[x + 2 * stride] + 64) >> 7]
488 static const uint8_t subpel_filters[7][6] = {
489 { 0, 6, 123, 12, 1, 0 },
490 { 2, 11, 108, 36, 8, 1 },
491 { 0, 9, 93, 50, 6, 0 },
492 { 3, 16, 77, 77, 16, 3 },
493 { 0, 6, 50, 93, 9, 0 },
494 { 1, 8, 36, 108, 11, 2 },
495 { 0, 1, 12, 123, 6, 0 },
498 #define MUL_20091(a) ((((a) * 20091) >> 16) + (a))
499 #define MUL_35468(a) (((a) * 35468) >> 16)
502 #define clip_int8(n) (cm[(n) + 0x80] - 0x80)
503 static av_always_inline void vp8_filter_common_is4tap(uint8_t *p,
506 int av_unused p1 = p[-2 * stride];
507 int av_unused p0 = p[-1 * stride];
508 int av_unused q0 = p[ 0 * stride];
509 int av_unused q1 = p[ 1 * stride];
511 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
514 a += clip_int8(p1 - q1);
517 // We deviate from the spec here with c(a+3) >> 3
518 // since that's what libvpx does.
519 f1 = FFMIN(a + 4, 127) >> 3;
520 f2 = FFMIN(a + 3, 127) >> 3;
522 // Despite what the spec says, we do need to clamp here to
523 // be bitexact with libvpx.
524 p[-1 * stride] = cm[p0 + f2];
525 p[ 0 * stride] = cm[q0 - f1];
528 static av_always_inline void vp8_filter_common_isnot4tap(uint8_t *p,
531 int av_unused p1 = p[-2 * stride];
532 int av_unused p0 = p[-1 * stride];
533 int av_unused q0 = p[ 0 * stride];
534 int av_unused q1 = p[ 1 * stride];
536 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
541 // We deviate from the spec here with c(a+3) >> 3
542 // since that's what libvpx does.
543 f1 = FFMIN(a + 4, 127) >> 3;
544 f2 = FFMIN(a + 3, 127) >> 3;
546 // Despite what the spec says, we do need to clamp here to
547 // be bitexact with libvpx.
548 p[-1 * stride] = cm[p0 + f2];
549 p[ 0 * stride] = cm[q0 - f1];
551 p[-2 * stride] = cm[p1 + a];
552 p[ 1 * stride] = cm[q1 - a];
555 static av_always_inline int vp8_simple_limit(uint8_t *p, ptrdiff_t stride,
558 int av_unused p1 = p[-2 * stride];
559 int av_unused p0 = p[-1 * stride];
560 int av_unused q0 = p[ 0 * stride];
561 int av_unused q1 = p[ 1 * stride];
563 return 2 * FFABS(p0 - q0) + (FFABS(p1 - q1) >> 1) <= flim;
566 static av_always_inline int hev(uint8_t *p, ptrdiff_t stride, int thresh)
568 int av_unused p1 = p[-2 * stride];
569 int av_unused p0 = p[-1 * stride];
570 int av_unused q0 = p[ 0 * stride];
571 int av_unused q1 = p[ 1 * stride];
573 return FFABS(p1 - p0) > thresh || FFABS(q1 - q0) > thresh;
576 static av_always_inline void filter_mbedge(uint8_t *p, ptrdiff_t stride)
579 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
581 int av_unused p2 = p[-3 * stride];
582 int av_unused p1 = p[-2 * stride];
583 int av_unused p0 = p[-1 * stride];
584 int av_unused q0 = p[ 0 * stride];
585 int av_unused q1 = p[ 1 * stride];
586 int av_unused q2 = p[ 2 * stride];
588 w = clip_int8(p1 - q1);
589 w = clip_int8(w + 3 * (q0 - p0));
591 a0 = (27 * w + 63) >> 7;
592 a1 = (18 * w + 63) >> 7;
593 a2 = (9 * w + 63) >> 7;
595 p[-3 * stride] = cm[p2 + a2];
596 p[-2 * stride] = cm[p1 + a1];
597 p[-1 * stride] = cm[p0 + a0];
598 p[ 0 * stride] = cm[q0 - a0];
599 p[ 1 * stride] = cm[q1 - a1];
600 p[ 2 * stride] = cm[q2 - a2];
603 static av_always_inline int vp8_normal_limit(uint8_t *p, ptrdiff_t stride,
606 int av_unused p3 = p[-4 * stride];
607 int av_unused p2 = p[-3 * stride];
608 int av_unused p1 = p[-2 * stride];
609 int av_unused p0 = p[-1 * stride];
610 int av_unused q0 = p[ 0 * stride];
611 int av_unused q1 = p[ 1 * stride];
612 int av_unused q2 = p[ 2 * stride];
613 int av_unused q3 = p[ 3 * stride];
615 return vp8_simple_limit(p, stride, E) &&
616 FFABS(p3 - p2) <= I && FFABS(p2 - p1) <= I &&
617 FFABS(p1 - p0) <= I && FFABS(q3 - q2) <= I &&
618 FFABS(q2 - q1) <= I && FFABS(q1 - q0) <= I;
621 static av_always_inline void vp8_v_loop_filter8_mmi(uint8_t *dst,
622 ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
626 for (i = 0; i < 8; i++)
627 if (vp8_normal_limit(dst + i * 1, stride, flim_E, flim_I)) {
628 if (hev(dst + i * 1, stride, hev_thresh))
629 vp8_filter_common_is4tap(dst + i * 1, stride);
631 filter_mbedge(dst + i * 1, stride);
635 static av_always_inline void vp8_v_loop_filter8_inner_mmi(uint8_t *dst,
636 ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
640 for (i = 0; i < 8; i++)
641 if (vp8_normal_limit(dst + i * 1, stride, flim_E, flim_I)) {
642 int hv = hev(dst + i * 1, stride, hev_thresh);
644 vp8_filter_common_is4tap(dst + i * 1, stride);
646 vp8_filter_common_isnot4tap(dst + i * 1, stride);
650 static av_always_inline void vp8_h_loop_filter8_mmi(uint8_t *dst,
651 ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
655 for (i = 0; i < 8; i++)
656 if (vp8_normal_limit(dst + i * stride, 1, flim_E, flim_I)) {
657 if (hev(dst + i * stride, 1, hev_thresh))
658 vp8_filter_common_is4tap(dst + i * stride, 1);
660 filter_mbedge(dst + i * stride, 1);
664 static av_always_inline void vp8_h_loop_filter8_inner_mmi(uint8_t *dst,
665 ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
669 for (i = 0; i < 8; i++)
670 if (vp8_normal_limit(dst + i * stride, 1, flim_E, flim_I)) {
671 int hv = hev(dst + i * stride, 1, hev_thresh);
673 vp8_filter_common_is4tap(dst + i * stride, 1);
675 vp8_filter_common_isnot4tap(dst + i * stride, 1);
679 void ff_vp8_luma_dc_wht_mmi(int16_t block[4][4][16], int16_t dc[16])
686 MMI_LDC1(%[ftmp0], %[dc], 0x00)
687 MMI_LDC1(%[ftmp1], %[dc], 0x08)
688 MMI_LDC1(%[ftmp2], %[dc], 0x10)
689 MMI_LDC1(%[ftmp3], %[dc], 0x18)
690 "paddsh %[ftmp4], %[ftmp0], %[ftmp3] \n\t"
691 "psubsh %[ftmp5], %[ftmp0], %[ftmp3] \n\t"
692 "paddsh %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
693 "psubsh %[ftmp7], %[ftmp1], %[ftmp2] \n\t"
694 "paddsh %[ftmp0], %[ftmp4], %[ftmp6] \n\t"
695 "paddsh %[ftmp1], %[ftmp5], %[ftmp7] \n\t"
696 "psubsh %[ftmp2], %[ftmp4], %[ftmp6] \n\t"
697 "psubsh %[ftmp3], %[ftmp5], %[ftmp7] \n\t"
698 MMI_SDC1(%[ftmp0], %[dc], 0x00)
699 MMI_SDC1(%[ftmp1], %[dc], 0x08)
700 MMI_SDC1(%[ftmp2], %[dc], 0x10)
701 MMI_SDC1(%[ftmp3], %[dc], 0x18)
702 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
703 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
704 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
705 [ftmp6]"=&f"(ftmp[6]),
707 [ftmp7]"=&f"(ftmp[7])
708 : [dc]"r"((uint8_t*)dc)
712 block[0][0][0] = (dc[0] + dc[3] + 3 + dc[1] + dc[2]) >> 3;
713 block[0][1][0] = (dc[0] - dc[3] + 3 + dc[1] - dc[2]) >> 3;
714 block[0][2][0] = (dc[0] + dc[3] + 3 - dc[1] - dc[2]) >> 3;
715 block[0][3][0] = (dc[0] - dc[3] + 3 - dc[1] + dc[2]) >> 3;
717 block[1][0][0] = (dc[4] + dc[7] + 3 + dc[5] + dc[6]) >> 3;
718 block[1][1][0] = (dc[4] - dc[7] + 3 + dc[5] - dc[6]) >> 3;
719 block[1][2][0] = (dc[4] + dc[7] + 3 - dc[5] - dc[6]) >> 3;
720 block[1][3][0] = (dc[4] - dc[7] + 3 - dc[5] + dc[6]) >> 3;
722 block[2][0][0] = (dc[8] + dc[11] + 3 + dc[9] + dc[10]) >> 3;
723 block[2][1][0] = (dc[8] - dc[11] + 3 + dc[9] - dc[10]) >> 3;
724 block[2][2][0] = (dc[8] + dc[11] + 3 - dc[9] - dc[10]) >> 3;
725 block[2][3][0] = (dc[8] - dc[11] + 3 - dc[9] + dc[10]) >> 3;
727 block[3][0][0] = (dc[12] + dc[15] + 3 + dc[13] + dc[14]) >> 3;
728 block[3][1][0] = (dc[12] - dc[15] + 3 + dc[13] - dc[14]) >> 3;
729 block[3][2][0] = (dc[12] + dc[15] + 3 - dc[13] - dc[14]) >> 3;
730 block[3][3][0] = (dc[12] - dc[15] + 3 - dc[13] + dc[14]) >> 3;
733 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
734 MMI_SDC1(%[ftmp0], %[dc], 0x00)
735 MMI_SDC1(%[ftmp0], %[dc], 0x08)
736 MMI_SDC1(%[ftmp0], %[dc], 0x10)
737 MMI_SDC1(%[ftmp0], %[dc], 0x18)
739 [ftmp0]"=&f"(ftmp[0])
740 : [dc]"r"((uint8_t *)dc)
744 int t00, t01, t02, t03, t10, t11, t12, t13, t20, t21, t22, t23, t30, t31, t32, t33;
746 t00 = dc[0] + dc[12];
747 t10 = dc[1] + dc[13];
748 t20 = dc[2] + dc[14];
749 t30 = dc[3] + dc[15];
751 t03 = dc[0] - dc[12];
752 t13 = dc[1] - dc[13];
753 t23 = dc[2] - dc[14];
754 t33 = dc[3] - dc[15];
756 t01 = dc[4] + dc[ 8];
757 t11 = dc[5] + dc[ 9];
758 t21 = dc[6] + dc[10];
759 t31 = dc[7] + dc[11];
761 t02 = dc[4] - dc[ 8];
762 t12 = dc[5] - dc[ 9];
763 t22 = dc[6] - dc[10];
764 t32 = dc[7] - dc[11];
786 block[0][0][0] = (dc[0] + dc[3] + 3 + dc[1] + dc[2]) >> 3;
787 block[0][1][0] = (dc[0] - dc[3] + 3 + dc[1] - dc[2]) >> 3;
788 block[0][2][0] = (dc[0] + dc[3] + 3 - dc[1] - dc[2]) >> 3;
789 block[0][3][0] = (dc[0] - dc[3] + 3 - dc[1] + dc[2]) >> 3;
791 block[1][0][0] = (dc[4] + dc[7] + 3 + dc[5] + dc[6]) >> 3;
792 block[1][1][0] = (dc[4] - dc[7] + 3 + dc[5] - dc[6]) >> 3;
793 block[1][2][0] = (dc[4] + dc[7] + 3 - dc[5] - dc[6]) >> 3;
794 block[1][3][0] = (dc[4] - dc[7] + 3 - dc[5] + dc[6]) >> 3;
796 block[2][0][0] = (dc[8] + dc[11] + 3 + dc[9] + dc[10]) >> 3;
797 block[2][1][0] = (dc[8] - dc[11] + 3 + dc[9] - dc[10]) >> 3;
798 block[2][2][0] = (dc[8] + dc[11] + 3 - dc[9] - dc[10]) >> 3;
799 block[2][3][0] = (dc[8] - dc[11] + 3 - dc[9] + dc[10]) >> 3;
801 block[3][0][0] = (dc[12] + dc[15] + 3 + dc[13] + dc[14]) >> 3;
802 block[3][1][0] = (dc[12] - dc[15] + 3 + dc[13] - dc[14]) >> 3;
803 block[3][2][0] = (dc[12] + dc[15] + 3 - dc[13] - dc[14]) >> 3;
804 block[3][3][0] = (dc[12] - dc[15] + 3 - dc[13] + dc[14]) >> 3;
813 void ff_vp8_luma_dc_wht_dc_mmi(int16_t block[4][4][16], int16_t dc[16])
815 int val = (dc[0] + 3) >> 3;
819 block[0][0][0] = val;
820 block[0][1][0] = val;
821 block[0][2][0] = val;
822 block[0][3][0] = val;
823 block[1][0][0] = val;
824 block[1][1][0] = val;
825 block[1][2][0] = val;
826 block[1][3][0] = val;
827 block[2][0][0] = val;
828 block[2][1][0] = val;
829 block[2][2][0] = val;
830 block[2][3][0] = val;
831 block[3][0][0] = val;
832 block[3][1][0] = val;
833 block[3][2][0] = val;
834 block[3][3][0] = val;
837 void ff_vp8_idct_add_mmi(uint8_t *dst, int16_t block[16], ptrdiff_t stride)
840 DECLARE_ALIGNED(8, const uint64_t, ff_ph_4e7b) = {0x4e7b4e7b4e7b4e7bULL};
841 DECLARE_ALIGNED(8, const uint64_t, ff_ph_22a3) = {0x22a322a322a322a3ULL};
848 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
849 MMI_LDC1(%[ftmp1], %[block], 0x00)
850 MMI_LDC1(%[ftmp2], %[block], 0x08)
851 MMI_LDC1(%[ftmp3], %[block], 0x10)
852 MMI_LDC1(%[ftmp4], %[block], 0x18)
854 "li %[tmp0], 0x02 \n\t"
855 "mtc1 %[tmp0], %[ftmp11] \n\t"
857 // block[0...3] + block[8...11]
858 "paddh %[ftmp5], %[ftmp1], %[ftmp3] \n\t"
859 // block[0...3] - block[8...11]
860 "psubh %[ftmp6], %[ftmp1], %[ftmp3] \n\t"
861 // MUL_35468(block[12...15])
862 "psllh %[ftmp9], %[ftmp4], %[ftmp11] \n\t"
863 "pmulhh %[ftmp7], %[ftmp9], %[ff_ph_22a3] \n\t"
864 // MUL_35468(block[4...7])
865 "psllh %[ftmp9], %[ftmp2], %[ftmp11] \n\t"
866 "pmulhh %[ftmp8], %[ftmp9], %[ff_ph_22a3] \n\t"
867 // MUL_20091(block[4...7]
868 "pmulhh %[ftmp9], %[ftmp2], %[ff_ph_4e7b] \n\t"
869 "paddh %[ftmp9], %[ftmp9], %[ftmp2] \n\t"
870 // MUL_20091(block[12...15])
871 "pmulhh %[ftmp10], %[ftmp4], %[ff_ph_4e7b] \n\t"
872 "paddh %[ftmp10], %[ftmp10], %[ftmp4] \n\t"
875 "paddh %[ftmp1], %[ftmp5], %[ftmp7] \n\t"
876 "paddh %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
878 "paddh %[ftmp2], %[ftmp6], %[ftmp8] \n\t"
879 "psubh %[ftmp2], %[ftmp2], %[ftmp10] \n\t"
881 "psubh %[ftmp3], %[ftmp6], %[ftmp8] \n\t"
882 "paddh %[ftmp3], %[ftmp3], %[ftmp10] \n\t"
884 "psubh %[ftmp4], %[ftmp5], %[ftmp7] \n\t"
885 "psubh %[ftmp4], %[ftmp4], %[ftmp9] \n\t"
887 MMI_SDC1(%[ftmp0], %[block], 0x00)
888 MMI_SDC1(%[ftmp0], %[block], 0x08)
889 MMI_SDC1(%[ftmp0], %[block], 0x10)
890 MMI_SDC1(%[ftmp0], %[block], 0x18)
892 TRANSPOSE_4H(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
893 %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
894 %[ftmp9], %[tmp0], %[ftmp0], %[ftmp10])
897 "paddh %[ftmp5], %[ftmp1], %[ftmp3] \n\t"
899 "psubh %[ftmp6], %[ftmp1], %[ftmp3] \n\t"
901 "psllh %[ftmp9], %[ftmp2], %[ftmp11] \n\t"
902 "pmulhh %[ftmp9], %[ftmp9], %[ff_ph_22a3] \n\t"
903 "psubh %[ftmp7], %[ftmp9], %[ftmp4] \n\t"
904 "pmulhh %[ftmp10], %[ftmp4], %[ff_ph_4e7b] \n\t"
905 "psubh %[ftmp7], %[ftmp7], %[ftmp10] \n\t"
907 "psllh %[ftmp9], %[ftmp4], %[ftmp11] \n\t"
908 "pmulhh %[ftmp9], %[ftmp9], %[ff_ph_22a3] \n\t"
909 "paddh %[ftmp8], %[ftmp9], %[ftmp2] \n\t"
910 "pmulhh %[ftmp10], %[ftmp2], %[ff_ph_4e7b] \n\t"
911 "paddh %[ftmp8], %[ftmp8], %[ftmp10] \n\t"
913 "li %[tmp0], 0x03 \n\t"
914 "mtc1 %[tmp0], %[ftmp11] \n\t"
915 "paddh %[ftmp1], %[ftmp5], %[ftmp8] \n\t"
916 "paddh %[ftmp1], %[ftmp1], %[ff_pw_4] \n\t"
917 "psrah %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
918 "paddh %[ftmp2], %[ftmp6], %[ftmp7] \n\t"
919 "paddh %[ftmp2], %[ftmp2], %[ff_pw_4] \n\t"
920 "psrah %[ftmp2], %[ftmp2], %[ftmp11] \n\t"
921 "psubh %[ftmp3], %[ftmp6], %[ftmp7] \n\t"
922 "paddh %[ftmp3], %[ftmp3], %[ff_pw_4] \n\t"
923 "psrah %[ftmp3], %[ftmp3], %[ftmp11] \n\t"
924 "psubh %[ftmp4], %[ftmp5], %[ftmp8] \n\t"
925 "paddh %[ftmp4], %[ftmp4], %[ff_pw_4] \n\t"
926 "psrah %[ftmp4], %[ftmp4], %[ftmp11] \n\t"
928 TRANSPOSE_4H(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
929 %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
930 %[ftmp9], %[tmp0], %[ftmp0], %[ftmp10])
932 MMI_LWC1(%[ftmp5], %[dst0], 0x00)
933 MMI_LWC1(%[ftmp6], %[dst1], 0x00)
934 MMI_LWC1(%[ftmp7], %[dst2], 0x00)
935 MMI_LWC1(%[ftmp8], %[dst3], 0x00)
937 "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
938 "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
939 "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
940 "punpcklbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
942 "paddh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
943 "paddh %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
944 "paddh %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
945 "paddh %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
947 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
948 "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
949 "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
950 "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
952 MMI_SWC1(%[ftmp1], %[dst0], 0x00)
953 MMI_SWC1(%[ftmp2], %[dst1], 0x00)
954 MMI_SWC1(%[ftmp3], %[dst2], 0x00)
955 MMI_SWC1(%[ftmp4], %[dst3], 0x00)
956 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
957 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
958 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
959 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
960 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
961 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
965 : [dst0]"r"(dst), [dst1]"r"(dst+stride),
966 [dst2]"r"(dst+2*stride), [dst3]"r"(dst+3*stride),
967 [block]"r"(block), [ff_pw_4]"f"(ff_pw_4),
968 [ff_ph_4e7b]"f"(ff_ph_4e7b), [ff_ph_22a3]"f"(ff_ph_22a3)
972 int i, t0, t1, t2, t3;
975 for (i = 0; i < 4; i++) {
976 t0 = block[0 + i] + block[8 + i];
977 t1 = block[0 + i] - block[8 + i];
978 t2 = MUL_35468(block[4 + i]) - MUL_20091(block[12 + i]);
979 t3 = MUL_20091(block[4 + i]) + MUL_35468(block[12 + i]);
985 tmp[i * 4 + 0] = t0 + t3;
986 tmp[i * 4 + 1] = t1 + t2;
987 tmp[i * 4 + 2] = t1 - t2;
988 tmp[i * 4 + 3] = t0 - t3;
991 for (i = 0; i < 4; i++) {
992 t0 = tmp[0 + i] + tmp[8 + i];
993 t1 = tmp[0 + i] - tmp[8 + i];
994 t2 = MUL_35468(tmp[4 + i]) - MUL_20091(tmp[12 + i]);
995 t3 = MUL_20091(tmp[4 + i]) + MUL_35468(tmp[12 + i]);
997 dst[0] = av_clip_uint8(dst[0] + ((t0 + t3 + 4) >> 3));
998 dst[1] = av_clip_uint8(dst[1] + ((t1 + t2 + 4) >> 3));
999 dst[2] = av_clip_uint8(dst[2] + ((t1 - t2 + 4) >> 3));
1000 dst[3] = av_clip_uint8(dst[3] + ((t0 - t3 + 4) >> 3));
1006 void ff_vp8_idct_dc_add_mmi(uint8_t *dst, int16_t block[16], ptrdiff_t stride)
1009 int dc = (block[0] + 4) >> 3;
1016 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1017 "mtc1 %[dc], %[ftmp5] \n\t"
1018 MMI_LWC1(%[ftmp1], %[dst0], 0x00)
1019 MMI_LWC1(%[ftmp2], %[dst1], 0x00)
1020 MMI_LWC1(%[ftmp3], %[dst2], 0x00)
1021 MMI_LWC1(%[ftmp4], %[dst3], 0x00)
1022 "pshufh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
1023 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1024 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1025 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1026 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1027 "paddsh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
1028 "paddsh %[ftmp2], %[ftmp2], %[ftmp5] \n\t"
1029 "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
1030 "paddsh %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
1031 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1032 "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1033 "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1034 "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1035 MMI_SWC1(%[ftmp1], %[dst0], 0x00)
1036 MMI_SWC1(%[ftmp2], %[dst1], 0x00)
1037 MMI_SWC1(%[ftmp3], %[dst2], 0x00)
1038 MMI_SWC1(%[ftmp4], %[dst3], 0x00)
1039 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1040 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1041 [ftmp4]"=&f"(ftmp[4]),
1043 [ftmp5]"=&f"(ftmp[5])
1044 : [dst0]"r"(dst), [dst1]"r"(dst+stride),
1045 [dst2]"r"(dst+2*stride), [dst3]"r"(dst+3*stride),
1050 int i, dc = (block[0] + 4) >> 3;
1054 for (i = 0; i < 4; i++) {
1055 dst[0] = av_clip_uint8(dst[0] + dc);
1056 dst[1] = av_clip_uint8(dst[1] + dc);
1057 dst[2] = av_clip_uint8(dst[2] + dc);
1058 dst[3] = av_clip_uint8(dst[3] + dc);
1064 void ff_vp8_idct_dc_add4y_mmi(uint8_t *dst, int16_t block[4][16],
1067 ff_vp8_idct_dc_add_mmi(dst + 0, block[0], stride);
1068 ff_vp8_idct_dc_add_mmi(dst + 4, block[1], stride);
1069 ff_vp8_idct_dc_add_mmi(dst + 8, block[2], stride);
1070 ff_vp8_idct_dc_add_mmi(dst + 12, block[3], stride);
1073 void ff_vp8_idct_dc_add4uv_mmi(uint8_t *dst, int16_t block[4][16],
1076 ff_vp8_idct_dc_add_mmi(dst + stride * 0 + 0, block[0], stride);
1077 ff_vp8_idct_dc_add_mmi(dst + stride * 0 + 4, block[1], stride);
1078 ff_vp8_idct_dc_add_mmi(dst + stride * 4 + 0, block[2], stride);
1079 ff_vp8_idct_dc_add_mmi(dst + stride * 4 + 4, block[3], stride);
1082 // loop filter applied to edges between macroblocks
1083 void ff_vp8_v_loop_filter16_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E,
1084 int flim_I, int hev_thresh)
1088 for (i = 0; i < 16; i++)
1089 if (vp8_normal_limit(dst + i * 1, stride, flim_E, flim_I)) {
1090 if (hev(dst + i * 1, stride, hev_thresh))
1091 vp8_filter_common_is4tap(dst + i * 1, stride);
1093 filter_mbedge(dst + i * 1, stride);
1097 void ff_vp8_h_loop_filter16_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E,
1098 int flim_I, int hev_thresh)
1102 for (i = 0; i < 16; i++)
1103 if (vp8_normal_limit(dst + i * stride, 1, flim_E, flim_I)) {
1104 if (hev(dst + i * stride, 1, hev_thresh))
1105 vp8_filter_common_is4tap(dst + i * stride, 1);
1107 filter_mbedge(dst + i * stride, 1);
1111 void ff_vp8_v_loop_filter8uv_mmi(uint8_t *dstU, uint8_t *dstV, ptrdiff_t stride,
1112 int flim_E, int flim_I, int hev_thresh)
1114 vp8_v_loop_filter8_mmi(dstU, stride, flim_E, flim_I, hev_thresh);
1115 vp8_v_loop_filter8_mmi(dstV, stride, flim_E, flim_I, hev_thresh);
1118 void ff_vp8_h_loop_filter8uv_mmi(uint8_t *dstU, uint8_t *dstV, ptrdiff_t stride,
1119 int flim_E, int flim_I, int hev_thresh)
1121 vp8_h_loop_filter8_mmi(dstU, stride, flim_E, flim_I, hev_thresh);
1122 vp8_h_loop_filter8_mmi(dstV, stride, flim_E, flim_I, hev_thresh);
1125 // loop filter applied to inner macroblock edges
1126 void ff_vp8_v_loop_filter16_inner_mmi(uint8_t *dst, ptrdiff_t stride,
1127 int flim_E, int flim_I, int hev_thresh)
1131 for (i = 0; i < 16; i++)
1132 if (vp8_normal_limit(dst + i * 1, stride, flim_E, flim_I)) {
1133 int hv = hev(dst + i * 1, stride, hev_thresh);
1135 vp8_filter_common_is4tap(dst + i * 1, stride);
1137 vp8_filter_common_isnot4tap(dst + i * 1, stride);
1141 void ff_vp8_h_loop_filter16_inner_mmi(uint8_t *dst, ptrdiff_t stride,
1142 int flim_E, int flim_I, int hev_thresh)
1146 for (i = 0; i < 16; i++)
1147 if (vp8_normal_limit(dst + i * stride, 1, flim_E, flim_I)) {
1148 int hv = hev(dst + i * stride, 1, hev_thresh);
1150 vp8_filter_common_is4tap(dst + i * stride, 1);
1152 vp8_filter_common_isnot4tap(dst + i * stride, 1);
1156 void ff_vp8_v_loop_filter8uv_inner_mmi(uint8_t *dstU, uint8_t *dstV,
1157 ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
1159 vp8_v_loop_filter8_inner_mmi(dstU, stride, flim_E, flim_I, hev_thresh);
1160 vp8_v_loop_filter8_inner_mmi(dstV, stride, flim_E, flim_I, hev_thresh);
1163 void ff_vp8_h_loop_filter8uv_inner_mmi(uint8_t *dstU, uint8_t *dstV,
1164 ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
1166 vp8_h_loop_filter8_inner_mmi(dstU, stride, flim_E, flim_I, hev_thresh);
1167 vp8_h_loop_filter8_inner_mmi(dstV, stride, flim_E, flim_I, hev_thresh);
1170 void ff_vp8_v_loop_filter_simple_mmi(uint8_t *dst, ptrdiff_t stride, int flim)
1174 for (i = 0; i < 16; i++)
1175 if (vp8_simple_limit(dst + i, stride, flim))
1176 vp8_filter_common_is4tap(dst + i, stride);
1179 void ff_vp8_h_loop_filter_simple_mmi(uint8_t *dst, ptrdiff_t stride, int flim)
1183 for (i = 0; i < 16; i++)
1184 if (vp8_simple_limit(dst + i * stride, 1, flim))
1185 vp8_filter_common_is4tap(dst + i * stride, 1);
1188 void ff_put_vp8_pixels16_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1189 ptrdiff_t srcstride, int h, int x, int y)
1199 PTR_ADDU "%[addr0], %[src], %[srcstride] \n\t"
1200 MMI_ULDC1(%[ftmp0], %[src], 0x00)
1201 "ldl %[tmp0], 0x0f(%[src]) \n\t"
1202 "ldr %[tmp0], 0x08(%[src]) \n\t"
1203 MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
1204 "ldl %[tmp1], 0x0f(%[addr0]) \n\t"
1205 "ldr %[tmp1], 0x08(%[addr0]) \n\t"
1206 PTR_ADDU "%[addr1], %[dst], %[dststride] \n\t"
1207 MMI_SDC1(%[ftmp0], %[dst], 0x00)
1208 "sdl %[tmp0], 0x0f(%[dst]) \n\t"
1209 "sdr %[tmp0], 0x08(%[dst]) \n\t"
1210 "addiu %[h], %[h], -0x02 \n\t"
1211 MMI_SDC1(%[ftmp1], %[addr1], 0x00)
1212 PTR_ADDU "%[src], %[addr0], %[srcstride] \n\t"
1213 "sdl %[tmp1], 0x0f(%[addr1]) \n\t"
1214 "sdr %[tmp1], 0x08(%[addr1]) \n\t"
1215 PTR_ADDU "%[dst], %[addr1], %[dststride] \n\t"
1216 "bnez %[h], 1b \n\t"
1217 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1218 [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
1220 [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
1221 [dst]"+&r"(dst), [src]"+&r"(src),
1223 : [dststride]"r"((mips_reg)dststride),
1224 [srcstride]"r"((mips_reg)srcstride)
1230 for (i = 0; i < h; i++, dst += dststride, src += srcstride)
1231 memcpy(dst, src, 16);
1235 void ff_put_vp8_pixels8_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1236 ptrdiff_t srcstride, int h, int x, int y)
1246 PTR_ADDU "%[addr0], %[src], %[srcstride] \n\t"
1247 MMI_ULDC1(%[ftmp0], %[src], 0x00)
1248 "ldl %[tmp0], 0x07(%[addr0]) \n\t"
1249 "ldr %[tmp0], 0x00(%[addr0]) \n\t"
1250 PTR_ADDU "%[addr1], %[dst], %[dststride] \n\t"
1251 MMI_SDC1(%[ftmp0], %[dst], 0x00)
1252 "addiu %[h], %[h], -0x02 \n\t"
1253 "sdl %[tmp0], 0x07(%[addr1]) \n\t"
1254 "sdr %[tmp0], 0x00(%[addr1]) \n\t"
1255 PTR_ADDU "%[src], %[addr0], %[srcstride] \n\t"
1256 PTR_ADDU "%[dst], %[addr1], %[dststride] \n\t"
1257 "bnez %[h], 1b \n\t"
1258 : [ftmp0]"=&f"(ftmp[0]), [tmp0]"=&r"(tmp[0]),
1260 [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
1261 [dst]"+&r"(dst), [src]"+&r"(src),
1263 : [dststride]"r"((mips_reg)dststride),
1264 [srcstride]"r"((mips_reg)srcstride)
1270 for (i = 0; i < h; i++, dst += dststride, src += srcstride)
1271 memcpy(dst, src, 8);
1275 void ff_put_vp8_pixels4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1276 ptrdiff_t srcstride, int h, int x, int y)
1286 PTR_ADDU "%[addr0], %[src], %[srcstride] \n\t"
1287 MMI_LWC1(%[ftmp0], %[src], 0x00)
1288 "lwl %[tmp0], 0x03(%[addr0]) \n\t"
1289 "lwr %[tmp0], 0x00(%[addr0]) \n\t"
1290 PTR_ADDU "%[addr1], %[dst], %[dststride] \n\t"
1291 MMI_SWC1(%[ftmp0], %[dst], 0x00)
1292 "addiu %[h], %[h], -0x02 \n\t"
1293 "swl %[tmp0], 0x03(%[addr1]) \n\t"
1294 "swr %[tmp0], 0x00(%[addr1]) \n\t"
1295 PTR_ADDU "%[src], %[addr0], %[srcstride] \n\t"
1296 PTR_ADDU "%[dst], %[addr1], %[dststride] \n\t"
1297 "bnez %[h], 1b \n\t"
1298 : [ftmp0]"=&f"(ftmp[0]), [tmp0]"=&r"(tmp[0]),
1300 [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
1301 [dst]"+&r"(dst), [src]"+&r"(src),
1303 : [dststride]"r"((mips_reg)dststride),
1304 [srcstride]"r"((mips_reg)srcstride)
1310 for (i = 0; i < h; i++, dst += dststride, src += srcstride)
1311 memcpy(dst, src, 4);
1315 void ff_put_vp8_epel16_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1316 ptrdiff_t srcstride, int h, int mx, int my)
1319 const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1322 mips_reg src1, dst1;
1326 dst[0] = cm[(filter[2] * src[0] - filter[1] * src[-1] + filter[3] * src[1] - filter[4] * src[2] + 64) >> 7];
1327 dst[1] = cm[(filter[2] * src[1] - filter[1] * src[ 0] + filter[3] * src[2] - filter[4] * src[3] + 64) >> 7];
1328 dst[2] = cm[(filter[2] * src[2] - filter[1] * src[ 1] + filter[3] * src[3] - filter[4] * src[4] + 64) >> 7];
1329 dst[3] = cm[(filter[2] * src[3] - filter[1] * src[ 2] + filter[3] * src[4] - filter[4] * src[5] + 64) >> 7];
1330 dst[4] = cm[(filter[2] * src[4] - filter[1] * src[ 3] + filter[3] * src[5] - filter[4] * src[6] + 64) >> 7];
1331 dst[5] = cm[(filter[2] * src[5] - filter[1] * src[ 4] + filter[3] * src[6] - filter[4] * src[7] + 64) >> 7];
1332 dst[6] = cm[(filter[2] * src[6] - filter[1] * src[ 5] + filter[3] * src[7] - filter[4] * src[8] + 64) >> 7];
1333 dst[7] = cm[(filter[2] * src[7] - filter[1] * src[ 6] + filter[3] * src[8] - filter[4] * src[9] + 64) >> 7];
1335 dst[ 8] = cm[(filter[2] * src[ 8] - filter[1] * src[ 7] + filter[3] * src[ 9] - filter[4] * src[10] + 64) >> 7];
1336 dst[ 9] = cm[(filter[2] * src[ 9] - filter[1] * src[ 8] + filter[3] * src[10] - filter[4] * src[11] + 64) >> 7];
1337 dst[10] = cm[(filter[2] * src[10] - filter[1] * src[ 9] + filter[3] * src[11] - filter[4] * src[12] + 64) >> 7];
1338 dst[11] = cm[(filter[2] * src[11] - filter[1] * src[10] + filter[3] * src[12] - filter[4] * src[13] + 64) >> 7];
1339 dst[12] = cm[(filter[2] * src[12] - filter[1] * src[11] + filter[3] * src[13] - filter[4] * src[14] + 64) >> 7];
1340 dst[13] = cm[(filter[2] * src[13] - filter[1] * src[12] + filter[3] * src[14] - filter[4] * src[15] + 64) >> 7];
1341 dst[14] = cm[(filter[2] * src[14] - filter[1] * src[13] + filter[3] * src[15] - filter[4] * src[16] + 64) >> 7];
1342 dst[15] = cm[(filter[2] * src[15] - filter[1] * src[14] + filter[3] * src[16] - filter[4] * src[17] + 64) >> 7];
1345 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1346 "li %[tmp0], 0x07 \n\t"
1347 "mtc1 %[tmp0], %[ftmp4] \n\t"
1351 PUT_VP8_EPEL8_H4_MMI(%[src], %[dst])
1352 PTR_ADDIU "%[src1], %[src], 0x08 \n\t"
1353 PTR_ADDIU "%[dst1], %[dst], 0x08 \n\t"
1355 PUT_VP8_EPEL8_H4_MMI(%[src1], %[dst1])
1357 "addiu %[h], %[h], -0x01 \n\t"
1358 PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1359 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1360 "bnez %[h], 1b \n\t"
1361 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1362 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1363 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1364 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1365 [ftmp8]"=&f"(ftmp[8]),
1366 [tmp0]"=&r"(tmp[0]),
1368 [dst1]"=&r"(dst1), [src1]"=&r"(src1),
1370 [dst]"+&r"(dst), [src]"+&r"(src)
1371 : [ff_pw_64]"f"(ff_pw_64),
1372 [srcstride]"r"((mips_reg)srcstride),
1373 [dststride]"r"((mips_reg)dststride),
1374 [filter1]"f"(filter[1]), [filter2]"f"(filter[2]),
1375 [filter3]"f"(filter[3]), [filter4]"f"(filter[4])
1379 const uint8_t *filter = subpel_filters[mx - 1];
1380 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1383 for (y = 0; y < h; y++) {
1384 for (x = 0; x < 16; x++)
1385 dst[x] = FILTER_4TAP(src, filter, 1);
1392 void ff_put_vp8_epel8_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1393 ptrdiff_t srcstride, int h, int mx, int my)
1396 const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1402 dst[0] = cm[(filter[2] * src[0] - filter[1] * src[-1] + filter[3] * src[1] - filter[4] * src[2] + 64) >> 7];
1403 dst[1] = cm[(filter[2] * src[1] - filter[1] * src[ 0] + filter[3] * src[2] - filter[4] * src[3] + 64) >> 7];
1404 dst[2] = cm[(filter[2] * src[2] - filter[1] * src[ 1] + filter[3] * src[3] - filter[4] * src[4] + 64) >> 7];
1405 dst[3] = cm[(filter[2] * src[3] - filter[1] * src[ 2] + filter[3] * src[4] - filter[4] * src[5] + 64) >> 7];
1406 dst[4] = cm[(filter[2] * src[4] - filter[1] * src[ 3] + filter[3] * src[5] - filter[4] * src[6] + 64) >> 7];
1407 dst[5] = cm[(filter[2] * src[5] - filter[1] * src[ 4] + filter[3] * src[6] - filter[4] * src[7] + 64) >> 7];
1408 dst[6] = cm[(filter[2] * src[6] - filter[1] * src[ 5] + filter[3] * src[7] - filter[4] * src[8] + 64) >> 7];
1409 dst[7] = cm[(filter[2] * src[7] - filter[1] * src[ 6] + filter[3] * src[8] - filter[4] * src[9] + 64) >> 7];
1412 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1413 "li %[tmp0], 0x07 \n\t"
1414 "mtc1 %[tmp0], %[ftmp4] \n\t"
1417 PUT_VP8_EPEL8_H4_MMI(%[src], %[dst])
1419 "addiu %[h], %[h], -0x01 \n\t"
1420 PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1421 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1422 "bnez %[h], 1b \n\t"
1423 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1424 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1425 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1426 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1427 [ftmp8]"=&f"(ftmp[8]),
1428 [tmp0]"=&r"(tmp[0]),
1431 [dst]"+&r"(dst), [src]"+&r"(src)
1432 : [ff_pw_64]"f"(ff_pw_64),
1433 [srcstride]"r"((mips_reg)srcstride),
1434 [dststride]"r"((mips_reg)dststride),
1435 [filter1]"f"(filter[1]), [filter2]"f"(filter[2]),
1436 [filter3]"f"(filter[3]), [filter4]"f"(filter[4])
1440 const uint8_t *filter = subpel_filters[mx - 1];
1441 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1444 for (y = 0; y < h; y++) {
1445 for (x = 0; x < 8; x++)
1446 dst[x] = FILTER_4TAP(src, filter, 1);
1453 void ff_put_vp8_epel4_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1454 ptrdiff_t srcstride, int h, int mx, int my)
1457 const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1463 dst[0] = cm[(filter[2] * src[0] - filter[1] * src[-1] + filter[3] * src[1] - filter[4] * src[2] + 64) >> 7];
1464 dst[1] = cm[(filter[2] * src[1] - filter[1] * src[ 0] + filter[3] * src[2] - filter[4] * src[3] + 64) >> 7];
1465 dst[2] = cm[(filter[2] * src[2] - filter[1] * src[ 1] + filter[3] * src[3] - filter[4] * src[4] + 64) >> 7];
1466 dst[3] = cm[(filter[2] * src[3] - filter[1] * src[ 2] + filter[3] * src[4] - filter[4] * src[5] + 64) >> 7];
1469 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1470 "li %[tmp0], 0x07 \n\t"
1471 "mtc1 %[tmp0], %[ftmp4] \n\t"
1474 PUT_VP8_EPEL4_H4_MMI(%[src], %[dst])
1476 "addiu %[h], %[h], -0x01 \n\t"
1477 PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1478 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1479 "bnez %[h], 1b \n\t"
1480 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1481 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1482 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1483 [tmp0]"=&r"(tmp[0]),
1486 [dst]"+&r"(dst), [src]"+&r"(src)
1487 : [ff_pw_64]"f"(ff_pw_64),
1488 [srcstride]"r"((mips_reg)srcstride),
1489 [dststride]"r"((mips_reg)dststride),
1490 [filter1]"f"(filter[1]), [filter2]"f"(filter[2]),
1491 [filter3]"f"(filter[3]), [filter4]"f"(filter[4])
1495 const uint8_t *filter = subpel_filters[mx - 1];
1496 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1499 for (y = 0; y < h; y++) {
1500 for (x = 0; x < 4; x++)
1501 dst[x] = FILTER_4TAP(src, filter, 1);
1508 void ff_put_vp8_epel16_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1509 ptrdiff_t srcstride, int h, int mx, int my)
1512 const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1515 mips_reg src1, dst1;
1519 dst[ 0] = cm[(filter[2]*src[ 0] - filter[1]*src[-1] + filter[0]*src[-2] + filter[3]*src[ 1] - filter[4]*src[ 2] + filter[5]*src[ 3] + 64) >> 7];
1520 dst[ 1] = cm[(filter[2]*src[ 1] - filter[1]*src[ 0] + filter[0]*src[-1] + filter[3]*src[ 2] - filter[4]*src[ 3] + filter[5]*src[ 4] + 64) >> 7];
1521 dst[ 2] = cm[(filter[2]*src[ 2] - filter[1]*src[ 1] + filter[0]*src[ 0] + filter[3]*src[ 3] - filter[4]*src[ 4] + filter[5]*src[ 5] + 64) >> 7];
1522 dst[ 3] = cm[(filter[2]*src[ 3] - filter[1]*src[ 2] + filter[0]*src[ 1] + filter[3]*src[ 4] - filter[4]*src[ 5] + filter[5]*src[ 6] + 64) >> 7];
1523 dst[ 4] = cm[(filter[2]*src[ 4] - filter[1]*src[ 3] + filter[0]*src[ 2] + filter[3]*src[ 5] - filter[4]*src[ 6] + filter[5]*src[ 7] + 64) >> 7];
1524 dst[ 5] = cm[(filter[2]*src[ 5] - filter[1]*src[ 4] + filter[0]*src[ 3] + filter[3]*src[ 6] - filter[4]*src[ 7] + filter[5]*src[ 8] + 64) >> 7];
1525 dst[ 6] = cm[(filter[2]*src[ 6] - filter[1]*src[ 5] + filter[0]*src[ 4] + filter[3]*src[ 7] - filter[4]*src[ 8] + filter[5]*src[ 9] + 64) >> 7];
1526 dst[ 7] = cm[(filter[2]*src[ 7] - filter[1]*src[ 6] + filter[0]*src[ 5] + filter[3]*src[ 8] - filter[4]*src[ 9] + filter[5]*src[10] + 64) >> 7];
1528 dst[ 8] = cm[(filter[2]*src[ 8] - filter[1]*src[ 7] + filter[0]*src[ 6] + filter[3]*src[ 9] - filter[4]*src[10] + filter[5]*src[11] + 64) >> 7];
1529 dst[ 9] = cm[(filter[2]*src[ 9] - filter[1]*src[ 8] + filter[0]*src[ 7] + filter[3]*src[10] - filter[4]*src[11] + filter[5]*src[12] + 64) >> 7];
1530 dst[10] = cm[(filter[2]*src[10] - filter[1]*src[ 9] + filter[0]*src[ 8] + filter[3]*src[11] - filter[4]*src[12] + filter[5]*src[13] + 64) >> 7];
1531 dst[11] = cm[(filter[2]*src[11] - filter[1]*src[10] + filter[0]*src[ 9] + filter[3]*src[12] - filter[4]*src[13] + filter[5]*src[14] + 64) >> 7];
1532 dst[12] = cm[(filter[2]*src[12] - filter[1]*src[11] + filter[0]*src[10] + filter[3]*src[13] - filter[4]*src[14] + filter[5]*src[15] + 64) >> 7];
1533 dst[13] = cm[(filter[2]*src[13] - filter[1]*src[12] + filter[0]*src[11] + filter[3]*src[14] - filter[4]*src[15] + filter[5]*src[16] + 64) >> 7];
1534 dst[14] = cm[(filter[2]*src[14] - filter[1]*src[13] + filter[0]*src[12] + filter[3]*src[15] - filter[4]*src[16] + filter[5]*src[17] + 64) >> 7];
1535 dst[15] = cm[(filter[2]*src[15] - filter[1]*src[14] + filter[0]*src[13] + filter[3]*src[16] - filter[4]*src[17] + filter[5]*src[18] + 64) >> 7];
1538 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1539 "li %[tmp0], 0x07 \n\t"
1540 "mtc1 %[tmp0], %[ftmp4] \n\t"
1544 PUT_VP8_EPEL8_H6_MMI(%[src], %[dst])
1545 PTR_ADDIU "%[src1], %[src], 0x08 \n\t"
1546 PTR_ADDIU "%[dst1], %[dst], 0x08 \n\t"
1548 PUT_VP8_EPEL8_H6_MMI(%[src1], %[dst1])
1550 "addiu %[h], %[h], -0x01 \n\t"
1551 PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1552 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1553 "bnez %[h], 1b \n\t"
1554 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1555 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1556 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1557 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1558 [ftmp8]"=&f"(ftmp[8]),
1559 [tmp0]"=&r"(tmp[0]),
1561 [dst1]"=&r"(dst1), [src1]"=&r"(src1),
1563 [dst]"+&r"(dst), [src]"+&r"(src)
1564 : [ff_pw_64]"f"(ff_pw_64),
1565 [srcstride]"r"((mips_reg)srcstride),
1566 [dststride]"r"((mips_reg)dststride),
1567 [filter0]"f"(filter[0]), [filter1]"f"(filter[1]),
1568 [filter2]"f"(filter[2]), [filter3]"f"(filter[3]),
1569 [filter4]"f"(filter[4]), [filter5]"f"(filter[5])
1573 const uint8_t *filter = subpel_filters[mx - 1];
1574 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1577 for (y = 0; y < h; y++) {
1578 for (x = 0; x < 16; x++)
1579 dst[x] = FILTER_6TAP(src, filter, 1);
1586 void ff_put_vp8_epel8_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1587 ptrdiff_t srcstride, int h, int mx, int my)
1590 const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1596 dst[0] = cm[(filter[2]*src[0] - filter[1]*src[-1] + filter[0]*src[-2] + filter[3]*src[1] - filter[4]*src[2] + filter[5]*src[ 3] + 64) >> 7];
1597 dst[1] = cm[(filter[2]*src[1] - filter[1]*src[ 0] + filter[0]*src[-1] + filter[3]*src[2] - filter[4]*src[3] + filter[5]*src[ 4] + 64) >> 7];
1598 dst[2] = cm[(filter[2]*src[2] - filter[1]*src[ 1] + filter[0]*src[ 0] + filter[3]*src[3] - filter[4]*src[4] + filter[5]*src[ 5] + 64) >> 7];
1599 dst[3] = cm[(filter[2]*src[3] - filter[1]*src[ 2] + filter[0]*src[ 1] + filter[3]*src[4] - filter[4]*src[5] + filter[5]*src[ 6] + 64) >> 7];
1600 dst[4] = cm[(filter[2]*src[4] - filter[1]*src[ 3] + filter[0]*src[ 2] + filter[3]*src[5] - filter[4]*src[6] + filter[5]*src[ 7] + 64) >> 7];
1601 dst[5] = cm[(filter[2]*src[5] - filter[1]*src[ 4] + filter[0]*src[ 3] + filter[3]*src[6] - filter[4]*src[7] + filter[5]*src[ 8] + 64) >> 7];
1602 dst[6] = cm[(filter[2]*src[6] - filter[1]*src[ 5] + filter[0]*src[ 4] + filter[3]*src[7] - filter[4]*src[8] + filter[5]*src[ 9] + 64) >> 7];
1603 dst[7] = cm[(filter[2]*src[7] - filter[1]*src[ 6] + filter[0]*src[ 5] + filter[3]*src[8] - filter[4]*src[9] + filter[5]*src[10] + 64) >> 7];
1606 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1607 "li %[tmp0], 0x07 \n\t"
1608 "mtc1 %[tmp0], %[ftmp4] \n\t"
1611 PUT_VP8_EPEL8_H6_MMI(%[src], %[dst])
1613 "addiu %[h], %[h], -0x01 \n\t"
1614 PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1615 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1616 "bnez %[h], 1b \n\t"
1617 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1618 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1619 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1620 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1621 [ftmp8]"=&f"(ftmp[8]),
1622 [tmp0]"=&r"(tmp[0]),
1625 [dst]"+&r"(dst), [src]"+&r"(src)
1626 : [ff_pw_64]"f"(ff_pw_64),
1627 [srcstride]"r"((mips_reg)srcstride),
1628 [dststride]"r"((mips_reg)dststride),
1629 [filter0]"f"(filter[0]), [filter1]"f"(filter[1]),
1630 [filter2]"f"(filter[2]), [filter3]"f"(filter[3]),
1631 [filter4]"f"(filter[4]), [filter5]"f"(filter[5])
1635 const uint8_t *filter = subpel_filters[mx - 1];
1636 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1639 for (y = 0; y < h; y++) {
1640 for (x = 0; x < 8; x++)
1641 dst[x] = FILTER_6TAP(src, filter, 1);
1648 void ff_put_vp8_epel4_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1649 ptrdiff_t srcstride, int h, int mx, int my)
1652 const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1658 dst[0] = cm[(filter[2]*src[0] - filter[1]*src[-1] + filter[0]*src[-2] + filter[3]*src[1] - filter[4]*src[2] + filter[5]*src[ 3] + 64) >> 7];
1659 dst[1] = cm[(filter[2]*src[1] - filter[1]*src[ 0] + filter[0]*src[-1] + filter[3]*src[2] - filter[4]*src[3] + filter[5]*src[ 4] + 64) >> 7];
1660 dst[2] = cm[(filter[2]*src[2] - filter[1]*src[ 1] + filter[0]*src[ 0] + filter[3]*src[3] - filter[4]*src[4] + filter[5]*src[ 5] + 64) >> 7];
1661 dst[3] = cm[(filter[2]*src[3] - filter[1]*src[ 2] + filter[0]*src[ 1] + filter[3]*src[4] - filter[4]*src[5] + filter[5]*src[ 6] + 64) >> 7];
1664 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1665 "li %[tmp0], 0x07 \n\t"
1666 "mtc1 %[tmp0], %[ftmp4] \n\t"
1669 PUT_VP8_EPEL4_H6_MMI(%[src], %[dst])
1671 "addiu %[h], %[h], -0x01 \n\t"
1672 PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1673 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1674 "bnez %[h], 1b \n\t"
1675 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1676 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1677 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1678 [tmp0]"=&r"(tmp[0]),
1681 [dst]"+&r"(dst), [src]"+&r"(src)
1682 : [ff_pw_64]"f"(ff_pw_64),
1683 [srcstride]"r"((mips_reg)srcstride),
1684 [dststride]"r"((mips_reg)dststride),
1685 [filter0]"f"(filter[0]), [filter1]"f"(filter[1]),
1686 [filter2]"f"(filter[2]), [filter3]"f"(filter[3]),
1687 [filter4]"f"(filter[4]), [filter5]"f"(filter[5])
1691 const uint8_t *filter = subpel_filters[mx - 1];
1692 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1695 for (y = 0; y < h; y++) {
1696 for (x = 0; x < 4; x++)
1697 dst[x] = FILTER_6TAP(src, filter, 1);
1704 void ff_put_vp8_epel16_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1705 ptrdiff_t srcstride, int h, int mx, int my)
1708 const uint64_t *filter = fourtap_subpel_filters[my - 1];
1711 mips_reg src0, src1, dst0;
1715 dst[0] = cm[(filter[2] * src[0] - filter[1] * src[ -srcstride] + filter[3] * src[ srcstride] - filter[4] * src[ 2*srcstride] + 64) >> 7];
1716 dst[1] = cm[(filter[2] * src[1] - filter[1] * src[1-srcstride] + filter[3] * src[1+srcstride] - filter[4] * src[1+2*srcstride] + 64) >> 7];
1717 dst[2] = cm[(filter[2] * src[2] - filter[1] * src[2-srcstride] + filter[3] * src[2+srcstride] - filter[4] * src[2+2*srcstride] + 64) >> 7];
1718 dst[3] = cm[(filter[2] * src[3] - filter[1] * src[3-srcstride] + filter[3] * src[3+srcstride] - filter[4] * src[3+2*srcstride] + 64) >> 7];
1719 dst[4] = cm[(filter[2] * src[4] - filter[1] * src[4-srcstride] + filter[3] * src[4+srcstride] - filter[4] * src[4+2*srcstride] + 64) >> 7];
1720 dst[5] = cm[(filter[2] * src[5] - filter[1] * src[5-srcstride] + filter[3] * src[5+srcstride] - filter[4] * src[5+2*srcstride] + 64) >> 7];
1721 dst[6] = cm[(filter[2] * src[6] - filter[1] * src[6-srcstride] + filter[3] * src[6+srcstride] - filter[4] * src[6+2*srcstride] + 64) >> 7];
1722 dst[7] = cm[(filter[2] * src[7] - filter[1] * src[7-srcstride] + filter[3] * src[7+srcstride] - filter[4] * src[7+2*srcstride] + 64) >> 7];
1724 dst[ 8] = cm[(filter[2] * src[ 8] - filter[1] * src[ 8-srcstride] + filter[3] * src[ 8+srcstride] - filter[4] * src[ 8+2*srcstride] + 64) >> 7];
1725 dst[ 9] = cm[(filter[2] * src[ 9] - filter[1] * src[ 9-srcstride] + filter[3] * src[ 9+srcstride] - filter[4] * src[ 9+2*srcstride] + 64) >> 7];
1726 dst[10] = cm[(filter[2] * src[10] - filter[1] * src[10-srcstride] + filter[3] * src[10+srcstride] - filter[4] * src[10+2*srcstride] + 64) >> 7];
1727 dst[11] = cm[(filter[2] * src[11] - filter[1] * src[11-srcstride] + filter[3] * src[11+srcstride] - filter[4] * src[11+2*srcstride] + 64) >> 7];
1728 dst[12] = cm[(filter[2] * src[12] - filter[1] * src[12-srcstride] + filter[3] * src[12+srcstride] - filter[4] * src[12+2*srcstride] + 64) >> 7];
1729 dst[13] = cm[(filter[2] * src[13] - filter[1] * src[13-srcstride] + filter[3] * src[13+srcstride] - filter[4] * src[13+2*srcstride] + 64) >> 7];
1730 dst[14] = cm[(filter[2] * src[14] - filter[1] * src[14-srcstride] + filter[3] * src[14+srcstride] - filter[4] * src[14+2*srcstride] + 64) >> 7];
1731 dst[15] = cm[(filter[2] * src[15] - filter[1] * src[15-srcstride] + filter[3] * src[15+srcstride] - filter[4] * src[15+2*srcstride] + 64) >> 7];
1734 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1735 "li %[tmp0], 0x07 \n\t"
1736 "mtc1 %[tmp0], %[ftmp4] \n\t"
1740 PUT_VP8_EPEL8_V4_MMI(%[src], %[src1], %[dst], %[srcstride])
1741 PTR_ADDIU "%[src0], %[src], 0x08 \n\t"
1742 PTR_ADDIU "%[dst0], %[dst], 0x08 \n\t"
1744 PUT_VP8_EPEL8_V4_MMI(%[src0], %[src1], %[dst], %[srcstride])
1746 "addiu %[h], %[h], -0x01 \n\t"
1747 PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1748 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1749 "bnez %[h], 1b \n\t"
1750 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1751 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1752 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1753 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1754 [ftmp8]"=&f"(ftmp[8]),
1755 [tmp0]"=&r"(tmp[0]),
1757 [src0]"=&r"(src0), [dst0]"=&r"(dst0),
1760 [dst]"+&r"(dst), [src]"+&r"(src)
1761 : [ff_pw_64]"f"(ff_pw_64),
1762 [srcstride]"r"((mips_reg)srcstride),
1763 [dststride]"r"((mips_reg)dststride),
1764 [filter1]"f"(filter[1]), [filter2]"f"(filter[2]),
1765 [filter3]"f"(filter[3]), [filter4]"f"(filter[4])
1769 const uint8_t *filter = subpel_filters[my - 1];
1770 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1773 for (y = 0; y < h; y++) {
1774 for (x = 0; x < 16; x++)
1775 dst[x] = FILTER_4TAP(src, filter, srcstride);
1782 void ff_put_vp8_epel8_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1783 ptrdiff_t srcstride, int h, int mx, int my)
1786 const uint64_t *filter = fourtap_subpel_filters[my - 1];
1793 dst[0] = cm[(filter[2] * src[0] - filter[1] * src[ -srcstride] + filter[3] * src[ srcstride] - filter[4] * src[ 2*srcstride] + 64) >> 7];
1794 dst[1] = cm[(filter[2] * src[1] - filter[1] * src[1-srcstride] + filter[3] * src[1+srcstride] - filter[4] * src[1+2*srcstride] + 64) >> 7];
1795 dst[2] = cm[(filter[2] * src[2] - filter[1] * src[2-srcstride] + filter[3] * src[2+srcstride] - filter[4] * src[2+2*srcstride] + 64) >> 7];
1796 dst[3] = cm[(filter[2] * src[3] - filter[1] * src[3-srcstride] + filter[3] * src[3+srcstride] - filter[4] * src[3+2*srcstride] + 64) >> 7];
1797 dst[4] = cm[(filter[2] * src[4] - filter[1] * src[4-srcstride] + filter[3] * src[4+srcstride] - filter[4] * src[4+2*srcstride] + 64) >> 7];
1798 dst[5] = cm[(filter[2] * src[5] - filter[1] * src[5-srcstride] + filter[3] * src[5+srcstride] - filter[4] * src[5+2*srcstride] + 64) >> 7];
1799 dst[6] = cm[(filter[2] * src[6] - filter[1] * src[6-srcstride] + filter[3] * src[6+srcstride] - filter[4] * src[6+2*srcstride] + 64) >> 7];
1800 dst[7] = cm[(filter[2] * src[7] - filter[1] * src[7-srcstride] + filter[3] * src[7+srcstride] - filter[4] * src[7+2*srcstride] + 64) >> 7];
1803 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1804 "li %[tmp0], 0x07 \n\t"
1805 "mtc1 %[tmp0], %[ftmp4] \n\t"
1808 PUT_VP8_EPEL8_V4_MMI(%[src], %[src1], %[dst], %[srcstride])
1810 "addiu %[h], %[h], -0x01 \n\t"
1811 PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1812 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1813 "bnez %[h], 1b \n\t"
1814 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1815 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1816 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1817 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1818 [ftmp8]"=&f"(ftmp[8]),
1819 [tmp0]"=&r"(tmp[0]),
1823 [dst]"+&r"(dst), [src]"+&r"(src)
1824 : [ff_pw_64]"f"(ff_pw_64),
1825 [srcstride]"r"((mips_reg)srcstride),
1826 [dststride]"r"((mips_reg)dststride),
1827 [filter1]"f"(filter[1]), [filter2]"f"(filter[2]),
1828 [filter3]"f"(filter[3]), [filter4]"f"(filter[4])
1832 const uint8_t *filter = subpel_filters[my - 1];
1833 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1836 for (y = 0; y < h; y++) {
1837 for (x = 0; x < 8; x++)
1838 dst[x] = FILTER_4TAP(src, filter, srcstride);
1845 void ff_put_vp8_epel4_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1846 ptrdiff_t srcstride, int h, int mx, int my)
1849 const uint64_t *filter = fourtap_subpel_filters[my - 1];
1856 dst[0] = cm[(filter[2] * src[0] - filter[1] * src[ -srcstride] + filter[3] * src[ srcstride] - filter[4] * src[ 2*srcstride] + 64) >> 7];
1857 dst[1] = cm[(filter[2] * src[1] - filter[1] * src[1-srcstride] + filter[3] * src[1+srcstride] - filter[4] * src[1+2*srcstride] + 64) >> 7];
1858 dst[2] = cm[(filter[2] * src[2] - filter[1] * src[2-srcstride] + filter[3] * src[2+srcstride] - filter[4] * src[2+2*srcstride] + 64) >> 7];
1859 dst[3] = cm[(filter[2] * src[3] - filter[1] * src[3-srcstride] + filter[3] * src[3+srcstride] - filter[4] * src[3+2*srcstride] + 64) >> 7];
1862 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1863 "li %[tmp0], 0x07 \n\t"
1864 "mtc1 %[tmp0], %[ftmp4] \n\t"
1867 PUT_VP8_EPEL4_V4_MMI(%[src], %[src1], %[dst], %[srcstride])
1869 "addiu %[h], %[h], -0x01 \n\t"
1870 PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1871 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1872 "bnez %[h], 1b \n\t"
1873 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1874 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1875 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1876 [tmp0]"=&r"(tmp[0]),
1880 [dst]"+&r"(dst), [src]"+&r"(src)
1881 : [ff_pw_64]"f"(ff_pw_64),
1882 [srcstride]"r"((mips_reg)srcstride),
1883 [dststride]"r"((mips_reg)dststride),
1884 [filter1]"f"(filter[1]), [filter2]"f"(filter[2]),
1885 [filter3]"f"(filter[3]), [filter4]"f"(filter[4])
1889 const uint8_t *filter = subpel_filters[my - 1];
1890 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1893 for (y = 0; y < h; y++) {
1894 for (x = 0; x < 4; x++)
1895 dst[x] = FILTER_4TAP(src, filter, srcstride);
1902 void ff_put_vp8_epel16_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1903 ptrdiff_t srcstride, int h, int mx, int my)
1906 const uint64_t *filter = fourtap_subpel_filters[my - 1];
1909 mips_reg src0, src1, dst0;
1913 dst[0] = cm[(filter[2]*src[0] - filter[1]*src[0-srcstride] + filter[0]*src[0-2*srcstride] + filter[3]*src[0+srcstride] - filter[4]*src[0+2*srcstride] + filter[5]*src[0+3*srcstride] + 64) >> 7];
1914 dst[1] = cm[(filter[2]*src[1] - filter[1]*src[1-srcstride] + filter[0]*src[1-2*srcstride] + filter[3]*src[1+srcstride] - filter[4]*src[1+2*srcstride] + filter[5]*src[1+3*srcstride] + 64) >> 7];
1915 dst[2] = cm[(filter[2]*src[2] - filter[1]*src[2-srcstride] + filter[0]*src[2-2*srcstride] + filter[3]*src[2+srcstride] - filter[4]*src[2+2*srcstride] + filter[5]*src[2+3*srcstride] + 64) >> 7];
1916 dst[3] = cm[(filter[2]*src[3] - filter[1]*src[3-srcstride] + filter[0]*src[3-2*srcstride] + filter[3]*src[3+srcstride] - filter[4]*src[3+2*srcstride] + filter[5]*src[3+3*srcstride] + 64) >> 7];
1917 dst[4] = cm[(filter[2]*src[4] - filter[1]*src[4-srcstride] + filter[0]*src[4-2*srcstride] + filter[3]*src[4+srcstride] - filter[4]*src[4+2*srcstride] + filter[5]*src[4+3*srcstride] + 64) >> 7];
1918 dst[5] = cm[(filter[2]*src[5] - filter[1]*src[5-srcstride] + filter[0]*src[5-2*srcstride] + filter[3]*src[5+srcstride] - filter[4]*src[5+2*srcstride] + filter[5]*src[5+3*srcstride] + 64) >> 7];
1919 dst[6] = cm[(filter[2]*src[6] - filter[1]*src[6-srcstride] + filter[0]*src[6-2*srcstride] + filter[3]*src[6+srcstride] - filter[4]*src[6+2*srcstride] + filter[5]*src[6+3*srcstride] + 64) >> 7];
1920 dst[7] = cm[(filter[2]*src[7] - filter[1]*src[7-srcstride] + filter[0]*src[7-2*srcstride] + filter[3]*src[7+srcstride] - filter[4]*src[7+2*srcstride] + filter[5]*src[7+3*srcstride] + 64) >> 7];
1922 dst[ 8] = cm[(filter[2]*src[ 8] - filter[1]*src[ 8-srcstride] + filter[0]*src[ 8-2*srcstride] + filter[3]*src[ 8+srcstride] - filter[4]*src[ 8+2*srcstride] + filter[5]*src[ 8+3*srcstride] + 64) >> 7];
1923 dst[ 9] = cm[(filter[2]*src[ 9] - filter[1]*src[ 9-srcstride] + filter[0]*src[ 9-2*srcstride] + filter[3]*src[ 9+srcstride] - filter[4]*src[ 9+2*srcstride] + filter[5]*src[ 9+3*srcstride] + 64) >> 7];
1924 dst[10] = cm[(filter[2]*src[10] - filter[1]*src[10-srcstride] + filter[0]*src[10-2*srcstride] + filter[3]*src[10+srcstride] - filter[4]*src[10+2*srcstride] + filter[5]*src[10+3*srcstride] + 64) >> 7];
1925 dst[11] = cm[(filter[2]*src[11] - filter[1]*src[11-srcstride] + filter[0]*src[11-2*srcstride] + filter[3]*src[11+srcstride] - filter[4]*src[11+2*srcstride] + filter[5]*src[11+3*srcstride] + 64) >> 7];
1926 dst[12] = cm[(filter[2]*src[12] - filter[1]*src[12-srcstride] + filter[0]*src[12-2*srcstride] + filter[3]*src[12+srcstride] - filter[4]*src[12+2*srcstride] + filter[5]*src[12+3*srcstride] + 64) >> 7];
1927 dst[13] = cm[(filter[2]*src[13] - filter[1]*src[13-srcstride] + filter[0]*src[13-2*srcstride] + filter[3]*src[13+srcstride] - filter[4]*src[13+2*srcstride] + filter[5]*src[13+3*srcstride] + 64) >> 7];
1928 dst[14] = cm[(filter[2]*src[14] - filter[1]*src[14-srcstride] + filter[0]*src[14-2*srcstride] + filter[3]*src[14+srcstride] - filter[4]*src[14+2*srcstride] + filter[5]*src[14+3*srcstride] + 64) >> 7];
1929 dst[15] = cm[(filter[2]*src[15] - filter[1]*src[15-srcstride] + filter[0]*src[15-2*srcstride] + filter[3]*src[15+srcstride] - filter[4]*src[15+2*srcstride] + filter[5]*src[15+3*srcstride] + 64) >> 7];
1932 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1933 "li %[tmp0], 0x07 \n\t"
1934 "mtc1 %[tmp0], %[ftmp4] \n\t"
1938 PUT_VP8_EPEL8_V6_MMI(%[src], %[src1], %[dst], %[srcstride])
1939 PTR_ADDIU "%[src0], %[src], 0x08 \n\t"
1940 PTR_ADDIU "%[dst0], %[dst], 0x08 \n\t"
1942 PUT_VP8_EPEL8_V6_MMI(%[src0], %[src1], %[dst0], %[srcstride])
1944 "addiu %[h], %[h], -0x01 \n\t"
1945 PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1946 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1947 "bnez %[h], 1b \n\t"
1948 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1949 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1950 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1951 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1952 [ftmp8]"=&f"(ftmp[8]),
1953 [tmp0]"=&r"(tmp[0]),
1955 [src0]"=&r"(src0), [dst0]"=&r"(dst0),
1958 [dst]"+&r"(dst), [src]"+&r"(src)
1959 : [ff_pw_64]"f"(ff_pw_64),
1960 [srcstride]"r"((mips_reg)srcstride),
1961 [dststride]"r"((mips_reg)dststride),
1962 [filter0]"f"(filter[0]), [filter1]"f"(filter[1]),
1963 [filter2]"f"(filter[2]), [filter3]"f"(filter[3]),
1964 [filter4]"f"(filter[4]), [filter5]"f"(filter[5])
1968 const uint8_t *filter = subpel_filters[my - 1];
1969 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1972 for (y = 0; y < h; y++) {
1973 for (x = 0; x < 16; x++)
1974 dst[x] = FILTER_6TAP(src, filter, srcstride);
1981 void ff_put_vp8_epel8_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1982 ptrdiff_t srcstride, int h, int mx, int my)
1985 const uint64_t *filter = fourtap_subpel_filters[my - 1];
1992 dst[0] = cm[(filter[2]*src[0] - filter[1]*src[0-srcstride] + filter[0]*src[0-2*srcstride] + filter[3]*src[0+srcstride] - filter[4]*src[0+2*srcstride] + filter[5]*src[0+3*srcstride] + 64) >> 7];
1993 dst[1] = cm[(filter[2]*src[1] - filter[1]*src[1-srcstride] + filter[0]*src[1-2*srcstride] + filter[3]*src[1+srcstride] - filter[4]*src[1+2*srcstride] + filter[5]*src[1+3*srcstride] + 64) >> 7];
1994 dst[2] = cm[(filter[2]*src[2] - filter[1]*src[2-srcstride] + filter[0]*src[2-2*srcstride] + filter[3]*src[2+srcstride] - filter[4]*src[2+2*srcstride] + filter[5]*src[2+3*srcstride] + 64) >> 7];
1995 dst[3] = cm[(filter[2]*src[3] - filter[1]*src[3-srcstride] + filter[0]*src[3-2*srcstride] + filter[3]*src[3+srcstride] - filter[4]*src[3+2*srcstride] + filter[5]*src[3+3*srcstride] + 64) >> 7];
1996 dst[4] = cm[(filter[2]*src[4] - filter[1]*src[4-srcstride] + filter[0]*src[4-2*srcstride] + filter[3]*src[4+srcstride] - filter[4]*src[4+2*srcstride] + filter[5]*src[4+3*srcstride] + 64) >> 7];
1997 dst[5] = cm[(filter[2]*src[5] - filter[1]*src[5-srcstride] + filter[0]*src[5-2*srcstride] + filter[3]*src[5+srcstride] - filter[4]*src[5+2*srcstride] + filter[5]*src[5+3*srcstride] + 64) >> 7];
1998 dst[6] = cm[(filter[2]*src[6] - filter[1]*src[6-srcstride] + filter[0]*src[6-2*srcstride] + filter[3]*src[6+srcstride] - filter[4]*src[6+2*srcstride] + filter[5]*src[6+3*srcstride] + 64) >> 7];
1999 dst[7] = cm[(filter[2]*src[7] - filter[1]*src[7-srcstride] + filter[0]*src[7-2*srcstride] + filter[3]*src[7+srcstride] - filter[4]*src[7+2*srcstride] + filter[5]*src[7+3*srcstride] + 64) >> 7];
2002 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2003 "li %[tmp0], 0x07 \n\t"
2004 "mtc1 %[tmp0], %[ftmp4] \n\t"
2007 PUT_VP8_EPEL8_V6_MMI(%[src], %[src1], %[dst], %[srcstride])
2009 "addiu %[h], %[h], -0x01 \n\t"
2010 PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
2011 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
2012 "bnez %[h], 1b \n\t"
2013 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2014 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2015 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2016 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
2017 [ftmp8]"=&f"(ftmp[8]),
2018 [tmp0]"=&r"(tmp[0]),
2022 [dst]"+&r"(dst), [src]"+&r"(src)
2023 : [ff_pw_64]"f"(ff_pw_64),
2024 [srcstride]"r"((mips_reg)srcstride),
2025 [dststride]"r"((mips_reg)dststride),
2026 [filter0]"f"(filter[0]), [filter1]"f"(filter[1]),
2027 [filter2]"f"(filter[2]), [filter3]"f"(filter[3]),
2028 [filter4]"f"(filter[4]), [filter5]"f"(filter[5])
2032 const uint8_t *filter = subpel_filters[my - 1];
2033 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2036 for (y = 0; y < h; y++) {
2037 for (x = 0; x < 8; x++)
2038 dst[x] = FILTER_6TAP(src, filter, srcstride);
2045 void ff_put_vp8_epel4_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2046 ptrdiff_t srcstride, int h, int mx, int my)
2049 const uint64_t *filter = fourtap_subpel_filters[my - 1];
2056 dst[0] = cm[(filter[2]*src[0] - filter[1]*src[0-srcstride] + filter[0]*src[0-2*srcstride] + filter[3]*src[0+srcstride] - filter[4]*src[0+2*srcstride] + filter[5]*src[0+3*srcstride] + 64) >> 7];
2057 dst[1] = cm[(filter[2]*src[1] - filter[1]*src[1-srcstride] + filter[0]*src[1-2*srcstride] + filter[3]*src[1+srcstride] - filter[4]*src[1+2*srcstride] + filter[5]*src[1+3*srcstride] + 64) >> 7];
2058 dst[2] = cm[(filter[2]*src[2] - filter[1]*src[2-srcstride] + filter[0]*src[2-2*srcstride] + filter[3]*src[2+srcstride] - filter[4]*src[2+2*srcstride] + filter[5]*src[2+3*srcstride] + 64) >> 7];
2059 dst[3] = cm[(filter[2]*src[3] - filter[1]*src[3-srcstride] + filter[0]*src[3-2*srcstride] + filter[3]*src[3+srcstride] - filter[4]*src[3+2*srcstride] + filter[5]*src[3+3*srcstride] + 64) >> 7];
2062 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2063 "li %[tmp0], 0x07 \n\t"
2064 "mtc1 %[tmp0], %[ftmp4] \n\t"
2067 PUT_VP8_EPEL4_V6_MMI(%[src], %[src1], %[dst], %[srcstride])
2069 "addiu %[h], %[h], -0x01 \n\t"
2070 PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
2071 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
2072 "bnez %[h], 1b \n\t"
2073 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2074 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2075 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2076 [tmp0]"=&r"(tmp[0]),
2080 [dst]"+&r"(dst), [src]"+&r"(src)
2081 : [ff_pw_64]"f"(ff_pw_64),
2082 [srcstride]"r"((mips_reg)srcstride),
2083 [dststride]"r"((mips_reg)dststride),
2084 [filter0]"f"(filter[0]), [filter1]"f"(filter[1]),
2085 [filter2]"f"(filter[2]), [filter3]"f"(filter[3]),
2086 [filter4]"f"(filter[4]), [filter5]"f"(filter[5])
2090 const uint8_t *filter = subpel_filters[my - 1];
2091 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2094 for (y = 0; y < h; y++) {
2095 for (x = 0; x < 4; x++)
2096 dst[x] = FILTER_6TAP(src, filter, srcstride);
2103 void ff_put_vp8_epel16_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2104 ptrdiff_t srcstride, int h, int mx, int my)
2107 DECLARE_ALIGNED(8, uint8_t, tmp_array[560]);
2108 uint8_t *tmp = tmp_array;
2111 ff_put_vp8_epel16_h4_mmi(tmp, 16, src, srcstride, h + 3, mx, my);
2112 tmp = tmp_array + 16;
2113 ff_put_vp8_epel16_v4_mmi(dst, dststride, tmp, 16, h, mx, my);
2115 const uint8_t *filter = subpel_filters[mx - 1];
2116 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2118 uint8_t tmp_array[560];
2119 uint8_t *tmp = tmp_array;
2123 for (y = 0; y < h + 3; y++) {
2124 for (x = 0; x < 16; x++)
2125 tmp[x] = FILTER_4TAP(src, filter, 1);
2130 tmp = tmp_array + 16;
2131 filter = subpel_filters[my - 1];
2133 for (y = 0; y < h; y++) {
2134 for (x = 0; x < 16; x++)
2135 dst[x] = FILTER_4TAP(tmp, filter, 16);
2142 void ff_put_vp8_epel8_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2143 ptrdiff_t srcstride, int h, int mx, int my)
2146 DECLARE_ALIGNED(8, uint8_t, tmp_array[152]);
2147 uint8_t *tmp = tmp_array;
2150 ff_put_vp8_epel8_h4_mmi(tmp, 8, src, srcstride, h + 3, mx, my);
2151 tmp = tmp_array + 8;
2152 ff_put_vp8_epel8_v4_mmi(dst, dststride, tmp, 8, h, mx, my);
2154 const uint8_t *filter = subpel_filters[mx - 1];
2155 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2157 uint8_t tmp_array[152];
2158 uint8_t *tmp = tmp_array;
2162 for (y = 0; y < h + 3; y++) {
2163 for (x = 0; x < 8; x++)
2164 tmp[x] = FILTER_4TAP(src, filter, 1);
2169 tmp = tmp_array + 8;
2170 filter = subpel_filters[my - 1];
2172 for (y = 0; y < h; y++) {
2173 for (x = 0; x < 8; x++)
2174 dst[x] = FILTER_4TAP(tmp, filter, 8);
2181 void ff_put_vp8_epel4_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2182 ptrdiff_t srcstride, int h, int mx, int my)
2185 DECLARE_ALIGNED(4, uint8_t, tmp_array[44]);
2186 uint8_t *tmp = tmp_array;
2189 ff_put_vp8_epel4_h4_mmi(tmp, 4, src, srcstride, h + 3, mx, my);
2190 tmp = tmp_array + 4;
2191 ff_put_vp8_epel4_v4_mmi(dst, dststride, tmp, 4, h, mx, my);
2193 const uint8_t *filter = subpel_filters[mx - 1];
2194 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2196 uint8_t tmp_array[44];
2197 uint8_t *tmp = tmp_array;
2201 for (y = 0; y < h + 3; y++) {
2202 for (x = 0; x < 4; x++)
2203 tmp[x] = FILTER_4TAP(src, filter, 1);
2207 tmp = tmp_array + 4;
2208 filter = subpel_filters[my - 1];
2210 for (y = 0; y < h; y++) {
2211 for (x = 0; x < 4; x++)
2212 dst[x] = FILTER_4TAP(tmp, filter, 4);
2219 void ff_put_vp8_epel16_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2220 ptrdiff_t srcstride, int h, int mx, int my)
2223 DECLARE_ALIGNED(8, uint8_t, tmp_array[592]);
2224 uint8_t *tmp = tmp_array;
2226 src -= 2 * srcstride;
2227 ff_put_vp8_epel16_h4_mmi(tmp, 16, src, srcstride, h + 5, mx, my);
2228 tmp = tmp_array + 32;
2229 ff_put_vp8_epel16_v6_mmi(dst, dststride, tmp, 16, h, mx, my);
2231 const uint8_t *filter = subpel_filters[mx - 1];
2232 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2234 uint8_t tmp_array[592];
2235 uint8_t *tmp = tmp_array;
2237 src -= 2 * srcstride;
2239 for (y = 0; y < h + 5; y++) {
2240 for (x = 0; x < 16; x++)
2241 tmp[x] = FILTER_4TAP(src, filter, 1);
2246 tmp = tmp_array + 32;
2247 filter = subpel_filters[my - 1];
2249 for (y = 0; y < h; y++) {
2250 for (x = 0; x < 16; x++)
2251 dst[x] = FILTER_6TAP(tmp, filter, 16);
2258 void ff_put_vp8_epel8_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2259 ptrdiff_t srcstride, int h, int mx, int my)
2262 DECLARE_ALIGNED(8, uint8_t, tmp_array[168]);
2263 uint8_t *tmp = tmp_array;
2265 src -= 2 * srcstride;
2266 ff_put_vp8_epel8_h4_mmi(tmp, 8, src, srcstride, h + 5, mx, my);
2267 tmp = tmp_array + 16;
2268 ff_put_vp8_epel8_v6_mmi(dst, dststride, tmp, 8, h, mx, my);
2270 const uint8_t *filter = subpel_filters[mx - 1];
2271 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2273 uint8_t tmp_array[168];
2274 uint8_t *tmp = tmp_array;
2276 src -= 2 * srcstride;
2278 for (y = 0; y < h + 5; y++) {
2279 for (x = 0; x < 8; x++)
2280 tmp[x] = FILTER_4TAP(src, filter, 1);
2285 tmp = tmp_array + 16;
2286 filter = subpel_filters[my - 1];
2288 for (y = 0; y < h; y++) {
2289 for (x = 0; x < 8; x++)
2290 dst[x] = FILTER_6TAP(tmp, filter, 8);
2297 void ff_put_vp8_epel4_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2298 ptrdiff_t srcstride, int h, int mx, int my)
2301 DECLARE_ALIGNED(4, uint8_t, tmp_array[52]);
2302 uint8_t *tmp = tmp_array;
2304 src -= 2 * srcstride;
2305 ff_put_vp8_epel4_h4_mmi(tmp, 4, src, srcstride, h + 5, mx, my);
2306 tmp = tmp_array + 8;
2307 ff_put_vp8_epel4_v6_mmi(dst, dststride, tmp, 4, h, mx, my);
2309 const uint8_t *filter = subpel_filters[mx - 1];
2310 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2312 uint8_t tmp_array[52];
2313 uint8_t *tmp = tmp_array;
2315 src -= 2 * srcstride;
2317 for (y = 0; y < h + 5; y++) {
2318 for (x = 0; x < 4; x++)
2319 tmp[x] = FILTER_4TAP(src, filter, 1);
2324 tmp = tmp_array + 8;
2325 filter = subpel_filters[my - 1];
2327 for (y = 0; y < h; y++) {
2328 for (x = 0; x < 4; x++)
2329 dst[x] = FILTER_6TAP(tmp, filter, 4);
2336 void ff_put_vp8_epel16_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2337 ptrdiff_t srcstride, int h, int mx, int my)
2340 DECLARE_ALIGNED(8, uint8_t, tmp_array[560]);
2341 uint8_t *tmp = tmp_array;
2344 ff_put_vp8_epel16_h6_mmi(tmp, 16, src, srcstride, h + 3, mx, my);
2345 tmp = tmp_array + 16;
2346 ff_put_vp8_epel16_v4_mmi(dst, dststride, tmp, 16, h, mx, my);
2348 const uint8_t *filter = subpel_filters[mx - 1];
2349 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2351 uint8_t tmp_array[560];
2352 uint8_t *tmp = tmp_array;
2356 for (y = 0; y < h + 3; y++) {
2357 for (x = 0; x < 16; x++)
2358 tmp[x] = FILTER_6TAP(src, filter, 1);
2363 tmp = tmp_array + 16;
2364 filter = subpel_filters[my - 1];
2366 for (y = 0; y < h; y++) {
2367 for (x = 0; x < 16; x++)
2368 dst[x] = FILTER_4TAP(tmp, filter, 16);
2375 void ff_put_vp8_epel8_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2376 ptrdiff_t srcstride, int h, int mx, int my)
2379 DECLARE_ALIGNED(8, uint8_t, tmp_array[152]);
2380 uint8_t *tmp = tmp_array;
2383 ff_put_vp8_epel8_h6_mmi(tmp, 8, src, srcstride, h + 3, mx, my);
2384 tmp = tmp_array + 8;
2385 ff_put_vp8_epel8_v4_mmi(dst, dststride, tmp, 8, h, mx, my);
2387 const uint8_t *filter = subpel_filters[mx - 1];
2388 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2390 uint8_t tmp_array[152];
2391 uint8_t *tmp = tmp_array;
2395 for (y = 0; y < h + 3; y++) {
2396 for (x = 0; x < 8; x++)
2397 tmp[x] = FILTER_6TAP(src, filter, 1);
2402 tmp = tmp_array + 8;
2403 filter = subpel_filters[my - 1];
2405 for (y = 0; y < h; y++) {
2406 for (x = 0; x < 8; x++)
2407 dst[x] = FILTER_4TAP(tmp, filter, 8);
2414 void ff_put_vp8_epel4_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2415 ptrdiff_t srcstride, int h, int mx, int my)
2418 DECLARE_ALIGNED(4, uint8_t, tmp_array[44]);
2419 uint8_t *tmp = tmp_array;
2422 ff_put_vp8_epel4_h6_mmi(tmp, 4, src, srcstride, h + 3, mx, my);
2423 tmp = tmp_array + 4;
2424 ff_put_vp8_epel4_v4_mmi(dst, dststride, tmp, 4, h, mx, my);
2426 const uint8_t *filter = subpel_filters[mx - 1];
2427 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2429 uint8_t tmp_array[44];
2430 uint8_t *tmp = tmp_array;
2434 for (y = 0; y < h + 3; y++) {
2435 for (x = 0; x < 4; x++)
2436 tmp[x] = FILTER_6TAP(src, filter, 1);
2441 tmp = tmp_array + 4;
2442 filter = subpel_filters[my - 1];
2444 for (y = 0; y < h; y++) {
2445 for (x = 0; x < 4; x++)
2446 dst[x] = FILTER_4TAP(tmp, filter, 4);
2453 void ff_put_vp8_epel16_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2454 ptrdiff_t srcstride, int h, int mx, int my)
2457 DECLARE_ALIGNED(8, uint8_t, tmp_array[592]);
2458 uint8_t *tmp = tmp_array;
2460 src -= 2 * srcstride;
2461 ff_put_vp8_epel16_h6_mmi(tmp, 16, src, srcstride, h + 5, mx, my);
2462 tmp = tmp_array + 32;
2463 ff_put_vp8_epel16_v6_mmi(dst, dststride, tmp, 16, h, mx, my);
2465 const uint8_t *filter = subpel_filters[mx - 1];
2466 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2468 uint8_t tmp_array[592];
2469 uint8_t *tmp = tmp_array;
2471 src -= 2 * srcstride;
2473 for (y = 0; y < h + 5; y++) {
2474 for (x = 0; x < 16; x++)
2475 tmp[x] = FILTER_6TAP(src, filter, 1);
2480 tmp = tmp_array + 32;
2481 filter = subpel_filters[my - 1];
2483 for (y = 0; y < h; y++) {
2484 for (x = 0; x < 16; x++)
2485 dst[x] = FILTER_6TAP(tmp, filter, 16);
2492 void ff_put_vp8_epel8_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2493 ptrdiff_t srcstride, int h, int mx, int my)
2496 DECLARE_ALIGNED(8, uint8_t, tmp_array[168]);
2497 uint8_t *tmp = tmp_array;
2499 src -= 2 * srcstride;
2500 ff_put_vp8_epel8_h6_mmi(tmp, 8, src, srcstride, h + 5, mx, my);
2501 tmp = tmp_array + 16;
2502 ff_put_vp8_epel8_v6_mmi(dst, dststride, tmp, 8, h, mx, my);
2504 const uint8_t *filter = subpel_filters[mx - 1];
2505 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2507 uint8_t tmp_array[168];
2508 uint8_t *tmp = tmp_array;
2510 src -= 2 * srcstride;
2512 for (y = 0; y < h + 5; y++) {
2513 for (x = 0; x < 8; x++)
2514 tmp[x] = FILTER_6TAP(src, filter, 1);
2519 tmp = tmp_array + 16;
2520 filter = subpel_filters[my - 1];
2522 for (y = 0; y < h; y++) {
2523 for (x = 0; x < 8; x++)
2524 dst[x] = FILTER_6TAP(tmp, filter, 8);
2531 void ff_put_vp8_epel4_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2532 ptrdiff_t srcstride, int h, int mx, int my)
2535 DECLARE_ALIGNED(4, uint8_t, tmp_array[52]);
2536 uint8_t *tmp = tmp_array;
2538 src -= 2 * srcstride;
2539 ff_put_vp8_epel4_h6_mmi(tmp, 4, src, srcstride, h + 5, mx, my);
2540 tmp = tmp_array + 8;
2541 ff_put_vp8_epel4_v6_mmi(dst, dststride, tmp, 4, h, mx, my);
2543 const uint8_t *filter = subpel_filters[mx - 1];
2544 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2546 uint8_t tmp_array[52];
2547 uint8_t *tmp = tmp_array;
2549 src -= 2 * srcstride;
2551 for (y = 0; y < h + 5; y++) {
2552 for (x = 0; x < 4; x++)
2553 tmp[x] = FILTER_6TAP(src, filter, 1);
2558 tmp = tmp_array + 8;
2559 filter = subpel_filters[my - 1];
2561 for (y = 0; y < h; y++) {
2562 for (x = 0; x < 4; x++)
2563 dst[x] = FILTER_6TAP(tmp, filter, 4);
2570 void ff_put_vp8_bilinear16_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
2571 ptrdiff_t sstride, int h, int mx, int my)
2574 int a = 8 - mx, b = mx;
2577 mips_reg dst0, src0;
2581 dst[0] = (a * src[0] + b * src[1] + 4) >> 3;
2582 dst[1] = (a * src[1] + b * src[2] + 4) >> 3;
2583 dst[2] = (a * src[2] + b * src[3] + 4) >> 3;
2584 dst[3] = (a * src[3] + b * src[4] + 4) >> 3;
2585 dst[4] = (a * src[4] + b * src[5] + 4) >> 3;
2586 dst[5] = (a * src[5] + b * src[6] + 4) >> 3;
2587 dst[6] = (a * src[6] + b * src[7] + 4) >> 3;
2588 dst[7] = (a * src[7] + b * src[8] + 4) >> 3;
2590 dst[ 8] = (a * src[ 8] + b * src[ 9] + 4) >> 3;
2591 dst[ 9] = (a * src[ 9] + b * src[10] + 4) >> 3;
2592 dst[10] = (a * src[10] + b * src[11] + 4) >> 3;
2593 dst[11] = (a * src[11] + b * src[12] + 4) >> 3;
2594 dst[12] = (a * src[12] + b * src[13] + 4) >> 3;
2595 dst[13] = (a * src[13] + b * src[14] + 4) >> 3;
2596 dst[14] = (a * src[14] + b * src[15] + 4) >> 3;
2597 dst[15] = (a * src[15] + b * src[16] + 4) >> 3;
2600 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2601 "li %[tmp0], 0x03 \n\t"
2602 "mtc1 %[tmp0], %[ftmp4] \n\t"
2603 "pshufh %[a], %[a], %[ftmp0] \n\t"
2604 "pshufh %[b], %[b], %[ftmp0] \n\t"
2608 PUT_VP8_BILINEAR8_H_MMI(%[src], %[dst])
2609 PTR_ADDIU "%[src0], %[src], 0x08 \n\t"
2610 PTR_ADDIU "%[dst0], %[dst], 0x08 \n\t"
2612 PUT_VP8_BILINEAR8_H_MMI(%[src0], %[dst0])
2614 "addiu %[h], %[h], -0x01 \n\t"
2615 PTR_ADDU "%[src], %[src], %[sstride] \n\t"
2616 PTR_ADDU "%[dst], %[dst], %[dstride] \n\t"
2617 "bnez %[h], 1b \n\t"
2618 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2619 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2620 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2621 [ftmp6]"=&f"(ftmp[6]),
2622 [tmp0]"=&r"(tmp[0]),
2624 [dst0]"=&r"(dst0), [src0]"=&r"(src0),
2626 [dst]"+&r"(dst), [src]"+&r"(src),
2627 [a]"+&f"(a), [b]"+&f"(b)
2628 : [sstride]"r"((mips_reg)sstride),
2629 [dstride]"r"((mips_reg)dstride),
2630 [ff_pw_4]"f"(ff_pw_4)
2634 int a = 8 - mx, b = mx;
2637 for (y = 0; y < h; y++) {
2638 for (x = 0; x < 16; x++)
2639 dst[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
2646 void ff_put_vp8_bilinear16_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
2647 ptrdiff_t sstride, int h, int mx, int my)
2650 int c = 8 - my, d = my;
2653 mips_reg src0, src1, dst0;
2657 dst[0] = (c * src[0] + d * src[ sstride] + 4) >> 3;
2658 dst[1] = (c * src[1] + d * src[1 + sstride] + 4) >> 3;
2659 dst[2] = (c * src[2] + d * src[2 + sstride] + 4) >> 3;
2660 dst[3] = (c * src[3] + d * src[3 + sstride] + 4) >> 3;
2661 dst[4] = (c * src[4] + d * src[4 + sstride] + 4) >> 3;
2662 dst[5] = (c * src[5] + d * src[5 + sstride] + 4) >> 3;
2663 dst[6] = (c * src[6] + d * src[6 + sstride] + 4) >> 3;
2664 dst[7] = (c * src[7] + d * src[7 + sstride] + 4) >> 3;
2667 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2668 "li %[tmp0], 0x03 \n\t"
2669 "mtc1 %[tmp0], %[ftmp4] \n\t"
2670 "pshufh %[c], %[c], %[ftmp0] \n\t"
2671 "pshufh %[d], %[d], %[ftmp0] \n\t"
2675 PUT_VP8_BILINEAR8_V_MMI(%[src], %[src1], %[dst], %[sstride])
2676 PTR_ADDIU "%[src0], %[src], 0x08 \n\t"
2677 PTR_ADDIU "%[dst0], %[dst], 0x08 \n\t"
2679 PUT_VP8_BILINEAR8_V_MMI(%[src0], %[src1], %[dst0], %[sstride])
2681 "addiu %[h], %[h], -0x01 \n\t"
2682 PTR_ADDU "%[src], %[src], %[sstride] \n\t"
2683 PTR_ADDU "%[dst], %[dst], %[dstride] \n\t"
2684 "bnez %[h], 1b \n\t"
2685 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2686 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2687 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2688 [ftmp6]"=&f"(ftmp[6]),
2689 [tmp0]"=&r"(tmp[0]),
2691 [src0]"=&r"(src0), [dst0]"=&r"(dst0),
2694 [dst]"+&r"(dst), [src]"+&r"(src),
2695 [c]"+&f"(c), [d]"+&f"(d)
2696 : [sstride]"r"((mips_reg)sstride),
2697 [dstride]"r"((mips_reg)dstride),
2698 [ff_pw_4]"f"(ff_pw_4)
2702 int c = 8 - my, d = my;
2705 for (y = 0; y < h; y++) {
2706 for (x = 0; x < 16; x++)
2707 dst[x] = (c * src[x] + d * src[x + sstride] + 4) >> 3;
2714 void ff_put_vp8_bilinear16_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
2715 ptrdiff_t sstride, int h, int mx, int my)
2718 DECLARE_ALIGNED(8, uint8_t, tmp_array[528]);
2719 uint8_t *tmp = tmp_array;
2721 ff_put_vp8_bilinear16_h_mmi(tmp, 16, src, sstride, h + 1, mx, my);
2722 ff_put_vp8_bilinear16_v_mmi(dst, dstride, tmp, 16, h, mx, my);
2724 int a = 8 - mx, b = mx;
2725 int c = 8 - my, d = my;
2727 uint8_t tmp_array[528];
2728 uint8_t *tmp = tmp_array;
2730 for (y = 0; y < h + 1; y++) {
2731 for (x = 0; x < 16; x++)
2732 tmp[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
2739 for (y = 0; y < h; y++) {
2740 for (x = 0; x < 16; x++)
2741 dst[x] = (c * tmp[x] + d * tmp[x + 16] + 4) >> 3;
2748 void ff_put_vp8_bilinear8_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
2749 ptrdiff_t sstride, int h, int mx, int my)
2752 int a = 8 - mx, b = mx;
2758 dst[0] = (a * src[0] + b * src[1] + 4) >> 3;
2759 dst[1] = (a * src[1] + b * src[2] + 4) >> 3;
2760 dst[2] = (a * src[2] + b * src[3] + 4) >> 3;
2761 dst[3] = (a * src[3] + b * src[4] + 4) >> 3;
2762 dst[4] = (a * src[4] + b * src[5] + 4) >> 3;
2763 dst[5] = (a * src[5] + b * src[6] + 4) >> 3;
2764 dst[6] = (a * src[6] + b * src[7] + 4) >> 3;
2765 dst[7] = (a * src[7] + b * src[8] + 4) >> 3;
2768 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2769 "li %[tmp0], 0x03 \n\t"
2770 "mtc1 %[tmp0], %[ftmp4] \n\t"
2771 "pshufh %[a], %[a], %[ftmp0] \n\t"
2772 "pshufh %[b], %[b], %[ftmp0] \n\t"
2775 PUT_VP8_BILINEAR8_H_MMI(%[src], %[dst])
2777 "addiu %[h], %[h], -0x01 \n\t"
2778 PTR_ADDU "%[src], %[src], %[sstride] \n\t"
2779 PTR_ADDU "%[dst], %[dst], %[dstride] \n\t"
2780 "bnez %[h], 1b \n\t"
2781 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2782 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2783 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2784 [ftmp6]"=&f"(ftmp[6]),
2785 [tmp0]"=&r"(tmp[0]),
2788 [dst]"+&r"(dst), [src]"+&r"(src),
2789 [a]"+&f"(a), [b]"+&f"(b)
2790 : [sstride]"r"((mips_reg)sstride),
2791 [dstride]"r"((mips_reg)dstride),
2792 [ff_pw_4]"f"(ff_pw_4)
2796 int a = 8 - mx, b = mx;
2799 for (y = 0; y < h; y++) {
2800 for (x = 0; x < 8; x++)
2801 dst[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
2808 void ff_put_vp8_bilinear8_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
2809 ptrdiff_t sstride, int h, int mx, int my)
2812 int c = 8 - my, d = my;
2819 dst[0] = (c * src[0] + d * src[ sstride] + 4) >> 3;
2820 dst[1] = (c * src[1] + d * src[1 + sstride] + 4) >> 3;
2821 dst[2] = (c * src[2] + d * src[2 + sstride] + 4) >> 3;
2822 dst[3] = (c * src[3] + d * src[3 + sstride] + 4) >> 3;
2823 dst[4] = (c * src[4] + d * src[4 + sstride] + 4) >> 3;
2824 dst[5] = (c * src[5] + d * src[5 + sstride] + 4) >> 3;
2825 dst[6] = (c * src[6] + d * src[6 + sstride] + 4) >> 3;
2826 dst[7] = (c * src[7] + d * src[7 + sstride] + 4) >> 3;
2829 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2830 "li %[tmp0], 0x03 \n\t"
2831 "mtc1 %[tmp0], %[ftmp4] \n\t"
2832 "pshufh %[c], %[c], %[ftmp0] \n\t"
2833 "pshufh %[d], %[d], %[ftmp0] \n\t"
2836 PUT_VP8_BILINEAR8_V_MMI(%[src], %[src1], %[dst], %[sstride])
2838 "addiu %[h], %[h], -0x01 \n\t"
2839 PTR_ADDU "%[src], %[src], %[sstride] \n\t"
2840 PTR_ADDU "%[dst], %[dst], %[dstride] \n\t"
2841 "bnez %[h], 1b \n\t"
2842 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2843 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2844 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2845 [ftmp6]"=&f"(ftmp[6]),
2846 [tmp0]"=&r"(tmp[0]),
2850 [dst]"+&r"(dst), [src]"+&r"(src),
2851 [c]"+&f"(c), [d]"+&f"(d)
2852 : [sstride]"r"((mips_reg)sstride),
2853 [dstride]"r"((mips_reg)dstride),
2854 [ff_pw_4]"f"(ff_pw_4)
2858 int c = 8 - my, d = my;
2861 for (y = 0; y < h; y++) {
2862 for (x = 0; x < 8; x++)
2863 dst[x] = (c * src[x] + d * src[x + sstride] + 4) >> 3;
2870 void ff_put_vp8_bilinear8_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
2871 ptrdiff_t sstride, int h, int mx, int my)
2874 DECLARE_ALIGNED(8, uint8_t, tmp_array[136]);
2875 uint8_t *tmp = tmp_array;
2877 ff_put_vp8_bilinear8_h_mmi(tmp, 8, src, sstride, h + 1, mx, my);
2878 ff_put_vp8_bilinear8_v_mmi(dst, dstride, tmp, 8, h, mx, my);
2880 int a = 8 - mx, b = mx;
2881 int c = 8 - my, d = my;
2883 uint8_t tmp_array[136];
2884 uint8_t *tmp = tmp_array;
2886 for (y = 0; y < h + 1; y++) {
2887 for (x = 0; x < 8; x++)
2888 tmp[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
2895 for (y = 0; y < h; y++) {
2896 for (x = 0; x < 8; x++)
2897 dst[x] = (c * tmp[x] + d * tmp[x + 8] + 4) >> 3;
2904 void ff_put_vp8_bilinear4_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
2905 ptrdiff_t sstride, int h, int mx, int my)
2908 int a = 8 - mx, b = mx;
2915 dst[0] = (a * src[0] + b * src[1] + 4) >> 3;
2916 dst[1] = (a * src[1] + b * src[2] + 4) >> 3;
2917 dst[2] = (a * src[2] + b * src[3] + 4) >> 3;
2918 dst[3] = (a * src[3] + b * src[4] + 4) >> 3;
2921 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2922 "li %[tmp0], 0x03 \n\t"
2923 "mtc1 %[tmp0], %[ftmp4] \n\t"
2924 "pshufh %[a], %[a], %[ftmp0] \n\t"
2925 "pshufh %[b], %[b], %[ftmp0] \n\t"
2928 PUT_VP8_BILINEAR4_H_MMI(%[src], %[dst])
2930 "addiu %[h], %[h], -0x01 \n\t"
2931 PTR_ADDU "%[src], %[src], %[sstride] \n\t"
2932 PTR_ADDU "%[dst], %[dst], %[dstride] \n\t"
2933 "bnez %[h], 1b \n\t"
2934 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2935 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2936 [ftmp4]"=&f"(ftmp[4]),
2937 [tmp0]"=&r"(tmp[0]),
2941 [dst]"+&r"(dst), [src]"+&r"(src),
2942 [a]"+&f"(a), [b]"+&f"(b)
2943 : [sstride]"r"((mips_reg)sstride),
2944 [dstride]"r"((mips_reg)dstride),
2945 [ff_pw_4]"f"(ff_pw_4)
2949 int a = 8 - mx, b = mx;
2952 for (y = 0; y < h; y++) {
2953 for (x = 0; x < 4; x++)
2954 dst[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
2961 void ff_put_vp8_bilinear4_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
2962 ptrdiff_t sstride, int h, int mx, int my)
2965 int c = 8 - my, d = my;
2973 dst[0] = (c * src[0] + d * src[ sstride] + 4) >> 3;
2974 dst[1] = (c * src[1] + d * src[1 + sstride] + 4) >> 3;
2975 dst[2] = (c * src[2] + d * src[2 + sstride] + 4) >> 3;
2976 dst[3] = (c * src[3] + d * src[3 + sstride] + 4) >> 3;
2979 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2980 "li %[tmp0], 0x03 \n\t"
2981 "mtc1 %[tmp0], %[ftmp4] \n\t"
2982 "pshufh %[c], %[c], %[ftmp0] \n\t"
2983 "pshufh %[d], %[d], %[ftmp0] \n\t"
2986 PUT_VP8_BILINEAR4_V_MMI(%[src], %[src1], %[dst], %[sstride])
2988 "addiu %[h], %[h], -0x01 \n\t"
2989 PTR_ADDU "%[src], %[src], %[sstride] \n\t"
2990 PTR_ADDU "%[dst], %[dst], %[dstride] \n\t"
2991 "bnez %[h], 1b \n\t"
2992 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2993 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2994 [ftmp4]"=&f"(ftmp[4]),
2995 [tmp0]"=&r"(tmp[0]),
3000 [dst]"+&r"(dst), [src]"+&r"(src),
3001 [c]"+&f"(c), [d]"+&f"(d)
3002 : [sstride]"r"((mips_reg)sstride),
3003 [dstride]"r"((mips_reg)dstride),
3004 [ff_pw_4]"f"(ff_pw_4)
3008 int c = 8 - my, d = my;
3011 for (y = 0; y < h; y++) {
3012 for (x = 0; x < 4; x++)
3013 dst[x] = (c * src[x] + d * src[x + sstride] + 4) >> 3;
3020 void ff_put_vp8_bilinear4_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3021 ptrdiff_t sstride, int h, int mx, int my)
3024 DECLARE_ALIGNED(4, uint8_t, tmp_array[36]);
3025 uint8_t *tmp = tmp_array;
3027 ff_put_vp8_bilinear4_h_mmi(tmp, 4, src, sstride, h + 1, mx, my);
3028 ff_put_vp8_bilinear4_v_mmi(dst, dstride, tmp, 4, h, mx, my);
3030 int a = 8 - mx, b = mx;
3031 int c = 8 - my, d = my;
3033 uint8_t tmp_array[36];
3034 uint8_t *tmp = tmp_array;
3036 for (y = 0; y < h + 1; y++) {
3037 for (x = 0; x < 4; x++)
3038 tmp[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3045 for (y = 0; y < h; y++) {
3046 for (x = 0; x < 4; x++)
3047 dst[x] = (c * tmp[x] + d * tmp[x + 4] + 4) >> 3;